一、前言
本文通过Python构建CNN(卷积神经网络)体系,对mnist(手写数字图片)进行识别,手写数字是从0到9,数据分为60000条训练集,10000条测试集。
二、卷积神经网络搭建
卷积神经网路与普通神经网络,最大的区别是其主要以卷积层和池化层为主,而普通神经网络主要以全连接层为主。卷积层和池化层不仅学习到数据的空间结构关系,还减少了模型的参数的数量,提升了学习的效率。卷积神经网络分为七个部分:数据预处理(Data Preprocessing)、卷积层构建(Convolution)、池化层构建(Pooling)、全连接层构建(Affine)、激活函数(Activation)、优化器(Optimizer)、损失函数(Loss Function)。
1、数据预处理(Data Preprocessing)
由于mnist数据是以压缩文件的形式储存的,所以需要对数据进行解压,以及图片大小进行变换定义操作。首先导入所需模块,以及数据文件、shape定义:
import numpy as np
import gzip
key_file = {
'train_img':'mnist/train-images-idx3-ubyte.gz',
'train_label':'mnist/train-labels-idx1-ubyte.gz',
'test_img':'mnist/t10k-images-idx3-ubyte.gz',
'test_label':'mnist/t10k-labels-idx1-ubyte.gz'
}
train_num = 60000
test_num = 10000
img_dim = (1, 28, 28)
img_size = 784
编写数据的解压函数:
def load_mnist(normalize=True, flatten=True, one_hot_label=False):
dataset = {}
for key, file in key_file.items():
if 'label' not in key:
with gzip.open(file, 'rb') as f:
data = np.frombuffer(f.read(), np.uint8, offset=16)
data = data.reshape(-1, img_size)
dataset[key] = data
else:
with gzip.open(file, 'rb') as f:
labels = np.frombuffer(f.read(), np.uint8, offset=8)
dataset[key] = labels
if not flatten:
for key in ('train_img', 'test_img'):
dataset[key] = dataset[key].reshape(-1, 1, 28, 28)
return (dataset['train_img'], dataset['train_label']), (dataset['test_img'], dataset['test_label'])
flatten参数的意思是否要对数据进项展开。
将数据分为训练数据集和测试数据集:
(x_train, y_train), (x_test, y_test) = load_mnist(flatten=False)
2、卷积层构建(Convolution)
由于数据的结构,在运行卷积运算的时候,会嵌套好几层的for循环语句,导致计算速度过慢。通过编写im2col函数,将输入数据展开以适合滤波器,实现将3维数据转换为2维矩阵,从而可以进行向量运算,提升运算效率。
im2col函数实现:
def im2col(input_data, filter_h, filter_w, stride=1, pad=0):
N, C, H, W = input_data.shape
out_h = (H + 2*pad - filter_h) // stride + 1
out_w = (W + 2*pad - filter_w) // stride + 1
img = np.pad(input_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)], 'constant')
col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))
for y in range(filter_h):
y_max = y + stride*out_h
for x in range(filter_w):
x_max = x + stride*out_w
col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]
col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1)
return col
卷积层在进行反向传播的优化运算时候,im2col展开的数据不利于进行,需要进行im2col的逆处理。
col2im(im2col的逆处理)函数定义:
def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):
N, C, H, W = input_shape
out_h = (H + 2*pad - filter_h) // stride + 1
out_w = (W + 2*pad - filter_w) // stride + 1
col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2)
img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))
for y in range(filter_h):
y_max = y + stride*out_h
for x in range(filter_w):
x_max = x + stride*out_w
img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]
return img[:, :, pad:H + pad, pad:W + pad]
卷积层运算定义,需要定义卷积层的正向传播运算(forward)和反向传播运算(backward):
class Convolution:
def __init__(self, W, b, stride=1, pad=0):
self.W = W
self.b = b
self.stride = stride
self.pad = pad
self.x = None
self.col = None
self.col_W = None
self.dW = None
self.db = None
def forward(self, x):
FN, C, FH, FW = self.W.shape
N, C, H, W = x.shape
out_h = 1 + int((H + 2*self.pad - FH) / self.stride)
out_w = 1 + int((W + 2*self.pad - FW) / self.stride)
col = im2col(x, FH, FW, self.stride, self.pad)
col_W = self.W.reshape(FN, -1).T
out = np.dot(col, col_W) + self.b
out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)
self.x = x
self.col = col
self.col_W = col_W
return out
def backward(self, dout):
FN, C, FH, FW = self.W.shape
dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN)
self.db = np.sum(dout, axis=0)
self.dW = np.dot(self.col.T, dout)
self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW)
dcol = np.dot(dout, self.col_W.T)
dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)
return dx
3、池化层构建(Pooling)
池化层是对卷积层进行下采样操作,缩小高、长方向上的空间运算,池化运算分为Max池化和Average池化,一般池化运算的步幅和池化窗口大小保持一致,本次采用Max池化运算,即取池化窗口的最大值。
池化层定义:
class Pooling:
def __init__(self, pool_h, pool_w, stride=1, pad=0):
self.pool_h = pool_h
self.pool_w = pool_w
self.stride = stride
self.pad = pad
self.x = None
self.arg_max = None
def forward(self, x):
N, C, H, W = x.shape
out_h = int(1 + (H - self.pool_h) / self.stride)
out_w = int(1 + (W - self.pool_w) / self.stride)
col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)
col = col.reshape(-1, self.pool_h*self.pool_w)
arg_max = np.argmax(col, axis=1)
out = np.max(col, axis=1)
out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)
self.x = x
self.arg_max = arg_max
return out
def backward(self, dout):
dout = dout.transpose(0, 2, 3, 1)
pool_size = self.pool_h * self.pool_w
dmax = np.zeros((dout.size, pool_size))
dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten()
dmax = dmax.reshape(dout.shape + (pool_size,))
dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)
dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad)
return dx
4、全连接层构建(Affine)
卷积神经网络最后的输出层是由全连接层进行构建,便于输出数据结果的完整性。
全连接层定义:
class Affine:
def __init__(self, W, b):
self.W = W
self.b = b
self.x = None
self.original_x_shape = None
self.dW = None
self.db = None
def forward(self, x):
self.original_x_shape = x.shape
x = x.reshape(x.shape[0], -1)
self.x = x
out = np.dot(self.x, self.W) + self.b
return out
def backward(self, dout):
dx = np.dot(dout, self.W.T)
self.dW = np.dot(self.x.T, dout)
self.db = np.sum(dout, axis=0)
dx = dx.reshape(*self.original_x_shape)
return dx
5、激活函数(Activation)
深度学习中常用的激活函数有:sigmoid函数、relu函数、tanh函数、softmax函数等。由于mnist是多分类输出的图片识别,因此主要用到relu函数和softmax函数。
relu函数分布是小于0的结果都为0,大于0的结果为输入,以至于relu函数降低一般的计算量,提升计算的效率。定义如下:
class Relu:
def __init__(self):
self.mask = None
def forward(self, x):
self.mask = (x <= 0)
out = x.copy()
out[self.mask] = 0
return out
def backward(self, dout):
dout[self.mask] = 0
dx = dout
return dx
softmax函数常用与多输出的运行,各分类输出的总和为1。定义如下:
def softmax(x):
if x.ndim == 2:
x = x.T
x = x - np.max(x, axis=0)
y = np.exp(x) / np.sum(np.exp(x), axis=0)
return y.T
x = x - np.max(x)
return np.exp(x) / np.sum(np.exp(x))
6、优化器(Optimizer)
深度学习目前常用的优化器有:SGD(随机梯度下降)、Momentum、AdaGrad、Adam等。由于Adam优化器相对有较快的运算速度,并能够很好的收敛的特点,所以本次优化器采用Adam优化器。
Adam优化器定义:
class Adam:
def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
self.lr = lr
self.beta1 = beta1
self.beta2 = beta2
self.iter = 0
self.m = None
self.v = None
def update(self, params, grads):
if self.m is None:
self.m, self.v = {}, {}
for key, val in params.items():
self.m[key] = np.zeros_like(val)
self.v[key] = np.zeros_like(val)
self.iter += 1
lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)
for key in params.keys():
self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])
params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
7、损失函数(Loss Function)
本次的mnist是属于多分类的任务,所以采用交叉熵来计算模型的损失,由于交叉熵能够用梯度来优化模型。
交叉熵损失函数定义:
def cross_entropy_error(y, t):
if y.ndim == 1:
t = t.reshape(1, t.size)
y = y.reshape(1, y.size)
if t.size == y.size:
t = t.argmax(axis=1)
batch_size = y.shape[0]
return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size
损失函数的反向传播函数定义:
class SoftmaxWithLoss:
def __init__(self):
self.loss = None
self.y = None
self.t = None
def forward(self, x, t):
self.t = t
self.y = softmax(x)
self.loss = cross_entropy_error(self.y, self.t)
return self.loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
if self.t.size == self.y.size:
dx = (self.y - self.t) / batch_size
else:
dx = self.y.copy()
dx[np.arange(batch_size), self.t] -= 1
dx = dx / batch_size
return dx
最后将搭建卷积神经网络(SimpleConvNet)的总体模型:
SimpleConvNet卷积神经网络实现predict(预测)、loss(损失计算)、accuracy(准确计算)、gradient(梯度计算)、save_params(保存权重)、load_params(读取权重)等功能。
class SimpleConvNet:
def __init__(self, input_dim=(1, 28, 28),
conv_param={'filter_num': 30, 'filter_size': 5, 'pad': 0, 'stride': 1},
hidden_size=100, output_size=10, weight_init_std=0.01):
filter_num = conv_param['filter_num']
filter_size = conv_param['filter_size']
filter_pad = conv_param['pad']
filter_stride = conv_param['stride']
input_size = input_dim[1]
conv_output_size = (input_size - filter_size + 2*filter_pad) / filter_stride + 1
pool_output_size = int(filter_num * (conv_output_size/2) * (conv_output_size/2))
self.params = {}
self.params['W1'] = weight_init_std * np.random.randn(filter_num,
input_dim[0], filter_size, filter_size)
self.params['b1'] = np.zeros(filter_num)
self.params['W2'] = weight_init_std * np.random.randn(pool_output_size, hidden_size)
self.params['b2'] = np.zeros(hidden_size)
self.params['W3'] = weight_init_std * np.random.randn(hidden_size, output_size)
self.params['b3'] = np.zeros(output_size)
self.layers = OrderedDict()
self.layers['Conv1'] = Convolution(self.params['W1'], self.params['b1'],
conv_param['stride'], conv_param['pad'])
self.layers['Relu1'] = Relu()
self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2)
self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2'])
self.layers['Relu2'] = Relu()
self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3'])
self.last_layer = SoftmaxWithLoss()
def predict(self, x):
for layer in self.layers.values():
x = layer.forward(x)
return x
def loss(self, x, t):
y = self.predict(x)
return self.last_layer.forward(y, t)
def accuracy(self, x, t, batch_size=100):
if t.ndim != 1: t = np.argmax(t, axis=1)
acc = 0.0
for i in range(int(x.shape[0] / batch_size)):
tx = x[i*batch_size:(i+1)*batch_size]
tt = t[i*batch_size:(i+1)*batch_size]
y = self.predict(tx)
y = np.argmax(y, axis=1)
acc += np.sum(y == tt)
return acc / x.shape[0]
def gradient(self, x, t):
self.loss(x, t)
dout = 1
dout = self.last_layer.backward(dout)
layers = list(self.layers.values())
layers.reverse()
for layer in layers:
dout = layer.backward(dout)
grads = {}
grads['W1'], grads['b1'] = self.layers['Conv1'].dW, self.layers['Conv1'].db
grads['W2'], grads['b2'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
grads['W3'], grads['b3'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
return grads
def save_params(self, file_name='params.pkl'):
params = {}
for key, val in self.params.items():
params[key] = val
with open(file_name, 'wb') as f:
pickle.dump(params, f)
def load_params(self, file_name='params.pkl'):
with open(file_name, 'rb') as f:
params = pickle.load(f)
for key, val in params.items():
self.params[key] = val
for i, key in enumerate(['Conv1', 'Affine1', 'Affine2']):
self.layers[key].W = self.params['W' + str(i + 1)]
self.layers[key].b = self.params['b' + str(i + 1)]
通过搭建的卷积神经网络模型,得到最好的测试成绩为99%的正确率。
当然后续还可以采用Dropout方法以便于使用更复杂的网络架构、数据增强、预训练的卷积神经网络等方法进一步提升预测的准确率。
总体来说,在图片识别的方法上,卷积神经网络有着较强的优势,但是同样面临着一些挑战,如何合理的调节超参数?由于重头运算对于计算的开销,将会花费不少的时间,当然也可以凭借以往的一些经验,合理的尝试一些比较好的取值,深度学习还存在较大的发展空间,保持不断学习,运用新的方法,就像Geoffrey Hinton开发Dropout方法一样,灵感之一来自于银行防欺诈机制。