利用python卷积神经网络手写数字识别_利用卷积神经网络进行手写数字识别

一、前言

本文通过Python构建CNN(卷积神经网络)体系,对mnist(手写数字图片)进行识别,手写数字是从0到9,数据分为60000条训练集,10000条测试集。

二、卷积神经网络搭建

卷积神经网路与普通神经网络,最大的区别是其主要以卷积层和池化层为主,而普通神经网络主要以全连接层为主。卷积层和池化层不仅学习到数据的空间结构关系,还减少了模型的参数的数量,提升了学习的效率。卷积神经网络分为七个部分:数据预处理(Data Preprocessing)、卷积层构建(Convolution)、池化层构建(Pooling)、全连接层构建(Affine)、激活函数(Activation)、优化器(Optimizer)、损失函数(Loss Function)。

1、数据预处理(Data Preprocessing)

由于mnist数据是以压缩文件的形式储存的,所以需要对数据进行解压,以及图片大小进行变换定义操作。首先导入所需模块,以及数据文件、shape定义:

import numpy as np

import gzip

key_file = {

'train_img':'mnist/train-images-idx3-ubyte.gz',

'train_label':'mnist/train-labels-idx1-ubyte.gz',

'test_img':'mnist/t10k-images-idx3-ubyte.gz',

'test_label':'mnist/t10k-labels-idx1-ubyte.gz'

}

train_num = 60000

test_num = 10000

img_dim = (1, 28, 28)

img_size = 784

编写数据的解压函数:

def load_mnist(normalize=True, flatten=True, one_hot_label=False):

dataset = {}

for key, file in key_file.items():

if 'label' not in key:

with gzip.open(file, 'rb') as f:

data = np.frombuffer(f.read(), np.uint8, offset=16)

data = data.reshape(-1, img_size)

dataset[key] = data

else:

with gzip.open(file, 'rb') as f:

labels = np.frombuffer(f.read(), np.uint8, offset=8)

dataset[key] = labels

if not flatten:

for key in ('train_img', 'test_img'):

dataset[key] = dataset[key].reshape(-1, 1, 28, 28)

return (dataset['train_img'], dataset['train_label']), (dataset['test_img'], dataset['test_label'])

flatten参数的意思是否要对数据进项展开。

将数据分为训练数据集和测试数据集:

(x_train, y_train), (x_test, y_test) = load_mnist(flatten=False)

2、卷积层构建(Convolution)

由于数据的结构,在运行卷积运算的时候,会嵌套好几层的for循环语句,导致计算速度过慢。通过编写im2col函数,将输入数据展开以适合滤波器,实现将3维数据转换为2维矩阵,从而可以进行向量运算,提升运算效率。

im2col函数实现:

def im2col(input_data, filter_h, filter_w, stride=1, pad=0):

N, C, H, W = input_data.shape

out_h = (H + 2*pad - filter_h) // stride + 1

out_w = (W + 2*pad - filter_w) // stride + 1

img = np.pad(input_data, [(0, 0), (0, 0), (pad, pad), (pad, pad)], 'constant')

col = np.zeros((N, C, filter_h, filter_w, out_h, out_w))

for y in range(filter_h):

y_max = y + stride*out_h

for x in range(filter_w):

x_max = x + stride*out_w

col[:, :, y, x, :, :] = img[:, :, y:y_max:stride, x:x_max:stride]

col = col.transpose(0, 4, 5, 1, 2, 3).reshape(N*out_h*out_w, -1)

return col

卷积层在进行反向传播的优化运算时候,im2col展开的数据不利于进行,需要进行im2col的逆处理。

col2im(im2col的逆处理)函数定义:

def col2im(col, input_shape, filter_h, filter_w, stride=1, pad=0):

N, C, H, W = input_shape

out_h = (H + 2*pad - filter_h) // stride + 1

out_w = (W + 2*pad - filter_w) // stride + 1

col = col.reshape(N, out_h, out_w, C, filter_h, filter_w).transpose(0, 3, 4, 5, 1, 2)

img = np.zeros((N, C, H + 2*pad + stride - 1, W + 2*pad + stride - 1))

for y in range(filter_h):

y_max = y + stride*out_h

for x in range(filter_w):

x_max = x + stride*out_w

img[:, :, y:y_max:stride, x:x_max:stride] += col[:, :, y, x, :, :]

return img[:, :, pad:H + pad, pad:W + pad]

卷积层运算定义,需要定义卷积层的正向传播运算(forward)和反向传播运算(backward):

class Convolution:

def __init__(self, W, b, stride=1, pad=0):

self.W = W

self.b = b

self.stride = stride

self.pad = pad

self.x = None

self.col = None

self.col_W = None

self.dW = None

self.db = None

def forward(self, x):

FN, C, FH, FW = self.W.shape

N, C, H, W = x.shape

out_h = 1 + int((H + 2*self.pad - FH) / self.stride)

out_w = 1 + int((W + 2*self.pad - FW) / self.stride)

col = im2col(x, FH, FW, self.stride, self.pad)

col_W = self.W.reshape(FN, -1).T

out = np.dot(col, col_W) + self.b

out = out.reshape(N, out_h, out_w, -1).transpose(0, 3, 1, 2)

self.x = x

self.col = col

self.col_W = col_W

return out

def backward(self, dout):

FN, C, FH, FW = self.W.shape

dout = dout.transpose(0, 2, 3, 1).reshape(-1, FN)

self.db = np.sum(dout, axis=0)

self.dW = np.dot(self.col.T, dout)

self.dW = self.dW.transpose(1, 0).reshape(FN, C, FH, FW)

dcol = np.dot(dout, self.col_W.T)

dx = col2im(dcol, self.x.shape, FH, FW, self.stride, self.pad)

return dx

3、池化层构建(Pooling)

池化层是对卷积层进行下采样操作,缩小高、长方向上的空间运算,池化运算分为Max池化和Average池化,一般池化运算的步幅和池化窗口大小保持一致,本次采用Max池化运算,即取池化窗口的最大值。

池化层定义:

class Pooling:

def __init__(self, pool_h, pool_w, stride=1, pad=0):

self.pool_h = pool_h

self.pool_w = pool_w

self.stride = stride

self.pad = pad

self.x = None

self.arg_max = None

def forward(self, x):

N, C, H, W = x.shape

out_h = int(1 + (H - self.pool_h) / self.stride)

out_w = int(1 + (W - self.pool_w) / self.stride)

col = im2col(x, self.pool_h, self.pool_w, self.stride, self.pad)

col = col.reshape(-1, self.pool_h*self.pool_w)

arg_max = np.argmax(col, axis=1)

out = np.max(col, axis=1)

out = out.reshape(N, out_h, out_w, C).transpose(0, 3, 1, 2)

self.x = x

self.arg_max = arg_max

return out

def backward(self, dout):

dout = dout.transpose(0, 2, 3, 1)

pool_size = self.pool_h * self.pool_w

dmax = np.zeros((dout.size, pool_size))

dmax[np.arange(self.arg_max.size), self.arg_max.flatten()] = dout.flatten()

dmax = dmax.reshape(dout.shape + (pool_size,))

dcol = dmax.reshape(dmax.shape[0] * dmax.shape[1] * dmax.shape[2], -1)

dx = col2im(dcol, self.x.shape, self.pool_h, self.pool_w, self.stride, self.pad)

return dx

4、全连接层构建(Affine)

卷积神经网络最后的输出层是由全连接层进行构建,便于输出数据结果的完整性。

全连接层定义:

class Affine:

def __init__(self, W, b):

self.W = W

self.b = b

self.x = None

self.original_x_shape = None

self.dW = None

self.db = None

def forward(self, x):

self.original_x_shape = x.shape

x = x.reshape(x.shape[0], -1)

self.x = x

out = np.dot(self.x, self.W) + self.b

return out

def backward(self, dout):

dx = np.dot(dout, self.W.T)

self.dW = np.dot(self.x.T, dout)

self.db = np.sum(dout, axis=0)

dx = dx.reshape(*self.original_x_shape)

return dx

5、激活函数(Activation)

深度学习中常用的激活函数有:sigmoid函数、relu函数、tanh函数、softmax函数等。由于mnist是多分类输出的图片识别,因此主要用到relu函数和softmax函数。

relu函数分布是小于0的结果都为0,大于0的结果为输入,以至于relu函数降低一般的计算量,提升计算的效率。定义如下:

class Relu:

def __init__(self):

self.mask = None

def forward(self, x):

self.mask = (x <= 0)

out = x.copy()

out[self.mask] = 0

return out

def backward(self, dout):

dout[self.mask] = 0

dx = dout

return dx

softmax函数常用与多输出的运行,各分类输出的总和为1。定义如下:

def softmax(x):

if x.ndim == 2:

x = x.T

x = x - np.max(x, axis=0)

y = np.exp(x) / np.sum(np.exp(x), axis=0)

return y.T

x = x - np.max(x)

return np.exp(x) / np.sum(np.exp(x))

6、优化器(Optimizer)

深度学习目前常用的优化器有:SGD(随机梯度下降)、Momentum、AdaGrad、Adam等。由于Adam优化器相对有较快的运算速度,并能够很好的收敛的特点,所以本次优化器采用Adam优化器。

Adam优化器定义:

class Adam:

def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):

self.lr = lr

self.beta1 = beta1

self.beta2 = beta2

self.iter = 0

self.m = None

self.v = None

def update(self, params, grads):

if self.m is None:

self.m, self.v = {}, {}

for key, val in params.items():

self.m[key] = np.zeros_like(val)

self.v[key] = np.zeros_like(val)

self.iter += 1

lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

for key in params.keys():

self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])

self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])

params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)

7、损失函数(Loss Function)

本次的mnist是属于多分类的任务,所以采用交叉熵来计算模型的损失,由于交叉熵能够用梯度来优化模型。

交叉熵损失函数定义:

def cross_entropy_error(y, t):

if y.ndim == 1:

t = t.reshape(1, t.size)

y = y.reshape(1, y.size)

if t.size == y.size:

t = t.argmax(axis=1)

batch_size = y.shape[0]

return -np.sum(np.log(y[np.arange(batch_size), t] + 1e-7)) / batch_size

损失函数的反向传播函数定义:

class SoftmaxWithLoss:

def __init__(self):

self.loss = None

self.y = None

self.t = None

def forward(self, x, t):

self.t = t

self.y = softmax(x)

self.loss = cross_entropy_error(self.y, self.t)

return self.loss

def backward(self, dout=1):

batch_size = self.t.shape[0]

if self.t.size == self.y.size:

dx = (self.y - self.t) / batch_size

else:

dx = self.y.copy()

dx[np.arange(batch_size), self.t] -= 1

dx = dx / batch_size

return dx

最后将搭建卷积神经网络(SimpleConvNet)的总体模型:

SimpleConvNet卷积神经网络实现predict(预测)、loss(损失计算)、accuracy(准确计算)、gradient(梯度计算)、save_params(保存权重)、load_params(读取权重)等功能。

class SimpleConvNet:

def __init__(self, input_dim=(1, 28, 28),

conv_param={'filter_num': 30, 'filter_size': 5, 'pad': 0, 'stride': 1},

hidden_size=100, output_size=10, weight_init_std=0.01):

filter_num = conv_param['filter_num']

filter_size = conv_param['filter_size']

filter_pad = conv_param['pad']

filter_stride = conv_param['stride']

input_size = input_dim[1]

conv_output_size = (input_size - filter_size + 2*filter_pad) / filter_stride + 1

pool_output_size = int(filter_num * (conv_output_size/2) * (conv_output_size/2))

self.params = {}

self.params['W1'] = weight_init_std * np.random.randn(filter_num,

input_dim[0], filter_size, filter_size)

self.params['b1'] = np.zeros(filter_num)

self.params['W2'] = weight_init_std * np.random.randn(pool_output_size, hidden_size)

self.params['b2'] = np.zeros(hidden_size)

self.params['W3'] = weight_init_std * np.random.randn(hidden_size, output_size)

self.params['b3'] = np.zeros(output_size)

self.layers = OrderedDict()

self.layers['Conv1'] = Convolution(self.params['W1'], self.params['b1'],

conv_param['stride'], conv_param['pad'])

self.layers['Relu1'] = Relu()

self.layers['Pool1'] = Pooling(pool_h=2, pool_w=2, stride=2)

self.layers['Affine1'] = Affine(self.params['W2'], self.params['b2'])

self.layers['Relu2'] = Relu()

self.layers['Affine2'] = Affine(self.params['W3'], self.params['b3'])

self.last_layer = SoftmaxWithLoss()

def predict(self, x):

for layer in self.layers.values():

x = layer.forward(x)

return x

def loss(self, x, t):

y = self.predict(x)

return self.last_layer.forward(y, t)

def accuracy(self, x, t, batch_size=100):

if t.ndim != 1: t = np.argmax(t, axis=1)

acc = 0.0

for i in range(int(x.shape[0] / batch_size)):

tx = x[i*batch_size:(i+1)*batch_size]

tt = t[i*batch_size:(i+1)*batch_size]

y = self.predict(tx)

y = np.argmax(y, axis=1)

acc += np.sum(y == tt)

return acc / x.shape[0]

def gradient(self, x, t):

self.loss(x, t)

dout = 1

dout = self.last_layer.backward(dout)

layers = list(self.layers.values())

layers.reverse()

for layer in layers:

dout = layer.backward(dout)

grads = {}

grads['W1'], grads['b1'] = self.layers['Conv1'].dW, self.layers['Conv1'].db

grads['W2'], grads['b2'] = self.layers['Affine1'].dW, self.layers['Affine1'].db

grads['W3'], grads['b3'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

return grads

def save_params(self, file_name='params.pkl'):

params = {}

for key, val in self.params.items():

params[key] = val

with open(file_name, 'wb') as f:

pickle.dump(params, f)

def load_params(self, file_name='params.pkl'):

with open(file_name, 'rb') as f:

params = pickle.load(f)

for key, val in params.items():

self.params[key] = val

for i, key in enumerate(['Conv1', 'Affine1', 'Affine2']):

self.layers[key].W = self.params['W' + str(i + 1)]

self.layers[key].b = self.params['b' + str(i + 1)]

通过搭建的卷积神经网络模型,得到最好的测试成绩为99%的正确率。

当然后续还可以采用Dropout方法以便于使用更复杂的网络架构、数据增强、预训练的卷积神经网络等方法进一步提升预测的准确率。

总体来说,在图片识别的方法上,卷积神经网络有着较强的优势,但是同样面临着一些挑战,如何合理的调节超参数?由于重头运算对于计算的开销,将会花费不少的时间,当然也可以凭借以往的一些经验,合理的尝试一些比较好的取值,深度学习还存在较大的发展空间,保持不断学习,运用新的方法,就像Geoffrey Hinton开发Dropout方法一样,灵感之一来自于银行防欺诈机制。

你可能感兴趣的:(利用python卷积神经网络手写数字识别_利用卷积神经网络进行手写数字识别)