用NumPy仿照PyTorch写了一个深度学习模块包,实现了CNN,在MNIST上有99.33%的测试精度。
包内容包括神经网络层(卷积层、线性层),损失函数(MSE、Cross Entropy)和优化器(Adam、RMSProp)。
完整代码地址:EthanLifeGreat/NumPy_CNN: This project implemented some Convolutional Neural Network modules using pure NumPy. And a network built by these modules achieved over 99% test accuracy on MNIST dataset. (github.com)
先来看看完成后的模块怎么写:
class ConvNetwork(SequentialNeuralNetwork):
def __init__(self, output_size):
# NOTE: feel free to change structure and seed
sequential = list()
sequential.append(ConvolutionModule(1, 32, window_size=(5, 5), stride=(1, 1), padding=(2, 2)))
sequential.append(ReluModule())
sequential.append(MaxPoolModule(window_size=(2, 2), stride=(2, 2)))
sequential.append(DropoutModule(p=0.15))
sequential.append(ConvolutionModule(32, 64, window_size=(3, 3), stride=(1, 1), padding=(1, 1)))
sequential.append(ReluModule())
sequential.append(MaxPoolModule(window_size=(2, 2), stride=(2, 2)))
sequential.append(DropoutModule(p=0.15))
sequential.append(LinearModule(7 * 7 * 64, hidden_size))
sequential.append(ReluModule())
sequential.append(LinearModule(hidden_size, hidden_size))
sequential.append(ReluModule())
sequential.append(DropoutModule(p=0.25))
sequential.append(LinearModule(hidden_size, hidden_size))
sequential.append(ReluModule())
sequential.append(LinearModule(hidden_size, hidden_size))
sequential.append(ReluModule())
sequential.append(DropoutModule(p=0.25))
sequential.append(LinearModule(hidden_size, output_size))
loss_func = CrossEntropyLoss()
optimizer = Adam(lr=1e-3)
super().__init__(sequential, loss_func, optimizer)
上面的类中所用部分模块的定义:
以下代码仅供预览,是不全的,跑不通的。要可以运行的完整版请前往文章开头的Github地址下载完整版,上面也有使用教程。
class Adam:
def __init__(self, lr=1e-3, beta1=0.9, beta2=0.999, eps=1e-8):
self.alpha = lr
self.beta1 = beta1
self.beta2 = beta2
self.eps = eps
self.m = 0
self.v = 0
self.t = 0
def step(self, dw):
self.t += 1
g = dw
m = (self.beta1 * self.m + (1 - self.beta1) * g)
v = (self.beta2 * self.v + (1 - self.beta2) * g ** 2)
alpha = self.alpha * np.sqrt(1 - self.beta2 ** self.t) / (1 - self.beta1 ** self.t)
dw = alpha * m / (np.sqrt(v) + self.eps)
self.m = m
self.v = v
return dw
class ReluModule(NeuralNetworkModule):
# ReLU Layer: y = 0, x<0; x, x>=0
def __init__(self):
self.x = None
def forward(self, x):
self.x = x
y = x.copy()
y[x < 0] = 0
return y
def backward(self, dy):
d = np.ones_like(self.x)
d[self.x < 0] = 0
return d * dy
class DropoutModule(NeuralNetworkModule):
def __init__(self, p):
self.keep_prob = 1-p
self.mask = None
def forward(self, x):
self.mask = np.random.binomial(1, self.keep_prob, size=x.shape)
y = x * self.mask / self.keep_prob
return y
def backward(self, dy):
dx = dy * self.mask * self.keep_prob
return dx
def predict(self, x):
return x
class LinearModule(OptimizableModule):
# Linear Layer: W * [X, 1] = Y
def __init__(self, input_size, output_size, random=True):
super().__init__()
if random:
std_v = 1. / np.sqrt(input_size)
self.w = (np.random.uniform(-std_v, std_v, [input_size + 1, output_size]))
else:
self.w = np.zeros([input_size + 1, output_size])
self.dw = None
self.x_1 = None
self.input_shape = None
self.input_size = input_size
def forward(self, x):
num_samples = x.shape[0]
self.input_shape = x.shape
if len(x.shape) > 2:
# Squeezing
x = x.reshape(num_samples, self.input_size)
self.x_1 = np.concatenate([x, np.ones([num_samples, 1])], axis=1)
y = matmul(self.x_1, self.w)
return y
def backward(self, dy):
self.dw = einsum('ij,ih->hj', dy, self.x_1) / dy.shape[0]
dx = matmul(dy, np.transpose(self.w))[:, :-1]
dx = dx.reshape(self.input_shape)
return dx
class ConvolutionModule(OptimizableModule):
# !!!!! should satisfy: 2p + x - w == (y - 1) * s !!!!!
def __init__(self, num_channels_in, num_channels_out, window_size, padding, stride=(1, 1), random=True):
# weight_size: [height, width]
super().__init__()
# unrolled weight
self.window_size = window_size
self.stride = stride
self.padding = padding
self.y_width = 0
self.y_height = 0
self.zero_image_shape = None
self.weight_size = (window_size[0] * window_size[1] * (num_channels_in + 1), num_channels_out)
self.num_channels_in = num_channels_in
self.num_channels_out = num_channels_out
if random:
std_v = 1. / np.sqrt(np.prod(self.weight_size) * num_channels_in)
self.w = np.random.uniform(-std_v, std_v, self.weight_size)
else:
self.w = np.zeros(self.weight_size)
self.dw = np.zeros_like(self.w)
self.x1p = None
def batch_unroll(self, x, weight_size, stride):
# x is a 4d tensor([batch_size, height, width, channels])
unrolled_x, y_height, y_width = batch_image_unroll(x, weight_size, stride)
self.zero_image_shape = x.shape
self.y_width, self.y_height = y_width, y_height
return unrolled_x
def batch_roll(self, unrolled_x, weight_size, stride):
# unrolled_x is a 3d tensor([batch_size, y_height * y_width, w_height * w_width * channels])
y_width, y_height = self.y_width, self.y_height
num_channels = self.num_channels_in + 1
x = batch_image_roll(unrolled_x, self.zero_image_shape, weight_size, stride, y_height, y_width, num_channels)
return x
@staticmethod
def batch_matmul(x, w):
return einsum('ijk,kh->ijh', x, w)
def batch_convolve(self, x, w):
# x is bias-included, batched, unrolled input
# w is unrolled kernel
y = self.batch_matmul(x, w)
return y
def forward(self, x):
# x_size = [batch_size, height, width, num_channels_in]
assert len(x.shape) == 4
num_samples, x_height, x_width = x.shape[0], x.shape[1], x.shape[2]
x_1 = np.pad(x, ((0, 0), (0, 0), (0, 0), (0, 1)), 'constant', constant_values=1)
x_1_p = np.pad(x_1, ((0, 0), (self.padding[0], self.padding[0]),
(self.padding[1], self.padding[1]), (0, 0)), 'constant', constant_values=0)
unrolled_x1p = self.batch_unroll(x_1_p, self.window_size, self.stride)
self.x1p = unrolled_x1p
unrolled_y = self.batch_convolve(self.x1p, self.w)
y = unrolled_y.reshape([num_samples, self.y_height, self.y_width, self.num_channels_out])
return y
def backward(self, dy):
num_samples, y_height, y_width = dy.shape[0], dy.shape[1], dy.shape[2]
dy_unrolled = dy.reshape([num_samples, y_height * y_width, self.num_channels_out])
dw = einsum('ijk,ikh->jh', np.transpose(self.x1p, axes=(0, 2, 1)), dy_unrolled)
self.dw = dw / num_samples
dx1p_unrolled = self.batch_matmul(dy_unrolled, np.transpose(self.w))
dx1p = self.batch_roll(dx1p_unrolled, self.window_size, self.stride)
dx1 = dx1p[:, self.padding[0]:-self.padding[0], self.padding[1]:-self.padding[1], :]
return dx1[:, :, :, :-1]
class MaxPoolModule(NeuralNetworkModule):
def __init__(self, window_size, stride):
self.window_size = window_size
self.stride = stride
self.mask = None
self.y_height = 0
self.y_width = 0
self.output_size = 0
self.zero_image_shape = None
def forward(self, x):
num_samples, num_channels = x.shape[0], x.shape[3]
unrolled_x, y_height, y_width = batch_image_unroll(x, self.window_size, self.stride)
self.zero_image_shape = x.shape
self.output_size = num_samples * y_height * y_width * num_channels
unrolled_x = unrolled_x.reshape([num_samples, y_height * y_width, np.prod(self.window_size), num_channels])
unrolled_x_ = np.transpose(unrolled_x, axes=(0, 1, 3, 2)). \
reshape(self.output_size, np.prod(self.window_size))
y = np.max(unrolled_x, axis=2).reshape([num_samples, y_height, y_width, num_channels])
self.mask = np.argmax(unrolled_x_, axis=1)
self.y_height, self.y_width = y_height, y_width
return y
def backward(self, dy):
# dy is a batched gradients with size [num_samples, y_height, y_width, num_channels]
num_samples, y_height, y_width, num_channels = dy.shape[0], dy.shape[1], dy.shape[2], dy.shape[3]
dy_unrolled = dy.reshape([num_samples, y_height * y_width, 1, num_channels])
dx_unrolled_ = np.zeros([self.output_size, np.prod(self.window_size)])
dx_unrolled_[np.arange(self.output_size), self.mask] = dy_unrolled.ravel()
dx_unrolled = np.transpose(dx_unrolled_.reshape(
[num_samples, y_height * y_width, num_channels, np.prod(self.window_size)]), axes=(0, 1, 3, 2))
dx_unrolled = dx_unrolled.reshape([num_samples, y_height * y_width, np.prod(self.window_size) * num_channels])
dx = batch_image_roll(dx_unrolled, self.zero_image_shape,
self.window_size, self.stride, y_height, y_width, num_channels)
return dx
class CrossEntropyLoss:
def __call__(self, y_hat, y):
y_prob = softmax(y_hat)
# return Loss & Derivative
return np.sum(- np.log(y_prob) * y), y_prob - y