ConvolutionalNetworks.ipynb
在cs231n/layers.py
中实实现conv_forward_naive
。这是一个简单的实现,你不必考虑太多的效率问题。
def conv_forward_naive(x, w, b, conv_param):
"""
A naive implementation of the forward pass for a convolutional layer.
The input consists of N data points, each with C channels, height H and
width W. We convolve each input with F different filters, where each filter
spans all C channels and has height HH and width WW.
Input:
- x: Input data of shape (N, C, H, W)
- w: Filter weights of shape (F, C, HH, WW)
- b: Biases, of shape (F,)
- conv_param: A dictionary with the following keys:
- 'stride': The number of pixels between adjacent receptive fields in the
horizontal and vertical directions.
- 'pad': The number of pixels that will be used to zero-pad the input.
During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
along the height and width axes of the input. Be careful not to modfiy the original
input x directly.
Returns a tuple of:
- out: Output data, of shape (N, F, H', W') where H' and W' are given by
H' = 1 + (H + 2 * pad - HH) / stride
W' = 1 + (W + 2 * pad - WW) / stride
- cache: (x, w, b, conv_param)
"""
out = None
# Hint: you can use the function np.pad for padding. #
stride,pad = conv_param["stride"],conv_param["pad"]
N, C, H, W = x.shape
F, C, HH, WW = w.shape
x_pad = np.pad(x,((0,0),(0,0),(pad,pad),(pad,pad)),
mode = 'constant',constant_values = 0)
# out of shape (N,F,out_H,out_W)
out_H = 1 + (H + 2 * pad - HH) // stride
out_W = 1 + (W + 2 * pad - WW) // stride
out = np.zeros((N,F,out_H,out_W))
for i in range(out_H):
for j in range(out_W):
# 参与计算的数值
x_pad_masked = x_pad[:,:, i*stride:i*stride+HH, j*stride:j*stride+WW]
# 每个输出通道分别计算
for f in range(F):
# 借助广播机制,会执行 N 次
# https://cs231n.github.io/python-numpy-tutorial/
out[:,f,i,j] = np.sum(x_pad_masked * w[f,:,:,:],axis=(1,2,3))
# 给每一个输出通道加上 bias
# b of shape (1,F,1,1)
# out = out + (b)[None,:,None,None]
# 能用广播机制进行计算, 如果 rank不相等,则先在前面 prepend 1s 直到 shape 有相同的长度
out = out + b.reshape((1,F,1,1))
cache = (x, w, b, conv_param)
return out, cache
为了检查实现并更好地理解卷积层可以执行的操作类型,我们将设置一个包含两幅图像的输入,并手动设置执行常见图像处理操作(灰度转换和边缘检测)的过滤器。卷积前向传递将对每个输入图像应用这些操作。然后我们可以将结果可视化为完整性检查。
from scipy.misc import imread, imresize
kitten, puppy = imread('kitten.jpg'), imread('puppy.jpg')
# kitten is wide, and puppy is already square
d = kitten.shape[1] - kitten.shape[0]
kitten_cropped = kitten[:, d//2:-d//2, :]
img_size = 200 # Make this smaller if it runs too slow
x = np.zeros((2, 3, img_size, img_size))
x[0, :, :, :] = imresize(puppy, (img_size, img_size)).transpose((2, 0, 1))
x[1, :, :, :] = imresize(kitten_cropped, (img_size, img_size)).transpose((2, 0, 1))
# Set up a convolutional weights holding 2 filters, each 3x3
w = np.zeros((2, 3, 3, 3))
# The first filter converts the image to grayscale.
# Set up the red, green, and blue channels of the filter.
w[0, 0, :, :] = [[0, 0, 0], [0, 0.3, 0], [0, 0, 0]]
w[0, 1, :, :] = [[0, 0, 0], [0, 0.6, 0], [0, 0, 0]]
w[0, 2, :, :] = [[0, 0, 0], [0, 0.1, 0], [0, 0, 0]]
# Second filter detects horizontal edges in the blue channel.
w[1, 2, :, :] = [[1, 2, 1], [0, 0, 0], [-1, -2, -1]]
# Vector of biases. We don't need any bias for the grayscale
# filter, but for the edge detection filter we want to add 128
# to each output so that nothing is negative.
b = np.array([0, 128])
# Compute the result of convolving each input in x with each filter in w,
# offsetting by b, and storing the results in out.
out, _ = conv_forward_naive(x, w, b, {'stride': 1, 'pad': 1})
def imshow_noax(img, normalize=True):
""" Tiny helper to show images as uint8 and remove axis labels """
if normalize:
img_max, img_min = np.max(img), np.min(img)
img = 255.0 * (img - img_min) / (img_max - img_min)
plt.imshow(img.astype('uint8'))
plt.gca().axis('off')
# Show the original images and the results of the conv operation
plt.subplot(2, 3, 1)
imshow_noax(puppy, normalize=False)
plt.title('Original image')
plt.subplot(2, 3, 2)
imshow_noax(out[0, 0])
plt.title('Grayscale')
plt.subplot(2, 3, 3)
imshow_noax(out[0, 1])
plt.title('Edges')
plt.subplot(2, 3, 4)
imshow_noax(kitten_cropped, normalize=False)
plt.subplot(2, 3, 5)
imshow_noax(out[1, 0])
plt.subplot(2, 3, 6)
imshow_noax(out[1, 1])
plt.show()
可以看到,第一个卷积核执行的是类似灰度化的操作,即输出结果等于 0.3R+0.6G+0.1*B。
第二个卷积核检测蓝色通道中的水平边缘。
也是只实现功能,先暂不考虑效率
def conv_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a convolutional layer.
Inputs:
- dout: Upstream derivatives.
- cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive
Returns a tuple of:
- dx: Gradient with respect to x
- dw: Gradient with respect to w
- db: Gradient with respect to b
"""
dx, dw, db = None, None, None
# TODO: Implement the convolutional backward pass.
(x, w, b, conv_param) = cache
stride, pad = conv_param["stride"], conv_param["pad"]
N, C, H, W = x.shape
F, C, HH, WW = w.shape
out_H = 1 + (H + 2 * pad - HH) // stride
out_W = 1 + (W + 2 * pad - WW) // stride
x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)),
mode='constant', constant_values=0)
dx = np.zeros_like(x)
dw = np.zeros_like(w)
db = np.zeros_like(b)
dx_pad = np.zeros_like(x_pad)
db = np.sum(dout, axis=(0, 2, 3))
for i in range(out_H):
for j in range(out_W):
# of shape (N,C,HH,WW)
x_pad_masked = x_pad[:,:, i*stride:i*stride+HH, j*stride:j*stride+WW]
# 每个输出通道分别计算
for f in range(F):
# 一个通道的值 of shape (N,1,1,1)
dout_C = dout[:,f:f+1,i:i+1,j:j+1]
# of shape (C,HH,WW)
dw[f,:,:,:] += np.sum(dout_C * x_pad_masked, axis = 0)
# of shape (N,C,HH,WW) = (N,1,1,1) * (C,HH,WW)
dx_pad[:,:, i*stride:i*stride+HH, j*stride:j*stride+WW] += dout_C * w[f,:,:,:]
# for n in range(N): # compute dx_pad
# # [F,C,HH,WW] * [F,1,1,1] -> [C,HH,WW]
# dx_pad[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW] += \
# np.sum((w[:, :, :, :] * (dout[n, :, i, j])[:,None,None,None]),axis=0)
# 把 dx_pad 映射回 dx
# of shape (N,C,H,W)
dx = dx_pad[:,:,pad:-pad,pad:-pad]
return dx, dw, db
同样,只考虑功能不考虑效率。
def max_pool_forward_naive(x, pool_param):
"""
A naive implementation of the forward pass for a max-pooling layer.
Inputs:
- x: Input data, of shape (N, C, H, W)
- pool_param: dictionary with the following keys:
- 'pool_height': The height of each pooling region
- 'pool_width': The width of each pooling region
- 'stride': The distance between adjacent pooling regions
No padding is necessary here. Output size is given by
Returns a tuple of:
- out: Output data, of shape (N, C, H', W') where H' and W' are given by
H' = 1 + (H - pool_height) / stride
W' = 1 + (W - pool_width) / stride
- cache: (x, pool_param)
"""
out = None
# TODO: Implement the max-pooling forward pass
N,C,H,W = x.shape
pool_height = pool_param["pool_height"]
pool_width = pool_param["pool_width"]
stride = pool_param["stride"]
outH = 1 + (H - pool_height) // stride
outW = 1 + (W - pool_width) // stride
out = np.zeros((N,C,outH,outW))
for i in range(outH):
for j in range(outW):
# of shape (N,C,HH,WW)
x_pool_mask = x[:,:,i*stride:i*stride+pool_height,j*stride:j*stride+pool_width]
# out of shape (N,C)
out[:,:,i,j] = np.max(x_pool_mask,axis=(2,3))
cache = (x, pool_param)
return out, cache
def max_pool_backward_naive(dout, cache):
"""
A naive implementation of the backward pass for a max-pooling layer.
Inputs:
- dout: Upstream derivatives
- cache: A tuple of (x, pool_param) as in the forward pass.
Returns:
- dx: Gradient with respect to x
"""
dx = None
# TODO: Implement the max-pooling backward pass
(x, pool_param) = cache
N,C,H,W = x.shape
HH = pool_param["pool_height"]
WW = pool_param["pool_width"]
stride = pool_param["stride"]
dx = np.zeros_like(x)
outH = 1 + (H - HH) // stride
outW = 1 + (W - WW) // stride
for i in range(outH):
for j in range(outW):
# of shape (N,C,HH,WW)
x_pool_mask = x[:, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
# 先找出最大值,然后再找出最大值对应的位置做一个掩码
# ,但是如果最大值不唯一的时候就不合理了
# (N,C,1,1)
max_x_masked = np.max(x_pool_mask, axis=(2, 3),keepdims=True)
# of shape (N,C,HH,WW)
temp_binary_mask = (x_pool_mask == max_x_masked)
dx[:, :, i * stride: i * stride + HH, j * stride: j * stride + WW] += temp_binary_mask * dout[:, :, i:i+1, j:j+1]
return dx
快速实现卷积和池化层是一项挑战。为了避免您的麻烦,官方在文件cs231n/fast_layers.py
中提供了卷积和池化层的正向和向后传递的快速实现。有时间的可以看一下。
快速卷积实现依赖Cython
扩展.要编译它,您需要从cs231n目录运行以下代码:
python setup.py build_ext --inplace
NOTE: The fast implementation for pooling will only perform optimally if the pooling regions are non-overlapping and tile the input. If these conditions are not met then the fast pooling implementation will not be much faster than the naive implementation.
下面是实现的效率对比:
# Rel errors should be around e-9 or less
from cs231n.fast_layers import conv_forward_fast, conv_backward_fast
from time import time
np.random.seed(231)
x = np.random.randn(100, 3, 31, 31)
w = np.random.randn(25, 3, 3, 3)
b = np.random.randn(25,)
dout = np.random.randn(100, 25, 16, 16)
conv_param = {'stride': 2, 'pad': 1}
t0 = time()
out_naive, cache_naive = conv_forward_naive(x, w, b, conv_param)
t1 = time()
out_fast, cache_fast = conv_forward_fast(x, w, b, conv_param)
t2 = time()
print('Testing conv_forward_fast:')
print('Naive: %fs' % (t1 - t0))
print('Fast: %fs' % (t2 - t1))
print('Speedup: %fx' % ((t1 - t0) / (t2 - t1)))
print('Difference: ', rel_error(out_naive, out_fast))
t0 = time()
dx_naive, dw_naive, db_naive = conv_backward_naive(dout, cache_naive)
t1 = time()
dx_fast, dw_fast, db_fast = conv_backward_fast(dout, cache_fast)
t2 = time()
print('\nTesting conv_backward_fast:')
print('Naive: %fs' % (t1 - t0))
print('Fast: %fs' % (t2 - t1))
print('Speedup: %fx' % ((t1 - t0) / (t2 - t1)))
print('dx difference: ', rel_error(dx_naive, dx_fast))
print('dw difference: ', rel_error(dw_naive, dw_fast))
print('db difference: ', rel_error(db_naive, db_fast))
Testing conv_forward_fast:
Naive: 0.125854s
Fast: 0.005614s
Speedup: 22.417803x
Difference: 4.926407851494105e-11
Testing conv_backward_fast:
Naive: 0.269846s
Fast: 0.004650s
Speedup: 58.027019x
dx difference: 1.383704034070129e-11
dw difference: 3.75105216164263e-13
db difference: 0.0
# Relative errors should be close to 0.0
from cs231n.fast_layers import max_pool_forward_fast, max_pool_backward_fast
np.random.seed(231)
x = np.random.randn(100, 3, 32, 32)
dout = np.random.randn(100, 3, 16, 16)
pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}
t0 = time()
out_naive, cache_naive = max_pool_forward_naive(x, pool_param)
t1 = time()
out_fast, cache_fast = max_pool_forward_fast(x, pool_param)
t2 = time()
print('Testing pool_forward_fast:')
print('Naive: %fs' % (t1 - t0))
print('fast: %fs' % (t2 - t1))
print('speedup: %fx' % ((t1 - t0) / (t2 - t1)))
print('difference: ', rel_error(out_naive, out_fast))
t0 = time()
dx_naive = max_pool_backward_naive(dout, cache_naive)
t1 = time()
dx_fast = max_pool_backward_fast(dout, cache_fast)
t2 = time()
print('\nTesting pool_backward_fast:')
print('Naive: %fs' % (t1 - t0))
print('fast: %fs' % (t2 - t1))
print('speedup: %fx' % ((t1 - t0) / (t2 - t1)))
print('dx difference: ', rel_error(dx_naive, dx_fast))
Testing pool_forward_fast:
Naive: 0.007368s
fast: 0.001482s
speedup: 4.969926x
difference: 0.0
Testing pool_backward_fast:
Naive: 0.015857s
fast: 0.007847s
speedup: 2.020660x
dx difference: 0.0
将之前实现的层组合起来,构成我们常见的子模块。
conv + ReLu
def conv_relu_forward(x, w, b, conv_param):
"""
A convenience layer that performs a convolution followed by a ReLU.
Inputs:
- x: Input to the convolutional layer
- w, b, conv_param: Weights and parameters for the convolutional layer
Returns a tuple of:
- out: Output from the ReLU
- cache: Object to give to the backward pass
"""
a, conv_cache = conv_forward_fast(x, w, b, conv_param)
out, relu_cache = relu_forward(a)
cache = (conv_cache, relu_cache)
return out, cache
def conv_relu_backward(dout, cache):
"""
Backward pass for the conv-relu convenience layer.
"""
conv_cache, relu_cache = cache
da = relu_backward(dout, relu_cache)
dx, dw, db = conv_backward_fast(da, conv_cache)
return dx, dw, db
conv+ReLu+maxpooling
def conv_relu_pool_forward(x, w, b, conv_param, pool_param):
"""
Convenience layer that performs a convolution, a ReLU, and a pool.
Inputs:
- x: Input to the convolutional layer
- w, b, conv_param: Weights and parameters for the convolutional layer
- pool_param: Parameters for the pooling layer
Returns a tuple of:
- out: Output from the pooling layer
- cache: Object to give to the backward pass
"""
a, conv_cache = conv_forward_fast(x, w, b, conv_param)
s, relu_cache = relu_forward(a)
out, pool_cache = max_pool_forward_fast(s, pool_param)
cache = (conv_cache, relu_cache, pool_cache)
return out, cache
def conv_relu_pool_backward(dout, cache):
"""
Backward pass for the conv-relu-pool convenience layer
"""
conv_cache, relu_cache, pool_cache = cache
ds = max_pool_backward_fast(dout, pool_cache)
da = relu_backward(ds, relu_cache)
dx, dw, db = conv_backward_fast(da, conv_cache)
return dx, dw, db
Now that you have implemented all the necessary layers, we can put them together into a simple convolutional network.
Open the file cs231n/classifiers/cnn.py
and complete the implementation of the ThreeLayerConvNet
class. Remember you can use the fast/sandwich layers
(already imported for you) in your implementation.
class ThreeLayerConvNet(object):
"""
A three-layer convolutional network with the following architecture:
conv - relu - 2x2 max pool - affine - relu - affine - softmax
The network operates on minibatches of data that have shape (N, C, H, W)
consisting of N images, each with height H and width W and with C input
channels.
"""
def __init__(self, input_dim=(3, 32, 32), num_filters=32, filter_size=7,
hidden_dim=100, num_classes=10, weight_scale=1e-3, reg=0.0,
dtype=np.float32):
"""Initialize a new network.
Inputs:
- input_dim: Tuple (C, H, W) giving size of input data
- num_filters: Number of filters to use in the convolutional layer
- filter_size: Width/height of filters to use in the convolutional layer
- hidden_dim: Number of units to use in the fully-connected hidden layer
- num_classes: Number of scores to produce from the final affine layer.
- weight_scale: Scalar giving standard deviation for random initialization
of weights.
- reg: Scalar giving L2 regularization strength
- dtype: numpy datatype to use for computation.
"""
self.params = {}
self.reg = reg
self.dtype = dtype
############################################################################
# TODO: Initialize weights and biases for the three-layer convolutional #
# network. Weights should be initialized from a Gaussian centered at 0.0 #
# with standard deviation equal to weight_scale; biases should be #
# initialized to zero. All weights and biases should be stored in the #
# dictionary self.params. Store weights and biases for the convolutional #
# layer using the keys 'W1' and 'b1'; use keys 'W2' and 'b2' for the #
# weights and biases of the hidden affine layer, and keys 'W3' and 'b3' #
# for the weights and biases of the output affine layer. #
# #
# IMPORTANT: For this assignment, you can assume that the padding #
# and stride of the first convolutional layer are chosen so that #
# **the width and height of the input are preserved**. Take a look at #
# the start of the loss() function to see how that happens. #
############################################################################
""""""
# 第一层卷积层
# of shape (C_out,C_in,HH,WW)
self.params['W1'] = weight_scale * np.random.randn(num_filters,input_dim[0],filter_size,filter_size)
# of shape (C_out)
self.params['b1'] = np.zeros(num_filters)
# 第二层全连接层,假设第一层的输入输出尺度不发生变化(因为有padd),但是会因为 max_pooling使得空间尺寸减一半
# input of shape (N,C_out,H/2,W/2), out of shape (N,hidden_dim)
hidden_input_dim = num_filters * (input_dim[1]//2) * (input_dim[2]//2)
self.params['W2'] = weight_scale * np.random.randn(hidden_input_dim,hidden_dim)
self.params['b2'] = np.zeros(hidden_dim)
# 第三层分类层
self.params['W3'] = weight_scale * np.random.randn(hidden_dim,num_classes)
self.params['b3'] = np.zeros(num_classes)
############################################################################
# END OF YOUR CODE #
############################################################################
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Evaluate loss and gradient for the three-layer convolutional network.
Input / output: Same API as TwoLayerNet in fc_net.py.
"""
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
W3, b3 = self.params['W3'], self.params['b3']
# pass conv_param to the forward pass for the convolutional layer
# Padding and stride chosen to preserve the input spatial size
filter_size = W1.shape[2]
conv_param = {'stride': 1, 'pad': (filter_size - 1) // 2}
# pass pool_param to the forward pass for the max-pooling layer
pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}
scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer convolutional net, #
# computing the class scores for X and storing them in the scores #
# variable. #
# #
# Remember you can use the functions defined in cs231n/fast_layers.py and #
# cs231n/layer_utils.py in your implementation (already imported). #
############################################################################
# 第一层卷积层
conv_out,conv_cache = conv_relu_pool_forward(X,W1,b1,conv_param,pool_param)
# 第二层全连接层
fc_out,fc_cache = affine_relu_forward(conv_out,W2,b2)
# 第三层分类层
# out of shape (N,num_classes)
# 这是没有softmax概率化的分数
scores,scores_cache = affine_forward(fc_out,W3,b3)
if y is None:
return scores
loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the three-layer convolutional net, #
# storing the loss and gradients in the loss and grads variables. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
loss,dout = softmax_loss(scores,y)
# 分类层梯度
ds,grads["W3"],grads["b3"] = affine_backward(dout,scores_cache)
loss += self.reg * 0.5 * np.sum(W3 * W3)
grads["W3"] += self.reg * W3
# 全连接层梯度
da,grads["W2"],grads["b2"] = affine_relu_backward(ds,fc_cache)
loss += self.reg * 0.5 * np.sum(W2 * W2)
grads["W2"] += self.reg * W2
# 卷积层梯度
dx,grads["W1"],grads["b1"] = conv_relu_pool_backward(da,conv_cache)
loss += self.reg * 0.5 * np.sum(W1 * W1)
grads["W1"] += self.reg * W1
return loss, grads
A nice trick is to train your model with just a few training samples. You should be able to overfit small datasets, which will result in very high training accuracy and comparatively low validation accuracy
np.random.seed(231)
num_train = 100
small_data = {
'X_train': data['X_train'][:num_train],
'y_train': data['y_train'][:num_train],
'X_val': data['X_val'],
'y_val': data['y_val'],
}
model = ThreeLayerConvNet(weight_scale=1e-2)
solver = Solver(model, small_data,
num_epochs=15, batch_size=50,
update_rule='adam',
optim_config={
'learning_rate': 1e-3,
},
verbose=True, print_every=1)
solver.train()
Plotting the loss, training accuracy, and validation accuracy should show clear overfitting:
plt.subplot(2, 1, 1)
plt.plot(solver.loss_history, 'o')
plt.xlabel('iteration')
plt.ylabel('loss')
plt.subplot(2, 1, 2)
plt.plot(solver.train_acc_history, '-o')
plt.plot(solver.val_acc_history, '-o')
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()
By training the three-layer convolutional network for one epoch, you should achieve greater than 40% accuracy on the training set:
model = ThreeLayerConvNet(weight_scale=0.001, hidden_dim=500, reg=0.001)
solver = Solver(model, data,
num_epochs=1, batch_size=50,
update_rule='adam',
optim_config={
'learning_rate': 1e-3,
},
verbose=True, print_every=20)
solver.train()
可视化第一层的卷积核: ( C o u t , C i n , H H , W W ) (C_{out},C_{in},HH,WW) (Cout,Cin,HH,WW)
首先是展示N张图片的代码:
def visualize_grid(Xs, ubound=255.0, padding=1):
"""
Reshape a 4D tensor of image data to a grid for easy visualization.
Inputs:
- Xs: Data of shape (N, H, W, C)
- ubound: Output grid will have values scaled to the range [0, ubound]
- padding: The number of blank pixels between elements of the grid
"""
(N, H, W, C) = Xs.shape
grid_size = int(ceil(sqrt(N)))
grid_height = H * grid_size + padding * (grid_size - 1)
grid_width = W * grid_size + padding * (grid_size - 1)
grid = np.zeros((grid_height, grid_width, C))
next_idx = 0
y0, y1 = 0, H
for y in range(grid_size):
x0, x1 = 0, W
for x in range(grid_size):
if next_idx < N:
img = Xs[next_idx]
low, high = np.min(img), np.max(img)
grid[y0:y1, x0:x1] = ubound * (img - low) / (high - low)
# grid[y0:y1, x0:x1] = Xs[next_idx]
next_idx += 1
x0 += W + padding
x1 += W + padding
y0 += H + padding
y1 += H + padding
# grid_max = np.max(grid)
# grid_min = np.min(grid)
# grid = ubound * (grid - grid_min) / (grid_max - grid_min)
return grid
展示Cout个卷积核:
from cs231n.vis_utils import visualize_grid
grid = visualize_grid(model.params['W1'].transpose(0, 2, 3, 1))
plt.imshow(grid.astype('uint8'))
plt.axis('off')
plt.gcf().set_size_inches(5, 5)
plt.show()
We already saw that batch normalization is a very useful technique for training deep fully-connected networks. As proposed in the original paper [3], batch normalization can also be used for convolutional networks, but we need to tweak it a bit; the modification will be called “spatial batch normalization.”
Normally batch-normalization accepts inputs of shape (N, D)
and produces outputs of shape (N, D)
, where we normalize across the minibatch dimension N. For data coming from convolutional layers, batch normalization needs to accept inputs of shape (N, C, H, W)
and produce outputs of shape (N, C, H, W)
where the N
dimension gives the minibatch size and the (H, W)
dimensions give the spatial size of the feature map.
If the feature map was produced using convolutions, then we expect the statistics of each feature channel to be relatively consistent both between different imagesand different locations within the same image. Therefore spatial batch normalization computes a mean and variance for each of the C feature channels by computing statistics over both the minibatch dimension N and the spatial dimensions H and W.
[3] Sergey Ioffe and Christian Szegedy, “Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift”, ICML 2015.
In the file cs231n/layers.py, implement the forward pass for spatial batch normalization in the function spatial_batchnorm_forward
.
def spatial_batchnorm_forward(x, gamma, beta, bn_param):
"""
Computes the forward pass for spatial batch normalization.
Inputs:
- x: Input data of shape (N, C, H, W)
- gamma: Scale parameter, of shape (C,)
- beta: Shift parameter, of shape (C,)
- bn_param: Dictionary with the following keys:
- mode: 'train' or 'test'; required
- eps: Constant for numeric stability
- momentum: Constant for running mean / variance. momentum=0 means that
old information is discarded completely at every time step, while
momentum=1 means that new information is never incorporated. The
default of momentum=0.9 should work well in most situations.
- running_mean: Array of shape (D,) giving running mean of features
- running_var Array of shape (D,) giving running variance of features
Returns a tuple of:
- out: Output data, of shape (N, C, H, W)
- cache: Values needed for the backward pass
"""
out, cache = None, None
###########################################################################
# TODO: Implement the forward pass for spatial batch normalization. #
# #
# HINT: You can implement spatial batch normalization by calling the #
# vanilla version of batch normalization you implemented above. #
# Your implementation should be very short; ours is less than five lines. #
###########################################################################
N,C,H,W = x.shape
# (N,H,W,C)
input = x.transpose(0,2,3,1).reshape(-1,C)
temp_out,cache = batchnorm_forward(input,gamma,beta,bn_param)
out = temp_out.reshape(N,H,W,C).transpose(0,3,1,2)
# temp_output, cache = batchnorm_forward(x.transpose(0, 3, 2, 1).reshape((N * H * W, C)), gamma, beta, bn_param)
# out = temp_output.reshape(N, W, H, C).transpose(0, 3, 2, 1)
return out, cache
可以发现,对于输入为(N,C,H,W)的张量,空间归一化是把不同图像的不同位置的分布看成一个高斯分布。即类似于之前批量归一化中输入为(NHW,C)
测试
np.random.seed(231)
# Check the training-time forward pass by checking means and variances
# of features both before and after spatial batch normalization
N, C, H, W = 2, 3, 4, 5
x = 4 * np.random.randn(N, C, H, W) + 10
print('Before spatial batch normalization:')
print(' Shape: ', x.shape)
print(' Means: ', x.mean(axis=(0, 2, 3)))
print(' Stds: ', x.std(axis=(0, 2, 3)))
# Means should be close to zero and stds close to one
gamma, beta = np.ones(C), np.zeros(C)
bn_param = {'mode': 'train'}
out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)
print('After spatial batch normalization:')
print(' Shape: ', out.shape)
print(' Means: ', out.mean(axis=(0, 2, 3)))
print(' Stds: ', out.std(axis=(0, 2, 3)))
# Means should be close to beta and stds close to gamma
gamma, beta = np.asarray([3, 4, 5]), np.asarray([6, 7, 8])
out, _ = spatial_batchnorm_forward(x, gamma, beta, bn_param)
print('After spatial batch normalization (nontrivial gamma, beta):')
print(' Shape: ', out.shape)
print(' Means: ', out.mean(axis=(0, 2, 3)))
print(' Stds: ', out.std(axis=(0, 2, 3)))
Before spatial batch normalization:
Shape: (2, 3, 4, 5)
Means: [9.33463814 8.90909116 9.11056338]
Stds: [3.61447857 3.19347686 3.5168142 ]
After spatial batch normalization:
Shape: (2, 3, 4, 5)
Means: [ 6.18949336e-16 5.99520433e-16 -1.22124533e-16]
Stds: [0.99999962 0.99999951 0.9999996 ]
After spatial batch normalization (nontrivial gamma, beta):
Shape: (2, 3, 4, 5)
Means: [6. 7. 8.]
Stds: [2.99999885 3.99999804 4.99999798]
In the file cs231n/layers.py
, implement the backward pass for spatial batch normalization
in the function spatial_batchnorm_backward
.
def spatial_batchnorm_backward(dout, cache):
"""
Computes the backward pass for spatial batch normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, C, H, W)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient with respect to inputs, of shape (N, C, H, W)
- dgamma: Gradient with respect to scale parameter, of shape (C,)
- dbeta: Gradient with respect to shift parameter, of shape (C,)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for spatial batch normalization. #
# #
# HINT: You can implement spatial batch normalization by calling the #
# vanilla version of batch normalization you implemented above. #
# Your implementation should be very short; ours is less than five lines. #
###########################################################################
N,C,H,W = dout.shape
dout = dout.transpose(0,2,3,1).reshape(-1,C)
dx,dgamma,dbeta = batchnorm_backward(dout,cache)
dx = dx.reshape(N,H,W,C).transpose(0,3,1,2)
return dx, dgamma, dbeta
In the previous notebook, we mentioned that Layer Normalization is an alternative normalization technique that mitigates the batch size limitations of Batch Normalization. However, as the authors of [4] observed, Layer Normalization does not perform as well as Batch Normalization when used with Convolutional Layers:
With fully connected layers, all the hidden units in a layer tend to make similar contributions to the final prediction, and re-centering and rescaling the summed inputs to a layer works well. However, the assumption of similar contributions is no longer true for convolutional neural networks. The large number of the hidden units whose receptive fields lie near the boundary of the image are rarely turned on and thus have very different statistics from the rest of the hidden units within the same layer.
The authors of [5] propose an intermediary technique. In contrast to Layer Normalization, where you normalize over the entire feature per-datapoint, they suggest a consistent splitting of each per-datapoint feature into G groups, and a per-group per-datapoint normalization instead.
Even though an assumption of equal contribution is still being made within each group, the authors hypothesize that this is not as problematic, as innate grouping arises within features for visual recognition. One example they use to illustrate this is that many high-performance handcrafted features in traditional Computer Vision have terms that are explicitly grouped together. Take for example Histogram of Oriented Gradients [6]-- after computing histograms per spatially local block, each per-block histogram is normalized before being concatenated together to form the final feature vector.
You will now implement Group Normalization. Note that this normalization technique that you are to implement in the following cells was introduced and published to arXiv less than a month ago – this truly is still an ongoing and excitingly active field of research!
[4] Ba, Jimmy Lei, Jamie Ryan Kiros, and Geoffrey E. Hinton. “Layer Normalization.” stat 1050 (2016): 21.
[5] Wu, Yuxin, and Kaiming He. “Group Normalization.” arXiv preprint arXiv:1803.08494 (2018).
[6] N. Dalal and B. Triggs. Histograms of oriented gradients for human detection. In Computer Vision and Pattern Recognition (CVPR), 2005.
In the file cs231n/layers.py
, implement the forward pass for group normalization in the function spatial_groupnorm_forward
.
def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
"""
Computes the forward pass for spatial group normalization.
In contrast to layer normalization, group normalization splits each entry
in the data into G contiguous pieces, which it then normalizes independently.
Per feature shifting and scaling are then applied to the data,
in a manner identical to that of batch normalization and layer normalization.
Inputs:
- x: Input data of shape (N, C, H, W)
- gamma: Scale parameter, of shape (C,)
- beta: Shift parameter, of shape (C,)
- G: Integer mumber of groups to split into, should be a divisor of C
- gn_param: Dictionary with the following keys:
- eps: Constant for numeric stability
Returns a tuple of:
- out: Output data, of shape (N, C, H, W)
- cache: Values needed for the backward pass
"""
out, cache = None, None
eps = gn_param.get('eps',1e-5)
###########################################################################
# TODO: Implement the forward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
# In particular, think about how you could transform the matrix so that #
# the bulk of the code is similar to both train-time batch normalization #
# and layer normalization! #
###########################################################################
N,C,H,W = x.shape
# of shape (N * G, C/G, H, W) -> (N * G, C/G * H * W)
x = x.reshape(N * G,-1)
# 下面是类似于 Layer norm 的过程
# 先转换为 Batch norm 的过程
x = x.T # (D',N')
mean = np.mean(x, axis=0) # (N',)
var = np.var(x, axis=0) # (N', )
x_hat = (x - mean) / (np.sqrt(var + eps)) # (D',N')
x_hat = x_hat.T # (N',D')
x_hat = x_hat.reshape(N,C,H,W)
# gamma of shape (C,), beta of shape (C,)
if len(gamma.shape) == 1:
gamma = gamma[None,:,None,None]
if len(beta.shape) == 1:
beta = beta[None,:,None,None]
out = x_hat * gamma + beta
cache = (gamma, x_hat, x, mean, var, eps)
return out, cache
In the file cs231n/layers.py
, implement the backward pass for spatial batch normalization in the function spatial_groupnorm_backward
.
def spatial_groupnorm_backward(dout, cache):
"""
Computes the backward pass for spatial group normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, C, H, W)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient with respect to inputs, of shape (N, C, H, W)
- dgamma: Gradient with respect to scale parameter, of shape (C,)
- dbeta: Gradient with respect to shift parameter, of shape (C,)
"""
###########################################################################
# TODO: Implement the backward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
###########################################################################
# x of shape (N,G,C/G,H,W)
# x_hat of shape (N,C,H,W)
gamma, x_hat, x, mean, var, eps = cache
dgamma = np.sum(dout * x_hat, axis=(0,2,3),keepdims=True) #(N,C,H,W) -> (1,C,1,1)
dbeta = np.sum(dout, axis=(0,2,3),keepdims=True) #(N,C,H,W) -> (1,C,1,1)
# 使用链式法则计算 dx of shape (N,G,1,1,1)
N, G, C_G, H, W = x.shape
dx_hat = dout * gamma
# (N,G,C/G,H,W)
dx_hat = dx_hat.reshape(*x.shape)
# (C/G,H,W, N,G) -- (D',N') 下面就类似于之前的 batch norm 了
dx_hat = dx_hat.transpose(2,3,4,0,1)
# (C/G,H,W, N,G) -- (D',N')
x = x.transpose(2, 3, 4, 0, 1)
mean = mean.transpose(2, 3, 4, 0, 1) # (1,1,1, N,G)
var = var.transpose(2, 3, 4, 0, 1) # (1,1,1, N,G)
div = 1 / (np.sqrt(var + eps))
# 第一条分支 of shape (D',N')
dx_1 = div * dx_hat
# 第二条分支 of shape (N',) -- (1,1,1,N,G)
dmean = np.sum(- div * dx_hat,axis=(0,1,2),keepdims=True)
# 第三条分支 of shape (N',) -- (1,1,1,N,G)
dvar = np.sum(- 0.5 * (x - mean) * (div**3) * dx_hat,axis=(0,1,2),keepdims=True)
# 计算 var 中的 mean 项
newN = C_G * H * W
dmean += np.sum(-2 * (x - mean) * dvar,axis=(0,1,2),keepdims = True) / newN
# 计算 var 中的 xi 项
dx_2 = 2 * (x - mean) * dvar / newN
dx = dx_1 + dx_2 + dmean / newN
# (C/G,H,W, N,G) -> (N,G, C/G,H,W)
dx = dx.transpose(3,4,0,1,2)
dx = dx.reshape(*dout.shape)
return dx, dgamma, dbeta
用之前实现的模块,在CIFAR10数据集上训练一个合适的CNN模型。
最新的作业,把这一部分放在了下一节,用Pytorch或Tensorflow实现一个CNN,这样更方便,而且还可以使用GPU进行训练。所以,具体的内容参考下一次的作业。