让我们实现卷积神经网络,说实话难度是有点大的,一步一步来吧
下面是我为了理解卷积神经网络所用到的一些学习资料
卷积层的前向的原理还是比较简单地,主要是麻烦再代码编写上面,看注释吧
def conv_forward_naive(x, w, b, conv_param):
"""A naive implementation of the forward pass for a convolutional layer.
The input consists of N data points, each with C channels, height H and
width W. We convolve each input with F different filters, where each filter
spans all C channels and has height HH and width WW.
Input:
- x: Input data of shape (N, C, H, W)
- w: Filter weights of shape (F, C, HH, WW)
- b: Biases, of shape (F,)
- conv_param: A dictionary with the following keys:
- 'stride': The number of pixels between adjacent receptive fields in the
horizontal and vertical directions.
- 'pad': The number of pixels that will be used to zero-pad the input.
During padding, 'pad' zeros should be placed symmetrically (i.e equally on both sides)
along the height and width axes of the input. Be careful not to modfiy the original
input x directly.
Returns a tuple of:
- out: Output data, of shape (N, F, H', W') where H' and W' are given by
H' = 1 + (H + 2 * pad - HH) / stride
W' = 1 + (W + 2 * pad - WW) / stride
- cache: (x, w, b, conv_param)
"""
out = None
###########################################################################
# TODO: Implement the convolutional forward pass. #
# Hint: you can use the function np.pad for padding. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 先获取一些需要用到的数据
N, C, H_input, W_input = x.shape # N个样本,C个通道,H_input高,W_input宽
F, C_w_, HH, WW = w.shape # F个卷积核, C_w_个通道,HH高,WW宽
stride = conv_param["stride"] # 步长
pad = conv_param["pad"] # 填充数量
# 计算卷积后的高和宽
out_H = int(1 + (H_input + 2 * pad - HH) / stride)
out_W = int(1 + (W_input + 2 * pad - WW) / stride)
# 给x的上下左右填充上pad个0
x_pad = np.pad(x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), "constant", constant_values=0)
# 将卷积核w转换成F * (C * HH * WW)的矩阵 (便于使用矩阵乘法)
w_row = w.reshape(F, -1)
# 生成空白输出便于后续循环填充
out = np.zeros((N, F, out_H, out_W))
# 开始卷积
for n in range(N): # 遍历样本
for f in range(F): # 遍历卷积核
for i in range(out_H): # 遍历高
for j in range(out_W): # 遍历宽
# 获取当前卷积窗口
window = x_pad[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
# 将卷积窗口拉成一行
window_row = window.reshape(1, -1)
# 计算当前卷积窗口和卷积核的卷积结果
out[n, f, i, j] = np.sum(window_row * w_row[f, :]) + b[f]
# 将pad后的x存入cache (省的反向传播的时候在计算一次)
x = x_pad
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, w, b, conv_param)
return out, cache
卷积层的梯度还是比较简单的,因为卷积层进行的知识简单地加法,因此细想一下就好了,不懂的话看一下代码注释
def conv_backward_naive(dout, cache):
"""A naive implementation of the backward pass for a convolutional layer.
Inputs:
- dout: Upstream derivatives.
- cache: A tuple of (x, w, b, conv_param) as in conv_forward_naive
Returns a tuple of:
- dx: Gradient with respect to x
- dw: Gradient with respect to w
- db: Gradient with respect to b
"""
dx, dw, db = None, None, None
###########################################################################
# TODO: Implement the convolutional backward pass. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 获取一些需要用到的数据
x, w, b, conv_param = cache
N, C, H_input, W_input = x.shape # N个样本,C个通道,H_input高,W_input宽
F, C_w_, HH, WW = w.shape # F个卷积核, C_w_个通道,HH高,WW宽
stride = conv_param["stride"] # 步长
pad = conv_param["pad"] # 填充数量
# 计算卷积后的高和宽
out_H = int(1 + (H_input - HH) / stride)
out_W = int(1 + (W_input - WW) / stride)
# 给dx,dw,db分配空间
dx = np.zeros_like(x)
dw = np.zeros_like(w)
db = np.zeros_like(b)
for n in range(N):
for f in range(F):
for i in range(out_H):
for j in range(out_W):
# 获取当前卷积窗口
window = x[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW]
# 计算db
db[f] += dout[n, f, i, j]
# 计算dw
dw[f] += window * dout[n, f, i, j]
# 计算dx
dx[n, :, i * stride:i * stride + HH, j * stride:j * stride + WW] += w[f] * dout[n, f, i, j]
# 去掉dx的pad
dx = dx[:, :, pad:H_input - pad, pad:W_input - pad]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dw, db
这个也是比较简单的,看看课就能看明白
def max_pool_forward_naive(x, pool_param):
"""A naive implementation of the forward pass for a max-pooling layer.
Inputs:
- x: Input data, of shape (N, C, H, W)
- pool_param: dictionary with the following keys:
- 'pool_height': The height of each pooling region
- 'pool_width': The width of each pooling region
- 'stride': The distance between adjacent pooling regions
No padding is necessary here, eg you can assume:
- (H - pool_height) % stride == 0
- (W - pool_width) % stride == 0
Returns a tuple of:
- out: Output data, of shape (N, C, H', W') where H' and W' are given by
H' = 1 + (H - pool_height) / stride
W' = 1 + (W - pool_width) / stride
- cache: (x, pool_param)
"""
out = None
###########################################################################
# TODO: Implement the max-pooling forward pass #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 获取一些需要用到的数据
N, C, H, W = x.shape # N个样本,C个通道,H高,W宽
pool_height = pool_param["pool_height"] # 池化核高
pool_width = pool_param["pool_width"] # 池化核宽
stride = pool_param["stride"] # 步长
# 计算池化后的高和宽
out_H = int(1 + (H - pool_height) / stride)
out_W = int(1 + (W - pool_width) / stride)
# 给out分配空间
out = np.zeros((N, C, out_H, out_W))
for n in range(N):
for c in range(C):
for i in range(out_H):
for j in range(out_W):
# 获取当前池化窗口
window = x[n, c, i * stride:i * stride + pool_height, j * stride:j * stride + pool_width]
# 计算当前池化窗口的最大值
out[n, c, i, j] = np.max(window)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
cache = (x, pool_param)
return out, cache
因为是maxpool,所以x对loss的贡献就是如果x[i,j]是某个卷积窗口的最大值,那么他就对loss产生了影响,那么也就是说要计算他的dx,否则他的梯度就是0,这一点类比一下relu就行了,我们在计算relu的梯度的时候,如果x<0的话,是不需要计算他的梯度的,因为他没有对loss产生影响,再不懂的话看代码
def max_pool_backward_naive(dout, cache):
"""A naive implementation of the backward pass for a max-pooling layer.
Inputs:
- dout: Upstream derivatives
- cache: A tuple of (x, pool_param) as in the forward pass.
Returns:
- dx: Gradient with respect to x
"""
dx = None
###########################################################################
# TODO: Implement the max-pooling backward pass #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 获取一些需要用到的数据
x, pool_param = cache
N, C, H, W = x.shape # N个样本,C个通道,H高,W宽
pool_height = pool_param["pool_height"] # 池化核高
pool_width = pool_param["pool_width"] # 池化核宽
stride = pool_param["stride"] # 步长
# 计算池化后的高和宽
out_H = int(1 + (H - pool_height) / stride)
out_W = int(1 + (W - pool_width) / stride)
# 给dx分配空间
dx = np.zeros_like(x)
for n in range(N):
for c in range(C):
for i in range(out_H):
for j in range(out_W):
# 获取当前池化窗口
window = x[n, c, i * stride:i * stride + pool_height, j * stride:j * stride + pool_width]
# 计算当前池化窗口的最大值
max_index = np.argmax(window)
# 计算dx
dx[n, c, i * stride + max_index // pool_width, j * stride + max_index % pool_width] += dout[n, c, i, j]
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx
这一部分让我们感受一下我们自己写的卷积层比别人写的慢了多少
注意!!! 这个cell的内容被我改了,如果你是google colab上的环境,并不需要修改,你就不要动这个cell里的内容就好了
注意!!!!! 我这里是在本地pyhcarm 里面跑的,也就是说我的环境是本地环境,然后在我本地跑的时候报错
Compiling im2col_cython.pyx because it changed.
[1/1] Cythonizing im2col_cython.pyx
E:\anaconda\envs\mlDev\lib\site-packages\Cython\Compiler\Main.py:381: FutureWarning: Cython directive 'language_level' not set, using '3str' for now (Py3). This has changed from earlier releases! File: E:\LeeZed\project\Python\AI\cs231n\assignment2\cs231n\im2col_cython.pyx
tree = Parsing.p_module(s, pxd, full_module_name)
Error compiling Cython file:
------------------------------------------------------------
...
cdef int N = x.shape[0]
cdef int C = x.shape[1]
cdef int H = x.shape[2]
cdef int W = x.shape[3]
cdef int HH = (H + 2 * padding - field_height) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:19:60: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
cdef int C = x.shape[1]
cdef int H = x.shape[2]
cdef int W = x.shape[3]
cdef int HH = (H + 2 * padding - field_height) / stride + 1
cdef int WW = (W + 2 * padding - field_width) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:20:59: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
cdef int N = x.shape[0]
cdef int C = x.shape[1]
cdef int H = x.shape[2]
cdef int W = x.shape[3]
cdef int HH = (H + 2 * padding - field_height) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:19:60: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
cdef int C = x.shape[1]
cdef int H = x.shape[2]
cdef int W = x.shape[3]
cdef int HH = (H + 2 * padding - field_height) / stride + 1
cdef int WW = (W + 2 * padding - field_width) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:20:59: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
int field_height, int field_width, int padding, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int HH = (H + 2 * padding - field_height) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:60:60: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
int field_height, int field_width, int padding, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int HH = (H + 2 * padding - field_height) / stride + 1
cdef int WW = (W + 2 * padding - field_width) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:61:59: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
int field_height, int field_width, int padding, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int HH = (H + 2 * padding - field_height) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:60:60: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_cython(np.ndarray[DTYPE_t, ndim=2] cols, int N, int C, int H, int W,
int field_height, int field_width, int padding, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int HH = (H + 2 * padding - field_height) / stride + 1
cdef int WW = (W + 2 * padding - field_width) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:61:59: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
int HH, int WW, int pad, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int out_h = (H + 2 * pad - HH) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:112:49: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
int HH, int WW, int pad, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int out_h = (H + 2 * pad - HH) / stride + 1
cdef int out_w = (W + 2 * pad - WW) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:113:49: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
int HH, int WW, int pad, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int out_h = (H + 2 * pad - HH) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:112:49: Cannot assign type 'double' to 'int'
Error compiling Cython file:
------------------------------------------------------------
...
def col2im_6d_cython(np.ndarray[DTYPE_t, ndim=6] cols, int N, int C, int H, int W,
int HH, int WW, int pad, int stride):
cdef np.ndarray x = np.empty((N, C, H, W), dtype=cols.dtype)
cdef int out_h = (H + 2 * pad - HH) / stride + 1
cdef int out_w = (W + 2 * pad - WW) / stride + 1
^
------------------------------------------------------------
im2col_cython.pyx:113:49: Cannot assign type 'double' to 'int'
Traceback (most recent call last):
File "E:\LeeZed\project\Python\AI\cs231n\assignment2\cs231n\setup.py", line 12, in
setup(ext_modules=cythonize(extensions),)
File "E:\anaconda\envs\mlDev\lib\site-packages\Cython\Build\Dependencies.py", line 1134, in cythonize
cythonize_one(*args)
File "E:\anaconda\envs\mlDev\lib\site-packages\Cython\Build\Dependencies.py", line 1301, in cythonize_one
raise CompileError(None, pyx_file)
Cython.Compiler.Errors.CompileError: im2col_cython.pyx
后来我搜索了一下,这个问题是因为编译的时候Cpython使用了python3的语法乱七八糟的,反正最后你只要修改一下im2col_cython.pyx这个文件就好了
在这个文件的最开头加上
#cython: language_level=2
然后重新运行那个cell就好了
可恶啊居然快这么多
让我们完成 init 和loss函数,有了之前的经验,其实也算轻车熟路
自己理解一下init时各层权重的维度,就是size就好了,没什么难点
def __init__(
self,
input_dim=(3, 32, 32),
num_filters=32,
filter_size=7,
hidden_dim=100,
num_classes=10,
weight_scale=1e-3,
reg=0.0,
dtype=np.float32,
):
"""
Initialize a new network.
Inputs:
- input_dim: Tuple (C, H, W) giving size of input data
- num_filters: Number of filters to use in the convolutional layer
- filter_size: Width/height of filters to use in the convolutional layer
- hidden_dim: Number of units to use in the fully-connected hidden layer
- num_classes: Number of scores to produce from the final affine layer.
- weight_scale: Scalar giving standard deviation for random initialization
of weights.
- reg: Scalar giving L2 regularization strength
- dtype: numpy datatype to use for computation.
"""
self.params = {}
self.reg = reg
self.dtype = dtype
############################################################################
# TODO: Initialize weights and biases for the three-layer convolutional #
# network. Weights should be initialized from a Gaussian centered at 0.0 #
# with standard deviation equal to weight_scale; biases should be #
# initialized to zero. All weights and biases should be stored in the #
# dictionary self.params. Store weights and biases for the convolutional #
# layer using the keys 'W1' and 'b1'; use keys 'W2' and 'b2' for the #
# weights and biases of the hidden affine layer, and keys 'W3' and 'b3' #
# for the weights and biases of the output affine layer. #
# #
# IMPORTANT: For this assignment, you can assume that the padding #
# and stride of the first convolutional layer are chosen so that #
# **the width and height of the input are preserved**. Take a look at #
# the start of the loss() function to see how that happens. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# conv - relu - 2x2 max pool - affine - relu - affine - softmax
C, H, W = input_dim # 获取输入数据的通道数,高度,宽度
# 卷积层
self.params["W1"] = np.random.normal(0, weight_scale, (num_filters, C, filter_size, filter_size))
self.params["b1"] = np.zeros(num_filters)
# 全连接层
self.params["W2"] = np.random.normal(0, weight_scale, (num_filters * H * W // 4, hidden_dim))
self.params["b2"] = np.zeros(hidden_dim)
# 全连接层
self.params["W3"] = np.random.normal(0, weight_scale, (hidden_dim, num_classes))
self.params["b3"] = np.zeros(num_classes)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
for k, v in self.params.items():
self.params[k] = v.astype(dtype)
def loss(self, X, y=None):
"""
Evaluate loss and gradient for the three-layer convolutional network.
Input / output: Same API as TwoLayerNet in fc_net.py.
"""
W1, b1 = self.params["W1"], self.params["b1"]
W2, b2 = self.params["W2"], self.params["b2"]
W3, b3 = self.params["W3"], self.params["b3"]
# pass conv_param to the forward pass for the convolutional layer
# Padding and stride chosen to preserve the input spatial size
filter_size = W1.shape[2]
conv_param = {"stride": 1, "pad": (filter_size - 1) // 2}
# pass pool_param to the forward pass for the max-pooling layer
pool_param = {"pool_height": 2, "pool_width": 2, "stride": 2}
scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer convolutional net, #
# computing the class scores for X and storing them in the scores #
# variable. #
# #
# Remember you can use the functions defined in cs231n/fast_layers.py and #
# cs231n/layer_utils.py in your implementation (already imported). #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# conv - relu - 2x2 max pool - affine - relu - affine - softmax
out1, cache1 = conv_relu_pool_forward(X, W1, b1, conv_param, pool_param) # 卷积层
out2, cache2 = affine_relu_forward(out1, W2, b2) # 全连接层
scores, cache3 = affine_forward(out2, W3, b3) # 全连接层
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
if y is None:
return scores
loss, grads = 0, {}
############################################################################
# TODO: Implement the backward pass for the three-layer convolutional net, #
# storing the loss and gradients in the loss and grads variables. Compute #
# data loss using softmax, and make sure that grads[k] holds the gradients #
# for self.params[k]. Don't forget to add L2 regularization! #
# #
# NOTE: To ensure that your implementation matches ours and you pass the #
# automated tests, make sure that your L2 regularization includes a factor #
# of 0.5 to simplify the expression for the gradient. #
############################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
# 计算损失
loss, dout = softmax_loss(scores, y)
loss += 0.5 * self.reg * (np.sum(W1 ** 2) + np.sum(W2 ** 2) + np.sum(W3 ** 2)) # L2正则化
# 计算梯度
dout, grads["W3"], grads["b3"] = affine_backward(dout, cache3) # 全连接层
dout, grads["W2"], grads["b2"] = affine_relu_backward(dout, cache2) # 全连接层
dout, grads["W1"], grads["b1"] = conv_relu_pool_backward(dout, cache1) # 卷积层
# 加上正则化项的梯度
grads["W3"] += self.reg * W3
grads["W2"] += self.reg * W2
grads["W1"] += self.reg * W1
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
############################################################################
# END OF YOUR CODE #
############################################################################
return loss, grads
让我们使用空间批量归一化,并且还有告诉我们可以直接调用我们写好的batchnorm_forward来说进行计算,总代码行数应该不超过五行
题目这么说就说明我们不需要想的太复杂了
关于空间批量归一化和一般的批量归一化之间的联系在下面
也就是说我们需要把数据变成一个size为(N * H * W,C )的一个二维矩阵,你可以把这个
1. 矩阵的行向量,可以看做某一张图片,某一个位置的rgb三色像素值
2. 矩阵的列向量,可以看做某一个颜色,当前数据批次中的所有图片的所有位置的像素值
而 batchnorm是对列向量做归一化,这样的话我们就能保证我们归一化处理的数据是同一分布的,哪怕数据量不一致
def spatial_batchnorm_forward(x, gamma, beta, bn_param):
"""Computes the forward pass for spatial batch normalization.
Inputs:
- x: Input data of shape (N, C, H, W)
- gamma: Scale parameter, of shape (C,)
- beta: Shift parameter, of shape (C,)
- bn_param: Dictionary with the following keys:
- mode: 'train' or 'test'; required
- eps: Constant for numeric stability
- momentum: Constant for running mean / variance. momentum=0 means that
old information is discarded completely at every time step, while
momentum=1 means that new information is never incorporated. The
default of momentum=0.9 should work well in most situations.
- running_mean: Array of shape (D,) giving running mean of features
- running_var Array of shape (D,) giving running variance of features
Returns a tuple of:
- out: Output data, of shape (N, C, H, W)
- cache: Values needed for the backward pass
"""
out, cache = None, None
###########################################################################
# TODO: Implement the forward pass for spatial batch normalization. #
# #
# HINT: You can implement spatial batch normalization by calling the #
# vanilla version of batch normalization you implemented above. #
# Your implementation should be very short; ours is less than five lines. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, C, H, W = x.shape # N个样本,C个通道,H高,W宽
x = np.moveaxis(x, 1, -1).reshape(-1, C) # 将C通道放到最后,然后reshape成二维数组
out, cache = batchnorm_forward(x, gamma, beta, bn_param) # 调用batchnorm_forward
out = np.moveaxis(out.reshape(N, H, W, C), -1, 1) # 将C通道放到第二维,然后reshape成四维数组
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return out, cache
理解了前向传播的解析,反向传播就不难了
def spatial_batchnorm_backward(dout, cache):
"""Computes the backward pass for spatial batch normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, C, H, W)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient with respect to inputs, of shape (N, C, H, W)
- dgamma: Gradient with respect to scale parameter, of shape (C,)
- dbeta: Gradient with respect to shift parameter, of shape (C,)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for spatial batch normalization. #
# #
# HINT: You can implement spatial batch normalization by calling the #
# vanilla version of batch normalization you implemented above. #
# Your implementation should be very short; ours is less than five lines. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, C, H, W = dout.shape # N个样本,C个通道,H高,W宽
dout = np.moveaxis(dout, 1, -1).reshape(-1, C) # 将C通道放到最后,然后reshape成二维数组
dx, dgamma, dbeta = batchnorm_backward(dout, cache) # 调用batchnorm_backward
dx = np.moveaxis(dx.reshape(N, H, W, C), -1, 1) # 将C通道放到第二维,然后reshape成四维数组
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta
下面是一个帮助理解的视频
def spatial_groupnorm_forward(x, gamma, beta, G, gn_param):
"""Computes the forward pass for spatial group normalization.
In contrast to layer normalization, group normalization splits each entry in the data into G
contiguous pieces, which it then normalizes independently. Per-feature shifting and scaling
are then applied to the data, in a manner identical to that of batch normalization and layer
normalization.
Inputs:
- x: Input data of shape (N, C, H, W)
- gamma: Scale parameter, of shape (1, C, 1, 1)
- beta: Shift parameter, of shape (1, C, 1, 1)
- G: Integer mumber of groups to split into, should be a divisor of C
- gn_param: Dictionary with the following keys:
- eps: Constant for numeric stability
Returns a tuple of:
- out: Output data, of shape (N, C, H, W)
- cache: Values needed for the backward pass
"""
out, cache = None, None
eps = gn_param.get("eps", 1e-5)
###########################################################################
# TODO: Implement the forward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
# In particular, think about how you could transform the matrix so that #
# the bulk of the code is similar to both train-time batch normalization #
# and layer normalization! #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
N, C, H, W = x.shape # N个样本,C个通道,H高,W宽
# 将C通道分成G组,每组有C//G个通道
x = x.reshape(N, G, C // G, H, W) # reshape成五维数组
x_mean = np.mean(x, axis=(2, 3, 4), keepdims=True) # 求均值
x_var = np.var(x, axis=(2, 3, 4), keepdims=True) # 求方差
x_norm = (x - x_mean) / np.sqrt(x_var + eps) # 归一化
x_norm = x_norm.reshape(N, C, H, W) # reshape成四维数组
out = gamma * x_norm + beta # 伸缩平移
cache = (x, x_norm, x_mean, x_var, gamma, beta, G, eps) # 缓存变量
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return out, cache
你可以自己重新写,也可以将矩阵形状改成适合之前写的layer_normalization的方式来调用函数
def spatial_groupnorm_backward(dout, cache):
"""Computes the backward pass for spatial group normalization.
Inputs:
- dout: Upstream derivatives, of shape (N, C, H, W)
- cache: Values from the forward pass
Returns a tuple of:
- dx: Gradient with respect to inputs, of shape (N, C, H, W)
- dgamma: Gradient with respect to scale parameter, of shape (1, C, 1, 1)
- dbeta: Gradient with respect to shift parameter, of shape (1, C, 1, 1)
"""
dx, dgamma, dbeta = None, None, None
###########################################################################
# TODO: Implement the backward pass for spatial group normalization. #
# This will be extremely similar to the layer norm implementation. #
###########################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
x, x_norm, x_mean, x_var, gamma, beta, G, eps = cache # 从缓存中取出变量
N, C, H, W = dout.shape # N个样本,C个通道,H高,W宽
# 计算dgamma和dbeta
dgamma = np.sum(dout * x_norm, axis=(0, 2, 3), keepdims=True) # 求dgamma
dbeta = np.sum(dout, axis=(0, 2, 3), keepdims=True) # 求dbeta
# 准备数据
x = x.reshape(N, G, C // G, H, W) # reshape成五维数组
m = C // G * H * W
dx_norm = (dout * gamma).reshape(N, G, C // G, H, W)
dx_var = np.sum(dx_norm * (x - x_mean) * (-0.5) * np.power((x_var + eps), -1.5), axis=(2, 3, 4), keepdims=True)
dx_mean = np.sum(dx_norm * (-1) / np.sqrt(x_var + eps), axis=(2, 3, 4), keepdims=True) + dx_var * np.sum(-2 * (x - x_mean), axis=(2, 3, 4),
keepdims=True) / m
dx = dx_norm / np.sqrt(x_var + eps) + dx_var * 2 * (x - x_mean) / m + dx_mean / m
dx = dx.reshape(N, C, H, W)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
###########################################################################
# END OF YOUR CODE #
###########################################################################
return dx, dgamma, dbeta