X
是一个4维Tensor,(m, n_H, n_W, n_C)
,对n_H
和n_W
维度填充0
,返回X_pad
def zero_pad(X, pad):
X_pad = np.pad( X, ((0, 0),(pad, pad),(pad, pad),(0, 0)), 'constant', constant_values=0 )
return X_pad
运算:Z = conv( A_prev, W ) + b
A_prev.shape = (m, n_H_prev, n_W_prev, n_C_prev)
W.shape = (f, f, n_C_prev, n_C) # 共n_C个filter
b.shape = (1, 1, 1, n_C)
Z.shape = (m, n_H, n_W, n_C)
def conv_forward( A_prev, W, b, hparameters ):
# 获取Tensor的各个维度
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(f, f, n_C_prev, n_C) = W.shape
# 获取超参数
stride = hparameters['stride']
pad = hparameters['pad']
# 计算卷积结果空间上的维度
n_H = int( ( n_H_prev + 2 * pad - f ) / stride ) + 1
n_W = int( ( n_W_prev + 2 * pad - f ) / stride ) + 1
# 初始化卷积结果
Z = np.zeros( (m, n_H, n_W, n_C) )
# 执行Zero Padding
A_prev_temp = A_prev # 暂存,用于backprop
A_prev = zero_pad( A_prev, pad )
for i in range(m): # 遍历m个样本
for h in range(n_H): # 遍历空间上的所有位置
for w in range(n_W):
for c in range(n_C): # 遍历n_C个filter
# 定位A_prev上的slice
h_start, w_start = h * stride, w * stride
h_end, w_end = h_start + f, w_start + f
A_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, :]
# 执行卷积运算
# A_prev_slice和W[:, :, :, c]均为3维Tensor
Z[i, h, w, c] = np.sum( A_prev_slice * W[:, :, :, c] + b[:, :, :, c] )
cache = (A_prev_temp, W, b, hparameters) # 缓存,用于backprop
return Z, cache
运算:A = pool( A_prev )
A_prev.shape = (m, n_H_prev, n_W_prev, n_C_prev)
A.shape = (m, n_H, n_W, n_C)
池化参数:fxf,stride
注:将池化运算看做Z = pool( A_prev )
(地位相当于Z = W * A_prev + b
),A = Z
(相当于线型激活函数),有助于理解反向传播
def pool_forward( A_prev, hparameters, mode='max' ):
# 获取A_prev的各个维度
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# 获取超参数
f = hparameters['f']
stride = hparameters['stride']
# 计算池化结果的维度
n_H = int( ( n_H_prev - f ) / stride ) + 1
n_W = int( ( n_W_prev - f ) / stride ) + 1
n_C = n_C_prev
# 初始化池化结果
A = np.zeros( (m, n_H, n_W, n_C) )
for i in range(m): # 遍历m个样本
for h in range(n_H): # 遍历空间上的所有位置
for w in range(n_W):
for c in range (n_C): # 遍历n_C个filter
# 定位A_prev上的slice
h_start, w_start = h * stride, w * stride
h_end, w_end = h_start + f, w_start + f
A_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, c]
# 执行池化运算
if mode == 'max':
A[i, h, w, c] = np.max( A_prev_slice )
elif mode == 'average':
A[i, h, w, c] = np.mean( A_prev_slice )
cache = (A_prev, hparameters) # 缓存,用于backprop
return A, cache
【Tips】卷积操作中,A_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, :]
,选取了所有channel,池化操作中,A_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, c]
,选取当前channel
前向传播关键代码:Z[i, h, w, c] = np.sum( A_prev_slice * W[:, :, :, c] + b[:, :, :, c] )
反向传播关键代码:dA_prev_slice += W[:, :, :, c] * dZ[i, h, w, c]
参数梯度关键代码:
dW[:, :, :, c] += A_prev_slice * dZ[i, h, w, c]
db[:, :, :, c] += dZ[i, h, w, c]
# 前向传播:Z = conv( A_prev, W ) + b
# 反向传播,已知dZ,反向传播至dA_prev,同时计算参数dW,db
def conv_backward( dZ, cache ):
(A_prev, W, b, hparameters) = cache
# 获取Tensor的各个维度
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
(f, f, n_C_prev, n_C) = W.shape
(m, n_H, n_W, n_C) = dZ.shape
# 获取超参数
stride = hparameters['stride']
pad = hparameters['pad']
# 初始化dA_prev,dW,db,它们的维度和A_prev,W,b相同
dA_prev = np.zeros_like( A_prev )
dW = np.zeros_like( W )
db = np.zeros_like( b )
# 执行Zero Padding
A_prev = zero_pad( A_prev, pad )
dA_prev = zero_pad( dA_prev, pad )
for i in range(m): # 遍历m个样本
for h in range(n_H): # 遍历空间上的所有位置
for w in range(n_W):
for c in range(n_C): # 遍历n_C个filter
# 定位A_prev和dA_prev上的slice
h_start, w_start = h * stride, w * stride
h_end, w_end = h_start + f, w_start + f
A_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, :]
dA_prev_slice = dA_prev[i, h_start:h_end, w_start:w_end, :]
# 执行反向传播(以累加的形式计算)
dA_prev_slice += W[:, :, :, c] * dZ[i, h, w, c]
# 同时计算参数的梯度(以累加的形式计算)
dW[:, :, :, c] += A_prev_slice * dZ[i, h, w, c]
db[:, :, :, c] += dZ[i, h, w, c]
# 去除Zero Padding
dA_prev = dA_prev[:, pad:-pad, pad:-pad, :]
return dA_prev, dW, db
对于max pooling,只需要记住最大值来自哪一个位置,因此设计create_mask
函数,实现如下计算
A=[1432]→M=[0100] A = [ 1 3 4 2 ] → M = [ 0 0 1 0 ]
def create_mask( A ):
mask = A == A.max()
return mask
对于average pooling,需要构造一个将1均分的矩阵,对于2x2
的池化操作,需要构造如下2x2
的矩阵
M=[14141414] M = [ 1 4 1 4 1 4 1 4 ]
def create_matrix( shape ):
(n_H, n_W) = shape
matrix = np.ones( shape ) / (n_H * n_W)
return matrix
前向传播关键代码:
A[i, h, w, c] = np.max( A_prev_slice )
A[i, h, w, c] = np.mean( A_prev_slice )
反向传播关键代码:
mask = create_mask( A_prev_slice )
dA_prev_slice += mask * dA[i, h, w, c]
matrix = create_matrix( (f, f) )
dA_prev_slice += matrix * dA[i, h, w, c]
# 前向传播:A = pool( A_prev )
# 反向传播:已知dA,反向传播至dA_prev
def pool_backward( dA, cache, mode='max' ):
(A_prev, hparameters) = cache
# 获取Tensor的各个维度
(m, n_H, n_W, n_C) = dA.shape
(m, n_H_prev, n_W_prev, n_C_prev) = A_prev.shape
# 获取超参数
stride = hparameters['stride']
f = hparameters['f']
# 初始化dA_prev
dA_prev = np.zeros_like( A_prev )
for i in range(m): # 遍历m个样本
for h in range(n_H): # 遍历空间上的所有位置
for w in range(n_W):
for c in range(n_C): # 遍历n_C个filter
# 定位A_prev和dA_prev上的slice
h_start = h * stride
h_end = h_start + f
w_start = w * stride
w_end = w_start + f
A_prev_slice = A_prev[i, h_start:h_end, w_start:w_end, c]
dA_prev_slice = dA_prev[i, h_start:h_end, w_start:w_end, c]
if mode == 'max':
mask = create_mask( A_prev_slice )
dA_prev_slice += mask * dA[i, h, w, c]
elif mode == 'average':
matrix = create_matrix( (f, f) )
dA_prev_slice += matrix * dA[i, h, w, c]
return dA_prev