按照一次迭代来说
输入→计算linear,缓存linear_cache→计算action,缓存action_cache→反向传播→更新参数
# GRADED FUNCTION: initialize_parameters_deep
def initialize_parameters_deep(layer_dims):
"""
Arguments:
layer_dims -- python array (list) containing the dimensions of each layer in our network
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
Wl -- weight matrix of shape (layer_dims[l], layer_dims[l-1])
bl -- bias vector of shape (layer_dims[l], 1)
"""
np.random.seed(3)
parameters = {}
L = len(layer_dims) # number of layers in the network
for l in range(1, L):
### START CODE HERE ### (≈ 2 lines of code)
parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
parameters['b' + str(l)] = np.zeros((layer_dims[l],1))
### END CODE HERE ###
assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
return parameters
# GRADED FUNCTION: linear_forward
def linear_forward(A, W, b):
"""
实现前向传播的线性部分——linear部分
参数::A , W, b
返回:
Z -- 激活函数的输入,也称为预激活参数
cache -- 包含“A”、“W”和“b”的python元组;存储用于有效地计算向后传递
"""
### START CODE HERE ### (≈ 1 line of code)
Z = np.dot(W,A)+b
### END CODE HERE ###
assert(Z.shape == (W.shape[0], A.shape[1]))
cache = (A, W, b)
return Z, cache
# GRADED FUNCTION: linear_activation_forward
def linear_activation_forward(A_prev, W, b, activation):
"""
Implement the forward propagation for the LINEAR->ACTIVATION layer
Arguments:
A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
b -- bias vector, numpy array of shape (size of the current layer, 1)
activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
Returns:
A -- the output of the activation function, also called the post-activation value
cache -- a python tuple containing "linear_cache" and "activation_cache";
stored for computing the backward pass efficiently
"""
if activation == "sigmoid":
# Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
### START CODE HERE ### (≈ 2 lines of code)
Z, linear_cache = linear_forward(A_prev,W,b)
A, activation_cache = sigmoid(Z)
### END CODE HERE ###
elif activation == "relu":
# Inputs: "A_prev, W, b". Outputs: "A, activation_cache".
### START CODE HERE ### (≈ 2 lines of code)
Z, linear_cache = linear_forward(A_prev,W,b)
A, activation_cache = relu(Z)
### END CODE HERE ###
assert (A.shape == (W.shape[0], A_prev.shape[1]))
cache = (linear_cache, activation_cache)
return A, cache
# GRADED FUNCTION: L_model_forward
def L_model_forward(X, parameters):
"""
# 本示例中实现了前向传播,前L-1层为relu函数,第L层为sigmoid函数。
# 每一层都要缓存cache
# 返回参数:
## AL:最后一层的激励值
## caches:缓存的参数,L个,下标为0-(L-1)
"""
caches = []
A = X
L = len(parameters) // 2 # number of layers in the neural network
# 实现前L-1层的激励值并缓存,for循环,(1,L)不包含L层
for l in range(1, L):
A_prev = A
###---------- START CODE HERE -------------### (≈ 2 lines of code)
A, cache = linear_activation_forward(A_prev, parameters['W'+str(l)], parameters['b'+str(l)], activation = 'relu')
caches.append(cache)
###---------- END CODE HERE ---------- ###
# 实现第L层的sigmoid并缓存
### ----------START CODE HERE---------- ### (≈ 2 lines of code)
AL, cache = linear_activation_forward(A, parameters['W'+str(L)], parameters['b'+str(L)], activation = 'sigmoid')
caches.append(cache)
### ----------END CODE HERE---------- ###
assert(AL.shape == (1,X.shape[1]))
return AL, caches
现在开始实现前向传播和反向传播,并且计算cost ,因为我想知道我的模型是否真的在学习
Compute the cross-entropy cost J J J, using the following formula: (7) − 1 m ∑ i = 1 m ( y ( i ) log ( a [ L ] ( i ) ) + ( 1 − y ( i ) ) log ( 1 − a [ L ] ( i ) ) ) -\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7} −m1i=1∑m(y(i)log(a[L](i))+(1−y(i))log(1−a[L](i)))(7)
# GRADED FUNCTION: compute_cost
def compute_cost(AL, Y):
m = Y.shape[1]
cost = -1/m*np.sum(Y*np.log(AL)+(1-Y)*np.log(1-AL))
cost = np.squeeze(cost) # 保证cost出来的是一个数字,而不是其他数组之类奇奇怪怪的东西
assert(cost.shape == ())
return cost
就像使用正向传播一样,您将实现用于反向传播的辅助函数。记住,反向传播用于计算损失函数相对于参数的梯度。
-反向传播分为两个函数,一个是Linear反向传播,预备函数,用来计算输入dz,cache,输出dA_prev,dW,db
-另一个是Linear-Activation-backword反向传播,通过集成Linear-backword,计算每一层的dA_prev,dW,db
cache缓存由之前的函数可以知道,我们将cache缓存分为两种
--activation_cache:缓存的是每一层的激励值,A
--作用:用于在反向传播中计算dZ的值
--linear_cache:缓存每一层中的,Z,W,b
--作用:用于在反向传播中计算dA,dW,db
(11) d Z [ l ] = d A [ l ] ∗ g ′ ( Z [ l ] ) dZ^{[l]} = dA^{[l]} * g'(Z^{[l]}) \tag{11} dZ[l]=dA[l]∗g′(Z[l])(11).
对于最后一层L层的反向传播,输入dz,输出dW,db,da
The three outputs ( d W [ l ] , d b [ l ] , d A [ l − 1 ] ) (dW^{[l]}, db^{[l]}, dA^{[l-1]}) (dW[l],db[l],dA[l−1]) are computed using the input d Z [ l ] dZ^{[l]} dZ[l].Here are the formulas you need:
(8) d W [ l ] = ∂ J ∂ W [ l ] = 1 m d Z [ l ] A [ l − 1 ] T dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} \tag{8} dW[l]=∂W[l]∂J=m1dZ[l]A[l−1]T(8)
(9) d b [ l ] = ∂ J ∂ b [ l ] = 1 m ∑ i = 1 m d Z [ l ] ( i ) db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}\tag{9} db[l]=∂b[l]∂J=m1i=1∑mdZ[l](i)(9)
(10) d A [ l − 1 ] = ∂ L ∂ A [ l − 1 ] = W [ l ] T d Z [ l ] dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} \tag{10} dA[l−1]=∂A[l−1]∂L=W[l]TdZ[l](10)
# GRADED FUNCTION: linear_backward
def linear_backward(dZ, cache):
"""
Implement the linear portion of backward propagation for a single layer (layer l)
Arguments:
dZ -- Gradient of the cost with respect to the linear output (of current layer l)
cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer
Returns:
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
"""
A_prev, W, b = cache
m = A_prev.shape[1]
### START CODE HERE ### (≈ 3 lines of code)
dW = 1/m*np.dot(dZ,A_prev.T)
db = 1/m*np.sum(dZ,axis = 1,keepdims = True)
dA_prev = np.dot(W.T,dZ)
### END CODE HERE ###
assert (dA_prev.shape == A_prev.shape)
assert (dW.shape == W.shape)
assert (db.shape == b.shape)
return dA_prev, dW, db
# GRADED FUNCTION: linear_activation_backward
def linear_activation_backward(dA, cache, activation):
"""
Implement the backward propagation for the LINEAR->ACTIVATION layer.
Arguments:
dA -- post-activation gradient for current layer l
cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
Returns:
dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
dW -- Gradient of the cost with respect to W (current layer l), same shape as W
db -- Gradient of the cost with respect to b (current layer l), same shape as b
"""
linear_cache, activation_cache = cache
if activation == "relu":
### START CODE HERE ### (≈ 2 lines of code)
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
### END CODE HERE ###
elif activation == "sigmoid":
### START CODE HERE ### (≈ 2 lines of code)
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache)
### END CODE HERE ###
return dA_prev, dW, db
现在,您将为整个网络实现向后函数。
回想一下,当您实现L_model_forward函数时,在每次迭代中,您都存储了一个包含(X、W、b和z)的缓存 在反向传播模块中,您将使用这些变量来计算梯度
因此,在l_model_back函数中,您将向后遍历所有隐藏层,从第L层开始。在每个步骤中,您都将使用第L层的缓存值在第L层中进行反向传播。下面的图5显示了向后传递。
传播思路:
Z = W ∗ * ∗ A [ l − 1 ] ^{[l-1]} [l−1] + b
因此最后一层:L层
1.先计算dAL
2.再用dAL,cache,计算梯度——dA,dW,db
之后第L-1至第1层:
1.通过dA,cache,计算梯度——dA[l],dW[l],db[l]
Tips:
for循环,比如for l in range(L)
不包括第L个,因为下标是从0开始
因此可见下面的函数中
## 讲解:本来就是从1层到L-1层计算梯度,for循环不包括右边
## for i in L 为 i=0 to L-1 为所有层
## for i in L-1 为 i = 0 to L-2,为不包括最后一层
## 因此我要计算倒数第二层(L-1)的梯度,需要用到最后一层(L)计算好的参数dA,
## 比如L=5,dAL = dA5,第五层通过dA5计算出dA[L-1],即第四层的输入参数dA[4],然后第四层通过dA[4]计算dA[3]......
## 那么,倒数第二层,即第四层,使用dA[4]的时候,for l in reversed(range(L-1)),L-1=4,但是for循环又不包括4,所以实际上 l = 1...2...3,因此使用的时候才需要在第四层用 l + 1 这个值
for l in reversed(range(L-1)):
(例如L-1 = 4 ,那么 l = 1...2...3,不包括4,所以计算第四层要l+1)
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads['dA'+str(l+1)],current_cache,activation = 'relu')
grads["dA" + str(l)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
以下为整体方法:
# GRADED FUNCTION: L_model_backward
def L_model_backward(AL, Y, caches):
"""
Implement the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group
Arguments:
AL -- probability vector, output of the forward propagation (L_model_forward())
Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
caches -- list of caches containing:
every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
Returns:
grads -- A dictionary with the gradients
grads["dA" + str(l)] = ...
grads["dW" + str(l)] = ...
grads["db" + str(l)] = ...
"""
grads = {}
L = len(caches) # the number of layers
m = AL.shape[1]
Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
# Initializing the backpropagation
### START CODE HERE ### (1 line of code)
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
### END CODE HERE ###
# Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]
### START CODE HERE ### (approx. 2 lines)
current_cache = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = 'sigmoid')
### END CODE HERE ###
# Loop from l=L-2 to l=0
for l in reversed(range(L-1)):
# lth layer: (RELU -> LINEAR) gradients.
# Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)]
### START CODE HERE ### (approx. 5 lines)
current_cache = caches[l]
dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads['dA'+str(l+1)],current_cache,activation = 'relu')
grads["dA" + str(l)] = dA_prev_temp
grads["dW" + str(l + 1)] = dW_temp
grads["db" + str(l + 1)] = db_temp
### END CODE HERE ###
return grads
# GRADED FUNCTION: update_parameters
def update_parameters(parameters, grads, learning_rate):
"""
Update parameters using gradient descent
Arguments:
parameters -- python dictionary containing your parameters
grads -- python dictionary containing your gradients, output of L_model_backward
Returns:
parameters -- python dictionary containing your updated parameters
parameters["W" + str(l)] = ...
parameters["b" + str(l)] = ...
"""
L = len(parameters) // 2 # number of layers in the neural network
# Update rule for each parameter. Use a for loop.
### START CODE HERE ### (≈ 3 lines of code)
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads['dW'+str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads['db'+str(l+1)]
### END CODE HERE ###
return parameters