目录链接:吴恩达Deep Learning学习笔记目录
1.Outline of the Assignment
2. Initialization
3. Forword propagate
4. Backward propagate
5. L-layers Model
6.Training and predicting
注
:本次作业参照Building your Deep Neural Network: Step by Step而完成。
1. Outline of the Assignment
Packages
:
dnn_utils
:激活函数及其导数计算function
testCases
:用于验证函数是否正常运行的测试数组
import numpy as np
import h5py
import matplotlib.pyplot as plt
from testCases import *
from dnn_utils import sigmoid, sigmoid_backward, relu, relu_backward
# %matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# %load_ext autoreload
# %autoreload 2
np.random.seed(1)
Deep Neural Network实现步骤:
(1)2-layers 和 L-layers 神经网络结构参数初始化
(2)实现模型前向传播计算:
①实现前向传播算法Linear计算,得到结果Z[L],L代表第L层
②将Linear和Activation函数组合为一个神经元[Linear->Activation]函数
③神经元函数结合网络结构参数堆叠得到隐藏层,最后添加一个[Linear->Sigmoid],用于输出前向传播计算结果
(3)计算loss
(4)实现向后传播计算:
①计算Linear函数向后传播计算
②计算Activation函数gradient
③将①②结合为向后传播函数
④堆叠③L次和①一次实现向后传播计算
(5)更新参数
每一步前向传播计算中得到的Z和A需要保存下来用于向后传播计算,以减少计算量。L为第L层神经元,m为样本数,nh[L]为第L层隐藏层神经元数量,向后传播计算过程为:
计算过程中各个矩阵维度
:
matrix | A[L] , dA[L] | W[L],dW[L] | Z[L],Z[L] |
---|---|---|---|
dim | (nh[L],m) | (nh[L],nh[L-1]) | (nh[L],m) |
计算gradients
:
layers | gradients | back propagate |
---|---|---|
L |
|
|
L |
|
|
L-1 |
|
|
L-1 |
|
|
··· | ··· | ··· |
1 |
|
|
1 |
|
|
2 .Initialization
需要写两个函数,用于初始化2-layers模型、L-layers模型参数。
2.1 双层模型
np.random.randn()
返回一组服从标准正态分布得随机值,乘以0.001是为了将初始化权重值减小,以加快收敛速度,同时避免数值问题(cost计算结果出现nan,inf,没乘以0.01时出现过),参数初始化问题参见神经网络参数初始化的学问
、神经网络中参数的初始化方法。
def initialize_params(n_x,n_h,n_y):
"""
params:
n_x: num of input layer
n_h: num of units of hidden layer
n_y: num of output layer
return: a dict
W1:weights matrix of input layer, dim = [n_h,n_x]
b1:bias vector of input layer,dim = [n_h,1]
W2:weights matrix of hidden layer, dim = [n_y,n_h]
b2:bias vector of hiddden layer,dim = [n_y,1]
"""
W1 = np.random.randn(n_h,n_x) * 0.001
b1 = np.zeros((n_h,1))
W2 = np.random.randn(n_y,n_h) * 0.01
b2 = np.zeros((n_y,1))
assert(W1.shape == ( n_h , n_x ))
assert(b1.shape == ( n_h , 1 ))
assert(W2.shape == ( n_y , n_h ))
assert(b2.shape == ( n_y , 1 ))
params = {
"W1": W1,
"b1": b1,
"W2": W2,
"b2": b2,
}
return params
"""
params = initialize_params(2,2,1)
输出
W1: [ 0.01744812, -0.00761207]
[ 0.00319039, -0.0024937 ]
b1: [0.]
[0.]
W2: [ 0.01462108, -0.02060141]
b2: [0.]
"""
2.2 多层模型
def initialize_params_deep(layer_dims):
"""
params:a array containing num of neural units of each layer
return: a dict of weights and bias
"""
np.random.seed(1)
layers = len(layer_dims)
params = {}
for layer in range(1,layers):
params["W" + str(layer)] = np.random.randn(layer_dims[layer],layer_dims[layer - 1]) * np.sqrt(2/layer_dims[layer-1])
params["b" + str(layer)] = np.zeros((layer_dims[layer],1))
assert(params["W" + str(layer)].shape == (layer_dims[layer],layer_dims[layer - 1]))
assert(params["b" + str(layer)].shape == (layer_dims[layer],1))
return params
"""
params = initialize_params_deep([4,3,2,1])
输出
{'W1':[[ 1.14858562, -0.43257711, -0.37347383, -0.75870339],
[ 0.6119356 , -1.62743362, 1.23376823, -0.53825456],
[ 0.22559471, -0.17633148, 1.03386644, -1.45673947]]),
'b1': array([[0.],
[0.],
[0.]]),
'W2': array([[-0.26325254, -0.31357907, 0.92571887],
[-0.89805746, -0.14078704, -0.7167684 ]]),
'b2': array([[0.],
[0.]]),
'W3': array([[0.04221375, 0.58281521]]),
'b3': array([[0.]])}
"""
3 .Forword propagate
由于计算时采用向量化的数据进行计算,neural net 结构描述为:(L-1)层[Linear -> ReLu] -> Linear -> sigmoid(二分类问题),Linear计算为:3.1 Linear Forword
用于计算Z,并保存上一层A,当前层W,b
def linear_forword(previous_A,W,b):
"""
params:
previous_A:values of previous layer(input data is A[0]), dim = [num_units of previous layer,num of samples]
W:weights matrix of current layer,dim = [num_units of current layer,num_units of previous layer]
b:bias vector,dim = [num_units of current layer,1]
return:
Z:input of activition,dim = [num_units of previous layer,num of samples]
cache:a dict containing "previous_A","current_W","current_b",stored for computing back propagate
"""
Z = np.dot(W,previous_A)+b
assert(Z.shape == (W.shape[0],previous_A.shape[1]))
cache = (previous_A,W,b)
return Z,cache
3.2 Activation Forword
用于计算当前层的激活值A,保存当前层Z,保存当前层W,b,上一层A
def activation_forwaor(previous_A,W,b,activation):
"""
params:
previous_A:values of previous layer(input data is A[0]), dim = [num_units of previous layer,num of samples]
W:weights matrix of current layer,dim = [num_units of current layer,num_units of previous layer]
b:bias vector,dim = [num_units of current layer,1]
activation:str "sigmoid" or "relu"
return:
A:values of current layer , dim = [num_units of current layer,num of samples]
cache:a dict containing activation_cache and linear_cache of cunrrent layer
->activation_cache stores current Z
->linear_cache stores "previous_A","current_W","current_b"
"""
if activation == "sigmoid":
Z,linear_cache = linear_forword(previous_A,W,b)
A,activation_cache = sigmoid(Z)
elif activation == "relu":
Z,linear_cache = linear_forword(previous_A,W,b)
A,activation_cache = relu(Z)
assert (A.shape == (W.shape[0],previous_A.shape[1]))
cache = (linear_cache,activation_cache)
return A,cache
3.3 Forword propagate
def forword_propagate(X,params):
"""
params:
X:dim = [num of features,num of samples]
params:output of initialize_params_deep() containing W,b
return:
A_L:the last activation value
caches:a list of cache:
->every cache of activation_forwaor() ,num:L-1,index:0 to L-2
->the last cache of activation_forword(),num:1,index:L-1
"""
caches = []
A = X
L = len(params) // 2 # the dict contains w and b,
for layer in range(1,L):
A_pre = A
A,cache = activation_forwaor(A_pre,
params["W" + str(layer)],
params["b" + str(layer)],
activation = "relu")
caches.append(cache)
A_L,cache = activation_forwaor(A,
params["W" + str(L)],
params["b" + str(L)],
activation = "sigmoid")
caches.append(cache)
assert(A_L.shape == (1,X.shape[1]))
return A_L,caches
caches:
[((A0,W1,b1),(Z1))
((A1,W2,b2),(Z2))
··· ]
其中,A,W,Z等均为np.array
3.4 Cost function
def cost(A_L,Y):
"""
return: cross entropy
"""
m = Y.shape[1]
cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
cost = np.squeeze(cost)
assert(cost.shape == ())
return cost
4. Backword propagate
4.1 linear backword
如上图所示,假设,已知:
def linear_backword(dZ,cache):
"""
params:
dZ: current layer dL/dZ
cache: a tuple -> (previous_A,W,b)
return:
dA_pre,dW,db
"""
previous_A,W,b = cache
m = previous_A.shape[1]
dW = (1 / m) * np.dot(dZ,previous_A.T)
db = (1 / m) * np.sum(dZ,axis = 1,keepdims = True)
dA_pre = np.dot(W.T,dZ)
assert(dA_pre.shape == previous_A.shape)
assert(dW.shape == W.shape)
return dA_pre,dW,db
4.2 activation backword
用于计算dZ,再通过linear backword返回dA_pre,dW,db:def activation_backword(dA,cache,activation):
"""
params:
dA: current layer dL/dA
cache: a tuple -> (linear_cache, activation_cache)
activation:str -> "relu" or "sigmoid"
return:
dA_pre,dW,db
"""
linear_cache,activation_cache = cache
if activation == "relu":
dZ = relu_backward(dA,activation_cache)
elif activation == "sigmoid":
dZ = sigmoid_backward(dA,activation_cache)
dA_pre,dW,db = linear_backword(dZ,linear_cache)
return dA_pre,dW,db
4.3 backword propagate
由于输出层激活函数与隐藏层激活函数不同,单独计算。
def backword_propagate(A_L,Y,caches):
"""
params:
A_L:the last activation value,probability vector,dim = [1,m]
caches:a list of cache:
->every cache of activation_forwaor() ,num:L-1,index:0 to L-2
->the last cache of activation_forword(),num:1,index:L-1
return:
gradients:a dict contains dA, dW, db
"""
grads = {}
L = len(caches)
m = A_L.shape[1]
Y = Y.reshape(A_L.shape)
dA_L = -(np.divide(Y,A_L) - np.divide(1 - Y, 1 - A_L))
current_cache = caches[-1]
grads["dA" + str(L - 1)], grads["dW" + str(L)], grads["db" + str(L)] = activation_backword(dA_L,current_cache,activation = "sigmoid")
for layer in reversed(range(L - 1)):
current_cache = caches[layer]
dA_prev_temp, dW_temp, db_temp = activation_backword(grads["dA" + str(layer + 1)],current_cache,activation = "relu")
grads["dA" + str(layer)] = dA_prev_temp
grads["dW" + str(layer + 1)] = dW_temp
grads["db" + str(layer + 1)] = db_temp
return grads
4.4 optimize parameters
def optimize_params(params,grads,learning_rate = 1e-3):
L = len(params) // 2
for layer in range(L):
params["W" + str(layer + 1)] = params["W" + str(layer + 1)] - learning_rate * grads["dW" + str(layer + 1)]
params["b" + str(layer + 1)] = params["b" + str(layer + 1)] - learning_rate * grads["db" + str(layer + 1)]
return params
5 L-layers model
5.1 model
①输入的layer_dims
,layer_dims[0]
指X
的特征数量,layer_dims[-1]=1
为输出层,initialize_params_deep()
生成的W
矩阵有len(layer_dims)-1
个,即为隐藏层+输出层;如layer_dims=[4,3,3,1]时,隐藏层+输出层=3,有W1、W2、W3;
②forword_propagate()
计算时所获得caches为:
((A0,W1,b1),(Z1))
((A1,W2,b2),(Z2))
······
((AL-1,WL,bL),(ZL)) = ((Alen(layer_dims)-2,Wlen(layer_dims)-1,blen(layer_dims)-1),(Zlen(layer_dims)-1))
最后输出AL;
③backword_propagate()
每次计算结果为当前层dW,db和前一层dA。
def model(X,Y,layer_dims,learning_rate = 1e-3,epochs = 10):
np.random.seed(1)
costs = []
params = initialize_params_deep(layer_dims)
for epoch in range(epochs):
A_L, caches = forword_propagate(X,params)
loss = cost(A_L,Y)
grads = backword_propagate(A_L,Y,caches)
pramas = optimize_params(params,grads,learning_rate=learning_rate)
costs.append(loss)
if epoch % 100 == 0:
print("epoch: %d, cost: %3.3f" % (epoch,loss))
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('epochs')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
return params
p = model(train_x,train_y,[12288,20,8,6,1],1e-2,2000)
5.2 predict
def predict(X,params):
m = X.shape[1]
predictions = np.zeros((1,m))
probs,caches = forword_propagate(X,params)
for i in range(probs.shape[1]):
predictions[0,i] = 1 if probs[0,i] >0.5 else 0
assert(predictions.shape == (1,m))
return predictions
6.Training and predicting
train_dataset = h5py.File("./input/train_catvnoncat.h5","r")
train_dataset_x = np.array(train_dataset['train_set_x'][:])
train_dataset_y = np.array(train_dataset['train_set_y'][:])
test_dataset = h5py.File("./input/test_catvnoncat.h5","r")
test_dataset_x = np.array(test_dataset['test_set_x'][:])
test_dataset_y = np.array(test_dataset['test_set_y'][:])
train_x = train_dataset_x.reshape(train_dataset_x.shape[0],-1).T
train_y = train_dataset_y.reshape(train_dataset_y.shape[0],-1).T
test_x = test_dataset_x.reshape(test_dataset_x.shape[0],-1).T
test_y = test_dataset_y.reshape(test_dataset_y.shape[0],-1).T
print(train_x.shape,train_y.shape,test_x.shape,test_y.shape)
train_x = train_x / 255
test_x = test_x / 255
p = model(train_x,train_y,[12288,20,8,6,1],1e-2,2000)
y_pred_train = predict(train_x,p)
print("train_acc: %3.3f" % (1 - np.mean(np.abs(y_pred_train - train_y))))
y_pred_test = predict(test_x,p)
print("test_acc: %3.3f" % (1 - np.mean(np.abs(y_pred_test - test_y))))