目录链接:吴恩达Deep Learning学习笔记目录
1.Initialization
2.Regularization
3.Gradient Checking
注
:本文参考Initialization、Regularization、Gradient Checking
1.Initialization
对于神经网络来说,很容易出现梯度消失或梯度爆炸的问题,而权重的初始化方法对此影响较大。
在做测试前,先引入必要的工具包,创建神经网络模型:
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
from init_utils import sigmoid, relu, compute_loss, forward_propagation, backward_propagation
from init_utils import update_parameters, predict, load_dataset, plot_decision_boundary, predict_dec
# %matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# load image dataset: blue/red dots in circles
train_X, train_Y, test_X, test_Y = load_dataset(is_plot=True)
def model(X,Y,learning_rate=1e-3,epochs=1000,initialization="he"):
"""
params:
X:dims=(num of features,num of samples)
Y:dims=(classes,samples)
initialization:"zeros" "random" "he",to select the initialization function of params
return:
a dict of w,b matrix
"""
costs = []
gradients = {}
m = X.shape[1]
layer_dims = [X.shape[0],10,5,1]
if initialization == "zeros":
params = initialize_params_zeros(layer_dims)
elif initialization =="random":
params = initialize_params_random(layer_dims)
elif initialization == "he":
params = initialize_params_he(layer_dims)
for epoch in range(epochs):
AL,cache = forward_propagation(X,params)
cost = compute_loss(AL,Y)
grads = backward_propagation(X,Y,cache)
params = update_parameters(params,grads,learning_rate)
if epoch % 100 == 0:
print("epoch: %d, loss: %3.3f" %(epoch,cost))
costs.append(cost)
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('iterations')
plt.title(str(initialization) + " initialization")
return params
(1)初始化为0
结论:权重不更新,一直为0,loss不下降。原因:权重全部初始化为0后,没有打破对称性,每个神经元都相同,相当于只有一个神经元在工作,所学习的内容都相同,同时,backword propagation时,W参与梯度计算,导致dZ均为0,所以权重不更新。
def initialize_params_zeros(layer_dims):
"""
params:
a list of num of units for each layer, layer_dims[0]= n_x (num of features),layer_dims[-1]=classes (output)
return:
a dict of matrix containing:W1,W2,...,b1,b2
WL:dims=(layer_dims[L],layer_dims[L-1])
bL:dims=(layer_dims[L],1)
"""
params = {}
L = len(layer_dims)
for layer in range(1,L):
params["W"+str(layer)] = np.zeros((layer_dims[layer],layer_dims[layer-1]))
params["b"+str(layer)] = np.zeros((layer_dims[layer],1))
return params
params = model(train_X,train_Y,1e-3,1000,"zeros")
print(params)
"""
输出:
epoch: 0, loss: 0.693
epoch: 100, loss: 0.693
epoch: 200, loss: 0.693
epoch: 300, loss: 0.693
epoch: 400, loss: 0.693
epoch: 500, loss: 0.693
epoch: 600, loss: 0.693
epoch: 700, loss: 0.693
epoch: 800, loss: 0.693
epoch: 900, loss: 0.693
W1: [[0., 0.],[0., 0.], [0., 0.],[0., 0.],[0., 0.],[0., 0.],[0., 0.], [0., 0.],[0., 0.],[0., 0.]]
b1:[[0.],[0.],[0.], [0.],[0.],[0.],[0.],[0.],[0.],[0.]]
...
"""
plt.title("Zero initialization")
axes = plt.gca()
axes.set_xlim([-1.5,1.5])
axes.set_ylim([-1.5,1.5])
plot_decision_boundary(lambda x : predict_dec(params, x.T), train_X, np.squeeze(train_Y))
(2)随机初始化较大权值
结论:loss单调下降,但出现loss无穷大的情况,分类结果很差.这说明已打破神经元的对称。出现loss为无穷大的原因是,当W过大时,输入到激活函数的Z过大,导致激活函数输出值A接近1或者0,在计算loss时,log(0)即出现无穷大。同时loss未收敛,需要更多的迭代次数。
def initialize_params_random(layer_dims):
params = {}
L = len(layer_dims)
for layer in range(1,L):
params["W"+str(layer)] = np.random.randn(layer_dims[layer],layer_dims[layer-1]) * 10
params["b"+str(layer)] = np.random.randn(layer_dims[layer],1)
return params
params = model(train_X,train_Y,1e-3,1000,"random")
print(params)
"""
输出:
epoch: 0, loss: inf
epoch: 100, loss: inf
epoch: 200, loss: inf
epoch: 300, loss: inf
epoch: 400, loss: inf
epoch: 500, loss: inf
epoch: 600, loss: inf
epoch: 700, loss: 1.672
epoch: 800, loss: 1.521
epoch: 900, loss: 1.379
W1:[[ 5.33056249, -11.8212424 ], [ 6.23367709, -1.00322052],[ 12.00468043, -14.69259439],[ 0.89135801, -1.80814575],[ 5.56556499, 4.97645911],[ 9.14892715, 3.45688351],[ 5.00728705, 2.93703616],[ 0.36286946, 2.89054346],[ 10.72640109, -11.3627793 ],[ 1.93136828, -4.8462639 ]]
...
"""
(3)He 初始化
结论:对于ReLu激活函数,采用He初始化,效果较好。
def initialize_params_he(layer_dims):
params = {}
L = len(layer_dims)
for layer in range(1,L):
params["W"+str(layer)] = np.random.randn(layer_dims[layer],layer_dims[layer-1]) * np.square(2 / layer_dims[layer-1])
params["b"+str(layer)] = np.zeros((layer_dims[layer],1))
return params
2.Regularization
(1)引包、导入数据
# import packages
import numpy as np
import matplotlib.pyplot as plt
from reg_utils import sigmoid, relu, plot_decision_boundary, initialize_parameters, load_2D_dataset, predict_dec
from reg_utils import compute_cost, predict, forward_propagation, backward_propagation, update_parameters
# %matplotlib inline
plt.rcParams['figure.figsize'] = (7.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
train_X, train_Y, test_X, test_Y = load_2D_dataset()
(2)loss函数,带L2正则
def compute_cost_with_regularization(AL,Y,params,lambd):
m = Y.shape[1]
W1 = params["W1"]
W2 = params["W2"]
W3 = params["W3"]
cross_entropy_cost = compute_cost(AL,Y)
L2_cost = lambd * (np.sum(np.square(W1)) + np.sum(np.square(W2)) +np.sum(np.square(W3))) / (2 * m)
loss = cross_entropy_cost + L2_cost
return loss
(3)前向传播,带dropout
def forward_propagation(X,params,keep_prob):
"""
function:implement forward propagation with dropout or not
return:
AL:the output of forward propagation
cache:tuple of W1 W2 ... b1 b2 ...
"""
np.random.seed(1)
W1 = params["W1"]
b1 = params["b1"]
W2 = params["W2"]
b2 = params["b2"]
W3 = params["W3"]
b3 = params["b3"]
Z1 = np.dot(W1,X) + b1
A1 = relu(Z1)
D1 = np.random.rand(A1.shape[0],A1.shape[1]) < keep_prob
A1 = A1 * D1 / keep_prob
Z2 = np.dot(W2,A1) + b2
A2 = relu(Z2)
D2 = np.random.rand(A2.shape[0],A2.shape[1]) < keep_prob
A2 = A2 * D2 / keep_prob
Z3 = np.dot(W3,A2) + b3
AL = sigmoid(Z3)
cache = (Z1,Z2,Z3,A1,A2,AL,W1,W2,W3,b1,b2,b3,D1,D2)
return AL,cache
(3)后向传播
def backward_propagation(X,Y,cache,lambd,keep_prob):
m = X.shape[1]
Z1,Z2,Z3,A1,A2,AL,W1,W2,W3,b1,b2,b3,D1,D2 = cache
dZ3 = AL - Y
dW3 = (1 / m) * (np.dot(dZ3,A2.T) + (lambd * W3))
db3 = (1 / m) * np.sum(dZ3,axis=1,keepdims=True)
dA2 = np.dot(W3.T,dZ3)
dA2 = dA2 * D2 / keep_prob
dZ2 = np.multiply(dA2,np.int64(A2 > 0))
dW2 = (1 / m) * (np.dot(dZ2,A1.T) + (lambd * W2))
db2 = (1 / m) * np.sum(dZ2,axis=1,keepdims=True)
dA1 = np.dot(W2.T,dZ2)
dA1 = dA1 * D1 / keep_prob
dZ1 = np.multiply(dA1,np.int64(A1 > 0))
dW1 = (1 / m) * (np.dot(dZ1,X.T) + (lambd * W1))
db1 = (1 / m) * np.sum(dZ1,axis=1,keepdims=True)
grads = {"dZ3": dZ3, "dW3": dW3, "db3": db3,"dA2": dA2,
"dZ2": dZ2, "dW2": dW2, "db2": db2, "dA1": dA1,
"dZ1": dZ1, "dW1": dW1, "db1": db1}
return grads
(4)建立模型
def model(X,Y,learning_rate = 0.3,epochs = 30000,lambd = 0,keep_prob = 1):
"""
params:
lambd:a param of L2 regularization
keep_prob:probability of keeping a neuron
return:
W,b
"""
grads = {}
costs = []
m = X.shape[1]
layer_dims = [X.shape[0],20,3,1]
# initialize params
params = initialize_parameters(layer_dims)
# gradients descent
for epoch in range(epochs):
AL,cache = forward_propagation(X,params,keep_prob)
cost = compute_cost_with_regularization(AL,Y,params,lambd)
grads = backward_propagation(X,Y,cache,lambd,keep_prob)
params = update_parameters(params,grads,learning_rate)
if epoch % 100 == 0:
print("epoch: %d, loss: %3.3f" % (epoch,cost))
costs.append(cost)
plt.rcParams['figure.figsize'] = (15.0, 4.0)
plt.subplot(1,2,1)
plt.plot(costs)
plt.ylabel('cost')
plt.xlabel('epochs')
plt.title("lambd: "+str(lambd)+" keep_prob: "+ str(keep_prob))
plt.subplot(1,2,2)
plt.title("lambd: "+str(lambd)+" keep_prob: "+ str(keep_prob))
axes = plt.gca()
axes.set_xlim([-0.75, 0.40])
axes.set_ylim([-0.75, 0.65])
plot_decision_boundary(lambda x: predict_dec(params, x.T), X, np.squeeze(Y))
return params
(4)对比
3 . Gradients checking
(1)如何进行梯度检验?
①通过后向传播算法计算出所有参数对应的梯度;
②将所有参数w、b整合为一个向量,每次取一个参数增加(减少)ε,其他参数固定,通过前向传播算法计算出loss函数后,再求导;
③对比导数和梯度。
(2)1D 梯度检验
当拟合函数为:
def forward_propagation(x,theta):
J = np.dot(theta,x)
return J
def backward_propagation(x,theta):
return x
#梯度检验
def gradient_checking(x,theta,epsilon = 1e-7):
theta_plus = theta + epsilon
theta_minus = theta - epsilon
J_plus = forward_propagation(x,theta_plus)
J_minus = forward_propagation(x,theta_minus)
grad_approx = (J_plus - J_minus) / (2 * epsilon)
grad = backward_propagation(x,theta)
#compute norm
numerator = np.linalg.norm(grad - grad_approx)
denominator = np.linalg.norm(grad) + np.linalg.norm(grad_approx)
difference = numerator / denominator
if difference < 1e-7:
print("correct")
else:
print("wrong")
return difference
x,theta = 2,4
dif = gradient_checking(x,theta)
print(dif)
"""
输出:
correct
2.919335883291695e-10
"""
(2)n-D 梯度检验
def forward_propagation_n(X,Y,params):
"""
function:implement forward propagation with dropout or not
return:
AL:the output of forward propagation
cache:tuple of W1 W2 ... b1 b2 ...
"""
np.random.seed(1)
m = X.shape[1]
W1 = params["W1"]
b1 = params["b1"]
W2 = params["W2"]
b2 = params["b2"]
W3 = params["W3"]
b3 = params["b3"]
Z1 = np.dot(W1,X) + b1
A1 = relu(Z1)
Z2 = np.dot(W2,A1) + b2
A2 = relu(Z2)
Z3 = np.dot(W3,A2) + b3
AL = sigmoid(Z3)
cost = (1 / m) * np.sum(np.multiply(-np.log(AL),Y) + np.multiply(-np.log(1 - A3), 1 - Y))
cache = (Z1,Z2,Z3,A1,A2,AL,W1,W2,W3,b1,b2,b3)
return cost,cache
def backward_propagation_n(X, Y, cache):
m = X.shape[1]
(Z1,Z2,Z3,A1,A2,AL,W1,W2,W3,b1,b2,b3) = cache
dZ3 = AL - Y
dW3 = (1 / m) * np.dot(dZ3, A2.T)
db3 = (1 / m) * np.sum(dZ3, axis=1, keepdims=True)
dA2 = np.dot(W3.T, dZ3)
dZ2 = np.multiply(dA2, np.int64(A2 > 0))
dW2 = (1 / m) * np.dot(dZ2, A1.T) * 2 # Should not multiply by 2
db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)
dA1 = np.dot(W2.T, dZ2)
dZ1 = np.multiply(dA1, np.int64(A1 > 0))
dW1 = (1 / m) * np.dot(dZ1, X.T)
db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True) # Should not multiply by 4
gradients = {"dZ3": dZ3, "dW3": dW3, "db3": db3,
"dA2": dA2, "dZ2": dZ2, "dW2": dW2, "db2": db2,
"dA1": dA1, "dZ1": dZ1, "dW1": dW1, "db1": db1}
return gradients
def gradient_checking_n(X,Y,params,grads,epsilon = 1e-7):
params_values,_ = dictionary_to_vector(params)
grad = gradients_to_vector(grads)
num_params = params_values.shape[0]
J_plus = np.zeros((num_params,1))
J_minus = np.zeros((num_params,1))
gradapprox = np.zeros((num_params,1))
for i in range(num_params):
theta_plus = np.copy(params_values)
theta_plus[i][0] = theta_plus[i][0] + epsilon
J_plus[i],_ = forward_propagation_n(X,Y,vector_to_dictionary(theta_plus))
theta_minus = np.copy(params_values)
theta_minus[i][0] = theta_minus[i][0] - epsilon
J_minus[i],_ = forward_propagation_n(X,Y,vector_to_dictionary(theta_minus))
grad_approx[i] = (J_plus[i] - J_minus[i]) / (2 * epsilon)
numerator = np.linalg.norm(grad - grad_approx)
denominator = np.linalg.norm(grad) + np.linalg.norm(grad_approx)
difference = numerator / denominator
if difference < 1e-7:
print("correct")
else:
print("wrong")