今天实现了一下简单的神经网络,可以用于简单的二分类任务。代码如下:
本文实现了L2正则化、dropout正则化、Adam优化算法
import math
import numpy as np
# 随机打乱数据集,并且按mini_batch_size的大小把数据集划分为多个batch
def random_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
permutation = list(np.random.permutation(m))
X_shuffled = X[:, permutation]
Y_shuffled = Y[:, permutation]
mini_batches = []
# 每个都是mini_batch_size个数的块的数目
full_batch_size = math.floor(m / mini_batch_size)
for i in range(full_batch_size):
mini_batch_X = X_shuffled[:, i * mini_batch_size:(i + 1) * mini_batch_size]
mini_batch_Y = Y_shuffled[:, i * mini_batch_size:(i + 1) * mini_batch_size]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
# 处理剩余的数据
if (m % mini_batch_size != 0):
mini_batch_X = X_shuffled[:, full_batch_size * mini_batch_size:]
mini_batch_Y = Y_shuffled[:, full_batch_size * mini_batch_size:]
mini_batch = (mini_batch_X, mini_batch_Y)
mini_batches.append(mini_batch)
return mini_batches
# 初始化参数
def initialize_parameters(layer_dims):
L = len(layer_dims)
parameters = {}
for i in range(1, L):
# 除以np.sqrt(layers_dims[i - 1])是防止梯度消失或梯度爆炸
parameters['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i - 1]) / np.sqrt(layer_dims[i - 1])
parameters['b' + str(i)] = np.zeros((layer_dims[i], 1))
return parameters
def initialize_adam(parameters):
L = len(parameters) // 2
v = {}
s = {}
for i in range(1, L + 1):
v['dW' + str(i)] = np.zeros_like(parameters['W' + str(i)])
v['db' + str(i)] = np.zeros_like(parameters['b' + str(i)])
s['dW' + str(i)] = np.zeros_like(parameters['W' + str(i)])
s['db' + str(i)] = np.zeros_like(parameters['b' + str(i)])
return v, s
# 一次前向传导
# 带有dropout正则化,1-keep_prob表示随机关闭神经元的概率,keep_prob表示保留神经元的概率
def forward(A_pre, W, b, activation, keep_prob):
Z = np.dot(W, A_pre) + b
A = np.zeros(Z.shape)
D = np.random.rand(A.shape[0], A.shape[1])
D = D < keep_prob
A = A * D
A = A / keep_prob
if activation == 'relu':
A = np.maximum(0, Z)
elif activation == 'sigmoid':
A = 1 / (1 + np.exp(-Z))
return A, Z, D
# 向前传导
def model_forward(X, parameters, keep_prob):
# 存放参数用于反向传导
cache = {}
layer_num = len(parameters) // 2
cache['A0'] = X
cache['D0'] = np.ones(X.shape) > 0
A_pre = X
# 除过最后一层都用relu
for i in range(1, layer_num):
A, Z, D = forward(A_pre, parameters['W' + str(i)], parameters['b' + str(i)], 'relu', keep_prob)
cache['A' + str(i)] = A
cache['Z' + str(i)] = Z
cache['D' + str(i)] = D
A_pre = A
# 最后一层用的是sigmoid,
# 注意:最后一层不使用drop_out
AL, Z, D = forward(A_pre, parameters['W' + str(layer_num)], parameters['b' + str(layer_num)], 'sigmoid',
keep_prob=1)
cache['A' + str(layer_num)] = AL
cache['Z' + str(layer_num)] = Z
cache['D' + str(layer_num)] = D
return AL, cache
# 计算损失
def compute_cost(AL, Y, lambd, parameters, layer_num):
m = AL.shape[1]
# 交叉熵损失
cross_entropy_cost = -(1 / m) * np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL))
print(cross_entropy_cost)
# 所有的权重之和
L2_regularization_cost = 0
for i in range(1, layer_num):
W = parameters['W' + str(i)]
L2_regularization_cost = L2_regularization_cost + np.sum(np.square(W))
L2_regularization_cost = (lambd / (2 * m)) * L2_regularization_cost
cost = cross_entropy_cost + L2_regularization_cost
cost = np.squeeze(cost)
return cost
# 一次向后传导
def back(dA, A, A_pre, activation, W, D, keep_prob, lambd):
m = dA.shape[1]
dZ = None
if activation == 'sigmoid':
dZ = dA * A * (1 - A)
elif activation == 'relu':
dZ = np.array(dA, copy=True)
dZ[A <= 0] = 0
dW = (1 / m) * np.dot(dZ, A_pre.T) + (lambd / m) * W
db = (1 / m) * np.sum(dZ, axis=1).reshape(-1, 1)
dA_pre = np.dot(W.T, dZ)
# 进行反向dropout处理
dA_pre = dA_pre * D
dA_pre = dA_pre / keep_prob
return dA_pre, dW, db
# 向后传导
def model_back(Y, cache, parameters, keep_prob, lambd, epsilon):
grads = {}
layer_num = len(parameters) // 2
# 先求最后一层的
# 最后一层没有用dropout,所以不需要处理
AL = cache['A' + str(layer_num)]
D = cache['D' + str(layer_num - 1)]
A_pre = cache['A' + str(layer_num - 1)]
W = parameters['W' + str(layer_num)]
dA = - (np.divide(Y, AL + epsilon) - np.divide(1 - Y, 1 - (AL + epsilon)))
# 最后一层不需要dropout
dA_pre, dW, db = back(dA, AL, A_pre, 'sigmoid', W, D, keep_prob=1, lambd=lambd)
grads['dW' + str(layer_num)] = dW
grads['db' + str(layer_num)] = db
dA = dA_pre
for i in reversed(range(1, layer_num)):
A = cache['A' + str(i)]
D = cache['D' + str(i - 1)]
A_pre = cache['A' + str(i - 1)]
W = parameters['W' + str(i)]
dA_pre, dW, db = back(dA, A, A_pre, 'relu', W, D, keep_prob, lambd)
grads['dW' + str(i)] = dW
grads['db' + str(i)] = db
dA = dA_pre
return grads
# 使用batch梯度下降更新参数
def update_parameters(parameters, grads, learning_rate):
layer_num = len(parameters) // 2
for i in range(1, layer_num + 1):
parameters['W' + str(i)] = parameters['W' + str(i)] - learning_rate * grads['dW' + str(i)]
parameters['b' + str(i)] = parameters['b' + str(i)] - learning_rate * grads['db' + str(i)]
return parameters
def update_parameters_with_adam(parameters, grads, learning_rate, beta1, beta2, v, s, t, epsilon):
# 获取网络层数
layer_num = len(parameters) // 2
v_corrected = {}
s_corrected = {}
for i in range(1, layer_num + 1):
# 使用指数加权平均对导数进行求平均
v['dW' + str(i)] = beta1 * v['dW' + str(i)] + (1 - beta1) * grads['dW' + str(i)]
v['db' + str(i)] = beta1 * v['db' + str(i)] + (1 - beta1) * grads['db' + str(i)]
s['dW' + str(i)] = beta2 * s['dW' + str(i)] + (1 - beta2) * np.square(grads['dW' + str(i)])
s['db' + str(i)] = beta2 * s['db' + str(i)] + (1 - beta2) * np.square(grads['db' + str(i)])
# 对v,s进行修正
v_corrected['dW' + str(i)] = v['dW' + str(i)] / (1 - np.power(beta1, t))
v_corrected['db' + str(i)] = v['db' + str(i)] / (1 - np.power(beta1, t))
s_corrected['dW' + str(i)] = s['dW' + str(i)] / (1 - np.power(beta2, t))
s_corrected['db' + str(i)] = s['db' + str(i)] / (1 - np.power(beta2, t))
# 更新参数
parameters['W' + str(i)] = parameters['W' + str(i)] - learning_rate * (
v_corrected['dW' + str(i)] / (np.sqrt(s_corrected['dW' + str(i)]) + epsilon))
parameters['b' + str(i)] = parameters['b' + str(i)] - learning_rate * (
v_corrected['db' + str(i)] / (np.sqrt(s_corrected['db' + str(i)]) + epsilon))
return parameters, v, s
# 模型训练
# keep_prob为dropout正则化参数,keep_prob=1时,则没有开dropout正则
# lambd是L2正则化参数,lambd=0时,则没有开L2正则
def model(X, Y, layers_dims, mini_batch_size=64, learning_rate=0.0075, num_iterations=3000, keep_prob=1.0, lambd=0.0,
beta1=0.9, beta2=0.999, epsilon=1e-8, print_cost=False):
# 显示使用了什么正则化
if keep_prob < 1:
print("使用了dropout正则化")
if lambd > 0:
print("使用了L2正则化")
# 显示使用了什么梯度
if mini_batch_size == 1:
print("使用了随机梯度下降")
elif mini_batch_size == X.shape[1]:
print("使用了梯度下降")
else:
print("使用了小批量梯度下降")
# 初始化参数
parameters = initialize_parameters(layers_dims)
v, s = initialize_adam(parameters)
# 神经网络层数
layer_num = len(parameters) // 2
# 将数据随机打乱并划分为每一个块,存放到mini_batches
mini_batches = random_mini_batches(X, Y, mini_batch_size)
t = 0
for i in range(num_iterations):
for mini_batch in mini_batches:
# 取出每一个块中的X和Y
mini_batch_X, mini_batch_Y = mini_batch
# 前向传导,AL为预测结果,cache为中间结果,以便于计算反向传导
AL, cache = model_forward(mini_batch_X, parameters, keep_prob)
# 计算损失
cost = compute_cost(AL, mini_batch_Y, lambd, parameters, layer_num)
# 打印损失值
if print_cost and i % 1000 == 0:
print("Cost after iteration %i: %f" % (i, cost))
# 后向传导
grads = model_back(mini_batch_Y, cache, parameters, keep_prob, lambd, epsilon)
# 参数更新
t = t + 1
parameters, v, s = update_parameters_with_adam(parameters, grads, learning_rate, beta1, beta2, v, s, t,
epsilon)
return parameters
# 预测函数
def predict(X, Y, parameters):
m = Y.shape[1]
# 预测的时候不能使用dropout,keep_prob=1.0则关闭
result, _ = model_forward(X, parameters, keep_prob=1.0)
result = np.where(result > 0.5, 1, 0)
result = result == Y
result = np.sum(result == True) / m
print("准确率为:", result)
return result