简单神经网络代码实现

今天实现了一下简单的神经网络,可以用于简单的二分类任务。代码如下:
本文实现了L2正则化、dropout正则化、Adam优化算法

import math
import numpy as np

# 随机打乱数据集,并且按mini_batch_size的大小把数据集划分为多个batch
def random_mini_batches(X, Y, mini_batch_size):
    m = X.shape[1]
    permutation = list(np.random.permutation(m))
    X_shuffled = X[:, permutation]
    Y_shuffled = Y[:, permutation]
    mini_batches = []
    # 每个都是mini_batch_size个数的块的数目
    full_batch_size = math.floor(m / mini_batch_size)
    for i in range(full_batch_size):
        mini_batch_X = X_shuffled[:, i * mini_batch_size:(i + 1) * mini_batch_size]
        mini_batch_Y = Y_shuffled[:, i * mini_batch_size:(i + 1) * mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    # 处理剩余的数据
    if (m % mini_batch_size != 0):
        mini_batch_X = X_shuffled[:, full_batch_size * mini_batch_size:]
        mini_batch_Y = Y_shuffled[:, full_batch_size * mini_batch_size:]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)
    return mini_batches


# 初始化参数
def initialize_parameters(layer_dims):
    L = len(layer_dims)
    parameters = {}
    for i in range(1, L):
        # 除以np.sqrt(layers_dims[i - 1])是防止梯度消失或梯度爆炸
        parameters['W' + str(i)] = np.random.randn(layer_dims[i], layer_dims[i - 1]) / np.sqrt(layer_dims[i - 1])
        parameters['b' + str(i)] = np.zeros((layer_dims[i], 1))
    return parameters


def initialize_adam(parameters):
    L = len(parameters) // 2
    v = {}
    s = {}
    for i in range(1, L + 1):
        v['dW' + str(i)] = np.zeros_like(parameters['W' + str(i)])
        v['db' + str(i)] = np.zeros_like(parameters['b' + str(i)])
        s['dW' + str(i)] = np.zeros_like(parameters['W' + str(i)])
        s['db' + str(i)] = np.zeros_like(parameters['b' + str(i)])
    return v, s


# 一次前向传导
# 带有dropout正则化,1-keep_prob表示随机关闭神经元的概率,keep_prob表示保留神经元的概率
def forward(A_pre, W, b, activation, keep_prob):
    Z = np.dot(W, A_pre) + b
    A = np.zeros(Z.shape)
    D = np.random.rand(A.shape[0], A.shape[1])
    D = D < keep_prob
    A = A * D
    A = A / keep_prob
    if activation == 'relu':
        A = np.maximum(0, Z)
    elif activation == 'sigmoid':
        A = 1 / (1 + np.exp(-Z))
    return A, Z, D


# 向前传导
def model_forward(X, parameters, keep_prob):
    # 存放参数用于反向传导
    cache = {}
    layer_num = len(parameters) // 2
    cache['A0'] = X
    cache['D0'] = np.ones(X.shape) > 0
    A_pre = X
    # 除过最后一层都用relu
    for i in range(1, layer_num):
        A, Z, D = forward(A_pre, parameters['W' + str(i)], parameters['b' + str(i)], 'relu', keep_prob)
        cache['A' + str(i)] = A
        cache['Z' + str(i)] = Z
        cache['D' + str(i)] = D
        A_pre = A
    # 最后一层用的是sigmoid,
    # 注意:最后一层不使用drop_out
    AL, Z, D = forward(A_pre, parameters['W' + str(layer_num)], parameters['b' + str(layer_num)], 'sigmoid',
                       keep_prob=1)
    cache['A' + str(layer_num)] = AL
    cache['Z' + str(layer_num)] = Z
    cache['D' + str(layer_num)] = D
    return AL, cache


# 计算损失
def compute_cost(AL, Y, lambd, parameters, layer_num):
    m = AL.shape[1]
    # 交叉熵损失
    cross_entropy_cost = -(1 / m) * np.sum(Y * np.log(AL) + (1 - Y) * np.log(1 - AL))
    print(cross_entropy_cost)
    # 所有的权重之和
    L2_regularization_cost = 0
    for i in range(1, layer_num):
        W = parameters['W' + str(i)]
        L2_regularization_cost = L2_regularization_cost + np.sum(np.square(W))
    L2_regularization_cost = (lambd / (2 * m)) * L2_regularization_cost
    cost = cross_entropy_cost + L2_regularization_cost
    cost = np.squeeze(cost)
    return cost


# 一次向后传导
def back(dA, A, A_pre, activation, W, D, keep_prob, lambd):
    m = dA.shape[1]
    dZ = None
    if activation == 'sigmoid':
        dZ = dA * A * (1 - A)
    elif activation == 'relu':
        dZ = np.array(dA, copy=True)
        dZ[A <= 0] = 0
    dW = (1 / m) * np.dot(dZ, A_pre.T) + (lambd / m) * W
    db = (1 / m) * np.sum(dZ, axis=1).reshape(-1, 1)
    dA_pre = np.dot(W.T, dZ)
    # 进行反向dropout处理
    dA_pre = dA_pre * D
    dA_pre = dA_pre / keep_prob
    return dA_pre, dW, db


# 向后传导
def model_back(Y, cache, parameters, keep_prob, lambd, epsilon):
    grads = {}
    layer_num = len(parameters) // 2
    # 先求最后一层的
    # 最后一层没有用dropout,所以不需要处理
    AL = cache['A' + str(layer_num)]
    D = cache['D' + str(layer_num - 1)]
    A_pre = cache['A' + str(layer_num - 1)]
    W = parameters['W' + str(layer_num)]
    dA = - (np.divide(Y, AL + epsilon) - np.divide(1 - Y, 1 - (AL + epsilon)))
    # 最后一层不需要dropout
    dA_pre, dW, db = back(dA, AL, A_pre, 'sigmoid', W, D, keep_prob=1, lambd=lambd)
    grads['dW' + str(layer_num)] = dW
    grads['db' + str(layer_num)] = db
    dA = dA_pre
    for i in reversed(range(1, layer_num)):
        A = cache['A' + str(i)]
        D = cache['D' + str(i - 1)]
        A_pre = cache['A' + str(i - 1)]
        W = parameters['W' + str(i)]
        dA_pre, dW, db = back(dA, A, A_pre, 'relu', W, D, keep_prob, lambd)
        grads['dW' + str(i)] = dW
        grads['db' + str(i)] = db
        dA = dA_pre
    return grads


# 使用batch梯度下降更新参数
def update_parameters(parameters, grads, learning_rate):
    layer_num = len(parameters) // 2
    for i in range(1, layer_num + 1):
        parameters['W' + str(i)] = parameters['W' + str(i)] - learning_rate * grads['dW' + str(i)]
        parameters['b' + str(i)] = parameters['b' + str(i)] - learning_rate * grads['db' + str(i)]
    return parameters


def update_parameters_with_adam(parameters, grads, learning_rate, beta1, beta2, v, s, t, epsilon):
    # 获取网络层数
    layer_num = len(parameters) // 2
    v_corrected = {}
    s_corrected = {}
    for i in range(1, layer_num + 1):
        # 使用指数加权平均对导数进行求平均
        v['dW' + str(i)] = beta1 * v['dW' + str(i)] + (1 - beta1) * grads['dW' + str(i)]
        v['db' + str(i)] = beta1 * v['db' + str(i)] + (1 - beta1) * grads['db' + str(i)]
        s['dW' + str(i)] = beta2 * s['dW' + str(i)] + (1 - beta2) * np.square(grads['dW' + str(i)])
        s['db' + str(i)] = beta2 * s['db' + str(i)] + (1 - beta2) * np.square(grads['db' + str(i)])
        # 对v,s进行修正
        v_corrected['dW' + str(i)] = v['dW' + str(i)] / (1 - np.power(beta1, t))
        v_corrected['db' + str(i)] = v['db' + str(i)] / (1 - np.power(beta1, t))
        s_corrected['dW' + str(i)] = s['dW' + str(i)] / (1 - np.power(beta2, t))
        s_corrected['db' + str(i)] = s['db' + str(i)] / (1 - np.power(beta2, t))
        # 更新参数
        parameters['W' + str(i)] = parameters['W' + str(i)] - learning_rate * (
                v_corrected['dW' + str(i)] / (np.sqrt(s_corrected['dW' + str(i)]) + epsilon))
        parameters['b' + str(i)] = parameters['b' + str(i)] - learning_rate * (
                v_corrected['db' + str(i)] / (np.sqrt(s_corrected['db' + str(i)]) + epsilon))
    return parameters, v, s


# 模型训练
# keep_prob为dropout正则化参数,keep_prob=1时,则没有开dropout正则
# lambd是L2正则化参数,lambd=0时,则没有开L2正则
def model(X, Y, layers_dims, mini_batch_size=64, learning_rate=0.0075, num_iterations=3000, keep_prob=1.0, lambd=0.0,
          beta1=0.9, beta2=0.999, epsilon=1e-8, print_cost=False):
    # 显示使用了什么正则化
    if keep_prob < 1:
        print("使用了dropout正则化")
    if lambd > 0:
        print("使用了L2正则化")
    # 显示使用了什么梯度
    if mini_batch_size == 1:
        print("使用了随机梯度下降")
    elif mini_batch_size == X.shape[1]:
        print("使用了梯度下降")
    else:
        print("使用了小批量梯度下降")
    # 初始化参数
    parameters = initialize_parameters(layers_dims)
    v, s = initialize_adam(parameters)
    # 神经网络层数
    layer_num = len(parameters) // 2
    # 将数据随机打乱并划分为每一个块,存放到mini_batches
    mini_batches = random_mini_batches(X, Y, mini_batch_size)
    t = 0
    for i in range(num_iterations):
        for mini_batch in mini_batches:
            # 取出每一个块中的X和Y
            mini_batch_X, mini_batch_Y = mini_batch
            # 前向传导,AL为预测结果,cache为中间结果,以便于计算反向传导
            AL, cache = model_forward(mini_batch_X, parameters, keep_prob)
            # 计算损失
            cost = compute_cost(AL, mini_batch_Y, lambd, parameters, layer_num)
            # 打印损失值
            if print_cost and i % 1000 == 0:
                print("Cost after iteration %i: %f" % (i, cost))
            # 后向传导
            grads = model_back(mini_batch_Y, cache, parameters, keep_prob, lambd, epsilon)
            # 参数更新
            t = t + 1
            parameters, v, s = update_parameters_with_adam(parameters, grads, learning_rate, beta1, beta2, v, s, t,
                                                           epsilon)
    return parameters


# 预测函数
def predict(X, Y, parameters):
    m = Y.shape[1]
    # 预测的时候不能使用dropout,keep_prob=1.0则关闭
    result, _ = model_forward(X, parameters, keep_prob=1.0)
    result = np.where(result > 0.5, 1, 0)
    result = result == Y
    result = np.sum(result == True) / m
    print("准确率为:", result)
    return result


你可能感兴趣的:(python,神经网络,深度学习,机器学习)