【日常】手写三层反向传播神经网络(损失函数交叉熵+正则项+反向求导)

课程的一次作业,虽然没什么用,但是手写一遍dense确实能加深对神经网络的理解,尤其是反向传播求导这一块。

资源已经上传,不过最近CSDN犯病不能改资源积分了。留个BDY链接了

#-*- coding:UTF-8 -*-
import numpy as np
import pandas as pd
from scipy.io import loadmat
from scipy.linalg import norm
import matplotlib.pyplot as plt

""" 百度云链接:https://pan.baidu.com/s/1kdSoUcK9PFjUEfRiUI9pdw """
""" 密码:iycw """

"""
	Nesterov's方法是先根据历史信息走到一个点,
	再根据那个点的gradient来走一段更新;
	这恰好与动量方法的次序是相反的,
	动量方法是先根据当前点的gradient来走一段更新,
	然后再根据历史信息往前面搞一段距离;
"""
def train(wd,n_hidden,n_iters,learning_rate,momentum_mul,do_early_stopping=False,minibatch_size=10,isNestrov=False):
	"""
		· wd: 权重衰减
		· n_hidden: 隐层结点数量
		· n_iters: 随机梯度下降迭代次数
		· learning_rate: 学习速率
		· momentum_mul: 速率衰减系数(这个系数将附加在前一次的动量上,然后瞎搞)
		· do_early_stopping: 是否提早结束(如果是则简单的输出过去epoch中最优的那个)
		· minibatch_size: 随机梯度下降的小批尺寸
		· inNestrov: 是否使用Nestrov方法
		· return: 数据集上的分类损失
	"""
	data_file = loadmat("data.mat",squeeze_me=True,struct_as_record=False)
	data = data_file["data"]											 # 读取数据
	"""
		· data.training.inputs —— 256×1000
		· data.training.targets —— 10×1000
		· data.validation.inputs —— 256×1000
		· data.validation.targets —— 10×1000
		· data.test.inputs —— 256×9000
		· data.test.targets —— 10×9000
	"""
	data_train = {"X":data.training.inputs,"y":data.training.targets}
	data_valid = {"X":data.validation.inputs,"y":data.validation.targets}
	data_test = {"X":data.test.inputs,"y":data.test.targets}
	n_train = data_train["X"].shape[1]									 # 训练集样本数量							 
	params = initial_model(n_hidden)									 # 初始化两个权重矩阵的参数
	theta = model2theta(params)											 # 将两个矩阵压扁拼接成向量
	test_gradient(params,data_train,wd,n_hidden)						 # 检测梯度是否有问题
	v = 0																 # 初始化速率
	loss_train = []														 # 储存训练时的损失函数值
	loss_valid = []														 # 储存验证时的损失函数值
	best = {}															 # 储存最优参数
	if do_early_stopping:												 # 提前结束
		best["theta"] = 0
		best["loss_valid"] = np.inf
		best["iter"] = -1

	for t in range(n_iters+1):											 # 随机梯度下降迭代算法
		batch_start = (t*minibatch_size) % n_train						 # 每次取一定规模的样本来搞搞
		data_batch = {
			"X": data_train["X"][:,batch_start:batch_start+minibatch_size],
			"y": data_train["y"][:,batch_start:batch_start+minibatch_size],
		}
		if isNestrov:													 # Nestrov's方法
			temp = theta + momentum_mul*v								 # 先照着前一次的情况再苟一段距离
			loss,grad = eval_obj_grad(theta2model(temp),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 得到本次的实际下降方向
			theta -= learning_rate*v									 # 找到更新后的参数值

		else:															 # 传统动量
			loss,grad = eval_obj_grad(theta2model(theta),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 计算经过动量调整后的下降方向
			theta += learning_rate*v									 # 找到更新后的参数值

		params = theta2model(theta)										 # 将调整好的theta转化为params的格式(即两个权重矩阵)
		loss = eval_obj(params,data_train,wd)							 # 计算训练损失
		loss_train.append(loss)											 # 储存训练损失
		loss = eval_obj(params,data_valid,wd)							 # 计算验证损失
		loss_valid.append(loss)											 # 存储验证损失
		if do_early_stopping and loss_valid[-1]隐层 & 隐层->输出 的两个权重矩阵总的元素数量
	as_row_vector = np.cos(np.arange(n_params))							 # 返回darray([0,1,2,...,n_params-1])
	params = {}
	params["W_hid"] = as_row_vector[:256*n_hid].reshape((n_hid,256))*0.1
	params["W_out"] = as_row_vector[256*n_hid:].reshape((10,n_hid))*0.1
	return params

def test_gradient(params,data,wd,n_hidden):								 # 测试梯度
	loss,analytic_grad = eval_obj_grad(params,data,wd,n_hidden)			 # 获取当前参数的损失函数值及梯度
	num_checks = 100
	theta = model2theta(params)
	grad_ana = model2theta(analytic_grad)
	delta = 1e-4
	threshold = 1e-5
	for i in range(num_checks):
		ind = (i*1299283) % theta.size
		grad_ind_ana = grad_ana[ind]
		theta1 = theta.copy()
		theta1[ind] += delta
		l1 = eval_obj(theta2model(theta1),data,wd)
		theta2 = theta.copy()
		theta2[ind] -= delta
		l2 = eval_obj(theta2model(theta2),data,wd)
		grad_ind_fin = (l1-l2)/(2*delta)
		diff = abs(grad_ind_ana - grad_ind_fin)
		if diff

 

你可能感兴趣的:(python,深度学习)