手动实现了一个有10个隐藏层的全连接神经网络的向前与向后传播(主要是完成学校作业…TAT)
import torch
class MLP_Scratch:
def __init__(self, in_dim, hidden_dim, out_dim, num_hlayer):
"""
Args:
in_dim: input data dimension
hidden_dim: MLP hidden layer dimension
out_dim: output dimension
num_hlayer: number of hidden layers (excluding output layer)
"""
self.parameters = dict()
self.parameters['W0'] = torch.randn(in_dim, hidden_dim, dtype=torch.float64)
self.parameters['b0'] = torch.randn(hidden_dim, dtype=torch.float64)
for i in range(1, num_hlayer):
self.parameters[f'W{i}'] = torch.randn(hidden_dim, hidden_dim, dtype=torch.float64)
self.parameters[f'b{i}'] = torch.randn(hidden_dim, dtype=torch.float64)
self.parameters[f'W{num_hlayer}'] = torch.randn(hidden_dim, out_dim, dtype=torch.float64)
self.parameters[f'b{num_hlayer}'] = torch.randn(out_dim, dtype=torch.float64)
self.grads = dict()
self.grads['dJdW0'] = torch.zeros(in_dim, hidden_dim, dtype=torch.float64)
self.grads['dJdb0'] = torch.zeros(hidden_dim, dtype=torch.float64)
for i in range(1, num_hlayer):
self.grads[f'dJdW{i}'] = torch.zeros(hidden_dim, hidden_dim, dtype=torch.float64)
self.grads[f'dJdb{i}'] = torch.zeros(hidden_dim, dtype=torch.float64)
self.grads[f'dJdW{num_hlayer}'] = torch.zeros(hidden_dim, out_dim, dtype=torch.float64)
self.grads[f'dJdb{num_hlayer}'] = torch.zeros(out_dim, dtype=torch.float64)
self.num_hlayer = num_hlayer
# put all the cache value you need in self.cache
self.cache = dict()
def forward(self, x):
"""
Args:
x: tensor shape (batch_size, in_dim)
"""
# TODO: Implement the forward function
num_hlayer = self.num_hlayer
self.cache['A0'] = x
x = torch.add(x.mm(self.parameters['W0']), self.parameters['b0'])
self.cache['Z1'] = x
x = relu(x)
self.cache['A1'] = x
for i in range(1, num_hlayer):
x = torch.add(x.mm(self.parameters[f'W{i}']), self.parameters[f'b{i}'])
self.cache[f'Z{i+1}'] = x
x = relu(x)
self.cache[f'A{i + 1}'] = x
x = torch.add(x.mm(self.parameters[f'W{num_hlayer}']), self.parameters[f'b{num_hlayer}'])
self.cache[f'Z11'] = x
# pass
return x
def backward(self, dJdy_hat):
"""
Args:
dJdy_hat: The gradient tensor of shape (batch_size, out_dim)
"""
# TODO: Implement the backward function
num_hlayer = self.num_hlayer
self.cache[f'dA{num_hlayer+1}'] = dJdy_hat
self.cache[f'dZ{num_hlayer+1}'] = self.cache[f'dA{num_hlayer+1}']
self.grads[f'dJdW{num_hlayer}'] = self.cache[f'dZ{num_hlayer+1}'] * self.cache[f'A{num_hlayer}'].T
self.grads[f'dJdb{num_hlayer}'] = self.cache[f'dZ{num_hlayer+1}']
self.cache[f'dA{num_hlayer}'] = self.parameters[f'W{num_hlayer}'].T * self.cache[f'dZ{num_hlayer+1}']
for layer in range(1, num_hlayer+1):
layer = num_hlayer - layer
self.cache[f'dZ{layer+1}'] = self.cache[f'dA{layer+1}']
self.cache[f'dZ{layer+1}'][self.cache[f'Z{layer+1}']<= 0] = 0
self.grads[f'dJdW{layer}'] = self.cache[f'dZ{layer+1}'] * self.cache[f'A{layer}'].T
self.grads[f'dJdb{layer}'] = torch.squeeze(self.cache[f'dZ{layer+1}'],dim=0)
self.cache[f'dA{layer}'] = self.cache[f'dZ{layer+1}'].mm(self.parameters[f'W{layer}'].T)
pass
def clear_grad_and_cache(self):
for grad in self.grads:
self.grads[grad].zero_()
self.cache = dict()
手写一下要用到的relu激活函数、mse损失函数:
def relu(x):
"""
Args:
x: tensor of any shape
"""
# TODO: Implement the ReLU function
x = torch.max(input=x, other=torch.tensor(0.0))
# pass
return x
def mse_loss(y_hat, y):
"""
Args:
y_hat: the prediction tensor (batch_size, out_dim)
y: the label tensor (batch_size, out_dim)
Return:
J: scalar of loss
dJdy_hat: The gradient tensor of shape (batch_size, out_dim)
"""
# TODO: Implement the mse loss
J = (y-y_hat)**2
dJdy_hat =2*(y_hat-y)
# pass
return J, dJdy_hat
训练一下
if __name__ == '__main__':
IN_DIM = 10
HIDDEN_DIM = 21
OUT_DIM = 1
NUM_HLAYER = 10
torch.manual_seed(0) #fix random seed for reproducity, please do not modify this line.
x = torch.rand(1, IN_DIM) # TODO: define input data
x = x.double() #set float64 accuracy, please do not modify this line.
y = (torch.sum(x*x)/IN_DIM).reshape(1, 1) # TODO: define label
#model inference
model_scratch = MLP_Scratch(in_dim=IN_DIM, hidden_dim=HIDDEN_DIM, out_dim=OUT_DIM, num_hlayer=NUM_HLAYER)
model_scratch.clear_grad_and_cache()
pred = model_scratch.forward(x)
J, dJdy_hat = mse_loss(pred, y)
model_scratch.backward(dJdy_hat)
# compare with autograd model
from mlp_autograd import MLP_AutoGrad
import torch.nn.functional as F
model_autograd = MLP_AutoGrad(in_dim=IN_DIM, hidden_dim=HIDDEN_DIM, out_dim=OUT_DIM, num_hlayer=NUM_HLAYER)
# initialize model_autograd with parameters of model_scratch
for i, (name, param) in enumerate(model_autograd.named_parameters()):
if i % 2 == 0:
param.data = model_scratch.parameters[f'W{i//2}'].T
else:
param.data = model_scratch.parameters[f'b{i//2}']
pred = model_autograd(x)
loss = F.mse_loss(pred, y)
loss.backward()
检查一下自己写的与使用自动求导的所求得的导数是不是一致:
print((model_autograd.linear_in.weight.grad.data.T - model_scratch.grads['dJdW0']).norm().item() < 1e-5)
print((model_autograd.linear_in.bias.grad.data - model_scratch.grads['dJdb0']).norm().item() < 1e-5)
# expected print result: True, True