【深度学习基础】手动实现多层感知器MLP神经网络(向前向后传播,relu激活函数,mse损失函数)

手动实现了一个有10个隐藏层的全连接神经网络的向前与向后传播(主要是完成学校作业…TAT)

import torch


class MLP_Scratch:
    def __init__(self, in_dim, hidden_dim, out_dim, num_hlayer):
        """
        Args:
            in_dim: input data dimension
            hidden_dim: MLP hidden layer dimension
            out_dim: output dimension
            num_hlayer: number of hidden layers (excluding output layer)
        """

        self.parameters = dict()
        self.parameters['W0'] = torch.randn(in_dim, hidden_dim, dtype=torch.float64)
        self.parameters['b0'] = torch.randn(hidden_dim, dtype=torch.float64)
        for i in range(1, num_hlayer):
            self.parameters[f'W{i}'] = torch.randn(hidden_dim, hidden_dim, dtype=torch.float64)
            self.parameters[f'b{i}'] = torch.randn(hidden_dim, dtype=torch.float64)
        self.parameters[f'W{num_hlayer}'] = torch.randn(hidden_dim, out_dim, dtype=torch.float64)
        self.parameters[f'b{num_hlayer}'] = torch.randn(out_dim, dtype=torch.float64)

        self.grads = dict()
        self.grads['dJdW0'] = torch.zeros(in_dim, hidden_dim, dtype=torch.float64)
        self.grads['dJdb0'] = torch.zeros(hidden_dim, dtype=torch.float64)
        for i in range(1, num_hlayer):
            self.grads[f'dJdW{i}'] = torch.zeros(hidden_dim, hidden_dim, dtype=torch.float64)
            self.grads[f'dJdb{i}'] = torch.zeros(hidden_dim, dtype=torch.float64)
        self.grads[f'dJdW{num_hlayer}'] = torch.zeros(hidden_dim, out_dim, dtype=torch.float64)
        self.grads[f'dJdb{num_hlayer}'] = torch.zeros(out_dim, dtype=torch.float64)

        self.num_hlayer = num_hlayer

        # put all the cache value you need in self.cache
        self.cache = dict()

    
    def forward(self, x):
        """
        Args:
            x: tensor shape (batch_size, in_dim)
        """
        # TODO: Implement the forward function
        num_hlayer = self.num_hlayer

        self.cache['A0'] = x
        x = torch.add(x.mm(self.parameters['W0']), self.parameters['b0'])
        self.cache['Z1'] = x
        x = relu(x)
        self.cache['A1'] = x

        for i in range(1, num_hlayer):
            x = torch.add(x.mm(self.parameters[f'W{i}']), self.parameters[f'b{i}'])
            self.cache[f'Z{i+1}'] = x
            x = relu(x)
            self.cache[f'A{i + 1}'] = x

        x = torch.add(x.mm(self.parameters[f'W{num_hlayer}']), self.parameters[f'b{num_hlayer}'])
        self.cache[f'Z11'] = x

        # pass
        return x

    
    def backward(self, dJdy_hat):
        """
        Args:
            dJdy_hat: The gradient tensor of shape (batch_size, out_dim)
        """
        # TODO: Implement the backward function
        num_hlayer = self.num_hlayer

        self.cache[f'dA{num_hlayer+1}'] = dJdy_hat
        self.cache[f'dZ{num_hlayer+1}'] = self.cache[f'dA{num_hlayer+1}']
        self.grads[f'dJdW{num_hlayer}'] = self.cache[f'dZ{num_hlayer+1}'] * self.cache[f'A{num_hlayer}'].T
        self.grads[f'dJdb{num_hlayer}'] = self.cache[f'dZ{num_hlayer+1}']
        self.cache[f'dA{num_hlayer}'] = self.parameters[f'W{num_hlayer}'].T * self.cache[f'dZ{num_hlayer+1}']

        for layer in range(1, num_hlayer+1):
            layer = num_hlayer - layer
            self.cache[f'dZ{layer+1}'] = self.cache[f'dA{layer+1}']
            self.cache[f'dZ{layer+1}'][self.cache[f'Z{layer+1}']<= 0] = 0
            self.grads[f'dJdW{layer}'] = self.cache[f'dZ{layer+1}'] * self.cache[f'A{layer}'].T
            self.grads[f'dJdb{layer}'] = torch.squeeze(self.cache[f'dZ{layer+1}'],dim=0)
            self.cache[f'dA{layer}'] = self.cache[f'dZ{layer+1}'].mm(self.parameters[f'W{layer}'].T)

        pass


    def clear_grad_and_cache(self):
        for grad in self.grads:
            self.grads[grad].zero_()
        self.cache = dict()

手写一下要用到的relu激活函数、mse损失函数:

def relu(x):
    """
    Args:
        x: tensor of any shape
    """
    # TODO: Implement the ReLU function
    x = torch.max(input=x, other=torch.tensor(0.0))
    # pass
    return x


def mse_loss(y_hat, y):
    """
    Args:
        y_hat: the prediction tensor (batch_size, out_dim)
        y: the label tensor (batch_size, out_dim)

    Return:
        J: scalar of loss
        dJdy_hat: The gradient tensor of shape (batch_size, out_dim)
    """
    # TODO: Implement the mse loss
    J = (y-y_hat)**2
    dJdy_hat =2*(y_hat-y)
    # pass
    return J, dJdy_hat

训练一下

if __name__ == '__main__':
    IN_DIM = 10
    HIDDEN_DIM = 21
    OUT_DIM = 1
    NUM_HLAYER = 10

    torch.manual_seed(0)    #fix random seed for reproducity, please do not modify this line.
    x = torch.rand(1, IN_DIM)                # TODO: define input data
    x = x.double()          #set float64 accuracy, please do not modify this line.
    y = (torch.sum(x*x)/IN_DIM).reshape(1, 1)                # TODO: define label
    #model inference
    model_scratch = MLP_Scratch(in_dim=IN_DIM, hidden_dim=HIDDEN_DIM, out_dim=OUT_DIM, num_hlayer=NUM_HLAYER)
    model_scratch.clear_grad_and_cache()
    pred = model_scratch.forward(x)
    J, dJdy_hat  = mse_loss(pred, y)
    model_scratch.backward(dJdy_hat)

    # compare with autograd model
    from mlp_autograd import MLP_AutoGrad
    import torch.nn.functional as F
    model_autograd = MLP_AutoGrad(in_dim=IN_DIM, hidden_dim=HIDDEN_DIM, out_dim=OUT_DIM, num_hlayer=NUM_HLAYER)
    # initialize model_autograd with parameters of model_scratch
    for i, (name, param) in enumerate(model_autograd.named_parameters()):
        if i % 2 == 0:
            param.data = model_scratch.parameters[f'W{i//2}'].T
        else:
            param.data = model_scratch.parameters[f'b{i//2}']
    pred = model_autograd(x)
    loss = F.mse_loss(pred, y)
    loss.backward()

检查一下自己写的与使用自动求导的所求得的导数是不是一致:

    print((model_autograd.linear_in.weight.grad.data.T - model_scratch.grads['dJdW0']).norm().item() < 1e-5)
    print((model_autograd.linear_in.bias.grad.data - model_scratch.grads['dJdb0']).norm().item() < 1e-5)
    # expected print result: True, True

你可能感兴趣的:(深度学习基础,深度学习,神经网络,pytorch)