超简单的pytorch反向传播举例

本例设置了两层y=ax+b,前置条件包括x和目标y的值,初始的两组a和b的值,损失函数为mse loss,可以只跑一轮,看看反向传播各层参数是如何更新。建议在backward函数里打断点,然后将程序与图请对照着看。

超简单的pytorch反向传播举例_第1张图片

# -*- coding:utf-8 -*-
# reference: https://pytorch.org/docs/stable/notes/extending.html
import torch
from torch import nn
from torch.autograd import Function, Variable
import numpy as np
from collections import OrderedDict

class LinearFunction2(Function):

    # Note that both forward and backward are @staticmethods
    @staticmethod
    # bias is an optional argument
    def forward(ctx, input, weight, bias=None):
        ctx.save_for_backward(input, weight, bias)
        output = input.mm(weight.t())
        if bias is not None:
            output += bias.unsqueeze(0).expand_as(output)
        return output

    # This function has only a single output, so it gets only one gradient
    @staticmethod
    def backward(ctx, grad_output):
        # This is a pattern that is very convenient - at the top of backward
        # unpack saved_tensors and initialize all gradients w.r.t. inputs to
        # None. Thanks to the fact that additional trailing Nones are
        # ignored, the return statement is simple even when the function has
        # optional inputs.

        # 第一个grad_output,是loss对y-y'的梯度。如loss_mse的梯度为2/n*(y-y'),
        # 其中n为一个batch数据的数目,该loss是nx1的矩阵。
        input, weight, bias = ctx.saved_tensors
        grad_input = grad_weight = grad_bias = None

        # These needs_input_grad checks are optional and there only to
        # improve efficiency. If you want to make your code simpler, you can
        # skip them. Returning gradients for inputs that don't require it is
        # not an error.

        # 如果输入数据能够计算梯度,ctx.needs_input_grad[0]为True,
        # grad_input将传给前向传播的输入数据作为grad_output,更新前一层的权重和偏置
        if ctx.needs_input_grad[0]:
            grad_input = grad_output.mm(weight)
        
        # 若前面还有网络层,则input是前一层前向传播的输出结果;
        # 若前面没有网络层,则input是输入的x。
        # grad_output.t().mm(input)是先转置再矩阵乘法。
        # grad_weight乘以η后用于更新当前层的权重;grad_bias同理。
        if ctx.needs_input_grad[1]:
            grad_weight = grad_output.t().mm(input)
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0)

        return grad_input, grad_weight, grad_bias

linear = LinearFunction2.apply

class Linear2(nn.Module):
    def __init__(self, input_features, output_features, bias=True, assign=None):
        super(Linear2, self).__init__()
        self.input_features = input_features
        self.output_features = output_features

        # nn.Parameter is a special kind of Tensor, that will get
        # automatically registered as Module's parameter once it's assigned
        # as an attribute. Parameters and buffers need to be registered, or
        # they won't appear in .parameters() (doesn't apply to buffers), and
        # won't be converted when e.g. .cuda() is called. You can use
        # .register_buffer() to register buffers.
        # nn.Parameters require gradients by default.
        self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
        if bias:
            self.bias = nn.Parameter(torch.Tensor(output_features))
        else:
            # You should always register all possible parameters, but the
            # optional ones can be None if you want.
            self.register_parameter('bias', None)

        # Not a very smart way to initialize weights
        self.weight.data.uniform_(-0.1, 0.1)
        if self.bias is not None:
            self.bias.data.uniform_(-0.1, 0.1)
        
        if assign:
            self.weight.data.fill_(assign[0])
            self.bias.data.fill_(assign[1])

    def forward(self, input):
        # See the autograd section for explanation of what happens here.
        return LinearFunction2.apply(input, self.weight, self.bias)

    def extra_repr(self):
        # (Optional)Set the extra information about this module. You can test
        # it by printing an object of this class.
        return 'input_features={}, output_features={}, bias={}'.format(
            self.input_features, self.output_features, self.bias is not None
        )

x_train = np.array([[3.], [4.], [5.]], dtype=np.float32)
y_train = np.array([[2.], [3.], [4.]], dtype=np.float32)

x_train = torch.from_numpy(x_train)
y_train = torch.from_numpy(y_train)

criterion = nn.MSELoss()

model = torch.nn.Sequential(
    OrderedDict([
    ('l1', Linear2(1,1, bias=True, assign=[0.5, 0.5])),
    ('l2', Linear2(1,1, bias=True, assign=[1, 0]))]))
optimizer = torch.optim.SGD(model.parameters(),lr=1e-3)
num_epochs = 3
# model.weight.data.fill_(0.5)
# model.bias.data.fill_(0.5)
for epoch in range(num_epochs):
    inputs = Variable(x_train)
    outputs = Variable(y_train)
    out = model(inputs)
    loss = criterion(out, outputs)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

你可能感兴趣的:(机器学习,神经网络,人工智能,深度学习,python)