# Inherit from Function
class LinearFunction(Function):
# Note that both forward and backward are @staticmethods
# bias is an optional argument
def forward(ctx, input, weight, bias=None):
ctx.save_for_backward(input, weight, bias)
output = input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
return output
# This function has only a single output, so it gets only one gradient
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
input, weight, bias = ctx.saved_tensors
grad_input = grad_weight = grad_bias = None
# These needs_input_grad checks are optional and there only to
# improve efficiency. If you want to make your code simpler, you can
# skip them. Returning gradients for inputs that don't require it is
# not an error.
if ctx.needs_input_grad[0]:
grad_input = grad_output.mm(weight)
if ctx.needs_input_grad[1]:
grad_weight = grad_output.t().mm(input)
if bias is not None and ctx.needs_input_grad[2]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_bias
Now, to make it easier to use these custom ops, we recommend aliasing their apply method:
linear = LinearFunction.apply
Here, we give an additional example of a function that is parametrized by non-Tensor arguments:
class MulConstant(Function):
def forward(ctx, tensor, constant):
# ctx is a context object that can be used to stash information
# for backward computation
ctx.constant = constant
return tensor * constant
def backward(ctx, grad_output):
# We return as many input gradients as there were arguments.
# Gradients of non-Tensor arguments to forward must be None.
return grad_output * ctx.constant, None
import torch.nn.functional as F
那么怎样为激活函数添加可学习的参数呢?这就涉及到nn.Module与nn.Functional(torch.nn.functional)的区别。 nn.Module是一个包装好的类,具体定义了一个网络层,可以维护状态和存储参数信息;而nn.Functional仅仅提供了一个计算,不会维护状态信息和存储参数。比如前面提到的ReLU等激活函数,dropout,pooling等没有训练参数,可以使用functional模块。
那么我们要为自定义的激活函数添加可学习的参数,该怎么办呢?这还要回到前边提到的nn.Module与nn.Functional之间的关系。我们知道继承于nn.Module的一个层是内含有可学习参数的,如权重weights和偏置bias。但是具体的操作,如卷积等还是由nn.Functional来执行的。nn.Module只是对nn.Functional进行了一层封装,使之可以维护状态和存储参数信息。对于简单的没有可学习的参数的层,可以用nn.Functional模块中的操作也可以用 nn.Module中的操作,如nn.ReLU与F.relu等。不同的是nn.ReLU作为网络的一个层,而F.relu仅仅作为一个函数运算。
class Linear(nn.Module):
def __init__(self, input_features, output_features, bias=True):
super(Linear, self).__init__()
self.input_features = input_features
self.output_features = output_features
# nn.Parameter is a special kind of Tensor, that will get
# automatically registered as Module's parameter once it's assigned
# as an attribute. Parameters and buffers need to be registered, or
# they won't appear in .parameters() (doesn't apply to buffers), and
# won't be converted when e.g. .cuda() is called. You can use
# .register_buffer() to register buffers.
# nn.Parameters require gradients by default.
self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(output_features))
# You should always register all possible parameters, but the
# optional ones can be None if you want.
self.register_parameter('bias', None)
# Not a very smart way to initialize weights
self.weight.data.uniform_(-0.1, 0.1)
if bias is not None:
self.bias.data.uniform_(-0.1, 0.1)
def forward(self, input):
# See the autograd section for explanation of what happens here.
return LinearFunction.apply(input, self.weight, self.bias)
def extra_repr(self):
# (Optional)Set the extra information about this module. You can test
# it by printing an object of this class.
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None
output = Linear()(input, self.weight, self.bias)
self.linear = Linear()
output = self.linear(input, self.weight, self.bias)
首先,当我们继承Function(from torch.autograd import Function)时需要自定义前向和反向,这是PyTorch非常灵活的地方。要设置可学习参数需要利用nn.Module类进行封装。然后,是继承nn.Module类的问题。继承nn.Module类不需要自己写反向,只需要定义前向函数。这是因为nn.Module类中的操作是调用由一系列nn.Functional类中的函数操作来完成的,而这些操作均已经定义了反向求导的操作,所以在nn.Module类中无需再重新定义。其实对于可导的操作,在继承nn.Module类时会利torch.autograd进行自动求导。通常我们在自定义损失函数时只需继承nn.Module类,将计算Loss的方法写在forward函数中,无需写反向,即可实现自动求反向操作。关于反向传播可以参看CS231n笔记|4 反向传播与神经网络。
