我们知道CNN这类人工神经网络都基于BP算法进行优化,因此需要误差关于权重是连续可导的,这是可以运用BP算法的前提条件;也有一些网络不满足这个条件。
对于可连续求导的神经网络构建时采用nn.Module类即可,此时仅仅需要改写__init__和forward方法,torch会自动求导,如下面的例子:
import torch
import torch.nn as nn
import torch.nn.functional as F
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
# 1 input image channel, 6 output channels, 3x3 square convolution
# kernel
self.conv1 = nn.Conv2d(1, 6, 3)
self.conv2 = nn.Conv2d(6, 16, 3)
# an affine operation: y = Wx + b
self.fc1 = nn.Linear(16 * 6 * 6, 120) # 6*6 from image dimension
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 10)
def forward(self, x):
# Max pooling over a (2, 2) window
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
# If the size is a square you can only specify a single number
x = F.max_pool2d(F.relu(self.conv2(x)), 2)
x = x.view(-1, self.num_flat_features(x))
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
def num_flat_features(self, x):
size = x.size()[1:] # all dimensions except the batch dimension
num_features = 1
for s in size:
num_features *= s
return num_features
net = Net()
print(net)
当构建的神经网络不满足连续可导时,通常是某一部分函数不可导,这时需要采用autograd.Function对不可导的部分自己定义backward方法。以下面这个例子说明autograd.Function的用法:
y = x*w +b # 自己定义的LinearFunction
z = f(y)其中,grad_output = dz/dy
根据复合函数求导法则:
1. dz/dx = dz/dy * dy/dx = grad_output*dy/dx = grad_output*w
2. dz/dw = dz/dy * dy/dw = grad_output*dy/dw = grad_output*x
3. dz/db = dz/dy * dy/db = grad_output*1
import torch.autograd.Function as Function
class LinearFunction(Function):
# 创建torch.autograd.Function类的一个子类
# 必须是staticmethod
@staticmethod
# 第一个是ctx,第二个是input,其他是可选参数。
# ctx在这里类似self,ctx的属性可以在backward中调用。
# 自己定义的Function中的forward()方法,所有的Variable参数将会转成tensor!因此这里的input也是tensor.在传入forward前,autograd engine会自动将Variable unpack成Tensor。
def forward(ctx, input, weight, bias=None):
print(type(input))
ctx.save_for_backward(input, weight, bias) # 将Tensor转变为Variable保存到ctx中
output = input.mm(weight.t()) # torch.t()方法,对2D tensor进行转置
if bias is not None:
output += bias.unsqueeze(0).expand_as(output) #unsqueeze(0) 扩展处第0维
# expand_as(tensor)等价于expand(tensor.size()), 将原tensor按照新的size进行扩展
return output
@staticmethod
def backward(ctx, grad_output):
# grad_output为反向传播上一级计算得到的梯度值
input, weight, bias = ctx.saved_variables
grad_input = grad_weight = grad_bias = None
# 分别代表输入,权值,偏置三者的梯度
# 判断三者对应的Variable是否需要进行反向求导计算梯度
if ctx.needs_input_grad[0]:
grad_input = grad_output.mm(weight) # 复合函数求导,链式法则
if ctx.needs_input_grad[1]:
grad_weight = grad_output.t().mm(input) # 复合函数求导,链式法则
if bias is not None and ctx.needs_input_grad[2]:
grad_bias = grad_output.sum(0).squeeze(0)
return grad_input, grad_weight, grad_bias
一般可以将其封装为一个方法来调用。
#建议把新操作封装在一个函数中
def linear(input, weight, bias=None):
# First braces create a Function object. Any arguments given here
# will be passed to __init__. Second braces will invoke the __call__
# operator, that will then use forward() to compute the result and
# return it.
return LinearFunction()(input, weight, bias)#调用forward()
# 或者使用apply方法对自己定义的方法取个别名,再调用
linear = LinearFunction.apply
linear(input, weight, bias)
#检查实现的backward()是否正确
from torch.autograd import gradcheck
# gradchek takes a tuple of tensor as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
input = (Variable(torch.randn(20,20).double(), requires_grad=True),)
test = gradcheck(LinearFunction(), input, eps=1e-6, atol=1e-4)
print(test) # 没问题的话输出True
下面则是将这个自定义求导的函数嵌入到自定义的神经网络中,网络架构用nn.Module实现:
import torch.nn as nn
class Linear(nn.Module):
def __init__(self, input_features, output_features, bias=True):
super(Linear, self).__init__()
self.input_features = input_features
self.output_features = output_features
# nn.Parameter is a special kind of Variable, that will get
# automatically registered as Module's parameter once it's assigned
# 这个很重要! Parameters是默认需要梯度的!
self.weight = nn.Parameter(torch.Tensor(output_features, input_features))
if bias:
self.bias = nn.Parameter(torch.Tensor(output_features))
else:
# You should always register all possible parameters, but the
# optional ones can be None if you want.
self.register_parameter('bias', None)
# Not a very smart way to initialize weights
self.weight.data.uniform_(-0.1, 0.1)
if bias is not None:
self.bias.data.uniform_(-0.1, 0.1)
def forward(self, input):
# See the autograd section for explanation of what happens here.
return LinearFunction.apply(input, self.weight, self.bias)
# 或者 return LinearFunction()(input, self.weight, self.bias)
最后2行代码,是调用LinearFuction的两种方法,一个是用apply,另一个是创建实例。注意不能用LinearFunction().forward(input, self.weight, self.bias),因为这样仅仅是调用LinearFunction的forward方法,torch对网络Linear的forwardz中的函数自动求导时,进入LinearFunction的forward方法并自动求导,而不会去调用LinearFunction中自定义的backward方法。
最后放上官方教程
# -*- coding: utf-8 -*-
import torch
from torch.autograd import Variable
class MyReLU(torch.autograd.Function):
"""
We can implement our own custom autograd Functions by subclassing
torch.autograd.Function and implementing the forward and backward passes
which operate on Tensors.
"""
@staticmethod
def forward(ctx, input):
"""
In the forward pass we receive a Tensor containing the input and return
a Tensor containing the output. ctx is a context object that can be used
to stash information for backward computation. You can cache arbitrary
objects for use in the backward pass using the ctx.save_for_backward method.
"""
ctx.save_for_backward(input)
return input.clamp(min=0)
@staticmethod
def backward(ctx, grad_output):
"""
In the backward pass we receive a Tensor containing the gradient of the loss
with respect to the output, and we need to compute the gradient of the loss
with respect to the input.
"""
input, = ctx.saved_tensors
grad_input = grad_output.clone()
grad_input[input < 0] = 0
return grad_input
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)
learning_rate = 1e-6
for t in range(500):
# To apply our Function, we use Function.apply method. We alias this as 'relu'.
relu = MyReLU.apply
# Forward pass: compute predicted y using operations on Variables; we compute
# ReLU using our custom autograd operation.
y_pred = relu(x.mm(w1)).mm(w2)
# Compute and print loss
loss = (y_pred - y).pow(2).sum()
print(t, loss.data[0])
# Use autograd to compute the backward pass.
loss.backward()
# Update weights using gradient descent
w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data
# Manually zero the gradients after updating weights
w1.grad.data.zero_()
w2.grad.data.zero_()
参考博客:https://blog.csdn.net/Hungryof/article/details/78346304
https://blog.csdn.net/tsq292978891/article/details/79364140