PyTorch的两个主要特征:1. 和numpy很相似的Tensor, 可以在GPU上运行(然而我并没有)2. 自动求导
下面是一个用Tensor实现一个简单的两层网络(fc-ReLu-fc)的代码
import torch
device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10
# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)
learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h = x.mm(w1)
h_relu = h.clamp(min=0) // 压缩函数,把向量的值压缩在min和max之间
y_pred = h_relu.mm(w2)
# Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
# of shape (); we can get its value as a Python number with loss.item().
loss = (y_pred - y).pow(2).sum()
print(t, loss.item()) // 标量也存储在Tensor中
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.t().mm(grad_y_pred) // 转置用函数.t()完成
grad_h_relu = grad_y_pred.mm(w2.t())
grad_h = grad_h_relu.clone()
grad_h[h < 0] = 0
grad_w1 = x.t().mm(grad_h)
# Update weights using gradient descent
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
下面一个部分jc介绍了pytorch的autograd操作。对于需要求导的Tensor,我们在生成这个Tensor时加入require_grads=True参数,它的导数就存在另一个名叫x.grad的Tensor中。有些时候对某些require_grads=True的向量不需要求导,就用with torch.no_grad():控制语句,具体见下面代码。
import torch
device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU
N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)
learning_rate = 1e-6
for t in range(500):
y_pred = x.mm(w1).clamp(min=0).mm(w2) # 注意这里将forward部分浓缩成了一行
loss = (y_pred - y).pow(2).sum()
print(t, loss.item())
loss.backward() # bp同样也只需一行,导数就自动存在各Tensor的grad里面了
# Update weights using gradient descent. For this step we just want to mutate
# the values of w1 and w2 in-place; we don't want to build up a computational
# graph for the update steps, so we use the torch.no_grad() context manager
# to prevent PyTorch from building a computational graph for the updates
with torch.no_grad():
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
# 每次bp后手动将参数的grad清零
w1.grad.zero_()
w2.grad.zero_()
此外我们还可以自己定义autograd操作,作为torch.autograd.Function的一个子类.
import torch class MyReLU(torch.autograd.Function): @staticmethod # 标注staticmethod的方法不用创建实例即可调用 def forward(ctx, x): """ In the forward pass we receive a context object and a Tensor containing the input; we must return a Tensor containing the output, and we can use the context object to cache objects for use in the backward pass. """ ctx.save_for_backward(x) return x.clamp(min=0) def backward(ctx, grad_output): """ In the backward pass we receive the context object and a Tensor containing the gradient of the loss with respect to the output produced during the forward pass. We can retrieve cached data from the context object, and must compute and return the gradient of the loss with respect to the input to the forward function. """ x, = ctx.saved_tensors # x后面的逗号代表什么? grad_x = grad_output.clone() grad_x[x < 0] = 0 return grad_x device = torch.device('cpu') # device = torch.device('cuda') # Uncomment this to run on GPU # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold input and output x = torch.randn(N, D_in, device=device) y = torch.randn(N, D_out, device=device) # Create random Tensors for weights. w1 = torch.randn(D_in, H, device=device, requires_grad=True) w2 = torch.randn(H, D_out, device=device, requires_grad=True) learning_rate = 1e-6 for t in range(500): # Forward pass: compute predicted y using operations on Tensors; we call our # custom ReLU implementation using the MyReLU.apply function y_pred = MyReLU.apply(x.mm(w1)).mm(w2) # Compute and print loss loss = (y_pred - y).pow(2).sum() print(t, loss.item()) # Use autograd to compute the backward pass. loss.backward() with torch.no_grad(): # Update weights using gradient descent w1 -= learning_rate * w1.grad w2 -= learning_rate * w2.grad # Manually zero the gradients after running the backward pass w1.grad.zero_() w2.grad.zero_()
在复杂的网络里,简单的autograd还是过于low-level了. pytorch的nn库定义了一系列的Modules,可以视同为神经网络里的“层”。一个Module接受输入Tensor,也输出一个Tensor. 同时Module内部保存了learnable parameters, 以及一系列loss functions.
import torch device = torch.device('cpu') # device = torch.device('cuda') # Uncomment this to run on GPU N, D_in, H, D_out = 64, 1000, 100, 10 x = torch.randn(N, D_in, device=device) y = torch.randn(N, D_out, device=device) # Use the nn package to define our model as a sequence of layers. nn.Sequential # is a Module which contains other Modules, and applies them in sequence to # produce its output. Each Linear Module computes output from input using a # linear function, and holds internal Tensors for its weight and bias. # After constructing the model we use the .to() method to move it to the # desired device. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ).to(device) # The nn package also contains definitions of popular loss functions; in this # case we will use Mean Squared Error (MSE) as our loss function. loss_fn = torch.nn.MSELoss(size_average=False) learning_rate = 1e-4 for t in range(500): # Forward pass: compute predicted y by passing x to the model. Module objects # override the __call__ operator so you can call them like functions. When # doing so you pass a Tensor of input data to the Module and it produces # a Tensor of output data. y_pred = model(x) # Compute and print loss. We pass Tensors containing the predicted and true # values of y, and the loss function returns a Tensor containing the loss. loss = loss_fn(y_pred, y) print(t, loss.item()) # Zero the gradients before running the backward pass. model.zero_grad() # Backward pass: compute gradient of the loss with respect to all the learnable # parameters of the model. Internally, the parameters of each Module are stored # in Tensors with requires_grad=True, so this call will compute gradients for # all learnable parameters in the model. loss.backward() # Update the weights using gradient descent. Each parameter is a Tensor, so # we can access its data and gradients like we did before. with torch.no_grad(): for param in model.parameters(): param.data -= learning_rate * param.grad # 注意要用.data来访问参数的值(有哪些其他变量?)
pytorch中的optim库实现了几个常用的优化方法,如Adam, RMSProp等(注意在前几段程序中我们都是手动进行参数更新)
import torch # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold inputs and outputs. x = torch.randn(N, D_in) y = torch.randn(N, D_out) # Use the nn package to define our model and loss function. model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) loss_fn = torch.nn.MSELoss(size_average=False) # Use the optim package to define an Optimizer that will update the weights of # the model for us. Here we will use Adam; the optim package contains many other # optimization algoriths. The first argument to the Adam constructor tells the # optimizer which Tensors it should update. learning_rate = 1e-4 optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) for t in range(500): # Forward pass: compute predicted y by passing x to the model. y_pred = model(x) # Compute and print loss. loss = loss_fn(y_pred, y) print(t, loss.item()) # Before the backward pass, use the optimizer object to zero all of the # gradients for the Tensors it will update (which are the learnable weights # of the model) optimizer.zero_grad() # Backward pass: compute gradient of the loss with respect to model parameters loss.backward() # Calling the step function on an Optimizer makes an update to its parameters optimizer.step()
我们也可以自己定义模型,作为torch.nn.Module的子类:
import torch class TwoLayerNet(torch.nn.Module): def __init__(self, D_in, H, D_out): """ In the constructor we instantiate two nn.Linear modules and assign them as member variables. """ super(TwoLayerNet, self).__init__() # 注意这个神奇的super self.linear1 = torch.nn.Linear(D_in, H) self.linear2 = torch.nn.Linear(H, D_out) def forward(self, x): """ In the forward function we accept a Tensor of input data and we must return a Tensor of output data. We can use Modules defined in the constructor as well as arbitrary (differentiable) operations on Tensors. """ h_relu = self.linear1(x).clamp(min=0) y_pred = self.linear2(h_relu) return y_pred # N is batch size; D_in is input dimension; # H is hidden dimension; D_out is output dimension. N, D_in, H, D_out = 64, 1000, 100, 10 # Create random Tensors to hold inputs and outputs x = torch.randn(N, D_in) y = torch.randn(N, D_out) # Construct our model by instantiating the class defined above. model = TwoLayerNet(D_in, H, D_out) # Construct our loss function and an Optimizer. The call to model.parameters() # in the SGD constructor will contain the learnable parameters of the two # nn.Linear modules which are members of the model. loss_fn = torch.nn.MSELoss(size_average=False) optimizer = torch.optim.SGD(model.parameters(), lr=1e-4) for t in range(500): # Forward pass: Compute predicted y by passing x to the model y_pred = model(x) # Compute and print loss loss = loss_fn(y_pred, y) print(t, loss.item()) # Zero gradients, perform a backward pass, and update the weights. optimizer.zero_grad() loss.backward() optimizer.step()最后总结一下,入门pytorch需要了解的几个重要的库:nn(loss, Module), optim