来源:https://pytorch.org/tutorials/beginner/nn_tutorial.html#refactor-using-optim
使用的数据集是MNIST,有多种方式可以获取,包括教程中的requests方式。这里不在详细说明,但是说明将数据转化为图片的方式。(MNIST的数据是手写数字)
pyplot.imshow(x_train[0].reshape((28, 28)), cmap="gray")
print(x_train.shape)
x_train, y_train, x_valid, y_valid = map(
torch.tensor, (x_train, y_train, x_valid, y_valid)
)
import math
weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)
虽然pytorch提供了很多写好的loss function,activation function给我们,但我们可以自己实现自己的版本。pytorch会自动将代码优化为更快的CPU或者向量化CPU的计算方式。
def log_softmax(x):
return x - x.exp().sum(-1).log().unsqueeze(-1)
def model(xb):
return log_softmax(xb @ weights + bias)
其中@代表点乘
bs = 64 # batch size
xb = x_train[0:bs] # a mini-batch from x
preds = model(xb) # predictions
preds[0], preds.shape
print(preds[0], preds.shape)
Out:
tensor([-2.9153, -2.6091, -2.9060, -2.2049, -2.4748, -2.4453, -2.1476, -2.1597,
-2.7930, -1.4146], grad_fn=<SelectBackward>) torch.Size([64, 10])
def nll(input, target):
return -input[range(target.shape[0]), target].mean()
loss_func = nll
yb = y_train[0:bs]
print(loss_func(preds, yb))
Out:
tensor(2.3504, grad_fn=<NegBackward>)
def accuracy(out, yb):
preds = torch.argmax(out, dim=1)
return (preds == yb).float().mean()
print(accuracy(preds, yb))
Out:
tensor(0.1562)
使用weights.grad.zero_() bias.grad.zero_()是为了准备好进行下一次梯度计算。避免梯度累加。因为每次计算不是覆盖,而是累加上去。
set_trace() 可以追踪每一步的变量的值。
from IPython.core.debugger import set_trace
lr = 0.5 # learning rate
epochs = 2 # how many epochs to train for
for epoch in range(epochs):
for i in range((n - 1) // bs + 1):
# set_trace()
start_i = i * bs
end_i = start_i + bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
with torch.no_grad():
weights -= weights.grad * lr
bias -= bias.grad * lr
weights.grad.zero_()
bias.grad.zero_()
# 再次测试
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
Out:
tensor(0.0829, grad_fn=<NegBackward>) tensor(1.)
import torch.nn.functional as F
loss_func = F.cross_entropy
def model(xb):
return xb @ weights + bias
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
Out:
tensor(0.0829, grad_fn=<NllLossBackward>) tensor(1.)
注意这里的Module是大写的M,小写的module是另一完全不同的库。
from torch import nn
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
self.bias = nn.Parameter(torch.zeros(10))
def forward(self, xb):
return xb @ self.weights + self.bias
model = Mnist_Logistic()
print(loss_func(model(xb), yb))
Out:
tensor(2.3437, grad_fn=<NllLossBackward>)
原来的版本:
with torch.no_grad():
weights -= weights.grad * lr
bias -= bias.grad * lr
weights.grad.zero_()
bias.grad.zero_()
改进之后的版本:
with torch.no_grad():
for p in model.parameters(): p -= p.grad * lr
model.zero_grad()
写进fit():
def fit():
for epoch in range(epochs):
for i in range((n - 1) // bs + 1):
start_i = i * bs
end_i = start_i + bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
with torch.no_grad():
for p in model.parameters():
p -= p.grad * lr
model.zero_grad()
fit()
print(loss_func(model(xb), yb))
Out:
tensor(0.0812, grad_fn=<NllLossBackward>)
使用Linear帮助我们初始化和进行前向操作。
原来的版本:
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
self.bias = nn.Parameter(torch.zeros(10))
def forward(self, xb):
return xb @ self.weights + self.bias
改进之后的版本:
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
self.lin = nn.Linear(784, 10)
def forward(self, xb):
return self.lin(xb)
model = Mnist_Logistic()
print(loss_func(model(xb), yb))
Out:
tensor(2.3360, grad_fn=<NllLossBackward>)
fit()
print(loss_func(model(xb), yb))
Out:
tensor(0.0798, grad_fn=<NllLossBackward>)
原来的版本:
with torch.no_grad():
for p in model.parameters(): p -= p.grad * lr
model.zero_grad()
改进之后的版本:
from torch import optim
# opt.step()
# opt.zero_grad()
def get_model():
model = Mnist_Logistic()
return model, optim.SGD(model.parameters(), lr=lr)
model, opt = get_model()
print(loss_func(model(xb), yb))
for epoch in range(epochs):
for i in range((n - 1) // bs + 1):
start_i = i * bs
end_i = start_i + bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
opt.step()
opt.zero_grad()
print(loss_func(model(xb), yb))
Out:
tensor(2.3593, grad_fn=<NllLossBackward>)
tensor(0.0821, grad_fn=<NllLossBackward>)
如果没有传入opt,那么将不会进行反向传播。
def loss_batch(model, loss_func, xb, yb, opt=None):
loss = loss_func(model(xb), yb)
if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
import numpy as np
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
for epoch in range(epochs):
model.train()
for xb, yb in train_dl:
loss_batch(model, loss_func, xb, yb, opt)
model.eval()
with torch.no_grad():
losses, nums = zip(
*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
)
val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
print(epoch, val_loss)
class Mnist_CNN(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1)
self.conv2 = nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1)
self.conv3 = nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1)
def forward(self, xb):
xb = xb.view(-1, 1, 28, 28)
xb = F.relu(self.conv1(xb))
xb = F.relu(self.conv2(xb))
xb = F.relu(self.conv3(xb))
xb = F.avg_pool2d(xb, 4)
return xb.view(-1, xb.size(1))
lr = 0.1
model = Mnist_CNN()
opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
fit(epochs, model, loss_func, opt, train_dl, valid_dl)
Out:
0 0.346197331571579
1 0.26924657912254335
其实这是一种简化的定义神经网络的方式。
搭配Lambda:lambda可以在我们定义的模型的时候创建层。其定义如下:
Lambda will create a layer that we can then use when defining a network with Sequential.
class Lambda(nn.Module):
def __init__(self, func):
super().__init__()
self.func = func
def forward(self, x):
return self.func(x)
def preprocess(x):
return x.view(-1, 1, 28, 28)
model = nn.Sequential(
Lambda(preprocess),
nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(16, 16, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
nn.Conv2d(16, 10, kernel_size=3, stride=2, padding=1),
nn.ReLU(),
nn.AvgPool2d(4),
Lambda(lambda x: x.view(x.size(0), -1)),
)
opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
fit(epochs, model, loss_func, opt, train_dl, valid_dl)
Out:
0 0.38947487244606016
1 0.279226434135437
如果可以使用CUDA-capable GPU,将会对代码进行加速。
print(torch.cuda.is_available())
Out:
True
dev = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
def preprocess(x, y):
return x.view(-1, 1, 28, 28).to(dev), y.to(dev)
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
train_dl = WrappedDataLoader(train_dl, preprocess)
valid_dl = WrappedDataLoader(valid_dl, preprocess)
model.to(dev)
opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
fit(epochs, model, loss_func, opt, train_dl, valid_dl)