PyTorch
提供设计优雅的模块和类 torch.nn, torch.optim, Dataset, DataLoader,帮助您创建和训练神经网络。为了充分利用他们的能力并针对您的问题定制他们,您需要真正了解他们在做什么。为了加深这种理解,我们将首先在 MNIST
数据集上训练基本神经网络,而不使用这些模型的任何特征;我们最初只使用最基本的 PyTorch Tensor
功能。然后,我们将一次从 torch.nn、torch.optim、Dataset
或 DataLoader
中递增地添加一个功能,准确地显示每个部分的功能,以及如何使代码更简洁或更灵活。
这个教程假设你已经完全安装了 PyTorch ,并且熟悉 Tensor 的操作,如果熟悉 Numpy array 操作,那么 PyTorch tensor 操作几乎相同。
我们将使用 MNIST 数据集,他们包含了黑白的手写字体(数字 0~9)。
利用 pathlib 处理路径,利用 requests 下载数据集。
下载数据集
from pathlib import Path
import requests
DATA_PATH = Path("data")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)
URL = "https://github.com/pytorch/tutorials/raw/main/_static/"
FILENAME = "mnist.pkl.gz"
if not (PATH / FILENAME).exists():
content = requests.get(URL + FILENAME).content
(PATH / FILENAME).open("wb").write(content)
This dataset is in numpy array format, and has been stored using pickle, a python-specific format for serializing data.
读取数据集
数据集是 numpy array
格式的,并且利用 pickle
存储的
import pickle
import gzip
DATA_PATH = Path("../../../datasets")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)
FILENAME = "mnist.pkl.gz"
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")
每个图像大小为 28*28
, 存储的时候是拉长的数据,数据长度为 784 (28*28)
, 在使用之前,需要改变为 2D
数据。
from matplotlib import pyplot
import numpy as np
pyplot.imshow(x_train[0].reshape((28,28)), cmap="gray")
print(x_train.shape)
# (50000, 784)
PyTorch
使用 tensor
而不是 numpy
的 array
,所以我们需要转换数据。
import torch
x_train, y_train, x_valid, y_valid = map(
torch.tensor, (x_train, y_train, x_valid, y_valid)
)
n, c = x_train.shape
print(x_train, y_train)
print(x_train.shape)
print(y_train.min(), y_train.max())
tensor([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]) tensor([5, 0, 4, ..., 8, 4, 8])
torch.Size([50000, 784])
tensor(0) tensor(9)
让我们创建一个模型,只使用 PyTorch
的 tensor
不用其他封装的函数。
PyTorch
提供了创建随机和0值的 tensor
,我们可以用它创建 weights
和 bias
。这是一个标准的 tensor
,唯一不同的是,我们需要指定需要梯度信息,这样可以在计算时可以记录梯度信息,并用于反向传播。
对于 weights
, 我们在初始化后,设置 requires_grad = True
, 后缀 _
代表的是 in-place
操作。
NOTE: 使用 Xavier initialisation ( 乘以 1/sqrt(n)) 初始化权重,
import math
weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)
因为 PyTorch
强大的自动求导机制,我们可以使用标准的 Python
函数作为模型。
所以让我们写一个简单的矩阵乘法和广播加法来创建一个简单线性模型。我们还需要一个激活函数,所以我们将编写 log_softmax
并使用它。请记住:尽管 PyTorch
提供了许多预先编写的损失函数、激活函数等,但您可以使用普通 Python
轻松编写自己的函数。PyTorch
甚至会自动为您的功能创建快速 GPU
或矢量化 CPU
代码。
def log_softmax(x):
return x - x.exp().sum(-1).log().unsqueeze(-1)
def model(xb):
return log_softmax(xb @ weights + bias)
上面 @
是矩阵乘法,我们可以调用一批数据(这里,是64张图像),这是一个前向传递。请注意,在这个阶段,我们的预测不会比随机更好,那么我们从随机权重开始。
bs = 64 # batch size
xb = x_train[0:bs] # 小批量
preds = model(xb) # 预测
preds[0], preds.shape
print(preds[0], preds.shape)
tensor([-2.5138, -1.8509, -2.5235, -3.0260, -2.1639, -2.7076, -1.9549, -2.2125,
-2.2198, -2.3855], grad_fn=<SelectBackward0>) torch.Size([64, 10])
从上面可以看到, preds tensor
不仅包含了 tensor
的值,还包含了 gradient
函数,我们等会要做反向传播。让我们实现负对数似然作为损失函数(同样,我们可以使用标准 Python
):
# negative log-likelihood
def nll(input, target):
return -input[range(target.shape[0]), target].mean()
loss_func = nll
yb = y_train[0:bs]
print(loss_func(preds, yb))
# output: tensor(2.3346, grad_fn=)
让我们检查以下随机模型的 loss
, 看是否在反向传播时改进了一些。
tensor(2.3346, grad_fn=
我们同样也实现一个计算模型正确率的函数,对于每个预测,如果最大值的索引与目标值匹配,就算正确。
def accuracy(out, yb):
preds = torch.argmax(out, dim=1)
return (preds == yb).float().mean()
print(accuracy(preds, yb))
让我们记录以下随机模型的正确率,并和后续进行对比。
tensor(0.1094)
我们现在可以执行训练,对于每次迭代,我们可以:
loss
weights
和 bias
)from IPython.core.debugger import set_trace
lr = 0.5 # 学习率
epochs = 10 # 训练的轮数
for epoch in range(epochs):
for i in range((n-1) // bs + 1):
# set_trace()
# 读入数据
start_i = i * bs
end_i = start_i + bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
# 计算 loss
loss = loss_func(pred, yb)
# 反向传播 loss
loss.backward()
# 更新模型
with torch.no_grad():
weights -= weights.grad * lr
bias -= bias.grad * lr
weights.grad.zero_()
bias.grad.zero_()
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
以上就是我们手动编写了一个最小的神经网络 (线性回归,没有隐藏层)并进行了训练,然我们来检测一下 loss
和 accuracy
并与之前的进行对比,我们期望 loss
下降, accuracy
上升。
tensor(0.0528, grad_fn=
# 完整代码
import pickle
import gzip
import torch
import math
DATA_PATH = Path("../../../datasets")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)
FILENAME = "mnist.pkl.gz"
# 读取数据到 x_train, y_train 和 x_valid, y_valid
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
((x_train, y_train), (x_valid, y_valid),
_) = pickle.load(f, encoding="latin-1")
# 数据转换
x_train, y_train, x_valid, y_valid = map(
torch.tensor, (x_train, y_train, x_valid, y_valid))
# ## 检查数据
# n, c = x_train.shape
# print(x_train, y_train)
# print(x_train.shape)
# print(y_train.min(), y_train.max())
# 初始化模型数据
weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
bias = torch.zeros(10, requires_grad=True)
# 定义模型
def log_softmax(x):
return x - x.exp().sum(-1).log().unsqueeze(-1)
def model(xb):
return log_softmax(xb @ weights + bias)
# 定义 loss 函数
# negative log-likelihood
def nll(input, target):
return -input[range(target.shape[0]), target].mean()
loss_func = nll
# 定义 accuracy
def accuracy(out, yb):
preds = torch.argmax(out, dim=1)
return (preds == yb).float().mean()
# 训练模型
lr = 0.5 # 学习率
epochs = 5 # 训练的轮数
for epoch in range(epochs):
for i in range((n-1) // bs + 1):
# 读入数据
start_i = i * bs
end_i = start_i + bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
# 计算 loss
loss = loss_func(pred, yb)
# 反向传播 loss
loss.backward()
# 更新模型
with torch.no_grad():
weights -= weights.grad * lr
bias -= bias.grad * lr
weights.grad.zero_()
bias.grad.zero_()
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
tensor(0.0625, grad_fn=<NegBackward0>) tensor(1.)
我们现在将重构我们的代码,使其执行与以前相同的操作,但我们将开始利用 PyTorch 的 nn 类,使其更加简洁和灵活。
第一步也是最简单的一步是用 torch.nn.functional (通常按照惯例将其导入命名空间F)中的函数替换我们手写的激活和损失函数,从而缩短代码。此模块包含 torch.nn 库中的所有函数(而库的其他部分包含类)。除了广泛的损失和激活函数外,您还可以在这里找到一些创建神经网络的方便函数,例如池化操作。(也有用于卷积、线性层等的函数,但正如我们将看到的,这些函数通常使用库的其他部分处理得更好。)
如果您使用的是负对数似然损失和 log-softmax 激活函数,那么Pytorch提供了一个将两者结合的函数F.cross_entropy。因此,我们甚至可以从模型中删除激活函数。
import torch.nn.functional as F
loss_func = F.cross_entropy
def model(xb):
return xb @ weights + bias
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
tensor(0.0625, grad_fn=<NegBackward0>) tensor(1.)
接下来,我们将使用 nn.Module
和 nn.Parameter
,以获得更清晰、更简洁的训练循环。我们子类 nn.Module
(它本身是一个类,能够跟踪状态)。在这种情况下,我们希望创建一个类,该类保存我们的权重、偏差和前进步骤的方法。 nn.Module
有许多我们将要使用的属性和方法(例如 .prarameters()
和 .zero_grad()
)。
nn.Module (uppercase M) is a PyTorch specific concept, and is a class we’ll be using a lot. nn.Module is not to be confused with the Python concept of a (lowercase m) module, which is a file of Python code that can be imported.
nn.Module (M 为大写)是 PyTorch 特殊的概念,不要与小写的 module 混淆。
################ 改造2
# # 初始化模型数据
# weights = torch.randn(784, 10) / math.sqrt(784)
# weights.requires_grad_()
# bias = torch.zeros(10, requires_grad=True)
# # 定义模型
# def log_softmax(x):
# return x - x.exp().sum(-1).log().unsqueeze(-1)
# def model(xb):
# return log_softmax(xb @ weights + bias)
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
self.bias = nn.Parameter(torch.zeros(10))
self.linear = nn.Linear(784, 10)
def forward(self, xb):
return xb @ self.weights + self.bias
################ 改造2
# for epoch in range(epochs):
# for i in range((n-1) // bs + 1):
# # 读入数据
# start_i = i * bs
# end_i = start_i + bs
# xb = x_train[start_i:end_i]
# yb = y_train[start_i:end_i]
# pred = model(xb)
# # 计算 loss
# loss = loss_func(pred, yb)
# # 反向传播 loss
# loss.backward()
# # 更新模型
# with torch.no_grad():
# weights -= weights.grad * lr
# bias -= bias.grad * lr
# weights.grad.zero_()
# bias.grad.zero_()
def fit():
for epoch in range(epochs):
for i in range((n - 1) // bs + 1):
start_i = i * bs
end_i = start_i + bs
xb = x_train[start_i:end_i]
yb = y_train[start_i:end_i]
pred = model(xb)
loss = loss_func(pred, yb)
loss.backward()
with torch.no_grad():
for p in model.parameters():
p -= p.grad * lr
model.zero_grad()
fit()
PyTorch
有很多预定义的层,可以大大简化代码,提高运行速度。
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
################ 改造3
# self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
# self.bias = nn.Parameter(torch.zeros(10))
self.linear = nn.Linear(784, 10)
def forward(self, xb):
################ 改造3
# return xb @ self.weights + self.bias
return self.linear(xb)
Pytorch
有很多优化算法包, torch.optim, 我们可以用 step
方法代替手动更新参数。
################ 改造4
# model = Mnist_Logistic()
def get_model():
model = Mnist_Logistic()
return model, optim.SGD(model.parameters(), lr=lr)
model, opt = get_model()
################ 改造4
# 更新模型
# with torch.no_grad():
# for p in model.parameters():
# p -= p.grad * lr
# model.zero_grad()
opt.step()
opt.zero_grad()
TensorDataset
可以将 x_train
和 y_train
打包,方便迭代和切片。
################## 改造5
train_ds = TensorDataset(x_train, y_train)
################## 改造5
# start_i = i * bs
# end_i = start_i + bs
# xb = x_train[start_i:end_i]
# yb = y_train[start_i:end_i]
xb, yb = train_ds[i*bs: i*bs+bs]
DataLoader
可以很方便的对数据进行批量迭代。
################## 改造6
train_dl = DataLoader(train_ds, batch_size=bs)
################## 改造6
# def fit():
# for epoch in range(epochs):
# for i in range((n - 1) // bs + 1):
# ################## 改造5
# # start_i = i * bs
# # end_i = start_i + bs
# # xb = x_train[start_i:end_i]
# # yb = y_train[start_i:end_i]
# xb, yb = train_ds[i*bs: i*bs+bs]
# pred = model(xb)
# # 计算 loss
# loss = loss_func(pred, yb)
# # 反向传播 loss
# loss.backward()
# ################ 改造4
# # 更新模型
# # with torch.no_grad():
# # for p in model.parameters():
# # p -= p.grad * lr
# # model.zero_grad()
# opt.step()
# opt.zero_grad()
def fit():
for epoch in range(epochs):
for xb, yb in train_dl:
pred = model(xb)
# 计算 loss
loss = loss_func(pred, yb)
# 反向传播 loss
loss.backward()
################ 改造4
# 更新模型
opt.step()
opt.zero_grad()
# 完整代码---改造1---改造2---改造3---改造4---改造5---改造6
import pickle
import gzip
import torch
import math
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
DATA_PATH = Path("../../../datasets")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)
FILENAME = "mnist.pkl.gz"
# 读取数据到 x_train, y_train 和 x_valid, y_valid
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
((x_train, y_train), (x_valid, y_valid),
_) = pickle.load(f, encoding="latin-1")
# 数据转换
x_train, y_train, x_valid, y_valid = map(
torch.tensor, (x_train, y_train, x_valid, y_valid))
################ 改造2
# # 初始化模型数据
# weights = torch.randn(784, 10) / math.sqrt(784)
# weights.requires_grad_()
# bias = torch.zeros(10, requires_grad=True)
# # 定义模型
# def log_softmax(x):
# return x - x.exp().sum(-1).log().unsqueeze(-1)
# def model(xb):
# return log_softmax(xb @ weights + bias)
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
################ 改造3
# self.weights = nn.Parameter(torch.randn(784, 10) / math.sqrt(784))
# self.bias = nn.Parameter(torch.zeros(10))
self.linear = nn.Linear(784, 10)
def forward(self, xb):
################ 改造3
# return xb @ self.weights + self.bias
return self.linear(xb)
################ 改造4
# model = Mnist_Logistic()
def get_model():
model = Mnist_Logistic()
return model, optim.SGD(model.parameters(), lr=lr)
model, opt = get_model()
################ 改造1
# 定义 loss 函数
## negative log-likelihood
# def nll(input, target):
# return -input[range(target.shape[0]), target].mean()
# loss_func = nll
loss_func = F.cross_entropy
def accuracy(out, yb):
"""
# 定义 accuracy
"""
preds = torch.argmax(out, dim=1)
return (preds == yb).float().mean()
# 训练模型
lr = 0.01 # 学习率
epochs = 5 # 训练的轮数
bs = 64 # batch size
################ 改造2
# for epoch in range(epochs):
# for i in range((n-1) // bs + 1):
# # 读入数据
# start_i = i * bs
# end_i = start_i + bs
# xb = x_train[start_i:end_i]
# yb = y_train[start_i:end_i]
# pred = model(xb)
# # 计算 loss
# loss = loss_func(pred, yb)
# # 反向传播 loss
# loss.backward()
# # 更新模型
# with torch.no_grad():
# weights -= weights.grad * lr
# bias -= bias.grad * lr
# weights.grad.zero_()
# bias.grad.zero_()
################## 改造5
train_ds = TensorDataset(x_train, y_train)
################## 改造6
train_dl = DataLoader(train_ds, batch_size=bs)
################## 改造6
# def fit():
# for epoch in range(epochs):
# for i in range((n - 1) // bs + 1):
# ################## 改造5
# # start_i = i * bs
# # end_i = start_i + bs
# # xb = x_train[start_i:end_i]
# # yb = y_train[start_i:end_i]
# xb, yb = train_ds[i*bs: i*bs+bs]
# pred = model(xb)
# # 计算 loss
# loss = loss_func(pred, yb)
# # 反向传播 loss
# loss.backward()
# ################ 改造4
# # 更新模型
# # with torch.no_grad():
# # for p in model.parameters():
# # p -= p.grad * lr
# # model.zero_grad()
# opt.step()
# opt.zero_grad()
def fit():
for epoch in range(epochs):
for xb, yb in train_dl:
pred = model(xb)
# 计算 loss
loss = loss_func(pred, yb)
# 反向传播 loss
loss.backward()
################ 改造4
# 更新模型
opt.step()
opt.zero_grad()
fit()
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
# 完整代码----改造完成后
import pickle
import gzip
import torch
import math
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
DATA_PATH = Path("../../../datasets")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)
FILENAME = "mnist.pkl.gz"
# 读取数据到 x_train, y_train 和 x_valid, y_valid
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
((x_train, y_train), (x_valid, y_valid),
_) = pickle.load(f, encoding="latin-1")
# 数据转换
x_train, y_train, x_valid, y_valid = map(
torch.tensor, (x_train, y_train, x_valid, y_valid))
# 定义模型
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(784, 10)
def forward(self, xb):
return self.linear(xb)
def get_model():
model = Mnist_Logistic()
return model, optim.SGD(model.parameters(), lr=lr)
# 实例化与参数获取
model, opt = get_model()
# 损失函数
loss_func = F.cross_entropy
def accuracy(out, yb):
"""
# 定义 accuracy
"""
preds = torch.argmax(out, dim=1)
return (preds == yb).float().mean()
# 训练模型
lr = 0.01 # 学习率
epochs = 5 # 训练的轮数
bs = 64 # batch size
# 数据 dataloader
train_ds = TensorDataset(x_train, y_train)
train_dl = DataLoader(train_ds, batch_size=bs)
def fit():
for epoch in range(epochs):
for xb, yb in train_dl:
pred = model(xb)
# 计算 loss
loss = loss_func(pred, yb)
# 反向传播 loss
loss.backward()
################ 改造4
# 更新模型
opt.step()
opt.zero_grad()
fit()
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
我们已经建立了一个完整的训练流程,还需要增加一些验证,方便判断训练是否过拟合。
打乱训练数据可以防止批次数据的相关性以及过拟合,另一方面,无论是否打乱验证数据,结果都是相同的。由于打乱需要额外的时间,所以打乱验证数据是没有意义的。
由于验证集不需要反向传播,占用内存少,我们可以将验证集的批量大小提升为2倍。
# 完整代码----改造完成后
import pickle
import gzip
import torch
import math
import torch.nn.functional as F
from torch import nn
from torch import optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
DATA_PATH = Path("../../../datasets")
PATH = DATA_PATH / "mnist"
PATH.mkdir(parents=True, exist_ok=True)
FILENAME = "mnist.pkl.gz"
# 读取数据到 x_train, y_train 和 x_valid, y_valid
with gzip.open((PATH / FILENAME).as_posix(), "rb") as f:
((x_train, y_train), (x_valid, y_valid),
_) = pickle.load(f, encoding="latin-1")
# 数据转换
x_train, y_train, x_valid, y_valid = map(
torch.tensor, (x_train, y_train, x_valid, y_valid))
# 定义模型
class Mnist_Logistic(nn.Module):
def __init__(self):
super().__init__()
self.linear = nn.Linear(784, 10)
def forward(self, xb):
return self.linear(xb)
def get_model():
model = Mnist_Logistic()
return model, optim.SGD(model.parameters(), lr=lr)
# 实例化与参数获取
model, opt = get_model()
# 损失函数
loss_func = F.cross_entropy
def accuracy(out, yb):
"""
# 定义 accuracy
"""
preds = torch.argmax(out, dim=1)
return (preds == yb).float().mean()
def loss_batch(model, loss_func, xb, yb, opt=None):
loss = loss_func(model(xb), yb)
if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
# 训练模型
lr = 0.01 # 学习率
epochs = 5 # 训练的轮数
bs = 64 # batch size
# # 数据 dataloader
train_ds = TensorDataset(x_train, y_train)
# train_dl = DataLoader(train_ds, batch_size=bs)
# ##
valid_ds = TensorDataset(x_valid, y_valid)
# valid_dl = DataLoader(valid_ds, batch_size=bs * 2)
def get_data(train_ds, valid_ds, bs):
return (
DataLoader(train_ds, batch_size=bs, shuffle=True),
DataLoader(valid_ds, batch_size=bs * 2),
)
def loss_batch(model, loss_func, xb, yb, opt=None):
loss = loss_func(model(xb), yb)
if opt is not None:
loss.backward()
opt.step()
opt.zero_grad()
return loss.item(), len(xb)
def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
for epoch in range(epochs):
model.train()
for xb, yb in train_dl:
loss_batch(model, loss_func, xb, yb, opt)
model.eval()
with torch.no_grad():
losses, nums = zip(
*[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
)
val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)
print(epoch, val_loss)
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)
model, opt = get_model()
fit(epochs, model, loss_func, opt, train_dl, valid_dl)
print(loss_func(model(xb), yb), accuracy(model(xb), yb))
WHAT IS TORCH.NN REALLY?