原文链接:http://chenhao.space/post/7f30013.html
import torch
from time import time
print(torch.__version__)
1.1.0
a = torch.ones(1000)
b = torch.ones(1000)
将这两个向量按元素逐一做标量加法:
start = time()
c = torch.zeros(1000)
for i in range(1000):
c[i] = a[i] + b[i]
print(time() - start)
0.014781951904296875
将这两个向量直接做矢量加法:
start = time()
d = a + b
print(time() - start)
0.0003459453582763672
结果很明显,后者比前者更省时。因此,我们应该尽可能采用矢量计算,以提升计算效率。
广播机制例子:
a = torch.ones(3)
b = 10
print(a + b)
tensor([11., 11., 11.])
设房屋的面积为 x1 ,房龄为 x2 ,售出价格为 y 。我们需要建立基于输入 x1 和 x2 来计算输出 y 的表达式,也就是模型(model)。顾名思义,线性回归假设输出与各个输入之间是线性关系:
y ^ = x 1 w 1 + x 2 w 2 + b , ŷ =x_1w_1+x_2w_2+b, y^=x1w1+x2w2+b,
其中 w1 和 w2 是权重(weight), b 是偏差(bias),且均为标量。
%matplotlib inline
import torch
from IPython import display
from matplotlib import pyplot as plt
import numpy as np
import random
print(torch.__version__)
torch.set_default_tensor_type('torch.FloatTensor')
1.1.0
num_inputs = 2
num_examples = 1000
true_w = [2, -3.4]
true_b = 4.2
features = torch.randn(num_examples, num_inputs)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
# torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
# 是增加的随机噪声,噪声代表了数据集中无意义的干扰
# 注意,features的每一行是一个长度为2的向量,而labels的每一行是一个长度为1的向量(标量)。
print(features[0], labels[0])
print(labels.size())
tensor([1.0796, 0.3098]) tensor(5.3111)
torch.Size([1000])
def use_svg_display():
# 用矢量图显示
display.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5, 2.5)):
use_svg_display()
# 设置图的尺寸
plt.rcParams['figure.figsize'] = figsize
# # 在../d2lzh_pytorch里面添加上面两个函数后就可以这样导入
# import sys
# sys.path.append("..")
# from d2lzh_pytorch import *
set_figsize()
plt.scatter(features[:, 1].numpy(), labels.numpy(), 1);
"""
在训练模型的时候,我们需要遍历数据集并不断读取小批量数据样本。
这里我们定义一个函数:它每次返回batch_size(批量大小)个随机样本的特征和标签。
"""
# 本函数已保存在d21zh中方便以后使用
def data_iter(batch_size, features, labels):
num_examples = len(features)
indices = list(range(num_examples))
random.shuffle(indices) # 样本的读取顺序是随机的
for i in range(0, num_examples, batch_size):
# 最后一次可能不足一个batch
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)])
yield features.index_select(0, j), labels.index_select(0, j)
让我们读取第一个小批量数据样本并打印。每个批量的特征形状为(10, 2),分别对应批量大小和输入个数;标签形状为批量大小。
batch_size = 10
for X, y in data_iter(batch_size, features, labels):
print(X, '\n', y)
break
tensor([[ 0.0758, -0.3826],
[-0.0732, -2.8249],
[-0.7557, 0.7309],
[-0.6299, -0.1189],
[-0.9627, 0.0947],
[ 1.3580, 1.2370],
[-0.3616, 0.3718],
[-0.5052, -1.4169],
[-1.4944, -1.5984],
[-0.2773, -0.4932]])
tensor([ 5.6482, 13.6693, 0.2113, 3.3605, 1.9548, 2.7224, 2.2263, 7.9824,
6.6399, 5.3169])
我们将权重初始化成均值为0、标准差为0.01的正态随机数,偏差则初始化成0。
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float)
b = torch.zeros(1)
之后的模型训练中,需要对这些参数求梯度来迭代参数的值,因此我们需要创建它们的梯度。
w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
tensor([0.], requires_grad=True)
使用mm()
函数做矩阵乘法
# 本函数已保存在d21zh包中方便以后使用
def linreg(X, w, b):
return torch.mm(X, w) + b
我们需要把真实值 y
变形成预测值 y_hat
的形状。以下函数返回的结果也将和 y_hat
的形状相同。
# 本函数已保存在pytorch_d2lzh包中方便以后使用
def squared_loss(y_hat, y):
return (y_hat - y.view(y_hat.size())) ** 2 /2
以下的sgd
函数实现了小批量随机梯度下降算法。它通过不断迭代模型参数来优化损失函数。这里自动求梯度模块计算得来的梯度是一个批量样本的梯度和。我们将它除以批量大小来得到平均值。
# 本函数已保存在pytorch_d2lzh包中方便以后使用
def sgd(params, lr, batch_size):
for param in params:
# 注意这里更改param时用的param.data
param.data -= lr * param.grad / batch_size
在训练中,我们将多次迭代模型参数。在每次迭代中,我们根据当前读取的小批量数据样本(特征X
和标签y
),通过调用反向函数backward
计算小批量随机梯度,并调用优化算法sgd
迭代模型参数。由于我们之前设批量大小batch_size
为10,每个小批量的损失l
的形状为(10, 1)。回忆一下“自动求梯度”一节。由于变量l
并不是一个标量,运行l.backward()
将对l
中元素求和得到新的变量,再求该变量有关模型参数的梯度。
在一个迭代周期(epoch)中,我们将完整遍历一遍data_iter
函数,并对训练数据集中所有样本都使用一次(假设样本数能够被批量大小整除)。这里的迭代周期个数num_epochs
和学习率lr
都是超参数,分别设3和0.03。在实践中,大多超参数都需要通过反复试错来不断调节。虽然迭代周期数设得越大模型可能越有效,但是训练时间可能过长。
lr = 0.03
num_epochs = 3
net = linreg
loss = squared_loss
# 训练模型一共需要num_epochs个迭代周期
for epoch in range(num_epochs):
# 在每一个迭代周期中,会使用训练数据集中所有样本一次(假设样本数能够被批量大小整除)。
# X和y分别是小批量样本的特征和标签
for X, y in data_iter(batch_size, features, labels):
l = loss(net(X, w, b), y).sum() # l是有关小批量X和y的损失
l.backward() # 小批量的损失对模型参数求梯度
sgd([w, b], lr, batch_size) # 使用小批量随机梯度下降迭代模型参数
# 不要忘了梯度清零
w.grad.data.zero_()
b.grad.data.zero_()
train_l = loss(net(features, w, b), labels)
print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
epoch 1, loss 0.000051
epoch 2, loss 0.000051
epoch 3, loss 0.000050
print(true_w, '\n', w)
print(true_b, '\n', b)
[2, -3.4]
tensor([[ 1.9999],
[-3.3999]], requires_grad=True)
4.2
tensor([4.2008], requires_grad=True)
import torch
from torch import nn
import numpy as np
torch.manual_seed(1)
print(torch.__version__)
torch.set_default_tensor_type('torch.FloatTensor')
1.1.0
num_inputs = 2
num_examples = 100
true_w = [2, -3.4]
true_b = 4.2
features = torch.tensor(np.random.normal(0, 1, (num_examples, num_inputs)), dtype=torch.float)
labels = true_w[0] * features[:, 0] + true_w[1] * features[:, 1] + true_b
# 加上随机噪声
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
features
tensor([[ 0.1243, -0.5666],
[ 1.7075, -1.6025],
[ 0.8301, 0.2019],
[ 1.0534, 0.1100],
[ 2.4399, -0.4741],
[-0.0539, 0.0243],
[ 1.4430, -1.4239],
[-0.6309, -0.6679],
[-0.2309, -0.5457],
[ 1.6393, 1.2493],
[-0.0947, 0.1160],
[ 0.4428, -0.8794],
[-1.5046, 1.1385],
[ 1.3436, -0.7787],
[-1.7918, -1.1573],
[-1.5081, -0.3098],
[ 1.1218, 0.2899],
[ 0.1286, -0.2290],
[-0.1047, 0.5363],
[-0.2209, -0.3525],
[ 0.1808, -0.5736],
[-1.2324, 0.2134],
[ 0.3638, 0.9431],
[-2.2382, -1.2264],
[-0.3424, -1.4127],
[-0.6259, -0.5414],
[ 1.5641, -1.0523],
[-0.1465, 1.9687],
[-2.0239, 1.3050],
[ 0.6236, 0.9445],
[-1.8673, -0.2439],
[ 1.0122, -0.5010],
[ 0.7602, -0.6323],
[-1.1825, -0.4863],
[ 0.4238, -1.1245],
[ 0.6046, 0.0247],
[-0.0610, 0.0560],
[ 0.8459, 1.9671],
[-1.5314, 1.9764],
[-0.0863, 0.3367],
[ 0.4751, -1.4216],
[-2.3350, -1.5692],
[-0.0806, 1.0371],
[ 2.2874, -0.2344],
[ 0.7814, 0.5793],
[ 0.3327, -0.3436],
[ 1.1383, -1.5865],
[ 0.3443, -1.1161],
[-0.1329, -0.5532],
[ 0.4722, -1.0481],
[-0.1027, 0.3339],
[ 1.7872, -0.5495],
[ 0.5177, -0.9145],
[-0.7556, -0.3341],
[ 1.3349, 0.1399],
[-0.3999, 0.6756],
[ 0.7278, -0.0186],
[ 0.1343, -2.8902],
[ 1.3208, -0.0030],
[ 0.7743, -0.4380],
[-1.4138, -0.1755],
[-0.2573, -1.5179],
[ 1.5327, -0.1858],
[ 0.0104, -1.1359],
[-0.8267, 0.2327],
[-0.3347, 1.2213],
[-0.2930, 1.4084],
[ 0.0039, -0.7128],
[ 0.2927, 1.3857],
[-1.3342, 1.7952],
[ 1.3093, 0.1347],
[ 0.6866, -1.5838],
[ 0.2249, 1.3329],
[-2.1616, -0.8585],
[-0.1465, -0.4043],
[ 1.3356, -0.3467],
[ 0.1955, 0.6419],
[ 0.6358, 2.5126],
[-1.9304, 0.1323],
[ 0.1517, 0.7876],
[-1.2069, 1.0423],
[ 0.1724, 0.2650],
[-1.1125, -0.3688],
[-0.5661, -0.1372],
[ 0.5544, -1.0634],
[ 0.3861, -0.2862],
[-2.0400, -0.1196],
[-0.2448, 0.6680],
[ 0.6154, 1.1794],
[ 0.4105, -0.6895],
[ 1.8495, 0.4164],
[ 0.0693, -1.4827],
[ 0.2276, -0.8581],
[-0.1439, -0.1953],
[ 0.6906, -1.6262],
[-0.3355, 1.4109],
[ 0.1661, -1.3578],
[ 0.1400, 0.2831],
[-2.2244, -0.0052],
[ 0.1870, 0.6322]])
labels
tensor([ 6.3582, 13.0468, 5.1703, 5.9370, 10.6816, 4.0108, 11.9330, 5.2226,
5.5883, 3.2406, 3.6132, 8.0711, -2.6950, 9.5377, 4.5423, 2.2213,
5.4405, 5.2333, 2.1839, 4.9762, 6.5070, 0.9976, 1.7188, 3.8831,
8.3345, 4.7859, 10.8829, -2.7822, -4.2824, 2.2112, 1.3012, 7.9229,
7.8523, 3.5080, 8.8797, 5.3386, 3.8763, -0.7937, -5.6060, 2.8852,
9.9732, 4.8737, 0.5205, 9.5834, 3.8199, 6.0169, 11.8725, 8.6902,
5.8258, 8.7158, 2.8549, 9.6557, 8.3736, 3.8157, 6.3899, 1.1137,
5.7034, 14.2933, 6.8431, 7.2523, 1.9778, 8.8505, 7.8973, 8.0876,
1.7574, -0.6365, -1.1802, 6.6373, 0.0778, -4.5783, 6.3599, 10.9425,
0.1049, 2.7995, 5.2798, 8.0636, 2.4182, -3.0564, -0.1136, 1.8242,
-1.7579, 3.6434, 3.2396, 3.5254, 8.9399, 5.9625, 0.5436, 1.4391,
1.4254, 7.3642, 6.4810, 9.3788, 7.5766, 4.5753, 11.1097, -1.2708,
9.1500, 3.5305, -0.2608, 2.4223])
import torch.utils.data as Data
batch_size = 10
# 将训练数据的特征和标签组合
# 将数据集转成 torch 能识别的 dataset
dataset = Data.TensorDataset(features, labels)
# print(dict(dataset))
# 把 dataset 放入 DataLoader,可以生成一个迭代器,从而我们可以方便的进行批处理。
data_iter = Data.DataLoader(
dataset = dataset, # torch TensorDataset format
batch_size = batch_size, # mini batch size
shuffle = False, # 要不要打乱数据(打乱比较好)
num_workers = 2, # 多线程来读数据
)
dataset:Dataset类型,从其中加载数据
batch_size:int,可选。每个batch加载多少样本
shuffle:bool,可选。为True时表示每个epoch都对数据进行洗牌
sampler:Sampler,可选。从数据集中采样样本的方法。
num_workers:int,可选。加载数据时使用多少子进程。默认值为0,表示在主进程中加载数据。
collate_fn:callable,可选。
pin_memory:bool,可选。
drop_last:bool,可选。True表示如果最后剩下不完全的batch,丢弃。False表示不丢弃。
for X, y in data_iter:
print(X, '\n', y)
break
tensor([[ 0.1243, -0.5666],
[ 1.7075, -1.6025],
[ 0.8301, 0.2019],
[ 1.0534, 0.1100],
[ 2.4399, -0.4741],
[-0.0539, 0.0243],
[ 1.4430, -1.4239],
[-0.6309, -0.6679],
[-0.2309, -0.5457],
[ 1.6393, 1.2493]])
tensor([ 6.3582, 13.0468, 5.1703, 5.9370, 10.6816, 4.0108, 11.9330, 5.2226,
5.5883, 3.2406])
class LinearNet(nn.Module):
def __init__(self, n_feature):
super(LinearNet, self).__init__()
self.LinearNet = nn.Linear(n_feature, 1)
def forward(self, x):
y = self.linear(x)
return y
net = LinearNet(num_inputs)
print(net) # 打印出网络结构
LinearNet(
(LinearNet): Linear(in_features=2, out_features=1, bias=True)
)
Sequential
实例可以看作是一个串联各个层的容器。在构造模型时,我们在该容器中依次添加层。当给定输入数据时,容器中的每一层将依次计算并将输出作为下一层的输入。
# 写法一
net = nn.Sequential(
nn.Linear(num_inputs, 1) # nn.Linear(input_size, output_size)
# 此处还可以传入其他层
)
# 写法二
net = nn.Sequential()
net.add_module('linear', nn.Linear(num_inputs, 1))
# net.add_module
# 写法三
from collections import OrderedDict
net = nn.Sequential(
OrderedDict(
[
('linear', nn.Linear(num_inputs, 1))
# ......
]
)
)
print(net)
print(net[0])
Sequential(
(linear): Linear(in_features=2, out_features=1, bias=True)
)
Linear(in_features=2, out_features=1, bias=True)
print(list(net.parameters())) # weight 和 bias
[Parameter containing:
tensor([[0.1465, 0.2274]], requires_grad=True), Parameter containing:
tensor([0.5282], requires_grad=True)]
for param in net.parameters():
print(param)
Parameter containing:
tensor([[0.2274, 0.5282]], requires_grad=True)
Parameter containing:
tensor([0.6705], requires_grad=True)
from torch.nn import init
init.normal_(net[0].weight, mean=0.0, std=0.01)
init.constant_(net[0].bias, val=0.0)
# 也可以直接修改bias的data: net[0].bias.data.fill_(0)
Parameter containing:
tensor([0.], requires_grad=True)
for param in net.parameters():
print(param)
Parameter containing:
tensor([[ 0.0031, -0.0114]], requires_grad=True)
Parameter containing:
tensor([0.], requires_grad=True)
loss = nn.MSELoss()
import torch.optim as optim
optimizer = optim.SGD(net.parameters(), lr=0.03)
print(optimizer)
SGD (
Parameter Group 0
dampening: 0
lr: 0.03
momentum: 0
nesterov: False
weight_decay: 0
)
# 为不同子网络设置不同的学习率
# optimizer =optim.SGD([
# # 如果对某个参数不指定学习率,就使用最外层的默认学习率
# {'params': net.subnet1.parameters()}, # lr=0.03
# {'params': net.subnet2.parameters(), 'lr': 0.01}
# ], lr=0.03)
# # 调整学习率
# for param_group in optimizer.param_groups:
# param_group['lr'] *= 0.1 # 学习率为之前的0.1倍
num_epochs = 3
for epoch in range(1, num_epochs + 1):
for X, y in data_iter:
output = net(X) # X是数据(特征向量),y是标签(scalar)
l = loss(output, y.view(-1, 1))
optimizer.zero_grad() # 梯度清零,等价于net.zero_grad()
l.backward()
optimizer.step()
print('epoch %d, loss: %f' % (epoch, l.item()))
epoch 1, loss: 11.934128
epoch 2, loss: 2.813488
epoch 3, loss: 0.671115
dense = net[0]
print(true_w, dense.weight.data)
print(true_b, dense.bias.data)
[2, -3.4] tensor([[ 1.8862, -3.0119]])
4.2 tensor([3.6755])