torch.nn.functional.linear
torch.nn.linear
类API
>>> layer1 = torch.nn.Linear(100, 10)
# 这里使用类API
# 定义一个全连接层,输入100个单元,输出10个,可以理解成初始化的一个(100, 10)的矩阵
>>> layer2 = torch.nn.Linear(10, 1)
>>> x = torch.rand(1,100)
# 定义一个(1, 100)的矩阵
>>> x = layer1(x)
# x经过layer1全连接层的运算
>>> x
tensor([[-0.1354, 0.1530, 0.1946, -0.1349, 0.6149, -0.0482, 0.1025, -0.8483,
-1.0567, -0.5853]], grad_fn=<AddmmBackward>)
>>> x.shape
torch.Size([1, 10])
# 可以发现乘完以后变成了(1, 10)的矩阵
>>> x = layer2(x)
# x再经过layer2层运算
>>> x
tensor([[-0.2182]], grad_fn=<AddmmBackward>)
>>> x.shape
torch.Size([1, 1])
>>> layer1.weight.shape
torch.Size([10, 100])
# 可以通过weight属性查看当前层的权值,计算的时候会将权值矩阵进行转置后才进行运算,所以是(10, 100)而不是(100, 10)
>>> layer1.bias
Parameter containing:
tensor([ 0.0049, -0.0081, -0.0541, -0.0301, 0.0320, -0.0621, 0.0072, -0.0024,
-0.0339, 0.0456], requires_grad=True)
# 可以通过bias属性查看当前层的偏置值
函数式API
>>> x = torch.rand(1,100)
>>> w = torch.rand(10, 100)
>>> x = torch.nn.functional.linear(x, w)
# 可以看出函数式API需要我们自己定义初始化权值,然后直接调用即可
>>> x
tensor([[25.9789, 23.4787, 24.2929, 25.8615, 22.0681, 23.1044, 22.0457, 22.0386,
23.0654, 24.6127]])
torch.nn.functional.dropout
torch.nn.dropout
>>> a = torch.rand(20)
>>> torch.nn.functional.dropout(a, 0.2)
tensor([1.2178, 1.0375, 0.0555, 0.0307, 0.3235, 0.0000, 0.5209, 0.0000, 0.3346,
1.2383, 0.3606, 1.0937, 0.0000, 0.2957, 0.9463, 0.2932, 0.8088, 0.4445,
0.5565, 0.0241])
# 随机将百分之20的节点转成0
torch.nn.functional.batch_norm
torch.nn.BatchNorm2d
>>> x1 = torch.rand(1, 3, 784)
# 3通道的1d数据
>>> layer1 = torch.nn.BatchNorm1d(3)
# 1d批标准化层,3通道
>>> layer1.weight
Parameter containing:
tensor([1., 1., 1.], requires_grad=True)
# 可以看出batch_norm层的权值全是1
>>> layer1(x1)
tensor([[[-0.0625, -0.1859, -0.3823, ..., 0.6668, -0.7487, 0.8913],
[ 0.0115, -0.1149, 0.1470, ..., -0.1546, 0.3012, 0.2472],
[ 1.5185, -0.4740, -0.8664, ..., 0.6266, 0.2797, -0.2975]]],
grad_fn=<NativeBatchNormBackward>)
# 可以看到数据都被标准化了
>>> layer1(x1).shape
torch.Size([1, 3, 784])
>>> x2 = torch.rand(1, 3, 28, 28)
# 3通道的2d数据
>>> layer2 = torch.nn.BatchNorm2d(3)
>>> layer2(x2)
tensor([[[[-0.0378, -0.3922, 0.2255, ..., -0.1469, -0.3016, 0.2384],
[-0.3901, -0.0220, -0.3118, ..., -0.2492, 0.1705, -0.0599],
[-0.1309, -0.3064, -0.2001, ..., -0.0613, -0.1838, 0.1335],
...,
[ 0.9022, -0.3031, 1.0695, ..., -0.8257, -0.6438, -0.2672],
[-0.1015, 1.1482, 1.0834, ..., 0.6641, -0.8632, -0.2418],
[-1.2068, -0.7443, 0.8346, ..., 0.1213, 0.4528, -0.5756]]]],
grad_fn=<NativeBatchNormBackward>)
>>> layer2(x2).shape
# 经过batch_norm只是将数据变得符合高斯分布,并不会改变数据形状
torch.Size([1, 3, 28, 28])
>>> x2.mean()
tensor(0.4942)
# 原来数据的平均值
>>> x2.std()
tensor(0.2899)
# 原来数据的标准差
>>> layer2(x2).mean()
tensor(-2.1211e-08, grad_fn=<MeanBackward0>)
# 经过batch_norm的平均值,可以看出经过batch_norm层后数据平均值变成接近0
>>> layer2(x2).std()
tensor(1.0002, grad_fn=<StdBackward0>)
# 经过batch_norm的标准差,可以看出经过batch_norm层后数据标准差变成接近1
torch.nn.functional.conv2d
torch.nn.ConV2d
>>> x = torch.rand(1, 1, 28, 28)
>>> layer = torch.nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=0)
# 设置输入通道为1,输出通道为3,filter大小为3x3,步长为1,边框不补0
>>> layer.weight
Parameter containing:
tensor([[[[-0.1893, 0.1177, -0.2837],
[ 0.1116, 0.0348, 0.3011],
[-0.1871, -0.0722, -0.1843]]],
...,
[[[ 0.0083, -0.0784, 0.1592],
[-0.1896, 0.0082, -0.0146],
[-0.2069, -0.0147, -0.1899]]]], requires_grad=True)
# 可以查看初始化权值
>>> layer.weight.shape
torch.Size([3, 1, 3, 3])
# 格式分别代表输出通道3,输入通道1,尺寸为3x3
>>> layer.bias.shape
torch.Size([3])
# 查看初始化偏置
>>> layer(x)
tensor([[[[-0.0494, -0.1396, -0.0690, ..., -0.1382, -0.0539, -0.1876],
[-0.2185, -0.0116, -0.1287, ..., 0.1233, -0.0091, 0.0407],
[-0.0648, 0.0506, -0.1971, ..., -0.2013, 0.1151, -0.0026],
...,
[-0.4974, -0.5449, -0.4583, ..., -0.7153, -0.1890, -0.7381],
[-0.4254, -0.6051, -0.2578, ..., -0.4957, -0.4128, -0.4875],
[-0.5392, -0.4214, -0.5671, ..., -0.2785, -0.6113, -0.3150]]]],
grad_fn=<ThnnConv2DBackward>)
# 进行一次卷积运算,实际是魔法方法__call__里调用了forward方法
>>> layer(x).shape
torch.Size([1, 3, 26, 26])
# 可以看到计算后由于边框不补0,而滤波器大小为3x3,所以结果的长宽就变成了(height-3+1, weight-3+1)
>>> layer1 = torch.nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=1)
# 这里边缘补0
>>> layer1(x).shape
torch.Size([1, 3, 28, 28])
# 可以看到由于边缘补0,所以大小没变
>>> layer2 = torch.nn.Conv2d(1, 3, kernel_size=3, stride=2, padding=0)
# 这里步长改成2
>>> layer2(x).shape
torch.Size([1, 3, 13, 13])
# 结果的长宽就变成了((height-3+1)/2, (weight-3+1)/2)
>>> layer3 = torch.nn.Conv2d(1, 3, kernel_size=3, stride=2, padding=1)
# 这里边缘补0,且步长改成2
>>> layer3(x).shape
torch.Size([1, 3, 14, 14])
# 可以看到结果的长宽就变成了(height/2, weight/2)
>>> x = torch.rand(1, 1, 28, 28)
>>> w = torch.rand(3, 1, 3, 3)
# 输出3通道,输入1通道,尺寸3x3
>>> b = torch.rand(3)
# 偏置长度要和通道数一样
>>> layer = torch.nn.functional.conv2d(x, w, b, stride=1, padding=1)
>>> layer
tensor([[[[2.1963, 2.6321, 3.4186, ..., 3.2495, 3.1609, 2.5473],
[2.5637, 3.4892, 4.0079, ..., 4.1167, 4.4497, 3.1637],
[2.7618, 3.2788, 3.2314, ..., 4.7185, 4.3128, 2.6393],
...,
[1.3735, 2.3738, 1.8388, ..., 2.9912, 2.6638, 1.5941],
[2.1967, 2.0466, 2.0095, ..., 3.3192, 2.9521, 2.2673],
[1.6091, 2.1341, 1.5108, ..., 2.1684, 2.4585, 1.7931]]]])
>>> layer.shape
torch.Size([1, 3, 28, 28])
torch.nn.functional.max_pool2d
torch.nn.MaxPool2d
torch.nn.functional.avgpool
torch.nn.AvgPool2d
>>> x = torch.rand(1, 1, 28, 28)
>>> layer = torch.nn.MaxPool2d(3, stride=2)
# 尺寸3x3,步长为2
>>> layer(x)
tensor([[[[0.9301, 0.9342, 0.9606, 0.9922, 0.9754, 0.9055, 0.7142, 0.9882,
0.9803, 0.8054, 0.9903, 0.9903, 0.9426],
...,
[0.8873, 0.8873, 0.9324, 0.9876, 0.9566, 0.9225, 0.9673, 0.9675,
0.9977, 0.9977, 0.9552, 0.9552, 0.8689]]]])
>>> layer(x).shape
torch.Size([1, 1, 13, 13])
>>> torch.nn.functional.avg_pool2d(x, 3, stride=2)
tensor([[[[0.5105, 0.6301, 0.5491, 0.4691, 0.5788, 0.4525, 0.3903, 0.5718,
0.6259, 0.3388, 0.4169, 0.6122, 0.4760],
...,
[0.4705, 0.5332, 0.4150, 0.5000, 0.5686, 0.5325, 0.6241, 0.4926,
0.4646, 0.3121, 0.2975, 0.5203, 0.5701]]]])
>>> torch.nn.functional.avg_pool2d(x, 3, stride=2).shape
torch.Size([1, 1, 13, 13])
torch.flatten()
>>> torch.flatten(torch.rand(2,2))
tensor([0.1339, 0.5694, 0.9034, 0.6025])
>>> torch.flatten(torch.rand(2,2)).shape
torch.Size([4])
常用于定义词向量,可以理解embedding层定义了一个词典用来存储和表示所有的词向量,而传入的数据则会根据索引找到对应的词向量
torch.nn.functional.embedding
torch.nn.Embedding
>>> embed = torch.nn.Embedding(10, 2)
# 定义了10个词向量,每个词向量用格式为(1, 2)的tensor表示
>>> words = torch.tensor([0, 1, 2, 0])
# 定义一句话,里面有4个词,那么可以看出第一个和最后一个词相同
>>> embed(words)
# 经过嵌入层索引可以看到4个词对应的词向量如下,也可以看出第一个和最后一个词索引相同,所以值是一样的
tensor([[-0.0019, 1.6786],
[ 0.3118, -1.6250],
[ 1.6038, 1.5044],
[-0.0019, 1.6786]], grad_fn=<EmbeddingBackward>)
>>> embed.weight
# 再看embedding层的权重,可以发现这就是定义了一个词向量表,并且会随着训练而更新,从而找出词与词之间的关系
Parameter containing:
tensor([[-1.8939e-03, 1.6786e+00],
[ 3.1179e-01, -1.6250e+00],
[ 1.6038e+00, 1.5044e+00],
[-6.2278e-01, -2.5135e-01],
[ 1.6210e+00, -5.6379e-01],
[-7.3388e-02, -2.0099e+00],
[ 8.7655e-01, 2.4011e-01],
[-2.5685e+00, 2.6756e-01],
[ 4.9723e-01, -8.3784e-01],
[ 4.2338e-01, -1.9839e+00]], requires_grad=True)
torch.nn.RNN
会返回计算后总体的输出,以及最后一个时间戳上的输出,通过下面代码可以证明最后一个时间戳的输出和总体输出的最后一个是一样的
>>> x = torch.randn(10, 3, 100)
# 模拟句子序列:有10个单词(序列长度是10),共3句话,每个单词用100维向量表示
# input:[seq_len, batch, input_size],如果希望batch_size放第一个,可以设置batch_first=True
>>> layer = torch.nn.RNN(input_size=100, hidden_size=20, num_layers=4)
>>> layer
RNN(100, 20, num_layers=4)
>>> out, h = layer(x)
# 返回output和hidden
>>> out.shape
torch.Size([10, 3, 20])
# 所有时间戳上的状态
# output:[seq_len, batch, hidden_size]
>>> h.shape
torch.Size([4, 3, 20])
# 最后一个时间戳上的hidden
# hidden:[num_layers, batch, hidden_size]
>>> h[-1]
# 最后一层的最后一个时间戳上的输出(因为num_layers的值为4,所以要取第四个,对于num_layers参数的解释,下面会说)
tensor([[ 3.5205e-01, 3.6580e-01, -5.6378e-01, -9.9363e-02, 3.8728e-03,
-5.0282e-01, 1.4762e-01, -2.5631e-01, -8.8786e-03, 1.2912e-01,
4.7565e-01, -8.8090e-02, -3.9374e-02, 3.1736e-02, 3.1264e-01,
2.8091e-01, 5.0764e-01, 2.9722e-01, -3.6929e-01, -5.1096e-02],
...
[ 5.4770e-01, 4.8047e-01, -5.2541e-01, 2.5208e-01, -4.0260e-04,
-2.3619e-01, -2.1128e-01, -1.1262e-01, -6.2672e-02, 3.5301e-01,
-4.1065e-02, -3.5043e-02, -4.3008e-01, -1.8410e-01, 2.5826e-01,
3.5430e-02, 2.5651e-01, 4.5170e-01, -5.4705e-01, -2.4720e-01]],
grad_fn=<SelectBackward>)
>>> out[-1]
# 所有状态的最后一个输出,可以看到是一样的
tensor([[ 3.5205e-01, 3.6580e-01, -5.6378e-01, -9.9363e-02, 3.8728e-03,
-5.0282e-01, 1.4762e-01, -2.5631e-01, -8.8786e-03, 1.2912e-01,
4.7565e-01, -8.8090e-02, -3.9374e-02, 3.1736e-02, 3.1264e-01,
2.8091e-01, 5.0764e-01, 2.9722e-01, -3.6929e-01, -5.1096e-02],
...
[ 5.4770e-01, 4.8047e-01, -5.2541e-01, 2.5208e-01, -4.0260e-04,
-2.3619e-01, -2.1128e-01, -1.1262e-01, -6.2672e-02, 3.5301e-01,
-4.1065e-02, -3.5043e-02, -4.3008e-01, -1.8410e-01, 2.5826e-01,
3.5430e-02, 2.5651e-01, 4.5170e-01, -5.4705e-01, -2.4720e-01]],
grad_fn=<SelectBackward>)
num_layers参数的理解:rnn的基本参数都挺好理解因为其他深度学习框架基本也都一样,而比较特殊的就是num_layers参数,其实也很简单,顾名思义就是代表着有几层rnn,直接一口气帮你定义好直接计算,省的你自己再去定义一堆rnn,然后一层一层的算过去,比如上面的示例代码设置num_layers=4,那么上面的代码可以替换成下面这种:
>>> x = torch.randn(10, 3, 100)
>>> layer1 = torch.nn.RNN(input_size=100, hidden_size=20, num_layers=1)
# 把上面示例代码中num_layers=4的rnn改成4个为1的rnn
>>> layer2 = torch.nn.RNN(input_size=20, hidden_size=20, num_layers=1)
# 因为第一层的hidden是20,所以后几层的输入都是20
>>> layer3 = torch.nn.RNN(input_size=20, hidden_size=20, num_layers=1)
>>> layer4 = torch.nn.RNN(input_size=20, hidden_size=20, num_layers=1)
>>> out, h = layer1(x)
>>> out, h = layer2(out)
>>> out, h = layer3(out)
>>> out, h = layer4(out)
>>> out.shape
torch.Size([10, 3, 20])
作者:dawsonenjoy
链接:https://www.jianshu.com/p/5460b7fa3ec4
来源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
torch.nn.LSTM
因为LSTM是基于RNN并添加了门控制,因此返回的时候比RNN要多返回一个cell单元,格式和hidden一样
>>> x = torch.randn(10, 3, 100)
>>> layer = torch.nn.LSTM(input_size=100, hidden_size=20, num_layers=4)
>>> layer
LSTM(100, 20, num_layers=4)
>>> out, (h, c) = layer(x)
# 返回output、hidden和cell
>>> out.shape
torch.Size([10, 3, 20])
>>> h.shape
torch.Size([4, 3, 20])
>>> c.shape
torch.Size([4, 3, 20])
# 可以看出和hidden格式一样
# cell:[num_layers, batch_size, hidden_size]
这里给一个通过前三个数预测后一个数的模型代码示例
# -----------------------------------
# 模块导入
import numpy
import torch
from torch import nn
# -----------------------------------
# 数据预处理
data_length = 30
# 定义30个数,通过前三个预测后一个,比如:1,2,3->4
seq_length = 3
# 通过上面可知序列长度为3
number = [i for i in range(data_length)]
li_x = []
li_y = []
for i in range(0, data_length - seq_length):
x = number[i: i + seq_length]
y = number[i + seq_length]
li_x.append(x)
li_y.append(y)
# print(x, '->', y)
data_x = numpy.reshape(li_x, (len(li_x), 1, seq_length))
# 输入数据格式:seq_len, batch, input_size
# 这里可能会有误解,seq_len不是步长,而是你的样本有多少组,即sample
# 而input_size就是你数据的维度,比如用三个预测一个,就是3维
data_x = torch.from_numpy(data_x / float(data_length)).float()
# 将输入数据归一化
data_y = torch.zeros(len(li_y), data_length).scatter_(1, torch.tensor(li_y).unsqueeze_(dim=1), 1).float()
# 将输出数据设置为one-hot编码
# print(data_x.shape)
# # 格式:torch.Size([27, 1, 3]),代表:27组数据(batch)、序列步长为3(sequence)
# print(data_y.shape)
# # 格式:torch.Size([27, 30]),代表:27组数据,30个特征(features)
# -----------------------------------
# 定义网络模型
class net(nn.Module):
# 模型结构:LSTM + 全连接 + Softmax
def __init__(self, input_size, hidden_size, output_size, num_layer):
super(net, self).__init__()
self.layer1 = nn.LSTM(input_size, hidden_size, num_layer)
self.layer2 = nn.Linear(hidden_size, output_size)
self.layer3 = nn.Softmax()
def forward(self,x):
x, _ = self.layer1(x)
sample, batch, hidden = x.size()
# 格式:[27, 1, 32],代表样本数量,batch大小以及隐藏层尺寸
x = x.reshape(-1, hidden)
# 转成二维矩阵后与全连接进行计算
x = self.layer2(x)
x = self.layer3(x)
return x
model = net(seq_length, 32, data_length, 4)
# -----------------------------------
# 定义损失函数和优化器
loss_fun = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# -----------------------------------
# 训练模型
# 训练前可以先看看初始化的参数预测的结果差距
# result = model(data_x)
# for target, pred in zip(data_y, result):
# print("{} -> {}".format(target.argmax().data, pred.argmax().data))
# 开始训练1000轮
for _ in range(500):
output = model(data_x)
loss = loss_fun(data_y, output)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (_ + 1) % 50 == 0:
print('Epoch: {}, Loss: {}'.format(_, loss.data))
# -----------------------------------
# 预测结果
result = model(data_x)
for target, pred in zip(data_y, result):
print("正确结果:{},预测:{}".format(target.argmax().data, pred.argmax().data))
# 结果:
# 正确结果:3,预测:3
# 正确结果:4,预测:4
# 正确结果:5,预测:5
# 正确结果:6,预测:6
# 正确结果:7,预测:7
# 正确结果:8,预测:8
# 正确结果:9,预测:9
# 正确结果:10,预测:10
# 正确结果:11,预测:11
# 正确结果:12,预测:12
# 正确结果:13,预测:13
# 正确结果:14,预测:14
# 正确结果:15,预测:15
# 正确结果:16,预测:16
# 正确结果:17,预测:21
# 正确结果:18,预测:18
# 正确结果:19,预测:27
# 正确结果:20,预测:21
# 正确结果:21,预测:21
# 正确结果:22,预测:21
# 正确结果:23,预测:21
# 正确结果:24,预测:24
# 正确结果:25,预测:25
# 正确结果:26,预测:26
# 正确结果:27,预测:27
# 正确结果:28,预测:28
# 正确结果:29,预测:29
torch.nn.Sequential
>>> net = torch.nn.Sequential(
torch.nn.Linear(100, 10),
torch.nn.Dropout(0.7),
torch.nn.ReLU(),
torch.nn.Linear(10, 1)
)
>>> net
Sequential(
(0): Linear(in_features=100, out_features=10, bias=True)
(1): Dropout(p=0.7)
(2): ReLU()
(3): Linear(in_features=10, out_features=1, bias=True)
# 可以直接查看网络结构
序列化模型修改
序列化模型可以理解成一个列表,里面按顺序存放了所有的网络层,官方也提供了添加往序列化模型里添加网络层的方法add_module(name, layer),而修改则可以索引到对应的层直接修改,删除可以通过del关键字删除
>>> seq = nn.Sequential(nn.Linear(10, 20), nn.Linear(20, 1))
>>> seq
Sequential(
(0): Linear(in_features=10, out_features=20, bias=True)
(1): Linear(in_features=20, out_features=1, bias=True)
)
>>> seq.add_module("tanh", nn.Tanh())
# 在最后面添加一个tanh激活层
>>> seq
# 可以看到添加成功
Sequential(
(0): Linear(in_features=10, out_features=20, bias=True)
(1): Linear(in_features=20, out_features=1, bias=True)
(tanh): Tanh()
)
>>> seq[2]
Tanh()
>>> seq[2] = nn.ReLU()
# 修改第三层为relu
>>> seq
# 可以看到修改成功
Sequential(
(0): Linear(in_features=10, out_features=20, bias=True)
(1): Linear(in_features=20, out_features=1, bias=True)
(tanh): ReLU()
)
>>> del seq[2]
# 删除第三层
>>> seq
# 可以看到删除成功
Sequential(
(0): Linear(in_features=10, out_features=20, bias=True)
(1): Linear(in_features=20, out_features=1, bias=True)
)
torch.nn.Parameter
>>> layer = nn.Linear(2, 1)
>>> layer.weight
Parameter containing:
tensor([[0.6619, 0.2653]], requires_grad=True)
>>> type(layer.weight)
# 参数的数据类型
<class 'torch.nn.parameter.Parameter'>
>>> layer.weight = torch.rand(2, 1, requires_grad=True)
# 直接赋值张量会报错
Traceback (most recent call last):
File "" , line 1, in <module>
layer.weight = torch.rand(2, 1)
File "D:\python\lib\site-packages\torch\nn\modules\module.py", line 604, in __setattr__
.format(torch.typename(value), name))
TypeError: cannot assign 'torch.FloatTensor' as parameter 'weight' (torch.nn.Parameter or None expected)
>>> layer.weight = nn.parameter.Parameter(torch.rand(2, 1))
# 赋值Parameter类型的则可以
>>> layer.weight
# 可以看到修改成功
Parameter containing:
tensor([[0.7412],
[0.9723]], requires_grad=True)
torch.nn.Module()
自定义全连接层
import torch
class Dense(torch.nn.Module):
# 实现一个自定义全连接+relu层,继承torch.nn.Module
def __init__(self, input_shape, output_shape):
super(Dense, self).__init__()
# 首先初始化时执行父类的初始化,这句话可以看
# 在父类初始化中会初始化很多变量
self.w = torch.nn.Parameter(torch.randn(output_shape, input_shape))
# 初始化权重和偏置参数
# 使用Parameter其会自动将参数设置为需要梯度信息,并且可以通过内置的parameters方法返回这些参数
self.b = torch.nn.Parameter(torch.rand(output_shape))
self.relu = torch.nn.ReLU()
# 初始化relu层
def forward(self, x):
# 定义前向传播方法
x = x @ self.w.t() + self.b
# 全连接层的功能就是矩阵相乘计算
x = self.relu(x)
# 进行relu层计算
return x
def __call__(self, x):
# 调用该类对象执行时,调用前向传播方法
# 这个可以不写,直接通过调用forward方法也一样
return self.forward(x)
layer = Dense(10, 1)
x = torch.rand(2, 10)
output = layer(x)
print(output)
# 输出结果:
# tensor([[0.1780],
# [0.0000]], grad_fn=)
如果希望训练过程当中,对某些网络层的权重不进行训练的话(该场景在迁移学习当中比较常见),可以设置该层的权重、偏差等属性为False。
>>> net = torch.nn.Sequential(
torch.nn.Linear(100, 10),
torch.nn.Dropout(0.7),
torch.nn.ReLU(),
torch.nn.Linear(10, 1)
)
>>> net
Sequential(
(0): Linear(in_features=100, out_features=10, bias=True)
(1): Dropout(p=0.7, inplace=False)
(2): ReLU()
(3): Linear(in_features=10, out_features=1, bias=True)
)
>>> for name, value in net.named_parameters():
print(name, value.requires_grad)
# 可以看到网络层的两个全连接层的权重和偏置都可求导
0.weight True
0.bias True
3.weight True
3.bias True
>>> net[0].weight.requires_grad = False
# 冻结第一个全连接的权重
>>> net[0].bias.requires_grad = False
>>> for name, value in net.named_parameters():
print(name, value.requires_grad)
# 可以看到第一个全连接的权重和偏置都被冻结
0.weight False
0.bias False
3.weight True
3.bias True
torch.optim
Adam SGD
optim = torch.optim.Adam(model.parameters(), lr=0.01)
torch.nn.functional.sigmoid
torch.nn.Sigmoid
torch.sigmoid
torch.nn.functional.tanh
torch.nn.Tanh
torch.tanh
torch.nn.functional.relu
torch.nn.Relu
torch.nn.functional.softmax
torch.nn.Softmax
torch.softmax
x = torch.linspace(-100, 100,10)
torch.sigmoid(x)
torch.nn.functional.sigmoid(x)
torch.nn.Sigmoid()(x)
均方误差
torch.nn.functional.mse_loss
torch.nn.MSELoss
>>> x = torch.tensor(1.)
>>> w = torch.tensor(2.)
>>> b = torch.tensor(0.)
>>> y = torch.tensor(1.)
>>> mse = torch.nn.functional.mse_loss(y, w*x+b)
# 计算y=wx+b在点(1, 1)时的均方差:(1 - (2*1+0))^2 = 1
>>> mse
tensor(1.)
交叉熵
torch.nn.functional.cross_entropy
torch.nn.CrossEntropyLoss
要注意的是目标y的格式要求为Long类型,值为每个one-hot数据对应的argmax处
>>> output = torch.rand(5,10)
# 输出结果,假设5条数据,每条数据有10个特征
>>> target = torch.tensor([0, 2, 9, 9, 2]).long()
# 目标y,数据必须为long型,值分别为每条数据的特征,比如第一个0代表第一条数据的第一个特征
>>> output.shape
torch.Size([5, 10])
>>> target.shape
torch.Size([5])
# 目标y的要求还要求是1维的
>>> loss = torch.nn.CrossEntropyLoss()
>>> loss(output, target)
tensor(2.2185)
二叉交叉熵
torch.nn.functional.binary_cross_entropy
torch.nn.BCELoss
>>> batch_size = 5
# 5条数据
>>> output = torch.rand(batch_size, 1)
>>> output
# 每个数据的正确率
tensor([[0.3546],
[0.9064],
[0.0617],
[0.2839],
[0.3106]])
>>> target = torch.ones(batch_size, 1)
>>> target
# 正确的概率是1
tensor([[1.],
[1.],
[1.],
[1.],
[1.]])
>>> loss = torch.nn.BCELoss()
>>> loss(output, target)
tensor(1.2697)
torch.autograd.grad
定义了自动求导,传入第一个参数是对应的公式,第二个参数是一个列表,里面存放所有要求导的变量,并且在求导前的变量需要通过require_grad()方法来声明该公式的某个变量是需要求导的(或者在定义时就设置requires_grad参数为True),返回一个元组,里面是对应每个变量的求导信息
>>> x = torch.tensor(1.)
>>> w = torch.tensor(2.)
>>> b = torch.tensor(0.)
>>> y = torch.tensor(1.)
>>> mse = torch.nn.functional.mse_loss(y, w*x+b)
# 模拟一个y=wx+b的函数
>>> mse
tensor(1.)
>>> torch.autograd.grad(mse, [w])
# 此时没有变量声明过是需要求导的
>>> w.requires_grad_()
tensor(2., requires_grad=True)
# 声明w需要求导,可以看到一开始就定义w = torch.tensor(2., requires_grad=True)效果也是一样的
# 加下划线代表为in-place类型,直接对w进行修改,也可以替换成:w = w.requires_grad()
>>> torch.autograd.grad(mse, [w])
# 报错,因为mse里的w还是之前的w,需要更新一下mse里的w
>>> mse = torch.nn.functional.mse_loss(y, w*x+b)
# 更新mse里的w为声明了需要求导的w
>>> torch.autograd.grad(mse, [w])
(tensor(2.),)
# 可以看出mse对第一个变量w求偏导的结果为2,计算过程:
# mse为:(y-(wx+b))^2
# 对w求偏导为:-2x(y-(wx+b))
# 代入数值:-2*1*(1-(2*1+0)) = 2
torch.backward
torch.save(model.state_dict(), "ckpt.mdl")
model.load_state_dict(torch.load("ckpt.mdl"))
对于所有继承自torch.nn.Module下的网络,保存时首先通过内置的方法state_dict()返回当前模型的所有参数,然后通过torch.save()方法保存成文件(也可以不保存参数,直接保存模型,但这样可控性低,不推荐);载入时通过torch.load()方法载入文件,并通过内置的load_state_dict()方法载入所有的参数。
>>> layer = torch.nn.Linear(10, 1)
>>> layer.state_dict()
OrderedDict([('weight', tensor([[-0.1597, 0.0573, 0.0976, -0.1028, -0.1264, -0.0400, 0.0308, 0.2192,
-0.0150, -0.3148]])), ('bias', tensor([0.0557]))])
# 可以看到layer里定义的参数配置
>>> torch.save(layer.state_dict(), "ckpt.mdl")
# 现在保存这个网络参数
>>> layer1 = torch.nn.Linear(10, 1)
# 新建一个网络
>>> layer1.state_dict()
OrderedDict([('weight', tensor([[-0.2506, -0.2960, -0.3083, 0.0629, 0.1707, 0.3018, 0.2345, -0.1922,
-0.0527, -0.1894]])), ('bias', tensor([-0.0069]))])
# 显然layer1参数和layer的不一样
>>> layer1.load_state_dict(torch.load("ckpt.mdl"))
# layer1载入前面的layer网络参数
>>> layer1.state_dict()
OrderedDict([('weight', tensor([[-0.1597, 0.0573, 0.0976, -0.1028, -0.1264, -0.0400, 0.0308, 0.2192,
-0.0150, -0.3148]])), ('bias', tensor([0.0557]))])
# 可以发现layer1的参数变得和layer保存的参数一样
提供了很多计算视觉相关的数据集,以及较流行的模型。