Pytorch实战入门(一):MLP
Pytorch实战入门(二):CNN与MNIST
Pytorch实战入门(三):迁移学习
环境:
Python 3.7.4
Pytorch 1.3.1
任务:
构建一个多层感知机(Multi-Layer Perceptron,MLP)学习随机生成的输入和输出数据。
PS:MLP好像别名挺多的其实就是全连接网络,有的也叫人工神经网络(Artificial Neural Network,ANN);其中的全连接层(Fully Connected Layers)一般搭网络时取层名为 FC;而 FCN 一般是全卷积网络(Fully Convolutional Networks)
网络输入为1000维,输出为10维,随机生成64组数据用作训练;网络包含三层:输入层(1000个神经元)、隐藏层(100个神经元)、输出层(10个神经元)。
流程:
API 查询: Pytorch官网
若对神经网络基础不熟悉建议 3Blue1Brown 的深度学习视频(p1,p2,p3)
import numpy as np
N = 64 # 训练数据数量
D_in = 1000 # 训练数据维度,也是输入层神经元个数
H = 100 # 隐藏层神经元个数
D_out = 10 # 输出层神经元个数,也是训练数据标签的维度
# 随机生成训练数据
x = np.random.randn(N, D_in) # [64, 1000] 64个1000维的输出
y = np.random.randn(N, D_out) # [64, 10] 64个10维的对应输出
# 随机初始化网络的权重(此网络忽略偏置 bias)
w1 = np.random.randn(D_in, H) # [1000, 100] 输入层和隐藏层间的权重
w2 = np.random.randn(H, D_out) # [100, 10] 隐藏层和输出层间的权重
learning_rate = 1e-6 # 设置学习率
# 开始训练网络,迭代500次
for it in range(500):
# Forward Pass 前向传播
z1 = x.dot(w1) # x*w1, 输出[64, 100]
a1 = np.maximum(z1, 0) # 激活层 relu, 小于0取0, 大于0不变
y_pred = a1.dot(w2) # a1*w2, 输出[64, 10]
# Loss 计算损失
loss = np.square(y_pred - y).sum() # MSE均方误差损失
print(it, loss)
# Backward Pass 反向传播
# Gradient 计算梯度, 暂略具体计算公式的推导, 备注中的维度变化可以简单验证计算正确性
grad_y_pred = 2.0 * (y_pred - y) # [64,10]
grad_w2 = a1.T.dot(grad_y_pred) # [100,10] = [100,64] * [64,10]
grad_a1 = grad_y_pred.dot(w2.T) # [64,100] = [64,10] * [10,100]
grad_z1 = grad_a1.copy()
grad_z1[z1<0] = 0 # [64,100]
grad_w1 = x.T.dot(grad_z1) # [1000,100] = [1000,64] * [64,100]
# update weights 更新权重
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
输出:
0 36093135.37125358
1 32398017.92254483
2 30489201.634114698
3 25977952.979760103
......
497 5.757372208142832e-06
498 5.501219493141868e-06
499 5.256529222978903e-06
import torch
N = 64
D_in = 1000
H = 100
D_out = 10
# np.random.randn() 变为 torch.randn()
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)
learning_rate = 1e-6
for it in range(500):
# Forward Pass
z1 = x.mm(w1) # dot() 变为 mm()
a1 = z1.clamp(min=0) # np.maximum() 变为 .clamp(min,max), 将数值夹在设定的 min,max 之间
y_pred = a1.mm(w2)
# Loss
loss = (y_pred - y).pow(2).sum().item()
# np.square变为.pow(2), 计算完的 loss 是一个 tensor, 通过 .item() 获取数值
print(it, loss)
# Backward Pass
# Gradient
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = a1.t().mm(grad_y_pred) # .T变为.t()
grad_a1 = grad_y_pred.mm(w2.t())
grad_z1 = grad_a1.clone() # .copy()变为.clone()
grad_z1[z1<0] = 0
grad_w1 = x.t().mm(grad_z1)
# update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
输出:
0 32200542.0
1 31099402.0
2 35729468.0
......
497 0.00011954964429605752
498 0.00011727648234227672
499 0.00011476786312414333
import torch
N = 64
D_in = 1000
H = 100
D_out = 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# requires_grad 设为 True 后即可自动求导
# 在1.3.1版本默认为 False
w1 = torch.randn(D_in, H, requires_grad = True)
w2 = torch.randn(H, D_out, requires_grad = True)
learning_rate = 1e-6
for it in range(500):
# Forward Pass
# 自动求导不需要网络中间的输出, 一行解决前向传播
y_pred = x.mm(w1).clamp(min=0).mm(w2)
# Loss
loss = (y_pred - y).pow(2).sum()
print(it, loss.item())
# Backward Pass
# Gradient
loss.backward()
# update weights
with torch.no_grad():
w1 -= learning_rate * w1.grad # .grad 即可获取对应梯度
w2 -= learning_rate * w2.grad
w1.grad.zero_() # 梯度在每次使用前需要清零,不然会不断累加
w2.grad.zero_()
import torch
import torch.nn as nn
N = 64
D_in = 1000
H = 100
D_out = 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 用 nn.Sequential 直接搭建网络
model = nn.Sequential(nn.Linear(D_in, H, bias = False),
nn.ReLU(),
nn.Linear(H, D_out, bias = False))
# 权重初始化
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)
# 损失函数
loss_fn = nn.MSELoss(reduction = 'sum')
learning_rate = 1e-6
for it in range(500):
# Forward Pass
y_pred = model(x) # 相当于 model.forward(x)
# Loss
loss = loss_fn(y_pred, y)
print(it, loss.item())
# Backward Pass
# Gradient
loss.backward()
# update weights
with torch.no_grad():
for param in model.parameters():
param -= learning_rate * param.grad
model.zero_grad()
import torch
import torch.nn as nn
N = 64
D_in = 1000
H = 100
D_out = 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = nn.Sequential(nn.Linear(D_in, H, bias = False),
nn.ReLU(),
nn.Linear(H, D_out, bias = False))
# 权重初始化
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)
# 损失函数
loss_fn = nn.MSELoss(reduction = 'sum')
# optimizer 优化器, 用来做梯度下降
learning_rate = 1e-4 # Adam 通常用 1e-3 到 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
learning_rate = 1e-6 # SGD 通常用 1e-6
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)
for it in range(500):
# Forward Pass
y_pred = model(x)
# Loss
loss = loss_fn(y_pred, y)
print(it, loss.item())
# Backward Pass
# Gradient
loss.backward()
# update weights
optimizer.step()
optimizer.zero_grad()
nn.init.normal_()
初始化权重时的输出:0 22112800.0
1 22049420.0
2 21986150.0
......
497 4801638.0
498 4786025.0
499 4770454.0
0 665.029296875
1 648.2096557617188
2 631.8590087890625
......
497 1.3149491451258655e-07
498 1.2488904133078904e-07
499 1.1852827697111934e-07
nn.init.normal_()
初始化权重时的输出:0 30186936.0
1 27895552.0
2 28996608.0
......
497 3.768844544538297e-05
498 3.709576412802562e-05
499 3.662251037894748e-05
0 663.9910278320312
1 663.4700927734375
2 662.9500122070312
......
497 470.6908264160156
498 470.3971252441406
499 470.1034240722656
可以看出使用不同的优化器时,权重的初始化方式会很大程度影响网络训练的效果和速度,其中具体原理尚待补充研究。
import torch
import torch.nn as nn
N = 64
D_in = 1000
H = 100
D_out = 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 创建一个类构件模型,对构件复杂模型有很大好处
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = nn.Linear(D_in, H, bias = False)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(H, D_out, bias = False)
def forward(self, x):
y_pred = self.linear2(self.relu(self.linear1(x)))
return y_pred
# model
model = TwoLayerNet(D_in, H, D_out)
# loss
loss_fn = nn.MSELoss(reduction = 'sum')
# optimizer
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
for it in range(500):
# Forward Pass
y_pred = model(x)
# Loss
loss = loss_fn(y_pred, y)
print(it, loss.item())
# Backward Pass
loss.backward()
# update model parameters
optimizer.step()
optimizer.zero_grad()