MLP用pytorch一般可以写为如下形式:
import torch
import torch.nn as nn
N = 64
D_in = 1000
H = 100
D_out = 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 创建一个类构件模型,对构件复杂模型有很大好处
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = nn.Linear(D_in, H, bias = False)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(H, D_out, bias = False)
def forward(self, x):
y_pred = self.linear2(self.relu(self.linear1(x)))
return y_pred
# model
model = TwoLayerNet(D_in, H, D_out)
# loss
loss_fn = nn.MSELoss(reduction = 'sum')
# optimizer
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
for it in range(500):
# Forward Pass
y_pred = model(x)
# Loss
loss = loss_fn(y_pred, y)
print(it, loss.item())
# Backward Pass
loss.backward()
# update model parameters
optimizer.step()
optimizer.zero_grad()
对比,用numpy的数组或者torch的tensor写则分别为如下:
import numpy as np
N = 64 # 训练数据数量
D_in = 1000 # 训练数据维度,也是输入层神经元个数
H = 100 # 隐藏层神经元个数
D_out = 10 # 输出层神经元个数,也是训练数据标签的维度
# 随机生成训练数据
x = np.random.randn(N, D_in) # [64, 1000] 64个1000维的输出
y = np.random.randn(N, D_out) # [64, 10] 64个10维的对应输出
# 随机初始化网络的权重(此网络忽略偏置 bias)
w1 = np.random.randn(D_in, H) # [1000, 100] 输入层和隐藏层间的权重
w2 = np.random.randn(H, D_out) # [100, 10] 隐藏层和输出层间的权重
learning_rate = 1e-6 # 设置学习率
# 开始训练网络,迭代500次
for it in range(500):
# Forward Pass 前向传播
z1 = x.dot(w1) # x*w1, 输出[64, 100]
a1 = np.maximum(z1, 0) # 激活层 relu, 小于0取0, 大于0不变
y_pred = a1.dot(w2) # a1*w2, 输出[64, 10]
# Loss 计算损失
loss = np.square(y_pred - y).sum() # MSE均方误差损失
print(it, loss)
# Backward Pass 反向传播
# Gradient 计算梯度, 暂略具体计算公式的推导, 备注中的维度变化可以简单验证计算正确性
grad_y_pred = 2.0 * (y_pred - y) # [64,10]
grad_w2 = a1.T.dot(grad_y_pred) # [100,10] = [100,64] * [64,10]
grad_a1 = grad_y_pred.dot(w2.T) # [64,100] = [64,10] * [10,100]
grad_z1 = grad_a1.copy()
grad_z1[z1<0] = 0 # [64,100]
grad_w1 = x.T.dot(grad_z1) # [1000,100] = [1000,64] * [64,100]
# update weights 更新权重
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
以及:
import torch
N = 64
D_in = 1000
H = 100
D_out = 10
# np.random.randn() 变为 torch.randn()
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)
learning_rate = 1e-6
for it in range(500):
# Forward Pass
z1 = x.mm(w1) # dot() 变为 mm()
a1 = z1.clamp(min=0) # np.maximum() 变为 .clamp(min,max), 将数值夹在设定的 min,max 之间
y_pred = a1.mm(w2)
# Loss
loss = (y_pred - y).pow(2).sum().item()
# np.square变为.pow(2), 计算完的 loss 是一个 tensor, 通过 .item() 获取数值
print(it, loss)
# Backward Pass
# Gradient
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = a1.t().mm(grad_y_pred) # .T变为.t()
grad_a1 = grad_y_pred.mm(w2.t())
grad_z1 = grad_a1.clone() # .copy()变为.clone()
grad_z1[z1<0] = 0
grad_w1 = x.t().mm(grad_z1)
# update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
(转自,逐行解释详见Pytorch实战入门(一):MLP_pytorch mlp神经网络_秋山丶雪绪的博客-CSDN博客)
其中,optimizer.zero_grad(), loss.backward(), optimizer.step()作用分别为
optimizer.zero_grad():将计算梯度重置为0,方便下一次计算,
loss.backward():计算梯度(根据方向传播的公式计算),
optimizer.step():根据计算的梯度更新每一层的权重。
转自,具体源码等一些解释可以参考:理解optimizer.zero_grad(), loss.backward(), optimizer.step()的作用及原理_self.optimizer.step()_潜行隐耀的博客-CSDN博客