from __future__ import print_function
import torch
torch.__version__
'1.4.0'
一个全连接ReLU神经网络,一个隐藏层,没有bias。用来从x预测y,使用L2 Loss。
这一实现完全使用numpy来计算前向神经网络,loss,和反向传播。
numpy ndarray是一个普通的n维array。它不知道任何关于深度学习或者梯度(gradient)的知识,也不知道计算图(computation graph),只是一种用来计算数学运算的数据结构。
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
# 随机初始化权重(正态分布初始化,当然也可使用其他分布,不过测试显示,这种效果好)
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
# 学习率
learning_rate = 1e-6
# 开始训练
for it in range(500):
# Forward pass
h = x.dot(w1) # N * H
h_relu = np.maximum(h, 0) #N*H
y_pred = h_relu.dot(w2) # N*D_out
# compute Loss
loss = np.sum(np.square(y_pred - y))/N
# 每50轮输出一次
if it%50 == 0:
print(it, loss)
# Backward pss
# compute the gradient
grad_y_pred = 2.0 *(y_pred - y) # N*D_out
grad_w2 = h_relu.T.dot(grad_y_pred) #H*D_out
grad_h_relu = grad_y_pred.dot(w2.T) #N*H
grad_h = grad_h_relu.copy() # N*H
grad_h[h < 0] = 0 # N*H
grad_w1 = x.T.dot(grad_h) #D_in * H
# update weight of w1 and w2
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
可以看到,模型的损失函数在下降。
0 453837.5006405384
50 254.0963990846821
100 12.189041646454672
150 1.02029282379215
200 0.10376308159967264
250 0.011484592626939395
300 0.0013329704334778575
350 0.00015973632332052156
400 1.9599518664087285e-05
450 2.4488332542399643e-06
这次我们使用PyTorch tensors来创建前向神经网络,计算损失,以及反向传播。
一个PyTorch Tensor很像一个numpy的ndarray。但是它和numpy ndarray最大的区别是,PyTorch Tensor可以在CPU或者GPU上运算。如果想要在GPU上运算,就需要把Tensor换成cuda类型。
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 随机初始化权重
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)
# 学习率
learning_rate = 1e-6
for it in range(500):
# Forward pass
# matrix multiplication torch的矩阵乘法
h = x.mm(w1)
# torch.clamp(input, min, max, out=None) → Tensor
h_relu = h.clamp(min=0)
y_pred = h_relu.mm(w2)
# compute Loss
# torch.pow(input, exponent, out=None)
loss = (y_pred - y).pow(2).sum()/N
# loss = np.sum(np.square(y_pred - y))/N
if it%50 == 0:
print(it, loss.item())
# Backward pss
# compute the gradient
grad_y_pred = 2.0 *(y_pred - y)
# torch.t(input, out=None) → Tensor
grad_w2 = h_relu.t().mm(grad_y_pred)
grad_h_relu = grad_y_pred.mm(w2.t())
grad_h = grad_h_relu.clone()
grad_h[h < 0] = 0
grad_w1 = x.t().mm(grad_h)
# update weight of w1 and w2
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2
将Numpy实现改为pytorch实现后,损失值同样实现了下降。
0 376953.78125
50 205.20765686035156
100 8.080462455749512
150 0.5182541608810425
200 0.03946444019675255
250 0.003323344048112631
300 0.0003024951438419521
350 3.265954364906065e-05
400 5.786065230495296e-06
450 1.7886110299514257e-06
PyTorch的一个重要功能就是autograd,也就是说只要定义了forward pass(前向神经网络),计算了loss之后,PyTorch可以自动求导计算模型所有参数的梯度。
一个PyTorch的Tensor表示计算图中的一个节点。如果x
是一个Tensor并且x.requires_grad=True
那么x.grad
是另一个储存着x
当前梯度(相对于一个scalar,常常是loss)的向量。
y = w ∗ x + b ; x , w , b = 1 , 2 , 3 y = w*x +b;\qquad x,w,b = 1,2,3 y=w∗x+b;x,w,b=1,2,3
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)
y = w*x +b # 2*1 + 3
y.backward()
print(w.grad)
print(x.grad)
print(b.grad)
print(y.requires_grad)
tensor(1.)
tensor(2.)
tensor(1.)
True
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 随机初始化权重
w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)
# 学习率
learning_rate = 1e-6
for it in range(500):
# Forward pass
# matrix multiplication 矩阵乘法
y_pred = x.mm(w1).clamp(min=0).mm(w2)
# compute Loss
# torch.pow(input, exponent, out=None)
loss = (y_pred - y).pow(2).sum()
# loss = np.sum(np.square(y_pred - y))/N
if it%50 == 0:
print(it, loss.item())
# Backward pss
# compute the gradient
loss.backward()
with torch.no_grad():
# update weight of w1 and w2
w1 -= learning_rate * w1.grad
w2 -= learning_rate * w2.grad
w1.grad.zero_()
w2.grad.zero_()
0 34451024.0
50 8608.970703125
100 195.85415649414062
150 8.930438041687012
200 0.5644952654838562
250 0.04160516336560249
300 0.00351862539537251
350 0.0004813229606952518
400 0.00012497704301495105
450 5.010930181015283e-05
这次我们使用PyTorch中nn这个库来构建网络。
用PyTorch autograd来构建计算图和计算gradients,
然后PyTorch会帮我们自动计算gradient。
# neural network
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据,没有GPU的话,将最后的参数去掉
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H, bias=True),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out, bias=True),
)
# 切换模型至gpu
model = model.cuda()
# 将模型权重初始化为标准正态分布,初始化的方式和 优化器相关 optimizer,比如 下面的优化器就不适用 正态分布初始化
torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)
# loss function
loss_fn = torch.nn.MSELoss(reduction='sum')
# 学习率
learning_rate = 1e-6
for it in range(500):
# Forward pass
y_pred = model.forward(x)
# compute Loss
loss = loss_fn(y_pred, y)
if it%50 == 0:
print(it, loss.item())
# Backward pss
# compute the gradient
loss.backward()
# 更新优化
with torch.no_grad():
# update weight of w1 and w2
for param in model.parameters():
param -= learning_rate * param.grad
# 将module中的所有模型参数的梯度设置为0.
model.zero_grad()
0 33782936.0
50 12429.576171875
100 396.4686584472656
150 22.655574798583984
200 1.6235853433609009
250 0.12922528386116028
300 0.011064080521464348
350 0.0012060991721227765
400 0.000243198883254081
450 8.291941048810259e-05
model
Sequential(
(0): Linear(in_features=1000, out_features=100, bias=True)
(1): ReLU()
(2): Linear(in_features=100, out_features=10, bias=True)
)
print(model[0])
print(model[1])
print(model[2])
Linear(in_features=1000, out_features=100, bias=True)
ReLU()
Linear(in_features=100, out_features=10, bias=True)
print(model[0].weight, model[0].weight.size())
可以看到,权重形状为(100,1000)。其中1000指我们的输入特征的数目,100指我们隐藏层单元数
Parameter containing:
tensor([[ 2.9527, 0.0705, 0.6567, ..., 1.8047, 0.0522, 0.4627],
[-1.0993, -0.1027, 1.6518, ..., 1.9525, -1.0062, 0.7210],
[ 0.1221, -1.4789, -0.2117, ..., 1.3389, 1.6465, -1.3678],
...,
[-0.7976, 0.9299, 1.3768, ..., 0.6766, -3.0162, -0.4211],
[-1.1172, 0.3629, -1.1479, ..., 2.1785, -1.0729, 0.7226],
[-0.9535, 1.3007, -0.3081, ..., 1.5089, -2.0301, 0.5141]],
device='cuda:0', requires_grad=True) torch.Size([100, 1000])
model[0].bias
可以看到bias是100维的向量
Parameter containing:
tensor([-0.0829, -0.1126, -0.0959, -0.1200, -0.0569, -0.1234, -0.1879, -0.1558,
-0.1204, -0.0943, -0.0494, -0.0972, -0.0946, -0.1077, -0.1254, -0.1321,
-0.0894, -0.0993, -0.0949, -0.1338, -0.1247, -0.1498, -0.0898, -0.0614,
-0.1405, -0.0832, -0.0792, -0.1756, -0.0647, -0.0518, -0.0877, -0.0283,
-0.1253, -0.1160, -0.0896, -0.0896, -0.0209, -0.0955, -0.1371, -0.1240,
-0.1514, -0.0774, -0.0422, -0.0517, -0.1434, -0.0883, -0.0226, -0.1384,
-0.0290, -0.0724, -0.0603, -0.0943, -0.1584, -0.0428, -0.0475, -0.0807,
-0.1504, -0.0699, -0.1765, -0.1281, -0.0665, -0.1321, -0.1663, -0.1214,
-0.0727, -0.0616, -0.0429, -0.0895, -0.1085, -0.0662, -0.1271, -0.0838,
-0.0610, -0.0896, -0.0548, -0.1340, -0.1241, -0.0455, -0.0441, -0.2305,
-0.1060, -0.1799, -0.0398, -0.1280, -0.0790, -0.1453, -0.0916, -0.0527,
-0.0966, -0.0351, -0.0508, -0.0573, -0.0838, -0.0924, -0.1029, -0.0669,
-0.0871, -0.1145, -0.0758, -0.1035], device='cuda:0',
requires_grad=True)
这一次我们不再手动更新模型的weights,而是使用optim这个包来帮助我们更新参数。
optim这个package提供了各种不同的模型优化方法,包括SGD+momentum, RMSProp, Adam等等。
# neural network
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H, bias=True),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out, bias=True),
)
# 切换模型至gpu
model = model.cuda()
# torch.nn.init.normal_(model[0].weight)
# torch.nn.init.normal_(model[2].weight)
# loss function
loss_fn = torch.nn.MSELoss(reduction='sum')
# 学习率
learning_rate = 1e-4
# 优化器
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for it in range(500):
# Forward pass
y_pred = model.forward(x)
# compute Loss
loss = loss_fn(y_pred, y)
if it%50 == 0:
print(it, loss.item())
# Backward pss
# compute the gradient
loss.backward()
# 更新优化
# update model parameters
optimizer.step()
optimizer.zero_grad()
0 649.89404296875
50 32.93153381347656
100 2.3027215003967285
150 0.25849246978759766
200 0.03555472940206528
250 0.005513873882591724
300 0.0009421082213521004
350 0.00016964430687949061
400 3.1613410101272166e-05
450 6.031679731677286e-06
# neural network
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H, bias=True),
torch.nn.ReLU(),
torch.nn.Linear(H, D_out, bias=True),
)
# 切换模型至gpu
model = model.cuda()
# loss function
loss_fn = torch.nn.MSELoss(reduction='sum')
# 学习率
learning_rate = 1e-4
# 优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for it in range(500):
# Forward pass
y_pred = model.forward(x)
# compute Loss
loss = loss_fn(y_pred, y)
if it%50 == 0:
print(it, loss.item())
# Backward pss
# compute the gradient
loss.backward()
# 更新优化
# update model parameters
# 执行一次优化步骤
optimizer.step()
optimizer.zero_grad()
0 631.9619140625
50 178.90069580078125
100 41.89125061035156
150 6.091819763183594
200 0.6236540079116821
250 0.053772542625665665
300 0.003956200089305639
350 0.0002552031655795872
400 1.3869885151507333e-05
450 5.88077000429621e-07
我们可以定义一个模型,这个模型继承自nn.Module类。如果需要定义一个比Sequential模型更加复杂的模型,就需要定义nn.Module模型。
# neural network
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10
# 随机产生一些训练数据
x = torch.randn(N, D_in, device='cuda')
y = torch.randn(N, D_out, device='cuda')
class TwoLayerModel(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerModel, self).__init__()
# define the model architecture
self.liner1 = torch.nn.Linear(D_in, H, bias=True)
self.liner2 = torch.nn.Linear(H, D_out, bias=True)
def forward(self, x):
y_pred = self.liner2(self.liner1(x).clamp(min=0))
return y_pred
model = TwoLayerModel(D_in, H, D_out)
# 切换模型至gpu
model = model.cuda()
# loss function
loss_fn = torch.nn.MSELoss(reduction='sum')
# 学习率
learning_rate = 1e-4
# 优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for it in range(500):
# Forward pass
y_pred = model.forward(x)
# compute Loss
loss = loss_fn(y_pred, y)
if it%50 == 0:
print(it, loss.item())
# Backward pss
# compute the gradient
loss.backward()
# 更新优化
# update model parameters
# 执行一次优化步骤
optimizer.step()
optimizer.zero_grad()
0 641.8615112304688
50 175.04244995117188
100 37.29473876953125
150 5.148418426513672
200 0.4981759190559387
250 0.03344475477933884
300 0.0012865143362432718
350 2.4406330339843407e-05
400 1.8774034060697886e-07
450 5.518275836280395e-10
model
TwoLayerModel(
(liner1): Linear(in_features=1000, out_features=100, bias=True)
(liner2): Linear(in_features=100, out_features=10, bias=True)
)
本文完成了由Numpy一步步改进成pytorch是实现的过程。