内容来源:
B站视频——最好的PyTorch的入门与实战教程(16小时实战)
import torch
import numpy as np
torch.empty(5,3) # 创建未初始化的矩阵
x1 = torch.rand(5,3) # 随机初始化矩阵
x2 = torch.zeros(5,3) # 全部为0矩阵
x3 = torch.zeros(5,3, dtype=torch.long) # 数据类型变为long
# x3 = torch.zeros(5,3).long() 效果一样
x4 = torch.tensor([5.5, 3]) # 从数据直接构建tensor
x5 = x4.new_ones(5,3) # 根据已有tensor构建一个tensor,这些方法会重用原来tensor的特征。例如数据类型
x6 = x4.new_ones(5,3, dtype=torch.double)
torch.rand_like(x5, dtype=torch.float)
# 得到tensor的形状
x5.shape
x5.size
# 运算
y1 = torch.rand(5,3)
print(y1)
# add
x1 + y1
torch.add(x1, y1)
result = torch.empty(5,3)
torch.add(x1, y1, out=result)
print(result) # 把输出作为一个变量
# In-place operation
y1.add_(x1) # 把操作保存在y1里面
print(y1)
# 任何in-place运算都会以_结尾。 x.copy_(y) x.t_()会改变x
# 各种Numpy的indexing都可以在Pytorch tensor上使用
print(y1[:, 1:]) # 把所有行留下,把第一列之后的留下,相当于第零列舍去
print(y1[1:, 1:]) # 舍弃第零行,第零列
# 如果希望resize一个tensor,可以使用torch.view
x7 = torch.randn(4,4)
y2 = x7.view(16) # 变成16维
y3 = x7.view(2,8) # 2x8 matrix
y3 = x7.view(2,-1) # 会自动算出对应的为数,16/2 = 8, 但不能写两个-1
# 要能被16整除,因此出现(-1, 5)会报错
# 若只有一个元素的tensor,使用.item()可以把里面的value变成python数值
x8 = torch.randn(1)
print(x8.data) # 仍返回一个tensor
print(x8.grad) # 返回一个grad
print(x8.item()) # 返回一个数字
print(y3.transpose(1, 0)) # 将y3进行转置
# 在Numpy和Tensor之间转换
# Torch Tensor 和 Numpy Array 共享内存,改变其中一项另一项也改变
a = torch.ones(5)
b = a.numpy()
b[1] = 2
print(a)
# 把Numpy ndarry转成Torch Tensor
c = np.ones(5)
d = torch.from_numpy(c)
np.add(c, 1, out = c)
print(c)
print(d)
# CUDA Tensors
if torch.cuda.is_available():
device = torch.device("cuda") # a CUDA device object
y = torch.ones_like(x7, device=device) # directly create a tensor on GPU
x7 = x7.to(device) # or just use strings ``.to("cuda")``
z = x7 + y
print(z)
print(z.to("cpu", torch.double)) # ``.to`` can also change dtype together!
# numpy是在CPU上操作的
# y.to("cpu").data.numpy()
# y.cpu().data.numpy()
'''
用numpy实现两层神经网络,一个隐藏层,没有bias,用来从x预测y,使用L2 loss
h = W_1X + b_1
a = max(0,h)
y_hat = w_2a + b_2
numpy ndarray 是一个普通的n维array
'''
import numpy as np
N, D_in, H, D_out = 64, 1000, 100, 10 # 输入64个变量,输入是1000维,输出10维,中间层H为100维
# 随机创建一些训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
learning_rate = 1e-6
for t in range(500): # forward pass
h = x.dot(w1) # N*H 点积
h_relu = np.maximum(h, 0) # N*H
y_pred = h_relu.dot(w2) # N*D_out
# compute loss
loss = np.square(y_pred - y).sum()
print(t, loss)
# backward pass, compute the gradient
grad_y_pred = 2.0*(y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy()
grad_h[h<0] = 0
grad_w1 = x.T.dot(grad_h)
# update weights of w1 and w2
w1 -= learning_rate*grad_w1
w2 -= learning_rate*grad_w2
import torch
N, D_in, H, D_out = 64, 1000, 100, 10 # 输入64个变量,输入是1000维,输出10维,中间层H为100维
# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out)
learning_rate = 1e-6
for t in range(500): # forward pass
h = x.mm(w1) # N*H matrix multipulication点积
h_relu = h.clamp(min=0) # N*H 类似于夹子,把值夹在min和max之间
y_pred = h_relu.mm(w2) # N*D_out
# compute loss
loss = (y_pred - y).pow(2).sum().item() # 要转成数字
print(t, loss)
# backward pass, compute the gradient
grad_y_pred = 2.0*(y_pred - y)
grad_w2 = h_relu.t().mm(grad_y_pred)
grad_h_relu = grad_y_pred.mm(w2.T)
grad_h = grad_h_relu.clone()
grad_h[h<0] = 0
grad_w1 = x.t().mm(grad_h)
# update weights of w1 and w2
w1 -= learning_rate*grad_w1
w2 -= learning_rate*grad_w2
import torch
N, D_in, H, D_out = 64, 1000, 100, 10 # 输入64个变量,输入是1000维,输出10维,中间层H为100维
# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
w1 = torch.randn(D_in, H, requires_grad=True)
w2 = torch.randn(H, D_out, requires_grad=True)
learning_rate = 1e-6
for t in range(500): # forward pass
# h = x.mm(w1) # N*H matrix multipulication点积
# h_relu = h.clamp(min=0) # N*H 类似于夹子,把值夹在min和max之间
y_pred = x.mm(w1).clamp(min=0).mm(w2) # N*D_out
# compute loss
loss = (y_pred - y).pow(2).sum() #computation graph
print(t, loss.item())
# backward pass, compute the gradient
loss.backward()
# update weights of w1 and w2
# 为了不让计算图占内存,不会记住w1和w2的值
with torch.no_grad():
w1 -= learning_rate*w1.grad
w2 -= learning_rate*w2.grad
w1.grad.zero_() # 避免多次计算累加导致错误
w2.grad.zero_()
'''
用nn库来构建网络 neural network
用autograd来构建计算图和计算gradients
'''
import torch
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10 # 输入64个变量,输入是1000维,输出10维,中间层H为100维
# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H), # w_1*x + b_1
torch.nn.ReLU(),
torch.nn.Linear(H, D_out)
)
# 把初始化变成normal distribution会让模型效果好很多
torch.nn.init.normal_(model[0].weight)
torch.nn.init.normal_(model[2].weight)
# model = model.cuda()
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-6
for t in range(500): # forward pass
y_pred = model(x) # model.forward()
# compute loss
loss = loss_fn(y_pred, y) # computation graph
print(t, loss.item())
model.zero_grad() # 将梯度清零避免叠加
# backward pass, compute the gradient
loss.backward()
# update weights of w1 and w2
with torch.no_grad():
for param in model.parameters():
param -= learning_rate*param.grad
'''
用nn库来构建网络 neural network
用autograd来构建计算图和计算gradients
'''
import torch
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10 # 输入64个变量,输入是1000维,输出10维,中间层H为100维
# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = torch.nn.Sequential(
torch.nn.Linear(D_in, H), # w_1*x + b_1
torch.nn.ReLU(),
torch.nn.Linear(H, D_out)
)
# model = model.cuda()
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Adam 的学习率一般在1e-3到1e-4
# 若用SGD,则需要把初始值做一下nomalization,不知道为什么,但是loss会变得很小,玄学
for t in range(500): # forward pass
y_pred = model(x) # model.forward()
# compute loss
loss = loss_fn(y_pred, y) # computation graph
print(t, loss.item())
optimizer.zero_grad() # 将梯度清零避免叠加
# backward pass, compute the gradient
loss.backward()
# update model parameters
optimizer.step() # optimizer会更新
'''
用nn库来构建网络 neural network
用autograd来构建计算图和计算gradients
'''
import torch
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10 # 输入64个变量,输入是1000维,输出10维,中间层H为100维
# 随机创建一些训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
# 把所有的module写在__init__里面,把每一个有导数的层放在init里面,在init里面定义模型的框架
class TwoLayerNet(torch.nn.Module):
def __init__(self, D_in, H, D_out):
super(TwoLayerNet, self).__init__()
self.linear1 = torch.nn.Linear(D_in, H, bias=False)
self.linear2 = torch.nn.Linear(H, D_out, bias=False)
def forward(self, x): # 前向传播的过程
y_pred = self.linear2(self.linear1(x).clamp(min=0))
return y_pred
model = TwoLayerNet(D_in, H, D_out)
loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Adam 的学习率一般在1e-3到1e-4
# 若用SGD,则需要把初始值做一下nomalization,不知道为什么,但是loss会变得很小,玄学
for t in range(500): # forward pass
y_pred = model(x) # model.forward()
# compute loss
loss = loss_fn(y_pred, y) # computation graph
print(t, loss.item())
optimizer.zero_grad() # 将梯度清零避免叠加
# backward pass, compute the gradient
loss.backward()
# update model parameters
optimizer.step() # optimizer会更新