根据学习情况随时更新。
2020.08.14更新完成。
参考课程-刘二大人《PyTorch深度学习实践》
本章知识部分涉及到计算图的正向传播(表达式计算)和反向传播(表达式求导),无需赘述。
核心公式
梯度下降法的本质是贪心法。由于函数是凸函数,可以保证贪心法求解得到的是最优解,此时局部最优解即是全局最优解。
此外,学习率α不能过大,否则可能造成不收敛。
梯度下降算法的例子(还是要自己写写才学得会)
def compute(x):
return w * x
def cost(xs, ys):
sum = 0
for x, y in zip(xs, ys):
sum += (compute(x) - y) ** 2
return sum / len(xs)
def gradient(xs, ys):
grad = 0
for x, y in zip(xs, ys):
grad += 2 * x * (w * x - y)
return grad / len(xs)
w = 1.0
xs = [1.0, 2.0, 3.0]
ys = [2.0, 4.0, 6.0]
print('f(4) before training:' + str(compute(4)))
for i in range(1000):
tempCost = cost(xs, ys)
tempGradient = gradient(xs, ys)
w -= 0.01 * tempGradient
print('cost:' + str(tempCost) + '\t' + 'w:' + str(w))
print('f(4) after training:' + str(compute(4)))
图中是一个两层的神经网络,在每层的输出中添加一个非线性函数(此处是sigmoid函数,在吴课程中有介绍)。
import torch
def compute(x):
return w * x # 此处的w是Tensor类型,x由整型自动转为Tensor类型,并最终返回Tensor
def loss(x, y):
return (compute(x) - y) ** 2 # 返回Tensor类型
x_data = [1.0, 2.0, 3.0]
y_data = [2.0, 4.0, 6.0]
w = torch.Tensor([1.0]) # 为Tensor赋初值时需要带方括号
w.requires_grad = True # 标明需要计算该权值的梯度值,则计算图中,w参与运算的结果同样具有梯度值
for i in range(100):
for x, y in zip(x_data, y_data): # 采用随机梯度下降
lossResult = loss(x, y)
lossResult.backward() # 反向传播
w.data -= 0.01 * w.grad.data
w.grad.data.zero_() # 对于每个输入数据,使用后将w梯度置0,避免梯度累加
print('Time:' + str(i+1) + '\tw:' + str(w.data) + '\tresult:' + str(compute(4).data))
设计步骤:
-准备数据集
-设计模型,计算y的估计值(构造计算图)
-使用PyTorch API构造损失函数、优化器
-训练:前馈算损失、反馈算梯度、更新权值
实例:
import torch
# Step 1
x_data = torch.Tensor([[1.0], [2.0], [3.0]])
y_data = torch.Tensor([[2.0], [4.0], [6.0]])
# Step 2
class LinearModule(torch.nn.Module): # 一定要继承自 神经网络模块基类 nn.Module类,并实现__init__()、forward()
def __init__(self):
super(LinearModule, self).__init__()
self.linear = torch.nn.Linear(1, 1) # 参数分别代表初始的 权值w和偏移b(都是Tensor类)
def forward(self, x):
y_pred = self.linear(x) # linear是Linear类的实例,且该类实现了__call__()方法,在实例化后可以像函数一样调用,通常调用forward()
return y_pred
model = LinearModule() # 实例化,model是可调用的
# model(1) 计算x=1时的估计值
# Step 3
criterion = torch.nn.MSELoss(size_average=False) # 设置为True也可
optimizer = torch.optim.SGD(model.parameters(), lr=0.01) # lr即学习率
# Step 4
for epoch in range(1000):
# 前馈
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
print(epoch, loss.data)
# 反馈
optimizer.zero_grad() # 将梯度清零
loss.backward()
# 更新
optimizer.step()
# 输出结果
print(model.linear.weight.item()) # 输出w
print(model.linear.bias.item()) # 输出b
分类问题
-Logistic回归属于分类问题。分类问题求解是对应情况的概率,并取最大值作为分类结果。
-对于前例,输出由多少分转为能否通过/通过考试的概率。
-使用Logistic函数(标记为σ),将实数空间的输出值映射到 [0,1] 中。此时损失函数也需要改变-(ylog(y’)+(1-y)log(1-y’))。
# 记录LogisticRegressionModel的特点
import torch
class LogisticModel(torch.nn.Module):
def __init__(self):
super(LogisticModel, self).__init__()
self.linear = torch.nn.Linear(1, 1)
def forward(self, x):
y_pred = torch.nn.functional.sigmoid(self.linear(x)) # 对原y'进行Sigmoid处理
return y_pred
criterion = torch.nn.BCELoss(size_average=False) # 二分类交叉熵,是否取平均值影响学习率大小
多维输入的Logistic模型:每个样本对应一个向量,单个样本的输出y’仍然是概率(即y’∈[0,1])。相比一维输入,w*x由实数运算变为向量内积运算。
将N个方程合并成矩阵的运算,利用向量化的特性加速训练过程。
import torch
import numpy as np
xy = np.loadtxt('diabetes.csv.gz', delimiter=',', dtype=np.float32)
x_data = torch.from_numpy(xy[:, :-1])
y_data = torch.from_numpy(xy[:, [-1]])
class MultiDimensionLogisticModel(torch.nn.Module):
def __init__(self):
super(MultiDimensionLogisticModel, self).__init__()
self.linear1 = torch.nn.Linear(8, 6) # 输入维度为8(每个样本有8个特征),输出维度为6(每个结果有6个特征)
self.linear2 = torch.nn.Linear(6, 4)
self.linear3 = torch.nn.Linear(4, 1) # 经过三层处理最终输出一维
self.sigmoid = torch.nn.Sigmoid() # Sigmoid函数为模型添加非线性变换,构建计算图
def forward(self, x):
x = self.sigmoid(self.linear1(x))
x = self.sigmoid(self.linear2(x))
x = self.sigmoid(self.linear3(x))
return x
model = MultiDimensionLogisticModel()
criterion = torch.nn.BCELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
for epoch in range(1000):
# 前馈
y_pred = model(x_data) # 每次处理所有数据集,而下一章每次处理一个Batch
loss = criterion(y_pred, y_data)
print(epoch, loss.data)
# 反馈
optimizer.zero_grad()
loss.backward()
# 更新
optimizer.step()
采用DataLoader处理输入时,需要额外定义一个继承自Dataset(该类是抽象类)的类,并实现_init_(), _getitem_(), _len_()方法。
# Step 4
for epoch in range(training_epochs): # 对所有数据,共进行training_epochs次的训练
for i in range(total_batch): # 对每个Batch进行处理
#TODO
Dataset和DataLoader
DataLoader迭代时以Batch为基本元素,其中Batch的大小在创建Loader时指定。
Dataset每次获取一个样本,DataLoader每次获取Batch个样本。
实例
import torch
import numpy as np
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
# Step 1
class DiabetesDataset(Dataset):
def __init__(self, filepath):
xy = np.loadtxt(filepath, delimiter=',', dtype=np.float32)
self.x_data = torch.from_numpy(xy[:, :-1])
self.y_data = torch.from_numpy(xy[:, [-1]])
self.len = xy.shape[0] # 和不采用Dataset相比增加的步骤
def __getitem__(self, index):
return self.x_data[index], self.y_data[index]
def __len__(self):
return self.len
dataset = DiabetesDataset('diabetes.csv.gz')
# dataset是传入数据集,batch_size规定一个batch的样本数,shuffle代表是否打乱样本顺序,num_workers代表并行数目
train_loader = DataLoader(dataset=dataset, batch_size=32, shuffle=True, num_workers=2)
# Step 2(未发生变化)
class MultiDimensionLogisticModel(torch.nn.Module):
def __init__(self):
super(MultiDimensionLogisticModel, self).__init__()
self.linear1 = torch.nn.Linear(8, 6)
self.linear2 = torch.nn.Linear(6, 4)
self.linear3 = torch.nn.Linear(4, 1)
self.sigmoid = torch.nn.Sigmoid() # Sigmoid函数为模型添加非线性变换,构建计算图
def forward(self, x):
x = self.sigmoid(self.linear1(x))
x = self.sigmoid(self.linear2(x))
x = self.sigmoid(self.linear3(x))
return x
model = MultiDimensionLogisticModel()
# Step 3(未发生变化)
criterion = torch.nn.BCELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
# Step 4
for epoch in range(100):
for i, data in enumerate(train_loader, 0): # 每次获取index和一个Batch
x, y = data # 其中x和y都是Tensor类型,x代表Batch个输入,y代表Batch个输出
# 前馈
y_pred = model(x)
loss = criterion(y_pred, y)
print(epoch, i, loss.item())
# 反馈
optimizer.zero_grad()
loss.backward()
# 更新
optimizer.step()
Softmax层的引入
相比较二分类问题,多分类需要输出每种情况的对应概率。如果按照二分类的方法,可能造成所有情况概率之和不为1的情况。改进措施是,将输出最终结果前的Sigmoid层转换成Softmax层。
引入Softmax层的具体操作
①使用numpy直接计算
import numpy as np
y = np.array([1, 0, 0]) # 标签值
z = np.array([0.2, 0.1, -0.1]) # Softmax输入值
y_pred = np.exp(z)/np.exp(z).sum()
loss = (-y * np.log(y_pred)).sum()
②使用NLLLoss损失函数,输入Softmax和Log处理后的数据,以及标签值,输出Loss。
import torch
y = torch.LongTensor([2, 0, 1]) # 注意此处是LongTensor
# z_1和z_2是最后一层输出,进入Softmax之前的值,所以每个分类之和不为1
# 每行元素代表对一个对象的分类情况,共三个对象
z_1 = torch.Tensor([[0.1, 0.2, 0.9],
[1.1, 0.1, 0.2],
[0.2, 2.1, 0.1]])
z_2 = torch.Tensor([[0.9, 0.2, 0.1],
[0.1, 0.1, 0.5],
[0.2, 0.1, 0.7]])
criterion = torch.nn.CrossEntropyLoss()
print(criterion(z_1, y), criterion(z_2, y))
# 将PIL转为Tensor
transform = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307, ), (0.3081, ))
])
②设计模型
view将 N个 1维 28*28的输入,转化为 N个 1行 的数据(每行784个)。
注意最后一层不进行激活。
图像存储形式
RGB:输入图片是3*w*h的形式,频道对应红/绿/蓝,每个元素取值0~255.
矢量:存储图像的绘制信息。特点是放大不会出现像素块。
图像的卷积概述
对图像进行卷积操作,本质上是对图像的某块的所有频道(即一个Batch)进行操作。操作会改变图像的c,w,h值。
多通道的卷积操作
本质上是将单通道的卷积累加。需要注意的是,输入Kernel的通道数要和输入的通道数相等。
该操作将n通道的输入转化成1通道的输出。
如果需要得到m通道的输出,则要准备m份的卷积阵。此时对应的权值是 m*n*k_width*k_height的四维量。
import torch
in_channels, out_channels= 5, 10
width, height = 100, 100
kernel_size = 3
batch_size = 1
input = torch.randn(batch_size, in_channels, width, height)
conv_layer = torch.nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size)
output = conv_layer(input)
import torch
input = [3,4,6,5,7,
2,4,6,8,2,
1,6,7,8,4,
9,7,4,6,2,
3,7,5,4,1]
input = torch.Tensor(input).view(1, 1, 5, 5) # 分别对应Batch、Channel、Width、Height
conv_layer = torch.nn.Conv2d(1, 1, kernel_size=3, padding=1, bias=False) # 分别对应输入Channel、输出Channel
kernel = torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8, 9]).view(1, 1, 3, 3) #输出Channel、输入Channel、Width、Height
conv_layer.weight.data = kernel.data
output = conv_layer(input)
步长stride
将上述代码的padding=1改为stride=2,即可得到2*2的结果。
Max Pooling层
kernel_size为2的MaxPooling可以将输入的行和列都减少为原来的1/2.
该操作和通道数无关,操作前后不改变通道数。
import torch
input = [3,4,6,5,
2,4,6,8,
1,6,7,8,
9,7,4,6,]
input = torch.Tensor(input).view(1, 1, 4, 4)
maxpooling_layer = torch.nn.MaxPool2d(kernel_size=2)
output = maxpooling_layer(input)
import torch
import torch.nn.functional as F
class Net(torch.nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = torch.nn.Conv2d(1, 10, kernel_size=5) # 卷积层1
self.conv2 = torch.nn.Conv2d(10, 20, kernel_size=5) # 卷积层2
self.pooling = torch.nn.MaxPool2d(2)
self.linear = torch.nn.Linear(320, 10) # 输入320个数据,输出10个数据(对应十种分类情况)
def forward(self, x):
batch_size = x.size(0)
x = self.pooling(F.relu(self.conv1(x)))
x = self.pooling(F.relu(self.conv2(x)))
x = x.view(batch_size, -1) # 全连接网络输入
x = self.linear(x)
return x
model = Net()
# 在上述代码添加
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# 并在训练、测试时,在 input, target = data的基础上增加
input, target = input.to(device), target.to(device)
Inception Module
对同一输入进行不同卷积,合并最终结果,取最优解。
不同路径只能改变频道数,不能改变宽度和高度(因为最终需要合并)。
Concatenate操作将不同的卷积结果沿着通道方向合并。
# 上半部分是__init__的操作,下半部分是forward的操作
# 分支1
self.branch_pool = nn.Conv2d(in_channels, 24, kernel_size=1)
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool(branch_pool)
# 分支2
self.branch1x1 = nn.Conv2d(in_channels, 16, kernel_size=1)
branch1x1 = self.branch1x1(x)
# 分支3
self.branch5x5_1 = nn.Conv2d(in_channels,16, kernel_size=1)
self.branch5x5_2 = nn.Conv2d(16, 24, kernel_size=5, padding=2)
branch5x5 = self.branch5x5_1(x)
branch5x5 = self.branch5x5_2(branch5x5)
# 分支4
self.branch3x3_1 = nn.Conv2d(in_channels, 16, kernel_size=1)
self.branch3x3_2 = nn.Conv2d(16, 24, kernel_size=3, padding=1)
self.branch3x3_3 = nn.Conv2d(24, 24, kernel_size=3, padding=1)
branch3x3 = self.branch3x3_1(x)
branch3x3 = self.branch3x3_2(branch3x3)
# Concatenate
outputs = [branch1x1, branch5x5, branch3x3, branch_pool]
return torch.cat(outputs, dim=1) # (batch, channel, width, height)中channel对应index为1,按照channel对齐
使用实例
import torch
from torch import nn
import torch.nn.functional as F
# 图中的Inception示例
class InceptionA(nn.Module):
def __init__(self, in_channels):
super(InceptionA, self).__init__()
# 定义各分支的卷积部分:
# 分支2
self.branch1x1 = nn.Conv2d(in_channels, 16, kernel_size=1)
# 分支3
self.branch5x5_1 = nn.Conv2d(in_channels, 16, kernel_size=1)
self.branch5x5_2 = nn.Conv2d(16, 24, kernel_size=5, padding=2)
# 分支4
self.branch3x3_1 = nn.Conv2d(in_channels, 16, kernel_size=1)
self.branch3x3_2 = nn.Conv2d(16, 24, kernel_size=3, padding=1)
self.branch3x3_3 = nn.Conv2d(24, 24, kernel_size=3, padding=1)
# 分支1
self.branch_pool = nn.Conv2d(in_channels, 24, kernel_size=1)
def forward(self, x):
# 分支2
branch1x1 = self.branch1x1(x)
# 分支3
branch5x5 = self.branch5x5_1(x)
branch5x5 = self.branch5x5_2(branch5x5)
# 分支4
branch3x3 = self.branch3x3_1(x)
branch3x3 = self.branch3x3_2(branch3x3)
branch3x3 = self.branch3x3_3(branch3x3)
# 分支1
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool(branch_pool)
# Concatenate
outputs = [branch1x1, branch5x5, branch3x3, branch_pool]
return torch.cat(outputs, dim=1)
# 采用Inception的模型
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(88, 20, kernel_size=5) # 88是经过Inception1后的通道数
self.incep1 = InceptionA(in_channels=10)
self.incep2 = InceptionA(in_channels=20)
self.mp = nn.MaxPool2d(2) # pooling操作不改变通道数
self.linear = nn.Linear(1408, 10)
def forward(self, x):
in_size = x.size(0)
x = F.relu(self.mp(self.conv1(x))) # 先后进行卷积、池化、激活
x = self.incep1(x)
x = F.relu(self.mp(self.conv2(x)))
x = self.incep2(x)
x = x.view(in_size, -1)
x = self.linear(x)
return x
from torch import nn
import torch.nn.functional as F
class ResidualBlock(nn.Module):
def __init__(self, channel):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv2d(channel, channel, kernel_size=3, padding=1)
self.conv2 = nn.Conv2d(channel, channel, kernel_size=3, padding=1)
def forward(self, x):
y = F.relu(self.conv1(x))
y = self.conv2(y) # 注意此处需要求和后再激活
y = F.relu(x + y)
return y
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 16, kernel_size=5)
self.conv2 = nn.Conv2d(16, 32, kernel_size=5)
self.rb1 = ResidualBlock(16)
self.rb2 = ResidualBlock(32)
self.mp = nn.MaxPool2d(2)
self.linear = nn.Linear(512, 10)
def forward(self, x):
in_size = x.size(0)
x = self.mp(F.relu(self.conv1(x)))
x = self.rb1(x)
x = self.mp(F.relu(self.conv2(x)))
x = self.rb2(x)
x = x.view(in_size, -1)
x = self.linear(x)
return x
使用时需要明确输入、输出的维度关系。
设 batchSize=1,seqLen=3,inputSize=4,hiddenSize=2
则有 input.shape=(batchSize, inputSize),output.shape=(batchSize, hiddenSize),dataset.shape=(seqLen, batchSize, inputSize)
import torch
batch_size = 1
seq_len = 3
input_size = 4
hidden_size = 2
cell = torch.nn.RNNCell(input_size=input_size, hidden_size=hidden_size)
dataset = torch.randn(seq_len, batch_size, input_size)
hidden = torch.zeros(batch_size, hidden_size)
input.shape=(seqLen, batchSize, inputSize)
h0.shape=(numLayers, batchSize, hiddenSize)
output.shape=(seqLen, batchSize, hiddenSize)
hn.shape=(numLayers, batchSize, hiddenSize)
import torch
batch_size = 1
seq_len = 3
input_size = 4
hidden_size = 2
num_layers = 1
cell = torch.nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers)
inputs = torch.randn(seq_len, batch_size, input_size)
hidden = torch.zeros(num_layers, batch_size, hidden_size)
将字符串"hello"转化为"ohlol"
首先将输入字符的每个字符进行向量化,得到RNNCell的输入。
此时inputSize=4,seqLen=5。
import torch
# 处理输入
input_size = 4
hidden_size = 4
batch_size = 1
idx2char = ['e', 'h', 'l', 'o']
x_data = [1, 0, 2, 2, 3] # 'hello'
y_data = [3, 1, 2, 3, 2] # 'ohlol'
one_hot_lookup = [[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
x_one_hot = [one_hot_lookup[x] for x in x_data] # 得到RNNCell的输入
inputs = torch.Tensor(x_one_hot).view(-1, batch_size, input_size) # input.shape=(seqLen, batchSize, inputSize)
labels = torch.LongTensor(y_data).view(-1, 1) # labels.shape=(seqLen, 1)
# 处理单个字符输入的RNNCell
class Model(torch.nn.Module):
def __init__(self, input_size, hidden_size, batch_size):
super(Model, self).__init__()
self.batch_size = batch_size
self.input_size = input_size
self.hidden_size = hidden_size
self.rnncell = torch.nn.RNNCell(input_size=self.input_size, hidden_size=self.hidden_size)
def forward(self, input, hidden):
hidden = self.rnncell(input, hidden) # input.shape=(batchSize, inputSize); hidden.shape=(batch, hiddenSize)
return hidden
def init_hidden(self):
return torch.zeros(self.batch_size, self.hidden_size) # h0
net = Model(input_size, hidden_size, batch_size)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.1)
for epoch in range(15):
loss = 0
optimizer.zero_grad()
hidden = net.init_hidden()
# 每轮对所有输入进行训练
# inputs.shape=(, ,) ; input.shape(, )
for input, label in zip(inputs, labels):
hidden = net(input, hidden)
loss += criterion(hidden, label) # 需要构建计算图,因此需要求和
loss.backward()
optimizer.step()
import torch
input_size = 4
hidden_size = 4
num_layers = 1
batch_size = 1
seq_len = 5
idx2char = ['e', 'h', 'l', 'o']
x_data = [1, 0, 2, 2, 3]
y_data = [3, 1, 2, 3, 2]
one_hot_lookup = [[1, 0, 0, 0],
[0, 1, 0, 0],
[0, 0, 1, 0],
[0, 0, 0, 1]]
x_one_hot = [one_hot_lookup[x] for x in x_data]
inputs = torch.Tensor(x_one_hot).view(seq_len, batch_size, input_size)
labels = torch.LongTensor(y_data) # (seqLen*batchSize, 1)
class Model(torch.nn.Module):
def __init__(self, input_size, hidden_size, num_layers, batch_size, seq_len):
super(Model, self).__init__()
self.input_size = input_size
self.num_layers = num_layers
self.hidden_size = hidden_size
self.batch_size = batch_size
self.seq_len = seq_len
self.rnn = torch.nn.RNN(self.input_size, self.hidden_size, self.num_layers)
def forward(self, input):
hidden = torch.zeros(self.num_layers, self.batch_size, self.hidden_size) # h0
out, _ = self.rnn(input, hidden)
return out.view(-1, self.hidden_size) # ( × , )
net = Model(input_size, hidden_size, batch_size, num_layers)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.05)
for epoch in range(15):
optimizer.zero_grad()
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()