softmax
进行蒸馏:softmax : q i = e z i / T ∑ j n e z i / T q_i=\frac{e^{z_i/T}}{ {\textstyle \sum_{j}^{n}e^{z_i/T}} } qi=∑jnezi/Tezi/T
T : 蒸馏温度
T = 时即为softmax
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
import torch.nn.functional as F
import torchvision
from torch import nn
from torchvision import transforms
from torch.utils.data import DataLoader
# from torchinfo import summary
from tqdm import tqdm
#设置随机种子
torch.manual_seed(0)
# device = torch.device("cuda" if torch.cuda.is_available() else "pcu") # 使用云GPU
# 使用cuDNN加速卷积运算
torch.backends.cudnn.benchmark=True
执行后,MNIST数据集会下载到"dataset/"
文件夹下
# 载入训练集
train_dataset = torchvision.datasets.MNIST(
root="dataset/", # MNIST数据集存放目录
train=True, #为train=True 时,加载训练集
transform=transforms.ToTensor(), # 图像处理、转不同格式显示
download=True
)
# 载入测试集
test_dataset = torchvision.datasets.MNIST(
root="dataset/",
train=False, #为train=False 时,加载测试集
transform=transforms.ToTensor(), # 图像处理、转不同格式显示
download=True
)
train_loder = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
test_loder = DataLoader(dataset=test_dataset, batch_size=32,shuffle=False) # 从数据库中每次抽出batch size个样本
class TeacherModel(nn.Module):
# 教师模型先定义 三个隐藏层fc1,fc2,fc3
def __init__(self,in_channels=1,num_classes=10):
super(TeacherModel, self).__init__()
self.relu = nn.ReLU()
self.fc1 = nn.Linear(784,1200)
self.fc2 = nn.Linear(1200,1200)
self.fc3 = nn.Linear(1200,num_classes)
self.dropout = nn.Dropout(p = 0.5) # 使用dropout防止过拟合
def forward(self,x):
x = x.view(-1,784)
x = self.fc1(x)
x = self.relu(x) # 前向传播使用线性整流relu激活函数
x = self.fc2(x)
x = self.dropout(x)
x = self.relu(x)
x = self.fc3(x)
return x
model = TeacherModel()
criterion = nn.CrossEntropyLoss() # 设置使用交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4) # 使用Adam优化器,学习率为lr=1e-4
epochs = 6 # 训练6轮
for epoch in range(epochs):
model.train()
for data,targets in tqdm(train_loder):
# 前向预测
preds = model(data)
loss = criterion(preds,targets)
# 反向传播,优化权重
optimizer.zero_grad() # 把梯度置为0
loss.backward()
optimizer.step()
# 测试集上评估性能
model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
for x,y in test_loder:
preds = model(x)
predictions = preds.max(1).indices
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
acc = (num_correct / num_samples).item()
model.train()
print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))
teacher_model = model
# 学生模型
class StudentModel(nn.Module):
def __init__( self,inchannels=1,num_class=10):
super(StudentModel, self).__init__()
self.relu = nn.ReLU()
self.fc1 = nn.Linear(784, 20)
self.fc2 = nn.Linear(20, 20)
self.fc3 = nn.Linear(20, num_class)
self.dropout = nn.Dropout(p = 0.5)
def forward(self,x):
x = x.view(-1, 784)
x = self.fc1(x)
x = self.dropout(x)
x = self.relu(x)
x = self.fc2(x)
x = self.dropout(x)
x = self.relu(x)
x = self.fc3(x)
return x
model = StudentModel() # 从头先训练一下学生模型
# 设置交叉损失函数 和 激活函数
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
epochs = 3
# 训练集上训练权重
for epoch in range(epochs):
model.train()
for data,targets in tqdm(train_loder):
# 前向预测
preds = model(data)
loss = criterion(preds,targets)
# 反向传播,优化权重
optimizer.zero_grad() # 把梯度置为0
loss.backward()
optimizer.step()
with torch.no_grad():
for x,y in test_loder:
preds = model(x)
predictions = preds.max(1).indices
num_correct += (predictions==y).sum()
num_samples += predictions.size(0)
acc = (num_correct / num_samples).item()
model.train()
print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))
student_model_scratch = model
知识蒸馏训练学生模型
# 准备好预训练好的教师模型
teacher_model.eval()
# 准备新的学生模型
model = SrudentModel()
model.train()
# 蒸馏温度
temp = 7
# hard_loss
hard_loss = nn.CrossEntropyLoss()
# hard_loss权重
alpha = 0.3
# soft_loss
soft_loss = nn.KLDivLoss(reduction="batchmean")
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
epochs = 3
for epoch in range(epochs):
for data,targets in tqdm(train_loder):
# 教师模型预测
with torch.no_grad():
teacher_preds = teacher_model(data)
# 学生模型预测
student_preds = student_model_scratch(data)
student_loss = hard_loss(student_preds,targets)
# 计算蒸馏后的预测结果及soft_loss
distillation_loss = soft_loss(
F.softmax(student_preds/temp, dim=1),
F.softmax(teacher_preds/temp, dim=1)
)
# 将 hard_loss 和 soft_loss 加权求和
loss = alpha * student_loss + (1-alpha) * distillation_loss
# 反向传播,优化权重
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 测试集上评估性能
model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
for x,y in test_loder:
preds = student_model_scratch(x)
predictions = preds.max(1).indices
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
acc = (num_correct/num_samples).item()
model.train()
print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))
import torch.nn as nn
torch.nn
是用于设置网络中的全连接层的,需要注意在二维图像处理的任务中,全连接层的输入与输出一般都设置为二维张量,形状通常为[batch_size, size]
,不同于卷积层要求输入输出是四维张量。
in_features
指的是输入的二维张量的大小,即输入的[batch_size, size]中的size。
out_features
指的是输出的二维张量的大小,即输出的二维张量的形状为[batch_size,output_size]
,当然,它也代表了该全连接层的神经元个数。从输入输出的张量的shape角度来理解,相当于一个输入为[batch_size, in_features]
的张量变换成了[batch_size, out_features]
的输出张量。
import torch as t
from torch import nn
# in_features由输入张量的形状决定,out_features则决定了输出张量的形状
connected_layer = nn.Linear(in_features = 64*64*3, out_features = 1)
# 假定输入的图像形状为[64,64,3]
input = t.randn(1,64,64,3)
# 将四维张量转换为二维张量之后,才能作为全连接层的输入
input = input.view(1,64*64*3)
print(input.shape)
output = connected_layer(input) # 调用全连接层
print(output.shape)
# 运行结果:
# input shape is %s torch.Size([1, 12288])
# output shape is %s torch.Size([1, 1])
nn.functional
import torch.nn.functional as F
包含 torch.nn 库中所有函数
同时包含大量 loss 和 activation function
import torch.nn.functional as F
loss_func = F.cross_entropy
loss = loss_func(model(x), y)
loss.backward()
其中 loss.backward() 更新模型的梯度,包括 weights 和 bias
from torch.utils.data import DataLoader
DataLoader:数据加载器
,结合了数据集和取样器,并且可以提供多个线程处理数据集。
在训练模型时使用到此函数,用来把训练数据分成多个小组,此函数每次抛出一组数据。直至把所有的数据都抛出。就是做一个数据的初始化。
torch.utils.data.DataLoader(dataset,batch_size=1, shuffle=False, sampler=None,batch_sampler=None, num_workers=0, collate_fn=<function default_collate>,
pin_memory=False, drop_last=False,timeout=0, worker_init_fn=None)
``
model.train()
model.train()
的作用是启用 Batch Normalization 和 Dropout。
如果模型中有BN层(Batch Normalization)和Dropout,需要在训练时添加model.train()。model.train()是保证BN层能够用到每一批数据的均值和方差。对于Dropout,model.train()是随机取一部分网络连接来训练更新参数。
optimizer.zero_grad()
optimizer.zero_grad()意思是把梯度置零,也就是把loss关于weight的导数变成0.
另外Pytorch 为什么每一轮batch需要设置optimizer.zero_grad:
根据pytorch中的backward()函数的计算,当网络参量进行反馈时,梯度是被积累的而不是被替换掉;但是在每一个batch时毫无疑问并不需要将两个batch的梯度混合起来累积,因此这里就需要每个batch设置一遍zero_grad 了。
在学习pytorch的时候注意到,对于每个batch大都执行了这样的操作:
# zero the parameter gradients
optimizer.zero_grad() # 梯度初始化为零
# forward + backward + optimize
outputs = net(inputs) # 前向传播求出预测的值
loss = criterion(outputs, labels) # 求loss
loss.backward() # 反向传播求梯度
optimizer.step() # 更新所有参数
显示循环的进度条的库。taqadum, تقدّم)在阿拉伯语中的意思是进展。tqdm可以在长循环中添加一个进度提示信息,用户只需要 封装任意的迭代器 tqdm(iterator),是一个快速、扩展性强的进度条工具库。
import time
from tqdm import *
for i in tqdm(range(1000)):
time.sleep(.01) #进度条每0.1s前进一次,总时间为1000*0.1=100s
numpy.indices()函数返回一个表示网格索引的数组。计算一个数组,其中子数组包含仅沿相应轴变化的索引值0、1,…。
# Python program explaining
# numpy.indices() function
# importing numpy as geek
import numpy as geek
gfg = geek.indices((2, 3))
print (gfg)
#输出:
#[[[0 0 0]
# [1 1 1]]
# [[0 1 2]
# [0 1 2]]]
首先,loss.backward()这个函数很简单,就是计算与图中叶子结点有关的当前张量的梯度
使用呢,当然可以直接如下使用
optimizer.zero_grad() 清空过往梯度;
loss.backward() 反向传播,计算当前梯度;
optimizer.step() 根据梯度更新网络参数
or这种情况
for i in range(num):
loss+=Loss(input,target)
optimizer.zero_grad() 清空过往梯度;
loss.backward() 反向传播,计算当前梯度;
optimizer.step() 根据梯度更新网络参数
import torch
import numpy as np
import pandas as pd
from torch import nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
# from torchinfo import summary
from tqdm import tqdm
import matplotlib.pyplot as plt
#设置随机种子
torch.manual_seed(0)
# device = torch.device("cuda" if torch.cuda.is_available() else "pcu")
# 使用cuDNN加速卷积运算
torch.backends.cudnn.benchmark=True
# 载入MNIST数据集
# 载入训练集
train_dataset = torchvision.datasets.MNIST(
root="dataset/",
train=True,
transform=transforms.ToTensor(),
download=True
)
# 载入测试集
test_dataset = torchvision.datasets.MNIST(
root="dataset/",
train=False,
transform=transforms.ToTensor(),
download=True
)
train_loder = DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
test_loder = DataLoader(dataset=test_dataset, batch_size=32,shuffle=False)
# 教师模型
class TeacherModel(nn.Module):
def __init__(self,in_channels=1,num_classes=10):
super(TeacherModel, self).__init__()
self.relu = nn.ReLU()
self.fc1 = nn.Linear(784,1200)
self.fc2 = nn.Linear(1200,1200)
self.fc3 = nn.Linear(1200,num_classes)
self.dropout = nn.Dropout(p = 0.5)
def forward(self,x):
x = x.view(-1,784)
x = self.fc1(x)
x = self.dropout(x)
x = self.relu(x)
x = self.fc2(x)
x = self.dropout(x)
x = self.relu(x)
x = self.fc3(x)
return x
model = TeacherModel()
criterion = nn.CrossEntropyLoss() # 设置使用交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4) # 使用Adam优化器,学习率为lr=1e-4
epochs = 1 # 训练6轮
for epoch in range(epochs):
model.train()
for data,targets in tqdm(train_loder):
# 前向预测
preds = model(data)
loss = criterion(preds,targets)
# 反向传播,优化权重
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 测试集上评估性能
model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
for x,y in test_loder:
preds = model(x)
predictions = preds.max(1).indices
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
acc = (num_correct / num_samples).item()
model.train()
teacher_model = model
print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))
# 学生模型
class StudentModel(nn.Module):
def __init__( self,inchannels=1,num_class=10):
super(StudentModel, self).__init__()
self.relu = nn.ReLU()
self.fc1 = nn.Linear(784, 20)
self.fc2 = nn.Linear(20, 20)
self.fc3 = nn.Linear(20, num_class)
#self.dropout = nn.Dropout(p = 0.5)
def forward(self,x):
x = x.view(-1, 784)
x = self.fc1(x)
#x = self.dropout(x)
x = self.relu(x)
x = self.fc2(x)
#x = self.dropout(x)
x = self.relu(x)
x = self.fc3(x)
return x
model = StudentModel() # 从头先训练一下学生模型
# 设置交叉损失函数 和 激活函数
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
epochs = 3
# 训练集上训练权重
for epoch in range(epochs):
model.train()
for data,targets in tqdm(train_loder):
# 前向预测
preds = model(data)
loss = criterion(preds,targets)
# 反向传播,优化权重
optimizer.zero_grad() # 把梯度置为0
loss.backward()
optimizer.step()
with torch.no_grad():
for x,y in test_loder:
preds = model(x)
predictions = preds.max(1).indices
num_correct += (predictions==y).sum()
num_samples += predictions.size(0)
acc = (num_correct / num_samples).item()
model.train()
print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))
student_model_scratch = model
# 准备好预训练好的教师模型
teacher_model.eval()
# 准备新的学生模型
model = StudentModel()
model.train()
# 蒸馏温度
temp = 7
# hard_loss
hard_loss = nn.CrossEntropyLoss()
# hard_loss权重
alpha = 0.3
# soft_loss
soft_loss = nn.KLDivLoss(reduction="batchmean")
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
epochs = 3
for epoch in range(epochs):
for data,targets in tqdm(train_loder):
# 教师模型预测
with torch.no_grad():
teacher_preds = teacher_model(data)
# 学生模型预测
student_preds = student_model_scratch(data)
student_loss = hard_loss(student_preds,targets)
# 计算蒸馏后的预测结果及soft_loss
distillation_loss = soft_loss(
F.softmax(student_preds/temp, dim=1),
F.softmax(teacher_preds/temp, dim=1)
)
# 将 hard_loss 和 soft_loss 加权求和
loss = alpha * student_loss + (1-alpha) * distillation_loss
# 反向传播,优化权重
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 测试集上评估性能
model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
for x,y in test_loder:
preds = student_model_scratch(x)
predictions = preds.max(1).indices
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
acc = (num_correct/num_samples).item()
model.train()
print(("Epoch:{}\t Accuracy:{:4f}").format(epoch+1,acc))