pytorch学习笔记——知识蒸馏(14)

利用大型教师网络辅助小型学生网络进行训练的知识蒸馏方法如今已经被开始用于模型的压缩。鉴于之前的紧凑型神经网络容量的限制,捕获数据集中特征的能力有限;大型教师网络输出的标签富含更多的语义信息,因而我们使用之前的较为复杂的网络作为教师网络,利用之前设计的紧凑型神经网络作为学生网络进行知识蒸馏。我们冻结教师网络的权重,对学生网络和教师网络的输出进行min_max_scale,再进行均方误差的计算,以此作为损失函数来监督学生网络进行训练。(辛顿的论文中是利用教师网络的softmax输出的结果的软化结果作为学生网络softmax层的监督数据,与我们的方法略有不同)经过训练,我们的学生网络达到了98.9%的精确度,取得了一定的精确度提升的效果。
附知识蒸馏代码:

import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision
from sklearn.datasets import load_digits
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import torch as t
import torch.nn as nn
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
from graphviz import Digraph
import torch
from torch.autograd import Variable
from torchsummary import summary
from PIL import Image
from torchvision.transforms import transforms
import random
from sklearn.metrics import mean_squared_error

class train_mini_mnist(t.utils.data.Dataset):
    def __init__(self):
        self.X,self.y=load_digits(return_X_y=True)
        self.X=self.X
        self.X_train,self.X_test,self.y_train,self.y_test=train_test_split(self.X,self.y,random_state=0)

    def __getitem__(self, index):
            img, target = np.array(self.X_train[index].reshape(8,8),dtype=int), int(self.y_train[index])
            img=transforms.ToPILImage()(img)
            img=img.rotate(random.randint(-20,20))#填充白色
            img=transforms.ToTensor()(img)
            return img/15.,target

    def __len__(self):
        return len(self.y_train)

class test_mini_mnist(t.utils.data.Dataset):
    def __init__(self):
        self.X,self.y=load_digits(return_X_y=True)
        self.X=self.X/15.
        self.X_train,self.X_test,self.y_train,self.y_test=train_test_split(self.X,self.y,random_state=0)
    def __getitem__(self, index):
        return t.tensor(self.X_test[index].reshape(1,8,8),dtype=torch.float32),self.y_test[index]
    def __len__(self):
        return len(self.y_test)

BATCH_SIZE=8
LEARNING_RATE=4e-3
EPOCHES=100

train_data=train_mini_mnist()
test_data=test_mini_mnist()

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = Data.DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=True)

class CompactNet(nn.Module):
    def __init__(self):
        super(CompactNet, self).__init__()
        self.conv1 = nn.Sequential(#(1, 8, 8)
            nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(3,1), stride=1, padding=(1,0)),
            nn.Conv2d(in_channels=4, out_channels=6, kernel_size=(1,3), stride=1, padding=(0,1)),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.Conv2d(in_channels=6, out_channels=8, kernel_size=(3,1), stride=1, padding=(1,0)),
            nn.Conv2d(in_channels=8, out_channels=12, kernel_size=(1,3), stride=1, padding=(0,1)),
            nn.BatchNorm2d(12),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=12, out_channels=16, kernel_size=(3,1), stride=1, padding=(1,0)),
            nn.Conv2d(in_channels=16, out_channels=24, kernel_size=(1,3), stride=1, padding=(0,1)),
            nn.BatchNorm2d(24),
            nn.ReLU(),
            nn.Conv2d(in_channels=24, out_channels=16, kernel_size=(1,1), stride=1, padding=0),
            nn.Conv2d(in_channels=16, out_channels=8, kernel_size=(1,1), stride=1, padding=0),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)#(8,2,2)
        )
        self.fc = nn.Sequential(
            nn.Linear(8*2*2, 32),
            nn.Dropout(0.5),
            nn.Linear(32, 10)
            )
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)#相当于Flatten
        x = self.fc(x)
        return x


class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.conv1 = nn.Sequential(#(1, 8, 8)
            nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding=1),#(8, 8, 8) 
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=1),#(16, 8, 8) 
            nn.BatchNorm2d(16),
            nn.ReLU(),#(16,8,8)
            nn.MaxPool2d(kernel_size=2)#(16,4,4)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),#(32, 4, 4) 
            nn.BatchNorm2d(32),
            nn.ReLU(),#(32,4,4)
            nn.MaxPool2d(kernel_size=2)#(32,2,2)
        )
        self.fc = nn.Sequential(
            nn.Linear(32*2*2, 64),
            nn.Dropout(0.5),
            nn.Linear(64, 10)
            )
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)#相当于Flatten
        x = self.fc(x)
        return x

def eval_on_dataloader(my_net,name,loader,len):
    acc = 0.0
    with torch.no_grad():
        for data in loader:
            images, labels = data
            outputs = my_net(images)
            predict_y = torch.max(outputs, dim=1)[1]#torch.max返回两个数值,一个是最大值,一个是最大值的下标
            acc += (predict_y == labels).sum().item()
        accurate = acc / len
        return accurate

def plot_train_and_test_result(train_accs,test_accs):
    epoches=np.arange(1,len(train_accs)+1,dtype=np.int32)
    plt.plot(epoches,train_accs,label="train_accuracy")
    plt.plot(epoches,test_accs,label="test_accuracy")
    plt.xlabel('epoches')
    plt.ylabel('accuracy')
    plt.legend()

def min_max_scale(array):
    up=torch.max(array)
    down=torch.min(array)
    ret = (array-down)/(up-down)
    return ret
def loss_func(outputs,soft_labels):#mean_square_error
    scaled_outputs=min_max_scale(outputs)
    scaled_soft_labels=min_max_scale(soft_labels)
    #其实可以用softmax输出的软化后的交叉熵
    loss = -(scaled_outputs-scaled_soft_labels)
    return (loss**2).sum()
    
compact_net=torch.load("compact_net.pkl")
print(compact_net.parameters)
summary(compact_net, (1, 8, 8))

net=torch.load("net0.9955555555555555.pkl")
for param in net.parameters():
    param.require_grad = False
print(net.parameters)
summary(net, (1, 8, 8))
#绘制正态分布直方图
#mu,sigma=0,100
#np.random.seed(233)
#x=np.random.normal(mu,sigma,size=100000)
#plt.hist(x,100,histtype="stepfilled",facecolor="r",alpha=1)
#plt.show()

#需要冻结教师网络的权重

loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(net.parameters(), lr = LEARNING_RATE,weight_decay=3e-4)

for name,parameters in net.named_parameters():
    print(name,":",parameters.size())
best_acc = 0.0

train_accs,test_accs=[],[]
tmplist=[]
for epoch in range(EPOCHES):

    compact_net.train()#切换到训练模式

    
    for step, data in enumerate(train_loader, start=0):
        images, labels = data
        optim.zero_grad()#将优化器的梯度清零
        logits = compact_net(images)#网络推断的输出
        soft_labels=net(images)
        loss = loss_func(logits, soft_labels)#计算损失函数
        #print(loss)
        tmplist.append(float(loss))
        if len(tmplist)==1000:
            plt.hist(tmplist,100,histtype="stepfilled",facecolor="r",alpha=1)
            plt.show()
            tmplist=[]
        loss.backward()#反向传播求梯度
        optim.step()#优化器进一步优化

        rate = (step+1)/len(train_loader)
        a = "*" * int(rate * 50)
        b = "." * int((1 - rate) * 50)
        print("\rtrain loss: {:^3.0f}%[{}->{}]{:.4f}".format(int(rate*100), a, b, loss), end="")
    print()

    compact_net.eval()#切换到测试模式

    train_acc=eval_on_dataloader(compact_net,"train",train_loader,train_data.__len__())
    test_acc=eval_on_dataloader(compact_net,"test",test_loader,test_data.__len__())
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print("epoch:",epoch,"train_acc:",train_acc," test_acc:",test_acc)
    if test_acc>=best_acc:
        best_acc=test_acc
        torch.save(compact_net, 'compact_net'+str(best_acc)+'.pkl')

print('Finished Training')
plot_train_and_test_result(train_accs,test_accs)
torch.save(net, 'compact_net_final.pkl')
plt.show()



使用Hinton的知识蒸馏方法进行知识蒸馏的代码如下:

import torch
import torch.nn as nn
import torch.utils.data as Data
import torchvision
from sklearn.datasets import load_digits
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import torch as t
import torch.nn as nn
import torch.utils.data as Data
import torchvision
import matplotlib.pyplot as plt
import torch.functional
from graphviz import Digraph
import torch
from torch.autograd import Variable
from torchsummary import summary
from PIL import Image
from torchvision.transforms import transforms
import random
from sklearn.metrics import log_loss

class train_mini_mnist(t.utils.data.Dataset):
    def __init__(self):
        self.X,self.y=load_digits(return_X_y=True)
        self.X=self.X
        self.X_train,self.X_test,self.y_train,self.y_test=train_test_split(self.X,self.y,random_state=0)

    def __getitem__(self, index):
            img, target = np.array(self.X_train[index].reshape(8,8),dtype=int), int(self.y_train[index])
            img=transforms.ToPILImage()(img)
            img=img.rotate(random.randint(-20,20))#填充白色
            img=transforms.ToTensor()(img)
            return img/15.,target

    def __len__(self):
        return len(self.y_train)

class test_mini_mnist(t.utils.data.Dataset):
    def __init__(self):
        self.X,self.y=load_digits(return_X_y=True)
        self.X=self.X/15.
        self.X_train,self.X_test,self.y_train,self.y_test=train_test_split(self.X,self.y,random_state=0)
    def __getitem__(self, index):
        return t.tensor(self.X_test[index].reshape(1,8,8),dtype=torch.float32),self.y_test[index]
    def __len__(self):
        return len(self.y_test)

BATCH_SIZE=8
LEARNING_RATE=4e-3
EPOCHES=100
T=5#T越大软化程度越大

train_data=train_mini_mnist()
test_data=test_mini_mnist()

train_loader = Data.DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = Data.DataLoader(dataset=test_data, batch_size=BATCH_SIZE, shuffle=True)

class CompactNet(nn.Module):
    def __init__(self):
        super(CompactNet, self).__init__()
        self.conv1 = nn.Sequential(#(1, 8, 8)
            nn.Conv2d(in_channels=1, out_channels=4, kernel_size=(3,1), stride=1, padding=(1,0)),
            nn.Conv2d(in_channels=4, out_channels=6, kernel_size=(1,3), stride=1, padding=(0,1)),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.Conv2d(in_channels=6, out_channels=8, kernel_size=(3,1), stride=1, padding=(1,0)),
            nn.Conv2d(in_channels=8, out_channels=12, kernel_size=(1,3), stride=1, padding=(0,1)),
            nn.BatchNorm2d(12),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=12, out_channels=16, kernel_size=(3,1), stride=1, padding=(1,0)),
            nn.Conv2d(in_channels=16, out_channels=24, kernel_size=(1,3), stride=1, padding=(0,1)),
            nn.BatchNorm2d(24),
            nn.ReLU(),
            nn.Conv2d(in_channels=24, out_channels=16, kernel_size=(1,1), stride=1, padding=0),
            nn.Conv2d(in_channels=16, out_channels=8, kernel_size=(1,1), stride=1, padding=0),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)#(8,2,2)
        )
        self.fc = nn.Sequential(
            nn.Linear(8*2*2, 32),
            nn.Dropout(0.5),
            nn.Linear(32, 10)
            )
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)#相当于Flatten
        x = self.fc(x)
        return x


class SimpleNet(nn.Module):
    def __init__(self):
        super(SimpleNet, self).__init__()
        self.conv1 = nn.Sequential(#(1, 8, 8)
            nn.Conv2d(in_channels=1, out_channels=8, kernel_size=3, stride=1, padding=1),#(8, 8, 8) 
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=1),#(16, 8, 8) 
            nn.BatchNorm2d(16),
            nn.ReLU(),#(16,8,8)
            nn.MaxPool2d(kernel_size=2)#(16,4,4)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1),#(32, 4, 4) 
            nn.BatchNorm2d(32),
            nn.ReLU(),#(32,4,4)
            nn.MaxPool2d(kernel_size=2)#(32,2,2)
        )
        self.fc = nn.Sequential(
            nn.Linear(32*2*2, 64),
            nn.Dropout(0.5),
            nn.Linear(64, 10)
            )
    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = x.view(x.size(0), -1)#相当于Flatten
        x = self.fc(x)
        return x

def eval_on_dataloader(my_net,name,loader,len):
    acc = 0.0
    with torch.no_grad():
        for data in loader:
            images, labels = data
            outputs = my_net(images)
            predict_y = torch.max(outputs, dim=1)[1]#torch.max返回两个数值,一个是最大值,一个是最大值的下标
            acc += (predict_y == labels).sum().item()
        accurate = acc / len
        return accurate

def plot_train_and_test_result(train_accs,test_accs):
    epoches=np.arange(1,len(train_accs)+1,dtype=np.int32)
    plt.plot(epoches,train_accs,label="train_accuracy")
    plt.plot(epoches,test_accs,label="test_accuracy")
    plt.xlabel('epoches')
    plt.ylabel('accuracy')
    plt.legend()

def soften(array):
    return nn.Softmax()(array/T)
    
def loss_func(outputs,soft_labels):#mean_square_error
    outputs=nn.Softmax()(outputs)
    soft_labels=soften(soft_labels)
    #print("outputs:",outputs)
    #print("soft_labels:",soft_labels)
    loss=-(soft_labels * torch.log(outputs)).sum()
    #print(loss)
    return loss/15.
    
compact_net=torch.load("compact_net.pkl")
print(compact_net.parameters)
summary(compact_net, (1, 8, 8))

net=torch.load("net0.9955555555555555.pkl")
for param in net.parameters():
    param.require_grad = False
print(net.parameters)
summary(net, (1, 8, 8))
#绘制正态分布直方图
#mu,sigma=0,100
#np.random.seed(233)
#x=np.random.normal(mu,sigma,size=100000)
#plt.hist(x,100,histtype="stepfilled",facecolor="r",alpha=1)
#plt.show()

#需要冻结教师网络的权重

loss_fn = nn.CrossEntropyLoss()
optim = torch.optim.Adam(net.parameters(), lr = LEARNING_RATE,weight_decay=3e-4)

for name,parameters in net.named_parameters():
    print(name,":",parameters.size())
best_acc = 0.0

train_accs,test_accs=[],[]
tmplist=[]
for epoch in range(EPOCHES):

    compact_net.train()#切换到训练模式

    
    for step, data in enumerate(train_loader, start=0):
        images, labels = data
        optim.zero_grad()#将优化器的梯度清零
        logits = compact_net(images)#网络推断的输出
        soft_labels=net(images)
        #print(logits)
        #print(soft_labels)
        
        loss = loss_func(logits, soft_labels)#计算损失函数
        #print(loss)
        tmplist.append(float(loss))
        #if len(tmplist)==150:
        #    plt.hist(tmplist,100,histtype="stepfilled",facecolor="r",alpha=1)
        #    plt.show()
        #    tmplist=[]
        loss.backward()#反向传播求梯度
        optim.step()#优化器进一步优化

        rate = (step+1)/len(train_loader)
        a = "*" * int(rate * 50)
        b = "." * int((1 - rate) * 50)
        print("\rtrain loss: {:^3.0f}%[{}->{}]{:.4f}".format(int(rate*100), a, b, loss), end="")
    print()

    compact_net.eval()#切换到测试模式

    train_acc=eval_on_dataloader(compact_net,"train",train_loader,train_data.__len__())
    test_acc=eval_on_dataloader(compact_net,"test",test_loader,test_data.__len__())
    train_accs.append(train_acc)
    test_accs.append(test_acc)
    print("epoch:",epoch,"train_acc:",train_acc," test_acc:",test_acc)
    if test_acc>=best_acc:
        best_acc=test_acc
        torch.save(compact_net, 'compact_net'+str(best_acc)+'.pkl')

print('Finished Training')
plot_train_and_test_result(train_accs,test_accs)
torch.save(net, 'compact_net_final.pkl')
plt.show()



效果相类似,也许这就是网络的拟合极限了吧。我们尝试再加网络的容量试试,争取正确率达到99%以上。

你可能感兴趣的:(pytorch学习笔记——知识蒸馏(14))