pytorch现在已经支持模型量化,主要支持两种量化模式(qnnpack和fbgemm):qnnpack是基于tensor的模式,这种相当于一种全局量化模式;另外fbgemm是基于通道的模式,相比qnnpack,fbgemm是以及基于通道的局部量化模式;需要说明的是这两种模式主要是针对权重而言的,激活函数的量化都是一样的。这里给出一个量化示例,pytorch目前只支持8bit的量化,通过自己实验发现对于浅层网络,分类精度损失不大,但是对于较深的网络量化就需要有针对性的优化,比如增大量化位宽,对部分权重或者activate分布不恰当的层加大这些层的位宽,或者模型训练过程中对权重或activate的大小进行约束,从而可以在量化的时候可以采用较低的位宽实现较高的精度保持。
实现过程依然是承接前一篇博客而来,只是添加了量化部分而已,代码实现如下:
import os
import torch
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
from valid_simple_cnn5 import Net, QuantNet, get_dataloader, train, test, calibrate, print_size_of_model, time_model_evaluation
#---------------------------------------------------
#setting
input_shape = [1, 1, 28, 28]
BATCH_SIZE = 1
EPOCH = 10
GAMMA = 0.7
LR = 1.0
SAVE_MODEL = True
LOG_INTERVAL = 100
N_TEST_ITER = 3
QUANT_BACKEND = "fbgemm" # fbgemm for x86 / qnnpack for arm
N_CALIBRATION = 10
SEED = 777
N_CPU = os.cpu_count()
DATA_PATH = "./"
PARAM_PATH = "mnist_qcnn05.pt"
SCRIPTED_PARAM_PATH = "mnist_jit_qcnn.pt"
'''USE_CUDA = True
GPU_ID = [3]
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(i) for i in GPU_ID])
device = torch.device("cuda" if torch.cuda.is_available() and USE_CUDA else "cpu")
'''
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
torch.manual_seed(SEED)
TRAIN = False
#---------------------------------------------------
#training
model = Net().to(device)
print('model = ',model)
train_loader, test_loader = get_dataloader(DATA_PATH, BATCH_SIZE, N_CPU)
if os.path.exists(PARAM_PATH) and TRAIN == False:
print('start load model...')
model.load_state_dict(torch.load(PARAM_PATH,map_location=torch.device('cpu')))
print('Loaded parameters from', PARAM_PATH)
else:
print('start training...')
optimizer = optim.Adadelta(model.parameters(), lr=LR)
scheduler = StepLR(optimizer, step_size=1, gamma=GAMMA)
for epoch in range(1, EPOCH + 1):
train(model, device, train_loader, optimizer, epoch, LOG_INTERVAL)
test(model, device, test_loader)
scheduler.step()
if SAVE_MODEL:
torch.save(model.state_dict(), PARAM_PATH)
#---------------------------------------------------
#testing
print('testing acc...')
print('model = ',model)
test(model, "cpu", test_loader)
# print_size_of_model(model)
# time_model_evaluation(model, test_loader, N_TEST_ITER)
#--------------------------------------------------------------
#下面是基于pytorch 模型量化
#--------------------------------------------------------------
#层的合并
model = torch.quantization.fuse_modules(model,[["conv1","relu1"],
["conv2","relu2"],
["conv3","relu3"],
["conv4","relu4"],
["conv5","relu5"],
["conv6","relu6"],
["fc1","relu7"]])
#量化模式(qnnpack是基于tensor的量化,fbgemm是基于通道的量化)
QUANT_BACKEND = "qnnpack"
model.qconfig = torch.quantization.get_default_qconfig(QUANT_BACKEND)
torch.quantization.prepare_qat(model,inplace=True)
#对模型进行统计获取activate分布相关的最大最小值
calibrate(model,train_loader,N_CALIBRATION)
print('calibration done')
#test(model, "cpu", test_loader)
print('model = ',model)
torch.save(model.state_dict(),'mnist_qcnn_fuse03.pt')
#转换为量化的模型
torch.quantization.convert(model,inplace=True)
print('model = ',model)
torch.save(model.state_dict(), 'mnist_qcnn_quant0.pt')
print('quant testing...')
test(model,'cpu',test_loader)
其他的代码可以通过链接下载:请戳
量化相关的东西后续会进一步更新和讲解,水平有限,不当之处请指教,希望对大家有帮助。