安装onnxruntime
pip install -i https://mirror.baidu.com/pypi/simple onnxruntime==1.10.0
同样以mobilenetv3分类网络为例第一篇文章写了pytorch转onnx模型
float32------>int8的onnx
参考官网:onnx quantization
import os
import numpy as np
import time
from PIL import Image
import onnxruntime
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantFormat, QuantType
class DataReader(CalibrationDataReader):
def __init__(self, calibration_image_folder, augmented_model_path=None):
self.image_folder = calibration_image_folder
self.augmented_model_path = augmented_model_path
self.preprocess_flag = True
self.enum_data_dicts = []
self.datasize = 0
def get_next(self):
if self.preprocess_flag:
self.preprocess_flag = False
session = onnxruntime.InferenceSession(self.augmented_model_path, None)
(_,_,height, width) = session.get_inputs()[0].shape
nhwc_data_list = proprocess_func(self.image_folder, height, width, size_limit=0)
input_name = session.get_inputs()[0].name
self.datasize = len(nhwc_data_list)
self.enum_data_dicts = iter([{input_name: nhwc_data} for nhwc_data in nhwc_data_list])
return next(self.enum_data_dicts, None)
def proprocess_func(images_folder, height, width, size_limit=0):
image_names = os.listdir(images_folder)
if size_limit > 0 and len(image_names) >= size_limit:
batch_filenames = [image_names[i] for i in range(size_limit)]
else:
batch_filenames = image_names
unconcatenated_batch_data = []
for image_name in batch_filenames:
# print(image_name)
image_filepath = images_folder + '/' + image_name
pillow_img = Image.new("L", (width, height))
pillow_img.paste(Image.open(image_filepath).resize((width, height)))
pillow_img.resize((128,128), 0)
input_data = np.float32(pillow_img)/255.
input_data = input_data[np.newaxis, np.newaxis, :, :]
input_data = np.array(input_data, dtype=np.float32)
# nhwc_data = np.expand_dims(input_data, axis=0)
# nchw_data = nhwc_data.transpose(())
unconcatenated_batch_data.append(input_data)
batch_data = np.concatenate(np.expand_dims(unconcatenated_batch_data, axis=0), axis=0)
return batch_data
def benchmark(model_path):
"""
用于测试速度
:param model_path:
:return:
"""
session = onnxruntime.InferenceSession(model_path)
input_name = session.get_inputs()[0].name
total = 0.0
runs = 10
input_data = np.zeros((1,1,128,128), np.float32) # 随便输入一个假数据,注意shape要与模型一致,我这里是灰度图输入所以(1,1),三通道图为(1,3)
# warming up
_ = session.run([], {input_name: input_data})
for i in range(runs):
start = time.perf_counter()
_ = session.run([], {input_name: input_data})
end = (time.perf_counter() - start) * 1000
total += end
print(f"{end:.2f}ms")
total /= runs
print(f"Avg: {total:.2f}ms")
def main():
input_model_path = 'mobilenetv3.onnx' # 输入onnx模型
output_model_path = 'mobilenetv3-quant.onnx' # 输出模型名
calibration_dataset_path = './data/calibrate' # 校准数据集图像地址
# 用于校准数据加载,注意这个方法里面需要做图像一些操作,与pytorch训练的时候加载数据操作一致
dr = DataReader(calibration_dataset_path, input_model_path)
# 开始量化
quantize_static(input_model_path,
output_model_path,
dr,
quant_format=QuantFormat.QDQ,
per_channel=False,
weight_type=QuantType.QInt8)
print("量化完成")
print("float32测试")
benchmark(input_model_path)
print("int8测试")
benchmark(output_model_path)
if __name__ == "__main__":
main()
QuantFormat.QDQ是在tensor上插入QuantizeLinear,DeQuantizeLinear来量化模型
用netron打开模型查看
模型大小减小到原来的1/4,精度依然是降低0.02%,与pytorch量化前后测试不同,在intel和amd cpu上均没有速度提升,这一点在paddle的官网看到了一样的说法。
在python环境下推理测到时间
pytorch模型:40ms
量化pytorch模型:10ms
onnx模型:4ms
量化onnx模型:4ms
可见onnx的加速优势还是很明显的