本文主要是记录学习openvino_notebootk 302-pytorch-quantization-aware-training文档的一些收获,对于yolov5在cpu部署感兴趣的可以参考下。
此文档的目的是为了了解openvino如何降低模型部署的推理时间,同时尽可能保证精度。
此文档一共提供了两种PTQ量化方式,下面分别介绍。
首先两者都需要使用数据进行校准(calibration),借助于yolov5优秀的代码,这里直接调用它的dataloader即可。
data = check_dataset(DATASET_CONFIG)
val_dataloader = create_dataloader(
data["val"], imgsz=640, batch_size=1, stride=32, pad=0.5, workers=1
)[0]
return val_dataloader
获取yolov5原始的data之后,对于POT,需要再实现openvino.tools.pot.api
里面的DataLoader
类, 有两个必须要实现的方法__init__
and __gtitem__
.
from openvino.tools.pot.api import DataLoader
class YOLOv5POTDataLoader(DataLoader):
"""Inherit from DataLoader function and implement for YOLOv5."""
def __init__(self, data_source):
super().__init__({})
self._data_loader = data_source
self._data_iter = iter(self._data_loader)
def __len__(self):
return len(self._data_loader.dataset)
def __getitem__(self, item):
try:
batch_data = next(self._data_iter)
except StopIteration:
self._data_iter = iter(self._data_loader)
batch_data = next(self._data_iter)
im, target, path, shape = batch_data
im = im.float()
im /= 255
nb, _, height, width = im.shape
img = im.cpu().detach().numpy()
target = target.cpu().detach().numpy()
annotation = dict()
annotation["image_path"] = path
annotation["target"] = target
annotation["batch_size"] = nb
annotation["shape"] = shape
annotation["width"] = width
annotation["height"] = height
annotation["img"] = img
return (item, annotation), img
对于POT,量化参数需要通过配置字典(configuration dictionary)定义。包括algorithms
,描述算法的参数,engine
描述推理流程的参数(optional),model
描述float模型的路径。
algorithms_config = [
{
"name": "DefaultQuantization",
"params": {
"preset": "mixed",
"stat_subset_size": 300,
"target_device": "CPU"
},
}
]
engine_config = {"device": "CPU"}
model_config = {
"model_name": f"{MODEL_NAME}",
"model": fp32_path,
"weights": fp32_path.replace(".xml", ".bin"),
}
配置好之后,通过以下代码加载模型和创建pipeline
from openvino.tools.pot.engines.ie_engine import IEEngine
from openvino.tools.pot.graph import load_model
from openvino.tools.pot.pipeline.initializer import create_pipeline
# Load model as POT model representation
pot_model = load_model(model_config)
# Initialize the engine for metric calculation and statistics collection.
engine = IEEngine(config=engine_config, data_loader=pot_data_loader)
# Step 5: Create a pipeline of compression algorithms.
pipeline = create_pipeline(algorithms_config, engine)
通过调用pipeline.run(pot_model)
来开始POT量化,可以通过save_model
来保存,optionally,可以通过compress_model_weights
来对模型的权重进行压缩。这里不是很理解,我还以为int8之后模型的大小会自动变小呢?
from openvino.tools.pot.graph.model_utils import compress_model_weights
from openvino.tools.pot.graph import load_model, save_model
compressed_model = pipeline.run(pot_model)
compress_model_weights(compressed_model)
optimized_save_dir = Path(f"{MODEL_PATH}/POT_INT8_openvino_model/")
save_model(compressed_model, optimized_save_dir, model_config["model_name"] + "_int8")
pot_int8_path = f"{optimized_save_dir}/{MODEL_NAME}_int8.xml"
nncf不需要再实现一个Dataloader类,它只需要将yolov5中的val_dataloader 封装进nncf.Dataset
中即可。此外,可以将data预处理的逻辑写进去,以使数据满足model的输入要求,上面的POT也实现了该功能。
import nncf
# Define the transformation method. This method should take a data item returned
# per iteration through the `data_source` object and transform it into the model's
# expected input that can be used for the model inference.
def transform_fn(data_item):
# unpack input images tensor
images = data_item[0]
# convert input tensor into float format
images = images.float()
# scale input
images = images / 255
# convert torch tensor to numpy array
images = images.cpu().detach().numpy()
return images
# Wrap framework-specific data source into the `nncf.Dataset` object.
nncf_calibration_dataset = nncf.Dataset(data_source, transform_fn)
nncf有多种量化方式。nncf.quantize
用来进行 DefaultQuantization Algorithm, nncf.quantize_with_accuracy_control
用来进行 AccuracyAwareQuantization. 他也有一些参数诸如preset
, model_type
, subset_size
, fast_bias_correction
等,当前我只知道subset_size
代表数据集的数量,其它还不是很明确。本文用到的配置如下
subset_size = 300
preset = nncf.QuantizationPreset.MIXED
通过调用nncf.quantize
来进行nncf量化,通过调用oepnvino.runtime.serialize
保存模型。
from openvino.runtime import Core
core = Core()
ov_model = core.read_model(fp32_path)
quantized_model = nncf.quantize(
ov_model, nncf_calibration_dataset, preset=preset, subset_size=subset_size
)
nncf_int8_path = f"{MODEL_PATH}/NNCF_INT8_openvino_model/{MODEL_NAME}_int8.xml"
serialize(quantized_model, nncf_int8_path)
这是在我本地测试一张1600*1600的c++推理结果,分为模型加载(init)和推理(infer)两部分,在效果没有明显差别的情况下,POT和nncf性能都提升了4倍左右。
下面是我将jupyter中的代码移植到本地yolov5中的完整代码。
import torch
import nncf
import sys
from openvino.tools import mo
from openvino.runtime import serialize
from pathlib import Path
sys.path.append('.')
sys.path.append('..')
from utils.dataloaders import create_dataloader
from utils.general import check_dataset
from openvino.tools.pot.api import DataLoader
from openvino.tools.pot.engines.ie_engine import IEEngine
from openvino.tools.pot.graph import load_model
from openvino.tools.pot.pipeline.initializer import create_pipeline
from openvino.tools.pot.graph.model_utils import compress_model_weights
from openvino.tools.pot.graph import save_model
from openvino.runtime import Core
from export import attempt_load, yaml_save
from val import run as validation_fn
IMAGE_SIZE = 1600
ONNX_PATH = './weights/best.onnx'
MODEL_NAME = "yolov5l"
MODEL_PATH = "./weights"
DATASET_CONFIG = "mydata/0612.yaml"
fp32_path = f"{MODEL_PATH}/FP32_openvino_model/{MODEL_NAME}_fp32.xml"
fp16_path = f"{MODEL_PATH}/FP16_openvino_model/{MODEL_NAME}_fp16.xml"
class YOLOv5POTDataLoader(DataLoader):
"""Inherit from DataLoader function and implement for YOLOv5."""
def __init__(self, data_source):
super().__init__({})
self._data_loader = data_source
self._data_iter = iter(self._data_loader)
def __len__(self):
return len(self._data_loader.dataset)
def __getitem__(self, index):
try:
batch_data = next(self._data_iter)
except StopIteration:
self._data_iter = iter(self._data_loader)
batch_data = next(self._data_iter)
im, target, path, shape = batch_data
im = im.float()
im /= 255
nb, _, height, width = im.shape
img = im.cpu().detach().numpy()
target = target.cpu().detach().numpy()
annotation = dict()
annotation["image_path"] = path
annotation["target"] = target
annotation["batch_size"] = nb
annotation["shape"] = shape
annotation["width"] = width
annotation["height"] = height
annotation["img"] = img
return (index, annotation), img
def onnx2mo(onnx_path):
# fp32 IR model
fp32_path = f"{MODEL_PATH}/FP32_openvino_model/{MODEL_NAME}_fp32.xml"
print(f"Export ONNX to OpenVINO FP32 IR to: {fp32_path}")
model = mo.convert_model(onnx_path)
serialize(model, fp32_path)
# fp16 IR model
fp16_path = f"{MODEL_PATH}/FP16_openvino_model/{MODEL_NAME}_fp16.xml"
print(f"Export ONNX to OpenVINO FP16 IR to: {fp16_path}")
model = mo.convert_model(onnx_path, compress_to_fp16=True)
serialize(model, fp16_path)
def create_data_source():
data = check_dataset(DATASET_CONFIG)
val_dataloader = create_dataloader(data['train'], imgsz=IMAGE_SIZE, batch_size=1, stride=32, pad=0.5, workers=1)[0]
return val_dataloader
# create nncf dataset
def transform_fn(data_item):
# unpack input images tensor
images = data_item[0]
# convert input tensor into float format
images = images.float()
# scale input
images = images / 255
# convert torch tensor to numpy array
images = images.cpu().detach().numpy()
return images
# prepare config and pipeline for pot
algorithms_config = [
{
"name": "DefaultQuantization",
"params": {
"preset": "mixed",
"stat_subset_size": 300,
"target_device": "CPU"
},
}
]
engine_config = {"device": "CPU"}
model_config = {
"model_name": f"{MODEL_NAME}",
"model": fp32_path,
"weights": fp32_path.replace(".xml", ".bin"),
}
subset_size = 80
preset = nncf.QuantizationPreset.MIXED
# quantiaztion with pot
def quant_pot():
compressed_model = pipeline.run(pot_model)
compress_model_weights(compressed_model)
optimized_save_dir = Path(f"{MODEL_PATH}/POT_INT8_openvino_model/")
save_model(compressed_model, optimized_save_dir, model_config["model_name"] + "_int8")
pot_int8_path = f"{optimized_save_dir}/{MODEL_NAME}_int8.xml"
# quantization with nncf
def quant_nncf():
core = Core()
ov_model = core.read_model(fp32_path)
quantized_model = nncf.quantize(
ov_model, nncf_calibration_dataset, preset=preset, subset_size=subset_size
)
nncf_int8_path = f"{MODEL_PATH}/NNCF_INT8_openvino_model/{MODEL_NAME}_int8.xml"
serialize(quantized_model, nncf_int8_path)
if __name__ == '__main__':
onnx2mo(ONNX_PATH)
# create yolov5 dataloader class for pot
data_source = create_data_source()
pot_data_loader = YOLOv5POTDataLoader(data_source)
print('create yolov5 pot data done')
nncf_calibration_dataset = nncf.Dataset(data_source, transform_func=transform_fn)
print('wrap data source into nncf.dataset object. ')
# Load model as POT model representation
pot_model = load_model(model_config)
# Initialize the engine for metric calculation and statistics collection.
engine = IEEngine(config=engine_config, data_loader=pot_data_loader)
# Step 5: Create a pipeline of compression algorithms.
pipeline = create_pipeline(algorithms_config, engine)
quant_pot()
# print('pot quant done.')
quant_nncf()
print('nncf quant done.')