yolov5量化部署(基于openvino和tensorrt)

yolov5 openvino量化部署

首先,下载YOLOv5源码,安装YOLOv5和OpenVINO的python依赖。

git clone https://github.com/ultralytics/yolov5.git 
pip install -r requirements.txt && pip install openvino openvino-dev

然后,通过YOLOv5提供的export.py将预训练的Pytorch模型转换为OpenVINO FP32 IR模型。

python export.py --weights yolov5n.pt --imgsz 640 --batch-size 1 --include openvino

下面的量化代码改编自:https://github.com/openvinotoolkit/openvino_notebooks/tree/main/notebooks/111-yolov5-quantization-migration

from pathlib import Path
from utils.dataloaders import create_dataloader
from utils.general import check_dataset
from export import attempt_load, yaml_save
from val import run as validation_fn
from openvino.tools import mo
from openvino.runtime import serialize
from openvino.tools.pot.api import DataLoader
from openvino.tools.pot.engines.ie_engine import IEEngine
from openvino.tools.pot.graph import load_model
from openvino.tools.pot.pipeline.initializer import create_pipeline
from openvino.tools.pot.graph.model_utils import compress_model_weights
from openvino.tools.pot.graph import load_model, save_model


IMAGE_SIZE = 640
MODEL_NAME = "yolov5n"
DATASET_CONFIG = "./data/coco128.yaml"


class YOLOv5POTDataLoader(DataLoader):
    '''Inherit from DataLoader function and implement for YOLOv5.'''
    def __init__(self, data_source):
        super().__init__({})
        self._data_loader = data_source
        self._data_iter = iter(self._data_loader)

    def __len__(self):
        return len(self._data_loader.dataset)

    def __getitem__(self, item):
        try:
            batch_data = next(self._data_iter)
        except StopIteration:
            self._data_iter = iter(self._data_loader)
            batch_data = next(self._data_iter)

        im, target, path, shape = batch_data

        im = im.float()
        im /= 255
        nb, _, height, width = im.shape
        img = im.cpu().detach().numpy()
        target = target.cpu().detach().numpy()

        annotation = dict()
        annotation["image_path"] = path
        annotation["target"] = target
        annotation["batch_size"] = nb
        annotation["shape"] = shape
        annotation["width"] = width
        annotation["height"] = height
        annotation["img"] = img

        return (item, annotation), img


if __name__ == "__main__":
    '''Conversion of the YOLOv5 model to OpenVINO'''
    onnx_path = f"./{MODEL_NAME}.onnx"

    # fp32 IR model
    fp32_path = f"./FP32_openvino_model/{MODEL_NAME}_fp32.xml"

    print(f"Export ONNX to OpenVINO FP32 IR to: {fp32_path}")
    model = mo.convert_model(onnx_path)
    serialize(model, fp32_path)

    # fp16 IR model
    fp16_path = f"./FP16_openvino_model/{MODEL_NAME}_fp16.xml"

    print(f"Export ONNX to OpenVINO FP16 IR to: {fp16_path}")
    model = mo.convert_model(onnx_path, compress_to_fp16=True)
    serialize(model, fp16_path)

    '''Prepare dataset for quantization'''
    data = check_dataset(DATASET_CONFIG)
    data_source = create_dataloader(data["val"], imgsz=640, batch_size=1, stride=32, pad=0.5, workers=0)[0]
    pot_data_loader = YOLOv5POTDataLoader(data_source)

    '''Configure quantization pipeline'''
    algorithms_config = [
        {
            "name": "DefaultQuantization",
            "params": {
                "preset": "mixed",
                "stat_subset_size": 300,
                "target_device": "CPU"
            },
        }
    ]

    engine_config = {"device": "CPU"}

    model_config = {
        "model_name": f"{MODEL_NAME}",
        "model": fp32_path,
        "weights": fp32_path.replace(".xml", ".bin"),
    }

    pot_model = load_model(model_config)

    engine = IEEngine(config=engine_config, data_loader=pot_data_loader)

    pipeline = create_pipeline(algorithms_config, engine)

    '''Perform model optimization'''
    compressed_model = pipeline.run(pot_model)
    compress_model_weights(compressed_model)
    optimized_save_dir = Path(f"./POT_INT8_openvino_model/")
    save_model(compressed_model, optimized_save_dir, model_config["model_name"] + "_int8")
    pot_int8_path = f"{optimized_save_dir}/{MODEL_NAME}_int8.xml"

    '''Compare accuracy FP32 and INT8 models'''
    model = attempt_load(f"./{MODEL_NAME}.pt", device="cpu", inplace=True, fuse=True) 
    metadata = {"stride": int(max(model.stride)), "names": model.names}  # model metadata
    yaml_save(Path(pot_int8_path).with_suffix(".yaml"), metadata)
    yaml_save(Path(fp32_path).with_suffix(".yaml"), metadata)

    print("Checking the accuracy of the original model:")
    fp32_metrics = validation_fn(
        data=DATASET_CONFIG,
        weights=Path(fp32_path).parent,
        batch_size=1,
        workers=0,
        plots=False,
        device="cpu",
        iou_thres=0.65,
    )

    fp32_ap5 = fp32_metrics[0][2]
    fp32_ap_full = fp32_metrics[0][3]
    print(f"[email protected] = {fp32_ap5}")
    print(f"[email protected]:.95 = {fp32_ap_full}")

    print("Checking the accuracy of the POT int8 model:")
    int8_metrics = validation_fn(
        data=DATASET_CONFIG,
        weights=Path(pot_int8_path).parent,
        batch_size=1,
        workers=0,
        plots=False,
        device="cpu",
        iou_thres=0.65,
    )

    pot_int8_ap5 = int8_metrics[0][2]
    pot_int8_ap_full = int8_metrics[0][3]
    print(f"[email protected] = {pot_int8_ap5}")
    print(f"[email protected]:.95 = {pot_int8_ap_full}")

python推理:

python detect.py --weights ./POT_INT8_openvino_model

C++推理:(参考基于OpenVNO C++ API部署YOLOv5模型)

#include 
#include 
#include  
#include    


/* ---------  Please modify the path of yolov5 model and image -----------*/
std::string model_file = "yolov5n_int8.xml";
std::string image_file = "bus.jpg";

const std::vector<std::string> class_names = {
	"person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light",
	"fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow",
	"elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
	"skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard",
	"tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch",
	"potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear",
	"hair drier", "toothbrush" };


cv::Mat letterbox(cv::Mat& img, std::vector<float>& paddings, std::vector<int> new_shape = { 640, 640 })
{
	// Get current image shape [height, width]
	int img_h = img.rows;
	int img_w = img.cols;

	// Compute scale ratio(new / old) and target resized shape
	float scale = std::min(new_shape[1] * 1.0 / img_h, new_shape[0] * 1.0 / img_w);
	int resize_h = int(round(img_h * scale));
	int resize_w = int(round(img_w * scale));
	paddings[0] = scale;

	// Compute padding
	int pad_h = new_shape[1] - resize_h;
	int pad_w = new_shape[0] - resize_w;

	// Resize and pad image while meeting stride-multiple constraints
	cv::Mat resized_img;
	cv::resize(img, resized_img, cv::Size(resize_w, resize_h));

	// divide padding into 2 sides
	float half_h = pad_h * 1.0 / 2;
	float half_w = pad_w * 1.0 / 2;
	paddings[1] = half_h;
	paddings[2] = half_w;

	// Compute padding boarder
	int top = int(round(half_h - 0.1));
	int bottom = int(round(half_h + 0.1));
	int left = int(round(half_w - 0.1));
	int right = int(round(half_w + 0.1));

	// Add border
	cv::copyMakeBorder(resized_img, resized_img, top, bottom, left, right, 0, cv::Scalar(114, 114, 114));

	return resized_img;
}


int main(int argc, char* argv[]) 
{
	// -------- Get OpenVINO runtime version --------
	std::cout << ov::get_openvino_version().description << ':' << ov::get_openvino_version().buildNumber << std::endl;

	// -------- Step 1. Initialize OpenVINO Runtime Core --------
	ov::Core core;

	// -------- Step 2. Compile the Model --------
	auto compiled_model = core.compile_model(model_file, "CPU"); 

	// -------- Step 3. Create an Inference Request --------
	ov::InferRequest infer_request = compiled_model.create_infer_request();

	clock_t start = clock();
	// -------- Step 4. Read a picture file and do the preprocess --------
	cv::Mat img = cv::imread(image_file); //Load a picture into memory
	std::vector<float> paddings(3);       //scale, half_h, half_w
	cv::Mat resized_img = letterbox(img, paddings); //resize to (640,640) by letterbox
	cv::Mat blob = cv::dnn::blobFromImage(resized_img, 1 / 255.0, cv::Size(640, 640), cv::Scalar(0, 0, 0), true); 	// BGR->RGB, u8(0-255)->f32(0.0-1.0), HWC->NCHW

	// -------- Step 5. Feed the blob into the input node of YOLOv5 -------	
	auto input_port = compiled_model.input(); // Get input port for model with one input
	ov::Tensor input_tensor(input_port.get_element_type(), input_port.get_shape(), blob.ptr(0)); // Create tensor from external memory
	infer_request.set_input_tensor(input_tensor); // Set input tensor for model with one input

	// -------- Step 6. Start inference --------
	for (size_t i = 0; i < 100; i++)
		infer_request.infer();

	// -------- Step 7. Get the inference result --------
	auto output = infer_request.get_output_tensor(0);
	auto output_shape = output.get_shape();
	std::cout << "The shape of output tensor:" << output_shape << std::endl;
	cv::Mat output_buffer(output_shape[1], output_shape[2], CV_32F, output.data()); 

	// -------- Step 8. Post-process the inference result -----------
	float conf_threshold = 0.25;
	float nms_threshold = 0.5;
	std::vector<cv::Rect> boxes;
	std::vector<int> class_ids;
	std::vector<float> class_scores;
	std::vector<float> confidences;
	for (int i = 0; i < output_buffer.rows; i++) 
	{
		float confidence = output_buffer.at<float>(i, 4);
		if (confidence < conf_threshold) 
			continue;

		cv::Mat classes_scores = output_buffer.row(i).colRange(5, 85);
		cv::Point class_id;
		double score;
		cv::minMaxLoc(classes_scores, NULL, &score, NULL, &class_id);

		if (score > 0.25)
		{
			float cx = output_buffer.at<float>(i, 0);
			float cy = output_buffer.at<float>(i, 1);
			float w = output_buffer.at<float>(i, 2);
			float h = output_buffer.at<float>(i, 3);
			int left = static_cast<int>((cx - 0.5 * w - paddings[2]) / paddings[0]);
			int top = static_cast<int>((cy - 0.5 * h - paddings[1]) / paddings[0]);
			int width = static_cast<int>(w / paddings[0]);
			int height = static_cast<int>(h / paddings[0]);
			cv::Rect box;
			box.x = left;
			box.y = top;
			box.width = width;
			box.height = height;

			boxes.push_back(box);
			class_ids.push_back(class_id.x);
			class_scores.push_back(score);
			confidences.push_back(confidence);
		}
	}

	// NMS
	std::vector<int> indices;
	cv::dnn::NMSBoxes(boxes, confidences, conf_threshold, nms_threshold, indices);

	clock_t end = clock();
	std::cout << end - start << std::endl;

	// -------- Step 8. Visualize the detection results -----------
	for (size_t i = 0; i < indices.size(); i++) 
	{
		int index = indices[i];
		int class_id = class_ids[index];
		cv::rectangle(img, boxes[index], cv::Scalar(0, 0, 255), 2, 8);
		std::string label = class_names[class_id] + ":" + std::to_string(class_scores[index]);
		cv::putText(img, label, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 10), cv::FONT_HERSHEY_SIMPLEX, .5, cv::Scalar(255, 0, 0));
	}

	cv::imshow("YOLOv5 OpenVINO Inference C++ Demo", img);
	cv::waitKey(0);

	return 0;
}

C++在i7-12700 CPU下推理fp32、fp16、int8模型循环100轮,耗时如下(各跑3次):
yolov5n_fp32:1599ms 2040ms 1514ms
yolov5n_fp16:1505ms 2078ms 1514ms
yolov5n_int8: 856ms 861ms 852ms

fp32和fp16模型推理耗时差不多,int8能缩短推理耗时到一半左右。

yolov5 tensorrt量化部署

这个借助的是大佬的工程:https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5
python和c++的推理代码都有。不得不佩服人家写的确实很详细了,有这么好的轮子干嘛不直接拿来用呢,哈哈。
配置环境的过程还参考了博文:windows上配置TensorRT yolov5 -6.0部署 tensorrtx视频流推理
LZ测试了一下yolov5各种模型单张图片的推理耗时如下(C++,RTX3070 gpu ):
yolov5n-int8: 2ms 1ms
yolov5s-int8: 2ms 1ms
yolov5m-int8:3ms 2ms
yolov5l-int8: 4ms 3ms
yolov5x-int8 :7ms 6ms

yolov5n-fp16: 1ms 1ms
yolov5s-fp16: 2ms 2ms
yolov5m-fp16:4ms 3ms
yolov5l-fp16: 6ms 5ms
yolov5x-fp16:10ms 9ms

yolov5n-fp32: 424ms 2ms
yolov5s-fp32: 389ms 4ms
yolov5m-fp32:401ms 9ms
yolov5l-fp32: 422ms 17ms
yolov5x-fp32: 30ms 28ms
其中在我的机器上,fp32模型的yolov5n-yolov5l版本首次推理时间较长,不清楚是什么原因。

你可能感兴趣的:(#,model,deployment,deep,learning,#,object,detection,yolov5,openvino,tensorrt)