YOLOv7迁移昇腾NPU推理测试

1、准备源码和模型文件,下载yolov7官方源码预训练模型

2、使用源码脚本将模型导出为onnx格式:python3 export.py --weights yolov7.pt --grid --simplify --topk-all 100 --img-size 640 640 --max-wh 640

3、模型转换,使用昇腾atc模型转换工具将onnx模型转换为om格式。测试设备为昇腾Atlas300I推理卡,需要将soc_version设置为Ascend310,转换命令:atc --model=yolov7.onnx --framework=5 --output=yolov7 --soc_version=Ascend310

4、使用昇腾acl接口编写推理脚本,acl支持c++和python,这里用python接口进行测试。主要步骤:acl初始化(设置device、创建context)、加载模型(获取模型id、模型描述、根据模型信息在npu申请输入输出内存)、执行推理(拷贝输入数据、执行推理、拷贝输出数据)。

om模型推理脚本:

import acl
import numpy as np


class ACL_inference(object):
    def __init__(self, device_id):
        self.device_id = device_id
        acl.init()
        acl.rt.set_device(self.device_id)
        self.context, _ = acl.rt.create_context(self.device_id)
        self.ACL_MEMCPY_HOST_TO_DEVICE = 1
        self.ACL_MEMCPY_DEVICE_TO_HOST = 2
        self.ACL_MEM_MALLOC_HUGE_ONLY = 2
        self.model_id = None
        self.model_desc = None
        self.load_input_dataset = None
        self.load_output_dataset = None
        self.input_data = []
        self.output_data = []

    def init(self, model_path):
        self.model_id, _ = acl.mdl.load_from_file(model_path)
        self.model_desc = acl.mdl.create_desc()
        acl.mdl.get_desc(self.model_desc, self.model_id)
        self.gen_input_dataset()
        self.gen_output_dataset()

    def gen_output_dataset(self):
        self.load_output_dataset = acl.mdl.create_dataset()
        # 获取模型输出的数量。
        output_size = acl.mdl.get_num_outputs(self.model_desc)
        # 循环为每个输出申请内存,并将每个输出添加到aclmdlDataset类型的数据中。
        for i in range(output_size):
            buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
            # 申请输出内存。
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
            self.output_data.append({"buffer": buffer, "size": buffer_size})

    def gen_input_dataset(self):
        self.load_input_dataset = acl.mdl.create_dataset()
        input_size = acl.mdl.get_num_inputs(self.model_desc)
        for i in range(input_size):
            buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
            buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
            data = acl.create_data_buffer(buffer, buffer_size)
            _, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
            self.input_data.append({"buffer": buffer, "size": buffer_size})

    def process_output(self):
        inference_result = []
        for i, item in enumerate(self.output_data):
            dims = acl.mdl.get_output_dims(self.model_desc, i)
            shape = tuple(dims[i]["dims"])
            buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
            # 将推理输出数据从Device传输到Host。
            acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
                          self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
            bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
            data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
            inference_result.append(data)
        return inference_result

    def load_input_data(self, img):
        bytes_data = img.tobytes()
        np_ptr = acl.util.bytes_to_ptr(bytes_data)
        # 将图片数据从Host传输到Device。
        acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
                      self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)

    def execute(self):
        acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)

    def destory(self):
        acl.rt.destroy_context(self.context)
        acl.rt.reset_device(self.device_id)
        acl.finalize()

5、推理测试脚本。整体推理过程为图像预处理、加载模型、加载输入数据、执行推理、获取输出结果、图像后处理。

import cv2
import time
from NMS import nms, draw
time_ = time.time()
image_path = '1.jpg'
img_ = cv2.imread(image_path)
x_scale = img_.shape[1] / 640
y_scale = img_.shape[0] / 640
img = img_ / 255.
img = cv2.resize(img, (640, 640))
img = np.transpose(img, (2, 0, 1))
img = np.expand_dims(img, axis=0)
img = img.astype(np.float32)
print(f'image preprocess time cost:{time.time() - time_}')
model_path = './yolov7.om'
device_id = 0
model = ACL_inference(device_id)
model.init(model_path)
time0 = time.time()
model.load_input_data(img)
print(f'data copy to device time cost:{time.time() - time0}')
time1 = time.time()
model.execute()
print(f'device inference time cost:{time.time() - time1}')
time2 = time.time()
result = model.process_output()[0][0]
print(f'data copy to host time cost:{time.time() - time2}')
model.destory()
print(result.shape)
result = nms(result, 0.35, 0.45)
print(result)
result_img = draw(img_, x_scale, y_scale, result)
cv2.imwrite('result.jpg', result_img)

测试图像:

输出结果:

YOLOv7迁移昇腾NPU推理测试_第1张图片

 

6、推理结果分析

从推理结果看,精度方面基本满足正常推理,性能方面:在npu上推理时间消耗为30毫秒,基本上可以实现30fps以内的视频实时推理,主要的时间消耗在图像解码和传输上,后续可以采用npu上的DVPP模块进行图像/视频解码,以及在atc转换时加入AIPP图像预处理功能进行优化,减少数据预处理时间。 

你可能感兴趣的:(YOLO,深度学习,人工智能)