1、准备源码和模型文件,下载yolov7官方源码和预训练模型,
2、使用源码脚本将模型导出为onnx格式:python3 export.py --weights yolov7.pt --grid --simplify --topk-all 100 --img-size 640 640 --max-wh 640
3、模型转换,使用昇腾atc模型转换工具将onnx模型转换为om格式。测试设备为昇腾Atlas300I推理卡,需要将soc_version设置为Ascend310,转换命令:atc --model=yolov7.onnx --framework=5 --output=yolov7 --soc_version=Ascend310
4、使用昇腾acl接口编写推理脚本,acl支持c++和python,这里用python接口进行测试。主要步骤:acl初始化(设置device、创建context)、加载模型(获取模型id、模型描述、根据模型信息在npu申请输入输出内存)、执行推理(拷贝输入数据、执行推理、拷贝输出数据)。
om模型推理脚本:
import acl
import numpy as np
class ACL_inference(object):
def __init__(self, device_id):
self.device_id = device_id
acl.init()
acl.rt.set_device(self.device_id)
self.context, _ = acl.rt.create_context(self.device_id)
self.ACL_MEMCPY_HOST_TO_DEVICE = 1
self.ACL_MEMCPY_DEVICE_TO_HOST = 2
self.ACL_MEM_MALLOC_HUGE_ONLY = 2
self.model_id = None
self.model_desc = None
self.load_input_dataset = None
self.load_output_dataset = None
self.input_data = []
self.output_data = []
def init(self, model_path):
self.model_id, _ = acl.mdl.load_from_file(model_path)
self.model_desc = acl.mdl.create_desc()
acl.mdl.get_desc(self.model_desc, self.model_id)
self.gen_input_dataset()
self.gen_output_dataset()
def gen_output_dataset(self):
self.load_output_dataset = acl.mdl.create_dataset()
# 获取模型输出的数量。
output_size = acl.mdl.get_num_outputs(self.model_desc)
# 循环为每个输出申请内存,并将每个输出添加到aclmdlDataset类型的数据中。
for i in range(output_size):
buffer_size = acl.mdl.get_output_size_by_index(self.model_desc, i)
# 申请输出内存。
buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
data = acl.create_data_buffer(buffer, buffer_size)
_, ret = acl.mdl.add_dataset_buffer(self.load_output_dataset, data)
self.output_data.append({"buffer": buffer, "size": buffer_size})
def gen_input_dataset(self):
self.load_input_dataset = acl.mdl.create_dataset()
input_size = acl.mdl.get_num_inputs(self.model_desc)
for i in range(input_size):
buffer_size = acl.mdl.get_input_size_by_index(self.model_desc, i)
buffer, ret = acl.rt.malloc(buffer_size, self.ACL_MEM_MALLOC_HUGE_ONLY)
data = acl.create_data_buffer(buffer, buffer_size)
_, ret = acl.mdl.add_dataset_buffer(self.load_input_dataset, data)
self.input_data.append({"buffer": buffer, "size": buffer_size})
def process_output(self):
inference_result = []
for i, item in enumerate(self.output_data):
dims = acl.mdl.get_output_dims(self.model_desc, i)
shape = tuple(dims[i]["dims"])
buffer_host, ret = acl.rt.malloc_host(self.output_data[i]["size"])
# 将推理输出数据从Device传输到Host。
acl.rt.memcpy(buffer_host, self.output_data[i]["size"], self.output_data[i]["buffer"],
self.output_data[i]["size"], self.ACL_MEMCPY_DEVICE_TO_HOST)
bytes_out = acl.util.ptr_to_bytes(buffer_host, self.output_data[i]["size"])
data = np.frombuffer(bytes_out, dtype=np.float32).reshape(shape)
inference_result.append(data)
return inference_result
def load_input_data(self, img):
bytes_data = img.tobytes()
np_ptr = acl.util.bytes_to_ptr(bytes_data)
# 将图片数据从Host传输到Device。
acl.rt.memcpy(self.input_data[0]["buffer"], self.input_data[0]["size"], np_ptr,
self.input_data[0]["size"], self.ACL_MEMCPY_HOST_TO_DEVICE)
def execute(self):
acl.mdl.execute(self.model_id, self.load_input_dataset, self.load_output_dataset)
def destory(self):
acl.rt.destroy_context(self.context)
acl.rt.reset_device(self.device_id)
acl.finalize()
5、推理测试脚本。整体推理过程为图像预处理、加载模型、加载输入数据、执行推理、获取输出结果、图像后处理。
import cv2
import time
from NMS import nms, draw
time_ = time.time()
image_path = '1.jpg'
img_ = cv2.imread(image_path)
x_scale = img_.shape[1] / 640
y_scale = img_.shape[0] / 640
img = img_ / 255.
img = cv2.resize(img, (640, 640))
img = np.transpose(img, (2, 0, 1))
img = np.expand_dims(img, axis=0)
img = img.astype(np.float32)
print(f'image preprocess time cost:{time.time() - time_}')
model_path = './yolov7.om'
device_id = 0
model = ACL_inference(device_id)
model.init(model_path)
time0 = time.time()
model.load_input_data(img)
print(f'data copy to device time cost:{time.time() - time0}')
time1 = time.time()
model.execute()
print(f'device inference time cost:{time.time() - time1}')
time2 = time.time()
result = model.process_output()[0][0]
print(f'data copy to host time cost:{time.time() - time2}')
model.destory()
print(result.shape)
result = nms(result, 0.35, 0.45)
print(result)
result_img = draw(img_, x_scale, y_scale, result)
cv2.imwrite('result.jpg', result_img)
测试图像:
输出结果:
6、推理结果分析
从推理结果看,精度方面基本满足正常推理,性能方面:在npu上推理时间消耗为30毫秒,基本上可以实现30fps以内的视频实时推理,主要的时间消耗在图像解码和传输上,后续可以采用npu上的DVPP模块进行图像/视频解码,以及在atc转换时加入AIPP图像预处理功能进行优化,减少数据预处理时间。