song@song-Mi-Gaming-Laptop-15-6:~$ cd onnx2trt
song@song-Mi-Gaming-Laptop-15-6:~/onnx2trt$ python setup6.py
[TensorRT] WARNING: onnx2trt_utils.cpp:220: Your ONNX model has been generated with INT64 weights, while TensorRT does not natively support INT64. Attempting to cast down to INT32.
[TensorRT] ERROR: x: dynamic input is missing dimensions in profile 0.
[TensorRT] ERROR: Network validation failed.
engine built successfully!
Traceback (most recent call last):
File "setup6.py", line 160, in
f.write(trt_engine.serialize())
AttributeError: 'NoneType' object has no attribute 'serialize'
# -*- coding:UTF-8 -*-
# @Time : 2021/5/24
# @Author : favorxin
# @Func : 利用TensorRT对ONNX模型进行加速的推理代码
import os
import sys
import time
import math
import copy
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda
from PIL import Image
### 正常TensorRT定义变量
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
a = (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
device = 'cuda:0'
### 分配内存不超过30G
def GiB(val):
return val * 1 << 30
### 读取onnx模型并构建engine
def build_engine(onnx_path, using_half,engine_file,dynamic_input=True):
trt.init_libnvinfer_plugins(None, '')
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_batch_size = 1 # always 1 for explicit batch
config = builder.create_builder_config()
config.max_workspace_size = GiB(4) # 设置4G的创建engine的显存占用
if using_half:
config.set_flag(trt.BuilderFlag.FP16) # 半精度FP6
# Load the Onnx model and parse it in order to populate the TensorRT network.
with open(onnx_path, 'rb') as model:
if not parser.parse(model.read()):
print ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print (parser.get_error(error))
return None
### 设置动态输入尺寸设置三个尺寸,依次最小尺寸,最佳尺寸,最大尺寸。
### (Batch-size, channel, height, width)输入最好按照该顺序,本人尝试通道数放最后未走通,动态输入会报错。
if dynamic_input:
profile = builder.create_optimization_profile();
profile.set_shape("the_input", (1,1,32,80), (1,1,32,148), (1,1,32,250))
config.add_optimization_profile(profile)
return builder.build_engine(network, config)
### 为输入,输出分配内存
def allocate_buffers(engine, is_explicit_batch=False, input_shape=None, output_shape=18):
inputs = []
outputs = []
bindings = []
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
for binding in engine:
dims = engine.get_binding_shape(binding)
### 此处是动态输入和动态输出所需设置的输入和输出尺寸大小。
if dims[-1] == -1 and len(dims) == 4:
assert (input_shape is not None)
dims[-1] = input_shape
elif dims[-2] == -1 and len(dims) == 3:
assert (output_shape is not None)
dims[-2] = output_shape
size = trt.volume(dims) * engine.max_batch_size # 设置推理所需的最大batch-size.
dtype = trt.nptype(engine.get_binding_dtype(binding))
# 分配内存
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
### 输入图片的预处理,输出预处理过后图片以及该图片预处理过后的宽。
def preprocess_image(imagepath):
img = Image.open(imagepath)
img = img.convert('L')
width, height = img.size[0], img.size[1]
scale = height * 1.0 / 32
new_width = int(width / scale)
img = img.resize([new_width, 32], Image.ANTIALIAS)
img = np.array(img).astype(np.float32) / 255.0 - 0.5
X = img.reshape([1, 1, 32, new_width])
return X, new_width
### 根据输入图片的宽度计算最终模型输出的尺寸大小。(因本人模型为动态输入,动态输出,故需计算输出尺寸大小,并分配输出的内存占用)。
def compute_out_shape(input_shape):
x2 = input_shape
x2_ft = math.floor((x2 - 5 + 2 * 2) / 2) + 1
for i in range(2):
x2_ft = int(x2_ft/2) if x2_ft % 2 == 0 else int((x2_ft-1)/2)
return x2_ft
### 根据engine推理代码
def profile_trt(engine, imagepath, batch_size):
assert (engine is not None)
### 确定模型输出尺寸,从而为输入输出分配内存
input_image, input_shape = preprocess_image(imagepath)
output_shape = compute_out_shape(input_shape)
segment_inputs, segment_outputs, segment_bindings = allocate_buffers(engine, True, input_shape, output_shape)
stream = cuda.Stream()
with engine.create_execution_context() as context:
context.active_optimization_profile = 0
origin_inputshape = context.get_binding_shape(0)
# 本人此处输入图片的宽度为动态的,故最后一位为动态的,并根据输入图片尺寸进行固定
if (origin_inputshape[-1] == -1 and len(origin_inputshape) == 4):
origin_inputshape[-1] = input_shape
context.set_binding_shape(0, (origin_inputshape))
# 本人此处模型输出结果为动态 的,为倒数第二位为动态的,可以根据输入图片的宽度确定
elif (origin_inputshape[-2] == -1 and len(origin_inputshape) == 3):
origin_inputshape[-2] = output_shape
context.set_binding_shape(0, (origin_inputshape))
segment_inputs[0].host = input_image
start_time = time.time()
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in segment_inputs]
context.execute_async(bindings=segment_bindings, stream_handle=stream.handle)
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in segment_outputs]
stream.synchronize()
use_time = time.time() - start_time
infer_out = [out.host for out in segment_outputs]
results = infer_out[0]
return results, use_time
if __name__ == '__main__':
onnx_path = 'cls.onnx'
usinghalf = True
batch_size = 1
imagepath = './data/num_5_true.bmp'
engine_file = 'cls.engine'
init_engine = True
load_engine = True
### 初始化并创建engine,根据onnx模型创建engine,该步骤较为费时,故正常会将engine保存下来,方便后期推理。
if init_engine:
trt_engine = build_engine(onnx_path, usinghalf, engine_file, dynamic_input=True)
print('engine built successfully!')
with open(engine_file, "wb") as f:
f.write(trt_engine.serialize())
print('save engine successfully')
### 利用上方创建的engine进行推理。平时推理时可以将init_engine设为False,因为engine已保存。
if load_engine:
trt.init_libnvinfer_plugins(None, '')
with open(engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
trt_engine = runtime.deserialize_cuda_engine(f.read())
trt_result, use_time = profile_trt(trt_engine, imagepath, batch_size)
print(trt_result)