在训练了神经网络之后,TensorRT可以对网络进行压缩、优化以及运行时部署,并且没有框架的开销。TensorRT通过combines layers,kernel优化选择,以及根据指定的精度执行归一化和转换成最优的matrix math方法,改善网络的延迟、吞吐量以及效率。
推理部署的流程:
trtexec --onnx=yolov5.onnx --saveEngine=yolov5.engine
引入包部分
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
加载engine模型并打印相关信息
TENSORRT_ENGINE_PATH_PY = './yolov5.engine'
# If a serialized engine exists, use it instead of building an engine.
logger = trt.Logger(trt.Logger.WARNING)
with open(TENSORRT_ENGINE_PATH_PY, "rb") as f, trt.Runtime(logger) as runtime:
engine= runtime.deserialize_cuda_engine(f.read())
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * 1
dims = engine.get_binding_shape(binding)
print(size)
print(dims)
print(binding)
print(engine.binding_is_input(binding))
dtype = trt.nptype(engine.get_binding_dtype(binding))
print("dtype = ", dtype)
context = engine.create_execution_context()
输入数据并进行推理
# with get_engine("net.engine") as engine, engine.create_execution_context() as context:
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)#输入模型的数据
h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32) #模型输出结果
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
# set the host input data
np.copyto(h_input, inputs[0].ravel())
# np.copyto(h_input, img_in.unsqueeze_(0))
# print(h_input)
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input, h_input, stream)
# Run inference.
context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
h_output就是模型输出的结果,后面再对这个输出结果做处理
完整代码:
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import os.path as osp
import os
import sys
# sys.path.append(os.getcwd())
sys.path.append('./')
import numpy as np
from scipy.special import softmax
import pandas as pd
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
def main():
TENSORRT_ENGINE_PATH_PY = './net.engine'
# If a serialized engine exists, use it instead of building an engine.
logger = trt.Logger(trt.Logger.WARNING)
with open(TENSORRT_ENGINE_PATH_PY, "rb") as f, trt.Runtime(logger) as runtime:
engine= runtime.deserialize_cuda_engine(f.read())
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * 1
dims = engine.get_binding_shape(binding)
print(size)
print(dims)
print(binding)
print(engine.binding_is_input(binding))
dtype = trt.nptype(engine.get_binding_dtype(binding))
print("dtype = ", dtype)
context = engine.create_execution_context()
input_file='./data/'
##读取文件夹下面的视频
files = []
if osp.isfile(input_file):
files = [input_file]
else:
files = os.listdir(input_file)
files = [
file for file in files
if (file.endswith(".jpg") or file.endswith(".png"))
]
files = [osp.join(input_file, file) for file in files]
batch_num = 1
for st_idx in range(0, len(files), batch_num):
ed_idx=min(st_idx+batch_num,len(files)-1)
# with get_engine("net.engine") as engine, engine.create_execution_context() as context:
h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)
# Allocate device memory for inputs and outputs.
d_input = cuda.mem_alloc(h_input.nbytes)
d_output = cuda.mem_alloc(h_output.nbytes)
# Create a stream in which to copy inputs/outputs and run inference.
stream = cuda.Stream()
# set the host input data
# h_input = img_in
inputs=[....]#你要输入模型的数据
np.copyto(h_input, inputs.ravel())
# np.copyto(h_input, img_in.unsqueeze_(0))
# print(h_input)
# Transfer input data to the GPU.
cuda.memcpy_htod_async(d_input, h_input, stream)
# Run inference.
context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(h_output, d_output, stream)
# Synchronize the stream
stream.synchronize()
# results = net_model([inputs[0]])[output_layer]
# print(" outputs:")
# print(np.array(results))
batched_outputs = [h_output]
for i in range(batch_num):
output = softmax(batched_outputs[i])
predict_cla = np.argsort(-output)[0]
pre_probability = output[predict_cla]
# print(f'\n file: {batch_file_name[i]},pre_class:{predict_cla},probability:{pre_probability}\n')
print("finish!")
if __name__ == '__main__':
main()