TensorRT入门

     在训练了神经网络之后,TensorRT可以对网络进行压缩、优化以及运行时部署,并且没有框架的开销。TensorRT通过combines layers,kernel优化选择,以及根据指定的精度执行归一化和转换成最优的matrix math方法,改善网络的延迟、吞吐量以及效率。

推理部署的流程:

  • 安装tensorrt相关的包,最好是找直接能用的镜像。

  • 将onnx转换成engine格式
trtexec --onnx=yolov5.onnx --saveEngine=yolov5.engine
  • 推理代码

 引入包部分

import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

加载engine模型并打印相关信息

TENSORRT_ENGINE_PATH_PY = './yolov5.engine'
# If a serialized engine exists, use it instead of building an engine.
logger = trt.Logger(trt.Logger.WARNING)
with open(TENSORRT_ENGINE_PATH_PY, "rb") as f, trt.Runtime(logger) as runtime:
    engine= runtime.deserialize_cuda_engine(f.read())

for binding in engine:
    size = trt.volume(engine.get_binding_shape(binding)) * 1
    dims = engine.get_binding_shape(binding)
    print(size)
    print(dims)
    print(binding)
    print(engine.binding_is_input(binding))
    dtype = trt.nptype(engine.get_binding_dtype(binding))
    print("dtype = ", dtype)
context = engine.create_execution_context()

输入数据并进行推理

 # with get_engine("net.engine") as engine, engine.create_execution_context() as context:
        h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)#输入模型的数据
        h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)  #模型输出结果

        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()

  # set the host input data
       
        np.copyto(h_input, inputs[0].ravel())
        # np.copyto(h_input, img_in.unsqueeze_(0))

        # print(h_input)
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()

h_output就是模型输出的结果,后面再对这个输出结果做处理

完整代码:

#!/usr/bin/env python

# -*- coding:utf-8 -*-


import os.path as osp
import os
import sys
# sys.path.append(os.getcwd())
sys.path.append('./')


import numpy as np
from scipy.special import softmax
import pandas as pd
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit




def main():

    TENSORRT_ENGINE_PATH_PY = './net.engine'
    # If a serialized engine exists, use it instead of building an engine.
    logger = trt.Logger(trt.Logger.WARNING)
    with open(TENSORRT_ENGINE_PATH_PY, "rb") as f, trt.Runtime(logger) as runtime:
        engine= runtime.deserialize_cuda_engine(f.read())

    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * 1
        dims = engine.get_binding_shape(binding)
        print(size)
        print(dims)
        print(binding)
        print(engine.binding_is_input(binding))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        print("dtype = ", dtype)
    context = engine.create_execution_context()


    input_file='./data/'
    


    ##读取文件夹下面的视频
    files = []
    if osp.isfile(input_file):
        files = [input_file]
    else:
        files = os.listdir(input_file)
        files = [
            file for file in files
            if (file.endswith(".jpg") or file.endswith(".png"))
        ]
        files = [osp.join(input_file, file) for file in files]

    batch_num = 1
    for st_idx in range(0, len(files), batch_num):
        ed_idx=min(st_idx+batch_num,len(files)-1)
        # with get_engine("net.engine") as engine, engine.create_execution_context() as context:
        h_input = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(0)), dtype=np.float32)
        h_output = cuda.pagelocked_empty(trt.volume(context.get_binding_shape(1)), dtype=np.float32)

        # Allocate device memory for inputs and outputs.
        d_input = cuda.mem_alloc(h_input.nbytes)
        d_output = cuda.mem_alloc(h_output.nbytes)
        # Create a stream in which to copy inputs/outputs and run inference.
        stream = cuda.Stream()


        # set the host input data
        # h_input = img_in
        inputs=[....]#你要输入模型的数据
        np.copyto(h_input, inputs.ravel())
        # np.copyto(h_input, img_in.unsqueeze_(0))

        # print(h_input)
        # Transfer input data to the GPU.
        cuda.memcpy_htod_async(d_input, h_input, stream)
        # Run inference.
        context.execute_async_v2(bindings=[int(d_input), int(d_output)], stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(h_output, d_output, stream)
        # Synchronize the stream
        stream.synchronize()


       # results = net_model([inputs[0]])[output_layer]

        # print(" outputs:")
        # print(np.array(results))
        batched_outputs = [h_output]
        for i in range(batch_num):
            output = softmax(batched_outputs[i])
            predict_cla = np.argsort(-output)[0]
            pre_probability = output[predict_cla]
            # print(f'\n file: {batch_file_name[i]},pre_class:{predict_cla},probability:{pre_probability}\n')
 

   
    print("finish!")







if __name__ == '__main__':
     main()



你可能感兴趣的:(深度学习,神经网络)