tensorrt采坑记录

一. 下载对应的安装包

查看cuda版本:nvcc -V 显示我的版本是10.2版本
查看cudnn版本:cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2
显示我的cudnn版本为7.5.0版本

在官网下载trt安装包:根据系统、cuda、cudnn下载,下载地址:
https://developer.nvidia.cn/nvidia-tensorrt-8x-download

由于我需要下载8.x,要求cudnn的最低版本为8.2,所以去下载cudnn。
https://developer.nvidia.com/rdp/cudnn-archive

二. 环境配置

cd TensorRT-8.0.1.6/python/
pip install tensorrt-8.0.1.6-cp37-none-linux_x86_64.whl # 根据python版本安装
添加依赖环境cudnn:

export LD_LIBRARY_PATH=/home/work/guopei/cuda/lib64:$LD_LIBRARY_PATH

添加依赖环境tensorrt:

export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/work/guopei/TensorRT-8.0.1.6/lib

添加这个就可以使用trtexec
export PATH=/home/work/guopei/TensorRT-8.0.1.6/bin:$PATH
trtexec --onnx=resnet50/model.onnx --saveEngine=resnet_engine.trt --explicitBatch --device=5

注意:在onnx转trt之前,最好执行一下命令:
python -m onnxsim test1.oxxn test2.onnx

三. 测试安装环境

tensorrt采坑记录_第1张图片

查看onnx结构:https://lutzroeder.github.io/netron/

四. 代码示例

import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import numpy as np

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
def GiB(val):
    return val * 1 << 30

def ONNX_to_TRT(onnx_model_path=None,trt_engine_path=None,fp16_mode=False):
    """
    仅适用TensorRT V8版本
    生成cudaEngine,并保存引擎文件(仅支持固定输入尺度)  
    
    fp16_mode: True则fp16预测
    onnx_model_path: 将加载的onnx权重路径
    trt_engine_path: trt引擎文件保存路径
    """
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(EXPLICIT_BATCH)
    parser = trt.OnnxParser(network, TRT_LOGGER)
    
    config = builder.create_builder_config()
    config.max_workspace_size=GiB(1) 
    if fp16_mode:
        config.set_flag(trt.BuilderFlag.FP16) 
    with open(onnx_model_path, 'rb') as model:
        assert parser.parse(model.read())
        serialized_engine=builder.build_serialized_network(network, config)

   
    with open(trt_engine_path, 'wb') as f:
        f.write(serialized_engine)  # 序列化

    print('TensorRT file in ' + trt_engine_path)
    print('============ONNX->TensorRT SUCCESS============')

class TrtModel():
    '''
    TensorRT infer
    '''
    def __init__(self,trt_path):
        self.ctx=cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(trt_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size
    
    def __call__(self,img_np_nchw):
        '''
        TensorRT推理
        :param img_np_nchw: 输入图像
        '''
        self.ctx.push()

        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings

        np.copyto(host_inputs[0], img_np_nchw.ravel())
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        stream.synchronize()
        self.ctx.pop()
        return host_outputs[0]


    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()

yolov5-face测试代码:

import os
import sys
import cv2
import copy
import torch
import argparse
root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径:获取当前路径,再上级路径
sys.path.append(root_path)  # 将项目根路径写入系统路径
from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
from utils.datasets import letterbox
from detect_face import scale_coords_landmarks,show_results
from torch2trt.trt_model import TrtModel
cur_path=os.path.abspath(os.path.dirname(__file__))


def img_process(img_path,long_side=512,stride_max=32):
    '''
    图像预处理
    '''
    orgimg=cv2.imread(img_path)
    img0 = copy.deepcopy(orgimg)
    h0, w0 = orgimg.shape[:2]  # orig hw
    r = long_side/ max(h0, w0)  # resize image to img_size
    if r != 1:  # always resize down, only resize up if training with augmentation
        interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
        img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)

    imgsz = check_img_size(long_side, s=stride_max)  # check img_size

    img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形   False固定尺度
    # Convert
    img = img[:, :, ::-1].transpose(2, 0, 1).copy()  # BGR to RGB, to 3x416x416
    img = torch.from_numpy(img)
    img = img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)
    return img,orgimg

def img_vis(img,orgimg,pred,vis_thres = 0.3):
    '''
    预测可视化
    vis_thres: 可视化阈值
    '''

    print('img.shape: ', img.shape)
    print('orgimg.shape: ', orgimg.shape)

    no_vis_nums=0
    # Process detections
    for i, det in enumerate(pred):  # detections per image
        gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]  # normalization gain landmarks
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()

            # Print results
            for c in det[:, -1].unique():
                n = (det[:, -1] == c).sum()  # detections per class

            det[:, 5:15] = scale_coords_landmarks(img.shape[2:], det[:, 5:15], orgimg.shape).round()

            for j in range(det.size()[0]):
                if det[j, 4].cpu().numpy() < vis_thres:
                    no_vis_nums+=1
                    continue

                xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
                conf = det[j, 4].cpu().numpy()
                landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist()
                class_num = det[j, 15].cpu().numpy()
                orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)

    cv2.imwrite(cur_path+'/result.jpg', orgimg)
    print('result save in '+cur_path+'/result.jpg')


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--img_path', type=str, default=cur_path+"/test.jpg", help='img path') 
    parser.add_argument('--trt_path', type=str, required=True, help='trt_path') 
    parser.add_argument('--output_shape', type=list, default=[1,25200,16], help='input[1,3,640,640] ->  output[1,25200,16]') 
    opt = parser.parse_args()


    img,orgimg=img_process(opt.img_path) 
    model=TrtModel(opt.trt_path)
    # pred=model(img.numpy()).reshape(opt.output_shape) # forward
    pred=model(img.numpy()).reshape([1, 16128, 16]) # forward
    model.destroy()

    # Apply NMS
    pred = non_max_suppression_face(torch.from_numpy(pred), conf_thres=0.3, iou_thres=0.5)

    # ============可视化================
    img_vis(img,orgimg,pred)

你可能感兴趣的:(模型部署,深度学习,tensorrt)