tensorRT推理

一、参考资料

第一步:ubuntu18.04装TensorRT8.0.0.3 + onnx1.8.0 + onnx_tensorrt
第二步:pytorch模型转onnx模型步骤和可能遇到的问题
第三步:onnx模型导入tensorrt生成优化engine + 在GPU上推理
Additional TensorRT resources
深入理解 TensorRT (1) TensorRT Python API 详解

二、重要说明

  1. 对于每一个输入张量与输出张量,都需要分配两块资源,分别是主机内存(Host)中的资源以及显存(Device)中的资源。
  2. 在主机内存(Host)中分配空间,使用 pycuda.driver.cuda.pagelocked_empty(shape, dtype)。shape 一般通过 trt.volume(engine.get_binding_shape(id))实现,可以理解为元素数量(而不是内存大小)。dtype就是数据类型,可以通过 np.float32trt.float32 的形式。
  3. 显存(Device)中分配空间,使用 pycuda.driver.cuda.mem_alloc(buffer.nbytes), buffer 可以是ndarray,也可以是前面的 pagelocked_empty() 结果。
  4. 数据从Host拷贝到Device,使用 pycuda.driver.cuda.memcpy_htod(dest, src),dest是 mem_alloc 的结果,src 是 numpy/pagelocked_empty
  5. 数据从Device拷贝到Host,使用 pycuda.driver.cuda.memcpy_dtoh(dest, src),dest是numpy/pagelocked_empty,src是mem_alloc
  6. binding可以理解为端口,表示 input tensoroutput tensor,可通过 idname 获取对应的 binding。在模型推理过程中,需要以 bindings 作为输入,其具体数值为内存地址,即 int(buffer)
  7. bindings是一个数组,包含所有的input/output buffer(即device)的地址,获取方式就是直接通过 int(buffer),其中 buffer 就是 mem_alloc 的结果。

三、相关文件

common.py

import time
import os
import argparse
import configs

import numpy as np
import pycuda.driver as cuda
import tensorrt as trt


try:
    # Sometimes python2 does not understand FileNotFoundError
    FileNotFoundError
except NameError:
    FileNotFoundError = IOError

def GiB(val):
    return val * 1 << 30

def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
    '''
    Parses sample arguments.
    Args:
        description (str): Description of the sample.
        subfolder (str): The subfolder containing data relevant to this sample
        find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
    Returns:
        str: Path of data directory.
    Raises:
        FileNotFoundError
    '''
    kDEFAULT_DATA_ROOT = os.path.abspath("/usr/src/tensorrt/data")

    # Standard command-line arguments for all samples.
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.")
    args, unknown_args = parser.parse_known_args()

    # If data directory is not specified, use the default.
    data_root = args.datadir if args.datadir else kDEFAULT_DATA_ROOT
    # If the subfolder exists, append it to the path, otherwise use the provided path as-is.
    subfolder_path = os.path.join(data_root, subfolder)
    if not os.path.exists(subfolder_path):
        print("WARNING: " + subfolder_path + " does not exist. Using " + data_root + " instead.")
    data_path = subfolder_path if os.path.exists(subfolder_path) else data_root

    # Make sure data directory exists.
    if not (os.path.exists(data_path)):
        raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.")

    # Find all requested files.
    for index, f in enumerate(find_files):
        find_files[index] = os.path.abspath(os.path.join(data_path, f))
        if not os.path.exists(find_files[index]):
            raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.")
    if find_files:
        return data_path, find_files
    else:
        return data_path

# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()

# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for i, binding in enumerate(engine):
        size = trt.volume(context.get_binding_shape(i))
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream

# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
    # Transfer input data to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]

onnx2trt.py

import numpy as np
import pycuda.driver as cudadriver
import tensorrt as trt
import torch
import os
import time
import common

from PIL import Image
import cv2
import torchvision

def ONNX_build_engine(onnx_file_path, write_engine=True):
    
    # 通过加载onnx文件,构建engine
    # :param onnx_file_path: onnx文件路径
    # :return: engine
    
    G_LOGGER = trt.Logger(trt.Logger.WARNING)
    # 1、动态输入第一点必须要写的
    explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
    batch_size = 8  # trt推理时最大支持的batchsize
    with trt.Builder(G_LOGGER) as builder, builder.create_network(explicit_batch) as network, \
            trt.OnnxParser(network, G_LOGGER) as parser:
        builder.max_batch_size = batch_size
        config = builder.create_builder_config()
        config.max_workspace_size = common.GiB(2)  
        config.set_flag(trt.BuilderFlag.FP16)
        print('Loading ONNX file from path {}...'.format(onnx_file_path))
        with open(onnx_file_path, 'rb') as model:
            print('Beginning ONNX file parsing')
            parser.parse(model.read())
        print('Completed parsing of ONNX file')
        print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
        # 重点
        profile = builder.create_optimization_profile()  # 动态输入时候需要 分别为最小输入、常规输入、最大输入
        # 有几个输入就要写几个profile.set_shape 名字和转onnx的时候要对应
        # tensorrt6以后的版本是支持动态输入的,需要给每个动态输入绑定一个profile,用于指定最小值,常规值和最大值,如果超出这个范围会报异常。
        profile.set_shape("inputs", (1, 3, 240, 240), (8, 3, 240, 240), (16, 3, 480, 480))
        config.add_optimization_profile(profile)

        engine = builder.build_engine(network, config)
        print("Completed creating Engine")
        # 保存engine文件
        if write_engine:
            engine_file_path = 'efficientnet_b1.trt'
            with open(engine_file_path, "wb") as f:
                f.write(engine.serialize())
        return engine

onnx_file_path = r'/path/to/efficientnet_b1.onnx'
write_engine = True
engine = ONNX_build_engine(onnx_file_path, write_engine)

trt_infer.py

import sys
sys.path.append('../')

import common
import cv2
import configs
import time

import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit

TRT_LOGGER = trt.Logger()


# 用numpy重写softmax
def softmax(out_np, dim):
    s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
    return s_value


class FaceClassify(object):
    def __init__(self, configs):
        self.engine_path = configs.face_classify_engine
        self.input_size = configs.classify_input_size
        self.image_size = self.input_size[1:]
        self.MEAN = configs.classify_mean
        self.STD = configs.classify_std
        self.engine = self.get_engine()
        self.context = self.engine.create_execution_context()


    def get_engine(self):
        # If a serialized engine exists, use it instead of building an engine.
        f = open(self.engine_path, 'rb')
        runtime = trt.Runtime(TRT_LOGGER)
        return runtime.deserialize_cuda_engine(f.read())


    def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
        cuda_ctx.push()

        IN_IMAGE_H, IN_IMAGE_W = self.image_size

        # Input
        img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
        img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)

        img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32)  # (3, 240, 240)
        img_in /= 255.0  # 归一化[0, 1]

        # mean = (0.485, 0.456, 0.406)
        mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        mean = np.concatenate((mean0, mean1, mean2), axis=0)

        # std = (0.229, 0.224, 0.225)
        std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
        std = np.concatenate((std0, std1, std2), axis=0)

        img_in = ((img_in - mean) / std).astype(np.float32)
        img_in = np.expand_dims(img_in, axis=0)  # (1, 3, 240, 240)

        img_in = np.ascontiguousarray(img_in)

        # 动态输入
        self.context.active_optimization_profile = 0
        origin_inputshape = self.context.get_binding_shape(0)
        origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
        self.context.set_binding_shape(0, (origin_inputshape))  # 若每个输入的size不一样,可根据inputs的size更改对应的context中的size

        inputs, outputs, bindings, stream = common.allocate_buffers(self.engine, self.context)
        # Do inference
        inputs[0].host = img_in
        trt_outputs = common.do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
                                          stream=stream, batch_size=1)
        if cuda_ctx:
            cuda_ctx.pop()

        labels_sm = softmax(trt_outputs, dim=1)
        labels_max = np.argmax(labels_sm, axis=1)

        return labels_max.item()

你可能感兴趣的:(编程工具,深度学习,tensorRT)