第一步:ubuntu18.04装TensorRT8.0.0.3 + onnx1.8.0 + onnx_tensorrt
第二步:pytorch模型转onnx模型步骤和可能遇到的问题
第三步:onnx模型导入tensorrt生成优化engine + 在GPU上推理
Additional TensorRT resources
深入理解 TensorRT (1) TensorRT Python API 详解
pycuda.driver.cuda.pagelocked_empty(shape, dtype)
。shape 一般通过 trt.volume(engine.get_binding_shape(id))
实现,可以理解为元素数量(而不是内存大小)。dtype就是数据类型,可以通过 np.float32
或 trt.float32
的形式。pycuda.driver.cuda.mem_alloc(buffer.nbytes)
, buffer 可以是ndarray,也可以是前面的 pagelocked_empty()
结果。pycuda.driver.cuda.memcpy_htod(dest, src)
,dest是 mem_alloc
的结果,src 是 numpy/pagelocked_empty
。pycuda.driver.cuda.memcpy_dtoh(dest, src)
,dest是numpy/pagelocked_empty
,src是mem_alloc
。input tensor
与 output tensor
,可通过 id
或 name
获取对应的 binding。在模型推理过程中,需要以 bindings
作为输入,其具体数值为内存地址,即 int(buffer)
。int(buffer)
,其中 buffer
就是 mem_alloc
的结果。import time
import os
import argparse
import configs
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
try:
# Sometimes python2 does not understand FileNotFoundError
FileNotFoundError
except NameError:
FileNotFoundError = IOError
def GiB(val):
return val * 1 << 30
def find_sample_data(description="Runs a TensorRT Python sample", subfolder="", find_files=[]):
'''
Parses sample arguments.
Args:
description (str): Description of the sample.
subfolder (str): The subfolder containing data relevant to this sample
find_files (str): A list of filenames to find. Each filename will be replaced with an absolute path.
Returns:
str: Path of data directory.
Raises:
FileNotFoundError
'''
kDEFAULT_DATA_ROOT = os.path.abspath("/usr/src/tensorrt/data")
# Standard command-line arguments for all samples.
parser = argparse.ArgumentParser(description=description)
parser.add_argument("-d", "--datadir", help="Location of the TensorRT sample data directory.")
args, unknown_args = parser.parse_known_args()
# If data directory is not specified, use the default.
data_root = args.datadir if args.datadir else kDEFAULT_DATA_ROOT
# If the subfolder exists, append it to the path, otherwise use the provided path as-is.
subfolder_path = os.path.join(data_root, subfolder)
if not os.path.exists(subfolder_path):
print("WARNING: " + subfolder_path + " does not exist. Using " + data_root + " instead.")
data_path = subfolder_path if os.path.exists(subfolder_path) else data_root
# Make sure data directory exists.
if not (os.path.exists(data_path)):
raise FileNotFoundError(data_path + " does not exist. Please provide the correct data path with the -d option.")
# Find all requested files.
for index, f in enumerate(find_files):
find_files[index] = os.path.abspath(os.path.join(data_path, f))
if not os.path.exists(find_files[index]):
raise FileNotFoundError(find_files[index] + " does not exist. Please provide the correct data path with the -d option.")
if find_files:
return data_path, find_files
else:
return data_path
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
# Allocates all buffers required for an engine, i.e. host/device inputs/outputs.
def allocate_buffers(engine, context):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for i, binding in enumerate(engine):
size = trt.volume(context.get_binding_shape(i))
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
# This function is generalized for multiple inputs/outputs.
# inputs and outputs are expected to be lists of HostDeviceMem objects.
def do_inference(context, bindings, inputs, outputs, stream, batch_size):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
import numpy as np
import pycuda.driver as cudadriver
import tensorrt as trt
import torch
import os
import time
import common
from PIL import Image
import cv2
import torchvision
def ONNX_build_engine(onnx_file_path, write_engine=True):
# 通过加载onnx文件,构建engine
# :param onnx_file_path: onnx文件路径
# :return: engine
G_LOGGER = trt.Logger(trt.Logger.WARNING)
# 1、动态输入第一点必须要写的
explicit_batch = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
batch_size = 8 # trt推理时最大支持的batchsize
with trt.Builder(G_LOGGER) as builder, builder.create_network(explicit_batch) as network, \
trt.OnnxParser(network, G_LOGGER) as parser:
builder.max_batch_size = batch_size
config = builder.create_builder_config()
config.max_workspace_size = common.GiB(2)
config.set_flag(trt.BuilderFlag.FP16)
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
parser.parse(model.read())
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
# 重点
profile = builder.create_optimization_profile() # 动态输入时候需要 分别为最小输入、常规输入、最大输入
# 有几个输入就要写几个profile.set_shape 名字和转onnx的时候要对应
# tensorrt6以后的版本是支持动态输入的,需要给每个动态输入绑定一个profile,用于指定最小值,常规值和最大值,如果超出这个范围会报异常。
profile.set_shape("inputs", (1, 3, 240, 240), (8, 3, 240, 240), (16, 3, 480, 480))
config.add_optimization_profile(profile)
engine = builder.build_engine(network, config)
print("Completed creating Engine")
# 保存engine文件
if write_engine:
engine_file_path = 'efficientnet_b1.trt'
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
return engine
onnx_file_path = r'/path/to/efficientnet_b1.onnx'
write_engine = True
engine = ONNX_build_engine(onnx_file_path, write_engine)
import sys
sys.path.append('../')
import common
import cv2
import configs
import time
import numpy as np
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
TRT_LOGGER = trt.Logger()
# 用numpy重写softmax
def softmax(out_np, dim):
s_value = np.exp(out_np) / np.sum(np.exp(out_np), axis=dim, keepdims=True)
return s_value
class FaceClassify(object):
def __init__(self, configs):
self.engine_path = configs.face_classify_engine
self.input_size = configs.classify_input_size
self.image_size = self.input_size[1:]
self.MEAN = configs.classify_mean
self.STD = configs.classify_std
self.engine = self.get_engine()
self.context = self.engine.create_execution_context()
def get_engine(self):
# If a serialized engine exists, use it instead of building an engine.
f = open(self.engine_path, 'rb')
runtime = trt.Runtime(TRT_LOGGER)
return runtime.deserialize_cuda_engine(f.read())
def detect(self, image_src, cuda_ctx = pycuda.autoinit.context):
cuda_ctx.push()
IN_IMAGE_H, IN_IMAGE_W = self.image_size
# Input
img_in = cv2.cvtColor(image_src, cv2.COLOR_BGR2RGB)
img_in = cv2.resize(img_in, (IN_IMAGE_W, IN_IMAGE_H), interpolation=cv2.INTER_LINEAR)
img_in = np.transpose(img_in, (2, 0, 1)).astype(np.float32) # (3, 240, 240)
img_in /= 255.0 # 归一化[0, 1]
# mean = (0.485, 0.456, 0.406)
mean0 = np.expand_dims(self.MEAN[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean1 = np.expand_dims(self.MEAN[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean2 = np.expand_dims(self.MEAN[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
mean = np.concatenate((mean0, mean1, mean2), axis=0)
# std = (0.229, 0.224, 0.225)
std0 = np.expand_dims(self.STD[0] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std1 = np.expand_dims(self.STD[1] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std2 = np.expand_dims(self.STD[2] * np.ones((IN_IMAGE_H, IN_IMAGE_W)), axis=0)
std = np.concatenate((std0, std1, std2), axis=0)
img_in = ((img_in - mean) / std).astype(np.float32)
img_in = np.expand_dims(img_in, axis=0) # (1, 3, 240, 240)
img_in = np.ascontiguousarray(img_in)
# 动态输入
self.context.active_optimization_profile = 0
origin_inputshape = self.context.get_binding_shape(0)
origin_inputshape[0], origin_inputshape[1], origin_inputshape[2], origin_inputshape[3] = img_in.shape
self.context.set_binding_shape(0, (origin_inputshape)) # 若每个输入的size不一样,可根据inputs的size更改对应的context中的size
inputs, outputs, bindings, stream = common.allocate_buffers(self.engine, self.context)
# Do inference
inputs[0].host = img_in
trt_outputs = common.do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs,
stream=stream, batch_size=1)
if cuda_ctx:
cuda_ctx.pop()
labels_sm = softmax(trt_outputs, dim=1)
labels_max = np.argmax(labels_sm, axis=1)
return labels_max.item()