tensorrt安装:
参考官网
Deep Learning SDK Documentation
采用tar安装而非deb安装更稳一些
onnx安装
注意protobuf版本应为3.9
遇到的问题,onnx导出没问题,但是转tensorrt时经过dead-layer removel后层数变少,接卸onnx是返回false
pytorch上采样部分得源码导致
经过好久的研究发觉,包括从Netron观察模型结构,按照分析,应该是第1041个算子的问题,这个算子是Gather算子,当indices=2的时候,TensorRT无法进行解析,导致出现的现在的问题,所以索性就直接修改PyTorch中的源码,这一个算子存在于F.interpolate中,是其中的源码程序,function.py中的程序,修改如下:
#可用
python
import torch
torch
来查看对应文件位置
源码:
if torch._C._get_tracing_state():
return [(torch.floor(input.size(i + 2) * torch.tensor(float(scale_factors[i])))) for i in range(dim)]
else:
return [int(math.floor(int(input.size(i + 2)) * scale_factors[i])) for i in range(dim)]
修改为:
return [int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)]
参考链接:
Mask_RCNN的TensorRT加速
upsample后onnx无法转为tensorrt
from network.model import get_net
import torchvision
import torch
from torch.autograd import Variable
import onnx
print(torch.__version__)
input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(1, 3, 320, 320)).cuda()
#model = torchvision.models.resnet50(pretrained=True).cuda()
num_layers=50
net = get_net(num_layers,onnx_mode=True,num_class=2, head_conv=64)
use_gpu = True
trained_model='../result/loss_24_right_2000/sarnet_5_loss_258.245988227427.pth'
if use_gpu:
net = net.cuda()
net.load_state_dict(torch.load(trained_model))
model = net
print("model is {}".format(model))
model.eval()
export_onnx_file = "test.onnx"
torch.onnx.export(model, input, export_onnx_file, input_names=input_name, output_names=output_name, verbose=True)
test = onnx.load(export_onnx_file)
onnx.checker.check_model(test)
print("==> Passed")
可视化
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)
import pycuda.autoinit
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
import torch
import os
import time
from PIL import Image
import cv2
import torchvision
import sys
sys.path.append("..")
from network.model import get_net
def get_img_np_nchw(filename):
image = cv2.imread(filename)
image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image_cv = cv2.resize(image_cv, (320, 320))
miu = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])
img_np = np.array(image_cv, dtype=float) / 255.
r = (img_np[:, :, 0] - miu[0]) / std[0]
g = (img_np[:, :, 1] - miu[1]) / std[1]
b = (img_np[:, :, 2] - miu[2]) / std[2]
img_np_t = np.array([r, g, b])
img_np_nchw = np.expand_dims(img_np_t, axis=0)
return img_np_nchw
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""Within this context, host_mom means the cpu memory and device means the GPU memory
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \
fp16_mode=False, int8_mode=False, save_engine=False,
):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
def build_engine(max_batch_size, save_engine):
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network() as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1 << 30 # Your workspace size
builder.max_batch_size = max_batch_size
# pdb.set_trace()
builder.fp16_mode = fp16_mode # Default: False
builder.int8_mode = int8_mode # Default: False
if int8_mode:
# To be updated
raise NotImplementedError
# Parse model file
if not os.path.exists(onnx_file_path):
quit('ONNX file {} not found'.format(onnx_file_path))
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
a=parser.parse(model.read())
print(a)
#last_layer = network.get_layer(network.num_layers - 1)
#network.mark_output(last_layer.get_output(0))
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
engine = builder.build_cuda_engine(network)
print("Completed creating Engine")
if save_engine:
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
return engine
if os.path.exists(engine_file_path):
# If a serialized engine exists, load it instead of building a new one.
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine(max_batch_size, save_engine)
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def postprocess_the_outputs(h_outputs, shape_of_output):
h_outputs = h_outputs.reshape(*shape_of_output)
return h_outputs
max_batch_size = 1
onnx_model_path = 'test.onnx'
TRT_LOGGER = trt.Logger()
filename="dog.jpg"
img_np_nchw = get_img_np_nchw(filename)
img_np_nchw = img_np_nchw.astype(dtype=np.float32)
# These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = 'test.trt'
save_engine = True
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode,save_engine)
context = engine.create_execution_context()
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
shape_of_output = (max_batch_size, 12800)
print("img_np_nchw.shape is {}".format(img_np_nchw.shape))
inputs[0].host = img_np_nchw
#inputs[0].host = img_np_nchw.reshape(-1)
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
t2 = time.time()
print("trt_outputs[0].shape is {}".format(trt_outputs[0].shape))
print("trt_outputs[1].shape is {}".format(trt_outputs[1].shape))
shape_of_output=(1,2,80,80)
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print("feat.shape is {}".format(feat.shape))
print('TensorRT ok')
print("Inference time with the TensorRT engine: {}".format(t2-t1))