pytorch2onnx2tensorrt

tensorrt与onnx安装

tensorrt安装:
参考官网
Deep Learning SDK Documentation
采用tar安装而非deb安装更稳一些

onnx安装
注意protobuf版本应为3.9

模型部署

遇到的问题,onnx导出没问题,但是转tensorrt时经过dead-layer removel后层数变少,接卸onnx是返回false
pytorch上采样部分得源码导致
经过好久的研究发觉,包括从Netron观察模型结构,按照分析,应该是第1041个算子的问题,这个算子是Gather算子,当indices=2的时候,TensorRT无法进行解析,导致出现的现在的问题,所以索性就直接修改PyTorch中的源码,这一个算子存在于F.interpolate中,是其中的源码程序,function.py中的程序,修改如下:
#可用

python 
import torch 
torch

来查看对应文件位置

源码:
if torch._C._get_tracing_state():
            return [(torch.floor(input.size(i + 2) * torch.tensor(float(scale_factors[i])))) for i in range(dim)]
        else:
            return [int(math.floor(int(input.size(i + 2)) * scale_factors[i])) for i in range(dim)]
 
修改为:
return [int(math.floor(input.size(i + 2) * scale_factors[i])) for i in range(dim)]

参考链接:
Mask_RCNN的TensorRT加速
upsample后onnx无法转为tensorrt

pytorch转onnx

from network.model import get_net

import torchvision
import torch
from torch.autograd import Variable
import onnx
print(torch.__version__)

input_name = ['input']
output_name = ['output']
input = Variable(torch.randn(1, 3, 320, 320)).cuda()
#model = torchvision.models.resnet50(pretrained=True).cuda()

num_layers=50
net = get_net(num_layers,onnx_mode=True,num_class=2, head_conv=64)
use_gpu = True
trained_model='../result/loss_24_right_2000/sarnet_5_loss_258.245988227427.pth'
if use_gpu:
    net = net.cuda()
net.load_state_dict(torch.load(trained_model))
model = net
print("model is {}".format(model))
model.eval()
export_onnx_file = "test.onnx"
torch.onnx.export(model, input, export_onnx_file, input_names=input_name, output_names=output_name, verbose=True)
test = onnx.load(export_onnx_file)
onnx.checker.check_model(test)
print("==> Passed")

onnx转tensorrt

可视化
TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)

import pycuda.autoinit
import numpy as np
import pycuda.driver as cuda
import tensorrt as trt
import torch
import os
import time
from PIL import Image
import cv2
import torchvision

import sys
sys.path.append("..")
from network.model import get_net

def get_img_np_nchw(filename):
    image = cv2.imread(filename)
    image_cv = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image_cv = cv2.resize(image_cv, (320, 320))
    miu = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    img_np = np.array(image_cv, dtype=float) / 255.
    r = (img_np[:, :, 0] - miu[0]) / std[0]
    g = (img_np[:, :, 1] - miu[1]) / std[1]
    b = (img_np[:, :, 2] - miu[2]) / std[2]
    img_np_t = np.array([r, g, b])
    img_np_nchw = np.expand_dims(img_np_t, axis=0)
    return img_np_nchw

class HostDeviceMem(object):
    def __init__(self, host_mem, device_mem):
        """Within this context, host_mom means the cpu memory and device means the GPU memory
        """
        self.host = host_mem
        self.device = device_mem

    def __str__(self):
        return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)

    def __repr__(self):
        return self.__str__()


def allocate_buffers(engine):
    inputs = []
    outputs = []
    bindings = []
    stream = cuda.Stream()
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        dtype = trt.nptype(engine.get_binding_dtype(binding))
        # Allocate host and device buffers
        host_mem = cuda.pagelocked_empty(size, dtype)
        device_mem = cuda.mem_alloc(host_mem.nbytes)
        # Append the device buffer to device bindings.
        bindings.append(int(device_mem))
        # Append to the appropriate list.
        if engine.binding_is_input(binding):
            inputs.append(HostDeviceMem(host_mem, device_mem))
        else:
            outputs.append(HostDeviceMem(host_mem, device_mem))
    return inputs, outputs, bindings, stream


def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="", \
               fp16_mode=False, int8_mode=False, save_engine=False,
               ):
    """Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
    def build_engine(max_batch_size, save_engine):
        """Takes an ONNX file and creates a TensorRT engine to run inference with"""
        with trt.Builder(TRT_LOGGER) as builder, \
                builder.create_network() as network, \
                trt.OnnxParser(network, TRT_LOGGER) as parser:
            builder.max_workspace_size = 1 << 30  # Your workspace size
            builder.max_batch_size = max_batch_size
            # pdb.set_trace()
            builder.fp16_mode = fp16_mode  # Default: False
            builder.int8_mode = int8_mode  # Default: False
            if int8_mode:
                # To be updated
                raise NotImplementedError
            # Parse model file
            if not os.path.exists(onnx_file_path):
                quit('ONNX file {} not found'.format(onnx_file_path))
            print('Loading ONNX file from path {}...'.format(onnx_file_path))
            with open(onnx_file_path, 'rb') as model:
                print('Beginning ONNX file parsing')
                a=parser.parse(model.read())
                print(a)
            #last_layer = network.get_layer(network.num_layers - 1)
            #network.mark_output(last_layer.get_output(0))
            print('Completed parsing of ONNX file')
            print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
            engine = builder.build_cuda_engine(network)
            print("Completed creating Engine")

            if save_engine:
                with open(engine_file_path, "wb") as f:
                    f.write(engine.serialize())
            return engine

    if os.path.exists(engine_file_path):
        # If a serialized engine exists, load it instead of building a new one.
        print("Reading engine from file {}".format(engine_file_path))
        with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
            return runtime.deserialize_cuda_engine(f.read())
    else:
        return build_engine(max_batch_size, save_engine)


def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
    # Transfer data from CPU to the GPU.
    [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
    # Run inference.
    context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle)
    # Transfer predictions back from the GPU.
    [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
    # Synchronize the stream
    stream.synchronize()
    # Return only the host outputs.
    return [out.host for out in outputs]


def postprocess_the_outputs(h_outputs, shape_of_output):
    h_outputs = h_outputs.reshape(*shape_of_output)
    return h_outputs


max_batch_size = 1
onnx_model_path = 'test.onnx'
TRT_LOGGER = trt.Logger()
filename="dog.jpg"
img_np_nchw = get_img_np_nchw(filename)
img_np_nchw = img_np_nchw.astype(dtype=np.float32)


# These two modes are dependent on hardwares
fp16_mode = False
int8_mode = False
trt_engine_path = 'test.trt'
save_engine = True
engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode,save_engine)
context = engine.create_execution_context()
inputs, outputs, bindings, stream = allocate_buffers(engine) # input, output: host # bindings
shape_of_output = (max_batch_size, 12800)
print("img_np_nchw.shape is {}".format(img_np_nchw.shape))
inputs[0].host = img_np_nchw
#inputs[0].host = img_np_nchw.reshape(-1)
t1 = time.time()
trt_outputs = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
t2 = time.time()
print("trt_outputs[0].shape is {}".format(trt_outputs[0].shape))
print("trt_outputs[1].shape is {}".format(trt_outputs[1].shape))
shape_of_output=(1,2,80,80)
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)
print("feat.shape is {}".format(feat.shape))
print('TensorRT ok')
print("Inference time with the TensorRT engine: {}".format(t2-t1))




你可能感兴趣的:(deploy)