运行环境:
项目 | 主机版本(on ubuntu 18.04) | AGX版本(Jetson 4.3) |
---|---|---|
cuda 10.0 | 10.0.130 | - |
cudnn7 | 7.6.5.32 | - |
tensorrt | 7.0.0.11-1+cuda10.0 | - |
pytorch | 1.4.0+cu100 | 1.2.0 |
torchvision | 0.5.0+cu100 | 0.4.0 |
项目地址:
https://gitee.com/n__n/Ultra-Fast-Lane-Detection.git
pth 转 onnx
#!/usr/bin/python3
import torch
import torch.onnx as tox
from torch.autograd import Variable
from model.model import parsingNet
from utils.common import merge_config
from utils.dist_utils import dist_print
if __name__ == "__main__":
torch.backends.cudnn.benchmark = True
args, cfg = merge_config()
dist_print('start testing...')
assert cfg.backbone in ['18', '34', '50', '101',
'152', '50next', '101next', '50wide', '101wide']
if cfg.dataset == 'CULane':
cls_num_per_lane = 18
else:
raise NotImplementedError
net = parsingNet(pretrained=False, backbone=cfg.backbone, cls_dim=(
cfg.griding_num + 1, cls_num_per_lane, 4), use_aux=False).cuda()
state_dict = torch.load(cfg.test_model, map_location='cuda')['model']
compatible_state_dict = {}
for k, v in state_dict.items():
if 'module.' in k:
compatible_state_dict[k[7:]] = v
else:
compatible_state_dict[k] = v
net.load_state_dict(compatible_state_dict, strict=False)
net.cuda()
net.eval()
dummy_input = Variable(torch.randn(1,3,288,800)).cuda()
input_names = ["input"]
output_names = ["output"]
tox.export(net, dummy_input, "culane_18.onnx", verbose=True, input_names=input_names, output_names=output_names)
加载 onnx ,保存trt
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
"""Within this context, host_mom means the cpu memory and device means the GPU memory
"""
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def get_engine(max_batch_size=1, onnx_file_path="", engine_file_path="",fp16_mode=False, int8_mode=False, save_engine=False,):
"""Attempts to load a serialized engine if available, otherwise builds a new TensorRT engine and saves it."""
def build_engine(max_batch_size, save_engine):
"""Takes an ONNX file and creates a TensorRT engine to run inference with"""
with trt.Builder(TRT_LOGGER) as builder, \
builder.create_network() as network, \
trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_workspace_size = 1 << 30 # Your workspace size
builder.max_batch_size = max_batch_size
# pdb.set_trace()
builder.fp16_mode = fp16_mode # Default: False
builder.int8_mode = int8_mode # Default: False
if int8_mode:
# To be updated
raise NotImplementedError
# Parse model file
if not os.path.exists(onnx_file_path):
quit('ONNX file {} not found'.format(onnx_file_path))
print('Loading ONNX file from path {}...'.format(onnx_file_path))
with open(onnx_file_path, 'rb') as model:
print('Beginning ONNX file parsing')
parser.parse(model.read())
print('Completed parsing of ONNX file')
print('Building an engine from file {}; this may take a while...'.format(onnx_file_path))
engine = builder.build_cuda_engine(network)
print("Completed creating Engine")
if save_engine:
with open(engine_file_path, "wb") as f:
f.write(engine.serialize())
return engine
if os.path.exists(engine_file_path):
# If a serialized engine exists, load it instead of building a new one.
print("Reading engine from file {}".format(engine_file_path))
with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
else:
return build_engine(max_batch_size, save_engine)
def do_inference(context, bindings, inputs, outputs, stream, batch_size=1):
# Transfer data from CPU to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
def postprocess_the_outputs(h_outputs, shape_of_output):
h_outputs = h_outputs.reshape(*shape_of_output)
return h_outputs
加载图片并识别
x = img_transforms(img2)
imgs = x.unsqueeze(0).cuda() + 1
if self.engine is None:
self.engine = get_engine(max_batch_size, onnx_model_path, trt_engine_path, fp16_mode, int8_mode)
self.context = self.engine.create_execution_context()
inputs, outputs, bindings, stream = allocate_buffers(self.engine)
inputs[0].host = imgs.data.cpu().numpy()
trt_outputs = do_inference(self.context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream)
feat = postprocess_the_outputs(trt_outputs[0], shape_of_output)