最近在摸索pytorch转tensorRT部署,看了很多示例代码,把步骤总结了一下。各种模型转换都大同小异,有所收获,记录一下。
pytorch转tensorRT步骤:
贴一段流程代码,来自 yolov5-tensorrt:
import sys
sys.path.append('yolov5')
from models.common import *
from utils.torch_utils import *
from utils.datasets import *
from yolo import *
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
import copy
import numpy as np
import os
from onnxsim import simplify
import onnx
import struct
import yaml
import torchvision
device = select_device('0') # cuda
weights = 'yolov5s.pt' # 权重
model_config = 'yolov5s.yaml' # 模型yaml
TRT_LOGGER = trt.Logger(trt.Logger.INFO) # 需要这个记录器来构建一个引擎
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
image_loader = LoadImages('yolov5/inference/images', img_size=640) # 推理文件夹
image_loader.__iter__()
_, input_img, _, _ = image_loader.__next__() # 迭代图片输入
input_img = input_img.astype(np.float) # 输入图片
input_img /= 255.0
input_img = np.expand_dims(input_img, axis=0)
with open(model_config) as f:
cfg = yaml.load(f, Loader=yaml.FullLoader) # model dict 模型加载
num_classes = cfg['nc']
# nms config
conf_thres = 0.4
iou_thres = 0.5
max_det = 300
# nms GPU
topK = 512 # max supported is 4096, if conf_thres is low, such as 0.001, use larger number.
keepTopK = max_det
def GiB(val):
return val * 1 << 30
# different from yolov5/utils/non_max_suppression, xywh2xyxy(x[:, :4]) is no longer needed (contained in Detect())
def non_max_suppression(prediction, conf_thres=0.1, iou_thres=0.6, merge=False, classes=None, agnostic=False):
"""Performs Non-Maximum Suppression (NMS) on inference results
这个是yolov5的nms处理部分,不是trt处理的,可以不用理
Returns:
detections with shape: nx6 (x1, y1, x2, y2, conf, cls)
"""
if prediction.dtype is torch.float16:
prediction = prediction.float() # to FP32
nc = prediction[0].shape[1] - 5 # number of classes
xc = prediction[..., 4] > conf_thres # candidates
# Settings
min_wh, max_wh = 2, 4096 # (pixels) minimum and maximum box width and height
max_det = 300 # maximum number of detections per image
time_limit = 10.0 # seconds to quit after
redundant = True # require redundant detections
multi_label = nc > 1 # multiple labels per box (adds 0.5ms/img)
t = time.time()
output = [None] * prediction.shape[0]
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x[xc[xi]] # confidence
# If none remain process next image
if not x.shape[0]:
continue
# Compute conf
x[:, 5:] *= x[:, 4:5] # conf = obj_conf * cls_conf
# Box (center x, center y, width, height) to (x1, y1, x2, y2)
box = x[:, :4] #xywh2xyxy(x[:, :4])
# Detections matrix nx6 (xyxy, conf, cls)
if multi_label:
i, j = (x[:, 5:] > conf_thres).nonzero().t()
x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
else: # best class only
conf, j = x[:, 5:].max(1, keepdim=True)
x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]
# Filter by class
if classes:
x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]
# Apply finite constraint
# if not torch.isfinite(x).all():
# x = x[torch.isfinite(x).all(1)]
# If none remain process next image
n = x.shape[0] # number of boxes
if not n:
continue
# Sort by confidence
# x = x[x[:, 4].argsort(descending=True)]
# Batched NMS
c = x[:, 5:6] * (0 if agnostic else max_wh) # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes (offset by class), scores
i = torchvision.ops.boxes.nms(boxes, scores, iou_thres)
if i.shape[0] > max_det: # limit detections
i = i[:max_det]
if merge and (1 < n < 3E3): # Merge NMS (boxes merged using weighted mean)
try: # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
iou = box_iou(boxes[i], boxes) > iou_thres # iou matrix
weights = iou * scores[None] # box weights
x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True) # merged boxes
if redundant:
i = i[iou.sum(1) > 1] # require redundancy
except: # possible CUDA error https://github.com/ultralytics/yolov3/issues/1139
print(x, i, x.shape, i.shape)
pass
output[xi] = x[i]
if (time.time() - t) > time_limit:
break # time limit exceeded
return output
def load_model(): # 模型加载
# Load model
model = Model(model_config)
ckpt = torch.load(weights, map_location=torch.device('cpu'))
ckpt['model'] = \
{
k: v for k, v in ckpt['model'].state_dict().items() if model.state_dict()[k].numel() == v.numel()}
model.load_state_dict(ckpt['model'], strict=False)
model.eval()
return model
def export_onnx(model, batch_size): # 模型转onnx
_,_,x,y = input_img.shape
img = torch.zeros((batch_size, 3, x, y))
torch.onnx.export(model, (img), 'yolov5_{}.onnx'.format(batch_size),
input_names=["data"], output_names=["prediction"], verbose=True, opset_version=11, operator_export_type=torch.onnx.OperatorExportTypes.ONNX
)
def simplify_onnx(onnx_path): # onnx模型简化
model = onnx.load(onnx_path)
model_simp, check = simplify(model)
assert check, "Simplified ONNX model could not be validated"
onnx.save(model_simp, onnx_path)
def build_engine(onnx_path, using_half):
"""
ONNX模型转换为trt序列化模型
Args:
onnx_path:
using_half:
Returns:
"""
trt.init_libnvinfer_plugins(None, '')
engine_file = onnx_path.replace(".onnx", ".engine")
if os.path.exists(engine_file): # 存在序列化文件直接加载返回,否则构建引擎
with open(engine_file, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
return runtime.deserialize_cuda_engine(f.read())
with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
builder.max_batch_size = 1 # always 1 for explicit batch
config = builder.create_builder_config()
config.max_workspace_size = GiB(1)
if using_half:
config.set_flag(trt.BuilderFlag.FP16)
# Load the Onnx model and parse it in order to populate the TensorRT network.
with open(onnx_path, 'rb') as model:
if not parser.parse(model.read()):
print ('ERROR: Failed to parse the ONNX file.')
for error in range(parser.num_errors):
print (parser.get_error(error))
return None
previous_output = network.get_output(0)
network.unmark_output(previous_output)
# slice boxes, obj_score, class_scores
strides = trt.Dims([1,1,1])
starts = trt.Dims([0,0,0])
bs, num_boxes, _ = previous_output.shape
shapes = trt.Dims([bs, num_boxes, 4])
boxes = network.add_slice(previous_output, starts, shapes, strides)
starts[2] = 4
shapes[2] = 1
obj_score = network.add_slice(previous_output, starts, shapes, strides)
starts[2] = 5
shapes[2] = num_classes
scores = network.add_slice(previous_output, starts, shapes, strides)
indices = network.add_constant(trt.Dims([num_classes]), trt.Weights(np.zeros(num_classes, np.int32)))
gather_layer = network.add_gather(obj_score.get_output(0), indices.get_output(0), 2)
# scores = obj_score * class_scores => [bs, num_boxes, nc]
updated_scores = network.add_elementwise(gather_layer.get_output(0), scores.get_output(0), trt.ElementWiseOperation.PROD)
# reshape box to [bs, num_boxes, 1, 4]
reshaped_boxes = network.add_shuffle(boxes.get_output(0))
reshaped_boxes.reshape_dims = trt.Dims([0,0,1,4])
# add batchedNMSPlugin, inputs:[boxes:(bs, num, 1, 4), scores:(bs, num, 1)]
trt.init_libnvinfer_plugins(TRT_LOGGER, "")
registry = trt.get_plugin_registry()
assert(registry)
creator = registry.get_plugin_creator("BatchedNMS_TRT", "1")
assert(creator)
fc = []
fc.append(trt.PluginField("shareLocation", np.array([1], dtype=np.int), trt.PluginFieldType.INT32))
fc.append(trt.PluginField("backgroundLabelId", np.array([-1], dtype=np.int), trt.PluginFieldType.INT32))
fc.append(trt.PluginField("numClasses", np.array([num_classes], dtype=np.int), trt.PluginFieldType.INT32))
fc.append(trt.PluginField("topK", np.array([topK], dtype=np.int), trt.PluginFieldType.INT32))
fc.append(trt.PluginField("keepTopK", np.array([keepTopK], dtype=np.int), trt.PluginFieldType.INT32))
fc.append(trt.PluginField("scoreThreshold", np.array([conf_thres], dtype=np.float32), trt.PluginFieldType.FLOAT32))
fc.append(trt.PluginField("iouThreshold", np.array([iou_thres], dtype=np.float32), trt.PluginFieldType.FLOAT32))
fc.append(trt.PluginField("isNormalized", np.array([0], dtype=np.int), trt.PluginFieldType.INT32))
fc.append(trt.PluginField("clipBoxes", np.array([0], dtype=np.int), trt.PluginFieldType.INT32))
fc = trt.PluginFieldCollection(fc)
nms_layer = creator.create_plugin("nms_layer", fc)
layer = network.add_plugin_v2([reshaped_boxes.get_output(0), updated_scores.get_output(0)], nms_layer)
layer.get_output(0).name = "num_detections"
layer.get_output(1).name = "nmsed_boxes"
layer.get_output(2).name = "nmsed_scores"
layer.get_output(3).name = "nmsed_classes"
for i in range(4):
network.mark_output(layer.get_output(i))
return builder.build_engine(network, config)
def allocate_buffers(engine, is_explicit_batch=False, dynamic_shapes=[]):
# 分配缓冲区,这一步是通用的
inputs = []
outputs = []
bindings = []
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
for binding in engine:
dims = engine.get_binding_shape(binding)
print(dims)
if dims[0] == -1:
assert(len(dynamic_shapes) > 0)
dims[0] = dynamic_shapes[0]
size = trt.volume(dims) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings
def profile_trt(engine, batch_size, num_warmups=10, num_iters=100): # trt推理
assert(engine is not None)
input_img_array = np.array([input_img] * batch_size) # 输入
yolo_inputs, yolo_outputs, yolo_bindings = allocate_buffers(engine, True) # 内存分配
stream = cuda.Stream() # pycuda 操作缓冲区
with engine.create_execution_context() as context:
total_duration = 0.
total_compute_duration = 0.
total_pre_duration = 0.
total_post_duration = 0.
for iteration in range(num_iters):
pre_t = time.time()
# set host data
img = torch.from_numpy(input_img_array).float().numpy()
yolo_inputs[0].host = img # 输入赋值到inputs
# Transfer data from CPU to the GPU. 将数据从CPU转移到GPU。
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in yolo_inputs]
# 线程同步
stream.synchronize()
start_t = time.time()
# 执行模型推理
context.execute_async_v2(bindings=yolo_bindings, stream_handle=stream.handle)
stream.synchronize()
end_t = time.time()
# Transfer predictions back from the GPU.从GPU传回的传输预测。
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in yolo_outputs]
stream.synchronize()
post_t = time.time()
duration = post_t - pre_t
compute_duration = end_t - start_t
pre_duration = start_t - pre_t
post_duration = post_t - end_t
if iteration >= num_warmups:
total_duration += duration
total_compute_duration += compute_duration
total_post_duration += post_duration
total_pre_duration += pre_duration
print("avg GPU time: {}".format(total_duration/(num_iters - num_warmups)))
print("avg GPU compute time: {}".format(total_compute_duration/(num_iters - num_warmups)))
print("avg pre time: {}".format(total_pre_duration/(num_iters - num_warmups)))
print("avg post time: {}".format(total_post_duration/(num_iters - num_warmups)))
# 取出outputs,reshape成输出格式
num_det = int(yolo_outputs[0].host[0, ...])
boxes = np.array(yolo_outputs[1].host).reshape(batch_size, -1, 4)[0, 0:num_det, 0:4]
scores = np.array(yolo_outputs[2].host).reshape(batch_size, -1, 1)[0, 0:num_det, 0:1]
classes = np.array(yolo_outputs[3].host).reshape(batch_size, -1, 1)[0, 0:num_det, 0:1]
return [np.concatenate([boxes, scores, classes], -1)]
def profile_torch(model, using_half, batch_size, num_warmups=10, num_iters=100): # 正常的torch推理,这一步主要是和trt推理做个对比
model.to(device)
total_duration = 0.
total_compute_duration = 0.
total_pre_duration = 0.
total_post_duration = 0.
if using_half:
model.half()
for iteration in range(num_iters):
pre_t = time.time()
# set host data
img = torch.from_numpy(input_img).float().to(device)
if using_half:
img = img.half()
start_t = time.time()
_ = model(img)
output = non_max_suppression(_[0], conf_thres, iou_thres)
end_t = time.time()
[i.cpu() for i in _]
post_t = time.time()
duration = post_t - pre_t
compute_duration = end_t - start_t
pre_duration = start_t - pre_t
post_duration = post_t - end_t
if iteration >= num_warmups:
total_duration += duration
total_compute_duration += compute_duration
total_post_duration += post_duration
total_pre_duration += pre_duration
print("avg GPU time: {}".format(total_duration/(num_iters - num_warmups)))
print("avg GPU compute time: {}".format(total_compute_duration/(num_iters - num_warmups)))
print("avg pre time: {}".format(total_pre_duration/(num_iters - num_warmups)))
print("avg post time: {}".format(total_post_duration/(num_iters - num_warmups)))
return [output[0].cpu().numpy()]
if __name__ == '__main__':
batch_size = 1 # only works for TRT. perf reported by torch is working on non-batched data.
using_half = False
onnx_path = 'yolov5_{}.onnx'.format(batch_size)
with torch.no_grad():
model = load_model() # 模型加载
export_onnx(model, batch_size) # 转onnx
simplify_onnx(onnx_path) # onnx简化
trt_result = profile_trt(build_engine(onnx_path, using_half), batch_size, 10, 100) # trt推理
if using_half:
model.half()
torch_result = profile_torch(model, using_half, batch_size, 10, 100) # torch推理
print(trt_result)
print(torch_result)