系统环境:
ubuntu 1804
cuda11.3
tensorrt 8.2.06
显卡2080
pytorch 1.10.0
onnx 1.10.2
onnx-simplifier 0.3.6
步骤1:导出onnx 模型(参考https://github.com/shouxieai/tensorRT_Pro)
1.1 下载yolov5项目并修改相关代码,目的是减少导出onnx的复杂度,只保留一个输出便于后处理
# 下载并进入yolov5项目
git clone [email protected]:ultralytics/yolov5.git
cd yolov5/models
gedit yolo.py
#更改yolov5/models/yolo.py Detect.forward函数
# yolov5/models/yolo.py
# bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
# x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# 修改为:
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
bs = -1
ny = int(ny)
nx = int(nx)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
# yolov5/models/yolo.py
# z.append(y.view(bs, -1, self.no))
# 修改为:
z.append(y.view(bs, self.na * ny * nx, self.no))
############# 对于 yolov5-6.0 #####################
# yolov5/models/yolo.py Detect.forward函数
# if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
# self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
# 修改为:
if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
self.grid[i], self.anchor_grid[i] = self._make_grid(nx, ny, i)
# disconnect for pytorch trace
anchor_grid = (self.anchors[i].clone() * self.stride[i]).view(1, -1, 1, 1, 2)
# y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# 修改为:
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * anchor_grid # wh
# wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh
# 修改为:
wh = (y[..., 2:4] * 2) ** 2 * anchor_grid # wh
#return x if self.training else (torch.cat(z, 1), x)
#修改为
return torch.cat(z, 1)
1.2 模型导出
修改yolov5/export.py,目的是只保留onnx模型动态batch
############# 对于 yolov5-6.0 #####################
# yolov5/export.py export_onnx函数
# torch.onnx.export(model, im, f, verbose=False, opset_version=opset,
# training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
# do_constant_folding=not train,
# input_names=['images'],
# output_names=['output'],
# dynamic_axes={'images': {0: 'batch', 2: 'height', 3: 'width'}, # shape(1,3,640,640)
# 'output': {0: 'batch', 1: 'anchors'} # shape(1,25200,85)
# } if dynamic else None)
# 修改为
torch.onnx.export(model, im, f, verbose=False, opset_version=opset,
training=torch.onnx.TrainingMode.TRAINING if train else torch.onnx.TrainingMode.EVAL,
do_constant_folding=not train,
input_names=['images'],
output_names=['output'],
dynamic_axes={'images': {0: 'batch'}, # shape(1,3,640,640)
'output': {0: 'batch'} # shape(1,25200,85)
} if dynamic else None)
cd yolov5
python export.py --weights=yolov5s.pt --dynamic --include=onnx --opset=11 --simplify
然后就生成了yolov5.onnx文件,用https://netron.app/ 打开可以看到模型结构,可以发现模型现在只有1个输出,维度是[Batch_size,25200,85]
步骤2:生成tensorrt引擎(官网现在推荐用trtexec,但我的tensorrt是pip安装的,不支持)
*坑比较多,动态维度支持必须添加profile
import tensorrt as trt
#构建logger,builder,network
logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)
#读入onnx查看有无错误
success = parser.parse_from_file("yolov5s.onnx")
for idx in range(parser.num_errors):
print(parser.get_error(idx))
if success:
print('Construction sucess!!!')
pass # Error handling code here
profile = builder.create_optimization_profile();
profile.set_shape("images", (1,3,640,640), (8,3,640,640), (16,3,640,640))
#profile = builder.create_optimization_profile()
#profile.set_shape("foo", (1,3, 640, 640), (20,3,640, 640), (10,3,640, 640))
config = builder.create_builder_config()
config.add_optimization_profile(profile)
config.max_workspace_size = 1 << 30 # 1 MiB
serialized_engine = builder.build_serialized_network(network, config)
with open("yolov5.engine", "wb") as f:
print('正在写入engine文件...')
f.write(serialized_engine)
print('构建引擎成功!!!')
步骤3:从engine文件反序列化构建tensorrt引擎
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import numpy as np
import cv2
import time
with open("yolov5.engine", "rb") as f:
serialized_engine = f.read()
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
engine = runtime.deserialize_cuda_engine(serialized_engine)
步骤4:推理
4.1设置一个推理用的BATCH_SIZE
BATCH_SIZE=8
4.2创建上下文管理context并获取相关buffer,每当batch要变化时,要重新set_binding_shape,并且需要重新申请buffer
context = engine.create_execution_context()
context.set_binding_shape(0, (BATCH_SIZE, 3, 640, 640)) #这句非常重要!!!定义batch为动态维度
inputs, outputs, bindings, stream = allocate_buffers(engine,max_batch_size=BATCH_SIZE) #构建输入,输出,流指针
*相关函数和类
# Simple helper data class that's a little nicer to use than a 2-tuple.
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
def allocate_buffers(engine,max_batch_size=16):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in engine:
dims = engine.get_binding_shape(binding)
#print(dims)
if dims[0] == -1:
assert(max_batch_size is not None)
dims[0] = max_batch_size #动态batch_size适应
#size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
size = trt.volume(dims) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
#print(dtype,size)
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype) #开辟出一片显存
device_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(device_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
4.3 构建数据,数据大小要和inputs的buffer大小一致
img = cv2.imread('../dog.jpg')
batch_data = np.repeat(pre_process(img),BATCH_SIZE,0) #8张一样的图做个batch
*相关函数
def pre_process(img):
print('original image shape', img.shape)
img = cv2.resize(img, (640, 640))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
# img = img.transpose((2, 0, 1)).astype(np.float16)
img = img.transpose((2, 0, 1)).astype(np.float32)
img /= 255.0
return img
4.4 数据拷贝到inputs内,然后执行推理
np.copyto(inputs[0].host, batch_data.ravel())
result = do_inference_v2(context, bindings, inputs, outputs, stream)[0]
result = np.reshape(result,[BATCH_SIZE,-1,85])
print(result.shape)
#结果
#(8, 25200, 85)
#因为我是8张图片一次推理,所以一次就会出来8个结果
#85维度的意义 ct_x,ct_y,w,h,conf_box,conf_class0,conf_class1,....conf_class79
#ct_x,ct_y,w,h是640*640尺度下结果
#conf结果都已经经过sigmoid或softmax处理
*相关函数
def do_inference_v2(context, bindings, inputs, outputs, stream):
# Transfer input data to the GPU.
[cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs]
# Run inference.
context.execute_async_v2(bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
[cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs]
# Synchronize the stream
stream.synchronize()
# Return only the host outputs.
return [out.host for out in outputs]
4.5 结果绘制出来看看
result = result[4]
img = cv2.resize(img,(640,640))
boxes, confs, classes = filter_boxes(result,0.5)
boxes, confs, classes = non_max_suppression(boxes, confs, classes)
for box,conf,cls in zip(boxes,confs,classes):
x1,y1,x2,y2 = np.int32(box)
cv2.rectangle(img,(x1,y1),(x2,y2),(0,0,255),2)
cv2.imwrite('tmp.jpg',img)
*相关函数
def filter_boxes(pred,threshold):
result = pred.copy()
result[...,:2]=result[...,:2]-result[...,2:4]*0.5
result[...,2:4]=result[...,:2]+result[...,2:4]
result_selected=result[np.where(result[...,4]>threshold)]
boxes = result_selected[...,:4]
classes = np.argmax(result_selected[...,5:],axis=-1)
confs = np.max(result_selected[...,5:],axis=-1)#[...,classes]
#print(boxes.shape)
#print(classes.shape)
#print(confs.shape)
return boxes, confs, classes
def non_max_suppression(boxes, confs, classes, iou_thres=0.6):
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = confs.flatten().argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(0.0, xx2 - xx1 + 1)
h = np.maximum(0.0, yy2 - yy1 + 1)
inter = w * h
ovr = inter / (areas[i] + areas[order[1:]] - inter)
inds = np.where( ovr <= iou_thres)[0]
order = order[inds + 1]
boxes = boxes[keep]
confs = confs[keep]
classes = classes[keep]
return boxes, confs, classes