我的环境:ubuntu18.04,kernel5.4,cuda11.2,RTX3050,cmake3.22,没有cudnn
在你的终端虚拟环境输入
pip install onnx
pip install onnxruntime-gpu (这行命令是用onnx推理时才必要的)
注意运行时需要输入img_size大小,必须和你之后detect时用的img_size相匹配且必须为64的倍数?
改完后能正常推理,但是输出结果没有目标框
原因:经过调试,原来pred输出的tensor为1*30240*30,30对应(4xywh+1置信度+8关键点+17类),而onnx输出的pred为1*3*96*96*30。实际上是因为onnx-runtimex运行之后应该还有一些后处理操作,而export.py没有这个功能。这些功能被集成到了torch2tensorrt文件夹的main.py中。接下来直接将其转化为tensorrt
ps:model.model[-1].export = True # set Detect() layer export=True 这一行注释掉,则卷积出来的结果是对的,但不能转化为onnx
Ubuntu16.04下安装cuda和cudnn的三种方法(亲测全部有效)_隔壁老王的博客-CSDN博客_ubuntu安装cuda
ps:先看评论区第一条
https://developer.nvidia.cn/nvidia-tensorrt-7x-download
下载.tar格式的
yolov5的TensorRT部署【tensorrt+cudnn@主机】_epic_Lin的博客-CSDN博客
按这位博主的流程安装好后,运行import tensorrt,报错
libcudnn.so.8: cannot open shared object file:....
解决办法:进入环境变量
终端执行gedit ~/.bashrc
注释掉之前写的那行
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/TensorRT
-7.2.2.3/lib
添加
export LD_LIBRARY_PATH="/opt/TensorRT-7.2.2.3/lib:$LD_LIBRARY_PATH"
export PATH="/usr/local/cuda-11.1/bin:$PATH"
export LD_LIBRARY_PATH="/usr/local/cuda-11.1/lib64:$LD_LIBRARY_PATH"保存并关闭(ps:记得改为自己的版本)
终端执行source ~/.bashrc
如上执行之后,可以在命令行中执行文件,但是在pycharm中执行还是会报原来的错
原因:没有将新配置的路径加入pycharm的环境变量
解决办法:
打开pycharm,右上角编辑配置-->环境-->环境变量
直接复制LD_LIBRARY_PATH处的/usr/local/cuda-11.1/lib64
即可
TensorFlow可以在终端和通过终端打开的PyCharm中运行,不能在直接打开的PyCharm中运行 - Youpeng - 博客园
执行卡在 Building an engine from file/home/luoxinhao/桌面/yolov5-face-master-lxh/yoloface训练/torch2tensorrt/2000+3000esay.onnx' this may take a while... 这一步
解决办法:等十分钟(真的只是单纯的慢而已....)
源代码如下
import os
import sys
import cv2
import copy
import torch
root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径:获取当前路径,再上级路径
sys.path.append(root_path) # 将项目根路径写入系统路径
from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
from utils.datasets import letterbox
from torch2tensorrt.yolo_trt_model import YoloTrtModel
from detect_face import scale_coords_landmarks,show_results
cur_path=os.path.abspath(os.path.dirname(__file__))
def img_process(img_path,long_side=320,stride_max=32):
'''
图像预处理
'''
orgimg=cv2.imread(img_path)
img0 = copy.deepcopy(orgimg)
h0, w0 = orgimg.shape[:2] # orig hw
r = long_side/ max(h0, w0) # resize image to img_size
if r != 1: # always resize down, only resize up if training with augmentation
interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)
imgsz = check_img_size(long_side, s=stride_max) # check img_size
img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形 False固定尺度
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1).copy() # BGR to RGB, to 3x416x416
img = torch.from_numpy(img)
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
return img,orgimg
def img_vis(img,orgimg,pred,device,vis_thres = 0.6):
'''
预测可视化
vis_thres: 可视化阈值
'''
print('img.shape: ', img.shape)
print('orgimg.shape: ', orgimg.shape)
no_vis_nums=0
# Process detections
for i, det in enumerate(pred): # detections per image
gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]].to(device) # normalization gain whwh
gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]].to(device) # normalization gain landmarks
if len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()
# Print results
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class
det[:, 5:15] = scale_coords_landmarks(img.shape[2:], det[:, 5:15], orgimg.shape).round()
for j in range(det.size()[0]):
if det[j, 4].cpu().numpy() < vis_thres:
no_vis_nums+=1
continue
xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
conf = det[j, 4].cpu().numpy()
landmarks = (det[j, 5:15].view(1, 10) / gn_lks).view(-1).tolist()
class_num = det[j, 15].cpu().numpy()
orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
cv2.imwrite(cur_path+'/result.jpg', orgimg)
print('result save in '+cur_path+'/result.jpg')
if __name__ == '__main__':
# ============参数================
img_path=cur_path+"/sample.jpg" #测试图片路径
device="cuda:0"
onnx_model_path=cur_path+"/2000+3000.onnx" #ONNX模型路径
fp16_mode=True #True则FP16推理
# ============图像预处理================
img,orgimg=img_process(img_path) #[1,3,320,320]
# ============TensorRT推理================
# 初始化TensorRT引擎
yolo_trt_model=YoloTrtModel(device,onnx_model_path,fp16_mode)
# 耗时统计 = tensorrt推理 + torch后处理
pred=yolo_trt_model(img.cpu().numpy()) #tensorrt推理
pred=yolo_trt_model.after_process(pred,device) # torch后处理
# Apply NMS
pred = non_max_suppression_face(pred, conf_thres=0.3, iou_thres=0.5)
# ============可视化================
img_vis(img,orgimg,pred,device)
改完如下
import os
import sys
import cv2
import copy
import torch
root_path=os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径:获取当前路径,再上级路径
sys.path.append(root_path) # 将项目根路径写入系统路径
from utils.general import check_img_size,non_max_suppression_face,scale_coords,xyxy2xywh
from utils.datasets import letterbox
from torch2tensorrt.yolo_trt_model import YoloTrtModel
from detect_face import scale_coords_landmarks,show_results
cur_path=os.path.abspath(os.path.dirname(__file__))
def img_process(img_path,long_side=640,stride_max=32):
'''
图像预处理
'''
orgimg=cv2.imread(img_path)
img0 = copy.deepcopy(orgimg)
h0, w0 = orgimg.shape[:2] # orig hw
r = long_side/ max(h0, w0) # resize image to img_size
if r != 1: # always resize down, only resize up if training with augmentation
interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)
imgsz = check_img_size(long_side, s=stride_max) # check img_size
img = letterbox(img0, new_shape=imgsz,auto=False)[0] # auto True最小矩形 False固定尺度
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1).copy() # BGR to RGB, to 3x416x416
img = torch.from_numpy(img)
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
return img,orgimg
def img_vis(img,orgimg,pred,device,vis_thres = 0.6):
'''
预测可视化
vis_thres: 可视化阈值
'''
print('img.shape: ', img.shape)
print('orgimg.shape: ', orgimg.shape)
no_vis_nums=0
# Process detections
for i, det in enumerate(pred): # detections per image
gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]].to(device) # normalization gain whwh
gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0]].to(device) # normalization gain landmarks 去掉两个
if len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()
# Print results
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class
det[:, 5:13] = scale_coords_landmarks(img.shape[2:], det[:, 5:13], orgimg.shape).round() #15变13
for j in range(det.size()[0]):
if det[j, 4].cpu().numpy() < vis_thres:
no_vis_nums+=1
continue
xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
conf = det[j, 4].cpu().numpy()
landmarks = (det[j, 5:13].view(1, 8) / gn_lks).view(-1).tolist() #15变13,10变8
class_num = det[j, 13].cpu().numpy() #15变13
orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
cv2.imwrite(cur_path+'/result.jpg', orgimg)
print('result save in '+cur_path+'/result.jpg')
if __name__ == '__main__':
# ============参数================
img_path=cur_path+"/sample.jpeg" #测试图片路径
device="cuda:0"
onnx_model_path=cur_path+"/2000+-3000.onnx" #ONNX模型路径
fp16_mode=True #True则FP16推理
# ============图像预处理================
img,orgimg=img_process(img_path) #[1,3,320,320]
# ============TensorRT推理================
# 初始化TensorRT引擎
yolo_trt_model=YoloTrtModel(device,onnx_model_path,fp16_mode)
# 耗时统计 = tensorrt推理 + torch后处理
pred=yolo_trt_model(img.cpu().numpy()) #tensorrt推理
pred=yolo_trt_model.after_process(pred,device) # torch后处理
# Apply NMS
pred = non_max_suppression_face(pred, conf_thres=0.3, iou_thres=0.5)
# ============可视化================
img_vis(img,orgimg,pred,device)
首先将onnx文件路径换成trt的路径
然后进入YoloTrtModel,将转换代码注释掉
ps:注意图像大小的匹配
pycuda._driver.LogicError: cuMemcpyHtoDAsync failed: invalid argument
报错原因:输入图像和模型尺寸不匹配(由于转换为onnx尺寸固定,所以trt也是固定的,只能更改输入图像大小
ValueError: cannot reshape array of size 576000 into shape (1,3,80,80,16)
报错原因:因为我将关键点改为4个,并且类别数也改变为17了,所以需要重写输出特征,16改为30(xywh+置信度+关键点+类别)
# 输出特征
self.stride8_shape=(1,3,80,80,16)
self.stride16_shape=(1,3,40,40,16)
self.stride32_shape=(1,3,20,20,16)
解决办法
修改函数
def after_process(self,pred,device):
原代码
def after_process(self,pred,device):
'''
Pytorch后处理
pred: tensorrt输出
device: "cuda:0"
'''
# 降8、16、32倍
stride= torch.tensor([8.,16.,32.]).to(device)
x=[torch.from_numpy(pred[0]).to(device),torch.from_numpy(pred[1]).to(device),torch.from_numpy(pred[2]).to(device)]
# =====提取自models/yolo.py=====
no=16 # 4坐标+1置信度+10关键点坐标+1类别
nl=3
grid=[torch.zeros(1).to(device)] * nl
anchor_grid=torch.tensor([[[[[[ 4., 5.]]],
[[[ 8., 10.]]],
[[[ 13., 16.]]]]],
[[[[[ 23., 29.]]],
[[[ 43., 55.]]],
[[[ 73., 105.]]]]],
[[[[[146., 217.]]],
[[[231., 300.]]],
[[[335., 433.]]]]]]).to(device)
z = []
for i in range(len(x)):
bs,ny, nx = x[i].shape[0],x[i].shape[2] ,x[i].shape[3]
if grid[i].shape[2:4] != x[i].shape[2:4]:
grid[i] = self._make_grid(nx, ny).to(x[i].device)
y = torch.full_like(x[i], 0)
y[..., [0,1,2,3,4,15]] = x[i][..., [0,1,2,3,4,15]].sigmoid()
y[..., 5:15] = x[i][..., 5:15]
#y = x[i].sigmoid()
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + grid[i].to(x[i].device)) * stride[i] # xy
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * anchor_grid[i] # wh
#y[..., 5:15] = y[..., 5:15] * 8 - 4
y[..., 5:7] = y[..., 5:7] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i] # landmark x1 y1
y[..., 7:9] = y[..., 7:9] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x2 y2
y[..., 9:11] = y[..., 9:11] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x3 y3
y[..., 11:13] = y[..., 11:13] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x4 y4
y[..., 13:15] = y[..., 13:15] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x5 y5
#y[..., 5:7] = (y[..., 5:7] * 2 -1) * anchor_grid[i] # landmark x1 y1
#y[..., 7:9] = (y[..., 7:9] * 2 -1) * anchor_grid[i] # landmark x2 y2
#y[..., 9:11] = (y[..., 9:11] * 2 -1) * anchor_grid[i] # landmark x3 y3
#y[..., 11:13] = (y[..., 11:13] * 2 -1) * anchor_grid[i] # landmark x4 y4
#y[..., 13:15] = (y[..., 13:15] * 2 -1) * anchor_grid[i] # landmark x5 y5
z.append(y.view(bs, -1, no))
return torch.cat(z, 1)
修改如下
def after_process(self,pred,device):
'''
Pytorch后处理
pred: tensorrt输出
device: "cuda:0"
'''
# 降8、16、32倍
stride= torch.tensor([8.,16.,32.]).to(device)
x=[torch.from_numpy(pred[0]).to(device),torch.from_numpy(pred[1]).to(device),torch.from_numpy(pred[2]).to(device)]
# =====提取自models/yolo.py=====
no=30 # 4坐标+1置信度+8关键点坐标+17类别
nc=17
nl=3
grid=[torch.zeros(1).to(device)] * nl
anchor_grid=torch.tensor([[[[[[ 4., 5.]]],
[[[ 8., 10.]]],
[[[ 13., 16.]]]]],
[[[[[ 23., 29.]]],
[[[ 43., 55.]]],
[[[ 73., 105.]]]]],
[[[[[146., 217.]]],
[[[231., 300.]]],
[[[335., 433.]]]]]]).to(device)
z = []
for i in range(len(x)): #x是一个列表,相当于遍历x里的元素
bs,ny, nx = x[i].shape[0],x[i].shape[2] ,x[i].shape[3]
if grid[i].shape[2:4] != x[i].shape[2:4]:
grid[i] = self._make_grid(nx, ny).to(x[i].device)
y = torch.full_like(x[i], 0)
class_range = list(range(5)) + list(range(13, 13 + nc))
y[..., class_range] = x[i][..., class_range].sigmoid() # 这里是只对关键点以外的值进行sigmoid
y[..., 5:13] = x[i][..., 5:13]
#y = x[i].sigmoid()
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + grid[i].to(x[i].device)) * stride[i] # xy
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * anchor_grid[i] # wh
#y[..., 5:15] = y[..., 5:15] * 8 - 4
y[..., 5:7] = y[..., 5:7] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i] # landmark x1 y1
y[..., 7:9] = y[..., 7:9] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x2 y2
y[..., 9:11] = y[..., 9:11] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x3 y3
y[..., 11:13] = y[..., 11:13] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x4 y4
# y[..., 13:15] = y[..., 13:15] * anchor_grid[i] + grid[i].to(x[i].device) * stride[i]# landmark x5 y5
#y[..., 5:7] = (y[..., 5:7] * 2 -1) * anchor_grid[i] # landmark x1 y1
#y[..., 7:9] = (y[..., 7:9] * 2 -1) * anchor_grid[i] # landmark x2 y2
#y[..., 9:11] = (y[..., 9:11] * 2 -1) * anchor_grid[i] # landmark x3 y3
#y[..., 11:13] = (y[..., 11:13] * 2 -1) * anchor_grid[i] # landmark x4 y4
#y[..., 13:15] = (y[..., 13:15] * 2 -1) * anchor_grid[i] # landmark x5 y5
z.append(y.view(bs, -1, no))
return torch.cat(z, 1)
初始化引擎: 1.2982511520385742
tensorrt推理: 0.005248308181762695
torch后处理 0.0032684803009033203
非极大值抑制 0.0015017986297607422
可以看出,torch后处理和非极大值抑制还是在pytorch上运行的,tensorrt,因此这两部分耗时颇多
由于main.py将onnx转换为tensorrt和tensorrt运行集成到了一起,并且不能对某些参数进行调整,也不能运行视频,因此我打算重写一份main.py,命名为trtdetect.py
遇到问题:opencv.namewindows()卡死,无图像输出
原因:opencv配置环境有问题,没有安装qt
解决办法:在pycharm的IDLE中搜索qt,然后安装(在虚拟环境中pip install qt安装失败)
修改后,trtdetect.py如下
import os
import sys
import cv2
import copy
import torch
import argparse
root_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) # 项目根路径:获取当前路径,再上级路径
sys.path.append(root_path) # 将项目根路径写入系统路径
from utils.general import check_img_size, non_max_suppression_face, scale_coords, xyxy2xywh
from utils.datasets import letterbox
from torch2tensorrt.yolo_trt_model import YoloTrtModel
from detect_face import scale_coords_landmarks, show_results
cur_path = os.path.abspath(os.path.dirname(__file__))
from utils.torch_utils import time_synchronized
def img_process(img_path, img1=str(0),long_side=640, stride_max=32):
'''
图像预处理
如果输入的是图片路径,则img_path为路径,img1为0
如果输入的是视频,则img_path不重要,img1是视频原图
无论输入的视频还是图片路径是什么样子,最终输出的,识别后的都是640*640
事实证明后面那两个参数没用,因为输入图像大小一定是原图,输出一定是onnx大小
'''
if img1!= str(0): #如果输入的是视频 即传入source是0(默认是none)
orgimg=img1 #则令orgimg等于视频
else: # 如果输入的不是视频
orgimg = cv2.imread(img_path)
img0 = copy.deepcopy(orgimg)
# h0, w0 = orgimg.shape[:2] # orig hw 直接读取得到的
# r = long_side / max(h0, w0) # resize image to img_size
# if r != 1: # always resize down, only resize up if training with augmentation
# interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
# img0 = cv2.resize(img0, (int(w0 * r), int(h0 * r)), interpolation=interp)
#
# imgsz = check_img_size(long_side, s=stride_max) # check img_size
img = letterbox(img0, new_shape=opt.trt_size, auto=False)[0] # auto True最小矩形 ,缩放成想要的矩形 False固定尺度 传入的img_size是img0,也就是未缩放的图,img是按长边缩放得到的矩形,
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1).copy() # BGR to RGB, to 3x416x416
img = torch.from_numpy(img)
img = img.float() # uint8 to fp16/32
img /= 255.0 # 0 - 255 to 0.0 - 1.0
if img.ndimension() == 3:
img = img.unsqueeze(0)
return img, orgimg
def img_vis(img, orgimg, pred, device, vis_thres=0.6,save_jpg=True):
'''
预测可视化
vis_thres: 可视化阈值
这里传入img除了print一下并没有什么作用
默认图片保存,视频直接imshow不保存
'''
print(' img.shape: ', img.shape)
print(' orgimg.shape: ', orgimg.shape)
no_vis_nums = 0
# Process detections
for i, det in enumerate(pred): # detections per image
gn = torch.tensor(orgimg.shape)[[1, 0, 1, 0]].to(device) # normalization gain whwh
gn_lks = torch.tensor(orgimg.shape)[[1, 0, 1, 0, 1, 0, 1, 0]].to(device) # normalization gain landmarks 去掉两个
if len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], orgimg.shape).round()
# Print results
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class
det[:, 5:13] = scale_coords_landmarks(img.shape[2:], det[:, 5:13], orgimg.shape).round() # 15变13
for j in range(det.size()[0]):
if det[j, 4].cpu().numpy() < vis_thres:
no_vis_nums += 1
continue
xywh = (xyxy2xywh(det[j, :4].view(1, 4)) / gn).view(-1).tolist()
conf = det[j, 4].cpu().numpy()
landmarks = (det[j, 5:13].view(1, 8) / gn_lks).view(-1).tolist() # 15变13,10变8
class_num = det[j, 13].cpu().numpy() # 15变13
orgimg = show_results(orgimg, xywh, conf, landmarks, class_num)
if save_jpg == True:
cv2.imwrite(cur_path + '/result.jpg', orgimg)
print('result save in ' + cur_path + '/result.jpg')
if __name__ == '__main__':
# ============参数================
parser = argparse.ArgumentParser() #所有路径根目录都是yoloface训练
parser.add_argument('--weights', type=str, default= 'weights/2000+3000.trt', help='weights path') # from yolov5/models/
parser.add_argument('--trt_size', nargs='+', type=int, default=[640, 640], help='must same as onnx,imput 2 numbers') # 在letterbox函数调用 必须与onnx模型匹配
parser.add_argument('--source', type=str, default='data/images/1.jpeg', help='file/dir/URL/glob, 0 for webcam') #判断信息源 如果为0,1,2,200则认为是视频流
parser.add_argument('--cls', type=int, default=17, help='class numbers') #在初始化引擎时调用 必须与onnx模型匹配
parser.add_argument('--conf_thres', type=float, default=0.25, help='confidence threshold') #在非极大值抑制时调用
parser.add_argument('--iou_thres', type=float, default=0.45, help='NMS IoU threshold') #在非极大值抑制时调用
opt = parser.parse_args()
if len(opt.trt_size)!=2:
print("error:--trt_size must imput 2 numbers")
exit()
# ============参数处理=============
device = "cuda:0"
fp16_mode = True # True则FP16推理
onnx_model_path = root_path + "/" + opt.weights # ONNX模型路径
# print(opt.img_size)
# ============初始化TensorRT引擎================
t0 = time_synchronized() # 计算时间
yolo_trt_model = YoloTrtModel(device, onnx_model_path, fp16_mode, opt.cls,opt.trt_size[0],opt.trt_size[1])
t = time_synchronized() # 计算时间
print("初始化引擎:", t - t0)
# ============判断视频还是图片================
if (opt.source == str(0) or opt.source == str(1) or opt.source == str(2) or opt.source == str(200)):
capture = cv2.VideoCapture(int(opt.source))
while (True):
t1 = time_synchronized()
ret, frame = capture.read() # ret为返回值,frame为视频的每一帧
img, orgimg = img_process(opt.source,frame) #视频预处理
t2 = time_synchronized()#视频预处理时间
pred = yolo_trt_model(img.cpu().numpy()) # tensorrt推理
t3 = time_synchronized() # tensorrt推理时间
pred = yolo_trt_model.after_process(pred, device) # torch后处理
t4 = time_synchronized()# torch后处理时间
pred = non_max_suppression_face(pred, conf_thres=opt.conf_thres, iou_thres=opt.iou_thres) # 非极大值抑制
t5 = time_synchronized() # 非极大值抑制时间
img_vis(img, orgimg, pred, device,save_jpg=False) #可视化
cv2.imshow("video", orgimg)
cv2.imshow("video", frame)
c = cv2.waitKey(1)
if c == 27: # 按了esc候可以退出
break
t6 = time_synchronized() # 可视化时间
print("视频流读取:", t2 - t1)
print("tensorrt推理:", t3 - t2)
print("torch后处理:", t4 - t3)
print("非极大值抑制:", t5 - t4)
print("可视化:", t6 - t5)
print("总耗时", t6 - t1)
else:
img_path = root_path + "/" + opt.source # 图片路径
t1 = time_synchronized()
img, orgimg = img_process(img_path) # 图像预处理
t2 = time_synchronized() # 图像预处理时间
pred = yolo_trt_model(img.cpu().numpy()) # tensorrt推理
t3 = time_synchronized() # tensorrt推理时间
pred = yolo_trt_model.after_process(pred, device) # torch后处理
t4 = time_synchronized() # torch后处理时间
pred = non_max_suppression_face(pred, conf_thres=opt.conf_thres, iou_thres=opt.iou_thres) # 非极大值抑制
t5 = time_synchronized() # 非极大值抑制时间
img_vis(img, orgimg, pred, device)
t6 = time_synchronized() # 可视化时间
print("图像预处理:", t2 - t1)
print("tensorrt推理:", t3 - t2)
print("torch后处理:", t4 - t3)
print("非极大值抑制:", t5 - t4)
print("图像可视化:", t6 - t5)
同时YoloTrtModel()函数中的onnx-->trt语句被注释掉,并进行如下更改
def __init__(self,device_id="cuda:0",onnx_model_path=None,fp16_mode=False,cls=17,x=640,y=640):
'''
device_id: "cuda:0"
onnx_model_path: 加载onnx模型的路径
output_size: # 输出尺寸 eg:(1,-1)
fp16_mode: True则FP16推理
cls:类别数
x,y:横向纵向图片大小
'''
trt_engine_path = onnx_model_path.replace('.onnx','.trt')
# 初始化TensorRT, 加载trt引擎文件
self.model_params=Init_TensorRT(trt_engine_path)
self.cls=cls
# 输出特征
self.stride8_shape=(1,3,x//8,y//8,13+cls)
self.stride16_shape=(1,3,x//16,y//16,13+cls)
self.stride32_shape=(1,3,x//32,y//32,13+cls)
main函数则只保留onnx-->trt语句,如下
if __name__ == '__main__':
# ============参数================
# img_path=cur_path+"/sample.jpeg" #测试图片路径
# device="cuda:0"
onnx_model_path=cur_path+"/2000+3000.onnx" #ONNX模型路径
fp16_mode=True #True则FP16推理
trt_engine_path = onnx_model_path.replace('.onnx', '.trt')
ONNX_to_TensorRT(fp16_mode=fp16_mode, onnx_model_path=onnx_model_path, trt_engine_path=trt_engine_path)