板卡:MLU270-S4
模型:yolov7
模型链接:https://github.com/WongKinYiu/yolov7.git
pt文件:https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
git clone https://github.com/WongKinYiu/yolov7.git
cd yolov7
wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
pip install seaborn
备注:由于Yolov7 提供的模型是在高版本pytorch 训练,在低版本(如:1.3)运行需要保存成nozip的pt文件,同时需要增加 对SiLU的支持。主要修改如下:
备注:如果不取消fuse 会出现模型结构和权重对不上的情况。
--- a/models/experimental.py
+++ b/models/experimental.py
@@ -250,7 +250,8 @@ def attempt_load(weights, map_location=None):
for w in weights if isinstance(weights, list) else [weights]:
attempt_download(w)
ckpt = torch.load(w, map_location=map_location) # load
- model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().fuse().eval()) # FP32 model
+ #model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().fuse().eval()) # FP32 model
+ model.append(ckpt['ema' if ckpt.get('ema') else 'model'].float().eval()) # FP32 model
diff --git a/utils/datasets.py b/utils/datasets.py
index b4e56ad..9130546 100644
--- a/utils/datasets.py
+++ b/utils/datasets.py
@@ -24,7 +24,7 @@ import pickle
from copy import deepcopy
#from pycocotools import mask as maskUtils
from torchvision.utils import save_image
-from torchvision.ops import roi_pool, roi_align, ps_roi_pool, ps_roi_align
+#from torchvision.ops import roi_pool, roi_align, ps_roi_pool, ps_roi_align
from utils.general import check_requirements, xyxy2xywh, xywh2xyxy, xywhn2xyxy, xyn2xy, segment2box, segments2boxes, \
resample_segments, clean_str
@@ -188,7 +188,8 @@ class LoadImages: # for inference
#print(f'image {self.count}/{self.nf} {path}: ', end='')
# Padded resize
- img = letterbox(img0, self.img_size, stride=self.stride)[0]
+ #img = letterbox(img0, self.img_size, stride=self.stride)[0]
+ img = letterbox(img0, self.img_size, auto=False,stride=self.stride)[0]
# Convert
img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416
+def get_model(opt):
+ model = Model(opt.cfg).to(torch.device('cpu')).eval()
+ weights = opt.weights[0]
+ state_dict = torch.load(weights, map_location=torch.device('cpu'))
+ model.load_state_dict(state_dict,strict=False)
+ return model
+
+def get_empty_model(opt):
+ # Create model
+ model = Model(opt.cfg).to(torch.device('cpu')).eval()
+ model.model[-1].mlu_detection_output = opt.mlu_det
+ model.model[-1].conf_thres = opt.conf_thres
+ model.model[-1].iou_thres = opt.iou_thres
+ return model
diff --git a/models/yolo.py b/models/yolo.py
old mode 100644
new mode 100755
index 95a019c..e0f08c0
--- a/models/yolo.py
+++ b/models/yolo.py
@@ -26,7 +26,7 @@ class Detect(nn.Module):
end2end = False
include_nms = False
concat = False
-
+ mlu_detection_output = False
def __init__(self, nc=80, anchors=(), ch=()): # detection layer
super(Detect, self).__init__()
self.nc = nc # number of classes
@@ -39,10 +39,34 @@ class Detect(nn.Module):
self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2)
self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv
+ self.img_h = 640
+ self.img_w = 640
+ self.conf_thres = 0.2
+ self.iou_thres = 0.45
+ self.maxBoxNum = 1024
+ # self.anchors_list = [[12., 16., 19., 36., 40., 28.], [36., 75., 76., 55., 72., 146.], [142., 110., 192., 243., 459., 401.]]
+ self.anchors_list = list(np.array(anchors).flatten())
+ self.num_anchors = len(self.anchors_list)
+
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
self.training |= self.export
+
+ if x[0].device.type == 'mlu':
+ for i in range(self.nl):
+ x[i] = self.m[i](x[i])
+ y = x[i].sigmoid()
+ z.append(y)
+ if self.mlu_detection_output:
+ print('\nyolo_detection_output,nc:{} anchors:{} \n conf_thres:{} iou_thres:{} img_w:{} img_h:{} \n'.format(self.nc,self.anchors_list,self.conf_thres, self.iou_thres,self.img_w,self.img_h))
+ detect_out = torch.ops.torch_mlu.yolov5_detection_output(z[0], z[1], z[2],
+ self.anchors_list,self.nc, self.num_anchors,
+ self.img_h, self.img_w, self.conf_thres, self.iou_thres, self.maxBoxNum)
+ return detect_out
+ else:
+ return tuple(z)
+
备注:
默认mlu_detection_output 是False,采用CPU 进行yolo detect 操作
sdk 没有yolov7的detection_output 算子,暂时使用的yolov5的,测试单张图片结果接近。如后续测试有问题可以直接使用cpu 进行nms操作
diff --git a/models/common.py b/models/common.py
index edb5edc..1de2919 100644
--- a/models/common.py
+++ b/models/common.py
@@ -8,9 +8,9 @@ import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
-from torchvision.ops import DeformConv2d
+#from torchvision.ops import DeformConv2d
from PIL import Image
-from torch.cuda import amp
+#from torch.cuda import amp
diff --git a/detect.py b/detect.py
old mode 100644
new mode 100755
index 5e0c441..eeed3f5
--- a/detect.py
+++ b/detect.py
@@ -7,6 +7,12 @@ import torch
import torch.backends.cudnn as cudnn
from numpy import random
+import torch.nn as nn
+from utils.activations import Hardswish, SiLU
+nn.modules.activation.SiLU = SiLU
+nn.modules.activation.Hardswish=Hardswish
+nn.SiLU = SiLU
+nn.Hardswish = Hardswish
from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages
from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
@@ -31,7 +37,11 @@ def detect(save_img=False):
half = device.type != 'cpu' # half precision only supported on CUDA
# Load model
- model = attempt_load(weights, map_location=device) # load FP32 model
+ # model = attempt_load(weights, map_location=device) # load FP32 model
+ from models.yolo import get_model
+ model = get_model(opt)
+ #print(model)
+
stride = int(model.stride.max()) # model stride
imgsz = check_img_size(imgsz, s=stride) # check img_size
@@ -183,6 +193,7 @@ if __name__ == '__main__':
parser.add_argument('--name', default='exp', help='save results to project/name')
parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
parser.add_argument('--no-trace', action='store_true', help='don`t trace model')
+ parser.add_argument('--cfg', type=str, default='yolor-csp-c.yaml', help='model.yaml')
opt = parser.parse_args()
print(opt)
#check_requirements(exclude=('pycocotools', 'thop'))
备注:
转换成nozip的模型可以在训练模型的容器或者是大等于pytorch 1.6 环境进行模型转换
转换后的模型分别在大等于pytorch 1.6 环境和MLU 容器内基于CPU运行验证模型正确性
python mlu/gen_unzipmodel.py
转换程序(gen_unzipmodel.py)
import argparse
import time
from pathlib import Path
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
import sys
import os
prj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
print(prj_dir)
sys.path.append(prj_dir)
from models.experimental import attempt_load
weights='yolov7.pt'
# Load model
model = attempt_load(weights, map_location='cpu') # load FP32 model
pt_file="yolov7_unzip.pt"
print("save no zipfile ...")
torch.save(model.state_dict(), pt_file,_use_new_zipfile_serialization=False)
print("save %s end ..."%pt_file)
python detect.py --weights yolov7.pt --conf 0.25 --img-size 640 --source inference/images/horses.jpg --no-trace
python mlu/detect.py --weights mlu/weight/yolov7_unzip.pt --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --no-trace
## 模型量化
---
# model = attempt_load(weights, map_location=device) # load FP32 model
from models.yolo import get_model
model = get_model(opt)
# print(model)
# 配置量化参数
import torch_mlu
import torch_mlu.core.mlu_model as ct
import torch_mlu.core.mlu_quantize as mlu_quantize
qconfig={'use_avg':False, 'data_scale':1.0, 'firstconv':False, 'per_channel': False}
# 调用量化接口
quantized_net = mlu_quantize.quantize_dynamic_mlu(model,qconfig_spec=qconfig, dtype='int8', gen_quant=True)
# 设置为推理模式
quantized_net = quantized_net.eval().float()
model = quantized_net
---
# 保存量化模型
qua_weight = opt.qua_weight
print("SAVE quantize model:",qua_weight)
torch.save(model.state_dict(),qua_weight)
---
#增加qua_weight参数
parser.add_argument('--qua_weight', type=str,default='yolov7_intx.pth', help='model.pt path(s)')
python mlu/mlu_quant.py --weights mlu/weight/yolov7_unzip.pt --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --qua_weight mlu/weight/yolov7_intx.pth --no-trace
#模型加载
from models.yolo import get_model
model = get_model(opt)
# print(model)
import torch_mlu
import torch_mlu.core.mlu_model as ct
import torch_mlu.core.mlu_quantize as mlu_quantize
from postprocess import MLU_PostProcessYoloV7,PostProcessPytorchYoloV7,draw_image
from models.yolo import get_empty_model
model = get_empty_model(opt)
stride = 32
# stride = int(model.stride.max()) # model stride
imgsz = check_img_size(imgsz, s=stride) # check img_size
#配置 MLU core number
ct.set_core_number(opt.core_number)
# 设置输入图片的通道顺序,以决定首层卷积对三通道输入的补齐通道顺序。默认是 RGBA 顺序
#ct.set_input_format(0)
#配置MLU core类型
ct.set_core_version(opt.mcore)
torch.set_grad_enabled(False)
if opt.fake_device:
print("fake_device mode")
ct.set_device(-1)
device = ct.mlu_device()
print("run on %s ..."%device)
# 加载量化模型
weight = weights[0]
quantized_net = torch_mlu.core.mlu_quantize.quantize_dynamic_mlu(model)
print('weight:',weight)
state_dict = torch.load(weight)
quantized_net.load_state_dict(state_dict, strict=False)
# 设置为推理模式
quantized_net = quantized_net.eval().float()
quantized_net.to(device)
model = quantized_net
# 设置在线融合模式
if opt.jit:
if opt.save:
ct.save_as_cambricon(opt.mname)
example = torch.randn(opt.batch_size, 3, imgsz, imgsz,dtype=torch.float)
trace_input = torch.randn(1, 3, imgsz, imgsz,dtype=torch.float)
if opt.half_input:
print('half_input ')
trace_input = trace_input.type(torch.HalfTensor)
example = example.type(torch.HalfTensor)
print("jit trace example shape",example.shape)
model = torch.jit.trace(model,trace_input.to(device),check_trace=False)
#如果是生成离线模型,推理一次,直接退出,会保存离线模型
if opt.save:
print("save offline model mname: ",opt.mname)
model(example.to(device))
ct.save_as_cambricon('')
exit(0)
if opt.mlu_det:
postproc = MLU_PostProcessYoloV7()
else:
postproc = PostProcessPytorchYoloV7(conf_thres=opt.conf_thres,iou_thres=opt.iou_thres)
names = postproc.names
---
#推理及后处理部分
---
# Inference
t1 = time_synchronized()
with torch.no_grad(): # Calculating gradients would cause a GPU memory leak
detect_out = model(img)
# pred = model(img, augment=opt.augment)[0]
t2 = time_synchronized()
if len(detect_out) == 1:
pred = detect_out.cpu().type(torch.FloatTensor) if opt.half_input else detect_out.cpu()
else:
pred = [out.cpu().type(torch.FloatTensor) for out in detect_out]
# print("mlu pred:{} {}".format(type(pred),pred))
# from mlu.tools.dump_npy import save_npy
# save_npy(pred,"mlu")
# # from postprocess import draw_image
if opt.mlu_det:
pred = postproc.get_boxes(pred)
p, s, im0, frame = path, '', im0s, getattr(dataset, 'frame', 0)
p = Path(p) # to Path
save_path = str(save_dir / p.name) # img.jpg
print(save_path)
draw_image(pred, img, im0s, path, save_path, names)
exit(0)
pred = postproc.yolo_det(pred)[0]
备注:后处理参看postprocess.py,mlu推理代码参看mlu_detect.py
#逐层运行+CPU NMS
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml
# 融合模型运行+CPU NMS
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --jit
#生成离线模型
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --jit --mcore MLU270 --mname yolov7_4b4c --core 4 --batch 4 --save
#逐层运行 + MLU yolo detection output
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --mlu_det
#融合模型运行 + MLU yolo detection output
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --jit --mlu_det
#生成MLU270离线模型 4B4C
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --jit --mlu_det --mcore MLU270 --mname yolov7_4b4c --core 4 --batch 4 --save
#生成MLU270离线模型 1B4C
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --jit --mlu_det --mcore MLU270 --mname yolov7_1b4c --core 4 --batch 1 --save
#生成MLU220离线模型 4B4C
python mlu/mlu_detect.py --weights mlu/weight/yolov7_intx.pth --conf 0.25 --img-size 640 --source inference/images/horses.jpg --cfg ./cfg/deploy/yolov7.yaml --jit --mlu_det --mcore MLU220 --mname mlu220_yolov7_1b4c --core 4 --batch 1 --save