1.用U版pytorch训练:https://github.com/ultralytics/yolov5
如意量化时不支持focus,所以修改结构,把focus层改成卷积层。还可以把上采样改成反卷积。结构如下:
# Parameters
nc: 1 # number of classes
depth_multiple: 0.33 # model depth multiple
width_multiple: 0.50 # layer channel multiple
#anchors:
# - [10,13, 16,30, 33,23] # P3/8
# - [30,61, 62,45, 59,119] # P4/16
# - [116,90, 156,198, 373,326] # P5/32
anchors:
- [48,56, 55,146, 129,94] # P3/8
- [126,221, 80,364, 233,145] # P4/16
- [182,433, 349,259, 396,499] # P5/32
# YOLOv5 backbone
backbone:
# [from, number, module, args]
# [[-1, 1, Focus, [64, 3]], # 0-P1/2
[[-1, 1, Conv, [64, 3, 2]], # 0-P1/2
[-1, 1, Conv, [128, 3, 2]], # 1-P2/4
[-1, 3, C3, [128]],
[-1, 1, Conv, [256, 3, 2]], # 3-P3/8
[-1, 9, C3, [256]],
[-1, 1, Conv, [512, 3, 2]], # 5-P4/16
[-1, 9, C3, [512]],
[-1, 1, Conv, [1024, 3, 2]], # 7-P5/32
[-1, 1, SPP, [1024, [5, 9, 13]]],
[-1, 3, C3, [1024, False]], # 9
]
# YOLOv5 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
# [-1, 1, nn.ConvTranspose2d, [256, 256, 2, 2]],
[[-1, 6], 1, Concat, [1]], # cat backbone P4
[-1, 3, C3, [512, False]], # 13
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
# [-1, 1, nn.ConvTranspose2d, [128, 128, 2, 2]],
[[-1, 4], 1, Concat, [1]], # cat backbone P3
[-1, 3, C3, [256, False]], # 17 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 14], 1, Concat, [1]], # cat head P4
[-1, 3, C3, [512, False]], # 20 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 10], 1, Concat, [1]], # cat head P5
[-1, 3, C3, [1024, False]], # 23 (P5/32-large)
[[17, 20, 23], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]
2.pytorch转onnx
用https://github.com/ultralytics/yolov5自带的export.py导出即可,选择opt为10。用的simply简化后的模型。
3.onnx转caffe
链接:https://blog.csdn.net/weixin_41012399/article/details/120066576?spm=1001.2014.3001.5501
这里注意,大概率会遇到keyerror:xxx。是因为节点的问题,上一步pytoch转onnx的时候会多出来后处理的部分(有的说让train=false,试过没用)。这里就手动去掉多出来的节点。
4.caffe模型测试
链接:https://blog.csdn.net/weixin_41012399/article/details/120066576?spm=1001.2014.3001.5501
这里遇到的问题主要的框的大小差太多,排查了好久查出原因在后处理部分。计算wh时用到的anchors并不是自己用k-means计算的,要用第一步pytoch训练时保存的detect层里的参数:anchors_grid,否则框的大小完全不对。
pytoch中的后处理:
class Detect(nn.Module):
stride = None # strides computed during build
onnx_dynamic = False # ONNX export parameter
def __init__(self, nc=80, anchors=(), ch=(), inplace=True): # detection layer
super().__init__()
self.nc = nc # number of classes
self.no = nc + 5 # number of outputs per anchor
self.nl = len(anchors) # number of detection layers
self.na = len(anchors[0]) // 2 # number of anchors
self.grid = [torch.zeros(1)] * self.nl # init grid
a = torch.tensor(anchors).float().view(self.nl, -1, 2)
self.register_buffer('anchors', a) # shape(nl,na,2)
self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2)) # shape(nl,1,na,1,1,2)
self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch) # output conv
self.inplace = inplace # use in-place ops (e.g. slice assignment)
def forward(self, x):
# x = x.copy() # for profiling
z = [] # inference output
f=open('/home/zhanglu/yolov5-fishi/tensorrt/yolov5_caffe-master/build/output_caffe.txt', 'r')
a = f.read()
b = a.split('\n')
ind = 0
for i in range(self.nl):
x[i] = self.m[i](x[i]) # conv
bs, _, ny, nx = x[i].shape # x(bs,255,20,20) to x(bs,3,20,20,85)
x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
if not self.training: # inference
if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
self.grid[i] = self._make_grid(nx, ny).to(x[i].device)
y = x[i].sigmoid()
if self.inplace:
y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
a=self.anchor_grid[i]
y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i] # wh,这里用的anchor_grid,这个参数保存在模型的detect层。
else: # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i] # xy
wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2) # wh
y = torch.cat((xy, wh, y[..., 4:]), -1)
z.append(y.view(bs, -1, self.no))
return x if self.training else (torch.cat(z, 1), x)
caffe中的后处理:
void postProcessParall(const int height, const int width, int scale_idx, float postThres, float * origin_output, vector<int> Strides, vector<Anchor> Anchors, vector<Bbox> *bboxes)
{
Bbox bbox;
float cx, cy, w_b, h_b, score;
int cid;
const float *ptr = (float *)origin_output;
cout << "ptr shape: " << *ptr << endl;
for(unsigned long a=0; a<3; ++a){
for(unsigned long h=0; h<height; ++h){
for(unsigned long w=0; w<width; ++w){
const float *cls_ptr = ptr + 5;
cid = argmax(cls_ptr, cls_ptr+NUM_CLASS);
score = sigmoid(ptr[4]) * sigmoid(cls_ptr[cid]);
if(score>=postThres){
cx = (sigmoid(ptr[0]) * 2.f - 0.5f + static_cast<float>(w)) * static_cast<float>(Strides[scale_idx]);
cy = (sigmoid(ptr[1]) * 2.f - 0.5f + static_cast<float>(h)) * static_cast<float>(Strides[scale_idx]);
w_b = powf(sigmoid(ptr[2]) * 2.f, 2) * Anchors[scale_idx * 3 + a].width*0.5; //这里用的anchors,是自定义给出的。要和上边的anchor_grid保持一致。
h_b = powf(sigmoid(ptr[3]) * 2.f, 2) * Anchors[scale_idx * 3 + a].height*0.5;
bbox.xmin = clip(cx - w_b / 2, 0.F, static_cast<float>(INPUT_W - 1));
bbox.ymin = clip(cy - h_b / 2, 0.f, static_cast<float>(INPUT_H - 1));
bbox.xmax = clip(cx + w_b / 2, 0.f, static_cast<float>(INPUT_W - 1));
bbox.ymax = clip(cy + h_b / 2, 0.f, static_cast<float>(INPUT_H - 1));
bbox.score = score;
bbox.cid = cid;
//std::cout<< "bbox.cid : " << bbox.cid << std::endl;
bboxes->push_back(bbox);
}
ptr += 5 + NUM_CLASS;
}
}
}
}
还有一个问题是图片的前处理,pytoch的输入没有补灰边的操作,直接等比例缩放。例如输入设置成(640,640),pytorch会预处理成640左右,按32的倍数加减。
参考u版yolov5输入预处理:https://blog.csdn.net/weixin_41012399/article/details/120907304
5.量化:第一步训练的时候输入是RGB,有归一化。注意量化的选项对应上即可。
6.海思推理:
参考后处理代码:https://gitee.com/shopping-tang/yolo_v5_nnie/tree/master
这里同样是注意anchor_grid是detect层的anchor_grid。