海思3516cv500部署yolov5

1.用U版pytorch训练:https://github.com/ultralytics/yolov5
如意量化时不支持focus,所以修改结构,把focus层改成卷积层。还可以把上采样改成反卷积。结构如下:

# Parameters
nc: 1  # number of classes
depth_multiple: 0.33  # model depth multiple
width_multiple: 0.50  # layer channel multiple
#anchors:
#  - [10,13, 16,30, 33,23]  # P3/8
#  - [30,61, 62,45, 59,119]  # P4/16
#  - [116,90, 156,198, 373,326]  # P5/32

anchors:
  - [48,56, 55,146, 129,94]  # P3/8
  - [126,221, 80,364, 233,145]  # P4/16
  - [182,433, 349,259, 396,499]  # P5/32

# YOLOv5 backbone
backbone:
  # [from, number, module, args]
#  [[-1, 1, Focus, [64, 3]],  # 0-P1/2
   [[-1, 1, Conv, [64, 3, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3, [128]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 9, C3, [256]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3, [512]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 1, SPP, [1024, [5, 9, 13]]],
   [-1, 3, C3, [1024, False]],  # 9
  ]

# YOLOv5 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
#   [-1, 1, nn.ConvTranspose2d, [256, 256, 2, 2]],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3, [512, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
#   [-1, 1, nn.ConvTranspose2d, [128, 128, 2, 2]],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3, [256, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3, [512, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3, [1024, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]

2.pytorch转onnx
用https://github.com/ultralytics/yolov5自带的export.py导出即可,选择opt为10。用的simply简化后的模型。
3.onnx转caffe
链接:https://blog.csdn.net/weixin_41012399/article/details/120066576?spm=1001.2014.3001.5501
这里注意,大概率会遇到keyerror:xxx。是因为节点的问题,上一步pytoch转onnx的时候会多出来后处理的部分(有的说让train=false,试过没用)。这里就手动去掉多出来的节点。
4.caffe模型测试
链接:https://blog.csdn.net/weixin_41012399/article/details/120066576?spm=1001.2014.3001.5501
这里遇到的问题主要的框的大小差太多,排查了好久查出原因在后处理部分。计算wh时用到的anchors并不是自己用k-means计算的,要用第一步pytoch训练时保存的detect层里的参数:anchors_grid,否则框的大小完全不对。
pytoch中的后处理:

class Detect(nn.Module):
    stride = None  # strides computed during build
    onnx_dynamic = False  # ONNX export parameter

    def __init__(self, nc=80, anchors=(), ch=(), inplace=True):  # detection layer
        super().__init__()
        self.nc = nc  # number of classes
        self.no = nc + 5  # number of outputs per anchor
        self.nl = len(anchors)  # number of detection layers
        self.na = len(anchors[0]) // 2  # number of anchors
        self.grid = [torch.zeros(1)] * self.nl  # init grid
        a = torch.tensor(anchors).float().view(self.nl, -1, 2)
        self.register_buffer('anchors', a)  # shape(nl,na,2)
        self.register_buffer('anchor_grid', a.clone().view(self.nl, 1, -1, 1, 1, 2))  # shape(nl,1,na,1,1,2)
        self.m = nn.ModuleList(nn.Conv2d(x, self.no * self.na, 1) for x in ch)  # output conv
        self.inplace = inplace  # use in-place ops (e.g. slice assignment)

    def forward(self, x):
        # x = x.copy()  # for profiling
        z = []  # inference output
        f=open('/home/zhanglu/yolov5-fishi/tensorrt/yolov5_caffe-master/build/output_caffe.txt', 'r')
        a = f.read()
        b = a.split('\n')
        ind = 0
        for i in range(self.nl):
            x[i] = self.m[i](x[i])  # conv
            bs, _, ny, nx = x[i].shape  # x(bs,255,20,20) to x(bs,3,20,20,85)
            x[i] = x[i].view(bs, self.na, self.no, ny, nx).permute(0, 1, 3, 4, 2).contiguous()
            if not self.training:  # inference
                if self.grid[i].shape[2:4] != x[i].shape[2:4] or self.onnx_dynamic:
                    self.grid[i] = self._make_grid(nx, ny).to(x[i].device)

                y = x[i].sigmoid()
                if self.inplace:
                    y[..., 0:2] = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    a=self.anchor_grid[i]
                    y[..., 2:4] = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i]  # wh,这里用的anchor_grid,这个参数保存在模型的detect层。
                else:  # for YOLOv5 on AWS Inferentia https://github.com/ultralytics/yolov5/pull/2953
                    xy = (y[..., 0:2] * 2. - 0.5 + self.grid[i]) * self.stride[i]  # xy
                    wh = (y[..., 2:4] * 2) ** 2 * self.anchor_grid[i].view(1, self.na, 1, 1, 2)  # wh
                    y = torch.cat((xy, wh, y[..., 4:]), -1)
                z.append(y.view(bs, -1, self.no))
        return x if self.training else (torch.cat(z, 1), x)

caffe中的后处理:

void postProcessParall(const int height, const int width, int scale_idx, float postThres, float * origin_output, vector<int> Strides, vector<Anchor> Anchors, vector<Bbox> *bboxes)
{
    Bbox bbox;
    float cx, cy, w_b, h_b, score;
    int cid;
    const float *ptr = (float *)origin_output;

    cout << "ptr shape: " << *ptr << endl;

    for(unsigned long a=0; a<3; ++a){
        for(unsigned long h=0; h<height; ++h){
            for(unsigned long w=0; w<width; ++w){
                const float *cls_ptr =  ptr + 5;
                cid = argmax(cls_ptr, cls_ptr+NUM_CLASS);
                score = sigmoid(ptr[4]) * sigmoid(cls_ptr[cid]);
                if(score>=postThres){
                    cx = (sigmoid(ptr[0]) * 2.f - 0.5f + static_cast<float>(w)) * static_cast<float>(Strides[scale_idx]);
                    cy = (sigmoid(ptr[1]) * 2.f - 0.5f + static_cast<float>(h)) * static_cast<float>(Strides[scale_idx]);
                    w_b = powf(sigmoid(ptr[2]) * 2.f, 2) * Anchors[scale_idx * 3 + a].width*0.5; //这里用的anchors,是自定义给出的。要和上边的anchor_grid保持一致。
                    h_b = powf(sigmoid(ptr[3]) * 2.f, 2) * Anchors[scale_idx * 3 + a].height*0.5;

                    bbox.xmin = clip(cx - w_b / 2, 0.F, static_cast<float>(INPUT_W - 1));
                    bbox.ymin = clip(cy - h_b / 2, 0.f, static_cast<float>(INPUT_H - 1));
                    bbox.xmax = clip(cx + w_b / 2, 0.f, static_cast<float>(INPUT_W - 1));
                    bbox.ymax = clip(cy + h_b / 2, 0.f, static_cast<float>(INPUT_H - 1));
                    bbox.score = score;
                    bbox.cid = cid;
                    //std::cout<< "bbox.cid : " << bbox.cid << std::endl;
                    bboxes->push_back(bbox);
                }
                ptr += 5 + NUM_CLASS;
            }
        }
    }
}

还有一个问题是图片的前处理,pytoch的输入没有补灰边的操作,直接等比例缩放。例如输入设置成(640,640),pytorch会预处理成640左右,按32的倍数加减。
参考u版yolov5输入预处理:https://blog.csdn.net/weixin_41012399/article/details/120907304
5.量化:第一步训练的时候输入是RGB,有归一化。注意量化的选项对应上即可。
6.海思推理:
参考后处理代码:https://gitee.com/shopping-tang/yolo_v5_nnie/tree/master
这里同样是注意anchor_grid是detect层的anchor_grid。

你可能感兴趣的:(caffe,深度学习)