pytorch源码解析系列-yolov4最核心技巧代码详解(2)- Anchorbox 和推理过程

yolo探测法

说代码前 简单提一句,you only look once(yolo)这个英文名 是对标滑动窗口探测
滑动窗口要看好几次,yolo只要看一次,因为yolo的特殊标注方式可以将目标检测问题简化为单步回归问题
yolo的标签是:
P:物体存在与否(置信度)
xyxxyy:坐标和长宽
c 类别置信度)

P x,y,w,h c
物体存在与否(置信度) 坐标和长宽 类别置信度

所以一个类别(c=1) 的话 一个标签就有6个值,有多少个格子 多少个anchorbox(一般是3个)再乘
所以一个格子的话 维度就是(5+C)*anchor 重要!
懂了原理 我们就看下代码:

AnchorBox

Yolov4没有用自适应框(就是自己事先写好),了解运作方式对于以后V5自适应框会有更好的理解
我们先看下anchorBox,下面的例子是最小的anchorbox,可以看一下源码的实现方式

num_anchors = 9
stride = 8 ## 另外两个stride16,24
anchor_mask=[0, 1, 2] ## 另外两个anchor[3,4,5] [6,7,8]
anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
anchor_step = len(anchors) // num_anchors
masked_anchors = []
#代码很简单 取anchors前6个除以stride
for m in anchor_mask:
    masked_anchors += anchors[m * anchor_step:(m + 1) * anchor_step]
masked_anchors = [anchor / stride for anchor in masked_anchors]
#[1.5, 2.0, 2.375, 4.5, 5.0, 3.5] 这个masked_anchors就是我们最小的anchors 怎么用的看下面推理

Yolo推理代码

先从推理看,图片输入网络(看上一章)
代码很长 核心就是用OUT的偏移量和anchor先验框 计算真实框

def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,validation=False):
	#这就是我上面说的维度确认
    assert (output.size(1) == (5 + num_classes) * num_anchors)
    # 下面代码 做的事情只有一个 将output的第二维(比如3个anchor): 
    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
    # 的输入 转成 bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]

    bxy_list = []
    bwh_list = []
    det_confs_list = []
    cls_confs_list = []
	# 几个anchor 就遍历几次Y标签
    for i in range(num_anchors):
    	#begin end对应一个anchor中所有的标签 对应我上面表格里的东西看!
        begin = i * (5 + num_classes)
        end = (i + 1) * (5 + num_classes)
       	# xy标签
        bxy_list.append(output[:, begin : begin + 2])
        # wh标签
        bwh_list.append(output[:, begin + 2 : begin + 4])
        # P标签 置信度
        det_confs_list.append(output[:, begin + 4 : begin + 5])
        # 类别置信度
        cls_confs_list.append(output[:, begin + 5 : end])

	# 下面代码都是用concat把list合并成torch
    # Shape: [batch, num_anchors * 2, H, W]
    bxy = torch.cat(bxy_list, dim=1)
    # Shape: [batch, num_anchors * 2, H, W]
    bwh = torch.cat(bwh_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    det_confs = torch.cat(det_confs_list, dim=1)
    # Shape: [batch, num_anchors * H * W]
    det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3))
    # Shape: [batch, num_anchors * num_classes, H, W]
    cls_confs = torch.cat(cls_confs_list, dim=1)
    # Shape: [batch, num_anchors, num_classes, H * W]
    cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
    #类别置信度:把所有格子、所有anchor的类别置信度聚合 
    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)

    # 对数据使用 sigmoid(), exp() and softmax(),scale_x_y比例默认1
    # 下面代码可能会看的一头雾水,实际上做的事情是计算输出结果对于每个grid的偏移量和初始锚点的缩放比率
    # 比如真实X的位置是 锚框位置+sigmoid(x偏移),缩放为exp
    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
    bwh = torch.exp(bwh)
    det_confs = torch.sigmoid(det_confs)
    cls_confs = torch.sigmoid(cls_confs)

    # 准备偏置数据 这边gridx+gridy就是对应yolo的网格框
    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
    
	# 用[1.5, 2.0, 2.375, 4.5, 5.0, 3.5] 做例子: 这里w就是[1.5,2.375,5.0] h[2.0,4.5,3.5]
	# 分别对应 (1.5,2.0) (2.375,4.5) (5.0,3.5) 的三个anchor框的缩放比率
    anchor_w = []
    anchor_h = []
    for i in range(num_anchors):
        anchor_w.append(anchors[i * 2])
        anchor_h.append(anchors[i * 2 + 1])
	#gpu加速
    device = None
    cuda_check = output.is_cuda
    if cuda_check:
        device = output.get_device()

    bx_list = []
    by_list = []
    bw_list = []
    bh_list = []

    # 遍历anchors bx,by 修正偏移量, bw,bh匹配anchor框
    for i in range(num_anchors):
        ii = i * 2
        # Shape: [batch, 1, H, W] bx就是X的偏移 可以看到直接用数据加上grid
        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) 
        # Shape: [batch, 1, H, W] by就是Y的偏移 可以看到直接用数据加上grid
        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) 
        # Shape: [batch, 1, H, W] bw就是W的缩放,这边乘以anchor的长宽系数,成对应anchorbox
        bw = bwh[:, ii : ii + 1] * anchor_w[i]
        # Shape: [batch, 1, H, W] bh就是H的缩放,这边乘以anchor的长宽系数,成对应anchorbox
        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]

        bx_list.append(bx)
        by_list.append(by)
        bw_list.append(bw)
        bh_list.append(bh)


    ########################################
    # bboxes 生成处理 ,后面基本上就是处理为输出格式 #
    ########################################
    #还是转为torch的处理 把上一步的偏移和缩放转为pytorch
    # Shape: [batch, num_anchors, H, W]
    bx = torch.cat(bx_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    by = torch.cat(by_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    bw = torch.cat(bw_list, dim=1)
    # Shape: [batch, num_anchors, H, W]
    bh = torch.cat(bh_list, dim=1)

    # Shape: [batch, 2 * num_anchors, H, W]
    bx_bw = torch.cat((bx, bw), dim=1)
    # Shape: [batch, 2 * num_anchors, H, W]
    by_bh = torch.cat((by, bh), dim=1)

    # 归一化
    bx_bw /= W
    by_bh /= H

    # Shape: [batch, num_anchors * H * W, 1]
    bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
    by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
    bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
    bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
	#中心位置转左上角位置
    bx1 = bx - bw * 0.5
    by1 = by - bh * 0.5
    bx2 = bx1 + bw
    by2 = by1 + bh
	#下面代码都是封装格式 返回
    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
    # boxes = boxes.repeat(1, 1, num_classes, 1)

    # boxes:     [batch, num_anchors * H * W, 1, 4]
    # cls_confs: [batch, num_anchors * H * W, num_classes]
    # det_confs: [batch, num_anchors * H * W]

    det_confs = det_confs.view(batch, num_anchors * H * W, 1)
    confs = cls_confs * det_confs

    # boxes: [batch, num_anchors * H * W, 1, 4]
    # confs: [batch, num_anchors * H * W, num_classes]

    return  boxes, confs

一次推理生成3个anchor的结果
看了上一章网络架构我们知道,yolov4用了9个anchor
对应3个输出结果,然后我们给这些结果合并一下

def get_region_boxes(boxes_and_confs):
    boxes_list = []
    confs_list = []
    for item in boxes_and_confs:
        boxes_list.append(item[0])
        confs_list.append(item[1])
	#下面代码就很熟悉了 list转torch,cat第2维
    # boxes: [batch, num1 + num2 + num3, 1, 4]
    # confs: [batch, num1 + num2 + num3, num_classes]
    boxes = torch.cat(boxes_list, dim=1)
    confs = torch.cat(confs_list, dim=1)
    return [boxes, confs]

这样我们就构建了完整的推理过程
现在需要载入图片,resize,然后调用model就可以得出结果,但是我们知道还需要NMS去清理一下多余边框,关于NMS可以参考我之前写的文章

剩余完整代码

#如果你看完了前面的内容,这一步就很容易了 载入图片 然后resize到416或者416+96*N
 img = cv2.imread(imgfile)
 sized = cv2.resize(img, (width, height))
 #cv2载入是BGR的 要转成RGB
 sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
 #然后调用下面这个
def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1):
	#推理模式
    model.eval()
    #3维图片格式转换 更换输入为 B C H W 并且除以255 标准处理了
    if type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
        img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
    elif type(img) == np.ndarray and len(img.shape) == 4:
        img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
    else:
        print("unknow image type")
        exit(-1)
     #GPU加速
    if use_cuda:
        img = img.cuda()
     #这里是老写法了 新版pytorch已经不用Varible了
    img = torch.autograd.Variable(img)
    output = model(img)
resh, nms_thresh, output)
def post_processing(img, conf_thresh, nms_thresh, output):
	# 下面的参数都是之前出现过的 如果不明白往前翻
    # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
    # num_anchors = 9
    # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
    # strides = [8, 16, 32]
    # anchor_step = len(anchors) // num_anchors
	#用box 和 conf 做 nms,做目标检测几本上看到这两个东西就知道是NMS了,NMS都不会,学什么CV
    # [batch, num, 1, 4]
    box_array = output[0]
    # [batch, num, num_classes]
    confs = output[1]

    if type(box_array).__name__ != 'ndarray':
        box_array = box_array.cpu().detach().numpy()
        confs = confs.cpu().detach().numpy()

    num_classes = confs.shape[2]

    # [batch, num, 4]
    box_array = box_array[:, :, 0]

    # [batch, num, num_classes] --> [batch, num]
    max_conf = np.max(confs, axis=2)
    max_id = np.argmax(confs, axis=2)

    bboxes_batch = []
    for i in range(box_array.shape[0]):
       
        argwhere = max_conf[i] > conf_thresh
        l_box_array = box_array[i, argwhere, :]
        l_max_conf = max_conf[i, argwhere]
        l_max_id = max_id[i, argwhere]

        bboxes = []
        # 遍历所有classes 做NMS处理 NMS代码就不贴了,顺带一提 现在新版torchvision已经集成nms了				
        #from torchvision.ops import nms
        for j in range(num_classes):

            cls_argwhere = l_max_id == j
            ll_box_array = l_box_array[cls_argwhere, :]
            ll_max_conf = l_max_conf[cls_argwhere]
            ll_max_id = l_max_id[cls_argwhere]

            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
            
            if (keep.size > 0):
                ll_box_array = ll_box_array[keep, :]
                ll_max_conf = ll_max_conf[keep]
                ll_max_id = ll_max_id[keep]

                for k in range(ll_box_array.shape[0]):
                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
        
        bboxes_batch.append(bboxes)

    return bboxes_batch

你可能感兴趣的:(pytorch,深度学习,python,pytorch,深度学习,自动驾驶)