说代码前 简单提一句,you only look once(yolo)这个英文名 是对标滑动窗口探测
滑动窗口要看好几次,yolo只要看一次,因为yolo的特殊标注方式可以将目标检测问题简化为单步回归问题
yolo的标签是:
P:物体存在与否(置信度)
xyxxyy:坐标和长宽
c 类别置信度)
P | x,y,w,h | c |
---|---|---|
物体存在与否(置信度) | 坐标和长宽 | 类别置信度 |
所以一个类别(c=1) 的话 一个标签就有6个值,有多少个格子 多少个anchorbox(一般是3个)再乘
所以一个格子的话 维度就是(5+C)*anchor 重要!
懂了原理 我们就看下代码:
Yolov4没有用自适应框(就是自己事先写好),了解运作方式对于以后V5自适应框会有更好的理解
我们先看下anchorBox,下面的例子是最小的anchorbox,可以看一下源码的实现方式
num_anchors = 9
stride = 8 ## 另外两个stride16,24
anchor_mask=[0, 1, 2] ## 另外两个anchor[3,4,5] [6,7,8]
anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
anchor_step = len(anchors) // num_anchors
masked_anchors = []
#代码很简单 取anchors前6个除以stride
for m in anchor_mask:
masked_anchors += anchors[m * anchor_step:(m + 1) * anchor_step]
masked_anchors = [anchor / stride for anchor in masked_anchors]
#[1.5, 2.0, 2.375, 4.5, 5.0, 3.5] 这个masked_anchors就是我们最小的anchors 怎么用的看下面推理
先从推理看,图片输入网络(看上一章)
代码很长 核心就是用OUT的偏移量和anchor先验框 计算真实框
def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,validation=False):
#这就是我上面说的维度确认
assert (output.size(1) == (5 + num_classes) * num_anchors)
# 下面代码 做的事情只有一个 将output的第二维(比如3个anchor):
# [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
# 的输入 转成 bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
bxy_list = []
bwh_list = []
det_confs_list = []
cls_confs_list = []
# 几个anchor 就遍历几次Y标签
for i in range(num_anchors):
#begin end对应一个anchor中所有的标签 对应我上面表格里的东西看!
begin = i * (5 + num_classes)
end = (i + 1) * (5 + num_classes)
# xy标签
bxy_list.append(output[:, begin : begin + 2])
# wh标签
bwh_list.append(output[:, begin + 2 : begin + 4])
# P标签 置信度
det_confs_list.append(output[:, begin + 4 : begin + 5])
# 类别置信度
cls_confs_list.append(output[:, begin + 5 : end])
# 下面代码都是用concat把list合并成torch
# Shape: [batch, num_anchors * 2, H, W]
bxy = torch.cat(bxy_list, dim=1)
# Shape: [batch, num_anchors * 2, H, W]
bwh = torch.cat(bwh_list, dim=1)
# Shape: [batch, num_anchors, H, W]
det_confs = torch.cat(det_confs_list, dim=1)
# Shape: [batch, num_anchors * H * W]
det_confs = det_confs.view(output.size(0), num_anchors * output.size(2) * output.size(3))
# Shape: [batch, num_anchors * num_classes, H, W]
cls_confs = torch.cat(cls_confs_list, dim=1)
# Shape: [batch, num_anchors, num_classes, H * W]
cls_confs = cls_confs.view(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
# Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes]
#类别置信度:把所有格子、所有anchor的类别置信度聚合
cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)
# 对数据使用 sigmoid(), exp() and softmax(),scale_x_y比例默认1
# 下面代码可能会看的一头雾水,实际上做的事情是计算输出结果对于每个grid的偏移量和初始锚点的缩放比率
# 比如真实X的位置是 锚框位置+sigmoid(x偏移),缩放为exp
bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
bwh = torch.exp(bwh)
det_confs = torch.sigmoid(det_confs)
cls_confs = torch.sigmoid(cls_confs)
# 准备偏置数据 这边gridx+gridy就是对应yolo的网格框
grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
# 用[1.5, 2.0, 2.375, 4.5, 5.0, 3.5] 做例子: 这里w就是[1.5,2.375,5.0] h[2.0,4.5,3.5]
# 分别对应 (1.5,2.0) (2.375,4.5) (5.0,3.5) 的三个anchor框的缩放比率
anchor_w = []
anchor_h = []
for i in range(num_anchors):
anchor_w.append(anchors[i * 2])
anchor_h.append(anchors[i * 2 + 1])
#gpu加速
device = None
cuda_check = output.is_cuda
if cuda_check:
device = output.get_device()
bx_list = []
by_list = []
bw_list = []
bh_list = []
# 遍历anchors bx,by 修正偏移量, bw,bh匹配anchor框
for i in range(num_anchors):
ii = i * 2
# Shape: [batch, 1, H, W] bx就是X的偏移 可以看到直接用数据加上grid
bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32)
# Shape: [batch, 1, H, W] by就是Y的偏移 可以看到直接用数据加上grid
by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32)
# Shape: [batch, 1, H, W] bw就是W的缩放,这边乘以anchor的长宽系数,成对应anchorbox
bw = bwh[:, ii : ii + 1] * anchor_w[i]
# Shape: [batch, 1, H, W] bh就是H的缩放,这边乘以anchor的长宽系数,成对应anchorbox
bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
bx_list.append(bx)
by_list.append(by)
bw_list.append(bw)
bh_list.append(bh)
########################################
# bboxes 生成处理 ,后面基本上就是处理为输出格式 #
########################################
#还是转为torch的处理 把上一步的偏移和缩放转为pytorch
# Shape: [batch, num_anchors, H, W]
bx = torch.cat(bx_list, dim=1)
# Shape: [batch, num_anchors, H, W]
by = torch.cat(by_list, dim=1)
# Shape: [batch, num_anchors, H, W]
bw = torch.cat(bw_list, dim=1)
# Shape: [batch, num_anchors, H, W]
bh = torch.cat(bh_list, dim=1)
# Shape: [batch, 2 * num_anchors, H, W]
bx_bw = torch.cat((bx, bw), dim=1)
# Shape: [batch, 2 * num_anchors, H, W]
by_bh = torch.cat((by, bh), dim=1)
# 归一化
bx_bw /= W
by_bh /= H
# Shape: [batch, num_anchors * H * W, 1]
bx = bx_bw[:, :num_anchors].view(batch, num_anchors * H * W, 1)
by = by_bh[:, :num_anchors].view(batch, num_anchors * H * W, 1)
bw = bx_bw[:, num_anchors:].view(batch, num_anchors * H * W, 1)
bh = by_bh[:, num_anchors:].view(batch, num_anchors * H * W, 1)
#中心位置转左上角位置
bx1 = bx - bw * 0.5
by1 = by - bh * 0.5
bx2 = bx1 + bw
by2 = by1 + bh
#下面代码都是封装格式 返回
# Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
boxes = torch.cat((bx1, by1, bx2, by2), dim=2).view(batch, num_anchors * H * W, 1, 4)
# boxes = boxes.repeat(1, 1, num_classes, 1)
# boxes: [batch, num_anchors * H * W, 1, 4]
# cls_confs: [batch, num_anchors * H * W, num_classes]
# det_confs: [batch, num_anchors * H * W]
det_confs = det_confs.view(batch, num_anchors * H * W, 1)
confs = cls_confs * det_confs
# boxes: [batch, num_anchors * H * W, 1, 4]
# confs: [batch, num_anchors * H * W, num_classes]
return boxes, confs
一次推理生成3个anchor的结果
看了上一章网络架构我们知道,yolov4用了9个anchor
对应3个输出结果,然后我们给这些结果合并一下
def get_region_boxes(boxes_and_confs):
boxes_list = []
confs_list = []
for item in boxes_and_confs:
boxes_list.append(item[0])
confs_list.append(item[1])
#下面代码就很熟悉了 list转torch,cat第2维
# boxes: [batch, num1 + num2 + num3, 1, 4]
# confs: [batch, num1 + num2 + num3, num_classes]
boxes = torch.cat(boxes_list, dim=1)
confs = torch.cat(confs_list, dim=1)
return [boxes, confs]
这样我们就构建了完整的推理过程
现在需要载入图片,resize,然后调用model就可以得出结果,但是我们知道还需要NMS去清理一下多余边框,关于NMS可以参考我之前写的文章
#如果你看完了前面的内容,这一步就很容易了 载入图片 然后resize到416或者416+96*N
img = cv2.imread(imgfile)
sized = cv2.resize(img, (width, height))
#cv2载入是BGR的 要转成RGB
sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
#然后调用下面这个
def do_detect(model, img, conf_thresh, nms_thresh, use_cuda=1):
#推理模式
model.eval()
#3维图片格式转换 更换输入为 B C H W 并且除以255 标准处理了
if type(img) == np.ndarray and len(img.shape) == 3: # cv2 image
img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
elif type(img) == np.ndarray and len(img.shape) == 4:
img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
else:
print("unknow image type")
exit(-1)
#GPU加速
if use_cuda:
img = img.cuda()
#这里是老写法了 新版pytorch已经不用Varible了
img = torch.autograd.Variable(img)
output = model(img)
resh, nms_thresh, output)
def post_processing(img, conf_thresh, nms_thresh, output):
# 下面的参数都是之前出现过的 如果不明白往前翻
# anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
# num_anchors = 9
# anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
# strides = [8, 16, 32]
# anchor_step = len(anchors) // num_anchors
#用box 和 conf 做 nms,做目标检测几本上看到这两个东西就知道是NMS了,NMS都不会,学什么CV
# [batch, num, 1, 4]
box_array = output[0]
# [batch, num, num_classes]
confs = output[1]
if type(box_array).__name__ != 'ndarray':
box_array = box_array.cpu().detach().numpy()
confs = confs.cpu().detach().numpy()
num_classes = confs.shape[2]
# [batch, num, 4]
box_array = box_array[:, :, 0]
# [batch, num, num_classes] --> [batch, num]
max_conf = np.max(confs, axis=2)
max_id = np.argmax(confs, axis=2)
bboxes_batch = []
for i in range(box_array.shape[0]):
argwhere = max_conf[i] > conf_thresh
l_box_array = box_array[i, argwhere, :]
l_max_conf = max_conf[i, argwhere]
l_max_id = max_id[i, argwhere]
bboxes = []
# 遍历所有classes 做NMS处理 NMS代码就不贴了,顺带一提 现在新版torchvision已经集成nms了
#from torchvision.ops import nms
for j in range(num_classes):
cls_argwhere = l_max_id == j
ll_box_array = l_box_array[cls_argwhere, :]
ll_max_conf = l_max_conf[cls_argwhere]
ll_max_id = l_max_id[cls_argwhere]
keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
if (keep.size > 0):
ll_box_array = ll_box_array[keep, :]
ll_max_conf = ll_max_conf[keep]
ll_max_id = ll_max_id[keep]
for k in range(ll_box_array.shape[0]):
bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
bboxes_batch.append(bboxes)
return bboxes_batch