以coco数据集为例,合计80类。
yolov5在stride为8, 16, 32这三个维度上分别检测小目标,中目标,大目标。在每个尺度的每个位置上铺num_anchors个anchor。正常来说,num_anchors=3。
假设输入图片的尺寸为[N, 3, H, W],这三个stride上输出特征图的尺寸分别为:
[N, num_anchors, H/8, W/8, num_classes + 5]
[N, num_anchors, H/16, W/16, num_classes + 5]
[N, num_anchors, H/32, W/32, num_classes + 5]
训练的重点是loss计算部分。分为build_targets和compute_loss两部分。
目的是将真值框和anchor之间形成匹配关系。
tcls, tbox, indices, anch = [], [], [], []
gain = 0.5
off = torch.tensor([[0, 0], [1, 0], [0, 1], [-1, 0], [0, -1]], device=targets.device).float() * g
gain = torch.ones(7, device=targets.device)
# na: num_anchors, 此处生成anchor index, 尺寸为[na, nt]
ai = torch.arange(na, device=targets.device).float().view(na, 1).repeat(1, nt)
# targets: [nt, 6] => [na, nt, 6]
# ai: [na, nt] => [na, nt, 1], 后缀None表示新增一维。
# 在dim=2的维度上,将两个tensor cat在一起,变为[na, nt, 7]
# targets中的每个item为[target_id, cls, xc, yc, w, h, anchor_id]
# 这里的[xc, yc, w, h]还是真值构建时,相对原图的width和height做过归一化的数值。
targets = torch.cat((targets.repeat(na, 1, 1), ai[:, :, None]), 2)
for i in range(self.nl): #对每个stride特征图
anchors = self.anchors[i] # 获取该stride特征图上铺设的anchor尺寸, [num_anchors, 2]
gain[2:6] = torch.tensor(p[i].shape)[[3, 2, 3, 2]] # [1, 1, w, h, w, h, 1]
# Match targets to anchors
t = targets * gain # [na, nt, 7]
if nt:
# Matches
# get wh ratio, [na, nt, 2] / [na, 1, 2]
r = t[:, :, 4:6] / anchors[:, None]
# torch.max是按位比较,取最大值。
# max取特定维度内的最大值,且返回list,0为最大值,1为最大值对应的序号。
# 该操作的目的是获取,在尺寸上最适合当前target的anchor配置
j = torch.max(r, 1./r).max(2)[0] < 4.0
t = t[j] # [num_valid, 7]
# get offset
gxy = t[:, 2:4] # object center position
gxi = gain[[2, 3]] - gxy
j, k = ((gxy % 1. < g) & (gxy > 1.)).T
l, m = ((gxi % 1. < g) & (gxi > 1.)).T
j = torch.stack((torch.ones_like(j), j, k, l, m))
t = t.repeat((5, 1, 1))[j]
# [num_valid, 1, 2] -> [num_valid, 5, 2]
# 每个t sample都提供[1, 5, 2]的offset偏置。
# 但只取j为true的分量。
# 这部分操作的目的,我理解是为了让每个target尽可能的和周边的grid形成关联,提供更多的正样本。
offsets = (torch.zeros_like(gxy)[None] + off[:, None])[j]
# Define
b, c = t[:, :2].long().T #image, class
gxy = t[:, 2:4]
gwh = t[:, 4:6]
gij = (gxy - offsets).long()
gi, gj = gij.T
# Append
a = [t:, 6].long() # anchor indices
# [target_id, anchor_id, h_id, w_id]
indices.append((b, a, gj.clamp_(0, gain[3] - 1), gi.clamp_(0, gain[2] - 1)))
# [cx_offset, cy_offset, w, h]
tbox.append(torch.cat((gxy - gij, gwh), 1))
# assoiated anchors, [num_valid, 2]
anch.append(anchors[a]))
tcls.append(c)
return tcls, tbox, indices, anch
loss分为cls_loss, box_loss, obj_loss三部分。cls_loss用于监督类别分类,box_loss用于监督检测框的回归,obj_loss用于监督grid中是否存在物体。loss的计算在每层feature map上进行。
lcls, lbox, lobj = torch.zeros(1, device=device), torch.zeros(1,device=device), torch.zeros(1, device=device)
tcls, tbox, indices, anchors = self.build_targets(p, targets) # targets
for i, pi in enumerate(p):
# b: [num_valid]
# a: [num_valid], index of anchors in current stride feat
# gj: [num_valid]
# gi: [num_valid]
b, a, gj, gi = indices[i]
tobj = torch.zeros_like(pi[..., 0], device=device)
n = b.shape[0]
if n:
ps = pi[b, a, gj, gi]
# Bbox Regression
pxy = ps[:, :2].sigmoid() * 2. - 0.5
pwh = (ps[:, 2:4].sigmoid() * 2) ** 2 * anchors[i]
pbox = torch.cat((pxy, pwh), 1)
iou = bbox_iou(pbox.T, tbox[i], x1y1x2y2=False, CIOU=True)
lbox += (1.0 - iou).mean()
# Objectness
tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * iou.detach().clamp(0).type(tobj.dtype) # self.gr = 0
# Classification
if self.nc > 1:
t = torch.full_like(ps[:, 5:], self.cn, device=device)
t[range(n), tcls[i]] = self.cp
lcls += self.BCEcls(ps[:, 5:], t)
lobj += self.BCEobj(pi[..., 4], tobj) * self.balance[i] # obj loss
loss = lbox + lobj + lcls
每个网格是否存在物体,是个二值分类问题。故这里用BCELoss
self.BCEobj = nn.BCEWithLogitsLoss(pose_weight=1.0)
lobj = self.BCEobj(pi[…, 4], tobj)
self.BCEcls = nn.BCEWithLogitsLoss(pose_weight=1.0)
lcls = self.BCEcls(ps[:, 5:], t)
用CIOU来表征bbox的回归损失。