博客基于github上的
Few-shot Object Detection via Feature Reweighting
进行的相关阅读,需要自行查看的可以去提取,原文是
“Few-shot Object Detection via Feature Reweighting, ICCV 2019”
这边由于项目相关需要用到few shot,进行了相关的论文学习,先行进行记录
博主主要需要查看部分loss计算,所以着重到loss相关的方面,其他部分需根据源码自行查看。
这边源码主要用到了两个框架的配置,一个是darknet_dynamic.cfg,这个是基于yolov2的配置进行更改的。主体的backbone还是darknet19,在卷积层中添加了由reweight向量组成的卷积层,该层出现在检测的卷积层前
关于yolov2的部分loss参考可以看这里
关于Yolov2的loss function二三事
darknet源码阅读之region_layer.c
主要看两个函数:
build_targets
class RegionLoss 的 forward函数
def forward(self, output, target):
if target.dim() == 3:
target = target.view(-1, target.size(-1))
bef = target.size(0)
output, target = neg_filter(output, target)
# print("{}/{}".format(target.size(0), bef))
#先将主要的batch size和featuremap的大小拿出来
t0 = time.time()
nB = output.data.size(0)
nA = self.num_anchors
nC = self.num_classes
nH = output.data.size(2)
nW = output.data.size(3)
#将输出的选框部分和confidence部分做转化
output = output.view(nB, nA, (5+nC), nH, nW)
x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
# [nB, nA, nC, nW, nH] | (bs, 5, 1, 13, 13)
cls = output.index_select(2, Variable(torch.linspace(5,5+nC-1,nC).long().cuda()))
cls = cls.view(nB*nA, nC, nH*nW).transpose(1,2).contiguous().view(nB*nA*nH*nW, nC)
# 预测部分的转化,将上面的 卷积输出tx,ty,tw,th转换为 feature map 尺度下的 x y w h
pred_boxes = torch.cuda.FloatTensor(4, nB*nA*nH*nW)
grid_x = torch.linspace(0, nW-1, nW).repeat(nH,1).repeat(nB*nA, 1, 1).view(nB*nA*nH*nW).cuda()
grid_y = torch.linspace(0, nH-1, nH).repeat(nW,1).t().repeat(nB*nA, 1, 1).view(nB*nA*nH*nW).cuda()
anchor_w = torch.Tensor(self.anchors).view(nA, int(self.anchor_step)).index_select(1, torch.LongTensor([0])).cuda()
anchor_h = torch.Tensor(self.anchors).view(nA, int(self.anchor_step)).index_select(1, torch.LongTensor([1])).cuda()
anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH*nW).view(nB*nA*nH*nW)
anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH*nW).view(nB*nA*nH*nW)
pred_boxes[0] = x.data + grid_x
pred_boxes[1] = y.data + grid_y
pred_boxes[2] = torch.exp(w.data) * anchor_w
pred_boxes[3] = torch.exp(h.data) * anchor_h
pred_boxes = convert2cpu(pred_boxes.transpose(0,1).contiguous().view(-1,4))
# 用上述转换的结果,得到对应的输出,这里要进到build tartgets进行分析
nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf,tcls = build_targets(pred_boxes, target.data, self.anchors, nA, nC, \
nH, nW, self.noobject_scale, self.object_scale, self.thresh, self.seen)
def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale, sil_thresh, seen):
# 输入信息: predbox 预测的结果,featuremap尺度下
# target 目标
# anchors num_anchors:和anchor相关的信息
# num_classes 类别数目
# nH nW feature map的尺度长宽
# noobject_scale、object_scale 无目标、有目标的conf乘数
# sil_thresh
# seen 第几个batch
nB = target.size(0)
nA = num_anchors
nC = num_classes
anchor_step = len(anchors)//num_anchors
# print('anchor_step: ', anchor_step)
# 初始化一些值
conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
coord_mask = torch.zeros(nB, nA, nH, nW)
cls_mask = torch.zeros(nB, nA, nH, nW)
tx = torch.zeros(nB, nA, nH, nW)
ty = torch.zeros(nB, nA, nH, nW)
tw = torch.zeros(nB, nA, nH, nW)
th = torch.zeros(nB, nA, nH, nW)
tconf = torch.zeros(nB, nA, nH, nW)
tcls = torch.zeros(nB, nA, nH, nW)
nAnchors = nA*nH*nW
nPixels = nH*nW
for b in range(nB):
cur_pred_boxes = pred_boxes[b*nAnchors:(b+1)*nAnchors].t()
cur_ious = torch.zeros(nAnchors)
# 将gt的 选框们,转换到featuremap尺度
for t in range(cfg.max_boxes):
if target[b][t*5+1] == 0:
break
gx = target[b][t*5+1]*nW
gy = target[b][t*5+2]*nH
gw = target[b][t*5+3]*nW
gh = target[b][t*5+4]*nH
#给一个预测结果图的每个预测都配上gt的结果,准备计算maxiou
cur_gt_boxes = torch.FloatTensor([gx,gy,gw,gh]).repeat(nAnchors,1).t()
#计算最大的iou
cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
# 那些阈值大于的,置为0,和yolov2相同
conf_mask[b][torch.reshape(cur_ious>sil_thresh, (conf_mask.shape[1], conf_mask.shape[2], conf_mask.shape[3]))] = 0
if seen < 12800: # 这里和yolov2的prior相同
if anchor_step == 4:
tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1,nA,1,1).repeat(nB,1,nH,nW)
ty = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1,nA,1,1).repeat(nB,1,nH,nW)
else:
tx.fill_(0.5)
ty.fill_(0.5)
tw.zero_()
th.zero_()
coord_mask.fill_(1)
# 这个用来计数gt下有几个框的
nGT = 0
# 这个用来计算iou>0.5的个数
nCorrect = 0
for b in range(nB):
# pdb.set_trace()
for t in range(50):
if target[b][t*5+1] == 0:
break
nGT = nGT + 1
best_iou = 0.0
best_n = -1
min_dist = 10000
# 这部分代码和darknet 的region.c代码大同小异,准备iou的计算的
gx = target[b][t*5+1] * nW
gy = target[b][t*5+2] * nH
gi = int(gx)
gj = int(gy)
gw = target[b][t*5+3]*nW
gh = target[b][t*5+4]*nH
gt_box = [0, 0, gw, gh]
for n in range(nA):
aw = anchors[anchor_step*n]
ah = anchors[anchor_step*n+1]
anchor_box = [0, 0, aw, ah]
iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
if anchor_step == 4:
ax = anchors[anchor_step*n+2]
ay = anchors[anchor_step*n+3]
dist = pow(((gi+ax) - gx), 2) + pow(((gj+ay) - gy), 2)
# 和yolov2一样的,拿出最好的iou和iou对应的anchor
if iou > best_iou:
best_iou = iou
best_n = n
elif anchor_step==4 and iou == best_iou and dist < min_dist:
best_iou = iou
best_n = n
min_dist = dist
# 找到对应的predbox,以及gtbox
gt_box = [gx, gy, gw, gh]
pred_box = pred_boxes[b*nAnchors+best_n*nPixels+gj*nW+gi]
# 对找到的位置,三个mask都赋值上1
coord_mask[b][best_n][gj][gi] = 1
cls_mask[b][best_n][gj][gi] = 1
conf_mask[b][best_n][gj][gi] = object_scale
# 把gt box的信息转换回到和卷积网络输出的一样信息上
tx[b][best_n][gj][gi] = target[b][t*5+1] * nW - gi
ty[b][best_n][gj][gi] = target[b][t*5+2] * nH - gj
tw[b][best_n][gj][gi] = math.log(gw/anchors[anchor_step*best_n])
th[b][best_n][gj][gi] = math.log(gh/anchors[anchor_step*best_n+1])
iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) # best_iou
tconf[b][best_n][gj][gi] = iou
tcls[b][best_n][gj][gi] = target[b][t*5]
if iou > 0.5:
nCorrect = nCorrect + 1
return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
cls_mask = (cls_mask == 1)
if cfg.metayolo:
tcls.zero_()
nProposals = int((conf > 0.25).float().sum().data[0])
tx = Variable(tx.cuda())
ty = Variable(ty.cuda())
tw = Variable(tw.cuda())
th = Variable(th.cuda())
tconf = Variable(tconf.cuda())
tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
coord_mask = Variable(coord_mask.cuda())
conf_mask = Variable(conf_mask.cuda().sqrt())
cls_mask = Variable(cls_mask.view(-1, 1).repeat(1,nC).cuda())
cls = cls[cls_mask].view(-1, nC)
# 进行损失计算,这里class用的交叉熵计算损失,需要注意
loss_x = self.coord_scale * nn.MSELoss(size_average=False)(x*coord_mask, tx*coord_mask)/2.0
loss_y = self.coord_scale * nn.MSELoss(size_average=False)(y*coord_mask, ty*coord_mask)/2.0
loss_w = self.coord_scale * nn.MSELoss(size_average=False)(w*coord_mask, tw*coord_mask)/2.0
loss_h = self.coord_scale * nn.MSELoss(size_average=False)(h*coord_mask, th*coord_mask)/2.0
loss_conf = nn.MSELoss(size_average=False)(conf*conf_mask, tconf*conf_mask)/2.0
loss_cls = self.class_scale * nn.CrossEntropyLoss(size_average=False)(cls, tcls)
loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
return loss
总体看来,和yolov2 的计算方式没有很大的不同,但估计在target的生成方式上会有些许不一样,需要进去细看一下的。这部分我这里就是参照yolov2的情形开始进行测试,待补充测试结果