参考代码:https://github.com/amdegroot/ssd.pytorch
https://github.com/acm5656/ssd_pytorch
import Config
from itertools import product as product
from math import sqrt as sqrt
import torch
#获取先验框
def default_prior_box():
mean_layer = []
for k,f in enumerate(Config.feature_map):
mean = []
for i,j in product(range(f),repeat=2):
f_k = Config.image_size/Config.steps[k]
cx = (j+0.5)/f_k
cy = (i+0.5)/f_k
s_k = Config.sk[k]/Config.image_size
mean += [cx,cy,s_k,s_k]
s_k_prime = sqrt(s_k * Config.sk[k+1]/Config.image_size)
mean += [cx,cy,s_k_prime,s_k_prime]
for ar in Config.aspect_ratios[k]:
mean += [cx, cy, s_k * sqrt(ar), s_k/sqrt(ar)]
mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
if Config.use_cuda:
mean = torch.Tensor(mean).cuda().view(Config.feature_map[k], Config.feature_map[k], -1).contiguous()
else:
mean = torch.Tensor(mean).view( Config.feature_map[k],Config.feature_map[k],-1).contiguous()
mean.clamp_(max=1, min=0)
mean_layer.append(mean)
return mean_layer
"""
encode和decode是根据论文中的g^{c_x}那些公式来进行变换的,不过代码实现和公式略有不同,多了variance
具体可参见https://blog.csdn.net/h__ang/article/details/90316220
http://caffecn.cn/?/question/640
https://zhuanlan.zhihu.com/p/39399799
"""
def encode(match_boxes,prior_box,variances):
"""
这里是对g^{cx}和g^{cy}都进行了计算
此处的match_boxes坐标应为(x_min,y_min,x_max,y_max),这样下式计算的实际就是
x = (x_min + x_max) / 2 - x_{prior_box}...
这和论文中的公式一致
"""
g_cxcy = (match_boxes[:, :2] + match_boxes[:, 2:])/2 - prior_box[:, :2]
# encode variance
g_cxcy /= (variances[0] * prior_box[:, 2:])
# match wh / prior wh
g_wh = (match_boxes[:, 2:] - match_boxes[:, :2]) / prior_box[:, 2:]
g_wh = torch.log(g_wh) / variances[1]
# return target for smooth_l1_loss
return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
#从[x,y,w,h]改成[x_min.y_min,x_max,y_max]
def change_prior_box(box):
if Config.use_cuda:
return torch.cat((box[:, :2] - box[:, 2:]/2, # xmin, ymin
box[:, :2] + box[:, 2:]/2), 1).cuda() # xmax, ymax
else:
return torch.cat((box[:, :2] - box[:, 2:]/2, # xmin, ymin
box[:, :2] + box[:, 2:]/2), 1)
# 计算两个box的交集
def insersect(box1,box2):
label_num = box1.size(0)
box_num = box2.size(0)
#这里box的坐标应该是(x_min,y_min,x_max,y_max),这样以下操作相当于min(x_max1,x_max2),min(y_max1,y_max2)
#max(x_min1,x_min2),max(y_min1,y_min2)
#然后得到的是groud truth中第label个框和先验框中的第box个框的交集
""" We resize both tensors to [A,B,2] without new malloc:
[A,2] -> [A,1,2] -> [A,B,2]
[B,2] -> [1,B,2] -> [A,B,2]
Then we compute the area of intersect between box_a and box_b.
Args:
box_a: (tensor) bounding boxes, Shape: [A,4].
box_b: (tensor) bounding boxes, Shape: [B,4].
Return:
(tensor) intersection area, Shape: [A,B].
"""
max_xy = torch.min(
box1[:,2:].unsqueeze(1).expand(label_num,box_num,2),
box2[:,2:].unsqueeze(0).expand(label_num,box_num,2)
)
min_xy = torch.max(
box1[:,:2].unsqueeze(1).expand(label_num,box_num,2),
box2[:,:2].unsqueeze(0).expand(label_num,box_num,2)
)
#如果max_xy比min_xy还小,说明没有交集,直接置0即可,inter保存的是交集的长和宽
inter = torch.clamp((max_xy-min_xy),min=0)
#返回交集面积
return inter[:,:,0]*inter[:,:,1]
def jaccard(box_a, box_b):
"""计算jaccard比
公式:
A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
"""
inter = insersect(box_a, box_b)
area_a = ((box_a[:, 2]-box_a[:, 0]) *
(box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B]
area_b = ((box_b[:, 2]-box_b[:, 0]) *
(box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B]
union = area_a + area_b - inter
return inter / union # [A,B]
def point_form(boxes):
return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin
boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax
def match(threshold, truths, priors, labels, loc_t, conf_t, idx):
"""计算default box和实际位置的jaccard比,计算出每个box的最大jaccard比的种类和每个种类的最大jaccard比的box
Args:
threshold: (float) jaccard比的阈值.
truths: (tensor) 实际位置.
priors: (tensor) default box
labels: (tensor) 一个图片实际包含的类别数.
loc_t: (tensor) 需要存储每个box不同类别中的最大jaccard比.
conf_t: (tensor) 存储每个box的最大jaccard比的类别.
idx: (int) 当前的批次
"""
# 计算jaccard比
overlaps = jaccard(
truths,
# 转换priors,转换为x_min,y_min,x_max和y_max
point_form(priors)
)
# [1,num_objects] best prior for each ground truth
# 实际包含的类别对应box中jaccarb最大的box和对应的索引值,即每个类别最优box
best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
# [1,num_priors] best ground truth for each prior
# 每一个box,在实际类别中最大的jaccard比的类别,即每个box最优类别
best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
best_truth_idx.squeeze_(0)
best_truth_overlap.squeeze_(0)
best_prior_idx.squeeze_(1)
best_prior_overlap.squeeze_(1)
# 将每个类别中的最大box设置为2,确保不影响后边操作
best_truth_overlap.index_fill_(0, best_prior_idx, 2)
# 计算每一个box的最优类别,和每个类别的最优loc
for j in range(best_prior_idx.size(0)):
best_truth_idx[best_prior_idx[j]] = j
matches = truths[best_truth_idx] # Shape: [num_priors,4]
conf = labels[best_truth_idx] + 1 # Shape: [num_priors]
conf[best_truth_overlap < threshold] = 0 # label as background
# 实现loc的转换,具体的转换公式参照论文中的loc的loss函数的计算公式
loc = encode(matches, priors,(0.1,0.2))
loc_t[idx] = loc # [num_priors,4] encoded offsets to learn
conf_t[idx] = conf # [num_priors] top class label for each prior
def log_sum_exp(x):
"""Utility function for computing log_sum_exp while determining
This will be used to determine unaveraged confidence loss across
all examples in a batch.
Args:
x (Variable(tensor)): conf_preds from conf layers
"""
x_max = x.data.max()
result = torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
#和encode操作相反
def decode(loc, priors, variances):
"""Decode locations from predictions using priors to undo
the encoding we did for offset regression at train time.
Args:
loc (tensor): location predictions for loc layers,
Shape: [num_priors,4]
priors (tensor): Prior boxes in center-offset form.
Shape: [num_priors,4].
variances: (list[float]) Variances of priorboxes
Return:
decoded bounding box predictions
"""
boxes = torch.cat((
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
boxes[:, :2] -= boxes[:, 2:] / 2
boxes[:, 2:] += boxes[:, :2]
return boxes
def nms(boxes, scores, overlap=0.5, top_k=200):
"""Apply non-maximum suppression at test time to avoid detecting too many
overlapping bounding boxes for a given object.
Args:
boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
scores: (tensor) The class predscores for the img, Shape:[num_priors].
overlap: (float) The overlap thresh for suppressing unnecessary boxes.
top_k: (int) The Maximum number of box preds to consider.
Return:
The indices of the kept boxes with respect to num_priors.
"""
"""
输入:
boxes: 存储一个图片的所有预测框。[num_positive,4].
scores:置信度。如果为多分类则需要将nms函数套在一个循环内。[num_positive].
overlap: nms抑制时iou的阈值.
top_k: 先选取置信度前top_k个框再进行nms.
返回:
nms后剩余预测框的索引.
"""
#keep为最后保留的边框
keep = scores.new(scores.size(0)).zero_().long()
if boxes.numel() == 0:
return keep,0
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
#这里没有x2 - x1 + 1,感觉是一件很奇怪的事情
area = torch.mul(x2 - x1, y2 - y1) #并行化计算所有框的面积
v, idx = scores.sort(0) # sort in ascending order
# I = I[v >= 0.01]
idx = idx[-top_k:] # indices of the top-k largest vals
xx1 = boxes.new()
yy1 = boxes.new()
xx2 = boxes.new()
yy2 = boxes.new()
w = boxes.new()
h = boxes.new()
# keep = torch.Tensor()
count = 0
#返回idx中元素的个数
while idx.numel() > 0:
i = idx[-1] # index of current largest val
# keep.append(i)
keep[count] = i
count += 1
if idx.size(0) == 1:
break
idx = idx[:-1] # remove kept element from view
# load bboxes of next highest vals
#剩下boxes的信息存储在xx,yy中,此时idx已经去掉了置信度最大的框的下标
torch.index_select(x1, 0, idx, out=xx1)
torch.index_select(y1, 0, idx, out=yy1)
torch.index_select(x2, 0, idx, out=xx2)
torch.index_select(y2, 0, idx, out=yy2)
# store element-wise max with next highest score
# 计算当前最大置信框与其他剩余框的交集,作者这段代码写的不好,容易误导
# 此处应该会自动广播
xx1 = torch.clamp(xx1, min=x1[i]) #max(x1,xx1)
yy1 = torch.clamp(yy1, min=y1[i]) #max(y1,yy1)
xx2 = torch.clamp(xx2, max=x2[i]) #min(x2,xx2)
yy2 = torch.clamp(yy2, max=y2[i]) #min(y2,yy2)
w.resize_as_(xx2)
h.resize_as_(yy2)
w = xx2 - xx1 #w=min(x2,xx2)−max(x1,xx1)
h = yy2 - yy1 #h=min(y2,yy2)−max(y1,yy1)
w = torch.clamp(w, min=0.0) #max(w,0)
h = torch.clamp(h, min=0.0) #max(h,0)
inter = w*h
# IoU = i / (area(a) + area(b) - i)
rem_areas = torch.index_select(area, 0, idx) # 剩余的框的面积
union = (rem_areas - inter) + area[i]
IoU = inter/union # store result in iou
# keep only elements with an IoU <= overlap
idx = idx[IoU.le(overlap)] # 选出IoU <= overlap的boxes(注意le函数的使用)
return keep, count
if __name__ == '__main__':
mean = default_prior_box()
print(mean)
在训练过程中,首先要确定训练图片中的ground truth(真实目标)与哪个先验框来进行匹配,与之匹配的先验框所对应的边界框将负责预测它。在Yolo中,ground truth的中心落在哪个单元格,该单元格中与其IOU最大的边界框负责预测它。但是在SSD中却完全不一样,SSD的先验框与ground truth的匹配原则主要有两点。首先,对于图片中每个ground truth,找到与其IOU最大的先验框,该先验框与其匹配,这样,可以保证每个ground truth一定与某个先验框匹配。通常称与ground truth匹配的先验框为正样本,反之,若一个先验框没有与任何ground truth进行匹配,那么该先验框只能与背景匹配,就是负样本。一个图片中ground truth是非常少的, 而先验框却很多,如果仅按第一个原则匹配,很多先验框会是负样本,正负样本极其不平衡,所以需要第二个原则。第二个原则是:对于剩余的未匹配先验框,若某个ground truth的IOU大于某个阈值(一般是0.5),那么该先验框也与这个ground truth进行匹配。
尽管一个ground truth可以与多个先验框匹配,但是ground truth相对先验框还是太少了,所以负样本相对正样本会很多。为了保证正负样本尽量平衡,SSD采用了hard negative mining,就是对负样本进行抽样,抽样时按照置信度误差(预测背景的置信度越小,误差越大)进行降序排列,选取误差的较大的top-k作为训练的负样本,以保证正负样本比例接近1:3。