SSD pytorch源码剖析(不包含模型搭建及训练、测试)及训练方法剖析

参考代码:https://github.com/amdegroot/ssd.pytorch 

                  https://github.com/acm5656/ssd_pytorch

import Config
from itertools import product as product
from math import sqrt as sqrt
import torch

#获取先验框
def default_prior_box():
    mean_layer = []
    for k,f in enumerate(Config.feature_map):
        mean = []
        for i,j in product(range(f),repeat=2):
            f_k = Config.image_size/Config.steps[k]
            cx = (j+0.5)/f_k
            cy = (i+0.5)/f_k

            s_k = Config.sk[k]/Config.image_size
            mean += [cx,cy,s_k,s_k]

            s_k_prime = sqrt(s_k * Config.sk[k+1]/Config.image_size)
            mean += [cx,cy,s_k_prime,s_k_prime]
            for ar in Config.aspect_ratios[k]:
                mean += [cx, cy, s_k * sqrt(ar), s_k/sqrt(ar)]
                mean += [cx, cy, s_k / sqrt(ar), s_k * sqrt(ar)]
        if Config.use_cuda:
            mean = torch.Tensor(mean).cuda().view(Config.feature_map[k], Config.feature_map[k], -1).contiguous()
        else:
            mean = torch.Tensor(mean).view( Config.feature_map[k],Config.feature_map[k],-1).contiguous()
        mean.clamp_(max=1, min=0)
        mean_layer.append(mean)

    return mean_layer

"""
encode和decode是根据论文中的g^{c_x}那些公式来进行变换的,不过代码实现和公式略有不同,多了variance
具体可参见https://blog.csdn.net/h__ang/article/details/90316220
http://caffecn.cn/?/question/640
https://zhuanlan.zhihu.com/p/39399799
"""
def encode(match_boxes,prior_box,variances):
    """
    这里是对g^{cx}和g^{cy}都进行了计算
    此处的match_boxes坐标应为(x_min,y_min,x_max,y_max),这样下式计算的实际就是
    x = (x_min + x_max) / 2 - x_{prior_box}...
    这和论文中的公式一致
    """
    g_cxcy = (match_boxes[:, :2] + match_boxes[:, 2:])/2 - prior_box[:, :2]
    # encode variance
    g_cxcy /= (variances[0] * prior_box[:, 2:])
    # match wh / prior wh
    g_wh = (match_boxes[:, 2:] - match_boxes[:, :2]) / prior_box[:, 2:]
    g_wh = torch.log(g_wh) / variances[1]
    # return target for smooth_l1_loss
    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]

#从[x,y,w,h]改成[x_min.y_min,x_max,y_max]
def change_prior_box(box):
    if Config.use_cuda:
        return torch.cat((box[:, :2] - box[:, 2:]/2,     # xmin, ymin
                         box[:, :2] + box[:, 2:]/2), 1).cuda()  # xmax, ymax
    else:
        return torch.cat((box[:, :2] - box[:, 2:]/2,     # xmin, ymin
                         box[:, :2] + box[:, 2:]/2), 1)
# 计算两个box的交集
def insersect(box1,box2):
    label_num = box1.size(0)
    box_num = box2.size(0)
    #这里box的坐标应该是(x_min,y_min,x_max,y_max),这样以下操作相当于min(x_max1,x_max2),min(y_max1,y_max2)
    #max(x_min1,x_min2),max(y_min1,y_min2)
    #然后得到的是groud truth中第label个框和先验框中的第box个框的交集
    """ We resize both tensors to [A,B,2] without new malloc:
    [A,2] -> [A,1,2] -> [A,B,2]
    [B,2] -> [1,B,2] -> [A,B,2]
    Then we compute the area of intersect between box_a and box_b.
    Args:
      box_a: (tensor) bounding boxes, Shape: [A,4].
      box_b: (tensor) bounding boxes, Shape: [B,4].
    Return:
      (tensor) intersection area, Shape: [A,B].
    """
    max_xy = torch.min(
        box1[:,2:].unsqueeze(1).expand(label_num,box_num,2),
        box2[:,2:].unsqueeze(0).expand(label_num,box_num,2)
    )
    min_xy = torch.max(
        box1[:,:2].unsqueeze(1).expand(label_num,box_num,2),
        box2[:,:2].unsqueeze(0).expand(label_num,box_num,2)
    )
    #如果max_xy比min_xy还小,说明没有交集,直接置0即可,inter保存的是交集的长和宽
    inter = torch.clamp((max_xy-min_xy),min=0)
    #返回交集面积
    return inter[:,:,0]*inter[:,:,1]

def jaccard(box_a, box_b):
    """计算jaccard比
    公式:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    """
    inter = insersect(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
    area_b = ((box_b[:, 2]-box_b[:, 0]) *
              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]
def point_form(boxes):

    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
def match(threshold, truths, priors, labels, loc_t, conf_t, idx):
    """计算default box和实际位置的jaccard比,计算出每个box的最大jaccard比的种类和每个种类的最大jaccard比的box
    Args:
        threshold: (float) jaccard比的阈值.
        truths: (tensor) 实际位置.
        priors: (tensor) default box
        labels: (tensor) 一个图片实际包含的类别数.
        loc_t: (tensor) 需要存储每个box不同类别中的最大jaccard比.
        conf_t: (tensor) 存储每个box的最大jaccard比的类别.
        idx: (int) 当前的批次
    """
    # 计算jaccard比
    overlaps = jaccard(
        truths,
        # 转换priors,转换为x_min,y_min,x_max和y_max
        point_form(priors)
    )
    # [1,num_objects] best prior for each ground truth
    # 实际包含的类别对应box中jaccarb最大的box和对应的索引值,即每个类别最优box
    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
    # [1,num_priors] best ground truth for each prior
    # 每一个box,在实际类别中最大的jaccard比的类别,即每个box最优类别
    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
    best_truth_idx.squeeze_(0)
    best_truth_overlap.squeeze_(0)
    best_prior_idx.squeeze_(1)
    best_prior_overlap.squeeze_(1)
    # 将每个类别中的最大box设置为2,确保不影响后边操作
    best_truth_overlap.index_fill_(0, best_prior_idx, 2)

    # 计算每一个box的最优类别,和每个类别的最优loc
    for j in range(best_prior_idx.size(0)):
        best_truth_idx[best_prior_idx[j]] = j
    matches = truths[best_truth_idx]          # Shape: [num_priors,4]
    conf = labels[best_truth_idx] + 1         # Shape: [num_priors]
    conf[best_truth_overlap < threshold] = 0  # label as background
    # 实现loc的转换,具体的转换公式参照论文中的loc的loss函数的计算公式
    loc = encode(matches, priors,(0.1,0.2))
    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
    conf_t[idx] = conf  # [num_priors] top class label for each prior


def log_sum_exp(x):
    """Utility function for computing log_sum_exp while determining
    This will be used to determine unaveraged confidence loss across
    all examples in a batch.
    Args:
        x (Variable(tensor)): conf_preds from conf layers
    """
    x_max = x.data.max()
    result = torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max

#和encode操作相反
def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes

def nms(boxes, scores, overlap=0.5, top_k=200):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
        scores: (tensor) The class predscores for the img, Shape:[num_priors].
        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
        top_k: (int) The Maximum number of box preds to consider.
    Return:
        The indices of the kept boxes with respect to num_priors.
    """
        """
    输入:
        boxes: 存储一个图片的所有预测框。[num_positive,4].
        scores:置信度。如果为多分类则需要将nms函数套在一个循环内。[num_positive].
        overlap: nms抑制时iou的阈值.
        top_k: 先选取置信度前top_k个框再进行nms.
    返回:
        nms后剩余预测框的索引.
    """

    #keep为最后保留的边框 
    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0:
        return keep,0
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    #这里没有x2 - x1 + 1,感觉是一件很奇怪的事情
    area = torch.mul(x2 - x1, y2 - y1) #并行化计算所有框的面积
    v, idx = scores.sort(0)  # sort in ascending order
    # I = I[v >= 0.01]
    idx = idx[-top_k:]  # indices of the top-k largest vals
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    # keep = torch.Tensor()
    count = 0
    #返回idx中元素的个数
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1
        if idx.size(0) == 1:
            break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        #剩下boxes的信息存储在xx,yy中,此时idx已经去掉了置信度最大的框的下标
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        # 计算当前最大置信框与其他剩余框的交集,作者这段代码写的不好,容易误导
        # 此处应该会自动广播
        xx1 = torch.clamp(xx1, min=x1[i])  #max(x1,xx1)
        yy1 = torch.clamp(yy1, min=y1[i])  #max(y1,yy1)
        xx2 = torch.clamp(xx2, max=x2[i])  #min(x2,xx2)
        yy2 = torch.clamp(yy2, max=y2[i])  #min(y2,yy2)
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1 #w=min(x2,xx2)−max(x1,xx1)
        h = yy2 - yy1 #h=min(y2,yy2)−max(y1,yy1)
        w = torch.clamp(w, min=0.0) #max(w,0)
        h = torch.clamp(h, min=0.0) #max(h,0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)   # 剩余的框的面积
        union = (rem_areas - inter) + area[i] 
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)] # 选出IoU <= overlap的boxes(注意le函数的使用)
    return keep, count
if __name__ == '__main__':
    mean = default_prior_box()
    print(mean)

在训练过程中,首先要确定训练图片中的ground truth(真实目标)与哪个先验框来进行匹配,与之匹配的先验框所对应的边界框将负责预测它。在Yolo中,ground truth的中心落在哪个单元格,该单元格中与其IOU最大的边界框负责预测它。但是在SSD中却完全不一样,SSD的先验框与ground truth的匹配原则主要有两点。首先,对于图片中每个ground truth,找到与其IOU最大的先验框,该先验框与其匹配,这样,可以保证每个ground truth一定与某个先验框匹配。通常称与ground truth匹配的先验框为正样本,反之,若一个先验框没有与任何ground truth进行匹配,那么该先验框只能与背景匹配,就是负样本。一个图片中ground truth是非常少的, 而先验框却很多,如果仅按第一个原则匹配,很多先验框会是负样本,正负样本极其不平衡,所以需要第二个原则。第二个原则是:对于剩余的未匹配先验框,若某个ground truth的IOU大于某个阈值(一般是0.5),那么该先验框也与这个ground truth进行匹配。

尽管一个ground truth可以与多个先验框匹配,但是ground truth相对先验框还是太少了,所以负样本相对正样本会很多。为了保证正负样本尽量平衡,SSD采用了hard negative mining,就是对负样本进行抽样,抽样时按照置信度误差(预测背景的置信度越小,误差越大)进行降序排列,选取误差的较大的top-k作为训练的负样本,以保证正负样本比例接近1:3。

你可能感兴趣的:(SSD pytorch源码剖析(不包含模型搭建及训练、测试)及训练方法剖析)