目标检测MobilenetV2-SSD300 with focal loss流水线

0.背景


想学习检测,yolov3很多基于darknet,不是很方便,转向同样轻量级的SSD

1.介绍

https://zhuanlan.zhihu.com/p/79854543

https://zhuanlan.zhihu.com/p/79933177

https://zhuanlan.zhihu.com/p/66332452


1.1 数据集

本次采用VOC07+12,20类,加上背景21类

1.2网络结构

原始SSD300,共6个尺度,从38*38逐步/2,最后到1*1,mobile版本从19*19开始

在这里插入图片描述

    def forward(self, x):
        """Applies network layers and ops on input image(s) x.

        Args:
            x: input image or batch of images. Shape: [batch,3,300,300].

        Return:
            Depending on phase:
            test:
                Variable(tensor) of output class label predictions,
                confidence score, and corresponding location predictions for
                each object detected. Shape: [batch,topk,7]

            train:
                list of concat outputs from:
                    1: confidence layers, Shape: [batch*num_priors,num_classes]
                    2: localization layers, Shape: [batch,num_priors*4]
                    3: priorbox layers, Shape: [2,num_priors*4]
        """
        sources = list()
        loc = list()
        conf = list()

        # apply vgg up to conv4_3 relu
        x = self.mobilenet.conv1(x)   #torch.Size([32, 16, 150, 150])
        x = self.mobilenet.bn1(x)     #torch.Size([32, 16, 150, 150])
        x = self.mobilenet.activation(x) #torch.Size([32, 16, 150, 150])

        for i in self.mobilenet.bottlenecks[:5]:
            x = i(x)
            '''
            torch.Size([32, 16, 150, 150])  B_0
            torch.Size([32, 24, 75, 75])
            torch.Size([32, 32, 38, 38])
            torch.Size([32, 64, 19, 19])
            torch.Size([32, 96, 19, 19])
            torch.Size([32, 160, 10, 10])
            torch.Size([32, 320, 10, 10])
            '''

        #s = self.L2Norm(x)
        sources.append(x)   #torch.Size([32, 96, 19, 19])

        # apply vgg up to fc7

        for i in self.mobilenet.bottlenecks[5:]:
            x = i(x)
        x = self.mobilenet.conv_last(x)
        x = self.mobilenet.bn_last(x)
        x = self.mobilenet.activation(x)

        sources.append(x)  #torch.Size([32, 1280, 10, 10])

        # apply extra layers and cache source layer outputs
        for k, v in enumerate(self.extras):
            #print(x.size())
            #print(v(x).size())
            # x = F.relu(v(x), inplace=True)
            x = v(x)
            if k % 2 == 1:
                sources.append(x)  #[[512,5],[256,3],[256,2],[128,1]  ?   512 256 256 128

        # apply multibox head to source layers
        for (x, l, c) in zip(sources, self.loc, self.conf):
            loc.append(l(x).permute(0, 2, 3, 1).contiguous())   #[B,4*num_box,H1,W1]--> [B,H,W,4*num_box]
            conf.append(c(x).permute(0, 2, 3, 1).contiguous())

        loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
        '''
        [B,H1*W1*16+H2*W2*24],一batch的所有框
        
        torch.Size([32, 16, 19, 19])
        torch.Size([32, 24, 10, 10])
        torch.Size([32, 24, 5, 5])
        torch.Size([32, 24, 3, 3])
        torch.Size([32, 24, 2, 2])
        torch.Size([32, 24, 1, 1])
        '''

        conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
        if self.phase == "test":
            output = self.detect(
                loc.view(loc.size(0), -1, 4),                   # loc preds
                self.softmax(conf.view(conf.size(0), -1,
                             self.num_classes)),                # conf preds
                self.priors.type(type(x.data))                  # default boxes
            )
        else:
            output = (
                loc.view(loc.size(0), -1, 4),    #  b*n*4,n为一张图所有的框
                conf.view(conf.size(0), -1, self.num_classes),
                self.priors  # #顺序  1_min,1_max,2_max_,2_min   [cx,cy,w,h] column,row,feature,大小  n*4
            )
        return output

简单来说,自己总结

从网络中抽取   96*19*19,1280*10*10  512*5*5  256*3*3  256*3*3 128*1*1  进行后续操作
原始 SSD是 38*38 19*19 10*10 5*5 3*3 1*1,有不同

通过  loc  和  conf  提取特征信息,分别再形成6个feature,用于计算loss,设定6层每个点的anchor数量依次为  466666
所以  loc 和 conf 不同的只是通道数,  4*num_anchor,  21*num_anchor

例如 loc的6个输出
        torch.Size([32, 16, 19, 19])
        torch.Size([32, 24, 10, 10])
        torch.Size([32, 24, 5, 5])
        torch.Size([32, 24, 3, 3])
        torch.Size([32, 24, 2, 2])
        torch.Size([32, 24, 1, 1])

再把 loc  和  conf   压扁 成 Batch_size*x

train:返回             output = (
                loc.view(loc.size(0), -1, 4),    #  b*n*4,n为一张图所有的框,6个尺度
                conf.view(conf.size(0), -1, self.num_classes),
                self.priors  # #顺序anchor  1_min,1_max,2_max_,2_min,(3_max,3_min
 后面5个尺度才有)  [cx,cy,w,h] 按照column,row顺序来的,feature,大小  n*4     
cx,cy是网格点中心,cx,cy,w,h 均为0-1的数
            )


test:            output = self.detect(
                loc.view(loc.size(0), -1, 4),                   # loc preds
                self.softmax(conf.view(conf.size(0), -1,
                             self.num_classes)),                # conf preds,NNL要自己siftmax
                self.priors.type(type(x.data))                  # default boxes
            )

1.2 VOC 数据类型

读取XML文件得到

    def __init__(self, root,
                 image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
                 transform=None, target_transform=VOCAnnotationTransform(),
                 dataset_name='VOC0712'):
        self.root = root
        self.image_set = image_sets
        self.transform = transform
        self.target_transform = target_transform
        self.name = dataset_name
        self._annopath = osp.join('%s', 'Annotations', '%s.xml')
        self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
        self.ids = list()
        for (year, name) in image_sets:
            rootpath = osp.join(self.root, 'VOC' + year)
            for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
                self.ids.append((rootpath, line.strip()))  #07+12    [(.../VOC2007,000001),()]

    def __getitem__(self, index):
        im, gt, h, w = self.pull_item(index)

        return im, gt  # # return torch.from_numpy(img) C*H*W  , target(# [[xmin, ymin, xmax, ymax, label_ind], ... ],#  0-1), height, width

    def __len__(self):
        return len(self.ids)

    def pull_item(self, index):
        img_id = self.ids[index]

        target = ET.parse(self._annopath % img_id).getroot()
        img = cv2.imread(self._imgpath % img_id)
        height, width, channels = img.shape

        if self.target_transform is not None:
            target = self.target_transform(target, width, height)

        if self.transform is not None:
            target = np.array(target)
            img, boxes, labels = self.transform(img, target[:, :4], target[:, 4])
            # to rgb
            img = img[:, :, (2, 1, 0)]
            # img = img.transpose(2, 0, 1)
            target = np.hstack((boxes, np.expand_dims(labels, axis=1)))
        return torch.from_numpy(img).permute(2, 0, 1), target, height, width
        # return torch.from_numpy(img) C*H*W  , target(# [[xmin, ymin, xmax, ymax, label_ind], ... ],#  0-1,相对于自己的w,h), height, width

2. loss 计算

自定义  MultiBoxLoss(nn.Module),在forward中实现loss计算

class MultiBoxLoss(nn.Module):
    """SSD Weighted Loss Function
    Compute Targets:
        1) Produce Confidence Target Indices by matching  ground truth boxes
           with (default) 'priorboxes' that have jaccard index > threshold parameter
           (default threshold: 0.5).
        2) Produce localization target by 'encoding' variance into offsets of ground
           truth boxes and their matched  'priorboxes'.
        3) Hard negative mining to filter the excessive number of negative examples
           that comes with using a large number of default bounding boxes.
           (default negative:positive ratio 3:1)
    Objective Loss:
        L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
        Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
        weighted by α which is set to 1 by cross val.
        Args:
            c: class confidences,
            l: predicted boxes,
            g: ground truth boxes
            N: number of matched default boxes
        See: https://arxiv.org/pdf/1512.02325.pdf for more details.

            criterion = MultiBoxLoss(cfg['num_classes'], 0.5, True, 0, True, 3, 0.5,
                             False, args.cuda)
    """

    def __init__(self, num_classes, overlap_thresh, prior_for_matching,
                 bkg_label, neg_mining, neg_pos, neg_overlap, encode_target,
                 use_gpu=True):
        super(MultiBoxLoss, self).__init__()
        self.use_gpu = use_gpu
        self.num_classes = num_classes  #21
        self.threshold = overlap_thresh  #0.5
        self.background_label = bkg_label #0
        self.encode_target = encode_target   # false
        self.use_prior_for_matching = prior_for_matching  #true
        self.do_neg_mining = neg_mining  #true
        self.negpos_ratio = neg_pos  #3  3:1
        self.neg_overlap = neg_overlap  #0.5
        self.variance = [0.1,0.2]

    def forward(self, predictions, targets):
        """Multibox Loss
        Args:
            predictions (tuple): A tuple containing loc preds, conf preds,
            and prior boxes from SSD net.
                conf shape: torch.size(batch_size,num_priors,num_classes) 
                loc shape: torch.size(batch_size,num_priors,4)
                priors shape: torch.size(num_priors,4) cx cy w h 0-1

            targets (tensor): Ground truth boxes and labels for a batch,
                shape: [batch_size,num_objs,5] (last idx is the label).
        """
        loc_data, conf_data, priors = predictions
        num = loc_data.size(0)  #batch 数量
        priors = priors[:loc_data.size(1), :] #torch.size(num_priors,4)
        num_priors = (priors.size(0))
        num_classes = self.num_classes

        # match priors (default boxes) and ground truth boxes
        loc_t = torch.Tensor(num, num_priors, 4)
        conf_t = torch.LongTensor(num, num_priors)  #方便onehot

        # conf_t = torch.zeros(num,num_priors).long()


        for idx in range(num):  #一张图片一张图片来
            target = targets[idx]   #[xm,ym,xmax,ymax,label]  0-1
            truths = target[:, :-1].data  #gt,   size=num_objects*4,,  object 不等于  prior
            labels = target[:, -1].data   #(num_obj,)
            defaults = priors.data
            match(self.threshold, truths, defaults, self.variance, labels,
                  loc_t, conf_t, idx)  #loc_t-->[B,num_pro,4],每个prio都有一个自己的GT,理想偏移。  con——t, [B,num_priors] top class label for each prior,21类别

        if self.use_gpu:
            loc_t = loc_t.cuda()  #所有loc_均已改变
            conf_t = conf_t.cuda()  # B*num_prio
        # wrap targets
        loc_t = Variable(loc_t, requires_grad=False)  #[num_pro,4]
        conf_t = Variable(conf_t, requires_grad=False)

        pos = conf_t > 0  #0 or 1,mask  # B*num_prio

        num_pos = pos.sum(dim=1, keepdim=True)  # 一个batch,prio非背景数量。  B*1


        # Localization Loss (Smooth L1)
        # Shape: [batch,num_priors,4]
        pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)  #(batch_size,num_priors,4),竖着复制一遍,mask
        loc_p = loc_data[pos_idx].view(-1, 4)  #先拍成一列,再view,   n*4,  n /b <   n_prio, 只考虑含物体的loc
        loc_t = loc_t[pos_idx].view(-1, 4)  #  正样本GT,一个batch非常少


        loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum')  # 计算smooth-l1,返回标量。

        # Compute max conf across batch for hard negative mining,         loss=[b*num_prio,1]
        batch_conf = conf_data.view(-1, self.num_classes)   #  (batch_size,num_priors,num_classes)-->
        loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))  # 后一个batch_conf; -->  (B*Num_prio,1), 对应类的conf值.

        # Hard Negative Mining
        # loss_c[pos] = 0  # filter out pos boxes for now
        # loss_c = loss_c.view(num, -1)

        # Hard Negative Mining
        loss_c = loss_c.view(num, -1)
        loss_c[pos] = 0  ## B*num_prio,只看背景的prio


        _, loss_idx = loss_c.sort(1, descending=True)  #每一张图中降序,排前面的背景框
        _, idx_rank = loss_idx.sort(1)  #告诉每批次,每张图的排队位置
        num_pos = pos.long().sum(1, keepdim=True) #每张图非背景数量,B*1

        num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)  #至少选出3倍数量的negative,   B*1
        neg = idx_rank < num_neg.expand_as(idx_rank)  # B*num_prio,  少量的backgroud是1,其余是0

        # Confidence Loss Including Positive and Negative Examples
        pos_idx = pos.unsqueeze(2).expand_as(conf_data)   # B*num_prio-->  batch_size,num_priors,num_classes   0是背景,1是物体
        neg_idx = neg.unsqueeze(2).expand_as(conf_data)   # B*num_prio*21,  少量的backgroud是1,其余是0
        conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes)  # 0+1或者1+0的框被选中, 最终参与计算conf——loss,  n*21
        targets_weighted = conf_t[(pos+neg).gt(0)] # 对应框的target,(n,),0-20

        if cfg.USE_FL:
            alpha = np.array([[0.25], [0.75], [0.75], [0.75], [0.75],
                              [0.75], [0.75], [0.75], [0.75], [0.75],
                              [0.75], [0.75], [0.75], [0.75], [0.75],
                              [0.75], [0.75], [0.75], [0.75], [0.75], [0.75]])  #  21*1,背景类权重下降
            alpha = torch.Tensor(alpha)
            compute_c_loss = focal_loss.FocalLoss(alpha=alpha, gamma=2, class_num=num_classes, size_average=False)
            loss_c = compute_c_loss(conf_p, targets_weighted)   #(n,21)   (n,)
        else:
            loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum')

        # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N

        N = num_pos.data.sum()  # n_pos , <_prio

        loss_l /= N
        loss_c /= N

        # print("N",N,"\t","loss_l",loss_l,"\t","loss_c",loss_c)

        return loss_l, loss_c

匹配的的过程很精彩,包括 IOU计算

# -*- coding: utf-8 -*-
import torch


def point_form(boxes):
    """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
    representation for comparison to point form ground truth data.
    Args:
        boxes: (tensor) center-size default boxes from priorbox layers.
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
                     boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax


def center_size(boxes):
    """ Convert prior_boxes to (cx, cy, w, h)
    representation for comparison to center-size form ground truth data.
    Args:
        boxes: (tensor) point_form boxes
    Return:
        boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
    """
    return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
                     boxes[:, 2:] - boxes[:, :2], 1)  # w, h


def intersect(box_a, box_b):
    """ We resize both tensors to [A,B,2] without new malloc:
    [A,2] -> [A,1,2] -> [A,B,2]
    [B,2] -> [1,B,2] -> [A,B,2]
    Then we compute the area of intersect between box_a and box_b.
    Args:
      box_a: (tensor) bounding boxes, Shape: [A,4].
      box_b: (tensor) bounding boxes, Shape: [B,4].
    Return:
      (tensor) intersection area, Shape: [A,B].
    """
    A = box_a.size(0)
    B = box_b.size(0)
    max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
                       box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
    min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
                       box_b[:, :2].unsqueeze(0).expand(A, B, 2))
    inter = torch.clamp((max_xy - min_xy), min=0)
    return inter[:, :, 0] * inter[:, :, 1]


def jaccard(box_a, box_b):
    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
    is simply the intersection over union of two boxes.  Here we operate on
    ground truth boxes and default boxes.
    E.g.:
        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
    Args:
        box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
        box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
    Return:
        jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
    """
    inter = intersect(box_a, box_b)
    area_a = ((box_a[:, 2]-box_a[:, 0]) *
              (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
    area_b = ((box_b[:, 2]-box_b[:, 0]) *
              (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
    union = area_a + area_b - inter
    return inter / union  # [A,B]


def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx):
    """Match each prior box with the ground truth box of the highest jaccard
    overlap, encode the bounding boxes, then return the matched indices
    corresponding to both confidence and location preds.
    Args:
        threshold: (float) The overlap threshold used when mathing boxes.               0.5
        truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors].              num_obj*4
            priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].     [n_priors,4]
        variances: (tensor) Variances corresponding to each prior coord,               [0.1,0.2]
            Shape: [num_priors, 4].
        labels: (tensor) All the class labels for the image, Shape: [num_obj].           [num_obj]
        loc_t: (tensor) Tensor to be filled w/ endcoded location targets.               (num_priors, 4)
        conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.          (num_priors,)
        idx: (int) current batch index
    Return:
        The matched indices corresponding to 1)location and 2)confidence preds.
    """

    # if len(truths) ==0:
    #     conf_t[idx] = 0
    #     return

    # jaccard index
    overlaps = jaccard(    #return iou,  [num_obj *num_prior]
        truths,
        point_form(priors)   #prio ,cx,cy,h,w   0-1
    )
    # (Bipartite Matching)
    # [1,num_objects] best prior for each ground truth
    best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)  #[num_oj,1],[nu_obj,1]
    # [1,num_priors] best ground truth for each prior
    best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)  #[1,num_prio],[1.num_prio]
    best_truth_idx.squeeze_(0)  #[num_prio]
    best_truth_overlap.squeeze_(0)
    best_prior_idx.squeeze_(1)
    best_prior_overlap.squeeze_(1)
    best_truth_overlap.index_fill_(0, best_prior_idx, 2)  # ensure best prior,GT指定的框 conf为2
    # TODO refactor: index  best_prior_idx with long tensor
    # ensure every gt matches with its prior of max overlap
    for j in range(best_prior_idx.size(0)):
        best_truth_idx[best_prior_idx[j]] = j  #  被GT制定的prio框的index更改


    matches = truths[best_truth_idx]          # Shape: [num_priors,4]   every prio  对应de GT
    conf = labels[best_truth_idx] + 1         # Shape: [num_priors]  #  0代表背景,20类变成21类。   label是20类
    conf[best_truth_overlap < threshold] = 0  # label as background  # conf [num_prio]
    loc = encode(matches, priors, variances)   #loc  [num_prio,4]
    loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn   ,相当于prio相当于 GT的偏移,是我们希望网络学习的完美输出。lov_t[b,num_prio,4]
    conf_t[idx] = conf  # [num_priors] top class label for each prior


def encode(matched, priors, variances):
    """Encode the variances from the priorbox layers into the ground truth boxes
    we have matched (based on jaccard overlap) with the prior boxes.
    Args:
        matched: (tensor) Coords of ground truth for each prior in point-form
            Shape: [num_priors, 4].
        priors: (tensor) Prior boxes in center-offset form
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes  [0.1,0.2]
    Return:
        encoded boxes (tensor), Shape: [num_priors, 4]
    """

    # dist b/t match center and prior's center
    g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]  # prio ,cx,cy,w,h
    # encode variance
    g_cxcy /= (variances[0] * priors[:, 2:])
    # match wh / prior wh
    g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
    g_wh = torch.log(g_wh) / variances[1]
    # return target for smooth_l1_loss
    return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]


# Adapted from https://github.com/Hakuyume/chainer-ssd
def decode(loc, priors, variances):
    """Decode locations from predictions using priors to undo
    the encoding we did for offset regression at train time.
    Args:
        loc (tensor): location predictions for loc layers,
            Shape: [num_priors,4]
        priors (tensor): Prior boxes in center-offset form.
            Shape: [num_priors,4].
        variances: (list[float]) Variances of priorboxes
    Return:
        decoded bounding box predictions
    """

    boxes = torch.cat((
        priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
        priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
    boxes[:, :2] -= boxes[:, 2:] / 2
    boxes[:, 2:] += boxes[:, :2]
    return boxes   # 最好是GT_prio,实际很接近


def log_sum_exp(x):
    """Utility function for computing log_sum_exp while determining
    This will be used to determine unaveraged confidence loss across
    all examples in a batch.
    Args:
        x (Variable(tensor)): conf_preds from conf layers,   (b*num_prio,21)
    """
    x_max = x.data.max()
    return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max


NMS的实现

def nms(boxes, scores, overlap=0.5, top_k=200):
    """Apply non-maximum suppression at test time to avoid detecting too many
    overlapping bounding boxes for a given object.
    Args:
        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
        scores: (tensor) The class predscores for the img, Shape:[num_priors].
        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
        top_k: (int) The Maximum number of box preds to consider.
    Return:
        The indices of the kept boxes with respect to num_priors.
    """

    keep = scores.new(scores.size(0)).zero_().long()
    if boxes.numel() == 0:
        return keep
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    area = torch.mul(x2 - x1, y2 - y1)
    v, idx = scores.sort(0)  # sort in ascending order
    # I = I[v >= 0.01]
    idx = idx[-top_k:]  # indices of the top-k largest vals,从最大的200个面开始筛选
    xx1 = boxes.new()
    yy1 = boxes.new()
    xx2 = boxes.new()
    yy2 = boxes.new()
    w = boxes.new()
    h = boxes.new()

    # keep = torch.Tensor()
    count = 0
    while idx.numel() > 0:
        i = idx[-1]  # index of current largest val
        # keep.append(i)
        keep[count] = i
        count += 1
        if idx.size(0) == 1:
            break
        idx = idx[:-1]  # remove kept element from view
        # load bboxes of next highest vals
        torch.index_select(x1, 0, idx, out=xx1)
        torch.index_select(y1, 0, idx, out=yy1)
        torch.index_select(x2, 0, idx, out=xx2)
        torch.index_select(y2, 0, idx, out=yy2)
        # store element-wise max with next highest score
        xx1 = torch.clamp(xx1, min=x1[i])
        yy1 = torch.clamp(yy1, min=y1[i])
        xx2 = torch.clamp(xx2, max=x2[i])
        yy2 = torch.clamp(yy2, max=y2[i])
        w.resize_as_(xx2)
        h.resize_as_(yy2)
        w = xx2 - xx1
        h = yy2 - yy1
        # check sizes of xx1 and xx2.. after each iteration
        w = torch.clamp(w, min=0.0)
        h = torch.clamp(h, min=0.0)
        inter = w*h
        # IoU = i / (area(a) + area(b) - i)
        rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
        union = (rem_areas - inter) + area[i]
        IoU = inter/union  # store result in iou
        # keep only elements with an IoU <= overlap
        idx = idx[IoU.le(overlap)]
    return keep, count  #一张图中pred框相对的编号

AP计算

https://zhuanlan.zhihu.com/p/70667071


https://zhuanlan.zhihu.com/p/70667071  这个写的太好了,以这个为主

import numpy as np

def voc_ap(rec, prec, use_07_metric=False):
    """ ap = voc_ap(rec, prec, [use_07_metric])
    Compute VOC AP given precision and recall.
    If use_07_metric is true, uses the
    VOC 07 11 point method (default:False).
    """
    # 针对2007年VOC,使用的11个点计算AP,现在不使用
    if use_07_metric:
        # 11 point metric
        ap = 0.
        for t in np.arange(0., 1.1, 0.1):
            if np.sum(rec >= t) == 0:
                p = 0
            else:
                p = np.max(prec[rec >= t])
            ap = ap + p / 11.
    else:
        # correct AP calculation
        # first append sentinel values at the end
        mrec = np.concatenate(([0.], rec, [1.]))  #[0.  0.0666, 0.1333, 0.4   , 0.4666,  1.] 
        mpre = np.concatenate(([0.], prec, [0.])) #[0.  1.,     0.6666, 0.4285, 0.3043,  0.]

        # compute the precision envelope
        # 计算出precision的各个断点(折线点)
        for i in range(mpre.size - 1, 0, -1):
            mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])  #[1.     1.     0.6666 0.4285 0.3043 0.    ]

        # to calculate area under PR curve, look for points
        # where X axis (recall) changes value
        i = np.where(mrec[1:] != mrec[:-1])[0]  #precision前后两个值不一样的点
        print(mrec[1:], mrec[:-1])
        print(i) #[0, 1, 3, 4, 5]

        # AP= AP1 + AP2+ AP3+ AP4
        ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap

rec = np.array([0.0666, 0.1333,0.1333, 0.4, 0.4666])
prec = np.array([1., 0.6666, 0.6666, 0.4285, 0.3043])
ap = voc_ap(rec, prec)

print(ap) #输出:0.2456

 

你可能感兴趣的:(pytroch,目标检测)