FCOS 并没有使用RPN来回归边界框,而是采用逐像素的方法来回归边界框,只不过这部分的代码名称依然是RPN而以。下面打印了这部分的网络结构。

  (head): FCOSHead(
    (cls_tower): Sequential(
      (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): GroupNorm(32, 256, eps=1e-05, affine=True)
      (2): ReLU()
      (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): GroupNorm(32, 256, eps=1e-05, affine=True)
      (5): ReLU()
      (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): GroupNorm(32, 256, eps=1e-05, affine=True)
      (8): ReLU()
      (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (10): GroupNorm(32, 256, eps=1e-05, affine=True)
      (11): ReLU()
    (bbox_tower): Sequential(
      (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): GroupNorm(32, 256, eps=1e-05, affine=True)
      (2): ReLU()
      (3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): GroupNorm(32, 256, eps=1e-05, affine=True)
      (5): ReLU()
      (6): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (7): GroupNorm(32, 256, eps=1e-05, affine=True)
      (8): ReLU()
      (9): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (10): GroupNorm(32, 256, eps=1e-05, affine=True)
      (11): ReLU()
    (cls_logits): Conv2d(256, 80, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (bbox_pred): Conv2d(256, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (centerness): Conv2d(256, 1, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (scales): ModuleList(
      (0): Scale()
      (1): Scale()
      (2): Scale()
      (3): Scale()
      (4): Scale()
  (box_selector_test): FCOSPostProcessor()

1)FCOSHead 部分

i. cls_tower 和 bbox_tower


cls_tower_list = []  # list:5 {Tensor:(1,256,100,140),Tensor:(1,256,50,70),Tensor:(1,256,25,35),Tensor:(1,256,13,18),Tensor:(1,256,7,9)}
box_tower_list = []  # list:5 {Tensor:(1,256,100,140),Tensor:(1,256,50,70),Tensor:(1,256,25,35),Tensor:(1,256,13,18),Tensor:(1,256,7,9)}

ii. cls_logits


logits = []  # list:5 {Tensor:(1,80,100,140),Tensor:(1,80,50,70),Tensor:(1,80,25,35),Tensor:(1,80,13,18),Tensor:(1,80,7,9)}

iii. bbox_pred


bbox_reg = []  # list:5 {Tensor:(1,4,100,140),Tensor:(1,4,50,70),Tensor:(1,4,25,35),Tensor:(1,4,13,18),Tensor:(1,4,7,9)}

iv. centerness


centerness = []  # list:5 {Tensor:(1,1,100,140),Tensor:(1,1,50,70),Tensor:(1,1,25,35),Tensor:(1,1,13,18),Tensor:(1,1,7,9)}

v. FCOSHead 类


class FCOSHead(torch.nn.Module):
    def __init__(self, cfg, in_channels):  # in_channels:256
            in_channels (int): number of channels of the input feature
        super(FCOSHead, self).__init__()
        # TODO: Implement the sigmoid version first.
        num_classes = cfg.MODEL.FCOS.NUM_CLASSES - 1  # 80
        self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES  # [8,16,32,64,128]
        self.norm_reg_targets = cfg.MODEL.FCOS.NORM_REG_TARGETS  # True
        self.centerness_on_reg = cfg.MODEL.FCOS.CENTERNESS_ON_REG  # True
        self.use_dcn_in_tower = cfg.MODEL.FCOS.USE_DCN_IN_TOWER  # False

        cls_tower = []
        bbox_tower = []
        for i in range(cfg.MODEL.FCOS.NUM_CONVS):  # 4
            if self.use_dcn_in_tower and \
                    i == cfg.MODEL.FCOS.NUM_CONVS - 1:
                conv_func = DFConv2d
                conv_func = nn.Conv2d

            cls_tower.append(nn.GroupNorm(32, in_channels))
            bbox_tower.append(nn.GroupNorm(32, in_channels))

        self.add_module('cls_tower', nn.Sequential(*cls_tower))
        self.add_module('bbox_tower', nn.Sequential(*bbox_tower))
        self.cls_logits = nn.Conv2d(
            in_channels, num_classes, kernel_size=3, stride=1,
        self.bbox_pred = nn.Conv2d(
            in_channels, 4, kernel_size=3, stride=1,
        self.centerness = nn.Conv2d(
            in_channels, 1, kernel_size=3, stride=1,

        # initialization
        for modules in [self.cls_tower, self.bbox_tower,
                        self.cls_logits, self.bbox_pred,
            for l in modules.modules():
                if isinstance(l, nn.Conv2d):
                    torch.nn.init.normal_(l.weight, std=0.01)
                    torch.nn.init.constant_(l.bias, 0)

        # initialize the bias for focal loss
        prior_prob = cfg.MODEL.FCOS.PRIOR_PROB  # 0.01
        bias_value = -math.log((1 - prior_prob) / prior_prob)  # -4.59511985013459
        torch.nn.init.constant_(self.cls_logits.bias, bias_value)

        self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)])

    def forward(self, x):  # form FPN, tuple:5
        logits = []  # list:5 {Tensor:(1,80,100,140),Tensor:(1,80,50,70),Tensor:(1,80,25,35),Tensor:(1,80,13,18),Tensor:(1,80,7,9)}
        bbox_reg = []  # list:5 {Tensor:(1,4,100,140),Tensor:(1,4,50,70),Tensor:(1,4,25,35),Tensor:(1,4,13,18),Tensor:(1,4,7,9)}
        centerness = []  # list:5 {Tensor:(1,1,100,140),Tensor:(1,1,50,70),Tensor:(1,1,25,35),Tensor:(1,1,13,18),Tensor:(1,1,7,9)}
        # cls_tower_list = []  # list:5 {Tensor:(1,256,100,140),Tensor:(1,256,50,70),Tensor:(1,256,25,35),Tensor:(1,256,13,18),Tensor:(1,256,7,9)}
        # box_tower_list = []  # list:5 {Tensor:(1,256,100,140),Tensor:(1,256,50,70),Tensor:(1,256,25,35),Tensor:(1,256,13,18),Tensor:(1,256,7,9)}
        for l, feature in enumerate(x):
            cls_tower = self.cls_tower(feature)
            box_tower = self.bbox_tower(feature)
            # cls_tower_list.append(cls_tower)
            # box_tower_list.append(box_tower)

            if self.centerness_on_reg:

            bbox_pred = self.scales[l](self.bbox_pred(box_tower))
            if self.norm_reg_targets:
                bbox_pred = F.relu(bbox_pred)
                if self.training:
                    bbox_reg.append(bbox_pred * self.fpn_strides[l])
        return logits, bbox_reg, centerness

## ------ self.scales = nn.ModuleList([Scale(init_value=1.0) for _ in range(5)])


bbox_pred = self.scales[l](self.bbox_pred(box_tower))

这是因为Head接收FPN 5层不同尺寸的特征(五层特征共享head),其回归范围不同,所以需要用缩放因子scale对回归结果进行缩放。

class Scale(nn.Module):
    def __init__(self, init_value=1.0):
        super(Scale, self).__init__()
        self.scale = nn.Parameter(torch.FloatTensor([init_value]))  # scale , size is 1

    def forward(self, input):
        return input * self.scale

打印了以下5个scale 的值,如下所示

number l is 0
self.scale is Parameter containing:
tensor([0.9034], device='cuda:0', requires_grad=True)
number l is 1
self.scale is Parameter containing:
tensor([0.9520], device='cuda:0', requires_grad=True)
number l is 2
self.scale is Parameter containing:
tensor([0.9570], device='cuda:0', requires_grad=True)
number l is 3
self.scale is Parameter containing:
tensor([0.9397], device='cuda:0', requires_grad=True)
number l is 4
self.scale is Parameter containing:
tensor([0.8791], device='cuda:0', requires_grad=True)

## ------- nn.ModuleList


2) FCOSModule 部分



class FCOSModule(torch.nn.Module):
    Module for FCOS computation. Takes feature maps from the backbone and
    FCOS outputs and losses. Only Test on FPN now.

    def __init__(self, cfg, in_channels):
        super(FCOSModule, self).__init__()

        head = FCOSHead(cfg, in_channels)

        box_selector_test = make_fcos_postprocessor(cfg)

        loss_evaluator = make_fcos_loss_evaluator(cfg)
        self.head = head
        self.box_selector_test = box_selector_test
        self.loss_evaluator = loss_evaluator
        self.fpn_strides = cfg.MODEL.FCOS.FPN_STRIDES  # [8,16,32,64,128]

    def forward(self, images, features, targets=None):
            images (ImageList): images for which we want to compute the predictions
            features (list[Tensor]): features computed from the images that are
                used for computing the predictions. Each tensor in the list
                correspond to different feature levels
            targets (list[BoxList): ground-truth boxes present in the image (optional)

            boxes (list[BoxList]): the predicted boxes from the RPN, one BoxList per
            losses (dict[Tensor]): the losses for the model during training. During
                testing, it is an empty dict.
        box_cls, box_regression, centerness = self.head(features)
        locations = self.compute_locations(features)
        if self.training:  # only use on training
            return self._forward_train(
                locations, box_cls, 
                centerness, targets
        else:  # demo
            return self._forward_test(
                locations, box_cls, box_regression, 
                centerness, images.image_sizes

    def _forward_train(self, locations, box_cls, box_regression, centerness, targets):
        loss_box_cls, loss_box_reg, loss_centerness = self.loss_evaluator(
            locations, box_cls, box_regression, centerness, targets
        losses = {
            "loss_cls": loss_box_cls,
            "loss_reg": loss_box_reg,
            "loss_centerness": loss_centerness
        return None, losses

    def _forward_test(self, locations, box_cls, box_regression, centerness, image_sizes):
        boxes = self.box_selector_test(
            locations, box_cls, box_regression, 
            centerness, image_sizes
        return boxes, {}

    def compute_locations(self, features):  # 传入的是backbone的特征
        locations = []  # 网格坐标点,  FPN每层特征各个位置 -----> 原图像的位置
        for level, feature in enumerate(features):  # 获取FPN输出的每层特征
            h, w = feature.size()[-2:]  # FPN每层特征的尺寸
            locations_per_level = self.compute_locations_per_level(
                h, w, self.fpn_strides[level],
            )  # FPN每层特征中 各个位置 对应原图像的位置 生成的网格坐标点
        return locations

    def compute_locations_per_level(self, h, w, stride, device):
        shifts_x = torch.arange(
            0, w * stride, step=stride,
            dtype=torch.float32, device=device
        )  # 乘以步长映射回原图像大小,特征点对应于原图像是一块区域,所以以步长为间隔
        shifts_y = torch.arange(
            0, h * stride, step=stride,
            dtype=torch.float32, device=device
        shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
        shift_x = shift_x.reshape(-1)
        shift_y = shift_y.reshape(-1)
        locations = torch.stack((shift_x, shift_y), dim=1) + stride // 2  # 加步长整除2是因为特征点对应原图一块区域的中心,所以从stride//2坐标开始
        return locations

首先,它的输入如下所示(代码在generalized_rcnn.py 的forward部分),包括原图像的Tensor,骨干网络(ResNet和FPN)的输出特征,以及ground_truth(demo过程没有,Train过程有)

images = to_image_list(images)
features = self.backbone(images.tensors)
proposals, proposal_losses = self.rpn(images, features, targets)


i. box_cls, box_regression, centerness = self.head(features)


ii.  location = self.compute_locations(features)

返回FPN不同层特征各个位置对应于原图像的位置。因为FCOSHead输出的各个层级的特征尺寸大小与其接收的 来自FPN输出的各个层级的特征尺寸大小一样,并没有改变,改变的只有通道数,即FCOSHead并没有改变输入特征尺寸的大小,而是只改变了输入通道数,所以这个location就是分类得分图(logits)各个位置 反射回 原图像对应的各个位置。

iii.  boxes= self.box_selector_test(locations, box_cls, box_regression,centerness, image_sizes )


class FCOSPostProcessor(torch.nn.Module):
    Performs post-processing on the outputs of the RetinaNet boxes.
    This is only used in the testing.
    def __init__(
            pre_nms_thresh (float)
            pre_nms_top_n (int)
            nms_thresh (float)
            fpn_post_nms_top_n (int)
            min_size (int)
            num_classes (int)
            box_coder (BoxCoder)
        super(FCOSPostProcessor, self).__init__()
        self.pre_nms_thresh = pre_nms_thresh  # 0.05
        self.pre_nms_top_n = pre_nms_top_n  # 1000
        self.nms_thresh = nms_thresh  # 0.6
        self.fpn_post_nms_top_n = fpn_post_nms_top_n  # 100
        self.min_size = min_size  # 0
        self.num_classes = num_classes  # 81
        self.bbox_aug_enabled = bbox_aug_enabled

    def forward_for_single_feature_map(
            self, locations, box_cls,
            box_regression, centerness,
            anchors: list[BoxList]
            box_cls: tensor of size N, A * C, H, W
            box_regression: tensor of size N, A * 4, H, W
        N, C, H, W = box_cls.shape  # 推理阶段N=1,训练阶段N=Batchsize

        # put in the same format as locations
        box_cls = box_cls.view(N, C, H, W).permute(0, 2, 3, 1)  # shape : (N,H,W,C)
        box_cls = box_cls.reshape(N, -1, C).sigmoid()  # shape (N, H*W,C),sigmoid函数将输出值变为0到1之间
        box_regression = box_regression.view(N, 4, H, W).permute(0, 2, 3, 1)  # shape (N,H,W,4)
        box_regression = box_regression.reshape(N, -1, 4)  # (N,H*W,4)
        centerness = centerness.view(N, 1, H, W).permute(0, 2, 3, 1)  # (N,H,W,1)
        centerness = centerness.reshape(N, -1).sigmoid()  # (N,H*W)

        candidate_inds = box_cls > self.pre_nms_thresh  # bool型 0或1,论文中所述的大于0.05为正样本,其余为背景负样本
        pre_nms_top_n = candidate_inds.contiguous().view(N, -1).sum(1)  # ||这里加了个.contiguous(),要不会报错。||sum(1)为对行求和,看一共有多少个正样本 ,为一个数值Tensor
        pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)  # pre_nms_top_n的值是否大于1000,大于1000取1000,小于保留原有不变。保留最多1000个正样本

        # multiply the classification scores with centerness scores
        box_cls = box_cls * centerness[:, :, None]  # 分类得分图与中心质量分数相乘

        results = []
        for i in range(N):  # N=Batchsize,demo时N=1
            per_box_cls = box_cls[i]  
            per_candidate_inds = candidate_inds[i]
            per_box_cls = per_box_cls[per_candidate_inds]  # 拿出得分图中正样本的点

            per_candidate_nonzeros = per_candidate_inds.nonzero()  # 得到正样本在得分图中的信息。得分图已经resize成(H*W,C),第一维相当于位置,第二维相当于类别。
            per_box_loc = per_candidate_nonzeros[:, 0]  # 位置信息  举例Tensor(294,)
            per_class = per_candidate_nonzeros[:, 1] + 1  # 类别信息,加1是因为背景也算了1类,为第0类。 举例Tensor(294,)

            per_box_regression = box_regression[i]  # (H*W,4)
            per_box_regression = per_box_regression[per_box_loc]  # 每个正样本点对应的回归向量 举例Tensor(294,4)
            per_locations = locations[per_box_loc]  # 每个正样本点对应于原图像上的点,举例 Tensor:(294,2)

            per_pre_nms_top_n = pre_nms_top_n[i]  # 举例  294

            if per_candidate_inds.sum().item() > per_pre_nms_top_n.item():  # 正样本数是否大于1000个
                per_box_cls, top_k_indices = \
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)  # 选取前per_pre_nms_top_n个,不进行大小排序。返回一个元组 (values,indices)对应per_box_cls, top_k_indices
                per_class = per_class[top_k_indices]  # per_class保留的是所有正样本的类别信息,现在只取前1000个
                per_box_regression = per_box_regression[top_k_indices]  # 回归向量,只取对应的前1000个
                per_locations = per_locations[top_k_indices]  # 正样本点对应的原图像的点,只取前1000个

            detections = torch.stack([
                per_locations[:, 0] - per_box_regression[:, 0],
                per_locations[:, 1] - per_box_regression[:, 1],
                per_locations[:, 0] + per_box_regression[:, 2],
                per_locations[:, 1] + per_box_regression[:, 3],
            ], dim=1)  # 预测的bbox的 x1,y1,x2,y2, 左上角顶点与右下角顶点 举例 Tesnor(294,4)

            h, w = image_sizes[i]
            boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy")
            boxlist.add_field("labels", per_class)  # 类属性extra_fields字典里添加 "labels":per_class
            boxlist.add_field("scores", torch.sqrt(per_box_cls))  # 同上,添加位置得分,这里将per_box_cls开根号了,
            boxlist = boxlist.clip_to_image(remove_empty=False)  # 将超出原图像的边界框进行平滑调整
            boxlist = remove_small_boxes(boxlist, self.min_size)  # 将w和h为负数的去掉,此时的bbox还是 xyxy 形式的

        return results

    def forward(self, locations, box_cls, box_regression, centerness, image_sizes):
            anchors: list[list[BoxList]]
            box_cls: list[tensor]
            box_regression: list[tensor]
            image_sizes: list[(h, w)]
            boxlists (list[BoxList]): the post-processed anchors, after
                applying box decoding and NMS
        sampled_boxes = []  # list:5,对应5层的不同特征,每一个都是BoxList类对象
        for _, (l, o, b, c) in enumerate(zip(locations, box_cls, box_regression, centerness)):
                    l, o, b, c, image_sizes

        boxlists = list(zip(*sampled_boxes))  # list:1,里面是5个元组
        boxlists = [cat_boxlist(boxlist) for boxlist in boxlists]
        if not self.bbox_aug_enabled:  # True demo时
            boxlists = self.select_over_all_levels(boxlists)

        return boxlists

    # TODO very similar to filter_results from PostProcessor
    # but filter_results is per image
    # TODO Yang: solve this issue in the future. No good solution
    # right now.
    def select_over_all_levels(self, boxlists):
        num_images = len(boxlists)
        results = []  # 举例 BoxList:100
        for i in range(num_images):
            # multiclass nms
            result = boxlist_ml_nms(boxlists[i], self.nms_thresh)  # NMS非极大值抑制后的结果,为BoxList类实例  举例 BoxList:111
            number_of_detections = len(result)  # NMS抑制后剩余的边界框数目, 举例int: 111

            # Limit to max_per_image detections **over all classes**
            if number_of_detections > self.fpn_post_nms_top_n > 0:  # 如果大于100个
                cls_scores = result.get_field("scores")  # 分类得分
                image_thresh, _ = torch.kthvalue(
                    number_of_detections - self.fpn_post_nms_top_n + 1
                )  # 取到数 number_of_detections - self.fpn_post_nms_top_n + 1最小值, 举例 取到数第 12个最小值,返回该数值和其索引坐标 
                keep = cls_scores >= image_thresh.item()  # bool型 0/1 大于 image_thresh的为1,其余为0,  举例 Tensor:(111, )
                keep = torch.nonzero(keep).squeeze(1)  # 拿出keep中元素为1的索引,举例 Tensor:(100, )
                result = result[keep]  # result为 BoxList类实例,举例 BoxList:100
        return results

def make_fcos_postprocessor(config):
    pre_nms_thresh = config.MODEL.FCOS.INFERENCE_TH  # 0.05
    pre_nms_top_n = config.MODEL.FCOS.PRE_NMS_TOP_N  # 1000
    nms_thresh = config.MODEL.FCOS.NMS_TH  # 0.6
    fpn_post_nms_top_n = config.TEST.DETECTIONS_PER_IMG  # 100
    bbox_aug_enabled = config.TEST.BBOX_AUG.ENABLED  # False

    box_selector = FCOSPostProcessor(

    return box_selector
candidate_inds = box_cls > self.pre_nms_thresh  
pre_nms_top_n = candidate_inds.contiguous().view(N, -1).sum(1) 
pre_nms_top_n = pre_nms_top_n.clamp(max=self.pre_nms_top_n)


self.pre_nms_thresh = pre_nms_thresh  # 0.05

阈值来筛选正负样本,根据论文中所述,大于0.05的为正样本,其余的为负样本。然后看看一共有多少个正样本,sum(1)函数参考[python] 关于sum函数:sum(-1)、sum(1)、sum(0)以及keepdims=True的说明,最后利用clamp函数(参考pytorch:torch.clamp())保留最多1000个正样本。

(ii). box_cls = box_cls * centerness[:, :, None]

这个[:,:,None]的操作就是为原来的数组添加一维,参考切片[:, None, None]的含义。 将分类分支与中心质量分支相乘,对应论文中来惩罚距离目标中心较远的点产生的低质量边界框。


per_box_cls = per_box_cls[per_candidate_inds]
per_candidate_nonzeros = per_candidate_inds.nonzero()   
per_box_loc = per_candidate_nonzeros[:, 0]  
per_class = per_candidate_nonzeros[:, 1] + 1 

这里首先拿出分类输出中正样本的点,per_candidate_inds是bool型,数组[bool]就会拿出对应bool=1 位置处的数值得到一行数组。如下图举例所示。

FCOS代码(二)(demo过程) RPN网络结构_第1张图片

 然后再配合nonzero函数(参考python nonzero()函数的用法)得到这些正样本的点在分类输出中的位置。注意现在分类输出的shape是(H*W,C),第一维表示的是位置信息(二维坐标被拉成一维了),第二维是类别信息。所以接下来就是分开把这两个信息拿出来了,per_candidate_nonzeros[:, 1] + 1 操作是因为背景也算了一类(在predictor.py 中



(iv) per_box_cls, top_k_indices = \
                    per_box_cls.topk(per_pre_nms_top_n, sorted=False)

这里之前先判断了一下正样本数是否大于1000个,大于1000个则利用 torch.topk函数(参考pytorch -- topk())来按顺序选取前1000个样本,不按大小排序。

(v)detections = torch.stack([
                per_locations[:, 0] - per_box_regression[:, 0],
                per_locations[:, 1] - per_box_regression[:, 1],
                per_locations[:, 0] + per_box_regression[:, 2],
                per_locations[:, 1] + per_box_regression[:, 3],
            ], dim=1)


(l,t,r,b)为网络输出的回归向量,(x,y)为location中对应的分类输出的坐标;把上式中的x0,y0,x1,y1 单独移到一边即可。 

(vi)boxlist = BoxList(detections, (int(w), int(h)), mode="xyxy")
            boxlist.add_field("labels", per_class)
            boxlist.add_field("scores", torch.sqrt(per_box_cls))
            boxlist = boxlist.clip_to_image(remove_empty=False)
            boxlist = remove_small_boxes(boxlist, self.min_size)

至此这里引出了 BoxList这个类,代码如下所示。除了利用下面这个类中的属性self.extra_fields = {} (对应于boxlist.add_field函数)来添加字典,以及boxlist.clip_to_image用来平滑调整超出图像的边界框以外,还用了remove_small_boxes函数。

def remove_small_boxes(boxlist, min_size):
    Only keep boxes with both sides >= min_size

        boxlist (Boxlist)
        min_size (int)
    # TODO maybe add an API for querying the ws / hs
    xywh_boxes = boxlist.convert("xywh").bbox  # 传入的mode为xywh,这里后面跟了个box是因为这个函数返回的是一个类实例BoxList。返回的是边界框的xywh形式,举例 Tensor:(294,4)
    _, _, ws, hs = xywh_boxes.unbind(dim=1)  # 分开赋值
    keep = (
        (ws >= min_size) & (hs >= min_size)  
    ).nonzero().squeeze(1)  # 把w或h小于0的结果排除掉,举例 Tensor:(294,)
    return boxlist[keep]

这里利用的语句包括:self.bbox.split(1, dim=-1) (参考torch.split()),bbox = torch.cat( (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1 )  (参考PyTorch的torch.cat), _, _, ws, hs = xywh_boxes.unbind(dim=1)  (参考PyTorch 函数解释:torch.narrow()、torch.unbind())。

class BoxList(object):
    This class represents a set of bounding boxes.
    The bounding boxes are represented as a Nx4 Tensor.
    In order to uniquely determine the bounding boxes with respect
    to an image, we also store the corresponding image dimensions.
    They can contain extra information that is specific to each bounding box, such as

    def __init__(self, bbox, image_size, mode="xyxy"):
        device = bbox.device if isinstance(bbox, torch.Tensor) else torch.device("cpu")
        bbox = torch.as_tensor(bbox, dtype=torch.float32, device=device)
        if bbox.ndimension() != 2:
            raise ValueError(
                "bbox should have 2 dimensions, got {}".format(bbox.ndimension())
        if bbox.size(-1) != 4:
            raise ValueError(
                "last dimension of bbox should have a "
                "size of 4, got {}".format(bbox.size(-1))
        if mode not in ("xyxy", "xywh"):
            raise ValueError("mode should be 'xyxy' or 'xywh'")

        self.bbox = bbox
        self.size = image_size  # (image_width, image_height)
        self.mode = mode
        self.extra_fields = {}

    def add_field(self, field, field_data):
        self.extra_fields[field] = field_data

    def get_field(self, field):
        return self.extra_fields[field]

    def has_field(self, field):
        return field in self.extra_fields

    def fields(self):
        return list(self.extra_fields.keys())

    def _copy_extra_fields(self, bbox):
        for k, v in bbox.extra_fields.items():
            self.extra_fields[k] = v

    def convert(self, mode):
        if mode not in ("xyxy", "xywh"):
            raise ValueError("mode should be 'xyxy' or 'xywh'")
        if mode == self.mode:
            return self
        # we only have two modes, so don't need to check
        # self.mode
        xmin, ymin, xmax, ymax = self._split_into_xyxy()  # 举例 Tensor:(294,1),Tensor:(294,1),Tensor:(294,1),Tensor:(294,1)
        if mode == "xyxy":
            bbox = torch.cat((xmin, ymin, xmax, ymax), dim=-1)
            bbox = BoxList(bbox, self.size, mode=mode)
            TO_REMOVE = 1
            bbox = torch.cat(
                (xmin, ymin, xmax - xmin + TO_REMOVE, ymax - ymin + TO_REMOVE), dim=-1
            )  # 转成xywh形式。  举例 Tensor:(294,4)
            bbox = BoxList(bbox, self.size, mode=mode)
        return bbox

    def _split_into_xyxy(self):
        if self.mode == "xyxy":  # True
            xmin, ymin, xmax, ymax = self.bbox.split(1, dim=-1)
            return xmin, ymin, xmax, ymax
        elif self.mode == "xywh":
            TO_REMOVE = 1
            xmin, ymin, w, h = self.bbox.split(1, dim=-1)  # 把bbox中的tensor切成小块,然后赋值给对应的变量。相当于列表或者元组对应元素赋值
            return (
                xmin + (w - TO_REMOVE).clamp(min=0),
                ymin + (h - TO_REMOVE).clamp(min=0),
            )  # 返回 xywh形式
            raise RuntimeError("Should not be here")

    def resize(self, size, *args, **kwargs):
        Returns a resized copy of this bounding box

        :param size: The requested size in pixels, as a 2-tuple:
            (width, height).

        ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(size, self.size))
        if ratios[0] == ratios[1]:
            ratio = ratios[0]
            scaled_box = self.bbox * ratio
            bbox = BoxList(scaled_box, size, mode=self.mode)
            # bbox._copy_extra_fields(self)
            for k, v in self.extra_fields.items():
                if not isinstance(v, torch.Tensor):
                    v = v.resize(size, *args, **kwargs)
                bbox.add_field(k, v)
            return bbox

        ratio_width, ratio_height = ratios
        xmin, ymin, xmax, ymax = self._split_into_xyxy()
        scaled_xmin = xmin * ratio_width
        scaled_xmax = xmax * ratio_width
        scaled_ymin = ymin * ratio_height
        scaled_ymax = ymax * ratio_height
        scaled_box = torch.cat(
            (scaled_xmin, scaled_ymin, scaled_xmax, scaled_ymax), dim=-1
        bbox = BoxList(scaled_box, size, mode="xyxy")
        # bbox._copy_extra_fields(self)
        for k, v in self.extra_fields.items():
            if not isinstance(v, torch.Tensor):
                v = v.resize(size, *args, **kwargs)
            bbox.add_field(k, v)

        return bbox.convert(self.mode)

    def transpose(self, method):
        Transpose bounding box (flip or rotate in 90 degree steps)
        :param method: One of :py:attr:`PIL.Image.FLIP_LEFT_RIGHT`,
          :py:attr:`PIL.Image.FLIP_TOP_BOTTOM`, :py:attr:`PIL.Image.ROTATE_90`,
          :py:attr:`PIL.Image.ROTATE_180`, :py:attr:`PIL.Image.ROTATE_270`,
          :py:attr:`PIL.Image.TRANSPOSE` or :py:attr:`PIL.Image.TRANSVERSE`.
        if method not in (FLIP_LEFT_RIGHT, FLIP_TOP_BOTTOM):
            raise NotImplementedError(
                "Only FLIP_LEFT_RIGHT and FLIP_TOP_BOTTOM implemented"

        image_width, image_height = self.size
        xmin, ymin, xmax, ymax = self._split_into_xyxy()
        if method == FLIP_LEFT_RIGHT:
            TO_REMOVE = 1
            transposed_xmin = image_width - xmax - TO_REMOVE
            transposed_xmax = image_width - xmin - TO_REMOVE
            transposed_ymin = ymin
            transposed_ymax = ymax
        elif method == FLIP_TOP_BOTTOM:
            transposed_xmin = xmin
            transposed_xmax = xmax
            transposed_ymin = image_height - ymax
            transposed_ymax = image_height - ymin

        transposed_boxes = torch.cat(
            (transposed_xmin, transposed_ymin, transposed_xmax, transposed_ymax), dim=-1
        bbox = BoxList(transposed_boxes, self.size, mode="xyxy")
        # bbox._copy_extra_fields(self)
        for k, v in self.extra_fields.items():
            if not isinstance(v, torch.Tensor):
                v = v.transpose(method)
            bbox.add_field(k, v)
        return bbox.convert(self.mode)

    def crop(self, box):
        Cropss a rectangular region from this bounding box. The box is a
        4-tuple defining the left, upper, right, and lower pixel
        xmin, ymin, xmax, ymax = self._split_into_xyxy()
        w, h = box[2] - box[0], box[3] - box[1]
        cropped_xmin = (xmin - box[0]).clamp(min=0, max=w)
        cropped_ymin = (ymin - box[1]).clamp(min=0, max=h)
        cropped_xmax = (xmax - box[0]).clamp(min=0, max=w)
        cropped_ymax = (ymax - box[1]).clamp(min=0, max=h)

        # TODO should I filter empty boxes here?
        if False:
            is_empty = (cropped_xmin == cropped_xmax) | (cropped_ymin == cropped_ymax)

        cropped_box = torch.cat(
            (cropped_xmin, cropped_ymin, cropped_xmax, cropped_ymax), dim=-1
        bbox = BoxList(cropped_box, (w, h), mode="xyxy")
        # bbox._copy_extra_fields(self)
        for k, v in self.extra_fields.items():
            if not isinstance(v, torch.Tensor):
                v = v.crop(box)
            bbox.add_field(k, v)
        return bbox.convert(self.mode)

    # Tensor-like methods

    def to(self, device):
        bbox = BoxList(self.bbox.to(device), self.size, self.mode)
        for k, v in self.extra_fields.items():
            if hasattr(v, "to"):
                v = v.to(device)
            bbox.add_field(k, v)
        return bbox

    def __getitem__(self, item):
        bbox = BoxList(self.bbox[item], self.size, self.mode)
        for k, v in self.extra_fields.items():
            bbox.add_field(k, v[item])
        return bbox

    def __len__(self):
        return self.bbox.shape[0]

    def clip_to_image(self, remove_empty=True):
        TO_REMOVE = 1
        self.bbox[:, 0].clamp_(min=0, max=self.size[0] - TO_REMOVE)
        self.bbox[:, 1].clamp_(min=0, max=self.size[1] - TO_REMOVE)
        self.bbox[:, 2].clamp_(min=0, max=self.size[0] - TO_REMOVE)
        self.bbox[:, 3].clamp_(min=0, max=self.size[1] - TO_REMOVE)
        if remove_empty:
            box = self.bbox
            keep = (box[:, 3] > box[:, 1]) & (box[:, 2] > box[:, 0])
            return self[keep]
        return self

    def area(self):
        box = self.bbox
        if self.mode == "xyxy":
            TO_REMOVE = 1
            area = (box[:, 2] - box[:, 0] + TO_REMOVE) * (box[:, 3] - box[:, 1] + TO_REMOVE)
        elif self.mode == "xywh":
            area = box[:, 2] * box[:, 3]
            raise RuntimeError("Should not be here")

        return area

    def copy_with_fields(self, fields, skip_missing=False):
        bbox = BoxList(self.bbox, self.size, self.mode)
        if not isinstance(fields, (list, tuple)):
            fields = [fields]
        for field in fields:
            if self.has_field(field):
                bbox.add_field(field, self.get_field(field))
            elif not skip_missing:
                raise KeyError("Field '{}' not found in {}".format(field, self))
        return bbox

    def __repr__(self):
        s = self.__class__.__name__ + "("
        s += "num_boxes={}, ".format(len(self))
        s += "image_width={}, ".format(self.size[0])
        s += "image_height={}, ".format(self.size[1])
        s += "mode={})".format(self.mode)
        return s


boxlists = [cat_boxlist(boxlist) for boxlist in boxlists] 引出下面函数。

def cat_boxlist(bboxes):
    Concatenates a list of BoxList (having the same image size) into a
    single BoxList

        bboxes (list[BoxList])
    assert isinstance(bboxes, (list, tuple))
    assert all(isinstance(bbox, BoxList) for bbox in bboxes)

    size = bboxes[0].size  # BoxList里的size属性,为图像的size,举例 (1120,800)
    assert all(bbox.size == size for bbox in bboxes)

    mode = bboxes[0].mode  # BoxList里的mode属性 xyxy
    assert all(bbox.mode == mode for bbox in bboxes)

    fields = set(bboxes[0].fields())  # BoxList类里面的fields函数,返回的是 set:2,['labels','scores']
    assert all(set(bbox.fields()) == fields for bbox in bboxes)

    cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode)  # BoxList实例类 将5层预测的所有bbox放在一起, 举例 {BoxList:737},属性bbox={Tensor:(737,4)}

    for field in fields:  # 'labels','scores'
        data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0)
        cat_boxes.add_field(field, data)

    return cat_boxes

这里的fields = set(bboxes[0].fields())利用了set函数,它可以删除重复数据(参考python的set()函数),.fileds()函数来自BoxList类内,定义如下:

def fields(self):
    return list(self.extra_fields.keys())

返回该类内属性extra_fields中的健,即 "labels","scores"。然后

cat_boxes = BoxList(_cat([bbox.bbox for bbox in bboxes], dim=0), size, mode)


def _cat(tensors, dim=0):
    Efficient version of torch.cat that avoids a copy if there is only a single element in a list
    assert isinstance(tensors, (list, tuple))
    if len(tensors) == 1:
        return tensors[0]
    return torch.cat(tensors, dim)

torch.cat函数(参考PyTorch的torch.cat)将来自5层的预测输出结果bbox连接在一起,这一步的主要作用是把预测结果的回归向量放到了一起。而且这里还重新定义了一个新的BoxList实例类cat_boxes,然后通过下面的代码 在cat_boxes实例属性 extra_fields 字典中写入对应的并放到一起的 {'label': Tensor:(737, ) ,  'socer':Tensor:(737,  ) }  (<----举例)。

for field in fields:  # 'labels','scores'
    data = _cat([bbox.get_field(field) for bbox in bboxes], dim=0)
    cat_boxes.add_field(field, data)


if not self.bbox_aug_enabled:  # True demo时
    boxlists = self.select_over_all_levels(boxlists)

跳到 select_over_all_levels(self, boxlists):函数里,result = boxlist_ml_nms(boxlists[i], self.nms_thresh)引出boxlist_ml_nms函数,如下所示,这个_box_ml_nms在源代码里并没有直接的NMS非极大值抑制函数,而是用的接口(底层的C语言)编译的开源NMS。然后

image_thresh, _ = torch.kthvalue(
                    number_of_detections - self.fpn_post_nms_top_n + 1

中利用torch.kthvalue函数(参考Pytorch学习之torch----比较操作(Comparison Ops))来删除超过100个以外数量的最小值。至此后处理过程FCOSPostProcessor类运行完毕,返回最终的result---一个BoxList的类实例。

def boxlist_ml_nms(boxlist, nms_thresh, max_proposals=-1,
                   score_field="scores", label_field="labels"):  # nms_thresh=0.6
    Performs non-maximum suppression on a boxlist, with scores specified
    in a boxlist field via score_field.

        nms_thresh (float)
        max_proposals (int): if > 0, then only the top max_proposals are kept
            after non-maximum suppression
        score_field (str)
    if nms_thresh <= 0:
        return boxlist
    mode = boxlist.mode  # 'xyxy'
    boxlist = boxlist.convert("xyxy")  # 转成xyxy形式,这个直接返回的,没有改变什么,原boxlist.mode='xyxy'
    boxes = boxlist.bbox  # 回归向量  举例 Tensor:(737,4)
    scores = boxlist.get_field(score_field)  # 分类得分 举例 Tensor:(737, )
    labels = boxlist.get_field(label_field)  # 分类得分对应的标签 举例 Tensor(737, )
    keep = _box_ml_nms(boxes, scores, labels.float(), nms_thresh)  # 运用非极大值抑制,网上查资料说的是用Cpython编译出开源的NMS,并没有在原代码中直接编写NMS
    if max_proposals > 0:
        keep = keep[: max_proposals]
    boxlist = boxlist[keep]
    return boxlist.convert(mode)


def _forward_test(self, locations, box_cls, box_regression, centerness, image_sizes):
    boxes = self.box_selector_test(
        locations, box_cls, box_regression, 
        centerness, image_sizes
    return boxes, {}

这里返回的是处理后的 一个 BoxList类实例,这也是FCOSHead类的返回值(forward函数里的)

else:  # demo
    return self._forward_test(
        locations, box_cls, box_regression, 
        centerness, images.image_sizes

后面的{}是因为 在 generalized_rcnn.py中调用FCOSHead的语句如下所示:

proposals, proposal_losses = self.rpn(images, features, targets)

这是demo过程,所以不需要返回损失函数。至此,代码rpn(并没有i用到anchor,这是anchor free 的)这部分结束了。
