maskrcnn_benchmark 代码详解之 poolers.py

前言:

  在目标检测的深度网络中最后一个步骤就是RoI层,其中RoI Pooling会实现将RPN提取的各种形状的边框进行池化,从而形成统一尺度的特征层,这一工程中将涉及到ROIAlign操作。Pool中的Scale是一个数组,代表原始图片变换到FPN的各个特征层需要的变换比例,比如到Stage2是1/4, 以此类推。其代码详解为:

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import torch
import torch.nn.functional as F
from torch import nn

from maskrcnn_benchmark.layers import ROIAlign

from .utils import cat


class LevelMapper(object):
    """Determine which FPN level each RoI in a set of RoIs should map to based
    on the heuristic in the FPN paper.
    """
    """
    LevelMapper函数的作用是获得某个特征区域将会从网络的那一层特征上进行提取,面积越大的目标区往往会在高层进行提取,小目标则在低层卷基层
    上进行特征提取。本函数的主要目标就是确定某个目标最好从那一层上进行提取。
    实现FPN论文里的公式
    """

    def __init__(self, k_min, k_max, canonical_scale=224, canonical_level=4, eps=1e-6):
        """
        Arguments:
            k_min (int)
            k_max (int)
            canonical_scale (int)
            canonical_level (int)
            eps (float)
        """
        # k_min是进行FPN的最低层网络在第几层,一般为2,表示FPN从第2层开始
        self.k_min = k_min
        # k_max是进行FPN的最高层网络在第几层,一般为5,表示FPN到第5层结束
        self.k_max = k_max
        # s0表示原始图像的边长为多大,以便确定目标是相对大还是小。这是参考imagenet预训练模型中的图片都是边长为224.如有必要,参数要调节
        self.s0 = canonical_scale
        # FPN层数
        self.lvl0 = canonical_level
        # 防止目标区域过小
        self.eps = eps

    def __call__(self, boxlists):
        """
        Arguments:
            boxlists (list[BoxList])
        """
        # Compute level ids
        # 计算目标区域边长
        s = torch.sqrt(cat([boxlist.area() for boxlist in boxlists]))

        # Eqn.(1) in FPN paper
        # 计算FPN论文里的公式1
        target_lvls = torch.floor(self.lvl0 + torch.log2(s / self.s0 + self.eps))
        # 吧target_lvls缩小到正确的范围
        target_lvls = torch.clamp(target_lvls, min=self.k_min, max=self.k_max)
        return target_lvls.to(torch.int64) - self.k_min


class Pooler(nn.Module):
    """
    Pooler for Detection with or without FPN.
    It currently hard-code ROIAlign in the implementation,
    but that can be made more generic later on.
    Also, the requirement of passing the scales is not strictly necessary, as they
    can be inferred from the size of the feature map / size of original image,
    which is available thanks to the BoxList.
    """

    def __init__(self, output_size, scales, sampling_ratio):
        """
        Arguments:
            output_size (list[tuple[int]] or list[int]): output size for the pooled region输出特征的大小
            scales (list[float]): scales for each Pooler # 获得参与FPN的最低层
            sampling_ratio (int): sampling ratio for ROIAlign 每个bin内高和宽方向的采样率,论文中默认的是2.即每个bin采样2*2=4
        """
        super(Pooler, self).__init__()
        # 按照不同的尺度构造池化层
        poolers = []
        for scale in scales:
            poolers.append(
                ROIAlign(
                    output_size, spatial_scale=scale, sampling_ratio=sampling_ratio
                )
            )
        self.poolers = nn.ModuleList(poolers)
        self.output_size = output_size
        # get the levels in the feature map by leveraging the fact that the network always
        # downsamples by a factor of 2 at each level.
        # 获得参与FPN的最低层
        lvl_min = -torch.log2(torch.tensor(scales[0], dtype=torch.float32)).item()
        # 获得参与FPN的最高层
        lvl_max = -torch.log2(torch.tensor(scales[-1], dtype=torch.float32)).item()
        self.map_levels = LevelMapper(lvl_min, lvl_max)

    # 转换成roi的格式
    def convert_to_roi_format(self, boxes):
        concat_boxes = cat([b.bbox for b in boxes], dim=0)
        device, dtype = concat_boxes.device, concat_boxes.dtype
        ids = cat(
            [
                torch.full((len(b), 1), i, dtype=dtype, device=device)
                for i, b in enumerate(boxes)
            ],
            dim=0,
        )
        rois = torch.cat([ids, concat_boxes], dim=1)
        return rois

    def forward(self, x, boxes):
        """
        Arguments:
            x (list[Tensor]): feature maps for each level
            boxes (list[BoxList]): boxes to be used to perform the pooling operation.
        Returns:
            result (Tensor)
        """
        # 得到提取特征的层的个数
        num_levels = len(self.poolers)
        rois = self.convert_to_roi_format(boxes)
        if num_levels == 1:
            return self.poolers[0](x[0], rois)

        # 得到目标特征应该映射到的最有的层
        levels = self.map_levels(boxes)

        # 获得roi个数
        num_rois = len(rois)
        # 获得通道数
        num_channels = x[0].shape[1]
        # 获得输出大小
        output_size = self.output_size[0]

        # 获得特征的数据类型和它所在的设备
        dtype, device = x[0].dtype, x[0].device
        # 初始化返回数据
        result = torch.zeros(
            (num_rois, num_channels, output_size, output_size),
            dtype=dtype,
            device=device,
        )
        for level, (per_level_feature, pooler) in enumerate(zip(x, self.poolers)):
            # 获得所有应该从同一特征层提取特征的roi
            idx_in_level = torch.nonzero(levels == level).squeeze(1)
            # 或者这些roi的编号
            rois_per_level = rois[idx_in_level]
            # 将大小相似的这些目标特征送入到特定同一个特征层进行池化,得到相应的结果
            result[idx_in_level] = pooler(per_level_feature, rois_per_level).to(dtype)

        return result


def make_pooler(cfg, head_name):
    # 获得输出特征图的大小
    resolution = cfg.MODEL[head_name].POOLER_RESOLUTION

    # 获得参与FPN的最低层
    scales = cfg.MODEL[head_name].POOLER_SCALES
    # 每个bin内高和宽方向的采样率,论文中默认的是2.即每个bin采样2 * 2 = 4
    sampling_ratio = cfg.MODEL[head_name].POOLER_SAMPLING_RATIO
    # 获得池化层
    pooler = Pooler(
        output_size=(resolution, resolution),
        scales=scales,
        sampling_ratio=sampling_ratio,
    )
    return pooler

 

你可能感兴趣的:(maskrcnn,benchmark)