目标检测——Faster RCNN网络代码(三)之模型部分

文章目录

  • model/region_proposal_network.py
    • def_enumerate_shifted_anchor_torch:
    • class RegionProposalNetwork(nn.Module):
  • model/faster_rcnn_vgg16.py
    • class VGG16ROIHead:
  • model/faster_rcnn.py

model/region_proposal_network.py

def_enumerate_shifted_anchor_torch:

目标检测——Faster RCNN网络代码(三)之模型部分_第1张图片

def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width):
    #  Enumerate all shifted anchors:
    #
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    # return (K*A, 4)

    # !TODO: add support for torch.CudaTensor
    # xp = cuda.get_array_module(anchor_base)
    import torch as t
    shift_y = t.arange(0, height * feat_stride, feat_stride)  # 向量
    shift_x = t.arange(0, width * feat_stride, feat_stride)
    shift_x, shift_y = xp.meshgrid(shift_x, shift_y)  # 二维数组
    shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
                      shift_y.ravel(), shift_x.ravel()), axis=1)
    #  ravel()原始数组降一维,和坐标相对应
    #  np.stack按行拼在一起,

    A = anchor_base.shape[0]  # 基础anchor有9个框
    K = shift.shape[0]   # 一共有多少个anchor点
    anchor = anchor_base.reshape((1, A, 4)) + \
             shift.reshape((1, K, 4)).transpose((1, 0, 2))
    anchor = anchor.reshape((K * A, 4)).astype(np.float32)
    return anchor

class RegionProposalNetwork(nn.Module):

目标检测——Faster RCNN网络代码(三)之模型部分_第2张图片

class RegionProposalNetwork(nn.Module):
    """Region Proposal Network introduced in Faster R-CNN.RPN引入Faster R-CNN

    This is Region Proposal Network introduced in Faster R-CNN [#]_.
    This takes features extracted from images and propose
    class agnostic bounding boxes around "objects".

    .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
    Faster R-CNN: Towards Real-Time Object Detection with \
    Region Proposal Networks. NIPS 2015.

    Args:
        in_channels (int): The channel size of input.
        mid_channels (int): The channel size of the intermediate tensor.
        ratios (list of floats): This is ratios of width to height of
            the anchors.
        anchor_scales (list of numbers): This is areas of anchors.
            Those areas will be the product of the square of an element in
            :obj:`anchor_scales` and the original area of the reference
            window.
        feat_stride (int): Stride size after extracting features from an
            image.
        initialW (callable): Initial weight value. If :obj:`None` then this
            function uses Gaussian distribution scaled by 0.1 to
            initialize weight.
            May also be a callable that takes an array and edits its values.
        proposal_creator_params (dict): Key valued paramters for
            :class:`model.utils.creator_tools.ProposalCreator`.

    .. seealso::
        :class:`~model.utils.creator_tools.ProposalCreator`

    """

    def __init__(
            self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
            anchor_scales=[8, 16, 32], feat_stride=16,
            proposal_creator_params=dict(),
    ):
        super(RegionProposalNetwork, self).__init__()
        self.anchor_base = generate_anchor_base(
            anchor_scales=anchor_scales, ratios=ratios)  # 生成基础的9个anchor
        self.feat_stride = feat_stride   # 卷积4个pooling层的缩小比例
        self.proposal_layer = ProposalCreator(self, **proposal_creator_params)
        # 从生成的anchor选出proposal
        n_anchor = self.anchor_base.shape[0]
        self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)  # 3*3的卷积
        self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
        # 1*1的卷积,第一个支路,用来分类,有填充
        self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
        # 1*1的卷积,第二个支路,用来回归,有填充
        normal_init(self.conv1, 0, 0.01)   # 正则化
        normal_init(self.score, 0, 0.01)
        normal_init(self.loc, 0, 0.01)

    def forward(self, x, img_size, scale=1.):
        """Forward Region Proposal Network.RPN网络前向传播

        Here are notations.

        * :math:`N` is batch size.
        * :math:`C` channel size of the input.
        * :math:`H` and :math:`W` are height and witdh of the input feature.
        * :math:`A` is number of anchors assigned to each pixel.

        Args:
            x (~torch.autograd.Variable): The Features extracted from images.
                Its shape is :math:`(N, C, H, W)`.
            img_size (tuple of ints): A tuple :obj:`height, width`,
                which contains image size after scaling.
            scale (float): The amount of scaling done to the input images after
                reading them from files.对输入图像所做的缩放量

        Returns:
            (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array):

            This is a tuple of five following values.

            * **rpn_locs**: Predicted bounding box offsets and scales for \
                anchors. Its shape is :math:`(N, H W A, 4)`.
            * **rpn_scores**:  Predicted foreground scores for \
                anchors. Its shape is :math:`(N, H W A, 2)`.
            * **rois**: A bounding box array containing coordinates of \
                proposal boxes.  This is a concatenation of bounding box \
                arrays from multiple images in the batch. \
                Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \
                bounding boxes from the :math:`i` th image, \
                :math:`R' = \\sum _{i=1} ^ N R_i`.
            * **roi_indices**: An array containing indices of images to \
                which RoIs correspond to. Its shape is :math:`(R',)`.
            * **anchor**: Coordinates of enumerated shifted anchors. \
                Its shape is :math:`(H W A, 4)`.

        """
        n, _, hh, ww = x.shape  # `(batch, 通道数, 高, 宽)`
        anchor = _enumerate_shifted_anchor(
            np.array(self.anchor_base),
            self.feat_stride, hh, ww)
        # self.feat_stride = 16,self.anchor_base:对一个像素点生成对应的9个anchor
        # 调用_enumerate_shifted_anchor函数生成整张图像的anchor,对应到原图坐标

        n_anchor = anchor.shape[0] // (hh * ww)  # //整数除法  hh*ww*9/hh*ww=9
        h = F.relu(self.conv1(x))   # 512个3x3卷积(512, H/16,W/16)  x应该是featrue map

        rpn_locs = self.loc(h)  # 第二个支路,定位BBOX和回归
        # n_anchor(9)*4个1x1卷积,回归坐标偏移量。(9*4,hh,ww)
        # UNNOTE: check whether need contiguous
        # A: Yes
        rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
        # 变为(n,hh*ww*9,4)
        # permute将tensor的维度换位;contiguous()把tensor变成在内存中连续分布的形式
        # view()函数作用和reshape差不多。其中参数-1表示剩下的值的个数一起构成一个维度。

        rpn_scores = self.score(h) # 第一个支路,判断label
        # n_anchor(9)*2个1x1卷积,回归类别。(9*2,hh,ww)
        rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
        rpn_softmax_scores = F.softmax(rpn_scores.view(n, hh, ww, n_anchor, 2), dim=4)
        rpn_fg_scores = rpn_softmax_scores[:, :, :, :, 1].contiguous()  # 得到前景的分类概率
        rpn_fg_scores = rpn_fg_scores.view(n, -1)  # 得到所有anchor的前景分类概率
        rpn_scores = rpn_scores.view(n, -1, 2)  # 得到每一张feature map上所有anchor的网络输出值

        rois = list()  # list() 生成一个空的列表。
        roi_indices = list()
        for i in range(n):  # n为batch_size数
            roi = self.proposal_layer(
                rpn_locs[i].cpu().data.numpy(),
                rpn_fg_scores[i].cpu().data.numpy(),
                anchor, img_size,
                scale=scale)
            # 从生成的anchor选出proposal rpn_locs维度(hh*ww*9,4) rpn_fg_scores维度为(hh*ww*9)
            # anchor的维度为(hh*ww*9,4)img_size的维度为(3,H,W)
            # 取前12000个并经过NMS得到2000个近似目标框G^的坐标
            batch_index = i * np.ones((len(roi),), dtype=np.int32)
            rois.append(roi)  # rois为所有batch_size的roi
            roi_indices.append(batch_index)

        rois = np.concatenate(rois, axis=0)
        #  按行拼接(即没有batch_size的区分,每一个[]里都是一个anchor的四个坐标)
        roi_indices = np.concatenate(roi_indices, axis=0)
        return rpn_locs, rpn_scores, rois, roi_indices, anchor
        # rpn_locs的维度(hh*ww*9,4),rpn_scores维度为(hh*ww*9,2)
        # rois的维度为(2000,4),roi_indices用不到,anchor的维度为(hh*ww*9,4)

model/faster_rcnn_vgg16.py

class VGG16ROIHead:

目标检测——Faster RCNN网络代码(三)之模型部分_第3张图片

class VGG16RoIHead(nn.Module):
    """Faster R-CNN Head for VGG-16 based implementation.
    This class is used as a head for Faster R-CNN.
    This outputs class-wise localizations and classification based on feature
    maps in the given RoIs.
    
    Args:
        n_class (int): The number of classes possibly including the background.
        roi_size (int): Height and width of the feature maps after RoI-pooling.
        spatial_scale (float): Scale of the roi is resized.
        classifier (nn.Module): Two layer Linear ported from vgg16

    """

    def __init__(self, n_class, roi_size, spatial_scale,
                 classifier):
        # n_class includes the background
        super(VGG16RoIHead, self).__init__()

        self.classifier = classifier  #vgg16中的最后两个全连接层
        self.cls_loc = nn.Linear(4096, n_class * 4)  # 全连接层,输出坐标
        self.score = nn.Linear(4096, n_class)  # 全连接层,输出

        normal_init(self.cls_loc, 0, 0.001)
        normal_init(self.score, 0, 0.01)  # 全连接层权重初始化

        self.n_class = n_class  # 加上背景21类
        self.roi_size = roi_size  # 7
        self.spatial_scale = spatial_scale   # 1/16
        self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale)
        # 将大小不同的roi变成大小一致,得到pooling后的特征,大小为[300, 512, 7, 7]
        # 利用Cupy实现在线编译的
    def forward(self, x, rois, roi_indices):
        """Forward the chain.

        We assume that there are :math:`N` batches.

        Args:
            x (Variable): 4D image variable.
            rois (Tensor): A bounding box array containing coordinates of
                proposal boxes.  This is a concatenation of bounding box
                arrays from multiple images in the batch.
                Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
                RoIs from the :math:`i` th image,
                :math:`R' = \\sum _{i=1} ^ N R_i`.
            roi_indices (Tensor): An array containing indices of images to
                which bounding boxes correspond to. Its shape is :math:`(R',)`.

        """
        # in case roi_indices is  ndarray
        roi_indices = at.totensor(roi_indices).float()
        rois = at.totensor(rois).float()
        indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
        # NOTE: important: yx->xy
        xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
        indices_and_rois =  xy_indices_and_rois.contiguous()

        pool = self.roi(x, indices_and_rois)   # X = 4D image variable.
        # roi_module.py中的RoIPooling
        pool = pool.view(pool.size(0), -1)   # reshape
        fc7 = self.classifier(pool)  # 全连接
        roi_cls_locs = self.cls_loc(fc7)   # 全连接层,输出坐标
        roi_scores = self.score(fc7)     # 全连接层,输出类别
        return roi_cls_locs, roi_scores
class RoI(Function):
    def __init__(self, outh, outw, spatial_scale):
        self.forward_fn = load_kernel('roi_forward', kernel_forward)
        self.backward_fn = load_kernel('roi_backward', kernel_backward)
        self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale

    def forward(self, x, rois):
        # 将大小不同的roi变成大小一致,得到pooling后的特征,大小为[300, 512, 7, 7]。
        # 反正意思为将每个feature map 变成统一大小为7x7的。
        # NOTE: MAKE SURE input is contiguous too
        x = x.contiguous()  # 变成在内存中连续分布的形式
        rois = rois.contiguous()  # 变成在内存中连续分布的形式
        self.in_size = B, C, H, W = x.size()
        self.N = N = rois.size(0)  # N应该是2000,一张图片的rois的数量
        output = t.zeros(N, C, self.outh, self.outw).cuda()
        self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
        self.rois = rois  # 大小为(B,2000,4)
        args = [x.data_ptr(), rois.data_ptr(),
                output.data_ptr(),
                self.argmax_data.data_ptr(),
                self.spatial_scale, C, H, W,
                self.outh, self.outw,
                output.numel()]
        # data_ptr()返回一个时间戳,类似指针,numel()返回一个tensor变量内所有元素
        stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
        self.forward_fn(args=args,
                        block=(CUDA_NUM_THREADS, 1, 1),
                        grid=(GET_BLOCKS(output.numel()), 1, 1),
                        stream=stream)
        #  这一步是实现RoI pooling的关键,通过Cupy实现在线编译,调用roi_cupy代码。
        return output

    def backward(self, grad_output):
        ##NOTE: IMPORTANT CONTIGUOUS
        # TODO: input
        grad_output = grad_output.contiguous()
        B, C, H, W = self.in_size
        grad_input = t.zeros(self.in_size).cuda()
        stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
        args = [grad_output.data_ptr(),
                self.argmax_data.data_ptr(),
                self.rois.data_ptr(),
                grad_input.data_ptr(),
                self.N, self.spatial_scale, C, H, W, self.outh, self.outw,
                grad_input.numel()]
        self.backward_fn(args=args,
                         block=(CUDA_NUM_THREADS, 1, 1),
                         grid=(GET_BLOCKS(grad_input.numel()), 1, 1),
                         stream=stream
                         )
        return grad_input, None


class RoIPooling2D(t.nn.Module):

    def __init__(self, outh, outw, spatial_scale):
        super(RoIPooling2D, self).__init__()
        self.RoI = RoI(outh, outw, spatial_scale)

    def forward(self, x, rois):
        return self.RoI(x, rois)

model/faster_rcnn.py

你可能感兴趣的:(深度学习——图像处理,深度学习,python,pytorch,神经网络,机器学习)