Tensorflow2.0:Faster RCNN 代码详解(二)

这次重点解析在Tensorflow2.0:Faster RCNN 代码详解(一)中关于引用rpn_heads文件函数的代码,该代码实际上是编写了Faster RCNN 中RPN网络的过程。

import tensorflow as tf
from tensorflow.keras import layers
from detection.core.anchor import anchor_generator, anchor_target
from detection.core.loss import losses
from detection.core.bbox import transforms
from detection.utils.misc import *

根据上述可以看出,rpn_heads文件引入了loss,anchor_generator, anchor_target,transforms和misc。其中,loss和anchor_generator, anchor_targetr比较关键,在其余部分给出解析。

第一部分 针对rpn_heads文件代码解析

class RPNHead(tf.keras.Model):
    def __init__(self, 
                 anchor_scales=(32, 64, 128, 256, 512), 
                 anchor_ratios=(0.5, 1, 2), 
                 anchor_feature_strides=(4, 8, 16, 32, 64),
                 target_means=(0., 0., 0., 0.), 
                 target_stds=(0.1, 0.1, 0.2, 0.2), 
        Network head of Region Proposal Network.

                                      / - rpn_cls (1x1 conv)
        input - rpn_conv (3x3 conv) -
                                      \ - rpn_reg (1x1 conv)

            anchor_scales: 1D array of anchor sizes in pixels.
            anchor_ratios: 1D array of anchor ratios of width/height.
            anchor_feature_strides: Stride of the feature map relative 
                to the image in pixels.
            proposal_count: int. RPN proposals kept after non-maximum 
            nms_threshold: float. Non-maximum suppression threshold to 
                filter RPN proposals.
            target_means: [4] Bounding box refinement mean.
            target_stds: [4] Bounding box refinement standard deviation.
            num_rpn_deltas: int.
            positive_fraction: float.
            pos_iou_thr: float.
            neg_iou_thr: float.
        super(RPNHead, self).__init__(**kwags)
        self.proposal_count = proposal_count
        self.nms_threshold = nms_threshold
        self.target_means = target_means
        self.target_stds = target_stds

        self.generator = anchor_generator.AnchorGenerator(
        self.anchor_target = anchor_target.AnchorTarget(
        self.rpn_class_loss = losses.rpn_class_loss
        self.rpn_bbox_loss = losses.rpn_bbox_loss
        # Shared convolutional base of the RPN
        self.rpn_conv_shared = layers.Conv2D(512, (3, 3), padding='same',
        self.rpn_class_raw = layers.Conv2D(len(anchor_ratios) * 2, (1, 1),

        self.rpn_delta_pred = layers.Conv2D(len(anchor_ratios) * 4, (1, 1),
    def call(self, inputs, training=True):
            inputs: [batch_size, feat_map_height, feat_map_width, channels] 
                one level of pyramid feat-maps.
            rpn_class_logits: [batch_size, num_anchors, 2]
            rpn_probs: [batch_size, num_anchors, 2]
            rpn_deltas: [batch_size, num_anchors, 4]
        layer_outputs = []
        for feat in inputs:   # for every anchors feature maps
            # 五种feature map
            (1, 304, 304, 256)
            (1, 152, 152, 256)
            (1, 76, 76, 256)
            (1, 38, 38, 256)
            (1, 19, 19, 256)
            对于一种feature maps来说(以feature map(1, 304, 304, 256)为例子)
            rpn_class_raw: (1, 304, 304, 6)  # 一个cell有3种anchor,每个有两个值,一个是前景值,一个是背景景值
            rpn_class_logits: (1, 277248, 2) # 输出每个anchor的两个置信值
            rpn_delta_pred: (1, 304, 304, 12) # 每种anchor有四个数代表anchor的坐标
            rpn_deltas: (1, 277248, 4)  # 输出每个anchor
            # feature map大小不变,经卷积后通道为512
            shared = self.rpn_conv_shared(feat)
            shared = tf.nn.relu(shared)
            # 输出每种可能性,即概率值
            x = self.rpn_class_raw(shared)  # (1, 304, 304, 2)
            rpn_class_logits = tf.reshape(x, [tf.shape(x)[0], -1, 2])  # (1, 277248, 2)
            # 计算每个预测框的前景值和后景值
            # 方法:tf.nn.softmax
            rpn_probs = tf.nn.softmax(rpn_class_logits)  # tf.nn.softmax可针对每一行数据进行计算
            # 计算每个预测框的偏移量
            x = self.rpn_delta_pred(shared)
            rpn_deltas = tf.reshape(x, [tf.shape(x)[0], -1, 4])
            layer_outputs.append([rpn_class_logits, rpn_probs, rpn_deltas])

            (1, 277248, 2) (1, 277248, 2) (1, 277248, 4)
            (1, 69312, 2) (1, 69312, 2) (1, 69312, 4)
            (1, 17328, 2) (1, 17328, 2) (1, 17328, 4)
            (1, 4332, 2) (1, 4332, 2) (1, 4332, 4)
            (1, 1083, 2) (1, 1083, 2) (1, 1083, 4)


        # 将feature map进行卷积处理得到上面每个anchor的类别class,置信probability以及偏移量(dy, dx, log(dh), log(dw))
        # 方法:Zip在整理数据的优势:
        # 具体:layer_outputs是五个列表,每个列表三个元素。zip使得变成三个列表,每个列表五个元素,元素是以前的列
        outputs = list(zip(*layer_outputs))  # zip带星号是解压的意思
        outputs = [tf.concat(list(o), axis=1) for o in outputs]
        rpn_class_logits, rpn_probs, rpn_deltas = outputs
        # 输出格式为:[batch_size, num_anchors,class],[batch_size, num_anchors,probability],
        #                                              [batch_size, num_anchors,(dy, dx, log(dh), log(dw)]
        # 输出shape为: (1, 369303, 2) (1, 369303, 2) (1, 369303, 4)
        return rpn_class_logits, rpn_probs, rpn_deltas
# 计算误差
    def loss(self, rpn_class_logits, rpn_deltas, gt_boxes, gt_class_ids, img_metas):

        :param rpn_class_logits: [N, 2]
        :param rpn_deltas: [N, 4]
        :param gt_boxes:  [GT_N]
        :param gt_class_ids:  [GT_N]
        :param img_metas: [11]
# 1.根据feature map输出所有预测框的坐标

        # 具体:在feature map上生成所有的预测框,并给出预测框是否超出图像边界的标签valid_flags,0为无效超出边界,1为有效
        # 输出的shape为:anchors: [batch_size, num_anchors, 4=(y1, x1, y2, x2)]
        #                valid_flags: [batch_size, num_anchors]
        anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)

# 2.计算真实值t*

        # 具体:对于每个batch,在feature map上随机抽取256个anchors,其中一半是negative一半是positive,
        #       计算positive anchors与真实标注框的4个距离参数dy, dx, log(dh), log(dw),网上有相关计算公式说明
        # 输出的shape为:rpn_target_matchs: [batch_size, num_anchors]
        #                                      (1 = positive anchor, -1 = negative anchor, 0 = neutral anchor)
        #               rpn_target_deltas: [batch_size, num_rpn_deltas, 4=(dy, dx, log(dh), log(dw))]
        rpn_target_matchs, rpn_target_deltas = self.anchor_target.build_targets(
            anchors, valid_flags, gt_boxes, gt_class_ids)

# 3.计算分类误差与偏移量回归误差

        # 分类误差
        rpn_class_loss = self.rpn_class_loss(rpn_target_matchs, rpn_class_logits)
        # 偏移量回归误差
        rpn_bbox_loss = self.rpn_bbox_loss(rpn_target_deltas, rpn_target_matchs, rpn_deltas)

        return rpn_class_loss, rpn_bbox_loss

# 4.输出proposal区域(由针对一个样本输出proposal区域组成)
    def get_proposals(self,  rpn_probs, rpn_deltas, img_metas, with_probs=False):

        Calculate proposals.
            rpn_probs: [batch_size, num_anchors, (bg prob, fg prob)]
            rpn_deltas: [batch_size, num_anchors, (dy, dx, log(dh), log(dw))]
            img_metas: [batch_size, 11]
            with_probs: bool.
            proposals_list: list of [num_proposals, (y1, x1, y2, x2)] in 
                normalized coordinates if with_probs is False. 
                Otherwise, the shape of proposals in proposals_list is 
                [num_proposals, (y1, x1, y2, x2, score)]
        Note that num_proposals is no more than proposal_count. And different 
           images in one batch may have different num_proposals.
        anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)
        # [369303, 4], [b, 11]
        # [b, N, (background prob, foreground prob)], get anchor's foreground prob, [1, 369303]
        rpn_probs = rpn_probs[:, :, 1]
        # [[1216, 1216]]
        pad_shapes = calc_pad_shapes(img_metas)
        proposals_list = [
                rpn_probs[i], rpn_deltas[i], anchors, valid_flags[i], pad_shapes[i], with_probs)
            for i in range(img_metas.shape[0])
        return proposals_list

# 针对一个样本输出proposal区域
    def _get_proposals_single(self, rpn_probs, rpn_deltas, anchors, valid_flags, img_shape, with_probs):

        Calculate proposals.
            rpn_probs: [num_anchors]
            rpn_deltas: [num_anchors, (dy, dx, log(dh), log(dw))]
            anchors: [num_anchors, (y1, x1, y2, x2)] anchors defined in pixel coordinates.
            valid_flags: [num_anchors]
            img_shape: np.ndarray. [2]. (img_height, img_width)
            with_probs: bool.
            proposals: [num_proposals, (y1, x1, y2, x2)] in normalized 

# 1. 将超出图片边界的anchors box剔除
        # 方法: tf.boolean_mask 挑选出True位置的数据
        H, W = img_shape
        # filter invalid anchors, int => bool
        valid_flags = tf.cast(valid_flags, tf.bool)
        # [369303] => [215169], respectively
        rpn_probs = tf.boolean_mask(rpn_probs, valid_flags)
        rpn_deltas = tf.boolean_mask(rpn_deltas, valid_flags)
        anchors = tf.boolean_mask(anchors, valid_flags)

# 2. 根据anchors的score(概率值),按最大的选前6000个
        # 方法:tf.nn.top_k 返回每行最大的 k 个数,并返回它们所在位置的索引,sorted=True指的是从大到小排列
        #       tf.nn.top_k 挑出该数据内从大到小的k个数,然后返回位置索引
        # Improve performance
        pre_nms_limit = min(6000, anchors.shape[0])  # min(6000, 215169) => 6000
        ix = tf.nn.top_k(rpn_probs, pre_nms_limit, sorted=True).indices
        # [215169] => [6000], respectively
        rpn_probs = tf.gather(rpn_probs, ix)
        rpn_deltas = tf.gather(rpn_deltas, ix)
        anchors = tf.gather(anchors, ix)

# 3.计算出6000个anchors的修改坐标,经NMS处理输出其中的2000个

        # 1.计算6000个anchors的proposals,格式为[y1, x1, y2, x2],shape为[6000, 4]
        # 为何要用方差和均值来重新计算偏移量
        proposals = transforms.delta2bbox(anchors, rpn_deltas, self.target_means, self.target_stds)
        # clipping to valid area, [6000, 4]
        window = tf.constant([0., 0., H, W], dtype=tf.float32)
        proposals = transforms.bbox_clip(proposals, window)
        # 坐标值得均一化Normalize
        proposals = proposals / tf.constant([H, W, H, W], dtype=tf.float32)

        # 2.去除冗余的proposals--NMS, indices: [2000]
        # 方法:tf.image.non_max_suppression 坐标位置 分数 通过非最大抑制选择的框的最大数量 判断框是否相对于IOU重叠太多的阈值
        indices = tf.image.non_max_suppression(proposals, rpn_probs, self.proposal_count, self.nms_threshold)
        proposals = tf.gather(proposals, indices)  # shape为 [2000, 4] 格式为[y1, x1, y2, x2] 左上角和右下角
        if with_probs:
            proposal_probs = tf.expand_dims(tf.gather(rpn_probs, indices), axis=1)
            proposals = tf.concat([proposals, proposal_probs], axis=1)
        return proposals

第二部分 针对loss文件代码解析

import tensorflow as tf
from tensorflow import keras

def smooth_l1_loss(y_true, y_pred):
    '''Implements Smooth-L1 loss.
        y_true and y_pred are typically: [N, 4], but could be any shape.
    diff = tf.abs(y_true - y_pred)
    less_than_one = tf.cast(tf.less(diff, 1.0), tf.float32)
    loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
    return loss

def rpn_class_loss(target_matchs, rpn_class_logits):
    '''RPN anchor classifier loss.
        target_matchs: [batch_size, num_anchors]. Anchor match type. 1=positive,
            -1=negative, 0=neutral anchor.
        rpn_class_logits: [batch_size, num_anchors, 2]. RPN classifier logits for FG/BG.

    # 当预测框中有目标时,才计算其分类误差;若预测框中没有目标,则不计算分类误差。
    anchor_class = tf.cast(tf.equal(target_matchs, 1), tf.int32)  # shape为[1, 2]
    # 确定含有目标预测框的索引
    indices = tf.where(tf.not_equal(target_matchs, 0))
    anchor_class = tf.gather_nd(anchor_class, indices)
    # 确定含有目标预测框的前景值和后景值
    rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)

    num_classes = rpn_class_logits.shape[-1]
    # 计算二分类的交叉熵损失
    loss = keras.losses.categorical_crossentropy(tf.one_hot(anchor_class, depth=num_classes),
                                                 rpn_class_logits, from_logits=True)

    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
    return loss

def rpn_bbox_loss(target_deltas, target_matchs, rpn_deltas):
    '''Return the RPN bounding box loss graph.
        target_deltas: [batch, num_rpn_deltas, (dy, dx, log(dh), log(dw))].
            Uses 0 padding to fill in unsed bbox deltas.
        target_matchs: [batch, anchors]. Anchor match type. 1=positive,
            -1=negative, 0=neutral anchor.
        rpn_deltas: [batch, anchors, (dy, dx, log(dh), log(dw))]
    def batch_pack(x, counts, num_rows):
        '''Picks different number of values from each row
        in x depending on the values in counts.
        outputs = []
        for i in range(num_rows):
            outputs.append(x[i, :counts[i]])
        return tf.concat(outputs, axis=0)
    # Positive anchors contribute to the loss, but negative and
    # neutral anchors (match value of 0 or -1) don't.
    indices = tf.where(tf.equal(target_matchs, 1))

    # Pick bbox deltas that contribute to the loss
    rpn_deltas = tf.gather_nd(rpn_deltas, indices)

    # Trim target bounding box deltas to the same length as rpn_deltas.
    batch_counts = tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32), axis=1)
    target_deltas = batch_pack(target_deltas, batch_counts,

    loss = smooth_l1_loss(target_deltas, rpn_deltas)
    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
    return loss

def rcnn_class_loss(target_matchs_list, rcnn_class_logits_list):
    '''Loss for the classifier head of Faster RCNN.
        target_matchs_list: list of [num_rois]. Integer class IDs. Uses zero
            padding to fill in the array.
        rcnn_class_logits_list: list of [num_rois, num_classes]
    class_ids = tf.concat(target_matchs_list, 0)
    class_logits = tf.concat(rcnn_class_logits_list, 0)
    class_ids = tf.cast(class_ids, 'int64')
    # loss = tf.losses.sparse_softmax_cross_entropy(labels=class_ids,
    #                                               logits=class_logits)

    num_classes = class_logits.shape[-1]
    # print(class_logits.shape)
    loss = keras.losses.categorical_crossentropy(tf.one_hot(class_ids, depth=num_classes),
                                                 class_logits, from_logits=True)

    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
    return loss

def rcnn_bbox_loss(target_deltas_list, target_matchs_list, rcnn_deltas_list):
    '''Loss for Faster R-CNN bounding box refinement.
        target_deltas_list: list of [num_positive_rois, (dy, dx, log(dh), log(dw))]
        target_matchs_list: list of [num_rois]. Integer class IDs.
        rcnn_deltas_list: list of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
    target_deltas = tf.concat(target_deltas_list, 0)
    target_class_ids = tf.concat(target_matchs_list, 0)
    rcnn_deltas = tf.concat(rcnn_deltas_list, 0)

    # Only positive ROIs contribute to the loss. And only
    # the right class_id of each ROI. Get their indicies.
    positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
    positive_roi_class_ids = tf.cast(
        tf.gather(target_class_ids, positive_roi_ix), tf.int64)
    indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
    # Gather the deltas (predicted and true) that contribute to loss
    rcnn_deltas = tf.gather_nd(rcnn_deltas, indices)

    # Smooth-L1 Loss
    loss = smooth_l1_loss(target_deltas, rcnn_deltas)
    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)

    return loss

第三部分 针对anchor_target文件代码解析

import tensorflow as tf
from detection.core.bbox import geometry, transforms
from detection.utils.misc import trim_zeros

class AnchorTarget:
    for every generated anchors boxes: [326393, 4],
    create its rpn_target_matchs and rpn_target_matchs
    which is used to train RPN network.
    def __init__(self,
                 target_means=(0., 0., 0., 0.), 
                 target_stds=(0.1, 0.1, 0.2, 0.2),
        Compute regression and classification targets for anchors.
            target_means: [4]. Bounding box refinement mean for RPN.
            target_stds: [4]. Bounding box refinement standard deviation for RPN.
            num_rpn_deltas: int. Maximal number of Anchors per image to feed to rpn heads.
            positive_fraction: float.
            pos_iou_thr: float.
            neg_iou_thr: float.
        self.target_means = target_means
        self.target_stds = target_stds
        self.num_rpn_deltas = num_rpn_deltas
        self.positive_fraction = positive_fraction
        self.pos_iou_thr = pos_iou_thr
        self.neg_iou_thr = neg_iou_thr

    def build_targets(self, anchors, valid_flags, gt_boxes, gt_class_ids):
        Given the anchors and GT boxes, compute overlaps and identify positive
        anchors and deltas to refine them to match their corresponding GT boxes.

            anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
            valid_flags: [batch_size, num_anchors]
            gt_boxes: [batch_size, num_gt_boxes, (y1, x1, y2, x2)] in image 
                coordinates. batch_size = 1 usually
            gt_class_ids: [batch_size, num_gt_boxes] Integer class IDs.

            rpn_target_matchs: [batch_size, num_anchors] matches between anchors and GT boxes.
                1 = positive anchor, -1 = negative anchor, 0 = neutral anchor
            rpn_target_deltas: [batch_size, num_rpn_deltas, (dy, dx, log(dh), log(dw))] 
                Anchor bbox deltas.
        rpn_target_matchs = []
        rpn_target_deltas = []
        num_imgs = gt_class_ids.shape[0] # namely, batchsz , 1
        for i in range(num_imgs):
            target_match, target_delta = self._build_single_target(
                anchors, valid_flags[i], gt_boxes[i], gt_class_ids[i])
        rpn_target_matchs = tf.stack(rpn_target_matchs)
        rpn_target_deltas = tf.stack(rpn_target_deltas)
        rpn_target_matchs = tf.stop_gradient(rpn_target_matchs)
        rpn_target_deltas = tf.stop_gradient(rpn_target_deltas)
        return rpn_target_matchs, rpn_target_deltas
    # 求出一个batch中随机抽取256个anchors中的positive anchor与真实框之间的偏离量(dy,dx,logw,logh)
    def _build_single_target(self, anchors, valid_flags, gt_boxes, gt_class_ids):
        ''' Compute targets per instance.
            anchors: [num_anchors, (y1, x1, y2, x2)]
            valid_flags: [num_anchors]
            gt_class_ids: [num_gt_boxes]
            gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
            target_matchs: [num_anchors]  # 一个256个,数值1代表该anchors内有目标,数值-1表示无目标
            target_deltas: [num_rpn_deltas, (dy, dx, log(dh), log(dw))]  #

        gt_boxes, _ = trim_zeros(gt_boxes)  # remove padded zero boxes, [new_N, 4]
        target_matchs = tf.zeros(anchors.shape[0], dtype=tf.int32)  # shape为[326393,]
        # Compute overlaps [num_anchors, num_gt_boxes] 326393 vs 10 => [326393, 10]
        overlaps = geometry.compute_overlaps(anchors, gt_boxes)

        # Match anchors to GT Boxes
        # If an anchor overlaps ANY GT box with IoU >= 0.7 then it's positive.
        # If an anchor overlaps ALL GT box with IoU < 0.3 then it's negative.
        # Neutral anchors are those that don't match the conditions above,
        # and they don't influence the loss function.
        # However, don't keep any GT box unmatched (rare, but happens). Instead,
        # match it to the closest anchor (even if its max IoU is < 0.3).
        neg_values = tf.constant([0, -1])
        pos_values = tf.constant([0, 1])

        # 将所有anchors进行分类,anchors里有目标则为1,没有为-1,判断标准为LoU
        # 1. Set negative anchors first. They get overwritten below if a GT box is
        # matched to them. [N_anchors, N_gt_boxes]
        # 方法 tf.argmax是最大值的位置 tf.reduce_max是最大值
        anchor_iou_argmax = tf.argmax(overlaps, axis=1)   # [326396]  get clost gt boxes for each anchors
        anchor_iou_max = tf.reduce_max(overlaps, axis=[1])  # [326396]  get closet gt boxes's overlap scores

        # negative anchor的值为-1
        # 方法:tf.where复合方法
        target_matchs = tf.where(anchor_iou_max < self.neg_iou_thr, 
                                 -tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)

        # 2.超出边界的anchor的值为-1
        target_matchs = tf.where(tf.equal(valid_flags, 1),
                                 target_matchs, tf.zeros(anchors.shape[0], dtype=tf.int32))

        # if an anchor overlap with any GT box with IoU > 0.7, marked as foreground
        # positive anchor的值为1
        target_matchs = tf.where(anchor_iou_max >= self.pos_iou_thr, 
                                 tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)

        # 3. 将未匹配到anchor的gt box匹配到IoU最大的anchor上
        # 方法:tf.compat.v1.scatter_update
        gt_iou_argmax = tf.argmax(overlaps, axis=0)  # [N_gt_boxes]
        # 具体:不知道该函数的作用
        target_matchs = tf.compat.v1.scatter_update(tf.Variable(target_matchs), gt_iou_argmax, 1)
        # update corresponding value=>1 for GT boxes' closest boxes

        # 4.控制positive anchors 和 negative anchors的数量
        # 将positive anchors的数量控制在128个
        # 具体:若positive anchors数量超过256*0.5个,则随机抽取超过该数量的anchor,使得在target_matchs上所对应的值为0
        # 方法:tf.random.shuffle 打乱顺序
        # 方法:tf.squeeze 删除维度长度为1的维度;tf.where与tf.equal的复合应用,tf.equal判断两个对象是否相等;
        ids = tf.where(tf.equal(target_matchs, 1))  # [N_pos_anchors, 1], [15, 1]
        ids = tf.squeeze(ids, 1)  # [15]
        extra = ids.shape.as_list()[0] - int(self.num_rpn_deltas * self.positive_fraction)  # 256*0.5
        if extra > 0:  # extra means the redundant pos_anchors
            ids = tf.random.shuffle(ids)[:extra]
            target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
        # 将negative anchors的数量控制在128个
        ids = tf.where(tf.equal(target_matchs, -1))  # [213748, 1]
        ids = tf.squeeze(ids, 1)
        extra = ids.shape.as_list()[0] - (self.num_rpn_deltas -      # 213748 - (256 - num_of_pos_anchors:15)
            tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32)))
        if extra > 0:  # 213507, so many negative anchors!
            # Rest the extra ones to neutral
            ids = tf.random.shuffle(ids)[:extra]
            target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
        # since we only need 256 anchors, and it had better contains half positive anchors, and harlf neg .
        # For positive anchors, compute shift and scale needed to transform them
        # to match the corresponding GT boxes.

        # 5.计算positive anchors的坐标和对应的真实坐标
        # 具体:positive anchors坐标:随机选好的256个anchors里面的positive anchors的坐标
        #      真实坐标:哪个真实框与positive anchors的Lou最大,就取哪个真实框的坐标
        # 方法:tf.gather_nd和tf.gather都是收集指定位置的数据,具体细节还是不太一样
        ids = tf.where(tf.equal(target_matchs, 1))  # [15]
        a = tf.gather_nd(anchors, ids)   # [369303, 4], [15] => [15, 4]

        anchor_idx = tf.gather_nd(anchor_iou_argmax, ids)   # closed gt boxes index for 369303 anchors
        gt = tf.gather(gt_boxes, anchor_idx)     # get closed gt boxes coordinates for ids=15
        # a: [15, 4], postive anchors, gt: [15, 4] closed gt boxes for each anchors=15

        # 6.计算positive anchors的(dy,dx,logw,logh)
        target_deltas = transforms.bbox2delta(a, gt, self.target_means, self.target_stds)
        # target_deltas: [15, (dy,dx,logw,logh)]

        padding = tf.maximum(self.num_rpn_deltas - tf.shape(target_deltas)[0], 0)   # 256-15
        target_deltas = tf.pad(target_deltas, [(0, padding), (0, 0)])   # padding to [256,4], last padding 0

        return target_matchs, target_deltas

第四部分 针对anchor_genarator文件代码解析

import tensorflow as tf
from detection.utils.misc import calc_img_shapes, calc_batch_padded_shape

class AnchorGenerator:
    # anchor指的是feature map上的每一格,其大小为feature_stride,cell会产生scales_len*ratio_len个
    # 先验框anchor boxes,scales表示框的一边长度,另一边的长度要乘上ratio

    This class operate on padded iamge, eg. [1216, 1216]
    and generate scales*ratios number of anchor boxes for each point in
    padded image, with stride = feature_strides
    number of anchor = (1216 // feature_stride)^2
    number of anchor boxes = number of anchor * (scales_len*ratio_len)
    def __init__(self, 
                 scales=(32, 64, 128, 256, 512), 
                 ratios=(0.5, 1, 2), 
                 feature_strides=(4, 8, 16, 32, 64)):
        Anchor Generator
            scales: 1D array of anchor sizes in pixels.
            ratios: 1D array of anchor ratios of width/height.
            feature_strides: Stride of the feature map relative to the image in pixels.
        self.scales = scales
        self.ratios = ratios
        self.feature_strides = feature_strides

    def generate_pyramid_anchors(self, img_metas):
        Generate the multi-level anchors for Region Proposal Network
            img_metas: [batch_size, 11]
            anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
            valid_flags: [batch_size, num_anchors]
        # 返回:针对所有anchors生成先验框的坐标 valid flags shape为[369303, ],数值1表示先验框未出范围
        # 具体:在feature map上每一个anchors在中心处生成的先验框anchors box的左上坐标和右下坐标(y1, x1, y2, x2)
        pad_shape = calc_batch_padded_shape(img_metas)  # [1216, 1216]
        # : [(304, 304), (152, 152), (76, 76), (38, 38), (19, 19)]
        feature_shapes = [(pad_shape[0] // stride, pad_shape[1] // stride)
                          for stride in self.feature_strides]
        anchors = [
            self._generate_level_anchors(level, feature_shape)
            for level, feature_shape in enumerate(feature_shapes)
        ]  # [277248, 4], [69312, 4], [17328, 4], [4332, 4], [1083, 4]
        anchors = tf.concat(anchors, axis=0)  # [369303, 4]

        # generate valid flags
        img_shapes = calc_img_shapes(img_metas)  # (800, 1067)
        valid_flags = [
            self._generate_valid_flags(anchors, img_shapes[i])
            for i in range(img_shapes.shape[0])
        valid_flags = tf.stack(valid_flags, axis=0)

        # 方法:tf.stop_gradient使得该变量不参与梯度计算中
        anchors = tf.stop_gradient(anchors)
        valid_flags = tf.stop_gradient(valid_flags)
        return anchors, valid_flags

    # 判断anchor是否在图片内
    # 方法:anchor的中心坐标在图片内就认为anchor就在图片内
    # 具体:tf.where
    def _generate_valid_flags(self, anchors, img_shape):
        remove these anchor boxed on padded area
            anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
            img_shape: Tuple. (height, width, channels)

            valid_flags: [num_anchors]
        y_center = (anchors[:, 2] + anchors[:, 0]) / 2  # [369300]
        x_center = (anchors[:, 3] + anchors[:, 1]) / 2
        valid_flags = tf.ones(anchors.shape[0], dtype=tf.int32)  # [369300]
        zeros = tf.zeros(anchors.shape[0], dtype=tf.int32)
        # set boxes whose center is out of image area as invalid.
        # anchor在图片内为1 不在为0
        valid_flags = tf.where(y_center <= img_shape[0], valid_flags, zeros)
        valid_flags = tf.where(x_center <= img_shape[1], valid_flags, zeros)
        return valid_flags

    # 在feature map上每一个anchors在中心处生成的先验框anchors box的左上坐标和右下坐标(y1, x1, y2, x2)
    def _generate_level_anchors(self, level, feature_shape):
        '''Generate the anchors given the spatial shape of feature map.
        scale: 32
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (256, 256) (256, 256)

            feature_shape: (height, width)

            numpy.ndarray [anchors_num, (y1, x1, y2, x2)]
        scale = self.scales[level]  # 32
        ratios = self.ratios  # 0.5 1 2
        feature_stride = self.feature_strides[level]  # 4
        # Get all combinations of scales and ratios
        # tf.meshgrid:a从一行重复增加到size(b)行,把b转置成一列再重复增加到size(a)列
        scales, ratios = tf.meshgrid([float(scale)], ratios)

        scales = tf.reshape(scales, [-1])  # [32, 32, 32]
        ratios = tf.reshape(ratios, [-1])  # [0.5, 1, 2]
        # Enumerate heights and widths from scales and ratios
        heights = scales / tf.sqrt(ratios)  # [45, 32, 22], square root
        widths = scales * tf.sqrt(ratios)   # [22, 32, 45]
        # x和y指的每个anchor左上角的坐标
        # Enumerate shifts in feature space, [0, 4, ..., 1216-4]
        shifts_y = tf.multiply(tf.range(feature_shape[0]), feature_stride)  # shape(304,)
        shifts_x = tf.multiply(tf.range(feature_shape[1]), feature_stride)
        # 方法:tf.meshgrid生成坐标系
        shifts_x, shifts_y = tf.cast(shifts_x, tf.float32), tf.cast(shifts_y, tf.float32)
        shifts_x, shifts_y = tf.meshgrid(shifts_x, shifts_y)  # [304, 304]

        # Enumerate combinations of shifts, widths, and heights # mesh A: [3] B:[304,304]=>[92416] =>[92416,3,2]
        box_widths, box_centers_x = tf.meshgrid(widths, shifts_x)  # [304, 3]
        box_heights, box_centers_y = tf.meshgrid(heights, shifts_y)

        # Reshape to get a list of (y, x) and a list of (h, w)
        box_centers = tf.reshape(tf.stack([box_centers_y, box_centers_x], axis=2), (-1, 2))
        box_sizes = tf.reshape(tf.stack([box_heights, box_widths], axis=2), (-1, 2))

        # Convert to corner coordinates (y1, x1, y2, x2) [304x304, 3, 4] => [277448, 4]
        boxes = tf.concat([box_centers - 0.5 * box_sizes,
                           box_centers + 0.5 * box_sizes], axis=1)

        return boxes
