目标检测系列5——Faster R-CNN2(代码实现)

参考黑马程序员教程

目标检测系列5——Faster R-CNN2(代码实现)_第1张图片

目标检测系列5——Faster R-CNN2(代码实现)_第2张图片
(1) train.py

# train
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from detection.datasets import coco, data_generator
from detection.models.detectors import faster_rcnn
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

## 新加,解决Failed to get convolution algorithm. This is probably because cuDNN failed to initialize
os.environ['CUDA_VISIBLE_DEVICES'] = '/device:GPU:0'

def train():
    # 1 调用coco处理数据的方法, 获取预处理后的数据集
    train_dataset = coco.CocoDataSet('./data/coco2017', 'train',
                         flip_ratio=0.5,
                         pad_mode='fixed',
                         mean=(123.675, 116.28, 103.53),
                         std=(1., 1., 1.),
                         scale=(800, 1216))
    print('=====', train_dataset)

    #2 数据处理,生成一个训练数据,生成每4个列表,每个列表都是一个多为列表
    # tf.data.Dataset
    train_generator = data_generator.DataGenerator(train_dataset)
    print("+++++", train_generator)

    #3 生成了数据,但是不太清楚train_generator到底是个啥东西
    tf_dataset = tf.data.Dataset.from_generator(train_generator,
                                                (tf.float32, tf.float32, tf.float32, tf.float32))
    print('------', tf_dataset)
    # 4因为这里只有1张图像,所以batch是1,打乱
    tf_dataset = tf_dataset.batch(1).prefetch(100).shuffle(100)

    # 5建立模型以及训练
    num_classes = len(train_dataset.get_categories())
    print('-=-=-=-=', num_classes)
    # 6获取模型
    model = faster_rcnn.FasterRCNN(num_classes=num_classes)
    # 7定义优化器
    optimizer = tf.keras.optimizers.SGD(1e-3, momentum=0.9, nesterov=True)
    # 8进行循环训练,循环1个epoch
    for epoch in range(1):
        # 这里是因为前面给它加了批次,所以tf_dataset变成了5维数据,第一维是它的批次量
        for (batch, inputs) in enumerate(tf_dataset):
            # 这里不懂,4个列表分别为,原始图像,第2个不知道,边界框,标签
            batch_imgs, batch_metas, batch_bboxes, batch_labels = inputs
            print("batch_imgs形状:",batch_imgs.shape)
            print(batch_imgs, batch_metas, batch_bboxes, batch_labels)

            with tf.GradientTape() as tape:
                # 每一个结构的损失
                rpn_class_loss, rpn_bbox_loss, rcnn_class_loss, rcnn_bbox_loss = \
                    model((batch_imgs, batch_metas, batch_bboxes, batch_labels), training=True)
                # rpn网络的损失和网络的总损失
                loss = rpn_class_loss + rpn_bbox_loss + rcnn_class_loss + rcnn_bbox_loss
            # 进行对损失求梯度
            grads = tape.gradient(loss, model.trainable_variables)
            # 将梯度应用到变量上
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            print("迭代次数:%d, batch大小:%d, 损失大小:%f" % (epoch+1, batch+1, loss))


def test():

    train_dataset = coco.CocoDataSet('./data/coco2017', 'val')

    # 获取数据和模型
    train_generator = data_generator.DataGenerator(train_dataset)
    tf_dataset = tf.data.Dataset.from_generator(train_generator,
                                                (tf.float32, tf.float32, tf.float32, tf.float32))
    tf_dataset = tf_dataset.batch(1).prefetch(100).shuffle(100)
    num_classes = len(train_dataset.get_categories())
    model = faster_rcnn.FasterRCNN(num_classes=num_classes)
    print(num_classes)

    for (batch, inputs) in enumerate(tf_dataset):
        img, img_meta, _, _ = inputs
        print(img, img_meta)

        detections_list = model((img, img_meta), training=False)

        print(detections_list)


if __name__ == '__main__':
    train()
    # test()

core
(2) anchor——generator

# anchor_generator
import tensorflow as tf
from detection.utils.misc import calc_img_shapes, calc_batch_padded_shape

class AnchorGenerator:
    """
    This class operate on padded iamge, eg. [1216, 1216]
    and generate scales*ratios number of anchor boxes for each point in
    padded image, with stride = feature_strides
    number of anchor = (1216 // feature_stride)^2
    number of anchor boxes = number of anchor * (scales_len*ratio_len)
    """
    def __init__(self, 
                 scales=(32, 64, 128, 256, 512), 
                 ratios=(0.5, 1, 2), 
                 feature_strides=(4, 8, 16, 32, 64)):
        '''
        Anchor Generator
        
        Attributes
        ---
            scales: 1D array of anchor sizes in pixels.
            ratios: 1D array of anchor ratios of width/height.
            feature_strides: Stride of the feature map relative to the image in pixels.
        '''
        self.scales = scales
        self.ratios = ratios
        self.feature_strides = feature_strides
     
    def generate_pyramid_anchors(self, img_metas):
        '''
        Generate the multi-level anchors for Region Proposal Network
        
        Args
        ---
            img_metas: [batch_size, 11]
        
        Returns
        ---
            anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
            valid_flags: [batch_size, num_anchors]
        '''
        # generate anchors
        pad_shape = calc_batch_padded_shape(img_metas) # [1216, 1216]
        # : [(304, 304), (152, 152), (76, 76), (38, 38), (19, 19)]
        feature_shapes = [(pad_shape[0] // stride, pad_shape[1] // stride)
                          for stride in self.feature_strides]
        anchors = [
            self._generate_level_anchors(level, feature_shape)
            for level, feature_shape in enumerate(feature_shapes)
        ] # [277248, 4], [69312, 4], [17328, 4], [4332, 4], [1083, 4]
        anchors = tf.concat(anchors, axis=0) # [369303, 4]
        # print('total anchors:', anchors.shape)
        # print('---------')

        # generate valid flags
        img_shapes = calc_img_shapes(img_metas) # (800, 1067)
        valid_flags = [
            self._generate_valid_flags(anchors, img_shapes[i])
            for i in range(img_shapes.shape[0])
        ]
        valid_flags = tf.stack(valid_flags, axis=0)
        
        anchors = tf.stop_gradient(anchors)
        valid_flags = tf.stop_gradient(valid_flags)
        
        return anchors, valid_flags
    
    def _generate_valid_flags(self, anchors, img_shape):
        '''
        remove these anchor boxed on padded area
        ---
            anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
            img_shape: Tuple. (height, width, channels)
            
        Returns
        ---
            valid_flags: [num_anchors]
        '''
        y_center = (anchors[:, 2] + anchors[:, 0]) / 2 # [369300]
        x_center = (anchors[:, 3] + anchors[:, 1]) / 2
        
        valid_flags = tf.ones(anchors.shape[0], dtype=tf.int32) # [369300]
        zeros = tf.zeros(anchors.shape[0], dtype=tf.int32)
        # set boxes whose center is out of image area as invalid.
        valid_flags = tf.where(y_center <= img_shape[0], valid_flags, zeros)
        valid_flags = tf.where(x_center <= img_shape[1], valid_flags, zeros)
        
        return valid_flags
    
    def _generate_level_anchors(self, level, feature_shape):
        '''Generate the anchors given the spatial shape of feature map.
        
        scale: 32
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (256, 256) (256, 256)
        scale: 64
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (128, 128) (128, 128)
        scale: 128
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (64, 64) (64, 64)
        scale: 256
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (32, 32) (32, 32)
        scale: 512
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (16, 16) (16, 16)


        scale: 32
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (304, 304) (304, 304)
        boxes: (277248, 4)
        scale: 64
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (152, 152) (152, 152)
        boxes: (69312, 4)
        scale: 128
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (76, 76) (76, 76)
        boxes: (17328, 4)
        scale: 256
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (38, 38) (38, 38)
        boxes: (4332, 4)
        scale: 512
        ratios: tf.Tensor([0.5 1.  2. ], shape=(3,), dtype=float32)
        pos: (19, 19) (19, 19)
        boxes: (1083, 4)
        total anchors: (369303, 4)

        ---
            feature_shape: (height, width)

        Returns
        ---
            numpy.ndarray [anchors_num, (y1, x1, y2, x2)]
        '''
        scale = self.scales[level]
        ratios = self.ratios
        feature_stride = self.feature_strides[level]
        
        # Get all combinations of scales and ratios
        scales, ratios = tf.meshgrid([float(scale)], ratios)
        scales = tf.reshape(scales, [-1]) # [32, 32, 32]
        ratios = tf.reshape(ratios, [-1]) # [0.5, 1, 2]
        
        # Enumerate heights and widths from scales and ratios
        heights = scales / tf.sqrt(ratios) # [45, 32, 22], square root
        widths = scales * tf.sqrt(ratios)  # [22, 32, 45]

        # Enumerate shifts in feature space, [0, 4, ..., 1216-4]
        shifts_y = tf.multiply(tf.range(feature_shape[0]), feature_stride)
        shifts_x = tf.multiply(tf.range(feature_shape[1]), feature_stride)
        
        shifts_x, shifts_y = tf.cast(shifts_x, tf.float32), tf.cast(shifts_y, tf.float32)
        shifts_x, shifts_y = tf.meshgrid(shifts_x, shifts_y) # [304, 304, 2] coordinates

        # Enumerate combinations of shifts, widths, and heights # mesh A: [3] B:[304,304]=>[92416] =>[92416,3,2]
        box_widths, box_centers_x = tf.meshgrid(widths, shifts_x)
        box_heights, box_centers_y = tf.meshgrid(heights, shifts_y)

        # Reshape to get a list of (y, x) and a list of (h, w)
        box_centers = tf.reshape(tf.stack([box_centers_y, box_centers_x], axis=2), (-1, 2))
        box_sizes = tf.reshape(tf.stack([box_heights, box_widths], axis=2), (-1, 2))

        # Convert to corner coordinates (y1, x1, y2, x2) [304x304, 3, 4] => [277448, 4]
        boxes = tf.concat([box_centers - 0.5 * box_sizes,
                           box_centers + 0.5 * box_sizes], axis=1)
        # print('scale:', scale)
        # print('ratios:', ratios)
        # print('pos:', shifts_x.shape, shifts_y.shape)
        # print('boxes:', boxes.shape)
        return boxes

(3) anchor_target

# anchor_target
import tensorflow as tf
from detection.core.bbox import geometry, transforms
from detection.utils.misc import trim_zeros



class AnchorTarget:
    """
    for every generated anchors boxes: [326393, 4],
    create its rpn_target_matchs and rpn_target_matchs
    which is used to train RPN network.
    """
    def __init__(self,
                 target_means=(0., 0., 0., 0.), 
                 target_stds=(0.1, 0.1, 0.2, 0.2),
                 num_rpn_deltas=256,
                 positive_fraction=0.5,
                 pos_iou_thr=0.7,
                 neg_iou_thr=0.3):
        '''
        Compute regression and classification targets for anchors.
        
        Attributes
        ---
            target_means: [4]. Bounding box refinement mean for RPN.
            target_stds: [4]. Bounding box refinement standard deviation for RPN.
            num_rpn_deltas: int. Maximal number of Anchors per image to feed to rpn heads.
            positive_fraction: float.
            pos_iou_thr: float.
            neg_iou_thr: float.
        '''
        self.target_means = target_means
        self.target_stds = target_stds
        self.num_rpn_deltas = num_rpn_deltas
        self.positive_fraction = positive_fraction
        self.pos_iou_thr = pos_iou_thr
        self.neg_iou_thr = neg_iou_thr

    def build_targets(self, anchors, valid_flags, gt_boxes, gt_class_ids):
        '''
        Given the anchors and GT boxes, compute overlaps and identify positive
        anchors and deltas to refine them to match their corresponding GT boxes.

        Args
        ---
            anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
            valid_flags: [batch_size, num_anchors]
            gt_boxes: [batch_size, num_gt_boxes, (y1, x1, y2, x2)] in image 
                coordinates. batch_size = 1 usually
            gt_class_ids: [batch_size, num_gt_boxes] Integer class IDs.

        Returns
        ---
            rpn_target_matchs: [batch_size, num_anchors] matches between anchors and GT boxes.
                1 = positive anchor, -1 = negative anchor, 0 = neutral anchor
            rpn_target_deltas: [batch_size, num_rpn_deltas, (dy, dx, log(dh), log(dw))] 
                Anchor bbox deltas.
        '''
        rpn_target_matchs = []
        rpn_target_deltas = []
        
        num_imgs = gt_class_ids.shape[0] # namely, batchsz , 1
        for i in range(num_imgs):
            target_match, target_delta = self._build_single_target(
                anchors, valid_flags[i], gt_boxes[i], gt_class_ids[i])
            rpn_target_matchs.append(target_match)
            rpn_target_deltas.append(target_delta)
        
        rpn_target_matchs = tf.stack(rpn_target_matchs)
        rpn_target_deltas = tf.stack(rpn_target_deltas)
        
        rpn_target_matchs = tf.stop_gradient(rpn_target_matchs)
        rpn_target_deltas = tf.stop_gradient(rpn_target_deltas)
        
        return rpn_target_matchs, rpn_target_deltas

    def _build_single_target(self, anchors, valid_flags, gt_boxes, gt_class_ids):
        '''Compute targets per instance.
        
        Args
        ---
            anchors: [num_anchors, (y1, x1, y2, x2)]
            valid_flags: [num_anchors]
            gt_class_ids: [num_gt_boxes]
            gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
        
        Returns
        ---
            target_matchs: [num_anchors]
            target_deltas: [num_rpn_deltas, (dy, dx, log(dh), log(dw))] 
        '''
        gt_boxes, _ = trim_zeros(gt_boxes) # remove padded zero boxes, [new_N, 4]
        
        target_matchs = tf.zeros(anchors.shape[0], dtype=tf.int32) # [326393]
        
        # Compute overlaps [num_anchors, num_gt_boxes] 326393 vs 10 => [326393, 10]
        overlaps = geometry.compute_overlaps(anchors, gt_boxes)

        # Match anchors to GT Boxes
        # If an anchor overlaps ANY GT box with IoU >= 0.7 then it's positive.
        # If an anchor overlaps ALL GT box with IoU < 0.3 then it's negative.
        # Neutral anchors are those that don't match the conditions above,
        # and they don't influence the loss function.
        # However, don't keep any GT box unmatched (rare, but happens). Instead,
        # match it to the closest anchor (even if its max IoU is < 0.3).
        
        neg_values = tf.constant([0, -1])
        pos_values = tf.constant([0, 1])
        
        # 1. Set negative anchors first. They get overwritten below if a GT box is
        # matched to them. [N_anchors, N_gt_boxes]
        anchor_iou_argmax = tf.argmax(overlaps, axis=1) # [326396] get clost gt boxes for each anchors
        anchor_iou_max = tf.reduce_max(overlaps, axis=[1]) # [326396] get closet gt boxes's overlap scores
        # if an anchor box overlap all GT box with IoU < 0.3, marked as -1 background
        target_matchs = tf.where(anchor_iou_max < self.neg_iou_thr, 
                                 -tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)

        # filter invalid anchors
        target_matchs = tf.where(tf.equal(valid_flags, 1),
                                 target_matchs, tf.zeros(anchors.shape[0], dtype=tf.int32))
        # if an anchor overlap with any GT box with IoU > 0.7, marked as foreground
        # 2. Set anchors with high overlap as positive.
        target_matchs = tf.where(anchor_iou_max >= self.pos_iou_thr, 
                                 tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)

        # 3. Set an anchor for each GT box (regardless of IoU value).        
        gt_iou_argmax = tf.argmax(overlaps, axis=0) # [N_gt_boxes]
        target_matchs = tf.compat.v1.scatter_update(tf.Variable(target_matchs), gt_iou_argmax, 1)
        # update corresponding value=>1 for GT boxes' closest boxes
        
        # Subsample to balance positive and negative anchors
        # Don't let positives be more than half the anchors
        ids = tf.where(tf.equal(target_matchs, 1))  # [N_pos_anchors, 1], [15, 1]
        ids = tf.squeeze(ids, 1) # [15]
        extra = ids.shape.as_list()[0] - int(self.num_rpn_deltas * self.positive_fraction) # 256*0.5
        if extra > 0: # extra means the redundant pos_anchors
            # Reset the extra random ones to neutral
            ids = tf.random.shuffle(ids)[:extra]
            target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
        # Same for negative proposals
        ids = tf.where(tf.equal(target_matchs, -1)) # [213748, 1]
        ids = tf.squeeze(ids, 1)
        extra = ids.shape.as_list()[0] - (self.num_rpn_deltas - # 213748 - (256 - num_of_pos_anchors:15)
            tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32)))
        if extra > 0: # 213507, so many negative anchors!
            # Rest the extra ones to neutral
            ids = tf.random.shuffle(ids)[:extra]
            target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
        # since we only need 256 anchors, and it had better contains half positive anchors, and harlf neg .
        
        # For positive anchors, compute shift and scale needed to transform them
        # to match the corresponding GT boxes.
        ids = tf.where(tf.equal(target_matchs, 1)) # [15]
        
        a = tf.gather_nd(anchors, ids) # [369303, 4], [15] => [15, 4]
        anchor_idx = tf.gather_nd(anchor_iou_argmax, ids) # closed gt boxes index for 369303 anchors
        gt = tf.gather(gt_boxes, anchor_idx) # get closed gt boxes coordinates for ids=15
        # a: [15, 4], postive anchors, gt: [15, 4] closed gt boxes for each anchors=15
        target_deltas = transforms.bbox2delta(
            a, gt, self.target_means, self.target_stds)
        # target_deltas: [15, (dy,dx,logw,logh)]?
        padding = tf.maximum(self.num_rpn_deltas - tf.shape(target_deltas)[0], 0) # 256-15
        target_deltas = tf.pad(target_deltas, [(0, padding), (0, 0)]) #padding to [256,4], last padding 0

        return target_matchs, target_deltas

(4) bbox_target

# bbox_target
import numpy as np
import tensorflow as tf

from detection.core.bbox import geometry, transforms
from detection.utils.misc import *

class ProposalTarget:

    def __init__(self,
                 target_means=(0., 0., 0., 0.),
                 target_stds=(0.1, 0.1, 0.2, 0.2), 
                 num_rcnn_deltas=256,
                 positive_fraction=0.25,
                 pos_iou_thr=0.5,
                 neg_iou_thr=0.5):
        '''
        Compute regression and classification targets for proposals.
        
        Attributes
        ---
            target_means: [4]. Bounding box refinement mean for RCNN.
            target_stds: [4]. Bounding box refinement standard deviation for RCNN.
            num_rcnn_deltas: int. Maximal number of RoIs per image to feed to bbox heads.

        '''
        self.target_means = target_means
        self.target_stds = target_stds
        self.num_rcnn_deltas = num_rcnn_deltas
        self.positive_fraction = positive_fraction
        self.pos_iou_thr = pos_iou_thr
        self.neg_iou_thr = neg_iou_thr
            
    def build_targets(self, proposals_list, gt_boxes, gt_class_ids, img_metas):
        '''
        Generates detection targets for images. Subsamples proposals and
        generates target class IDs, bounding box deltas for each.
        
        Args
        ---
            proposals_list: list of [num_proposals, (y1, x1, y2, x2)] in normalized coordinates.
            gt_boxes: [batch_size, num_gt_boxes, (y1, x1, y2, x2)] in image coordinates.
            gt_class_ids: [batch_size, num_gt_boxes] Integer class IDs.
            img_metas: [batch_size, 11]
            
        Returns
        ---
            rois_list: list of [num_rois, (y1, x1, y2, x2)] in normalized coordinates
            rcnn_target_matchs_list: list of [num_rois]. Integer class IDs.
            rcnn_target_deltas_list: list of [num_positive_rois, (dy, dx, log(dh), log(dw))].
            
        Note that self.num_rcnn_deltas >= num_rois > num_positive_rois. And different 
           images in one batch may have different num_rois and num_positive_rois.
        '''
        
        pad_shapes = calc_pad_shapes(img_metas) # [[1216, 1216]]
        
        rois_list = []
        rcnn_target_matchs_list = []
        rcnn_target_deltas_list = []
        
        for i in range(img_metas.shape[0]):
            rois, target_matchs, target_deltas = self._build_single_target(
                proposals_list[i], gt_boxes[i], gt_class_ids[i], pad_shapes[i])
            rois_list.append(rois) # [192, 4], including pos/neg anchors
            rcnn_target_matchs_list.append(target_matchs) # positive target label, and padding with zero for neg
            rcnn_target_deltas_list.append(target_deltas) # positive target deltas, and padding with zero for neg
        
        return rois_list, rcnn_target_matchs_list, rcnn_target_deltas_list
    
    def _build_single_target(self, proposals, gt_boxes, gt_class_ids, img_shape):
        '''
        Args
        ---
            proposals: [num_proposals, (y1, x1, y2, x2)] in normalized coordinates.
            gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
            gt_class_ids: [num_gt_boxes]
            img_shape: np.ndarray. [2]. (img_height, img_width)
            
        Returns
        ---
            rois: [num_rois, (y1, x1, y2, x2)]
            target_matchs: [num_positive_rois]
            target_deltas: [num_positive_rois, (dy, dx, log(dh), log(dw))]
        '''
        H, W = img_shape # 1216, 1216
        
        
        gt_boxes, non_zeros = trim_zeros(gt_boxes) # [7, 4], remove padded zero boxes
        gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros) # [7]
        # normalize (y1, x1, y2, x2) => 0~1
        gt_boxes = gt_boxes / tf.constant([H, W, H, W], dtype=tf.float32)
        # [2k, 4] with [7, 4] => [2k, 7] overlop scores
        overlaps = geometry.compute_overlaps(proposals, gt_boxes)
        anchor_iou_argmax = tf.argmax(overlaps, axis=1) # [2000]get cloest gt boxed id for each anchor boxes
        roi_iou_max = tf.reduce_max(overlaps, axis=1) # [2000]get clost gt boxes overlop score for each anchor boxes
        # roi_iou_max: [2000],
        positive_roi_bool = (roi_iou_max >= self.pos_iou_thr) #[2000]
        positive_indices = tf.where(positive_roi_bool)[:, 0] #[48, 1] =>[48]
        # get all positive indices, namely get all pos_anchor indices
        negative_indices = tf.where(roi_iou_max < self.neg_iou_thr)[:, 0]
        # get all negative anchor indices
        # Subsample ROIs. Aim for 33% positive
        # Positive ROIs
        positive_count = int(self.num_rcnn_deltas * self.positive_fraction) # 0.25?
        positive_indices = tf.random.shuffle(positive_indices)[:positive_count] # [256*0.25]=64, at most get 64
        positive_count = tf.shape(positive_indices)[0] # 34
        
        # Negative ROIs. Add enough to maintain positive:negative ratio.
        r = 1.0 / self.positive_fraction
        negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count #102
        negative_indices = tf.random.shuffle(negative_indices)[:negative_count] #[102]
        
        # Gather selected ROIs, based on remove redundant pos/neg indices
        positive_rois = tf.gather(proposals, positive_indices) # [34, 4]
        negative_rois = tf.gather(proposals, negative_indices) # [102, 4]
        
        # Assign positive ROIs to GT boxes.
        positive_overlaps = tf.gather(overlaps, positive_indices) # [34, 7]
        roi_gt_box_assignment = tf.argmax(positive_overlaps, axis=1) # [34]for each anchor, get its clost gt boxes
        roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) # [34, 4]
        target_matchs = tf.gather(gt_class_ids, roi_gt_box_assignment) # [34]
        # target_matchs, target_deltas all get!!
        # proposal: [34, 4], target: [34, 4]
        target_deltas = transforms.bbox2delta(positive_rois, roi_gt_boxes, self.target_means, self.target_stds)
        # [34, 4] [102, 4]
        rois = tf.concat([positive_rois, negative_rois], axis=0)
        
        N = tf.shape(negative_rois)[0] # 102
        target_matchs = tf.pad(target_matchs, [(0, N)]) # [34] padding after with [N]
        
        target_matchs = tf.stop_gradient(target_matchs) # [34+102]
        target_deltas = tf.stop_gradient(target_deltas) # [34, 4]
        # rois: [34+102, 4]
        return rois, target_matchs, target_deltas

(5) geometry

import tensorflow as tf

def compute_overlaps(boxes1, boxes2):
    '''Computes IoU overlaps between two sets of boxes.
    boxes1, boxes2: [N, (y1, x1, y2, x2)].
    '''
    # 1. Tile boxes2 and repeate boxes1. This allows us to compare
    # every boxes1 against every boxes2 without loops.
    # TF doesn't have an equivalent to np.repeate() so simulate it
    # using tf.tile() and tf.reshape.
    b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
                            [1, 1, tf.shape(boxes2)[0]]), [-1, 4])
    b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
    # 2. Compute intersections
    b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
    b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
    y1 = tf.maximum(b1_y1, b2_y1)
    x1 = tf.maximum(b1_x1, b2_x1)
    y2 = tf.minimum(b1_y2, b2_y2)
    x2 = tf.minimum(b1_x2, b2_x2)
    intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
    # 3. Compute unions
    b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
    b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
    union = b1_area + b2_area - intersection
    # 4. Compute IoU and reshape to [boxes1, boxes2]
    iou = intersection / union
    overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
    return overlaps

(6) transforms

# transforms
import tensorflow as tf

from detection.utils.misc import *

def bbox2delta(box, gt_box, target_means, target_stds):
    '''Compute refinement needed to transform box to gt_box.
    
    Args
    ---
        box: [..., (y1, x1, y2, x2)]
        gt_box: [..., (y1, x1, y2, x2)]
        target_means: [4]
        target_stds: [4]
    '''
    target_means = tf.constant(
        target_means, dtype=tf.float32)
    target_stds = tf.constant(
        target_stds, dtype=tf.float32)
    
    box = tf.cast(box, tf.float32)
    gt_box = tf.cast(gt_box, tf.float32)

    height = box[..., 2] - box[..., 0]
    width = box[..., 3] - box[..., 1]
    center_y = box[..., 0] + 0.5 * height
    center_x = box[..., 1] + 0.5 * width

    gt_height = gt_box[..., 2] - gt_box[..., 0]
    gt_width = gt_box[..., 3] - gt_box[..., 1]
    gt_center_y = gt_box[..., 0] + 0.5 * gt_height
    gt_center_x = gt_box[..., 1] + 0.5 * gt_width

    dy = (gt_center_y - center_y) / height
    dx = (gt_center_x - center_x) / width
    dh = tf.math.log(gt_height / height)
    dw = tf.math.log(gt_width / width)

    delta = tf.stack([dy, dx, dh, dw], axis=-1)
    delta = (delta - target_means) / target_stds
    
    return delta

def delta2bbox(box, delta, target_means, target_stds):
    '''Compute bounding box based on roi and delta.
    
    Args
    ---
        box: [N, (y1, x1, y2, x2)] box to update
        delta: [N, (dy, dx, log(dh), log(dw))] refinements to apply
        target_means: [4]
        target_stds: [4]
    '''
    target_means = tf.constant(
        target_means, dtype=tf.float32)
    target_stds = tf.constant(
        target_stds, dtype=tf.float32)
    delta = delta * target_stds + target_means    
    # Convert to y, x, h, w
    height = box[:, 2] - box[:, 0]
    width = box[:, 3] - box[:, 1]
    center_y = box[:, 0] + 0.5 * height
    center_x = box[:, 1] + 0.5 * width
    
    # Apply delta
    center_y += delta[:, 0] * height
    center_x += delta[:, 1] * width
    height *= tf.exp(delta[:, 2])
    width *= tf.exp(delta[:, 3])
    
    # Convert back to y1, x1, y2, x2
    y1 = center_y - 0.5 * height
    x1 = center_x - 0.5 * width
    y2 = y1 + height
    x2 = x1 + width
    result = tf.stack([y1, x1, y2, x2], axis=1)
    return result

def bbox_clip(box, window):
    '''
    Args
    ---
        box: [N, (y1, x1, y2, x2)]
        window: [4] in the form y1, x1, y2, x2
    '''
    # Split
    wy1, wx1, wy2, wx2 = tf.split(window, 4)
    y1, x1, y2, x2 = tf.split(box, 4, axis=1)
    # Clip
    y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
    x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
    y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
    x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
    clipped = tf.concat([y1, x1, y2, x2], axis=1)
    clipped.set_shape((clipped.shape[0], 4))
    return clipped

def bbox_flip(bboxes, width):
    '''
    Flip bboxes horizontally.
    
    Args
    ---
        bboxes: [..., 4]
        width: Int or Float
    '''
    y1, x1, y2, x2 = tf.split(bboxes, 4, axis=-1)
    
    new_x1 = width - x2
    new_x2 = width - x1
    
    flipped = tf.concat([y1, new_x1, y2, new_x2], axis=-1)
    
    return flipped



def bbox_mapping(box, img_meta):
    '''
    Args
    ---
        box: [N, 4]
        img_meta: [11]
    '''
    img_meta = parse_image_meta(img_meta)
    scale = img_meta['scale']
    flip = img_meta['flip']
    
    box = box * scale
    if tf.equal(flip, 1):
        box = bbox_flip(box, img_meta['img_shape'][1])
    
    return box

def bbox_mapping_back(box, img_meta):
    '''
    Args
    ---
        box: [N, 4]
        img_meta: [11]
    '''
    img_meta = parse_image_meta(img_meta)
    scale = img_meta['scale']
    flip = img_meta['flip']
    if tf.equal(flip, 1):
        box = bbox_flip(box, img_meta['img_shape'][1])
    box = box / scale
    
    return box

(7) loss

# loss
import tensorflow as tf
from    tensorflow import keras


def smooth_l1_loss(y_true, y_pred):
    '''Implements Smooth-L1 loss.
    
    Args
    ---
        y_true and y_pred are typically: [N, 4], but could be any shape.
    '''
    diff = tf.abs(y_true - y_pred)
    less_than_one = tf.cast(tf.less(diff, 1.0), tf.float32)
    loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
    return loss



def rpn_class_loss(target_matchs, rpn_class_logits):
    '''RPN anchor classifier loss.
    
    Args
    ---
        target_matchs: [batch_size, num_anchors]. Anchor match type. 1=positive,
            -1=negative, 0=neutral anchor.
        rpn_class_logits: [batch_size, num_anchors, 2]. RPN classifier logits for FG/BG.
    '''

    # Get anchor classes. Convert the -1/+1 match to 0/1 values.
    anchor_class = tf.cast(tf.equal(target_matchs, 1), tf.int32)
    # Positive and Negative anchors contribute to the loss,
    # but neutral anchors (match value = 0) don't.
    indices = tf.where(tf.not_equal(target_matchs, 0))
    # Pick rows that contribute to the loss and filter out the rest.
    rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
    anchor_class = tf.gather_nd(anchor_class, indices)
    # Cross entropy loss
    # loss = tf.losses.sparse_softmax_cross_entropy(labels=anchor_class,
    #                                               logits=rpn_class_logits)

    num_classes = rpn_class_logits.shape[-1]
    # print(rpn_class_logits.shape)
    loss = keras.losses.categorical_crossentropy(tf.one_hot(anchor_class, depth=num_classes),
                                                 rpn_class_logits, from_logits=True)

    
    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
    return loss


def rpn_bbox_loss(target_deltas, target_matchs, rpn_deltas):
    '''Return the RPN bounding box loss graph.
    
    Args
    ---
        target_deltas: [batch, num_rpn_deltas, (dy, dx, log(dh), log(dw))].
            Uses 0 padding to fill in unsed bbox deltas.
        target_matchs: [batch, anchors]. Anchor match type. 1=positive,
            -1=negative, 0=neutral anchor.
        rpn_deltas: [batch, anchors, (dy, dx, log(dh), log(dw))]
    '''
    def batch_pack(x, counts, num_rows):
        '''Picks different number of values from each row
        in x depending on the values in counts.
        '''
        outputs = []
        for i in range(num_rows):
            outputs.append(x[i, :counts[i]])
        return tf.concat(outputs, axis=0)
    
    # Positive anchors contribute to the loss, but negative and
    # neutral anchors (match value of 0 or -1) don't.
    indices = tf.where(tf.equal(target_matchs, 1))

    # Pick bbox deltas that contribute to the loss
    rpn_deltas = tf.gather_nd(rpn_deltas, indices)

    # Trim target bounding box deltas to the same length as rpn_deltas.
    batch_counts = tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32), axis=1)
    target_deltas = batch_pack(target_deltas, batch_counts,
                              target_deltas.shape.as_list()[0])

    loss = smooth_l1_loss(target_deltas, rpn_deltas)
    
    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
    
    return loss





def rcnn_class_loss(target_matchs_list, rcnn_class_logits_list):
    '''Loss for the classifier head of Faster RCNN.
    
    Args
    ---
        target_matchs_list: list of [num_rois]. Integer class IDs. Uses zero
            padding to fill in the array.
        rcnn_class_logits_list: list of [num_rois, num_classes]
    '''
    
    class_ids = tf.concat(target_matchs_list, 0)
    class_logits = tf.concat(rcnn_class_logits_list, 0)
    class_ids = tf.cast(class_ids, 'int64')
    
    # loss = tf.losses.sparse_softmax_cross_entropy(labels=class_ids,
    #                                               logits=class_logits)

    num_classes = class_logits.shape[-1]
    # print(class_logits.shape)
    loss = keras.losses.categorical_crossentropy(tf.one_hot(class_ids, depth=num_classes),
                                                 class_logits, from_logits=True)


    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
    return loss


def rcnn_bbox_loss(target_deltas_list, target_matchs_list, rcnn_deltas_list):
    '''Loss for Faster R-CNN bounding box refinement.
    
    Args
    ---
        target_deltas_list: list of [num_positive_rois, (dy, dx, log(dh), log(dw))]
        target_matchs_list: list of [num_rois]. Integer class IDs.
        rcnn_deltas_list: list of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
    '''
    
    target_deltas = tf.concat(target_deltas_list, 0)
    target_class_ids = tf.concat(target_matchs_list, 0)
    rcnn_deltas = tf.concat(rcnn_deltas_list, 0)

    # Only positive ROIs contribute to the loss. And only
    # the right class_id of each ROI. Get their indicies.
    positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
    positive_roi_class_ids = tf.cast(
        tf.gather(target_class_ids, positive_roi_ix), tf.int64)
    indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
    
    # Gather the deltas (predicted and true) that contribute to loss
    rcnn_deltas = tf.gather_nd(rcnn_deltas, indices)

    # Smooth-L1 Loss
    loss = smooth_l1_loss(target_deltas, rcnn_deltas)
    loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)

    return loss

Datasets

(8) coco

# coco
import os.path as osp
import cv2
import numpy as np
from pycocotools.coco import COCO

from detection.datasets import transforms, utils


"""
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession


config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
"""

class CocoDataSet(object):
    def __init__(self, dataset_dir, subset,
                 flip_ratio=0,
                 pad_mode='fixed',
                 mean=(0, 0, 0),
                 std=(1, 1, 1),
                 scale=(1024, 800),
                 debug=False):
        '''Load a subset of the COCO dataset.
        
        Attributes
        ---
            dataset_dir: The root directory of the COCO dataset.
            subset: What to load (train, val).
            flip_ratio: Float. The ratio of flipping an image and its bounding boxes.
            pad_mode: Which padded method to use (fixed, non-fixed)
            mean: Tuple. Image mean.
            std: Tuple. Image standard deviation.
            scale: Tuple of two integers.
        '''
        
        if subset not in ['train', 'val']:
            raise AssertionError('subset must be "train" or "val".')

        self.coco = COCO("{}/annotations/instances_{}2017.json".format(dataset_dir, subset))

        # get the mapping from original category ids to labels
        self.cat_ids = self.coco.getCatIds()
        self.cat2label = {
            cat_id: i + 1
            for i, cat_id in enumerate(self.cat_ids)
        }
        
        self.img_ids, self.img_infos = self._filter_imgs()
        
        if debug:
            self.img_ids, self.img_infos = self.img_ids[:50], self.img_infos[:50]
            
        self.image_dir = "{}/images/{}2017".format(dataset_dir, subset)
        
        self.flip_ratio = flip_ratio
        
        if pad_mode in ['fixed', 'non-fixed']:
            self.pad_mode = pad_mode
        elif subset == 'train':
            self.pad_mode = 'fixed'
        else:
            self.pad_mode = 'non-fixed'
        
        self.img_transform = transforms.ImageTransform(scale, mean, std, pad_mode)
        self.bbox_transform = transforms.BboxTransform()
        
        
    def _filter_imgs(self, min_size=32):
        '''Filter images too small or without ground truths.
        
        Args
        ---
            min_size: the minimal size of the image.
        '''
        # Filter images without ground truths.
        all_img_ids = list(set([_['image_id'] for _ in self.coco.anns.values()]))
        # Filter images too small.
        img_ids = []
        img_infos = []
        for i in all_img_ids:
            info = self.coco.loadImgs(i)[0]
            
            ann_ids = self.coco.getAnnIds(imgIds=i)
            ann_info = self.coco.loadAnns(ann_ids)
            ann = self._parse_ann_info(ann_info)
            
            if min(info['width'], info['height']) >= min_size and ann['labels'].shape[0] != 0:
                img_ids.append(i)
                img_infos.append(info)
        return img_ids, img_infos
        
    def _load_ann_info(self, idx):
        img_id = self.img_ids[idx]
        ann_ids = self.coco.getAnnIds(imgIds=img_id)
        ann_info = self.coco.loadAnns(ann_ids)
        return ann_info

    def _parse_ann_info(self, ann_info):
        '''Parse bbox annotation.
        
        Args
        ---
            ann_info (list[dict]): Annotation info of an image.
            
        Returns
        ---
            dict: A dict containing the following keys: bboxes, 
                bboxes_ignore, labels.
        '''
        gt_bboxes = []
        gt_labels = []
        gt_bboxes_ignore = []

        for i, ann in enumerate(ann_info):
            if ann.get('ignore', False):
                continue
            x1, y1, w, h = ann['bbox']
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
            bbox = [y1, x1, y1 + h - 1, x1 + w - 1]
            if ann['iscrowd']:
                gt_bboxes_ignore.append(bbox)
            else:
                gt_bboxes.append(bbox)
                gt_labels.append(self.cat2label[ann['category_id']])

        if gt_bboxes:
            gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
            gt_labels = np.array(gt_labels, dtype=np.int64)
        else:
            gt_bboxes = np.zeros((0, 4), dtype=np.float32)
            gt_labels = np.array([], dtype=np.int64)

        if gt_bboxes_ignore:
            gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
        else:
            gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)

        ann = dict(
            bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)


        return ann
    
    def __len__(self):
        return len(self.img_infos)
    
    def __getitem__(self, idx):
        '''Load the image and its bboxes for the given index.
        
        Args
        ---
            idx: the index of images.
            
        Returns
        ---
            tuple: A tuple containing the following items: image, 
                bboxes, labels.
        '''
        img_info = self.img_infos[idx]
        ann_info = self._load_ann_info(idx)
        
        # load the image.
        img = cv2.imread(osp.join(self.image_dir, img_info['file_name']), cv2.IMREAD_COLOR)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        
        ori_shape = img.shape
        
        # Load the annotation.
        ann = self._parse_ann_info(ann_info)
        bboxes = ann['bboxes']
        labels = ann['labels']
        
        flip = True if np.random.rand() < self.flip_ratio else False
        
        # Handle the image
        img, img_shape, scale_factor = self.img_transform(img, flip)

        pad_shape = img.shape
        
        # Handle the annotation.
        bboxes, labels = self.bbox_transform(
            bboxes, labels, img_shape, scale_factor, flip)
        
        # Handle the meta info.
        img_meta_dict = dict({
            'ori_shape': ori_shape,
            'img_shape': img_shape,
            'pad_shape': pad_shape,
            'scale_factor': scale_factor,
            'flip': flip
        })

        img_meta = utils.compose_image_meta(img_meta_dict)
        
        return img, img_meta, bboxes, labels
    
    def get_categories(self):
        '''Get list of category names. 
        
        Returns
        ---
            list: A list of category names.
            
        Note that the first item 'bg' means background.
        '''
        return ['bg'] + [self.coco.loadCats(i)[0]["name"] for i in self.cat2label.keys()]


(9)data_generator

# data_generator
import numpy as np



class DataGenerator:
    """
    读取每一个图片的信息
    """

    def __init__(self, dataset, shuffle=False):
        self.dataset = dataset
        self.shuffle = shuffle
    
    def __call__(self):
        indices = np.arange(len(self.dataset))
        if self.shuffle:
            np.random.shuffle(indices)

        for img_idx in indices:
            img, img_meta, bbox, label = self.dataset[img_idx]
            yield img, img_meta, bbox, label

(10) transforms

# transforms
import numpy as np

from detection.datasets.utils import *


class ImageTransform(object):
    '''Preprocess the image.
    
        1. rescale the image to expected size
        2. normalize the image
        3. flip the image (if needed)
        4. pad the image (if needed)
    '''
    def __init__(self,
                 scale=(800, 1333),
                 mean=(0, 0, 0),
                 std=(1, 1, 1),
                 pad_mode='fixed'):
        self.scale = scale
        self.mean = mean
        self.std = std
        self.pad_mode = pad_mode

        self.impad_size = max(scale) if pad_mode == 'fixed' else 64

    def __call__(self, img, flip=False):
        img, scale_factor = imrescale(img, self.scale)
        img_shape = img.shape
        img = imnormalize(img, self.mean, self.std)
          
        if flip:
            img = img_flip(img)
        if self.pad_mode == 'fixed':
            img = impad_to_square(img, self.impad_size)

        else: # 'non-fixed'
            img = impad_to_multiple(img, self.impad_size)
        
        return img, img_shape, scale_factor


class BboxTransform(object):
    '''Preprocess ground truth bboxes.
    
        1. rescale bboxes according to image size
        2. flip bboxes (if needed)
    '''
    def __init__(self):
        pass
    
    def __call__(self, bboxes, labels, 
                 img_shape, scale_factor, flip=False):
 
        bboxes = bboxes * scale_factor
        if flip:
            bboxes = bbox_flip(bboxes, img_shape)
            
        bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[0])
        bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[1])
            
        return bboxes, labels

(11)datasets.utils

# datasets.utils

import cv2
import numpy as np

###########################################
#
# Utility Functions for 
# Image Preprocessing and Data Augmentation
#
###########################################

def img_flip(img):
    '''Flip the image horizontally
    
    Args
    ---
        img: [height, width, channel]
    
    Returns
    ---
        np.ndarray: the flipped image.
    '''
    return np.fliplr(img)

def bbox_flip(bboxes, img_shape):
    '''Flip bboxes horizontally.
    
    Args
    ---
        bboxes: [..., 4]
        img_shape: Tuple. (height, width)
    
    Returns
    ---
        np.ndarray: the flipped bboxes.
    '''
    w = img_shape[1]
    flipped = bboxes.copy()
    flipped[..., 1] = w - bboxes[..., 3] - 1
    flipped[..., 3] = w - bboxes[..., 1] - 1
    return flipped

def impad_to_square(img, pad_size):
    '''Pad an image to ensure each edge to equal to pad_size.
    
    Args
    ---
        img: [height, width, channels]. Image to be padded
        pad_size: Int.
    
    Returns
    ---
        ndarray: The padded image with shape of 
            [pad_size, pad_size, channels].
    '''
    shape = (pad_size, pad_size, img.shape[-1])
    
    pad = np.zeros(shape, dtype=img.dtype)
    pad[:img.shape[0], :img.shape[1], ...] = img
    return pad

def impad_to_multiple(img, divisor):
    '''Pad an image to ensure each edge to be multiple to some number.
    
    Args
    ---
        img: [height, width, channels]. Image to be padded.
        divisor: Int. Padded image edges will be multiple to divisor.
    
    Returns
    ---
        ndarray: The padded image.
    '''
    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
    shape = (pad_h, pad_w, img.shape[-1])
    
    pad = np.zeros(shape, dtype=img.dtype)
    pad[:img.shape[0], :img.shape[1], ...] = img
    return pad

def imrescale(img, scale):
    '''Resize image while keeping the aspect ratio.
    
    Args
    ---
        img: [height, width, channels]. The input image.
        scale: Tuple of 2 integers. the image will be rescaled 
            as large as possible within the scale
    
    Returns
    ---
        np.ndarray: the scaled image.
    ''' 
    h, w = img.shape[:2]
    
    max_long_edge = max(scale)
    max_short_edge = min(scale)
    scale_factor = min(max_long_edge / max(h, w),
                       max_short_edge / min(h, w))
    
    new_size = (int(w * float(scale_factor) + 0.5),
                int(h * float(scale_factor) + 0.5))

    rescaled_img = cv2.resize(
        img, new_size, interpolation=cv2.INTER_LINEAR)
    
    return rescaled_img, scale_factor

def imnormalize(img, mean, std):
    '''Normalize the image.
    
    Args
    ---
        img: [height, width, channel]
        mean: Tuple or np.ndarray. [3]
        std: Tuple or np.ndarray. [3]
    
    Returns
    ---
        np.ndarray: the normalized image.
    '''
    img = (img - mean) / std    
    return img.astype(np.float32)

def imdenormalize(norm_img, mean, std):
    '''Denormalize the image.
    
    Args
    ---
        norm_img: [height, width, channel]
        mean: Tuple or np.ndarray. [3]
        std: Tuple or np.ndarray. [3]
    
    Returns
    ---
        np.ndarray: the denormalized image.
    '''
    img = norm_img * std + mean
    return img.astype(np.float32)

#######################################
#
# Utility Functions for Data Formatting
#
#######################################

def get_original_image(img, img_meta, 
                       mean=(0, 0, 0), std=(1, 1, 1)):
    '''Recover the origanal image.
    
    Args
    ---
        img: np.ndarray. [height, width, channel]. 
            The transformed image.
        img_meta: np.ndarray. [11]
        mean: Tuple or np.ndarray. [3]
        std: Tuple or np.ndarray. [3]
    
    Returns
    ---
        np.ndarray: the original image.
    '''
    img_meta_dict = parse_image_meta(img_meta)
    ori_shape = img_meta_dict['ori_shape']
    img_shape = img_meta_dict['img_shape']
    flip = img_meta_dict['flip']
    
    img = img[:img_shape[0], :img_shape[1]]
    if flip:
        img = img_flip(img)
    img = cv2.resize(img, (ori_shape[1], ori_shape[0]), 
                     interpolation=cv2.INTER_LINEAR)
    img = imdenormalize(img, mean, std)
    return img

def compose_image_meta(img_meta_dict):
    '''Takes attributes of an image and puts them in one 1D array.

    Args
    ---
        img_meta_dict: dict

    Returns
    ---
        img_meta: np.ndarray
    '''
    ori_shape = img_meta_dict['ori_shape']
    img_shape = img_meta_dict['img_shape']
    pad_shape = img_meta_dict['pad_shape']
    scale_factor = img_meta_dict['scale_factor']
    flip = 1 if img_meta_dict['flip'] else 0
    img_meta = np.array(
        ori_shape +               # size=3
        img_shape +               # size=3
        pad_shape +               # size=3
        tuple([scale_factor]) +   # size=1
        tuple([flip])             # size=1
    ).astype(np.float32)

    return img_meta

def parse_image_meta(img_meta):
    '''Parses an array that contains image attributes to its components.

    Args
    ---
        meta: [11]

    Returns
    ---
        a dict of the parsed values.
    '''
    ori_shape = img_meta[0:3]
    img_shape = img_meta[3:6]
    pad_shape = img_meta[6:9]
    scale_factor = img_meta[9]
    flip = img_meta[10]
    return {
        'ori_shape': ori_shape.astype(np.int32),
        'img_shape': img_shape.astype(np.int32),
        'pad_shape': pad_shape.astype(np.int32),
        'scale_factor': scale_factor.astype(np.float32),
        'flip': flip.astype(np.bool),
    }

model

(12)resnet

'''ResNet model for Keras.

# Reference:
- [Deep Residual Learning for Image Recognition](
    https://arxiv.org/abs/1512.03385)

'''
# resnet
import  tensorflow as tf
from    tensorflow.keras import layers

class _Bottleneck(tf.keras.Model):

    def __init__(self, filters, block, 
                 downsampling=False, stride=1, **kwargs):
        super(_Bottleneck, self).__init__(**kwargs)

        filters1, filters2, filters3 = filters
        conv_name_base = 'res' + block + '_branch'
        bn_name_base   = 'bn'  + block + '_branch'

        self.downsampling = downsampling
        self.stride = stride
        self.out_channel = filters3
        
        self.conv2a = layers.Conv2D(filters1, (1, 1), strides=(stride, stride),
                                    kernel_initializer='he_normal',
                                    name=conv_name_base + '2a')
        self.bn2a = layers.BatchNormalization(name=bn_name_base + '2a')

        self.conv2b = layers.Conv2D(filters2, (3, 3), padding='same',
                                    kernel_initializer='he_normal',
                                    name=conv_name_base + '2b')
        self.bn2b = layers.BatchNormalization(name=bn_name_base + '2b')

        self.conv2c = layers.Conv2D(filters3, (1, 1),
                                    kernel_initializer='he_normal',
                                    name=conv_name_base + '2c')
        self.bn2c = layers.BatchNormalization(name=bn_name_base + '2c')
         
        if self.downsampling:
            self.conv_shortcut = layers.Conv2D(filters3, (1, 1), strides=(stride, stride),
                                               kernel_initializer='he_normal',
                                               name=conv_name_base + '1')
            self.bn_shortcut = layers.BatchNormalization(name=bn_name_base + '1')     
    
    def call(self, inputs, training=False):
        x = self.conv2a(inputs)
        x = self.bn2a(x, training=training)
        x = tf.nn.relu(x)
        
        x = self.conv2b(x)
        x = self.bn2b(x, training=training)
        x = tf.nn.relu(x)
        
        x = self.conv2c(x)
        x = self.bn2c(x, training=training)
        
        if self.downsampling:
            shortcut = self.conv_shortcut(inputs)
            shortcut = self.bn_shortcut(shortcut, training=training)
        else:
            shortcut = inputs
            
        x += shortcut
        x = tf.nn.relu(x)
        
        return x
    
    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()

        shape[1] = shape[1] // self.stride
        shape[2] = shape[2] // self.stride
        shape[-1] = self.out_channel
        return tf.TensorShape(shape)        
        

class ResNet(tf.keras.Model):

    def __init__(self, depth, **kwargs):
        super(ResNet, self).__init__(**kwargs)
              
        if depth not in [50, 101]:
            raise AssertionError('depth must be 50 or 101.')
        self.depth = depth
    
        self.padding = layers.ZeroPadding2D((3, 3))
        self.conv1 = layers.Conv2D(64, (7, 7),
                                   strides=(2, 2),
                                   kernel_initializer='he_normal',
                                   name='conv1')
        self.bn_conv1 = layers.BatchNormalization(name='bn_conv1')
        self.max_pool = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')
        
        self.res2a = _Bottleneck([64, 64, 256], block='2a',
                                 downsampling=True, stride=1)
        self.res2b = _Bottleneck([64, 64, 256], block='2b')
        self.res2c = _Bottleneck([64, 64, 256], block='2c')
        
        self.res3a = _Bottleneck([128, 128, 512], block='3a', 
                                 downsampling=True, stride=2)
        self.res3b = _Bottleneck([128, 128, 512], block='3b')
        self.res3c = _Bottleneck([128, 128, 512], block='3c')
        self.res3d = _Bottleneck([128, 128, 512], block='3d')
        
        self.res4a = _Bottleneck([256, 256, 1024], block='4a', 
                                 downsampling=True, stride=2)
        self.res4b = _Bottleneck([256, 256, 1024], block='4b')
        self.res4c = _Bottleneck([256, 256, 1024], block='4c')
        self.res4d = _Bottleneck([256, 256, 1024], block='4d')
        self.res4e = _Bottleneck([256, 256, 1024], block='4e')
        self.res4f = _Bottleneck([256, 256, 1024], block='4f')
        if self.depth == 101:
            self.res4g = _Bottleneck([256, 256, 1024], block='4g')
            self.res4h = _Bottleneck([256, 256, 1024], block='4h')
            self.res4i = _Bottleneck([256, 256, 1024], block='4i')
            self.res4j = _Bottleneck([256, 256, 1024], block='4j')
            self.res4k = _Bottleneck([256, 256, 1024], block='4k')
            self.res4l = _Bottleneck([256, 256, 1024], block='4l')
            self.res4m = _Bottleneck([256, 256, 1024], block='4m')
            self.res4n = _Bottleneck([256, 256, 1024], block='4n')
            self.res4o = _Bottleneck([256, 256, 1024], block='4o')
            self.res4p = _Bottleneck([256, 256, 1024], block='4p')
            self.res4q = _Bottleneck([256, 256, 1024], block='4q')
            self.res4r = _Bottleneck([256, 256, 1024], block='4r')
            self.res4s = _Bottleneck([256, 256, 1024], block='4s')
            self.res4t = _Bottleneck([256, 256, 1024], block='4t')
            self.res4u = _Bottleneck([256, 256, 1024], block='4u')
            self.res4v = _Bottleneck([256, 256, 1024], block='4v')
            self.res4w = _Bottleneck([256, 256, 1024], block='4w') 
        
        self.res5a = _Bottleneck([512, 512, 2048], block='5a', 
                                 downsampling=True, stride=2)
        self.res5b = _Bottleneck([512, 512, 2048], block='5b')
        self.res5c = _Bottleneck([512, 512, 2048], block='5c')
        
        
        self.out_channel = (256, 512, 1024, 2048)
    
    def call(self, inputs, training=True):
        x = self.padding(inputs)
        x = self.conv1(x)
        x = self.bn_conv1(x, training=training)
        x = tf.nn.relu(x)
        x = self.max_pool(x)
        
        x = self.res2a(x, training=training)
        x = self.res2b(x, training=training)
        C2 = x = self.res2c(x, training=training)
        
        x = self.res3a(x, training=training)
        x = self.res3b(x, training=training)
        x = self.res3c(x, training=training)
        C3 = x = self.res3d(x, training=training)
        
        x = self.res4a(x, training=training)
        x = self.res4b(x, training=training)
        x = self.res4c(x, training=training)
        x = self.res4d(x, training=training)
        x = self.res4e(x, training=training)
        x = self.res4f(x, training=training)
        if self.depth == 101:
            x = self.res4g(x, training=training)
            x = self.res4h(x, training=training)
            x = self.res4i(x, training=training)
            x = self.res4j(x, training=training)
            x = self.res4k(x, training=training)
            x = self.res4l(x, training=training)
            x = self.res4m(x, training=training)
            x = self.res4n(x, training=training)
            x = self.res4o(x, training=training)
            x = self.res4p(x, training=training)
            x = self.res4q(x, training=training)
            x = self.res4r(x, training=training)
            x = self.res4s(x, training=training)
            x = self.res4t(x, training=training)
            x = self.res4u(x, training=training)
            x = self.res4v(x, training=training)
            x = self.res4w(x, training=training) 
        C4 = x
        
        x = self.res5a(x, training=training)
        x = self.res5b(x, training=training)
        C5 = x = self.res5c(x, training=training)
        
        return (C2, C3, C4, C5)
    
    def compute_output_shape(self, input_shape):
        shape = tf.TensorShape(input_shape).as_list()
        batch, H, W, C = shape
        
        C2_shape = tf.TensorShape([batch, H //  4, W //  4, self.out_channel[0]])
        C3_shape = tf.TensorShape([batch, H //  8, W //  8, self.out_channel[1]])
        C4_shape = tf.TensorShape([batch, H // 16, W // 16, self.out_channel[2]])
        C5_shape = tf.TensorShape([batch, H // 32, W // 32, self.out_channel[3]])
        
        return (C2_shape, C3_shape, C4_shape, C5_shape)

(13)bbox_head

# bbox_head
import tensorflow as tf
from    tensorflow.keras import layers

from detection.core.bbox import transforms
from detection.core.loss import losses
from detection.utils.misc import *

class BBoxHead(tf.keras.Model):
    def __init__(self, num_classes, 
                 pool_size=(7, 7),
                 target_means=(0., 0., 0., 0.), 
                 target_stds=(0.1, 0.1, 0.2, 0.2),
                 min_confidence=0.7,
                 nms_threshold=0.3,
                 max_instances=100,
                 **kwags):
        super(BBoxHead, self).__init__(**kwags)
        
        self.num_classes = num_classes
        self.pool_size = tuple(pool_size)
        self.target_means = target_means
        self.target_stds = target_stds
        self.min_confidence = min_confidence
        self.nms_threshold = nms_threshold
        self.max_instances = max_instances
        
        self.rcnn_class_loss = losses.rcnn_class_loss
        self.rcnn_bbox_loss = losses.rcnn_bbox_loss
        
        self.rcnn_class_conv1 = layers.Conv2D(1024, self.pool_size, 
                                              padding='valid', name='rcnn_class_conv1')
        
        self.rcnn_class_bn1 = layers.BatchNormalization(name='rcnn_class_bn1')
        
        self.rcnn_class_conv2 = layers.Conv2D(1024, (1, 1), 
                                              name='rcnn_class_conv2')
        
        self.rcnn_class_bn2 = layers.BatchNormalization(name='rcnn_class_bn2')
        
        self.rcnn_class_logits = layers.Dense(num_classes, name='rcnn_class_logits')
        
        self.rcnn_delta_fc = layers.Dense(num_classes * 4, name='rcnn_bbox_fc')
        
    def call(self, inputs, training=True):
        '''
        Args
        ---
            pooled_rois_list: List of [num_rois, pool_size, pool_size, channels]
        
        Returns
        ---
            rcnn_class_logits_list: List of [num_rois, num_classes]
            rcnn_probs_list: List of [num_rois, num_classes]
            rcnn_deltas_list: List of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
        '''
        pooled_rois_list = inputs
        num_pooled_rois_list = [pooled_rois.shape[0] for pooled_rois in pooled_rois_list]
        pooled_rois = tf.concat(pooled_rois_list, axis=0)
        
        x = self.rcnn_class_conv1(pooled_rois)
        x = self.rcnn_class_bn1(x, training=training)
        x = tf.nn.relu(x)
        
        x = self.rcnn_class_conv2(x)
        x = self.rcnn_class_bn2(x, training=training)
        x = tf.nn.relu(x)
        
        x = tf.squeeze(tf.squeeze(x, 2), 1)
        
        logits = self.rcnn_class_logits(x)
        probs = tf.nn.softmax(logits)
        
        deltas = self.rcnn_delta_fc(x)
        deltas = tf.reshape(deltas, (-1, self.num_classes, 4))
        

        rcnn_class_logits_list = tf.split(logits, num_pooled_rois_list, 0)
        rcnn_probs_list = tf.split(probs, num_pooled_rois_list, 0)
        rcnn_deltas_list = tf.split(deltas, num_pooled_rois_list, 0)

            
        return rcnn_class_logits_list, rcnn_probs_list, rcnn_deltas_list

    def loss(self, 
             rcnn_class_logits_list, rcnn_deltas_list, 
             rcnn_target_matchs_list, rcnn_target_deltas_list):
        """

        :param rcnn_class_logits_list:
        :param rcnn_deltas_list:
        :param rcnn_target_matchs_list:
        :param rcnn_target_deltas_list:
        :return:
        """
        rcnn_class_loss = self.rcnn_class_loss(
            rcnn_target_matchs_list, rcnn_class_logits_list)
        rcnn_bbox_loss = self.rcnn_bbox_loss(
            rcnn_target_deltas_list, rcnn_target_matchs_list, rcnn_deltas_list)
        
        return rcnn_class_loss, rcnn_bbox_loss
        
    def get_bboxes(self, rcnn_probs_list, rcnn_deltas_list, rois_list, img_metas):
        '''
        Args
        ---
            rcnn_probs_list: List of [num_rois, num_classes]
            rcnn_deltas_list: List of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
            rois_list: List of [num_rois, (y1, x1, y2, x2)]
            img_meta_list: [batch_size, 11]
        
        Returns
        ---
            detections_list: List of [num_detections, (y1, x1, y2, x2, class_id, score)]
                coordinates are in pixel coordinates.
        '''
        
        pad_shapes = calc_pad_shapes(img_metas)
        detections_list = [
            self._get_bboxes_single(
                rcnn_probs_list[i], rcnn_deltas_list[i], rois_list[i], pad_shapes[i])
            for i in range(img_metas.shape[0])
        ]
        return detections_list  
    
    def _get_bboxes_single(self, rcnn_probs, rcnn_deltas, rois, img_shape):
        '''
        Args
        ---
            rcnn_probs: [num_rois, num_classes]
            rcnn_deltas: [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
            rois: [num_rois, (y1, x1, y2, x2)]
            img_shape: np.ndarray. [2]. (img_height, img_width)       
        '''
        H, W = img_shape   
        # Class IDs per ROI
        class_ids = tf.argmax(rcnn_probs, axis=1, output_type=tf.int32)
        
        # Class probability of the top class of each ROI
        indices = tf.stack([tf.range(rcnn_probs.shape[0]), class_ids], axis=1)
        class_scores = tf.gather_nd(rcnn_probs, indices)
        # Class-specific bounding box deltas
        deltas_specific = tf.gather_nd(rcnn_deltas, indices)
        # Apply bounding box deltas
        # Shape: [num_rois, (y1, x1, y2, x2)] in normalized coordinates        
        refined_rois = transforms.delta2bbox(rois, deltas_specific, self.target_means, self.target_stds)
        
        # Clip boxes to image window
        refined_rois *= tf.constant([H, W, H, W], dtype=tf.float32)
        window = tf.constant([0., 0., H * 1., W * 1.], dtype=tf.float32)
        refined_rois = transforms.bbox_clip(refined_rois, window)
        
        
        # Filter out background boxes
        keep = tf.where(class_ids > 0)[:, 0]
        
        # Filter out low confidence boxes
        if self.min_confidence:
            conf_keep = tf.where(class_scores >= self.min_confidence)[:, 0]
            keep = tf.compat.v2.sets.intersection(tf.expand_dims(keep, 0),
                                            tf.expand_dims(conf_keep, 0))
            keep = tf.sparse.to_dense(keep)[0]
            
        # Apply per-class NMS
        # 1. Prepare variables
        pre_nms_class_ids = tf.gather(class_ids, keep)
        pre_nms_scores = tf.gather(class_scores, keep)
        pre_nms_rois = tf.gather(refined_rois,   keep)
        unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]

        def nms_keep_map(class_id):
            '''Apply Non-Maximum Suppression on ROIs of the given class.'''
            # Indices of ROIs of the given class
            ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
            # Apply NMS
            class_keep = tf.image.non_max_suppression(
                    tf.gather(pre_nms_rois, ixs),
                    tf.gather(pre_nms_scores, ixs),
                    max_output_size=self.max_instances,
                    iou_threshold=self.nms_threshold)
            # Map indices
            class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
            return class_keep

        # 2. Map over class IDs
        nms_keep = []
        for i in range(unique_pre_nms_class_ids.shape[0]):
            nms_keep.append(nms_keep_map(unique_pre_nms_class_ids[i]))
        nms_keep = tf.concat(nms_keep, axis=0)
        
        # 3. Compute intersection between keep and nms_keep
        keep = tf.compat.v2.sets.intersection(tf.expand_dims(keep, 0),
                                        tf.expand_dims(nms_keep, 0))
        keep = tf.sparse.to_dense(keep)[0]
        # Keep top detections
        roi_count = self.max_instances
        class_scores_keep = tf.gather(class_scores, keep)
        num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
        top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
        keep = tf.gather(keep, top_ids)  
        
        detections = tf.concat([
            tf.gather(refined_rois, keep),
            tf.cast(tf.gather(class_ids, keep), tf.float32)[..., tf.newaxis],
            tf.gather(class_scores, keep)[..., tf.newaxis]
            ], axis=1)
        
        return detections

(14)faster-rcnn

# faster-rcnn
import tensorflow as tf

from detection.models.backbones import resnet
from detection.models.necks import fpn
from detection.models.rpn_heads import rpn_head
from detection.models.bbox_heads import bbox_head
from detection.models.roi_extractors import roi_align
from detection.models.detectors.test_mixins import RPNTestMixin, BBoxTestMixin

from detection.core.bbox import bbox_target


class FasterRCNN(tf.keras.Model, RPNTestMixin, BBoxTestMixin):

    def __init__(self, num_classes, **kwags):
        super(FasterRCNN, self).__init__(**kwags)
       # 包含多少类
        self.NUM_CLASSES = num_classes
        
        # RPN configuration配置
        # Anchor attributes属性
        self.ANCHOR_SCALES = (32, 64, 128, 256, 512)           ## 尺度
        self.ANCHOR_RATIOS = (0.5, 1, 2)                       ## 比率,这是干什么的?
        self.ANCHOR_FEATURE_STRIDES = (4, 8, 16, 32, 64)       ## 步伐,这个是干什么的?
        
        # Bounding box refinement mean and standard deviation
        # RPN边界框均值和标准差,均值为什么是0呢?
        self.RPN_TARGET_MEANS = (0., 0., 0., 0.)
        self.RPN_TARGET_STDS = (0.1, 0.1, 0.2, 0.2)
        
        # RPN training configuration
        # RPN训练配置
        self.PRN_BATCH_SIZE = 256
        # 前景所占的比例
        self.RPN_POS_FRAC = 0.5
        self.RPN_POS_IOU_THR = 0.7
        self.RPN_NEG_IOU_THR = 0.3

        # ROIs kept configuration
        # 配置生成两千个框
        self.PRN_PROPOSAL_COUNT = 2000
        # 非极大抑制的阈值
        self.PRN_NMS_THRESHOLD = 0.7
        
        # RCNN configuration
        # Bounding box refinement mean and standard deviation
        # RCNN边界框均值和标准差
        self.RCNN_TARGET_MEANS = (0., 0., 0., 0.)
        self.RCNN_TARGET_STDS = (0.1, 0.1, 0.2, 0.2)
        
        # ROI Feat Size
        # 每个候选区域都变成7*7大小
        self.POOL_SIZE = (7, 7)
        
        # RCNN training configuration
        # RCNN训练的配置
        self.RCNN_BATCH_SIZE = 256
        ## 应该是前景所占的比例
        self.RCNN_POS_FRAC = 0.25
        self.RCNN_POS_IOU_THR = 0.5
        self.RCNN_NEG_IOU_THR = 0.5
        
        # Boxes kept configuration
        # 最小置信度为0.7
        self.RCNN_MIN_CONFIDENCE = 0.7
        self.RCNN_NME_THRESHOLD = 0.3
        self.RCNN_MAX_INSTANCES = 100
        
        # Target Generator for the second stage.
        self.bbox_target = bbox_target.ProposalTarget(
            # 目标的均值
            target_means=self.RCNN_TARGET_MEANS,
            # RPN网络的标准差,感觉这里应该是target_stds=self.RCNN_TARGET_STDS,
            target_stds=self.RPN_TARGET_STDS,
            # 批次量
            num_rcnn_deltas=self.RCNN_BATCH_SIZE,
            # 前景的比率
            positive_fraction=self.RCNN_POS_FRAC,
            # 正例的iou阈值
            pos_iou_thr=self.RCNN_POS_IOU_THR,
            # 反例的iou阈值
            neg_iou_thr=self.RCNN_NEG_IOU_THR)
                
        # Modules 一共4部分,主干网络、rpn、rpn分类回归头、roi、分类和回归头
        # 1 主干网络
        self.backbone = resnet.ResNet(
            depth=101, 
            name='res_net')
        # 2 fpn网络
        self.neck = fpn.FPN(
            name='fpn')
        # 3 rpn的分类回归头
        self.rpn_head = rpn_head.RPNHead(
            # anchor的尺度
            anchor_scales=self.ANCHOR_SCALES,
            # anchor的比率,不知道是干什么的?
            anchor_ratios=self.ANCHOR_RATIOS,
            # anchor的步伐,这是干什么的?
            anchor_feature_strides=self.ANCHOR_FEATURE_STRIDES,
            # RPN提出的数量
            proposal_count=self.PRN_PROPOSAL_COUNT,
            # 非极大抑制的阈值
            nms_threshold=self.PRN_NMS_THRESHOLD,
            # 目标的均值,不知道啥意思
            target_means=self.RPN_TARGET_MEANS,
            # 目标的标准差,不知道啥意思
            target_stds=self.RPN_TARGET_STDS,
            # 批次量
            num_rpn_deltas=self.PRN_BATCH_SIZE,
            # 提出得到的前景比例
            positive_fraction=self.RPN_POS_FRAC,
            # 正例的阈值
            pos_iou_thr=self.RPN_POS_IOU_THR,
            # 反例的阈值
            neg_iou_thr=self.RPN_NEG_IOU_THR,
            name='rpn_head')
        # roi层
        self.roi_align = roi_align.PyramidROIAlign(
            # roi的池化大小
            pool_shape=self.POOL_SIZE,
            name='pyramid_roi_align')

        # bbox_head
        self.bbox_head = bbox_head.BBoxHead(
            # 类的数量
            num_classes=self.NUM_CLASSES,
            # 池化层大小
            pool_size=self.POOL_SIZE,
            # RCNN的目标均值
            target_means=self.RCNN_TARGET_MEANS,
            # RCNN的目标标准差
            target_stds=self.RCNN_TARGET_STDS,
            # RCNN的最低置信度
            min_confidence=self.RCNN_MIN_CONFIDENCE,
            # 非极大抑制的阈值
            nms_threshold=self.RCNN_NME_THRESHOLD,
            max_instances=self.RCNN_MAX_INSTANCES,
            name='b_box_head')

    def call(self, inputs, training=True):
        """

        :param inputs: [1, 1216, 1216, 3], [1, 11], [1, 14, 4], [1, 14]
        :param training:
        :return:
        """
        if training: # training
            imgs, img_metas, gt_boxes, gt_class_ids = inputs
        else: # inference
            imgs, img_metas = inputs

        # 每层输出的形状
        # [1, 304, 304, 256] => [1, 152, 152, 512]=>[1,76,76,1024]=>[1,38,38,2048]
        C2, C3, C4, C5 = self.backbone(imgs, 
                                       training=training)
        # 每层输出的形状
        # [1, 304, 304, 256] <= [1, 152, 152, 256]<=[1,76,76,256]<=[1,38,38,256]=>[1,19,19,256]
        P2, P3, P4, P5, P6 = self.neck([C2, C3, C4, C5], 
                                       training=training)
        # 用一个列表存储了各个输出,这里是要用到金字塔,就是用多个层
        rpn_feature_maps = [P2, P3, P4, P5, P6]
        rcnn_feature_maps = [P2, P3, P4, P5]
        # [1, 369303, 2] [1, 369303, 2], [1, 369303, 4], includes all anchors on pyramid level of features

        # 得到
        rpn_class_logits, rpn_probs, rpn_deltas = self.rpn_head(
            rpn_feature_maps, training=training)

        # [369303, 4] => [215169, 4], valid => [6000, 4], performance =>[2000, 4],  NMS
        proposals_list = self.rpn_head.get_proposals(
            rpn_probs, rpn_deltas, img_metas)

        if training: # get target value for these proposal target label and target delta
            rois_list, rcnn_target_matchs_list, rcnn_target_deltas_list = \
                self.bbox_target.build_targets(
                    proposals_list, gt_boxes, gt_class_ids, img_metas)
        else:
            rois_list = proposals_list
        # rois_list only contains coordinates, rcnn_feature_maps save the 5 features data=>[192,7,7,256]
        pooled_regions_list = self.roi_align(#
            (rois_list, rcnn_feature_maps, img_metas), training=training)
        # [192, 81], [192, 81], [192, 81, 4]
        rcnn_class_logits_list, rcnn_probs_list, rcnn_deltas_list = \
            self.bbox_head(pooled_regions_list, training=training)
        if training:
            rpn_class_loss, rpn_bbox_loss = self.rpn_head.loss(
                rpn_class_logits, rpn_deltas, gt_boxes, gt_class_ids, img_metas)
            
            rcnn_class_loss, rcnn_bbox_loss = self.bbox_head.loss(
                rcnn_class_logits_list, rcnn_deltas_list, 
                rcnn_target_matchs_list, rcnn_target_deltas_list)
            
            return [rpn_class_loss, rpn_bbox_loss, 
                    rcnn_class_loss, rcnn_bbox_loss]
        else:
            detections_list = self.bbox_head.get_bboxes(
                rcnn_probs_list, rcnn_deltas_list, rois_list, img_metas)
        
            return detections_list

(15)test_mixins

# test_mixins
import numpy as np
import tensorflow as tf

from detection.core.bbox import transforms
from detection.utils.misc import *

class RPNTestMixin:
    
    def simple_test_rpn(self, img, img_meta):
        '''
        Args
        ---
            imgs: np.ndarray. [height, width, channel]
            img_metas: np.ndarray. [11]
        
        '''
        imgs = tf.Variable(np.expand_dims(img, 0))
        img_metas = tf.Variable(np.expand_dims(img_meta, 0))

        x = self.backbone(imgs, training=False)
        x = self.neck(x, training=False)
        
        rpn_class_logits, rpn_probs, rpn_deltas = self.rpn_head(x, training=False)
        
        proposals_list = self.rpn_head.get_proposals(
            rpn_probs, rpn_deltas, img_metas, with_probs=False)

        return proposals_list[0]
    
class BBoxTestMixin(object):
    
    def _unmold_detections(self, detections_list, img_metas):
        return [
            self._unmold_single_detection(detections_list[i], img_metas[i])
            for i in range(img_metas.shape[0])
        ]

    def _unmold_single_detection(self, detections, img_meta):
        zero_ix = tf.where(tf.not_equal(detections[:, 4], 0))
        detections = tf.gather_nd(detections, zero_ix)

        # Extract boxes, class_ids, scores, and class-specific masks
        boxes = detections[:, :4]
        class_ids = tf.cast(detections[:, 4], tf.int32)
        scores = detections[:, 5]

        boxes = transforms.bbox_mapping_back(boxes, img_meta)

        return {'rois': boxes.numpy(),
                'class_ids': class_ids.numpy(),
                'scores': scores.numpy()}

    def simple_test_bboxes(self, img, img_meta, proposals):
        '''
        Args
        ---
            imgs: np.ndarray. [height, width, channel]
            img_meta: np.ndarray. [11]
        
        '''
        imgs = tf.Variable(np.expand_dims(img, 0))
        img_metas = tf.Variable(np.expand_dims(img_meta, 0))
        rois_list = [tf.Variable(proposals)]
        
        x = self.backbone(imgs, training=False)
        P2, P3, P4, P5, _ = self.neck(x, training=False)
        
        rcnn_feature_maps = [P2, P3, P4, P5]
        
        
        pooled_regions_list = self.roi_align(
            (rois_list, rcnn_feature_maps, img_metas), training=False)

        rcnn_class_logits_list, rcnn_probs_list, rcnn_deltas_list = \
            self.bbox_head(pooled_regions_list, training=False)
        
        detections_list = self.bbox_head.get_bboxes(
            rcnn_probs_list, rcnn_deltas_list, rois_list, img_metas)
        
        return self._unmold_detections(detections_list, img_metas)[0]

(16)fpn

'''
FRN model for Keras.

# Reference:
- [Feature Pyramid Networks for Object Detection](
    https://arxiv.org/abs/1612.03144)

'''
# fpn
import tensorflow as tf
from tensorflow.keras import layers


class FPN(tf.keras.Model):

    def __init__(self, out_channels=256, **kwargs):
        '''
        Feature Pyramid Networks
        
        Attributes
        ---
            out_channels: int. the channels of pyramid feature maps.
        '''
        super(FPN, self).__init__(**kwargs)
        
        self.out_channels = out_channels
        
        self.fpn_c2p2 = layers.Conv2D(out_channels, (1, 1), 
                                      kernel_initializer='he_normal', name='fpn_c2p2')
        self.fpn_c3p3 = layers.Conv2D(out_channels, (1, 1), 
                                      kernel_initializer='he_normal', name='fpn_c3p3')
        self.fpn_c4p4 = layers.Conv2D(out_channels, (1, 1), 
                                      kernel_initializer='he_normal', name='fpn_c4p4')
        self.fpn_c5p5 = layers.Conv2D(out_channels, (1, 1), 
                                      kernel_initializer='he_normal', name='fpn_c5p5')
        
        self.fpn_p3upsampled = layers.UpSampling2D(size=(2, 2), name='fpn_p3upsampled')
        self.fpn_p4upsampled = layers.UpSampling2D(size=(2, 2), name='fpn_p4upsampled')
        self.fpn_p5upsampled = layers.UpSampling2D(size=(2, 2), name='fpn_p5upsampled')
        
        
        self.fpn_p2 = layers.Conv2D(out_channels, (3, 3), padding='SAME', 
                                    kernel_initializer='he_normal', name='fpn_p2')
        self.fpn_p3 = layers.Conv2D(out_channels, (3, 3), padding='SAME', 
                                    kernel_initializer='he_normal', name='fpn_p3')
        self.fpn_p4 = layers.Conv2D(out_channels, (3, 3), padding='SAME', 
                                    kernel_initializer='he_normal', name='fpn_p4')
        self.fpn_p5 = layers.Conv2D(out_channels, (3, 3), padding='SAME', 
                                    kernel_initializer='he_normal', name='fpn_p5')
        
        self.fpn_p6 = layers.MaxPooling2D(pool_size=(1, 1), strides=2, name='fpn_p6')
        
            
    def call(self, inputs, training=True):
        C2, C3, C4, C5 = inputs
        
        P5 = self.fpn_c5p5(C5)
        P4 = self.fpn_c4p4(C4) + self.fpn_p5upsampled(P5)
        P3 = self.fpn_c3p3(C3) + self.fpn_p4upsampled(P4)
        P2 = self.fpn_c2p2(C2) + self.fpn_p3upsampled(P3)
        
        # Attach 3x3 conv to all P layers to get the final feature maps.
        P2 = self.fpn_p2(P2)
        P3 = self.fpn_p3(P3)
        P4 = self.fpn_p4(P4)
        P5 = self.fpn_p5(P5)
        
        # subsampling from P5 with stride of 2.
        P6 = self.fpn_p6(P5)
        
        return [P2, P3, P4, P5, P6]
        
    def compute_output_shape(self, input_shape):
        C2_shape, C3_shape, C4_shape, C5_shape = input_shape
        
        C2_shape, C3_shape, C4_shape, C5_shape = \
            C2_shape.as_list(), C3_shape.as_list(), C4_shape.as_list(), C5_shape.as_list()
        
        C6_shape = [C5_shape[0], (C5_shape[1] + 1) // 2, (C5_shape[2] + 1) // 2, self.out_channels]
        
        C2_shape[-1] = self.out_channels
        C3_shape[-1] = self.out_channels
        C4_shape[-1] = self.out_channels
        C5_shape[-1] = self.out_channels
        
        return [tf.TensorShape(C2_shape),
                tf.TensorShape(C3_shape),
                tf.TensorShape(C4_shape),
                tf.TensorShape(C5_shape),
                tf.TensorShape(C6_shape)]

if __name__ == '__main__':
    
    C2 = tf.random.normal((2, 256, 256,  256))
    C3 = tf.random.normal((2, 128, 128,  512))
    C4 = tf.random.normal((2,  64,  64, 1024))
    C5 = tf.random.normal((2,  32,  32, 2048))
    
    fpn = FPN()
    
    P2, P3, P4, P5, P6 = fpn([C2, C3, C4, C5])
    
    print('P2 shape:', P2.shape.as_list())
    print('P3 shape:', P3.shape.as_list())
    print('P4 shape:', P4.shape.as_list())
    print('P5 shape:', P5.shape.as_list())
    print('P6 shape:', P6.shape.as_list())

(17)roi_align

# roi_align
import tensorflow as tf

from detection.utils.misc import *


class PyramidROIAlign(tf.keras.layers.Layer):

    def __init__(self, pool_shape, **kwargs):
        '''
        Implements ROI Pooling on multiple levels of the feature pyramid.

        Attributes
        ---
            pool_shape: (height, width) of the output pooled regions.
                Example: (7, 7)
        '''
        super(PyramidROIAlign, self).__init__(**kwargs)

        self.pool_shape = tuple(pool_shape)

    def call(self, inputs, training=True):
        '''
        Args
        ---
            rois_list: list of [num_rois, (y1, x1, y2, x2)] in normalized coordinates.
            feature_map_list: List of [batch, height, width, channels].
                feature maps from different levels of the pyramid.
            img_metas: [batch_size, 11]

        Returns
        ---
            pooled_rois_list: list of [num_rois, pooled_height, pooled_width, channels].
                The width and height are those specific in the pool_shape in the layer
                constructor.
        '''
        rois_list, feature_map_list, img_metas = inputs # [2000 ,4], list:[P2, P3, P4, P5]

        pad_shapes = calc_pad_shapes(img_metas)
        
        pad_areas = pad_shapes[:, 0] * pad_shapes[:, 1] # 1216*1216
        
        num_rois_list = [rois.shape.as_list()[0] for rois in rois_list] # data:[2000]
        roi_indices = tf.constant(
            [i for i in range(len(rois_list)) for _ in range(rois_list[i].shape.as_list()[0])],
            dtype=tf.int32
        ) #[0.....], shape:[2000]
        
        areas = tf.constant(#              range(1)                               range(2000)
            [pad_areas[i] for i in range(pad_areas.shape[0]) for _ in range(num_rois_list[i])],
            dtype=tf.float32
        )#[1216*1216, 1216*1216,...], shape:[2000]


        rois = tf.concat(rois_list, axis=0) # [2000, 4]
        
        # Assign each ROI to a level in the pyramid based on the ROI area.
        y1, x1, y2, x2 = tf.split(rois, 4, axis=1) # 4 of [2000, 1]
        h = y2 - y1 # [2000, 1]
        w = x2 - x1 # [2000, 1]
        
        # Equation 1 in the Feature Pyramid Networks paper. Account for
        # the fact that our coordinates are normalized here.
        # e.g. a 224x224 ROI (in pixels) maps to P4

        roi_level = tf.math.log( # [2000]
                    tf.sqrt(tf.squeeze(h * w, 1))
                    / tf.cast((224.0 / tf.sqrt(areas * 1.0)), tf.float32)
                    ) / tf.math.log(2.0)
        roi_level = tf.minimum(5, tf.maximum( # [2000], clamp to [2-5]
            2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
        # roi_level will indicates which level of feature to use

        
        # Loop through levels and apply ROI pooling to each. P2 to P5.
        pooled_rois = []
        roi_to_level = []
        for i, level in enumerate(range(2, 6)): # 2,3,4,5
            ix = tf.where(tf.equal(roi_level, level)) # [1999, 1], means 1999 of 2000 select P2
            level_rois = tf.gather_nd(rois, ix) # boxes to crop, [1999, 4]

            # ROI indices for crop_and_resize.
            level_roi_indices = tf.gather_nd(roi_indices, ix) # [19999], data:[0....0]

            # Keep track of which roi is mapped to which level
            roi_to_level.append(ix)

            # Stop gradient propogation to ROI proposals
            level_rois = tf.stop_gradient(level_rois)
            level_roi_indices = tf.stop_gradient(level_roi_indices)

            # Crop and Resize
            # From Mask R-CNN paper: "We sample four regular locations, so
            # that we can evaluate either max or average pooling. In fact,
            # interpolating only a single value at each bin center (without
            # pooling) is nearly as effective."
            #
            # Here we use the simplified approach of a single value per bin,
            # which is how it's done in tf.crop_and_resize()
            # Result: [batch * num_rois, pool_height, pool_width, channels]
            pooled_rois.append(tf.image.crop_and_resize(
                feature_map_list[i], level_rois, level_roi_indices, self.pool_shape,
                method="bilinear")) # [1, 304, 304, 256], [1999, 4], [1999], [2]=[7,7]=>[1999,7,7,256]
        # [1999, 7, 7, 256], [], [], [1,7,7,256] => [2000, 7, 7, 256]
        # Pack pooled features into one tensor
        pooled_rois = tf.concat(pooled_rois, axis=0)

        # Pack roi_to_level mapping into one array and add another
        # column representing the order of pooled rois
        roi_to_level = tf.concat(roi_to_level, axis=0) # [2000, 1], 1999 of P2, and 1 other P
        roi_range = tf.expand_dims(tf.range(tf.shape(roi_to_level)[0]), 1) # [2000, 1], 0~1999
        roi_to_level = tf.concat([tf.cast(roi_to_level, tf.int32), roi_range],
                                 axis=1) # [2000, 2], (P, range)

        # Rearrange pooled features to match the order of the original rois
        # Sort roi_to_level by batch then roi indextf.Tensor([        0    100001    200002 ... 199801997 199901998  20101999], shape=(2000,), dtype=int32)
        # TF doesn't have a way to sort by two columns, so merge them and sort.
        sorting_tensor = roi_to_level[:, 0] * 100000 + roi_to_level[:, 1]
        ix = tf.nn.top_k(sorting_tensor, k=tf.shape( # k=2000
            roi_to_level)[0]).indices[::-1]# reverse the order
        ix = tf.gather(roi_to_level[:, 1], ix) # [2000]
        pooled_rois = tf.gather(pooled_rois, ix) # [2000, 7, 7, 256]
        # 2000 of [7, 7, 256]
        pooled_rois_list = tf.split(pooled_rois, num_rois_list, axis=0)
        return pooled_rois_list

(18)rpn_head

# rpn_head
import  tensorflow as tf
from    tensorflow.keras import layers

from detection.core.bbox import transforms
from detection.utils.misc import *

from detection.core.anchor import anchor_generator, anchor_target
from detection.core.loss import losses

class RPNHead(tf.keras.Model):

    def __init__(self, 
                 anchor_scales=(32, 64, 128, 256, 512), 
                 anchor_ratios=(0.5, 1, 2), 
                 anchor_feature_strides=(4, 8, 16, 32, 64),
                 proposal_count=2000, 
                 nms_threshold=0.7, 
                 target_means=(0., 0., 0., 0.), 
                 target_stds=(0.1, 0.1, 0.2, 0.2), 
                 num_rpn_deltas=256,
                 positive_fraction=0.5,
                 pos_iou_thr=0.7,
                 neg_iou_thr=0.3,
                 **kwags):
        '''
        Network head of Region Proposal Network.

                                      / - rpn_cls (1x1 conv)
        input - rpn_conv (3x3 conv) -
                                      \ - rpn_reg (1x1 conv)

        Attributes
        ---
            anchor_scales: 1D array of anchor sizes in pixels.
            anchor_ratios: 1D array of anchor ratios of width/height.
            anchor_feature_strides: Stride of the feature map relative 
                to the image in pixels.
            proposal_count: int. RPN proposals kept after non-maximum 
                suppression.
            nms_threshold: float. Non-maximum suppression threshold to 
                filter RPN proposals.
            target_means: [4] Bounding box refinement mean.
            target_stds: [4] Bounding box refinement standard deviation.
            num_rpn_deltas: int.
            positive_fraction: float.
            pos_iou_thr: float.
            neg_iou_thr: float.
        '''
        super(RPNHead, self).__init__(**kwags)
        
        self.proposal_count = proposal_count
        self.nms_threshold = nms_threshold
        self.target_means = target_means
        self.target_stds = target_stds

        self.generator = anchor_generator.AnchorGenerator(
            scales=anchor_scales, 
            ratios=anchor_ratios, 
            feature_strides=anchor_feature_strides)
        
        self.anchor_target = anchor_target.AnchorTarget(
            target_means=target_means, 
            target_stds=target_stds,
            num_rpn_deltas=num_rpn_deltas,
            positive_fraction=positive_fraction,
            pos_iou_thr=pos_iou_thr,
            neg_iou_thr=neg_iou_thr)
        
        self.rpn_class_loss = losses.rpn_class_loss
        self.rpn_bbox_loss = losses.rpn_bbox_loss
        
        
        # Shared convolutional base of the RPN
        self.rpn_conv_shared = layers.Conv2D(512, (3, 3), padding='same',
                                             kernel_initializer='he_normal', 
                                             name='rpn_conv_shared')
        
        self.rpn_class_raw = layers.Conv2D(len(anchor_ratios) * 2, (1, 1),
                                           kernel_initializer='he_normal', 
                                           name='rpn_class_raw')

        self.rpn_delta_pred = layers.Conv2D(len(anchor_ratios) * 4, (1, 1),
                                           kernel_initializer='he_normal', 
                                           name='rpn_bbox_pred')
        
    def call(self, inputs, training=True):
        '''
        Args
        ---
            inputs: [batch_size, feat_map_height, feat_map_width, channels] 
                one level of pyramid feat-maps.
        
        Returns
        ---
            rpn_class_logits: [batch_size, num_anchors, 2]
            rpn_probs: [batch_size, num_anchors, 2]
            rpn_deltas: [batch_size, num_anchors, 4]
        '''
        
        layer_outputs = []
        
        for feat in inputs: # for every anchors feature maps
            """
            (1, 304, 304, 256)
            (1, 152, 152, 256)
            (1, 76, 76, 256)
            (1, 38, 38, 256)
            (1, 19, 19, 256)
            rpn_class_raw: (1, 304, 304, 6)
            rpn_class_logits: (1, 277248, 2)
            rpn_delta_pred: (1, 304, 304, 12)
            rpn_deltas: (1, 277248, 4)
            rpn_class_raw: (1, 152, 152, 6)
            rpn_class_logits: (1, 69312, 2)
            rpn_delta_pred: (1, 152, 152, 12)
            rpn_deltas: (1, 69312, 4)
            rpn_class_raw: (1, 76, 76, 6)
            rpn_class_logits: (1, 17328, 2)
            rpn_delta_pred: (1, 76, 76, 12)
            rpn_deltas: (1, 17328, 4)
            rpn_class_raw: (1, 38, 38, 6)
            rpn_class_logits: (1, 4332, 2)
            rpn_delta_pred: (1, 38, 38, 12)
            rpn_deltas: (1, 4332, 4)
            rpn_class_raw: (1, 19, 19, 6)
            rpn_class_logits: (1, 1083, 2)
            rpn_delta_pred: (1, 19, 19, 12)
            rpn_deltas: (1, 1083, 4)

            """
            # print(feat.shape)
            shared = self.rpn_conv_shared(feat)
            shared = tf.nn.relu(shared)

            x = self.rpn_class_raw(shared)
            # print('rpn_class_raw:', x.shape)
            rpn_class_logits = tf.reshape(x, [tf.shape(x)[0], -1, 2])
            rpn_probs = tf.nn.softmax(rpn_class_logits)
            # print('rpn_class_logits:', rpn_class_logits.shape)

            x = self.rpn_delta_pred(shared)
            # print('rpn_delta_pred:', x.shape)
            rpn_deltas = tf.reshape(x, [tf.shape(x)[0], -1, 4])
            # print('rpn_deltas:', rpn_deltas.shape)
            
            layer_outputs.append([rpn_class_logits, rpn_probs, rpn_deltas])
            # print(rpn_class_logits.shape, rpn_probs.shape, rpn_deltas.shape)
            """
            (1, 277248, 2) (1, 277248, 2) (1, 277248, 4)
            (1, 69312, 2) (1, 69312, 2) (1, 69312, 4)
            (1, 17328, 2) (1, 17328, 2) (1, 17328, 4)
            (1, 4332, 2) (1, 4332, 2) (1, 4332, 4)
            (1, 1083, 2) (1, 1083, 2) (1, 1083, 4)

            """

        outputs = list(zip(*layer_outputs))
        outputs = [tf.concat(list(o), axis=1) for o in outputs]
        rpn_class_logits, rpn_probs, rpn_deltas = outputs
        # (1, 369303, 2) (1, 369303, 2) (1, 369303, 4)
        # print(rpn_class_logits.shape, rpn_probs.shape, rpn_deltas.shape)
        
        return rpn_class_logits, rpn_probs, rpn_deltas

    def loss(self, rpn_class_logits, rpn_deltas, gt_boxes, gt_class_ids, img_metas):
        """

        :param rpn_class_logits: [N, 2]
        :param rpn_deltas: [N, 4]
        :param gt_boxes:  [GT_N]
        :param gt_class_ids:  [GT_N]
        :param img_metas: [11]
        :return:
        """
        # valid_flags indicates anchors located in padded area or not.
        anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)

        #
        rpn_target_matchs, rpn_target_deltas = self.anchor_target.build_targets(
            anchors, valid_flags, gt_boxes, gt_class_ids)
        
        rpn_class_loss = self.rpn_class_loss(
            rpn_target_matchs, rpn_class_logits)
        rpn_bbox_loss = self.rpn_bbox_loss(
            rpn_target_deltas, rpn_target_matchs, rpn_deltas)
        
        return rpn_class_loss, rpn_bbox_loss
    
    def get_proposals(self, 
                      rpn_probs, 
                      rpn_deltas, 
                      img_metas, 
                      with_probs=False):
        '''
        Calculate proposals.
        
        Args
        ---
            rpn_probs: [batch_size, num_anchors, (bg prob, fg prob)]
            rpn_deltas: [batch_size, num_anchors, (dy, dx, log(dh), log(dw))]
            img_metas: [batch_size, 11]
            with_probs: bool.
        
        Returns
        ---
            proposals_list: list of [num_proposals, (y1, x1, y2, x2)] in 
                normalized coordinates if with_probs is False. 
                Otherwise, the shape of proposals in proposals_list is 
                [num_proposals, (y1, x1, y2, x2, score)]
        
        Note that num_proposals is no more than proposal_count. And different 
           images in one batch may have different num_proposals.
        '''
        anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)
        # [369303, 4], [b, 11]
        # [b, N, (background prob, foreground prob)], get anchor's foreground prob, [1, 369303]
        rpn_probs = rpn_probs[:, :, 1]
        # [[1216, 1216]]
        pad_shapes = calc_pad_shapes(img_metas)
        
        proposals_list = [
            self._get_proposals_single(
                rpn_probs[i], rpn_deltas[i], anchors, valid_flags[i], pad_shapes[i], with_probs)
            for i in range(img_metas.shape[0])
        ]
        
        return proposals_list
    
    def _get_proposals_single(self, 
                              rpn_probs, 
                              rpn_deltas, 
                              anchors, 
                              valid_flags, 
                              img_shape, 
                              with_probs):
        '''
        Calculate proposals.
        
        Args
        ---
            rpn_probs: [num_anchors]
            rpn_deltas: [num_anchors, (dy, dx, log(dh), log(dw))]
            anchors: [num_anchors, (y1, x1, y2, x2)] anchors defined in 
                pixel coordinates.
            valid_flags: [num_anchors]
            img_shape: np.ndarray. [2]. (img_height, img_width)
            with_probs: bool.
        
        Returns
        ---
            proposals: [num_proposals, (y1, x1, y2, x2)] in normalized 
                coordinates.
        '''
        
        H, W = img_shape
        
        # filter invalid anchors, int => bool
        valid_flags = tf.cast(valid_flags, tf.bool)
        # [369303] => [215169], respectively
        rpn_probs = tf.boolean_mask(rpn_probs, valid_flags)
        rpn_deltas = tf.boolean_mask(rpn_deltas, valid_flags)
        anchors = tf.boolean_mask(anchors, valid_flags)

        # Improve performance
        pre_nms_limit = min(6000, anchors.shape[0]) # min(6000, 215169) => 6000
        ix = tf.nn.top_k(rpn_probs, pre_nms_limit, sorted=True).indices
        # [215169] => [6000], respectively
        rpn_probs = tf.gather(rpn_probs, ix)
        rpn_deltas = tf.gather(rpn_deltas, ix)
        anchors = tf.gather(anchors, ix)
        
        # Get refined anchors, => [6000, 4]
        proposals = transforms.delta2bbox(anchors, rpn_deltas, 
                                          self.target_means, self.target_stds)
        # clipping to valid area, [6000, 4]
        window = tf.constant([0., 0., H, W], dtype=tf.float32)
        proposals = transforms.bbox_clip(proposals, window)
        
        # Normalize, (y1, x1, y2, x2)
        proposals = proposals / tf.constant([H, W, H, W], dtype=tf.float32)
        
        # NMS, indices: [2000]
        indices = tf.image.non_max_suppression(
            proposals, rpn_probs, self.proposal_count, self.nms_threshold)
        proposals = tf.gather(proposals, indices) # [2000, 4]
        
        if with_probs:
            proposal_probs = tf.expand_dims(tf.gather(rpn_probs, indices), axis=1)
            proposals = tf.concat([proposals, proposal_probs], axis=1)
   
        return proposals

(19)misc

# misc
import tensorflow as tf

def trim_zeros(boxes, name=None):
    '''
    Often boxes are represented with matrices of shape [N, 4] and
    are padded with zeros. This removes zero boxes.
    
    Args
    ---
        boxes: [N, 4] matrix of boxes.
        non_zeros: [N] a 1D boolean mask identifying the rows to keep
    '''
    non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
    boxes = tf.boolean_mask(boxes, non_zeros, name=name)
    return boxes, non_zeros

def parse_image_meta(meta):
    '''
    Parses a tensor that contains image attributes to its components.
    
    Args
    ---
        meta: [..., 11]

    Returns
    ---
        a dict of the parsed tensors.
    '''
    meta = meta.numpy()
    ori_shape = meta[..., 0:3]
    img_shape = meta[..., 3:6]
    pad_shape = meta[..., 6:9]
    scale = meta[..., 9]  
    flip = meta[..., 10]
    return {
        'ori_shape': ori_shape,
        'img_shape': img_shape,
        'pad_shape': pad_shape,
        'scale': scale,
        'flip': flip
    }

def calc_batch_padded_shape(meta):
    '''
    Args
    ---
        meta: [batch_size, 11]
    
    Returns
    ---
        nd.ndarray. Tuple of (height, width)
    '''
    return tf.cast(tf.reduce_max(meta[:, 6:8], axis=0), tf.int32).numpy()

def calc_img_shapes(meta):
    '''
    Args
    ---
        meta: [..., 11]
    
    Returns
    ---
        nd.ndarray. [..., (height, width)]
    '''
    return tf.cast(meta[..., 3:5], tf.int32).numpy()


def calc_pad_shapes(meta):
    '''
    Args
    ---
        meta: [..., 11]
    
    Returns
    ---
        nd.ndarray. [..., (height, width)]
    '''
    return tf.cast(meta[..., 6:8], tf.int32).numpy()

你可能感兴趣的:(#,目标检测)