# train
import os
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from detection.datasets import coco, data_generator
from detection.models.detectors import faster_rcnn
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
## 新加,解决Failed to get convolution algorithm. This is probably because cuDNN failed to initialize
os.environ['CUDA_VISIBLE_DEVICES'] = '/device:GPU:0'
def train():
# 1 调用coco处理数据的方法, 获取预处理后的数据集
train_dataset = coco.CocoDataSet('./data/coco2017', 'train',
flip_ratio=0.5,
pad_mode='fixed',
mean=(123.675, 116.28, 103.53),
std=(1., 1., 1.),
scale=(800, 1216))
print('=====', train_dataset)
#2 数据处理,生成一个训练数据,生成每4个列表,每个列表都是一个多为列表
# tf.data.Dataset
train_generator = data_generator.DataGenerator(train_dataset)
print("+++++", train_generator)
#3 生成了数据,但是不太清楚train_generator到底是个啥东西
tf_dataset = tf.data.Dataset.from_generator(train_generator,
(tf.float32, tf.float32, tf.float32, tf.float32))
print('------', tf_dataset)
# 4因为这里只有1张图像,所以batch是1,打乱
tf_dataset = tf_dataset.batch(1).prefetch(100).shuffle(100)
# 5建立模型以及训练
num_classes = len(train_dataset.get_categories())
print('-=-=-=-=', num_classes)
# 6获取模型
model = faster_rcnn.FasterRCNN(num_classes=num_classes)
# 7定义优化器
optimizer = tf.keras.optimizers.SGD(1e-3, momentum=0.9, nesterov=True)
# 8进行循环训练,循环1个epoch
for epoch in range(1):
# 这里是因为前面给它加了批次,所以tf_dataset变成了5维数据,第一维是它的批次量
for (batch, inputs) in enumerate(tf_dataset):
# 这里不懂,4个列表分别为,原始图像,第2个不知道,边界框,标签
batch_imgs, batch_metas, batch_bboxes, batch_labels = inputs
print("batch_imgs形状:",batch_imgs.shape)
print(batch_imgs, batch_metas, batch_bboxes, batch_labels)
with tf.GradientTape() as tape:
# 每一个结构的损失
rpn_class_loss, rpn_bbox_loss, rcnn_class_loss, rcnn_bbox_loss = \
model((batch_imgs, batch_metas, batch_bboxes, batch_labels), training=True)
# rpn网络的损失和网络的总损失
loss = rpn_class_loss + rpn_bbox_loss + rcnn_class_loss + rcnn_bbox_loss
# 进行对损失求梯度
grads = tape.gradient(loss, model.trainable_variables)
# 将梯度应用到变量上
optimizer.apply_gradients(zip(grads, model.trainable_variables))
print("迭代次数:%d, batch大小:%d, 损失大小:%f" % (epoch+1, batch+1, loss))
def test():
train_dataset = coco.CocoDataSet('./data/coco2017', 'val')
# 获取数据和模型
train_generator = data_generator.DataGenerator(train_dataset)
tf_dataset = tf.data.Dataset.from_generator(train_generator,
(tf.float32, tf.float32, tf.float32, tf.float32))
tf_dataset = tf_dataset.batch(1).prefetch(100).shuffle(100)
num_classes = len(train_dataset.get_categories())
model = faster_rcnn.FasterRCNN(num_classes=num_classes)
print(num_classes)
for (batch, inputs) in enumerate(tf_dataset):
img, img_meta, _, _ = inputs
print(img, img_meta)
detections_list = model((img, img_meta), training=False)
print(detections_list)
if __name__ == '__main__':
train()
# test()
core
(2) anchor——generator
# anchor_generator
import tensorflow as tf
from detection.utils.misc import calc_img_shapes, calc_batch_padded_shape
class AnchorGenerator:
"""
This class operate on padded iamge, eg. [1216, 1216]
and generate scales*ratios number of anchor boxes for each point in
padded image, with stride = feature_strides
number of anchor = (1216 // feature_stride)^2
number of anchor boxes = number of anchor * (scales_len*ratio_len)
"""
def __init__(self,
scales=(32, 64, 128, 256, 512),
ratios=(0.5, 1, 2),
feature_strides=(4, 8, 16, 32, 64)):
'''
Anchor Generator
Attributes
---
scales: 1D array of anchor sizes in pixels.
ratios: 1D array of anchor ratios of width/height.
feature_strides: Stride of the feature map relative to the image in pixels.
'''
self.scales = scales
self.ratios = ratios
self.feature_strides = feature_strides
def generate_pyramid_anchors(self, img_metas):
'''
Generate the multi-level anchors for Region Proposal Network
Args
---
img_metas: [batch_size, 11]
Returns
---
anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
valid_flags: [batch_size, num_anchors]
'''
# generate anchors
pad_shape = calc_batch_padded_shape(img_metas) # [1216, 1216]
# : [(304, 304), (152, 152), (76, 76), (38, 38), (19, 19)]
feature_shapes = [(pad_shape[0] // stride, pad_shape[1] // stride)
for stride in self.feature_strides]
anchors = [
self._generate_level_anchors(level, feature_shape)
for level, feature_shape in enumerate(feature_shapes)
] # [277248, 4], [69312, 4], [17328, 4], [4332, 4], [1083, 4]
anchors = tf.concat(anchors, axis=0) # [369303, 4]
# print('total anchors:', anchors.shape)
# print('---------')
# generate valid flags
img_shapes = calc_img_shapes(img_metas) # (800, 1067)
valid_flags = [
self._generate_valid_flags(anchors, img_shapes[i])
for i in range(img_shapes.shape[0])
]
valid_flags = tf.stack(valid_flags, axis=0)
anchors = tf.stop_gradient(anchors)
valid_flags = tf.stop_gradient(valid_flags)
return anchors, valid_flags
def _generate_valid_flags(self, anchors, img_shape):
'''
remove these anchor boxed on padded area
---
anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
img_shape: Tuple. (height, width, channels)
Returns
---
valid_flags: [num_anchors]
'''
y_center = (anchors[:, 2] + anchors[:, 0]) / 2 # [369300]
x_center = (anchors[:, 3] + anchors[:, 1]) / 2
valid_flags = tf.ones(anchors.shape[0], dtype=tf.int32) # [369300]
zeros = tf.zeros(anchors.shape[0], dtype=tf.int32)
# set boxes whose center is out of image area as invalid.
valid_flags = tf.where(y_center <= img_shape[0], valid_flags, zeros)
valid_flags = tf.where(x_center <= img_shape[1], valid_flags, zeros)
return valid_flags
def _generate_level_anchors(self, level, feature_shape):
'''Generate the anchors given the spatial shape of feature map.
scale: 32
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (256, 256) (256, 256)
scale: 64
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (128, 128) (128, 128)
scale: 128
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (64, 64) (64, 64)
scale: 256
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (32, 32) (32, 32)
scale: 512
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (16, 16) (16, 16)
scale: 32
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (304, 304) (304, 304)
boxes: (277248, 4)
scale: 64
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (152, 152) (152, 152)
boxes: (69312, 4)
scale: 128
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (76, 76) (76, 76)
boxes: (17328, 4)
scale: 256
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (38, 38) (38, 38)
boxes: (4332, 4)
scale: 512
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (19, 19) (19, 19)
boxes: (1083, 4)
total anchors: (369303, 4)
---
feature_shape: (height, width)
Returns
---
numpy.ndarray [anchors_num, (y1, x1, y2, x2)]
'''
scale = self.scales[level]
ratios = self.ratios
feature_stride = self.feature_strides[level]
# Get all combinations of scales and ratios
scales, ratios = tf.meshgrid([float(scale)], ratios)
scales = tf.reshape(scales, [-1]) # [32, 32, 32]
ratios = tf.reshape(ratios, [-1]) # [0.5, 1, 2]
# Enumerate heights and widths from scales and ratios
heights = scales / tf.sqrt(ratios) # [45, 32, 22], square root
widths = scales * tf.sqrt(ratios) # [22, 32, 45]
# Enumerate shifts in feature space, [0, 4, ..., 1216-4]
shifts_y = tf.multiply(tf.range(feature_shape[0]), feature_stride)
shifts_x = tf.multiply(tf.range(feature_shape[1]), feature_stride)
shifts_x, shifts_y = tf.cast(shifts_x, tf.float32), tf.cast(shifts_y, tf.float32)
shifts_x, shifts_y = tf.meshgrid(shifts_x, shifts_y) # [304, 304, 2] coordinates
# Enumerate combinations of shifts, widths, and heights # mesh A: [3] B:[304,304]=>[92416] =>[92416,3,2]
box_widths, box_centers_x = tf.meshgrid(widths, shifts_x)
box_heights, box_centers_y = tf.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = tf.reshape(tf.stack([box_centers_y, box_centers_x], axis=2), (-1, 2))
box_sizes = tf.reshape(tf.stack([box_heights, box_widths], axis=2), (-1, 2))
# Convert to corner coordinates (y1, x1, y2, x2) [304x304, 3, 4] => [277448, 4]
boxes = tf.concat([box_centers - 0.5 * box_sizes,
box_centers + 0.5 * box_sizes], axis=1)
# print('scale:', scale)
# print('ratios:', ratios)
# print('pos:', shifts_x.shape, shifts_y.shape)
# print('boxes:', boxes.shape)
return boxes
(3) anchor_target
# anchor_target
import tensorflow as tf
from detection.core.bbox import geometry, transforms
from detection.utils.misc import trim_zeros
class AnchorTarget:
"""
for every generated anchors boxes: [326393, 4],
create its rpn_target_matchs and rpn_target_matchs
which is used to train RPN network.
"""
def __init__(self,
target_means=(0., 0., 0., 0.),
target_stds=(0.1, 0.1, 0.2, 0.2),
num_rpn_deltas=256,
positive_fraction=0.5,
pos_iou_thr=0.7,
neg_iou_thr=0.3):
'''
Compute regression and classification targets for anchors.
Attributes
---
target_means: [4]. Bounding box refinement mean for RPN.
target_stds: [4]. Bounding box refinement standard deviation for RPN.
num_rpn_deltas: int. Maximal number of Anchors per image to feed to rpn heads.
positive_fraction: float.
pos_iou_thr: float.
neg_iou_thr: float.
'''
self.target_means = target_means
self.target_stds = target_stds
self.num_rpn_deltas = num_rpn_deltas
self.positive_fraction = positive_fraction
self.pos_iou_thr = pos_iou_thr
self.neg_iou_thr = neg_iou_thr
def build_targets(self, anchors, valid_flags, gt_boxes, gt_class_ids):
'''
Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
Args
---
anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
valid_flags: [batch_size, num_anchors]
gt_boxes: [batch_size, num_gt_boxes, (y1, x1, y2, x2)] in image
coordinates. batch_size = 1 usually
gt_class_ids: [batch_size, num_gt_boxes] Integer class IDs.
Returns
---
rpn_target_matchs: [batch_size, num_anchors] matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral anchor
rpn_target_deltas: [batch_size, num_rpn_deltas, (dy, dx, log(dh), log(dw))]
Anchor bbox deltas.
'''
rpn_target_matchs = []
rpn_target_deltas = []
num_imgs = gt_class_ids.shape[0] # namely, batchsz , 1
for i in range(num_imgs):
target_match, target_delta = self._build_single_target(
anchors, valid_flags[i], gt_boxes[i], gt_class_ids[i])
rpn_target_matchs.append(target_match)
rpn_target_deltas.append(target_delta)
rpn_target_matchs = tf.stack(rpn_target_matchs)
rpn_target_deltas = tf.stack(rpn_target_deltas)
rpn_target_matchs = tf.stop_gradient(rpn_target_matchs)
rpn_target_deltas = tf.stop_gradient(rpn_target_deltas)
return rpn_target_matchs, rpn_target_deltas
def _build_single_target(self, anchors, valid_flags, gt_boxes, gt_class_ids):
'''Compute targets per instance.
Args
---
anchors: [num_anchors, (y1, x1, y2, x2)]
valid_flags: [num_anchors]
gt_class_ids: [num_gt_boxes]
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
Returns
---
target_matchs: [num_anchors]
target_deltas: [num_rpn_deltas, (dy, dx, log(dh), log(dw))]
'''
gt_boxes, _ = trim_zeros(gt_boxes) # remove padded zero boxes, [new_N, 4]
target_matchs = tf.zeros(anchors.shape[0], dtype=tf.int32) # [326393]
# Compute overlaps [num_anchors, num_gt_boxes] 326393 vs 10 => [326393, 10]
overlaps = geometry.compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps ANY GT box with IoU >= 0.7 then it's positive.
# If an anchor overlaps ALL GT box with IoU < 0.3 then it's negative.
# Neutral anchors are those that don't match the conditions above,
# and they don't influence the loss function.
# However, don't keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
neg_values = tf.constant([0, -1])
pos_values = tf.constant([0, 1])
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. [N_anchors, N_gt_boxes]
anchor_iou_argmax = tf.argmax(overlaps, axis=1) # [326396] get clost gt boxes for each anchors
anchor_iou_max = tf.reduce_max(overlaps, axis=[1]) # [326396] get closet gt boxes's overlap scores
# if an anchor box overlap all GT box with IoU < 0.3, marked as -1 background
target_matchs = tf.where(anchor_iou_max < self.neg_iou_thr,
-tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)
# filter invalid anchors
target_matchs = tf.where(tf.equal(valid_flags, 1),
target_matchs, tf.zeros(anchors.shape[0], dtype=tf.int32))
# if an anchor overlap with any GT box with IoU > 0.7, marked as foreground
# 2. Set anchors with high overlap as positive.
target_matchs = tf.where(anchor_iou_max >= self.pos_iou_thr,
tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)
# 3. Set an anchor for each GT box (regardless of IoU value).
gt_iou_argmax = tf.argmax(overlaps, axis=0) # [N_gt_boxes]
target_matchs = tf.compat.v1.scatter_update(tf.Variable(target_matchs), gt_iou_argmax, 1)
# update corresponding value=>1 for GT boxes' closest boxes
# Subsample to balance positive and negative anchors
# Don't let positives be more than half the anchors
ids = tf.where(tf.equal(target_matchs, 1)) # [N_pos_anchors, 1], [15, 1]
ids = tf.squeeze(ids, 1) # [15]
extra = ids.shape.as_list()[0] - int(self.num_rpn_deltas * self.positive_fraction) # 256*0.5
if extra > 0: # extra means the redundant pos_anchors
# Reset the extra random ones to neutral
ids = tf.random.shuffle(ids)[:extra]
target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
# Same for negative proposals
ids = tf.where(tf.equal(target_matchs, -1)) # [213748, 1]
ids = tf.squeeze(ids, 1)
extra = ids.shape.as_list()[0] - (self.num_rpn_deltas - # 213748 - (256 - num_of_pos_anchors:15)
tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32)))
if extra > 0: # 213507, so many negative anchors!
# Rest the extra ones to neutral
ids = tf.random.shuffle(ids)[:extra]
target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
# since we only need 256 anchors, and it had better contains half positive anchors, and harlf neg .
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
ids = tf.where(tf.equal(target_matchs, 1)) # [15]
a = tf.gather_nd(anchors, ids) # [369303, 4], [15] => [15, 4]
anchor_idx = tf.gather_nd(anchor_iou_argmax, ids) # closed gt boxes index for 369303 anchors
gt = tf.gather(gt_boxes, anchor_idx) # get closed gt boxes coordinates for ids=15
# a: [15, 4], postive anchors, gt: [15, 4] closed gt boxes for each anchors=15
target_deltas = transforms.bbox2delta(
a, gt, self.target_means, self.target_stds)
# target_deltas: [15, (dy,dx,logw,logh)]?
padding = tf.maximum(self.num_rpn_deltas - tf.shape(target_deltas)[0], 0) # 256-15
target_deltas = tf.pad(target_deltas, [(0, padding), (0, 0)]) #padding to [256,4], last padding 0
return target_matchs, target_deltas
(4) bbox_target
# bbox_target
import numpy as np
import tensorflow as tf
from detection.core.bbox import geometry, transforms
from detection.utils.misc import *
class ProposalTarget:
def __init__(self,
target_means=(0., 0., 0., 0.),
target_stds=(0.1, 0.1, 0.2, 0.2),
num_rcnn_deltas=256,
positive_fraction=0.25,
pos_iou_thr=0.5,
neg_iou_thr=0.5):
'''
Compute regression and classification targets for proposals.
Attributes
---
target_means: [4]. Bounding box refinement mean for RCNN.
target_stds: [4]. Bounding box refinement standard deviation for RCNN.
num_rcnn_deltas: int. Maximal number of RoIs per image to feed to bbox heads.
'''
self.target_means = target_means
self.target_stds = target_stds
self.num_rcnn_deltas = num_rcnn_deltas
self.positive_fraction = positive_fraction
self.pos_iou_thr = pos_iou_thr
self.neg_iou_thr = neg_iou_thr
def build_targets(self, proposals_list, gt_boxes, gt_class_ids, img_metas):
'''
Generates detection targets for images. Subsamples proposals and
generates target class IDs, bounding box deltas for each.
Args
---
proposals_list: list of [num_proposals, (y1, x1, y2, x2)] in normalized coordinates.
gt_boxes: [batch_size, num_gt_boxes, (y1, x1, y2, x2)] in image coordinates.
gt_class_ids: [batch_size, num_gt_boxes] Integer class IDs.
img_metas: [batch_size, 11]
Returns
---
rois_list: list of [num_rois, (y1, x1, y2, x2)] in normalized coordinates
rcnn_target_matchs_list: list of [num_rois]. Integer class IDs.
rcnn_target_deltas_list: list of [num_positive_rois, (dy, dx, log(dh), log(dw))].
Note that self.num_rcnn_deltas >= num_rois > num_positive_rois. And different
images in one batch may have different num_rois and num_positive_rois.
'''
pad_shapes = calc_pad_shapes(img_metas) # [[1216, 1216]]
rois_list = []
rcnn_target_matchs_list = []
rcnn_target_deltas_list = []
for i in range(img_metas.shape[0]):
rois, target_matchs, target_deltas = self._build_single_target(
proposals_list[i], gt_boxes[i], gt_class_ids[i], pad_shapes[i])
rois_list.append(rois) # [192, 4], including pos/neg anchors
rcnn_target_matchs_list.append(target_matchs) # positive target label, and padding with zero for neg
rcnn_target_deltas_list.append(target_deltas) # positive target deltas, and padding with zero for neg
return rois_list, rcnn_target_matchs_list, rcnn_target_deltas_list
def _build_single_target(self, proposals, gt_boxes, gt_class_ids, img_shape):
'''
Args
---
proposals: [num_proposals, (y1, x1, y2, x2)] in normalized coordinates.
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
gt_class_ids: [num_gt_boxes]
img_shape: np.ndarray. [2]. (img_height, img_width)
Returns
---
rois: [num_rois, (y1, x1, y2, x2)]
target_matchs: [num_positive_rois]
target_deltas: [num_positive_rois, (dy, dx, log(dh), log(dw))]
'''
H, W = img_shape # 1216, 1216
gt_boxes, non_zeros = trim_zeros(gt_boxes) # [7, 4], remove padded zero boxes
gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros) # [7]
# normalize (y1, x1, y2, x2) => 0~1
gt_boxes = gt_boxes / tf.constant([H, W, H, W], dtype=tf.float32)
# [2k, 4] with [7, 4] => [2k, 7] overlop scores
overlaps = geometry.compute_overlaps(proposals, gt_boxes)
anchor_iou_argmax = tf.argmax(overlaps, axis=1) # [2000]get cloest gt boxed id for each anchor boxes
roi_iou_max = tf.reduce_max(overlaps, axis=1) # [2000]get clost gt boxes overlop score for each anchor boxes
# roi_iou_max: [2000],
positive_roi_bool = (roi_iou_max >= self.pos_iou_thr) #[2000]
positive_indices = tf.where(positive_roi_bool)[:, 0] #[48, 1] =>[48]
# get all positive indices, namely get all pos_anchor indices
negative_indices = tf.where(roi_iou_max < self.neg_iou_thr)[:, 0]
# get all negative anchor indices
# Subsample ROIs. Aim for 33% positive
# Positive ROIs
positive_count = int(self.num_rcnn_deltas * self.positive_fraction) # 0.25?
positive_indices = tf.random.shuffle(positive_indices)[:positive_count] # [256*0.25]=64, at most get 64
positive_count = tf.shape(positive_indices)[0] # 34
# Negative ROIs. Add enough to maintain positive:negative ratio.
r = 1.0 / self.positive_fraction
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count #102
negative_indices = tf.random.shuffle(negative_indices)[:negative_count] #[102]
# Gather selected ROIs, based on remove redundant pos/neg indices
positive_rois = tf.gather(proposals, positive_indices) # [34, 4]
negative_rois = tf.gather(proposals, negative_indices) # [102, 4]
# Assign positive ROIs to GT boxes.
positive_overlaps = tf.gather(overlaps, positive_indices) # [34, 7]
roi_gt_box_assignment = tf.argmax(positive_overlaps, axis=1) # [34]for each anchor, get its clost gt boxes
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) # [34, 4]
target_matchs = tf.gather(gt_class_ids, roi_gt_box_assignment) # [34]
# target_matchs, target_deltas all get!!
# proposal: [34, 4], target: [34, 4]
target_deltas = transforms.bbox2delta(positive_rois, roi_gt_boxes, self.target_means, self.target_stds)
# [34, 4] [102, 4]
rois = tf.concat([positive_rois, negative_rois], axis=0)
N = tf.shape(negative_rois)[0] # 102
target_matchs = tf.pad(target_matchs, [(0, N)]) # [34] padding after with [N]
target_matchs = tf.stop_gradient(target_matchs) # [34+102]
target_deltas = tf.stop_gradient(target_deltas) # [34, 4]
# rois: [34+102, 4]
return rois, target_matchs, target_deltas
(5) geometry
import tensorflow as tf
def compute_overlaps(boxes1, boxes2):
'''Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
'''
# 1. Tile boxes2 and repeate boxes1. This allows us to compare
# every boxes1 against every boxes2 without loops.
# TF doesn't have an equivalent to np.repeate() so simulate it
# using tf.tile() and tf.reshape.
b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
[1, 1, tf.shape(boxes2)[0]]), [-1, 4])
b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
# 2. Compute intersections
b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
y1 = tf.maximum(b1_y1, b2_y1)
x1 = tf.maximum(b1_x1, b2_x1)
y2 = tf.minimum(b1_y2, b2_y2)
x2 = tf.minimum(b1_x2, b2_x2)
intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
# 3. Compute unions
b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
union = b1_area + b2_area - intersection
# 4. Compute IoU and reshape to [boxes1, boxes2]
iou = intersection / union
overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
return overlaps
(6) transforms
# transforms
import tensorflow as tf
from detection.utils.misc import *
def bbox2delta(box, gt_box, target_means, target_stds):
'''Compute refinement needed to transform box to gt_box.
Args
---
box: [..., (y1, x1, y2, x2)]
gt_box: [..., (y1, x1, y2, x2)]
target_means: [4]
target_stds: [4]
'''
target_means = tf.constant(
target_means, dtype=tf.float32)
target_stds = tf.constant(
target_stds, dtype=tf.float32)
box = tf.cast(box, tf.float32)
gt_box = tf.cast(gt_box, tf.float32)
height = box[..., 2] - box[..., 0]
width = box[..., 3] - box[..., 1]
center_y = box[..., 0] + 0.5 * height
center_x = box[..., 1] + 0.5 * width
gt_height = gt_box[..., 2] - gt_box[..., 0]
gt_width = gt_box[..., 3] - gt_box[..., 1]
gt_center_y = gt_box[..., 0] + 0.5 * gt_height
gt_center_x = gt_box[..., 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = tf.math.log(gt_height / height)
dw = tf.math.log(gt_width / width)
delta = tf.stack([dy, dx, dh, dw], axis=-1)
delta = (delta - target_means) / target_stds
return delta
def delta2bbox(box, delta, target_means, target_stds):
'''Compute bounding box based on roi and delta.
Args
---
box: [N, (y1, x1, y2, x2)] box to update
delta: [N, (dy, dx, log(dh), log(dw))] refinements to apply
target_means: [4]
target_stds: [4]
'''
target_means = tf.constant(
target_means, dtype=tf.float32)
target_stds = tf.constant(
target_stds, dtype=tf.float32)
delta = delta * target_stds + target_means
# Convert to y, x, h, w
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
# Apply delta
center_y += delta[:, 0] * height
center_x += delta[:, 1] * width
height *= tf.exp(delta[:, 2])
width *= tf.exp(delta[:, 3])
# Convert back to y1, x1, y2, x2
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
result = tf.stack([y1, x1, y2, x2], axis=1)
return result
def bbox_clip(box, window):
'''
Args
---
box: [N, (y1, x1, y2, x2)]
window: [4] in the form y1, x1, y2, x2
'''
# Split
wy1, wx1, wy2, wx2 = tf.split(window, 4)
y1, x1, y2, x2 = tf.split(box, 4, axis=1)
# Clip
y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
clipped = tf.concat([y1, x1, y2, x2], axis=1)
clipped.set_shape((clipped.shape[0], 4))
return clipped
def bbox_flip(bboxes, width):
'''
Flip bboxes horizontally.
Args
---
bboxes: [..., 4]
width: Int or Float
'''
y1, x1, y2, x2 = tf.split(bboxes, 4, axis=-1)
new_x1 = width - x2
new_x2 = width - x1
flipped = tf.concat([y1, new_x1, y2, new_x2], axis=-1)
return flipped
def bbox_mapping(box, img_meta):
'''
Args
---
box: [N, 4]
img_meta: [11]
'''
img_meta = parse_image_meta(img_meta)
scale = img_meta['scale']
flip = img_meta['flip']
box = box * scale
if tf.equal(flip, 1):
box = bbox_flip(box, img_meta['img_shape'][1])
return box
def bbox_mapping_back(box, img_meta):
'''
Args
---
box: [N, 4]
img_meta: [11]
'''
img_meta = parse_image_meta(img_meta)
scale = img_meta['scale']
flip = img_meta['flip']
if tf.equal(flip, 1):
box = bbox_flip(box, img_meta['img_shape'][1])
box = box / scale
return box
(7) loss
# loss
import tensorflow as tf
from tensorflow import keras
def smooth_l1_loss(y_true, y_pred):
'''Implements Smooth-L1 loss.
Args
---
y_true and y_pred are typically: [N, 4], but could be any shape.
'''
diff = tf.abs(y_true - y_pred)
less_than_one = tf.cast(tf.less(diff, 1.0), tf.float32)
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
return loss
def rpn_class_loss(target_matchs, rpn_class_logits):
'''RPN anchor classifier loss.
Args
---
target_matchs: [batch_size, num_anchors]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_class_logits: [batch_size, num_anchors, 2]. RPN classifier logits for FG/BG.
'''
# Get anchor classes. Convert the -1/+1 match to 0/1 values.
anchor_class = tf.cast(tf.equal(target_matchs, 1), tf.int32)
# Positive and Negative anchors contribute to the loss,
# but neutral anchors (match value = 0) don't.
indices = tf.where(tf.not_equal(target_matchs, 0))
# Pick rows that contribute to the loss and filter out the rest.
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
anchor_class = tf.gather_nd(anchor_class, indices)
# Cross entropy loss
# loss = tf.losses.sparse_softmax_cross_entropy(labels=anchor_class,
# logits=rpn_class_logits)
num_classes = rpn_class_logits.shape[-1]
# print(rpn_class_logits.shape)
loss = keras.losses.categorical_crossentropy(tf.one_hot(anchor_class, depth=num_classes),
rpn_class_logits, from_logits=True)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
def rpn_bbox_loss(target_deltas, target_matchs, rpn_deltas):
'''Return the RPN bounding box loss graph.
Args
---
target_deltas: [batch, num_rpn_deltas, (dy, dx, log(dh), log(dw))].
Uses 0 padding to fill in unsed bbox deltas.
target_matchs: [batch, anchors]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_deltas: [batch, anchors, (dy, dx, log(dh), log(dw))]
'''
def batch_pack(x, counts, num_rows):
'''Picks different number of values from each row
in x depending on the values in counts.
'''
outputs = []
for i in range(num_rows):
outputs.append(x[i, :counts[i]])
return tf.concat(outputs, axis=0)
# Positive anchors contribute to the loss, but negative and
# neutral anchors (match value of 0 or -1) don't.
indices = tf.where(tf.equal(target_matchs, 1))
# Pick bbox deltas that contribute to the loss
rpn_deltas = tf.gather_nd(rpn_deltas, indices)
# Trim target bounding box deltas to the same length as rpn_deltas.
batch_counts = tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32), axis=1)
target_deltas = batch_pack(target_deltas, batch_counts,
target_deltas.shape.as_list()[0])
loss = smooth_l1_loss(target_deltas, rpn_deltas)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
def rcnn_class_loss(target_matchs_list, rcnn_class_logits_list):
'''Loss for the classifier head of Faster RCNN.
Args
---
target_matchs_list: list of [num_rois]. Integer class IDs. Uses zero
padding to fill in the array.
rcnn_class_logits_list: list of [num_rois, num_classes]
'''
class_ids = tf.concat(target_matchs_list, 0)
class_logits = tf.concat(rcnn_class_logits_list, 0)
class_ids = tf.cast(class_ids, 'int64')
# loss = tf.losses.sparse_softmax_cross_entropy(labels=class_ids,
# logits=class_logits)
num_classes = class_logits.shape[-1]
# print(class_logits.shape)
loss = keras.losses.categorical_crossentropy(tf.one_hot(class_ids, depth=num_classes),
class_logits, from_logits=True)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
def rcnn_bbox_loss(target_deltas_list, target_matchs_list, rcnn_deltas_list):
'''Loss for Faster R-CNN bounding box refinement.
Args
---
target_deltas_list: list of [num_positive_rois, (dy, dx, log(dh), log(dw))]
target_matchs_list: list of [num_rois]. Integer class IDs.
rcnn_deltas_list: list of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
'''
target_deltas = tf.concat(target_deltas_list, 0)
target_class_ids = tf.concat(target_matchs_list, 0)
rcnn_deltas = tf.concat(rcnn_deltas_list, 0)
# Only positive ROIs contribute to the loss. And only
# the right class_id of each ROI. Get their indicies.
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
positive_roi_class_ids = tf.cast(
tf.gather(target_class_ids, positive_roi_ix), tf.int64)
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
# Gather the deltas (predicted and true) that contribute to loss
rcnn_deltas = tf.gather_nd(rcnn_deltas, indices)
# Smooth-L1 Loss
loss = smooth_l1_loss(target_deltas, rcnn_deltas)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
Datasets
(8) coco
# coco
import os.path as osp
import cv2
import numpy as np
from pycocotools.coco import COCO
from detection.datasets import transforms, utils
"""
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
"""
class CocoDataSet(object):
def __init__(self, dataset_dir, subset,
flip_ratio=0,
pad_mode='fixed',
mean=(0, 0, 0),
std=(1, 1, 1),
scale=(1024, 800),
debug=False):
'''Load a subset of the COCO dataset.
Attributes
---
dataset_dir: The root directory of the COCO dataset.
subset: What to load (train, val).
flip_ratio: Float. The ratio of flipping an image and its bounding boxes.
pad_mode: Which padded method to use (fixed, non-fixed)
mean: Tuple. Image mean.
std: Tuple. Image standard deviation.
scale: Tuple of two integers.
'''
if subset not in ['train', 'val']:
raise AssertionError('subset must be "train" or "val".')
self.coco = COCO("{}/annotations/instances_{}2017.json".format(dataset_dir, subset))
# get the mapping from original category ids to labels
self.cat_ids = self.coco.getCatIds()
self.cat2label = {
cat_id: i + 1
for i, cat_id in enumerate(self.cat_ids)
}
self.img_ids, self.img_infos = self._filter_imgs()
if debug:
self.img_ids, self.img_infos = self.img_ids[:50], self.img_infos[:50]
self.image_dir = "{}/images/{}2017".format(dataset_dir, subset)
self.flip_ratio = flip_ratio
if pad_mode in ['fixed', 'non-fixed']:
self.pad_mode = pad_mode
elif subset == 'train':
self.pad_mode = 'fixed'
else:
self.pad_mode = 'non-fixed'
self.img_transform = transforms.ImageTransform(scale, mean, std, pad_mode)
self.bbox_transform = transforms.BboxTransform()
def _filter_imgs(self, min_size=32):
'''Filter images too small or without ground truths.
Args
---
min_size: the minimal size of the image.
'''
# Filter images without ground truths.
all_img_ids = list(set([_['image_id'] for _ in self.coco.anns.values()]))
# Filter images too small.
img_ids = []
img_infos = []
for i in all_img_ids:
info = self.coco.loadImgs(i)[0]
ann_ids = self.coco.getAnnIds(imgIds=i)
ann_info = self.coco.loadAnns(ann_ids)
ann = self._parse_ann_info(ann_info)
if min(info['width'], info['height']) >= min_size and ann['labels'].shape[0] != 0:
img_ids.append(i)
img_infos.append(info)
return img_ids, img_infos
def _load_ann_info(self, idx):
img_id = self.img_ids[idx]
ann_ids = self.coco.getAnnIds(imgIds=img_id)
ann_info = self.coco.loadAnns(ann_ids)
return ann_info
def _parse_ann_info(self, ann_info):
'''Parse bbox annotation.
Args
---
ann_info (list[dict]): Annotation info of an image.
Returns
---
dict: A dict containing the following keys: bboxes,
bboxes_ignore, labels.
'''
gt_bboxes = []
gt_labels = []
gt_bboxes_ignore = []
for i, ann in enumerate(ann_info):
if ann.get('ignore', False):
continue
x1, y1, w, h = ann['bbox']
if ann['area'] <= 0 or w < 1 or h < 1:
continue
bbox = [y1, x1, y1 + h - 1, x1 + w - 1]
if ann['iscrowd']:
gt_bboxes_ignore.append(bbox)
else:
gt_bboxes.append(bbox)
gt_labels.append(self.cat2label[ann['category_id']])
if gt_bboxes:
gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
gt_labels = np.array(gt_labels, dtype=np.int64)
else:
gt_bboxes = np.zeros((0, 4), dtype=np.float32)
gt_labels = np.array([], dtype=np.int64)
if gt_bboxes_ignore:
gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
else:
gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
ann = dict(
bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
return ann
def __len__(self):
return len(self.img_infos)
def __getitem__(self, idx):
'''Load the image and its bboxes for the given index.
Args
---
idx: the index of images.
Returns
---
tuple: A tuple containing the following items: image,
bboxes, labels.
'''
img_info = self.img_infos[idx]
ann_info = self._load_ann_info(idx)
# load the image.
img = cv2.imread(osp.join(self.image_dir, img_info['file_name']), cv2.IMREAD_COLOR)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
ori_shape = img.shape
# Load the annotation.
ann = self._parse_ann_info(ann_info)
bboxes = ann['bboxes']
labels = ann['labels']
flip = True if np.random.rand() < self.flip_ratio else False
# Handle the image
img, img_shape, scale_factor = self.img_transform(img, flip)
pad_shape = img.shape
# Handle the annotation.
bboxes, labels = self.bbox_transform(
bboxes, labels, img_shape, scale_factor, flip)
# Handle the meta info.
img_meta_dict = dict({
'ori_shape': ori_shape,
'img_shape': img_shape,
'pad_shape': pad_shape,
'scale_factor': scale_factor,
'flip': flip
})
img_meta = utils.compose_image_meta(img_meta_dict)
return img, img_meta, bboxes, labels
def get_categories(self):
'''Get list of category names.
Returns
---
list: A list of category names.
Note that the first item 'bg' means background.
'''
return ['bg'] + [self.coco.loadCats(i)[0]["name"] for i in self.cat2label.keys()]
(9)data_generator
# data_generator
import numpy as np
class DataGenerator:
"""
读取每一个图片的信息
"""
def __init__(self, dataset, shuffle=False):
self.dataset = dataset
self.shuffle = shuffle
def __call__(self):
indices = np.arange(len(self.dataset))
if self.shuffle:
np.random.shuffle(indices)
for img_idx in indices:
img, img_meta, bbox, label = self.dataset[img_idx]
yield img, img_meta, bbox, label
(10) transforms
# transforms
import numpy as np
from detection.datasets.utils import *
class ImageTransform(object):
'''Preprocess the image.
1. rescale the image to expected size
2. normalize the image
3. flip the image (if needed)
4. pad the image (if needed)
'''
def __init__(self,
scale=(800, 1333),
mean=(0, 0, 0),
std=(1, 1, 1),
pad_mode='fixed'):
self.scale = scale
self.mean = mean
self.std = std
self.pad_mode = pad_mode
self.impad_size = max(scale) if pad_mode == 'fixed' else 64
def __call__(self, img, flip=False):
img, scale_factor = imrescale(img, self.scale)
img_shape = img.shape
img = imnormalize(img, self.mean, self.std)
if flip:
img = img_flip(img)
if self.pad_mode == 'fixed':
img = impad_to_square(img, self.impad_size)
else: # 'non-fixed'
img = impad_to_multiple(img, self.impad_size)
return img, img_shape, scale_factor
class BboxTransform(object):
'''Preprocess ground truth bboxes.
1. rescale bboxes according to image size
2. flip bboxes (if needed)
'''
def __init__(self):
pass
def __call__(self, bboxes, labels,
img_shape, scale_factor, flip=False):
bboxes = bboxes * scale_factor
if flip:
bboxes = bbox_flip(bboxes, img_shape)
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[0])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[1])
return bboxes, labels
(11)datasets.utils
# datasets.utils
import cv2
import numpy as np
###########################################
#
# Utility Functions for
# Image Preprocessing and Data Augmentation
#
###########################################
def img_flip(img):
'''Flip the image horizontally
Args
---
img: [height, width, channel]
Returns
---
np.ndarray: the flipped image.
'''
return np.fliplr(img)
def bbox_flip(bboxes, img_shape):
'''Flip bboxes horizontally.
Args
---
bboxes: [..., 4]
img_shape: Tuple. (height, width)
Returns
---
np.ndarray: the flipped bboxes.
'''
w = img_shape[1]
flipped = bboxes.copy()
flipped[..., 1] = w - bboxes[..., 3] - 1
flipped[..., 3] = w - bboxes[..., 1] - 1
return flipped
def impad_to_square(img, pad_size):
'''Pad an image to ensure each edge to equal to pad_size.
Args
---
img: [height, width, channels]. Image to be padded
pad_size: Int.
Returns
---
ndarray: The padded image with shape of
[pad_size, pad_size, channels].
'''
shape = (pad_size, pad_size, img.shape[-1])
pad = np.zeros(shape, dtype=img.dtype)
pad[:img.shape[0], :img.shape[1], ...] = img
return pad
def impad_to_multiple(img, divisor):
'''Pad an image to ensure each edge to be multiple to some number.
Args
---
img: [height, width, channels]. Image to be padded.
divisor: Int. Padded image edges will be multiple to divisor.
Returns
---
ndarray: The padded image.
'''
pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
shape = (pad_h, pad_w, img.shape[-1])
pad = np.zeros(shape, dtype=img.dtype)
pad[:img.shape[0], :img.shape[1], ...] = img
return pad
def imrescale(img, scale):
'''Resize image while keeping the aspect ratio.
Args
---
img: [height, width, channels]. The input image.
scale: Tuple of 2 integers. the image will be rescaled
as large as possible within the scale
Returns
---
np.ndarray: the scaled image.
'''
h, w = img.shape[:2]
max_long_edge = max(scale)
max_short_edge = min(scale)
scale_factor = min(max_long_edge / max(h, w),
max_short_edge / min(h, w))
new_size = (int(w * float(scale_factor) + 0.5),
int(h * float(scale_factor) + 0.5))
rescaled_img = cv2.resize(
img, new_size, interpolation=cv2.INTER_LINEAR)
return rescaled_img, scale_factor
def imnormalize(img, mean, std):
'''Normalize the image.
Args
---
img: [height, width, channel]
mean: Tuple or np.ndarray. [3]
std: Tuple or np.ndarray. [3]
Returns
---
np.ndarray: the normalized image.
'''
img = (img - mean) / std
return img.astype(np.float32)
def imdenormalize(norm_img, mean, std):
'''Denormalize the image.
Args
---
norm_img: [height, width, channel]
mean: Tuple or np.ndarray. [3]
std: Tuple or np.ndarray. [3]
Returns
---
np.ndarray: the denormalized image.
'''
img = norm_img * std + mean
return img.astype(np.float32)
#######################################
#
# Utility Functions for Data Formatting
#
#######################################
def get_original_image(img, img_meta,
mean=(0, 0, 0), std=(1, 1, 1)):
'''Recover the origanal image.
Args
---
img: np.ndarray. [height, width, channel].
The transformed image.
img_meta: np.ndarray. [11]
mean: Tuple or np.ndarray. [3]
std: Tuple or np.ndarray. [3]
Returns
---
np.ndarray: the original image.
'''
img_meta_dict = parse_image_meta(img_meta)
ori_shape = img_meta_dict['ori_shape']
img_shape = img_meta_dict['img_shape']
flip = img_meta_dict['flip']
img = img[:img_shape[0], :img_shape[1]]
if flip:
img = img_flip(img)
img = cv2.resize(img, (ori_shape[1], ori_shape[0]),
interpolation=cv2.INTER_LINEAR)
img = imdenormalize(img, mean, std)
return img
def compose_image_meta(img_meta_dict):
'''Takes attributes of an image and puts them in one 1D array.
Args
---
img_meta_dict: dict
Returns
---
img_meta: np.ndarray
'''
ori_shape = img_meta_dict['ori_shape']
img_shape = img_meta_dict['img_shape']
pad_shape = img_meta_dict['pad_shape']
scale_factor = img_meta_dict['scale_factor']
flip = 1 if img_meta_dict['flip'] else 0
img_meta = np.array(
ori_shape + # size=3
img_shape + # size=3
pad_shape + # size=3
tuple([scale_factor]) + # size=1
tuple([flip]) # size=1
).astype(np.float32)
return img_meta
def parse_image_meta(img_meta):
'''Parses an array that contains image attributes to its components.
Args
---
meta: [11]
Returns
---
a dict of the parsed values.
'''
ori_shape = img_meta[0:3]
img_shape = img_meta[3:6]
pad_shape = img_meta[6:9]
scale_factor = img_meta[9]
flip = img_meta[10]
return {
'ori_shape': ori_shape.astype(np.int32),
'img_shape': img_shape.astype(np.int32),
'pad_shape': pad_shape.astype(np.int32),
'scale_factor': scale_factor.astype(np.float32),
'flip': flip.astype(np.bool),
}
(12)resnet
'''ResNet model for Keras.
# Reference:
- [Deep Residual Learning for Image Recognition](
https://arxiv.org/abs/1512.03385)
'''
# resnet
import tensorflow as tf
from tensorflow.keras import layers
class _Bottleneck(tf.keras.Model):
def __init__(self, filters, block,
downsampling=False, stride=1, **kwargs):
super(_Bottleneck, self).__init__(**kwargs)
filters1, filters2, filters3 = filters
conv_name_base = 'res' + block + '_branch'
bn_name_base = 'bn' + block + '_branch'
self.downsampling = downsampling
self.stride = stride
self.out_channel = filters3
self.conv2a = layers.Conv2D(filters1, (1, 1), strides=(stride, stride),
kernel_initializer='he_normal',
name=conv_name_base + '2a')
self.bn2a = layers.BatchNormalization(name=bn_name_base + '2a')
self.conv2b = layers.Conv2D(filters2, (3, 3), padding='same',
kernel_initializer='he_normal',
name=conv_name_base + '2b')
self.bn2b = layers.BatchNormalization(name=bn_name_base + '2b')
self.conv2c = layers.Conv2D(filters3, (1, 1),
kernel_initializer='he_normal',
name=conv_name_base + '2c')
self.bn2c = layers.BatchNormalization(name=bn_name_base + '2c')
if self.downsampling:
self.conv_shortcut = layers.Conv2D(filters3, (1, 1), strides=(stride, stride),
kernel_initializer='he_normal',
name=conv_name_base + '1')
self.bn_shortcut = layers.BatchNormalization(name=bn_name_base + '1')
def call(self, inputs, training=False):
x = self.conv2a(inputs)
x = self.bn2a(x, training=training)
x = tf.nn.relu(x)
x = self.conv2b(x)
x = self.bn2b(x, training=training)
x = tf.nn.relu(x)
x = self.conv2c(x)
x = self.bn2c(x, training=training)
if self.downsampling:
shortcut = self.conv_shortcut(inputs)
shortcut = self.bn_shortcut(shortcut, training=training)
else:
shortcut = inputs
x += shortcut
x = tf.nn.relu(x)
return x
def compute_output_shape(self, input_shape):
shape = tf.TensorShape(input_shape).as_list()
shape[1] = shape[1] // self.stride
shape[2] = shape[2] // self.stride
shape[-1] = self.out_channel
return tf.TensorShape(shape)
class ResNet(tf.keras.Model):
def __init__(self, depth, **kwargs):
super(ResNet, self).__init__(**kwargs)
if depth not in [50, 101]:
raise AssertionError('depth must be 50 or 101.')
self.depth = depth
self.padding = layers.ZeroPadding2D((3, 3))
self.conv1 = layers.Conv2D(64, (7, 7),
strides=(2, 2),
kernel_initializer='he_normal',
name='conv1')
self.bn_conv1 = layers.BatchNormalization(name='bn_conv1')
self.max_pool = layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')
self.res2a = _Bottleneck([64, 64, 256], block='2a',
downsampling=True, stride=1)
self.res2b = _Bottleneck([64, 64, 256], block='2b')
self.res2c = _Bottleneck([64, 64, 256], block='2c')
self.res3a = _Bottleneck([128, 128, 512], block='3a',
downsampling=True, stride=2)
self.res3b = _Bottleneck([128, 128, 512], block='3b')
self.res3c = _Bottleneck([128, 128, 512], block='3c')
self.res3d = _Bottleneck([128, 128, 512], block='3d')
self.res4a = _Bottleneck([256, 256, 1024], block='4a',
downsampling=True, stride=2)
self.res4b = _Bottleneck([256, 256, 1024], block='4b')
self.res4c = _Bottleneck([256, 256, 1024], block='4c')
self.res4d = _Bottleneck([256, 256, 1024], block='4d')
self.res4e = _Bottleneck([256, 256, 1024], block='4e')
self.res4f = _Bottleneck([256, 256, 1024], block='4f')
if self.depth == 101:
self.res4g = _Bottleneck([256, 256, 1024], block='4g')
self.res4h = _Bottleneck([256, 256, 1024], block='4h')
self.res4i = _Bottleneck([256, 256, 1024], block='4i')
self.res4j = _Bottleneck([256, 256, 1024], block='4j')
self.res4k = _Bottleneck([256, 256, 1024], block='4k')
self.res4l = _Bottleneck([256, 256, 1024], block='4l')
self.res4m = _Bottleneck([256, 256, 1024], block='4m')
self.res4n = _Bottleneck([256, 256, 1024], block='4n')
self.res4o = _Bottleneck([256, 256, 1024], block='4o')
self.res4p = _Bottleneck([256, 256, 1024], block='4p')
self.res4q = _Bottleneck([256, 256, 1024], block='4q')
self.res4r = _Bottleneck([256, 256, 1024], block='4r')
self.res4s = _Bottleneck([256, 256, 1024], block='4s')
self.res4t = _Bottleneck([256, 256, 1024], block='4t')
self.res4u = _Bottleneck([256, 256, 1024], block='4u')
self.res4v = _Bottleneck([256, 256, 1024], block='4v')
self.res4w = _Bottleneck([256, 256, 1024], block='4w')
self.res5a = _Bottleneck([512, 512, 2048], block='5a',
downsampling=True, stride=2)
self.res5b = _Bottleneck([512, 512, 2048], block='5b')
self.res5c = _Bottleneck([512, 512, 2048], block='5c')
self.out_channel = (256, 512, 1024, 2048)
def call(self, inputs, training=True):
x = self.padding(inputs)
x = self.conv1(x)
x = self.bn_conv1(x, training=training)
x = tf.nn.relu(x)
x = self.max_pool(x)
x = self.res2a(x, training=training)
x = self.res2b(x, training=training)
C2 = x = self.res2c(x, training=training)
x = self.res3a(x, training=training)
x = self.res3b(x, training=training)
x = self.res3c(x, training=training)
C3 = x = self.res3d(x, training=training)
x = self.res4a(x, training=training)
x = self.res4b(x, training=training)
x = self.res4c(x, training=training)
x = self.res4d(x, training=training)
x = self.res4e(x, training=training)
x = self.res4f(x, training=training)
if self.depth == 101:
x = self.res4g(x, training=training)
x = self.res4h(x, training=training)
x = self.res4i(x, training=training)
x = self.res4j(x, training=training)
x = self.res4k(x, training=training)
x = self.res4l(x, training=training)
x = self.res4m(x, training=training)
x = self.res4n(x, training=training)
x = self.res4o(x, training=training)
x = self.res4p(x, training=training)
x = self.res4q(x, training=training)
x = self.res4r(x, training=training)
x = self.res4s(x, training=training)
x = self.res4t(x, training=training)
x = self.res4u(x, training=training)
x = self.res4v(x, training=training)
x = self.res4w(x, training=training)
C4 = x
x = self.res5a(x, training=training)
x = self.res5b(x, training=training)
C5 = x = self.res5c(x, training=training)
return (C2, C3, C4, C5)
def compute_output_shape(self, input_shape):
shape = tf.TensorShape(input_shape).as_list()
batch, H, W, C = shape
C2_shape = tf.TensorShape([batch, H // 4, W // 4, self.out_channel[0]])
C3_shape = tf.TensorShape([batch, H // 8, W // 8, self.out_channel[1]])
C4_shape = tf.TensorShape([batch, H // 16, W // 16, self.out_channel[2]])
C5_shape = tf.TensorShape([batch, H // 32, W // 32, self.out_channel[3]])
return (C2_shape, C3_shape, C4_shape, C5_shape)
(13)bbox_head
# bbox_head
import tensorflow as tf
from tensorflow.keras import layers
from detection.core.bbox import transforms
from detection.core.loss import losses
from detection.utils.misc import *
class BBoxHead(tf.keras.Model):
def __init__(self, num_classes,
pool_size=(7, 7),
target_means=(0., 0., 0., 0.),
target_stds=(0.1, 0.1, 0.2, 0.2),
min_confidence=0.7,
nms_threshold=0.3,
max_instances=100,
**kwags):
super(BBoxHead, self).__init__(**kwags)
self.num_classes = num_classes
self.pool_size = tuple(pool_size)
self.target_means = target_means
self.target_stds = target_stds
self.min_confidence = min_confidence
self.nms_threshold = nms_threshold
self.max_instances = max_instances
self.rcnn_class_loss = losses.rcnn_class_loss
self.rcnn_bbox_loss = losses.rcnn_bbox_loss
self.rcnn_class_conv1 = layers.Conv2D(1024, self.pool_size,
padding='valid', name='rcnn_class_conv1')
self.rcnn_class_bn1 = layers.BatchNormalization(name='rcnn_class_bn1')
self.rcnn_class_conv2 = layers.Conv2D(1024, (1, 1),
name='rcnn_class_conv2')
self.rcnn_class_bn2 = layers.BatchNormalization(name='rcnn_class_bn2')
self.rcnn_class_logits = layers.Dense(num_classes, name='rcnn_class_logits')
self.rcnn_delta_fc = layers.Dense(num_classes * 4, name='rcnn_bbox_fc')
def call(self, inputs, training=True):
'''
Args
---
pooled_rois_list: List of [num_rois, pool_size, pool_size, channels]
Returns
---
rcnn_class_logits_list: List of [num_rois, num_classes]
rcnn_probs_list: List of [num_rois, num_classes]
rcnn_deltas_list: List of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
'''
pooled_rois_list = inputs
num_pooled_rois_list = [pooled_rois.shape[0] for pooled_rois in pooled_rois_list]
pooled_rois = tf.concat(pooled_rois_list, axis=0)
x = self.rcnn_class_conv1(pooled_rois)
x = self.rcnn_class_bn1(x, training=training)
x = tf.nn.relu(x)
x = self.rcnn_class_conv2(x)
x = self.rcnn_class_bn2(x, training=training)
x = tf.nn.relu(x)
x = tf.squeeze(tf.squeeze(x, 2), 1)
logits = self.rcnn_class_logits(x)
probs = tf.nn.softmax(logits)
deltas = self.rcnn_delta_fc(x)
deltas = tf.reshape(deltas, (-1, self.num_classes, 4))
rcnn_class_logits_list = tf.split(logits, num_pooled_rois_list, 0)
rcnn_probs_list = tf.split(probs, num_pooled_rois_list, 0)
rcnn_deltas_list = tf.split(deltas, num_pooled_rois_list, 0)
return rcnn_class_logits_list, rcnn_probs_list, rcnn_deltas_list
def loss(self,
rcnn_class_logits_list, rcnn_deltas_list,
rcnn_target_matchs_list, rcnn_target_deltas_list):
"""
:param rcnn_class_logits_list:
:param rcnn_deltas_list:
:param rcnn_target_matchs_list:
:param rcnn_target_deltas_list:
:return:
"""
rcnn_class_loss = self.rcnn_class_loss(
rcnn_target_matchs_list, rcnn_class_logits_list)
rcnn_bbox_loss = self.rcnn_bbox_loss(
rcnn_target_deltas_list, rcnn_target_matchs_list, rcnn_deltas_list)
return rcnn_class_loss, rcnn_bbox_loss
def get_bboxes(self, rcnn_probs_list, rcnn_deltas_list, rois_list, img_metas):
'''
Args
---
rcnn_probs_list: List of [num_rois, num_classes]
rcnn_deltas_list: List of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
rois_list: List of [num_rois, (y1, x1, y2, x2)]
img_meta_list: [batch_size, 11]
Returns
---
detections_list: List of [num_detections, (y1, x1, y2, x2, class_id, score)]
coordinates are in pixel coordinates.
'''
pad_shapes = calc_pad_shapes(img_metas)
detections_list = [
self._get_bboxes_single(
rcnn_probs_list[i], rcnn_deltas_list[i], rois_list[i], pad_shapes[i])
for i in range(img_metas.shape[0])
]
return detections_list
def _get_bboxes_single(self, rcnn_probs, rcnn_deltas, rois, img_shape):
'''
Args
---
rcnn_probs: [num_rois, num_classes]
rcnn_deltas: [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
rois: [num_rois, (y1, x1, y2, x2)]
img_shape: np.ndarray. [2]. (img_height, img_width)
'''
H, W = img_shape
# Class IDs per ROI
class_ids = tf.argmax(rcnn_probs, axis=1, output_type=tf.int32)
# Class probability of the top class of each ROI
indices = tf.stack([tf.range(rcnn_probs.shape[0]), class_ids], axis=1)
class_scores = tf.gather_nd(rcnn_probs, indices)
# Class-specific bounding box deltas
deltas_specific = tf.gather_nd(rcnn_deltas, indices)
# Apply bounding box deltas
# Shape: [num_rois, (y1, x1, y2, x2)] in normalized coordinates
refined_rois = transforms.delta2bbox(rois, deltas_specific, self.target_means, self.target_stds)
# Clip boxes to image window
refined_rois *= tf.constant([H, W, H, W], dtype=tf.float32)
window = tf.constant([0., 0., H * 1., W * 1.], dtype=tf.float32)
refined_rois = transforms.bbox_clip(refined_rois, window)
# Filter out background boxes
keep = tf.where(class_ids > 0)[:, 0]
# Filter out low confidence boxes
if self.min_confidence:
conf_keep = tf.where(class_scores >= self.min_confidence)[:, 0]
keep = tf.compat.v2.sets.intersection(tf.expand_dims(keep, 0),
tf.expand_dims(conf_keep, 0))
keep = tf.sparse.to_dense(keep)[0]
# Apply per-class NMS
# 1. Prepare variables
pre_nms_class_ids = tf.gather(class_ids, keep)
pre_nms_scores = tf.gather(class_scores, keep)
pre_nms_rois = tf.gather(refined_rois, keep)
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
def nms_keep_map(class_id):
'''Apply Non-Maximum Suppression on ROIs of the given class.'''
# Indices of ROIs of the given class
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
# Apply NMS
class_keep = tf.image.non_max_suppression(
tf.gather(pre_nms_rois, ixs),
tf.gather(pre_nms_scores, ixs),
max_output_size=self.max_instances,
iou_threshold=self.nms_threshold)
# Map indices
class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
return class_keep
# 2. Map over class IDs
nms_keep = []
for i in range(unique_pre_nms_class_ids.shape[0]):
nms_keep.append(nms_keep_map(unique_pre_nms_class_ids[i]))
nms_keep = tf.concat(nms_keep, axis=0)
# 3. Compute intersection between keep and nms_keep
keep = tf.compat.v2.sets.intersection(tf.expand_dims(keep, 0),
tf.expand_dims(nms_keep, 0))
keep = tf.sparse.to_dense(keep)[0]
# Keep top detections
roi_count = self.max_instances
class_scores_keep = tf.gather(class_scores, keep)
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
keep = tf.gather(keep, top_ids)
detections = tf.concat([
tf.gather(refined_rois, keep),
tf.cast(tf.gather(class_ids, keep), tf.float32)[..., tf.newaxis],
tf.gather(class_scores, keep)[..., tf.newaxis]
], axis=1)
return detections
(14)faster-rcnn
# faster-rcnn
import tensorflow as tf
from detection.models.backbones import resnet
from detection.models.necks import fpn
from detection.models.rpn_heads import rpn_head
from detection.models.bbox_heads import bbox_head
from detection.models.roi_extractors import roi_align
from detection.models.detectors.test_mixins import RPNTestMixin, BBoxTestMixin
from detection.core.bbox import bbox_target
class FasterRCNN(tf.keras.Model, RPNTestMixin, BBoxTestMixin):
def __init__(self, num_classes, **kwags):
super(FasterRCNN, self).__init__(**kwags)
# 包含多少类
self.NUM_CLASSES = num_classes
# RPN configuration配置
# Anchor attributes属性
self.ANCHOR_SCALES = (32, 64, 128, 256, 512) ## 尺度
self.ANCHOR_RATIOS = (0.5, 1, 2) ## 比率,这是干什么的?
self.ANCHOR_FEATURE_STRIDES = (4, 8, 16, 32, 64) ## 步伐,这个是干什么的?
# Bounding box refinement mean and standard deviation
# RPN边界框均值和标准差,均值为什么是0呢?
self.RPN_TARGET_MEANS = (0., 0., 0., 0.)
self.RPN_TARGET_STDS = (0.1, 0.1, 0.2, 0.2)
# RPN training configuration
# RPN训练配置
self.PRN_BATCH_SIZE = 256
# 前景所占的比例
self.RPN_POS_FRAC = 0.5
self.RPN_POS_IOU_THR = 0.7
self.RPN_NEG_IOU_THR = 0.3
# ROIs kept configuration
# 配置生成两千个框
self.PRN_PROPOSAL_COUNT = 2000
# 非极大抑制的阈值
self.PRN_NMS_THRESHOLD = 0.7
# RCNN configuration
# Bounding box refinement mean and standard deviation
# RCNN边界框均值和标准差
self.RCNN_TARGET_MEANS = (0., 0., 0., 0.)
self.RCNN_TARGET_STDS = (0.1, 0.1, 0.2, 0.2)
# ROI Feat Size
# 每个候选区域都变成7*7大小
self.POOL_SIZE = (7, 7)
# RCNN training configuration
# RCNN训练的配置
self.RCNN_BATCH_SIZE = 256
## 应该是前景所占的比例
self.RCNN_POS_FRAC = 0.25
self.RCNN_POS_IOU_THR = 0.5
self.RCNN_NEG_IOU_THR = 0.5
# Boxes kept configuration
# 最小置信度为0.7
self.RCNN_MIN_CONFIDENCE = 0.7
self.RCNN_NME_THRESHOLD = 0.3
self.RCNN_MAX_INSTANCES = 100
# Target Generator for the second stage.
self.bbox_target = bbox_target.ProposalTarget(
# 目标的均值
target_means=self.RCNN_TARGET_MEANS,
# RPN网络的标准差,感觉这里应该是target_stds=self.RCNN_TARGET_STDS,
target_stds=self.RPN_TARGET_STDS,
# 批次量
num_rcnn_deltas=self.RCNN_BATCH_SIZE,
# 前景的比率
positive_fraction=self.RCNN_POS_FRAC,
# 正例的iou阈值
pos_iou_thr=self.RCNN_POS_IOU_THR,
# 反例的iou阈值
neg_iou_thr=self.RCNN_NEG_IOU_THR)
# Modules 一共4部分,主干网络、rpn、rpn分类回归头、roi、分类和回归头
# 1 主干网络
self.backbone = resnet.ResNet(
depth=101,
name='res_net')
# 2 fpn网络
self.neck = fpn.FPN(
name='fpn')
# 3 rpn的分类回归头
self.rpn_head = rpn_head.RPNHead(
# anchor的尺度
anchor_scales=self.ANCHOR_SCALES,
# anchor的比率,不知道是干什么的?
anchor_ratios=self.ANCHOR_RATIOS,
# anchor的步伐,这是干什么的?
anchor_feature_strides=self.ANCHOR_FEATURE_STRIDES,
# RPN提出的数量
proposal_count=self.PRN_PROPOSAL_COUNT,
# 非极大抑制的阈值
nms_threshold=self.PRN_NMS_THRESHOLD,
# 目标的均值,不知道啥意思
target_means=self.RPN_TARGET_MEANS,
# 目标的标准差,不知道啥意思
target_stds=self.RPN_TARGET_STDS,
# 批次量
num_rpn_deltas=self.PRN_BATCH_SIZE,
# 提出得到的前景比例
positive_fraction=self.RPN_POS_FRAC,
# 正例的阈值
pos_iou_thr=self.RPN_POS_IOU_THR,
# 反例的阈值
neg_iou_thr=self.RPN_NEG_IOU_THR,
name='rpn_head')
# roi层
self.roi_align = roi_align.PyramidROIAlign(
# roi的池化大小
pool_shape=self.POOL_SIZE,
name='pyramid_roi_align')
# bbox_head
self.bbox_head = bbox_head.BBoxHead(
# 类的数量
num_classes=self.NUM_CLASSES,
# 池化层大小
pool_size=self.POOL_SIZE,
# RCNN的目标均值
target_means=self.RCNN_TARGET_MEANS,
# RCNN的目标标准差
target_stds=self.RCNN_TARGET_STDS,
# RCNN的最低置信度
min_confidence=self.RCNN_MIN_CONFIDENCE,
# 非极大抑制的阈值
nms_threshold=self.RCNN_NME_THRESHOLD,
max_instances=self.RCNN_MAX_INSTANCES,
name='b_box_head')
def call(self, inputs, training=True):
"""
:param inputs: [1, 1216, 1216, 3], [1, 11], [1, 14, 4], [1, 14]
:param training:
:return:
"""
if training: # training
imgs, img_metas, gt_boxes, gt_class_ids = inputs
else: # inference
imgs, img_metas = inputs
# 每层输出的形状
# [1, 304, 304, 256] => [1, 152, 152, 512]=>[1,76,76,1024]=>[1,38,38,2048]
C2, C3, C4, C5 = self.backbone(imgs,
training=training)
# 每层输出的形状
# [1, 304, 304, 256] <= [1, 152, 152, 256]<=[1,76,76,256]<=[1,38,38,256]=>[1,19,19,256]
P2, P3, P4, P5, P6 = self.neck([C2, C3, C4, C5],
training=training)
# 用一个列表存储了各个输出,这里是要用到金字塔,就是用多个层
rpn_feature_maps = [P2, P3, P4, P5, P6]
rcnn_feature_maps = [P2, P3, P4, P5]
# [1, 369303, 2] [1, 369303, 2], [1, 369303, 4], includes all anchors on pyramid level of features
# 得到
rpn_class_logits, rpn_probs, rpn_deltas = self.rpn_head(
rpn_feature_maps, training=training)
# [369303, 4] => [215169, 4], valid => [6000, 4], performance =>[2000, 4], NMS
proposals_list = self.rpn_head.get_proposals(
rpn_probs, rpn_deltas, img_metas)
if training: # get target value for these proposal target label and target delta
rois_list, rcnn_target_matchs_list, rcnn_target_deltas_list = \
self.bbox_target.build_targets(
proposals_list, gt_boxes, gt_class_ids, img_metas)
else:
rois_list = proposals_list
# rois_list only contains coordinates, rcnn_feature_maps save the 5 features data=>[192,7,7,256]
pooled_regions_list = self.roi_align(#
(rois_list, rcnn_feature_maps, img_metas), training=training)
# [192, 81], [192, 81], [192, 81, 4]
rcnn_class_logits_list, rcnn_probs_list, rcnn_deltas_list = \
self.bbox_head(pooled_regions_list, training=training)
if training:
rpn_class_loss, rpn_bbox_loss = self.rpn_head.loss(
rpn_class_logits, rpn_deltas, gt_boxes, gt_class_ids, img_metas)
rcnn_class_loss, rcnn_bbox_loss = self.bbox_head.loss(
rcnn_class_logits_list, rcnn_deltas_list,
rcnn_target_matchs_list, rcnn_target_deltas_list)
return [rpn_class_loss, rpn_bbox_loss,
rcnn_class_loss, rcnn_bbox_loss]
else:
detections_list = self.bbox_head.get_bboxes(
rcnn_probs_list, rcnn_deltas_list, rois_list, img_metas)
return detections_list
(15)test_mixins
# test_mixins
import numpy as np
import tensorflow as tf
from detection.core.bbox import transforms
from detection.utils.misc import *
class RPNTestMixin:
def simple_test_rpn(self, img, img_meta):
'''
Args
---
imgs: np.ndarray. [height, width, channel]
img_metas: np.ndarray. [11]
'''
imgs = tf.Variable(np.expand_dims(img, 0))
img_metas = tf.Variable(np.expand_dims(img_meta, 0))
x = self.backbone(imgs, training=False)
x = self.neck(x, training=False)
rpn_class_logits, rpn_probs, rpn_deltas = self.rpn_head(x, training=False)
proposals_list = self.rpn_head.get_proposals(
rpn_probs, rpn_deltas, img_metas, with_probs=False)
return proposals_list[0]
class BBoxTestMixin(object):
def _unmold_detections(self, detections_list, img_metas):
return [
self._unmold_single_detection(detections_list[i], img_metas[i])
for i in range(img_metas.shape[0])
]
def _unmold_single_detection(self, detections, img_meta):
zero_ix = tf.where(tf.not_equal(detections[:, 4], 0))
detections = tf.gather_nd(detections, zero_ix)
# Extract boxes, class_ids, scores, and class-specific masks
boxes = detections[:, :4]
class_ids = tf.cast(detections[:, 4], tf.int32)
scores = detections[:, 5]
boxes = transforms.bbox_mapping_back(boxes, img_meta)
return {'rois': boxes.numpy(),
'class_ids': class_ids.numpy(),
'scores': scores.numpy()}
def simple_test_bboxes(self, img, img_meta, proposals):
'''
Args
---
imgs: np.ndarray. [height, width, channel]
img_meta: np.ndarray. [11]
'''
imgs = tf.Variable(np.expand_dims(img, 0))
img_metas = tf.Variable(np.expand_dims(img_meta, 0))
rois_list = [tf.Variable(proposals)]
x = self.backbone(imgs, training=False)
P2, P3, P4, P5, _ = self.neck(x, training=False)
rcnn_feature_maps = [P2, P3, P4, P5]
pooled_regions_list = self.roi_align(
(rois_list, rcnn_feature_maps, img_metas), training=False)
rcnn_class_logits_list, rcnn_probs_list, rcnn_deltas_list = \
self.bbox_head(pooled_regions_list, training=False)
detections_list = self.bbox_head.get_bboxes(
rcnn_probs_list, rcnn_deltas_list, rois_list, img_metas)
return self._unmold_detections(detections_list, img_metas)[0]
(16)fpn
'''
FRN model for Keras.
# Reference:
- [Feature Pyramid Networks for Object Detection](
https://arxiv.org/abs/1612.03144)
'''
# fpn
import tensorflow as tf
from tensorflow.keras import layers
class FPN(tf.keras.Model):
def __init__(self, out_channels=256, **kwargs):
'''
Feature Pyramid Networks
Attributes
---
out_channels: int. the channels of pyramid feature maps.
'''
super(FPN, self).__init__(**kwargs)
self.out_channels = out_channels
self.fpn_c2p2 = layers.Conv2D(out_channels, (1, 1),
kernel_initializer='he_normal', name='fpn_c2p2')
self.fpn_c3p3 = layers.Conv2D(out_channels, (1, 1),
kernel_initializer='he_normal', name='fpn_c3p3')
self.fpn_c4p4 = layers.Conv2D(out_channels, (1, 1),
kernel_initializer='he_normal', name='fpn_c4p4')
self.fpn_c5p5 = layers.Conv2D(out_channels, (1, 1),
kernel_initializer='he_normal', name='fpn_c5p5')
self.fpn_p3upsampled = layers.UpSampling2D(size=(2, 2), name='fpn_p3upsampled')
self.fpn_p4upsampled = layers.UpSampling2D(size=(2, 2), name='fpn_p4upsampled')
self.fpn_p5upsampled = layers.UpSampling2D(size=(2, 2), name='fpn_p5upsampled')
self.fpn_p2 = layers.Conv2D(out_channels, (3, 3), padding='SAME',
kernel_initializer='he_normal', name='fpn_p2')
self.fpn_p3 = layers.Conv2D(out_channels, (3, 3), padding='SAME',
kernel_initializer='he_normal', name='fpn_p3')
self.fpn_p4 = layers.Conv2D(out_channels, (3, 3), padding='SAME',
kernel_initializer='he_normal', name='fpn_p4')
self.fpn_p5 = layers.Conv2D(out_channels, (3, 3), padding='SAME',
kernel_initializer='he_normal', name='fpn_p5')
self.fpn_p6 = layers.MaxPooling2D(pool_size=(1, 1), strides=2, name='fpn_p6')
def call(self, inputs, training=True):
C2, C3, C4, C5 = inputs
P5 = self.fpn_c5p5(C5)
P4 = self.fpn_c4p4(C4) + self.fpn_p5upsampled(P5)
P3 = self.fpn_c3p3(C3) + self.fpn_p4upsampled(P4)
P2 = self.fpn_c2p2(C2) + self.fpn_p3upsampled(P3)
# Attach 3x3 conv to all P layers to get the final feature maps.
P2 = self.fpn_p2(P2)
P3 = self.fpn_p3(P3)
P4 = self.fpn_p4(P4)
P5 = self.fpn_p5(P5)
# subsampling from P5 with stride of 2.
P6 = self.fpn_p6(P5)
return [P2, P3, P4, P5, P6]
def compute_output_shape(self, input_shape):
C2_shape, C3_shape, C4_shape, C5_shape = input_shape
C2_shape, C3_shape, C4_shape, C5_shape = \
C2_shape.as_list(), C3_shape.as_list(), C4_shape.as_list(), C5_shape.as_list()
C6_shape = [C5_shape[0], (C5_shape[1] + 1) // 2, (C5_shape[2] + 1) // 2, self.out_channels]
C2_shape[-1] = self.out_channels
C3_shape[-1] = self.out_channels
C4_shape[-1] = self.out_channels
C5_shape[-1] = self.out_channels
return [tf.TensorShape(C2_shape),
tf.TensorShape(C3_shape),
tf.TensorShape(C4_shape),
tf.TensorShape(C5_shape),
tf.TensorShape(C6_shape)]
if __name__ == '__main__':
C2 = tf.random.normal((2, 256, 256, 256))
C3 = tf.random.normal((2, 128, 128, 512))
C4 = tf.random.normal((2, 64, 64, 1024))
C5 = tf.random.normal((2, 32, 32, 2048))
fpn = FPN()
P2, P3, P4, P5, P6 = fpn([C2, C3, C4, C5])
print('P2 shape:', P2.shape.as_list())
print('P3 shape:', P3.shape.as_list())
print('P4 shape:', P4.shape.as_list())
print('P5 shape:', P5.shape.as_list())
print('P6 shape:', P6.shape.as_list())
(17)roi_align
# roi_align
import tensorflow as tf
from detection.utils.misc import *
class PyramidROIAlign(tf.keras.layers.Layer):
def __init__(self, pool_shape, **kwargs):
'''
Implements ROI Pooling on multiple levels of the feature pyramid.
Attributes
---
pool_shape: (height, width) of the output pooled regions.
Example: (7, 7)
'''
super(PyramidROIAlign, self).__init__(**kwargs)
self.pool_shape = tuple(pool_shape)
def call(self, inputs, training=True):
'''
Args
---
rois_list: list of [num_rois, (y1, x1, y2, x2)] in normalized coordinates.
feature_map_list: List of [batch, height, width, channels].
feature maps from different levels of the pyramid.
img_metas: [batch_size, 11]
Returns
---
pooled_rois_list: list of [num_rois, pooled_height, pooled_width, channels].
The width and height are those specific in the pool_shape in the layer
constructor.
'''
rois_list, feature_map_list, img_metas = inputs # [2000 ,4], list:[P2, P3, P4, P5]
pad_shapes = calc_pad_shapes(img_metas)
pad_areas = pad_shapes[:, 0] * pad_shapes[:, 1] # 1216*1216
num_rois_list = [rois.shape.as_list()[0] for rois in rois_list] # data:[2000]
roi_indices = tf.constant(
[i for i in range(len(rois_list)) for _ in range(rois_list[i].shape.as_list()[0])],
dtype=tf.int32
) #[0.....], shape:[2000]
areas = tf.constant(# range(1) range(2000)
[pad_areas[i] for i in range(pad_areas.shape[0]) for _ in range(num_rois_list[i])],
dtype=tf.float32
)#[1216*1216, 1216*1216,...], shape:[2000]
rois = tf.concat(rois_list, axis=0) # [2000, 4]
# Assign each ROI to a level in the pyramid based on the ROI area.
y1, x1, y2, x2 = tf.split(rois, 4, axis=1) # 4 of [2000, 1]
h = y2 - y1 # [2000, 1]
w = x2 - x1 # [2000, 1]
# Equation 1 in the Feature Pyramid Networks paper. Account for
# the fact that our coordinates are normalized here.
# e.g. a 224x224 ROI (in pixels) maps to P4
roi_level = tf.math.log( # [2000]
tf.sqrt(tf.squeeze(h * w, 1))
/ tf.cast((224.0 / tf.sqrt(areas * 1.0)), tf.float32)
) / tf.math.log(2.0)
roi_level = tf.minimum(5, tf.maximum( # [2000], clamp to [2-5]
2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
# roi_level will indicates which level of feature to use
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled_rois = []
roi_to_level = []
for i, level in enumerate(range(2, 6)): # 2,3,4,5
ix = tf.where(tf.equal(roi_level, level)) # [1999, 1], means 1999 of 2000 select P2
level_rois = tf.gather_nd(rois, ix) # boxes to crop, [1999, 4]
# ROI indices for crop_and_resize.
level_roi_indices = tf.gather_nd(roi_indices, ix) # [19999], data:[0....0]
# Keep track of which roi is mapped to which level
roi_to_level.append(ix)
# Stop gradient propogation to ROI proposals
level_rois = tf.stop_gradient(level_rois)
level_roi_indices = tf.stop_gradient(level_roi_indices)
# Crop and Resize
# From Mask R-CNN paper: "We sample four regular locations, so
# that we can evaluate either max or average pooling. In fact,
# interpolating only a single value at each bin center (without
# pooling) is nearly as effective."
#
# Here we use the simplified approach of a single value per bin,
# which is how it's done in tf.crop_and_resize()
# Result: [batch * num_rois, pool_height, pool_width, channels]
pooled_rois.append(tf.image.crop_and_resize(
feature_map_list[i], level_rois, level_roi_indices, self.pool_shape,
method="bilinear")) # [1, 304, 304, 256], [1999, 4], [1999], [2]=[7,7]=>[1999,7,7,256]
# [1999, 7, 7, 256], [], [], [1,7,7,256] => [2000, 7, 7, 256]
# Pack pooled features into one tensor
pooled_rois = tf.concat(pooled_rois, axis=0)
# Pack roi_to_level mapping into one array and add another
# column representing the order of pooled rois
roi_to_level = tf.concat(roi_to_level, axis=0) # [2000, 1], 1999 of P2, and 1 other P
roi_range = tf.expand_dims(tf.range(tf.shape(roi_to_level)[0]), 1) # [2000, 1], 0~1999
roi_to_level = tf.concat([tf.cast(roi_to_level, tf.int32), roi_range],
axis=1) # [2000, 2], (P, range)
# Rearrange pooled features to match the order of the original rois
# Sort roi_to_level by batch then roi indextf.Tensor([ 0 100001 200002 ... 199801997 199901998 20101999], shape=(2000,), dtype=int32)
# TF doesn't have a way to sort by two columns, so merge them and sort.
sorting_tensor = roi_to_level[:, 0] * 100000 + roi_to_level[:, 1]
ix = tf.nn.top_k(sorting_tensor, k=tf.shape( # k=2000
roi_to_level)[0]).indices[::-1]# reverse the order
ix = tf.gather(roi_to_level[:, 1], ix) # [2000]
pooled_rois = tf.gather(pooled_rois, ix) # [2000, 7, 7, 256]
# 2000 of [7, 7, 256]
pooled_rois_list = tf.split(pooled_rois, num_rois_list, axis=0)
return pooled_rois_list
(18)rpn_head
# rpn_head
import tensorflow as tf
from tensorflow.keras import layers
from detection.core.bbox import transforms
from detection.utils.misc import *
from detection.core.anchor import anchor_generator, anchor_target
from detection.core.loss import losses
class RPNHead(tf.keras.Model):
def __init__(self,
anchor_scales=(32, 64, 128, 256, 512),
anchor_ratios=(0.5, 1, 2),
anchor_feature_strides=(4, 8, 16, 32, 64),
proposal_count=2000,
nms_threshold=0.7,
target_means=(0., 0., 0., 0.),
target_stds=(0.1, 0.1, 0.2, 0.2),
num_rpn_deltas=256,
positive_fraction=0.5,
pos_iou_thr=0.7,
neg_iou_thr=0.3,
**kwags):
'''
Network head of Region Proposal Network.
/ - rpn_cls (1x1 conv)
input - rpn_conv (3x3 conv) -
\ - rpn_reg (1x1 conv)
Attributes
---
anchor_scales: 1D array of anchor sizes in pixels.
anchor_ratios: 1D array of anchor ratios of width/height.
anchor_feature_strides: Stride of the feature map relative
to the image in pixels.
proposal_count: int. RPN proposals kept after non-maximum
suppression.
nms_threshold: float. Non-maximum suppression threshold to
filter RPN proposals.
target_means: [4] Bounding box refinement mean.
target_stds: [4] Bounding box refinement standard deviation.
num_rpn_deltas: int.
positive_fraction: float.
pos_iou_thr: float.
neg_iou_thr: float.
'''
super(RPNHead, self).__init__(**kwags)
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold
self.target_means = target_means
self.target_stds = target_stds
self.generator = anchor_generator.AnchorGenerator(
scales=anchor_scales,
ratios=anchor_ratios,
feature_strides=anchor_feature_strides)
self.anchor_target = anchor_target.AnchorTarget(
target_means=target_means,
target_stds=target_stds,
num_rpn_deltas=num_rpn_deltas,
positive_fraction=positive_fraction,
pos_iou_thr=pos_iou_thr,
neg_iou_thr=neg_iou_thr)
self.rpn_class_loss = losses.rpn_class_loss
self.rpn_bbox_loss = losses.rpn_bbox_loss
# Shared convolutional base of the RPN
self.rpn_conv_shared = layers.Conv2D(512, (3, 3), padding='same',
kernel_initializer='he_normal',
name='rpn_conv_shared')
self.rpn_class_raw = layers.Conv2D(len(anchor_ratios) * 2, (1, 1),
kernel_initializer='he_normal',
name='rpn_class_raw')
self.rpn_delta_pred = layers.Conv2D(len(anchor_ratios) * 4, (1, 1),
kernel_initializer='he_normal',
name='rpn_bbox_pred')
def call(self, inputs, training=True):
'''
Args
---
inputs: [batch_size, feat_map_height, feat_map_width, channels]
one level of pyramid feat-maps.
Returns
---
rpn_class_logits: [batch_size, num_anchors, 2]
rpn_probs: [batch_size, num_anchors, 2]
rpn_deltas: [batch_size, num_anchors, 4]
'''
layer_outputs = []
for feat in inputs: # for every anchors feature maps
"""
(1, 304, 304, 256)
(1, 152, 152, 256)
(1, 76, 76, 256)
(1, 38, 38, 256)
(1, 19, 19, 256)
rpn_class_raw: (1, 304, 304, 6)
rpn_class_logits: (1, 277248, 2)
rpn_delta_pred: (1, 304, 304, 12)
rpn_deltas: (1, 277248, 4)
rpn_class_raw: (1, 152, 152, 6)
rpn_class_logits: (1, 69312, 2)
rpn_delta_pred: (1, 152, 152, 12)
rpn_deltas: (1, 69312, 4)
rpn_class_raw: (1, 76, 76, 6)
rpn_class_logits: (1, 17328, 2)
rpn_delta_pred: (1, 76, 76, 12)
rpn_deltas: (1, 17328, 4)
rpn_class_raw: (1, 38, 38, 6)
rpn_class_logits: (1, 4332, 2)
rpn_delta_pred: (1, 38, 38, 12)
rpn_deltas: (1, 4332, 4)
rpn_class_raw: (1, 19, 19, 6)
rpn_class_logits: (1, 1083, 2)
rpn_delta_pred: (1, 19, 19, 12)
rpn_deltas: (1, 1083, 4)
"""
# print(feat.shape)
shared = self.rpn_conv_shared(feat)
shared = tf.nn.relu(shared)
x = self.rpn_class_raw(shared)
# print('rpn_class_raw:', x.shape)
rpn_class_logits = tf.reshape(x, [tf.shape(x)[0], -1, 2])
rpn_probs = tf.nn.softmax(rpn_class_logits)
# print('rpn_class_logits:', rpn_class_logits.shape)
x = self.rpn_delta_pred(shared)
# print('rpn_delta_pred:', x.shape)
rpn_deltas = tf.reshape(x, [tf.shape(x)[0], -1, 4])
# print('rpn_deltas:', rpn_deltas.shape)
layer_outputs.append([rpn_class_logits, rpn_probs, rpn_deltas])
# print(rpn_class_logits.shape, rpn_probs.shape, rpn_deltas.shape)
"""
(1, 277248, 2) (1, 277248, 2) (1, 277248, 4)
(1, 69312, 2) (1, 69312, 2) (1, 69312, 4)
(1, 17328, 2) (1, 17328, 2) (1, 17328, 4)
(1, 4332, 2) (1, 4332, 2) (1, 4332, 4)
(1, 1083, 2) (1, 1083, 2) (1, 1083, 4)
"""
outputs = list(zip(*layer_outputs))
outputs = [tf.concat(list(o), axis=1) for o in outputs]
rpn_class_logits, rpn_probs, rpn_deltas = outputs
# (1, 369303, 2) (1, 369303, 2) (1, 369303, 4)
# print(rpn_class_logits.shape, rpn_probs.shape, rpn_deltas.shape)
return rpn_class_logits, rpn_probs, rpn_deltas
def loss(self, rpn_class_logits, rpn_deltas, gt_boxes, gt_class_ids, img_metas):
"""
:param rpn_class_logits: [N, 2]
:param rpn_deltas: [N, 4]
:param gt_boxes: [GT_N]
:param gt_class_ids: [GT_N]
:param img_metas: [11]
:return:
"""
# valid_flags indicates anchors located in padded area or not.
anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)
#
rpn_target_matchs, rpn_target_deltas = self.anchor_target.build_targets(
anchors, valid_flags, gt_boxes, gt_class_ids)
rpn_class_loss = self.rpn_class_loss(
rpn_target_matchs, rpn_class_logits)
rpn_bbox_loss = self.rpn_bbox_loss(
rpn_target_deltas, rpn_target_matchs, rpn_deltas)
return rpn_class_loss, rpn_bbox_loss
def get_proposals(self,
rpn_probs,
rpn_deltas,
img_metas,
with_probs=False):
'''
Calculate proposals.
Args
---
rpn_probs: [batch_size, num_anchors, (bg prob, fg prob)]
rpn_deltas: [batch_size, num_anchors, (dy, dx, log(dh), log(dw))]
img_metas: [batch_size, 11]
with_probs: bool.
Returns
---
proposals_list: list of [num_proposals, (y1, x1, y2, x2)] in
normalized coordinates if with_probs is False.
Otherwise, the shape of proposals in proposals_list is
[num_proposals, (y1, x1, y2, x2, score)]
Note that num_proposals is no more than proposal_count. And different
images in one batch may have different num_proposals.
'''
anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)
# [369303, 4], [b, 11]
# [b, N, (background prob, foreground prob)], get anchor's foreground prob, [1, 369303]
rpn_probs = rpn_probs[:, :, 1]
# [[1216, 1216]]
pad_shapes = calc_pad_shapes(img_metas)
proposals_list = [
self._get_proposals_single(
rpn_probs[i], rpn_deltas[i], anchors, valid_flags[i], pad_shapes[i], with_probs)
for i in range(img_metas.shape[0])
]
return proposals_list
def _get_proposals_single(self,
rpn_probs,
rpn_deltas,
anchors,
valid_flags,
img_shape,
with_probs):
'''
Calculate proposals.
Args
---
rpn_probs: [num_anchors]
rpn_deltas: [num_anchors, (dy, dx, log(dh), log(dw))]
anchors: [num_anchors, (y1, x1, y2, x2)] anchors defined in
pixel coordinates.
valid_flags: [num_anchors]
img_shape: np.ndarray. [2]. (img_height, img_width)
with_probs: bool.
Returns
---
proposals: [num_proposals, (y1, x1, y2, x2)] in normalized
coordinates.
'''
H, W = img_shape
# filter invalid anchors, int => bool
valid_flags = tf.cast(valid_flags, tf.bool)
# [369303] => [215169], respectively
rpn_probs = tf.boolean_mask(rpn_probs, valid_flags)
rpn_deltas = tf.boolean_mask(rpn_deltas, valid_flags)
anchors = tf.boolean_mask(anchors, valid_flags)
# Improve performance
pre_nms_limit = min(6000, anchors.shape[0]) # min(6000, 215169) => 6000
ix = tf.nn.top_k(rpn_probs, pre_nms_limit, sorted=True).indices
# [215169] => [6000], respectively
rpn_probs = tf.gather(rpn_probs, ix)
rpn_deltas = tf.gather(rpn_deltas, ix)
anchors = tf.gather(anchors, ix)
# Get refined anchors, => [6000, 4]
proposals = transforms.delta2bbox(anchors, rpn_deltas,
self.target_means, self.target_stds)
# clipping to valid area, [6000, 4]
window = tf.constant([0., 0., H, W], dtype=tf.float32)
proposals = transforms.bbox_clip(proposals, window)
# Normalize, (y1, x1, y2, x2)
proposals = proposals / tf.constant([H, W, H, W], dtype=tf.float32)
# NMS, indices: [2000]
indices = tf.image.non_max_suppression(
proposals, rpn_probs, self.proposal_count, self.nms_threshold)
proposals = tf.gather(proposals, indices) # [2000, 4]
if with_probs:
proposal_probs = tf.expand_dims(tf.gather(rpn_probs, indices), axis=1)
proposals = tf.concat([proposals, proposal_probs], axis=1)
return proposals
(19)misc
# misc
import tensorflow as tf
def trim_zeros(boxes, name=None):
'''
Often boxes are represented with matrices of shape [N, 4] and
are padded with zeros. This removes zero boxes.
Args
---
boxes: [N, 4] matrix of boxes.
non_zeros: [N] a 1D boolean mask identifying the rows to keep
'''
non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
boxes = tf.boolean_mask(boxes, non_zeros, name=name)
return boxes, non_zeros
def parse_image_meta(meta):
'''
Parses a tensor that contains image attributes to its components.
Args
---
meta: [..., 11]
Returns
---
a dict of the parsed tensors.
'''
meta = meta.numpy()
ori_shape = meta[..., 0:3]
img_shape = meta[..., 3:6]
pad_shape = meta[..., 6:9]
scale = meta[..., 9]
flip = meta[..., 10]
return {
'ori_shape': ori_shape,
'img_shape': img_shape,
'pad_shape': pad_shape,
'scale': scale,
'flip': flip
}
def calc_batch_padded_shape(meta):
'''
Args
---
meta: [batch_size, 11]
Returns
---
nd.ndarray. Tuple of (height, width)
'''
return tf.cast(tf.reduce_max(meta[:, 6:8], axis=0), tf.int32).numpy()
def calc_img_shapes(meta):
'''
Args
---
meta: [..., 11]
Returns
---
nd.ndarray. [..., (height, width)]
'''
return tf.cast(meta[..., 3:5], tf.int32).numpy()
def calc_pad_shapes(meta):
'''
Args
---
meta: [..., 11]
Returns
---
nd.ndarray. [..., (height, width)]
'''
return tf.cast(meta[..., 6:8], tf.int32).numpy()