这次重点解析在Tensorflow2.0:Faster RCNN 代码详解(一)中关于引用rpn_heads文件函数的代码,该代码实际上是编写了Faster RCNN 中RPN网络的过程。
import tensorflow as tf
from tensorflow.keras import layers
from detection.core.anchor import anchor_generator, anchor_target
from detection.core.loss import losses
from detection.core.bbox import transforms
from detection.utils.misc import *
根据上述可以看出,rpn_heads文件引入了loss,anchor_generator, anchor_target,transforms和misc。其中,loss和anchor_generator, anchor_targetr比较关键,在其余部分给出解析。
class RPNHead(tf.keras.Model):
def __init__(self,
anchor_scales=(32, 64, 128, 256, 512),
anchor_ratios=(0.5, 1, 2),
anchor_feature_strides=(4, 8, 16, 32, 64),
proposal_count=2000,
nms_threshold=0.7,
target_means=(0., 0., 0., 0.),
target_stds=(0.1, 0.1, 0.2, 0.2),
num_rpn_deltas=256,
positive_fraction=0.5,
pos_iou_thr=0.7,
neg_iou_thr=0.3,
**kwags):
'''
Network head of Region Proposal Network.
/ - rpn_cls (1x1 conv)
input - rpn_conv (3x3 conv) -
\ - rpn_reg (1x1 conv)
Attributes
---
anchor_scales: 1D array of anchor sizes in pixels.
anchor_ratios: 1D array of anchor ratios of width/height.
anchor_feature_strides: Stride of the feature map relative
to the image in pixels.
proposal_count: int. RPN proposals kept after non-maximum
suppression.
nms_threshold: float. Non-maximum suppression threshold to
filter RPN proposals.
target_means: [4] Bounding box refinement mean.
target_stds: [4] Bounding box refinement standard deviation.
num_rpn_deltas: int.
positive_fraction: float.
pos_iou_thr: float.
neg_iou_thr: float.
'''
super(RPNHead, self).__init__(**kwags)
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold
self.target_means = target_means
self.target_stds = target_stds
self.generator = anchor_generator.AnchorGenerator(
scales=anchor_scales,
ratios=anchor_ratios,
feature_strides=anchor_feature_strides)
self.anchor_target = anchor_target.AnchorTarget(
target_means=target_means,
target_stds=target_stds,
num_rpn_deltas=num_rpn_deltas,
positive_fraction=positive_fraction,
pos_iou_thr=pos_iou_thr,
neg_iou_thr=neg_iou_thr)
self.rpn_class_loss = losses.rpn_class_loss
self.rpn_bbox_loss = losses.rpn_bbox_loss
# Shared convolutional base of the RPN
self.rpn_conv_shared = layers.Conv2D(512, (3, 3), padding='same',
kernel_initializer='he_normal',
name='rpn_conv_shared')
self.rpn_class_raw = layers.Conv2D(len(anchor_ratios) * 2, (1, 1),
kernel_initializer='he_normal',
name='rpn_class_raw')
self.rpn_delta_pred = layers.Conv2D(len(anchor_ratios) * 4, (1, 1),
kernel_initializer='he_normal',
name='rpn_bbox_pred')
def call(self, inputs, training=True):
'''
Args
---
inputs: [batch_size, feat_map_height, feat_map_width, channels]
one level of pyramid feat-maps.
Returns
---
rpn_class_logits: [batch_size, num_anchors, 2]
rpn_probs: [batch_size, num_anchors, 2]
rpn_deltas: [batch_size, num_anchors, 4]
'''
layer_outputs = []
for feat in inputs: # for every anchors feature maps
"""
# 五种feature map
(1, 304, 304, 256)
(1, 152, 152, 256)
(1, 76, 76, 256)
(1, 38, 38, 256)
(1, 19, 19, 256)
对于一种feature maps来说(以feature map(1, 304, 304, 256)为例子)
进行class转换
rpn_class_raw: (1, 304, 304, 6) # 一个cell有3种anchor,每个有两个值,一个是前景值,一个是背景景值
rpn_class_logits: (1, 277248, 2) # 输出每个anchor的两个置信值
进行位置转换
rpn_delta_pred: (1, 304, 304, 12) # 每种anchor有四个数代表anchor的坐标
rpn_deltas: (1, 277248, 4) # 输出每个anchor
"""
# feature map大小不变,经卷积后通道为512
shared = self.rpn_conv_shared(feat)
shared = tf.nn.relu(shared)
# 输出每种可能性,即概率值
x = self.rpn_class_raw(shared) # (1, 304, 304, 2)
rpn_class_logits = tf.reshape(x, [tf.shape(x)[0], -1, 2]) # (1, 277248, 2)
# 计算每个预测框的前景值和后景值
# 方法:tf.nn.softmax
rpn_probs = tf.nn.softmax(rpn_class_logits) # tf.nn.softmax可针对每一行数据进行计算
# 计算每个预测框的偏移量
x = self.rpn_delta_pred(shared)
rpn_deltas = tf.reshape(x, [tf.shape(x)[0], -1, 4])
layer_outputs.append([rpn_class_logits, rpn_probs, rpn_deltas])
"""
Return:
(1, 277248, 2) (1, 277248, 2) (1, 277248, 4)
(1, 69312, 2) (1, 69312, 2) (1, 69312, 4)
(1, 17328, 2) (1, 17328, 2) (1, 17328, 4)
(1, 4332, 2) (1, 4332, 2) (1, 4332, 4)
(1, 1083, 2) (1, 1083, 2) (1, 1083, 4)
"""
# 将feature map进行卷积处理得到上面每个anchor的类别class,置信probability以及偏移量(dy, dx, log(dh), log(dw))
# 方法:Zip在整理数据的优势:
# 具体:layer_outputs是五个列表,每个列表三个元素。zip使得变成三个列表,每个列表五个元素,元素是以前的列
outputs = list(zip(*layer_outputs)) # zip带星号是解压的意思
outputs = [tf.concat(list(o), axis=1) for o in outputs]
rpn_class_logits, rpn_probs, rpn_deltas = outputs
# 输出格式为:[batch_size, num_anchors,class],[batch_size, num_anchors,probability],
# [batch_size, num_anchors,(dy, dx, log(dh), log(dw)]
# 输出shape为: (1, 369303, 2) (1, 369303, 2) (1, 369303, 4)
return rpn_class_logits, rpn_probs, rpn_deltas
# 计算误差
def loss(self, rpn_class_logits, rpn_deltas, gt_boxes, gt_class_ids, img_metas):
"""
:param rpn_class_logits: [N, 2]
:param rpn_deltas: [N, 4]
:param gt_boxes: [GT_N]
:param gt_class_ids: [GT_N]
:param img_metas: [11]
:return:
"""
# 1.根据feature map输出所有预测框的坐标
# 具体:在feature map上生成所有的预测框,并给出预测框是否超出图像边界的标签valid_flags,0为无效超出边界,1为有效
# 输出的shape为:anchors: [batch_size, num_anchors, 4=(y1, x1, y2, x2)]
# valid_flags: [batch_size, num_anchors]
anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)
# 2.计算真实值t*
# 具体:对于每个batch,在feature map上随机抽取256个anchors,其中一半是negative一半是positive,
# 计算positive anchors与真实标注框的4个距离参数dy, dx, log(dh), log(dw),网上有相关计算公式说明
# 输出的shape为:rpn_target_matchs: [batch_size, num_anchors]
# (1 = positive anchor, -1 = negative anchor, 0 = neutral anchor)
# rpn_target_deltas: [batch_size, num_rpn_deltas, 4=(dy, dx, log(dh), log(dw))]
rpn_target_matchs, rpn_target_deltas = self.anchor_target.build_targets(
anchors, valid_flags, gt_boxes, gt_class_ids)
# 3.计算分类误差与偏移量回归误差
# 分类误差
rpn_class_loss = self.rpn_class_loss(rpn_target_matchs, rpn_class_logits)
# 偏移量回归误差
rpn_bbox_loss = self.rpn_bbox_loss(rpn_target_deltas, rpn_target_matchs, rpn_deltas)
return rpn_class_loss, rpn_bbox_loss
# 4.输出proposal区域(由针对一个样本输出proposal区域组成)
def get_proposals(self, rpn_probs, rpn_deltas, img_metas, with_probs=False):
'''
Calculate proposals.
Args
---
rpn_probs: [batch_size, num_anchors, (bg prob, fg prob)]
rpn_deltas: [batch_size, num_anchors, (dy, dx, log(dh), log(dw))]
img_metas: [batch_size, 11]
with_probs: bool.
Returns
---
proposals_list: list of [num_proposals, (y1, x1, y2, x2)] in
normalized coordinates if with_probs is False.
Otherwise, the shape of proposals in proposals_list is
[num_proposals, (y1, x1, y2, x2, score)]
Note that num_proposals is no more than proposal_count. And different
images in one batch may have different num_proposals.
'''
anchors, valid_flags = self.generator.generate_pyramid_anchors(img_metas)
# [369303, 4], [b, 11]
# [b, N, (background prob, foreground prob)], get anchor's foreground prob, [1, 369303]
rpn_probs = rpn_probs[:, :, 1]
# [[1216, 1216]]
pad_shapes = calc_pad_shapes(img_metas)
proposals_list = [
self._get_proposals_single(
rpn_probs[i], rpn_deltas[i], anchors, valid_flags[i], pad_shapes[i], with_probs)
for i in range(img_metas.shape[0])
]
return proposals_list
# 针对一个样本输出proposal区域
def _get_proposals_single(self, rpn_probs, rpn_deltas, anchors, valid_flags, img_shape, with_probs):
'''
Calculate proposals.
Args
---
rpn_probs: [num_anchors]
rpn_deltas: [num_anchors, (dy, dx, log(dh), log(dw))]
anchors: [num_anchors, (y1, x1, y2, x2)] anchors defined in pixel coordinates.
valid_flags: [num_anchors]
img_shape: np.ndarray. [2]. (img_height, img_width)
with_probs: bool.
Returns
---
proposals: [num_proposals, (y1, x1, y2, x2)] in normalized
coordinates.
'''
# 1. 将超出图片边界的anchors box剔除
# 方法: tf.boolean_mask 挑选出True位置的数据
H, W = img_shape
# filter invalid anchors, int => bool
valid_flags = tf.cast(valid_flags, tf.bool)
# [369303] => [215169], respectively
rpn_probs = tf.boolean_mask(rpn_probs, valid_flags)
rpn_deltas = tf.boolean_mask(rpn_deltas, valid_flags)
anchors = tf.boolean_mask(anchors, valid_flags)
# 2. 根据anchors的score(概率值),按最大的选前6000个
# 方法:tf.nn.top_k 返回每行最大的 k 个数,并返回它们所在位置的索引,sorted=True指的是从大到小排列
# tf.nn.top_k 挑出该数据内从大到小的k个数,然后返回位置索引
# Improve performance
pre_nms_limit = min(6000, anchors.shape[0]) # min(6000, 215169) => 6000
ix = tf.nn.top_k(rpn_probs, pre_nms_limit, sorted=True).indices
# [215169] => [6000], respectively
rpn_probs = tf.gather(rpn_probs, ix)
rpn_deltas = tf.gather(rpn_deltas, ix)
anchors = tf.gather(anchors, ix)
# 3.计算出6000个anchors的修改坐标,经NMS处理输出其中的2000个
# 1.计算6000个anchors的proposals,格式为[y1, x1, y2, x2],shape为[6000, 4]
# 为何要用方差和均值来重新计算偏移量
proposals = transforms.delta2bbox(anchors, rpn_deltas, self.target_means, self.target_stds)
# clipping to valid area, [6000, 4]
window = tf.constant([0., 0., H, W], dtype=tf.float32)
proposals = transforms.bbox_clip(proposals, window)
# 坐标值得均一化Normalize
proposals = proposals / tf.constant([H, W, H, W], dtype=tf.float32)
# 2.去除冗余的proposals--NMS, indices: [2000]
# 方法:tf.image.non_max_suppression 坐标位置 分数 通过非最大抑制选择的框的最大数量 判断框是否相对于IOU重叠太多的阈值
indices = tf.image.non_max_suppression(proposals, rpn_probs, self.proposal_count, self.nms_threshold)
proposals = tf.gather(proposals, indices) # shape为 [2000, 4] 格式为[y1, x1, y2, x2] 左上角和右下角
if with_probs:
proposal_probs = tf.expand_dims(tf.gather(rpn_probs, indices), axis=1)
proposals = tf.concat([proposals, proposal_probs], axis=1)
return proposals
import tensorflow as tf
from tensorflow import keras
def smooth_l1_loss(y_true, y_pred):
'''Implements Smooth-L1 loss.
Args
---
y_true and y_pred are typically: [N, 4], but could be any shape.
'''
diff = tf.abs(y_true - y_pred)
less_than_one = tf.cast(tf.less(diff, 1.0), tf.float32)
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
return loss
def rpn_class_loss(target_matchs, rpn_class_logits):
'''RPN anchor classifier loss.
Args
---
target_matchs: [batch_size, num_anchors]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_class_logits: [batch_size, num_anchors, 2]. RPN classifier logits for FG/BG.
'''
# 当预测框中有目标时,才计算其分类误差;若预测框中没有目标,则不计算分类误差。
anchor_class = tf.cast(tf.equal(target_matchs, 1), tf.int32) # shape为[1, 2]
# 确定含有目标预测框的索引
indices = tf.where(tf.not_equal(target_matchs, 0))
anchor_class = tf.gather_nd(anchor_class, indices)
# 确定含有目标预测框的前景值和后景值
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
num_classes = rpn_class_logits.shape[-1]
# 计算二分类的交叉熵损失
loss = keras.losses.categorical_crossentropy(tf.one_hot(anchor_class, depth=num_classes),
rpn_class_logits, from_logits=True)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
def rpn_bbox_loss(target_deltas, target_matchs, rpn_deltas):
'''Return the RPN bounding box loss graph.
Args
---
target_deltas: [batch, num_rpn_deltas, (dy, dx, log(dh), log(dw))].
Uses 0 padding to fill in unsed bbox deltas.
target_matchs: [batch, anchors]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_deltas: [batch, anchors, (dy, dx, log(dh), log(dw))]
'''
def batch_pack(x, counts, num_rows):
'''Picks different number of values from each row
in x depending on the values in counts.
'''
outputs = []
for i in range(num_rows):
outputs.append(x[i, :counts[i]])
return tf.concat(outputs, axis=0)
# Positive anchors contribute to the loss, but negative and
# neutral anchors (match value of 0 or -1) don't.
indices = tf.where(tf.equal(target_matchs, 1))
# Pick bbox deltas that contribute to the loss
rpn_deltas = tf.gather_nd(rpn_deltas, indices)
# Trim target bounding box deltas to the same length as rpn_deltas.
batch_counts = tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32), axis=1)
target_deltas = batch_pack(target_deltas, batch_counts,
target_deltas.shape.as_list()[0])
loss = smooth_l1_loss(target_deltas, rpn_deltas)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
def rcnn_class_loss(target_matchs_list, rcnn_class_logits_list):
'''Loss for the classifier head of Faster RCNN.
Args
---
target_matchs_list: list of [num_rois]. Integer class IDs. Uses zero
padding to fill in the array.
rcnn_class_logits_list: list of [num_rois, num_classes]
'''
class_ids = tf.concat(target_matchs_list, 0)
class_logits = tf.concat(rcnn_class_logits_list, 0)
class_ids = tf.cast(class_ids, 'int64')
# loss = tf.losses.sparse_softmax_cross_entropy(labels=class_ids,
# logits=class_logits)
num_classes = class_logits.shape[-1]
# print(class_logits.shape)
loss = keras.losses.categorical_crossentropy(tf.one_hot(class_ids, depth=num_classes),
class_logits, from_logits=True)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
def rcnn_bbox_loss(target_deltas_list, target_matchs_list, rcnn_deltas_list):
'''Loss for Faster R-CNN bounding box refinement.
Args
---
target_deltas_list: list of [num_positive_rois, (dy, dx, log(dh), log(dw))]
target_matchs_list: list of [num_rois]. Integer class IDs.
rcnn_deltas_list: list of [num_rois, num_classes, (dy, dx, log(dh), log(dw))]
'''
target_deltas = tf.concat(target_deltas_list, 0)
target_class_ids = tf.concat(target_matchs_list, 0)
rcnn_deltas = tf.concat(rcnn_deltas_list, 0)
# Only positive ROIs contribute to the loss. And only
# the right class_id of each ROI. Get their indicies.
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
positive_roi_class_ids = tf.cast(
tf.gather(target_class_ids, positive_roi_ix), tf.int64)
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
# Gather the deltas (predicted and true) that contribute to loss
rcnn_deltas = tf.gather_nd(rcnn_deltas, indices)
# Smooth-L1 Loss
loss = smooth_l1_loss(target_deltas, rcnn_deltas)
loss = tf.reduce_mean(loss) if tf.size(loss) > 0 else tf.constant(0.0)
return loss
import tensorflow as tf
from detection.core.bbox import geometry, transforms
from detection.utils.misc import trim_zeros
class AnchorTarget:
"""
for every generated anchors boxes: [326393, 4],
create its rpn_target_matchs and rpn_target_matchs
which is used to train RPN network.
"""
def __init__(self,
target_means=(0., 0., 0., 0.),
target_stds=(0.1, 0.1, 0.2, 0.2),
num_rpn_deltas=256,
positive_fraction=0.5,
pos_iou_thr=0.7,
neg_iou_thr=0.3):
'''
Compute regression and classification targets for anchors.
Attributes
---
target_means: [4]. Bounding box refinement mean for RPN.
target_stds: [4]. Bounding box refinement standard deviation for RPN.
num_rpn_deltas: int. Maximal number of Anchors per image to feed to rpn heads.
positive_fraction: float.
pos_iou_thr: float.
neg_iou_thr: float.
'''
self.target_means = target_means
self.target_stds = target_stds
self.num_rpn_deltas = num_rpn_deltas
self.positive_fraction = positive_fraction
self.pos_iou_thr = pos_iou_thr
self.neg_iou_thr = neg_iou_thr
def build_targets(self, anchors, valid_flags, gt_boxes, gt_class_ids):
'''
Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
Args
---
anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
valid_flags: [batch_size, num_anchors]
gt_boxes: [batch_size, num_gt_boxes, (y1, x1, y2, x2)] in image
coordinates. batch_size = 1 usually
gt_class_ids: [batch_size, num_gt_boxes] Integer class IDs.
Returns
---
rpn_target_matchs: [batch_size, num_anchors] matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral anchor
rpn_target_deltas: [batch_size, num_rpn_deltas, (dy, dx, log(dh), log(dw))]
Anchor bbox deltas.
'''
rpn_target_matchs = []
rpn_target_deltas = []
num_imgs = gt_class_ids.shape[0] # namely, batchsz , 1
for i in range(num_imgs):
target_match, target_delta = self._build_single_target(
anchors, valid_flags[i], gt_boxes[i], gt_class_ids[i])
rpn_target_matchs.append(target_match)
rpn_target_deltas.append(target_delta)
rpn_target_matchs = tf.stack(rpn_target_matchs)
rpn_target_deltas = tf.stack(rpn_target_deltas)
rpn_target_matchs = tf.stop_gradient(rpn_target_matchs)
rpn_target_deltas = tf.stop_gradient(rpn_target_deltas)
return rpn_target_matchs, rpn_target_deltas
# 求出一个batch中随机抽取256个anchors中的positive anchor与真实框之间的偏离量(dy,dx,logw,logh)
def _build_single_target(self, anchors, valid_flags, gt_boxes, gt_class_ids):
''' Compute targets per instance.
Args
---
anchors: [num_anchors, (y1, x1, y2, x2)]
valid_flags: [num_anchors]
gt_class_ids: [num_gt_boxes]
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
Returns
---
target_matchs: [num_anchors] # 一个256个,数值1代表该anchors内有目标,数值-1表示无目标
target_deltas: [num_rpn_deltas, (dy, dx, log(dh), log(dw))] #
'''
gt_boxes, _ = trim_zeros(gt_boxes) # remove padded zero boxes, [new_N, 4]
target_matchs = tf.zeros(anchors.shape[0], dtype=tf.int32) # shape为[326393,]
# Compute overlaps [num_anchors, num_gt_boxes] 326393 vs 10 => [326393, 10]
overlaps = geometry.compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps ANY GT box with IoU >= 0.7 then it's positive.
# If an anchor overlaps ALL GT box with IoU < 0.3 then it's negative.
# Neutral anchors are those that don't match the conditions above,
# and they don't influence the loss function.
# However, don't keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
neg_values = tf.constant([0, -1])
pos_values = tf.constant([0, 1])
# 将所有anchors进行分类,anchors里有目标则为1,没有为-1,判断标准为LoU
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. [N_anchors, N_gt_boxes]
# 方法 tf.argmax是最大值的位置 tf.reduce_max是最大值
anchor_iou_argmax = tf.argmax(overlaps, axis=1) # [326396] get clost gt boxes for each anchors
anchor_iou_max = tf.reduce_max(overlaps, axis=[1]) # [326396] get closet gt boxes's overlap scores
# negative anchor的值为-1
# 方法:tf.where复合方法
target_matchs = tf.where(anchor_iou_max < self.neg_iou_thr,
-tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)
# 2.超出边界的anchor的值为-1
target_matchs = tf.where(tf.equal(valid_flags, 1),
target_matchs, tf.zeros(anchors.shape[0], dtype=tf.int32))
# if an anchor overlap with any GT box with IoU > 0.7, marked as foreground
# positive anchor的值为1
target_matchs = tf.where(anchor_iou_max >= self.pos_iou_thr,
tf.ones(anchors.shape[0], dtype=tf.int32), target_matchs)
# 3. 将未匹配到anchor的gt box匹配到IoU最大的anchor上
# 方法:tf.compat.v1.scatter_update
gt_iou_argmax = tf.argmax(overlaps, axis=0) # [N_gt_boxes]
# 具体:不知道该函数的作用
target_matchs = tf.compat.v1.scatter_update(tf.Variable(target_matchs), gt_iou_argmax, 1)
# update corresponding value=>1 for GT boxes' closest boxes
# 4.控制positive anchors 和 negative anchors的数量
# 将positive anchors的数量控制在128个
# 具体:若positive anchors数量超过256*0.5个,则随机抽取超过该数量的anchor,使得在target_matchs上所对应的值为0
# 方法:tf.random.shuffle 打乱顺序
# 方法:tf.squeeze 删除维度长度为1的维度;tf.where与tf.equal的复合应用,tf.equal判断两个对象是否相等;
ids = tf.where(tf.equal(target_matchs, 1)) # [N_pos_anchors, 1], [15, 1]
ids = tf.squeeze(ids, 1) # [15]
extra = ids.shape.as_list()[0] - int(self.num_rpn_deltas * self.positive_fraction) # 256*0.5
if extra > 0: # extra means the redundant pos_anchors
ids = tf.random.shuffle(ids)[:extra]
target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
# 将negative anchors的数量控制在128个
ids = tf.where(tf.equal(target_matchs, -1)) # [213748, 1]
ids = tf.squeeze(ids, 1)
extra = ids.shape.as_list()[0] - (self.num_rpn_deltas - # 213748 - (256 - num_of_pos_anchors:15)
tf.reduce_sum(tf.cast(tf.equal(target_matchs, 1), tf.int32)))
if extra > 0: # 213507, so many negative anchors!
# Rest the extra ones to neutral
ids = tf.random.shuffle(ids)[:extra]
target_matchs = tf.compat.v1.scatter_update(target_matchs, ids, 0)
# since we only need 256 anchors, and it had better contains half positive anchors, and harlf neg .
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
# 5.计算positive anchors的坐标和对应的真实坐标
# 具体:positive anchors坐标:随机选好的256个anchors里面的positive anchors的坐标
# 真实坐标:哪个真实框与positive anchors的Lou最大,就取哪个真实框的坐标
# 方法:tf.gather_nd和tf.gather都是收集指定位置的数据,具体细节还是不太一样
ids = tf.where(tf.equal(target_matchs, 1)) # [15]
a = tf.gather_nd(anchors, ids) # [369303, 4], [15] => [15, 4]
anchor_idx = tf.gather_nd(anchor_iou_argmax, ids) # closed gt boxes index for 369303 anchors
gt = tf.gather(gt_boxes, anchor_idx) # get closed gt boxes coordinates for ids=15
# a: [15, 4], postive anchors, gt: [15, 4] closed gt boxes for each anchors=15
# 6.计算positive anchors的(dy,dx,logw,logh)
target_deltas = transforms.bbox2delta(a, gt, self.target_means, self.target_stds)
# target_deltas: [15, (dy,dx,logw,logh)]
padding = tf.maximum(self.num_rpn_deltas - tf.shape(target_deltas)[0], 0) # 256-15
target_deltas = tf.pad(target_deltas, [(0, padding), (0, 0)]) # padding to [256,4], last padding 0
return target_matchs, target_deltas
第四部分 针对anchor_genarator文件代码解析
import tensorflow as tf
from detection.utils.misc import calc_img_shapes, calc_batch_padded_shape
class AnchorGenerator:
# anchor指的是feature map上的每一格,其大小为feature_stride,cell会产生scales_len*ratio_len个
# 先验框anchor boxes,scales表示框的一边长度,另一边的长度要乘上ratio
"""
This class operate on padded iamge, eg. [1216, 1216]
and generate scales*ratios number of anchor boxes for each point in
padded image, with stride = feature_strides
number of anchor = (1216 // feature_stride)^2
number of anchor boxes = number of anchor * (scales_len*ratio_len)
"""
def __init__(self,
scales=(32, 64, 128, 256, 512),
ratios=(0.5, 1, 2),
feature_strides=(4, 8, 16, 32, 64)):
'''
Anchor Generator
Attributes
---
scales: 1D array of anchor sizes in pixels.
ratios: 1D array of anchor ratios of width/height.
feature_strides: Stride of the feature map relative to the image in pixels.
'''
self.scales = scales
self.ratios = ratios
self.feature_strides = feature_strides
def generate_pyramid_anchors(self, img_metas):
'''
Generate the multi-level anchors for Region Proposal Network
Args
---
img_metas: [batch_size, 11]
Returns
---
anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
valid_flags: [batch_size, num_anchors]
'''
# 返回:针对所有anchors生成先验框的坐标 valid flags shape为[369303, ],数值1表示先验框未出范围
# 具体:在feature map上每一个anchors在中心处生成的先验框anchors box的左上坐标和右下坐标(y1, x1, y2, x2)
pad_shape = calc_batch_padded_shape(img_metas) # [1216, 1216]
# : [(304, 304), (152, 152), (76, 76), (38, 38), (19, 19)]
feature_shapes = [(pad_shape[0] // stride, pad_shape[1] // stride)
for stride in self.feature_strides]
anchors = [
self._generate_level_anchors(level, feature_shape)
for level, feature_shape in enumerate(feature_shapes)
] # [277248, 4], [69312, 4], [17328, 4], [4332, 4], [1083, 4]
anchors = tf.concat(anchors, axis=0) # [369303, 4]
# generate valid flags
img_shapes = calc_img_shapes(img_metas) # (800, 1067)
valid_flags = [
self._generate_valid_flags(anchors, img_shapes[i])
for i in range(img_shapes.shape[0])
]
valid_flags = tf.stack(valid_flags, axis=0)
# 方法:tf.stop_gradient使得该变量不参与梯度计算中
anchors = tf.stop_gradient(anchors)
valid_flags = tf.stop_gradient(valid_flags)
return anchors, valid_flags
# 判断anchor是否在图片内
# 方法:anchor的中心坐标在图片内就认为anchor就在图片内
# 具体:tf.where
def _generate_valid_flags(self, anchors, img_shape):
'''
remove these anchor boxed on padded area
---
anchors: [num_anchors, (y1, x1, y2, x2)] in image coordinates.
img_shape: Tuple. (height, width, channels)
Returns
---
valid_flags: [num_anchors]
'''
y_center = (anchors[:, 2] + anchors[:, 0]) / 2 # [369300]
x_center = (anchors[:, 3] + anchors[:, 1]) / 2
valid_flags = tf.ones(anchors.shape[0], dtype=tf.int32) # [369300]
zeros = tf.zeros(anchors.shape[0], dtype=tf.int32)
# set boxes whose center is out of image area as invalid.
# anchor在图片内为1 不在为0
valid_flags = tf.where(y_center <= img_shape[0], valid_flags, zeros)
valid_flags = tf.where(x_center <= img_shape[1], valid_flags, zeros)
return valid_flags
# 在feature map上每一个anchors在中心处生成的先验框anchors box的左上坐标和右下坐标(y1, x1, y2, x2)
def _generate_level_anchors(self, level, feature_shape):
'''Generate the anchors given the spatial shape of feature map.
scale: 32
ratios: tf.Tensor([0.5 1. 2. ], shape=(3,), dtype=float32)
pos: (256, 256) (256, 256)
---
feature_shape: (height, width)
Returns
---
numpy.ndarray [anchors_num, (y1, x1, y2, x2)]
'''
scale = self.scales[level] # 32
ratios = self.ratios # 0.5 1 2
feature_stride = self.feature_strides[level] # 4
# Get all combinations of scales and ratios
# tf.meshgrid:a从一行重复增加到size(b)行,把b转置成一列再重复增加到size(a)列
scales, ratios = tf.meshgrid([float(scale)], ratios)
scales = tf.reshape(scales, [-1]) # [32, 32, 32]
ratios = tf.reshape(ratios, [-1]) # [0.5, 1, 2]
# Enumerate heights and widths from scales and ratios
heights = scales / tf.sqrt(ratios) # [45, 32, 22], square root
widths = scales * tf.sqrt(ratios) # [22, 32, 45]
# x和y指的每个anchor左上角的坐标
# Enumerate shifts in feature space, [0, 4, ..., 1216-4]
shifts_y = tf.multiply(tf.range(feature_shape[0]), feature_stride) # shape(304,)
shifts_x = tf.multiply(tf.range(feature_shape[1]), feature_stride)
# 方法:tf.meshgrid生成坐标系
shifts_x, shifts_y = tf.cast(shifts_x, tf.float32), tf.cast(shifts_y, tf.float32)
shifts_x, shifts_y = tf.meshgrid(shifts_x, shifts_y) # [304, 304]
# Enumerate combinations of shifts, widths, and heights # mesh A: [3] B:[304,304]=>[92416] =>[92416,3,2]
box_widths, box_centers_x = tf.meshgrid(widths, shifts_x) # [304, 3]
box_heights, box_centers_y = tf.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = tf.reshape(tf.stack([box_centers_y, box_centers_x], axis=2), (-1, 2))
box_sizes = tf.reshape(tf.stack([box_heights, box_widths], axis=2), (-1, 2))
# Convert to corner coordinates (y1, x1, y2, x2) [304x304, 3, 4] => [277448, 4]
boxes = tf.concat([box_centers - 0.5 * box_sizes,
box_centers + 0.5 * box_sizes], axis=1)
return boxes