接上一篇文章Faster RCNN源码解析(1).
第二阶段我会拆分为个3模块,在下面详细介绍。
RPN
关于faster rcnn中PRN的介绍大家可以自己看paper或者找点论坛看看, medium, CSDN, 知乎, 包括都有大量的资料做介绍, 本文只站在源码的角度给你介绍每一步的实现, 所以就不阐述原理了,见谅~~
代码入口
lib/model/train_val.py
# Construct the computation graph
lr, train_op = self.construct_graph(sess)
lr是学习率, train_op是训练网络的一系列操作。
让我们走进construct_graph函数
lib/model/train_val.py
def construct_graph(self, sess):
with sess.graph.as_default():
# Set the random seed for tensorflow
tf.set_random_seed(cfg.RNG_SEED)
# Build the main computation graph
layers = self.net.create_architecture('TRAIN', self.imdb.num_classes, tag='default',
anchor_scales=cfg.ANCHOR_SCALES,
anchor_ratios=cfg.ANCHOR_RATIOS)
# Define the loss
loss = layers['total_loss']
# Set learning rate and momentum
lr = tf.Variable(cfg.TRAIN.LEARNING_RATE, trainable=False)
self.optimizer = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM)
# Compute the gradients with regard to the loss
gvs = self.optimizer.compute_gradients(loss)
# Double the gradient of the bias if set
if cfg.TRAIN.DOUBLE_BIAS:
final_gvs = []
with tf.variable_scope('Gradient_Mult') as scope:
for grad, var in gvs:
scale = 1.
if cfg.TRAIN.DOUBLE_BIAS and '/biases:' in var.name:
scale *= 2.
if not np.allclose(scale, 1.0):
grad = tf.multiply(grad, scale)
final_gvs.append((grad, var))
train_op = self.optimizer.apply_gradients(final_gvs)
else:
train_op = self.optimizer.apply_gradients(gvs)
# We will handle the snapshots ourselves
self.saver = tf.train.Saver(max_to_keep=100000)
# Write the train and validation information to tensorboard
self.writer = tf.summary.FileWriter(self.tbdir, sess.graph)
self.valwriter = tf.summary.FileWriter(self.tbvaldir)
return lr, train_op
代码其实将流程阐述的非常清楚,我再废话给大家总结一下~~
- 给tensorflow设置随机种子seed(为啥要这样,可以百度一下)
- 建立一个计算图computational graph(重点,下面介绍)
- 定义了一个执行Momentum算法的优化器
accumulation = momentum * accumulation + gradient
variable -= learning_rate * accumulation
- 计算损失参数的梯度self.optimizer.compute_gradients(loss)
- 将梯度应用于变量self.optimizer.apply_gradients(gvs), 返回值就是train_op
- 定义Saver(用于快照-缓存), writer, valwriter(把信息及时传入tensorboard)
然后走进create_architecture函数
lib/nets/network.py
def create_architecture(self, mode, num_classes, tag=None,
anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
self._image = tf.placeholder(tf.float32, shape=[1, None, None, 3])
self._im_info = tf.placeholder(tf.float32, shape=[3])
self._gt_boxes = tf.placeholder(tf.float32, shape=[None, 5])
self._tag = tag
self._num_classes = num_classes
self._mode = mode
self._anchor_scales = anchor_scales
self._num_scales = len(anchor_scales)
self._anchor_ratios = anchor_ratios
self._num_ratios = len(anchor_ratios)
self._num_anchors = self._num_scales * self._num_ratios
training = mode == 'TRAIN'
testing = mode == 'TEST'
assert tag != None
# handle most of the regularizers here
weights_regularizer = tf.contrib.layers.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY)
if cfg.TRAIN.BIAS_DECAY:
biases_regularizer = weights_regularizer
else:
biases_regularizer = tf.no_regularizer
# list as many types of layers as possible, even if they are not used now
with arg_scope([slim.conv2d, slim.conv2d_in_plane, \
slim.conv2d_transpose, slim.separable_conv2d, slim.fully_connected],
weights_regularizer=weights_regularizer,
biases_regularizer=biases_regularizer,
biases_initializer=tf.constant_initializer(0.0)):
rois, cls_prob, bbox_pred = self._build_network(training)
layers_to_output = {'rois': rois}
for var in tf.trainable_variables():
self._train_summaries.append(var)
if testing:
stds = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (self._num_classes))
means = np.tile(np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (self._num_classes))
self._predictions["bbox_pred"] *= stds
self._predictions["bbox_pred"] += means
else:
self._add_losses()
layers_to_output.update(self._losses)
val_summaries = []
with tf.device("/cpu:0"):
val_summaries.append(self._add_gt_image_summary())
for key, var in self._event_summaries.items():
val_summaries.append(tf.summary.scalar(key, var))
for key, var in self._score_summaries.items():
self._add_score_summary(key, var)
for var in self._act_summaries:
self._add_act_summary(var)
for var in self._train_summaries:
self._add_train_summary(var)
self._summary_op = tf.summary.merge_all()
self._summary_op_val = tf.summary.merge(val_summaries)
layers_to_output.update(self._predictions)
return layers_to_output
很多人(包括我自己)对tensorflow还不是很熟悉,所以这里还是给大家概括一下程序流程
- 给network的成员变量赋值
- 定义权重weights的正则regularizer
- 建立网络self._build_network(training) (重点)
- 定义损失函数, 包括RPN class loss, RPN bbox loss,整个RCNN网络的class loss和最终确定的物体边框bbox loss, 细节可以看这个函数_add_losses
- 更新一下tensorboard用得到的参数
然后我们了解一下_build_network函数
lib/nets/network.py
def _build_network(self, is_training=True):
# select initializers
if cfg.TRAIN.TRUNCATED:
initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
else:
initializer = tf.random_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.random_normal_initializer(mean=0.0, stddev=0.001)
net_conv = self._image_to_head(is_training)
with tf.variable_scope(self._scope, self._scope):
# build the anchors for the image
self._anchor_component()
# region proposal network
rois = self._region_proposal(net_conv, is_training, initializer)
# region of interest pooling
if cfg.POOLING_MODE == 'crop':
pool5 = self._crop_pool_layer(net_conv, rois, "pool5")
else:
raise NotImplementedError
fc7 = self._head_to_tail(pool5, is_training)
with tf.variable_scope(self._scope, self._scope):
# region classification
cls_prob, bbox_pred = self._region_classification(fc7, is_training,
initializer, initializer_bbox)
self._score_summaries.update(self._predictions)
return rois, cls_prob, bbox_pred
- 初始化权重weight, 用截断的normal initializer或者随机的normal initializer
- 构建主干网络前端_image_to_head
- 构建anchors
- 构建RPN
- ROI pooling 调用函数_crop_pool_layer
- 构建主干网络的尾部 fc7 = self._head_to_tail(pool5, is_training)
- object分类以及边框预测的回归
各位是不是一脸萌币。。。不要紧, 下面我会给大家详细介绍上述的每一个步骤。
构建主干网络前端
_image_to_head方法是一个类Network的一个abstract class, 以它的实现类Resnet 101为例
lib/nets/resnet_v1.py
def _image_to_head(self, is_training, reuse=None):
assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3)
# Now the base is always fixed during training
with slim.arg_scope(resnet_arg_scope(is_training=False)):
net_conv = self._build_base()
if cfg.RESNET.FIXED_BLOCKS > 0:
with slim.arg_scope(resnet_arg_scope(is_training=False)):
net_conv, _ = resnet_v1.resnet_v1(net_conv,
self._blocks[0:cfg.RESNET.FIXED_BLOCKS],
global_pool=False,
include_root_block=False,
reuse=reuse,
scope=self._scope)
if cfg.RESNET.FIXED_BLOCKS < 3:
with slim.arg_scope(resnet_arg_scope(is_training=is_training)):
net_conv, _ = resnet_v1.resnet_v1(net_conv,
self._blocks[cfg.RESNET.FIXED_BLOCKS:-1],
global_pool=False,
include_root_block=False,
reuse=reuse,
scope=self._scope)
self._act_summaries.append(net_conv)
self._layers['head'] = net_conv
return net_conv
def _build_base(self):
with tf.variable_scope(self._scope, self._scope):
net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1')
net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1')
我会在下一篇文章中专门介绍resnet, 这里还是只做一个流程的简介。
- 调用_build_base函数手动建立初始的几层: input -> 64 * 7 * 7 filters, stride = 2 -> padding -> max pooling
- 构建网络主干, 因为之前定义过self._blocks
self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
# use stride 1 for the last conv4 layer
resnet_v1_block('block3', base_depth=256, num_units=36, stride=1),
resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
调用slim的resnet_v1的接口实现这段网络resnet_v1.resnet_v1()
构建anchors
anchor 是什么?这里借用一点知乎作者马塔的回答:
anchor的本质是什么,本质是将相同尺寸的 conv5_3 层的输出,倒推得到不同尺寸的输入。接下来是anchor的窗口尺寸,详细说下这个尺寸的来源,最基本的anchor只有一个尺寸,是16*16的尺寸,然后设定了基本的面积scale是(8,16,32),用这三个scale乘以16就得到了三个面积尺寸(1282,2562,512^2),然后在每个面积尺寸下,取三种不同的长宽比例(1:1,1:2,2:1).这样一来,我们得到了一共9种面积尺寸各异的anchor。示意图如下:
不过这个示意图其实比较有误导性,首先它图中的9个框并不是在同一个中心点的,而实际上,是应该在每个特征图的每个点作为中心点生成9 个框; 其次,生成的 anchor 尺寸大小不是以特征图为基准的,甚至毫无关系,而是以 anchor ratio 和 anchor scale 得到最终的大小,并且其最大的 anchor 也基本和 resize 之后的图大小相当。
在 generate_anchors 代码文件中,可以看到如下数据,
# anchors =
# -83 -39 100 56
# -175 -87 192 104
# -359 -183 376 200
# -55 -55 72 72
# -119 -119 136 136
# -247 -247 264 264
# -35 -79 52 96
# -79 -167 96 184
# -167 -343 184 360
这就是生成的最基本的9个anchor,这个anchor的坐标是xyxy类型的,它表示了图片的左上角的第1个9个anchor的坐标,后面用到的所有anchor都是用它在特征图上平移得到的(它代表的坐标是resize 后的图片坐标而不是原图)。
至于这个anchor到底是怎么用的,这个是理解整个问题的关键。
上面我们已经得到了基础网络最终的conv5_3 输出为138671024(1024是层数),在这个特征参数的基础上,通过一个3x3的滑动窗口,在这个3867的区域上进行滑动,stride=1,padding=2,这样一来,滑动得到的就是3867个3x3的窗口。
对于每个3x3的窗口,计算这个滑动窗口的中心点所对应的原始图片的中心点。然后作者假定,这个3x3窗口,是从原始图片上通过SPP池化得到的,而这个池化的区域的面积以及比例,就是一个个的anchor。换句话说,对于每个3x3窗口,作者假定它来自9种不同原始区域的池化,但是这些池化在原始图片中的中心点,都完全一样。这个中心点,就是刚才提到的,3x3窗口中心点所对应的原始图片中的中心点。如此一来,在每个窗口位置,我们都可以根据9个不同长宽比例、不同面积的anchor,逆向推导出它所对应的原始图片中的一个区域,这个区域的尺寸以及坐标,都是已知的。而这个区域,就是我们想要的 proposal。所以我们通过滑动窗口和anchor,成功得到了 3867x9 个原始图片的proposal。接下来,每个proposal我们只输出6个参数:每个 proposal 和 ground truth 进行比较得到的前景概率和背景概率(2个参数)(对应 cls_score);由于每个 proposal 和 ground truth 位置及尺寸上的差异,从 proposal 通过平移放缩得到 ground truth 需要的4个平移放缩参数(对应 bbox_pred)。
加上一点我的理解,anchor 是用来做多尺度的目标检测的,它是用来代替图像金字塔和特征金字塔的,它为什么可以达到这样的目的?可以看看它的最后一层的输出是 MN(92), 如果我们只看它在特征图 MN 个特征点的第一个点的第一个卷积核,它代表了什么含义?它相当于用这个卷积核去综合图片该点附近(33,上一步进行了33的卷积)的信息,判断有没有第一个尺寸的目标,也就是说每个卷积核都负责了一个尺寸的目标检测,那么18个卷积核,每2个负责一个任务,就达到了多尺度目标检测的目的,很巧妙的一个思路,从最终的效果来看,它实际上就是一个多尺度的目标热力图,或者用作者的话说,就相当于一个‘注意力’机制。
另外值得提出的是这里使用的是全卷积结构(33的卷积,然后接11的卷积),也就是说 M*N 也是一个二维结构,和原图的像素二维结构是对应的,那么我们就能相应的判断出该特征点对应的原图是否存在目标。 个人感觉,理解了这里的全卷积结构和 anchor 的机制,整个 faster rcnn 就明晰很多了。
最后明确的一点就是在代码中,anchor,proposal,rois ,boxes 代表的含义其实都是一样的,都是推荐的区域或者框,不过有所区别的地方在于这几个名词有一个递进的关系,最开始的是锚定的框 anchor,数量最多有约20000个(根据resize后的图片大小不同而有数量有所变化),然后是RPN网络推荐的框 proposal,数量较多,train时候有2000个,最后是实际分类时候用到的 rois 框,每张图片有256个;最后得到的结果就是 boxes。
好了, 以上就是转自知乎对于anchor的一个详细解释,我知道大家是来看代码的
入口在network.py的 _build_network函数中
# build the anchors for the image
self._anchor_component()
......
......
def _anchor_component(self):
with tf.variable_scope('ANCHOR_' + self._tag) as scope:
# just to get the shape right
height = tf.to_int32(tf.ceil(self._im_info[0] / np.float32(self._feat_stride[0])))
width = tf.to_int32(tf.ceil(self._im_info[1] / np.float32(self._feat_stride[0])))
if cfg.USE_E2E_TF:
anchors, anchor_length = generate_anchors_pre_tf(
height,
width,
self._feat_stride,
self._anchor_scales,
self._anchor_ratios
)
else:
anchors, anchor_length = tf.py_func(generate_anchors_pre,
[height, width,
self._feat_stride, self._anchor_scales, self._anchor_ratios],
[tf.float32, tf.int32], name="generate_anchors")
anchors.set_shape([None, 4])
anchor_length.set_shape([])
self._anchors = anchors
self._anchor_length = anchor_length
- 首先计算好偏移量
- 生成初始的9个anchor
lib/layer_utils/snippets.py
def generate_anchors_pre_tf(height, width, feat_stride=16, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
shift_x = tf.range(width) * feat_stride # width
shift_y = tf.range(height) * feat_stride # height
shift_x, shift_y = tf.meshgrid(shift_x, shift_y)
sx = tf.reshape(shift_x, shape=(-1,))
sy = tf.reshape(shift_y, shape=(-1,))
shifts = tf.transpose(tf.stack([sx, sy, sx, sy]))
K = tf.multiply(width, height)
shifts = tf.transpose(tf.reshape(shifts, shape=[1, K, 4]), perm=(1, 0, 2))
anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales))
A = anchors.shape[0]
anchor_constant = tf.constant(anchors.reshape((1, A, 4)), dtype=tf.int32)
length = K * A
anchors_tf = tf.reshape(tf.add(anchor_constant, shifts), shape=(length, 4))
return tf.cast(anchors_tf, dtype=tf.float32), length
生成anchor的代码一目了然,这个脚本对系统环境没有要求,所以大家也可以直接运行该文件,打点断点调试,就会很清楚整个流程。
lib/layer_utils/generate_anchors.py
def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
scales=2 ** np.arange(3, 6)):
"""
Generate anchor (reference) windows by enumerating aspect ratios X
scales wrt a reference (0, 0, 15, 15) window.
"""
base_anchor = np.array([1, 1, base_size, base_size]) - 1
ratio_anchors = _ratio_enum(base_anchor, ratios)
anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
for i in range(ratio_anchors.shape[0])])
return anchors
def _whctrs(anchor):
"""
Return width, height, x center, and y center for an anchor (window).
"""
w = anchor[2] - anchor[0] + 1
h = anchor[3] - anchor[1] + 1
x_ctr = anchor[0] + 0.5 * (w - 1)
y_ctr = anchor[1] + 0.5 * (h - 1)
return w, h, x_ctr, y_ctr
def _mkanchors(ws, hs, x_ctr, y_ctr):
"""
Given a vector of widths (ws) and heights (hs) around a center
(x_ctr, y_ctr), output a set of anchors (windows).
"""
ws = ws[:, np.newaxis]
hs = hs[:, np.newaxis]
anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
y_ctr - 0.5 * (hs - 1),
x_ctr + 0.5 * (ws - 1),
y_ctr + 0.5 * (hs - 1)))
return anchors
def _ratio_enum(anchor, ratios):
"""
Enumerate a set of anchors for each aspect ratio wrt an anchor.
"""
w, h, x_ctr, y_ctr = _whctrs(anchor)
size = w * h
size_ratios = size / ratios
ws = np.round(np.sqrt(size_ratios))
hs = np.round(ws * ratios)
anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
return anchors
def _scale_enum(anchor, scales):
"""
Enumerate a set of anchors for each scale wrt an anchor.
"""
w, h, x_ctr, y_ctr = _whctrs(anchor)
ws = w * scales
hs = h * scales
anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
return anchors
构建RPN
RPN层利用anchors在图片上的滑动,与256(512)个3*3的滑窗做卷积,生成全连接层。其主要作用有二:
- 预测proposal的中心锚点对应的坐标x,y以及宽高w,h
- 判断proposal区域是前景还是背景
代码入口
# region proposal network
rois = self._region_proposal(net_conv, is_training, initializer)
lib/nets/network.py
def _region_proposal(self, net_conv, is_training, initializer):
rpn = slim.conv2d(net_conv, cfg.RPN_CHANNELS, [3, 3], trainable=is_training, weights_initializer=initializer,
scope="rpn_conv/3x3")
self._act_summaries.append(rpn)
rpn_cls_score = slim.conv2d(rpn, self._num_anchors * 2, [1, 1], trainable=is_training,
weights_initializer=initializer,
padding='VALID', activation_fn=None, scope='rpn_cls_score')
# change it so that the score has 2 as its channel size
rpn_cls_score_reshape = self._reshape_layer(rpn_cls_score, 2, 'rpn_cls_score_reshape')
rpn_cls_prob_reshape = self._softmax_layer(rpn_cls_score_reshape, "rpn_cls_prob_reshape")
rpn_cls_pred = tf.argmax(tf.reshape(rpn_cls_score_reshape, [-1, 2]), axis=1, name="rpn_cls_pred")
rpn_cls_prob = self._reshape_layer(rpn_cls_prob_reshape, self._num_anchors * 2, "rpn_cls_prob")
rpn_bbox_pred = slim.conv2d(rpn, self._num_anchors * 4, [1, 1], trainable=is_training,
weights_initializer=initializer,
padding='VALID', activation_fn=None, scope='rpn_bbox_pred')
if is_training:
rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
# Try to have a deterministic order for the computing graph, for reproducibility
with tf.control_dependencies([rpn_labels]):
rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
else:
if cfg.TEST.MODE == 'nms':
rois, _ = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
elif cfg.TEST.MODE == 'top':
rois, _ = self._proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
else:
raise NotImplementedError
self._predictions["rpn_cls_score"] = rpn_cls_score
self._predictions["rpn_cls_score_reshape"] = rpn_cls_score_reshape
self._predictions["rpn_cls_prob"] = rpn_cls_prob
self._predictions["rpn_cls_pred"] = rpn_cls_pred
self._predictions["rpn_bbox_pred"] = rpn_bbox_pred
self._predictions["rois"] = rois
return rois
_region_proposal这段代码逻辑看细节的话会有点绕,所以先回顾一下整个faster rcnn的流程图
_region_proposal 函数分别做了以下几件事:
- 将特征图[60 * 40 * 256] (取决于原始图片的像素还有缩放scale还有选择的特征提取网络) 与 256个 3*3 的flters卷积(近一步提取特征)得到[60 * 40 * 9]的图
- 与18个11的filters做卷积,也就是92,对应着每个像素9个anchor,乘以2表示每个anchor对应2个scores,分别表示前景或者背景。通过reshape -> softmax -> reshape 获取了目标是否是物体的预测以及得分。输出的参数有两个rpn_cls_pred(预测结果),rpn_cls_prob(前景和背景的概率)。需要监督的信息是Y=0,1,表示这个区域是否是ground truth。
- 与36个1*1的filters做卷积,也就是 9 * 4, 得到Anchor Box的坐标信息,其实是偏移量
ground truth:标定的框也对应一个中心点位置坐标x,y和宽高w,h
anchor box: 中心点位置坐标x_a,y_a和宽高w_a,h_a
所以,偏移量:
△x=(x-x_a)/w_a △y=(y-y_a)/h_a
△w=log(w/w_a) △h=log(h/h_a)
通过ground truth box与预测的anchor box之间的差异来进行学习,从而是RPN网络中的权重能够学习到预测box的能力
- rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois"), 这一步获取2000个proposals
- rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor"),计算每个anchor的label值
- rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois"),选取__C.TRAIN.BATCH_SIZE个正样本和负样本作为训练的一个mini batch。这一步的作用是给PRN提供的proposals分配标签,计算proposals和ground truth boxes的偏移量,用于网络最后一层(bbox_pred)回归参数的学习。
接下来详细为大家介绍一下步骤4,5,6
4、rois, roi_scores = self._proposal_layer(rpn_cls_prob, rpn_bbox_pred, "rois")
lib/nets/network.py
def _proposal_layer(self, rpn_cls_prob, rpn_bbox_pred, name):
with tf.variable_scope(name) as scope:
if cfg.USE_E2E_TF:
rois, rpn_scores = proposal_layer_tf(
rpn_cls_prob,
rpn_bbox_pred,
self._im_info,
self._mode,
self._feat_stride,
self._anchors,
self._num_anchors
)
else:
rois, rpn_scores = tf.py_func(proposal_layer,
[rpn_cls_prob, rpn_bbox_pred, self._im_info, self._mode,
self._feat_stride, self._anchors, self._num_anchors],
[tf.float32, tf.float32], name="proposal")
rois.set_shape([None, 5])
rpn_scores.set_shape([None, 1])
return rois, rpn_scores
proposal_layer_tf 的流程如下
- 从config文件读取配置 post_nms_topN(执行NMS算法后proposal的数量), nms_thresh(NMS 阈值)
- 原始anchor给出的proposal通过学习参数rpn_bbox_pred,转换为与ground truth接近的边框,裁剪掉超出图片的部分
- 执行NMS算法,获取最终的proposals
用下图一个案例来对NMS算法进行简单介绍
如上图所示,一共有6个识别为人的框,每一个框有一个置信率。
现在需要消除多余的:
- 按置信率排序: 0.95, 0.9, 0.9, 0.8, 0.7, 0.7
- 取最大0.95的框为一个物体框
- 剩余5个框中,去掉与0.95框重叠率IoU大于0.6(可以另行设置),则保留0.9, 0.8, 0.7三个框
- 重复上面的步骤,直到没有框了,0.9为一个框
选出来的为: 0.95, 0.9
lib/layer_utils/proposal_layer.py
def proposal_layer_tf(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
if type(cfg_key) == bytes:
cfg_key = cfg_key.decode('utf-8')
pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
# Get the scores and bounding boxes
scores = rpn_cls_prob[:, :, :, num_anchors:]
scores = tf.reshape(scores, shape=(-1,))
rpn_bbox_pred = tf.reshape(rpn_bbox_pred, shape=(-1, 4))
proposals = bbox_transform_inv_tf(anchors, rpn_bbox_pred)
proposals = clip_boxes_tf(proposals, im_info[:2])
# Non-maximal suppression
indices = tf.image.non_max_suppression(proposals, scores, max_output_size=post_nms_topN, iou_threshold=nms_thresh)
boxes = tf.gather(proposals, indices)
boxes = tf.to_float(boxes)
scores = tf.gather(scores, indices)
scores = tf.reshape(scores, shape=(-1, 1))
# Only support single image as input
batch_inds = tf.zeros((tf.shape(indices)[0], 1), dtype=tf.float32)
blob = tf.concat([batch_inds, boxes], 1)
return blob, scores
-
bbox_transform_inv_tf: 每个anchor的边框学习之前得到的偏移量(这里的偏移量就是需要学习的rpn_bbox_pred)做位移和缩放,获取最终的预测边框。也就是将原始proposal A, 通过学习rpn_bbox_pred中的参数,得到一个与ground truth G 相近的预测边框 G'。
- clip_boxes_tf: 剪裁掉超出原始图片边框的部分。
lib/model/bbox_transform.py
def bbox_transform_inv_tf(boxes, deltas):
boxes = tf.cast(boxes, deltas.dtype)
widths = tf.subtract(boxes[:, 2], boxes[:, 0]) + 1.0
heights = tf.subtract(boxes[:, 3], boxes[:, 1]) + 1.0
ctr_x = tf.add(boxes[:, 0], widths * 0.5)
ctr_y = tf.add(boxes[:, 1], heights * 0.5)
dx = deltas[:, 0]
dy = deltas[:, 1]
dw = deltas[:, 2]
dh = deltas[:, 3]
pred_ctr_x = tf.add(tf.multiply(dx, widths), ctr_x)
pred_ctr_y = tf.add(tf.multiply(dy, heights), ctr_y)
pred_w = tf.multiply(tf.exp(dw), widths)
pred_h = tf.multiply(tf.exp(dh), heights)
pred_boxes0 = tf.subtract(pred_ctr_x, pred_w * 0.5)
pred_boxes1 = tf.subtract(pred_ctr_y, pred_h * 0.5)
pred_boxes2 = tf.add(pred_ctr_x, pred_w * 0.5)
pred_boxes3 = tf.add(pred_ctr_y, pred_h * 0.5)
return tf.stack([pred_boxes0, pred_boxes1, pred_boxes2, pred_boxes3], axis=1)
def clip_boxes_tf(boxes, im_info):
b0 = tf.maximum(tf.minimum(boxes[:, 0], im_info[1] - 1), 0)
b1 = tf.maximum(tf.minimum(boxes[:, 1], im_info[0] - 1), 0)
b2 = tf.maximum(tf.minimum(boxes[:, 2], im_info[1] - 1), 0)
b3 = tf.maximum(tf.minimum(boxes[:, 3], im_info[0] - 1), 0)
return tf.stack([b0, b1, b2, b3], axis=1)
5、rpn_labels = self._anchor_target_layer(rpn_cls_score, "anchor")
这一步要给 self._anchors 中的所有 anchor 赋以一个 label 值:
- 1 : 正样本
- 0 : 负样本
- -1 : 非样本, 不用于训练
同时初始化一些参数用于后面计算损失函数(关于损失函数, 大家可以去看https://blog.csdn.net/wfei101/article/details/79809332,我觉得讲的比论文清楚的多): - rpn_bbox_targets: PRN网络边框回归的ground truth
- rpn_bbox_inside_weights: label为1的行,也就是目标区域为前景的行,参数为[1.0 1.0 1.0 1.0],其余为0
- rpn_bbox_outside_weights: label为0或者1的行,参数为[1.0 1.0 1.0 1.0] / len(fg+bg)
lib/nets/network.py
def _anchor_target_layer(self, rpn_cls_score, name):
with tf.variable_scope(name) as scope:
rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = tf.py_func(
anchor_target_layer,
[rpn_cls_score, self._gt_boxes, self._im_info, self._feat_stride, self._anchors, self._num_anchors],
[tf.float32, tf.float32, tf.float32, tf.float32],
name="anchor_target")
rpn_labels.set_shape([1, 1, None, None])
rpn_bbox_targets.set_shape([1, None, None, self._num_anchors * 4])
rpn_bbox_inside_weights.set_shape([1, None, None, self._num_anchors * 4])
rpn_bbox_outside_weights.set_shape([1, None, None, self._num_anchors * 4])
rpn_labels = tf.to_int32(rpn_labels, name="to_int32")
self._anchor_targets['rpn_labels'] = rpn_labels
self._anchor_targets['rpn_bbox_targets'] = rpn_bbox_targets
self._anchor_targets['rpn_bbox_inside_weights'] = rpn_bbox_inside_weights
self._anchor_targets['rpn_bbox_outside_weights'] = rpn_bbox_outside_weights
self._score_summaries.update(self._anchor_targets)
return rpn_labels
lib/layer_utils/anchor_target_layer.py
def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
"""Same as the anchor target layer in original Fast/er RCNN """
A = num_anchors
total_anchors = all_anchors.shape[0]
K = total_anchors / num_anchors
# allow boxes to sit over the edge by a small amount
_allowed_border = 0
# map of shape (..., H, W)
height, width = rpn_cls_score.shape[1:3]
# only keep anchors inside the image
inds_inside = np.where(
(all_anchors[:, 0] >= -_allowed_border) &
(all_anchors[:, 1] >= -_allowed_border) &
(all_anchors[:, 2] < im_info[1] + _allowed_border) & # width
(all_anchors[:, 3] < im_info[0] + _allowed_border) # height
)[0]
# keep only inside anchors
anchors = all_anchors[inds_inside, :]
# label: 1 is positive, 0 is negative, -1 is dont care
labels = np.empty((len(inds_inside),), dtype=np.float32)
labels.fill(-1)
# overlaps between the anchors and the gt boxes
# overlaps (ex, gt)
overlaps = bbox_overlaps(
np.ascontiguousarray(anchors, dtype=np.float),
np.ascontiguousarray(gt_boxes, dtype=np.float))
argmax_overlaps = overlaps.argmax(axis=1)
max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
gt_argmax_overlaps = overlaps.argmax(axis=0)
gt_max_overlaps = overlaps[gt_argmax_overlaps,
np.arange(overlaps.shape[1])]
gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
# assign bg labels first so that positive labels can clobber them
# first set the negatives
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
# fg label: for each gt, anchor with highest overlap
labels[gt_argmax_overlaps] = 1
# fg label: above threshold IOU
labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
# assign bg labels last so that negative labels can clobber positives
labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
# subsample positive labels if we have too many
num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
fg_inds = np.where(labels == 1)[0]
if len(fg_inds) > num_fg:
disable_inds = npr.choice(
fg_inds, size=(len(fg_inds) - num_fg), replace=False)
labels[disable_inds] = -1
# subsample negative labels if we have too many
num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
bg_inds = np.where(labels == 0)[0]
if len(bg_inds) > num_bg:
disable_inds = npr.choice(
bg_inds, size=(len(bg_inds) - num_bg), replace=False)
labels[disable_inds] = -1
bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
# only the positive ones have regression targets
bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
# uniform weighting of examples (given non-uniform sampling)
num_examples = np.sum(labels >= 0)
positive_weights = np.ones((1, 4)) * 1.0 / num_examples
negative_weights = np.ones((1, 4)) * 1.0 / num_examples
else:
assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
(cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
np.sum(labels == 1))
negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
np.sum(labels == 0))
bbox_outside_weights[labels == 1, :] = positive_weights
bbox_outside_weights[labels == 0, :] = negative_weights
# map up to original set of anchors
labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
# labels
labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
labels = labels.reshape((1, 1, A * height, width))
rpn_labels = labels
# bbox_targets
bbox_targets = bbox_targets \
.reshape((1, height, width, A * 4))
rpn_bbox_targets = bbox_targets
# bbox_inside_weights
bbox_inside_weights = bbox_inside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_inside_weights = bbox_inside_weights
# bbox_outside_weights
bbox_outside_weights = bbox_outside_weights \
.reshape((1, height, width, A * 4))
rpn_bbox_outside_weights = bbox_outside_weights
return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
def _compute_targets(ex_rois, gt_rois):
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 5
return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
lib/model/bbox_transform.py
def bbox_transform(ex_rois, gt_rois):
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
6、rois, _ = self._proposal_target_layer(rois, roi_scores, "rpn_rois")
从之前RPN给出的2000个proposal中选出__C.TRAIN.BATCH_SIZE(128, 其中25%是前景, 75%是背景)作为训练的一批。
lib/nets/network.py
def _proposal_target_layer(self, rois, roi_scores, name):
with tf.variable_scope(name) as scope:
rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights = tf.py_func(
proposal_target_layer,
[rois, roi_scores, self._gt_boxes, self._num_classes],
[tf.float32, tf.float32, tf.float32, tf.float32, tf.float32, tf.float32],
name="proposal_target")
rois.set_shape([cfg.TRAIN.BATCH_SIZE, 5])
roi_scores.set_shape([cfg.TRAIN.BATCH_SIZE])
labels.set_shape([cfg.TRAIN.BATCH_SIZE, 1])
bbox_targets.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
bbox_inside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
bbox_outside_weights.set_shape([cfg.TRAIN.BATCH_SIZE, self._num_classes * 4])
self._proposal_targets['rois'] = rois
self._proposal_targets['labels'] = tf.to_int32(labels, name="to_int32")
self._proposal_targets['bbox_targets'] = bbox_targets
self._proposal_targets['bbox_inside_weights'] = bbox_inside_weights
self._proposal_targets['bbox_outside_weights'] = bbox_outside_weights
self._score_summaries.update(self._proposal_targets)
return rois, roi_scores
lib/layer_utils/proposal_target_layer.py
def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
"""
# Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
# (i.e., rpn.proposal_layer.ProposalLayer), or any other source
all_rois = rpn_rois
all_scores = rpn_scores
# Include ground-truth boxes in the set of candidate rois
if cfg.TRAIN.USE_GT:
zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
all_rois = np.vstack(
(all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
)
# not sure if it a wise appending, but anyway i am not using it
all_scores = np.vstack((all_scores, zeros))
num_images = 1
rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
# Sample rois with classification labels and bounding box regression
# targets
labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
all_rois, all_scores, gt_boxes, fg_rois_per_image,
rois_per_image, _num_classes)
rois = rois.reshape(-1, 5)
roi_scores = roi_scores.reshape(-1)
labels = labels.reshape(-1, 1)
bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
def _get_bbox_regression_labels(bbox_target_data, num_classes):
"""Bounding-box regression targets (bbox_target_data) are stored in a
compact form N x (class, tx, ty, tw, th)
This function expands those targets into the 4-of-4*K representation used
by the network (i.e. only one class has non-zero targets).
Returns:
bbox_target (ndarray): N x 4K blob of regression targets
bbox_inside_weights (ndarray): N x 4K blob of loss weights
"""
clss = bbox_target_data[:, 0]
bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
inds = np.where(clss > 0)[0]
for ind in inds:
cls = clss[ind]
start = int(4 * cls)
end = start + 4
bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
return bbox_targets, bbox_inside_weights
def _compute_targets(ex_rois, gt_rois, labels):
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 4
targets = bbox_transform(ex_rois, gt_rois)
if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
# Optionally normalize targets by a precomputed mean and stdev
targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
/ np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
return np.hstack(
(labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
"""Generate a random sample of RoIs comprising foreground and background
examples.
"""
# overlaps: (rois x gt_boxes)
overlaps = bbox_overlaps(
np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
gt_assignment = overlaps.argmax(axis=1)
max_overlaps = overlaps.max(axis=1)
labels = gt_boxes[gt_assignment, 4]
# Select foreground RoIs as those with >= FG_THRESH overlap
fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
# Guard against the case when an image has fewer than fg_rois_per_image
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
(max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
# Small modification to the original version where we ensure a fixed number of regions are sampled
if fg_inds.size > 0 and bg_inds.size > 0:
fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
bg_rois_per_image = rois_per_image - fg_rois_per_image
to_replace = bg_inds.size < bg_rois_per_image
bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
elif fg_inds.size > 0:
to_replace = fg_inds.size < rois_per_image
fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
fg_rois_per_image = rois_per_image
elif bg_inds.size > 0:
to_replace = bg_inds.size < rois_per_image
bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
fg_rois_per_image = 0
else:
import pdb
pdb.set_trace()
# The indices that we're selecting (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
labels[int(fg_rois_per_image):] = 0
rois = all_rois[keep_inds]
roi_scores = all_scores[keep_inds]
bbox_target_data = _compute_targets(
rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
bbox_targets, bbox_inside_weights = \
_get_bbox_regression_labels(bbox_target_data, num_classes)
return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
ROI层
这一层的输入是原始图片经过特征网络处理过的feature map和rois。
ROI主要做了3件事:
- 因为rois是在原始图片下的坐标,我们第一步需要将rois的坐标映射到feature map上
- 将映射后的坐标分为大小相等sections
- 对每个section做max pooling
代码还是很清晰的
lib/nets/network.py
def _crop_pool_layer(self, bottom, rois, name):
with tf.variable_scope(name) as scope:
batch_ids = tf.squeeze(tf.slice(rois, [0, 0], [-1, 1], name="batch_id"), [1])
# Get the normalized coordinates of bounding boxes
bottom_shape = tf.shape(bottom)
height = (tf.to_float(bottom_shape[1]) - 1.) * np.float32(self._feat_stride[0])
width = (tf.to_float(bottom_shape[2]) - 1.) * np.float32(self._feat_stride[0])
x1 = tf.slice(rois, [0, 1], [-1, 1], name="x1") / width
y1 = tf.slice(rois, [0, 2], [-1, 1], name="y1") / height
x2 = tf.slice(rois, [0, 3], [-1, 1], name="x2") / width
y2 = tf.slice(rois, [0, 4], [-1, 1], name="y2") / height
# Won't be back-propagated to rois anyway, but to save time
bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1))
pre_pool_size = cfg.POOLING_SIZE * 2
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops")
return slim.max_pool2d(crops, [2, 2], padding='SAME')
bboxes = tf.stop_gradient(tf.concat([y1, x1, y2, x2], axis=1)) 这行代码主要是为了节约计算时间,因为反向传播到这一步就停止了。
crops = tf.image.crop_and_resize(bottom, bboxes, tf.to_int32(batch_ids), [pre_pool_size, pre_pool_size], name="crops") 这行代码就是将feature map分为14 * 14的section,主要是为了适配不同size图片的feature map。
return slim.max_pool2d(crops, [2, 2], padding='SAME') 这行代码给出了返回结果,实现了filter为2 * 2的max pooling, 所以输出的结果是一个7 * 7的图。
构建全连接层
fc7 = self._head_to_tail(pool5, is_training)
下面代码的实现不用解释了吧,就是用resnet_v1.resnet_v1实现了一个FC
lib/nets/resnet_v1.py
def _head_to_tail(self, pool5, is_training, reuse=None):
with slim.arg_scope(resnet_arg_scope(is_training=is_training)):
fc7, _ = resnet_v1.resnet_v1(pool5,
self._blocks[-1:],
global_pool=False,
include_root_block=False,
reuse=reuse,
scope=self._scope)
# average pooling done by reduce_mean
fc7 = tf.reduce_mean(fc7, axis=[1, 2])
return fc7
区域分类和目标边框的回归
cls_prob, bbox_pred = self._region_classification(fc7, is_training,
initializer, initializer_bbox)
这段代码就是整个网络的额输出部分。
lib/nets/network.py
def _region_classification(self, fc7, is_training, initializer, initializer_bbox):
cls_score = slim.fully_connected(fc7, self._num_classes,
weights_initializer=initializer,
trainable=is_training,
activation_fn=None, scope='cls_score')
cls_prob = self._softmax_layer(cls_score, "cls_prob")
cls_pred = tf.argmax(cls_score, axis=1, name="cls_pred")
bbox_pred = slim.fully_connected(fc7, self._num_classes * 4,
weights_initializer=initializer_bbox,
trainable=is_training,
activation_fn=None, scope='bbox_pred')
self._predictions["cls_score"] = cls_score
self._predictions["cls_pred"] = cls_pred
self._predictions["cls_prob"] = cls_prob
self._predictions["bbox_pred"] = bbox_pred
return cls_prob, bbox_pred
到此整个faster rcnn的训练部分已经结束,testing部分的代码容易一些,跟训练也差不多,本文就不赘述了。