上一次说到train.py中
final_bbox, final_scores, final_category, loss_dict = faster_rcnn.build_whole_detection_network(
input_img_batch=img_batch,
gtboxes_batch=gtboxes_and_label)
现在我们看看build_whole_detection_network()这个函数,emmm,在build_whole_network.py中。pycharm对这个函数ctrl+B定位一下。嗯,慢慢讲:
if self.is_training:
# 确保形状是[-1,5],-1表示电脑自动计算行数
#tf.cast代表数据转换为tf.float32
gtboxes_batch = tf.reshape(gtboxes_batch, [-1, 5])
gtboxes_batch = tf.cast(gtboxes_batch, tf.float32)
#图像的shape:(1,H,W,C)这是形状啦,不是每一行的具体数据
img_shape = tf.shape(input_img_batch)
########################################
P_list = self.build_base_network(input_img_batch) # [P2的特征图, P3的特征图, P4……, P5……, P6……]
具体点,需要看到:
def build_base_network(self, input_img_batch):
#string.startswith(X)——检查string是否与X相同,返回一个bool值
if self.base_network_name.startswith('resnet_v1'):
#构建了resnet网络并利用了FPN思想得到P2~6,并返回一个dict,字典里面是一个个的conv2d的特征图
return resnet.resnet_base(input_img_batch, scope_name=self.base_network_name, is_training=self.is_training)
elif self.base_network_name.startswith('MobilenetV2'):
return mobilenet_v2.mobilenetv2_base(input_img_batch, is_training=self.is_training)
else:
raise ValueError('Sry, we only support resnet or mobilenet_v2')
重点在这一句代码
return resnet.resnet_base(input_img_batch, scope_name=self.base_network_name, is_training=self.is_training)
定位到resnet.py中
拆开看:
def resnet_base(img_batch, scope_name, is_training=True):
'''
this code is derived from light-head rcnn.
https://github.com/zengarden/light_head_rcnn
It is convenient to freeze blocks. So we adapt this mode.
'''
if scope_name == 'resnet_v1_50':
middle_num_units = 6
elif scope_name == 'resnet_v1_101':
middle_num_units = 23
else:
raise NotImplementedError('We only support resnet_v1_50 or resnet_v1_101. Check your network name....yjr')
这边表示能用什么形式
blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
resnet_v1_block('block3', base_depth=256, num_units=middle_num_units, stride=2),
resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
# when use fpn . stride list is [1, 2, 2]
这边我没怎么看彻底,有点绕的,大概的意思就是构建了一个resnet_v1_50这杨一个基础结构
with slim.arg_scope(resnet_arg_scope(is_training=False)):
with tf.variable_scope(scope_name, scope_name):
# Do the first few layers manually, because 'SAME' padding can behave inconsistently
# for images of different sizes: sometimes 0, sometimes 1
#做卷积操作
net = resnet_utils.conv2d_same(
img_batch, 64, 7, stride=2, scope='conv1')
#pad就是填充0,同时[0,0]表示在该维度不做处理
net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
net = slim.max_pool2d(
net, [3, 3], stride=2, padding='VALID', scope='pool1')
对图像进行卷积,变为64通道的,然后进行padding,最后进行最大池化。
关于tf.pad,用下面的代码测一下大概就知道得到什么了
import tensorflow as tf
t=tf.constant([[[1,2], [2,3],[2,1]],
[[1,2], [2,3],[2,1]]])
print(t)
a=tf.pad(t,[[1,1], [2,0],[1,0]])
with tf.Session() as sess:
print("----------------------------------")
a = sess.run(a)
print(a)
print(a.shape)
接着:
#在建立base网络时,根据网络定义 not_freezed 确定,是否对特征提取网络进行再训练
not_freezed = [False] * cfgs.FIXED_BLOCKS + (4-cfgs.FIXED_BLOCKS)*[True]
cfgs.FIXED_BLOCKS=0,所以输出[True,True,True,True]
with slim.arg_scope(resnet_arg_scope(is_training=(is_training and not_freezed[0]))):
C2, end_points_C2 = resnet_v1.resnet_v1(net,
blocks[0:1],
global_pool=False,
include_root_block=False,
scope=scope_name)
类似的就是做卷积,得到C2~C5层特征图
feature_dict = {'C2': end_points_C2['{}/block1/unit_2/bottleneck_v1'.format(scope_name)],
'C3': end_points_C3['{}/block2/unit_3/bottleneck_v1'.format(scope_name)],
'C4': end_points_C4['{}/block3/unit_{}/bottleneck_v1'.format(scope_name, middle_num_units - 1)],
'C5': end_points_C5['{}/block4/unit_3/bottleneck_v1'.format(scope_name)],
# 'C5': end_points_C5['{}/block4'.format(scope_name)],
}
将其每一层变为一个字典形式
接下在是构建特征图的金字塔结构
pyramid_dict = {} #定义一个空字典
with tf.variable_scope('build_pyramid'):
with slim.arg_scope([slim.conv2d], weights_regularizer=slim.l2_regularizer(cfgs.WEIGHT_DECAY),
activation_fn=None, normalizer_fn=None):
P5 = slim.conv2d(C5,
num_outputs=256,
kernel_size=[1, 1],
stride=1, scope='build_P5')
if "P6" in cfgs.LEVLES:
P6 = slim.max_pool2d(P5, kernel_size=[1, 1], stride=2, scope='build_P6')
pyramid_dict['P6'] = P6
pyramid_dict['P5'] = P5
此时构建了顶层P6、P5别用maxpool和conv2d进行构建
for level in range(4, 1, -1): # build [P4, P3, P2]
#[]里面的是key,=后面是进行内容
pyramid_dict['P%d' % level] = fusion_two_layer(C_i=feature_dict["C%d" % level],
P_j=pyramid_dict["P%d" % (level+1)],
scope='build_P%d' % level)
FPN的横向连接思想,就是上一层特征图进行双线性插值放大一倍与同层的C_x进行像素级融合比例0.5,0.5。此时得到了P4, P3,P2三个特征层
for level in range(4, 1, -1):
pyramid_dict['P%d' % level] = slim.conv2d(
pyramid_dict['P%d' % level],
num_outputs=256, kernel_size=[3, 3],
padding="SAME",stride=1, scope="fuse_P%d" % level)
每个特征层进行用3*3的卷即可进行卷积,步长是1,都输出256层,可以看做是统一输出吧,并减小重叠可能性。
return [pyramid_dict[level_name] for level_name in cfgs.LEVLES]
最终在resnet.py的resnet_base函数返回的是一个列表[P2,P3,P4,P5,P6]每个Px都是一个特征层
返回到build_whloe_network.py上
P_list = self.build_base_network(input_img_batch)
这句话就是返回一个P_list的列表,列表中每个元素是一个特征层
#############################
with tf.variable_scope('build_rpn',
regularizer=slim.l2_regularizer(cfgs.WEIGHT_DECAY)):
fpn_cls_score =[]
fpn_box_pred = []
for level_name, p in zip(cfgs.LEVLES, P_list): #对应关系
if cfgs.SHARE_HEADS:
reuse_flag = None if level_name==cfgs.LEVLES[0] else True
#赋 每一层需要的变量
scope_list=['rpn_conv/3x3', 'rpn_cls_score', 'rpn_bbox_pred']
else:
reuse_flag = None
scope_list= ['rpn_conv/3x3_%s' % level_name, 'rpn_cls_score_%s' % level_name, 'rpn_bbox_pred_%s' % level_name]
下面的代码就是RPN的重点首先将特征图输入到3*3的卷积后再进行位置偏差提取和区别前景/背景
#进行3*3的卷积
rpn_conv3x3 = slim.conv2d(
p, 512, [3, 3],
trainable=self.is_training, weights_initializer=cfgs.INITIALIZER, padding="SAME",
activation_fn=tf.nn.relu,
scope=scope_list[0],
reuse=reuse_flag)
#得到每个RPN的信息得分
rpn_cls_score = slim.conv2d(rpn_conv3x3, self.num_anchors_per_location*2, [1, 1], stride=1,
trainable=self.is_training, weights_initializer=cfgs.INITIALIZER,
activation_fn=None, padding="VALID",
scope=scope_list[1],
reuse=reuse_flag)
#位置偏差
rpn_box_pred = slim.conv2d(rpn_conv3x3, self.num_anchors_per_location*4, [1, 1], stride=1,
trainable=self.is_training, weights_initializer=cfgs.BBOX_INITIALIZER,
activation_fn=None, padding="VALID",
scope=scope_list[2],
reuse=reuse_flag)
#将得分(锚点的前景、背景的score)一行行的写,位置偏差(dx,dy,dw,dh)
rpn_box_pred = tf.reshape(rpn_box_pred, [-1, 4])
rpn_cls_score = tf.reshape(rpn_cls_score, [-1, 2])
#通过for循环,得到了一串大列表fpn_cls_score, fpn_box_pred,每个里面可能就是6个大元素,每个元素的shape是[-1,4]
fpn_cls_score.append(rpn_cls_score) #【[[1,2],][1,2]],[……],[……]】
fpn_box_pred.append(rpn_box_pred)
fpn_cls_score = tf.concat(fpn_cls_score, axis=0, name='fpn_cls_score') #shape[-1,2]
fpn_box_pred = tf.concat(fpn_box_pred, axis=0, name='fpn_box_pred') #shape[-1,4]
fpn_cls_prob = slim.softmax(fpn_cls_score, scope='fpn_cls_prob')
总结:上述是为了进行rpn的粗分类与位置偏差的,最后得到fpn_cls_prob:shape(-1,4) 与 fpn_box_prob: shape(-1,2)
#######################################################
all_anchors = []
for i in range(len(cfgs.LEVLES)):
#p_h是一个矩阵
level_name, p = cfgs.LEVLES[i], P_list[i]
p_h, p_w = tf.shape(p)[1], tf.shape(p)[2]
featuremap_height = tf.cast(p_h, tf.float32)
featuremap_width = tf.cast(p_w, tf.float32)
anchors = anchor_utils.make_anchors(base_anchor_size=cfgs.BASE_ANCHOR_SIZE_LIST[i],
anchor_scales=cfgs.ANCHOR_SCALES,
anchor_ratios=cfgs.ANCHOR_RATIOS,
featuremap_height=featuremap_height,
featuremap_width=featuremap_width,
stride=cfgs.ANCHOR_STRIDE_LIST[i],
name="make_anchors_for%s" % level_name)
anchor_utils.make_anchors函数定位在anchor_utils.py文件中
def make_anchors(base_anchor_size, anchor_scales, anchor_ratios,
featuremap_height, featuremap_width,
stride, name='make_anchors'):
with tf.variable_scope(name):
base_anchor = tf.constant([0, 0, base_anchor_size, base_anchor_size], tf.float32) # [x_center, y_center, w, h]
#得到ws与hs的各种可能,用列表的形式显示
ws, hs = enum_ratios(enum_scales(base_anchor, anchor_scales),
anchor_ratios) # per locations ws and hs
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#这儿表示每个特征点在原图上对应的anchor的中心。
x_centers = tf.range(featuremap_width, dtype=tf.float32) * stride
y_centers = tf.range(featuremap_height, dtype=tf.float32) * stride
if cfgs.USE_CENTER_OFFSET:
x_centers = x_centers + stride/2.
y_centers = y_centers + stride/2.
#这边不是很明白,可能真的数学不大好吧
x_centers, y_centers = tf.meshgrid(x_centers, y_centers)
ws, x_centers = tf.meshgrid(ws, x_centers)
hs, y_centers = tf.meshgrid(hs, y_centers)
#合称为一个个坐标点,shape[1,N,2]
anchor_centers = tf.stack([x_centers, y_centers], 2)
#转变维度,变为[-1,2]
anchor_centers = tf.reshape(anchor_centers, [-1, 2])
box_sizes = tf.stack([ ws,hs], axis=2)
box_sizes = tf.reshape(box_sizes, [-1, 2])
print(box_sizes)
#这边就是生成各种anchor的坐标了
anchors = tf.concat([anchor_centers - 0.5*box_sizes,
anchor_centers + 0.5*box_sizes], axis=1)
return anchors
回到build_whole_network.py中,上述函数就是在原图像上得到各个特征层的特征点在原图上对应的anchor框的坐标
all_anchors.append(anchors)
all_anchors = tf.concat(all_anchors, axis=0, name='all_anchors_of_FPN')
跟之前的fpn_cls_prob类似的操作,变为一个个组数据后再转为行数据【维度变换】,此时可以看做是对原图像进行密集采样
#################################################
rois, roi_scores = postprocess_rpn_proposals(rpn_bbox_pred=fpn_box_pred,
rpn_cls_prob=fpn_cls_prob,
img_shape=img_shape,
anchors=all_anchors,
is_training=self.is_training)
postprocess_rpn_proposals的输出参数是RPN的预测偏差,预测类别(前/背景),图像的shape,锚点框
定位postprocess_rpn_proposals在proposal_opr.py中
大致思路是这样的:
选取概率较大的12000个anchor
利用回归的位置参数,修正这12000个anchor的位置,得到RoIs
利用非极大值((Non-maximum suppression, NMS)抑制,选出概率最大的2000个RoIs
#得到anchor映射框与rpn得到的预测框对比得到回归
decode_boxes = encode_and_decode.decode_boxes(encoded_boxes=rpn_bbox_pred,
reference_boxes=anchors,
scale_factors=cfgs.ANCHOR_SCALE_FACTORS)
#将anchor的采样框进行剪枝,
#有的框超过了边界,将其约束住
decode_boxes = boxes_utils.clip_boxes_to_img_boundaries(decode_boxes=decode_boxes,
img_shape=img_shape)
if pre_nms_topN > 0:
#比较12000与采样框进行比较,取小的数
pre_nms_topN = tf.minimum(pre_nms_topN, tf.shape(decode_boxes)[0], name='avoid_unenough_boxes')
#cls_prob是一个shape= [:,2],在第二维是一个bool类型的矩阵
#cls_prob是tensor,类型与cls_prob类型相同,大小是12000
#top_k_indices,是对应的索引位置
cls_prob, top_k_indices = tf.nn.top_k(cls_prob, k=pre_nms_topN)
#通过top_k_indices的索引,用tf.gather对decode_boxes进行筛选然后进行排序,得到排序后的anchor的筛选框
decode_boxes = tf.gather(decode_boxes, top_k_indices)
# 4. NMS
#进行NMS,返回的是形状为[M]的一维整数Tensor,表示从box张量中选择的指数,其中M <= max_output_
keep = tf.image.non_max_suppression(
boxes=decode_boxes,
scores=cls_prob,
max_output_size=post_nms_topN,
iou_threshold=nms_thresh)
#得到最终的box
final_boxes = tf.gather(decode_boxes, keep)
final_probs = tf.gather(cls_prob, keep)
#返回筛选后的box和prob(对anchor与rpn)
return final_boxes, final_probs
以上就是对proposal_opr.py的注释=-=,意思就是对anchor与rpn当时产生的框进行回归,剪枝,NMS。这边可以看到基本有roi兴趣区域了,相当于初步筛选结束啦!!!!!!!!!!!!!!!
返回到build_whole_network.py中
if self.is_training:
with tf.variable_scope('sample_anchors_minibatch'):
#tf.py_func(fun,input of func, output of func)
#这代码就是选取了256个样本,返回筛选后的rpn的标签fpn_labels和回归框fpn_bbox_targets
fpn_labels, fpn_bbox_targets = \
tf.py_func(
anchor_target_layer,
[gtboxes_batch, img_shape, all_anchors],
[tf.float32, tf.float32])
fpn_bbox_targets = tf.reshape(fpn_bbox_targets, [-1, 4])
fpn_labels = tf.to_int32(fpn_labels, name="to_int32")
fpn_labels = tf.reshape(fpn_labels, [-1])
#画正负样本的img
self.add_anchor_img_smry(input_img_batch, all_anchors, fpn_labels)
tf.py_func()将是相当于可以自定义一个函数,anchor_targe_layer是函数名,[gtboxes_batch, img_shape, all_anchors是函数的输入变量,tf.float32, tf.float32是函数的输出格式。
定位anchor_target_layer函数anchor_target_layer_without_boxweight.py中。
这边建议看这个链接:https://blog.csdn.net/u014380165/article/details/80379812
强力推荐
这边我做些摘抄上去注释,别喷我=-=,AI大神真的讲的挺详细的
def anchor_target_layer(gt_boxes, img_shape, all_anchors, is_restrict_bg=False):
total_anchors = all_anchors.shape[0] #总的anchor个数
img_h, img_w = img_shape[1], img_shape[2] #图像的长宽
gt_boxes = gt_boxes[:, :-1] #(x1,y1,x2,y2)
_allowed_border = 0
#inds_inside表示anchor的4个点坐标都在图像内部的anchor的index
if cfgs.IS_FILTER_OUTSIDE_BOXES:
inds_inside = np.where(
(all_anchors[:, 0] >= -_allowed_border) &
(all_anchors[:, 1] >= -_allowed_border) &
(all_anchors[:, 2] < img_w + _allowed_border) & # width
(all_anchors[:, 3] < img_h + _allowed_border) # height
)[0]
else:
inds_inside = range(all_anchors.shape[0]) #做顺序列表
#过滤掉不完全在图像内部的anchor框
anchors = all_anchors[inds_inside, :]
#先将labels置空,全部填充-1
labels = np.empty((len(inds_inside),), dtype=np.float32)
labels.fill(-1)
重点来了,这是大神帮了我
#这里得出的是一个表?元素是anchor与GT的IOU
overlaps = bbox_overlaps(
np.ascontiguousarray(anchors, dtype=np.float),
np.ascontiguousarray(gt_boxes, dtype=np.float))
#每一行最大IOU的索引,shape(n,1),返回的是对应的gt的标签值
argmax_overlaps = overlaps.argmax(axis=1)
#max_overlaps是具体的IOU值。
max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
#计算出每一列的最大值的索引,一共有ground truth 目标数目个列(每一个ground truth与哪一个Anchor重叠最大)
gt_argmax_overlaps = overlaps.argmax(axis=0)
#取出与ground truth最大重叠的Anchor
gt_max_overlaps = overlaps[ gt_argmax_overlaps, np.arange(overlaps.shape[1])]
#IOU最大的那些anchor都捞出来。
gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
if not cfgs.TRAIN_RPN_CLOOBER_POSITIVES:
#如果某一行最大的anchor的IOU都小于0.3,那直接定义为背景
labels[max_overlaps < cfgs.RPN_IOU_NEGATIVE_THRESHOLD] = 0
#前景标签定义2种:1,object对应的IOU最大的那个,2.某个anchor与object的IOU>0.7的情况
labels[gt_argmax_overlaps] = 1
labels[max_overlaps >= cfgs.RPN_IOU_POSITIVE_THRESHOLD] = 1
#背景定义:某个anchor与每个object中最大的IOU都小于0.3
labels[max_overlaps < cfgs.RPN_IOU_NEGATIVE_THRESHOLD] = 0
'''
0.1 0.2 0.1 0.1
0.2 0.1 0.5 0.4
0.7 0.1 0.6 0.6
上述的x[1,1]是最大,在正样本时被选为了1,但是同时<0.3,所以有一种可能就是该anchor是负样本
'''
#欠样本处理
#前景的个数=128
num_fg = int(cfgs.RPN_MINIBATCH_SIZE * cfgs.RPN_POSITIVE_RATE)
#取正样本的索引
fg_inds = np.where(labels == 1)[0]
#如果正样本个数大于>128,就随机将多余的正样本的标签变为-1(不关注)
if len(fg_inds) > num_fg:
disable_inds = npr.choice(
fg_inds, size=(len(fg_inds) - num_fg), replace=False)
labels[disable_inds] = -1
#负样本的个数 = 128 或者 256-正样本个数(正样本个数<128)
#若果负样本个数>规定个数,同样随机将多余的标签改为-1
num_bg = cfgs.RPN_MINIBATCH_SIZE - np.sum(labels == 1)
if is_restrict_bg:
num_bg = max(num_bg, num_fg * 1.5)
bg_inds = np.where(labels == 0)[0]
if len(bg_inds) > num_bg:
disable_inds = npr.choice(
bg_inds, size=(len(bg_inds) - num_bg), replace=False)
labels[disable_inds] = -1
#此时得到了每个anchor关于样本的标签了 (还只是各class的前景和背景)
#_compute_targets是计算关于anchor与真实的gt框IOU最大的object 之间的缩放因子(t_xcenter, t_ycenter, t_w, t_h)
bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
#因为一直在计算中都是针对于所有在图像内的框进行运算,
#并没有考虑到在图像外的框,但是在最终的计算中,
#针对的是所有的anchor,因此需要将处理过的与原始的进行融合
labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
rpn_labels = labels.reshape((-1, 1)) #得到所有的标签了
bbox_targets = bbox_targets.reshape((-1, 4))
rpn_bbox_targets = bbox_targets
#此时 rpn_bbox_targets代表了所有的anchor框,但是是处理好的
return rpn_labels, rpn_bbox_targets #返回rpn_labels的标签,和rpn_targets的偏转位置(t因子)
上述做啥了?简单叙述下:
回到build_whole_network.py中
刚才的设计都是针对于RPN网络的,并没有设计到真正的类别信息,以下则采用RCNN部分获得其相关的roi,target等信息句
那么fpn_labels, fpn_bbox_targets = rpn_labels, rpn_bbox_targets
然后是一些画图啊计算ACC什么的不管他了,还算是好理解
接下来是这个代码,和上面的with是统一层次的
with tf.control_dependencies([fpn_labels]):
with tf.variable_scope('sample_RCNN_minibatch'):
rois, labels, bbox_targets = \
tf.py_func(proposal_target_layer,
[rois, gtboxes_batch],
[tf.float32, tf.float32, tf.float32])
rois = tf.reshape(rois, [-1, 4])
labels = tf.to_int32(labels)
labels = tf.reshape(labels, [-1])
#回归框标记shape(-1,4*(21))存储的是roi对应每个类的坐标信息。
bbox_targets = tf.reshape(bbox_targets, [-1, 4*(cfgs.CLASS_NUM+1)])
self.add_roi_batch_img_smry(input_img_batch, rois, labels)
定位proposal_target_layer在proposal_target_layer.py中,注意一下=-=这里的anchor就是rois。哇,想到这里我就哭了,看成了all_anchor了,佰忙活了一个上午查资料
def proposal_target_layer(rpn_rois, gt_boxes):
"""
Assign object detection proposals to ground-truth targets. Produces proposal
classification labels and bounding-box regression targets.
"""
# Proposal ROIs (x1, y1, x2, y2) coming from RPN
# gt_boxes (x1, y1, x2, y2, label)
if cfgs.ADD_GTBOXES_TO_TRAIN:
#gt_boxes[:,:-1]表示不取最后一列
#每一行水平放置
all_rois = np.vstack((rpn_rois, gt_boxes[:, :-1]))
else:
all_rois = rpn_rois
#若cfgs.FAST_RCNN_MINIBATCH_SIZE == -1 则rois_per_image = np.inf,
#否则rois_per_image = cfgs.FAST_RCNN_MINIBATCH_SIZE = 256
#设置每张图上的roi个数
rois_per_image = np.inf if cfgs.FAST_RCNN_MINIBATCH_SIZE == -1 else cfgs.FAST_RCNN_MINIBATCH_SIZE
#fg_rois_per_image=64
#每张图的roi个数为256*0.25
fg_rois_per_image = np.round(cfgs.FAST_RCNN_POSITIVE_RATE * rois_per_image)
#带有分类标签和边界框回归的样本rois_sample_rois选择进行分类训练的框,
# 并求取他们类别和坐标的ground truth和计算边框损失loss时需要的
#对roi进行再次筛选,并分具体的标签,不再是二分类了
#注意bbox_targets shape (-1,num[含背景]*4),num*4中,只有4个数是有值的,其他都是0
bbox_inside_weights
labels, rois, bbox_targets = _sample_rois(all_rois, gt_boxes, fg_rois_per_image,
rois_per_image, cfgs.CLASS_NUM+1)
rois = rois.reshape(-1, 4)
labels = labels.reshape(-1)
bbox_targets = bbox_targets.reshape(-1, (cfgs.CLASS_NUM+1) * 4)
return rois, labels, bbox_target
最后返回rois(进过两次筛选啦)与labels和bbox_targets(坐标框)
回到 build_whloe_network.py中
if self.is_training:
rois_list, labels, bbox_targets = self.assign_levels(all_rois=rois,
labels=labels,
bbox_targets=bbox_targets)
else:
#每个Px返回的rois 不是很清楚啦
rois_list = self.assign_levels(all_rois=rois) # rois_list: [P2_rois, P3_rois, P
##########################################
5.构建Fast R-CNN
bbox_pred, cls_score = self.build_fastrcnn(P_list=P_list, rois_list=rois_list,
img_shape=img_shape)
cls_prob = slim.softmax(cls_score, 'cls_prob')#softmax得到概率
##以下是build_fastrcnn函数的代码
def build_fastrcnn(self, P_list, rois_list, img_shape):
with tf.variable_scope('Fast-RCNN'):
# 5. ROI Pooling
with tf.variable_scope('rois_pooling'):
pooled_features_list = []
for level_name, p, rois in zip(cfgs.LEVLES, P_list, rois_list): # exclude P6_rois
#对每个roi进行roi pooling
#该函数调用了tf.image.crop_and_resize函数,其中box_ind=tf.zeros(shape=[N, ],表示做能用到第batch=0这张图上,就是本张的意思,每张roi变成14*14,之后进行3*3的卷积,返回roi_features
dtype=tf.int32
pooled_features = self.roi_pooling(feature_maps=p, rois=rois, img_shape=img_shape,
scope=level_name)
# pooled_features_list把每层都添加进来
pooled_features_list.append(pooled_features)
#变为shape(1,H,W,C)
pooled_features = tf.concat(pooled_features_list, axis=0)
# 6. inferecne rois in Fast-RCNN to obtain fc_flatten features
if self.base_network_name.startswith('resnet'):
#进行两次全连接
fc_flatten = resnet.restnet_head(inputs=pooled_features,
is_training=self.is_training,
scope_name=self.base_network_name)
elif self.base_network_name.startswith('Mobile'):
fc_flatten = mobilenet_v2.mobilenetv2_head(inputs=pooled_features,
is_training=self.is_training)
else:
raise NotImplementedError('only support resnet and mobilenet')
# 7. cls and reg in Fast-RCNN
with slim.arg_scope([slim.fully_connected], weights_regularizer=slim.l2_regularizer(cfgs.WEIGHT_DECAY)):
#d得到分类得分
#shape(128,(20+1))
cls_score = slim.fully_connected(fc_flatten,
num_outputs=cfgs.CLASS_NUM+1,
weights_initializer=cfgs.INITIALIZER,
activation_fn=None, trainable=self.is_training,
scope='cls_fc')
#回归框位置
#shape(128,4*(20+1))
bbox_pred = slim.fully_connected(fc_flatten,
num_outputs=(cfgs.CLASS_NUM+1)*4,
weights_initializer=cfgs.BBOX_INITIALIZER,
activation_fn=None, trainable=self.is_training,
scope='reg_fc')
# for convient. It also produce (cls_num +1) bboxes
cls_score = tf.reshape(cls_score, [-1, cfgs.CLASS_NUM+1])
bbox_pred = tf.reshape(bbox_pred, [-1, 4*(cfgs.CLASS_NUM+1)])
#返回得分,返回位置
return bbox_pred, cls_score
接下来是build_whloe_network函数的最后一部分-loss
loss_dict = self.build_loss(rpn_box_pred=fpn_box_pred,
rpn_bbox_targets=fpn_bbox_targets,
rpn_cls_score=fpn_cls_score,
rpn_labels=fpn_labels,
bbox_pred=bbox_pred,
bbox_targets=bbox_targets,
cls_score=cls_score,
labels=labels)
final_bbox, final_scores, final_category = self.postprocess_fastrcnn(rois=rois,
bbox_ppred=bbox_pred,
scores=cls_prob,
img_shape=img_shape)
return final_bbox, final_scores, final_category, loss_dict
这个放在后面讲吧,感觉看了好多