Faster R-CNN代码理解(二)之build_whole_network.py

上一次说到train.py中

final_bbox, final_scores, final_category, loss_dict = faster_rcnn.build_whole_detection_network(
            input_img_batch=img_batch,
            gtboxes_batch=gtboxes_and_label)

现在我们看看build_whole_detection_network()这个函数,emmm,在build_whole_network.py中。pycharm对这个函数ctrl+B定位一下。嗯,慢慢讲:

if self.is_training:
     # 确保形状是[-1,5],-1表示电脑自动计算行数
     #tf.cast代表数据转换为tf.float32
     gtboxes_batch = tf.reshape(gtboxes_batch, [-1, 5])
     gtboxes_batch = tf.cast(gtboxes_batch, tf.float32)
 #图像的shape:(1,H,W,C)这是形状啦,不是每一行的具体数据
img_shape = tf.shape(input_img_batch)

########################################

1.构建基础网络

P_list = self.build_base_network(input_img_batch)  # [P2的特征图, P3的特征图, P4……, P5……, P6……]

具体点,需要看到:

def  build_base_network(self, input_img_batch):
     #string.startswith(X)——检查string是否与X相同,返回一个bool值
     if self.base_network_name.startswith('resnet_v1'):
     #构建了resnet网络并利用了FPN思想得到P2~6,并返回一个dict,字典里面是一个个的conv2d的特征图
        return resnet.resnet_base(input_img_batch, scope_name=self.base_network_name, is_training=self.is_training)
     elif self.base_network_name.startswith('MobilenetV2'):
         return mobilenet_v2.mobilenetv2_base(input_img_batch, is_training=self.is_training)
     else:
         raise ValueError('Sry, we only support resnet or mobilenet_v2')

重点在这一句代码

return resnet.resnet_base(input_img_batch, scope_name=self.base_network_name, is_training=self.is_training)

定位到resnet.py中
拆开看:

def resnet_base(img_batch, scope_name, is_training=True):
    '''
    this code is derived from light-head rcnn.
    https://github.com/zengarden/light_head_rcnn

    It is convenient to freeze blocks. So we adapt this mode.
    '''
    if scope_name == 'resnet_v1_50':
        middle_num_units = 6
    elif scope_name == 'resnet_v1_101':
        middle_num_units = 23
    else:
        raise NotImplementedError('We only support resnet_v1_50 or resnet_v1_101. Check your network name....yjr')

这边表示能用什么形式

blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
              resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
              resnet_v1_block('block3', base_depth=256, num_units=middle_num_units, stride=2),
              resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
    # when use fpn . stride list is [1, 2, 2]

这边我没怎么看彻底,有点绕的,大概的意思就是构建了一个resnet_v1_50这杨一个基础结构

 with slim.arg_scope(resnet_arg_scope(is_training=False)):
        with tf.variable_scope(scope_name, scope_name):
            # Do the first few layers manually, because 'SAME' padding can behave inconsistently
            # for images of different sizes: sometimes 0, sometimes 1
            #做卷积操作
            net = resnet_utils.conv2d_same(
                img_batch, 64, 7, stride=2, scope='conv1')
            #pad就是填充0,同时[0,0]表示在该维度不做处理

            net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
            net = slim.max_pool2d(
                net, [3, 3], stride=2, padding='VALID', scope='pool1')

对图像进行卷积,变为64通道的,然后进行padding,最后进行最大池化。
关于tf.pad,用下面的代码测一下大概就知道得到什么了

import tensorflow as tf
t=tf.constant([[[1,2], [2,3],[2,1]],
               [[1,2],  [2,3],[2,1]]])
print(t)
a=tf.pad(t,[[1,1], [2,0],[1,0]])
with tf.Session() as sess:
    print("----------------------------------")
    a = sess.run(a)
    print(a)
    print(a.shape)

接着:

#在建立base网络时,根据网络定义 not_freezed 确定,是否对特征提取网络进行再训练
not_freezed = [False] * cfgs.FIXED_BLOCKS + (4-cfgs.FIXED_BLOCKS)*[True]

cfgs.FIXED_BLOCKS=0,所以输出[True,True,True,True]

with slim.arg_scope(resnet_arg_scope(is_training=(is_training and not_freezed[0]))):
    C2, end_points_C2 = resnet_v1.resnet_v1(net,
                                            blocks[0:1],
                                            global_pool=False,
                                            include_root_block=False,
                                            scope=scope_name)

类似的就是做卷积,得到C2~C5层特征图

feature_dict = {'C2': end_points_C2['{}/block1/unit_2/bottleneck_v1'.format(scope_name)],
                   'C3': end_points_C3['{}/block2/unit_3/bottleneck_v1'.format(scope_name)],
                   'C4': end_points_C4['{}/block3/unit_{}/bottleneck_v1'.format(scope_name, middle_num_units - 1)],
                   'C5': end_points_C5['{}/block4/unit_3/bottleneck_v1'.format(scope_name)],
                   # 'C5': end_points_C5['{}/block4'.format(scope_name)],
                    }

将其每一层变为一个字典形式

接下在是构建特征图的金字塔结构

pyramid_dict = {} #定义一个空字典
with tf.variable_scope('build_pyramid'):
    with slim.arg_scope([slim.conv2d], weights_regularizer=slim.l2_regularizer(cfgs.WEIGHT_DECAY),
                        activation_fn=None, normalizer_fn=None):

        P5 = slim.conv2d(C5,
                         num_outputs=256,
                         kernel_size=[1, 1],
                         stride=1, scope='build_P5')
        if "P6" in cfgs.LEVLES:
            P6 = slim.max_pool2d(P5, kernel_size=[1, 1], stride=2, scope='build_P6')
            pyramid_dict['P6'] = P6

        pyramid_dict['P5'] = P5

此时构建了顶层P6、P5别用maxpool和conv2d进行构建

for level in range(4, 1, -1):  # build [P4, P3, P2]
#[]里面的是key,=后面是进行内容
         pyramid_dict['P%d' % level] = fusion_two_layer(C_i=feature_dict["C%d" % level],
                                                        P_j=pyramid_dict["P%d" % (level+1)],
                                                        scope='build_P%d' % level)

FPN的横向连接思想,就是上一层特征图进行双线性插值放大一倍与同层的C_x进行像素级融合比例0.5,0.5。此时得到了P4, P3,P2三个特征层

for level in range(4, 1, -1):
    pyramid_dict['P%d' % level] = slim.conv2d(
    			pyramid_dict['P%d' % level],
       			num_outputs=256, kernel_size=[3, 3],
       			padding="SAME",stride=1, scope="fuse_P%d" % level)

每个特征层进行用3*3的卷即可进行卷积,步长是1,都输出256层,可以看做是统一输出吧,并减小重叠可能性。

return [pyramid_dict[level_name] for level_name in cfgs.LEVLES]

最终在resnet.py的resnet_base函数返回的是一个列表[P2,P3,P4,P5,P6]每个Px都是一个特征层

返回到build_whloe_network.py上

P_list = self.build_base_network(input_img_batch)

这句话就是返回一个P_list的列表,列表中每个元素是一个特征层
#############################

2.构建RPN结构

 with tf.variable_scope('build_rpn',
	 regularizer=slim.l2_regularizer(cfgs.WEIGHT_DECAY)):

     fpn_cls_score =[]
     fpn_box_pred = []
     for level_name, p in zip(cfgs.LEVLES, P_list): #对应关系
     	if cfgs.SHARE_HEADS:
     		reuse_flag = None if level_name==cfgs.LEVLES[0] else True
     	#赋 每一层需要的变量
        scope_list=['rpn_conv/3x3',  'rpn_cls_score', 'rpn_bbox_pred'] 
        else:
    		 reuse_flag = None
     		scope_list= ['rpn_conv/3x3_%s' % level_name, 'rpn_cls_score_%s' % level_name, 'rpn_bbox_pred_%s' % level_name]

下面的代码就是RPN的重点首先将特征图输入到3*3的卷积后再进行位置偏差提取和区别前景/背景

#进行3*3的卷积
rpn_conv3x3 = slim.conv2d(
                    p, 512, [3, 3],
                    trainable=self.is_training, weights_initializer=cfgs.INITIALIZER, padding="SAME",
                    activation_fn=tf.nn.relu,
                    scope=scope_list[0],
                    reuse=reuse_flag)
#得到每个RPN的信息得分
rpn_cls_score = slim.conv2d(rpn_conv3x3, self.num_anchors_per_location*2, [1, 1], stride=1,
                                            trainable=self.is_training, weights_initializer=cfgs.INITIALIZER,
                                            activation_fn=None, padding="VALID",
                                            scope=scope_list[1],
                                            reuse=reuse_flag)
#位置偏差                                            
rpn_box_pred = slim.conv2d(rpn_conv3x3, self.num_anchors_per_location*4, [1, 1], stride=1,
                                           trainable=self.is_training, weights_initializer=cfgs.BBOX_INITIALIZER,
                                           activation_fn=None, padding="VALID",
                                           scope=scope_list[2],
                                           reuse=reuse_flag)
#将得分(锚点的前景、背景的score)一行行的写,位置偏差(dx,dy,dw,dh)
rpn_box_pred = tf.reshape(rpn_box_pred, [-1, 4])
rpn_cls_score = tf.reshape(rpn_cls_score, [-1, 2])

#通过for循环,得到了一串大列表fpn_cls_score, fpn_box_pred,每个里面可能就是6个大元素,每个元素的shape是[-1,4]
fpn_cls_score.append(rpn_cls_score) #【[[1,2],][1,2]],[……],[……]】
fpn_box_pred.append(rpn_box_pred)
fpn_cls_score = tf.concat(fpn_cls_score, axis=0, name='fpn_cls_score') #shape[-1,2]
fpn_box_pred = tf.concat(fpn_box_pred, axis=0, name='fpn_box_pred') #shape[-1,4]
fpn_cls_prob = slim.softmax(fpn_cls_score, scope='fpn_cls_prob')

总结:上述是为了进行rpn的粗分类与位置偏差的,最后得到fpn_cls_prob:shape(-1,4) 与 fpn_box_prob: shape(-1,2)
#######################################################

3.生成anchors

all_anchors = []
for i in range(len(cfgs.LEVLES)):
    #p_h是一个矩阵
    level_name, p = cfgs.LEVLES[i], P_list[i]

    p_h, p_w = tf.shape(p)[1], tf.shape(p)[2]
    featuremap_height = tf.cast(p_h, tf.float32)
    featuremap_width = tf.cast(p_w, tf.float32)
    anchors = anchor_utils.make_anchors(base_anchor_size=cfgs.BASE_ANCHOR_SIZE_LIST[i],
                                                anchor_scales=cfgs.ANCHOR_SCALES,
                                                anchor_ratios=cfgs.ANCHOR_RATIOS,
                                                featuremap_height=featuremap_height,
                                                featuremap_width=featuremap_width,
                                                stride=cfgs.ANCHOR_STRIDE_LIST[i],
                                                name="make_anchors_for%s" % level_name)

anchor_utils.make_anchors函数定位在anchor_utils.py文件中

def make_anchors(base_anchor_size, anchor_scales, anchor_ratios,
                 featuremap_height, featuremap_width,
                 stride, name='make_anchors'):
    with tf.variable_scope(name):
        base_anchor = tf.constant([0, 0, base_anchor_size, base_anchor_size], tf.float32)  # [x_center, y_center, w, h]
        #得到ws与hs的各种可能,用列表的形式显示
        ws, hs = enum_ratios(enum_scales(base_anchor, anchor_scales),
                             anchor_ratios)  # per locations ws and hs
        #!!!!!!!!!!!!!!!!!!!!!!!!!!!!
        #这儿表示每个特征点在原图上对应的anchor的中心。
        x_centers = tf.range(featuremap_width, dtype=tf.float32) * stride
        y_centers = tf.range(featuremap_height, dtype=tf.float32) * stride

        if cfgs.USE_CENTER_OFFSET:
            x_centers = x_centers + stride/2.
            y_centers = y_centers + stride/2.

        #这边不是很明白,可能真的数学不大好吧 
        x_centers, y_centers = tf.meshgrid(x_centers, y_centers)
        ws, x_centers = tf.meshgrid(ws, x_centers)
        hs, y_centers = tf.meshgrid(hs, y_centers)
		#合称为一个个坐标点,shape[1,N,2]
        anchor_centers = tf.stack([x_centers, y_centers], 2)
        #转变维度,变为[-1,2]
        anchor_centers = tf.reshape(anchor_centers, [-1, 2])

        box_sizes = tf.stack([ ws,hs], axis=2)
        box_sizes = tf.reshape(box_sizes, [-1, 2])
        print(box_sizes)
		#这边就是生成各种anchor的坐标了
        anchors = tf.concat([anchor_centers - 0.5*box_sizes,
                             anchor_centers + 0.5*box_sizes], axis=1)
        return anchors

回到build_whole_network.py中,上述函数就是在原图像上得到各个特征层的特征点在原图上对应的anchor框的坐标

all_anchors.append(anchors) 
all_anchors = tf.concat(all_anchors, axis=0, name='all_anchors_of_FPN')

跟之前的fpn_cls_prob类似的操作,变为一个个组数据后再转为行数据【维度变换】,此时可以看做是对原图像进行密集采样
#################################################

4.postprocess rpn proposals. such as: decode, clip, NMS

rois, roi_scores = postprocess_rpn_proposals(rpn_bbox_pred=fpn_box_pred,
                                             rpn_cls_prob=fpn_cls_prob,
                                             img_shape=img_shape,
                                             anchors=all_anchors,
                                             is_training=self.is_training)

postprocess_rpn_proposals的输出参数是RPN的预测偏差,预测类别(前/背景),图像的shape,锚点框

定位postprocess_rpn_proposals在proposal_opr.py中
大致思路是这样的:

选取概率较大的12000个anchor
利用回归的位置参数,修正这12000个anchor的位置,得到RoIs
利用非极大值((Non-maximum suppression, NMS)抑制,选出概率最大的2000个RoIs
#得到anchor映射框与rpn得到的预测框对比得到回归
decode_boxes = encode_and_decode.decode_boxes(encoded_boxes=rpn_bbox_pred,
                                              reference_boxes=anchors,
                                              scale_factors=cfgs.ANCHOR_SCALE_FACTORS)
#将anchor的采样框进行剪枝,
#有的框超过了边界,将其约束住
decode_boxes = boxes_utils.clip_boxes_to_img_boundaries(decode_boxes=decode_boxes,
                                                        img_shape=img_shape)
if pre_nms_topN > 0:
    #比较12000与采样框进行比较,取小的数
    pre_nms_topN = tf.minimum(pre_nms_topN, tf.shape(decode_boxes)[0], name='avoid_unenough_boxes')
    #cls_prob是一个shape= [:,2],在第二维是一个bool类型的矩阵
    #cls_prob是tensor,类型与cls_prob类型相同,大小是12000
    #top_k_indices,是对应的索引位置
    cls_prob, top_k_indices = tf.nn.top_k(cls_prob, k=pre_nms_topN)
    #通过top_k_indices的索引,用tf.gather对decode_boxes进行筛选然后进行排序,得到排序后的anchor的筛选框
    decode_boxes = tf.gather(decode_boxes, top_k_indices)
   # 4. NMS
   #进行NMS,返回的是形状为[M]的一维整数Tensor,表示从box张量中选择的指数,其中M <= max_output_
    keep = tf.image.non_max_suppression(
        boxes=decode_boxes,
        scores=cls_prob,
        max_output_size=post_nms_topN,
        iou_threshold=nms_thresh)
    #得到最终的box
    final_boxes = tf.gather(decode_boxes, keep)
    final_probs = tf.gather(cls_prob, keep)
    #返回筛选后的box和prob(对anchor与rpn)
    return final_boxes, final_probs

以上就是对proposal_opr.py的注释=-=,意思就是对anchor与rpn当时产生的框进行回归,剪枝,NMS。这边可以看到基本有roi兴趣区域了,相当于初步筛选结束啦!!!!!!!!!!!!!!!

返回到build_whole_network.py中

if self.is_training:
    with tf.variable_scope('sample_anchors_minibatch'):
        #tf.py_func(fun,input of func, output of func)
        #这代码就是选取了256个样本,返回筛选后的rpn的标签fpn_labels和回归框fpn_bbox_targets
        fpn_labels, fpn_bbox_targets = \
            tf.py_func(
                anchor_target_layer,
                [gtboxes_batch, img_shape, all_anchors],
                [tf.float32, tf.float32])
        fpn_bbox_targets = tf.reshape(fpn_bbox_targets, [-1, 4])
        fpn_labels = tf.to_int32(fpn_labels, name="to_int32")
        fpn_labels = tf.reshape(fpn_labels, [-1])
        #画正负样本的img
        self.add_anchor_img_smry(input_img_batch, all_anchors, fpn_labels)

tf.py_func()将是相当于可以自定义一个函数,anchor_targe_layer是函数名,[gtboxes_batch, img_shape, all_anchors是函数的输入变量,tf.float32, tf.float32是函数的输出格式。

定位anchor_target_layer函数anchor_target_layer_without_boxweight.py中。
这边建议看这个链接:https://blog.csdn.net/u014380165/article/details/80379812
强力推荐

这边我做些摘抄上去注释,别喷我=-=,AI大神真的讲的挺详细的

def anchor_target_layer(gt_boxes, img_shape, all_anchors, is_restrict_bg=False):
    total_anchors = all_anchors.shape[0] #总的anchor个数
    img_h, img_w = img_shape[1], img_shape[2] #图像的长宽
    gt_boxes = gt_boxes[:, :-1]  #(x1,y1,x2,y2)
	
	_allowed_border = 0
	#inds_inside表示anchor的4个点坐标都在图像内部的anchor的index
   if cfgs.IS_FILTER_OUTSIDE_BOXES:
      inds_inside = np.where(
          (all_anchors[:, 0] >= -_allowed_border) &
          (all_anchors[:, 1] >= -_allowed_border) &
          (all_anchors[:, 2] < img_w + _allowed_border) &  # width
          (all_anchors[:, 3] < img_h + _allowed_border)  # height
      )[0]
  else:
      inds_inside = range(all_anchors.shape[0]) #做顺序列表

  #过滤掉不完全在图像内部的anchor框
  anchors = all_anchors[inds_inside, :]
  #先将labels置空,全部填充-1
  labels = np.empty((len(inds_inside),), dtype=np.float32)
  labels.fill(-1)

重点来了,这是大神帮了我

#这里得出的是一个表?元素是anchor与GT的IOU
overlaps = bbox_overlaps(
        np.ascontiguousarray(anchors, dtype=np.float),
        np.ascontiguousarray(gt_boxes, dtype=np.float))
#每一行最大IOU的索引,shape(n,1),返回的是对应的gt的标签值
argmax_overlaps = overlaps.argmax(axis=1) 
#max_overlaps是具体的IOU值。
max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]

#计算出每一列的最大值的索引,一共有ground truth 目标数目个列(每一个ground truth与哪一个Anchor重叠最大)
gt_argmax_overlaps = overlaps.argmax(axis=0)
#取出与ground truth最大重叠的Anchor
gt_max_overlaps = overlaps[ gt_argmax_overlaps, np.arange(overlaps.shape[1])]
#IOU最大的那些anchor都捞出来。
gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]

if not cfgs.TRAIN_RPN_CLOOBER_POSITIVES:
	#如果某一行最大的anchor的IOU都小于0.3,那直接定义为背景
    labels[max_overlaps < cfgs.RPN_IOU_NEGATIVE_THRESHOLD] = 0
    #前景标签定义2种:1,object对应的IOU最大的那个,2.某个anchor与object的IOU>0.7的情况
    labels[gt_argmax_overlaps] = 1
    labels[max_overlaps >= cfgs.RPN_IOU_POSITIVE_THRESHOLD] = 1
	#背景定义:某个anchor与每个object中最大的IOU都小于0.3
	labels[max_overlaps < cfgs.RPN_IOU_NEGATIVE_THRESHOLD] = 0
	'''
	0.1  0.2  0.1  0.1
	0.2  0.1  0.5  0.4
	0.7  0.1  0.6  0.6  
	上述的x[1,1]是最大,在正样本时被选为了1,但是同时<0.3,所以有一种可能就是该anchor是负样本
	'''

#欠样本处理
#前景的个数=128
num_fg = int(cfgs.RPN_MINIBATCH_SIZE * cfgs.RPN_POSITIVE_RATE)
#取正样本的索引
fg_inds = np.where(labels == 1)[0]
#如果正样本个数大于>128,就随机将多余的正样本的标签变为-1(不关注)
if len(fg_inds) > num_fg:
        disable_inds = npr.choice(
            fg_inds, size=(len(fg_inds) - num_fg), replace=False)
        labels[disable_inds] = -1

#负样本的个数 = 128 或者  256-正样本个数(正样本个数<128)
#若果负样本个数>规定个数,同样随机将多余的标签改为-1
num_bg = cfgs.RPN_MINIBATCH_SIZE - np.sum(labels == 1)
    if is_restrict_bg:
        num_bg = max(num_bg, num_fg * 1.5)
    bg_inds = np.where(labels == 0)[0]
    if len(bg_inds) > num_bg:
        disable_inds = npr.choice(
            bg_inds, size=(len(bg_inds) - num_bg), replace=False)
        labels[disable_inds] = -1
#此时得到了每个anchor关于样本的标签了 (还只是各class的前景和背景)
	
	#_compute_targets是计算关于anchor与真实的gt框IOU最大的object 之间的缩放因子(t_xcenter, t_ycenter, t_w, t_h)
	bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
	
	#因为一直在计算中都是针对于所有在图像内的框进行运算,
	#并没有考虑到在图像外的框,但是在最终的计算中,
	#针对的是所有的anchor,因此需要将处理过的与原始的进行融合
	labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
    bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
	
	rpn_labels = labels.reshape((-1, 1)) #得到所有的标签了 
	bbox_targets = bbox_targets.reshape((-1, 4))
    rpn_bbox_targets = bbox_targets
    #此时 rpn_bbox_targets代表了所有的anchor框,但是是处理好的
	
	return rpn_labels, rpn_bbox_targets #返回rpn_labels的标签,和rpn_targets的偏转位置(t因子)

上述做啥了?简单叙述下:

  • 过滤下anchor框超出变换后的图像的情况,并统计个数
  • 统计gt_boxes的个数,做过滤后的anchor与gt_boxes的overlaps–IOU
  • 计算IOU贴正负和不关注样本标签
  • 欠采样处理
  • 计算IOU最大的anchor与gt_boxes之间的尺尺度因子
  • 将这些特殊经过欠采样得到的标签和尺度因子放回到整个anchor中

回到build_whole_network.py中
刚才的设计都是针对于RPN网络的,并没有设计到真正的类别信息,以下则采用RCNN部分获得其相关的roi,target等信息句
那么fpn_labels, fpn_bbox_targets = rpn_labels, rpn_bbox_targets
然后是一些画图啊计算ACC什么的不管他了,还算是好理解
接下来是这个代码,和上面的with是统一层次的

with tf.control_dependencies([fpn_labels]):
      with tf.variable_scope('sample_RCNN_minibatch'):
          rois, labels, bbox_targets = \
          tf.py_func(proposal_target_layer,
                     [rois, gtboxes_batch],
                     [tf.float32, tf.float32, tf.float32])
          rois = tf.reshape(rois, [-1, 4])
          labels = tf.to_int32(labels)
          labels = tf.reshape(labels, [-1])
          #回归框标记shape(-1,4*(21))存储的是roi对应每个类的坐标信息。
          bbox_targets = tf.reshape(bbox_targets, [-1, 4*(cfgs.CLASS_NUM+1)])
          self.add_roi_batch_img_smry(input_img_batch, rois, labels)

定位proposal_target_layer在proposal_target_layer.py中,注意一下=-=这里的anchor就是rois。哇,想到这里我就哭了,看成了all_anchor了,佰忙活了一个上午查资料

def proposal_target_layer(rpn_rois, gt_boxes):
    """
    Assign object detection proposals to ground-truth targets. Produces proposal
    classification labels and bounding-box regression targets.
    """
    # Proposal ROIs (x1, y1, x2, y2) coming from RPN
    # gt_boxes (x1, y1, x2, y2, label)

    if cfgs.ADD_GTBOXES_TO_TRAIN:
    #gt_boxes[:,:-1]表示不取最后一列
    #每一行水平放置
        all_rois = np.vstack((rpn_rois, gt_boxes[:, :-1]))
    else:
        all_rois = rpn_rois

    #若cfgs.FAST_RCNN_MINIBATCH_SIZE == -1 则rois_per_image = np.inf,
    #否则rois_per_image = cfgs.FAST_RCNN_MINIBATCH_SIZE = 256
    #设置每张图上的roi个数
    rois_per_image = np.inf if cfgs.FAST_RCNN_MINIBATCH_SIZE == -1 else cfgs.FAST_RCNN_MINIBATCH_SIZE
    #fg_rois_per_image=64
    #每张图的roi个数为256*0.25
    fg_rois_per_image = np.round(cfgs.FAST_RCNN_POSITIVE_RATE * rois_per_image)

    #带有分类标签和边界框回归的样本rois_sample_rois选择进行分类训练的框,
    # 并求取他们类别和坐标的ground truth和计算边框损失loss时需要的
    #对roi进行再次筛选,并分具体的标签,不再是二分类了
    #注意bbox_targets shape (-1,num[含背景]*4),num*4中,只有4个数是有值的,其他都是0
    bbox_inside_weights
    labels, rois, bbox_targets = _sample_rois(all_rois, gt_boxes, fg_rois_per_image,
                                              rois_per_image, cfgs.CLASS_NUM+1)

    rois = rois.reshape(-1, 4)
    labels = labels.reshape(-1)
    bbox_targets = bbox_targets.reshape(-1, (cfgs.CLASS_NUM+1) * 4)

    return rois, labels, bbox_target

最后返回rois(进过两次筛选啦)与labels和bbox_targets(坐标框)
回到 build_whloe_network.py中

上面这段我会重读=-=,感觉看的不是很顺利

 if self.is_training:
     rois_list, labels, bbox_targets = self.assign_levels(all_rois=rois,
                                                          labels=labels,
                                                          bbox_targets=bbox_targets)
 else:
     #每个Px返回的rois 不是很清楚啦
     rois_list = self.assign_levels(all_rois=rois)  # rois_list: [P2_rois, P3_rois, P

##########################################
5.构建Fast R-CNN

bbox_pred, cls_score = self.build_fastrcnn(P_list=P_list, rois_list=rois_list,
                                                   img_shape=img_shape)
cls_prob = slim.softmax(cls_score, 'cls_prob')#softmax得到概率
##以下是build_fastrcnn函数的代码
def build_fastrcnn(self, P_list, rois_list, img_shape):
    with tf.variable_scope('Fast-RCNN'):
        # 5. ROI Pooling
        with tf.variable_scope('rois_pooling'):
            pooled_features_list = []
            for level_name, p, rois in zip(cfgs.LEVLES, P_list, rois_list):  # exclude P6_rois
                #对每个roi进行roi pooling
                #该函数调用了tf.image.crop_and_resize函数,其中box_ind=tf.zeros(shape=[N, ],表示做能用到第batch=0这张图上,就是本张的意思,每张roi变成14*14,之后进行3*3的卷积,返回roi_features
                                                                                          dtype=tf.int32
                pooled_features = self.roi_pooling(feature_maps=p, rois=rois, img_shape=img_shape,
                                                   scope=level_name)
				# pooled_features_list把每层都添加进来
                pooled_features_list.append(pooled_features)
			#变为shape(1,H,W,C)
            pooled_features = tf.concat(pooled_features_list, axis=0)

        # 6. inferecne rois in Fast-RCNN to obtain fc_flatten features
        if self.base_network_name.startswith('resnet'):
        	#进行两次全连接
            fc_flatten = resnet.restnet_head(inputs=pooled_features,
                                             is_training=self.is_training,
                                             scope_name=self.base_network_name)
        elif self.base_network_name.startswith('Mobile'):
            fc_flatten = mobilenet_v2.mobilenetv2_head(inputs=pooled_features,
                                                       is_training=self.is_training)
        else:
            raise NotImplementedError('only support resnet and mobilenet')

        # 7. cls and reg in Fast-RCNN
        with slim.arg_scope([slim.fully_connected], weights_regularizer=slim.l2_regularizer(cfgs.WEIGHT_DECAY)):
            #d得到分类得分
            #shape(128,(20+1))
            cls_score = slim.fully_connected(fc_flatten,
                                             num_outputs=cfgs.CLASS_NUM+1,
                                             weights_initializer=cfgs.INITIALIZER,
                                             activation_fn=None, trainable=self.is_training,
                                             scope='cls_fc')
            #回归框位置
            #shape(128,4*(20+1))
            bbox_pred = slim.fully_connected(fc_flatten,
                                             num_outputs=(cfgs.CLASS_NUM+1)*4,
                                             weights_initializer=cfgs.BBOX_INITIALIZER,
                                             activation_fn=None, trainable=self.is_training,
                                             scope='reg_fc')
            # for convient. It also produce (cls_num +1) bboxes

            cls_score = tf.reshape(cls_score, [-1, cfgs.CLASS_NUM+1])
            bbox_pred = tf.reshape(bbox_pred, [-1, 4*(cfgs.CLASS_NUM+1)])
            #返回得分,返回位置
            return bbox_pred, cls_score

接下来是build_whloe_network函数的最后一部分-loss

 loss_dict = self.build_loss(rpn_box_pred=fpn_box_pred,
                                        rpn_bbox_targets=fpn_bbox_targets,
                                        rpn_cls_score=fpn_cls_score,
                                        rpn_labels=fpn_labels,
                                        bbox_pred=bbox_pred,
                                        bbox_targets=bbox_targets,
                                        cls_score=cls_score,
                                        labels=labels)

            final_bbox, final_scores, final_category = self.postprocess_fastrcnn(rois=rois,
                                                                                 bbox_ppred=bbox_pred,
                                                                                 scores=cls_prob,
                                                                                 img_shape=img_shape)
            return final_bbox, final_scores, final_category, loss_dict

这个放在后面讲吧,感觉看了好多

你可能感兴趣的:(深度学习)