FasterRCNN(一)网络部分代码分析

一.定义一个RCNN的类作为基础

1. self.build_network(self,sess,is_training=True) 函数,构建网络框架。

     with tf.variable_scope('RCNN'):

        initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
        initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)

        net=self.VGG(is_training)

        rpn_cls_prob,rpn_bbox_pred,rpn_cls_score,rpn_cls_score_reshape=self.RPN(net,is_training)

        rois=self.build_proposals(is_training,rpn_cls_prob,rpn_bbox_pred,rpn_cls_score)

        cls_score,cls_prob,bbox_pred=self.build_predictions(net,rois,is_training,initializer,initializer_bbox)

        return rois, cls_prob, bbox_pred

原始特征提取步骤使用经典的VGG16网络
然后将得到的输出net 传到RPN层里
再将得到的对于proposal的预测是前景还是后景(cls),以及具体位置预测传入生成ROIS的函数里
最后将ROIS放入预测函数。

2. self.VGG(self,is_training) 提取图像特征层

     with tf.variable_scope("VGG"):
        with slim.arg_scope([slim.conv2d],stride=1,padding="SAME",activation_fn=tf.nn.relu):
            with slim.arg_scope([slim.max_pool2d],stride=2,padding="VALID"):
                conv1_1=slim.conv2d(self._image,64,[3,3],scope='conv1_1_3x3')
                conv1_2=slim.conv2d(conv1_1,64,[3,3],scope='conv1_2_3x3')
                self._vgg["conv1_2"]=conv1_2
                pool1=slim.max_pool2d(conv1_2,[2,2],scope='pool1_2x2')

                conv2_1=slim.conv2d(pool1,128,[3,3],scope='conv2_1_3x3')
                conv2_2=slim.conv2d(conv2_1,128,[3,3],scope='conv2_2_3x3')
                self._vgg["conv2_2"]=conv2_2
                pool2=slim.max_pool2d(conv2_2,[2,2],scope='pool2_2x2')

                conv3_1=slim.conv2d(pool2,256,[3,3],scope='conv3_1_3x3')
                conv3_2=slim.conv2d(conv3_1,256,[3,3],scope='conv3_2_3x3')
                conv3_3=slim.conv2d(conv3_2,256,[3,3],scope='conv3_3_3x3')
                self._vgg["conv3_3"]=conv3_3
                pool3=slim.max_pool2d(conv3_3,[2,2],scope='pool3_2x2')

                conv4_1=slim.conv2d(pool3,512,[3,3],scope='conv4_1_3x3')
                conv4_2=slim.conv2d(conv4_1,512,[3,3],scope='conv4_2_3x3')
                conv4_3=slim.conv2d(conv4_2,512,[3,3],scope='conv4_3_3x3')
                self._vgg["conv4_3"]=conv4_3
                pool4=slim.max_pool2d(conv4_3,[2,2],scope='pool4_2x2')

                conv5_1=slim.conv2d(pool4,512,[3,3],scope='conv5_1_3x3')
                conv5_2=slim.conv2d(conv5_1,512,[3,3],scope='conv5_2_3x3')
                conv5_3=slim.conv2d(conv5_2,512,[3,3],scope='conv5_3_3x3')
                self._vgg["conv5_3"]=conv5_3

                return conv5_3
                #obtain a tensor with a channel number of 512 and unfixed long, wide

VGG网络中每一个卷积层都是使用的3*3的卷积核,padding=‘SAME’,步长为1,池化层都是2*2,步长为2,padding=‘VALID’,即卷积层不改变长宽,池化层使之缩小二分之一,最后变成1/16。由于都是重复的卷积层,可以直接使用slim.repeat()函数。

3. self.RPN(tensor,is_training)

rpn层,代替了fastRCNN网络中的selectivesearch,速度上升了很多。

    with tf.variable_scope("RPN"):
        rpn_conv1=slim.conv2d(tensor,512,[3,3],padding="SAME",stride=1,
            trainable=is_training,
            weights_initializer=tf.truncated_normal_initializer(0.01),
            activation_fn=tf.nn.relu,name="rpn_conv1")

        rpn_cls_score=slim.conv2d(rpn_conv1,self._num_anchors*2,[1,1],padding="VALID",stride=1,
            trainable=is_training,
            weights_initializer=tf.truncated_normal_initializer(0.01),
            activation_fn=tf.nn.sigmoid,name="rpn_cls")#Each anchor binary classification

        rpn_cls_score_reshape=self._reshape_layer(rpn_cls_score,2,'rpn_cls_score_reshape')
        rpn_cls_prob_reshape=self._softmax_layer(rpn_cls_score_reshape,"rpn_cls_prob_reshape")
        rpn_cls_prob=self._reshape_layer(rpn_cls_prob_reshape,self._num_anchors*2,"rpn_cls_prob")
        rpn_bbox_pred=slim.conv2d(rpn,self._num_anchors*4,[1,1],trainable=is_training,
            weights_initializer=initializer,padding='VALID',activation_fn=None,scope='rpn_bbox_pred')

        self._rpn["rpn_conv1"]=rpn_conv1
        self._rpn["rpn_cls_prob"]=rpn_cls_prob
        self._rpn["rpn_bbox_pred"]=rpn_bbox_pred
        self._rpn["rpn_cls_score"]=rpn_cls_score
        self._rpn["rpn_cls_score_reshape"]=rpn_cls_score_reshape

        return rpn_cls_prob,rpn_bbox_pred,rpn_cls_score,rpn_cls_score_reshape

RPN层首先对VGG得到的feature map进行3x3的卷积(可能是为了语义空间转换?某乎看到的。。),然后利用两个1x1的卷积分别进行二分类(背景还是目标)和位置回归。进行分类的卷积核通道数为9×2(9个anchor,每个anchor二分类,sigmod作为激活函数),进行位置回归的卷积核通道数为9×4(9个anchor,每个anchor有4个位置参数,没有激活函数)。RPN是一个全卷积网络(fully convolutional network),这样对输入图片的尺寸就没有要求了。
在这段代码里还用到啦self._reshape_layer(self,bottom,num_dim,name) ,这个函数主要是将tensor的通道数修改为num_dim,因为是进行二分类,所以需要将9个anchor都进行softmax的操作,即使用函数self._softmax_layer(self,bottom,name) 。下面是这两个函数。

def _softmax_layer(self,bottom,name):
    if name=='rpn_cls_prob_reshape':
        input_shape=tf.shape(bottom)
        bottom_reshaped=tf.reshape(bottom,[-1,input_shape[-1]])
        reshaped_score=tf.nn.softmax(bottom_reshaped,name=name)

        return tf.reshape(reshaped_score,input_shape)

    return tf.nn.softmax(bottom,name=name)

def _reshape_layer(self,bottom,num_dim,name):
    input_shape=tf.shape(bottom)
    with tf.variable_scope(name):
        # change the channel to the caffe format
        to_caffe=tf.transpose(bottom,[0,3,1,2])
        # then force it to have channel 2
        reshaped=tf.reshape(to_caffe,tf.concat(axis=0,
            values=[[self._batch_size],[num_dim,-1],[input_shape[2]]]))
        # then swap the channel back
        to_tf=tf.transpose(reshaped,[0,2,3,1])
        return to_tf

4. self.(self,tensor,is_training,rpn_cls_prob,rpn_bbox_pred,rpn_cls_score)

这个函数返回到是ROIS(Region of interests)

if is_training:
        with tf.variable_scope("rois"):
            rois,rois_score=tf.py_func(proposal_layer,
                [rpn_cls_prob,rpn_bbox_pred,self._im_info,self._mode,
                self._feat_stride,self._anchors,self._num_anchors],
                [tf.float32,tf.float32])
            rois.set_shape([None,5])
            rois_score.set_shape([None,1])

        with tf.variable_scope("anchor"):
            rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights=tf.py_func(
                anchor_target_layer,
                [rpn_cls_score,self._gt_boxes,self._im_info,self._feat_stride,self._anchors,self._num_anchors],
                [tf.float32,tf.float32,tf.float32,tf.float32])

            rpn_labels.set_shape([1,1,None,None])
            rpn_bbox_targets.set_shape([1,None,None,self._num_anchors*4])
            rpn_bbox_inside_weights.set_shape([1,None,None,self._num_anchors*4])
            rpn_bbox_outside_weights.set_shape([1,None,None,self._num_anchors*4])

            rpn_labels=tf.to_int32(rpn_labels,name="to_int32")
            self._anchors_targets["rpn_labels"]=rpn_labels
            self._anchors_targets["rpn_bbox_targets"]=rpn_bbox_targets
            self._anchors_targets["rpn_bbox_inside_weights"]=rpn_bbox_inside_weights
            self._anchors_targets["rpn_bbox_outside_weights"]=rpn_bbox_outside_weights

        with tf.control_dependencies([rpn_labels]):
            with tf.variable_scope("rpn_rois"):
                rois,roi_scores,labels,bbox_targets,bbox_inside_weights,bbox_outside_weights=tf.py_func(
                    proposal_target_layer,
                    [rois,roi_scores,self._gt_boxes,self._num_classes],
                    [tf.float32,tf.float32,tf.float32,tf.float32,tf.float32,tf.float32])

                rois.set_shape([self._training_anchor_batch_size,5])
                roi_scores.set_shape([self._training_anchor_batch_size])
                labels.set_shape([self._training_anchor_batch_size, 1])
                bbox_targets.set_shape([self._training_anchor_batch_size,self._num_classes*4])
                bbox_inside_weights.set_shape([self._training_anchor_batch_size,self._num_classes*4])
                bbox_outside_weights.set_shape([self._training_anchor_batch_size,self._num_classes*4])

                self._proposal_targets['rois']=rois
                self._proposal_targets['labels']=tf.to_int32(labels, name="to_int32")
                self._proposal_targets['bbox_targets']=bbox_targets
                self._proposal_targets['bbox_inside_weights']=bbox_inside_weights
                self._proposal_targets['bbox_outside_weights']=bbox_outside_weights
    else:
        with tf.variable_scope("rois"):
        rois,rpn_scores=tf.py_func(proposal_top_layer,
                                    [rpn_cls_prob,rpn_bbox_pred,self._im_info,
                                    self._feat_stride,self._anchors,self._num_anchors],
                                    [tf.float32,tf.float32])
        rois.set_shape([cfg.FLAGS.rpn_top_n,5])
        rpn_scores.set_shape([cfg.FLAGS.rpn_top_n,1])

    return rois

首先,在train中,首先调用了proposal_layer(rpn_cls_prob,rpn_bbox_pred,im_info,mode,_feat_stride,anchors,num_anchors 函数,下面是这个函数的代码

if type(mode)==bytes:
    mode=mode.decode('utf-8')

if mode=="TRAIN":
    pre_nms_topN=cfg.FLAGS.rpn_train_pre_nms_top_n
    post_nms_topN=cfg.FLAGS.rpn_train_post_nms_top_n
    nms_thresh=cfg.FLAGS.rpn_train_nms_thresh
else:
    pre_nms_topN=cfg.FLAGS.rpn_test_pre_nms_top_n
    post_nms_topN=cfg.FLAGS.rpn_test_post_nms_top_n
    nms_thresh=cfg.FLAGS.rpn_test_nms_thresh

im_info=im_info[0]
# Get the scores and bounding boxes
scores=rpn_cls_prob[:,:,:,num_anchors:]
rpn_bbox_pred=rpn_bbox_pred.reshape((-1, 4))
scores=scores.reshape((-1, 1))
proposals=bbox_transform_inv(anchors,rpn_bbox_pred)
proposals=clip_boxes(proposals,im_info[:2])

# Pick the top region proposals
order=scores.ravel().argsort()[::-1]
if pre_nms_topN > 0:
    order=order[:pre_nms_topN]
proposals=proposals[order,:]
scores=scores[order]

# Non-maximal suppression
keep=nms(np.hstack((proposals, scores)),nms_thresh)

# Pick th top region proposals after NMS
if post_nms_topN > 0:
    keep=keep[:post_nms_topN]
proposals=proposals[keep, :]
scores=scores[keep]

# Only support single image as input
batch_inds=np.zeros((proposals.shape[0], 1),dtype=np.float32)
blob=np.hstack((batch_inds,proposals.astype(np.float32,copy=False)))

return blob,scores

函数实参后面再解释。现在先看函数内部,cfg是一个存储常量值的.py文件,pre_nms_topN和post_nms_topN分别是两次筛选anchor后剩余的数量,第一次为12000,后一次为2000,nms_thresh为nms的阈值,0.7。(train中的值)后面又使用了另一个函数,bbox_transform_inv(boxes,deltas) 返回的是所有预测的proposal,代码如下

if boxes.shape[0]==0:
    return np.zeros((0,deltas.shape[1]),dtype=deltas.dtype)

boxes=boxes.astype(deltas.dtype,copy=False)
widths=boxes[:,2]-boxes[:,0]+1.0
heights=boxes[:,3]-boxes[:,1]+1.0
ctr_x=boxes[:,0]+0.5*widths
ctr_y=boxes[:,1]+0.5*heights

dx=deltas[:,0::4]
dy=deltas[:,1::4]
dw=deltas[:,2::4]
dh=deltas[:,3::4]

pred_ctr_x=dx*widths[:,np.newaxis]+ctr_x[:,np.newaxis]
pred_ctr_y=dy*heights[:,np.newaxis]+ctr_y[:,np.newaxis]
pred_w=np.exp(dw)*widths[:,np.newaxis]
pred_h=np.exp(dh)*heights[:,np.newaxis]

pred_boxes=np.zeros(deltas.shape,dtype=deltas.dtype)
# x1
pred_boxes[:,0::4]=pred_ctr_x-0.5*pred_w
# y1
pred_boxes[:,1::4]=pred_ctr_y-0.5*pred_h
# x2
pred_boxes[:,2::4]=pred_ctr_x+0.5*pred_w
# y2
pred_boxes[:,3::4]=pred_ctr_y+0.5*pred_h

return pred_boxes

主要是通过对anchors与卷积得到的结果进行运算,得到中心点与宽高的估计值。下面的clip_boxes(boxes,im_shape) 函数是去掉估计值中不合实际的点,即超出或者不足的位置点。score是次proposal为背景还是文本的概率。代码如下,

 boxes[:,0::4]=np.maximum(np.minimum(boxes[:,0::4],im_shape[1]-1),0)
 boxes[:,1::4]=np.maximum(np.minimum(boxes[:,1::4],im_shape[0]-1),0)
 boxes[:,2::4]=np.maximum(np.minimum(boxes[:,2::4],im_shape[1]-1),0)
 boxes[:,3::4]=np.maximum(np.minimum(boxes[:,3::4],im_shape[0]-1),0)
 return boxes

得到预测之后,筛选出前pre_nms_topN个预测,再进行nms,nms(dets,thresh) 函数的代码如下,

 if dets.shape[0] == 0:
    return []
 else:
    x1=dets[:, 0]
    y1=dets[:, 1]
    x2=dets[:, 2]
    y2=dets[:, 3]
    scores=dets[:, 4]

    areas=(x2-x1+1)*(y2-y1+1)
    order=scores.argsort()[::-1]

    keep=[]
    while order.size>0:
        i=order[0]
        keep.append(i)
        xx1=np.maximum(x1[i],x1[order[1:]])
        yy1=np.maximum(y1[i],y1[order[1:]])
        xx2=np.minimum(x2[i],x2[order[1:]])
        yy2=np.minimum(y2[i],y2[order[1:]])

        w=np.maximum(0.0,xx2-xx1+1)
        h=np.maximum(0.0,yy2-yy1+1)
        inter=w*h
        ovr=inter/(areas[i]+areas[order[1:]]-inter)

        inds=np.where(ovr<=thresh)[0]
        order=order[inds+1]

    return keep

有很多关于nms的解释,这里不多说。

然后回到build_proposals函数中,接下来调用了anchor_target_layer(rpn_cls_score,gt_boxes,im_info,_feat_stride,all_anchors,num_anchors) 直接上代码,,

A=num_anchors
total_anchors=all_anchors.shape[0]
K=total_anchors/num_anchors
im_info=im_info[0]

# allow boxes to sit over the edge by a small amount
_allowed_border=0

# map of shape (..., H, W)
height,width=rpn_cls_score.shape[1:3]

# only keep anchors inside the image
inds_inside=np.where(
    (all_anchors[:,0]>=-_allowed_border)&
    (all_anchors[:,1]>=-_allowed_border)&
    (all_anchors[:,2]=cfg.FLAGS.rpn_positive_overlap]=1

if cfg.FLAGS.rpn_clobber_positives:
    # assign bg labels last so that negative labels can clobber positives
    labels[max_overlapsnum_fg:
    disable_inds=npr.choice(
        fg_inds,size=(len(fg_inds)-num_fg),replace=False)
    labels[disable_inds]=-1

# subsample negative labels if we have too many
num_bg=cfg.FLAGS.rpn_batchsize-np.sum(labels == 1)
bg_inds=np.where(labels==0)[0]
if len(bg_inds)>num_bg:
    disable_inds=npr.choice(
        bg_inds,size=(len(bg_inds)-num_bg),replace=False)
    labels[disable_inds]=-1

bbox_targets=_compute_targets(anchors, gt_boxes[argmax_overlaps, :])

bbox_inside_weights=np.zeros((len(inds_inside), 4),dtype=np.float32)
# only the positive ones have regression targets
bbox_inside_weights[labels==1,:]=np.array(cfg.FLAGS2["bbox_inside_weights"])

bbox_outside_weights=np.zeros((len(inds_inside),4),dtype=np.float32)
if cfg.FLAGS.rpn_positive_weight<0:
    # uniform weighting of examples (given non-uniform sampling)
    num_examples=np.sum(labels>=0)
    positive_weights=np.ones((1,4))*1.0/num_examples
    negative_weights=np.ones((1,4))*1.0/num_examples
else:
    assert ((cfg.FLAGS.rpn_positive_weight > 0) &
            (cfg.FLAGS.rpn_positive_weight < 1))
    positive_weights=(cfg.FLAGS.rpn_positive_weight /
                        np.sum(labels==1))
    negative_weights=((1.0-cfg.FLAGS.rpn_positive_weight) /
                        np.sum(labels==0))
bbox_outside_weights[labels==1,:]=positive_weights
bbox_outside_weights[labels==0,:]=negative_weights

# map up to original set of anchors
labels=_unmap(labels,total_anchors,inds_inside,fill=-1)
bbox_targets=_unmap(bbox_targets,total_anchors,inds_inside,fill=0)
bbox_inside_weights=_unmap(bbox_inside_weights,total_anchors,inds_inside,fill=0)
bbox_outside_weights=_unmap(bbox_outside_weights,total_anchors,inds_inside,fill=0)

# labels
labels=labels.reshape((1,height,width,A)).transpose(0,3,1,2)
labels=labels.reshape((1,1,A*height,width))
rpn_labels=labels

# bbox_targets
bbox_targets=bbox_targets.reshape((1,height,width,A*4))

rpn_bbox_targets=bbox_targets
# bbox_inside_weights
bbox_inside_weights=bbox_inside_weights.reshape((1,height,width,A*4))

rpn_bbox_inside_weights=bbox_inside_weights

# bbox_outside_weights
bbox_outside_weights=bbox_outside_weights.reshape((1,height,width,A*4))

rpn_bbox_outside_weights=bbox_outside_weights
return rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights

首先调用了bbox_overlaps(boxes,query_boxes) 生成了overlaps,其实就是IoU,具体请自行百度(原谅我的懒惰),

 N=boxes.shape[0]
K=query_boxes.shape[0]
overlaps=np.zeros((N,K),dtype=np.float)
for k in range(K):
    box_area = (
        (query_boxes[k,2]-query_boxes[k,0]+1)*
        (query_boxes[k,3]-query_boxes[k,1]+1)
    )
    for n in range(N):
        iw=(
            min(boxes[n,2],query_boxes[k,2])-
            max(boxes[n,0],query_boxes[k,0])+1
        )
        if iw>0:
            ih=(
                min(boxes[n,3],query_boxes[k,3])-
                max(boxes[n,1],query_boxes[k,1])+1
            )
            if ih>0:
                ua=float(
                    (boxes[n, 2]-boxes[n,0]+1)*
                    (boxes[n, 3]-boxes[n,1]+1)+
                    box_area-iw*ih
                )
                overlaps[n,k]=iw*ih/ua
return overlaps

然后将正样本设置为1,负样本为0.然后筛选除了256个anchors,正负样本各128,不够就用另一个填充。然后生成了rpn_bbox_targets,这里调用了_compute_targets(ex_rois,gt_rois) 函数,代码如下,

    assert ex_rois.shape[0]==gt_rois.shape[0]
    assert ex_rois.shape[1]==4
    assert gt_rois.shape[1]==5

    return bbox_transform(ex_rois,gt_rois[:,:4]).astype(np.float32,copy=False)

同时这个函数又调用了bbox_transform(ex_rois,gt_rois) 函数,这个函数生成的是偏移量,就是anchor与gt_box的差距值,代码如下,

ex_widths=ex_rois[:,2]-ex_rois[:,0]+1.0
ex_heights=ex_rois[:,3]-ex_rois[:,1]+1.0
ex_ctr_x=ex_rois[:,0]+0.5*ex_widths
ex_ctr_y=ex_rois[:,1]+0.5*ex_heights

gt_widths=gt_rois[:,2]-gt_rois[:,0]+1.0
gt_heights=gt_rois[:,3]-gt_rois[:,1]+1.0
gt_ctr_x=gt_rois[:,0]+0.5*gt_widths
gt_ctr_y=gt_rois[:,1]+0.5*gt_heights

targets_dx=(gt_ctr_x-ex_ctr_x)/ex_widths
targets_dy=(gt_ctr_y-ex_ctr_y)/ex_heights
targets_dw=np.log(gt_widths/ex_widths)
targets_dh=np.log(gt_heights/ex_heights)

targets=np.vstack(
    (targets_dx,targets_dy,targets_dw,targets_dh)).transpose()
return targets

接着又生成了rpn_bbox_outside_weights和rpn_bbox_inside_weight,其实就是做了下数据对齐,从前面认真看过来,很好理解,这里用到了另一个函数_unmap(data,count,inds,fill=0)

if len(data.shape)==1:
    ret=np.empty((count,),dtype=np.float32)
    ret.fill(fill)
    ret[inds]=data
else:
    ret=np.empty((count,)+data.shape[1:],dtype=np.float32)
    ret.fill(fill)
    ret[inds, :]=data
return ret

最后就是将数据的shape改变成需要用的形式。

回到到build_proposals函数中,接下来调用了proposal_target_layer(rpn_rois,rpn_scores,gt_boxes,_num_classes)

这个函数与上面那个部分功能类似,在上一个函数中,所求的是anchor与gt_box,在这求的的是卷积bbox与gt_box的差距值。

all_rois = rpn_rois
all_scores = rpn_scores

# Include ground-truth boxes in the set of candidate rois
if cfg.FLAGS.proposal_use_gt:
    zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
    all_rois = np.vstack(
        (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
    )
    # not sure if it a wise appending, but anyway i am not using it
    all_scores = np.vstack((all_scores, zeros))

num_images = 1
rois_per_image = cfg.FLAGS.batch_size / num_images
fg_rois_per_image = np.round(
    cfg.FLAGS.proposal_fg_fraction * rois_per_image)

# Sample rois with classification labels and bounding box regression
# targets
labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
    all_rois, all_scores, gt_boxes, fg_rois_per_image,
    rois_per_image, _num_classes)

rois = rois.reshape(-1, 5)
roi_scores = roi_scores.reshape(-1)
labels = labels.reshape(-1, 1)
bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)

return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights

这个函数首先用到了_sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes) 函数,

overlaps = bbox_overlaps(
    np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
    np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
gt_assignment = overlaps.argmax(axis=1)
max_overlaps = overlaps.max(axis=1)
labels = gt_boxes[gt_assignment, 4]

# Select foreground RoIs as those with >= FG_THRESH overlap
fg_inds = np.where(max_overlaps >= cfg.FLAGS.roi_fg_threshold)[0]
# Guard against the case when an image has fewer than fg_rois_per_image
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((max_overlaps < cfg.FLAGS.roi_bg_threshold_high) &
                   (max_overlaps >= cfg.FLAGS.roi_bg_threshold_low))[0]

# Small modification to the original version where we ensure a fixed number of regions are sampled
if fg_inds.size > 0 and bg_inds.size > 0:
    fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
    fg_inds = npr.choice(fg_inds, size=int(
        fg_rois_per_image), replace=False)
    bg_rois_per_image = rois_per_image - fg_rois_per_image
    to_replace = bg_inds.size < bg_rois_per_image
    bg_inds = npr.choice(bg_inds, size=int(
        bg_rois_per_image), replace=to_replace)
elif fg_inds.size > 0:
    to_replace = fg_inds.size < rois_per_image
    fg_inds = npr.choice(fg_inds, size=int(
        rois_per_image), replace=to_replace)
    fg_rois_per_image = rois_per_image
elif bg_inds.size > 0:
    to_replace = bg_inds.size < rois_per_image
    bg_inds = npr.choice(bg_inds, size=int(
        rois_per_image), replace=to_replace)
    fg_rois_per_image = 0
else:
    import pdb
    pdb.set_trace()

# The indices that we're selecting (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
labels[int(fg_rois_per_image):] = 0
rois = all_rois[keep_inds]
roi_scores = all_scores[keep_inds]

bbox_target_data = _compute_targets(
    rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)

bbox_targets, bbox_inside_weights = \
    _get_bbox_regression_labels(bbox_target_data, num_classes)

return labels, rois, roi_scores, bbox_targets, bbox_inside_weights

然后在这个里面调用了下面两个

def _get_bbox_regression_labels(bbox_target_data, num_classes):
    """Bounding-box regression targets (bbox_target_data) are stored in a
    compact form N x (class, tx, ty, tw, th)

    This function expands those targets into the 4-of-4*K representation used
    by the network (i.e. only one class has non-zero targets).

    Returns:
        bbox_target (ndarray): N x 4K blob of regression targets
        bbox_inside_weights (ndarray): N x 4K blob of loss weights
    """

    clss = bbox_target_data[:, 0]
    bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
    bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
    inds = np.where(clss > 0)[0]
    for ind in inds:
        cls = clss[ind]
        start = int(4 * cls)
        end = start + 4
        bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
        bbox_inside_weights[ind, start:end] = cfg.FLAGS2["bbox_inside_weights"]
return bbox_targets, bbox_inside_weights


def _compute_targets(ex_rois, gt_rois, labels):
"""Compute bounding-box regression targets for an image."""

    assert ex_rois.shape[0] == gt_rois.shape[0]
    assert ex_rois.shape[1] == 4
    assert gt_rois.shape[1] == 4

    targets = bbox_transform(ex_rois, gt_rois)
    if cfg.FLAGS.bbox_normalize_targets_precomputed:
        # Optionally normalize targets by a precomputed mean and stdev
        targets = ((targets - np.array(cfg.FLAGS2["bbox_normalize_means"]))
                   / np.array(cfg.FLAGS2["bbox_normalize_stds"]))
    return np.hstack(
        (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)

过程和上一个基本相同,不再赘述。

4self.build_predictions(self, net, rois, is_training, initializer, initializer_bbox)

经过上面一步不需要训练的函数(主要是对数据进行处理)之后,又回到了网络里面(说实话感觉网络好理解,数据变换是真烦人)

    pool5=self._crop_pool_layer(net,rois,"pool5")

    pool5_flat=slim.flatten(pool5,scope='flatten')

    # Fully connected layers
    fc6=slim.fully_connected(pool5_flat,4096,scope='fc6')
    if is_training:
        fc6=slim.dropout(fc6,keep_prob=0.5,is_training=True,scope='dropout6')

    fc7=slim.fully_connected(fc6,4096,scope='fc7')
    if is_training:
        fc7=slim.dropout(fc7,keep_prob=0.5,is_training=True,scope='dropout7')

    # Scores and predictions
    cls_score=slim.fully_connected(fc7,self._num_classes,
        weights_initializer=initializer,trainable=is_training,activation_fn=None,scope='cls_score')
    cls_prob=self._softmax_layer(cls_score, "cls_prob")
    bbox_prediction=slim.fully_connected(fc7,self._num_classes*4,
        weights_initializer=initializer_bbox,trainable=is_training,activation_fn=None,scope='bbox_pred')

非常简单,用到了两个函数

def _crop_pool_layer(self,bottom,rois,name):
    with tf.variable_scope(name):
        batch_ids=tf.squeeze(tf.slice(rois,[0,0],[-1,1],name="batch_id"),[1])
        # Get the normalized coordinates of bboxes
        bottom_shape=tf.shape(bottom)
        height=(tf.to_float(bottom_shape[1])-1.)*np.float32(self._feat_stride[0])
        width=(tf.to_float(bottom_shape[2])-1.)*np.float32(self._feat_stride[0])
        x1=tf.slice(rois,[0, 1],[-1,1],name="x1")/width
        y1=tf.slice(rois,[0, 2],[-1,1],name="y1")/height
        x2=tf.slice(rois,[0, 3],[-1,1],name="x2")/width
        y2=tf.slice(rois,[0, 4],[-1,1],name="y2")/height
        # Won't be backpropagated to rois anyway, but to save time
        bboxes=tf.stop_gradient(tf.concat([y1,x1,y2,x2],axis=1))
        pre_pool_size=cfg.FLAGS.roi_pooling_size*2
        crops=tf.image.crop_and_resize(bottom,bboxes,tf.to_int32(batch_ids),
            [pre_pool_size,pre_pool_size],name="crops")

    return slim.max_pool2d(crops,[2,2],padding='SAME')

def _softmax_layer(self,bottom,name):
    if name=='rpn_cls_prob_reshape':
        input_shape=tf.shape(bottom)
        bottom_reshaped=tf.reshape(bottom,[-1,input_shape[-1]])
        reshaped_score=tf.nn.softmax(bottom_reshaped,name=name)

        return tf.reshape(reshaped_score,input_shape)

    return tf.nn.softmax(bottom,name=name)

看看就能懂,不多解释,至此,网络部分的所有内容完成,cls_prob用于二分类,bbox_prediction用于预测位置。

*
*
*
*
*
代码来自github~~~初学深度学习,尚是小白一只,可能理解的不对,有问题欢迎一块讨论~

理解过程参考了好多资料
[源码分析]Text-Detection-with-FRCN
从编程实现角度学习Faster R-CNN(附极简实现)
FasterRCNN系列

你可能感兴趣的:(SceneTextD&R)