self.build_network(self,sess,is_training=True)
函数,构建网络框架。 with tf.variable_scope('RCNN'):
initializer = tf.truncated_normal_initializer(mean=0.0, stddev=0.01)
initializer_bbox = tf.truncated_normal_initializer(mean=0.0, stddev=0.001)
net=self.VGG(is_training)
rpn_cls_prob,rpn_bbox_pred,rpn_cls_score,rpn_cls_score_reshape=self.RPN(net,is_training)
rois=self.build_proposals(is_training,rpn_cls_prob,rpn_bbox_pred,rpn_cls_score)
cls_score,cls_prob,bbox_pred=self.build_predictions(net,rois,is_training,initializer,initializer_bbox)
return rois, cls_prob, bbox_pred
原始特征提取步骤使用经典的VGG16网络
然后将得到的输出net
传到RPN层里
再将得到的对于proposal的预测是前景还是后景(cls),以及具体位置预测传入生成ROIS的函数里
最后将ROIS放入预测函数。
self.VGG(self,is_training)
提取图像特征层 with tf.variable_scope("VGG"):
with slim.arg_scope([slim.conv2d],stride=1,padding="SAME",activation_fn=tf.nn.relu):
with slim.arg_scope([slim.max_pool2d],stride=2,padding="VALID"):
conv1_1=slim.conv2d(self._image,64,[3,3],scope='conv1_1_3x3')
conv1_2=slim.conv2d(conv1_1,64,[3,3],scope='conv1_2_3x3')
self._vgg["conv1_2"]=conv1_2
pool1=slim.max_pool2d(conv1_2,[2,2],scope='pool1_2x2')
conv2_1=slim.conv2d(pool1,128,[3,3],scope='conv2_1_3x3')
conv2_2=slim.conv2d(conv2_1,128,[3,3],scope='conv2_2_3x3')
self._vgg["conv2_2"]=conv2_2
pool2=slim.max_pool2d(conv2_2,[2,2],scope='pool2_2x2')
conv3_1=slim.conv2d(pool2,256,[3,3],scope='conv3_1_3x3')
conv3_2=slim.conv2d(conv3_1,256,[3,3],scope='conv3_2_3x3')
conv3_3=slim.conv2d(conv3_2,256,[3,3],scope='conv3_3_3x3')
self._vgg["conv3_3"]=conv3_3
pool3=slim.max_pool2d(conv3_3,[2,2],scope='pool3_2x2')
conv4_1=slim.conv2d(pool3,512,[3,3],scope='conv4_1_3x3')
conv4_2=slim.conv2d(conv4_1,512,[3,3],scope='conv4_2_3x3')
conv4_3=slim.conv2d(conv4_2,512,[3,3],scope='conv4_3_3x3')
self._vgg["conv4_3"]=conv4_3
pool4=slim.max_pool2d(conv4_3,[2,2],scope='pool4_2x2')
conv5_1=slim.conv2d(pool4,512,[3,3],scope='conv5_1_3x3')
conv5_2=slim.conv2d(conv5_1,512,[3,3],scope='conv5_2_3x3')
conv5_3=slim.conv2d(conv5_2,512,[3,3],scope='conv5_3_3x3')
self._vgg["conv5_3"]=conv5_3
return conv5_3
#obtain a tensor with a channel number of 512 and unfixed long, wide
VGG网络中每一个卷积层都是使用的3*3的卷积核,padding=‘SAME’,步长为1,池化层都是2*2,步长为2,padding=‘VALID’,即卷积层不改变长宽,池化层使之缩小二分之一,最后变成1/16。由于都是重复的卷积层,可以直接使用slim.repeat()函数。
self.RPN(tensor,is_training)
rpn层,代替了fastRCNN网络中的selectivesearch,速度上升了很多。
with tf.variable_scope("RPN"):
rpn_conv1=slim.conv2d(tensor,512,[3,3],padding="SAME",stride=1,
trainable=is_training,
weights_initializer=tf.truncated_normal_initializer(0.01),
activation_fn=tf.nn.relu,name="rpn_conv1")
rpn_cls_score=slim.conv2d(rpn_conv1,self._num_anchors*2,[1,1],padding="VALID",stride=1,
trainable=is_training,
weights_initializer=tf.truncated_normal_initializer(0.01),
activation_fn=tf.nn.sigmoid,name="rpn_cls")#Each anchor binary classification
rpn_cls_score_reshape=self._reshape_layer(rpn_cls_score,2,'rpn_cls_score_reshape')
rpn_cls_prob_reshape=self._softmax_layer(rpn_cls_score_reshape,"rpn_cls_prob_reshape")
rpn_cls_prob=self._reshape_layer(rpn_cls_prob_reshape,self._num_anchors*2,"rpn_cls_prob")
rpn_bbox_pred=slim.conv2d(rpn,self._num_anchors*4,[1,1],trainable=is_training,
weights_initializer=initializer,padding='VALID',activation_fn=None,scope='rpn_bbox_pred')
self._rpn["rpn_conv1"]=rpn_conv1
self._rpn["rpn_cls_prob"]=rpn_cls_prob
self._rpn["rpn_bbox_pred"]=rpn_bbox_pred
self._rpn["rpn_cls_score"]=rpn_cls_score
self._rpn["rpn_cls_score_reshape"]=rpn_cls_score_reshape
return rpn_cls_prob,rpn_bbox_pred,rpn_cls_score,rpn_cls_score_reshape
RPN层首先对VGG得到的feature map进行3x3的卷积(可能是为了语义空间转换?某乎看到的。。),然后利用两个1x1的卷积分别进行二分类(背景还是目标)和位置回归。进行分类的卷积核通道数为9×2(9个anchor,每个anchor二分类,sigmod作为激活函数),进行位置回归的卷积核通道数为9×4(9个anchor,每个anchor有4个位置参数,没有激活函数)。RPN是一个全卷积网络(fully convolutional network),这样对输入图片的尺寸就没有要求了。
在这段代码里还用到啦self._reshape_layer(self,bottom,num_dim,name)
,这个函数主要是将tensor的通道数修改为num_dim,因为是进行二分类,所以需要将9个anchor都进行softmax的操作,即使用函数self._softmax_layer(self,bottom,name)
。下面是这两个函数。
def _softmax_layer(self,bottom,name):
if name=='rpn_cls_prob_reshape':
input_shape=tf.shape(bottom)
bottom_reshaped=tf.reshape(bottom,[-1,input_shape[-1]])
reshaped_score=tf.nn.softmax(bottom_reshaped,name=name)
return tf.reshape(reshaped_score,input_shape)
return tf.nn.softmax(bottom,name=name)
def _reshape_layer(self,bottom,num_dim,name):
input_shape=tf.shape(bottom)
with tf.variable_scope(name):
# change the channel to the caffe format
to_caffe=tf.transpose(bottom,[0,3,1,2])
# then force it to have channel 2
reshaped=tf.reshape(to_caffe,tf.concat(axis=0,
values=[[self._batch_size],[num_dim,-1],[input_shape[2]]]))
# then swap the channel back
to_tf=tf.transpose(reshaped,[0,2,3,1])
return to_tf
self.(self,tensor,is_training,rpn_cls_prob,rpn_bbox_pred,rpn_cls_score)
这个函数返回到是ROIS(Region of interests)
if is_training:
with tf.variable_scope("rois"):
rois,rois_score=tf.py_func(proposal_layer,
[rpn_cls_prob,rpn_bbox_pred,self._im_info,self._mode,
self._feat_stride,self._anchors,self._num_anchors],
[tf.float32,tf.float32])
rois.set_shape([None,5])
rois_score.set_shape([None,1])
with tf.variable_scope("anchor"):
rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights=tf.py_func(
anchor_target_layer,
[rpn_cls_score,self._gt_boxes,self._im_info,self._feat_stride,self._anchors,self._num_anchors],
[tf.float32,tf.float32,tf.float32,tf.float32])
rpn_labels.set_shape([1,1,None,None])
rpn_bbox_targets.set_shape([1,None,None,self._num_anchors*4])
rpn_bbox_inside_weights.set_shape([1,None,None,self._num_anchors*4])
rpn_bbox_outside_weights.set_shape([1,None,None,self._num_anchors*4])
rpn_labels=tf.to_int32(rpn_labels,name="to_int32")
self._anchors_targets["rpn_labels"]=rpn_labels
self._anchors_targets["rpn_bbox_targets"]=rpn_bbox_targets
self._anchors_targets["rpn_bbox_inside_weights"]=rpn_bbox_inside_weights
self._anchors_targets["rpn_bbox_outside_weights"]=rpn_bbox_outside_weights
with tf.control_dependencies([rpn_labels]):
with tf.variable_scope("rpn_rois"):
rois,roi_scores,labels,bbox_targets,bbox_inside_weights,bbox_outside_weights=tf.py_func(
proposal_target_layer,
[rois,roi_scores,self._gt_boxes,self._num_classes],
[tf.float32,tf.float32,tf.float32,tf.float32,tf.float32,tf.float32])
rois.set_shape([self._training_anchor_batch_size,5])
roi_scores.set_shape([self._training_anchor_batch_size])
labels.set_shape([self._training_anchor_batch_size, 1])
bbox_targets.set_shape([self._training_anchor_batch_size,self._num_classes*4])
bbox_inside_weights.set_shape([self._training_anchor_batch_size,self._num_classes*4])
bbox_outside_weights.set_shape([self._training_anchor_batch_size,self._num_classes*4])
self._proposal_targets['rois']=rois
self._proposal_targets['labels']=tf.to_int32(labels, name="to_int32")
self._proposal_targets['bbox_targets']=bbox_targets
self._proposal_targets['bbox_inside_weights']=bbox_inside_weights
self._proposal_targets['bbox_outside_weights']=bbox_outside_weights
else:
with tf.variable_scope("rois"):
rois,rpn_scores=tf.py_func(proposal_top_layer,
[rpn_cls_prob,rpn_bbox_pred,self._im_info,
self._feat_stride,self._anchors,self._num_anchors],
[tf.float32,tf.float32])
rois.set_shape([cfg.FLAGS.rpn_top_n,5])
rpn_scores.set_shape([cfg.FLAGS.rpn_top_n,1])
return rois
proposal_layer(rpn_cls_prob,rpn_bbox_pred,im_info,mode,_feat_stride,anchors,num_anchors
函数,下面是这个函数的代码if type(mode)==bytes:
mode=mode.decode('utf-8')
if mode=="TRAIN":
pre_nms_topN=cfg.FLAGS.rpn_train_pre_nms_top_n
post_nms_topN=cfg.FLAGS.rpn_train_post_nms_top_n
nms_thresh=cfg.FLAGS.rpn_train_nms_thresh
else:
pre_nms_topN=cfg.FLAGS.rpn_test_pre_nms_top_n
post_nms_topN=cfg.FLAGS.rpn_test_post_nms_top_n
nms_thresh=cfg.FLAGS.rpn_test_nms_thresh
im_info=im_info[0]
# Get the scores and bounding boxes
scores=rpn_cls_prob[:,:,:,num_anchors:]
rpn_bbox_pred=rpn_bbox_pred.reshape((-1, 4))
scores=scores.reshape((-1, 1))
proposals=bbox_transform_inv(anchors,rpn_bbox_pred)
proposals=clip_boxes(proposals,im_info[:2])
# Pick the top region proposals
order=scores.ravel().argsort()[::-1]
if pre_nms_topN > 0:
order=order[:pre_nms_topN]
proposals=proposals[order,:]
scores=scores[order]
# Non-maximal suppression
keep=nms(np.hstack((proposals, scores)),nms_thresh)
# Pick th top region proposals after NMS
if post_nms_topN > 0:
keep=keep[:post_nms_topN]
proposals=proposals[keep, :]
scores=scores[keep]
# Only support single image as input
batch_inds=np.zeros((proposals.shape[0], 1),dtype=np.float32)
blob=np.hstack((batch_inds,proposals.astype(np.float32,copy=False)))
return blob,scores
函数实参后面再解释。现在先看函数内部,cfg是一个存储常量值的.py文件,pre_nms_topN和post_nms_topN分别是两次筛选anchor后剩余的数量,第一次为12000,后一次为2000,nms_thresh为nms的阈值,0.7。(train中的值)后面又使用了另一个函数,bbox_transform_inv(boxes,deltas)
返回的是所有预测的proposal,代码如下
if boxes.shape[0]==0:
return np.zeros((0,deltas.shape[1]),dtype=deltas.dtype)
boxes=boxes.astype(deltas.dtype,copy=False)
widths=boxes[:,2]-boxes[:,0]+1.0
heights=boxes[:,3]-boxes[:,1]+1.0
ctr_x=boxes[:,0]+0.5*widths
ctr_y=boxes[:,1]+0.5*heights
dx=deltas[:,0::4]
dy=deltas[:,1::4]
dw=deltas[:,2::4]
dh=deltas[:,3::4]
pred_ctr_x=dx*widths[:,np.newaxis]+ctr_x[:,np.newaxis]
pred_ctr_y=dy*heights[:,np.newaxis]+ctr_y[:,np.newaxis]
pred_w=np.exp(dw)*widths[:,np.newaxis]
pred_h=np.exp(dh)*heights[:,np.newaxis]
pred_boxes=np.zeros(deltas.shape,dtype=deltas.dtype)
# x1
pred_boxes[:,0::4]=pred_ctr_x-0.5*pred_w
# y1
pred_boxes[:,1::4]=pred_ctr_y-0.5*pred_h
# x2
pred_boxes[:,2::4]=pred_ctr_x+0.5*pred_w
# y2
pred_boxes[:,3::4]=pred_ctr_y+0.5*pred_h
return pred_boxes
主要是通过对anchors与卷积得到的结果进行运算,得到中心点与宽高的估计值。下面的clip_boxes(boxes,im_shape)
函数是去掉估计值中不合实际的点,即超出或者不足的位置点。score是次proposal为背景还是文本的概率。代码如下,
boxes[:,0::4]=np.maximum(np.minimum(boxes[:,0::4],im_shape[1]-1),0)
boxes[:,1::4]=np.maximum(np.minimum(boxes[:,1::4],im_shape[0]-1),0)
boxes[:,2::4]=np.maximum(np.minimum(boxes[:,2::4],im_shape[1]-1),0)
boxes[:,3::4]=np.maximum(np.minimum(boxes[:,3::4],im_shape[0]-1),0)
return boxes
得到预测之后,筛选出前pre_nms_topN个预测,再进行nms,nms(dets,thresh)
函数的代码如下,
if dets.shape[0] == 0:
return []
else:
x1=dets[:, 0]
y1=dets[:, 1]
x2=dets[:, 2]
y2=dets[:, 3]
scores=dets[:, 4]
areas=(x2-x1+1)*(y2-y1+1)
order=scores.argsort()[::-1]
keep=[]
while order.size>0:
i=order[0]
keep.append(i)
xx1=np.maximum(x1[i],x1[order[1:]])
yy1=np.maximum(y1[i],y1[order[1:]])
xx2=np.minimum(x2[i],x2[order[1:]])
yy2=np.minimum(y2[i],y2[order[1:]])
w=np.maximum(0.0,xx2-xx1+1)
h=np.maximum(0.0,yy2-yy1+1)
inter=w*h
ovr=inter/(areas[i]+areas[order[1:]]-inter)
inds=np.where(ovr<=thresh)[0]
order=order[inds+1]
return keep
有很多关于nms的解释,这里不多说。
anchor_target_layer(rpn_cls_score,gt_boxes,im_info,_feat_stride,all_anchors,num_anchors)
直接上代码,,A=num_anchors
total_anchors=all_anchors.shape[0]
K=total_anchors/num_anchors
im_info=im_info[0]
# allow boxes to sit over the edge by a small amount
_allowed_border=0
# map of shape (..., H, W)
height,width=rpn_cls_score.shape[1:3]
# only keep anchors inside the image
inds_inside=np.where(
(all_anchors[:,0]>=-_allowed_border)&
(all_anchors[:,1]>=-_allowed_border)&
(all_anchors[:,2]=cfg.FLAGS.rpn_positive_overlap]=1
if cfg.FLAGS.rpn_clobber_positives:
# assign bg labels last so that negative labels can clobber positives
labels[max_overlapsnum_fg:
disable_inds=npr.choice(
fg_inds,size=(len(fg_inds)-num_fg),replace=False)
labels[disable_inds]=-1
# subsample negative labels if we have too many
num_bg=cfg.FLAGS.rpn_batchsize-np.sum(labels == 1)
bg_inds=np.where(labels==0)[0]
if len(bg_inds)>num_bg:
disable_inds=npr.choice(
bg_inds,size=(len(bg_inds)-num_bg),replace=False)
labels[disable_inds]=-1
bbox_targets=_compute_targets(anchors, gt_boxes[argmax_overlaps, :])
bbox_inside_weights=np.zeros((len(inds_inside), 4),dtype=np.float32)
# only the positive ones have regression targets
bbox_inside_weights[labels==1,:]=np.array(cfg.FLAGS2["bbox_inside_weights"])
bbox_outside_weights=np.zeros((len(inds_inside),4),dtype=np.float32)
if cfg.FLAGS.rpn_positive_weight<0:
# uniform weighting of examples (given non-uniform sampling)
num_examples=np.sum(labels>=0)
positive_weights=np.ones((1,4))*1.0/num_examples
negative_weights=np.ones((1,4))*1.0/num_examples
else:
assert ((cfg.FLAGS.rpn_positive_weight > 0) &
(cfg.FLAGS.rpn_positive_weight < 1))
positive_weights=(cfg.FLAGS.rpn_positive_weight /
np.sum(labels==1))
negative_weights=((1.0-cfg.FLAGS.rpn_positive_weight) /
np.sum(labels==0))
bbox_outside_weights[labels==1,:]=positive_weights
bbox_outside_weights[labels==0,:]=negative_weights
# map up to original set of anchors
labels=_unmap(labels,total_anchors,inds_inside,fill=-1)
bbox_targets=_unmap(bbox_targets,total_anchors,inds_inside,fill=0)
bbox_inside_weights=_unmap(bbox_inside_weights,total_anchors,inds_inside,fill=0)
bbox_outside_weights=_unmap(bbox_outside_weights,total_anchors,inds_inside,fill=0)
# labels
labels=labels.reshape((1,height,width,A)).transpose(0,3,1,2)
labels=labels.reshape((1,1,A*height,width))
rpn_labels=labels
# bbox_targets
bbox_targets=bbox_targets.reshape((1,height,width,A*4))
rpn_bbox_targets=bbox_targets
# bbox_inside_weights
bbox_inside_weights=bbox_inside_weights.reshape((1,height,width,A*4))
rpn_bbox_inside_weights=bbox_inside_weights
# bbox_outside_weights
bbox_outside_weights=bbox_outside_weights.reshape((1,height,width,A*4))
rpn_bbox_outside_weights=bbox_outside_weights
return rpn_labels,rpn_bbox_targets,rpn_bbox_inside_weights,rpn_bbox_outside_weights
首先调用了bbox_overlaps(boxes,query_boxes)
生成了overlaps,其实就是IoU,具体请自行百度(原谅我的懒惰),
N=boxes.shape[0]
K=query_boxes.shape[0]
overlaps=np.zeros((N,K),dtype=np.float)
for k in range(K):
box_area = (
(query_boxes[k,2]-query_boxes[k,0]+1)*
(query_boxes[k,3]-query_boxes[k,1]+1)
)
for n in range(N):
iw=(
min(boxes[n,2],query_boxes[k,2])-
max(boxes[n,0],query_boxes[k,0])+1
)
if iw>0:
ih=(
min(boxes[n,3],query_boxes[k,3])-
max(boxes[n,1],query_boxes[k,1])+1
)
if ih>0:
ua=float(
(boxes[n, 2]-boxes[n,0]+1)*
(boxes[n, 3]-boxes[n,1]+1)+
box_area-iw*ih
)
overlaps[n,k]=iw*ih/ua
return overlaps
然后将正样本设置为1,负样本为0.然后筛选除了256个anchors,正负样本各128,不够就用另一个填充。然后生成了rpn_bbox_targets,这里调用了_compute_targets(ex_rois,gt_rois)
函数,代码如下,
assert ex_rois.shape[0]==gt_rois.shape[0]
assert ex_rois.shape[1]==4
assert gt_rois.shape[1]==5
return bbox_transform(ex_rois,gt_rois[:,:4]).astype(np.float32,copy=False)
同时这个函数又调用了bbox_transform(ex_rois,gt_rois)
函数,这个函数生成的是偏移量,就是anchor与gt_box的差距值,代码如下,
ex_widths=ex_rois[:,2]-ex_rois[:,0]+1.0
ex_heights=ex_rois[:,3]-ex_rois[:,1]+1.0
ex_ctr_x=ex_rois[:,0]+0.5*ex_widths
ex_ctr_y=ex_rois[:,1]+0.5*ex_heights
gt_widths=gt_rois[:,2]-gt_rois[:,0]+1.0
gt_heights=gt_rois[:,3]-gt_rois[:,1]+1.0
gt_ctr_x=gt_rois[:,0]+0.5*gt_widths
gt_ctr_y=gt_rois[:,1]+0.5*gt_heights
targets_dx=(gt_ctr_x-ex_ctr_x)/ex_widths
targets_dy=(gt_ctr_y-ex_ctr_y)/ex_heights
targets_dw=np.log(gt_widths/ex_widths)
targets_dh=np.log(gt_heights/ex_heights)
targets=np.vstack(
(targets_dx,targets_dy,targets_dw,targets_dh)).transpose()
return targets
接着又生成了rpn_bbox_outside_weights和rpn_bbox_inside_weight,其实就是做了下数据对齐,从前面认真看过来,很好理解,这里用到了另一个函数_unmap(data,count,inds,fill=0)
,
if len(data.shape)==1:
ret=np.empty((count,),dtype=np.float32)
ret.fill(fill)
ret[inds]=data
else:
ret=np.empty((count,)+data.shape[1:],dtype=np.float32)
ret.fill(fill)
ret[inds, :]=data
return ret
最后就是将数据的shape改变成需要用的形式。
proposal_target_layer(rpn_rois,rpn_scores,gt_boxes,_num_classes)
这个函数与上面那个部分功能类似,在上一个函数中,所求的是anchor与gt_box,在这求的的是卷积bbox与gt_box的差距值。
all_rois = rpn_rois
all_scores = rpn_scores
# Include ground-truth boxes in the set of candidate rois
if cfg.FLAGS.proposal_use_gt:
zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
all_rois = np.vstack(
(all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
)
# not sure if it a wise appending, but anyway i am not using it
all_scores = np.vstack((all_scores, zeros))
num_images = 1
rois_per_image = cfg.FLAGS.batch_size / num_images
fg_rois_per_image = np.round(
cfg.FLAGS.proposal_fg_fraction * rois_per_image)
# Sample rois with classification labels and bounding box regression
# targets
labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
all_rois, all_scores, gt_boxes, fg_rois_per_image,
rois_per_image, _num_classes)
rois = rois.reshape(-1, 5)
roi_scores = roi_scores.reshape(-1)
labels = labels.reshape(-1, 1)
bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
这个函数首先用到了_sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes)
函数,
overlaps = bbox_overlaps(
np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
gt_assignment = overlaps.argmax(axis=1)
max_overlaps = overlaps.max(axis=1)
labels = gt_boxes[gt_assignment, 4]
# Select foreground RoIs as those with >= FG_THRESH overlap
fg_inds = np.where(max_overlaps >= cfg.FLAGS.roi_fg_threshold)[0]
# Guard against the case when an image has fewer than fg_rois_per_image
# Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
bg_inds = np.where((max_overlaps < cfg.FLAGS.roi_bg_threshold_high) &
(max_overlaps >= cfg.FLAGS.roi_bg_threshold_low))[0]
# Small modification to the original version where we ensure a fixed number of regions are sampled
if fg_inds.size > 0 and bg_inds.size > 0:
fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
fg_inds = npr.choice(fg_inds, size=int(
fg_rois_per_image), replace=False)
bg_rois_per_image = rois_per_image - fg_rois_per_image
to_replace = bg_inds.size < bg_rois_per_image
bg_inds = npr.choice(bg_inds, size=int(
bg_rois_per_image), replace=to_replace)
elif fg_inds.size > 0:
to_replace = fg_inds.size < rois_per_image
fg_inds = npr.choice(fg_inds, size=int(
rois_per_image), replace=to_replace)
fg_rois_per_image = rois_per_image
elif bg_inds.size > 0:
to_replace = bg_inds.size < rois_per_image
bg_inds = npr.choice(bg_inds, size=int(
rois_per_image), replace=to_replace)
fg_rois_per_image = 0
else:
import pdb
pdb.set_trace()
# The indices that we're selecting (both fg and bg)
keep_inds = np.append(fg_inds, bg_inds)
# Select sampled values from various arrays:
labels = labels[keep_inds]
# Clamp labels for the background RoIs to 0
labels[int(fg_rois_per_image):] = 0
rois = all_rois[keep_inds]
roi_scores = all_scores[keep_inds]
bbox_target_data = _compute_targets(
rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
bbox_targets, bbox_inside_weights = \
_get_bbox_regression_labels(bbox_target_data, num_classes)
return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
然后在这个里面调用了下面两个
def _get_bbox_regression_labels(bbox_target_data, num_classes):
"""Bounding-box regression targets (bbox_target_data) are stored in a
compact form N x (class, tx, ty, tw, th)
This function expands those targets into the 4-of-4*K representation used
by the network (i.e. only one class has non-zero targets).
Returns:
bbox_target (ndarray): N x 4K blob of regression targets
bbox_inside_weights (ndarray): N x 4K blob of loss weights
"""
clss = bbox_target_data[:, 0]
bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
inds = np.where(clss > 0)[0]
for ind in inds:
cls = clss[ind]
start = int(4 * cls)
end = start + 4
bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
bbox_inside_weights[ind, start:end] = cfg.FLAGS2["bbox_inside_weights"]
return bbox_targets, bbox_inside_weights
def _compute_targets(ex_rois, gt_rois, labels):
"""Compute bounding-box regression targets for an image."""
assert ex_rois.shape[0] == gt_rois.shape[0]
assert ex_rois.shape[1] == 4
assert gt_rois.shape[1] == 4
targets = bbox_transform(ex_rois, gt_rois)
if cfg.FLAGS.bbox_normalize_targets_precomputed:
# Optionally normalize targets by a precomputed mean and stdev
targets = ((targets - np.array(cfg.FLAGS2["bbox_normalize_means"]))
/ np.array(cfg.FLAGS2["bbox_normalize_stds"]))
return np.hstack(
(labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
过程和上一个基本相同,不再赘述。
self.build_predictions(self, net, rois, is_training, initializer, initializer_bbox)
经过上面一步不需要训练的函数(主要是对数据进行处理)之后,又回到了网络里面(说实话感觉网络好理解,数据变换是真烦人)
pool5=self._crop_pool_layer(net,rois,"pool5")
pool5_flat=slim.flatten(pool5,scope='flatten')
# Fully connected layers
fc6=slim.fully_connected(pool5_flat,4096,scope='fc6')
if is_training:
fc6=slim.dropout(fc6,keep_prob=0.5,is_training=True,scope='dropout6')
fc7=slim.fully_connected(fc6,4096,scope='fc7')
if is_training:
fc7=slim.dropout(fc7,keep_prob=0.5,is_training=True,scope='dropout7')
# Scores and predictions
cls_score=slim.fully_connected(fc7,self._num_classes,
weights_initializer=initializer,trainable=is_training,activation_fn=None,scope='cls_score')
cls_prob=self._softmax_layer(cls_score, "cls_prob")
bbox_prediction=slim.fully_connected(fc7,self._num_classes*4,
weights_initializer=initializer_bbox,trainable=is_training,activation_fn=None,scope='bbox_pred')
非常简单,用到了两个函数
def _crop_pool_layer(self,bottom,rois,name):
with tf.variable_scope(name):
batch_ids=tf.squeeze(tf.slice(rois,[0,0],[-1,1],name="batch_id"),[1])
# Get the normalized coordinates of bboxes
bottom_shape=tf.shape(bottom)
height=(tf.to_float(bottom_shape[1])-1.)*np.float32(self._feat_stride[0])
width=(tf.to_float(bottom_shape[2])-1.)*np.float32(self._feat_stride[0])
x1=tf.slice(rois,[0, 1],[-1,1],name="x1")/width
y1=tf.slice(rois,[0, 2],[-1,1],name="y1")/height
x2=tf.slice(rois,[0, 3],[-1,1],name="x2")/width
y2=tf.slice(rois,[0, 4],[-1,1],name="y2")/height
# Won't be backpropagated to rois anyway, but to save time
bboxes=tf.stop_gradient(tf.concat([y1,x1,y2,x2],axis=1))
pre_pool_size=cfg.FLAGS.roi_pooling_size*2
crops=tf.image.crop_and_resize(bottom,bboxes,tf.to_int32(batch_ids),
[pre_pool_size,pre_pool_size],name="crops")
return slim.max_pool2d(crops,[2,2],padding='SAME')
def _softmax_layer(self,bottom,name):
if name=='rpn_cls_prob_reshape':
input_shape=tf.shape(bottom)
bottom_reshaped=tf.reshape(bottom,[-1,input_shape[-1]])
reshaped_score=tf.nn.softmax(bottom_reshaped,name=name)
return tf.reshape(reshaped_score,input_shape)
return tf.nn.softmax(bottom,name=name)
看看就能懂,不多解释,至此,网络部分的所有内容完成,cls_prob用于二分类,bbox_prediction用于预测位置。
*
*
*
*
*
代码来自github~~~初学深度学习,尚是小白一只,可能理解的不对,有问题欢迎一块讨论~
理解过程参考了好多资料
[源码分析]Text-Detection-with-FRCN
从编程实现角度学习Faster R-CNN(附极简实现)
FasterRCNN系列