https://blog.csdn.net/weixin_42206075/article/details/112026162
https://blog.csdn.net/weixin_42206075/article/details/111984086
上面两篇blog中,已经对Faster RCNN的训练阶段进行了解释,下面就是Faster RCNN的预测过程的代码实现。
在run step1~4的py文件之后,会生成上图的三个文件,其中两个是训练的权重文件,还有一个是rpn生成的roi框的字典。然后就运行inference.py文件进行预测了 ~
unified_network\config.py
CLASSES = ["aeroplane", "bicycle", "bird", "boat", "bottle",
"bus", "car", "cat", "chair", "cow",
"diningtable", "dog", "horse", "motorbike", "person",
"pottedplant", "sheep", "sofa", "train", "tvmonitor"] #检测类别,共20种
RATIO = [0.5, 1.0, 2.0] #anchor box长宽比
SCALE = [128, 256, 512] #anchor box正方形边长
POOLED_H = 7 #RoIPooling的池化尺寸高
POOLED_W = 7 #RoIPooling的池化尺寸宽
IMG_H = 600 #喂入faster rcnn网络图片的高
IMG_W = 800 #喂入faster rcnn网络图片的宽
NMS_THRESHOLD = 0.7 #RPN训练时正样本划分nms的阈值
NUMS_PROPOSAL = 300 #RPN输出到faster rcnn训练的ROI框数量,训练阶段是一张图片生成2000个ROI,预测是生成300个ROI
MINIBATCH = 256
BATCHSIZE = 1 #训练的批次大小
LEARNING_RATE = 0.001 #初始学习率
WEIGHT_DECAY = 0.0005 #权重损失衰减系数
MOMENTUM = 0.9 #动量训练器参数
EPSILON = 1e-10 #平滑参数
XML_PATH = "./VOCdevkit/VOC2007/Annotations/" #xml文件路径
IMG_PATH = "./VOCdevkit/VOC2007/JPEGImages/" #image文件路径
***rpn_proposal\vggnet.py(vgg_16函数)***
作用:同rpn_proposal\vggnet.py,构建vgg16模型,返回最后一层的feature map作为RPN的输入
def vgg_16(inputs,
num_classes=1000,
is_training=True,
dropout_keep_prob=0.5,
scope='vgg_16'):
"""Oxford Net VGG 16-Layers version D Example.
Note: All the fully_connected layers have been transformed to conv2d layers.
To use in classification mode, resize input to 224x224.
Args:
inputs: a tensor of size [batch_size, height, width, channels].
num_classes: number of predicted classes.
is_training: whether or not the model is being trained.
dropout_keep_prob: the probability that activations are kept in the dropout
layers during training.
spatial_squeeze: whether or not should squeeze the spatial dimensions of the
outputs. Useful to remove unnecessary dimensions for classification.
scope: Optional scope for the variables.
Returns:
the last op containing the log predictions and end_points dict.
"""
inputs -= tf.constant([123.68, 116.779, 103.939])
inputs /= 255
with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with arg_scope(
[layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
outputs_collections=end_points_collection):
net = layers_lib.repeat(
inputs, 2, layers.conv2d, 64, [3, 3], scope='conv1', trainable=False)
net = layers_lib.max_pool2d(net, [2, 2], scope='pool1', padding="SAME")
net = layers_lib.repeat(net, 2, layers.conv2d, 128, [3, 3], scope='conv2', trainable=False)
net = layers_lib.max_pool2d(net, [2, 2], scope='pool2', padding="SAME")
net = layers_lib.repeat(net, 3, layers.conv2d, 256, [3, 3], scope='conv3')
net = layers_lib.max_pool2d(net, [2, 2], scope='pool3', padding="SAME")
net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv4')
net = layers_lib.max_pool2d(net, [2, 2], scope='pool4', padding="SAME")
net = layers_lib.repeat(net, 3, layers.conv2d, 512, [3, 3], scope='conv5')
end_points = utils.convert_collection_to_dict(end_points_collection)
return end_points["vgg_16/conv5/conv5_3"]
***unified_network\ops.py(rpn2proposal函数)***
作用:将rpn生成出来的两个预测head(分类与回归)进行偏移量的转换,然后进行nms操作过滤,最后返回过滤完的预测框
def rpn2proposal(cls, reg, anchors):
cls, reg = cls[0], reg[0]
anchors = tf.constant(anchors, dtype=tf.float32)
normal_bbox, reverse_bbox = offset2bbox(reg, anchors) #reg, anchors这两个的shape都是(17100,4),转为xyxy格式
score = tf.nn.softmax(cls)[:, 1]
box_idx = tf.image.non_max_suppression(reverse_bbox, score, max_output_size=NUMS_PROPOSAL, iou_threshold=NMS_THRESHOLD)
reverse_bbox = tf.nn.embedding_lookup(reverse_bbox, box_idx)
normal_bbox = tf.nn.embedding_lookup(normal_bbox, box_idx)
temp = tf.constant([[IMG_H, IMG_W, IMG_H, IMG_W]], )
reverse_bbox = reverse_bbox / temp #进行坐标归一化操作
bbox_idx = tf.zeros([NUMS_PROPOSAL], dtype=tf.int32)
return normal_bbox, reverse_bbox, bbox_idx
***unified_network\vggnet.py(roi_fc函数)***
作用:构建faster rcnn中最后一层roi pooling和两层全连接
def roi_fc(inputs, boxes, box_idx, scope='vgg_16'):
with variable_scope.variable_scope(scope, 'vgg_16', [inputs]) as sc:
end_points_collection = sc.original_name_scope + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with arg_scope(
[layers.conv2d, layers_lib.fully_connected, layers_lib.max_pool2d],
outputs_collections=end_points_collection):
# Use conv2d instead of fully_connected layers.
net = roi_pooling(inputs, boxes, box_idx)
net = layers.conv2d(net, 4096, [7, 7], padding='VALID', scope='fc6')
net = layers_lib.dropout(
net, 0.5, is_training=True, scope='dropout6')
net = layers.conv2d(net, 4096, [1, 1], scope='fc7')
net = layers_lib.dropout(
net, 0.5, is_training=True, scope='dropout7')
return net
***unified_network\networks.py(unified_net函数)***
作用:输入一张原始图片,返回faster rcnn的最终回归head和分类head,还有经过rpn进行筛选的300个ROI框
def unified_net(inputs, anchors):
inputs = vgg_16(inputs) #shape=(1, 38, 50, 512)
rpn_cls, rpn_reg = rpn(inputs) # shape=(1, 38*50*9, 2),shape=(1, 38*50*9, 4)
normal_bbox, reverse_bbox, bbox_idx = rpn2proposal(rpn_cls, rpn_reg, anchors)
inputs = roi_fc(inputs, reverse_bbox, bbox_idx)
inputs = tf.squeeze(inputs, axis=[1, 2])
cls = fully_connected("classification", inputs, len(CLASSES)+1)
reg = fully_connected("regression", inputs, 4)
return cls, reg, normal_bbox
inference.py
作用:主文件
import tensorflow as tf
import numpy as np
from PIL import Image
from unified_network.networks import unified_net
from unified_network.ops import offset2bbox, non_max_suppression
from rpn_proposal.utils import generate_anchors, draw_box
from unified_network.config import IMG_H, IMG_W
anchors = generate_anchors()
def inference():
imgs = tf.placeholder(tf.float32, [1, IMG_H, IMG_W, 3])
cls, reg, proposal = unified_net(imgs, anchors) #shape=(300, 21),shape=(300, 4),shape=(300, 4)
x0, y0, x1, y1 = proposal[:, 0:1], proposal[:, 1:2], proposal[:, 2:3], proposal[:, 3:4]
x, y, w, h = (x0 + x1) / 2, (y0 + y1) / 2, x1 - x0, y1 - y0
proposal = tf.concat([x, y, w, h], axis=1) #通过rpn得到300个初步的候选框,要重新进行解码成xywh,然后
normal_bbox, reverse_bbox = offset2bbox(reg, proposal)
cls = tf.nn.softmax(cls)
boxes, score, classes = non_max_suppression(cls, reverse_bbox)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
fast_rcnn_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="vgg_16") + \
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="classification") + \
tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="regression")
rpn_var = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="rpn")
saver = tf.train.Saver(fast_rcnn_var)
saver.restore(sess, "./fast_rcnn/model/model.ckpt")
saver = tf.train.Saver(rpn_var)
saver.restore(sess, "./rpn_proposal/model/model.ckpt")
IMGS = np.array(Image.open("./road.jpg").resize([IMG_W, IMG_H]))
[BBOX, SCORE, CLS] = sess.run([boxes, score, classes], feed_dict={
imgs: IMGS[np.newaxis]})
X0, Y0, X1, Y1 = BBOX[:, 0:1], BBOX[:, 1:2], BBOX[:, 2:3], BBOX[:, 3:4]
X, Y, W, H = (X0 + X1) / 2, (Y0 + Y1) / 2, X1 - X0, Y1 - Y0
BBOX = np.concatenate((X, Y, W, H), axis=-1)
Image.fromarray(np.uint8(draw_box(IMGS, BBOX, CLS))).show()
if __name__ == "__main__":
inference()