AVOD代码理解第二篇,开始介绍网络结构的两大部分rpn和avod部分,在core文件下的avod_model.py和rpn_model.py部分分别是结构的主体,其间会交叉一些相应的处理部分!这篇先介绍网络的输入.
def build(self):
rpn_model = self._rpn_model
# Share the same prediction dict as RPN
#rpn_model/build
'''rpn_model,第一阶段'''
prediction_dict = rpn_model.build()
#nms后的anchors
top_anchors = prediction_dict[RpnModel.PRED_TOP_ANCHORS]
ground_plane = rpn_model.placeholders[RpnModel.PL_GROUND_PLANE]
#1:car
class_labels = rpn_model.placeholders[RpnModel.PL_LABEL_CLASSES]
def build(self):
# Setup input placeholders
# 一堆的输入信息.就是给输入占个坑
self._set_up_input_pls()
# Setup feature extractors
# 输入图像的featuremap
self._set_up_feature_extractors()
# proposal_input
# 1*1卷积后的bev_fasturemap和img_featuremap
bev_proposal_input = self.bev_bottleneck
img_proposal_input = self.img_bottleneck
# 融合参数
fusion_mean_div_factor = 2.0
self._set_up_featuremap_extrators是对输入的img图像和bev图像卷积,进行特征提取.
def _set_up_feature_extractors(self):
"""Sets up feature extractors and stores feature maps and
bottlenecks as member variables.
"""
#得到输入鸟瞰图的featuremap.256
self.bev_feature_maps, self.bev_end_points = \
self._bev_feature_extractor.build(
self._bev_preprocessed,
self._bev_pixel_size,
self._is_training)
#得到输入图像的featuremap.和bev一样的操作
self.img_feature_maps, self.img_end_points = \
self._img_feature_extractor.build(
self._img_preprocessed,
self._img_pixel_size,
self._is_training)
#bev 1*1的卷积操作
with tf.variable_scope('bev_bottleneck'):
#对featuremap进行1*1的卷积
self.bev_bottleneck = slim.conv2d(
self.bev_feature_maps,
1, [1, 1],
scope='bottleneck',
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': self._is_training})
#img_bottleneck.和bev一样的操作
with tf.variable_scope('img_bottleneck'):
self.img_bottleneck = slim.conv2d(
self.img_feature_maps,
1, [1, 1],
scope='bottleneck',
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': self._is_training})
self._bev_feature_extractor是图片的特征提取层,对输入图像进行.self._img_feature_extractor的操作和其是一样的
def build(self,
inputs,
input_pixel_size,
is_training,
scope='bev_vgg'):
""" Modified VGG for BEV feature extraction
Note: All the fully_connected layers have been transformed to conv2d
layers and are implemented in the main model.
Args:
inputs: a tensor of size [batch_size, height, width, channels].
input_pixel_size: size of the input (H x W)
is_training: True for training, False fo validation/testing.
scope: Optional scope for the variables.
Returns:
包含日志预测和end_points dict的最后一个操作
The last op containing the log predictions and end_points dict.
"""
#avod_car_example.config/rpn_config
vgg_config = self.config
#input为输入的bev_inout
with slim.arg_scope(self.vgg_arg_scope(
weight_decay=vgg_config.l2_weight_decay)):
with tf.variable_scope(scope, 'bev_vgg', [inputs]) as sc:
#end_points_collection?
end_points_collection = sc.name + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with slim.arg_scope([slim.conv2d, slim.max_pool2d],
outputs_collections=end_points_collection):
#[2,32].重复两个卷积.输出channels=32
net = slim.repeat(inputs,
vgg_config.vgg_conv1[0],
slim.conv2d,
vgg_config.vgg_conv1[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv1')
#max_pool2d的参数应该是不需要训练的
net = slim.max_pool2d(net, [2, 2], scope='pool1')
#[2,64].重复2个卷积,输出channels=64
net = slim.repeat(net,
vgg_config.vgg_conv2[0],
slim.conv2d,
vgg_config.vgg_conv2[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv2')
net = slim.max_pool2d(net, [2, 2], scope='pool2')
#[3,128]
net = slim.repeat(net,
vgg_config.vgg_conv3[0],
slim.conv2d,
vgg_config.vgg_conv3[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv3')
net = slim.max_pool2d(net, [2, 2], scope='pool3')
#[3,256]
net = slim.repeat(net,
vgg_config.vgg_conv4[0],
slim.conv2d,
vgg_config.vgg_conv4[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv4')
with tf.variable_scope('upsampling'):
# This extractor downsamples the input by a factor
# of 8 (3 maxpool layers)
downsampling_factor = 8
downsampled_shape = input_pixel_size / downsampling_factor
#*4.也就是最初的输入图像尺寸的1/2
upsampled_shape = \
downsampled_shape * vgg_config.upsampling_multiplier
#双线性插值 ,net里的目标尺寸变为原始图像的1/2
#和原文似乎有一点不对.这里的尺寸并不是原图的尺寸
feature_maps_out = tf.image.resize_bilinear(
net, upsampled_shape)
# Convert end_points_collection into a end_point dict.
#集合转换为字典
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
return feature_maps_out, end_points
self._set_up_input_pls是输入.这个部分有些细节部分还是不是很清楚,不过不是会很影响理解
def _set_up_input_pls(self):
"""Sets up input placeholders by adding them to self._placeholders.
Keys are defined as self.PL_*.
"""
# Combine config data
#输入的尺寸与深度.深度是6.在config文件里面有些是没有_bev_pixel_size的
bev_dims = np.append(self._bev_pixel_size, self._bev_depth)
#鸟瞰图输入
with tf.variable_scope('bev_input'):
# Placeholder for BEV image input, to be filled in with feed_dict
#PL_BEV_INPUT一个名字.6个值
bev_input_placeholder = self._add_placeholder(tf.float32, bev_dims,
self.PL_BEV_INPUT)
#在列上增加维度
self._bev_input_batches = tf.expand_dims(
bev_input_placeholder, axis=0)
#resize输入的尺寸.config文件里面没有限制bev的尺寸
#预处理实际就是预处理输入图像的尺寸,有些config里面不对尺寸进行限制
self. _bev_preprocessed = \
self._bev_feature_extractor.preprocess_input(
self._bev_input_batches, self._bev_pixel_size)
# Summary Images
#沿着深度,切割成6分,
bev_summary_images = tf.split(
bev_input_placeholder, self._bev_depth, axis=2)
tf.summary.image("bev_maps", bev_summary_images,
max_outputs=self._bev_depth)
#图像输入
with tf.variable_scope('img_input'):
# Take variable size input images[none,none,3]
img_input_placeholder = self._add_placeholder(
tf.float32,
[None, None, self._img_depth],
self.PL_IMG_INPUT)
self._img_input_batches = tf.expand_dims(
img_input_placeholder, axis=0)
self._img_preprocessed = \
self._img_feature_extractor.preprocess_input(
self._img_input_batches, self._img_pixel_size)
# Summary Image
tf.summary.image("rgb_image", self._img_preprocessed,
max_outputs=2)
#label.最后一个是名字.dtype,shape,name
with tf.variable_scope('pl_labels'):
#6维的anchors?
self._add_placeholder(tf.float32, [None, 6],
self.PL_LABEL_ANCHORS)
#3D的boxes,x,y,z,dw,dh,dl,ry?
self._add_placeholder(tf.float32, [None, 7],
self.PL_LABEL_BOXES_3D)
#标签的类
self._add_placeholder(tf.float32, [None],
self.PL_LABEL_CLASSES)
# Placeholders for anchors
with tf.variable_scope('pl_anchors'):
self._add_placeholder(tf.float32, [None, 6],
self.PL_ANCHORS)
#ious
self._add_placeholder(tf.float32, [None],
self.PL_ANCHOR_IOUS)
#这个应该就是六个回归∆t x , ∆t y , ∆t z , ∆d x , ∆d y , ∆d z
self._add_placeholder(tf.float32, [None, 6],
self.PL_ANCHOR_OFFSETS)
#calss
self._add_placeholder(tf.float32, [None],
self.PL_ANCHOR_CLASSES)
#鸟瞰图投影.anchor的投影
with tf.variable_scope('bev_anchor_projections'):
#左上角与右下角坐标
self._add_placeholder(tf.float32, [None, 4],
self.PL_BEV_ANCHORS)
#norm一下
self._bev_anchors_norm_pl = self._add_placeholder(
tf.float32, [None, 4], self.PL_BEV_ANCHORS_NORM)
#rbg投影
with tf.variable_scope('img_anchor_projections'):
#RBG_anchor投影
self._add_placeholder(tf.float32, [None, 4],
self.PL_IMG_ANCHORS)
self._img_anchors_norm_pl = self._add_placeholder(
tf.float32, [None, 4], self.PL_IMG_ANCHORS_NORM)
#这个文件不知道是啥?一个txt文件
with tf.variable_scope('saample_info'):
# the calib matrix shape is (3 x 4)
'''pl_calib_p2'''
self._add_placeholder(
tf.float32, [3, 4], self.PL_CALIB_P2)
'''pl_img_idx'''
self._add_placeholder(tf.int32,
shape=[1],
name=self.PL_IMG_IDX)
'''pl_ground_plane'''
self._add_placeholder(tf.float32, [4], self.PL_GROUND_PLANE)