主要分为以下几个部分:
用于生成rpn网络的输入数据:包含类聚类的anchor大小信息以及具体每个sample的anchor的生成的anchor信息
base_dir = avod/
config = avod/avod/configs/mb_preprocessing/rpn_cars(cyclists,pedestrians,people).config
主要的相关模块调用:
scripts/preprocessing/gen_min_batches.py->avod/builders/dataset_builder.py(build_kitti_dataset)->avod/datasets/kitti/kitti_dataset.py(KittiDataset)->avod/datasets/kitti/kitti_utils.py(KittiUtils)->avod/core/mini_batch_utils.py(MiniBatchUtils.preprocess_rpn_mini_batches)->avod/core/mini_batch_preprocessor.py(MiniBatchPreprocessor.preprocess->avod/core/anchor_generator/grid_anchor_3d_generator.py(GridAnchor3dGenerator.generate)
数据前处理:mini_batch Anchor生成
Avod数据前处理gen_minbacth包括两个部分:生成不同类的size的cluster结果;利用聚类结果生成不同类的Anchor信息,作为RPN的输入数据
Anchor信息具体为:[max_gt_2d_iou, max_gt_3d_iou, (6 x offsets), class_index],anchor对应的gt_iou(2d和3d),anchor偏移值,对应类的index
具体步骤为:
核心代码如下:
# mini_batch_preprocessor.py:49
def preprocess(self, indices):
"""Preprocesses anchor info and saves info to files
Args:
indices (int array): sample indices to process.
If None, processes all samples
"""
# Get anchor stride for class,默认为0.5
anchor_strides = self._anchor_strides
dataset = self._dataset
dataset_utils = self._dataset.kitti_utils
classes_name = dataset.classes_name
# Make folder if it doesn't exist yet
output_dir = self.mini_batch_utils.get_file_path(classes_name,
anchor_strides,
sample_name=None)
os.makedirs(output_dir, exist_ok=True)
# Get clusters for class
# 生成的cluster size用于anchor size的生成
all_clusters_sizes, _ = dataset.get_cluster_info()
# 初始化3d_anchor_generator
anchor_generator = grid_anchor_3d_generator.GridAnchor3dGenerator()
# Load indices of data_split
all_samples = dataset.sample_list
if indices is None:
indices = np.arange(len(all_samples))
num_samples = len(indices)
# For each image in the dataset, save info on the anchors
for sample_idx in indices:
# Get image name for given cluster
sample_name = all_samples[sample_idx].name
img_idx = int(sample_name)
# Check for existing files and skip to the next
if self._check_for_existing(classes_name, anchor_strides,
sample_name):
print("{} / {}: Sample already preprocessed".format(
sample_idx + 1, num_samples, sample_name))
continue
# Get ground truth and filter based on difficulty
ground_truth_list = obj_utils.read_labels(dataset.label_dir,
img_idx)
# Filter objects to dataset classes
filtered_gt_list = dataset_utils.filter_labels(ground_truth_list)
filtered_gt_list = np.asarray(filtered_gt_list)
# Filtering by class has no valid ground truth, skip this image
if len(filtered_gt_list) == 0:
print("{} / {} No {}s for sample {} "
"(Ground Truth Filter)".format(
sample_idx + 1, num_samples,
classes_name, sample_name))
# Output an empty file and move on to the next image.
self._save_to_file(classes_name, anchor_strides, sample_name)
continue
# Get ground plane
ground_plane = obj_utils.get_road_plane(img_idx,
dataset.planes_dir)
image = Image.open(dataset.get_rgb_image_path(sample_name))
image_shape = [image.size[1], image.size[0]]
# Generate sliced 2D voxel grid for filtering
# 生成2d voxel grid,这里只保留了image视角内bev图信息
vx_grid_2d = dataset_utils.create_sliced_voxel_grid_2d(
sample_name,
source=dataset.bev_source,
image_shape=image_shape)
# List for merging all anchors
all_anchor_boxes_3d = []
# Create anchors for each class
for class_idx in range(len(dataset.classes)):
# Generate anchors for all classes
# 根据不同class的anchor大小以及stride和plane生成3d anchor
grid_anchor_boxes_3d = anchor_generator.generate(
area_3d=self._area_extents,
anchor_3d_sizes=all_clusters_sizes[class_idx],
anchor_stride=self._anchor_strides[class_idx],
ground_plane=ground_plane)
all_anchor_boxes_3d.extend(grid_anchor_boxes_3d)
# Filter empty anchors
all_anchor_boxes_3d = np.asarray(all_anchor_boxes_3d)
anchors = box_3d_encoder.box_3d_to_anchor(all_anchor_boxes_3d)
empty_anchor_filter = anchor_filter.get_empty_anchor_filter_2d(
anchors, vx_grid_2d, self._density_threshold)
# Calculate anchor info
# 这里更新了所有anchor和gt的iou信息,以找到anchor匹配的目标target
anchors_info = self._calculate_anchors_info(
all_anchor_boxes_3d, empty_anchor_filter, filtered_gt_list)
anchor_ious = anchors_info[:, self.mini_batch_utils.col_ious]
valid_iou_indices = np.where(anchor_ious > 0.0)[0]
print("{} / {}:"
"{:>6} anchors, "
"{:>6} iou > 0.0, "
"for {:>3} {}(s) for sample {}".format(
sample_idx + 1, num_samples,
len(anchors_info),
len(valid_iou_indices),
len(filtered_gt_list), classes_name, sample_name
))
# Save anchors info
self._save_to_file(classes_name, anchor_strides,
sample_name, anchors_info)
其中3D Anchor生成的步骤:
确定Anchor生成范围(area_extents)
根据stride生成anchor的center点分布
生成size和rotation分布->生成anchor matrix
def tile_anchors_3d(area_extents,
anchor_3d_sizes,
anchor_stride,
ground_plane):
"""
Tiles anchors over the area extents by using meshgrids to
generate combinations of (x, y, z), (l, w, h) and ry.
Args:
area_extents: [[min_x, max_x], [min_y, max_y], [min_z, max_z]]
anchor_3d_sizes: list of 3d anchor sizes N x (l, w, h)
anchor_stride: stride lengths (x_stride, z_stride)
ground_plane: coefficients of the ground plane e.g. [0, -1, 0, 0]
Returns:
boxes: list of 3D anchors in box_3d format N x [x, y, z, l, w, h, ry]
"""
# Convert sizes to ndarray
# 由于kitti坐标系的原因:x,z轴定义的为地平面坐标系,而y轴对应高度
anchor_3d_sizes = np.asarray(anchor_3d_sizes)
anchor_stride_x = anchor_stride[0]
anchor_stride_z = anchor_stride[1]
anchor_rotations = np.asarray([0, np.pi / 2.0])
x_start = area_extents[0][0] + anchor_stride[0] / 2.0
x_end = area_extents[0][1]
x_centers = np.array(np.arange(x_start, x_end, step=anchor_stride_x),
dtype=np.float32)
z_start = area_extents[2][1] - anchor_stride[1] / 2.0
z_end = area_extents[2][0]
z_centers = np.array(np.arange(z_start, z_end, step=-anchor_stride_z),
dtype=np.float32)
# Use ranges for substitution
size_indices = np.arange(0, len(anchor_3d_sizes))
rotation_indices = np.arange(0, len(anchor_rotations))
# Generate matrix for substitution
# e.g. for two sizes and two rotations
# [[x0, z0, 0, 0], [x0, z0, 0, 1], [x0, z0, 1, 0], [x0, z0, 1, 1],
# [x1, z0, 0, 0], [x1, z0, 0, 1], [x1, z0, 1, 0], [x1, z0, 1, 1], ...]
before_sub = np.stack(np.meshgrid(x_centers,
z_centers,
size_indices,
rotation_indices),
axis=4).reshape(-1, 4)
# Place anchors on the ground plane
# 利用之前的meshgrid生成anchor的center点
a, b, c, d = ground_plane
all_x = before_sub[:, 0]
all_z = before_sub[:, 1]
all_y = -(a * all_x + c * all_z + d) / b
# Create empty matrix to return
num_anchors = len(before_sub)
all_anchor_boxes_3d = np.zeros((num_anchors, 7))
# Fill in x, y, z
all_anchor_boxes_3d[:, 0:3] = np.stack((all_x, all_y, all_z), axis=1)
# Fill in shapes
sizes = anchor_3d_sizes[np.asarray(before_sub[:, 2], np.int32)]
all_anchor_boxes_3d[:, 3:6] = sizes
# Fill in rotations
rotations = anchor_rotations[np.asarray(before_sub[:, 3], np.int32)]
all_anchor_boxes_3d[:, 6] = rotations
return all_anchor_boxes_3d
avod模型的整体结构包括backbone+RPN+avod网络三个部分,详情参照avod_paperreading
backbone采用的是VGG+FPN的结构,但是添加了bev feature的设计(lidar三维数据转化为二维的bev特征),后与image feature进行融合,RPN网络用于生成region proposal,avod用于最后物体的分类和检测框的回归
base_dir = avod/
主要的相关模块调用:
config = avod/config/pyramid_cars_with_aug_example.config
scripts/run_training.py->avod/avod/core/trainer.py(这里会完成model,input_data,loss,op等模块的构建)->avod/avod/core/models/avod_model.py->avod/avod/core/models/rpn_model.py
数据前处理
训练的数据前处理与前文的预生成数据的区别是这里是对输入的原始数据进行处理,主要分为以下几个部分:
三维点云数据的读取和过滤:
三维点云数据读入后需要进行去除在image视角外的点云数据包括两个部分:ground_plane_filter+image_filter,前者主要用于生成bev图特征(对应不同高度生成不同体素空间,进行点的特征编码,参照bev的生成),后者主要是将对应cam view外的点进行过滤。
BEV图的生成
BEV图生成原理是在过滤后的点云数据上,根据height_lo和height_hi的高度范围(相对于ground_plane)生成num_slices个y轴维度的切片(slices)每个切片上按照voxel_size生成一系列单元(voxel),以其中点云的最高点高度作为feature,最终生成(bev_width/voxel_size)*(bev_height/voxel_size)*(num_slices+1)维特征,+1为记录的density信息,代码如下
#avod/acod/datasets/kitti/kitti_utils.py:109
def generate_bev(self,
source,
point_cloud,
ground_plane,
area_extents,
voxel_size):
"""Generates the BEV maps dictionary. One height map is created for
each slice of the point cloud. One density map is created for
the whole point cloud.
Args:
source: point cloud source
point_cloud: point cloud (3, N)
ground_plane: ground plane coefficients
area_extents: 3D area extents
[[min_x, max_x], [min_y, max_y], [min_z, max_z]]
voxel_size: voxel size in m
Returns:
BEV maps dictionary
height_maps: list of height maps
density_map: density map
"""
#得到点云数据
all_points = np.transpose(point_cloud)
height_maps = []
for slice_idx in range(self.num_slices):
height_lo = self.height_lo + slice_idx * self.height_per_division
height_hi = height_lo + self.height_per_division
#slice_filter相对ground_plane根据高度进行每个slice点云的过滤
slice_filter = self.kitti_utils.create_slice_filter(
point_cloud,
area_extents,
ground_plane,
height_lo,
height_hi)
# Apply slice filter
slice_points = all_points[slice_filter]
if len(slice_points) > 1:
# Create Voxel Grid 2D
voxel_grid_2d = VoxelGrid2D()
voxel_grid_2d.voxelize_2d(
slice_points, voxel_size,
extents=area_extents,
ground_plane=ground_plane,
create_leaf_layout=False)
# Remove y values (all 0)
voxel_indices = voxel_grid_2d.voxel_indices[:, [0, 2]]
# Create empty BEV images
height_map = np.zeros((voxel_grid_2d.num_divisions[0],
voxel_grid_2d.num_divisions[2]))
# Only update pixels where voxels have max height values,
# and normalize by height of slices
# 生成含有最大高度信息的height_map
voxel_grid_2d.heights = voxel_grid_2d.heights - height_lo
height_map[voxel_indices[:, 0], voxel_indices[:, 1]] = \
np.asarray(voxel_grid_2d.heights) / self.height_per_division
height_maps.append(height_map)
# Rotate height maps 90 degrees
# (transpose and flip) is faster than np.rot90
# 应该是坐标系定义的问题(image和bev)
height_maps_out = [np.flip(height_maps[map_idx].transpose(), axis=0)
for map_idx in range(len(height_maps))]
#得到density的filter,在全量高度上得到
density_slice_filter = self.kitti_utils.create_slice_filter(
point_cloud,
area_extents,
ground_plane,
self.height_lo,
self.height_hi)
density_points = all_points[density_slice_filter]
# Create Voxel Grid 2D
density_voxel_grid_2d = VoxelGrid2D()
density_voxel_grid_2d.voxelize_2d(
density_points,
voxel_size,
extents=area_extents,
ground_plane=ground_plane,
create_leaf_layout=False)
# Generate density map
density_voxel_indices_2d = \
density_voxel_grid_2d.voxel_indices[:, [0, 2]]
density_map = self._create_density_map(
num_divisions=density_voxel_grid_2d.num_divisions,
voxel_indices_2d=density_voxel_indices_2d,
num_pts_per_voxel=density_voxel_grid_2d.num_pts_in_voxel,
norm_value=self.NORM_VALUES[source])
bev_maps = dict()
bev_maps['height_maps'] = height_maps_out
bev_maps['density_map'] = density_map
return bev_maps
数据增强(data augumentation)
这部分主要是在读入数据的过程中会进行数据的增强操作,默认car的增强操作包括:flipping+pca_jitter。
Backbone
backbone(feature extactor)包括两个部分:bev和image,整体结构类似,具体实现参考下文代码,其结构可以概述为conv1*2->pool1->conv2*2->pool2->conv3*2->pool3->conv4->(upconv3+concat3+fusion3)->(upconv2+concat2+fusion2)->(upconv1+concat1+fusion1)
#avod/core/feature_extractors/bev_vgg_pyramid.py:30
def build(self,
inputs,
input_pixel_size,
is_training,
scope='bev_vgg_pyr'):
""" Modified VGG for BEV feature extraction with pyramid features
Args:
inputs: a tensor of size [batch_size, height, width, channels].
input_pixel_size: size of the input (H x W)
is_training: True for training, False for validation/testing.
scope: Optional scope for the variables.
Returns:
The last op containing the log predictions and end_points dict.
"""
vgg_config = self.config
with slim.arg_scope(self.vgg_arg_scope(
weight_decay=vgg_config.l2_weight_decay)):
with tf.variable_scope(scope, 'bev_vgg_pyr', [inputs]) as sc:
end_points_collection = sc.name + '_end_points'
# Collect outputs for conv2d, fully_connected and max_pool2d.
with slim.arg_scope([slim.conv2d, slim.max_pool2d],
outputs_collections=end_points_collection):
# Pad 700 to 704 to allow even divisions for max pooling
padded = tf.pad(inputs, [[0, 0], [4, 0], [0, 0], [0, 0]])
# Encoder
conv1 = slim.repeat(padded,
vgg_config.vgg_conv1[0],
slim.conv2d,
vgg_config.vgg_conv1[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv1')
pool1 = slim.max_pool2d(conv1, [2, 2], scope='pool1')
conv2 = slim.repeat(pool1,
vgg_config.vgg_conv2[0],
slim.conv2d,
vgg_config.vgg_conv2[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv2')
pool2 = slim.max_pool2d(conv2, [2, 2], scope='pool2')
conv3 = slim.repeat(pool2,
vgg_config.vgg_conv3[0],
slim.conv2d,
vgg_config.vgg_conv3[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv3')
pool3 = slim.max_pool2d(conv3, [2, 2], scope='pool3')
conv4 = slim.repeat(pool3,
vgg_config.vgg_conv4[0],
slim.conv2d,
vgg_config.vgg_conv4[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='conv4')
# Decoder (upsample and fuse features)
upconv3 = slim.conv2d_transpose(
conv4,
vgg_config.vgg_conv3[1],
[3, 3],
stride=2,
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='upconv3')
concat3 = tf.concat(
(conv3, upconv3), axis=3, name='concat3')
pyramid_fusion3 = slim.conv2d(
concat3,
vgg_config.vgg_conv2[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='pyramid_fusion3')
upconv2 = slim.conv2d_transpose(
pyramid_fusion3,
vgg_config.vgg_conv2[1],
[3, 3],
stride=2,
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='upconv2')
concat2 = tf.concat(
(conv2, upconv2), axis=3, name='concat2')
pyramid_fusion_2 = slim.conv2d(
concat2,
vgg_config.vgg_conv1[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='pyramid_fusion2')
upconv1 = slim.conv2d_transpose(
pyramid_fusion_2,
vgg_config.vgg_conv1[1],
[3, 3],
stride=2,
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='upconv1')
concat1 = tf.concat(
(conv1, upconv1), axis=3, name='concat1')
pyramid_fusion1 = slim.conv2d(
concat1,
vgg_config.vgg_conv1[1],
[3, 3],
normalizer_fn=slim.batch_norm,
normalizer_params={
'is_training': is_training},
scope='pyramid_fusion1')
# Slice off padded area
sliced = pyramid_fusion1[:, 4:]
feature_maps_out = sliced
# Convert end_points_collection into a end_point dict.
end_points = slim.utils.convert_collection_to_dict(
end_points_collection)
return feature_maps_out, end_points
RPN Model
Backbone(feature extraction)出来的feature会分别经过一个1*1的卷积(bottle_neck)生成proposal网络的input_feature。默认配置设置了path_drop:image和bev两个path会有一定的几率没有输入,类似于drop_out(具体参考avod/avod/core/models/rpn.py:create_path_drop_masks)。之后会将得到的3d anchor映射到bev图和image图上,前者直接投影到ground_plane上,后者通过lidar坐标和image坐标的映射关系得到(取最大的2d框)。之后根据config中的roi_crop_size将得到的proposal feature进行crop_and_resize到相同尺寸。之后会做特征的fusion(默认采用mean fusion),fusioned feature会通过两个分支:3层卷积(论文中为fc,实际代码中为convd)组成的objectness和offsets的预测,这样就形成了first stage的proposal,之后proposal一方面会通过top-k的nms(注意这里的nms是所有类共同做的nms结果)作为second stage的输入,另一方面通过gen_mini_batch生成mini-batch(默认为512个samples,正负例各一半)计算objectness和regression loss(smooth l1),值得注意的是这里的是生成mini-batch的方式采用的是random shuffile的方式,即先shuffle一半的正例(256),如果不足的话用负例补充,没有考虑类比不平衡的问题,所以会造成小样本类别物体收敛慢甚至不收敛的问题。其build 网络部分代码如下:
#rpn_model.py:280, deteled some code for summary
def build(self):
# Setup input placeholders
self._set_up_input_pls()
# Setup feature extractors
self._set_up_feature_extractors()
bev_proposal_input = self.bev_bottleneck
img_proposal_input = self.img_bottleneck
fusion_mean_div_factor = 2.0
# If both img and bev probabilites are set to 1.0, don't do
# path drop.
if not (self._path_drop_probabilities[0] ==
self._path_drop_probabilities[1] == 1.0):
with tf.variable_scope('rpn_path_drop'):
random_values = tf.random_uniform(shape=[3],
minval=0.0,
maxval=1.0)
img_mask, bev_mask = self.create_path_drop_masks(
self._path_drop_probabilities[0],
self._path_drop_probabilities[1],
random_values)
img_proposal_input = tf.multiply(img_proposal_input,
img_mask)
bev_proposal_input = tf.multiply(bev_proposal_input,
bev_mask)
self.img_path_drop_mask = img_mask
self.bev_path_drop_mask = bev_mask
# Overwrite the division factor
fusion_mean_div_factor = img_mask + bev_mask
with tf.variable_scope('proposal_roi_pooling'):
with tf.variable_scope('box_indices'):
def get_box_indices(boxes):
proposals_shape = boxes.get_shape().as_list()
if any(dim is None for dim in proposals_shape):
proposals_shape = tf.shape(boxes)
ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32)
multiplier = tf.expand_dims(
tf.range(start=0, limit=proposals_shape[0]), 1)
return tf.reshape(ones_mat * multiplier, [-1])
bev_boxes_norm_batches = tf.expand_dims(
self._bev_anchors_norm_pl, axis=0)
# These should be all 0's since there is only 1 image
tf_box_indices = get_box_indices(bev_boxes_norm_batches)
# Do ROI Pooling on BEV
bev_proposal_rois = tf.image.crop_and_resize(
bev_proposal_input,
self._bev_anchors_norm_pl,
tf_box_indices,
self._proposal_roi_crop_size)
# Do ROI Pooling on image
img_proposal_rois = tf.image.crop_and_resize(
img_proposal_input,
self._img_anchors_norm_pl,
tf_box_indices,
self._proposal_roi_crop_size)
with tf.variable_scope('proposal_roi_fusion'):
rpn_fusion_out = None
if self._fusion_method == 'mean':
tf_features_sum = tf.add(bev_proposal_rois, img_proposal_rois)
rpn_fusion_out = tf.divide(tf_features_sum,
fusion_mean_div_factor)
elif self._fusion_method == 'concat':
rpn_fusion_out = tf.concat(
[bev_proposal_rois, img_proposal_rois], axis=3)
else:
raise ValueError('Invalid fusion method', self._fusion_method)
# TODO: move this section into an separate AnchorPredictor class
with tf.variable_scope('anchor_predictor', 'ap', [rpn_fusion_out]):
tensor_in = rpn_fusion_out
# Parse rpn layers config
layers_config = self._config.layers_config.rpn_config
l2_weight_decay = layers_config.l2_weight_decay
if l2_weight_decay > 0:
weights_regularizer = slim.l2_regularizer(l2_weight_decay)
else:
weights_regularizer = None
with slim.arg_scope([slim.conv2d],
weights_regularizer=weights_regularizer):
# Use conv2d instead of fully_connected layers.
cls_fc6 = slim.conv2d(tensor_in,
layers_config.cls_fc6,
self._proposal_roi_crop_size,
padding='VALID',
scope='cls_fc6')
cls_fc6_drop = slim.dropout(cls_fc6,
layers_config.keep_prob,
is_training=self._is_training,
scope='cls_fc6_drop')
cls_fc7 = slim.conv2d(cls_fc6_drop,
layers_config.cls_fc7,
[1, 1],
scope='cls_fc7')
cls_fc7_drop = slim.dropout(cls_fc7,
layers_config.keep_prob,
is_training=self._is_training,
scope='cls_fc7_drop')
cls_fc8 = slim.conv2d(cls_fc7_drop,
2,
[1, 1],
activation_fn=None,
scope='cls_fc8')
objectness = tf.squeeze(
cls_fc8, [1, 2],
name='cls_fc8/squeezed')
# Use conv2d instead of fully_connected layers.
reg_fc6 = slim.conv2d(tensor_in,
layers_config.reg_fc6,
self._proposal_roi_crop_size,
padding='VALID',
scope='reg_fc6')
reg_fc6_drop = slim.dropout(reg_fc6,
layers_config.keep_prob,
is_training=self._is_training,
scope='reg_fc6_drop')
reg_fc7 = slim.conv2d(reg_fc6_drop,
layers_config.reg_fc7,
[1, 1],
scope='reg_fc7')
reg_fc7_drop = slim.dropout(reg_fc7,
layers_config.keep_prob,
is_training=self._is_training,
scope='reg_fc7_drop')
reg_fc8 = slim.conv2d(reg_fc7_drop,
6,
[1, 1],
activation_fn=None,
scope='reg_fc8')
offsets = tf.squeeze(
reg_fc8, [1, 2],
name='reg_fc8/squeezed')
# Return the proposals
with tf.variable_scope('proposals'):
anchors = self.placeholders[self.PL_ANCHORS]
# Decode anchor regression offsets
with tf.variable_scope('decoding'):
regressed_anchors = anchor_encoder.offset_to_anchor(
anchors, offsets)
with tf.variable_scope('bev_projection'):
_, bev_proposal_boxes_norm = anchor_projector.project_to_bev(
regressed_anchors, self._bev_extents)
with tf.variable_scope('softmax'):
objectness_softmax = tf.nn.softmax(objectness)
with tf.variable_scope('nms'):
objectness_scores = objectness_softmax[:, 1]
# Do NMS on regressed anchors
top_indices = tf.image.non_max_suppression(
bev_proposal_boxes_norm, objectness_scores,
max_output_size=self._nms_size,
iou_threshold=self._nms_iou_thresh)
top_anchors = tf.gather(regressed_anchors, top_indices)
top_objectness_softmax = tf.gather(objectness_scores,
top_indices)
# top_offsets = tf.gather(offsets, top_indices)
# top_objectness = tf.gather(objectness, top_indices)
# Get mini batch
all_ious_gt = self.placeholders[self.PL_ANCHOR_IOUS]
all_offsets_gt = self.placeholders[self.PL_ANCHOR_OFFSETS]
all_classes_gt = self.placeholders[self.PL_ANCHOR_CLASSES]
with tf.variable_scope('mini_batch'):
mini_batch_utils = self.dataset.kitti_utils.mini_batch_utils
mini_batch_mask, _ = \
mini_batch_utils.sample_rpn_mini_batch(all_ious_gt)
# Ground Truth Tensors
with tf.variable_scope('one_hot_classes'):
# Anchor classification ground truth
# Object / Not Object
min_pos_iou = \
self.dataset.kitti_utils.mini_batch_utils.rpn_pos_iou_range[0]
objectness_classes_gt = tf.cast(
tf.greater_equal(all_ious_gt, min_pos_iou),
dtype=tf.int32)
objectness_gt = tf.one_hot(
objectness_classes_gt, depth=2,
on_value=1.0 - self._config.label_smoothing_epsilon,
off_value=self._config.label_smoothing_epsilon)
# Mask predictions for mini batch
with tf.variable_scope('prediction_mini_batch'):
objectness_masked = tf.boolean_mask(objectness, mini_batch_mask)
offsets_masked = tf.boolean_mask(offsets, mini_batch_mask)
with tf.variable_scope('ground_truth_mini_batch'):
objectness_gt_masked = tf.boolean_mask(
objectness_gt, mini_batch_mask)
offsets_gt_masked = tf.boolean_mask(all_offsets_gt,
mini_batch_mask)
# Specify the tensors to evaluate
predictions = dict()
# Temporary predictions for debugging
# predictions['anchor_ious'] = anchor_ious
# predictions['anchor_offsets'] = all_offsets_gt
if self._train_val_test in ['train', 'val']:
# All anchors
predictions[self.PRED_ANCHORS] = anchors
# Mini-batch masks
predictions[self.PRED_MB_MASK] = mini_batch_mask
# Mini-batch predictions
predictions[self.PRED_MB_OBJECTNESS] = objectness_masked
predictions[self.PRED_MB_OFFSETS] = offsets_masked
# Mini batch ground truth
predictions[self.PRED_MB_OFFSETS_GT] = offsets_gt_masked
predictions[self.PRED_MB_OBJECTNESS_GT] = objectness_gt_masked
# Proposals after nms
predictions[self.PRED_TOP_INDICES] = top_indices
predictions[self.PRED_TOP_ANCHORS] = top_anchors
predictions[
self.PRED_TOP_OBJECTNESS_SOFTMAX] = top_objectness_softmax
else:
# self._train_val_test == 'test'
predictions[self.PRED_TOP_ANCHORS] = top_anchors
predictions[
self.PRED_TOP_OBJECTNESS_SOFTMAX] = top_objectness_softmax
return predictions
AVOD Model
AVOD网络部分会得到first stage得到的top-k anchor proposals,得到对应bev和img的anchor projection,进行相同的crop_and_resize操作,之后再进行fusion+n*(fc+fc_drop)进行cls,offsets以及angle vector的预测(fusion默认采用early-fusion:即先进行fusion再进入之后网络层)。生成prediction之后,会解码gt投影到bev图上,然后采用同样的策略生成mini-batch和top-anchor(bev上进行的nms),并且生成对应的objecness,offset,angle的loss。mini-batch的loss作为train过程中进行模型训练,后者生成最终的预测,但是loss好像并没有使用。其中,offset的loss需要转化到3d box上去计算(论文提出的box_4c计算方式)。相关代码如下:
#avod_model.py:123 deleted code for summary
def build(self):
rpn_model = self._rpn_model
# Share the same prediction dict as RPN
prediction_dict = rpn_model.build()
top_anchors = prediction_dict[RpnModel.PRED_TOP_ANCHORS]
ground_plane = rpn_model.placeholders[RpnModel.PL_GROUND_PLANE]
class_labels = rpn_model.placeholders[RpnModel.PL_LABEL_CLASSES]
with tf.variable_scope('avod_projection'):
if self._config.expand_proposals_xz > 0.0:
expand_length = self._config.expand_proposals_xz
# Expand anchors along x and z
with tf.variable_scope('expand_xz'):
expanded_dim_x = top_anchors[:, 3] + expand_length
expanded_dim_z = top_anchors[:, 5] + expand_length
expanded_anchors = tf.stack([
top_anchors[:, 0],
top_anchors[:, 1],
top_anchors[:, 2],
expanded_dim_x,
top_anchors[:, 4],
expanded_dim_z
], axis=1)
avod_projection_in = expanded_anchors
else:
avod_projection_in = top_anchors
with tf.variable_scope('bev'):
# Project top anchors into bev and image spaces
bev_proposal_boxes, bev_proposal_boxes_norm = \
anchor_projector.project_to_bev(
avod_projection_in,
self.dataset.kitti_utils.bev_extents)
# Reorder projected boxes into [y1, x1, y2, x2]
bev_proposal_boxes_tf_order = \
anchor_projector.reorder_projected_boxes(
bev_proposal_boxes)
bev_proposal_boxes_norm_tf_order = \
anchor_projector.reorder_projected_boxes(
bev_proposal_boxes_norm)
with tf.variable_scope('img'):
image_shape = tf.cast(tf.shape(
rpn_model.placeholders[RpnModel.PL_IMG_INPUT])[0:2],
tf.float32)
img_proposal_boxes, img_proposal_boxes_norm = \
anchor_projector.tf_project_to_image_space(
avod_projection_in,
rpn_model.placeholders[RpnModel.PL_CALIB_P2],
image_shape)
# Only reorder the normalized img
img_proposal_boxes_norm_tf_order = \
anchor_projector.reorder_projected_boxes(
img_proposal_boxes_norm)
bev_feature_maps = rpn_model.bev_feature_maps
img_feature_maps = rpn_model.img_feature_maps
if not (self._path_drop_probabilities[0] ==
self._path_drop_probabilities[1] == 1.0):
with tf.variable_scope('avod_path_drop'):
img_mask = rpn_model.img_path_drop_mask
bev_mask = rpn_model.bev_path_drop_mask
img_feature_maps = tf.multiply(img_feature_maps,
img_mask)
bev_feature_maps = tf.multiply(bev_feature_maps,
bev_mask)
else:
bev_mask = tf.constant(1.0)
img_mask = tf.constant(1.0)
# ROI Pooling
with tf.variable_scope('avod_roi_pooling'):
def get_box_indices(boxes):
proposals_shape = boxes.get_shape().as_list()
if any(dim is None for dim in proposals_shape):
proposals_shape = tf.shape(boxes)
ones_mat = tf.ones(proposals_shape[:2], dtype=tf.int32)
multiplier = tf.expand_dims(
tf.range(start=0, limit=proposals_shape[0]), 1)
return tf.reshape(ones_mat * multiplier, [-1])
bev_boxes_norm_batches = tf.expand_dims(
bev_proposal_boxes_norm, axis=0)
# These should be all 0's since there is only 1 image
tf_box_indices = get_box_indices(bev_boxes_norm_batches)
# Do ROI Pooling on BEV
bev_rois = tf.image.crop_and_resize(
bev_feature_maps,
bev_proposal_boxes_norm_tf_order,
tf_box_indices,
self._proposal_roi_crop_size,
name='bev_rois')
# Do ROI Pooling on image
img_rois = tf.image.crop_and_resize(
img_feature_maps,
img_proposal_boxes_norm_tf_order,
tf_box_indices,
self._proposal_roi_crop_size,
name='img_rois')
# Fully connected layers (Box Predictor)
avod_layers_config = self.model_config.layers_config.avod_config
fc_output_layers = \
avod_fc_layers_builder.build(
layers_config=avod_layers_config,
input_rois=[bev_rois, img_rois],
input_weights=[bev_mask, img_mask],
num_final_classes=self._num_final_classes,
box_rep=self._box_rep,
top_anchors=top_anchors,
ground_plane=ground_plane,
is_training=self._is_training)
all_cls_logits = \
fc_output_layers[avod_fc_layers_builder.KEY_CLS_LOGITS]
all_offsets = fc_output_layers[avod_fc_layers_builder.KEY_OFFSETS]
# This may be None
all_angle_vectors = \
fc_output_layers.get(avod_fc_layers_builder.KEY_ANGLE_VECTORS)
with tf.variable_scope('softmax'):
all_cls_softmax = tf.nn.softmax(
all_cls_logits)
######################################################
# Subsample mini_batch for the loss function
######################################################
# Get the ground truth tensors
anchors_gt = rpn_model.placeholders[RpnModel.PL_LABEL_ANCHORS]
if self._box_rep in ['box_3d', 'box_4ca']:
boxes_3d_gt = rpn_model.placeholders[RpnModel.PL_LABEL_BOXES_3D]
orientations_gt = boxes_3d_gt[:, 6]
elif self._box_rep in ['box_8c', 'box_8co', 'box_4c']:
boxes_3d_gt = rpn_model.placeholders[RpnModel.PL_LABEL_BOXES_3D]
else:
raise NotImplementedError('Ground truth tensors not implemented')
# Project anchor_gts to 2D bev
with tf.variable_scope('avod_gt_projection'):
bev_anchor_boxes_gt, _ = anchor_projector.project_to_bev(
anchors_gt, self.dataset.kitti_utils.bev_extents)
bev_anchor_boxes_gt_tf_order = \
anchor_projector.reorder_projected_boxes(bev_anchor_boxes_gt)
with tf.variable_scope('avod_box_list'):
# Convert to box_list format
anchor_box_list_gt = box_list.BoxList(bev_anchor_boxes_gt_tf_order)
anchor_box_list = box_list.BoxList(bev_proposal_boxes_tf_order)
#得到minibatch的mask,label index和对应的匹配到的gt index
mb_mask, mb_class_label_indices, mb_gt_indices = \
self.sample_mini_batch(
anchor_box_list_gt=anchor_box_list_gt,
anchor_box_list=anchor_box_list,
class_labels=class_labels)
# Create classification one_hot vector
with tf.variable_scope('avod_one_hot_classes'):
mb_classification_gt = tf.one_hot(
mb_class_label_indices,
depth=self._num_final_classes,
on_value=1.0 - self._config.label_smoothing_epsilon,
off_value=(self._config.label_smoothing_epsilon /
self.dataset.num_classes))
# TODO: Don't create a mini batch in test mode
# Mask predictions
with tf.variable_scope('avod_apply_mb_mask'):
# Classification
mb_classifications_logits = tf.boolean_mask(
all_cls_logits, mb_mask)
mb_classifications_softmax = tf.boolean_mask(
all_cls_softmax, mb_mask)
# Offsets
mb_offsets = tf.boolean_mask(all_offsets, mb_mask)
# Angle Vectors
if all_angle_vectors is not None:
mb_angle_vectors = tf.boolean_mask(all_angle_vectors, mb_mask)
else:
mb_angle_vectors = None
# Encode anchor offsets
with tf.variable_scope('avod_encode_mb_anchors'):
mb_anchors = tf.boolean_mask(top_anchors, mb_mask)
if self._box_rep == 'box_3d':
# Gather corresponding ground truth anchors for each mb sample
mb_anchors_gt = tf.gather(anchors_gt, mb_gt_indices)
mb_offsets_gt = anchor_encoder.tf_anchor_to_offset(
mb_anchors, mb_anchors_gt)
# Gather corresponding ground truth orientation for each
# mb sample
mb_orientations_gt = tf.gather(orientations_gt,
mb_gt_indices)
elif self._box_rep in ['box_8c', 'box_8co']:
# Get boxes_3d ground truth mini-batch and convert to box_8c
mb_boxes_3d_gt = tf.gather(boxes_3d_gt, mb_gt_indices)
if self._box_rep == 'box_8c':
mb_boxes_8c_gt = \
box_8c_encoder.tf_box_3d_to_box_8c(mb_boxes_3d_gt)
elif self._box_rep == 'box_8co':
mb_boxes_8c_gt = \
box_8c_encoder.tf_box_3d_to_box_8co(mb_boxes_3d_gt)
# Convert proposals: anchors -> box_3d -> box8c
proposal_boxes_3d = \
box_3d_encoder.anchors_to_box_3d(top_anchors, fix_lw=True)
proposal_boxes_8c = \
box_8c_encoder.tf_box_3d_to_box_8c(proposal_boxes_3d)
# Get mini batch offsets
mb_boxes_8c = tf.boolean_mask(proposal_boxes_8c, mb_mask)
mb_offsets_gt = box_8c_encoder.tf_box_8c_to_offsets(
mb_boxes_8c, mb_boxes_8c_gt)
# Flatten the offsets to a (N x 24) vector
mb_offsets_gt = tf.reshape(mb_offsets_gt, [-1, 24])
elif self._box_rep in ['box_4c', 'box_4ca']:
# Get ground plane for box_4c conversion
ground_plane = self._rpn_model.placeholders[
self._rpn_model.PL_GROUND_PLANE]
# Convert gt boxes_3d -> box_4c
mb_boxes_3d_gt = tf.gather(boxes_3d_gt, mb_gt_indices)
mb_boxes_4c_gt = box_4c_encoder.tf_box_3d_to_box_4c(
mb_boxes_3d_gt, ground_plane)
# Convert proposals: anchors -> box_3d -> box_4c
proposal_boxes_3d = \
box_3d_encoder.anchors_to_box_3d(top_anchors, fix_lw=True)
proposal_boxes_4c = \
box_4c_encoder.tf_box_3d_to_box_4c(proposal_boxes_3d,
ground_plane)
# Get mini batch
mb_boxes_4c = tf.boolean_mask(proposal_boxes_4c, mb_mask)
mb_offsets_gt = box_4c_encoder.tf_box_4c_to_offsets(
mb_boxes_4c, mb_boxes_4c_gt)
if self._box_rep == 'box_4ca':
# Gather corresponding ground truth orientation for each
# mb sample
mb_orientations_gt = tf.gather(orientations_gt,
mb_gt_indices)
else:
raise NotImplementedError(
'Anchor encoding not implemented for', self._box_rep)
######################################################
# Final Predictions
######################################################
# Get orientations from angle vectors
if all_angle_vectors is not None:
with tf.variable_scope('avod_orientation'):
all_orientations = \
orientation_encoder.tf_angle_vector_to_orientation(
all_angle_vectors)
# Apply offsets to regress proposals
with tf.variable_scope('avod_regression'):
if self._box_rep == 'box_3d':
prediction_anchors = \
anchor_encoder.offset_to_anchor(top_anchors,
all_offsets)
elif self._box_rep in ['box_8c', 'box_8co']:
# Reshape the 24-dim regressed offsets to (N x 3 x 8)
reshaped_offsets = tf.reshape(all_offsets,
[-1, 3, 8])
# Given the offsets, get the boxes_8c
prediction_boxes_8c = \
box_8c_encoder.tf_offsets_to_box_8c(proposal_boxes_8c,
reshaped_offsets)
# Convert corners back to box3D
prediction_boxes_3d = \
box_8c_encoder.box_8c_to_box_3d(prediction_boxes_8c)
# Convert the box_3d to anchor format for nms
prediction_anchors = \
box_3d_encoder.tf_box_3d_to_anchor(prediction_boxes_3d)
elif self._box_rep in ['box_4c', 'box_4ca']:
# Convert predictions box_4c -> box_3d
prediction_boxes_4c = \
box_4c_encoder.tf_offsets_to_box_4c(proposal_boxes_4c,
all_offsets)
prediction_boxes_3d = \
box_4c_encoder.tf_box_4c_to_box_3d(prediction_boxes_4c,
ground_plane)
# Convert to anchor format for nms
prediction_anchors = \
box_3d_encoder.tf_box_3d_to_anchor(prediction_boxes_3d)
else:
raise NotImplementedError('Regression not implemented for',
self._box_rep)
# Apply Non-oriented NMS in BEV
with tf.variable_scope('avod_nms'):
bev_extents = self.dataset.kitti_utils.bev_extents
with tf.variable_scope('bev_projection'):
# Project predictions into BEV
avod_bev_boxes, _ = anchor_projector.project_to_bev(
prediction_anchors, bev_extents)
avod_bev_boxes_tf_order = \
anchor_projector.reorder_projected_boxes(
avod_bev_boxes)
# Get top score from second column onward
all_top_scores = tf.reduce_max(all_cls_logits[:, 1:], axis=1)
# Apply NMS in BEV
nms_indices = tf.image.non_max_suppression(
avod_bev_boxes_tf_order,
all_top_scores,
max_output_size=self._nms_size,
iou_threshold=self._nms_iou_threshold)
# Gather predictions from NMS indices
top_classification_logits = tf.gather(all_cls_logits,
nms_indices)
top_classification_softmax = tf.gather(all_cls_softmax,
nms_indices)
top_prediction_anchors = tf.gather(prediction_anchors,
nms_indices)
if self._box_rep == 'box_3d':
top_orientations = tf.gather(
all_orientations, nms_indices)
elif self._box_rep in ['box_8c', 'box_8co']:
top_prediction_boxes_3d = tf.gather(
prediction_boxes_3d, nms_indices)
top_prediction_boxes_8c = tf.gather(
prediction_boxes_8c, nms_indices)
elif self._box_rep == 'box_4c':
top_prediction_boxes_3d = tf.gather(
prediction_boxes_3d, nms_indices)
top_prediction_boxes_4c = tf.gather(
prediction_boxes_4c, nms_indices)
elif self._box_rep == 'box_4ca':
top_prediction_boxes_3d = tf.gather(
prediction_boxes_3d, nms_indices)
top_prediction_boxes_4c = tf.gather(
prediction_boxes_4c, nms_indices)
top_orientations = tf.gather(
all_orientations, nms_indices)
else:
raise NotImplementedError('NMS gather not implemented for',
self._box_rep)
if self._train_val_test in ['train', 'val']:
# Additional entries are added to the shared prediction_dict
# Mini batch predictions
prediction_dict[self.PRED_MB_CLASSIFICATION_LOGITS] = \
mb_classifications_logits
prediction_dict[self.PRED_MB_CLASSIFICATION_SOFTMAX] = \
mb_classifications_softmax
prediction_dict[self.PRED_MB_OFFSETS] = mb_offsets
# Mini batch ground truth
prediction_dict[self.PRED_MB_CLASSIFICATIONS_GT] = \
mb_classification_gt
prediction_dict[self.PRED_MB_OFFSETS_GT] = mb_offsets_gt
# Top NMS predictions
prediction_dict[self.PRED_TOP_CLASSIFICATION_LOGITS] = \
top_classification_logits
prediction_dict[self.PRED_TOP_CLASSIFICATION_SOFTMAX] = \
top_classification_softmax
prediction_dict[self.PRED_TOP_PREDICTION_ANCHORS] = \
top_prediction_anchors
# Mini batch predictions (for debugging)
prediction_dict[self.PRED_MB_MASK] = mb_mask
# prediction_dict[self.PRED_MB_POS_MASK] = mb_pos_mask
prediction_dict[self.PRED_MB_CLASS_INDICES_GT] = \
mb_class_label_indices
# All predictions (for debugging)
prediction_dict[self.PRED_ALL_CLASSIFICATIONS] = \
all_cls_logits
prediction_dict[self.PRED_ALL_OFFSETS] = all_offsets
# Path drop masks (for debugging)
prediction_dict['bev_mask'] = bev_mask
prediction_dict['img_mask'] = img_mask
else:
# self._train_val_test == 'test'
prediction_dict[self.PRED_TOP_CLASSIFICATION_SOFTMAX] = \
top_classification_softmax
prediction_dict[self.PRED_TOP_PREDICTION_ANCHORS] = \
top_prediction_anchors
if self._box_rep == 'box_3d':
prediction_dict[self.PRED_MB_ANCHORS_GT] = mb_anchors_gt
prediction_dict[self.PRED_MB_ORIENTATIONS_GT] = mb_orientations_gt
prediction_dict[self.PRED_MB_ANGLE_VECTORS] = mb_angle_vectors
prediction_dict[self.PRED_TOP_ORIENTATIONS] = top_orientations
# For debugging
prediction_dict[self.PRED_ALL_ANGLE_VECTORS] = all_angle_vectors
elif self._box_rep in ['box_8c', 'box_8co']:
prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \
top_prediction_boxes_3d
# Store the corners before converting for visualization purposes
prediction_dict[self.PRED_TOP_BOXES_8C] = top_prediction_boxes_8c
elif self._box_rep == 'box_4c':
prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \
top_prediction_boxes_3d
prediction_dict[self.PRED_TOP_BOXES_4C] = top_prediction_boxes_4c
elif self._box_rep == 'box_4ca':
if self._train_val_test in ['train', 'val']:
prediction_dict[self.PRED_MB_ORIENTATIONS_GT] = \
mb_orientations_gt
prediction_dict[self.PRED_MB_ANGLE_VECTORS] = mb_angle_vectors
prediction_dict[self.PRED_TOP_PREDICTION_BOXES_3D] = \
top_prediction_boxes_3d
prediction_dict[self.PRED_TOP_BOXES_4C] = top_prediction_boxes_4c
prediction_dict[self.PRED_TOP_ORIENTATIONS] = top_orientations
else:
raise NotImplementedError('Prediction dict not implemented for',
self._box_rep)
# prediction_dict[self.PRED_MAX_IOUS] = max_ious
# prediction_dict[self.PRED_ALL_IOUS] = all_ious
return prediction_dict