主要是回忆SSD源码解析
在稍微熟悉了流程之后重新梳理SSD
特征提取
使用inception_v2进特征提取
inception_v2的特征层shape
onv2d_1a_7x7 : (3, 112, 112, 256)
MaxPool_2a_3x3 : (3, 56, 56, 256)
Conv2d_2b_1x1 : (3, 56, 56, 256)
Conv2d_2c_3x3 : (3, 56, 56, 256)
MaxPool_3a_3x3 : (3, 28, 28, 256)
Mixed_3b : (3, 28, 28, 1024)
Mixed_3c : (3, 28, 28, 1024)
Mixed_4a : (3, 14, 14, 1536)
Mixed_4b : (3, 14, 14, 1024)
Mixed_4c : (3, 14, 14, 1024)
Mixed_4d : (3, 14, 14, 1024)
Mixed_4e : (3, 14, 14, 1024)
Mixed_5a : (3, 7, 7, 1536)
Mixed_5b : (3, 7, 7, 1184)
Mixed_5c : (3, 7, 7, 1184)
在SSD中使用的是final_endpoint='Mixed_5c'
如下:
def extract_features(self, preprocessed_inputs):
"""Extract features from preprocessed inputs.
Args:
preprocessed_inputs: a [batch, height, width, channels] float tensor
representing a batch of images.
Returns:
feature_maps: a list of tensors where the ith tensor has shape
[batch, height_i, width_i, depth_i]
"""
preprocessed_inputs = shape_utils.check_min_image_dim(
33, preprocessed_inputs)
feature_map_layout = {
'from_layer': ['Mixed_4c', 'Mixed_5c', '', '', '', ''],
'layer_depth': [-1, -1, 512, 256, 256, 128],
'use_explicit_padding': self._use_explicit_padding,
'use_depthwise': self._use_depthwise,
}
with slim.arg_scope(self._conv_hyperparams_fn()):
with tf.variable_scope('InceptionV2',
reuse=self._reuse_weights) as scope:
_, image_features = inception_v2.inception_v2_base(
ops.pad_to_multiple(preprocessed_inputs, self._pad_to_multiple),
final_endpoint='Mixed_5c',
min_depth=self._min_depth,
depth_multiplier=self._depth_multiplier,
scope=scope)
feature_maps = feature_map_generators.multi_resolution_feature_maps(
feature_map_layout=feature_map_layout,
depth_multiplier=self._depth_multiplier,
min_depth=self._min_depth,
insert_1x1_conv=True,
image_features=image_features)
return feature_maps.values()
上面可以看出SSD使用了元inception_v2的'Mixed_4c', 'Mixed_5c'。后面还跟了几个空字符串。
在feature_map_generators_test.py中可以看到,feature_map会根据最后一层的特征图大小自动添加4层,如下:
这里是使用Mixed_3c,Mixed_4c,Mixed_5c,再添加最后三层。
expected_feature_map_shapes = {
'Mixed_3c': (4, 28, 28, 256),
'Mixed_4c': (4, 14, 14, 576),
'Mixed_5c': (4, 7, 7, 1024),
'Mixed_5c_2_Conv2d_3_3x3_s2_512': (4, 4, 4, 512),
'Mixed_5c_2_Conv2d_4_3x3_s2_256': (4, 2, 2, 256),
'Mixed_5c_2_Conv2d_5_3x3_s2_256': (4, 1, 1, 256)}
所以extract_features会返回6个特征层。
拿到featur_map之后需要在feature_map上进行box location 和classes score。
在config文件中,提到使用convolutional_box_predictor 进行预测
box_predictor {
convolutional_box_predictor {
使用的·还是box_predictot.py中
class ConvolutionalBoxPredictor(BoxPredictor):
def _predict(self, image_features, num_predictions_per_location):
# Add a slot for the background class.
num_class_slots = self.num_classes + 1
net = image_features
with slim.arg_scope(self._conv_hyperparams), \
slim.arg_scope([slim.dropout], is_training=self._is_training):
# Add additional conv layers before the class predictor.
features_depth = static_shape.get_depth(image_features.get_shape())
depth = max(min(features_depth, self._max_depth), self._min_depth)
tf.logging.info('depth of additional conv before box predictor: {}'.
format(depth))
if depth > 0 and self._num_layers_before_predictor > 0:
for i in range(self._num_layers_before_predictor):
net = slim.conv2d(
net, depth, [1, 1], scope='Conv2d_%d_1x1_%d' % (i, depth))
with slim.arg_scope([slim.conv2d], activation_fn=None,
normalizer_fn=None, normalizer_params=None):
box_encodings = slim.conv2d(
net, num_predictions_per_location * self._box_code_size,
[self._kernel_size, self._kernel_size],
scope='BoxEncodingPredictor')
if self._use_dropout:
net = slim.dropout(net, keep_prob=self._dropout_keep_prob)
class_predictions_with_background = slim.conv2d(
net, num_predictions_per_location * num_class_slots,
[self._kernel_size, self._kernel_size], scope='ClassPredictor',
biases_initializer=tf.constant_initializer(
self._class_prediction_bias_init))
if self._apply_sigmoid_to_scores:
class_predictions_with_background = tf.sigmoid(
class_predictions_with_background)
combined_feature_map_shape = shape_utils.combined_static_and_dynamic_shape(
image_features)
box_encodings = tf.reshape(
box_encodings, tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
1, self._box_code_size]))
class_predictions_with_background = tf.reshape(
class_predictions_with_background,
tf.stack([combined_feature_map_shape[0],
combined_feature_map_shape[1] *
combined_feature_map_shape[2] *
num_predictions_per_location,
num_class_slots]))
return {BOX_ENCODINGS: box_encodings,
CLASS_PREDICTIONS_WITH_BACKGROUND:
class_predictions_with_background}
这里代码比较乱吧,从后往前看,输出box_encodings和class_predictions_with_background。
如果需要改extract_features而进行模型创新的话可以改extract_features函数就行了。
其实SSD比较烦的是它的检测模型的postprocess
def postprocess(self, prediction_dict, true_image_shapes):
# 判断数据合法性
if ('box_encodings' not in prediction_dict or
'class_predictions_with_background' not in prediction_dict):
raise ValueError('prediction_dict does not contain expected entries.')
with tf.name_scope('Postprocessor'):
# 获取预测结果
preprocessed_images = prediction_dict['preprocessed_inputs']
box_encodings = prediction_dict['box_encodings']
class_predictions = prediction_dict['class_predictions_with_background']
# 转换bbox信息
detection_boxes, detection_keypoints = self._batch_decode(box_encodings)
detection_boxes = tf.expand_dims(detection_boxes, axis=2)
# 将logits转换为predictions
detection_scores_with_background = self._score_conversion_fn(
class_predictions)
detection_scores = tf.slice(detection_scores_with_background, [0, 0, 1],
[-1, -1, -1])
additional_fields = None
if detection_keypoints is not None:
additional_fields = {
fields.BoxListFields.keypoints: detection_keypoints}
# 通过nms算法筛选bbox
(nmsed_boxes, nmsed_scores, nmsed_classes, _, nmsed_additional_fields,
num_detections) = self._non_max_suppression_fn(
detection_boxes,
detection_scores,
clip_window=self._compute_clip_window(
preprocessed_images, true_image_shapes),
additional_fields=additional_fields)
# 封装返回结果
detection_dict = {
fields.DetectionResultFields.detection_boxes: nmsed_boxes,
fields.DetectionResultFields.detection_scores: nmsed_scores,
fields.DetectionResultFields.detection_classes: nmsed_classes,
fields.DetectionResultFields.num_detections:
tf.to_float(num_detections)
}
if (nmsed_additional_fields is not None and
fields.BoxListFields.keypoints in nmsed_additional_fields):
detection_dict[fields.DetectionResultFields.detection_keypoints] = (
nmsed_additional_fields[fields.BoxListFields.keypoints])
return detection_dict
loss在下面
def loss(self, prediction_dict, true_image_shapes, scope=None):
with tf.name_scope(scope, 'Loss', prediction_dict.values()):
# keypoints 相关操作
keypoints = None
if self.groundtruth_has_field(fields.BoxListFields.keypoints):
keypoints = self.groundtruth_lists(fields.BoxListFields.keypoints)
# 获取预测 targets(用于后续计算损失函数)
weights = None
if self.groundtruth_has_field(fields.BoxListFields.weights):
weights = self.groundtruth_lists(fields.BoxListFields.weights)
(batch_cls_targets, batch_cls_weights, batch_reg_targets,
batch_reg_weights, match_list) = self._assign_targets(
self.groundtruth_lists(fields.BoxListFields.boxes),
self.groundtruth_lists(fields.BoxListFields.classes),
keypoints, weights)
if self._add_summaries:
self._summarize_target_assignment(
self.groundtruth_lists(fields.BoxListFields.boxes), match_list)
# 二次筛选样本
# 如需要设置 正例和反例 的比例,则在这一步实现
if self._random_example_sampler:
batch_sampled_indicator = tf.to_float(
shape_utils.static_or_dynamic_map_fn(
self._minibatch_subsample_fn,
[batch_cls_targets, batch_cls_weights],
dtype=tf.bool,
parallel_iterations=self._parallel_iterations,
back_prop=True))
batch_reg_weights = tf.multiply(batch_sampled_indicator,
batch_reg_weights)
batch_cls_weights = tf.multiply(batch_sampled_indicator,
batch_cls_weights)
# 分别计算位置误差与分类误差(通过`Loss`子类对象)
location_losses = self._localization_loss(
prediction_dict['box_encodings'],
batch_reg_targets,
ignore_nan_targets=True,
weights=batch_reg_weights)
cls_losses = ops.reduce_sum_trailing_dimensions(
self._classification_loss(
prediction_dict['class_predictions_with_background'],
batch_cls_targets,
weights=batch_cls_weights),
ndims=2)
# hard example 相关
if self._hard_example_miner:
(localization_loss, classification_loss) = self._apply_hard_mining(
location_losses, cls_losses, prediction_dict, match_list)
if self._add_summaries:
self._hard_example_miner.summarize()
else:
if self._add_summaries:
class_ids = tf.argmax(batch_cls_targets, axis=2)
flattened_class_ids = tf.reshape(class_ids, [-1])
flattened_classification_losses = tf.reshape(cls_losses, [-1])
self._summarize_anchor_classification_loss(
flattened_class_ids, flattened_classification_losses)
localization_loss = tf.reduce_sum(location_losses)
classification_loss = tf.reduce_sum(cls_losses)
# Optionally normalize by number of positive matches
normalizer = tf.constant(1.0, dtype=tf.float32)
if self._normalize_loss_by_num_matches:
normalizer = tf.maximum(tf.to_float(tf.reduce_sum(batch_reg_weights)),
1.0)
localization_loss_normalizer = normalizer
if self._normalize_loc_loss_by_codesize:
localization_loss_normalizer *= self._box_coder.code_size
localization_loss = tf.multiply((self._localization_loss_weight /
localization_loss_normalizer),
localization_loss,
name='localization_loss')
classification_loss = tf.multiply((self._classification_loss_weight /
normalizer), classification_loss,
name='classification_loss')
# 封装返回结果
loss_dict = {
str(localization_loss.op.name): localization_loss,
str(classification_loss.op.name): classification_loss
}
return loss_dict
参考:
TensorFlow Object Detection API 源码(2) 组件介绍