作为学习记录，记录一下复杂的心情。整个项目并没看完，很多流程其实已经忘了，但。。。。。。。。。。。

极其简单的修改奏效了

使用vgg.py的vgg_a作为整个模型的特征提取工具。

代码如下：

def vgg_a(inputs,
          num_classes=1000,
          is_training=True,
          dropout_keep_prob=0.5,
          spatial_squeeze=True,
          scope='vgg_a',
          fc_conv_padding='VALID',
          global_pool=False,
          stride=8):
  with tf.variable_scope(scope, 'vgg_a', [inputs]) as sc:
    end_points_collection = sc.original_name_scope + '_end_points'
    # Collect outputs for conv2d, fully_connected and max_pool2d.
    with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                        outputs_collections=end_points_collection):
      end_points = slim.utils.convert_collection_to_dict(end_points_collection)
      net = slim.repeat(inputs, 1, slim.conv2d, 64, [3, 3], scope='conv1')
      net = slim.max_pool2d(net, [2, 2], scope='pool1')
      end_points['pool1'] = net
      net = slim.repeat(net, 1, slim.conv2d, 128, [3, 3], scope='conv2')
      net = slim.max_pool2d(net, [2, 2], scope='pool2')
      end_points['pool2'] = net
      net = slim.repeat(net, 2, slim.conv2d, 256, [3, 3], scope='conv3')
      net = slim.max_pool2d(net, [2, 2], scope='pool3')
      end_points['pool3'] = net
      if stride == 8:
        return net, end_points
      net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv4')
      net = slim.max_pool2d(net, [2, 2], scope='pool4')
      end_points['pool4'] = net
      if stride == 16:
        return net, end_points
      net = slim.repeat(net, 2, slim.conv2d, 512, [3, 3], scope='conv5')
      net = slim.max_pool2d(net, [2, 2], scope='pool5')
      end_points['pool5'] = net
      # Use conv2d instead of fully_connected layers.

      return net, end_points

这里的返回值就是最后一层的返回值和end_ponts的字典。

修改Resnet V1 Faster R-CNN implementation.

不敢在其他地方修改，照着Resnet V1 Faster R-CNN implementation的test文件简单修改了一下，其他地方是不敢动啊。



"""Resnet V1 Faster R-CNN implementation.

See "Deep Residual Learning for Image Recognition" by He et al., 2015.
https://arxiv.org/abs/1512.03385

Note: this implementation assumes that the classification checkpoint used
to finetune this model is trained using the same configuration as that of
the MSRA provided checkpoints
(see https://github.com/KaimingHe/deep-residual-networks), e.g., with
same preprocessing, batch norm scaling, etc.
"""
import tensorflow as tf

from object_detection.meta_architectures import faster_rcnn_meta_arch
from nets import resnet_utils
from nets import resnet_v1
from nets import vgg
slim = tf.contrib.slim


class FasterRCNNResnetV1FeatureExtractor(
    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
  """Faster R-CNN Resnet V1 feature extractor implementation."""

  def __init__(self,
               architecture,
               resnet_model,
               is_training,
               first_stage_features_stride,
               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.

    Args:
      architecture: Architecture name of the Resnet V1 model.
      resnet_model: Definition of the Resnet V1 model.
      is_training: See base class.
      first_stage_features_stride: See base class.
      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

    Raises:
      ValueError: If `first_stage_features_stride` is not 8 or 16.
    """
    if first_stage_features_stride != 8 and first_stage_features_stride != 16:
      raise ValueError('`first_stage_features_stride` must be 8 or 16.')
    self._architecture = architecture
    self._resnet_model = resnet_model
    super(FasterRCNNResnetV1FeatureExtractor, self).__init__(
        is_training, first_stage_features_stride, batch_norm_trainable,
        reuse_weights, weight_decay)

  def preprocess(self, resized_inputs):
    """Faster R-CNN Resnet V1 preprocessing.

    VGG style channel mean subtraction as described here:
    https://gist.github.com/ksimonyan/211839e770f7b538e2d8#file-readme-md

    Args:
      resized_inputs: A [batch, height_in, width_in, channels] float32 tensor
        representing a batch of images with values between 0 and 255.0.

    Returns:
      preprocessed_inputs: A [batch, height_out, width_out, channels] float32
        tensor representing a batch of images.

    """
    channel_means = [123.68, 116.779, 103.939]
    return resized_inputs - [[channel_means]]

  def _extract_proposal_features(self, preprocessed_inputs, scope):
    """Extracts first stage RPN features.

    Args:
      preprocessed_inputs: A [batch, height, width, channels] float32 tensor
        representing a batch of images.
      scope: A scope name.

    Returns:
      rpn_feature_map: A tensor with shape [batch, height, width, depth]
      activations: A dictionary mapping feature extractor tensor names to
        tensors

    Raises:
      InvalidArgumentError: If the spatial size of `preprocessed_inputs`
        (height or width) is less than 33.
      ValueError: If the created network is missing the required activation.
    """
    if len(preprocessed_inputs.get_shape().as_list()) != 4:
      raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
                       'tensor of shape %s' % preprocessed_inputs.get_shape())
    shape_assert = tf.Assert(
        tf.logical_and(
            tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
            tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
        ['image size must at least be 33 in both height and width.'])

    with tf.control_dependencies([shape_assert]):
      # Disables batchnorm for fine-tuning with smaller batch sizes.
      # TODO(chensun): Figure out if it is needed when image
      # batch size is bigger.
      with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=self._weight_decay)):
        net, end_point = vgg.vgg_a(preprocessed_inputs,
         num_classes=1000,
         is_training=True,
         dropout_keep_prob=0.5,
         spatial_squeeze=True,
         scope='vgg_a',
         fc_conv_padding='VALID',
         global_pool=False,
         stride=self._first_stage_features_stride)

    #   with slim.arg_scope(
    #       resnet_utils.resnet_arg_scope(
    #           batch_norm_epsilon=1e-5,
    #           batch_norm_scale=True,
    #           weight_decay=self._weight_decay)):
    #     with tf.variable_scope(
    #         self._architecture, reuse=self._reuse_weights) as var_scope:
    #       _, activations = self._resnet_model(
    #           preprocessed_inputs,
    #           num_classes=None,
    #           is_training=self._train_batch_norm,
    #           global_pool=False,
    #           output_stride=self._first_stage_features_stride,
    #           spatial_squeeze=False,
    #           scope=var_scope)
    #
    # handle = scope + '/%s/block1' % self._architecture
    return net, end_point

  def _extract_box_classifier_features(self, proposal_feature_maps, scope):
    """Extracts second stage box classifier features.

    Args:
      proposal_feature_maps: A 4-D float tensor with shape
        [batch_size * self.max_num_proposals, crop_height, crop_width, depth]
        representing the feature map cropped to each proposal.
      scope: A scope name (unused).

    Returns:
      proposal_classifier_features: A 4-D float tensor with shape
        [batch_size * self.max_num_proposals, height, width, depth]
        representing box classifier features for each proposal.
    """
    with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=self._weight_decay)):
        proposal_classifier_features = slim.conv2d(proposal_feature_maps, num_outputs=2048, kernel_size=3,scope='conv8')
    # with tf.variable_scope(self._architecture, reuse=self._reuse_weights):
    #   with slim.arg_scope(
    #       resnet_utils.resnet_arg_scope(
    #           batch_norm_epsilon=1e-5,
    #           batch_norm_scale=True,
    #           weight_decay=self._weight_decay)):
    #     with slim.arg_scope([slim.batch_norm],
    #                         is_training=self._train_batch_norm):
    #       blocks = [
    #           resnet_utils.Block('block2', resnet_v1.bottleneck, [{
    #               'depth': 2048,
    #               'depth_bottleneck': 512,
    #               'stride': 1
    #           }] * 3)
    #       ]
    #       proposal_classifier_features = resnet_utils.stack_blocks_dense(
    #           proposal_feature_maps, blocks)
    return proposal_classifier_features


class FasterRCNNResnet101FeatureExtractor(FasterRCNNResnetV1FeatureExtractor):
  """Faster R-CNN Resnet 101 feature extractor implementation."""

  def __init__(self,
               is_training,
               first_stage_features_stride,
               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    """Constructor.

    Args:
      is_training: See base class.
      first_stage_features_stride: See base class.
      batch_norm_trainable: See base class.
      reuse_weights: See base class.
      weight_decay: See base class.

    Raises:
      ValueError: If `first_stage_features_stride` is not 8 or 16,
        or if `architecture` is not supported.
    """
    super(FasterRCNNResnet101FeatureExtractor, self).__init__(
        'resnet_v1_101', resnet_v1.resnet_v1_101, is_training,
        first_stage_features_stride, batch_norm_trainable,
        reuse_weights, weight_decay)

配置文件使用的是１０１的配置文件，所以其他的可以干掉了。

修改奏效了

object_detectionAPI源码阅读笔记（13-稍微改变一下feature_extractor.）_第1张图片

模型

object_detectionAPI源码阅读笔记（13-稍微改变一下feature_extractor.）_第2张图片

loss

object_detectionAPI源码阅读笔记（13-稍微改变一下feature_extractor.）_第3张图片

pb文件

object_detectionAPI源码阅读笔记（13-稍微改变一下feature_extractor.）_第4张图片

检测图片

object_detectionAPI源码阅读笔记（13-稍微改变一下feature_extractor.）_第5张图片

这里只是简单修改，因为在ｔｅｓｔ文件中发现Resnet V1 Faster R-CNN implementation的第二阶段竟然没有改变输入的维度。照着ｔｅｓｔ文件进行修改了一下。

更新

今天又重新进行了改动，
创建faster_rcnn_vgg_feature_extractor.py

# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

# 

import tensorflow as tf

from object_detection.meta_architectures import faster_rcnn_meta_arch
from nets import vgg
slim = tf.contrib.slim


class FasterRCNNVGGFeatureExtractor(
    faster_rcnn_meta_arch.FasterRCNNFeatureExtractor):
  """Faster R-CNN Resnet V1 feature extractor implementation."""
  def __init__(self,
               is_training,
               first_stage_features_stride,
               batch_norm_trainable=False,
               reuse_weights=None,
               weight_decay=0.0):
    print('first_stage_features_stride',first_stage_features_stride)
    if first_stage_features_stride != 8 and first_stage_features_stride != 16:
      raise ValueError('`first_stage_features_stride` must be 8 or 16.')

    super(FasterRCNNVGGFeatureExtractor, self).__init__(
        is_training, first_stage_features_stride, batch_norm_trainable,
        reuse_weights, weight_decay)

  def preprocess(self, resized_inputs):

    channel_means = [123.68, 116.779, 103.939]
    return resized_inputs - [[channel_means]]

  def _extract_proposal_features(self, preprocessed_inputs, scope):

    if len(preprocessed_inputs.get_shape().as_list()) != 4:
      raise ValueError('`preprocessed_inputs` must be 4 dimensional, got a '
                       'tensor of shape %s' % preprocessed_inputs.get_shape())
    shape_assert = tf.Assert(
        tf.logical_and(
            tf.greater_equal(tf.shape(preprocessed_inputs)[1], 33),
            tf.greater_equal(tf.shape(preprocessed_inputs)[2], 33)),
        ['image size must at least be 33 in both height and width.'])

    with tf.control_dependencies([shape_assert]):
      # Disables batchnorm for fine-tuning with smaller batch sizes.
      # TODO(chensun): Figure out if it is needed when image
      # batch size is bigger.
      with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=self._weight_decay)):
        net, end_point = vgg.vgg_a(preprocessed_inputs,
         num_classes=1000,
         is_training=True,
         dropout_keep_prob=0.5,
         spatial_squeeze=True,
         scope='vgg_a',
         fc_conv_padding='VALID',
         global_pool=False,
         stride=self._first_stage_features_stride)


    return net, end_point

  def _extract_box_classifier_features(self, proposal_feature_maps, scope):

    with slim.arg_scope(vgg.vgg_arg_scope(weight_decay=self._weight_decay)):

        net = slim.repeat(proposal_feature_maps, 2, slim.conv2d, 2048, [3, 3], scope='conv8')
        net = slim.max_pool2d(net, [2, 2], scope='pool8')
        proposal_classifier_features = slim.repeat(proposal_feature_maps, 3, slim.conv2d, 64, [3, 3], scope='conv9')


    return proposal_classifier_features

在model_builder.py中增加faster_vgg

from object_detection.models import faster_rcnn_vgg_feature_extractor as frcnn_vgg


# 进注册
FASTER_RCNN_FEATURE_EXTRACTOR_CLASS_MAP = {
    'faster_rcnn_nas':
    frcnn_nas.FasterRCNNNASFeatureExtractor,
    'faster_rcnn_vgg':
    frcnn_vgg.FasterRCNNVGGFeatureExtractor,
    'faster_rcnn_pnas':
    frcnn_pnas.FasterRCNNPNASFeatureExtractor,
    'faster_rcnn_inception_resnet_v2':
    frcnn_inc_res.FasterRCNNInceptionResnetV2FeatureExtractor,
    'faster_rcnn_inception_v2':
    frcnn_inc_v2.FasterRCNNInceptionV2FeatureExtractor,
    'faster_rcnn_resnet50':
    frcnn_resnet_v1.FasterRCNNResnet50FeatureExtractor,
    'faster_rcnn_resnet101':
    frcnn_resnet_v1.FasterRCNNResnet101FeatureExtractor,
    'faster_rcnn_resnet152':
    frcnn_resnet_v1.FasterRCNNResnet152FeatureExtractor,
}

修改config文件

model {
  faster_rcnn {
    num_classes: 20
    image_resizer {
      keep_aspect_ratio_resizer {
        min_dimension: 600
        max_dimension: 1024
      }
    }
    feature_extractor {
      type: 'faster_rcnn_vgg'
      first_stage_features_stride: 16
    }
    first_stage_anchor_generator {
      grid_anchor_generator {
        scales: [0.25, 0.5, 1.0, 2.0]
        aspect_ratios: [0.5, 1.0, 2.0]
        height_stride: 16
        width_stride: 16
      }
    }
    first_stage_box_predictor_conv_hyperparams {
      op: CONV
      regularizer {
        l2_regularizer {
          weight: 0.0
        }
      }
      initializer {
        truncated_normal_initializer {
          stddev: 0.01
        }
      }
    }
    first_stage_nms_score_threshold: 0.0
    first_stage_nms_iou_threshold: 0.7
    first_stage_max_proposals: 300
    first_stage_localization_loss_weight: 2.0
    first_stage_objectness_loss_weight: 1.0
    initial_crop_size: 14
    maxpool_kernel_size: 2
    maxpool_stride: 2
    second_stage_box_predictor {
      mask_rcnn_box_predictor {
        use_dropout: false
        dropout_keep_probability: 1.0
        fc_hyperparams {
          op: FC
          regularizer {
            l2_regularizer {
              weight: 0.0
            }
          }
          initializer {
            variance_scaling_initializer {
              factor: 1.0
              uniform: true
              mode: FAN_AVG
            }
          }
        }
      }
    }
    second_stage_post_processing {
      batch_non_max_suppression {
        score_threshold: 0.0
        iou_threshold: 0.6
        max_detections_per_class: 100
        max_total_detections: 300
      }
      score_converter: SOFTMAX
    }
    second_stage_localization_loss_weight: 2.0
    second_stage_classification_loss_weight: 1.0
  }
}

train_config: {
  batch_size: 1
  optimizer {
    momentum_optimizer: {
      learning_rate: {
        manual_step_learning_rate {
          initial_learning_rate: 0.0003
          schedule {
            step: 900000
            learning_rate: .00003
          }
          schedule {
            step: 1200000
            learning_rate: .000003
          }
        }
      }
      momentum_optimizer_value: 0.9
    }
    use_moving_average: false
  }
  gradient_clipping_by_norm: 10.0
  #fine_tune_checkpoint: "faster_rcnn_vgg_coco_11_06_2017/model.ckpt"
  from_detection_checkpoint: false
  data_augmentation_options {
    random_horizontal_flip {
    }
  }
}

train_input_reader: {
  tf_record_input_reader {
    input_path: "data/voc2012_trian.record"
  }
  label_map_path: "data/pascal_label_map.pbtxt"
}

eval_config: {
  num_examples: 100
  # Note: The below line limits the evaluation process to 10 evaluations.
  # Remove the below line to evaluate indefinitely.
  max_evals: 100
  eval_interval_secs: 5
  #metrics_set:"pascal_voc_detection_metrics"
  metrics_set: "coco_detection_metrics"
}

eval_input_reader: {
  tf_record_input_reader {
    input_path: "data/voc2012_val.record"
  }
  label_map_path: "data/pascal_label_map.pbtxt"
  shuffle: false
  num_readers: 1
  num_epochs: 1

}

怎么调参需要慢慢弄。。。。。。。

object_detectionAPI源码阅读笔记（13-稍微改变一下feature_extractor.）