SSD源码解析

上面接SSD源码测试

下载源码:

github地址

如果想快速掌握SSD源码请看SS简版代码解读。本人脑子笨,想从易到难慢慢掌握,所以先看了一份简单的代码,再看一份代功能较多的代码对照记忆,这两份代码大体类似,简单版的代码想要训练的话需要改动。

demo_ssd.py

先从demo.py的代码开始吧!!
这里代码就是notebooks/ssd_tests.ipynb里面的代码:

# demo_ssd.py
# encode=utf-8

import os
import math
import random
 
import numpy as np
import tensorflow as tf
import cv2
 
slim = tf.contrib.slim
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import sys
sys.path.append('../')
from nets import ssd_vgg_300, ssd_common, np_methods
from preprocessing import ssd_vgg_preprocessing
from notebooks import visualization
# TensorFlow session: grow memory when needed. TF, DO NOT USE ALL MY GPU MEMORY!!!
gpu_options = tf.GPUOptions(allow_growth=True)
config = tf.ConfigProto(log_device_placement=False, gpu_options=gpu_options)
isess = tf.InteractiveSession(config=config)
# Input placeholder.
net_shape = (300, 300)
data_format = 'NHWC'
img_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
# Evaluation pre-processing: resize to SSD net shape.
image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval(
    img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE)
image_4d = tf.expand_dims(image_pre, 0)
 
# Define the SSD model.
reuse = True if 'ssd_net' in locals() else None
ssd_net = ssd_vgg_300.SSDNet()
with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)
 
# Restore SSD model.
ckpt_filename = '../checkpoints/ssd_300_vgg.ckpt'
# ckpt_filename = '../checkpoints/VGG_VOC0712_SSD_300x300_ft_iter_120000.ckpt'
isess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(isess, ckpt_filename)
 
# SSD default anchor boxes.
ssd_anchors = ssd_net.anchors(net_shape)
 
 
# Main image processing routine.
def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)):
    # Run SSD network.
    rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img],
                                                              feed_dict={img_input: img})
 
    # Get classes and bboxes from the net outputs.
    rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
        rpredictions, rlocalisations, ssd_anchors,
        select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
 
    rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
    rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
    rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
    # Resize bboxes to original image shape. Note: useless for Resize.WARP!
    rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)
    return rclasses, rscores, rbboxes
# Test on some demo image and visualize output.
#测试的文件夹
path = '../demo/'
image_names = sorted(os.listdir(path))
#文件夹中的第几张图,-1代表最后一张
img = mpimg.imread(path + image_names[-1])
rclasses, rscores, rbboxes =  process_image(img)
 
# visualization.bboxes_draw_on_img(img, rclasses, rscores, rbboxes, visualization.colors_plasma)
visualization.plt_bboxes(img, rclasses, rscores, rbboxes)

这里设置了图片的尺寸(300,300),图片的格式为shape='NxHxWxC',对于读取的图片先用preprocess_for_eval进行预处理把图片处理成net_shape的大小,这里留下一个漏洞就是bboxes_pre=[]这个返回值是干什么???,而bbox_img=[0,0,1,1],会慢慢揭晓,流程往下走吧。如下:

net_shape = (300, 300)
data_format = 'NHWC'
img_input = tf.placeholder(tf.uint8, shape=(None, None, 3))
# Evaluation pre-processing: resize to SSD net shape.
image_pre, labels_pre, bboxes_pre, bbox_img = ssd_vgg_preprocessing.preprocess_for_eval(
    img_input, None, None, net_shape, data_format, resize=ssd_vgg_preprocessing.Resize.WARP_RESIZE)
image_4d = tf.expand_dims(image_pre, 0)

构建SSD的模型输出预测的predictions, localisations的占位符。
注意:这里predictions, localisations只是经过初步筛选的预测值,也就是predictions大于某个阀值就保留box和类别预测。这里并没有所谓的NMS等去重手段。
这里的输出localisations.shape = [None, w, h, n_anchors, 4].
predictions.shape=[None, w, h, n_anchors, num_classes].None是batch size.。

ssd_net = ssd_vgg_300.SSDNet()
with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)
 

接下来就是套路,拿到ckpt文件中的参数。

# Restore SSD model.
ckpt_filename = '../checkpoints/ssd_300_vgg.ckpt'
# ckpt_filename = '../checkpoints/VGG_VOC0712_SSD_300x300_ft_iter_120000.ckpt'
isess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(isess, ckpt_filename)
 

接下来就是一件大事了,SSD与Faster R-CNN,YOLO不同就是体现在这里。就是这个 default boxes的设置不同。ssd_anchors 使是设置

# SSD default boxes.
ssd_anchors = ssd_net.anchors(net_shape)

ssd_anchors 是通过anchors设置box的,img_shape=(300,300)
拿到SSD模型上(conv4_3, conv_7) +(conv8_2, conv9_2, conv_10_2, pool_11)所有的default anchor boxes,这里SSD-vgg有8732个。

最后拿到图片的数据,送入SSD网络进行目标检测。

image_names = sorted(os.listdir(path))
#文件夹中的第几张图,-1代表最后一张
img = mpimg.imread(path + image_names[-1])
rclasses, rscores, rbboxes =  process_image(img)

在process_image干什么了??

def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)):
    # Run SSD network.
    rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img],
                                                              feed_dict={img_input: img})
 
    # Get classes and bboxes from the net outputs.
    rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
        rpredictions, rlocalisations, ssd_anchors,
        select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
 
    rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
    rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
    rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
    # Resize bboxes to original image shape. Note: useless for Resize.WARP!
    rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)
    return rclasses, rscores, rbboxes

可以看到rbbox_img 输出值就是[0,0,1,1]。那就是原图的尺寸,而bboxes_pre就是物体的box了,但是test时物体其实是没有的box的所以bboxes_pre的box。

现在可以看到,np_methods就是执行NMS,选择合适的box。
ssd_bboxes_select()函数已经把坐标解码,从[x,y,w, h]解码为[ymin,xmin,ymax,xmax],其中具体的操作慢慢细讲。

rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
        rpredictions, rlocalisations, ssd_anchors,
        select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
 

保留前top_k个box

rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)

保留rscores大于阀值的box

rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)

这里就是把box的大小到原图的尺寸,这里是占原图的比例。

rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)

把预测值可视化,这里的坐标已经是原图的比例了,只需要再乘以原尺寸就行了。

visualization.plt_bboxes(img, rclasses, rscores, rbboxes)
SSD源码解析_第1张图片

demo_test.py的解析大概就到这里了。

下面开始深入探讨。

ssd_vgg_300.py

这里的结构有点混乱,大概只有作者才能理得清

SSD源码解析_第2张图片

还是老规矩,只看主要结构,在demo.py中出现SSDNet的地方,如下。

# demo.py
略.......
ssd_net = ssd_vgg_300.SSDNet()
with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)
略.......
ssd_anchors = ssd_net.anchors(net_shape)

SSDNet()

首先定义ssd需要的参数结构,这里是SSDParams 的参数结构

SSDParams = namedtuple('SSDParameters', ['img_shape',           #输入图像大小
                                         'num_classes',         #分类类别数
                                         'no_annotation_label', #无标注标签
                                         'feat_layers',         #特征层
                                         'feat_shapes',         #特征层形状大小
                                         'anchor_size_bounds',  #锚点框大小上下边界,是与原图相比得到的小数值
                                         'anchor_sizes',        #初始锚点框尺寸
                                         'anchor_ratios',       #锚点框长宽比
                                         'anchor_steps',        #特征图相对原始图像的缩放
                                         'anchor_offset',       #锚点框中心的偏移
                                         'normalizations',      #是否正则化
                                         'prior_scaling'        #是对特征图参考框向gtbox做回归时用到的尺度缩放(0.1,0.1,0.2,0.2)
                                         ])

大体解SSDNet的从体结构:

SSD源码解析_第3张图片

大体上看SSDNet其实并不复杂从demo.py出发,看看几个主要的函数,如下:

ssd_net = ssd_vgg_300.SSDNet()
with slim.arg_scope(ssd_net.arg_scope(data_format=data_format)):
    predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)
ssd_anchors = ssd_net.anchors(net_shape)

SSDNet的初始化函数

class SSDNet(object):
    """Implementation of the SSD VGG-based 300 network.

    The default features layers with 300x300 image input are:
      conv4 ==> 38 x 38
      conv7 ==> 19 x 19
      conv8 ==> 10 x 10
      conv9 ==> 5 x 5
      conv10 ==> 3 x 3
      conv11 ==> 1 x 1
    The default image size used to train this network is 300x300.    #训练输入图像尺寸默认为300x300
    """
    default_params = SSDParams(             #默认参数
        img_shape=(300, 300),
        num_classes=21,  #包含背景在内,共21类目标类别
        no_annotation_label=21,
        feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], #特征层名字
        feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)],       #特征层尺寸
        anchor_size_bounds=[0.15, 0.90],                                           
        # anchor_size_bounds=[0.20, 0.90],                                        #论文中初始预测框大小为0.2x300~0.9x300;实际代码是[45,270]
        anchor_sizes=[(21., 45.),   #直接给出的每个特征图上起初的锚点框大小;如第一个特征层框大小是h:21;w:45;  共6个特征图用于回归
                      (45., 99.),   #越小的框能够得到原图上更多的局部信息,反之得到更多的全局信息;
                      (99., 153.),
                      (153., 207.),
                      (207., 261.),
                      (261., 315.)],
        # anchor_sizes=[(30., 60.),
        #               (60., 111.),
        #               (111., 162.),
        #               (162., 213.),
        #               (213., 264.),
        #               (264., 315.)],
        anchor_ratios=[[2, .5],             #每个特征层上的每个特征点预测的box长宽比及数量;如:block4: def_boxes:4
                       [2, .5, 3, 1./3],    #block7: def_boxes:6   (ratios中的4个+默认的1:1+额外增加的一个=6)
                       [2, .5, 3, 1./3],    #block8: def_boxes:6
                       [2, .5, 3, 1./3],    #block9: def_boxes:6
                       [2, .5],             #block10: def_boxes:4
                       [2, .5]],            #block11: def_boxes:4   #备注:实际上略去了默认的ratio=1以及多加了一个sqrt(初始框宽*初始框高),后面代码有
        anchor_steps=[8, 16, 32, 64, 100, 300],   #特征图锚点框放大到原始图的缩放比例;
        anchor_offset=0.5,                        #每个锚点框中心点在该特征图cell中心,因此offset=0.5
        normalizations=[20, -1, -1, -1, -1, -1],  #是否归一化,大于0则进行,否则不做归一化;目前看来只对block_4进行正则化,因为该层比较靠前,其norm较大,需做L2正则化(仅仅对每个像素在channel维度做归一化)以保证和后面检测层差异不是很大;
        prior_scaling=[0.1, 0.1, 0.2, 0.2]        #特征图上每个目标与参考框间的尺寸缩放(y,x,h,w)解码时用到
        )

    def __init__(self, params=None):      #网络参数的初始化
        """Init the SSD net with some parameters. Use the default ones
        if none provided.
        """
        if isinstance(params, SSDParams):  #是否有参数输入,是则用输入的,否则使用默认的
            self.params = params           #isinstance是python的內建函数,如果参数1与参数2的类型相同则返回true;
        else:
            self.params = SSDNet.default_params

初始化函数SSDNet的初始化函数功能不多,就是使用默认的初始化参数。简单解释一下参数:

  • img_shape=(300, 300) 输入图片的大小
  • num_classes=21, 包含背景在内,共21类目标类别
  • no_annotation_label=21,
  • feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], 特征层名字,ssd_vgg 使用6个特征层进行最后的识别 * feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], 6个特征图的尺寸
  • anchor_size_bounds=[0.15, 0.90], 这就是最小最大尺寸,每个box占图片的比例
  • anchor_ratios, 每个特征层上的每个特征点预测的box长宽比及数量;
  • anchor_steps=[8, 16, 32, 64, 100, 300], 特征图锚点框放大到原始图的缩放比例;
  • anchor_offset=0.5 每个锚点框中心点在该特征图cell中心,因此offset=0.5
  • normalizations=[20, -1, -1, -1, -1, -1], 是否归一化,大于0则进行,否则不做归一化;目前看来只对block_4进行正则化,因为该层比较靠前,其norm较大,需做L2正则化(仅仅对每个像素在channel维度做归一化)以保证和后面检测层差异不是很大;
  • prior_scaling=[0.1, 0.1, 0.2, 0.2] 特征图上每个目标与参考框间的尺寸缩放(y,x,h,w)解码时用到

现在看看SSD的参数空间,因为在test中就是在参数空间运行的

def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):  #权重衰减系数=0.0005;其是L2正则化项的系数
    """Defines the VGG arg scope.

    Args:
      weight_decay: The l2 regularization coefficient.

    Returns:
      An arg_scope.
    """
    with slim.arg_scope([slim.conv2d, slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_regularizer=slim.l2_regularizer(weight_decay),
                        weights_initializer=tf.contrib.layers.xavier_initializer(),
                        biases_initializer=tf.zeros_initializer()):
        with slim.arg_scope([slim.conv2d, slim.max_pool2d],
                            padding='SAME',
                            data_format=data_format):
            with slim.arg_scope([custom_layers.pad2d,
                                 custom_layers.l2_normalization,
                                 custom_layers.channel_to_last],
                                data_format=data_format) as sc:
                return sc

这里设值slim.conv2d和slim.fully_connected共有的参数设置,并且slim.conv2d使用SAME的padding方式,设置custom_layers.pad2d层使用custom_layers.l2_normalization方式。

进入SSDNet的net。

def net(self, inputs,                            #定义网络模型,输入图片
            is_training=True,                        #是否训练
            update_feat_shapes=True,                 #是否更新特征层的尺寸
            dropout_keep_prob=0.5,                   #dropout=0.5
            prediction_fn=slim.softmax,              #采用softmax预测结果
            reuse=None,
            scope='ssd_300_vgg'):                    #网络名:ssd_300_vgg   (基础网络时VGG,输入训练图像size是300x300)
        """SSD network definition.
        """
        r = ssd_net(inputs,                               #网络输入参数r
                    num_classes=self.params.num_classes, 
                    feat_layers=self.params.feat_layers,
                    anchor_sizes=self.params.anchor_sizes,
                    anchor_ratios=self.params.anchor_ratios,
                    normalizations=self.params.normalizations,
                    is_training=is_training,
                    dropout_keep_prob=dropout_keep_prob,
                    prediction_fn=prediction_fn,
                    reuse=reuse,
                    scope=scope)
        # Update feature shapes (try at least!)    #下面这步我的理解就是让读者自行更改特征层的输入,未必论文中介绍的那几个block
        if update_feat_shapes:                                               #是否更新特征层图像尺寸?
            shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)  #输入特征层图像尺寸以及inputs(应该是预测的特征尺寸),输出更新后的特征图尺寸列表
            self.params = self.params._replace(feat_shapes=shapes)        #将更新的特征图尺寸shapes替换当前的特征图尺寸
        return r   

使用到ssd_net()

那就看看ssd_net()

def ssd_net(inputs,                                             #定义ssd网络结构
            num_classes=SSDNet.default_params.num_classes,      #分类数
            feat_layers=SSDNet.default_params.feat_layers,      #特征层
            anchor_sizes=SSDNet.default_params.anchor_sizes,
            anchor_ratios=SSDNet.default_params.anchor_ratios,
            normalizations=SSDNet.default_params.normalizations, #正则化
            is_training=True,
            dropout_keep_prob=0.5,
            prediction_fn=slim.softmax,
            reuse=None,
            scope='ssd_300_vgg'):
    """SSD net definition.
    """
    # if data_format == 'NCHW':
    #     inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))

    # End_points collect relevant activations for external use.
    end_points = {}   #用于收集每一层输出结果
    with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
        # Original VGG-16 blocks.
        net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')    #VGG16网络的第一个conv,重复2次卷积,核为3x3,64个特征
        end_points['block1'] = net                                              #conv1_2结果存入end_points,name='block1'
        net = slim.max_pool2d(net, [2, 2], scope='pool1')
        # Block 2.
        net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')      #重复2次卷积,核为3x3,128个特征
        end_points['block2'] = net                                              #conv2_2结果存入end_points,name='block2'
        net = slim.max_pool2d(net, [2, 2], scope='pool2')
        # Block 3.
        net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')      #重复3次卷积,核为3x3,256个特征
        end_points['block3'] = net                                              #conv3_3结果存入end_points,name='block3'
        net = slim.max_pool2d(net, [2, 2], scope='pool3')
        # Block 4.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')       #重复3次卷积,核为3x3,512个特征
        end_points['block4'] = net                                               #conv4_3结果存入end_points,name='block4'
        net = slim.max_pool2d(net, [2, 2], scope='pool4')
        # Block 5.
        net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')       #重复3次卷积,核为3x3,512个特征
        end_points['block5'] = net                                               #conv5_3结果存入end_points,name='block5'
        net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')

        # Additional SSD blocks.                                                  #去掉了VGG的全连接层
        # Block 6: let's dilate the hell out of it!
        net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')              #将VGG基础网络最后的池化层结果做扩展卷积(带孔卷积);
        end_points['block6'] = net                                                #conv6结果存入end_points,name='block6'
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training) #dropout层
        # Block 7: 1x1 conv. Because the fuck.
        net = slim.conv2d(net, 1024, [1, 1], scope='conv7')                          #将dropout后的网络做1x1卷积,输出1024特征,name='block7'
        end_points['block7'] = net
        net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)   #将卷积后的网络继续做dropout

        # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
        end_point = 'block8'                                                         
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')           #对上述dropout的网络做1x1卷积,然后做3x3卷积,,输出512特征图,name=‘block8’
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block9'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')                 #对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block9’
            net = custom_layers.pad2d(net, pad=(1, 1))
            net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block10'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')               #对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block10’
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net
        end_point = 'block11'
        with tf.variable_scope(end_point):
            net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')              #对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block11’
            net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
        end_points[end_point] = net

        # Prediction and localisations layers. #预测和定位
        predictions = []
        logits = []
        localisations = []
        for i, layer in enumerate(feat_layers):               #遍历特征层
            with tf.variable_scope(layer + '_box'):                #起个命名范围   
            p, l = ssd_multibox_layer(end_points[layer],   #做多尺度大小box预测的特征层,返回每个cell中每个先验框预测的类别p和预测的位置l
                                          num_classes,         #种类数
                                          anchor_sizes[i],     #先验框尺度(同一特征图上的先验框尺度和长宽比一致)
                                          anchor_ratios[i],    #先验框长宽比
                                          normalizations[i])   #每个特征正则化信息,目前是只对第一个特征图做归一化操作;
            #把每一层的预测收集
            predictions.append(prediction_fn(p))  #prediction_fn为softmax,预测类别
            logits.append(p) #把每个cell每个先验框预测的类别的概率值存在logits中
            localisations.append(l)   #预测位置信息

        return predictions, localisations, logits, end_points  #返回类别预测结果,位置预测结果,所属某个类别的概率值,以及特征层
ssd_net.default_image_size = 300

SSD网络的主体模型结构比较简单,block1 到 block11都是卷积神经网络,在block8之后开始使用自定义的卷积结构。

在函数的最后使用ssd_multibox_layer对所的feature map构建box和classes预测,在收集的所有的点上进行预测。对于一个wxh的feature map,需要输出的box的shape=[w,h,num_anchors * 4],输出的classes的shape=[w,h,num_anchors * c],这里c=21。

这里的几个返回值:

  • predictions: 为softmax之后的预测类别
  • logits:把每个cell每个先验框预测的类别的概率值存在logits中
  • localisations: 预测位置信息
  • end_points 是每个feature map 的输出值。

在网络的后面使用ssd_multibox_layer对网络的feature map进行预测。如下:

def ssd_multibox_layer(inputs,                    #输入特征层
                       num_classes,               #类别数
                       sizes,                     #参考先验框的尺度
                       ratios=[1],                #默认的先验框长宽比为1
                       normalization=-1,          #默认不做正则化
                       bn_normalization=False):
    """Construct a multibox layer, return a class and localization predictions.
    """
    net = inputs
    if normalization > 0:    #如果输入整数,则进行L2正则化
        net = custom_layers.l2_normalization(net, scaling=True)    #对通道所在维度进行正则化,随后乘以gamma缩放系数
    # Number of anchors.
    num_anchors = len(sizes) + len(ratios)  #每层特征图参考先验框的个数[4,6,6,6,4,4]

    # Location.     #每个先验框对应4个坐标信息
    num_loc_pred = num_anchors * 4    #特征图上每个单元预测的坐标所需维度=锚点框数*4
    loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,   #通过对特征图进行3x3卷积得到位置信息和类别权重信息
                           scope='conv_loc')                                #该部分是定位信息,输出维度为[特征图h,特征图w,每个单元所有锚点框坐标]
    loc_pred = custom_layers.channel_to_last(loc_pred)
    loc_pred = tf.reshape(loc_pred,                                         #最后整个特征图所有锚点框预测目标位置 tensor为[h*w*每个cell先验框数,4]
                          tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
    # Class prediction.                                                #类别预测
    num_cls_pred = num_anchors * num_classes                            #特征图上每个单元预测的类别所需维度=锚点框数*种类数
    cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None, #该部分是类别信息,输出维度为[特征图h,特征图w,每个单元所有锚点框对应类别信息]
                           scope='conv_cls')
    cls_pred = custom_layers.channel_to_last(cls_pred)
    cls_pred = tf.reshape(cls_pred,
                          tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes]) #最后整个特征图所有锚点框预测类别 tensor为[h*w*每个cell先验框数,种类数]
    return cls_pred, loc_pred  #返回预测得到的类别和box位置 tensor

这里cls_pred就是网路对feature map的cell的box坐标预测值,loc_pred 就是对cell的box的类别的预测值。

返回到net中
发现网络其实还更新了feature map 的shape,就是防止在SSDParams中设置的feature map的形状与实际输入不符,如果不一致就更新SSDParams 中feat_shapes参数。
如下:

if update_feat_shapes:                                               #是否更新特征层图像尺寸?
            shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)  #输入特征层图像尺寸以及inputs(应该是预测的特征尺寸),输出更新后的特征图尺寸列表
            self.params = self.params._replace(feat_shapes=shapes)        #将更新的特征图尺寸shapes替换当前的特征图尺寸

ok!!

到这里这行代码其实已经结束了。

# demo.py
predictions, localisations, _, _ = ssd_net.net(image_4d, is_training=False, reuse=reuse)

开始这段:

# demo.py
ssd_anchors = ssd_net.anchors(net_shape)

再次回到SSDNet中,看看anchors
输入原始图像尺寸
返回每个特征层每个参考锚点框的位置及尺寸信息
img_shape = (300, 300)

    def anchors(self, img_shape, dtype=np.float32):             #输入原始图像尺寸;返回每个特征层每个参考锚点框的位置及尺寸信息(x,y,h,w)        
        """Compute the default anchor boxes, given an image shape.
        """
        return ssd_anchors_all_layers(img_shape,                #这是个关键函数;检测所有特征层中的参考锚点框位置和尺寸信息
                                      self.params.feat_shapes,
                                      self.params.anchor_sizes, 
                                      self.params.anchor_ratios,
                                      self.params.anchor_steps,
                                      self.params.anchor_offset,
                                      dtype)

需要注意是这里的输入参数:

  • self.params.feat_shapes= [(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)]
  • self.params.anchor_sizes=[0.15, 0.90]
  • self.params.anchor_ratios=[[2, .5],[2, .5, 3, 1./3], [2, .5, 3, 1./3],[2, .5, 3, 1./3],[2, .5], [2, .5],
  • self.params.anchor_steps=[8, 16, 32, 64, 100, 300]
  • self.params.anchor_offset=0.5

跳到ssd_anchors_all_layers()函数

def ssd_anchors_all_layers(img_shape,                 #检测所有特征图中锚点框的四个坐标信息; 输入原始图大小
                           layers_shape,              #每个特征层形状尺寸
                           anchor_sizes,              #起始特征图中框的长宽size
                           anchor_ratios,             #锚点框长宽比列表
                           anchor_steps,              #锚点框相对原图缩放比例
                           offset=0.5,                #锚点中心在每个特征图cell中的偏移
                           dtype=np.float32):            
    """Compute anchor boxes for all feature layers.
    """
    layers_anchors = []           #用于存放所有特征图中锚点框位置尺寸信息
    for i, s in enumerate(layers_shape):                 #6个特征图尺寸;如:第0个是38x38
        anchor_bboxes = ssd_anchor_one_layer(img_shape, s,     #分别计算每个特征图中锚点框的位置尺寸信息;
                                             anchor_sizes[i],    #输入:第i个特征图中起始锚点框大小;如第0个是(21., 45.)
                                             anchor_ratios[i],   #输入:第i个特征图中锚点框长宽比列表;如第0个是[2, .5]
                                             anchor_steps[i],    #输入:第i个特征图中锚点框相对原始图的缩放比;如第0个是8
                                             offset=offset, dtype=dtype)  #输入:锚点中心在每个特征图cell中的偏移
        layers_anchors.append(anchor_bboxes)              #将6个特征图中每个特征图上的点对应的锚点框(6个或4个)保存
    return layers_anchors

跳到

def ssd_anchor_one_layer(img_shape,         #检测单个特征图中所有锚点的坐标和尺寸信息(未与原图做除法)
                         feat_shape,
                         sizes,
                         ratios,
                         step,
                         offset=0.5,
                         dtype=np.float32):
 
    # Compute the position grid: simple way.
    # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
    # y = (y.astype(dtype) + offset) / feat_shape[0]
    # x = (x.astype(dtype) + offset) / feat_shape[1]
    # Weird SSD-Caffe computation using steps values...    #归一化到原图的锚点中心坐标(x,y);其坐标值域为(0,1)
    y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]      #对于第一个特征图(block4:38x38);y=[[0,0,……0],[1,1,……1],……[37,37,……,37]];而x=[[0,1,2……,37],[0,1,2……,37],……[0,1,2……,37]]
    y = (y.astype(dtype) + offset) * step / img_shape[0]   #将38个cell对应锚点框的y坐标偏移至每个cell中心,然后乘以相对原图缩放的比例,再除以原图
    x = (x.astype(dtype) + offset) * step / img_shape[1]   #可以得到在原图上,相对原图比例大小的每个锚点中心坐标x,y

    # Expand dims to support easy broadcasting.    #将锚点中心坐标扩大维度
    y = np.expand_dims(y, axis=-1)   #对于第一个特征图,y的shape=38x38x1;x的shape=38x38x1
    x = np.expand_dims(x, axis=-1)

    # Compute relative height and width.
    # Tries to follow the original implementation of SSD for the order.
    num_anchors = len(sizes) + len(ratios)      #该特征图上每个点对应的锚点框数量;如:对于第一个特征图每个点预测4个锚点框(block4:38x38),2+2=4
    h = np.zeros((num_anchors, ), dtype=dtype)     #对于第一个特征图,h的shape=4x;w的shape=4x
    w = np.zeros((num_anchors, ), dtype=dtype)
    # Add first anchor boxes with ratio=1.
    h[0] = sizes[0] / img_shape[0]            #第一个锚点框的高h[0]=起始锚点的高/原图大小的高;例如:h[0]=21/300
    w[0] = sizes[0] / img_shape[1]            #第一个锚点框的宽w[0]=起始锚点的宽/原图大小的宽;例如:h[0]=45/300
    di = 1  #锚点宽个数偏移
    if len(sizes) > 1:                     
        h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]    #第二个锚点框的高h[1]=sqrt(起始锚点的高*起始锚点的宽)/原图大小的高;例如:h[1]=sqrt(21*45)/300
        w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]    #第二个锚点框的高w[1]=sqrt(起始锚点的高*起始锚点的宽)/原图大小的宽;例如:w[1]=sqrt(21*45)/300
        di += 1     #di=2
    for i, r in enumerate(ratios):                            #遍历长宽比例,第一个特征图,r只有两个,2和0.5;共四个锚点宽size(h[0]~h[3])
        h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)      #例如:对于第一个特征图,h[0+2]=h[2]=21/300/sqrt(2);w[0+2]=w[2]=45/300*sqrt(2)
        w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)      #例如:对于第一个特征图,h[1+2]=h[3]=21/300/sqrt(0.5);w[1+2]=w[3]=45/300*sqrt(0.5)
    return y, x, h, w 

这里解释一下 (y.astype(dtype) + offset) * step,在feature map上认为y是一个像素点的坐标,假如feature map.shape=[38,38].在[[0,0,……0],[1,1,……1],……[37,37,……,37]],因为y是从feature map上取出的所以用的最大坐标为37。那怎么把[37,37]的feature map对应到原图[300,300]呢???

这里的把y=ystep(step=8)就可以把y对应到尺寸为300的图上了。所以这里的的每个点的相当于[300]的8个点。(y+offset) step就是把移到8个点中间。

这里还有一个幺蛾子,就是 anchor_sizes。
anchor_sizes=[(30., 60.),(60., 111.), (111., 162.),(162., 213.),(213., 264.),(264., 315.)]。这里在feature map每个点上需要画出box。在[38,38]的feature map上的box的最小边是30(30是原图的尺寸),反应到[38,38]不多4个格子的大小,同时从这里也可以看出SSD最小的检测只能是38x38个像素的物体。

h[0] = sizes[0] / img_shape[0]就是把在原图上box归一化到[0,1]。

来张图解释:

SSD源码解析_第4张图片

所有的box的尺寸:

sizes:  (21.0, 45.0)
h, w:  [[0.07       0.10246951 0.04949747 0.09899495]
 [0.07       0.10246951 0.09899495 0.04949747]]
sizes:  (45.0, 99.0)
h, w:  [[0.15       0.22248596 0.10606602 0.21213204 0.08660254 0.25980762]
 [0.15       0.22248596 0.21213204 0.10606602 0.25980762 0.08660254]]
sizes:  (99.0, 153.0)
h, w:  [[0.33       0.41024384 0.23334524 0.46669048 0.19052559 0.5715768 ]
 [0.33       0.41024384 0.46669048 0.23334524 0.5715768  0.19052559]]
sizes:  (153.0, 207.0)
h, w:  [[0.51       0.5932116  0.36062446 0.7212489  0.29444864 0.8833459 ]
 [0.51       0.5932116  0.7212489  0.36062446 0.8833459  0.29444864]]
sizes:  (207.0, 261.0)
h, w:  [[0.69       0.7747903  0.48790368 0.97580737]
 [0.69       0.7747903  0.97580737 0.48790368]]
sizes:  (261.0, 315.0)
h, w:  [[0.87      0.9557719 0.6151829 1.2303658]
 [0.87      0.9557719 1.2303658 0.6151829]]

每个feature map上cell的box的坐标基于ssd输出得到的,是固定尺寸。

现在所有的anchors box也得到了,回到demo.py中吧。

开始进行非极大值抑制np_methods,
这里竟然是在numpy里面执行的。

这里先抽取特征。

rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
        rpredictions, rlocalisations, ssd_anchors,
        select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
 

跳到ssd_bboxes_select()抽取box坐标所对应的feature map。

def ssd_bboxes_select(predictions_net,
                      localizations_net,
                      anchors_net,
                      select_threshold=0.5,
                      img_shape=(300, 300),
                      num_classes=21,
                      decode=True):
    """Extract classes, scores and bounding boxes from network output layers.

    Return:
      classes, scores, bboxes: Numpy arrays...
    """
    l_classes = []
    l_scores = []
    l_bboxes = []
    # l_layers = []
    # l_idxes = []
    for i in range(len(predictions_net)):
        classes, scores, bboxes = ssd_bboxes_select_layer(
            predictions_net[i], localizations_net[i], anchors_net[i],
            select_threshold, img_shape, num_classes, decode)
        l_classes.append(classes)
        l_scores.append(scores)
        l_bboxes.append(bboxes)
        # Debug information.
        # l_layers.append(i)
        # l_idxes.append((i, idxes))

    classes = np.concatenate(l_classes, 0)
    scores = np.concatenate(l_scores, 0)
    bboxes = np.concatenate(l_bboxes, 0)
    return classes, scores, bboxes

极大值预测之前需要把box对应的值从feature map中抽取出来,使用
ssd_bboxes_select_layer()


def ssd_bboxes_select_layer(predictions_layer,
                            localizations_layer,
                            anchors_layer,
                            select_threshold=0.5,
                            img_shape=(300, 300),
                            num_classes=21,
                            decode=True):
    """Extract classes, scores and bounding boxes from features in one layer.

    Return:
      classes, scores, bboxes: Numpy arrays...
    """
    # First decode localizations features if necessary.
    if decode:
        localizations_layer = ssd_bboxes_decode(localizations_layer, anchors_layer)

    # Reshape features to: Batches x N x N_labels | 4.
    p_shape = predictions_layer.shape
    batch_size = p_shape[0] if len(p_shape) == 5 else 1
    predictions_layer = np.reshape(predictions_layer,
                                   (batch_size, -1, p_shape[-1]))
    l_shape = localizations_layer.shape
    localizations_layer = np.reshape(localizations_layer,
                                     (batch_size, -1, l_shape[-1]))

    # Boxes selection: use threshold or score > no-label criteria.
    if select_threshold is None or select_threshold == 0:
        # Class prediction and scores: assign 0. to 0-class
        classes = np.argmax(predictions_layer, axis=2)
        scores = np.amax(predictions_layer, axis=2)
        mask = (classes > 0)
        classes = classes[mask]
        scores = scores[mask]
        bboxes = localizations_layer[mask]
    else:
        sub_predictions = predictions_layer[:, :, 1:]
        idxes = np.where(sub_predictions > select_threshold)
        classes = idxes[-1]+1
        scores = sub_predictions[idxes]
        bboxes = localizations_layer[idxes[:-1]]

    return classes, scores, bboxes

解码box所对应的坐标

def ssd_bboxes_decode(feat_localizations,
                      anchor_bboxes,
                      prior_scaling=[0.1, 0.1, 0.2, 0.2]):
    """Compute the relative bounding boxes from the layer features and
    reference anchor bounding boxes.

    Return:
      numpy array Nx4: ymin, xmin, ymax, xmax
    """
    # Reshape for easier broadcasting.
    l_shape = feat_localizations.shape
    feat_localizations = np.reshape(feat_localizations,
                                    (-1, l_shape[-2], l_shape[-1]))
    yref, xref, href, wref = anchor_bboxes
    xref = np.reshape(xref, [-1, 1])
    yref = np.reshape(yref, [-1, 1])

    # Compute center, height and width
    cx = feat_localizations[:, :, 0] * wref * prior_scaling[0] + xref
    cy = feat_localizations[:, :, 1] * href * prior_scaling[1] + yref
    w = wref * np.exp(feat_localizations[:, :, 2] * prior_scaling[2])
    h = href * np.exp(feat_localizations[:, :, 3] * prior_scaling[3])
    # bboxes: ymin, xmin, xmax, ymax.
    bboxes = np.zeros_like(feat_localizations)
    bboxes[:, :, 0] = cy - h / 2.
    bboxes[:, :, 1] = cx - w / 2.
    bboxes[:, :, 2] = cy + h / 2.
    bboxes[:, :, 3] = cx + w / 2.
    # Back to original shape.
    bboxes = np.reshape(bboxes, l_shape)
    return bboxes

解释一下输入的参数
feat_localizations=这是网路的box的预测坐标[]
anchor_bboxes=这里feature map的default anchor boxes坐标[]

参考:
深刻解读SSD tensorflow及源码详解
SSD关键源码解析
目标检测|SSD原理与实现
SSD-Tensorflow超详细解析【一】:加载模型对图片进行测试

你可能感兴趣的:(SSD源码解析)