
single shot multibox detectior

tensorflow 代码



  1 # Copyright 2016 Paul Balanca. All Rights Reserved.
  2 #
  3 # Licensed under the Apache License, Version 2.0 (the "License");
  4 # you may not use this file except in compliance with the License.
  5 # You may obtain a copy of the License at
  6 #
  7 #
  8 #
  9 # Unless required by applicable law or agreed to in writing, software
 10 # distributed under the License is distributed on an "AS IS" BASIS,
 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 # See the License for the specific language governing permissions and
 13 # limitations under the License.
 14 # ==============================================================================
 15 """Definition of 300 VGG-based SSD network.
 17 This model was initially introduced in:
 18 SSD: Single Shot MultiBox Detector
 19 Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
 20 Cheng-Yang Fu, Alexander C. Berg
 23 Two variants of the model are defined: the 300x300 and 512x512 models, the
 24 latter obtaining a slightly better accuracy on Pascal VOC.
 26 Usage:
 27     with slim.arg_scope(ssd_vgg.ssd_vgg()):
 28         outputs, end_points = ssd_vgg.ssd_vgg(inputs)
 30 This network port of the original Caffe model. The padding in TF and Caffe
 31 is slightly different, and can lead to severe accuracy drop(精度严重下降) if not taken care
 32 in a correct way!
 34 In Caffe, the output size of convolution and pooling layers are computing as
 35 following: h_o = (h_i + 2 * pad_h - kernel_h) / stride_h + 1
 37 Nevertheless(然而), there is a subtle(微妙的) difference between both for stride > 1. In
 38 the case of convolution(在卷积的情况下):
 39     top_size = floor((bottom_size + 2*pad - kernel_size) / stride) + 1
 40 whereas for pooling:
 41     top_size = ceil((bottom_size + 2*pad - kernel_size) / stride) + 1
 42 Hence implicitely allowing some additional padding even if pad = 0(隐含的允许一些额外的填充). This
 43 behaviour explains why pooling with stride and kernel of size 2 are behaving
 44 the same way in TensorFlow and Caffe.
 46 Nevertheless, this is not the case anymore for other kernel sizes()对于其他kernel,情况就不同了, hence
 47 motivating the use of special padding layer for controlling these side-effects.(鼓励使用特殊的填充层来控制这种副作用)
 49 @@ssd_vgg_300
 50 """
 51 import math
 52 from collections import namedtuple
 54 import numpy as np
 55 import tensorflow as tf
 57 import tf_extended as tfe
 58 from nets import custom_layers
 59 from nets import ssd_common
 61 slim = tf.contrib.slim
 64 # =========================================================================== #
 65 # SSD class definition.
 66 # =========================================================================== #
 67 #collections模块的namedtuple子类不仅可以使用item的index访问item,
 68 # 还可以通过item的name进行访问可以将namedtuple理解为c中的struct结构,
 69 # 其首先将各个item命名,然后对每个item赋予数据
 70 # nametuple(tuple名字,域名)
 71 SSDParams = namedtuple('SSDParameters', ['img_shape', #输入图像大小
 72                                          'num_classes', #类+1(背景)
 73                                          'no_annotation_label', #无标注标签????
 74                                          'feat_layers', #特征层
 75                                          'feat_shapes', #特征层形状
 76                                          'anchor_size_bounds',  #锚点框大小上下边界,相对于原图的比例值
 77                                          'anchor_sizes',    #初始锚点框尺寸
 78                                          'anchor_ratios',   #锚点框长宽比
 79                                          'anchor_steps',    #feature map相对于原图的缩小倍数,后面会解释
 80                                          'anchor_offset',   #锚点框中心的偏移
 81                                          'normalizations',  #是否正则化
 82                                          'prior_scaling'    ##特征图上每个目标与参考框间的尺寸缩放(y,x,h,w)解码时用到
 83                                          ])
 86 class SSDNet(object):
 87     """Implementation of the SSD VGG-based 300 network.
 89     The default features layers with 300x300 image input are:
 90       conv4 ==> 38 x 38
 91       conv7 ==> 19 x 19
 92       conv8 ==> 10 x 10
 93       conv9 ==> 5 x 5
 94       conv10 ==> 3 x 3
 95       conv11 ==> 1 x 1
 96     The default image size used to train this network is 300x300.
 97     """
 98     default_params = SSDParams( #默认参数
 99         img_shape=(300, 300),
100         num_classes=21,     #类数 + 1(背景)
101         no_annotation_label=21, #同上
102         feat_layers=['block4', 'block7', 'block8', 'block9', 'block10', 'block11'], #特征层名字
103         feat_shapes=[(38, 38), (19, 19), (10, 10), (5, 5), (3, 3), (1, 1)], #特征层尺寸
104         anchor_size_bounds=[0.15, 0.90],    #第一层feature map的default box缩放比例Sk,大小为:300x0.15,300x0.9
105         # anchor_size_bounds=[0.20, 0.90],  #论文中是300x0.2,300x0.9
107         #anchor的大小,一共6个比例,下面的是原图根据比例计算后的得到的实际anchor大小
108         #4,6,6,6,4,4(每层feature map的dafault box的个数)
109         #长宽都是有计算公式的,得到Sk后,通过公式得到h,w
110         anchor_sizes=[(21., 45.),   #h,w
111                       (45., 99.),
112                       (99., 153.),
113                       (153., 207.),
114                       (207., 261.),
115                       (261., 315.)],    #越小的anchor box,得到的信息越大,这个是相对于原图的大小,越来越大
116         # anchor_sizes=[(30., 60.),
117         #               (60., 111.),
118         #               (111., 162.),
119         #               (162., 213.),
120         #               (213., 264.),
121         #               (264., 315.)],
123         ##每个特征层上的每个特征点预测的box长宽比及数量,例如:[2, .5]:(1:1)、(2:1)、(1:2)、(1:1),这里是把重复的省去了
124         #实际上是有4个default box的
125         anchor_ratios=[[2, .5], #block4: def_boxes:4
126                        [2, .5, 3, 1./3],    #def_boxes:6   (ratios中的4个+默认的1:1+额外增加的一个(S'k)=6)
127                        [2, .5, 3, 1./3],    #def_boxes:6
128                        [2, .5, 3, 1./3],    #def_boxes:6
129                        [2, .5],     #def_boxes:4
130                        [2, .5]],    #def_boxes:4
131         anchor_steps=[8, 16, 32, 64, 100, 300], #8x38=304,16x19=304,32x10=320,64x5=320,100x3=300,1x300=300
132         anchor_offset=0.5,
133         #是否归一化,大于0则进行,否则不做归一化;
134         # 目前看来只对block_4进行正则化,因为该层比较靠前,其norm(范数)较大,需做L2正则化
135         # (仅仅对每个像素在channel维度做归一化)以保证和后面检测层差异不是很大;
136         normalizations=[20, -1, -1, -1, -1, -1],
137         prior_scaling=[0.1, 0.1, 0.2, 0.2]  #特征图上每个目标与参考框间的尺寸缩放(y,x,h,w)解码时用到
138         )
140     def __init__(self, params=None):    #网络参数初始化
141         """
142         Init the SSD net with some parameters. Use the default ones if none provided.
143         """
144         if isinstance(params, SSDParams):   #是否有参数输入,是则用输入的,否则使用默认的
145             self.params = params            #isinstance是python的內建函数,如果参数1与参数2的类型相同则返回true;
146         else:                               #
147             self.params = SSDNet.default_params
149     # ======================================================================= #
150     #定义网络模型
151     def net(self, inputs,
152             is_training=True,   #是否训练
153             update_feat_shapes=True,    #是否更新特征层的尺寸
154             dropout_keep_prob=0.5,      ##dropout=0.5
155             prediction_fn=slim.softmax, #采用softmax预测结果
156             reuse=None,
157             scope='ssd_300_vgg'):       #网络名:ssd_300_vgg(基础网络时VGG,输入训练图像size是300x300)
158         """
159         SSD network definition.
160         """
161         #网络输入参数
162         r = ssd_net(inputs,
163                     num_classes=self.params.num_classes,
164                     feat_layers=self.params.feat_layers,
165                     anchor_sizes=self.params.anchor_sizes,
166                     anchor_ratios=self.params.anchor_ratios,
167                     normalizations=self.params.normalizations,
168                     is_training=is_training,
169                     dropout_keep_prob=dropout_keep_prob,
170                     prediction_fn=prediction_fn,
171                     reuse=reuse,
172                     scope=scope)
173         # Update feature shapes (try at least!)
174         # 下面这步我的理解就是让读者自行更改特征层的输入,未必论文中介绍的那几个block
175         if update_feat_shapes:  #是否更新特征层图像尺寸?
176             #输入特征层图像尺寸以及inputs(应该是预测的特征尺寸),输出更新后的特征图尺寸列表
177             shapes = ssd_feat_shapes_from_net(r[0], self.params.feat_shapes)
178             #将更新的特征图尺寸shapes替换当前的特征图尺寸
179             self.params = self.params._replace(feat_shapes=shapes)
180         return r    ##更新网络输入参数r
182     # 定义权重衰减=0.0005,L2正则化项系数;数据类型是NHWC:[batch, height, width, channels]
183     def arg_scope(self, weight_decay=0.0005, data_format='NHWC'):
184         """Network arg_scope.
185         """
186         return ssd_arg_scope(weight_decay, data_format=data_format)
188     def arg_scope_caffe(self, caffe_scope):
189         """Caffe arg_scope used for weights importing.
190         """
191         return ssd_arg_scope_caffe(caffe_scope)
193     # ======================================================================= #
194     ##更新特征形状尺寸(来自预测结果)
195     def update_feature_shapes(self, predictions):
196         """Update feature shapes from predictions collection (Tensor or Numpy
197         array).
198         """
199         shapes = ssd_feat_shapes_from_net(predictions, self.params.feat_shapes)
200         self.params = self.params._replace(feat_shapes=shapes)
201     #输入原始图像尺寸;返回每个特征层每个参考锚点框的位置及尺寸信息(x,y,h,w)
202     def anchors(self, img_shape, dtype=np.float32):
203         """Compute the default anchor boxes, given an image shape.
204         """
205         return ssd_anchors_all_layers(img_shape,
206                                       self.params.feat_shapes,
207                                       self.params.anchor_sizes,
208                                       self.params.anchor_ratios,
209                                       self.params.anchor_steps,
210                                       self.params.anchor_offset,
211                                       dtype)
212     #编码,用于将标签信息,真实目标信息和锚点框信息编码在一起;得到预测真实框到参考框的转换值
213     def bboxes_encode(self, labels, bboxes, anchors,
214                       scope=None):
215         """Encode labels and bounding boxes.
216         """
217         return ssd_common.tf_ssd_bboxes_encode(
218             labels, bboxes, anchors,
219             self.params.num_classes,
220             self.params.no_annotation_label,    #未标注的标签(应该代表背景)
221             ignore_threshold=0.5,               #IOU筛选阈值
222             prior_scaling=self.params.prior_scaling,    #特征图目标与参考框间的尺寸缩放(0.1,0.1,0.2,0.2)
223             scope=scope)
224     #解码,用锚点框信息,锚点框与预测真实框间的转换值,得到真实的预测框(ymin,xmin,ymax,xmax)
225     def bboxes_decode(self, feat_localizations, anchors,
226                       scope='ssd_bboxes_decode'):
227         """Encode labels and bounding boxes.
228         """
229         return ssd_common.tf_ssd_bboxes_decode(
230             feat_localizations, anchors,
231             prior_scaling=self.params.prior_scaling,
232             scope=scope)
233     #通过SSD网络,得到检测到的bbox
234     def detected_bboxes(self, predictions, localisations,
235                         select_threshold=None, nms_threshold=0.5,
236                         clipping_bbox=None, top_k=400, keep_top_k=200):
237         """Get the detected bounding boxes from the SSD network output.
238         """
239         # Select top_k bboxes from predictions, and clip
240         # 选取top_k=400个框,并对框做修建(超出原图尺寸范围的切掉)
242         # 得到对应某个类别的得分值以及bbox
243         rscores, rbboxes = \
244             ssd_common.tf_ssd_bboxes_select(predictions, localisations,
245                                             select_threshold=select_threshold,
246                                             num_classes=self.params.num_classes)
247         #按照得分高低,筛选出400个bbox和对应得分
248         rscores, rbboxes = \
249             tfe.bboxes_sort(rscores, rbboxes, top_k=top_k)
250         # Apply NMS algorithm.
251         # 应用非极大值抑制,去掉与得分最高的bbox的重叠率大于nms_threshold=0.5的,保留200个
252         rscores, rbboxes = \
253             tfe.bboxes_nms_batch(rscores, rbboxes,
254                                  nms_threshold=nms_threshold,
255                                  keep_top_k=keep_top_k)
256         if clipping_bbox is not None:
257             rbboxes = tfe.bboxes_clip(clipping_bbox, rbboxes)
258         return rscores, rbboxes     #返回裁剪好的bbox和对应得分
260     # 尽管一个ground truth可以与多个先验框匹配,但是ground truth相对先验框还是太少了,
261     # 所以负样本相对正样本会很多。为了保证正负样本尽量平衡,SSD采用了hard negative mining,
262     # 就是对负样本进行抽样,抽样时按照置信度误差(预测背景的置信度越小(预测背景,但实际上不是背景的概率很大),误差越大)进行降序排列,
263     # 选取误差的较大的top-k作为训练的负样本,以保证正负样本比例接近1:3
264     def losses(self, logits, localisations,
265                gclasses, glocalisations, gscores,
266                match_threshold=0.5,
267                negative_ratio=3.,
268                alpha=1.,
269                label_smoothing=0.,
270                scope='ssd_losses'):
271         """
272         Define the SSD network losses.
273         """
274         return ssd_losses(logits, localisations,
275                           gclasses, glocalisations, gscores,
276                           match_threshold=match_threshold,
277                           negative_ratio=negative_ratio,
278                           alpha=alpha,
279                           label_smoothing=label_smoothing,
280                           scope=scope)
283 # =========================================================================== #
284 # SSD tools...
285 # =========================================================================== #
286 # ????
287 def ssd_size_bounds_to_values(size_bounds,
288                               n_feat_layers,
289                               img_shape=(300, 300)):
290     """
291     Compute the reference sizes of the anchor boxes from relative bounds.
292     The absolute values are measured in pixels, based on the network
293     default size (300 pixels).
295     This function follows the computation performed in the original
296     implementation of SSD in Caffe.
298     Return:
299       list of list containing the absolute sizes at each scale. For each scale,
300       the ratios only apply to the first value.
301     """
302     assert img_shape[0] == img_shape[1]
304     img_size = img_shape[0]
305     min_ratio = int(size_bounds[0] * 100)
306     max_ratio = int(size_bounds[1] * 100)
307     step = int(math.floor((max_ratio - min_ratio) / (n_feat_layers - 2)))
308     # Start with the following smallest sizes.
309     sizes = [[img_size * size_bounds[0] / 2, img_size * size_bounds[0]]]
310     for ratio in range(min_ratio, max_ratio + 1, step):
311         sizes.append((img_size * ratio / 100.,
312                       img_size * (ratio + step) / 100.))
313     return sizes
315 # 得到更新后的特征尺寸list
316 def ssd_feat_shapes_from_net(predictions, default_shapes=None):
317     """Try to obtain the feature shapes from the prediction layers. The latter
318     can be either a Tensor or Numpy ndarray.
320     Return:
321       如果预测没有完全成型,就是用默认值
322       list of feature shapes. Default values if predictions shape not fully
323       determined.
324     """
325     feat_shapes = []
326     for l in predictions:   #l:预测的特征形状
327         # Get the shape, from either a np array or a tensor.
328         # 如果l是np.ndarray类型,则将l的形状赋给shape;否则将shape作为list
329         if isinstance(l, np.ndarray):
330             shape = l.shape
331         else:
332             shape = l.get_shape().as_list()
333         shape = shape[1:4]
334         # Problem: undetermined shape...
335         # 如果预测的特征尺寸未定,则使用默认的形状;否则将shape中的值赋给特征形状列表中
336         if None in shape:
337             return default_shapes
338         else:
339             feat_shapes.append(shape)
340     return feat_shapes      #返回更新后的特征尺寸list
342 #default box 的生成
343 #生成一层anchor box
344 def ssd_anchor_one_layer(img_shape,     #原始图像shape
345                          feat_shape,    #特征图shape
346                          sizes,         #默认box大小,两个正方形,两个长方形,仅仅就是长宽比例相反,所以就两个
347                          ratios,    #默认box长宽比,list,就是那些比率列表,元素值是比例,列表长度是框的个数
348                          step,  #特征图上一步对应在原图上的跨度
349                          offset=0.5,
350                          dtype=np.float32):
351     """Computer SSD default anchor boxes for one feature layer.
353     Determine the relative position grid of the centers, and the relative
354     width and height.确定中心的相对位置网格和相对位置网格宽度和高度。
356     Arguments:
357       feat_shape: Feature shape, used for computing relative position grids;
358       size: Absolute reference sizes;
359       ratios: Ratios to use on these features;
360       img_shape: Image shape, used for computing height, width relatively to the
361         former;
362       offset: Grid offset.
364     Return:
365       y, x, h, w: Relative x and y grids, and height and width.
366     """
367     # Compute the position grid: simple way.
368     # y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
369     # y = (y.astype(dtype) + offset) / feat_shape[0]
370     # x = (x.astype(dtype) + offset) / feat_shape[1]
371     # Weird SSD-Caffe computation using steps values...
372     # 归一化到原图的锚点中心坐标(x,y);其坐标值域为(0,1)
373     # 计算default box中心坐标(相对于原图)
374     y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]   # 对于第一个特征图(block4:38x38);
375                                                         # y=[[0,0,……0],[1,1,……1],……[37,37,……,37]];
376                                                         # 而x=[[0,1,2……,37],[0,1,2……,37],……[0,1,2……,37]]
377     y = (y.astype(dtype) + offset) * step / img_shape[0]# 将38个cell对应锚点框的y坐标偏移至每个cell中心,然后乘以相对原图缩放的比例,再除以原图
378     x = (x.astype(dtype) + offset) * step / img_shape[1]#可以得到在原图上,相对原图比例大小的每个锚点中心坐标x,y
380     # Expand dims to support easy broadcasting.#将锚点中心坐标扩大维度
381     y = np.expand_dims(y, axis=-1)  #对于第一个特征图,y的shape=38x38x1;x的shape=38x38x1
382     x = np.expand_dims(x, axis=-1)
384     # Compute relative height and width.
385     # Tries to follow the original implementation of SSD for the order.
386     # 默认框的个数,该特征图上每个cell对应的锚点框数量;如:对于第一个特征图每个点预测4个锚点框(block4:38x38),2+2=4
387     num_anchors = len(sizes) + len(ratios)
388     h = np.zeros((num_anchors, ), dtype=dtype)  #第一个锚点框的高h[0]=起始锚点的高/原图大小的高;例如:h[0]=21/300
389     w = np.zeros((num_anchors, ), dtype=dtype)  #第一个锚点框的宽w[0]=起始锚点的宽/原图大小的宽;例如:w[0]=21/300
390     # Add first anchor boxes with ratio=1.
391     h[0] = sizes[0] / img_shape[0]# 添加长宽比为1的默认框
392     w[0] = sizes[0] / img_shape[1]
393     di = 1  #锚点框个数偏移
394     if len(sizes) > 1:
395         # 添加一组特殊的默认框,就是用S'k计算出来的box,长宽比为1,大小为sqrt(s(i) + s(i+1))
396         #第二个锚点框的高h[1]=sqrt(起始锚点的高*起始锚点的宽)/原图大小的高;例如:h[1]=sqrt(21*45)/300
397         h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
398         #第二个锚点框的高w[1]=sqrt(起始锚点的高*起始锚点的宽)/原图大小的宽;例如:w[1]=sqrt(21*45)/300
399         w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
400         di += 1
401     # 添加不同比例的默认框(ratios中不含1)
402     # #遍历长宽比例,第一个特征图,r只有两个,2和0.5;共四个锚点框size(h[0]~h[3])
403     for i, r in enumerate(ratios):
404         # 例如:对于第一个特征图,h[0+2]=h[2]=21/300/sqrt(2);w[0+2]=w[2]=45/300*sqrt(2)
405         h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
406         # 例如:对于第一个特征图,h[1+2]=h[3]=21/300/sqrt(0.5);w[1+2]=w[3]=45/300*sqrt(0.5)
407         w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
408     return y, x, h, w   #返回没有归一化前的锚点坐标和尺寸
410 #检测所有特征图中锚点框的四个坐标信息
411 def ssd_anchors_all_layers(img_shape, #输入原始图大小
412                            layers_shape,#每个特征层形状尺寸
413                            anchor_sizes,#起始特征图中框的长宽size
414                            anchor_ratios,#锚点框长宽比列表
415                            anchor_steps,#锚点框相对原图缩放比例
416                            offset=0.5,#锚点中心在每个特征图cell中的偏移
417                            dtype=np.float32):
418     """Compute anchor boxes for all feature layers.
419     """
420     layers_anchors = [] #用于存放所有特征图中锚点框位置尺寸信息
421     for i, s in enumerate(layers_shape):#6个特征图尺寸;如:第0个是38x38
422         # 分别计算每个特征图中锚点框的位置尺寸信息;
423         anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
424                                              anchor_sizes[i],#输入:第i个特征图中起始锚点框大小;如第0个是(21., 45.)
425                                              anchor_ratios[i],#输入:第i个特征图中锚点框长宽比列表;如第0个是[2, .5]
426                                              anchor_steps[i],#输入:第i个特征图中锚点框相对原始图的缩放比;如第0个是8
427                                              offset=offset, dtype=dtype)#输入:第i个特征图中锚点框相对原始图的缩放比;如第0个是8
428         # 将6个特征图中每个特征图上的点对应的锚点框(6个或4个)保存
429         layers_anchors.append(anchor_bboxes)
430     return layers_anchors   #返回所有特征图的锚点框尺寸信息
433 # =========================================================================== #
434 # Functional definition of VGG-based SSD 300.功能定义
435 # =========================================================================== #
436 #得到一个tensor的dim,list
437 def tensor_shape(x, rank=3):
438     """Returns the dimensions of a tensor.
439     Args:
440       image: A N-D Tensor of shape.
441     Returns:
442       A list of dimensions. Dimensions that are statically known are python
443         integers,otherwise they are integer scalar tensors.
444     """
445     if x.get_shape().is_fully_defined():
446         return x.get_shape().as_list()
447     else:
448         static_shape = x.get_shape().with_rank(rank).as_list()
449         dynamic_shape = tf.unstack(tf.shape(x), rank)
450         return [s if s is not None else d
451                 for s, d in zip(static_shape, dynamic_shape)]
453 #对指定feature layers的位置预测以及类别预测
454 #首先计算anchors的数量,对于位置信息,输出16通道的feature map,将其reshape为[N,W,H,num_anchors,4]。
455 #对于类别信息,输出84通道的feature maps,再将其reshape为[N,W,H,num_anchors,num_classes]。返回计算得到的位置和类别预测。
456 #返回计算得到的位置和类别预测。
457 def ssd_multibox_layer(inputs,#输入特征层
458                        num_classes,#类别数
459                        sizes,#参考先验框的尺度
460                        ratios=[1],#默认的先验框长宽比为1
461                        normalization=-1,#默认不做正则化
462                        bn_normalization=False):
463     """
464     Construct a multibox layer, return a class and localization predictions.
465     """
466     net = inputs
467     if normalization > 0:#如果输入整数,则进行L2正则化
468         net = custom_layers.l2_normalization(net, scaling=True)#对通道所在维度进行正则化,随后乘以gamma缩放系数
469     # Number of anchors.
470     num_anchors = len(sizes) + len(ratios)#每层特征图参考先验框的个数[4,6,6,6,4,4]
472     # Location.#每个先验框对应4个坐标信息
473     # 最后整个特征图所有锚点框预测目标位置 tensor为[h*w*每个cell先验框数,4]
474     num_loc_pred = num_anchors * 4#特征图上每个单元预测的坐标所需维度=锚点框数*4
475     # 通过对特征图进行3x3卷积得到位置信息和类别权重信息
476     loc_pred = slim.conv2d(net, num_loc_pred, [3, 3], activation_fn=None,
477                            scope='conv_loc') #该部分是定位信息,输出维度为[特征图h,特征图w,每个单元所有锚点框坐标]
478     loc_pred = custom_layers.channel_to_last(loc_pred)
479     loc_pred = tf.reshape(loc_pred,tensor_shape(loc_pred, 4)[:-1]+[num_anchors, 4])
480     # Class prediction.
481     #特征图上每个单元预测的类别所需维度=锚点框数*种类数
482     num_cls_pred = num_anchors * num_classes
483     # 该部分是类别信息,输出维度为[特征图h,特征图w,每个单元所有锚点框对应类别信息]
484     ##最后整个特征图所有锚点框预测类别 tensor为[h*w*每个cell先验框数,种类数]
485     cls_pred = slim.conv2d(net, num_cls_pred, [3, 3], activation_fn=None,scope='conv_cls')
486     cls_pred = custom_layers.channel_to_last(cls_pred)
487     cls_pred = tf.reshape(cls_pred,tensor_shape(cls_pred, 4)[:-1]+[num_anchors, num_classes])
488     return cls_pred, loc_pred   #返回预测得到的类别和box位置 tensor
490 #定义ssd网络结构
491 def ssd_net(inputs,
492             num_classes=SSDNet.default_params.num_classes,  #分类数
493             feat_layers=SSDNet.default_params.feat_layers,  #特征层
494             anchor_sizes=SSDNet.default_params.anchor_sizes,
495             anchor_ratios=SSDNet.default_params.anchor_ratios,
496             normalizations=SSDNet.default_params.normalizations,#正则化
497             is_training=True,
498             dropout_keep_prob=0.5,
499             prediction_fn=slim.softmax,
500             reuse=None,
501             scope='ssd_300_vgg'):
502     """SSD net definition.
503     """
504     # if data_format == 'NCHW':
505     #     inputs = tf.transpose(inputs, perm=(0, 3, 1, 2))
507     # End_points collect relevant activations for external use.
508     end_points = {} #用于收集每一层输出结果
509     with tf.variable_scope(scope, 'ssd_300_vgg', [inputs], reuse=reuse):
510         # Original VGG-16 blocks. #VGG16网络的第一个conv,重复2次卷积,核为3x3,64个特征
511         net = slim.repeat(inputs, 2, slim.conv2d, 64, [3, 3], scope='conv1')
512         end_points['block1'] = net  #conv1_2结果存入end_points,name='block1'
513         net = slim.max_pool2d(net, [2, 2], scope='pool1')
514         # Block 2. #重复2次卷积,核为3x3,128个特征
515         net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], scope='conv2')
516         end_points['block2'] = net #conv2_2结果存入end_points,name='block2'
517         net = slim.max_pool2d(net, [2, 2], scope='pool2')
518         # Block 3.#重复3次卷积,核为3x3,256个特征
519         net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3')
520         end_points['block3'] = net#conv3_3结果存入end_points,name='block3'
521         net = slim.max_pool2d(net, [2, 2], scope='pool3')
522         # Block 4.#重复3次卷积,核为3x3,512个特征
523         net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv4')
524         end_points['block4'] = net  #conv4_3结果存入end_points,name='block4'
525         net = slim.max_pool2d(net, [2, 2], scope='pool4')
526         # Block 5.#重复3次卷积,核为3x3,512个特征
527         net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], scope='conv5')
528         end_points['block5'] = net  #conv5_3结果存入end_points,name='block5'
529         net = slim.max_pool2d(net, [3, 3], stride=1, scope='pool5')
531         # Additional SSD blocks.    #去掉了VGG的全连接层
532         # Block 6: let's dilate the hell out of it!
533         # 将VGG基础网络最后的池化层结果做扩展卷积(带孔卷积);
534         net = slim.conv2d(net, 1024, [3, 3], rate=6, scope='conv6')
535         end_points['block6'] = net #conv6结果存入end_points,name='block6'
536         net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)#dropout层
537         # Block 7: 1x1 conv. Because the fuck.
538         # 将dropout后的网络做1x1卷积,输出1024特征,name='block7'
539         net = slim.conv2d(net, 1024, [1, 1], scope='conv7')
540         end_points['block7'] = net
541         net = tf.layers.dropout(net, rate=dropout_keep_prob, training=is_training)#将卷积后的网络继续做dropout
543         # Block 8/9/10/11: 1x1 and 3x3 convolutions stride 2 (except lasts).
544         end_point = 'block8'    #对上述dropout的网络做1x1卷积,然后做3x3卷积,,输出512特征图,name=‘block8’
545         with tf.variable_scope(end_point):
546             net = slim.conv2d(net, 256, [1, 1], scope='conv1x1')
547             net = custom_layers.pad2d(net, pad=(1, 1))
548             net = slim.conv2d(net, 512, [3, 3], stride=2, scope='conv3x3', padding='VALID')
549         end_points[end_point] = net
550         end_point = 'block9'    #对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block9’
551         with tf.variable_scope(end_point):
552             net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
553             net = custom_layers.pad2d(net, pad=(1, 1))
554             net = slim.conv2d(net, 256, [3, 3], stride=2, scope='conv3x3', padding='VALID')
555         end_points[end_point] = net
556         end_point = 'block10'   #对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block10’
557         with tf.variable_scope(end_point):
558             net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
559             net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
560         end_points[end_point] = net
561         end_point = 'block11'   #对上述网络做1x1卷积,然后做3x3卷积,输出256特征图,name=‘block11’
562         with tf.variable_scope(end_point):
563             net = slim.conv2d(net, 128, [1, 1], scope='conv1x1')
564             net = slim.conv2d(net, 256, [3, 3], scope='conv3x3', padding='VALID')
565         end_points[end_point] = net
567         # Prediction and localisations layers.
568         # 预测和定位
569         predictions = []
570         logits = []
571         localisations = []
572         for i, layer in enumerate(feat_layers):         #遍历特征层
573             with tf.variable_scope(layer + '_box'):     #起个命名范围
574                 # 做多尺度大小box预测的特征层,返回每个cell中每个先验框预测的类别p和预测的位置l
575                 p, l = ssd_multibox_layer(end_points[layer],
576                                           num_classes,#种类数
577                                           anchor_sizes[i],#先验框尺度(同一特征图上的先验框尺度和长宽比一致)
578                                           anchor_ratios[i],#先验框长宽比
579                                           normalizations[i])#每个特征正则化信息,目前是只对第一个特征图做归一化操作;
580             # 把每一层的预测收集
581             predictions.append(prediction_fn(p))#prediction_fn为softmax,预测类别
582             logits.append(p)#把每个cell每个先验框预测的类别的概率值存在logits中
583             localisations.append(l)#预测位置信息
584         # 返回类别预测结果,位置预测结果,所属某个类别的概率值,以及特征层
585         return predictions, localisations, logits, end_points
586 ssd_net.default_image_size = 300
588 # 权重衰减系数=0.0005;其是L2正则化项的系数
589 def ssd_arg_scope(weight_decay=0.0005, data_format='NHWC'):
590     """
591     Defines the VGG arg scope.
592     Args:
593       weight_decay: The l2 regularization coefficient.
594     Returns:
595       An arg_scope.
596     """
597     with slim.arg_scope([slim.conv2d, slim.fully_connected],
598                         activation_fn=tf.nn.relu,
599                         weights_regularizer=slim.l2_regularizer(weight_decay),
600                         weights_initializer=tf.contrib.layers.xavier_initializer(),
601                         biases_initializer=tf.zeros_initializer()):
602         with slim.arg_scope([slim.conv2d, slim.max_pool2d],
603                             padding='SAME',
604                             data_format=data_format):
605             with slim.arg_scope([custom_layers.pad2d,
606                                  custom_layers.l2_normalization,
607                                  custom_layers.channel_to_last],
608                                 data_format=data_format) as sc:
609                 return sc
611 # =========================================================================== #
612 # Caffe scope: importing weights at initialization.
613 # =========================================================================== #
615 def ssd_arg_scope_caffe(caffe_scope):
616     """Caffe scope definition.
618     Args:
619       caffe_scope: Caffe scope object with loaded weights.
621     Returns:
622       An arg_scope.
623     """
624     # Default network arg scope.
625     with slim.arg_scope([slim.conv2d],
626                         activation_fn=tf.nn.relu,
627                         weights_initializer=caffe_scope.conv_weights_init(),
628                         biases_initializer=caffe_scope.conv_biases_init()):
629         with slim.arg_scope([slim.fully_connected],
630                             activation_fn=tf.nn.relu):
631             with slim.arg_scope([custom_layers.l2_normalization],
632                                 scale_initializer=caffe_scope.l2_norm_scale_init()):
633                 with slim.arg_scope([slim.conv2d, slim.max_pool2d],
634                                     padding='SAME') as sc:
635                     return sc
638 # =========================================================================== #
639 # SSD loss function.
640 # =========================================================================== #
641 def ssd_losses(logits, localisations,   #损失函数定义为位置误差和置信度误差的加权和;
642                gclasses, glocalisations, gscores,
643                match_threshold=0.5,
644                negative_ratio=3.,
645                alpha=1.,    #位置误差权重系数
646                label_smoothing=0.,
647                device='/cpu:0',
648                scope=None):
649     with tf.name_scope(scope, 'ssd_losses'):
650         lshape = tfe.get_shape(logits[0], 5)
651         num_classes = lshape[-1]
652         batch_size = lshape[0]
654         # Flatten out all vectors!
655         flogits = []
656         fgclasses = []
657         fgscores = []
658         flocalisations = []
659         fglocalisations = []
660         for i in range(len(logits)):
661             flogits.append(tf.reshape(logits[i], [-1, num_classes]))    #将类别的概率值reshape成(-1,21)
662             fgclasses.append(tf.reshape(gclasses[i], [-1]))     #真实类别
663             fgscores.append(tf.reshape(gscores[i], [-1]))       #预测真实目标的得分
664             flocalisations.append(tf.reshape(localisations[i], [-1, 4]))    #预测真实目标边框坐标(编码形式的值)
665             fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))  #用于将真实目标gt的坐标进行编码存储
666         # And concat the crap!
667         logits = tf.concat(flogits, axis=0)
668         gclasses = tf.concat(fgclasses, axis=0)
669         gscores = tf.concat(fgscores, axis=0)
670         localisations = tf.concat(flocalisations, axis=0)
671         glocalisations = tf.concat(fglocalisations, axis=0)
672         dtype = logits.dtype
674         # Compute positive matching mask...
675         pmask = gscores > match_threshold   #预测框与真实框IOU>0.5则将这个先验作为正样本
676         fpmask = tf.cast(pmask, dtype)
677         n_positives = tf.reduce_sum(fpmask) #求正样本数量N
679         # Hard negative mining...
680         #为了保证正负样本尽量平衡,SSD采用了hard negative mining,
681         # 就是对负样本进行抽样,抽样时按照置信度误差(预测背景的置信度越小,误差越大)进行降序排列,
682         # 选取误差的较大的top - k作为训练的负样本,以保证正负样本比例接近1: 3
683         no_classes = tf.cast(pmask, tf.int32)
684         predictions = slim.softmax(logits) #类别预测
685         nmask = tf.logical_and(tf.logical_not(pmask),
686                                gscores > -0.5)
687         fnmask = tf.cast(nmask, dtype)
688         nvalues = tf.where(nmask,
689                            predictions[:, 0],
690                            1. - fnmask)
691         nvalues_flat = tf.reshape(nvalues, [-1])
692         # Number of negative entries to select.
693         max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
694         n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size #负样本数量,保证是正样本3倍
695         n_neg = tf.minimum(n_neg, max_neg_entries)
696         # 抽样时按照置信度误差(预测背景的置信度越小,误差越大)进行降序排列,选取误差的较大的top-k作为训练的负样本
697         val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
698         max_hard_pred = -val[-1]
699         # Final negative mask.
700         nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
701         fnmask = tf.cast(nmask, dtype)
703         # Add cross-entropy loss.#交叉熵
704         with tf.name_scope('cross_entropy_pos'):
705             # 类别置信度误差
706             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=gclasses)
707             # 将置信度误差除以正样本数后除以batch-size
708             loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')
709             tf.losses.add_loss(loss)
711         with tf.name_scope('cross_entropy_neg'):
712             loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
713                                                                   labels=no_classes)
714             loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
715             tf.losses.add_loss(loss)
717         # Add localization loss: smooth L1, L2, ...
718         with tf.name_scope('localization'):
719             # Weights Tensor: positive mask + random negative.
720             weights = tf.expand_dims(alpha * fpmask, axis=-1)
721             # 先验框对应边界的位置预测值-真实位置;然后做Smooth L1 loss
722             loss = custom_layers.abs_smooth(localisations - glocalisations)
723             # 将上面的loss*权重(=alpha/正样本数)求和后除以batch-size
724             loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
725             tf.losses.add_loss(loss)#获得置信度误差和位置误差的加权和
728 def ssd_losses_old(logits, localisations,
729                    gclasses, glocalisations, gscores,
730                    match_threshold=0.5,
731                    negative_ratio=3.,
732                    alpha=1.,
733                    label_smoothing=0.,
734                    device='/cpu:0',
735                    scope=None):
736     """Loss functions for training the SSD 300 VGG network.
738     This function defines the different loss components of the SSD, and
739     adds them to the TF loss collection.
741     Arguments:
742       logits: (list of) predictions logits Tensors;
743       localisations: (list of) localisations Tensors;
744       gclasses: (list of) groundtruth labels Tensors;
745       glocalisations: (list of) groundtruth localisations Tensors;
746       gscores: (list of) groundtruth score Tensors;
747     """
748     with tf.device(device):
749         with tf.name_scope(scope, 'ssd_losses'):
750             l_cross_pos = []
751             l_cross_neg = []
752             l_loc = []
753             for i in range(len(logits)):
754                 dtype = logits[i].dtype
755                 with tf.name_scope('block_%i' % i):
756                     # Sizing weight...
757                     wsize = tfe.get_shape(logits[i], rank=5)
758                     wsize = wsize[1] * wsize[2] * wsize[3]
760                     # Positive mask.
761                     pmask = gscores[i] > match_threshold
762                     fpmask = tf.cast(pmask, dtype)
763                     n_positives = tf.reduce_sum(fpmask)
765                     # Select some random negative entries.
766                     # n_entries =[i].get_shape().as_list())
767                     # r_positive = n_positives / n_entries
768                     # r_negative = negative_ratio * n_positives / (n_entries - n_positives)
770                     # Negative mask.
771                     no_classes = tf.cast(pmask, tf.int32)
772                     predictions = slim.softmax(logits[i])
773                     nmask = tf.logical_and(tf.logical_not(pmask),
774                                            gscores[i] > -0.5)
775                     fnmask = tf.cast(nmask, dtype)
776                     nvalues = tf.where(nmask,
777                                        predictions[:, :, :, :, 0],
778                                        1. - fnmask)
779                     nvalues_flat = tf.reshape(nvalues, [-1])
780                     # Number of negative entries to select.
781                     n_neg = tf.cast(negative_ratio * n_positives, tf.int32)
782                     n_neg = tf.maximum(n_neg, tf.size(nvalues_flat) // 8)
783                     n_neg = tf.maximum(n_neg, tf.shape(nvalues)[0] * 4)
784                     max_neg_entries = 1 + tf.cast(tf.reduce_sum(fnmask), tf.int32)
785                     n_neg = tf.minimum(n_neg, max_neg_entries)
787                     val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
788                     max_hard_pred = -val[-1]
789                     # Final negative mask.
790                     nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
791                     fnmask = tf.cast(nmask, dtype)
793                     # Add cross-entropy loss.
794                     with tf.name_scope('cross_entropy_pos'):
795                         fpmask = wsize * fpmask
796                         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],
797                                                                               labels=gclasses[i])
798                         loss = tf.losses.compute_weighted_loss(loss, fpmask)
799                         l_cross_pos.append(loss)
801                     with tf.name_scope('cross_entropy_neg'):
802                         fnmask = wsize * fnmask
803                         loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits[i],
804                                                                               labels=no_classes)
805                         loss = tf.losses.compute_weighted_loss(loss, fnmask)
806                         l_cross_neg.append(loss)
808                     # Add localization loss: smooth L1, L2, ...
809                     with tf.name_scope('localization'):
810                         # Weights Tensor: positive mask + random negative.
811                         weights = tf.expand_dims(alpha * fpmask, axis=-1)
812                         loss = custom_layers.abs_smooth(localisations[i] - glocalisations[i])
813                         loss = tf.losses.compute_weighted_loss(loss, weights)
814                         l_loc.append(loss)
816             # Additional total losses...
817             with tf.name_scope('total'):
818                 total_cross_pos = tf.add_n(l_cross_pos, 'cross_entropy_pos')
819                 total_cross_neg = tf.add_n(l_cross_neg, 'cross_entropy_neg')
820                 total_cross = tf.add(total_cross_pos, total_cross_neg, 'cross_entropy')
821                 total_loc = tf.add_n(l_loc, 'localization')
823                 # Add to EXTRA LOSSES TF.collection
824                 tf.add_to_collection('EXTRA_LOSSES', total_cross_pos)
825                 tf.add_to_collection('EXTRA_LOSSES', total_cross_neg)
826                 tf.add_to_collection('EXTRA_LOSSES', total_cross)
827                 tf.add_to_collection('EXTRA_LOSSES', total_loc)


  1 # Copyright 2015 Paul Balanca. All Rights Reserved.
  2 #
  3 # Licensed under the Apache License, Version 2.0 (the "License");
  4 # you may not use this file except in compliance with the License.
  5 # You may obtain a copy of the License at
  6 #
  7 #
  8 #
  9 # Unless required by applicable law or agreed to in writing, software
 10 # distributed under the License is distributed on an "AS IS" BASIS,
 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 # See the License for the specific language governing permissions and
 13 # limitations under the License.
 14 # ==============================================================================
 15 """Implement some custom layers, not provided by TensorFlow.
 16 实现一些TensorFlow没有提供的自定义层
 17 Trying to follow as much as possible the style/standards used in
 18 tf.contrib.layers
 19 尽可能多地遵循这种风格/标准
 20 """
 21 import tensorflow as tf
 23 from tensorflow.contrib.framework.python.ops import add_arg_scope
 24 from tensorflow.contrib.layers.python.layers import initializers
 25 from tensorflow.contrib.framework.python.ops import variables
 26 from tensorflow.contrib.layers.python.layers import utils
 27 from tensorflow.python.ops import nn
 28 from tensorflow.python.ops import init_ops
 29 from tensorflow.python.ops import variable_scope
 32 def abs_smooth(x):
 33     """Smoothed absolute function. Useful to compute an L1 smooth error.
 34     #绝对平滑函数,用于计算L1平滑误差
 35     #当预测值与目标值相差很大时, 梯度容易爆炸,因此L1 loss对噪声(outliers)更鲁棒
 36     Define as:
 37         x^2 / 2         if abs(x) < 1
 38         abs(x) - 0.5    if abs(x) > 1
 39     We use here a differentiable definition using min(x) and abs(x). Clearly
 40     not optimal, but good enough for our purpose!
 41     """
 42     absx = tf.abs(x)
 43     minx = tf.minimum(absx, 1)
 44     r = 0.5 * ((absx - 1) * minx + absx)    #计算得到L1 smooth loss
 45     return r
 47 @add_arg_scope
 48 #L2正则化:稀疏正则化操作
 49 def l2_normalization(
 50         inputs,#输入特征层,[batch_size,h,w,c]
 51         scaling=False,#默认归一化后是否设置缩放变量gamma
 52         scale_initializer=init_ops.ones_initializer(),#scale初始化为1
 53         reuse=None,
 54         variables_collections=None,
 55         outputs_collections=None,
 56         data_format='NHWC',
 57         trainable=True,
 58         scope=None):
 59     """Implement L2 normalization on every feature (i.e. spatial normalization).
 60     对每个特性实现L2规范化,空间归一化
 61     Should be extended in some near future to other dimensions, providing a more
 62     flexible normalization framework.
 63     是否应该在不久的将来扩展到其他维度,提供更多灵活的标准化框架。
 64     Args:
 65       inputs: a 4-D tensor with dimensions [batch_size, height, width, channels].
 66       scaling: whether or not to add a post scaling operation along the dimensions
 67         which have been normalized.
 68       scale_initializer: An initializer for the weights.
 69       reuse: whether or not the layer and its variables should be reused. To be
 70         able to reuse the layer scope must be given.
 71       variables_collections: optional list of collections for all the variables or
 72         a dictionary containing a different list of collection per variable.
 73       outputs_collections: collection to add the outputs.
 74       data_format:  NHWC or NCHW data format.
 75       trainable: If `True` also add variables to the graph collection
 76         `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
 77       scope: Optional scope for `variable_scope`.
 78     Returns:
 79       A `Tensor` representing the output of the operation.
 80     """
 82     with variable_scope.variable_scope(
 83             scope, 'L2Normalization', [inputs], reuse=reuse) as sc:
 84         inputs_shape = inputs.get_shape()#得到输入特征层的维度信息
 85         inputs_rank = inputs_shape.ndims #维度数=4
 86         dtype = inputs.dtype.base_dtype#数据类型
 87         if data_format == 'NHWC':
 88             # norm_dim = tf.range(1, inputs_rank-1)
 89             norm_dim = tf.range(inputs_rank-1, inputs_rank)#需要正则化的维度是4-1=3即channel这个维度
 90             params_shape = inputs_shape[-1:]#通道数
 91         elif data_format == 'NCHW':
 92             # norm_dim = tf.range(2, inputs_rank)
 93             norm_dim = tf.range(1, 2)#需要正则化的维度是第1维,即channel这个维度
 94             params_shape = (inputs_shape[1])#通道数
 96         # Normalize along spatial dimensions.
 97         # 对通道所在维度进行正则化,其中epsilon是避免除0风险
 98         outputs = nn.l2_normalize(inputs, norm_dim, epsilon=1e-12)
 99         # Additional scaling.
100         # 判断是否对正则化后设置缩放变量
101         if scaling:
102             scale_collections = utils.get_variable_collections(
103                 variables_collections, 'scale')
104             scale = variables.model_variable('gamma',
105                                              shape=params_shape,
106                                              dtype=dtype,
107                                              initializer=scale_initializer,
108                                              collections=scale_collections,
109                                              trainable=trainable)
110             if data_format == 'NHWC':
111                 outputs = tf.multiply(outputs, scale)
112             elif data_format == 'NCHW':
113                 scale = tf.expand_dims(scale, axis=-1)
114                 scale = tf.expand_dims(scale, axis=-1)
115                 outputs = tf.multiply(outputs, scale)
116                 # outputs = tf.transpose(outputs, perm=(0, 2, 3, 1))
117         # 即返回L2_norm*gamma
118         return utils.collect_named_outputs(outputs_collections,
119                                            sc.original_name_scope, outputs)
122 @add_arg_scope
123 def pad2d(inputs,
124           pad=(0, 0),
125           mode='CONSTANT',
126           data_format='NHWC',
127           trainable=True,
128           scope=None):
129     """
130     2D Padding layer, adding a symmetric padding to H and W dimensions.
131     2D填充层,为H和W维度添加对称填充
132     Aims to mimic padding in Caffe and MXNet, helping the port of models to
133     TensorFlow. Tries to follow the naming convention of `tf.contrib.layers`.
134     目的是在Caffe和MXNet中模拟填充,帮助模型移植到TensorFlow。
135     尝试遵循“tf.contrib.layers”的命名约定。
136     Args:
137       inputs: 4D input Tensor;
138       pad: 2-Tuple with padding values for H and W dimensions;
139       mode: Padding mode. C.f. `tf.pad`
140       data_format:  NHWC or NCHW data format.
141     """
142     with tf.name_scope(scope, 'pad2d', [inputs]):
143         # Padding shape.
144         if data_format == 'NHWC':
145             paddings = [[0, 0], [pad[0], pad[0]], [pad[1], pad[1]], [0, 0]]
146         elif data_format == 'NCHW':
147             paddings = [[0, 0], [0, 0], [pad[0], pad[0]], [pad[1], pad[1]]]
148         net = tf.pad(inputs, paddings, mode=mode)
149         return net
152 @add_arg_scope
153 #作用,将输入的特征图网络的通道维度放在最后,返回变形后的网络
154 def channel_to_last(inputs,
155                     data_format='NHWC',
156                     scope=None):
157     """Move the channel axis to the last dimension. Allows to
158     provide a single output format whatever the input data format.
159     将通道轴移动到最后一个维度。允许无论输入数据格式如何,都要提供单一的输出格式。
160     Args:
161       inputs: Input Tensor;
162       data_format: NHWC or NCHW.
163     Return:
164       Input in NHWC format.
165     """
166     with tf.name_scope(scope, 'channel_to_last', [inputs]):
167         if data_format == 'NHWC':
168             net = inputs
169         elif data_format == 'NCHW':
170             net = tf.transpose(inputs, perm=(0, 2, 3, 1))
171         return net

  1 # Copyright 2015 Paul Balanca. All Rights Reserved.
  2 #
  3 # Licensed under the Apache License, Version 2.0 (the "License");
  4 # you may not use this file except in compliance with the License.
  5 # You may obtain a copy of the License at
  6 #
  7 #
  8 #
  9 # Unless required by applicable law or agreed to in writing, software
 10 # distributed under the License is distributed on an "AS IS" BASIS,
 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 # See the License for the specific language governing permissions and
 13 # limitations under the License.
 14 # ==============================================================================
 15 """Shared function between different SSD implementations.
 16 """
 17 import numpy as np
 18 import tensorflow as tf
 19 import tf_extended as tfe
 22 # =========================================================================== #
 23 # TensorFlow implementation of boxes SSD encoding / decoding.
 24 # =========================================================================== #
 25 def tf_ssd_bboxes_encode_layer(labels,          #gt标签,1D的tensor
 26                                bboxes,          #Nx4的Tensor(float),真实的bbox
 27                                anchors_layer,      #参考锚点list
 28                                num_classes,        #分类类别数
 29                                no_annotation_label,
 30                                ignore_threshold=0.5,                #gt和锚点框间的匹配阈值,大于该值则为正样本
 31                                prior_scaling=[0.1, 0.1, 0.2, 0.2],  #真实值到预测值转换中用到的缩放
 32                                dtype=tf.float32):
 33     """Encode groundtruth labels and bounding boxes using SSD anchors from
 34     one layer.
 35     Arguments:
 36       labels: 1D Tensor(int64) containing groundtruth labels;
 37       bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
 38       anchors_layer: Numpy array with layer anchors;
 39       matching_threshold: Threshold for positive match with groundtruth bboxes;
 40       prior_scaling: Scaling of encoded coordinates.
 41     Return:
 42       (target_labels, target_localizations, target_scores): Target Tensors.   返回:包含目标标签类别,目标位置,目标置信度的tesndor
 43     """
 44     # Anchors coordinates and volume.
 45     yref, xref, href, wref = anchors_layer   #此前每个特征图上点对应生成的锚点框作为参考框
 46     ymin = yref - href / 2.                  #求参考框的左上角点(xmin,ymin)和右下角点(xmax,ymax)
 47     xmin = xref - wref / 2.   #yref和xref的shape为(38,38,1);href和wref的shape为(4,)
 48     ymax = yref + href / 2.
 49     xmax = xref + wref / 2.   
 50     vol_anchors = (xmax - xmin) * (ymax - ymin) #求参考框面积vol_anchors
 52     # Initialize tensors...                            #shape表示每个特征图上总锚点数
 53     shape = (yref.shape[0], yref.shape[1], href.size)  #对于第一个特征图,shape=(38,38,4);第二个特征图的shape=(19,19,6)
 54     feat_labels = tf.zeros(shape, dtype=tf.int64)   #初始化每个特征图上的点对应的各个box所属标签维度 如:38x38x4
 55     feat_scores = tf.zeros(shape, dtype=dtype)      #初始化每个特征图上的点对应的各个box所属标目标的得分值维度 如:38x38x4
 57     feat_ymin = tf.zeros(shape, dtype=dtype)    #预测每个特征图每个点所属目标的坐标 ;如38x38x4;初始化为全0
 58     feat_xmin = tf.zeros(shape, dtype=dtype)
 59     feat_ymax = tf.ones(shape, dtype=dtype)
 60     feat_xmax = tf.ones(shape, dtype=dtype)
 62     def jaccard_with_anchors(bbox):                      #计算gt的框和参考锚点框的重合度
 63         """Compute jaccard score between a box and the anchors.
 64         """
 65         int_ymin = tf.maximum(ymin, bbox[0])           #计算重叠区域的坐标
 66         int_xmin = tf.maximum(xmin, bbox[1])
 67         int_ymax = tf.minimum(ymax, bbox[2])
 68         int_xmax = tf.minimum(xmax, bbox[3])
 69         h = tf.maximum(int_ymax - int_ymin, 0.)        #计算重叠区域的长与宽
 70         w = tf.maximum(int_xmax - int_xmin, 0.)
 71         # Volumes.
 72         inter_vol = h * w                                 #重叠区域的面积
 73         union_vol = vol_anchors - inter_vol \             #计算bbox和参考框的并集区域
 74             + (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
 75         jaccard = tf.div(inter_vol, union_vol)           #计算IOU并返回该值
 76         return jaccard                                 
 78     def intersection_with_anchors(bbox):                #计算某个参考框包含真实框的得分情况
 79         """Compute intersection between score a box and the anchors.
 80         """
 81         int_ymin = tf.maximum(ymin, bbox[0])            #计算bbox和锚点框重叠区域的坐标和长宽
 82         int_xmin = tf.maximum(xmin, bbox[1])
 83         int_ymax = tf.minimum(ymax, bbox[2])
 84         int_xmax = tf.minimum(xmax, bbox[3])
 85         h = tf.maximum(int_ymax - int_ymin, 0.)
 86         w = tf.maximum(int_xmax - int_xmin, 0.)
 87         inter_vol = h * w                                 #重叠区域面积
 88         scores = tf.div(inter_vol, vol_anchors)           #将重叠区域面积除以参考框面积作为该参考框得分值;
 89         return scores
 91     def condition(i, feat_labels, feat_scores,
 92                   feat_ymin, feat_xmin, feat_ymax, feat_xmax):
 93         """Condition: check label index.
 94         """
 95         r = tf.less(i, tf.shape(labels)) # 逐元素比较大小,遍历labels,因为i在body返回的时候加1了
 96         return r[0]
 98     def body(i, feat_labels, feat_scores,                 #该函数大致意思是选择与gt box IOU最大的锚点框负责回归任务,并预测对应的边界框,如此循环
 99              feat_ymin, feat_xmin, feat_ymax, feat_xmax):
100         """Body: update feature labels, scores and bboxes.
101         Follow the original SSD paper for that purpose:
102           - assign values when jaccard > 0.5;
103           - only update if beat the score of other bboxes.
104         """
105         # Jaccard score.                                         #计算bbox与参考框的IOU值
106         label = labels[i]
107         bbox = bboxes[i]
108         jaccard = jaccard_with_anchors(bbox)
109         # Mask: check threshold + scores + no annotations + num_classes.
110         mask = tf.greater(jaccard, feat_scores)                  #当IOU大于feat_scores时,对应的mask至1,做筛选
111         # mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
112         mask = tf.logical_and(mask, feat_scores > -0.5)
113         mask = tf.logical_and(mask, label < num_classes)         #label满足<21
114         imask = tf.cast(mask, tf.int64)                          #将mask转换数据类型int型
115         fmask = tf.cast(mask, dtype)                             #将mask转换数据类型float型
116         # Update values using mask.
117         feat_labels = imask * label + (1 - imask) * feat_labels  #当mask=1,则feat_labels=1;否则为0,即背景
118         feat_scores = tf.where(mask, jaccard, feat_scores)       #tf.where表示如果mask为真则jaccard,否则为feat_scores
120         feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin    #选择与GT bbox IOU最大的框作为GT bbox,然后循环
121         feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
122         feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
123         feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
125         # Check no annotation label: ignore these anchors...     #对没有标注标签的锚点框做忽视,应该是背景
126         # interscts = intersection_with_anchors(bbox)
127         # mask = tf.logical_and(interscts > ignore_threshold,
128         #                       label == no_annotation_label)
129         # # Replace scores by -1.
130         # feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
132         return [i+1, feat_labels, feat_scores,
133                 feat_ymin, feat_xmin, feat_ymax, feat_xmax]
134     # Main loop definition.
135     i = 0
136     [i, feat_labels, feat_scores,
137      feat_ymin, feat_xmin,
138      feat_ymax, feat_xmax] = tf.while_loop(condition, body,
139                                            [i, feat_labels, feat_scores,
140                                             feat_ymin, feat_xmin,
141                                             feat_ymax, feat_xmax])
142     # Transform to center / size.                               #转换为中心及长宽形式(计算补偿后的中心)
143     feat_cy = (feat_ymax + feat_ymin) / 2.  #真实预测值其实是边界框相对于先验框的转换值,encode就是为了求这个转换值
144     feat_cx = (feat_xmax + feat_xmin) / 2.  
145     feat_h = feat_ymax - feat_ymin
146     feat_w = feat_xmax - feat_xmin
147     # Encode features.
148     feat_cy = (feat_cy - yref) / href / prior_scaling[0]   #(预测真实边界框中心y-参考框中心y)/参考框高/缩放尺度
149     feat_cx = (feat_cx - xref) / wref / prior_scaling[1]   
150     feat_h = tf.log(feat_h / href) / prior_scaling[2]      #log(预测真实边界框高h/参考框高h)/缩放尺度
151     feat_w = tf.log(feat_w / wref) / prior_scaling[3]
152     # Use SSD ordering: x / y / w / h instead of ours.
153     feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)  #返回(cx转换值,cy转换值,w转换值,h转换值)形式的边界框的预测值(其实是预测框相对于参考框的转换)
154     return feat_labels, feat_localizations, feat_scores                         #返回目标标签,目标预测值(位置转换值),目标置信度
155     #经过我们回归得到的变换,经过变换得到真实框,所以这个地方损失函数其实是我们预测的是变换,我们实际的框和anchor之间的变换和我们预测的变换之间的loss。我们回归的是一种变换。并不是直接预测框,这个和YOLO是不一样的。和Faster RCNN是一样的
158 def tf_ssd_bboxes_encode(labels,       #1D的tensor 包含gt标签
159                          bboxes,       #Nx4的tensor包含真实框的相对坐标
160                          anchors,      #参考锚点框信息(y,x,h,w) 其中y,x是中心坐标
161                          num_classes,
162                          no_annotation_label,
163                          ignore_threshold=0.5,
164                          prior_scaling=[0.1, 0.1, 0.2, 0.2],
165                          dtype=tf.float32,
166                          scope='ssd_bboxes_encode'):
167     """Encode groundtruth labels and bounding boxes using SSD net anchors.
168     Encoding boxes for all feature layers.
169     Arguments:
170       labels: 1D Tensor(int64) containing groundtruth labels;
171       bboxes: Nx4 Tensor(float) with bboxes relative coordinates;
172       anchors: List of Numpy array with layer anchors;
173       matching_threshold: Threshold for positive match with groundtruth bboxes;
174       prior_scaling: Scaling of encoded coordinates.
175     Return:
176       (target_labels, target_localizations, target_scores):  #返回:目标标签,目标位置,目标得分值(都是list形式)
177         Each element is a list of target Tensors.
178     """
179     with tf.name_scope(scope):  
180         target_labels = []               #目标标签
181         target_localizations = []        #目标位置
182         target_scores = []               #目标得分
183         for i, anchors_layer in enumerate(anchors):                #对所有特征图中的参考框做遍历
184             with tf.name_scope('bboxes_encode_block_%i' % i):
185                 t_labels, t_loc, t_scores = \
186                     tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,    #输入真实标签,gt位置大小,参考框位置大小……得到预测真实标签,参考框到真实框的转换以及得分
187                                                num_classes, no_annotation_label,
188                                                ignore_threshold,
189                                                prior_scaling, dtype)
190                 target_labels.append(t_labels)
191                 target_localizations.append(t_loc)
192                 target_scores.append(t_scores)
193         return target_labels, target_localizations, target_scores
196 def tf_ssd_bboxes_decode_layer(feat_localizations,   #解码,在预测时用到,根据之前得到的预测值相对于参考框的转换值后,反推出真实位置(该位置包括真实的x,y,w,h)
197                                anchors_layer,         #需要输入:预测框和参考框的转换feat_localizations,参考框位置尺度信息anchors_layer,以及转换时用到的缩放
198                                prior_scaling=[0.1, 0.1, 0.2, 0.2]):    #输出真实预测框的ymin,xmin,ymax,xmax
199     """Compute the relative bounding boxes from the layer features and
200     reference anchor bounding boxes.
201     Arguments:
202       feat_localizations: Tensor containing localization features.
203       anchors: List of numpy array containing anchor boxes.
204     Return:
205       Tensor Nx4: ymin, xmin, ymax, xmax
206     """
207     yref, xref, href, wref = anchors_layer    #锚点框的参考中心点以及长宽
209     # Compute center, height and width
210     cx = feat_localizations[:, :, :, :, 0] * wref * prior_scaling[0] + xref
211     cy = feat_localizations[:, :, :, :, 1] * href * prior_scaling[1] + yref
212     w = wref * tf.exp(feat_localizations[:, :, :, :, 2] * prior_scaling[2])
213     h = href * tf.exp(feat_localizations[:, :, :, :, 3] * prior_scaling[3])
214     # Boxes coordinates.
215     ymin = cy - h / 2.
216     xmin = cx - w / 2.
217     ymax = cy + h / 2.
218     xmax = cx + w / 2.
219     bboxes = tf.stack([ymin, xmin, ymax, xmax], axis=-1)
220     return bboxes     #预测真实框的坐标信息(两点式的框)
223 def tf_ssd_bboxes_decode(feat_localizations,
224                          anchors,
225                          prior_scaling=[0.1, 0.1, 0.2, 0.2],
226                          scope='ssd_bboxes_decode'):
227     """Compute the relative bounding boxes from the SSD net features and
228     reference anchors bounding boxes.
229     Arguments:
230       feat_localizations: List of Tensors containing localization features.
231       anchors: List of numpy array containing anchor boxes.
232     Return:
233       List of Tensors Nx4: ymin, xmin, ymax, xmax
234     """
235     with tf.name_scope(scope):
236         bboxes = []
237         for i, anchors_layer in enumerate(anchors):
238             bboxes.append(
239                 tf_ssd_bboxes_decode_layer(feat_localizations[i],
240                                            anchors_layer,
241                                            prior_scaling))
242         return bboxes
245 # =========================================================================== #
246 # SSD boxes selection.
247 # =========================================================================== #
248 def tf_ssd_bboxes_select_layer(predictions_layer, localizations_layer,  #输入预测得到的类别和位置做筛选
249                                select_threshold=None,
250                                num_classes=21,
251                                ignore_class=0,
252                                scope=None):
253     """Extract classes, scores and bounding boxes from features in one layer.
254     Batch-compatible: inputs are supposed to have batch-type shapes.
255     Args:
256       predictions_layer: A SSD prediction layer;
257       localizations_layer: A SSD localization layer;
258       select_threshold: Classification threshold for selecting a box. All boxes
259         under the threshold are set to 'zero'. If None, no threshold applied.
260     Return:
261       d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of
262         size Batches X N x 1 | 4. Each key corresponding to a class.
263     """
264     select_threshold = 0.0 if select_threshold is None else select_threshold
265     with tf.name_scope(scope, 'ssd_bboxes_select_layer',
266                        [predictions_layer, localizations_layer]):
267         # Reshape features: Batches x N x N_labels | 4
268         p_shape = tfe.get_shape(predictions_layer)
269         predictions_layer = tf.reshape(predictions_layer,
270                                        tf.stack([p_shape[0], -1, p_shape[-1]]))
271         l_shape = tfe.get_shape(localizations_layer)
272         localizations_layer = tf.reshape(localizations_layer,
273                                          tf.stack([l_shape[0], -1, l_shape[-1]]))
275         d_scores = {}
276         d_bboxes = {}
277         for c in range(0, num_classes):
278             if c != ignore_class:    #如果不是背景类别
279                 # Remove boxes under the threshold.   #去掉低于阈值的box
280                 scores = predictions_layer[:, :, c]    #预测为第c类别的得分值
281                 fmask = tf.cast(tf.greater_equal(scores, select_threshold), scores.dtype)
282                 scores = scores * fmask  #保留得分值大于阈值的得分
283                 bboxes = localizations_layer * tf.expand_dims(fmask, axis=-1)
284                 # Append to dictionary.
285                 d_scores[c] = scores
286                 d_bboxes[c] = bboxes
288         return d_scores, d_bboxes  #返回字典,每个字典里是对应某类的预测权重和框位置信息;
291 def tf_ssd_bboxes_select(predictions_net, localizations_net,  #输入:SSD网络输出的预测层list;定位层list;类别选择框阈值(None表示都选)
292                          select_threshold=None,                #返回一个字典,key为类别,值为得分和bbox坐标
293                          num_classes=21,   #包含了背景类别
294                          ignore_class=0,   #第0类是背景
295                          scope=None):
296     """Extract classes, scores and bounding boxes from network output layers.
297     Batch-compatible: inputs are supposed to have batch-type shapes.
298     Args:
299       predictions_net: List of SSD prediction layers;
300       localizations_net: List of localization layers;
301       select_threshold: Classification threshold for selecting a box. All boxes
302         under the threshold are set to 'zero'. If None, no threshold applied.
303     Return:
304       d_scores, d_bboxes: Dictionary of scores and bboxes Tensors of      #返回一个字典,其中key是对应类别,值对应得分值和坐标信息
305         size Batches X N x 1 | 4. Each key corresponding to a class.
306     """
307     with tf.name_scope(scope, 'ssd_bboxes_select',
308                        [predictions_net, localizations_net]):
309         l_scores = []
310         l_bboxes = []
311         for i in range(len(predictions_net)):
312             scores, bboxes = tf_ssd_bboxes_select_layer(predictions_net[i],
313                                                         localizations_net[i],
314                                                         select_threshold,
315                                                         num_classes,
316                                                         ignore_class)
317             l_scores.append(scores)  #对应某个类别的得分
318             l_bboxes.append(bboxes)  #对应某个类别的box坐标信息
319         # Concat results.
320         d_scores = {}
321         d_bboxes = {}
322         for c in l_scores[0].keys():
323             ls = [s[c] for s in l_scores]
324             lb = [b[c] for b in l_bboxes]
325             d_scores[c] = tf.concat(ls, axis=1)
326             d_bboxes[c] = tf.concat(lb, axis=1)
327         return d_scores, d_bboxes
330 def tf_ssd_bboxes_select_layer_all_classes(predictions_layer, localizations_layer,
331                                            select_threshold=None):
332     """Extract classes, scores and bounding boxes from features in one layer.
333      Batch-compatible: inputs are supposed to have batch-type shapes.
334      Args:
335        predictions_layer: A SSD prediction layer;
336        localizations_layer: A SSD localization layer;
337       select_threshold: Classification threshold for selecting a box. If None,
338         select boxes whose classification score is higher than 'no class'.
339      Return:
340       classes, scores, bboxes: Input Tensors.    #输出:类别,得分,框
341      """
342     # Reshape features: Batches x N x N_labels | 4
343     p_shape = tfe.get_shape(predictions_layer)
344     predictions_layer = tf.reshape(predictions_layer,
345                                    tf.stack([p_shape[0], -1, p_shape[-1]]))
346     l_shape = tfe.get_shape(localizations_layer)
347     localizations_layer = tf.reshape(localizations_layer,
348                                      tf.stack([l_shape[0], -1, l_shape[-1]]))
349     # Boxes selection: use threshold or score > no-label criteria.
350     if select_threshold is None or select_threshold == 0:
351         # Class prediction and scores: assign 0. to 0-class
352         classes = tf.argmax(predictions_layer, axis=2)
353         scores = tf.reduce_max(predictions_layer, axis=2)
354         scores = scores * tf.cast(classes > 0, scores.dtype)
355     else:
356         sub_predictions = predictions_layer[:, :, 1:]
357         classes = tf.argmax(sub_predictions, axis=2) + 1
358         scores = tf.reduce_max(sub_predictions, axis=2)
359         # Only keep predictions higher than threshold.
360         mask = tf.greater(scores, select_threshold)
361         classes = classes * tf.cast(mask, classes.dtype)
362         scores = scores * tf.cast(mask, scores.dtype)
363     # Assume localization layer already decoded.
364     bboxes = localizations_layer
365     return classes, scores, bboxes  #寻找当前特征图中类别,得分,bbox
368 def tf_ssd_bboxes_select_all_classes(predictions_net, localizations_net,
369                                      select_threshold=None,
370                                      scope=None):
371     """Extract classes, scores and bounding boxes from network output layers.
372     Batch-compatible: inputs are supposed to have batch-type shapes.
373     Args:
374       predictions_net: List of SSD prediction layers;
375       localizations_net: List of localization layers;
376       select_threshold: Classification threshold for selecting a box. If None,
377         select boxes whose classification score is higher than 'no class'.
378     Return:
379       classes, scores, bboxes: Tensors.
380     """
381     with tf.name_scope(scope, 'ssd_bboxes_select',
382                        [predictions_net, localizations_net]):
383         l_classes = []
384         l_scores = []
385         l_bboxes = []
386         for i in range(len(predictions_net)):
387             classes, scores, bboxes = \
388                 tf_ssd_bboxes_select_layer_all_classes(predictions_net[i],
389                                                        localizations_net[i],
390                                                        select_threshold)
391             l_classes.append(classes)
392             l_scores.append(scores)
393             l_bboxes.append(bboxes)
395         classes = tf.concat(l_classes, axis=1)
396         scores = tf.concat(l_scores, axis=1)
397         bboxes = tf.concat(l_bboxes, axis=1)
398         return classes, scores, bboxes  #返回所有特征图综合得出的类别,得分,bbox


