Single Shot MultiBox Detectorarxiv.org
清欢守护者:精读深度学习论文(16) SSDzhuanlan.zhihu.com
num_classes
的预测结果对应 ground Truth 一个数字。preprocess_for_train
和 preprocess_for_eval
,主要输入数据介绍如下:
image
:一张普通的RGB编码图片。labels
:shape为(N, ),N为图片中 Ground Truth 物体数量,记录所有物体类型标签。bboxes
:shape为(N, 4),其中数据分别是[ymin, xmin, ymax, xmax]
,数值范围都在(0, 1]之间。清欢守护者:Inside TF-Slim(13) preprocessing(图像增强相关)zhuanlan.zhihu.com
preprocess_for_train
流程介绍
preprocess_for_eval
流程介绍:
[0, 0, 1, 1]
的记录,后续通过该记录获取输出图片。ssd_vgg_300.py
或ssd_vgg_512.py
中的ssd_net
方法与ssd_multibox_layer
方法。predictions, localisations, logits, end_points
,分别表示bbox分类预测值(经过softmax)、bbox位移预测值、bbox分类预测值(未经过softmax)、模型节点。ssd_vgg_300.py
或 ssd_vgg_512.py
中的 ssd_anchors_all_layers
和 ssd_anchor_one_layer
。1:1
的anchor有特殊处理。ssd_anchors_all_layers
解析:
ssd_anchor_one_layer
来完成的。img_shape
:没什么难度,输入数据的尺寸(经过preprocessing之后的结果)。layers_shape
:要进行预测的特征图的尺寸列表。anchor_sizes
:anchor面积(本来是要进行计算,但这里直接给出数值),每个特征图分别获取。
anchor_ratios
:anchor的长宽比例(不包括1:1的anchor),每个特征图分别获取。anchor_steps
:特征图中每一点代表的实际像素值,每个特征图分别获取。offset
:anchor中心点在当前特征栅格中的位置。anchor_sizes, anchor_ratios, anchor_ratios, anchor_steps
都是每个特征图都各自拥有自己的实例。def ssd_anchors_all_layers(img_shape,
layers_shape,
anchor_sizes,
anchor_ratios,
anchor_steps,
offset=0.5,
dtype=np.float32):
layers_anchors = []
for i, s in enumerate(layers_shape):
anchor_bboxes = ssd_anchor_one_layer(img_shape, s,
anchor_sizes[i],
anchor_ratios[i],
anchor_steps[i],
offset=offset, dtype=dtype)
layers_anchors.append(anchor_bboxes)
return layers_anchors
ssd_anchor_one_layer
解析
y, x, h, w
:
x, y
代表中心点位置,shape为(N, M, 1),其中(N, M)为特征图尺寸。h, w
代表边长,shape与为(N, ),其中N为每个特征点的anchor数量。def ssd_anchor_one_layer(img_shape,
feat_shape,
sizes,
ratios,
step,
offset=0.5,
dtype=np.float32):
# 计算anchor中心点位置
y, x = np.mgrid[0:feat_shape[0], 0:feat_shape[1]]
y = (y.astype(dtype) + offset) * step / img_shape[0]
x = (x.astype(dtype) + offset) * step / img_shape[1]
y = np.expand_dims(y, axis=-1)
x = np.expand_dims(x, axis=-1)
# 计算相对边长
# anchor数量分为两部分:
# len(sizes) 代表长宽比为 1:1 的anchor数量
# len(ratios) 代表长宽比为ratios的anchor数量
num_anchors = len(sizes) + len(ratios)
h = np.zeros((num_anchors, ), dtype=dtype)
w = np.zeros((num_anchors, ), dtype=dtype)
# 长宽比为 1:1 的anchor
# 从代码上看,1:1的anchor最多两个,不能更多了
h[0] = sizes[0] / img_shape[0]
w[0] = sizes[0] / img_shape[1]
di = 1
if len(sizes) > 1:
h[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[0]
w[1] = math.sqrt(sizes[0] * sizes[1]) / img_shape[1]
di += 1
# 长宽比为ratios的anchor
for i, r in enumerate(ratios):
h[i+di] = sizes[0] / img_shape[0] / math.sqrt(r)
w[i+di] = sizes[0] / img_shape[1] * math.sqrt(r)
return y, x, h, w
ssd_common.py
中的 tf_ssd_bboxes_encode
和 tf_ssd_bboxes_encode_layer
方法。tf_ssd_bboxes_encode
解析
tf_ssd_bboxes_encode_layer
完成。labels
和 bboxes
是之前preprocessing的结果。anchors
是之前获取的所有默认anchors,即一组y, x, h, w
结果。def tf_ssd_bboxes_encode(labels,
bboxes,
anchors,
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32,
scope='ssd_bboxes_encode'):
with tf.name_scope(scope):
target_labels = []
target_localizations = []
target_scores = []
for i, anchors_layer in enumerate(anchors):
with tf.name_scope('bboxes_encode_block_%i' % i):
t_labels, t_loc, t_scores = \
tf_ssd_bboxes_encode_layer(labels, bboxes, anchors_layer,
num_classes, no_annotation_label,
ignore_threshold,
prior_scaling, dtype)
target_labels.append(t_labels)
target_localizations.append(t_loc)
target_scores.append(t_scores)
return target_labels, target_localizations, target_scores
tf_ssd_bboxes_encode_layer
解析def tf_ssd_bboxes_encode_layer(labels,
bboxes,
anchors_layer,
num_classes,
no_annotation_label,
ignore_threshold=0.5,
prior_scaling=[0.1, 0.1, 0.2, 0.2],
dtype=tf.float32):
# `y, x, h, w`
# `x, y`代表中心点位置,shape为(N, M, 1),其中(N, M)为特征图尺寸。
# `h, w`代表边长,shape与为(N, ),其中N为每个特征点的anchor数量。
# 取值都在[0, 1]之间,都是在整张图片中的相对位置。
yref, xref, href, wref = anchors_layer
# 转换bbox的表示方式
ymin = yref - href / 2.
xmin = xref - wref / 2.
ymax = yref + href / 2.
xmax = xref + wref / 2.
# 计算anchor面积
vol_anchors = (xmax - xmin) * (ymax - ymin)
# shape为(feature_map_height, feature_map_width, anchors_per_feature_map_point)
# 可以代表特征图中所有anchor
shape = (yref.shape[0], yref.shape[1], href.size)
feat_labels = tf.zeros(shape, dtype=tf.int64)
feat_scores = tf.zeros(shape, dtype=dtype)
feat_ymin = tf.zeros(shape, dtype=dtype)
feat_xmin = tf.zeros(shape, dtype=dtype)
feat_ymax = tf.ones(shape, dtype=dtype)
feat_xmax = tf.ones(shape, dtype=dtype)
def jaccard_with_anchors(bbox):
""" 计算某个bbox与所有输入anchors的交并比"""
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# Volumes.
inter_vol = h * w
union_vol = vol_anchors - inter_vol \
+ (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
jaccard = tf.div(inter_vol, union_vol)
return jaccard
def intersection_with_anchors(bbox):
""" 计算某个bbox与anchor交叉面积 占 anchor面积的比例 """
int_ymin = tf.maximum(ymin, bbox[0])
int_xmin = tf.maximum(xmin, bbox[1])
int_ymax = tf.minimum(ymax, bbox[2])
int_xmax = tf.minimum(xmax, bbox[3])
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
inter_vol = h * w
scores = tf.div(inter_vol, vol_anchors)
return scores
def condition(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
""" i < len(labels) """
r = tf.less(i, tf.shape(labels))
return r[0]
def body(i, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax):
label = labels[i]
bbox = bboxes[i]
jaccard = jaccard_with_anchors(bbox)
# 条件如下
# cur_jaccard > scores && jaccard > jaccard_threshold && scores > -0.5 && label < num_classes
mask = tf.greater(jaccard, feat_scores)
# mask = tf.logical_and(mask, tf.greater(jaccard, matching_threshold))
mask = tf.logical_and(mask, feat_scores > -0.5)
mask = tf.logical_and(mask, label < num_classes)
imask = tf.cast(mask, tf.int64)
fmask = tf.cast(mask, dtype)
# 符合条件的 添加到 feat_labels/feat_scores/feat_ymin/feat_xmin/feat_ymax/feat_xmax 中
# 不符合条件的还是使用之前的值
feat_labels = imask * label + (1 - imask) * feat_labels
feat_scores = tf.where(mask, jaccard, feat_scores)
feat_ymin = fmask * bbox[0] + (1 - fmask) * feat_ymin
feat_xmin = fmask * bbox[1] + (1 - fmask) * feat_xmin
feat_ymax = fmask * bbox[2] + (1 - fmask) * feat_ymax
feat_xmax = fmask * bbox[3] + (1 - fmask) * feat_xmax
# Check no annotation label: ignore these anchors...
# interscts = intersection_with_anchors(bbox)
# mask = tf.logical_and(interscts > ignore_threshold,
# label == no_annotation_label)
# # Replace scores by -1.
# feat_scores = tf.where(mask, -tf.cast(mask, dtype), feat_scores)
return [i+1, feat_labels, feat_scores,
feat_ymin, feat_xmin, feat_ymax, feat_xmax]
# 本质就是遍历所有 ground truth 中的label
# 将每个 gt 中的 label 与所有 anchors 进行对比
# 最后获取所有anchors的 label(分类标签) score(与gt的最大jaccard),以及groud truth的bbox信息
i = 0
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax] = tf.while_loop(condition, body,
[i, feat_labels, feat_scores,
feat_ymin, feat_xmin,
feat_ymax, feat_xmax])
# 转换bbox表达方式
feat_cy = (feat_ymax + feat_ymin) / 2.
feat_cx = (feat_xmax + feat_xmin) / 2.
feat_h = feat_ymax - feat_ymin
feat_w = feat_xmax - feat_xmin
# 获取偏差值(预测数据就是预测偏差),并进行scale
feat_cy = (feat_cy - yref) / href / prior_scaling[0]
feat_cx = (feat_cx - xref) / wref / prior_scaling[1]
feat_h = tf.log(feat_h / href) / prior_scaling[2]
feat_w = tf.log(feat_w / wref) / prior_scaling[3]
feat_localizations = tf.stack([feat_cx, feat_cy, feat_w, feat_h], axis=-1)
return feat_labels, feat_localizations, feat_scores
ssd_vgg_300.py
或 ssd_vgg_512.py
中的 ssd_losses
函数。logits
:anchor分类预测结果,每一个特征图一个数据。每个特征图中,数据shape为[batch_size, feature_map_height, feature_map_width, num_anchors, num_classes]
。localisations
:anchor的bbox预测结果,每个特征图一个数据。每个特征途中,数据shape为[batch_size, feature_map_height, feature_map_width, num_anchors, 4]
。gclasses, glocalisations, gscores
均来自Ground Truth 预处理。shape均为[feature_map_height, feature_map_width, num_anchors]
。negative_ratio
实现。def ssd_losses(logits, localisations,
gclasses, glocalisations, gscores,
match_threshold=0.5,
negative_ratio=3.,
alpha=1.,
label_smoothing=0.,
device='/cpu:0',
scope=None):
with tf.name_scope(scope, 'ssd_losses'):
lshape = tfe.get_shape(logits[0], 5)
num_classes = lshape[-1]
batch_size = lshape[0]
# flattern所有数据
flogits = []
fgclasses = []
fgscores = []
flocalisations = []
fglocalisations = []
for i in range(len(logits)):
flogits.append(tf.reshape(logits[i], [-1, num_classes]))
fgclasses.append(tf.reshape(gclasses[i], [-1]))
fgscores.append(tf.reshape(gscores[i], [-1]))
flocalisations.append(tf.reshape(localisations[i], [-1, 4]))
fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4]))
logits = tf.concat(flogits, axis=0)
gclasses = tf.concat(fgclasses, axis=0)
gscores = tf.concat(fgscores, axis=0)
localisations = tf.concat(flocalisations, axis=0)
glocalisations = tf.concat(fglocalisations, axis=0)
dtype = logits.dtype
# 根据gscores获取正/反例
pmask = gscores > match_threshold
fpmask = tf.cast(pmask, dtype)
n_positives = tf.reduce_sum(fpmask)
# Hard negative mining...
no_classes = tf.cast(pmask, tf.int32)
predictions = slim.softmax(logits)
nmask = tf.logical_and(tf.logical_not(pmask),
gscores > -0.5)
fnmask = tf.cast(nmask, dtype)
nvalues = tf.where(nmask,
predictions[:, 0],
1. - fnmask)
nvalues_flat = tf.reshape(nvalues, [-1])
# 设置反例数量为正例的negative_ratio
max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32)
n_neg = tf.cast(negative_ratio * n_positives, tf.int32) + batch_size
n_neg = tf.minimum(n_neg, max_neg_entries)
val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg)
max_hard_pred = -val[-1]
nmask = tf.logical_and(nmask, nvalues < max_hard_pred)
fnmask = tf.cast(nmask, dtype)
# 计算正例的分类误差
with tf.name_scope('cross_entropy_pos'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=gclasses)
loss = tf.div(tf.reduce_sum(loss * fpmask), batch_size, name='value')
tf.losses.add_loss(loss)
# 计算反例的分类误差
with tf.name_scope('cross_entropy_neg'):
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
labels=no_classes)
loss = tf.div(tf.reduce_sum(loss * fnmask), batch_size, name='value')
tf.losses.add_loss(loss)
# bbox位置误差: smooth L1, L2,
with tf.name_scope('localization'):
weights = tf.expand_dims(alpha * fpmask, axis=-1)
loss = custom_layers.abs_smooth(localisations - glocalisations)
loss = tf.div(tf.reduce_sum(loss * weights), batch_size, name='value')
tf.losses.add_loss(loss)
def process_image(img, select_threshold=0.5, nms_threshold=.45, net_shape=(300, 300)):
# 运行模型,获取预测结果
rimg, rpredictions, rlocalisations, rbbox_img = isess.run([image_4d, predictions, localisations, bbox_img],
feed_dict={img_input: img})
# 通过 rpredictions 与 select_threshold 筛选bbox
# 输出rclasses是筛选出bbox的classes label
# 输出rscores是筛选出bbox的score
# 输出rbboxes是筛选出的bbox边界
rclasses, rscores, rbboxes = np_methods.ssd_bboxes_select(
rpredictions, rlocalisations, ssd_anchors,
select_threshold=select_threshold, img_shape=net_shape, num_classes=21, decode=True)
# bbox的范围不能超过原始图片的范围
rbboxes = np_methods.bboxes_clip(rbbox_img, rbboxes)
# 根据 rscores 从大到小排序,并改变rclasses rbboxes的顺序
rclasses, rscores, rbboxes = np_methods.bboxes_sort(rclasses, rscores, rbboxes, top_k=400)
# 使用nms算法筛选bbox
rclasses, rscores, rbboxes = np_methods.bboxes_nms(rclasses, rscores, rbboxes, nms_threshold=nms_threshold)
# 根据原始图片的bbox,修改所有bbox的范围
rbboxes = np_methods.bboxes_resize(rbbox_img, rbboxes)
return rclasses, rscores, rbboxes
转载https://zhuanlan.zhihu.com/p/38013662