SSD网络做目标检测领域,速度快,实时性好,而且检测精度也很高(mAP为74.3)。作为一次检测网络,效果是非常不错的。我们今天的主要任务就是解读ssd网络中的各种boxes及相关的源代码。
1.原文中出现的各种box解读
ssd之所以能预测图片的位置,是因为ssd对一幅图片的坐标(xmin,ymin,xmax,ymax)进行回归运算。任何一个图片的区域,由左上角top-left和右下角bottom-right即可唯一确定。
gournd truth box:是你训练集中,标注好的待检测类别的的位置;即真实的位置。
prior box:是在feature map上每一个点上生成的某一类别图片的位置。feature map没一点生成4或6个box(这个到底生成多少box,数量是事先指定的)。每一个box长宽比例是从[1,2,1/2,3,1/3]和sqrt(smax*smin)中选择。
default box:则是经过IoU阈值筛选后,剩下的可能性高的box。这个box才是会被真正送去回归(做回归前,先要对ground truth box进行相关的表变换,变换过程为:有顶点坐标式→(x-central,y-central,w,h)→再转换为对应特征图下的尺寸)。这个过程也被称为encode过程。原文中的转换公式如下:
2.部分算法原理介绍
IoU计算:2个矩形的交集怎么计算呢?其实很简单,只需要确定相交区域的左上角顶点和右下角顶点即可。
交集的左上角顶点确定方法如下:
Ixmin=max(xmin_A,xmin_B); xmin_A,xmin_B分别表示矩形区域A,B的左上角顶点横坐标,后面自行类比,不再赘述。
Ixmin=max(ymin_A,ymin_B);
Ixmax=min(ymin_A,ymin_B);
Ixmax=min(ymin_A,ymin_B);
自己看不懂,可以画个草图演算,看究竟是不是这样。
3.源码解读
"""Some utils for SSD."""
import numpy as np
import tensorflow as tf
class BBoxUtility(object):
"""Utility class to do some stuff with bounding boxes and priors.
# Arguments
num_classes: Number of classes including background.
priors: Priors and variances, numpy tensor of shape (num_priors, 8),
priors[i] = [xmin, ymin, xmax, ymax, varxc, varyc, varw, varh].
overlap_threshold: Threshold to assign box to a prior.
nms_thresh: Nms threshold.
top_k: Number of total bboxes to be kept per image after nms step.
# References
https://arxiv.org/abs/1512.02325
"""
# TODO add setter methods for nms_thresh and top_K
def __init__(self, num_classes, priors=None, overlap_threshold=0.5,
nms_thresh=0.45, top_k=400):
self.num_classes = num_classes #预测的类别数
self.priors = priors
self.num_priors = 0 if priors is None else len(priors)
self.overlap_threshold = overlap_threshold
self._nms_thresh = nms_thresh
self._top_k = top_k
self.boxes = tf.placeholder(dtype='float32', shape=(None, 4))
self.scores = tf.placeholder(dtype='float32', shape=(None,))
#非极大值抑制
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
self.sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}))
@property
def nms_thresh(self):
return self._nms_thresh
@nms_thresh.setter
def nms_thresh(self, value):
self._nms_thresh = value
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
@property
def top_k(self):
return self._top_k
@top_k.setter
def top_k(self, value):
self._top_k = value
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
'''
下面这个函数是计算IoU.计算IoU,需要计算两个矩形的交集和并集
'''
def iou(self, box):
"""Compute intersection over union for the box with all priors.
# Arguments
box: Box, numpy tensor of shape (4,).
# Return
iou: Intersection over union,
numpy tensor of shape (num_priors).
"""
# compute intersection
inter_upleft = np.maximum(self.priors[:, :2], box[:2])#计算左顶点xmin,ymin
inter_botright = np.minimum(self.priors[:, 2:4], box[2:])#右顶点xmax,ymax
inter_wh = inter_botright - inter_upleft
inter_wh = np.maximum(inter_wh, 0)
inter = inter_wh[:, 0] * inter_wh[:, 1]
# compute union
area_pred = (box[2] - box[0]) * (box[3] - box[1])
area_gt = (self.priors[:, 2] - self.priors[:, 0])
area_gt *= (self.priors[:, 3] - self.priors[:, 1])
union = area_pred + area_gt - inter
# compute iou
iou = inter / union
return iou
'''
对box进行编码,实际上就是把gt box的坐标转换到feature map下,后面进行回归运算
特别注意:这个转换分为两个步骤,在前面第1部分已经介绍过了
'''
def encode_box(self, box, return_iou=True):
"""Encode box for training, do it only for assigned priors.
# Arguments
box: Box, numpy tensor of shape (4,).
return_iou: Whether to concat iou to encoded values.
# Return
encoded_box: Tensor with encoded box
numpy tensor of shape (num_priors, 4 + int(return_iou)).
"""
iou = self.iou(box)
encoded_box = np.zeros((self.num_priors, 4 + return_iou))
'''assign_mask为与iou shape 相同的tensor'''
'''overlap_threshold为设定的iou阈值,一般为0.5'''
assign_mask = iou > self.overlap_threshold
'''assign_mask都为0,则把iou中最大的那一个取出来,即相应位置设为true'''
if not assign_mask.any():'''assign_mask都为0,则把iou中最大的那一个取出来'''
assign_mask[iou.argmax()] = True
if return_iou:'''把他们连接起来,怎么连,这个句法我还没有弄懂'''
encoded_box[:, -1][assign_mask] = iou[assign_mask]
assigned_priors = self.priors[assign_mask]
box_center = 0.5 * (box[:2] + box[2:]) '''计算x_central,ycentral'''
box_wh = box[2:] - box[:2] '''计算W,H'''
assigned_priors_center = 0.5 * (assigned_priors[:, :2] +
assigned_priors[:, 2:4])'''default box'''
assigned_priors_wh = (assigned_priors[:, 2:4] -
assigned_priors[:, :2])'''default box,计算W,H'''
'''以下就是第一部分的转换公式'''
# we encode variance
encoded_box[:, :2][assign_mask] = box_center - assigned_priors_center
encoded_box[:, :2][assign_mask] /= assigned_priors_wh
encoded_box[:, :2][assign_mask] /= assigned_priors[:, -4:-2]
encoded_box[:, 2:4][assign_mask] = np.log(box_wh /
assigned_priors_wh)
encoded_box[:, 2:4][assign_mask] /= assigned_priors[:, -2:]
return encoded_box.ravel()
def assign_boxes(self, boxes):
"""Assign boxes to priors for training.
# Arguments
boxes: Box, numpy tensor of shape (num_boxes, 4 + num_classes),
num_classes without background.
# Return
assignment: Tensor with assigned boxes,
numpy tensor of shape (num_boxes, 4 + num_classes + 8),
priors in ground truth are fictitious,
assignment[:, -8] has 1 if prior should be penalized
or in other words is assigned to some ground truth box,
assignment[:, -7:] are all 0. See loss for more details.
"""
assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
assignment[:, 4] = 1.0
if len(boxes) == 0:
return assignment
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
best_iou = encoded_boxes[:, :, -1].max(axis=0)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
best_iou_mask = best_iou > 0
best_iou_idx = best_iou_idx[best_iou_mask]
assign_num = len(best_iou_idx)
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx,
np.arange(assign_num),
:4]
assignment[:, 4][best_iou_mask] = 0
assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
assignment[:, -8][best_iou_mask] = 1
return assignment
def decode_boxes(self, mbox_loc, mbox_priorbox, variances):
"""Convert bboxes from local predictions to shifted priors.
# Arguments
mbox_loc: Numpy array of predicted locations.
mbox_priorbox: Numpy array of prior boxes.
variances: Numpy array of variances.
# Return
decode_bbox: Shifted priors.
"""
'''
这一部分就是前面第一部分的公式,反求出预测的坐标,在原图中的位置
注意:这里反求解过程中,乘以了variance。至于原因,暂不清楚
最后的格式也是要由(x-cetral,ycentral,w,h)转换为(xmin,ymin,xmax,ymax)格式
'''
prior_width = mbox_priorbox[:, 2] - mbox_priorbox[:, 0]
prior_height = mbox_priorbox[:, 3] - mbox_priorbox[:, 1]
prior_center_x = 0.5 * (mbox_priorbox[:, 2] + mbox_priorbox[:, 0])
prior_center_y = 0.5 * (mbox_priorbox[:, 3] + mbox_priorbox[:, 1])
decode_bbox_center_x = mbox_loc[:, 0] * prior_width * variances[:, 0]
decode_bbox_center_x += prior_center_x
decode_bbox_center_y = mbox_loc[:, 1] * prior_width * variances[:, 1]
decode_bbox_center_y += prior_center_y
decode_bbox_width = np.exp(mbox_loc[:, 2] * variances[:, 2])
decode_bbox_width *= prior_width
decode_bbox_height = np.exp(mbox_loc[:, 3] * variances[:, 3])
decode_bbox_height *= prior_height
decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
decode_bbox = np.concatenate((decode_bbox_xmin[:, None],
decode_bbox_ymin[:, None],
decode_bbox_xmax[:, None],
decode_bbox_ymax[:, None]), axis=-1)
decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
return decode_bbox
def detection_out(self, predictions, background_label_id=0, keep_top_k=200,
confidence_threshold=0.01):
"""Do non maximum suppression (nms) on prediction results.
# Arguments
predictions: Numpy array of predicted values.
num_classes: Number of classes for prediction.
background_label_id: Label of background class.
keep_top_k: Number of total bboxes to be kept per image
after nms step.
confidence_threshold: Only consider detections,
whose confidences are larger than a threshold.
# Return
results: List of predictions for every picture. Each prediction is:
[label, confidence, xmin, ymin, xmax, ymax]
"""
'''
上面 prediction为ndarray形式的多位数组,第一维表示label,第二维为得分,
第三维为预测坐标(4个值)
'''
mbox_loc = predictions[:, :, :4] '''取出坐标'''
variances = predictions[:, :, -4:]'''这个好像与上面的内容一样'''
mbox_priorbox = predictions[:, :, -8:-4]
mbox_conf = predictions[:, :, 4:-8]
results = []
for i in range(len(mbox_loc)):
results.append([])
decode_bbox = self.decode_boxes(mbox_loc[i],
mbox_priorbox[i], variances[i])
for c in range(self.num_classes):
if c == background_label_id:
continue
c_confs = mbox_conf[i, :, c]
c_confs_m = c_confs > confidence_threshold
if len(c_confs[c_confs_m]) > 0:
boxes_to_process = decode_bbox[c_confs_m]
confs_to_process = c_confs[c_confs_m]
feed_dict = {self.boxes: boxes_to_process,
self.scores: confs_to_process}
idx = self.sess.run(self.nms, feed_dict=feed_dict)
good_boxes = boxes_to_process[idx]
confs = confs_to_process[idx][:, None]
labels = c * np.ones((len(idx), 1))
c_pred = np.concatenate((labels, confs, good_boxes),
axis=1)
results[-1].extend(c_pred)
if len(results[-1]) > 0:
results[-1] = np.array(results[-1])
argsort = np.argsort(results[-1][:, 1])[::-1]
results[-1] = results[-1][argsort]
results[-1] = results[-1][:keep_top_k]
return results