这篇博客里,我主要分析一下faster rcnn的测试过程是如何实现的。每个小结我都会以某个py文件的名字作为标题,表示以下内容是对此文件的分析。
test_net的输入是faster r-cnn网络,图片等信息,输出的是对这些图片里面物体进行预测的准确率。
def test_net(net, imdb, max_per_image=100, thresh=0.05, vis=False):
"""Test a Fast R-CNN network on an image database."""
num_images = len(imdb.image_index)
# all detections are collected into:
# all_boxes[cls][image] = N x 5 array of detections in
# (x1, y1, x2, y2, score)
all_boxes = [[[] for _ in xrange(num_images)]
for _ in xrange(imdb.num_classes)]
# 定义程序输出的路径
output_dir = get_output_dir(imdb, net)
# timers
_t = {'im_detect' : Timer(), 'misc' : Timer()}
if not cfg.TEST.HAS_RPN:
roidb = imdb.roidb
# 遍历每一张图片
for i in xrange(num_images):
# filter out any ground truth boxes
if cfg.TEST.HAS_RPN:
box_proposals = None
# roidb里可能有ground-truth的rois,这会影响检测结果(使得结果编号),所以我们要把这些bbox剔除
box_proposals = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0]
# 读取图片
im = cv2.imread(imdb.image_path_at(i))
# 得到这张图片的预测bbox和bbox的得分,具体数据类型如下
# scores (ndarray): R x K array of object class scores
# (K includes background as object category 0)
# boxes (ndarray): R x (4*K) array of predicted bounding boxes
scores, boxes = im_detect(net, im, box_proposals)
# 对于每张图片,从类别1开始统计预测结果(类别0是背景类)
for j in xrange(1, imdb.num_classes):
# 取出score大于某个阈值的下标
inds = np.where(scores[:, j] > thresh)[0]
cls_scores = scores[inds, j]
cls_boxes = boxes[inds, j*4:(j+1)*4]
# 将scores与bbox合在一起,得到dets
cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
.astype(np.float32, copy=False)
keep = nms(cls_dets, cfg.TEST.NMS)
cls_dets = cls_dets[keep, :]
# vis==True时,将框框与图片可视化,显示在屏幕上
if vis:
vis_detections(im, imdb.classes[j], cls_dets)
all_boxes[j][i] = cls_dets
# 将每张图片的检测个数限制在max_per_image之内
if max_per_image > 0:
image_scores = np.hstack([all_boxes[j][i][:, -1]
for j in xrange(1, imdb.num_classes)])
if len(image_scores) > max_per_image:
image_thresh = np.sort(image_scores)[-max_per_image]
for j in xrange(1, imdb.num_classes):
keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
all_boxes[j][i] = all_boxes[j][i][keep, :]
print 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \
.format(i + 1, num_images, _t['im_detect'].average_time,
# 将检测结果保存
det_file = os.path.join(output_dir, 'detections.pkl')
with open(det_file, 'wb') as f:
cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL)
# 评估检测结果
print 'Evaluating detections'
imdb.evaluate_detections(all_boxes, output_dir)
def evaluate_detections(self, all_boxes, output_dir):
# pyhon版本的评测(嗯再看看这个里面怎么实现评测的)
if self.config['matlab_eval']:
if self.config['cleanup']:
for cls in self._classes:
if cls == '__background__':
filename = self._get_voc_results_file_template().format(cls)
rec, prec, ap = voc_eval(
filename, annopath, imagesetfile, cls, cachedir, ovthresh=0.5,
def voc_eval(detpath,
"""rec, prec, ap = voc_eval(detpath,
Top level function that does the PASCAL VOC evaluation.
detpath: Path to detections
detpath.format(classname) should produce the detection results file.
annopath: Path to annotations
annopath.format(imagename) should be the xml annotations file.
imagesetfile: Text file containing the list of images, one image per line.
classname: Category name (duh)
cachedir: Directory for caching the annotations
[ovthresh]: Overlap threshold (default = 0.5)
[use_07_metric]: Whether to use VOC07's 11 point AP computation
(default False)
bbox_transform里包括bbox变换的内容(特征图像anchor box与原图bbox互相转换的代码)。大家分析代码的同时可以结合论文中的BBox回归的公式来看:
# 这个函数目的是得到anchor box学习目标(也就是偏移量)
def bbox_transform(ex_rois, gt_rois):
# 将RoI框框(Anchor Box)的四个坐标转换成 x, y, w, h 形式
ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
# 将目标框框(Ground Truth Box)的四个坐标转换成 x, y, w, h 形式
gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
# 算出需要学习的偏移量,也就是论文公式里的 t_x, t_y, t_w, t_h
targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
targets_dw = np.log(gt_widths / ex_widths)
targets_dh = np.log(gt_heights / ex_heights)
# 将学习目标格式转换一下叠在一起,这样每个target对应一排学习偏移量
targets = np.vstack(
(targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
return targets
# 这个函数是将偏移量加到Anchor Box上去,得到预测出来的Bbox。
def bbox_transform_inv(boxes, deltas):
if boxes.shape[0] == 0:
return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
# 统一deltas与boxes的数据类型
boxes = boxes.astype(deltas.dtype, copy=False)
# 将boxes的四个坐标转换成 x, y, w, h 形式
widths = boxes[:, 2] - boxes[:, 0] + 1.0
heights = boxes[:, 3] - boxes[:, 1] + 1.0
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
# 从deltas中取出 x, y, w, h 对应的偏移量
dx = deltas[:, 0::4]
dy = deltas[:, 1::4]
dw = deltas[:, 2::4]
dh = deltas[:, 3::4]
# 将偏移量加到bbox的 x, y, w, h 上面
pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
pred_w = np.exp(dw) * widths[:, np.newaxis]
pred_h = np.exp(dh) * heights[:, np.newaxis]
# 再将bbox的 x, y, w, h 形式转换成 x1, y1, x2, y2 的形式
pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
# x1
pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
# y1
pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
# x2
pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
# y2
pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
return pred_boxes
def clip_boxes(boxes, im_shape):
Clip boxes to image boundaries.
# x1 >= 0
boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
# y1 >= 0
boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
# x2 < im_shape[1]
boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
# y2 < im_shape[0]
boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
return boxes