我的项目是利用faster rcnn检测kiiti数据集,用原始nms,iters = 10000的情况下,得到的mAP = 0.586, 在改用soft nms后,其他参数均不变的情况下,得到的mAP = 0.622。算是挺大的改进了,所以分享一下具体实现。
我用的Faster-RCNN是tensorflow版本,github地址:Faster-RCNN_TF
Soft-NMS (Improving Object Detection With One Line of Code)
Paper:https://arxiv.org/pdf/1704.04503.pdf
code:https://github.com/bharatsingh430/soft-nms
论文解读可以参考:https://blog.csdn.net/lanyuxuan100/article/details/78767818
该篇论文主要focus在后处理NMS上,不得不承认,对于很多问题,后处理的方法会对结果产生几个点的影响。况且尝试起来非常容易,代价也很小,只需要替换一个函数就可以,所以大家不妨可以试验一下。
在文件里添加 cpu_soft_nms函数:
def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0):
cdef unsigned int N = boxes.shape[0]
cdef float iw, ih, box_area
cdef float ua
cdef int pos = 0
cdef float maxscore = 0
cdef int maxpos = 0
cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov
for i in range(N):
maxscore = boxes[i, 4]
maxpos = i
tx1 = boxes[i,0]
ty1 = boxes[i,1]
tx2 = boxes[i,2]
ty2 = boxes[i,3]
ts = boxes[i,4]
pos = i + 1
# get max box
while pos < N:
if maxscore < boxes[pos, 4]:
maxscore = boxes[pos, 4]
maxpos = pos
pos = pos + 1
# add max box as a detection
boxes[i,0] = boxes[maxpos,0]
boxes[i,1] = boxes[maxpos,1]
boxes[i,2] = boxes[maxpos,2]
boxes[i,3] = boxes[maxpos,3]
boxes[i,4] = boxes[maxpos,4]
# swap ith box with position of max box
boxes[maxpos,0] = tx1
boxes[maxpos,1] = ty1
boxes[maxpos,2] = tx2
boxes[maxpos,3] = ty2
boxes[maxpos,4] = ts
tx1 = boxes[i,0]
ty1 = boxes[i,1]
tx2 = boxes[i,2]
ty2 = boxes[i,3]
ts = boxes[i,4]
pos = i + 1
# NMS iterations, note that N changes if detection boxes fall below threshold
while pos < N:
x1 = boxes[pos, 0]
y1 = boxes[pos, 1]
x2 = boxes[pos, 2]
y2 = boxes[pos, 3]
s = boxes[pos, 4]
area = (x2 - x1 + 1) * (y2 - y1 + 1)
iw = (min(tx2, x2) - max(tx1, x1) + 1)
if iw > 0:
ih = (min(ty2, y2) - max(ty1, y1) + 1)
if ih > 0:
ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
ov = iw * ih / ua #iou between max box and detection box
if method == 1: # linear
if ov > Nt:
weight = 1 - ov
else:
weight = 1
elif method == 2: # gaussian
weight = np.exp(-(ov * ov)/sigma)
else: # original NMS
if ov > Nt:
weight = 0
else:
weight = 1
boxes[pos, 4] = weight*boxes[pos, 4]
# if box score falls below threshold, discard the box by swapping with last box
# update N
if boxes[pos, 4] < threshold:
boxes[pos,0] = boxes[N-1, 0]
boxes[pos,1] = boxes[N-1, 1]
boxes[pos,2] = boxes[N-1, 2]
boxes[pos,3] = boxes[N-1, 3]
boxes[pos,4] = boxes[N-1, 4]
N = N - 1
pos = pos - 1
pos = pos + 1
keep = [i for i in range(N)]
return keep
将文件修改为:
from fast_rcnn.config import cfg
if cfg.USE_GPU_NMS:
from nms.gpu_nms import gpu_nms
#from nms.cpu_nms import cpu_nms
from nms.cpu_nms import cpu_nms, cpu_soft_nms
import numpy as np
def soft_nms(dets, sigma=0.5, Nt=0.3, threshold=0.001, method=1):
keep = cpu_soft_nms(np.ascontiguousarray(dets, dtype=np.float32),
np.float32(sigma), np.float32(Nt),
np.float32(threshold),
np.uint8(method))
return keep
def nms(dets, thresh, force_cpu=False):
"""Dispatch to either CPU or GPU NMS implementations."""
if dets.shape[0] == 0:
return []
if cfg.USE_GPU_NMS and not force_cpu:
return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
else:
return cpu_nms(dets, thresh)
在文件中添加__C.TEST.SOFT_NMS = 1
这个文件修改较多,将文件修改为:
from fast_rcnn.config import cfg, get_output_dir
import argparse
from utils.timer import Timer
import numpy as np
import cv2
#from utils.cython_nms import nms, nms_new
from fast_rcnn.nms_wrapper import nms, soft_nms
from utils.boxes_grid import get_boxes_grid
import cPickle
import heapq
from utils.blob import im_list_to_blob
import os
import math
from rpn_msr.generate import imdb_proposals_det
import tensorflow as tf
from fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv
import matplotlib.pyplot as plt
from tensorflow.python.client import timeline
import time
from multiprocessing import Pool
def _get_image_blob(im):
"""Converts an image into a network input.
Arguments:
im (ndarray): a color image in BGR order
Returns:
blob (ndarray): a data blob holding an image pyramid
im_scale_factors (list): list of image scales (relative to im) used
in the image pyramid
"""
im_orig = im.astype(np.float32, copy=True)
im_orig -= cfg.PIXEL_MEANS
im_shape = im_orig.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
processed_ims = []
im_scale_factors = []
for target_size in cfg.TEST.SCALES:
im_scale = float(target_size) / float(im_size_min)
# Prevent the biggest axis from being more than MAX_SIZE
if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
interpolation=cv2.INTER_LINEAR)
im_scale_factors.append(im_scale)
processed_ims.append(im)
# Create a blob to hold the input images
blob = im_list_to_blob(processed_ims)
return blob, np.array(im_scale_factors)
def _get_rois_blob(im_rois, im_scale_factors):
"""Converts RoIs into network inputs.
Arguments:
im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
im_scale_factors (list): scale factors as returned by _get_image_blob
Returns:
blob (ndarray): R x 5 matrix of RoIs in the image pyramid
"""
rois, levels = _project_im_rois(im_rois, im_scale_factors)
rois_blob = np.hstack((levels, rois))
return rois_blob.astype(np.float32, copy=False)
def _project_im_rois(im_rois, scales):
"""Project image RoIs into the image pyramid built by _get_image_blob.
Arguments:
im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates
scales (list): scale factors as returned by _get_image_blob
Returns:
rois (ndarray): R x 4 matrix of projected RoI coordinates
levels (list): image pyramid levels used by each projected RoI
"""
im_rois = im_rois.astype(np.float, copy=False)
scales = np.array(scales)
if len(scales) > 1:
widths = im_rois[:, 2] - im_rois[:, 0] + 1
heights = im_rois[:, 3] - im_rois[:, 1] + 1
areas = widths * heights
scaled_areas = areas[:, np.newaxis] * (scales[np.newaxis, :] ** 2)
diff_areas = np.abs(scaled_areas - 224 * 224)
levels = diff_areas.argmin(axis=1)[:, np.newaxis]
else:
levels = np.zeros((im_rois.shape[0], 1), dtype=np.int)
rois = im_rois * scales[levels]
return rois, levels
def _get_blobs(im, rois):
"""Convert an image and RoIs within that image into network inputs."""
if cfg.TEST.HAS_RPN:
blobs = {'data' : None, 'rois' : None}
blobs['data'], im_scale_factors = _get_image_blob(im)
else:
blobs = {'data' : None, 'rois' : None}
blobs['data'], im_scale_factors = _get_image_blob(im)
if cfg.IS_MULTISCALE:
if cfg.IS_EXTRAPOLATING:
blobs['rois'] = _get_rois_blob(rois, cfg.TEST.SCALES)
else:
blobs['rois'] = _get_rois_blob(rois, cfg.TEST.SCALES_BASE)
else:
blobs['rois'] = _get_rois_blob(rois, cfg.TEST.SCALES_BASE)
return blobs, im_scale_factors
def _clip_boxes(boxes, im_shape):
"""Clip boxes to image boundaries."""
# x1 >= 0
boxes[:, 0::4] = np.maximum(boxes[:, 0::4], 0)
# y1 >= 0
boxes[:, 1::4] = np.maximum(boxes[:, 1::4], 0)
# x2 < im_shape[1]
boxes[:, 2::4] = np.minimum(boxes[:, 2::4], im_shape[1] - 1)
# y2 < im_shape[0]
boxes[:, 3::4] = np.minimum(boxes[:, 3::4], im_shape[0] - 1)
return boxes
def _rescale_boxes(boxes, inds, scales):
"""Rescale boxes according to image rescaling."""
for i in xrange(boxes.shape[0]):
boxes[i,:] = boxes[i,:] / scales[int(inds[i])]
return boxes
def im_detect(sess, net, im, boxes=None):
"""Detect object classes in an image given object proposals.
Arguments:
net (caffe.Net): Fast R-CNN network to use
im (ndarray): color image to test (in BGR order)
boxes (ndarray): R x 4 array of object proposals
Returns:
scores (ndarray): R x K array of object class scores (K includes
background as object category 0)
boxes (ndarray): R x (4*K) array of predicted bounding boxes
"""
blobs, im_scales = _get_blobs(im, boxes)
# When mapping from image ROIs to feature map ROIs, there's some aliasing
# (some distinct image ROIs get mapped to the same feature ROI).
# Here, we identify duplicate feature ROIs, so we only compute features
# on the unique subset.
if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN:
v = np.array([1, 1e3, 1e6, 1e9, 1e12])
hashes = np.round(blobs['rois'] * cfg.DEDUP_BOXES).dot(v)
_, index, inv_index = np.unique(hashes, return_index=True,
return_inverse=True)
blobs['rois'] = blobs['rois'][index, :]
boxes = boxes[index, :]
if cfg.TEST.HAS_RPN:
im_blob = blobs['data']
blobs['im_info'] = np.array(
[[im_blob.shape[1], im_blob.shape[2], im_scales[0]]],
dtype=np.float32)
# forward pass
if cfg.TEST.HAS_RPN:
feed_dict={net.data: blobs['data'], net.im_info: blobs['im_info'], net.keep_prob: 1.0}
else:
feed_dict={net.data: blobs['data'], net.rois: blobs['rois'], net.keep_prob: 1.0}
run_options = None
run_metadata = None
if cfg.TEST.DEBUG_TIMELINE:
run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
run_metadata = tf.RunMetadata()
cls_score, cls_prob, bbox_pred, rois = sess.run([net.get_output('cls_score'), net.get_output('cls_prob'), net.get_output('bbox_pred'),net.get_output('rois')],
feed_dict=feed_dict,
options=run_options,
run_metadata=run_metadata)
if cfg.TEST.HAS_RPN:
assert len(im_scales) == 1, "Only single-image batch implemented"
boxes = rois[:, 1:5] / im_scales[0]
if cfg.TEST.SVM:
# use the raw scores before softmax under the assumption they
# were trained as linear SVMs
scores = cls_score
else:
# use softmax estimated probabilities
scores = cls_prob
if cfg.TEST.BBOX_REG:
# Apply bounding-box regression deltas
box_deltas = bbox_pred
pred_boxes = bbox_transform_inv(boxes, box_deltas)
pred_boxes = _clip_boxes(pred_boxes, im.shape)
else:
# Simply repeat the boxes, once for each class
pred_boxes = np.tile(boxes, (1, scores.shape[1]))
if cfg.DEDUP_BOXES > 0 and not cfg.TEST.HAS_RPN:
# Map scores and predictions back to the original set of boxes
scores = scores[inv_index, :]
pred_boxes = pred_boxes[inv_index, :]
if cfg.TEST.DEBUG_TIMELINE:
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
trace_file = open(str(long(time.time() * 1000)) + '-test-timeline.ctf.json', 'w')
trace_file.write(trace.generate_chrome_trace_format(show_memory=False))
trace_file.close()
return scores, pred_boxes
def vis_detections(im, class_name, dets, thresh=0.8):
"""Visual debugging of detections."""
import matplotlib.pyplot as plt
#im = im[:, :, (2, 1, 0)]
for i in xrange(np.minimum(10, dets.shape[0])):
bbox = dets[i, :4]
score = dets[i, -1]
if score > thresh:
#plt.cla()
#plt.imshow(im)
plt.gca().add_patch(
plt.Rectangle((bbox[0], bbox[1]),
bbox[2] - bbox[0],
bbox[3] - bbox[1], fill=False,
edgecolor='g', linewidth=3)
)
plt.gca().text(bbox[0], bbox[1] - 2,
'{:s} {:.3f}'.format(class_name, score),
bbox=dict(facecolor='blue', alpha=0.5),
fontsize=14, color='white')
plt.title('{} {:.3f}'.format(class_name, score))
#plt.show()
def apply_nms(all_boxes, thresh):
"""Apply non-maximum suppression to all predicted boxes output by the
test_net method.
"""
num_classes = len(all_boxes)
num_images = len(all_boxes[0])
nms_boxes = [[[] for _ in xrange(num_images)]
for _ in xrange(num_classes)]
for cls_ind in xrange(num_classes):
for im_ind in xrange(num_images):
dets = all_boxes[cls_ind][im_ind]
if dets == []:
continue
x1 = dets[:, 0]
y1 = dets[:, 1]
x2 = dets[:, 2]
y2 = dets[:, 3]
scores = dets[:, 4]
inds = np.where((x2 > x1) & (y2 > y1) & (scores > cfg.TEST.DET_THRESHOLD))[0]
dets = dets[inds,:]
if dets == []:
continue
keep = nms(dets, thresh, force_cpu=True)
if len(keep) == 0:
continue
nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
return nms_boxes
def psoft(cls_dets):
keep = soft_nms(cls_dets, method=cfg.TEST.SOFT_NMS)
return cls_dets[keep]
def test_net(sess, net, imdb, weights_filename , max_per_image=300, thresh=0.0001, vis=False):
"""Test a Fast R-CNN network on an image database."""
num_images = len(imdb.image_index)
# all detections are collected into:
# all_boxes[cls][image] = N x 5 array of detections in
# (x1, y1, x2, y2, score)
all_boxes = [[[] for _ in xrange(num_images)]
for _ in xrange(imdb.num_classes)]
output_dir = get_output_dir(imdb, weights_filename)
# timers
_t = {'im_detect' : Timer(), 'misc' : Timer()}
if not cfg.TEST.HAS_RPN:
roidb = imdb.roidb
p = Pool(27)
for i in xrange(num_images):
# filter out any ground truth boxes
if cfg.TEST.HAS_RPN:
box_proposals = None
else:
# The roidb may contain ground-truth rois (for example, if the roidb
# comes from the training or val split). We only want to evaluate
# detection on the *non*-ground-truth rois. We select those the rois
# that have the gt_classes field set to 0, which means there's no
# ground truth.
box_proposals = roidb[i]['boxes'][roidb[i]['gt_classes'] == 0]
im = cv2.imread(imdb.image_path_at(i))
_t['im_detect'].tic()
scores, boxes = im_detect(sess, net, im, box_proposals)
_t['im_detect'].toc()
_t['misc'].tic()
if vis:
image = im[:, :, (2, 1, 0)]
plt.cla()
plt.imshow(image)
###add
commands = []
# skip j = 0, because it's the background class
for j in xrange(1, imdb.num_classes):
inds = np.where(scores[:, j] > thresh)[0]
cls_scores = scores[inds, j]
cls_boxes = boxes[inds, j*4:(j+1)*4]
cls_dets = np.hstack((cls_boxes, cls_scores[:, np.newaxis])) \
.astype(np.float32, copy=False)
commands.append(cls_dets)
nms_dets = p.map(psoft, commands)
for j in xrange(1, imdb.num_classes):
if vis:
vis_detections(im, imdb.classes[j], nms_dets[j-1])
all_boxes[j][i] = nms_dets[j-1]
if vis:
plt.show()
# Limit to max_per_image detections *over all classes*
if max_per_image > 0:
image_scores = np.hstack([all_boxes[j][i][:, -1]
for j in xrange(1, imdb.num_classes)])
if len(image_scores) > max_per_image:
image_thresh = np.sort(image_scores)[-max_per_image]
for j in xrange(1, imdb.num_classes):
keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0]
all_boxes[j][i] = all_boxes[j][i][keep, :]
_t['misc'].toc()
print 'im_detect: {:d}/{:d} {:.3f}s {:.3f}s' \
.format(i + 1, num_images, _t['im_detect'].average_time,
_t['misc'].average_time)
det_file = os.path.join(output_dir, 'detections.pkl')
with open(det_file, 'wb') as f:
cPickle.dump(all_boxes, f, cPickle.HIGHEST_PROTOCOL)
print 'Evaluating detections'
imdb.evaluate_detections(all_boxes, output_dir)
因为修改了cpu_nms的内容,所以工程需要重新编译:
cd Faster root/lib
make
重新用代码测试就可以了。
代码是昨晚改的,博客是今早写的,所以步骤可能有所遗漏,有什么问题还请大家指正~