算法的设计往往与名字有着绝对的关联性,目标定位检测即目标定位+检测。在深度学习中比较常用的目标定位检测方法有RCNN系列方法和YOLO系列方法。其中RCNN系列方法的定位过程和检测过程是分开的,即先定位目标,然后对定位出的目标进行分类,这种设计思路有利节约运算资源,但却不利于实时的定位检测场景。与之相反的是YOLO系列算法,由于将采用图像和位置坐标相融合的表述方式使得该方法能够运用于实时场景中。
RCNN系列算法指的是RCNN、Fast-RCNN、Faster-RCNN等一系列由RCNN算法演变出的算法。这类算法通常是采用两个步骤来实现对目标的定位及检测的,即定位+检测。定位算法通常在RCNN算法中也与很多,详细参照主要包括滑动窗口模型、和选择性收索模型等。然后特征分类网络一般采用ResNet系列模型及VGG系列模型。当然我们也可尝试使用GoogleNet或者Inception系列模型进行训练,以提高发杂分类场景中的分类准确性。RCNN系列模型也被称作为Two Stage模型。
Fast-RCNN论文链接
大致实现步骤如下:
a、输入图片
b、收索目标区域
c、提取特征
d、图像分类
#基于选择性收索的候选区域python基本实现
import cv2
if __name__ == '__main__':
# If image path and f/q is not passed as command
# line arguments, quit and display help message
# speed-up using multithreads
cv2.setUseOptimized(True)
cv2.setNumThreads(1)
# read image
im = cv2.imread('6.tif')
ResizeValue = 2
# resize image
newHeight = int((im.shape[0])/ResizeValue)
newWidth = int((im.shape[1])/ResizeValue)
#newWidth = int(im.shape[1] * 200 / im.shape[0])
#print(int(im.shape[1]))
im = cv2.resize(im, (newWidth, newHeight))
# create Selective Search Segmentation Object using default parameters
ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
# set input image on which we will run segmentation
ss.setBaseImage(im)
ss.switchToSelectiveSearchFast()
rects = ss.process()
print('Total Number of Region Proposals: {}'.format(len(rects)))
# number of region proposals to show
numShowRects = len(rects)
# increment to increase/decrease total number
# of reason proposals to be shown
increment = 50
Area = 100*100
set_x,set_y,set_w,set_h = [],[],[],[]
rec_value = []
while True:
# create a copy of original image
imOut = im.copy()
# itereate over all the region proposals
for i, rect in enumerate(rects):
# draw rectangle for region proposal till numShowRects
if (i < numShowRects):
x, y, w, h = rect
Area_Get = w*h
key = []
if 400>w>350 and 600>h>500:
key.append(x)
key.append(y)
key.append(w)
key.append(h)
rec_value.append(tuple(key))
cv2.rectangle(imOut, (x, y), (x + w, y + h), (0, 255, 0), 1, cv2.LINE_AA)
else:
break
# show output
cv2.imshow("Output", imOut)
# record key press
k = cv2.waitKey(0) & 0xFF
# m is pressed
if k == 109:
# increase total number of rectangles to show by increment
numShowRects += increment
# l is pressed
elif k == 108 and numShowRects > increment:
# decrease total number of rectangles to show by increment
numShowRects -= increment
# q is pressed
elif k == 113:
break
# close image show window
cv2.destroyAllWindows()
总结:
RCNN系列算法本质上都是一样的,无非是Two Stage步骤来回优化,首先需要优化的必然是候选区域的生成方法,无论是滑动窗口法还是选择性收索法都不具有效率,在此基础上Faster-RCNN提出了一种bbox regression的方法,正式将two方法真正意义上变成了完全的基于深度学习的目标检测方法。并且提升了算法的效率。然后就是特征提取模型,一般特征提取模型有三种,即深度模型,广度模型和混合模型,这里深度模型指的是算法层数不断在叠加的模型,典型的模型为ResNet系列模型,广度模型指的是通过不同的卷积窗口对图像特征进行提取的模型,典型的模型为VGG系列模型,还有就是混合模型,即既注重深度特征提取,又注重广度特征提取的模型,如Inception系列模型以及GoogleNet等。
YoloV1原文链接
Yolo系列算法是典型的one stage算法,同样,在算法设计上也注重目标区域的检测以及特征的分类,这里目标区域的检测采用的是和图像区域分类定位的方式实现的。
代码实现:
import torch
import torch.nn as nn
from utils import SPP, SAM, BottleneckCSP, Conv
from backbone import resnet18
import numpy as np
import tools
class myYOLO(nn.Module):
def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.01, nms_thresh=0.5, hr=False):
super(myYOLO, self).__init__()
self.device = device
self.num_classes = num_classes
self.trainable = trainable
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
self.stride = 32
self.grid_cell = self.create_grid(input_size)
self.input_size = input_size
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=device).float()
# we use resnet18 as backbone
self.backbone = resnet18(pretrained=True)
# neck
self.SPP = nn.Sequential(
Conv(512, 256, k=1),
SPP(),
BottleneckCSP(256*4, 512, n=1, shortcut=False)
)
self.SAM = SAM(512)
self.conv_set = BottleneckCSP(512, 512, n=3, shortcut=False)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
def create_grid(self, input_size):
w, h = input_size[1], input_size[0]
# generate grid cells
ws, hs = w // self.stride, h // self.stride
grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)])
grid_xy = torch.stack([grid_x, grid_y], dim=-1).float()
grid_xy = grid_xy.view(1, hs*ws, 2).to(self.device)
return grid_xy
def set_grid(self, input_size):
self.input_size = input_size
self.grid_cell = self.create_grid(input_size)
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=self.device).float()
def decode_boxes(self, pred):
"""
input box : [tx, ty, tw, th]
output box : [xmin, ymin, xmax, ymax]
"""
output = torch.zeros_like(pred)
pred[:, :, :2] = torch.sigmoid(pred[:, :, :2]) + self.grid_cell
pred[:, :, 2:] = torch.exp(pred[:, :, 2:])
# [c_x, c_y, w, h] -> [xmin, ymin, xmax, ymax]
output[:, :, 0] = pred[:, :, 0] * self.stride - pred[:, :, 2] / 2
output[:, :, 1] = pred[:, :, 1] * self.stride - pred[:, :, 3] / 2
output[:, :, 2] = pred[:, :, 0] * self.stride + pred[:, :, 2] / 2
output[:, :, 3] = pred[:, :, 1] * self.stride + pred[:, :, 3] / 2
return output
def nms(self, dets, scores):
""""Pure Python NMS baseline."""
x1 = dets[:, 0] #xmin
y1 = dets[:, 1] #ymin
x2 = dets[:, 2] #xmax
y2 = dets[:, 3] #ymax
areas = (x2 - x1) * (y2 - y1) # the size of bbox
order = scores.argsort()[::-1] # sort bounding boxes by decreasing order
keep = [] # store the final bounding boxes
while order.size > 0:
i = order[0] #the index of the bbox with highest confidence
keep.append(i) #save it to keep
xx1 = np.maximum(x1[i], x1[order[1:]])
yy1 = np.maximum(y1[i], y1[order[1:]])
xx2 = np.minimum(x2[i], x2[order[1:]])
yy2 = np.minimum(y2[i], y2[order[1:]])
w = np.maximum(1e-28, xx2 - xx1)
h = np.maximum(1e-28, yy2 - yy1)
inter = w * h
# Cross Area / (bbox + particular area - Cross Area)
ovr = inter / (areas[i] + areas[order[1:]] - inter)
#reserve all the boundingbox whose ovr less than thresh
inds = np.where(ovr <= self.nms_thresh)[0]
order = order[inds + 1]
return keep
def postprocess(self, all_local, all_conf, exchange=True, im_shape=None):
"""
bbox_pred: (HxW, 4), bsize = 1
prob_pred: (HxW, num_classes), bsize = 1
"""
bbox_pred = all_local
prob_pred = all_conf
cls_inds = np.argmax(prob_pred, axis=1)
prob_pred = prob_pred[(np.arange(prob_pred.shape[0]), cls_inds)]
scores = prob_pred.copy()
# threshold
keep = np.where(scores >= self.conf_thresh)
bbox_pred = bbox_pred[keep]
scores = scores[keep]
cls_inds = cls_inds[keep]
# NMS
keep = np.zeros(len(bbox_pred), dtype=np.int)
for i in range(self.num_classes):
inds = np.where(cls_inds == i)[0]
if len(inds) == 0:
continue
c_bboxes = bbox_pred[inds]
c_scores = scores[inds]
c_keep = self.nms(c_bboxes, c_scores)
keep[inds[c_keep]] = 1
keep = np.where(keep > 0)
bbox_pred = bbox_pred[keep]
scores = scores[keep]
cls_inds = cls_inds[keep]
if im_shape != None:
# clip
bbox_pred = self.clip_boxes(bbox_pred, im_shape)
return bbox_pred, scores, cls_inds
def forward(self, x, target=None):
# backbone
_, _, C_5 = self.backbone(x)
# head
C_5 = self.SPP(C_5)
C_5 = self.SAM(C_5)
C_5 = self.conv_set(C_5)
# pred
prediction = self.pred(C_5)
prediction = prediction.view(C_5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)
B, HW, C = prediction.size()
# Divide prediction to obj_pred, txtytwth_pred and cls_pred
# [B, H*W, 1]
conf_pred = prediction[:, :, :1]
# [B, H*W, num_cls]
cls_pred = prediction[:, :, 1 : 1 + self.num_classes]
# [B, H*W, 4]
txtytwth_pred = prediction[:, :, 1 + self.num_classes:]
# test
if not self.trainable:
with torch.no_grad():
# batch size = 1
all_conf = torch.sigmoid(conf_pred)[0] # 0 is because that these is only 1 batch.
all_bbox = torch.clamp((self.decode_boxes(txtytwth_pred) / self.scale_torch)[0], 0., 1.)
all_class = (torch.softmax(cls_pred[0, :, :], 1) * all_conf)
# separate box pred and class conf
all_conf = all_conf.to('cpu').numpy()
all_class = all_class.to('cpu').numpy()
all_bbox = all_bbox.to('cpu').numpy()
bboxes, scores, cls_inds = self.postprocess(all_bbox, all_class)
return bboxes, scores, cls_inds
else:
conf_loss, cls_loss, txtytwth_loss, total_loss = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred,
pred_txtytwth=txtytwth_pred,
label=target)
return conf_loss, cls_loss, txtytwth_loss, total_loss
总结:
yolo系列算法是一种比较成熟的目标检测算法框架,基于这种框架的算法还在不断地迭代中,当然解决的问题也越来越细化,比如候选区精度、比如小尺度检测等。基本上YoloV3及以上版本的算法可以在很多场景下得到现实应用。当然,问题总是在不断地出现和得到补充的,期待能够看到更加高效准确的基于Yolo算法设计思路而来的新算法。