SSD是单阶段目标检测算法,英文名Single Shot MultiBox Detector,简称一击必中。
'''
1. 基本思想:在多特征图上生成先验框进行分类和回归。
2. 优 点:检测速度快,在不同的特征层上进行检测,这样大、中、小物体都被考虑进来。
3. 缺 点:先验框设置靠经验。个人看来是简化版的Faster R-CNN,Faster R-CNN比SSD多筛选一次,速度相对慢。而且特征也没SSD多。
4. 改 进:SSD使用数据增强、多特征图、难负样本挖掘等操作,以上操作提高模型泛化能力,提高大中小物体的识别能力及缓解样本不均衡问题。
'''
代码流程
'''
1. Augment + Encoder :img[b,300,300,3]; boxes,cls --> encoder() --> y_true[b,8732,33]
2. model : img --> SSD300() --> y_pred[b,8732,33]
3. Loss : Loss(y_true,y_pred) --> Noob_loss + Coob_loss + Loc_loss
4. Predict : img --> SSD300() --> y_pred --> decoder --> NMS --> draw boxes and labels
'''
在数据处理部分,包含两个重要函数,数据增强函数img, y = get_random_data(annotation_line,self.image_size[0:2]) 和编码函数y = self.bbox_util.assign_boxes(y)。数据处理流程如下:
'''
1. 读入数据集中的数据,打乱数据顺序。
ssd-keras-master/VOCdevkit/VOC2007/JPEGImages/000014.jpg 72,163,302,228,5 185,194,500,316,6 416,180,500,222,6 314,8,344,65,14 331,4,361,61,14 357,8,401,61,14
ssd-keras-master/VOCdevkit/VOC2007/JPEGImages/000028.jpg 63,18,374,500,7
2. 遍历每一行数据,数据增强。
3. 得到真实框在图片上的相对位置,把类别变成one_hot格式。把真实框和类别放在堆叠再一起。
4. 真实框和类编码得到y_true.
5. img归一化,利用生成器,分批次处理img 和真实框和类编码得到y_true。
'''
def generate(self, train=True):
while True:
if train:
# 打乱数据顺序
shuffle(self.train_lines)
lines = self.train_lines
else:
shuffle(self.val_lines)
lines = self.val_lines
inputs = []
targets = []
for annotation_line in lines: # img,y = image_data, box_data
img, y = self.get_random_data(annotation_line,self.image_size[0:2]) # y.shap3[4+cls],数据增强操作
if len(y) != 0: # 归一化
boxes = np.array(y[:,:4],dtype=np.float32)
boxes[:,0] = boxes[:,0]/self.image_size[1]
boxes[:,1] = boxes[:,1]/self.image_size[0]
boxes[:,2] = boxes[:,2]/self.image_size[1]
boxes[:,3] = boxes[:,3]/self.image_size[0]
one_hot_label = np.eye(self.num_classes)[np.array(y[:,4],np.int32)] # 不包含背景
if ((boxes[:,3]-boxes[:,1]) <= 0).any() and ((boxes[:,2]-boxes[:,0])<=0).any():
continue
y = np.concatenate([boxes,one_hot_label],axis=-1) # y[n,4+20(one_hot)]
# 编码操作 y[ 框 + cls ]
y = self.bbox_util.assign_boxes(y) # 制作成标签y_true[4+1+cls+8]
inputs.append(img)
targets.append(y)
if len(targets) == self.batch_size:
tmp_inp = np.array(inputs)
tmp_targets = np.array(targets)
inputs = []
targets = []
yield preprocess_input(tmp_inp), tmp_targets # [n,],[n,]
SSD中对数据和框分别数据增强,包括缩放、剪切、随机翻转、色度(亮度、饱和度)调整及对真实框缩放、随机翻转、剪切操作。
'''
img, y = self.get_random_data(annotation_line,self.image_size[0:2])
1. 图片缩放
2. 图片平移
3. 图片随机翻转
4. 图片亮度、饱和度调整
5. 真实框缩放
6. 真实框随机翻转
7. 真实框平移操作
'''
def get_random_data(self, annotation_line, input_shape, jitter=.1, hue=.1, sat=1.1, val=1.1):
'''r实时数据增强的随机预处理'''
line = annotation_line.split()
image = Image.open(line[0])
iw, ih = image.size
h, w = input_shape
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
# resize image 缩放
new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)
scale = rand(.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
# place image 剪切
dx = int(rand(0, w-nw))
dy = int(rand(0, h-nh))
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
# flip image or not # 翻转
flip = rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
# distort image 亮度、饱和度调整
hue = rand(-hue, hue)
sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat)
val = rand(1, val) if rand()<.5 else 1/rand(1, val)
x = rgb_to_hsv(np.array(image)/255.)
x[..., 0] += hue
x[..., 0][x[..., 0]>1] -= 1
x[..., 0][x[..., 0]<0] += 1
x[..., 1] *= sat
x[..., 2] *= val
x[x>1] = 1
x[x<0] = 0
image_data = hsv_to_rgb(x)*255 # numpy array, 0 to 1
# correct boxes
box_data = np.zeros((len(box),5)) # [n,5]
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx # 缩放+平移
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
if flip: box[:, [0,2]] = w - box[:, [2,0]] # 翻转
box[:, 0:2][box[:, 0:2]<0] = 0 # 筛选
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
if len(box) == 0:
return image_data, []
if (box_data[:,:4]>0).any():
return image_data, box_data
else:
return image_data, [] # [300,300,3],[n,5]
'''
y = self.bbox_util.assign_boxes(y)
1. 模版assignment[8732,33]。33 = 4+1+20+8
2. 对所有真实框编码。根据真实框和先验框的IoU找出匹配真实框的先验框,根据编码公式,计算中心点和宽高偏移。
3. 在预测过程中,要求每个先验框只匹配一个真实框,根据这个条件对预测偏移筛选。
'''
def assign_boxes(self, boxes): # boxes.shape [-1,框+类别] 筛选框,得到 y_true ; y = self.bbox_util.assign_boxes(y)
assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8)) # assignment.shape (8732, 33) y.shape=(7, 24)
assignment[:, 4] = 1.0 # 背景的概率
if len(boxes) == 0:
return assignment
# 对每一个真实框都进行iou计算 encoded_boxes.shape = (7, 43660) 7 是图片有7个框,43660 = 8732*5
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4]) # 找到框并编码[ num_priors , 4 + 1 ]
# 每一个真实框的编码后的值,和iou encoded_boxes.shape = (7, 43660)
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5) # encoded_boxes.shape = (7, 8732, 5)
# 一个真实框可以匹配多个先验框,但是一个先验框只能拟合一个真实框,所以找出先验框最匹配的真实框,
best_iou = encoded_boxes[:, :, -1].max(axis=0) # encoded_boxes[:, :, -1].shape :(7, 8732) # best_iou .shape = (8732,)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0) # 取每个先验框对应iou最大的值 , (8732,) 每个先验框对应真实框的坐标
best_iou_mask = best_iou > 0 # 取iou大于零的框的小标
best_iou_idx = best_iou_idx[best_iou_mask] # 取iou大于零的框 ; best_iou_idx.shape = (64,)
assign_num = len(best_iou_idx) # 可以用来预测先验框的个数 ; assign_num = 64
# 保留重合程度最大的先验框的应该有的预测结果
encoded_boxes = encoded_boxes[:, best_iou_mask, :] # encoded_boxes.shape = (7, 64, 5)
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx , np.arange(assign_num),:4] # 偏移
# 4代表为背景的概率为1,前景的概率设为1。
assignment[:, 4][best_iou_mask] = 0 # 前景
assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:] # 类 别
assignment[:, -8][best_iou_mask] = 1 # 代表有物体?为什么有8,因为y_pre也有8吗?
# 通过assign_boxes我们就获得了,输入进来的这张图片,应该有的预测结果是什么样子的
return assignment # assignment.shape = (8732, 33) 33 = 4 + 21 + 8
SSD用VGG作为主干网络,提取VGG16第三、四次下采样的特征图得到Conv4_3、fc7,之后又继续下采样得到Conv6,Conv7,Conv8,Conv9,一共7个特征层用来分类和回归。根据每个特征层生成先验框。
'''
fea shape nums_anchors reg cls anchor_minmax wh_radio
ConV4-3 [38,38,512] 4 [38,38,4x4] [38,38,4x21] [30,60] [0.5,1,1,2]
fc7 [19,19,1024] 6 [19,19,6x4] [19,19,6x21] [60,111] [1/3,0.5,1,1,2,3]
ConV6-2 [10,10,512] 6 [10,10,6x4] [10,10,6x21] [111,162] [1/3,0.5,1,1,2,3]
ConV7-2 [ 5,5, 256] 6 [5, 5, 6x4] [5, 5, 6x21] [162,213] [1/3,0.5,1,1,2,3]
ConV8-2 [ 3,3, 256] 4 [3, 3, 4x4] [3, 3, 4x21] [213,264] [0.5,1,1,2]
ConV9-2 [ 1,1, 256] 4 [1, 1, 4x4] [1, 1, 4x21] [64,315] [0.5,1,1,2]
38*38*4+19*19*6+10*10*6+5*5*6+3*3*4+1*1*4 = 8732
y_pred [b,8732,4+21+8]
'''
'''
1. 对特征层分别进行两次卷积,包括分类和回归,通道数分别为num_boxes*4,num_boxes*cls.
'''
net = VGG16(input_tensor)
#-----------------------将提取到的主干特征进行处理---------------------------#
# 对conv4_3进行处理 38,38,512
net['conv4_3_norm'] = Normalize(20, name='conv4_3_norm')(net['conv4_3'])
num_priors = 4
# 预测框的处理
# num_priors表示每个网格点先验框的数量,4是x,y,h,w的调整
# 'conv4_3_norm_mbox_loc'(?,38,38,16)
net['conv4_3_norm_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same', name='conv4_3_norm_mbox_loc')(net['conv4_3_norm'])
net['conv4_3_norm_mbox_loc_flat'] = Flatten(name='conv4_3_norm_mbox_loc_flat')(net['conv4_3_norm_mbox_loc'])
# num_priors表示每个网格点先验框的数量,num_classes是所分的类 [?,38,38,4*21]
net['conv4_3_norm_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv4_3_norm_mbox_conf')(net['conv4_3_norm'])
net['conv4_3_norm_mbox_conf_flat'] = Flatten(name='conv4_3_norm_mbox_conf_flat')(net['conv4_3_norm_mbox_conf'])
priorbox = PriorBox(img_size, 30.0,max_size = 60.0, aspect_ratios=[2],
variances=[0.1, 0.1, 0.2, 0.2],
name='conv4_3_norm_mbox_priorbox') # priorbox [? ,5776,8]
net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm']) # prior_boxes_tensor.shape :TensorShape([batch_size, Dimension(5776), Dimension(8)])
'''
1. 根据先验框宽高极值和宽高比,计算本层先验框的各个宽高h,w。
2. 根据特征层尺寸生成先验框的中心点坐标cx,cy。
3. 以上两步数据得到原图上所有的先验框左上角和右下角坐标。
'''
class PriorBox(Layer): # img_size, 30.0,max_size = 60.0, aspect_ratios=[2],variances=[0.1, 0.1, 0.2, 0.2],
def __init__(self, img_size, min_size, max_size=None, aspect_ratios=None,
flip=True, variances=[0.1], clip=True, **kwargs):
if K.image_dim_ordering() == 'tf':
self.waxis = 2
self.haxis = 1
else:
self.waxis = 3
self.haxis = 2
self.img_size = img_size # (300,300)
if min_size <= 0:
raise Exception('min_size must be positive.')
self.min_size = min_size # 30.0
self.max_size = max_size # 60.
self.aspect_ratios = [1.0] # aspect_ratios[1.0, 1.0, 2.0, 0.5]
if max_size:
if max_size < min_size:
raise Exception('max_size must be greater than min_size.')
self.aspect_ratios.append(1.0)
if aspect_ratios:
for ar in aspect_ratios:
if ar in self.aspect_ratios:
continue
self.aspect_ratios.append(ar)
if flip:
self.aspect_ratios.append(1.0 / ar)
self.variances = np.array(variances) # variances=[0.1, 0.1, 0.2, 0.2]
self.clip = True
super(PriorBox, self).__init__(**kwargs)
def compute_output_shape(self, input_shape):
num_priors_ = len(self.aspect_ratios) # 4
layer_width = input_shape[self.waxis] # waxis = 2;layer_width = 38
layer_height = input_shape[self.haxis] # 38
num_boxes = num_priors_ * layer_width * layer_height # 4*38*38=5776
return (input_shape[0], num_boxes, 8)
def call(self, x, mask=None):
if hasattr(x, '_keras_shape'):
input_shape = x._keras_shape # (? 38,38,512)
elif hasattr(K, 'int_shape'):
input_shape = K.int_shape(x)
# ------------------ #
# 获取宽和高
# ------------------ #
layer_width = input_shape[self.waxis] # 38
layer_height = input_shape[self.haxis] # 38
img_width = self.img_size[0] # 300
img_height = self.img_size[1] # 300
box_widths = [] #[30.0, 42.43, 42.43, 21.21, ]
box_heights = [] #[30.0, 42.43, 21.21, 42.43,]
for ar in self.aspect_ratios:
if ar == 1 and len(box_widths) == 0:
box_widths.append(self.min_size)
box_heights.append(self.min_size)
elif ar == 1 and len(box_widths) > 0:
box_widths.append(np.sqrt(self.min_size * self.max_size))
box_heights.append(np.sqrt(self.min_size * self.max_size))
elif ar != 1:
box_widths.append(self.min_size * np.sqrt(ar))
box_heights.append(self.min_size / np.sqrt(ar))
box_widths = 0.5 * np.array(box_widths) # [[15. 21.21320344 21.21320344 10.60660172]]
box_heights = 0.5 * np.array(box_heights) # [[15. 21.21320344 10.60660172 21.21320344]]
step_x = img_width / layer_width # 300/38 = 7.89
step_y = img_height / layer_height #
linx = np.linspace(0.5 * step_x, img_width - 0.5 * step_x,
layer_width) # [3.9,...,296] d = 7.89
liny = np.linspace(0.5 * step_y, img_height - 0.5 * step_y,
layer_height)
centers_x, centers_y = np.meshgrid(linx, liny)
centers_x = centers_x.reshape(-1, 1)
centers_y = centers_y.reshape(-1, 1)
num_priors_ = len(self.aspect_ratios) # 4
# 每一个先验框需要两个(centers_x, centers_y),前一个用来计算左上角,后一个计算右下角
prior_boxes = np.concatenate((centers_x, centers_y), axis=1) # [1444,2]
prior_boxes = np.tile(prior_boxes, (1, 2 * num_priors_)) # [1444,16]
# 获得先验框的左上角和右下角
prior_boxes[:, ::4] -= box_widths
prior_boxes[:, 1::4] -= box_heights
prior_boxes[:, 2::4] += box_widths
prior_boxes[:, 3::4] += box_heights
# 变成小数的形式
prior_boxes[:, ::2] /= img_width # img_width 300
prior_boxes[:, 1::2] /= img_height
prior_boxes = prior_boxes.reshape(-1, 4) # 1444*4 = 5776 [5776,4]
prior_boxes = np.minimum(np.maximum(prior_boxes, 0.0), 1.0)
num_boxes = len(prior_boxes) # 5776
if len(self.variances) == 1:
variances = np.ones((num_boxes, 4)) * self.variances[0]
elif len(self.variances) == 4:
variances = np.tile(self.variances, (num_boxes, 1)) # [5776,4]
else:
raise Exception('Must provide one or four variances.')
prior_boxes = np.concatenate((prior_boxes, variances), axis=1) # [5776,8]
prior_boxes_tensor = K.expand_dims(K.variable(prior_boxes), 0) # [1,5776,8]
pattern = [tf.shape(x)[0], 1, 1] # x[batch_size,38,38,512] pattern[batch_size,1,1]
prior_boxes_tensor = tf.tile(prior_boxes_tensor, pattern) # prior_boxes_tensor[batch_size, 5776, 8]
return prior_boxes_tensor
SSD的损失函数在计算正负样本损失的时候,把所有预测类别作为正样本,背景预测作为负样本,由于正负样本数量相对不平衡,因此设计负样本数量在正样本的3倍,和总样本减正样本数量之间取最小值。
'''
1. 正样本分类损失,回归损失。
2. 难负样本挖掘。对负样本按照概率从小到大排列,取前K个负样本。计算负样本损失。
3. 总损失 = 正样本损失+负样本损失+回归损失
'''
def compute_loss(self, y_true, y_pred): # [b,8732,33]
batch_size = tf.shape(y_true)[0] # 4 # 输入图片的数量
num_boxes = tf.to_double(tf.shape(y_true)[1]) # 8732.0
conf_loss = _softmax_loss(y_true[:, :, 4:-8], # 4+21+4+4
y_pred[:, :, 4:-8]) # (4, 8732, 21) -> (4, 8732)
loc_loss = _l1_smooth_loss(y_true[:, :, :4],y_pred[:, :, :4]) # (4, 8732, 4) -> (4, 8732)
num_pos = tf.reduce_sum(y_true[:, :, -8], axis=-1) # shape=(4,) array([12., 22., 19., 9.])
pos_loc_loss = tf.reduce_sum(loc_loss * y_true[:, :, -8],
axis=1) # [4,8732]->[4]
pos_conf_loss = tf.reduce_sum(conf_loss * y_true[:, :, -8],
axis=1) # [4,8732]->[4]
num_neg = tf.minimum(neg_pos_ratio * num_pos, # array([36., 66., 57., 27.]) ; array([8720., 8710., 8713., 8723.])
num_boxes - num_pos) # array([36., 66., 57., 27.])
pos_num_neg_mask = tf.greater(num_neg, 0) # array([ True, True, True, True])
has_min = tf.to_float(tf.reduce_any(pos_num_neg_mask)) # 1.0
num_neg = tf.concat( axis=0,values=[num_neg, # 如果不存在负样本,就设置负样本的数量
[(1 - has_min) * negatives_for_hard]]) # array([36., 66., 57., 27., 0.])
num_neg_batch = tf.reduce_mean(tf.boolean_mask(num_neg, # 46.5
tf.greater(num_neg, 0)))
num_neg_batch = tf.to_int32(num_neg_batch) # 46
# conf的起始[5:-8]
confs_start = 4 + background_label_id + 1 # confs_start = 5
# conf的结束
confs_end = confs_start + num_classes - 1 # 25
max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
axis=2) # shape=(4, 8732, 25)-->(4, 8732)
_, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]), # indices.shape=(4,46)
k=num_neg_batch) # num_neg_batch=46
# 找到其在1维上的索引
batch_idx = tf.expand_dims(tf.range(0, batch_size), 1) # batch_idx.shape = (4, 1)
batch_idx = tf.tile(batch_idx, (1, num_neg_batch)) # batch_idx.shape = (4,46)
full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) + # num_boxes=8732
tf.reshape(indices, [-1])) # full_indices.shape :46*4 =184
neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
full_indices) # TensorShape([Dimension(184)])
neg_conf_loss = tf.reshape(neg_conf_loss,
[batch_size, num_neg_batch]) # [184] -- > [4,46]
neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1) # [4,46] --> [4]
num_pos = tf.where(tf.not_equal(num_pos, 0), num_pos, # array([12., 22., 19., 9.])
tf.ones_like(num_pos)) # array([1., 1., 1., 1.])
total_loss = tf.reduce_sum(pos_conf_loss) + tf.reduce_sum(neg_conf_loss)
total_loss /= tf.reduce_sum(num_pos)
total_loss += tf.reduce_sum(alpha * pos_loc_loss) / tf.reduce_sum(num_pos)
return total_loss
'''
1. 图片预处理后送入模型,计算得到预测结果。
2. 对预测结果解码、筛选。
3. 画框
'''
def detect_image(self, image):
image_shape = np.array(np.shape(image)[0:2]) # 图片尺寸
crop_img, x_offset, y_offset = letterbox_image(image, (self.model_image_size[0], self.model_image_size[1])) # 加入灰条
photo = np.array(crop_img, dtype=np.float64) # photo.shape=(300,300,3)
# 图片预处理,归一化、预测 ,[300, 300, 3]-->[1,300, 300, 3]
photo = preprocess_input(np.reshape(photo, [1, self.model_image_size[0], self.model_image_size[1], 3]))
preds = self.ssd_model.predict(photo) # predictions(Concatenate)(None, 8732, 33) 4+21+8=33
# 将预测结果进行解码-->筛选-->nms-->选出top_k
results = self.bbox_util.detection_out(preds, confidence_threshold=self.confidence)
if len(results[0]) <= 0:
return image
# 筛选出其中得分高于confidence的框 ,results[label,conf,det_xmin, det_ymin, det_xmax, det_ymax]
det_label = results[0][:, 0]
det_conf = results[0][:, 1]
det_xmin, det_ymin, det_xmax, det_ymax = results[0][:, 2], results[0][:, 3], results[0][:, 4], results[0][:, 5]
top_indices = [i for i, conf in enumerate(det_conf) if conf >= self.confidence]
top_conf = det_conf[top_indices]
top_label_indices = det_label[top_indices].tolist()
top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(det_xmin[top_indices], -1), np.expand_dims(
det_ymin[top_indices], -1), np.expand_dims(det_xmax[top_indices], -1), np.expand_dims(det_ymax[top_indices],
-1)
# 去掉灰条
boxes = ssd_correct_boxes(top_ymin, top_xmin, top_ymax, top_xmax, # [200,4]
np.array([self.model_image_size[0], self.model_image_size[1]]), image_shape)
font = ImageFont.truetype(font='model_data/simhei.ttf',
size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32'))
thickness = (np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0]
for i, c in enumerate(top_label_indices): # [2.0, 15.0, 15.0, 15.0, 7.0]
predicted_class = self.class_names[int(c) - 1]
score = top_conf[i]
top, left, bottom, right = boxes[i] # np.shape(image)=(1330, 1330, 3)
top = top - 5 #
left = left - 5
bottom = bottom + 5
right = right + 5
top = max(0, np.floor(top + 0.5).astype('int32')) # 向上取整,让框在image之内
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32'))
right = min(np.shape(image)[1], np.floor(right + 0.5).astype('int32'))
# 画框框
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
label = label.encode('utf-8')
print(label)
# label在框中的位置
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]]) # xy
else:
text_origin = np.array([left, top + 1])
for i in range(thickness):
draw.rectangle(
[left + i, top + i, right - i, bottom - i],
outline=self.colors[int(c) - 1]) # 画框
draw.rectangle(
[tuple(text_origin), tuple(text_origin + label_size)],
fill=self.colors[int(c) - 1]) # 画label
draw.text(text_origin, str(label, 'UTF-8'), fill=(0, 0, 0), font=font) # 写文字
del draw
return image
'''
1. 解码
2. 筛选
3. nms
4. 选出top_k
'''
def detection_out(self, predictions, background_label_id=0, keep_top_k=200,
confidence_threshold=0.5):
# 网络预测的结果 [4+1+20+4+4] 预测偏移 + 置信度 + 类别 + 先验框 + variance[0.1,0.1,0.2,0.2]
mbox_loc = predictions[:, :, :4] # (1, 8732, 4)
# 0.1,0.1,0.2,0.2
variances = predictions[:, :, -4:] # (1, 8732, 4)
# 先验框
mbox_priorbox = predictions[:, :, -8:-4] # (1, 8732, 4)
# 置信度
mbox_conf = predictions[:, :, 4:-8] # (1, 8732, 21)
results = []
# 处理每张图片
for i in range(len(mbox_loc)):
results.append([])
decode_bbox = self.decode_boxes(mbox_loc[i], mbox_priorbox[i], variances[i]) # 解码 decode_bbox.shape: (8732, 4)
for c in range(self.num_classes):
if c == background_label_id: # index为零的是背景
continue
c_confs = mbox_conf[i, :, c]
c_confs_m = c_confs > confidence_threshold
if len(c_confs[c_confs_m]) > 0:
# 取出得分高于confidence_threshold的框
boxes_to_process = decode_bbox[c_confs_m]
confs_to_process = c_confs[c_confs_m]
# 进行iou的非极大抑制
feed_dict = {self.boxes: boxes_to_process,
self.scores: confs_to_process}
idx = self.sess.run(self.nms, feed_dict=feed_dict)
# 取出在非极大抑制中效果较好的内容
good_boxes = boxes_to_process[idx]
confs = confs_to_process[idx][:, None] # 变成行列的形式
# 将label、置信度、框的位置进行堆叠
labels = c * np.ones((len(idx), 1)) # c 是类别对应的数字
c_pred = np.concatenate((labels, confs, good_boxes),
axis=1)
# 添加进result里
results[-1].extend(c_pred)
if len(results[-1]) > 0:
# 按照置信度进行排序
results[-1] = np.array(results[-1])
argsort = np.argsort(results[-1][:, 1])[::-1] # 按照概率排序
results[-1] = results[-1][argsort]
# 选出置信度最大的keep_top_k个
results[-1] = results[-1][:keep_top_k]
return results
'''
1. 先验框左上角和右下角坐标转化为中心点坐标和宽与高。
2. 预测框的中心点坐标、宽与高。
3. 获取预测框的左上角与右下角。
'''
def decode_boxes(self, mbox_loc, mbox_priorbox, variances):
# 1 获得先验框的宽与高 x1,y1,x2,y2 --> cx,cy,h,w
prior_width = mbox_priorbox[:, 2] - mbox_priorbox[:, 0]
prior_height = mbox_priorbox[:, 3] - mbox_priorbox[:, 1]
# 获得先验框的中心点
prior_center_x = 0.5 * (mbox_priorbox[:, 2] + mbox_priorbox[:, 0])
prior_center_y = 0.5 * (mbox_priorbox[:, 3] + mbox_priorbox[:, 1])
# 2 预测框距离先验框中心的xy轴偏移情况
decode_bbox_center_x = mbox_loc[:, 0] * prior_width * variances[:, 0]
decode_bbox_center_x += prior_center_x
decode_bbox_center_y = mbox_loc[:, 1] * prior_height * variances[:, 1]
decode_bbox_center_y += prior_center_y
# 预测框的宽与高的求取
decode_bbox_width = np.exp(mbox_loc[:, 2] * variances[:, 2])
decode_bbox_width *= prior_width
decode_bbox_height = np.exp(mbox_loc[:, 3] * variances[:, 3])
decode_bbox_height *= prior_height
# 3 获取预测框的左上角与右下角
decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
# 预测框的左上角与右下角进行堆叠
decode_bbox = np.concatenate((decode_bbox_xmin[:, None],
decode_bbox_ymin[:, None],
decode_bbox_xmax[:, None],
decode_bbox_ymax[:, None]), axis=-1)
# 防止超出0与1
decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
return decode_bbox