SSD(Single Shot MultiBox Detector),是一种非常优秀的one-stage方法。
one-stage算法就是目标检测和分类是同时完成的,其主要思路是均匀地在图片的不同位置进行密集抽样,抽样时可以采用不同尺度和长宽比,然后利用CNN提取特征后直接进行分类与回归,整个过程只需要一步,所以其优势是速度快。但是均匀的密集采样的一个重要缺点是训练比较困难,这主要是因为正样本与负样本(背景)极其不均衡(参见Focal Loss),导致模型准确度稍低。
1.提取特征的SSD网络结构
#1.提取图像特征的网络结构
import keras.backend as K
from keras.layers import Input,Conv2D,Dense,Flatten,Activation,GlobalAveragePooling2D,MaxPooling2D,Reshape,ZeroPadding2D,merge,concatenate
from keras.models importModel
def VGG16(input_tensor):
net={}#SSD结构,net字典
net['input']=input_tensor
net['conv1_1']=Conv2D(64,kernel_size=(3,3),activation='relu',padding='same',name='conv1_1')(net['input'])#300 300 3
net['conv1_2']=Conv2D(64,kernel_size=(3,3),activation='relu',padding='same',name='conv1_2')(net['conv1_1'])
net['pool1']=MaxPooling2D((2,2),strides=(2,2),padding='same',name='pool1')(net['conv1_2'])#150,150,64
net['conv2_1']=Conv2D(128,kernel_size=(3,3),activation='relu',padding='same',name='conv2_1')(net['pool1'])
net['conv2_2']=Conv2D(128,kernel_size=(3,3),activation='relu',padding='same',name='conv2_2')net['conv2_1']
net['pool2']=MaxPooling2D((2,2),strides=(2,2),padding='same',name='pool2')net['conv2_2']#75,75,128
net['conv3_1']=Conv2D(256,kernel_size=(3,3),activation='relu',padding='same',name='conv3_1')net['pool2']
net['conv3_2']=Conv2D(256,kernel_size=(3,3),activation='relu',padding='same',name='conv3_2')net['conv3_1']
net['conv3_3']=Conv2D(256,kernel_size=(3,3),activation='relu',padding='same',name='conv3_3')net['conv3_2']
net['pool3']=MaxPooling2D((2,2),strides=(2,2),padding='same',name='pool3')net['conv3_3']#38,38,256
net['conv4_1']=Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv4_1')net['pool3']
net['conv4_2']=Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv4_2')net['conv4_1']
net['conv4_3']=Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv4_3')net['conv4_2']#第一个输出特征层38,38,256
net['pool4']=MaxPooling2D((2,2),strides=(2,2),padding='same',name='pool4')net['conv4_3']#19,19,512
net['conv5_1']=Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv5_1')net['pool4']
net['conv5_2']=Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv5_2')net['conv5_1']
net['conv5_3']=Conv2D(512,kernel_size=(3,3),activation='relu',padding='same',name='conv5_3')net['conv5_2']
net['pool5']=MaxPooling2D((3,3),strides=(1,1),padding='same',name='pool5')net['conv5_3']#19,19,512
net['fc6']=Conv2D(1024,kernel_size=(3,3),dilation_rate=(6,6),activation='relu',padding='same',name='fc6')net['pool5']#膨胀卷积
net['fc7']=Conv2D(1024,kernel_size=(1,1),activation='relu',padding='same',name='fc7')net['fc6']#第二个输出特征层19,19,1024
net['conv6_1']=Conv2D(256,kernel_size=(1,1),activation='relu',padding='same',name='conv6_1')net['fc7']
net['conv6_2']=ZerPadding2D(padding=((1,1),(1,1)),name='conv6_padding')net['conv6_1']
net['conv6_2']=Conv2D(512,kernel_size=(3,3),strides=(2,2),activation='relu',name='conv6_2')net['conv6_2']#第三个输出特征层 10,10,512
net['conv7_1']=Conv2D(128,kernel_size=(1,1),activation='relu',padding='same',name='conv7_1')net['conv6_2']
net['conv7_2']=ZeroPadding2D(padding=((1,1),(1,1)),name='conv7_padding')net['conv7_1']
net['conv7_2']=Conv2D(256,kernel_size=(3,3),strides=(2,2),activation='relu',padding='valid',name='conv7_2')net['conv7_2']#第三个输出特征层 5,5,256
net['conv8_1']=Conv2D(128,kernel_size=(1,1),activation='relu',padding='same',name='conv8_1')net['conv7_1']
net['conv8_2']=Conv2D(256,kernel_size=(3,3),strides=(1,1),activation='relu',paddin='valid',name='conv8_2')net['conv8_1']#第三个输出特征层3,3,256
net['conv9_1']=Conv2D(128,kernel_size=(1,1),activation='relu',padding='same',name='conv9_1')net['conv8_2']
net['conv9_2']=Conv2D(256,kernel_size=(3,3),strides=(1,1),activation='relu',padding='valid',nmae='conv9_2')net['conv9_1']
return net
#2.ssd结构,对特征层进行处理,使得每一个特征测输出每个先验框的调整参数和每个先验框的类别种类
def SSD300(input_shape,num_classes=21):
input_tensor=Input(shape=input_shape)
img_size=(input_tensor[0],input_tensor[1])#h,w
net=VGG16(input_tensor)#vgg网络提取图形特征
net['conv4_3_norm']=Normalize(20,name='conv4_3_norm')net['conv4_3']
num_priors=4#这个特征层的每个网格的先验框是4个
#特征层卷积输出x,y,h,w的调整参数
net['conv4_3_norm_mbox_loc']=Conv2D(num_priors*4,kernel_size=(3,3),padding='same',name='conv4_3_norm_mbox_loc')net['conv4_3_norm']
net['conv4_3_norm_mbox_loc_flat']=Flatten(name='conv4_3_norm_mbox_loc_flat')(net['conv4_3_norm_mbox_loc'])
#特征层卷积输出每个网格点的类别
net['conv4_3_norm_mbox_conf']=Conv2D(num_priors*num_classes,kernel_size=(3,3),padding='same',name='conv4_3_norm_mbox_conf')(net['conv4_3_norm'])
net['conv4_3_norm_mbox_conf_flat']=Flatten(name='conv4_3_norm_mbox_conv_falt')net['conv4_3_norm_mbox_conf']
#获取这个特征层的所有网格点的坐上与右下的坐标
priorbox=PrioBox(img_size,30.0,max_size=60.0,aspect_ratios=[2],variances=[0.1,0.1,0.2,0.2],name='conv4_3_norm_mbox_priorbox')
net['conv4_3_norm_mbox_priorbox'] = priorbox(net['conv4_3_norm'])
#同理对后面几个特征层处理
num=priors=6
net['fc7_mbox_loc']=Conv2D(num_priors*4,kernel_szie=(3,3),padding='same',name='fc7_mbox_loc')(net['fc7'])
net['fc7_mbox_loc_flat'] = Flatten(name='fc7_mbox_loc_flat')(net['fc7_mbox_loc'])
net['fc7_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3),padding='same',name='fc7_mbox_conf')(net['fc7'])
net['fc7_mbox_conf_flat'] = Flatten(name='fc7_mbox_conf_flat')(net['fc7_mbox_conf'])
priorbox = PriorBox(img_size, 60.0, max_size=111.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='fc7_mbox_priorbox')
net['fc7_mbox_priorbox'] = priorbox(net['fc7'])
num_priors = 6
net['conv6_2_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv6_2_mbox_loc')(net['conv6_2'])
net['conv6_2_mbox_loc_flat'] = Flatten(name='conv6_2_mbox_loc_flat')(net['conv6_2_mbox_loc'])
net['conv6_2_mbox_conf'] =Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv6_2_mbox_conf')(net['conv6_2'])
net['conv6_2_mbox_conf_flat'] = Flatten(name='conv6_2_mbox_conf_flat')(net['conv6_2_mbox_conf'])
priorbox = PriorBox(img_size, 111.0, max_size=162.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2], name='conv6_2_mbox_priorbox')
net['conv6_2_mbox_priorbox'] = priorbox(net['conv6_2'])
num_priors = 6
net['conv7_2_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv7_2_mbox_loc')(net['conv7_2'])
net['conv7_2_mbox_loc_flat'] = Flatten(name='conv7_2_mbox_loc_flat')(net['conv7_2_mbox_loc'])
net['conv7_2_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv7_2_mbox_conf')(net['conv7_2'])
net['conv7_2_mbox_conf_flat'] = Flatten(name='conv7_2_mbox_conf_flat')(net['conv7_2_mbox_conf'])
priorbox = PriorBox(img_size, 162.0, max_size=213.0, aspect_ratios=[2, 3],variances=[0.1, 0.1, 0.2, 0.2],name='conv7_2_mbox_priorbox')
net['conv7_2_mbox_priorbox'] = priorbox(net['conv7_2'])
num_priors = 4
net['conv8_2_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv8_2_mbox_loc')(net['conv8_2'])
net['conv8_2_mbox_loc_flat'] = Flatten(name='conv8_2_mbox_loc_flat')(net['conv8_2_mbox_loc'])
net['conv8_2_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv8_2_mbox_conf')(net['conv8_2'])
net['conv8_2_mbox_conf_flat'] = Flatten(name='conv8_2_mbox_conf_flat')(net['conv8_2_mbox_conf'])
priorbox = PriorBox(img_size, 213.0, max_size=264.0, aspect_ratios=[2],variances=[0.1, 0.1, 0.2, 0.2],name='conv8_2_mbox_priorbox')
net['conv8_2_mbox_priorbox'] = priorbox(net['conv8_2'])
num_priors = 4
net['conv9_2_mbox_loc'] = Conv2D(num_priors * 4, kernel_size=(3,3), padding='same',name='conv9_2_mbox_loc')(net['conv9_2'])
net['conv9_2_mbox_loc_flat'] = Flatten(name='conv9_2_mbox_loc_flat')(net['conv9_2_mbox_loc'])
net['conv9_2_mbox_conf'] = Conv2D(num_priors * num_classes, kernel_size=(3,3), padding='same',name='conv9_2_mbox_conf')(net['conv9_2'])
net['conv9_2_mbox_conf_flat'] = Flatten(name='conv9_2_mbox_conf_flat')(net['conv9_2_mbox_conf'])
priorbox = PriorBox(img_size, 264.0, max_size=315.0, aspect_ratios=[2],variances=[0.1, 0.1, 0.2, 0.2],name='conv9_2_mbox_priorbox')
net['conv9_2_mbox_priorbox'] = priorbox(net['conv9_2'])
# 将所有结果进行堆叠
net['mbox_loc'] = concatenate([net['conv4_3_norm_mbox_loc_flat'],
net['fc7_mbox_loc_flat'],
net['conv6_2_mbox_loc_flat'],
net['conv7_2_mbox_loc_flat'],
net['conv8_2_mbox_loc_flat'],
net['conv9_2_mbox_loc_flat']],
axis=1, name='mbox_loc')#x,y,h,w的调整参数
net['mbox_conf'] = concatenate([net['conv4_3_norm_mbox_conf_flat'],
net['fc7_mbox_conf_flat'],
net['conv6_2_mbox_conf_flat'],
net['conv7_2_mbox_conf_flat'],
net['conv8_2_mbox_conf_flat'],
net['conv9_2_mbox_conf_flat']],
axis=1, name='mbox_conf')#每个网格点的类别
net['mbox_priorbox'] = concatenate([net['conv4_3_norm_mbox_priorbox'],
net['fc7_mbox_priorbox'],
net['conv6_2_mbox_priorbox'],
net['conv7_2_mbox_priorbox'],
net['conv8_2_mbox_priorbox'],
net['conv9_2_mbox_priorbox']],
axis=1, name='mbox_priorbox')#网格点的左上与右下的坐标
if hasattr(net['mbox_loc'], '_keras_shape'):
num_boxes = net['mbox_loc']._keras_shape[-1] // 4
elif hasattr(net['mbox_loc'], 'int_shape'):
num_boxes = K.int_shape(net['mbox_loc'])[-1] // 4
net['mbox_loc']=Reshape((num_boxs,4),name='mbox_loc_final')net[mbox_loc]
net['mbox_conf']=Reshape((num_boxs,num_classes),name='mbox_conf_logits')(net['mbox_conf'])
net['mbox_conf']=Activation('softmax',name='mbox_conf_final')net['mbox_conf']
net['predictions']=concatenate([net['mbox_loc'],net['mbox_conf'],net['mbox_priorbox'],axis=2,name='predictions'])
print(net['predictions'])
model=Model(net['input'],net['predictions'])
return model
2.计算所有先验框应该的位置
#2.计算所有先验框的位置坐标
import numpy as np
import tensorflow as tf
from keras.engine.topology import InputSpec
from keras.engine.topology import Layer
class Normalize(Layer):#自定义Normalize层,继承layer,所以使用时,会自动调用类对象的call方法
def __init__(self,scale,**kwargs):
if K.image_dim_ordering()=='tf':
self.axis=3#b,h,w,c
else:
self.axis=1#b,c,h,w
self.scale=scale
super(Normalize,self).__init__(**kwargs)
def build(self,input_shape):
self.input_spec=[InputSpec(shape=input_shape)]
shape=(input_shape[self.axis],)#channels通道数
init_gamma=self.scale*np.ones(shape)
self.gamma=K.Variable(init_gamma,name='{}_gamma'.format(self.name))
self.trainable_weights=[self.gama]
def call(self,x, mask=None):
output=K.l2_normalize(x,self.axis)
output*=self.gamma
return output
class PriorBox(layer):#自动调用call函数,min_size是先验框最小正方形的边长
def __init__(self,img_size,min_size,max_size=None,aspect_rations=None,flip=True,variances=[0.1],clip=True,**keargs):
if K.image_dim_ordering()=='tf':
self.haxis=1
self.waxis=2
else:
self.haxis=2
self.waxis=3
self.img_size=img_size
if min_size<=0:
raise Exception('min_size must be positive.')
self.min_size=min_size
self.max_size=max_size
self.aspect_rations=[1.0]
if max_size:
if max_size<min_size:
raise Exception('max_size must be greater than min_size.')
self.aspect_rations.append(1.0)
if aspect_rations:
for ar in aspect_rations:
if ar in self.rations:
continue
self.aspect_rations.append(ar)#self的aspect——ration会添加2或2,3,变成【1,2】或【1,2,3】,
if filp:
self.aspect_rations.append(1.0/ar)#(1,1,2,0.5)或(1,1,2,0.5,3,0.33)
self.variances=np.array(variances)
self.clip=True
super(PriorBoxmself).__init__(**kwargs)
def compute_output_shape(self,input_shape):
num_priors=len(self.aspect_rations)#4 or 6
layer_width=input_shape[self.waxis]
layer_height=input_shape[self.haxis]
num_boxs=num_priors*layer_width*layer_height#每个特征层的先验框个数
return (inputshape[0],num_boxs,8)
def call(self,x,mask=None):
if hasattr(x,'_keras_shape'):#hasattr() 函数用于判断对象是否包含对应的属性。hasattr(object, name)
input_shape=x._keras_shape
elif hasattr(K,'int_shape'):
input_shape=K.int_shape(x)
layer_height=input_shape[self.haxis]#特征层的宽高大小(38,38)(19,19)(10,10)......
layer_width=input_shape[self.waxis]
img_height=self.img_size[0]#输入图像的大小(300,300)
img_width=self.img_size[1]
#1.根据min_size计算每个特征层的先验框的边长大小
box_widths=[]
box_height=[]
for ar in self.aspect_rations:
if ar==1 and len(box_widths)==0:#第一个小正方形的边长
box_widths.append(self.min_size)
box_heights.append(self.min_size)
elif ar==1 and len(box_widths)>0:#第二个正方形边长
box_widths.append(np.sqrt(self.min_size*self.max_size))
box_heights.append(np.sqrt(self.min_size*self.max_size))
elif ar !=1:#剩下的那两个或四个
box_width.append(self.min_size*np.sqrt(ar))
box_heights.append(self.min_size/np.sqrt(ar))
#2.计算每个特征层所有网格点的中心点坐标在原图上的位置
box_widths=0.5*np.array(box_idths)#边长的一半
box_heights=0.5*np.array(box_heights)
step_x=img_width/layer_width#图像与特征层的大小比例
step_y=img_height/layer_height
linx=np.linspace(0.5*step_x,img_width-0.5*step_x,layer_width)#(15,300-15,10),这个特征层的每个先验框的中心点在原图上的坐标
liny=np.linspace(0.5*step_y,img_height-0.5*dtrp_y,layer_height)
centers_x,centers_y=np.meshgrid(linx,liny)
centers_x=centers_x.reshape(-1,1)
centers_y=centers_y.reshape(-1,1)
#3.计算每个先验框的左上与右下的坐标点
num_priors=len(self.aspect_rations)#几个先验框
prior_boxs=np.concatenate((centers_x,centers_y),axis=1)#(先验框中心点的个数,2)这个特征层的所有先验框中心位置坐标
prior_boxs=np.tile(prior_boxs,(1,2*num_priors))#(先验框中心点的个数,每个中心点的先验框个数*2),因为框要有两个坐标点确定位置
prior_boxs[:,::4]-=box_widths#左上点的横坐标,以4为距离切片
prior_boxs[:,1::4]-=box_height#左上纵坐标
prior_boxs[:,2::4]+=box_widths#右上
prior_boxs[:3::4]+=box_heights
prior_boxs[:,::2]/=img_width#坐标点换成小数,把整个图片规定到1的大小
prior_boxs[:,1::2]/=img_height
prior_boxs=prior_boxs.reshape(-1,4)#(所有先验框,两个坐标点)
prior_boxs=np.minimum(np.maximum(prior_boxs,0.0),1.0)
num_boxs=len(prior_boxs)#所有先验框的个数
if len(self.variances)==1:
variances=mp.ones((num_boxs,4))*self.variances[0]
elif len(self.variances)==4:(0.1,0.1,0.2,0.2)
variances=np.tile(self.variances,(num_boxs,1))#一个参数为Y轴复制倍数,第二个为X轴复制倍数
prior_boxs=np.concatenate((prior_boxs,variances),axis=1)#(所有先验框个数,8),前四个是左上右下坐标点,后面是0.1,0.1,0.2,0.2
prior_boxs_tensor=K.expand_dims(K.variable(prior_boxs),0)
pattern=[tf.shape(x)[0],1,1]
prior_boxs_tensor=tf.tile(prior_box_tensor,pattern)
return prior_boxs_tensor
3.图像数据的预处理
#3.定义ssd类,输入图片的预处理和输出图片结果处理
#预测输入图像数据的处理
from PIL import Image
def letterbox_image(image,size):#输入图像加上灰条resize到300,3000
iw,ih=image.size#输入图像尺寸
w,h=size#规定的图像尺寸
scale=min(w/iw,h/ih)#改变较小的一边,可以resize
nw=int(iw*scale)
nh=int(ih*scale)
image=image.resize((nw,nh),Image.BICUBIC)#等比例resize,利用双三次插值法
new_image=Image.new('RGB',size,(128,128,128))#画出一张规定size大小的全灰图
new_image.paste(image,(w-nw)//2,(h-nh)//2)#将image放在new_image图片上,坐标是左上角位置
x_offset,y_off_set=(w_nw)//2/300,(h-nh)//2/300#图片在规定尺寸上偏移位置与300的比例
return new_image,x_offset,y_offset
def ssd_correct_boxes(top, left, bottom, right, input_shape, image_shape):
new_shape = image_shape*np.min(input_shape/image_shape)
offset = (input_shape-new_shape)/2./input_shape
scale = input_shape/new_shape
box_yx = np.concatenate(((top+bottom)/2,(left+right)/2),axis=-1)
box_hw = np.concatenate((bottom-top,right-left),axis=-1)
box_yx = (box_yx - offset) * scale
box_hw *= scale
box_mins = box_yx - (box_hw / 2.)
box_maxes = box_yx + (box_hw / 2.)
boxes = np.concatenate([
box_mins[:, 0:1],
box_mins[:, 1:2],
box_maxes[:, 0:1],
box_maxes[:, 1:2]
],axis=-1)
print(np.shape(boxes))
boxes *= np.concatenate([image_shape, image_shape],axis=-1)
return boxes
class BBoxUtility(object):
def __init__(self, num_classes, priors=None, overlap_threshold=0.5,
nms_thresh=0.45, top_k=400):
self.num_classes = num_classes
self.priors = priors
self.num_priors = 0 if priors is None else len(priors)
self.overlap_threshold = overlap_threshold
self._nms_thresh = nms_thresh
self._top_k = top_k
self.boxes = tf.placeholder(dtype='float32', shape=(None, 4))
self.scores = tf.placeholder(dtype='float32', shape=(None,))
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
self.sess = tf.Session(config=tf.ConfigProto(device_count={'GPU': 0}))
@property
def nms_thresh(self):
return self._nms_thresh
@nms_thresh.setter
def nms_thresh(self, value):
self._nms_thresh = value
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
@property
def top_k(self):
return self._top_k
@top_k.setter
def top_k(self, value):
self._top_k = value
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
def iou(self, box):
# 计算出每个真实框与所有的先验框的iou
# 判断真实框与先验框的重合情况
inter_upleft = np.maximum(self.priors[:, :2], box[:2])
inter_botright = np.minimum(self.priors[:, 2:4], box[2:])
inter_wh = inter_botright - inter_upleft
inter_wh = np.maximum(inter_wh, 0)
inter = inter_wh[:, 0] * inter_wh[:, 1]
# 真实框的面积
area_true = (box[2] - box[0]) * (box[3] - box[1])
# 先验框的面积
area_gt = (self.priors[:, 2] - self.priors[:, 0])*(self.priors[:, 3] - self.priors[:, 1])
# 计算iou
union = area_true + area_gt - inter
iou = inter / union
return iou
def encode_box(self, box, return_iou=True):
iou = self.iou(box)
encoded_box = np.zeros((self.num_priors, 4 + return_iou))
# 找到每一个真实框,重合程度较高的先验框
assign_mask = iou > self.overlap_threshold
if not assign_mask.any():
assign_mask[iou.argmax()] = True
if return_iou:
encoded_box[:, -1][assign_mask] = iou[assign_mask]
# 找到对应的先验框
assigned_priors = self.priors[assign_mask]
# 逆向编码,将真实框转化为ssd预测结果的格式
# 先计算真实框的中心与长宽
box_center = 0.5 * (box[:2] + box[2:])
box_wh = box[2:] - box[:2]
# 再计算重合度较高的先验框的中心与长宽
assigned_priors_center = 0.5 * (assigned_priors[:, :2] +
assigned_priors[:, 2:4])
assigned_priors_wh = (assigned_priors[:, 2:4] -
assigned_priors[:, :2])
# 逆向求取ssd应该有的预测结果
encoded_box[:, :2][assign_mask] = box_center - assigned_priors_center
encoded_box[:, :2][assign_mask] /= assigned_priors_wh
# 除以0.1
encoded_box[:, :2][assign_mask] /= assigned_priors[:, -4:-2]
encoded_box[:, 2:4][assign_mask] = np.log(box_wh / assigned_priors_wh)
# 除以0.2
encoded_box[:, 2:4][assign_mask] /= assigned_priors[:, -2:]
return encoded_box.ravel()
def assign_boxes(self, boxes):
assignment = np.zeros((self.num_priors, 4 + self.num_classes + 8))
assignment[:, 4] = 1.0
if len(boxes) == 0:
return assignment
# 对每一个真实框都进行iou计算
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
# 每一个真实框的编码后的值,和iou
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
# 取重合程度最大的先验框,并且获取这个先验框的index
best_iou = encoded_boxes[:, :, -1].max(axis=0)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
best_iou_mask = best_iou > 0
best_iou_idx = best_iou_idx[best_iou_mask]
assign_num = len(best_iou_idx)
# 保留重合程度最大的先验框的应该有的预测结果
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx,np.arange(assign_num),:4]
# 4代表为背景的概率,为0
assignment[:, 4][best_iou_mask] = 0
assignment[:, 5:-8][best_iou_mask] = boxes[best_iou_idx, 4:]
assignment[:, -8][best_iou_mask] = 1
# 通过assign_boxes我们就获得了,输入进来的这张图片,应该有的预测结果是什么样子的
return assignment
def decode_boxes(self, mbox_loc, mbox_priorbox, variances):#对网络提取的先验框调整参数的输出在先验框上做出调整
# 获得先验框的宽与高
prior_width = mbox_priorbox[:, 2] - mbox_priorbox[:, 0]
prior_height = mbox_priorbox[:, 3] - mbox_priorbox[:, 1]
# 获得先验框的中心点
prior_center_x = 0.5 * (mbox_priorbox[:, 2] + mbox_priorbox[:, 0])
prior_center_y = 0.5 * (mbox_priorbox[:, 3] + mbox_priorbox[:, 1])
# 真实框距离先验框中心的xy轴偏移情况
decode_bbox_center_x = mbox_loc[:, 0] * prior_width * variances[:, 0]
decode_bbox_center_x += prior_center_x
decode_bbox_center_y = mbox_loc[:, 1] * prior_height * variances[:, 1]
decode_bbox_center_y += prior_center_y
# 真实框的宽与高的求取
decode_bbox_width = np.exp(mbox_loc[:, 2] * variances[:, 2])
decode_bbox_width *= prior_width
decode_bbox_height = np.exp(mbox_loc[:, 3] * variances[:, 3])
decode_bbox_height *= prior_height
# 获取真实框的左上角与右下角
decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
# 真实框的左上角与右下角进行堆叠
decode_bbox = np.concatenate((decode_bbox_xmin[:, None],
decode_bbox_ymin[:, None],
decode_bbox_xmax[:, None],
decode_bbox_ymax[:, None]), axis=-1)
# 防止超出0与1
decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
return decode_bbox
def detection_out(self, predictions, background_label_id=0, keep_top_k=200,
confidence_threshold=0.5):#对网络的类别输出利用非极大抑制等方法输出概率较大的某些预测框(先验框经过调整后的)和预测类别
# 网络预测的结果
mbox_loc = predictions[:, :, :4]
# 0.1,0.1,0.2,0.2
variances = predictions[:, :, -4:]
# 先验框
mbox_priorbox = predictions[:, :, -8:-4]
# 置信度
mbox_conf = predictions[:, :, 4:-8]
results = []
# 对每一个特征层进行处理
for i in range(len(mbox_loc)):
results.append([])
decode_bbox = self.decode_boxes(mbox_loc[i], mbox_priorbox[i], variances[i])
for c in range(self.num_classes):
if c == background_label_id:
continue
c_confs = mbox_conf[i, :, c]
c_confs_m = c_confs > confidence_threshold
if len(c_confs[c_confs_m]) > 0:
# 取出得分高于confidence_threshold的框
boxes_to_process = decode_bbox[c_confs_m]
confs_to_process = c_confs[c_confs_m]
# 进行iou的非极大抑制
feed_dict = {self.boxes: boxes_to_process,
self.scores: confs_to_process}
idx = self.sess.run(self.nms, feed_dict=feed_dict)
# 取出在非极大抑制中效果较好的内容
good_boxes = boxes_to_process[idx]
confs = confs_to_process[idx][:, None]
# 将label、置信度、框的位置进行堆叠。
labels = c * np.ones((len(idx), 1))
c_pred = np.concatenate((labels, confs, good_boxes),
axis=1)
# 添加进result里
results[-1].extend(c_pred)
if len(results[-1]) > 0:
# 按照置信度进行排序
results[-1] = np.array(results[-1])
argsort = np.argsort(results[-1][:, 1])[::-1]
results[-1] = results[-1][argsort]
# 选出置信度最大的keep_top_k个
results[-1] = results[-1][:keep_top_k]
return results
class SSD(object):
_defaults = {
"model_path": 'model_data/ssd_weights.h5',
"classes_path": 'model_data/voc_classes.txt',
"model_image_size" : (300, 300, 3),
"confidence": 0.5,
}
@classmethod
def get_defaults(cls, n):
if n in cls._defaults:
return cls._defaults[n]
else:
return "Unrecognized attribute name '" + n + "'"
#---------------------------------------------------#
# 初始化ssd
#---------------------------------------------------#
def __init__(self, **kwargs):
self.__dict__.update(self._defaults)
self.class_names = self._get_class()
self.sess = K.get_session()
self.generate()
self.bbox_util = BBoxUtility(self.num_classes)
#---------------------------------------------------#
# 获得所有的分类
#---------------------------------------------------#
def _get_class(self):
classes_path = os.path.expanduser(self.classes_path)
with open(classes_path) as f:
class_names = f.readlines()
class_names = [c.strip() for c in class_names]
return class_names
#---------------------------------------------------#
# 获得所有的分类
#---------------------------------------------------#
def generate(self):
model_path = os.path.expanduser(self.model_path)
assert model_path.endswith('.h5'), 'Keras model or weights must be a .h5 file.'
# 计算总的种类
self.num_classes = len(self.class_names) + 1
# 载入模型,如果原来的模型里已经包括了模型结构则直接载入。
# 否则先构建模型再载入
self.ssd_model = ssd.SSD300(self.model_image_size,self.num_classes)
self.ssd_model.load_weights(self.model_path,by_name=True)
self.ssd_model.summary()
print('{} model, anchors, and classes loaded.'.format(model_path))
# 画框设置不同的颜色
hsv_tuples = [(x / len(self.class_names), 1., 1.)
for x in range(len(self.class_names))]
self.colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples))
self.colors = list(
map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)),
self.colors))
#---------------------------------------------------#
# 检测图片
#---------------------------------------------------#
def detect_image(self, image):
image_shape = np.array(np.shape(image)[0:2])
crop_img,x_offset,y_offset = letterbox_image(image, (self.model_image_size[0],self.model_image_size[1]))
photo = np.array(crop_img,dtype = np.float64)
# 图片预处理,归一化
photo = preprocess_input(np.reshape(photo,[1,self.model_image_size[0],self.model_image_size[1],3]))
preds = self.ssd_model.predict(photo)
# 将预测结果进行解码
results = self.bbox_util.detection_out(preds, confidence_threshold=self.confidence)
if len(results[0])<=0:
return image
# 筛选出其中得分高于confidence的框
det_label = results[0][:, 0]
det_conf = results[0][:, 1]
det_xmin, det_ymin, det_xmax, det_ymax = results[0][:, 2], results[0][:, 3], results[0][:, 4], results[0][:, 5]
top_indices = [i for i, conf in enumerate(det_conf) if conf >= self.confidence]
top_conf = det_conf[top_indices]
top_label_indices = det_label[top_indices].tolist()
top_xmin, top_ymin, top_xmax, top_ymax = np.expand_dims(det_xmin[top_indices],-1),np.expand_dims(det_ymin[top_indices],-1),np.expand_dims(det_xmax[top_indices],-1),np.expand_dims(det_ymax[top_indices],-1)
# 去掉灰条
boxes = ssd_correct_boxes(top_ymin,top_xmin,top_ymax,top_xmax,np.array([self.model_image_size[0],self.model_image_size[1]]),image_shape)
font = ImageFont.truetype(font='model_data/simhei.ttf',size=np.floor(3e-2 * np.shape(image)[1] + 0.5).astype('int32'))
thickness = (np.shape(image)[0] + np.shape(image)[1]) // self.model_image_size[0]
for i, c in enumerate(top_label_indices):
predicted_class = self.class_names[int(c)-1]
score = top_conf[i]
top, left, bottom, right = boxes[i]
top = top - 5
left = left - 5
bottom = bottom + 5
right = right + 5
top = max(0, np.floor(top + 0.5).astype('int32'))
left = max(0, np.floor(left + 0.5).astype('int32'))
bottom = min(np.shape(image)[0], np.floor(bottom + 0.5).astype('int32'))
right = min(np.shape(image)[1], np.floor(right + 0.5).astype('int32'))
# 画框框
label = '{} {:.2f}'.format(predicted_class, score)
draw = ImageDraw.Draw(image)
label_size = draw.textsize(label, font)
label = label.encode('utf-8')
print(label)
if top - label_size[1] >= 0:
text_origin = np.array([left, top - label_size[1]])
else:
text_origin = np.array([left, top + 1])
for i in range(thickness):
draw.rectangle(
[left + i, top + i, right - i, bottom - i],
outline=self.colors[int(c)-1])
draw.rectangle(
[tuple(text_origin), tuple(text_origin + label_size)],
fill=self.colors[int(c)-1])
draw.text(text_origin, str(label,'UTF-8'), fill=(0, 0, 0), font=font)
del draw
return image
def close_session(self):
self.sess.close()
4.数据生成器
from keras.applications.imagenet_utils import preprocess_input
#训练输入数据生成器
def rand(a-0,b=1):
return np.random.rand()*(b-a)+a#生成固定范围与固定最小值的随机整数
class Generator(object):
def __init__(self,bbox_util,batch_size,train_lines,val_lines,image_size,num_classes):
self.bbox_util=bbox_util
self.batch_size=batch_size
self.train_lines=train_lines
self.val_lines=val_lines
self.train_batches=len(train_lines)
self.val_batches=len(val_lines)
self.image_size=image_size
self.num_classes=num_classes-1#去掉背景
def get_random_data(self,annotation_line,input_shape,jitter=.1,hue=.1,sat=1.1,val=1.1):
'''r实时数据增强的随机预处理'''
line=annotation_line.split()
image=Image.open(line[0])
iw,ih=image.size
h,w=input_shape
box=np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
#resize_image
new_ar=w/h*rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)#生成随机数(0.9/1.1,1.1/0.9)
scale=rand(.25,2)#生成随机数(0.25,2)
if new_ar<1:#0.5的概率
nh=int(scale*h)
nw=int(nh*new_ar)
else:
nw=int(scale*w)
nh=int(nw/new_ar)
image=image.resize((nw,nh),Image.BICUBIC)#resize到随机大小
dx=int(rand(0,w-nw))
dy=int(rand(0,h-nh))
new_image=Image.new('RGB',(w,h),(128,128,128))
new_image.paste(image,(dx,dy))#图片贴到300大小的灰图上,位置随机,但不出界
image=new_image
#flip image or not
flip=rand()<.5#一半可能=0,一半可能=1
if flip:image=image.transpose(Image.FLIP_LEFT_RIGHT)#左右翻转
#distort image,改变图像亮度饱和度和色度
hue=rand(-hue,hue)
sat=rand(1,sat) if rand()<.5 else 1/rand(1,sat)
val=rand(1,val) if rand()<.5 else 1/rand(1,val)
x=rgb_to_hsv(np.array(image)/255.)
x[...,0]+=hue
x[...,0][x[...,0]>1]-=1
x[...,0][x[...,0]<0]+=1
x[...,1]*=sat
x[...,2]*=val
x[x>1]=1
x[x<0]=0
image_data=hsv_to_rgb(x)*255# numpy array, 0 to 1
#correct boxe读出输入图像的box真的真实位置,因为图像resize并且在灰条图上,所以要找出此时的box位置
box_data=np.zeros(len(box),5)
if len(box)>0:
np.random.shuffle(box)
box[:,[0,2]]=box[:,[0,2]]*nw/iw+dx
box[:,[1,3]]=box[:,[1,3]]*nh/ih+dy
if flip:box[;,[0,2]]=w-box[:,[2,0]]
box[:,0:2][box[:,0:2]<0]=0
box[:,2][box[:,2]>w]=w
box[:,3][box[:,3]>h]=h
box_w=box[:,2]-box[:,0]
box_h=box[:,3]-box[:,1]
box=box[np.logical_and(box_w>1,box_h>1)]# discard invalid box
box_data=np.zeros((len(box),5))
box_data[:len(box)]=box
if len(box)==0:
return image_data,[]
if (box_data[:,:,4]>0).any():#第5列大于0,就是有类别非背景,即存在检测目标
return image_data,box_data
else:
return image_data,[]
def generatr(self,train=True):
while True:
if train:
shuffle(elf.train_lines)
lines=self.train_lines
else:
shuffle(self.val_lines)
lines=self.val_lines
inputs=[]
targets=[]
for annotation_line in lines:
img,y=self.get_random_data(annotation_line,self.image_size[0:2])
if len(y)==0:
continue
boxes=np.array(y[:,:4],stype=np.float32)#获得真是box的坐标
boxes[:,0]=boxes[:,0]/self.image_size[1]#规定到0~1
boxes[:,1]=boxes[:,1]/self.image_size[0]
boxes[:,2]=boxes[:,2]/self.image_size[1]
boxes[:,3]=boxes[:,3]/self.image_size[0]
one_hot_label=np.eye(self.num_classes)[(np.array[:,4],np.int32)]#第5列是类别,换成onehot
if ((boxes[:,3]-boxes[:,1])<0).any() and ((boxes[:,2]-boxes[:,0])<0).any():
continue
y=np.concatenate([boxes,one_hot_label],axis=-1)#处理后的左上右下坐标点和类别
inputs.append(img)
targets.append(y)
if len(targets)==self.batch_size:
tmp_inp=np.array(inputs)
tmp_targets=np.array(targets)
inputs=[]
targets=[]
yield preprocess_input(tmp_inp),tmp_targets
5.计算loss
#4.练网络的loss计算,调整正负样本个数
class MultiboxLoss(object):
def __init__(self,num_classes,alpha=1.0,neg_pos_ratio=3.0,background_label_id=0,negatives_for_hard=100.0):
self.num_classes=num_classes
self.alpha=alpha
self.neg_pos_ratio=neg_pos_ratio
if background_label_id!=0:
raise Exception('only 0 as background label id is supported')
self.backgroud_label_id=background_label_id
self.negatives_for hard =negatives_for_hard
def _l1_smooth_loss(self,y_true,y_pred):
abs_loss=tf.abs(y_true-y_pred)
sq_loss=0.5(y_true-y_pred)**2
l1_loss=tf.where(tf.less(abs_loss,1.0),sq_loss,abe_loss-0.5)#less返回abs_loss是否小于1,where,当判断为真时,sq_loss替代,否则abs_loss-0.5替代
return tf.reduce_sum(l1_loss,-1)
def _softmax_loss(self,y_true,y_pred):
y_pred=tf.maximum(y_pred,1e-7)
softmax_loss=-tf.reduce_sum(y_true*tf.log(y_pred),axis=-1)
return softmax_loss
def compute_loss(self,y_true,y_pred):
batch_size=tf.shape(u_true)[0]
num_boxes=tf.to_float(tf.shape(y_true)[1])
conf_loss=self._softmax_loss(y_true[:,:,4:-8],y_pred[:,:,4:-8])#类别预测的损失batch_size,8732,21 -> batch_size,8732
loc_loss=self._l1_smooth_loss(y_true[:,:,:4],y_pred[:,:,:4])#调整参数的预测损失batch_size,8732,4 -> batch_size,8732
num_pos=tf.reduce_sum(y_true[:,:-8],axis=-1)#每一张图的pos的个数
pos_loc_loss=tf.reduce_sum(loc_loss*y_true[:,:,-8]) # 每一张图的pos_loc_loss
pos_conf_loss=tf.reduce_sum(conf_loss*y_true[:,:,-8],axis=1)# 每一张图的pos_conf_loss
nu_neg=tf.minimum(self.neg_pos_ratio*num_pos,num_boxes-mun_pos)# 获取一定的负样本
pos_num_neg_mask=tf.greater(num_neg,0)## 找到了哪些值是大于0的
has_min=tf.to_float(tf.reduce_any(pos_num_mask))
num_neg=tf.concat(axis=0,values=[num_neg,[(1 - has_min) * self.negatives_for_hard]])# 获得一个1.0
num_neg_batch = tf.reduce_mean(tf.boolean_mask(num_neg,tf.greater(num_neg, 0)))# 求平均每个图片要取多少个负样本
num_neg_batch = tf.to_int32(num_neg_batch)
# conf的起始
confs_start = 4 + self.background_label_id + 1
# conf的结束
confs_end = confs_start + self.num_classes - 1
# 找到实际上在该位置不应该有预测结果的框,求他们最大的置信度。
max_confs = tf.reduce_max(y_pred[:, :, confs_start:confs_end],
axis=2)
# 取top_k个置信度,作为负样本
_, indices = tf.nn.top_k(max_confs * (1 - y_true[:, :, -8]),
k=num_neg_batch)
# 找到其在1维上的索引
batch_idx = tf.expand_dims(tf.range(0, batch_size), 1)
batch_idx = tf.tile(batch_idx, (1, num_neg_batch))
full_indices = (tf.reshape(batch_idx, [-1]) * tf.to_int32(num_boxes) +
tf.reshape(indices, [-1]))
# full_indices = tf.concat(2, [tf.expand_dims(batch_idx, 2),
# tf.expand_dims(indices, 2)])
# neg_conf_loss = tf.gather_nd(conf_loss, full_indices)
neg_conf_loss = tf.gather(tf.reshape(conf_loss, [-1]),
full_indices)
neg_conf_loss = tf.reshape(neg_conf_loss,
[batch_size, num_neg_batch])
neg_conf_loss = tf.reduce_sum(neg_conf_loss, axis=1)
6.train
#train
if __name__=='__main__':
log_dir='logs/'
annotation_path='2007_train.txt'
NUM_CLASSES=21
input_shape=(300,300,3)
priors=pickle.load(open('model_data/prior_boxes_ssd300.pkl', 'rb'))#获得先验框
bbox_util=BBoxUtility(NUM_CLASSES,priors)#实例化对象
val_split=0.1
with open(annotation_path) as f:
lines.f.readlines()
np.random.seed(1)
np.random.shuffle(lines)
no.random.seed(None)
num_val=int(len(lines)*val_split)
num_train=len(lines)-num_val#划分数据集
model=SSD300(input_shape,num_classes=NUM_CLASSES)
model.laod_weights('model_data/ssd_weights.h5',by_name=True,skip_mismatch=True)#载入模型预训练权重
logging=TensorBoard(log_dir=log_dir)
checkpoint=ModelCheckpoint(log_dir+'ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5',
monitor='val_loss',save_weights_only=True,save_best_only=True,period=1)
reduce_lr=ReduceLROnPlateau(monitor='val_loss',factor=0.5,patience=2,verbose=1)
early_stopping=EarlyStopping(monitor='val_loss',min_delta=0,patience=6,verbose=1)
BATCH_SIZE=4
gen=Generator(bbox_util,BATCH_SIZE,lines[:num_train],lines[num_train:],(input_shape[0],input_shape[1]),NUM_CLASSES)
for i in range(21):
model.layers[i].trainabel=False
if True:
model.compile(optimizer=Adam(lr=1e-4),loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=3.0).compute_loss)
model.fit_generator(gen.generate(True),
steps_per_epoch=num_train//BATCH_SIZE,
validation_data=gen.generate(False),
validation_steps=num_val//BATCH_SIZE,
epochs=15,
initial_epoch=0,
callbacks=[logging, checkpoint, reduce_lr, early_stopping])
if True:
model.compile(optimizer=Adam(lr=1e-5),loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=3.0).compute_loss)
model.fit_generator(gen.generate(True),
steps_per_epoch=num_train//BATCH_SIZE,
validation_data=gen.generate(False),
validation_steps=num_val//BATCH_SIZE,
epochs=30,
initial_epoch=15,
callbacks=[logging, checkpoint, reduce_lr, early_stopping])
for i in range(21):
model.layers[i].trainable = True
if True:
model.compile(optimizer=Adam(lr=1e-6),loss=MultiboxLoss(NUM_CLASSES, neg_pos_ratio=3.0).compute_loss)
model.fit_generator(gen.generate(True),
steps_per_epoch=num_train//BATCH_SIZE,
validation_data=gen.generate(False),
validation_steps=num_val//BATCH_SIZE,
epochs=50,
initial_epoch=30,
callbacks=[logging, checkpoint, reduce_lr, early_stopping])
参考:https://blog.csdn.net/weixin_44791964/article/details/102496765