FasterRCNN在FastRCNN的基础上,实现端到端的训练。算法分为3个部分。主干网络提取特征、RPN生成建议框、RCNN进行分类和回归。
FasterRCNN优点:
FasterRCNN缺点:
input[800,600,3] --> VGGNet(4次下采样) / ResNet50–> Output[37,50,512]
(1) VGG16代理流程
VGG16以卷积和池化作为基本结构,输入[600,600,3]进行5次下采样得到base_layers[37,37,512]。
'''
base_layers = VGG16(inputs)
x --> Conv2D*2 + MaxPoolong2D --> Conv2D*2 + MaxPoolong2D --> Conv2D*3 + MaxPoolong2D --> Conv2D*3 + MaxPoolong2D --> Conv2D*3
'''
def VGG16(inputs):
x = Conv2D(64,(3,3),activation = 'relu',padding = 'same',name = 'block1_conv1')(inputs)
x = Conv2D(64,(3,3),activation = 'relu',padding = 'same', name = 'block1_conv2')(x)
x = MaxPooling2D((2,2), strides = (2,2), name = 'block1_pool')(x)
x = Conv2D(128,(3,3),activation = 'relu',padding = 'same',name = 'block2_conv1')(x)
x = Conv2D(128,(3,3),activation = 'relu',padding = 'same',name = 'block2_conv2')(x)
x = MaxPooling2D((2,2),strides = (2,2), name = 'block2_pool')(x)
x = Conv2D(256,(3,3),activation = 'relu',padding = 'same',name = 'block3_conv1')(x)
x = Conv2D(256,(3,3),activation = 'relu',padding = 'same',name = 'block3_conv2')(x)
x = Conv2D(256,(3,3),activation = 'relu',padding = 'same',name = 'block3_conv3')(x)
x = MaxPooling2D((2,2),strides = (2,2), name = 'block3_pool')(x)
# 第四个卷积部分
# 14,14,512
x = Conv2D(512,(3,3),activation = 'relu',padding = 'same', name = 'block4_conv1')(x)
x = Conv2D(512,(3,3),activation = 'relu',padding = 'same', name = 'block4_conv2')(x)
x = Conv2D(512,(3,3),activation = 'relu',padding = 'same', name = 'block4_conv3')(x)
x = MaxPooling2D((2,2),strides = (2,2), name = 'block4_pool')(x)
# 第五个卷积部分
# 7,7,512
x = Conv2D(512,(3,3),activation = 'relu', padding = 'same', name = 'block5_conv1')(x)
x = Conv2D(512,(3,3),activation = 'relu', padding = 'same', name = 'block5_conv2')(x)
x = Conv2D(512,(3,3),activation = 'relu', padding = 'same', name = 'block5_conv3')(x)
return x
(2) ResNet50代码流程
ResNet50以卷积和池化作为基本结构,输入[600,600,3]进行4次下采样得到base_layers[38,38,1024]。ResNet50有conv_block 、identity_block两个基础结构。
'''
base_layers = ResNet50(inputs)
conv_block :BottleNeck + ResNet(下采样)
identity_block :BottleNeck + ResNet(无下采样)
ResNet50 :ZCBAM --> conv_block + identity_block *2 --> conv_block + identity_block *3 --> conv_block + identity_block *5
'''
def identity_block(input_tensor, kernel_size, filters, stage, block):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(filters1, (1, 1), kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '2a')(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = Conv2D(filters2, kernel_size, padding='same', kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '2b')(x)
x = BatchNormalization(name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = Conv2D(filters3, (1, 1), kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '2c')(x)
x = BatchNormalization(name=bn_name_base + '2c')(x)
x = layers.add([x, input_tensor])
x = Activation('relu')(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2)):
filters1, filters2, filters3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(filters1, (1, 1), strides=strides, kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '2a')(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x)
x = Activation('relu')(x)
x = Conv2D(filters2, kernel_size, padding='same', kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '2b')(x)
x = BatchNormalization(name=bn_name_base + '2b')(x)
x = Activation('relu')(x)
x = Conv2D(filters3, (1, 1), kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '2c')(x)
x = BatchNormalization(name=bn_name_base + '2c')(x)
shortcut = Conv2D(filters3, (1, 1), strides=strides, kernel_initializer=random_normal(stddev=0.02), name=conv_name_base + '1')(input_tensor)
shortcut = BatchNormalization(name=bn_name_base + '1')(shortcut)
x = layers.add([x, shortcut])
x = Activation('relu')(x)
return x
def ResNet50(inputs):
#-----------------------------------#
# 假设输入进来的图片是600,600,3
#-----------------------------------#
# 600,600,3 -> 300,300,64
x = ZeroPadding2D((3, 3))(inputs)
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x)
x = BatchNormalization(name='bn_conv1')(x)
x = Activation('relu')(x)
# 300,300,64 -> 150,150,64
x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# 150,150,64 -> 150,150,256
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1))
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
# 150,150,256 -> 75,75,512
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
# 75,75,512 -> 38,38,1024
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
# 最终获得一个38,38,1024的共享特征层
return x
'''
rpn = get_rpn(base_layers, num_anchors)
base_layers[38,38,1024] --> Conv2D((3, 3),512) --> Conv2D((1, 1),num_anchors) + Conv2D((3, 3),num_anchors * 4) --> Reshape --> x_class, x_regr]
'''
def get_rpn(base_layers, num_anchors):
#----------------------------------------------------#
# 利用一个512通道的3x3卷积进行特征整合
#----------------------------------------------------#
x = Conv2D(512, (3, 3), padding='same', activation='relu', kernel_initializer=random_normal(stddev=0.02), name='rpn_conv1')(base_layers)
#----------------------------------------------------#
# 利用一个1x1卷积调整通道数,获得预测结果
#----------------------------------------------------#
x_class = Conv2D(num_anchors, (1, 1), activation = 'sigmoid', kernel_initializer=random_normal(stddev=0.02), name='rpn_out_class')(x)
x_regr = Conv2D(num_anchors * 4, (1, 1), activation = 'linear', kernel_initializer=random_normal(stddev=0.02), name='rpn_out_regress')(x)
x_class = Reshape((-1, 1),name="classification")(x_class)
x_regr = Reshape((-1, 4),name="regression")(x_regr)
return [x_class, x_regr]
'''
classifier = get_vgg_classifier(base_layers, roi_input, 7, num_classes)
base_layers, input_rois --> RoiPoolingConv(roi_size) --> vgg_classifier_layers --> TimeDistributed(Dense) + TimeDistributed(Dense) --> out_class, out_regr
'''
def get_vgg_classifier(base_layers, input_rois, roi_size=7, num_classes=21):
# batch_size, 37, 37, 512 -> batch_size, num_rois, 7, 7, 512
out_roi_pool = RoiPoolingConv(roi_size)([base_layers, input_rois])
# batch_size, num_rois, 7, 7, 512 -> batch_size, num_rois, 4096
out = vgg_classifier_layers(out_roi_pool)
# batch_size, num_rois, 4096 -> batch_size, num_rois, num_classes
out_class = TimeDistributed(Dense(num_classes, activation='softmax', kernel_initializer=random_normal(stddev=0.02)), name='dense_class_{}'.format(num_classes))(out)
# batch_size, num_rois, 4096 -> batch_size, num_rois, 4 * (num_classes-1)
out_regr = TimeDistributed(Dense(4 * (num_classes-1), activation='linear', kernel_initializer=random_normal(stddev=0.02)), name='dense_regress_{}'.format(num_classes))(out)
return [out_class, out_regr]
RoIPooling : batch_size, 37, 37, 512 -> batch_size, num_rois, 7, 7, 512
'''
# batch_size, 37, 37, 512 -> batch_size, num_rois, 7, 7, 512
# roi_input = Input(shape=(None, 4))
# 用RPN得到的建议框在FeatureMap上截取下来,并Pooling。
'''
out = vgg_classifier_layers(out_roi_pool)
'''
def vgg_classifier_layers(x):
# num_rois, 14, 14, 1024 -> num_rois, 7, 7, 2048
x = TimeDistributed(Flatten(name='flatten'))(x)
x = TimeDistributed(Dense(4096, activation='relu'), name='fc1')(x)
x = TimeDistributed(Dense(4096, activation='relu'), name='fc2')(x)
return x
'''
r_image = frcnn.detect_image(image)
(1) rpn网络进行预测得到置信度和预测偏移值
'''
preds = self.model_rpn.predict(photo)
1. model_rpn.predict: photo[1,600,600,3] --> model_rpn --> preds[x_class, x_regr, base_layers]
2. model_rpn: input[1,600,600,3] --> ResNet50(inputs) --> base_layers + num_anchors=9 --> get_rpn(base_layers, num_anchors) --> rpn
3. ResNet50: (inputs[1,600,600,3]) --> ZCBAM(None, 150, 150, 64) --> conv_block + identity_block*2 (None, 150, 150, 256) --> conv_block + identity_block*3 (None, 75, 75, 512)--> conv_block + identity_block*5 --> base_layers(None, 38, 38, 1024)
4. get_rpn: base_layers(None, 38, 38, 1024)--> Conv2D(512) (None, 38, 38, 512)--> Conv2D(num_anchors)(None, 38, 38, 9),Conv2D(num_anchors * 4)(None, 38, 38, 36) --> x_class(None, 12996, 1) , x_regr(None, 12996, 4) , base_layers(None, 38, 38, 1024)
'''
(1.1) ResNet50
'''
base_layers = ResNet50(inputs) # [1,600,600,3] --> (None, 38, 38, 1024)
'''
def ResNet50(inputs):
# 输入 600*600*3
img_input = inputs
x = ZeroPadding2D((3, 3))(img_input)
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1')(x) # 300*300*64
x = BatchNormalization(name='bn_conv1')(x)
x = Activation('relu')(x)
x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x) # 150*150*64
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1)) # 150*150*256
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b')
x = identity_block(x, 3, [64, 64, 256], stage=2, block='c')
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a') # 75*75*512
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c')
x = identity_block(x, 3, [128, 128, 512], stage=3, block='d')
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a') # 38*38*1024
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='b')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='c')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='d')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='e')
x = identity_block(x, 3, [256, 256, 1024], stage=4, block='f')
return x # (None, 38, 38, 1024)
(1.2) get_rpn
'''
rpn = get_rpn(base_layers, num_anchors) # (None, 38, 38, 1024) --> x_class(None, 12996, 1) , x_regr(None, 12996, 4)
'''
def get_rpn(base_layers, num_anchors):
# 1 base_layers进行3*3卷积
x = Conv2D(512, (3, 3), padding='same', activation='relu', kernel_initializer='normal', name='rpn_conv1')(base_layers)
# 2 得到每个框的置信度和框坐标
x_class = Conv2D(num_anchors, (1, 1), activation='sigmoid', kernel_initializer='uniform', name='rpn_out_class')(x)
x_regr = Conv2D(num_anchors * 4, (1, 1), activation='linear', kernel_initializer='zero', name='rpn_out_regress')(x)
x_class = Reshape((-1,1),name="classification")(x_class) # 如果包含物体,x_class的值接近1。
x_regr = Reshape((-1,4),name="regression")(x_regr)
return [x_class, x_regr, base_layers] # x_class(None, 12996, 1) , x_regr(None, 12996, 4) , base_layers(None, 38, 38, 1024)
(2) 生成先验框
'''
anchors = get_anchors(self.get_img_output_length(width,height),width,height)
self.get_img_output_length(width,height): 根据输入特征层的大小得到输出特征的大小。600 --> [300,150,75,38]
[height:600,width:600]
'''
# 生成先验框
def get_anchors(shape,width,height):
''' 获得先验框
shape: featuremap.shape ,img.width,img.height '''
# 1 生成框边长
anchors = generate_anchors()
# 2 边长和中心点结合,每个像素点生成框
network_anchors = shift(shape,anchors)
# 3 缩放框在0,1之间
network_anchors[:,0] = network_anchors[:,0]/width
network_anchors[:,1] = network_anchors[:,1]/height
network_anchors[:,2] = network_anchors[:,2]/width
network_anchors[:,3] = network_anchors[:,3]/height
network_anchors = np.clip(network_anchors,0,1) # 框的坐标限制在0~1范围内。
return network_anchors
(2.1) 生成框边长
'''
anchors = generate_anchors()
'''
def generate_anchors(sizes=None, ratios=None):
''' 生成大小不同的先验框的边长
一共九个
[[ -64., -64., 64., 64.],
[-128., -128., 128., 128.],
[-256., -256., 256., 256.],
[ -64., -128., 64., 128.],
[-128., -256., 128., 256.],
[-256., -512., 256., 512.],
[-128., -64., 128., 64.],
[-256., -128., 256., 128.],
[-512., -256., 512., 256.]]
'''
if sizes is None:
sizes = config.anchor_box_scales # [128, 256, 512]
if ratios is None:
ratios = config.anchor_box_ratios # [[1, 1], [1, 2], [2, 1]]
# 框的数目
num_anchors = len(sizes) * len(ratios) # 3*3
# 放置框
anchors = np.zeros((num_anchors, 4)) #[9,4]
anchors[:, 2:] = np.tile(sizes, (2, len(ratios))).T # 把size复制成[2,3]
for i in range(len(ratios)):
anchors[3*i:3*i+3, 2] = anchors[3*i:3*i+3, 2]*ratios[i][0]
anchors[3*i:3*i+3, 3] = anchors[3*i:3*i+3, 3]*ratios[i][1]
anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
return anchors
(2.2) 边长和中心点结合,每个像素点生成框
'''
network_anchors = shift(shape,anchors)
'''
def shift(shape, anchors, stride=config.rpn_stride):
''' 生成网格中心点 self.rpn_stride = 16 ,根据边长和中心点生成框 What\How \Why
base_layers(None, 38, 38, 1024)
self.rpn_stride = 16
[0,1,2,3,....37]
[0.5,1.5,2.5,....,37.5]
[0.5,1.5,2.5,....,37.5]*stride
'''
shift_x = (np.arange(0, shape[0], dtype=keras.backend.floatx()) + 0.5) * stride
shift_y = (np.arange(0, shape[1], dtype=keras.backend.floatx()) + 0.5) * stride
shift_x, shift_y = np.meshgrid(shift_x, shift_y)
shift_x = np.reshape(shift_x, [-1]) # (1444,)
shift_y = np.reshape(shift_y, [-1]) # (1444,)
shifts = np.stack([
shift_x,
shift_y,
shift_x,
shift_y
], axis=0) # (4, 1444)
shifts = np.transpose(shifts) # (1444, 4)
number_of_anchors = np.shape(anchors)[0] # 9
k = np.shape(shifts)[0]
# 下面两步操作得到框的左上角和右下角坐标 [1,9,4] + [1444,1,4]
shifted_anchors = np.reshape(anchors, [1, number_of_anchors, 4]) + np.array(np.reshape(shifts, [k, 1, 4]), keras.backend.floatx())
shifted_anchors = np.reshape(shifted_anchors, [k * number_of_anchors, 4])
return shifted_anchors # (12996, 4)
(3) 将预测结果进行解码 + nms筛选得到建议框
'''
rpn_results = self.bbox_util.detection_out(preds,anchors,1,confidence_threshold=0.8)
predspreds : x_class(None, 12996, 1) , x_regr(None, 12996, 4)
anchors(12996, 4)
'''
def detection_out(self, predictions, mbox_priorbox, num_classes, keep_top_k=300,
confidence_threshold=0.5):
''' 对PRN网络框预测结果用先验框解码、NMS '''
mbox_conf = predictions[0] # 类别
mbox_loc = predictions[1] # 网络预测的结果
# 先验框数量
mbox_priorbox = mbox_priorbox
results = []
# 对每一个图片进行处理
for i in range(len(mbox_loc)):
results.append([])
# 1 解码 **************************得到预测框的左上角和右下角
decode_bbox = self.decode_boxes(mbox_loc[i], mbox_priorbox)
# 2 对解码后对框处理
for c in range(num_classes):
c_confs = mbox_conf[i, :, c]
c_confs_m = c_confs > confidence_threshold
if len(c_confs[c_confs_m]) > 0:
# 取出得分高于confidence_threshold的框
boxes_to_process = decode_bbox[c_confs_m]
confs_to_process = c_confs[c_confs_m]
# 进行iou的非极大抑制
feed_dict = {self.boxes: boxes_to_process,
self.scores: confs_to_process}
# 把标签、置信度、框取出来
idx = self.sess.run(self.nms, feed_dict=feed_dict)
# 取出在非极大抑制中效果较好的内容
good_boxes = boxes_to_process[idx]
confs = confs_to_process[idx][:, None]
# 将label、置信度、框的位置进行堆叠。
labels = c * np.ones((len(idx), 1))
c_pred = np.concatenate((labels, confs, good_boxes),
axis=1)
# 添加进result里
results[-1].extend(c_pred)
if len(results[-1]) > 0:
# 按照置信度进行排序
results[-1] = np.array(results[-1])
argsort = np.argsort(results[-1][:, 1])[::-1]
results[-1] = results[-1][argsort]
# 选出置信度最大的keep_top_k个
results[-1] = results[-1][:keep_top_k]
# 获得,在所有预测结果里面,置信度比较高的框
# 还有,利用先验框和RPN网络的预测结果,处理获得了真实框(预测框)的位置
return results
(3.1) 解码得到预测框的左上角和右下角
'''
decode_bbox = self.decode_boxes(mbox_loc[i], mbox_priorbox)
'''
def decode_boxes(self, mbox_loc, mbox_priorbox):
# 1.1 获得先验框的宽与高
prior_width = mbox_priorbox[:, 2] - mbox_priorbox[:, 0]
prior_height = mbox_priorbox[:, 3] - mbox_priorbox[:, 1]
# 1.2 获得先验框的中心点
prior_center_x = 0.5 * (mbox_priorbox[:, 2] + mbox_priorbox[:, 0])
prior_center_y = 0.5 * (mbox_priorbox[:, 3] + mbox_priorbox[:, 1])
# 2 预测的真实框距离先验框中心的xy轴偏移情况
decode_bbox_center_x = mbox_loc[:, 0] * prior_width / 4
decode_bbox_center_x += prior_center_x
decode_bbox_center_y = mbox_loc[:, 1] * prior_height / 4
decode_bbox_center_y += prior_center_y
# 预测的真实框的宽与高的求取
decode_bbox_width = np.exp(mbox_loc[:, 2] / 4)
decode_bbox_width *= prior_width
decode_bbox_height = np.exp(mbox_loc[:, 3] /4)
decode_bbox_height *= prior_height
# 获取预测的真实框的左上角与右下角
decode_bbox_xmin = decode_bbox_center_x - 0.5 * decode_bbox_width
decode_bbox_ymin = decode_bbox_center_y - 0.5 * decode_bbox_height
decode_bbox_xmax = decode_bbox_center_x + 0.5 * decode_bbox_width
decode_bbox_ymax = decode_bbox_center_y + 0.5 * decode_bbox_height
# 真实框的左上角与右下角进行堆叠
decode_bbox = np.concatenate((decode_bbox_xmin[:, None],
decode_bbox_ymin[:, None],
decode_bbox_xmax[:, None],
decode_bbox_ymax[:, None]), axis=-1)
# 防止超出0与1
decode_bbox = np.minimum(np.maximum(decode_bbox, 0.0), 1.0)
return decode_bbox
(3.2) NMS
上一步解码后得到预测框,然后遍历预测框的每一个类别,选出置信度大于阈值的预测框,再根据上一步的结果进行NMS删除掉重复的框。
'''
self.nms = tf.image.non_max_suppression(self.boxes, self.scores,
self._top_k,
iou_threshold=self._nms_thresh)
'''
(4) P_cls种类,置信度, P_regr
'''
[P_cls, P_regr] = self.model_classifier.predict([base_layer,ROIs])
'''
def get_classifier(base_layers, input_rois, num_rois, nb_classes=21, trainable=False):
''' roi --> cls+reg'''
pooling_regions = 14
input_shape = (num_rois, 14, 14, 1024)
# base_layers[38,38,1024], input_rois[num_prior,4] num_prior=32,out_roi_pool.shape[1,32,14,14,1024]
# 1 roiPooling,base_layers[38,38,1024], input_rois[-1,4] ,pooling_regions=14, num_rois=32
out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers, input_rois]) # input_rois是建议框
# 2
out = classifier_layers(out_roi_pool, input_shape=input_shape, trainable=True)
out = TimeDistributed(Flatten())(out)
out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out)
out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out)
return [out_class, out_regr] # 21, 20*4
(4.1) 抠图
'''
out_roi_pool = RoiPoolingConv(pooling_regions, num_rois)([base_layers, input_rois])
'''
def call(self, x, mask=None):
assert(len(x) == 2)
img = x[0] # featureMap
rois = x[1] # 建议框
outputs = []
# 遍历建议框
for roi_idx in range(self.num_rois):
# x,y左上角,wh宽高
x = rois[0, roi_idx, 0]
y = rois[0, roi_idx, 1]
w = rois[0, roi_idx, 2]
h = rois[0, roi_idx, 3]
x = K.cast(x, 'int32')
y = K.cast(y, 'int32')
w = K.cast(w, 'int32')
h = K.cast(h, 'int32')
# 在特征图上截取
rs = tf.image.resize_images(img[:, y:y+h, x:x+w, :], (self.pool_size, self.pool_size))
outputs.append(rs)
final_output = K.concatenate(outputs, axis=0)
final_output = K.reshape(final_output, (1, self.num_rois, self.pool_size, self.pool_size, self.nb_channels))
final_output = K.permute_dimensions(final_output, (0, 1, 2, 3, 4))
return final_output # [1,32,14,14,1024]
(4.3) 对抠图卷积提取特征
'''
out = classifier_layers(out_roi_pool, input_shape=input_shape, trainable=True)
[1,32,14,14,1024] --> (None, 32, 1, 1, 204)
'''
def classifier_layers(x, input_shape=(32, 14, 14, 1024), trainable=False):
x = conv_block_td(x, 3, [512, 512, 2048], stage=5, block='a', input_shape=input_shape, strides=(2, 2), trainable=trainable)
x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='b', trainable=trainable)
x = identity_block_td(x, 3, [512, 512, 2048], stage=5, block='c', trainable=trainable)
x = TimeDistributed(AveragePooling2D((7, 7)), name='avg_pool')(x) # 对第二维度处理
return x # (None, 32, 1, 1, 204)
(4.4) 分类、回归
'''
(None, 32, 1, 1, 204) --> (None, 1, 6528) --> (None, 1, 21) + (None, 1, 80)
'''
out = TimeDistributed(Flatten())(out) # (None, 1, 6528)
out_class = TimeDistributed(Dense(nb_classes, activation='softmax', kernel_initializer='zero'), name='dense_class_{}'.format(nb_classes))(out) # (None, 1, 21)
out_regr = TimeDistributed(Dense(4 * (nb_classes-1), activation='linear', kernel_initializer='zero'), name='dense_regress_{}'.format(nb_classes))(out) #(None, 1, 80)
'''
gen = Generator(bbox_util, lines, NUM_CLASSES, solid=True)
rpn_train = gen.generate()
'''
def generate(self):
while True:
shuffle(self.train_lines)
lines = self.train_lines
for annotation_line in lines:
# 数据增强
img,y=self.get_random_data(annotation_line)
height, width, _ = np.shape(img)
if len(y)==0:
continue
boxes = np.array(y[:,:4],dtype=np.float32)
boxes[:,0] = boxes[:,0]/width
boxes[:,1] = boxes[:,1]/height
boxes[:,2] = boxes[:,2]/width
boxes[:,3] = boxes[:,3]/height
box_heights = boxes[:,3] - boxes[:,1]
box_widths = boxes[:,2] - boxes[:,0]
if (box_heights<=0).any() or (box_widths<=0).any():
continue
y[:,:4] = boxes[:,:4]
# 获取先验框
anchors = get_anchors(get_img_output_length(width,height),width,height)
# 计算真实框对应的先验框,与这个先验框应当有的预测结果
assignment = self.bbox_util.assign_boxes(y,anchors)
num_regions = 256
classification = assignment[: , 4]
regression = assignment[:,:]
mask_pos = classification[:] > 0
num_pos = len(classification[mask_pos])
if num_pos > num_regions/2:
val_locs = random.sample(range(num_pos), int(num_pos - num_regions/2))
classification[mask_pos][val_locs] = -1
regression[mask_pos][val_locs,-1] = -1
mask_neg = classification[:]==0
num_neg = len(classification[mask_neg])
if len(classification[mask_neg]) + num_pos > num_regions:
val_locs = random.sample(range(num_neg), int(num_neg - num_pos))
classification[mask_neg][val_locs] = -1
classification = np.reshape(classification,[-1,1])
regression = np.reshape(regression,[-1,5])
tmp_inp = np.array(img)
tmp_targets = [np.expand_dims(np.array(classification,dtype=np.float32),0),np.expand_dims(np.array(regression,dtype=np.float32),0)]
yield preprocess_input(np.expand_dims(tmp_inp,0)), tmp_targets, np.expand_dims(y,0)
(1) 数据增强
'''
img,y=self.get_random_data(annotation_line)
'''
def get_random_data(self, annotation_line, random=True, jitter=.1, hue=.1, sat=1.1, val=1.1, proc_img=True):
'''r实时数据增强随机预处理'''
line = annotation_line.split()
image = Image.open(line[0])
iw, ih = image.size
if self.solid:
w,h = self.solid_shape
else:
w, h = get_new_img_size(iw, ih)
box = np.array([np.array(list(map(int,box.split(',')))) for box in line[1:]])
# resize image
new_ar = w/h * rand(1-jitter,1+jitter)/rand(1-jitter,1+jitter)
scale = rand(.25, 2)
if new_ar < 1:
nh = int(scale*h)
nw = int(nh*new_ar)
else:
nw = int(scale*w)
nh = int(nw/new_ar)
image = image.resize((nw,nh), Image.BICUBIC)
# place image
dx = int(rand(0, w-nw))
dy = int(rand(0, h-nh))
new_image = Image.new('RGB', (w,h), (128,128,128))
new_image.paste(image, (dx, dy))
image = new_image
# flip image or not
flip = rand()<.5
if flip: image = image.transpose(Image.FLIP_LEFT_RIGHT)
# distort image
hue = rand(-hue, hue)
sat = rand(1, sat) if rand()<.5 else 1/rand(1, sat)
val = rand(1, val) if rand()<.5 else 1/rand(1, val)
x = rgb_to_hsv(np.array(image)/255.)
x[..., 0] += hue
x[..., 0][x[..., 0]>1] -= 1
x[..., 0][x[..., 0]<0] += 1
x[..., 1] *= sat
x[..., 2] *= val
x[x>1] = 1
x[x<0] = 0
image_data = hsv_to_rgb(x)*255 # numpy array, 0 to 1
# correct boxes
box_data = np.zeros((len(box),5))
if len(box)>0:
np.random.shuffle(box)
box[:, [0,2]] = box[:, [0,2]]*nw/iw + dx
box[:, [1,3]] = box[:, [1,3]]*nh/ih + dy
# flip
if flip: box[:, [0,2]] = w - box[:, [2,0]]
# 过滤
box[:, 0:2][box[:, 0:2]<0] = 0
box[:, 2][box[:, 2]>w] = w
box[:, 3][box[:, 3]>h] = h
box_w = box[:, 2] - box[:, 0]
box_h = box[:, 3] - box[:, 1]
box = box[np.logical_and(box_w>1, box_h>1)] # discard invalid box
box_data = np.zeros((len(box),5))
box_data[:len(box)] = box
if len(box) == 0:
return image_data, []
if (box_data[:,:4]>0).any():
return image_data, box_data
else:
return image_data, []
(2) 获取先验框
'''
anchors = get_anchors(get_img_output_length(width,height),width,height)
'''
def get_anchors(shape,width,height):
''' 获得先验框
shape: featuremap.shape ,img.width,img.height '''
# 1 生成框边长
anchors = generate_anchors()
# 2 边长和中心点结合,每个像素点生成框
network_anchors = shift(shape,anchors)
# 3 缩放框在0,1之间
network_anchors[:,0] = network_anchors[:,0]/width
network_anchors[:,1] = network_anchors[:,1]/height
network_anchors[:,2] = network_anchors[:,2]/width
network_anchors[:,3] = network_anchors[:,3]/height
network_anchors = np.clip(network_anchors,0,1)
return network_anchors
(3) 真实框与先验框配对,生成标签
'''
assignment = self.bbox_util.assign_boxes(y,anchors)
'''
def assign_boxes(self, boxes, anchors):
self.num_priors = len(anchors)
self.priors = anchors
assignment = np.zeros((self.num_priors, 4 + 1))
assignment[:, 4] = 0.0
if len(boxes) == 0:
return assignment
# 1. 确忽略的框
# 对每一个真实框都进行iou计算
ingored_boxes = np.apply_along_axis(self.ignore_box, 1, boxes[:, :4])
# 取重合程度最大的先验框,并且获取这个先验框的index
ingored_boxes = ingored_boxes.reshape(-1, self.num_priors, 1)
# (num_priors)
ignore_iou = ingored_boxes[:, :, 0].max(axis=0)
# (num_priors)
ignore_iou_mask = ignore_iou > 0
assignment[:, 4][ignore_iou_mask] = -1
# 2. 找出正样本,并使符合要求每一个先验框只负责一个真实框。
# (n, num_priors, 5)
encoded_boxes = np.apply_along_axis(self.encode_box, 1, boxes[:, :4])
# 每一个真实框的编码后的值,和iou
# (n, num_priors)
encoded_boxes = encoded_boxes.reshape(-1, self.num_priors, 5)
# 取重合程度最大的先验框,并且获取这个先验框的index
# (num_priors)
best_iou = encoded_boxes[:, :, -1].max(axis=0)
# (num_priors)
best_iou_idx = encoded_boxes[:, :, -1].argmax(axis=0)
# (num_priors)
best_iou_mask = best_iou > 0
# 某个先验框它属于哪个真实框
best_iou_idx = best_iou_idx[best_iou_mask]
assign_num = len(best_iou_idx)
# 保留重合程度最大的先验框的应该有的预测结果
# 哪些先验框存在真实框
encoded_boxes = encoded_boxes[:, best_iou_mask, :]
assignment[:, :4][best_iou_mask] = encoded_boxes[best_iou_idx,np.arange(assign_num),:4]
# 4代表为背景的概率,为0
assignment[:, 4][best_iou_mask] = 1
# 通过assign_boxes我们就获得了,输入进来的这张图片,应该有的预测结果是什么样子的
return assignment
(4) 确定正负样本
num_regions = 256
classification = assignment[: , 4]
regression = assignment[:,:]
mask_pos = classification[:] > 0
num_pos = len(classification[mask_pos])
if num_pos > num_regions/2:
val_locs = random.sample(range(num_pos), int(num_pos - num_regions/2))
classification[mask_pos][val_locs] = -1
regression[mask_pos][val_locs,-1] = -1
mask_neg = classification[:] == 0
num_neg = len(classification[mask_neg])
if len(classification[mask_neg]) + num_pos > num_regions:
val_locs = random.sample(range(num_neg), int(num_neg - num_pos))
classification[mask_neg][val_locs] = -1
classification = np.reshape(classification,[-1,1])
regression = np.reshape(regression,[-1,5])
tmp_inp = np.array(img)
tmp_targets = [np.expand_dims(np.array(classification,dtype=np.float32),0),np.expand_dims(np.array(regression,dtype=np.float32),0)]
(1) smooth_l1损失函数
'''
smooth_l1()
'''
def smooth_l1(sigma=1.0):
sigma_squared = sigma ** 2
def _smooth_l1(y_true, y_pred):
# y_true [batch_size, num_anchor, 4+1]
# y_pred [batch_size, num_anchor, 4]
regression = y_pred
regression_target = y_true[:, :, :-1]
anchor_state = y_true[:, :, -1]
# 找到正样本
indices = tf.where(keras.backend.equal(anchor_state, 1))
regression = tf.gather_nd(regression, indices)
regression_target = tf.gather_nd(regression_target, indices)
# 计算 smooth L1 loss
# f(x) = 0.5 * (sigma * x)^2 if |x| < 1 / sigma / sigma
# |x| - 0.5 / sigma / sigma otherwise
regression_diff = regression - regression_target
regression_diff = keras.backend.abs(regression_diff)
regression_loss = tf.where(
keras.backend.less(regression_diff, 1.0 / sigma_squared),
0.5 * sigma_squared * keras.backend.pow(regression_diff, 2),
regression_diff - 0.5 / sigma_squared
)
normalizer = keras.backend.maximum(1, keras.backend.shape(indices)[0])
normalizer = keras.backend.cast(normalizer, dtype=keras.backend.floatx())
loss = keras.backend.sum(regression_loss) / normalizer
return loss
return _smooth_l1
(2) cls_loss()损失函数
'''
def cls_loss(ratio=3):
def _cls_loss(y_true, y_pred):
# y_true [batch_size, num_anchor, num_classes+1]
# y_pred [batch_size, num_anchor, num_classes]
labels = y_true
anchor_state = y_true[:,:,-1] # -1 是需要忽略的, 0 是背景, 1 是存在目标
classification = y_pred
# 找出存在目标的先验框
indices_for_object = tf.where(keras.backend.equal(anchor_state, 1))
labels_for_object = tf.gather_nd(labels, indices_for_object)
classification_for_object = tf.gather_nd(classification, indices_for_object)
cls_loss_for_object = keras.backend.binary_crossentropy(labels_for_object, classification_for_object)
# 找出实际上为背景的先验框
indices_for_back = tf.where(keras.backend.equal(anchor_state, 0))
labels_for_back = tf.gather_nd(labels, indices_for_back)
classification_for_back = tf.gather_nd(classification, indices_for_back)
# 计算每一个先验框应该有的权重
cls_loss_for_back = keras.backend.binary_crossentropy(labels_for_back, classification_for_back)
# 标准化,实际上是正样本的数量
normalizer_pos = tf.where(keras.backend.equal(anchor_state, 1))
normalizer_pos = keras.backend.cast(keras.backend.shape(normalizer_pos)[0], keras.backend.floatx())
normalizer_pos = keras.backend.maximum(keras.backend.cast_to_floatx(1.0), normalizer_pos)
normalizer_neg = tf.where(keras.backend.equal(anchor_state, 0))
normalizer_neg = keras.backend.cast(keras.backend.shape(normalizer_neg)[0], keras.backend.floatx())
normalizer_neg = keras.backend.maximum(keras.backend.cast_to_floatx(1.0), normalizer_neg)
# 将所获得的loss除上正样本的数量
cls_loss_for_object = keras.backend.sum(cls_loss_for_object)/normalizer_pos
cls_loss_for_back = ratio*keras.backend.sum(cls_loss_for_back)/normalizer_neg
# 总的loss
loss = cls_loss_for_object + cls_loss_for_back
return loss
return _cls_loss
'''