Mask-RCNN: Backbone + RPN block + (Reg,Cls)block + Mask block
(1) Backbone 用Resnet101提取下采样2次、3次、4次、5次的特称层构造特征金字塔。
(2) RPN block根据第一步提取的特征金字塔和设置的先验框筛选出有目标的区域,并有目标区域resize到7x7x256大小。
(3) (Reg,Cls)block对第二步结构进行分类和回归得到预测框和类别。
(4) Mask block根据第三步的预测框对第一步的特征层截取并resize到14x14x256大小,对每个像素点分类。
a. Resnet101结构
# 1 ConvBlock
input->CBR+CBR+CB->o1
input->CB->o2
↓
o1+o2
↓
ReLU
# 2 IdentityBlock
input->CBR+CBR+CB->o1
↓
o1+input
↓
ReLU
# 3 Resnet101
input(1024,1024,3)
Conv
ZConv(s=2,512,512,64)BRM(s=2,256,256,64)
↓
ConvBlock(256,256,256)
IdentityBlock*2 -> C2(h/4,w/4,256)
↓
ConvBlock(s=2,128,128,512)
IdentityBlock*3 -> C3(h/8,w/8,512)
↓
ConvBlock(s=2,64,64,1024)
IdentityBlock*22 -> C4(h/16,w/16,1024)
↓
ConvBlock(s=2,32,32,2048)
IdentityBlock*22 -> C5(h/32,w/32,2048)
b. 代码
from keras.layers import ZeroPadding2D,Conv2D,MaxPooling2D,BatchNormalization,Activation,Add
def identity_block(input_tensor, kernel_size, filters, stage, block,
use_bias=True, train_bn=True):
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
use_bias=use_bias)(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x, training=train_bn)
x = Activation('relu')(x)
x = Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
name=conv_name_base + '2b', use_bias=use_bias)(x)
x = BatchNormalization(name=bn_name_base + '2b')(x, training=train_bn)
x = Activation('relu')(x)
x = Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
use_bias=use_bias)(x)
x = BatchNormalization(name=bn_name_base + '2c')(x, training=train_bn)
x = Add()([x, input_tensor])
x = Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block,
strides=(2, 2), use_bias=True, train_bn=True):
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = Conv2D(nb_filter1, (1, 1), strides=strides,
name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
x = BatchNormalization(name=bn_name_base + '2a')(x, training=train_bn)
x = Activation('relu')(x)
x = Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
name=conv_name_base + '2b', use_bias=use_bias)(x)
x = BatchNormalization(name=bn_name_base + '2b')(x, training=train_bn)
x = Activation('relu')(x)
x = Conv2D(nb_filter3, (1, 1), name=conv_name_base +
'2c', use_bias=use_bias)(x)
x = BatchNormalization(name=bn_name_base + '2c')(x, training=train_bn)
shortcut = Conv2D(nb_filter3, (1, 1), strides=strides,
name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
shortcut = BatchNormalization(name=bn_name_base + '1')(shortcut, training=train_bn)
x = Add()([x, shortcut])
x = Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x
def get_resnet(input_image,stage5=False, train_bn=True):
# Stage 1
x = ZeroPadding2D((3, 3))(input_image)
x = Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
x = BatchNormalization(name='bn_conv1')(x, training=train_bn)
x = Activation('relu')(x)
# Height/4,Width/4,64
C1 = x = MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# Stage 2
x = conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_bn=train_bn)
x = identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_bn=train_bn)
# Height/4,Width/4,256
C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_bn=train_bn)
# Stage 3
x = conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_bn=train_bn)
# Height/8,Width/8,512
C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_bn=train_bn)
# Stage 4
x = conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_bn=train_bn)
block_count = 22
for i in range(block_count):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
# Height/16,Width/16,1024
C4 = x
# Stage 5
if stage5:
x = conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_bn=train_bn)
# Height/32,Width/32,2048
C5 = x = identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_bn=train_bn)
else:
C5 = None
return [C1, C2, C3, C4, C5]
a1. 结构
C2->Conv(256,256,256)-> Add(256,256,256)
| ↑
| upsample(256,256,256)
| ↑
C3->Conv(128,128,256)--->Add(64,64,256)
| ↑
| ↑
| upsample(128,128,256)
| ↑
C4->Conv(64,64,256)--->Add(64,64,256)
| _______↑
| ↑
| upsample(64,64,256)
| ↑
C5->Conv(32,32,256)->Conv(32,32,256)P5->MaxPool(s=2,16,16,256)P6
b1. 代码
# 获得Resnet里的压缩程度不同的一些层
_, C2, C3, C4, C5 = get_resnet(input_image, stage5=True, train_bn=config.TRAIN_BN)
# 组合成特征金字塔的结构
# P5长宽共压缩了5次
# Height/32,Width/32,256
P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c5p5')(C5)
# P4长宽共压缩了4次
# Height/16,Width/16,256
P4 = Add(name="fpn_p4add")([
UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),
Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c4p4')(C4)])
# P4长宽共压缩了3次
# Height/8,Width/8,256
P3 = Add(name="fpn_p3add")([
UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4),
Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c3p3')(C3)])
# P4长宽共压缩了2次
# Height/4,Width/4,256
P2 = Add(name="fpn_p2add")([
UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3),
Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name='fpn_c2p2')(C2)])
# 各自进行一次256通道的卷积,此时P2、P3、P4、P5通道数相同
# Height/4,Width/4,256
P2 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
# Height/8,Width/8,256
P3 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
# Height/16,Width/16,256
P4 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
# Height/32,Width/32,256
P5 = Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5)
# 在建议框网络里面还有一个P6用于获取建议框
# Height/64,Width/64,256
P6 = MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
# P2, P3, P4, P5, P6可以用于获取建议框
rpn_feature_maps = [P2, P3, P4, P5, P6]
# P2, P3, P4, P5用于获取mask信息
mrcnn_feature_maps = [P2, P3, P4, P5]
a2. 生成预测偏移和类别
feature maps
↓
Conv(k-3,f=512)
↓
Conv(k-3,f=num_anchoe*2)
Conv(k-3,f=num_anchoe*4)
b2. 代码
#------------------------------------#
# 五个不同大小的特征层会传入到
# RPN当中,获得建议框
#------------------------------------#
def rpn_graph(feature_map, anchors_per_location):
shared = Conv2D(512, (3, 3), padding='same', activation='relu',
name='rpn_conv_shared')(feature_map)
x = Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
activation='linear', name='rpn_class_raw')(shared)
# batch_size,num_anchors,2
# 代表这个先验框对应的类
rpn_class_logits = Reshape([-1,2])(x)
rpn_probs = Activation(
"softmax", name="rpn_class_xxx")(rpn_class_logits)
x = Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
activation='linear', name='rpn_bbox_pred')(shared)
# batch_size,num_anchors,4
# 这个先验框的调整参数
rpn_bbox = Reshape([-1,4])(x)
return [rpn_class_logits, rpn_probs, rpn_bbox]
#------------------------------------#
# 建立建议框网络模型
# RPN模型
#------------------------------------#
def build_rpn_model(anchors_per_location, depth):
input_feature_map = Input(shape=[None, None, depth],
name="input_rpn_feature_map")
outputs = rpn_graph(input_feature_map, anchors_per_location)
return Model([input_feature_map], outputs, name="rpn_model")
a3. 解码
gt:[cx,cyh,w]
pred:[dx,dx,dh,dw]
↓
rx=dx*h+cx
ry=dy*h+cy
rh=exp(dh)*h
rw=exp(dw)*w
b3. 代码
#----------------------------------------------------------#
# Proposal Layer
# 该部分代码用于将先验框转化成建议框
#----------------------------------------------------------#
def apply_box_deltas_graph(boxes, deltas):
# 计算先验框的中心和宽高
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
center_y = boxes[:, 0] + 0.5 * height
center_x = boxes[:, 1] + 0.5 * width
# 计算出调整后的先验框的中心和宽高
center_y += deltas[:, 0] * height
center_x += deltas[:, 1] * width
height *= tf.exp(deltas[:, 2])
width *= tf.exp(deltas[:, 3])
# 计算左上角和右下角的点的坐标
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
return result
def clip_boxes_graph(boxes, window):
"""
boxes: [N, (y1, x1, y2, x2)]
window: [4] in the form y1, x1, y2, x2
"""
# Split
wy1, wx1, wy2, wx2 = tf.split(window, 4)
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
# Clip
y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
clipped.set_shape((clipped.shape[0], 4))
return clipped
class ProposalLayer(Layer):
def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
super(ProposalLayer, self).__init__(**kwargs)
self.config = config
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold
# [rpn_class, rpn_bbox, anchors]
def call(self, inputs):
# 代表这个先验框内部是否有物体[batch, num_rois, 1]
scores = inputs[0][:, :, 1]
# 代表这个先验框的调整参数[batch, num_rois, 4]
deltas = inputs[1]
# [0.1 0.1 0.2 0.2],改变数量级
deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4])
# Anchors
anchors = inputs[2]
# 筛选出得分前6000个的框
pre_nms_limit = tf.minimum(self.config.PRE_NMS_LIMIT, tf.shape(anchors)[1])
# 获得这些框的索引
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True,
name="top_anchors").indices
# 获得这些框的得分
scores = utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
self.config.IMAGES_PER_GPU)
# 获得这些框的调整参数
deltas = utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
self.config.IMAGES_PER_GPU)
# 获得这些框对应的先验框
pre_nms_anchors = utils.batch_slice([anchors, ix], lambda a, x: tf.gather(a, x),
self.config.IMAGES_PER_GPU,
names=["pre_nms_anchors"])
# [batch, N, (y1, x1, y2, x2)]
# 对先验框进行解码
boxes = utils.batch_slice([pre_nms_anchors, deltas],
lambda x, y: apply_box_deltas_graph(x, y),
self.config.IMAGES_PER_GPU,
names=["refined_anchors"])
# [batch, N, (y1, x1, y2, x2)]
# 防止超出图片范围
window = np.array([0, 0, 1, 1], dtype=np.float32)
boxes = utils.batch_slice(boxes,
lambda x: clip_boxes_graph(x, window),
self.config.IMAGES_PER_GPU,
names=["refined_anchors_clipped"])
# 非极大抑制
def nms(boxes, scores):
indices = tf.image.non_max_suppression(
boxes, scores, self.proposal_count,
self.nms_threshold, name="rpn_non_max_suppression")
proposals = tf.gather(boxes, indices)
# 如果数量达不到设置的建议框数量的话
# 就padding
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
proposals = tf.pad(proposals, [(0, padding), (0, 0)])
return proposals
proposals = utils.batch_slice([boxes, scores], nms,
self.config.IMAGES_PER_GPU)
return proposals
def compute_output_shape(self, input_shape):
return (None, self.proposal_count, 4)
a. 结构
Roi->conv(7,7,256)->Conv(1,1,024)->Conv(1,1,1024)->(cls+reg)
Roi->conv(14,14,256)->Conv(14,14,024)->Conv(28,28,1024)->->Conv(28,28,81)->mask
b. 代码
#------------------------------------#
# 五个不同大小的特征层会传入到
# RPN当中,获得建议框
#------------------------------------#
def rpn_graph(feature_map, anchors_per_location):
shared = Conv2D(512, (3, 3), padding='same', activation='relu',
name='rpn_conv_shared')(feature_map)
x = Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
activation='linear', name='rpn_class_raw')(shared)
# batch_size,num_anchors,2
# 代表这个先验框对应的类
rpn_class_logits = Reshape([-1,2])(x)
rpn_probs = Activation(
"softmax", name="rpn_class_xxx")(rpn_class_logits)
x = Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
activation='linear', name='rpn_bbox_pred')(shared)
# batch_size,num_anchors,4
# 这个先验框的调整参数
rpn_bbox = Reshape([-1,4])(x)
return [rpn_class_logits, rpn_probs, rpn_bbox]
#------------------------------------#
# 建立建议框网络模型
# RPN模型
#------------------------------------#
def build_rpn_model(anchors_per_location, depth):
input_feature_map = Input(shape=[None, None, depth],
name="input_rpn_feature_map")
outputs = rpn_graph(input_feature_map, anchors_per_location)
return Model([input_feature_map], outputs, name="rpn_model")
#------------------------------------#
# 建立classifier模型
# 这个模型的预测结果会调整建议框
# 获得最终的预测框
#------------------------------------#
def fpn_classifier_graph(rois, feature_maps, image_meta,
pool_size, num_classes, train_bn=True,
fc_layers_size=1024):
# ROI Pooling,利用建议框在特征层上进行截取
# Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size],
name="roi_align_classifier")([rois, image_meta] + feature_maps)
# Shape: [batch, num_rois, 1, 1, fc_layers_size],相当于两次全连接
x = TimeDistributed(Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"),
name="mrcnn_class_conv1")(x)
x = TimeDistributed(BatchNormalization(), name='mrcnn_class_bn1')(x, training=train_bn)
x = Activation('relu')(x)
# Shape: [batch, num_rois, 1, 1, fc_layers_size]
x = TimeDistributed(Conv2D(fc_layers_size, (1, 1)),
name="mrcnn_class_conv2")(x)
x = TimeDistributed(BatchNormalization(), name='mrcnn_class_bn2')(x, training=train_bn)
x = Activation('relu')(x)
# Shape: [batch, num_rois, fc_layers_size]
shared = Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
name="pool_squeeze")(x)
# Classifier head
# 这个的预测结果代表这个先验框内部的物体的种类
mrcnn_class_logits = TimeDistributed(Dense(num_classes),
name='mrcnn_class_logits')(shared)
mrcnn_probs = TimeDistributed(Activation("softmax"),
name="mrcnn_class")(mrcnn_class_logits)
# BBox head
# 这个的预测结果会对先验框进行调整
# [batch, num_rois, NUM_CLASSES * (dy, dx, log(dh), log(dw))]
x = TimeDistributed(Dense(num_classes * 4, activation='linear'),
name='mrcnn_bbox_fc')(shared)
# Reshape to [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
mrcnn_bbox = Reshape((-1, num_classes, 4), name="mrcnn_bbox")(x)
return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
def build_fpn_mask_graph(rois, feature_maps, image_meta,
pool_size, num_classes, train_bn=True):
# ROI Pooling,利用建议框在特征层上进行截取
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size],
name="roi_align_mask")([rois, image_meta] + feature_maps)
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = TimeDistributed(Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv1")(x)
x = TimeDistributed(BatchNormalization(),
name='mrcnn_mask_bn1')(x, training=train_bn)
x = Activation('relu')(x)
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = TimeDistributed(Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv2")(x)
x = TimeDistributed(BatchNormalization(),
name='mrcnn_mask_bn2')(x, training=train_bn)
x = Activation('relu')(x)
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = TimeDistributed(Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv3")(x)
x = TimeDistributed(BatchNormalization(),
name='mrcnn_mask_bn3')(x, training=train_bn)
x = Activation('relu')(x)
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = TimeDistributed(Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv4")(x)
x = TimeDistributed(BatchNormalization(),
name='mrcnn_mask_bn4')(x, training=train_bn)
x = Activation('relu')(x)
# Shape: [batch, num_rois, 2xMASK_POOL_SIZE, 2xMASK_POOL_SIZE, channels]
x = TimeDistributed(Conv2DTranspose(256, (2, 2), strides=2, activation="relu"),
name="mrcnn_mask_deconv")(x)
# 反卷积后再次进行一个1x1卷积调整通道,使其最终数量为numclasses,代表分的类
x = TimeDistributed(Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"),
name="mrcnn_mask")(x)
return x
#----------------------------------------------------------#
# ROIAlign Layer
# 利用建议框在特征层上截取内容
#----------------------------------------------------------#
def log2_graph(x):
return tf.log(x) / tf.log(2.0)
def parse_image_meta_graph(meta):
"""
将meta里面的参数进行分割
"""
image_id = meta[:, 0]
original_image_shape = meta[:, 1:4]
image_shape = meta[:, 4:7]
window = meta[:, 7:11] # (y1, x1, y2, x2) window of image in in pixels
scale = meta[:, 11]
active_class_ids = meta[:, 12:]
return {
"image_id": image_id,
"original_image_shape": original_image_shape,
"image_shape": image_shape,
"window": window,
"scale": scale,
"active_class_ids": active_class_ids,
}
class PyramidROIAlign(Layer):
def __init__(self, pool_shape, **kwargs):
super(PyramidROIAlign, self).__init__(**kwargs)
self.pool_shape = tuple(pool_shape)
def call(self, inputs):
# 建议框的位置
boxes = inputs[0]
# image_meta包含了一些必要的图片信息
image_meta = inputs[1]
# 取出所有的特征层[batch, height, width, channels]
feature_maps = inputs[2:]
y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
h = y2 - y1
w = x2 - x1
# 获得输入进来的图像的大小
image_shape = parse_image_meta_graph(image_meta)['image_shape'][0]
# 通过建议框的大小找到这个建议框属于哪个特征层
image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
roi_level = tf.minimum(5, tf.maximum(
2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
# batch_size, box_num
roi_level = tf.squeeze(roi_level, 2)
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled = []
box_to_level = []
# 分别在P2-P5中进行截取
for i, level in enumerate(range(2, 6)):
# 找到每个特征层对应box
ix = tf.where(tf.equal(roi_level, level))
level_boxes = tf.gather_nd(boxes, ix)
box_to_level.append(ix)
# 获得这些box所属的图片
box_indices = tf.cast(ix[:, 0], tf.int32)
# 停止梯度下降
level_boxes = tf.stop_gradient(level_boxes)
box_indices = tf.stop_gradient(box_indices)
# Result: [batch * num_boxes, pool_height, pool_width, channels]
pooled.append(tf.image.crop_and_resize(
feature_maps[i], level_boxes, box_indices, self.pool_shape,
method="bilinear"))
pooled = tf.concat(pooled, axis=0)
# 将顺序和所属的图片进行堆叠
box_to_level = tf.concat(box_to_level, axis=0)
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
axis=1)
# box_to_level[:, 0]表示第几张图
# box_to_level[:, 1]表示第几张图里的第几个框
sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
# 进行排序,将同一张图里的某一些聚集在一起
ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
box_to_level)[0]).indices[::-1]
# 按顺序获得图片的索引
ix = tf.gather(box_to_level[:, 2], ix)
pooled = tf.gather(pooled, ix)
# 重新reshape为原来的格式
# 也就是
# Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
pooled = tf.reshape(pooled, shape)
return pooled
def compute_output_shape(self, input_shape):
return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
#----------------------------------------------------------#
# Detection Layer
#----------------------------------------------------------#
def refine_detections_graph(rois, probs, deltas, window, config):
"""细化分类建议并过滤重叠部分并返回最终结果探测。
Inputs:
rois: [N, (y1, x1, y2, x2)] in normalized coordinates
probs: [N, num_classes]. Class probabilities.
deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific
bounding box deltas.
window: (y1, x1, y2, x2) in normalized coordinates. The part of the image
that contains the image excluding the padding.
Returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where
coordinates are normalized.
"""
# 找到得分最高的类
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
# 序号+类
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
# 取出成绩
class_scores = tf.gather_nd(probs, indices)
# 还有框的调整参数
deltas_specific = tf.gather_nd(deltas, indices)
# 进行解码
# Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
refined_rois = apply_box_deltas_graph(
rois, deltas_specific * config.BBOX_STD_DEV)
# 防止超出0-1
refined_rois = clip_boxes_graph(refined_rois, window)
# 去除背景
keep = tf.where(class_ids > 0)[:, 0]
# 去除背景和得分小的区域
if config.DETECTION_MIN_CONFIDENCE:
conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(conf_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# 获得除去背景并且得分较高的框还有种类与得分
# 1. Prepare variables
pre_nms_class_ids = tf.gather(class_ids, keep)
pre_nms_scores = tf.gather(class_scores, keep)
pre_nms_rois = tf.gather(refined_rois, keep)
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
def nms_keep_map(class_id):
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
class_keep = tf.image.non_max_suppression(
tf.gather(pre_nms_rois, ixs),
tf.gather(pre_nms_scores, ixs),
max_output_size=config.DETECTION_MAX_INSTANCES,
iou_threshold=config.DETECTION_NMS_THRESHOLD)
class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
class_keep = tf.pad(class_keep, [(0, gap)],
mode='CONSTANT', constant_values=-1)
class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
return class_keep
# 2. 进行非极大抑制
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
dtype=tf.int64)
# 3. 找到符合要求的需要被保留的建议框
nms_keep = tf.reshape(nms_keep, [-1])
nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
# 4. Compute intersection between keep and nms_keep
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(nms_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# 寻找得分最高的num_keep个框
roi_count = config.DETECTION_MAX_INSTANCES
class_scores_keep = tf.gather(class_scores, keep)
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
keep = tf.gather(keep, top_ids)
# Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
detections = tf.concat([
tf.gather(refined_rois, keep),
tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
tf.gather(class_scores, keep)[..., tf.newaxis]
], axis=1)
# 如果达不到数量的话就padding
gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
return detections
def norm_boxes_graph(boxes, shape):
h, w = tf.split(tf.cast(shape, tf.float32), 2)
scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
shift = tf.constant([0., 0., 1., 1.])
return tf.divide(boxes - shift, scale)
class DetectionLayer(Layer):
def __init__(self, config=None, **kwargs):
super(DetectionLayer, self).__init__(**kwargs)
self.config = config
def call(self, inputs):
rois = inputs[0]
mrcnn_class = inputs[1]
mrcnn_bbox = inputs[2]
image_meta = inputs[3]
# 找到window的小数形式
m = parse_image_meta_graph(image_meta)
image_shape = m['image_shape'][0]
window = norm_boxes_graph(m['window'], image_shape[:2])
# Run detection refinement graph on each item in the batch
detections_batch = utils.batch_slice(
[rois, mrcnn_class, mrcnn_bbox, window],
lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
self.config.IMAGES_PER_GPU)
# Reshape output
# [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
# normalized coordinates
return tf.reshape(
detections_batch,
[self.config.BATCH_SIZE, self.config.DETECTION_MAX_INSTANCES, 6])
def compute_output_shape(self, input_shape):
return (None, self.config.DETECTION_MAX_INSTANCES, 6)
分类用交叉熵,回归用SmoothL1损失
a. 损失函数
rpn_class_loss = Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")(
[input_rpn_match, rpn_class_logits])
rpn_bbox_loss = Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")(
[input_rpn_bbox, input_rpn_match, rpn_bbox])
class_loss = Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
[target_class_ids, mrcnn_class_logits, active_class_ids])
bbox_loss = Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
[target_bbox, target_class_ids, mrcnn_bbox])
mask_loss = Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
[target_mask, target_class_ids, mrcnn_mask])
b. 代码
import tensorflow as tf
import keras.backend as K
import random
import numpy as np
import logging
from utils import utils
from utils.anchors import compute_backbone_shapes,generate_pyramid_anchors
############################################################
# Loss Functions
############################################################
def batch_pack_graph(x, counts, num_rows):
"""Picks different number of values from each row
in x depending on the values in counts.
"""
outputs = []
for i in range(num_rows):
outputs.append(x[i, :counts[i]])
return tf.concat(outputs, axis=0)
def smooth_l1_loss(y_true, y_pred):
"""Implements Smooth-L1 loss.
y_true and y_pred are typically: [N, 4], but could be any shape.
"""
diff = K.abs(y_true - y_pred)
less_than_one = K.cast(K.less(diff, 1.0), "float32")
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5)
return loss
def rpn_class_loss_graph(rpn_match, rpn_class_logits):
"""RPN anchor classifier loss.
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for BG/FG.
"""
# Squeeze last dim to simplify
rpn_match = tf.squeeze(rpn_match, -1)
# Get anchor classes. Convert the -1/+1 match to 0/1 values.
anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32)
# Positive and Negative anchors contribute to the loss,
# but neutral anchors (match value = 0) don't.
indices = tf.where(K.not_equal(rpn_match, 0))
# Pick rows that contribute to the loss and filter out the rest.
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
anchor_class = tf.gather_nd(anchor_class, indices)
# Cross entropy loss
loss = K.sparse_categorical_crossentropy(target=anchor_class,
output=rpn_class_logits,
from_logits=True)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
return loss
def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):
"""Return the RPN bounding box loss graph.
config: the model config object.
target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
Uses 0 padding to fill in unsed bbox deltas.
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
"""
# Positive anchors contribute to the loss, but negative and
# neutral anchors (match value of 0 or -1) don't.
rpn_match = K.squeeze(rpn_match, -1)
indices = tf.where(K.equal(rpn_match, 1))
# Pick bbox deltas that contribute to the loss
rpn_bbox = tf.gather_nd(rpn_bbox, indices)
# Trim target bounding box deltas to the same length as rpn_bbox.
batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1)
target_bbox = batch_pack_graph(target_bbox, batch_counts,
config.IMAGES_PER_GPU)
loss = smooth_l1_loss(target_bbox, rpn_bbox)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
return loss
def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,
active_class_ids):
"""Loss for the classifier head of Mask RCNN.
target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
padding to fill in the array.
pred_class_logits: [batch, num_rois, num_classes]
active_class_ids: [batch, num_classes]. Has a value of 1 for
classes that are in the dataset of the image, and 0
for classes that are not in the dataset.
"""
# During model building, Keras calls this function with
# target_class_ids of type float32. Unclear why. Cast it
# to int to get around it.
target_class_ids = tf.cast(target_class_ids, 'int64')
# Find predictions of classes that are not in the dataset.
pred_class_ids = tf.argmax(pred_class_logits, axis=2)
# TODO: Update this line to work with batch > 1. Right now it assumes all
# images in a batch have the same active_class_ids
pred_active = tf.gather(active_class_ids[0], pred_class_ids)
# Loss
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=target_class_ids, logits=pred_class_logits)
# Erase losses of predictions of classes that are not in the active
# classes of the image.
loss = loss * pred_active
# Computer loss mean. Use only predictions that contribute
# to the loss to get a correct mean.
loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
return loss
def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
"""Loss for Mask R-CNN bounding box refinement.
target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
target_class_ids: [batch, num_rois]. Integer class IDs.
pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
"""
# Reshape to merge batch and roi dimensions for simplicity.
target_class_ids = K.reshape(target_class_ids, (-1,))
target_bbox = K.reshape(target_bbox, (-1, 4))
pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4))
# Only positive ROIs contribute to the loss. And only
# the right class_id of each ROI. Get their indices.
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
positive_roi_class_ids = tf.cast(
tf.gather(target_class_ids, positive_roi_ix), tf.int64)
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
# Gather the deltas (predicted and true) that contribute to loss
target_bbox = tf.gather(target_bbox, positive_roi_ix)
pred_bbox = tf.gather_nd(pred_bbox, indices)
# Smooth-L1 Loss
loss = K.switch(tf.size(target_bbox) > 0,
smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox),
tf.constant(0.0))
loss = K.mean(loss)
return loss
def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):
"""Mask binary cross-entropy loss for the masks head.
target_masks: [batch, num_rois, height, width].
A float32 tensor of values 0 or 1. Uses zero padding to fill array.
target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
with values from 0 to 1.
"""
# Reshape for simplicity. Merge first two dimensions into one.
target_class_ids = K.reshape(target_class_ids, (-1,))
mask_shape = tf.shape(target_masks)
target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))
pred_shape = tf.shape(pred_masks)
pred_masks = K.reshape(pred_masks,
(-1, pred_shape[2], pred_shape[3], pred_shape[4]))
# Permute predicted masks to [N, num_classes, height, width]
pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])
# Only positive ROIs contribute to the loss. And only
# the class specific mask of each ROI.
positive_ix = tf.where(target_class_ids > 0)[:, 0]
positive_class_ids = tf.cast(
tf.gather(target_class_ids, positive_ix), tf.int64)
indices = tf.stack([positive_ix, positive_class_ids], axis=1)
# Gather the masks (predicted and true) that contribute to loss
y_true = tf.gather(target_masks, positive_ix)
y_pred = tf.gather_nd(pred_masks, indices)
# Compute binary cross entropy. If no positive ROIs, then return 0.
# shape: [batch, roi, num_classes]
loss = K.switch(tf.size(y_true) > 0,
K.binary_crossentropy(target=y_true, output=y_pred),
tf.constant(0.0))
loss = K.mean(loss)
return loss
############################################################
# Data Generator
############################################################
def load_image_gt(dataset, config, image_id, augment=False, augmentation=None,
use_mini_mask=False):
''' 返回image, image_meta, class_ids, bbox, mask '''
# 1 载入图片和语义分割效果
image = dataset.load_image(image_id)
mask, class_ids = dataset.load_mask(image_id)
# print("\nbefore:",image_id,np.shape(mask),np.shape(class_ids))
# 原始shape
original_shape = image.shape
# 2 灰条 获得新图片,原图片在新图片中的位置,变化的尺度,填充的情况等
image, window, scale, padding, crop = utils.resize_image(
image,
min_dim=config.IMAGE_MIN_DIM,
min_scale=config.IMAGE_MIN_SCALE,
max_dim=config.IMAGE_MAX_DIM,
mode=config.IMAGE_RESIZE_MODE)
# 3 语义分割填充 加灰条
mask = utils.resize_mask(mask, scale, padding, crop)
# print("\nafter:",np.shape(mask),np.shape(class_ids))
# print(np.shape(image),np.shape(mask))
# 4 可以把图片进行翻转
if augment:
logging.warning("'augment' is deprecated. Use 'augmentation' instead.")
if random.randint(0, 1):
image = np.fliplr(image)
mask = np.fliplr(mask)
if augmentation:
import imgaug
# 可用于图像增强
MASK_AUGMENTERS = ["Sequential", "SomeOf", "OneOf", "Sometimes",
"Fliplr", "Flipud", "CropAndPad",
"Affine", "PiecewiseAffine"]
def hook(images, augmenter, parents, default):
"""Determines which augmenters to apply to masks."""
return augmenter.__class__.__name__ in MASK_AUGMENTERS
image_shape = image.shape
mask_shape = mask.shape
det = augmentation.to_deterministic()
image = det.augment_image(image)
mask = det.augment_image(mask.astype(np.uint8),
hooks=imgaug.HooksImages(activator=hook))
assert image.shape == image_shape, "Augmentation shouldn't change image size"
assert mask.shape == mask_shape, "Augmentation shouldn't change mask size"
mask = mask.astype(np.bool)
# 5 检漏,防止某些层内部实际上不存在语义分割情况 mask.shape[h, w, num_instance]
_idx = np.sum(mask, axis=(0, 1)) > 0
# print("\nafterer:",np.shape(mask),np.shape(_idx))
# 防止某层内部没有mask
mask = mask[:, :, _idx]
class_ids = class_ids[_idx]
# 6 找到mask对应的box
bbox = utils.extract_bboxes(mask)
# 获得id信息
active_class_ids = np.zeros([dataset.num_classes], dtype=np.int32)
source_class_ids = dataset.source_class_ids[dataset.image_info[image_id]["source"]]
active_class_ids[source_class_ids] = 1
# 7 把框里的数据截出来
if use_mini_mask:
mask = utils.minimize_mask(bbox, mask, config.MINI_MASK_SHAPE)
# 生成Image_meta
image_meta = utils.compose_image_meta(image_id, original_shape, image.shape,
window, scale, active_class_ids)
return image, image_meta, class_ids, bbox, mask
def build_rpn_targets(image_shape, anchors, gt_class_ids, gt_boxes, config):
# 1代表正样本
# -1代表负样本
# 0代表忽略
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
# 创建该部分内容利用先验框和真实框进行编码
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
'''
iscrowd=0的时候,表示这是一个单独的物体,轮廓用Polygon(多边形的点)表示,
iscrowd=1的时候表示两个没有分开的物体,轮廓用RLE编码表示,比如说一张图片里面有三个人,
一个人单独站一边,另外两个搂在一起(标注的时候距离太近分不开了),这个时候,
单独的那个人的注释里面的iscrowing=0,segmentation用Polygon表示,
而另外两个用放在同一个anatation的数组里面用一个segmention的RLE编码形式表示
'''
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
non_crowd_ix = np.where(gt_class_ids > 0)[0]
crowd_boxes = gt_boxes[crowd_ix]
gt_class_ids = gt_class_ids[non_crowd_ix]
gt_boxes = gt_boxes[non_crowd_ix]
crowd_overlaps = utils.compute_overlaps(anchors, crowd_boxes)
crowd_iou_max = np.amax(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
else:
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
# 计算先验框和真实框的重合程度 [num_anchors, num_gt_boxes] , num_anchors数量,num_gt_boxes数量
overlaps = utils.compute_overlaps(anchors, gt_boxes)
# 1. 重合程度小于0.3则代表为负样本
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
# 2. 每个真实框重合度最大的先验框是正样本
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:,0]
rpn_match[gt_iou_argmax] = 1
# 3. 重合度大于0.7则代表为正样本
rpn_match[anchor_iou_max >= 0.7] = 1
# 正负样本平衡
# 找到正样本的索引
ids = np.where(rpn_match == 1)[0]
# 如果大于(config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)则删掉一些
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
if extra > 0:
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# 找到负样本的索引
ids = np.where(rpn_match == -1)[0]
# 使得总数为config.RPN_TRAIN_ANCHORS_PER_IMAGE 256个正负样本
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# 4. 找到内部真实存在物体的先验框,进行编码
ids = np.where(rpn_match == 1)[0]
ix = 0
for i, a in zip(ids, anchors[ids]):
gt = gt_boxes[anchor_iou_argmax[i]]
# 计算真实框的中心,高宽
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
# 计算先验框中心,高宽
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
# 编码运算
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / a_h,
(gt_center_x - a_center_x) / a_w,
np.log(gt_h / a_h),
np.log(gt_w / a_w),
]
# 改变数量级
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
ix += 1
return rpn_match, rpn_bbox
def data_generator(dataset, config, shuffle=True, augment=False, augmentation=None,
batch_size=1, detection_targets=False,
no_augmentation_sources=None):
"""
inputs list:
- images: [batch, H, W, C]
- image_meta: [batch, (meta data)] Image details. See compose_image_meta()
- rpn_match: [batch, N] Integer (1=positive anchor, -1=negative, 0=neutral)
- rpn_bbox: [batch, N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
- gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs
- gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)]
- gt_masks: [batch, height, width, MAX_GT_INSTANCES]. The height and width
are those of the image unless use_mini_mask is True, in which
case they are defined in MINI_MASK_SHAPE.
outputs list: Usually empty in regular training. But if detection_targets
is True then the outputs list contains target class_ids, bbox deltas,
and masks.
"""
b = 0 # batch item index
image_index = -1
image_ids = np.copy(dataset.image_ids)
no_augmentation_sources = no_augmentation_sources or []
# [anchor_count, (y1, x1, y2, x2)]
# 1 计算获得先验框
backbone_shapes = compute_backbone_shapes(config, config.IMAGE_SHAPE)
anchors = generate_pyramid_anchors(config.RPN_ANCHOR_SCALES,
config.RPN_ANCHOR_RATIOS,
backbone_shapes,
config.BACKBONE_STRIDES,
config.RPN_ANCHOR_STRIDE)
while True:
image_index = (image_index + 1) % len(image_ids)
if shuffle and image_index == 0:
np.random.shuffle(image_ids)
# 获得id
image_id = image_ids[image_index]
# 获得图片,真实框,语义分割结果等
if dataset.image_info[image_id]['source'] in no_augmentation_sources:
image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
load_image_gt(dataset, config, image_id, augment=augment,
augmentation=None,
use_mini_mask=config.USE_MINI_MASK)
else:
image, image_meta, gt_class_ids, gt_boxes, gt_masks = \
load_image_gt(dataset, config, image_id, augment=augment,
augmentation=augmentation,
use_mini_mask=config.USE_MINI_MASK)
if not np.any(gt_class_ids > 0):
continue
# RPN Targets ------ 标签
rpn_match, rpn_bbox = build_rpn_targets(image.shape, anchors,
gt_class_ids, gt_boxes, config)
# 如果某张图片里面物体的数量大于最大值的话,则进行筛选,防止过大
if gt_boxes.shape[0] > config.MAX_GT_INSTANCES:
ids = np.random.choice(
np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False)
gt_class_ids = gt_class_ids[ids]
gt_boxes = gt_boxes[ids]
gt_masks = gt_masks[:, :, ids]
# 初始化用于训练的内容
if b == 0:
batch_image_meta = np.zeros(
(batch_size,) + image_meta.shape, dtype=image_meta.dtype)
batch_rpn_match = np.zeros(
[batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
batch_rpn_bbox = np.zeros(
[batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype)
batch_images = np.zeros(
(batch_size,) + image.shape, dtype=np.float32)
batch_gt_class_ids = np.zeros(
(batch_size, config.MAX_GT_INSTANCES), dtype=np.int32)
batch_gt_boxes = np.zeros(
(batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32)
batch_gt_masks = np.zeros(
(batch_size, gt_masks.shape[0], gt_masks.shape[1],
config.MAX_GT_INSTANCES), dtype=gt_masks.dtype)
# Add to batch
batch_image_meta[b] = image_meta
batch_rpn_match[b] = rpn_match[:, np.newaxis]
batch_rpn_bbox[b] = rpn_bbox
batch_images[b] = utils.mold_image(image.astype(np.float32), config)
batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks
b += 1
if b >= batch_size:
inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]
outputs = []
yield inputs, outputs # 用生成器的方法训练
# start a new batch
b = 0