返回主目录
返回 实例分割 目录
上一章:深度篇——实例分割(二) 细说 mask rcnn 实例分割代码 训练自己数据
下面的代码携带有注释,思考一下,理解基本不难的。不好理解的地方,直接 debug 去看。更立体。
1.相关工具文件
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/13 22:57
# @Author : WanDaoYi
# @FileName : image_utils.py
# ============================================
import numpy as np
import skimage.color
import skimage.io
import skimage.transform
from distutils.version import LooseVersion
from config import cfg
class ImageUtils(object):
def __init__(self):
self.mean_pixel = np.array(cfg.COMMON.MEAN_PIXEL)
pass
def parse_image_meta_graph(self, meta):
"""
Parses a tensor that contains image attributes to its components.
See compose_image_meta() for more details.
:param meta: [batch, meta length] where meta length depends on NUM_CLASSES
:return: Returns a dict of the parsed tensors.
"""
image_id = meta[:, 0]
original_image_shape = meta[:, 1:4]
image_shape = meta[:, 4:7]
window = meta[:, 7:11] # (y1, x1, y2, x2) window of image in in pixels
scale = meta[:, 11]
active_class_ids = meta[:, 12:]
return {
"image_id": image_id,
"original_image_shape": original_image_shape,
"image_shape": image_shape,
"window": window,
"scale": scale,
"active_class_ids": active_class_ids,
}
pass
def compose_image_meta(self, image_id, original_image_shape, image_shape,
window, scale, active_class_ids):
"""
Takes attributes of an image and puts them in one 1D array.
:param image_id: An int ID of the image. Useful for debugging.
:param original_image_shape: [H, W, C] before resizing or padding.
:param image_shape: [H, W, C] after resizing and padding
:param window: (y1, x1, y2, x2) in pixels. The area of the image where the real
image is (excluding the padding)
:param scale: The scaling factor applied to the original image (float32)
:param active_class_ids: List of class_ids available in the dataset from which
the image came. Useful if training on images from multiple datasets
where not all classes are present in all datasets.
:return:
"""
meta = np.array([image_id] + # size=1
list(original_image_shape) + # size=3
list(image_shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(active_class_ids) # size=class_num
)
return meta
pass
def load_image(self, image_path):
"""
Load the specified image and return a [H,W,3] Numpy array.
:param image_path: image path
:return:
"""
# Load image
image = skimage.io.imread(image_path)
# If grayscale. Convert to RGB for consistency.
if image.ndim != 3:
image = skimage.color.gray2rgb(image)
# If has an alpha channel, remove it for consistency
if image.shape[-1] == 4:
image = image[..., :3]
return image
pass
def mold_image(self, images, mean_pixel):
"""
Expects an RGB image (or array of images) and subtracts
the mean pixel and converts it to float. Expects image
colors in RGB order.
:param images:
:param mean_pixel:
:return:
"""
return images.astype(np.float32) - np.array(mean_pixel)
pass
def mode_input(self, images_info_list):
"""
Takes a list of images and modifies them to the format expected
as an input to the neural network.
:param images_info_list: List of image matrices [height,width,depth]. Images can have
different sizes.
:return: returns 3 Numpy matrices:
molded_images_list: [N, h, w, 3]. Images resized and normalized.
image_metas_list: [N, length of meta data]. Details about each image.
windows_list: [N, (y1, x1, y2, x2)]. The portion of the image that has the
original image (padding excluded).
"""
molded_images_list = []
image_metas_list = []
windows_list = []
image_mi_dim = cfg.COMMON.IMAGE_MIN_DIM
image_max_dim = cfg.COMMON.IMAGE_MAX_DIM
image_min_scale = cfg.COMMON.IMAGE_MIN_SCALE
image_resize_mode = cfg.COMMON.IMAGE_RESIZE_MODE
for image_info in images_info_list:
# resize image
molded_image, window, scale, padding, crop = self.resize_image(image_info,
min_dim=image_mi_dim,
min_scale=image_min_scale,
max_dim=image_max_dim,
resize_mode=image_resize_mode)
molded_image = self.mold_image(molded_image, self.mean_pixel)
# Build image_meta
image_meta = self.compose_image_meta(0, image_info.shape, molded_image.shape, window, scale,
np.zeros([cfg.COMMON.CLASS_NUM], dtype=np.int32))
# Append
molded_images_list.append(molded_image)
image_metas_list.append(image_meta)
windows_list.append(window)
pass
# Pack into arrays
molded_images_list = np.stack(molded_images_list)
image_metas_list = np.stack(image_metas_list)
windows_list = np.stack(windows_list)
return molded_images_list, image_metas_list, windows_list
pass
def resize(self, image, output_shape, order=1, resize_mode="constant", cval=0, clip=True,
preserve_range=False, anti_aliasing=False, anti_aliasing_sigma=None):
"""
A wrapper for Scikit-Image resize().
Scikit-Image generates warnings on every call to resize() if it doesn't
receive the right parameters. The right parameters depend on the version
of skimage. This solves the problem by using different parameters per
version. And it provides a central place to control resizing defaults.
:param image:
:param output_shape:
:param order:
:param resize_mode:
:param cval:
:param clip:
:param preserve_range:
:param anti_aliasing:
:param anti_aliasing_sigma:
:return:
"""
if LooseVersion(skimage.__version__) >= LooseVersion("0.14"):
# New in 0.14: anti_aliasing. Default it to False for backward
# compatibility with skimage 0.13.
return skimage.transform.resize(image, output_shape,
order=order, mode=resize_mode, cval=cval, clip=clip,
preserve_range=preserve_range, anti_aliasing=anti_aliasing,
anti_aliasing_sigma=anti_aliasing_sigma)
else:
return skimage.transform.resize(image, output_shape,
order=order, mode=resize_mode, cval=cval, clip=clip,
preserve_range=preserve_range)
pass
def resize_image(self, image, min_dim=None, max_dim=None, min_scale=None, resize_mode="square"):
"""
resize an image keeping the aspect ratio unchanged.
:param image:
:param min_dim: if provided, resize the image such that it's smaller dimension == min_dim
:param max_dim: if provided, ensures that the image longest side doesn't
exceed this value.
:param min_scale: if provided, ensure that the image is scaled up by at least
this percent even if min_dim doesn't require it.
:param resize_mode: resizing mode.
none: No resizing. Return the image unchanged.
square: Resize and pad with zeros to get a square image
of size [max_dim, max_dim].
pad64: Pads width and height with zeros to make them multiples of 64.
If min_dim or min_scale are provided, it scales the image up
before padding. max_dim is ignored in this mode.
The multiple of 64 is needed to ensure smooth scaling of feature
maps up and down the 6 levels of the FPN pyramid (2**6=64).
crop: Picks random crops from the image. First, scales the image based
on min_dim and min_scale, then picks a random crop of
size min_dim x min_dim. Can be used in training only.
max_dim is not used in this mode.
:return:
image: the resized image
window: (y1, x1, y2, x2). If max_dim is provided, padding might
be inserted in the returned image. If so, this window is the
coordinates of the image part of the full image (excluding
the padding). The x2, y2 pixels are not included.
scale: The scale factor used to resize the image
padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
"""
# Keep track of image dtype and return results in the same dtype
image_dtype = image.dtype
# Default window (y1, x1, y2, x2) and default scale == 1.
h, w = image.shape[:2]
window = (0, 0, h, w)
scale = 1
padding = [(0, 0), (0, 0), (0, 0)]
crop = None
if resize_mode == "none":
return image, window, scale, padding, crop
pass
# Scale?
if min_dim:
# Scale up but not down
scale = max(1, min_dim / min(h, w))
pass
if min_scale and scale < min_scale:
scale = min_scale
pass
# Does it exceed max dim?
if max_dim and resize_mode == "square":
image_max = max(h, w)
if round(image_max * scale) > max_dim:
scale = max_dim / image_max
pass
pass
# Resize image using bilinear interpolation
if scale != 1:
image = self.resize(image, (round(h * scale), round(w * scale)), preserve_range=True)
pass
# Need padding or cropping?
if resize_mode == "square":
# Get new height and width
h, w = image.shape[:2]
top_pad = (max_dim - h) // 2
bottom_pad = max_dim - h - top_pad
left_pad = (max_dim - w) // 2
right_pad = max_dim - w - left_pad
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode='constant', constant_values=0)
window = (top_pad, left_pad, h + top_pad, w + left_pad)
pass
elif resize_mode == "pad64":
h, w = image.shape[:2]
# Both sides must be divisible by 64
assert min_dim % 64 == 0, "Minimum dimension must be a multiple of 64"
# Height
if h % 64 > 0:
max_h = h - (h % 64) + 64
top_pad = (max_h - h) // 2
bottom_pad = max_h - h - top_pad
else:
top_pad = bottom_pad = 0
# Width
if w % 64 > 0:
max_w = w - (w % 64) + 64
left_pad = (max_w - w) // 2
right_pad = max_w - w - left_pad
else:
left_pad = right_pad = 0
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode='constant', constant_values=0)
window = (top_pad, left_pad, h + top_pad, w + left_pad)
pass
elif resize_mode == "crop":
# Pick a random crop
h, w = image.shape[:2]
y = np.random.randint(0, (h - min_dim))
x = np.random.randint(0, (w - min_dim))
crop = (y, x, min_dim, min_dim)
image = image[y:y + min_dim, x:x + min_dim]
window = (0, 0, min_dim, min_dim)
pass
else:
raise Exception("Mode {} not supported".format(resize_mode))
pass
return image.astype(image_dtype), window, scale, padding, crop
pass
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/13 12:06
# @Author : WanDaoYi
# @FileName : misc_utils.py
# ============================================
import math
import numpy as np
import tensorflow as tf
from utils.bbox_utils import BboxUtil
from config import cfg
class MiscUtils(object):
def __init__(self):
self.bbox_util = BboxUtil()
pass
def compute_backbone_shapes(self, image_shape, backbone_strides):
"""
Computes the width and height of each stage of the backbone network
:param image_shape: [h, w, c]
:param backbone_strides: The strides of each layer of the FPN Pyramid.
These values are based on a resNet101 backbone.
:return: [N, (height, width)]. Where N is the number of stages
"""
return np.array(
[[int(math.ceil(image_shape[0] / stride)),
int(math.ceil(image_shape[1] / stride))] for stride in backbone_strides])
pass
def batch_slice(self, inputs, graph_fn, batch_size, names=None):
"""
Splits inputs into slices and feeds each slice to a copy of the given
computation graph and then combines the results. It allows you to run a
graph on a batch of inputs even if the graph is written to support one
instance only.
:param inputs: list of tensors. All must have the same first dimension length
:param graph_fn: A function that returns a TF tensor that's part of a graph.
:param batch_size: number of slices to divide the data into.
:param names: If provided, assigns names to the resulting tensors.
:return:
"""
if not isinstance(inputs, list):
inputs = [inputs]
outputs = []
for i in range(batch_size):
inputs_slice = [x[i] for x in inputs]
output_slice = graph_fn(*inputs_slice)
if not isinstance(output_slice, (tuple, list)):
output_slice = [output_slice]
outputs.append(output_slice)
# Change outputs from a list of slices where each is
# a list of outputs to a list of outputs and each has
# a list of slices
outputs = list(zip(*outputs))
if names is None:
names = [None] * len(outputs)
result = [tf.stack(o, axis=0, name=n)
for o, n in zip(outputs, names)]
if len(result) == 1:
result = result[0]
return result
pass
def trim_zeros_graph(self, boxes, name='trim_zeros'):
"""
Often boxes are represented with matrices of shape [N, 4] and
are padded with zeros. This removes zero boxes.
:param boxes: [N, 4] matrix of boxes.
:param name:
:return: non_zeros: [N] a 1D boolean mask identifying the rows to keep
"""
non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
boxes = tf.boolean_mask(boxes, non_zeros, name=name)
return boxes, non_zeros
pass
def detection_targets_graph(self, proposals, gt_class_ids, gt_boxes, gt_masks):
"""
Generates detection targets for one image. Subsamples proposals and
generates target class IDs, bounding box deltas, and masks for each.
:param proposals: [POST_NMS_ROIS_TRAINING, (y1, x1, y2, x2)] in normalized coordinates.
Might be zero padded if there are not enough proposals.
:param gt_class_ids: [MAX_GT_INSTANCES] int class IDs
:param gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
:param gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.
:return: Target ROIs and corresponding class IDs, bounding box shifts, and masks.
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
deltas: [TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw))]
masks: [TRAIN_ROIS_PER_IMAGE, height, width]. Masks cropped to bbox
boundaries and resized to neural network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
"""
# Assertions
asserts = [tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals], name="roi_assertion"), ]
with tf.control_dependencies(asserts):
proposals = tf.identity(proposals)
pass
# Remove zero padding
proposals, _ = self.trim_zeros_graph(proposals, name="trim_proposals")
gt_boxes, non_zeros = self.trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros, name="trim_gt_class_ids")
gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2, name="trim_gt_masks")
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = tf.where(gt_class_ids < 0)[:, 0]
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
crowd_boxes = tf.gather(gt_boxes, crowd_ix)
gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
# Compute overlaps matrix [proposals, gt_boxes]
overlaps = self.bbox_util.overlaps_graph(proposals, gt_boxes)
# Compute overlaps with crowd boxes [proposals, crowd_boxes]
crowd_overlaps = self.bbox_util.overlaps_graph(proposals, crowd_boxes)
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
# Determine positive and negative ROIs
roi_iou_max = tf.reduce_max(overlaps, axis=1)
# 1. Positive ROIs are those with >= 0.5 IoU with a GT box
positive_roi_bool = (roi_iou_max >= 0.5)
positive_indices = tf.where(positive_roi_bool)[:, 0]
# 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0]
# Subsample ROIs. Aim for 33% positive
# Positive ROIs
positive_count = int(cfg.TRAIN.ROIS_PER_IMAGE * cfg.TRAIN.ROI_POSITIVE_RATIO)
positive_indices = tf.random_shuffle(positive_indices)[:positive_count]
positive_count = tf.shape(positive_indices)[0]
# Negative ROIs. Add enough to maintain positive:negative ratio.
r = 1.0 / cfg.TRAIN.ROI_POSITIVE_RATIO
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count
negative_indices = tf.random_shuffle(negative_indices)[:negative_count]
# Gather selected ROIs
positive_rois = tf.gather(proposals, positive_indices)
negative_rois = tf.gather(proposals, negative_indices)
# Assign positive ROIs to GT boxes.
positive_overlaps = tf.gather(overlaps, positive_indices)
roi_gt_box_assignment = tf.cond(
tf.greater(tf.shape(positive_overlaps)[1], 0),
true_fn=lambda: tf.argmax(positive_overlaps, axis=1),
false_fn=lambda: tf.cast(tf.constant([]), tf.int64)
)
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment)
roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment)
# Compute bbox refinement for positive ROIs
deltas = self.bbox_util.box_refinement_graph(positive_rois, roi_gt_boxes)
deltas /= np.array(cfg.COMMON.BBOX_STD_DEV)
# Assign positive ROIs to GT masks
# Permute masks to [N, height, width, 1]
transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1)
# Pick the right mask for each ROI
roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment)
# Compute mask targets
boxes = positive_rois
if cfg.TRAIN.USE_MINI_MASK:
# Transform ROI coordinates from normalized image space
# to normalized mini-mask space.
y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
gt_h = gt_y2 - gt_y1
gt_w = gt_x2 - gt_x1
y1 = (y1 - gt_y1) / gt_h
x1 = (x1 - gt_x1) / gt_w
y2 = (y2 - gt_y1) / gt_h
x2 = (x2 - gt_x1) / gt_w
boxes = tf.concat([y1, x1, y2, x2], 1)
box_ids = tf.range(0, tf.shape(roi_masks)[0])
masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32),
boxes, box_ids,
cfg.TRAIN.MASK_SHAPE)
# Remove the extra dimension from masks.
masks = tf.squeeze(masks, axis=3)
# Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with
# binary cross entropy loss.
masks = tf.round(masks)
# Append negative ROIs and pad bbox deltas and masks that
# are not used for negative ROIs with zeros.
rois = tf.concat([positive_rois, negative_rois], axis=0)
N = tf.shape(negative_rois)[0]
P = tf.maximum(cfg.TRAIN.ROIS_PER_IMAGE - tf.shape(rois)[0], 0)
rois = tf.pad(rois, [(0, P), (0, 0)])
# roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])
return rois, roi_gt_class_ids, deltas, masks
pass
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/01 00:22
# @Author : WanDaoYi
# @FileName : mask_util.py
# ============================================
import warnings
import numpy as np
import scipy.ndimage
from utils.image_utils import ImageUtils
from pycocotools import mask as coco_mask_utils
from config import cfg
class MaskUtil(object):
def __init__(self):
self.coco_model_url = cfg.COMMON.COCO_MODEL_URL
self.image_utils = ImageUtils()
pass
# 计算两个 masks 的 IOU 重叠率
def compute_overlaps_masks(self, masks1, masks2):
"""
:param masks1: [Height, Width, instances]
:param masks2: [Height, Width, instances]
:return: 两个 masks 的 IOU 重叠率
"""
# 如果其中一个 masks 为空,则返回 空 结果
mask_flag = masks1.shape[-1] == 0 or masks2.shape[-1] == 0
if mask_flag:
return np.zeros((masks1.shape[-1], masks2.shape[-1]))
pass
# 将 masks 扁平化后并计算它们的面积
masks1 = np.reshape(masks1 > .5, (-1, masks1.shape[-1])).astype(np.float32)
masks2 = np.reshape(masks2 > .5, (-1, masks2.shape[-1])).astype(np.float32)
area1 = np.sum(masks1, axis=0)
area2 = np.sum(masks2, axis=0)
# intersections and union
intersections = np.dot(masks1.T, masks2)
union = area1[:, None] + area2[None, :] - intersections
overlaps = intersections / union
return overlaps
pass
def annotation_2_mask(self, annotation, height, width):
"""
Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
:param annotation: annotation info
:param height: image info of height
:param width: image info of width
:return: binary mask (numpy 2D array)
"""
segment = annotation['segmentation']
if isinstance(segment, list):
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = coco_mask_utils.frPyObjects(segment, height, width)
rle = coco_mask_utils.merge(rles)
pass
elif isinstance(segment['counts'], list):
# uncompressed RLE
rle = coco_mask_utils.frPyObjects(segment, height, width)
pass
else:
# rle
rle = segment['segmentation']
pass
mask = coco_mask_utils.decode(rle)
return mask
pass
def load_mask(self, data, image_id):
"""
Load instance masks for the given image.
Different datasets use different ways to store masks. This
function converts the different mask format to one format
in the form of a bitmap [height, width, instances].
:param data: The Dataset object to pick data from
:param image_id: image id of image
:return:
masks: A bool array of shape [height, width, instance count] with
one mask per instance.
class_ids: a 1D array of class IDs of the instance masks.
"""
image_info = data.image_info_list[image_id]
instance_masks = []
class_ids = []
annotations = data.image_info_list[image_id]["annotations"]
# Build mask of shape [height, width, instance_count] and list
# of class IDs that correspond to each channel of the mask.
for annotation in annotations:
class_id = data.class_from_source_map["coco.{}".format(annotation['category_id'])]
if class_id:
m = self.annotation_2_mask(annotation, image_info["height"], image_info["width"])
# Some objects are so small that they're less than 1 pixel area
# and end up rounded out. Skip those objects.
if m.max() < 1:
continue
pass
# Is it a crowd? If so, use a negative class ID.
if annotation['iscrowd']:
# Use negative class ID for crowds
class_id *= -1
# For crowd masks, annToMask() sometimes returns a mask
# smaller than the given dimensions. If so, resize it.
if m.shape[0] != image_info["height"] or m.shape[1] != image_info["width"]:
m = np.ones([image_info["height"], image_info["width"]], dtype=bool)
instance_masks.append(m)
class_ids.append(class_id)
pass
mask = np.stack(instance_masks, axis=2).astype(np.bool)
class_ids = np.array(class_ids, dtype=np.int32)
return mask, class_ids
pass
def resize_mask(self, mask, scale, padding, crop=None):
"""
resize a mask using the given scale and padding.
Typically, you get the scale and padding from resize_image() to
ensure both, the image and the mask, are resized consistently.
:param mask:
:param scale: mask scaling factor
:param padding: Padding to add to the mask in the form
[(top, bottom), (left, right), (0, 0)]
:param crop:
:return:
"""
# Suppress warning from scipy 0.13.0, the output shape of zoom() is
# calculated with round() instead of int()
with warnings.catch_warnings():
warnings.simplefilter("ignore")
mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0)
if crop is not None:
y, x, h, w = crop
mask = mask[y:y + h, x:x + w]
else:
mask = np.pad(mask, padding, mode='constant', constant_values=0)
return mask
pass
def minimize_mask(self, bbox, mask, mini_shape):
"""
Resize masks to a smaller version to reduce memory load.
Mini-masks can be resized back to image scale using expand_masks()
:param bbox:
:param mask:
:param mini_shape:
:return:
"""
# 避免 传参 过来 是 list,在 cfg.TRAIN.MINI_MASK_SHAPE 获得的是 list
mini_shape = tuple(mini_shape)
mini_mask = np.zeros(mini_shape + (mask.shape[-1],), dtype=bool)
for i in range(mask.shape[-1]):
# Pick slice and cast to bool in case load_mask() returned wrong dtype
m = mask[:, :, i].astype(bool)
y1, x1, y2, x2 = bbox[i][:4]
m = m[y1:y2, x1:x2]
if m.size == 0:
raise Exception("Invalid bounding box with area of zero")
# Resize with bilinear interpolation
m = self.image_utils.resize(m, mini_shape)
mini_mask[:, :, i] = np.around(m).astype(np.bool)
return mini_mask
pass
def unmold_mask(self, mask, bbox, image_shape):
"""
Converts a mask generated by the neural network to a format similar
to its original shape.
:param mask: [height, width] of type float. A small, typically 28x28 mask.
:param bbox: [y1, x1, y2, x2]. The box to fit the mask in.
:param image_shape:
:return: return a binary mask with the same size as the original image.
"""
threshold = 0.5
y1, x1, y2, x2 = bbox
mask = self.image_utils.resize(mask, (y2 - y1, x2 - x1))
mask = np.where(mask >= threshold, 1, 0).astype(np.bool)
# Put the mask in the right location.
full_mask = np.zeros(image_shape[:2], dtype=np.bool)
full_mask[y1:y2, x1:x2] = mask
return full_mask
pass
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/01 00:13
# @Author : WanDaoYi
# @FileName : bbox_utils.py
# ============================================
import numpy as np
import tensorflow as tf
from utils.image_utils import ImageUtils
from utils.mask_util import MaskUtil
from config import cfg
class BboxUtil(object):
def __init__(self):
self.image_utils = ImageUtils()
self.mask_util = MaskUtil()
pass
# 提取 bounding boxes
def extract_bboxes(self, mask):
"""
:param mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
:return: bbox array [num_instances, (y1, x1, y2, x2)]
"""
# 获取无类别的 instances 值,只区分前景和背景,类别在 目标检测的时候区分
num_instance = mask.shape[-1]
# 初始化 boxes
boxes = np.zeros([num_instance, 4], dtype=np.int32)
for i in range(num_instance):
m = mask[:, :, i]
# bounding box
# x 轴方向
horizontal_indicies = np.where(np.any(m, axis=0))[0]
# y 轴方向
vertical_indicies = np.where(np.any(m, axis=1))[0]
if horizontal_indicies.shape[0]:
x1, x2 = horizontal_indicies[[0, -1]]
y1, y2 = vertical_indicies[[0, -1]]
# x2 and y2 should not be part of the box. Increment by 1.
# 就是 x2 和 y2 不包含在 box 内,如 x1 = 1, x2 = 5, y1 = 1, y2 = 5
# 围起来的面积右下角不包含 (5, 5),所以加 1,以使 右下角超出 mask 面积外
x2 += 1
y2 += 1
pass
else:
# No mask for this instance. Might happen due to
# resizing or cropping. Set bbox to zeros
x1, x2, y1, y2 = 0, 0, 0, 0
pass
boxes[i] = np.array([y1, x1, y2, x2])
pass
return boxes.astype(np.int32)
pass
# 计算 box 的 IOU
def compute_iou(self, box, boxes):
"""
:param box: (y1, x1, y2, x2)
:param boxes: [N, (y1, x1, y2, x2)]
:return: iou
"""
# 计算 box 面积
# area = (x2 - x1) * (y2 - y1)
box_area = (box[3] - box[1]) * (box[2] - box[0])
boxes_area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
# 计算交面积
x1 = np.maximum(box[1], boxes[:, 1])
x2 = np.minimum(box[3], boxes[:, 3])
y1 = np.maximum(box[0], boxes[:, 0])
y2 = np.minimum(box[2], boxes[:, 2])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
# 计算 IOU
union = box_area + boxes_area[:] - intersection[:]
# iou = np.maximum(1.0 * inter_area / union_area, np.finfo(np.float32).eps)
iou = intersection / union
return iou
pass
# 计算 boxes 的 IOU 重叠率
def compute_overlaps(self, boxes1, boxes2):
"""
:param boxes1: [N, (y1, x1, y2, x2)]
:param boxes2: [N, (y1, x1, y2, x2)]
:return:
"""
# 定义覆盖率结构
overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0]))
for i in range(overlaps.shape[1]):
box2 = boxes2[i]
overlaps[:, i] = self.compute_iou(box2, boxes1)
pass
return overlaps
pass
def overlaps_graph(self, boxes1, boxes2):
"""
Computes IoU overlaps between two sets of boxes.
:param boxes1: [N, (y1, x1, y2, x2)].
:param boxes2: [N, (y1, x1, y2, x2)].
:return:
"""
# 1. Tile boxes2 and repeat boxes1. This allows us to compare
# every boxes1 against every boxes2 without loops.
# TF doesn't have an equivalent to np.repeat() so simulate it
# using tf.tile() and tf.reshape.
b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1),
[1, 1, tf.shape(boxes2)[0]]), [-1, 4])
b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
# 2. Compute intersections
b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
y1 = tf.maximum(b1_y1, b2_y1)
x1 = tf.maximum(b1_x1, b2_x1)
y2 = tf.minimum(b1_y2, b2_y2)
x2 = tf.minimum(b1_x2, b2_x2)
intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
# 3. Compute unions
b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
union = b1_area + b2_area - intersection
# 4. Compute IoU and reshape to [boxes1, boxes2]
iou = intersection / union
overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
return overlaps
pass
# 非极大值抑制
def non_max_suppression(self, boxes, scores, threshold):
"""
:param boxes: [N, (y1, x1, y2, x2)]. 注意,(y2, x2) 处于 box 之外
:param scores: box 的得分
:param threshold: IOU 阈值
:return:
"""
assert boxes.shape[0] > 0
if boxes.dtype.kind != "f":
boxes = boxes.astype(np.float32)
pass
# Get indices of boxes sorted by scores (highest first)
ixs = scores.argsort()[::-1]
pick = []
while len(ixs) > 0:
# Pick top box and add its index to the list
i = ixs[0]
pick.append(i)
# Compute IoU of the picked box with the rest
iou = self.compute_iou(boxes[i], boxes[ixs[1:]])
# Identify boxes with IoU over the threshold. This
# returns indices into ixs[1:], so add 1 to get
# indices into ixs.
remove_ixs = np.where(iou > threshold)[0] + 1
# Remove indices of the picked and overlapped boxes.
ixs = np.delete(ixs, remove_ixs)
ixs = np.delete(ixs, 0)
return np.array(pick, dtype=np.int32)
pass
# boxes 信息转换,bounding box regression
# tx = (x − xa) / wa , ty = (y − ya) / ha,
# tw = log(w / wa), th = log(h / ha)
def apply_box_deltas(self, boxes, deltas):
"""
:param boxes: [N, (y1, x1, y2, x2)]. 注意,(y2, x2) 处于 box 之外
:param deltas: [N, (dy, dx, log(dh), log(dw))]
:return:
"""
boxes = boxes.astype(np.float32)
# Convert to y, x, h, w
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
center_y = boxes[:, 0] + 0.5 * height
center_x = boxes[:, 1] + 0.5 * width
# Apply deltas
center_y += deltas[:, 0] * height
center_x += deltas[:, 1] * width
height *= np.exp(deltas[:, 2])
width *= np.exp(deltas[:, 3])
# Convert back to y1, x1, y2, x2
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
return np.stack([y1, x1, y2, x2], axis=1)
pass
# boxes 与 ground truth 信息转换 tf 图,bounding box regression
# 参考 bounding box regression 的公式
def box_refinement_graph(self, box, gt_box):
"""
:param box: [N, (y1, x1, y2, x2)]
:param gt_box: [N, (y1, x1, y2, x2)]
:return:
"""
box = tf.cast(box, tf.float32)
gt_box = tf.cast(gt_box, tf.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = tf.log(gt_height / height)
dw = tf.log(gt_width / width)
result = tf.stack([dy, dx, dh, dw], axis=1)
return result
pass
# boxes 与 ground truth 信息转换,bounding box regression
# 参考 bounding box regression 的公式
def box_refinement(self, box, gt_box):
"""
:param box: [N, (y1, x1, y2, x2)], 假设 (y2, x2) 处于 box 之外
:param gt_box: [N, (y1, x1, y2, x2)]
:return:
"""
box = box.astype(np.float32)
gt_box = gt_box.astype(np.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = np.log(gt_height / height)
dw = np.log(gt_width / width)
return np.stack([dy, dx, dh, dw], axis=1)
pass
# 将框从像素坐标转为标准坐标
def norm_boxes_graph(self, boxes, shape):
"""
:param boxes: [..., (y1, x1, y2, x2)] in pixel coordinates
:param shape: [..., (height, width)] in pixels
:return: [..., (y1, x1, y2, x2)] in normalized coordinates
注意:像素坐标 (y2,x2) 在框外。但在标准化坐标系下它在盒子里。
"""
h, w = tf.split(tf.cast(shape, tf.float32), 2)
scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
shift = tf.constant([0., 0., 1., 1.])
return tf.divide(boxes - shift, scale)
pass
def norm_boxes(self, boxes, shape):
"""
Converts boxes from pixel coordinates to normalized coordinates.
:param boxes: [N, (y1, x1, y2, x2)] in pixel coordinates
:param shape: [..., (height, width)] in pixels
:return: [N, (y1, x1, y2, x2)] in normalized coordinates
Note: In pixel coordinates (y2, x2) is outside the box.
But in normalized coordinates it's inside the box.
"""
h, w = shape
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
return np.divide((boxes - shift), scale).astype(np.float32)
pass
def denorm_boxes(self, boxes, shape):
"""
Converts boxes from normalized coordinates to pixel coordinates.
:param boxes: [N, (y1, x1, y2, x2)] in normalized coordinates
:param shape: [..., (height, width)] in pixels
:return: [N, (y1, x1, y2, x2)] in pixel coordinates
Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
coordinates it's inside the box.
"""
h, w = shape
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
return np.around(np.multiply(boxes, scale) + shift).astype(np.int32)
pass
def apply_box_deltas_graph(self, boxes, deltas):
"""
Applies the given deltas to the given boxes.
:param boxes: [N, (y1, x1, y2, x2)] boxes to update
:param deltas: [N, (dy, dx, log(dh), log(dw))] refinements to apply
:return:
"""
# Convert to y, x, h, w
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
center_y = boxes[:, 0] + 0.5 * height
center_x = boxes[:, 1] + 0.5 * width
# Apply deltas
center_y += deltas[:, 0] * height
center_x += deltas[:, 1] * width
height *= tf.exp(deltas[:, 2])
width *= tf.exp(deltas[:, 3])
# Convert back to y1, x1, y2, x2
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
return result
pass
def clip_boxes_graph(self, boxes, window):
"""
:param boxes: [N, (y1, x1, y2, x2)]
:param window: [4] in the form y1, x1, y2, x2
:return:
"""
# Split
wy1, wx1, wy2, wx2 = tf.split(window, 4)
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
# Clip
y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
clipped.set_shape((clipped.shape[0], 4))
return clipped
pass
def load_image_gt(self, data, image_id, augmentation=None, use_mini_mask=False):
"""
Load and return ground truth data for an image (image, mask, bounding boxes).
:param data: The Dataset object to pick data from
:param image_id: GT bounding boxes and masks for image id.
:param augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation.
For example, passing imgaug.augmenters.Fliplr(0.5) flips images
right/left 50% of the time.
:param use_mini_mask: If False, returns full-size masks that are the same height
and width as the original image. These can be big, for example
1024x1024x100 (for 100 instances). Mini masks are smaller, typically,
224x224 and are generated by extracting the bounding box of the
object and resizing it to MINI_MASK_SHAPE.
:return:
image: [height, width, 3]
shape: the original shape of the image before resizing and cropping.
class_ids: [instance_count] Integer class IDs
bbox: [instance_count, (y1, x1, y2, x2)]
mask: [height, width, instance_count]. The height and width are those
of the image unless use_mini_mask is True, in which case they are
defined in MINI_MASK_SHAPE.
"""
# Load image and mask
image_path = data.image_info_list[image_id]["path"]
image = self.image_utils.load_image(image_path)
mask, class_ids = self.mask_util.load_mask(data, image_id)
original_shape = image.shape
image, window, scale, padding, crop = self.image_utils.resize_image(image,
min_dim=cfg.COMMON.IMAGE_MIN_DIM,
min_scale=cfg.COMMON.IMAGE_MIN_SCALE,
max_dim=cfg.COMMON.IMAGE_MAX_DIM,
resize_mode=cfg.COMMON.IMAGE_RESIZE_MODE)
mask = self.mask_util.resize_mask(mask, scale, padding, crop)
# Augmentation
# This requires the imgaug lib (https://github.com/aleju/imgaug)
if augmentation:
import imgaug
def hook(images, augmenter, parents, default):
"""Determines which augmenters to apply to masks."""
return augmenter.__class__.__name__ in cfg.TRAIN.MASK_AUGMENTERS
# Store shapes before augmentation to compare
image_shape = image.shape
mask_shape = mask.shape
# Make augmenters deterministic to apply similarly to images and masks
det = augmentation.to_deterministic()
image = det.augment_image(image)
# Change mask to np.uint8 because imgaug doesn't support np.bool
mask = det.augment_image(mask.astype(np.uint8), hooks=imgaug.HooksImages(activator=hook))
# Verify that shapes didn't change
assert image.shape == image_shape, "Augmentation shouldn't change image size"
assert mask.shape == mask_shape, "Augmentation shouldn't change mask size"
# Change mask back to bool
mask = mask.astype(np.bool)
pass
# Note that some boxes might be all zeros if the corresponding mask got cropped out.
# and here is to filter them out
_idx = np.sum(mask, axis=(0, 1)) > 0
mask = mask[:, :, _idx]
class_ids = class_ids[_idx]
# Bounding boxes. Note that some boxes might be all zeros
# if the corresponding mask got cropped out.
# bbox: [num_instances, (y1, x1, y2, x2)]
bbox = self.extract_bboxes(mask)
# Active classes
# Different datasets have different classes, so track the
# classes supported in the dataset of this image.
active_class_ids = np.zeros([data.class_num], dtype=np.int32)
source_class_ids = data.source_class_ids[data.image_info_list[image_id]["source"]]
active_class_ids[source_class_ids] = 1
# Resize masks to smaller size to reduce memory usage
if use_mini_mask:
mask = self.mask_util.minimize_mask(bbox, mask, cfg.TRAIN.MINI_MASK_SHAPE)
# Image meta data
image_meta = self.image_utils.compose_image_meta(image_id, original_shape, image.shape,
window, scale, active_class_ids)
return image, image_meta, class_ids, bbox, mask
pass
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/13 12:08
# @Author : WanDaoYi
# @FileName : anchor_utils.py
# ============================================
import numpy as np
from utils.misc_utils import MiscUtils
from utils.bbox_utils import BboxUtil
from config import cfg
class AnchorUtils(object):
def __init__(self):
self.misc_utils = MiscUtils()
self.bbox_utils = BboxUtil()
# Cache anchors and reuse if image shape is the same
self._anchor_cache = {}
# self.anchors = None
pass
def get_anchors(self, image_shape):
"""
:return: Returns anchor pyramid for the given image size
"""
if tuple(image_shape) not in self._anchor_cache:
# Generate Anchors
anchor = self.generate_pyramid_anchors(image_shape)
# Keep a copy of the latest anchors in pixel coordinates because
# it's used in inspect_model notebooks.
# TODO: Remove this after the notebook are refactored to not use it
# self.anchors = anchor
self._anchor_cache[tuple(image_shape)] = self.bbox_utils.norm_boxes(anchor, image_shape[:2])
pass
return self._anchor_cache[tuple(image_shape)]
pass
def generate_pyramid_anchors(self, image_shape):
"""
Generate anchors at different levels of a feature pyramid.
Each scale is associated with a level of the pyramid,
but each ratio is used in all levels of the pyramid.
:param image_shape: [h, w, c]
:return: anchors: [N, (y1, x1, y2, x2)]
All generated anchors in one array.
Sorted with the same order of the given scales.
So, anchors of scale[0] come first, then anchors of scale[1], and so on.
"""
backbone_strides = cfg.COMMON.BACKBONE_STRIDES
# [N, (height, width)]. Where N is the number of stages
backbone_shape = self.misc_utils.compute_backbone_shapes(image_shape, backbone_strides)
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
scales = cfg.COMMON.RPN_ANCHOR_SCALES
scales_len = len(scales)
for i in range(scales_len):
anchor_box = self.generate_anchors(scales[i], backbone_shape[i], backbone_strides[i])
anchors.append(anchor_box)
pass
return np.concatenate(anchors, axis=0)
pass
# generate anchor box
def generate_anchors(self, scales, backbone_shape, backbone_strides):
"""
:param scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
:param backbone_shape: [height, width] spatial shape of the feature map over which to generate anchors.
:param backbone_strides: Stride of the feature map relative to the image in pixels.
:return: anchor box: Convert to corner coordinates (y1, x1, y2, x2)
"""
# 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
ratios = cfg.COMMON.RPN_ANCHOR_RATIOS
# Stride of anchors on the feature map. For example,
# if the value is 2 then generate anchors for every other feature map pixel.
anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE
# Get all combinations of scales and ratios
scales, ratios = np.meshgrid(np.array(scales), np.array(ratios))
scales = scales.flatten()
ratios = ratios.flatten()
# Enumerate heights and widths from scales and ratios
heights = scales / np.sqrt(ratios)
widths = scales * np.sqrt(ratios)
# Enumerate shifts in feature space
shifts_y = np.arange(0, backbone_shape[0], anchor_stride) * backbone_strides
shifts_x = np.arange(0, backbone_shape[1], anchor_stride) * backbone_strides
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
return boxes
pass
2. 公共文件
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/12 15:40
# @Author : WanDaoYi
# @FileName : common.py
# ============================================
import numpy as np
import tensorflow as tf
import keras.backend as K
import keras.layers as KL
import keras.models as KM
import keras.engine as KE
from utils.misc_utils import MiscUtils
from utils.bbox_utils import BboxUtil
from utils.image_utils import ImageUtils
from config import cfg
def log2_graph(x):
"""
Implementation of Log2. TF doesn't have a native implementation.
"""
return tf.log(x) / tf.log(2.0)
def conv_block(input_tensor, kernel_size, filters, stage, block,
strides=(2, 2), use_bias=True, train_flag=True):
"""
conv_block is the block that has a conv layer at shortcut
:param input_tensor: input tensor
:param kernel_size: default 3, the kernel size of middle conv layer at main path
:param filters: list of integers, the nb_filters of 3 conv layer at main path
:param stage: integer, current stage label, used for generating layer names
:param block: 'a','b'..., current block label, used for generating layer names
:param strides:
:param use_bias: Boolean. To use or not use a bias in conv layers.
:param train_flag: Boolean. Train or freeze Batch Norm layers
:return:
Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
And the shortcut should have subsample=(2,2) as well
"""
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = KL.Conv2D(nb_filter1, (1, 1), strides=strides,
name=conv_name_base + '2a', use_bias=use_bias)(input_tensor)
x = BatchNorm(name=bn_name_base + '2a')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
name=conv_name_base + '2b', use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2b')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c', use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2c')(x, training=train_flag)
shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides,
name=conv_name_base + '1', use_bias=use_bias)(input_tensor)
shortcut = BatchNorm(name=bn_name_base + '1')(shortcut, training=train_flag)
x = KL.Add()([x, shortcut])
x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x
def identity_block(input_tensor, kernel_size, filters, stage, block,
use_bias=True, train_flag=True):
"""
The identity_block is the block that has no conv layer at shortcut
:param input_tensor: input tensor
:param kernel_size: default 3, the kernel size of middle conv layer at main path
:param filters: list of integers, the nb_filters of 3 conv layer at main path
:param stage: nteger, current stage label, used for generating layer names
:param block: 'a','b'..., current block label, used for generating layer names
:param use_bias: Boolean. To use or not use a bias in conv layers.
:param train_flag: Boolean. Train or freeze Batch Norm layers
:return:
"""
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = 'res' + str(stage) + block + '_branch'
bn_name_base = 'bn' + str(stage) + block + '_branch'
x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + '2a',
use_bias=use_bias)(input_tensor)
x = BatchNorm(name=bn_name_base + '2a')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding='same',
name=conv_name_base + '2b', use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2b')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + '2c',
use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + '2c')(x, training=train_flag)
x = KL.Add()([x, input_tensor])
x = KL.Activation('relu', name='res' + str(stage) + block + '_out')(x)
return x
def build_fpn_mask_graph(rois, feature_maps, image_meta,
pool_size, class_num, train_flag=True):
"""
Builds the computation graph of the mask head of Feature Pyramid Network.
:param rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates.
:param feature_maps: List of feature maps from different layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution.
:param image_meta: [batch, (meta data)] Image details. See compose_image_meta()
:param pool_size: The width of the square feature map generated from ROI Pooling.
:param class_num: number of classes, which determines the depth of the results
:param train_flag: Boolean. Train or freeze Batch Norm layers
:return: Masks [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, NUM_CLASSES]
"""
# ROI Pooling
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_mask")([rois, image_meta] + feature_maps)
# Conv layers
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), name="mrcnn_mask_conv1")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_mask_bn1')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), name="mrcnn_mask_conv2")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_mask_bn2')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), name="mrcnn_mask_conv3")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_mask_bn3')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"), name="mrcnn_mask_conv4")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_mask_bn4')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"), name="mrcnn_mask_deconv")(x)
x = KL.TimeDistributed(KL.Conv2D(class_num, (1, 1), strides=1, activation="sigmoid"), name="mrcnn_mask")(x)
return x
def rpn_graph(feature_map, anchors_per_location, rpn_anchor_stride):
"""
Builds the computation graph of Region Proposal Network.
:param feature_map: backbone features [batch, height, width, depth]
:param anchors_per_location: number of anchors per pixel in the feature map
:param rpn_anchor_stride: Controls the density of anchors.
Typically 1 (anchors for every pixel in the feature map),
or 2 (every other pixel).
:return:
rpn_class_logits: [batch, H * W * anchors_per_location, 2] Anchor classifier logits (before softmax)
rpn_probs: [batch, H * W * anchors_per_location, 2] Anchor classifier probabilities.
rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh), log(dw))] Deltas to be
applied to anchors.
"""
# TODO: check if stride of 2 causes alignment issues if the feature map
# is not even.
# Shared convolutional base of the RPN
shared = KL.Conv2D(512, (3, 3), padding='same', activation='relu',
strides=rpn_anchor_stride, name='rpn_conv_shared')(feature_map)
# Anchor Score. [batch, height, width, anchors per location * 2].
x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding='valid',
activation='linear', name='rpn_class_raw')(shared)
# Reshape to [batch, anchors, 2]
rpn_class_logits = KL.Lambda(
lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
# Softmax on last dimension of BG/FG.
rpn_probs = KL.Activation("softmax", name="rpn_class_xxx")(rpn_class_logits)
# Bounding box refinement. [batch, H, W, anchors per location * depth]
# where depth is [x, y, log(w), log(h)]
x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid",
activation='linear', name='rpn_bbox_pred')(shared)
# Reshape to [batch, anchors, 4]
rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
return [rpn_class_logits, rpn_probs, rpn_bbox]
pass
def build_rpn_model(rpn_anchor_stride, anchors_per_location, depth):
"""
Builds a Keras model of the Region Proposal Network.
It wraps the RPN graph so it can be used multiple times with shared weights.
:param rpn_anchor_stride: Controls the density of anchors.
Typically 1 (anchors for every pixel in the feature map),
or 2 (every other pixel).
:param anchors_per_location: number of anchors per pixel in the feature map
:param depth: Depth of the backbone feature map.
:return: Returns a Keras Model object. The model outputs, when called, are:
rpn_class_logits: [batch, H * W * anchors_per_location, 2] Anchor classifier logits (before softmax)
rpn_probs: [batch, H * W * anchors_per_location, 2] Anchor classifier probabilities.
rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh),
log(dw))] Deltas to be applied to anchors.
"""
input_feature_map = KL.Input(shape=[None, None, depth], name="input_rpn_feature_map")
outputs = rpn_graph(input_feature_map, anchors_per_location, rpn_anchor_stride)
return KM.Model([input_feature_map], outputs, name="rpn_model")
pass
def fpn_classifier_graph(rois, feature_maps, image_meta, pool_size,
class_num, train_flag=True, fc_layers_size=1024):
"""
Builds the computation graph of the feature pyramid network classifier
and regressor heads.
:param rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates.
:param feature_maps: List of feature maps from different layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution.
:param image_meta: [batch, (meta data)] Image details. See compose_image_meta()
:param class_num: number of classes, which determines the depth of the results
:param pool_size: The width of the square feature map generated from ROI Pooling.
:param train_flag: Boolean. Train or freeze Batch Norm layers
:param fc_layers_size: Size of the 2 FC layers
:return:
logits: [batch, num_rois, NUM_CLASSES] classifier logits (before softmax)
probs: [batch, num_rois, NUM_CLASSES] classifier probabilities
bbox_deltas: [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))] Deltas to apply to
proposal boxes
"""
# ROI Pooling
# Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size],
name="roi_align_classifier")([rois, image_meta] + feature_maps)
# Two 1024 FC layers (implemented with Conv2D for consistency)
x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"),
name="mrcnn_class_conv1")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn1')(x, training=train_flag)
x = KL.Activation('relu')(x)
x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)),
name="mrcnn_class_conv2")(x)
x = KL.TimeDistributed(BatchNorm(), name='mrcnn_class_bn2')(x, training=train_flag)
x = KL.Activation('relu')(x)
shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
name="pool_squeeze")(x)
# Classifier head
mrcnn_class_logits = KL.TimeDistributed(KL.Dense(class_num),
name='mrcnn_class_logits')(shared)
mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"),
name="mrcnn_class")(mrcnn_class_logits)
# BBox head
# [batch, num_rois, NUM_CLASSES * (dy, dx, log(dh), log(dw))]
x = KL.TimeDistributed(KL.Dense(class_num * 4, activation='linear'),
name='mrcnn_bbox_fc')(shared)
# Reshape to [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
s = K.int_shape(x)
mrcnn_bbox = KL.Reshape((s[1], class_num, 4), name="mrcnn_bbox")(x)
return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
pass
def build_rpn_targets(anchors, gt_class_ids, gt_boxes):
"""
Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
:param anchors: [num_anchors, (y1, x1, y2, x2)]
:param gt_class_ids: [num_gt_boxes] Integer class IDs.
:param gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
:return:
rpn_match: [N] (int32) matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
"""
# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
anchor_per_image = cfg.TRAIN.ANCHORS_PER_IMAGE
# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
rpn_bbox = np.zeros((anchor_per_image, 4))
bbox_util = BboxUtil()
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
# Filter out crowds from ground truth class IDs and boxes
non_crowd_ix = np.where(gt_class_ids > 0)[0]
crowd_boxes = gt_boxes[crowd_ix]
gt_class_ids = gt_class_ids[non_crowd_ix]
gt_boxes = gt_boxes[non_crowd_ix]
# Compute overlaps with crowd boxes [anchors, crowds]
crowd_overlaps = bbox_util.compute_overlaps(anchors, crowd_boxes)
crowd_iou_max = np.amax(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
pass
else:
# All anchors don't intersect a crowd
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
pass
# Compute overlaps [num_anchors, num_gt_boxes]
overlaps = bbox_util.compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps a GT box with IoU >= 0.7 then it's positive.
# If an anchor overlaps a GT box with IoU < 0.3 then it's negative.
# Neutral anchors are those that don't match the conditions above,
# and they don't influence the loss function.
# However, don't keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
#
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. Skip boxes in crowd areas.
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
# 2. Set an anchor for each GT box (regardless of IoU value).
# If multiple anchors have the same IoU match all of them
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:, 0]
rpn_match[gt_iou_argmax] = 1
# 3. Set anchors with high overlap as positive.
rpn_match[anchor_iou_max >= 0.7] = 1
# Subsample to balance positive and negative anchors
# Don't let positives be more than half the anchors
ids = np.where(rpn_match == 1)[0]
extra = len(ids) - (anchor_per_image // 2)
if extra > 0:
# Reset the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# Same for negative proposals
ids = np.where(rpn_match == -1)[0]
extra = len(ids) - (anchor_per_image - np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
pass
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
ids = np.where(rpn_match == 1)[0]
ix = 0 # index into rpn_bbox
# TODO: use box_refinement() rather than duplicating the code here
for i, a in zip(ids, anchors[ids]):
# Closest gt box (it might have IoU < 0.7)
gt = gt_boxes[anchor_iou_argmax[i]]
# Convert coordinates to center plus width/height.
# GT Box
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
# Anchor
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
# Compute the bbox refinement that the RPN should predict.
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / a_h,
(gt_center_x - a_center_x) / a_w,
np.log(gt_h / a_h),
np.log(gt_w / a_w),
]
# Normalize
rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV)
rpn_bbox[ix] /= rpn_bbox_std_dev
ix += 1
return rpn_match, rpn_bbox
pass
def rpn_class_loss_graph(rpn_match, rpn_class_logits):
"""
RPN anchor classifier loss.
:param rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
:param rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for BG/FG.
:return:
"""
# Squeeze last dim to simplify
rpn_match = tf.squeeze(rpn_match, -1)
# Get anchor classes. Convert the -1/+1 match to 0/1 values.
anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32)
# Positive and Negative anchors contribute to the loss,
# but neutral anchors (match value = 0) don't.
indices = tf.where(K.not_equal(rpn_match, 0))
# Pick rows that contribute to the loss and filter out the rest.
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices)
anchor_class = tf.gather_nd(anchor_class, indices)
# Cross entropy loss
loss = K.sparse_categorical_crossentropy(target=anchor_class,
output=rpn_class_logits,
from_logits=True)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
return loss
def batch_pack_graph(x, counts, num_rows):
"""
Picks different number of values from each row in x depending on the values in counts.
:param x:
:param counts:
:param num_rows:
:return:
"""
outputs = []
for i in range(num_rows):
outputs.append(x[i, :counts[i]])
return tf.concat(outputs, axis=0)
def smooth_l1_loss(y_true, y_pred):
"""
Implements Smooth-L1 loss. y_true and y_pred are typically: [N, 4], but could be any shape.
:param y_true:
:param y_pred:
:return:
"""
diff = K.abs(y_true - y_pred)
less_than_one = K.cast(K.less(diff, 1.0), "float32")
loss = (less_than_one * 0.5 * diff ** 2) + (1 - less_than_one) * (diff - 0.5)
return loss
def rpn_bbox_loss_graph(batch_size, target_bbox, rpn_match, rpn_bbox):
"""
:param batch_size:
:param target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
Uses 0 padding to fill in unsed bbox deltas.
:param rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
:param rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
:return: Return the RPN bounding box loss graph.
"""
# Positive anchors contribute to the loss, but negative and
# neutral anchors (match value of 0 or -1) don't.
rpn_match = K.squeeze(rpn_match, -1)
indices = tf.where(K.equal(rpn_match, 1))
# Pick bbox deltas that contribute to the loss
rpn_bbox = tf.gather_nd(rpn_bbox, indices)
# Trim target bounding box deltas to the same length as rpn_bbox.
batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1)
target_bbox = batch_pack_graph(target_bbox, batch_counts, batch_size)
loss = smooth_l1_loss(target_bbox, rpn_bbox)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
return loss
def mrcnn_class_loss_graph(target_class_ids, pred_class_logits, active_class_ids):
"""
Loss for the classifier head of Mask RCNN.
:param target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
padding to fill in the array.
:param pred_class_logits: [batch, num_rois, num_classes]
:param active_class_ids: [batch, num_classes]. Has a value of 1 for
classes that are in the dataset of the image, and 0
for classes that are not in the dataset.
:return:
"""
# During model building, Keras calls this function with
# target_class_ids of type float32. Unclear why. Cast it
# to int to get around it.
target_class_ids = tf.cast(target_class_ids, 'int64')
# Find predictions of classes that are not in the dataset.
pred_class_ids = tf.argmax(pred_class_logits, axis=2)
# TODO: Update this line to work with batch > 1. Right now it assumes all
# images in a batch have the same active_class_ids
pred_active = tf.gather(active_class_ids[0], pred_class_ids)
# Loss
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=target_class_ids, logits=pred_class_logits)
# Erase losses of predictions of classes that are not in the active
# classes of the image.
loss = loss * pred_active
# Computer loss mean. Use only predictions that contribute
# to the loss to get a correct mean.
loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
return loss
def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
"""
Loss for Mask R-CNN bounding box refinement.
:param target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
:param target_class_ids: [batch, num_rois]. Integer class IDs.
:param pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
:return:
"""
# Reshape to merge batch and roi dimensions for simplicity.
target_class_ids = K.reshape(target_class_ids, (-1,))
target_bbox = K.reshape(target_bbox, (-1, 4))
pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4))
# Only positive ROIs contribute to the loss. And only
# the right class_id of each ROI. Get their indices.
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0]
positive_roi_class_ids = tf.cast(
tf.gather(target_class_ids, positive_roi_ix), tf.int64)
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1)
# Gather the deltas (predicted and true) that contribute to loss
target_bbox = tf.gather(target_bbox, positive_roi_ix)
pred_bbox = tf.gather_nd(pred_bbox, indices)
# Smooth-L1 Loss
loss = K.switch(tf.size(target_bbox) > 0,
smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox),
tf.constant(0.0))
loss = K.mean(loss)
return loss
def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):
"""
Mask binary cross-entropy loss for the masks head.
:param target_masks: [batch, num_rois, height, width].
A float32 tensor of values 0 or 1. Uses zero padding to fill array.
:param target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
:param pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
with values from 0 to 1.
:return:
"""
# Reshape for simplicity. Merge first two dimensions into one.
target_class_ids = K.reshape(target_class_ids, (-1,))
mask_shape = tf.shape(target_masks)
target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))
pred_shape = tf.shape(pred_masks)
pred_masks = K.reshape(pred_masks,
(-1, pred_shape[2], pred_shape[3], pred_shape[4]))
# Permute predicted masks to [N, num_classes, height, width]
pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])
# Only positive ROIs contribute to the loss. And only
# the class specific mask of each ROI.
positive_ix = tf.where(target_class_ids > 0)[:, 0]
positive_class_ids = tf.cast(
tf.gather(target_class_ids, positive_ix), tf.int64)
indices = tf.stack([positive_ix, positive_class_ids], axis=1)
# Gather the masks (predicted and true) that contribute to loss
y_true = tf.gather(target_masks, positive_ix)
y_pred = tf.gather_nd(pred_masks, indices)
# Compute binary cross entropy. If no positive ROIs, then return 0.
# shape: [batch, roi, num_classes]
loss = K.switch(tf.size(y_true) > 0,
K.binary_crossentropy(target=y_true, output=y_pred),
tf.constant(0.0))
loss = K.mean(loss)
return loss
def refine_detections_graph(rois, probs, deltas, window):
"""
Refine classified proposals and filter overlaps and return final detections.
:param rois: [N, (y1, x1, y2, x2)] in normalized coordinates
:param probs: [N, num_classes]. Class probabilities.
:param deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific bounding box deltas.
:param window: (y1, x1, y2, x2) in normalized coordinates.
The part of the image that contains the image excluding the padding.
:return: Returns detections shaped:
[num_detections, (y1, x1, y2, x2, class_id, score)] where coordinates are normalized.
"""
# Class IDs per ROI
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
# Class probability of the top class of each ROI
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
class_scores = tf.gather_nd(probs, indices)
# Class-specific bounding box deltas
deltas_specific = tf.gather_nd(deltas, indices)
# Apply bounding box deltas
# Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
bbox_utils = BboxUtil()
refined_rois = bbox_utils.apply_box_deltas_graph(rois, deltas_specific * cfg.COMMON.BBOX_STD_DEV)
# Clip boxes to image window
refined_rois = bbox_utils.clip_boxes_graph(refined_rois, window)
# TODO: Filter out boxes with zero area
# Filter out background boxes
keep = tf.where(class_ids > 0)[:, 0]
# Filter out low confidence boxes
defection_min_confidence = cfg.COMMON.DETECTION_MIN_CONFIDENCE
if defection_min_confidence:
conf_keep = tf.where(class_scores >= defection_min_confidence)[:, 0]
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0), tf.expand_dims(conf_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# Apply per-class NMS
# 1. Prepare variables
pre_nms_class_ids = tf.gather(class_ids, keep)
pre_nms_scores = tf.gather(class_scores, keep)
pre_nms_rois = tf.gather(refined_rois, keep)
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
def nms_keep_map(class_id):
"""Apply Non-Maximum Suppression on ROIs of the given class."""
defection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES
# Indices of ROIs of the given class
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
# Apply NMS
class_keep = tf.image.non_max_suppression(tf.gather(pre_nms_rois, ixs),
tf.gather(pre_nms_scores, ixs),
max_output_size=defection_max_instances,
iou_threshold=cfg.TEST.DETECTION_NMS_THRESHOLD)
# Map indices
class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
# Pad with -1 so returned tensors have the same shape
gap = defection_max_instances - tf.shape(class_keep)[0]
class_keep = tf.pad(class_keep, [(0, gap)],
mode='CONSTANT', constant_values=-1)
# Set shape so map_fn() can infer result shape
class_keep.set_shape([defection_max_instances])
return class_keep
# 2. Map over class IDs
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
dtype=tf.int64)
# 3. Merge results into one list, and remove -1 padding
nms_keep = tf.reshape(nms_keep, [-1])
nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
# 4. Compute intersection between keep and nms_keep
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(nms_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# Keep top detections
roi_count = cfg.TEST.DETECTION_MAX_INSTANCES
class_scores_keep = tf.gather(class_scores, keep)
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
keep = tf.gather(keep, top_ids)
# Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
# Coordinates are normalized.
detections = tf.concat([tf.gather(refined_rois, keep),
tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
tf.gather(class_scores, keep)[..., tf.newaxis]
], axis=1)
# Pad with zeros if detections < DETECTION_MAX_INSTANCES
gap = cfg.TEST.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
return detections
class BatchNorm(KL.BatchNormalization):
"""Extends the Keras BatchNormalization class to allow a central place
to make changes if needed.
Batch normalization has a negative effect on training if batches are small
so this layer is often frozen (via setting in Config class) and functions
as linear layer.
"""
def call(self, inputs, training=None):
"""
Note about training values:
None: Train BN layers. This is the normal mode
False: Freeze BN layers. Good when batch size is small
True: (don't use). Set layer in training mode even when making inferences
"""
return super(self.__class__, self).call(inputs, training=training)
class ProposalLayer(KE.Layer):
"""
Receives anchor scores and selects a subset to pass as proposals
to the second stage. Filtering is done based on anchor scores and
non-max suppression to remove overlaps. It also applies bounding
box refinement deltas to anchors.
Inputs:
rpn_probs: [batch, num_anchors, (bg prob, fg prob)]
rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))]
anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates
Returns:
Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
"""
def __init__(self, proposal_count, nms_threshold, batch_size, **kwargs):
super(ProposalLayer, self).__init__(**kwargs)
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold
self.batch_size = batch_size
self.misc_utils = MiscUtils()
self.bbox_utils = BboxUtil()
pass
def call(self, inputs):
"""
这里的 call 方法,会被 __init__() 方法回调
:param inputs:
:return:
"""
# Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
scores = inputs[0][:, :, 1]
# Box deltas [batch, num_rois, 4]
deltas = inputs[1]
rpn_bbox_std_dev = np.array(cfg.COMMON.RPN_BBOX_STD_DEV)
deltas = deltas * np.reshape(rpn_bbox_std_dev, [1, 1, 4])
# Anchors
anchors = inputs[2]
# Improve performance by trimming to top anchors by score
# and doing the rest on the smaller subset.
pre_nms_limit = tf.minimum(cfg.COMMON.PRE_NMS_LIMIT, tf.shape(anchors)[1])
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, name="top_anchors").indices
scores = self.misc_utils.batch_slice([scores, ix], lambda x, y: tf.gather(x, y),
self.batch_size)
deltas = self.misc_utils.batch_slice([deltas, ix], lambda x, y: tf.gather(x, y),
self.batch_size)
pre_nms_anchors = self.misc_utils.batch_slice([anchors, ix],
lambda a, x: tf.gather(a, x),
self.batch_size,
names=["pre_nms_anchors"])
# Apply deltas to anchors to get refined anchors.
# [batch, N, (y1, x1, y2, x2)]
boxes = self.misc_utils.batch_slice([pre_nms_anchors, deltas],
lambda x, y: self.bbox_utils.apply_box_deltas_graph(x, y),
self.batch_size,
names=["refined_anchors"])
# Clip to image boundaries. Since we're in normalized coordinates,
# clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
window = np.array([0, 0, 1, 1], dtype=np.float32)
boxes = self.misc_utils.batch_slice(boxes,
lambda x: self.bbox_utils.clip_boxes_graph(x, window),
self.batch_size,
names=["refined_anchors_clipped"])
# Filter out small boxes
# According to Xinlei Chen's paper, this reduces detection accuracy
# for small objects, so we're skipping it.
# Non-max suppression
def nms(boxes, scores):
indices = tf.image.non_max_suppression(
boxes, scores, self.proposal_count,
self.nms_threshold, name="rpn_non_max_suppression")
proposals = tf.gather(boxes, indices)
# Pad if needed
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
proposals = tf.pad(proposals, [(0, padding), (0, 0)])
return proposals
proposals = self.misc_utils.batch_slice([boxes, scores], nms, self.batch_size)
return proposals
def compute_output_shape(self, input_shape):
return (None, self.proposal_count, 4)
class DetectionTargetLayer(KE.Layer):
"""
Subsamples proposals and generates target box refinement, class_ids, and masks for each.
Inputs:
proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might
be zero padded if there are not enough proposals.
gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized
coordinates.
gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type
Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks.
rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized
coordinates
target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]
target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width]
Masks cropped to bbox boundaries and resized to neural
network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
"""
def __init__(self, batch_size, **kwargs):
super(DetectionTargetLayer, self).__init__(**kwargs)
self.batch_size = batch_size
self.misc_utils = MiscUtils()
self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE
self.mask_shape = cfg.TRAIN.MASK_SHAPE
pass
def call(self, inputs):
"""
这里的 call 方法,会被 __init__() 方法回调
:param inputs: 参数如下所示
:return:
"""
proposals = inputs[0]
gt_class_ids = inputs[1]
gt_boxes = inputs[2]
gt_masks = inputs[3]
# Slice the batch and run a graph for each slice
# TODO: Rename target_bbox to target_deltas for clarity
names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
outputs = self.misc_utils.batch_slice([proposals, gt_class_ids, gt_boxes, gt_masks],
lambda w, x, y, z: self.misc_utils.detection_targets_graph(w, x, y, z),
self.batch_size, names=names)
return outputs
def compute_output_shape(self, input_shape):
return [
(None, self.rois_per_image, 4), # rois
(None, self.rois_per_image), # class_ids
(None, self.rois_per_image, 4), # deltas
(None, self.rois_per_image, self.mask_shape[0], self.mask_shape[1]) # masks
]
def compute_mask(self, inputs, mask=None):
return [None, None, None, None]
class PyramidROIAlign(KE.Layer):
"""
Implements ROI Pooling on multiple levels of the feature pyramid.
Inputs:
- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
coordinates. Possibly padded with zeros if not enough
boxes to fill the array.
- image_meta: [batch, (meta data)] Image details. See compose_image_meta()
- feature_maps: List of feature maps from different levels of the pyramid.
Each is [batch, height, width, channels]
Output:
Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
The width and height are those specific in the pool_shape in the layer
constructor.
"""
def __init__(self, pool_shape, **kwargs):
super(PyramidROIAlign, self).__init__(**kwargs)
self.pool_shape = tuple(pool_shape)
self.image_utils = ImageUtils()
def call(self, inputs):
# Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
boxes = inputs[0]
# Image meta
# Holds details about the image. See compose_image_meta()
image_meta = inputs[1]
# Feature Maps. List of feature maps from different level of the
# feature pyramid. Each is [batch, height, width, channels]
feature_maps = inputs[2:]
# Assign each ROI to a level in the pyramid based on the ROI area.
y1, x1, y2, x2 = tf.split(boxes, 4, axis=2)
h = y2 - y1
w = x2 - x1
# Use shape of first image. Images in a batch must have the same size.
image_shape = self.image_utils.parse_image_meta_graph(image_meta)['image_shape'][0]
# Equation 1 in the Feature Pyramid Networks paper. Account for
# the fact that our coordinates are normalized here.
# e.g. a 224x224 ROI (in pixels) maps to P4
image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
roi_level = log2_graph(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area)))
roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
roi_level = tf.squeeze(roi_level, 2)
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled = []
box_to_level = []
for i, level in enumerate(range(2, 6)):
ix = tf.where(tf.equal(roi_level, level))
level_boxes = tf.gather_nd(boxes, ix)
# Box indices for crop_and_resize.
box_indices = tf.cast(ix[:, 0], tf.int32)
# Keep track of which box is mapped to which level
box_to_level.append(ix)
# Stop gradient propogation to ROI proposals
level_boxes = tf.stop_gradient(level_boxes)
box_indices = tf.stop_gradient(box_indices)
# Crop and Resize
# From Mask R-CNN paper: "We sample four regular locations, so
# that we can evaluate either max or average pooling. In fact,
# interpolating only a single value at each bin center (without
# pooling) is nearly as effective."
#
# Here we use the simplified approach of a single value per bin,
# which is how it's done in tf.crop_and_resize()
# Result: [batch * num_boxes, pool_height, pool_width, channels]
pooled.append(tf.image.crop_and_resize(
feature_maps[i], level_boxes, box_indices, self.pool_shape,
method="bilinear"))
# Pack pooled features into one tensor
pooled = tf.concat(pooled, axis=0)
# Pack box_to_level mapping into one array and add another
# column representing the order of pooled boxes
box_to_level = tf.concat(box_to_level, axis=0)
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1)
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range],
axis=1)
# Rearrange pooled features to match the order of the original boxes
# Sort box_to_level by batch then box index
# TF doesn't have a way to sort by two columns, so merge them and sort.
sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
ix = tf.nn.top_k(sorting_tensor, k=tf.shape(
box_to_level)[0]).indices[::-1]
ix = tf.gather(box_to_level[:, 2], ix)
pooled = tf.gather(pooled, ix)
# Re-add the batch dimension
shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0)
pooled = tf.reshape(pooled, shape)
return pooled
def compute_output_shape(self, input_shape):
return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1],)
class DetectionLayer(KE.Layer):
"""
Takes classified proposal boxes and their bounding box deltas and
returns the final detection boxes.
Returns:
[batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
coordinates are normalized.
"""
def __init__(self, batch_size, **kwargs):
super(DetectionLayer, self).__init__(**kwargs)
self.batch_size = batch_size
self.detection_max_instances = cfg.TEST.DETECTION_MAX_INSTANCES
self.image_utils = ImageUtils()
self.bbox_utils = BboxUtil()
self.misc_utils = MiscUtils()
def call(self, inputs):
rois = inputs[0]
mrcnn_class = inputs[1]
mrcnn_bbox = inputs[2]
image_meta = inputs[3]
# Get windows of images in normalized coordinates. Windows are the area
# in the image that excludes the padding.
# Use the shape of the first image in the batch to normalize the window
# because we know that all images get resized to the same size.
m = self.image_utils.parse_image_meta_graph(image_meta)
image_shape = m['image_shape'][0]
window = self.bbox_utils.norm_boxes_graph(m['window'], image_shape[:2])
# Run detection refinement graph on each item in the batch
detections_batch = self.misc_utils.batch_slice([rois, mrcnn_class, mrcnn_bbox, window],
lambda x, y, w, z: refine_detections_graph(x, y, w, z),
self.batch_size)
# Reshape output
# [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
# normalized coordinates
return tf.reshape(detections_batch, [self.batch_size, self.detection_max_instances, 6])
def compute_output_shape(self, input_shape):
return (None, self.detection_max_instances, 6)
3. 背骨网络
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/12 15:40
# @Author : WanDaoYi
# @FileName : backbone.py
# ============================================
import keras.layers as kl
from m_rcnn import common
from config import cfg
def resnet_graph(input_image, architecture, stage5=False):
"""
resNet 背骨图,没什么好说的,数好参数就好了。
:param input_image: input image info
:param architecture: Can be resNet50 or resNet101
:param stage5: Boolean. If False, stage5 of the network is not created
:return: [c1, c2, c3, c4, c5]
"""
train_flag = cfg.COMMON.TRAIN_FLAG
assert architecture in ["resNet50", "resNet101"]
# Stage 1
x = kl.ZeroPadding2D((3, 3))(input_image)
x = kl.Conv2D(64, (7, 7), strides=(2, 2), name='conv1', use_bias=True)(x)
x = common.BatchNorm(name='bn_conv1')(x, training=train_flag)
x = kl.Activation('relu')(x)
c1 = x = kl.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# Stage 2
x = common.conv_block(x, 3, [64, 64, 256], stage=2, block='a', strides=(1, 1), train_flag=train_flag)
x = common.identity_block(x, 3, [64, 64, 256], stage=2, block='b', train_flag=train_flag)
c2 = x = common.identity_block(x, 3, [64, 64, 256], stage=2, block='c', train_flag=train_flag)
# Stage 3
x = common.conv_block(x, 3, [128, 128, 512], stage=3, block='a', train_flag=train_flag)
x = common.identity_block(x, 3, [128, 128, 512], stage=3, block='b', train_flag=train_flag)
x = common.identity_block(x, 3, [128, 128, 512], stage=3, block='c', train_flag=train_flag)
c3 = x = common.identity_block(x, 3, [128, 128, 512], stage=3, block='d', train_flag=train_flag)
# Stage 4
x = common.conv_block(x, 3, [256, 256, 1024], stage=4, block='a', train_flag=train_flag)
block_count = {"resNet50": 5, "resNet101": 22}[architecture]
for i in range(block_count):
x = common.identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_flag=train_flag)
c4 = x
# Stage 5
if stage5:
x = common.conv_block(x, 3, [512, 512, 2048], stage=5, block='a', train_flag=train_flag)
x = common.identity_block(x, 3, [512, 512, 2048], stage=5, block='b', train_flag=train_flag)
c5 = common.identity_block(x, 3, [512, 512, 2048], stage=5, block='c', train_flag=train_flag)
else:
c5 = None
return [c1, c2, c3, c4, c5]
4. 将 coco 数据处理成 网络模型数据
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/11 20:49
# @Author : WanDaoYi
# @FileName : coco_dataset.py
# ============================================
from datetime import datetime
import os
import json
import itertools
import numpy as np
from collections import defaultdict
from config import cfg
def is_array_like(obj):
return hasattr(obj, '__iter__') and hasattr(obj, '__len__')
class CocoDataset(object):
def __init__(self, annotation_path, image_file_path):
"""
:param annotation_path: annotation json path, as ./instances_train2014.json
:param image_file_path: image file path, as ./train2014
"""
# file path
self.annotation_path = annotation_path
self.image_file_path = image_file_path
# dataset info
self.dataset = self.read_coco_json_data()
# class info
self.categories_dict = self.categories_info()
# image info
self.image_dict = self.images_info()
# annotations info, image to annotations info, class to image info
self.annotations_dict, self.image_2_annotations, self.categories_2_image = self.annotations_info()
self.image_info_list = []
# Background is always the first class
self.class_info_list = cfg.COMMON.DEFAULT_CLASS_INFO
# 数据处理
self.deal_data()
# 类别数量
self.class_num = len(self.class_info_list)
# 类别 id list
self.class_ids_list = np.arange(self.class_num)
# 类别名字
self.class_names_list = [self.clean_name(c["name"]) for c in self.class_info_list]
# 图像数量
self.images_num = len(self.image_info_list)
# 图像 id list
self._image_ids_list = np.arange(self.images_num)
# Map sources to class_ids they support
self.sources = list(set([i['source'] for i in self.class_info_list]))
self.source_class_ids = self.get_source_class_ids()
# Mapping from source class and image IDs to internal IDs
self.class_from_source_map = {"{}.{}".format(info['source'], info['id']): class_id
for info, class_id in zip(self.class_info_list, self.class_ids_list)}
self.image_from_source_map = {"{}.{}".format(info['source'], info['id']): image_id
for info, image_id in zip(self.image_info_list, self.image_ids_list)}
pass
@property
def image_ids_list(self):
return self._image_ids_list
def get_source_class_ids(self):
source_class_ids = {}
# Loop over dataset
for source in self.sources:
source_class_ids[source] = []
# Find classes that belong to this dataset
for i, info in enumerate(self.class_info_list):
# Include BG class in all dataset
if i == 0 or source == info['source']:
source_class_ids[source].append(i)
pass
pass
pass
return source_class_ids
pass
# load coco data
def read_coco_json_data(self):
# str to json
print("json load.....")
data_json = json.load(open(self.annotation_path, encoding="utf-8"))
assert type(data_json) == dict, 'annotation file format {} not supported'.format(type(data_json))
# json_key_list = [key_name for key_name in data_json]
# print(json_key_list)
return data_json
# deal class info
def categories_info(self):
categories_dict = dict()
if "categories" in self.dataset:
print("categories info...")
categories_info = self.dataset["categories"]
for categories in categories_info:
categories_dict[categories["id"]] = categories
pass
# categories_ids = [categories['id'] for categories in categories_info]
# print(categories_ids)
pass
return categories_dict
# deal image info
def images_info(self):
image_dict = dict()
if "images" in self.dataset:
print("images info...")
image_info_list = self.dataset["images"]
for image_info in image_info_list:
image_dict[image_info["id"]] = image_info
pass
pass
return image_dict
# deal annotation info and image to annotation, class to image
def annotations_info(self):
annotations_dict = dict()
image_2_annotations = defaultdict(list)
categories_2_image = defaultdict(list)
if "annotations" in self.dataset:
print("annotations info...")
annotations_list = self.dataset["annotations"]
for annotations in annotations_list:
annotations_dict[annotations["id"]] = annotations
image_2_annotations[annotations["image_id"]].append(annotations)
if "categories" in self.dataset:
categories_2_image[annotations["category_id"]].append(annotations["image_id"])
pass
pass
pass
return annotations_dict, image_2_annotations, categories_2_image
# image ids list
def get_image_ids(self, image_ids=[], class_ids=[]):
if len(image_ids) == 0 and len(class_ids) == 0:
ids = self.image_dict.keys()
else:
ids = set(image_ids)
for i, class_id in enumerate(class_ids):
if i == 0 and len(ids) == 0:
ids = set(self.categories_2_image[class_id])
else:
ids &= set(self.categories_2_image[class_id])
return list(ids)
pass
# class ids
def get_class_ids(self):
# class ids
categories_ids = sorted([categories['id'] for categories in self.dataset['categories']])
return categories_ids
pass
def get_annotation_ids(self, image_ids=[], class_ids=[], area_rang=[], is_crowd=False):
"""
:param image_ids : (int array) get annotation for given images
:param class_ids: (int array) get annotation for given classes
:param area_rang: (float array) get annotation for given area range (e.g. [0 inf])
:param is_crowd: (boolean) get annotation for given crowd label (False or True)
:return: annotation_ids: (int array) integer array of ann ids
"""
if len(image_ids) == len(class_ids) == len(area_rang) == 0:
annotations = self.dataset['annotations']
pass
else:
if len(image_ids) != 0:
lists = [self.image_2_annotations[image_id] for image_id in image_ids if
image_id in self.image_2_annotations]
annotations = list(itertools.chain.from_iterable(lists))
pass
else:
annotations = self.dataset['annotations']
pass
annotations = annotations if len(class_ids) == 0 else [ann for ann in annotations if ann['category_id'] in class_ids]
annotations = annotations if len(area_rang) == 0 else [ann for ann in annotations if ann['area'] > area_rang[0] and ann['area'] < area_rang[1]]
pass
if is_crowd:
annotation_ids = [annotation['id'] for annotation in annotations if annotation['iscrowd'] == is_crowd]
pass
else:
annotation_ids = [annotation['id'] for annotation in annotations]
pass
return annotation_ids
pass
def load_class_info(self, class_ids=[]):
return [self.categories_dict[class_ids]]
pass
def load_annotation_info(self, annotation_ids=[]):
if is_array_like(annotation_ids):
return [self.annotations_dict[annotation_id] for annotation_id in annotation_ids]
elif type(annotation_ids) == int:
return [self.annotations_dict[annotation_ids]]
pass
# 增加类别信息
def add_class(self, source, class_id, class_name):
"""
:param source: 来源
:param class_id: 类别的 id 号
:param class_name: 类别名称
:return:
"""
assert "." not in source, "Source name cannot contain a dot"
# 判断类别是否已存在
for info_map in self.class_info_list:
class_info_flag = info_map["source"] == source and info_map["id"] == class_id
if class_info_flag:
# source.class_id combination already available, skip
return
pass
# 添加新的类别信息
info_map = {"source": source,
"id": class_id,
"name": class_name
}
self.class_info_list.append(info_map)
pass
# 添加图像信息
def add_image(self, source, image_id, path, **kwargs):
"""
:param source: 来源
:param image_id: 图像 id
:param path: 路径
:param kwargs: 一个 map 超参
:return:
"""
image_info_map = {"id": image_id, "source": source, "path": path}
image_info_map.update(kwargs)
self.image_info_list.append(image_info_map)
pass
# 对数据处理
def deal_data(self):
image_ids = []
class_ids = self.get_class_ids()
for class_id in class_ids:
image_ids.extend(list(self.get_image_ids(class_ids=[class_id])))
pass
# Remove duplicates
image_ids = list(set(image_ids))
# Add classes
for i in class_ids:
self.add_class("coco", i, self.load_class_info(i)[0]["name"])
pass
# Add images
for i in image_ids:
self.add_image(source="coco", image_id=i,
path=os.path.join(self.image_file_path, self.image_dict[i]['file_name']),
width=self.image_dict[i]["width"],
height=self.image_dict[i]["height"],
annotations=self.load_annotation_info(self.get_annotation_ids(
image_ids=[i], class_ids=class_ids, is_crowd=False)))
pass
# class name value clean
def clean_name(self, name):
"""
:param name: name value
:return:
"""
return ",".join(name.split(",")[: 1])
pass
if __name__ == "__main__":
# 代码开始时间
start_time = datetime.now()
print("开始时间: {}".format(start_time))
anno_file_path = "G:/deep_learning_demo/data/instance_segmentation/annotations"
train_anno_json_name = "instances_train2014.json"
val_anno_json_name = "instances_minival2014.json"
train_anno_json_data_path = os.path.join(anno_file_path, train_anno_json_name)
val_anno_json_data_path = os.path.join(anno_file_path, val_anno_json_name)
train_image_path = "G:/deep_learning_demo/data/instance_segmentation/train2014"
val_image_path = "G:/deep_learning_demo/data/instance_segmentation/val2014"
train_data = CocoDataset(train_anno_json_data_path, train_image_path)
dataset = train_data.dataset
dataset_key = [key for key in dataset]
print("dataset_key: {}".format(dataset_key))
print("dataset_type: {}".format(type(dataset)))
print("info_type: {}".format(type(dataset["info"])))
print("images_type: {}".format(type(dataset["images"])))
print("licenses_type: {}".format(type(dataset["licenses"])))
print("annotations_type: {}".format(type(dataset["annotations"])))
print("categories_type: {}".format(type(dataset["categories"])))
info_key = [key for key in dataset["info"]]
print("info_key: {}".format(info_key))
print("info: {}".format(dataset["info"]))
print("licenses: {}".format(dataset["licenses"]))
print("categories: {}".format(dataset["categories"]))
print("images_0-1: {}".format(dataset["images"][: 2]))
print("annotations_0-1: {}".format(dataset["annotations"][: 2]))
print("It's over!")
# 代码结束时间
end_time = datetime.now()
print("结束时间: {}, 训练模型耗时: {}".format(end_time, end_time - start_time))
pass
5. mask rcnn 模型
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/12 15:42
# @Author : WanDaoYi
# @FileName : mask_rcnn.py
# ============================================
import h5py
import numpy as np
import tensorflow as tf
import keras.backend as k
import keras.layers as kl
import keras.models as km
from utils.bbox_utils import BboxUtil
from utils.anchor_utils import AnchorUtils
from utils.image_utils import ImageUtils
from utils.mask_util import MaskUtil
from m_rcnn import common
from m_rcnn import backbone
from config import cfg
# Conditional import to support versions of Keras before 2.2
try:
from keras.engine import saving
except ImportError:
# Keras before 2.2 used the 'topology' namespace.
from keras.engine import topology as saving
class MaskRCNN(object):
def __init__(self, train_flag=True):
"""
:param train_flag: 是否为训练,训练为 True,测试为 False
"""
self.train_flag = train_flag
self.bbox_util = BboxUtil()
self.anchor_utils = AnchorUtils()
self.image_utils = ImageUtils()
self.mask_util = MaskUtil()
# 模型 路径
self.model_path = cfg.TRAIN.MODEL_PATH if self.train_flag else cfg.TEST.COCO_MODEL_PATH
# batch size
self.batch_size = cfg.TRAIN.BATCH_SIZE if self.train_flag else cfg.TEST.BATCH_SIZE
# 模型保存路径
self.save_model_path = cfg.TRAIN.SAVE_MODEL_PATH
self.backbone = cfg.COMMON.BACKBONE
self.backbone_strides = cfg.COMMON.BACKBONE_STRIDES
# 输入图像
self.image_shape = np.array(cfg.COMMON.IMAGE_SHAPE)
# 用于构建特征金字塔的自顶向下层的大小
self.top_down_pyramid_size = cfg.COMMON.TOP_DOWN_PYRAMID_SIZE
self.rpn_anchor_stride = cfg.COMMON.RPN_ANCHOR_STRIDE
self.rpn_anchor_ratios = cfg.COMMON.RPN_ANCHOR_RATIOS
self.rpn_nms_threshold = cfg.COMMON.RPN_NMS_THRESHOLD
self.class_num = cfg.COMMON.CLASS_NUM
self.rois_per_image = cfg.TRAIN.ROIS_PER_IMAGE
self.roi_positive_ratio = cfg.TRAIN.ROI_POSITIVE_RATIO
self.keras_model = self.build()
pass
def build(self):
# image shape
h, w, c = self.image_shape[:]
print("image_shape: {}".format(self.image_shape))
if h / 2 ** 6 != int(h / 2 ** 6) or w / 2 ** 6 != int(w / 2 ** 6):
raise Exception("Image size must be dividable by 2 at least 6 times "
"to avoid fractions when downscaling and upscaling."
"For example, use 256, 320, 384, 448, 512, ... etc. ")
# Inputs
input_image = kl.Input(shape=[None, None, c], name="input_image")
input_image_meta = kl.Input(shape=[cfg.COMMON.IMAGE_META_SIZE], name="input_image_meta")
# 训练
if self.train_flag:
# RPN GT
input_rpn_match = kl.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
input_rpn_bbox = kl.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
# Detection GT (class IDs, bounding boxes, and masks)
# 1. GT Class IDs (zero padded)
input_gt_class_ids = kl.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32)
# 2. GT Boxes in pixels (zero padded)
# [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
input_gt_boxes = kl.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
# Normalize coordinates
gt_boxes = kl.Lambda(lambda x: self.bbox_util.norm_boxes_graph(x, k.shape(input_image)[1:3]))(
input_gt_boxes)
# 3. GT Masks (zero padded)
# [batch, height, width, MAX_GT_INSTANCES]
if cfg.TRAIN.USE_MINI_MASK:
min_h, min_w = cfg.TRAIN.MINI_MASK_SHAPE[:]
input_gt_masks = kl.Input(shape=[min_h, min_w, None], name="input_gt_masks", dtype=bool)
else:
input_gt_masks = kl.Input(shape=[h, w, None], name="input_gt_masks", dtype=bool)
pass
# anchor
anchors = self.anchor_utils.get_anchors(self.image_shape)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (self.batch_size,) + anchors.shape)
# A hack to get around Keras's bad support for constants
anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
anchors = kl.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image)
pass
else:
# Anchors in normalized coordinates
anchors = kl.Input(shape=[None, 4], name="input_anchors")
# 上面训练用到的参数,测试不需要,但是在 if else 里面定义一下,免得 undefined
input_rpn_match = None
input_rpn_bbox = None
input_gt_class_ids = None
gt_boxes = None
input_gt_boxes = None
input_gt_masks = None
pass
# Build the shared convolutional layers.
# Bottom-up Layers
# Returns a list of the last layers of each stage, 5 in total.
# Don't create the thead (stage 5), so we pick the 4th item in the list.
_, c2, c3, c4, c5 = backbone.resnet_graph(input_image, self.backbone, stage5=True)
# Top-down Layers
# TODO: add assert to varify feature map sizes match what's in config
p5 = kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c5p5')(c5)
p4 = kl.Add(name="fpn_p4add")([kl.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(p5),
kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c4p4')(c4)])
p3 = kl.Add(name="fpn_p3add")([kl.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(p4),
kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c3p3')(c3)])
p2 = kl.Add(name="fpn_p2add")([kl.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(p3),
kl.Conv2D(self.top_down_pyramid_size, (1, 1), name='fpn_c2p2')(c2)])
# Attach 3x3 conv to all P layers to get the final feature maps.
p2 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p2")(p2)
p3 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p3")(p3)
p4 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p4")(p4)
p5 = kl.Conv2D(self.top_down_pyramid_size, (3, 3), padding="SAME", name="fpn_p5")(p5)
# P6 is used for the 5th anchor scale in RPN. Generated by
# subsampling from P5 with stride of 2.
p6 = kl.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(p5)
# Note that P6 is used in RPN, but not in the classifier heads.
rpn_feature_maps = [p2, p3, p4, p5, p6]
mrcnn_feature_maps = [p2, p3, p4, p5]
# RPN Model
rpn = common.build_rpn_model(self.rpn_anchor_stride, len(self.rpn_anchor_ratios), self.top_down_pyramid_size)
# Loop through pyramid layers
layer_outputs = [] # list of lists
for p in rpn_feature_maps:
layer_outputs.append(rpn([p]))
pass
# Concatenate layer outputs
# Convert from list of lists of level outputs to list of lists
# of outputs across levels.
# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"]
outputs = list(zip(*layer_outputs))
outputs = [kl.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names)]
rpn_class_logits, rpn_class, rpn_bbox = outputs
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
proposal_count = cfg.TRAIN.POST_NMS_ROIS if self.train_flag else cfg.TEST.POST_NMS_ROIS
rpn_rois = common.ProposalLayer(proposal_count=proposal_count,
nms_threshold=self.rpn_nms_threshold,
batch_size=self.batch_size,
name="ROI")([rpn_class, rpn_bbox, anchors])
fc_layer_size = cfg.COMMON.FPN_CLASS_FC_LAYERS_SIZE
pool_size = cfg.COMMON.POOL_SIZE
mask_pool_size = cfg.COMMON.MASK_POOL_SIZE
train_or_freeze = cfg.COMMON.TRAIN_FLAG
if self.train_flag:
# Class ID mask to mark class IDs supported by the dataset the image
# came from.
active_class_ids = kl.Lambda(lambda x: self.image_utils.parse_image_meta_graph(x)["active_class_ids"])(
input_image_meta)
if not cfg.TRAIN.USE_RPN_ROIS:
# Ignore predicted ROIs and use ROIs provided as an input.
input_rois = kl.Input(shape=[proposal_count, 4], name="input_roi", dtype=np.int32)
# Normalize coordinates
target_rois = kl.Lambda(lambda x: self.bbox_util.norm_boxes_graph(x, k.shape(input_image)[1:3]))(
input_rois)
else:
target_rois = rpn_rois
input_rois = None
# Generate detection targets
# Subsamples proposals and generates target outputs for training
# Note that proposal class IDs, gt_boxes, and gt_masks are zero
# padded. Equally, returned rois and targets are zero padded.
rois, target_class_ids, target_bbox, target_mask = \
common.DetectionTargetLayer(self.batch_size, name="proposal_targets")([
target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
# Network Heads
# TODO: verify that this handles zero padded ROIs
mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph(rois,
mrcnn_feature_maps,
input_image_meta,
pool_size,
self.class_num,
train_flag=train_or_freeze,
fc_layers_size=fc_layer_size)
mrcnn_mask = common.build_fpn_mask_graph(rois, mrcnn_feature_maps,
input_image_meta,
mask_pool_size,
self.class_num,
train_flag=train_or_freeze)
# TODO: clean up (use tf.identify if necessary)
output_rois = kl.Lambda(lambda x: x * 1, name="output_rois")(rois)
# Losses
rpn_class_loss = kl.Lambda(lambda x: common.rpn_class_loss_graph(*x), name="rpn_class_loss")(
[input_rpn_match, rpn_class_logits])
rpn_bbox_loss = kl.Lambda(lambda x: common.rpn_bbox_loss_graph(self.batch_size, *x), name="rpn_bbox_loss")(
[input_rpn_bbox, input_rpn_match, rpn_bbox])
class_loss = kl.Lambda(lambda x: common.mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")(
[target_class_ids, mrcnn_class_logits, active_class_ids])
bbox_loss = kl.Lambda(lambda x: common.mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")(
[target_bbox, target_class_ids, mrcnn_bbox])
mask_loss = kl.Lambda(lambda x: common.mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")(
[target_mask, target_class_ids, mrcnn_mask])
# Model
inputs = [input_image, input_image_meta,
input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
if not cfg.TRAIN.USE_RPN_ROIS:
inputs.append(input_rois)
outputs = [rpn_class_logits, rpn_class, rpn_bbox,
mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
rpn_rois, output_rois,
rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
model = km.Model(inputs, outputs, name='mask_rcnn')
pass
else:
# Network Heads
# Proposal classifier and BBox regressor heads
mrcnn_class_logits, mrcnn_class, mrcnn_bbox = common.fpn_classifier_graph(rpn_rois,
mrcnn_feature_maps,
input_image_meta,
pool_size,
self.class_num,
train_flag=train_or_freeze,
fc_layers_size=fc_layer_size)
# Detections
# output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
# normalized coordinates
detections = common.DetectionLayer(self.batch_size, name="mrcnn_detection")([rpn_rois,
mrcnn_class,
mrcnn_bbox,
input_image_meta])
# Create masks for detections
detection_boxes = kl.Lambda(lambda x: x[..., :4])(detections)
mrcnn_mask = common.build_fpn_mask_graph(detection_boxes,
mrcnn_feature_maps,
input_image_meta,
mask_pool_size,
self.class_num,
train_flag=train_or_freeze)
model = km.Model([input_image, input_image_meta, anchors],
[detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],
name='mask_rcnn')
pass
# Add multi-GPU support. 多 GPU 操作
gpu_count = cfg.COMMON.GPU_COUNT
if gpu_count > 1:
from m_rcnn.parallel_model import ParallelModel
model = ParallelModel(model, gpu_count)
return model
pass
def load_weights(self, model_path, by_name=False, exclude=None):
"""
Modified version of the corresponding Keras function
with the addition of multi-GPU support and the ability
to exclude some layers from loading.
:param model_path:
:param by_name:
:param exclude: list of layer names to exclude
:return:
"""
if exclude:
by_name = True
pass
if h5py is None:
raise ImportError('`load_weights` requires h5py.')
pass
model_file = h5py.File(model_path, mode='r')
if 'layer_names' not in model_file.attrs and 'model_weights' in model_file:
model_file = model_file['model_weights']
# In multi-GPU training, we wrap the model. Get layers
# of the inner model because they have the weights.
keras_model = self.keras_model
layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model") else keras_model.layers
print("layers: {}".format(layers))
# Exclude some layers
if exclude:
layers = filter(lambda l: l.name not in exclude, layers)
if by_name:
saving.load_weights_from_hdf5_group_by_name(model_file, layers)
else:
saving.load_weights_from_hdf5_group(model_file, layers)
if hasattr(model_file, 'close'):
model_file.close()
pass
def generate_random_rois(self, image_shape, count, gt_boxes):
"""
Generates ROI proposals similar to what a region proposal network
would generate.
:param image_shape: [Height, Width, Depth]
:param count: Number of ROIs to generate
:param gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
:return:
"""
# placeholder
rois = np.zeros((count, 4), dtype=np.int32)
# Generate random ROIs around GT boxes (90% of count)
rois_per_box = int(0.9 * count / gt_boxes.shape[0])
for i in range(gt_boxes.shape[0]):
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
h = gt_y2 - gt_y1
w = gt_x2 - gt_x1
# random boundaries
r_y1 = max(gt_y1 - h, 0)
r_y2 = min(gt_y2 + h, image_shape[0])
r_x1 = max(gt_x1 - w, 0)
r_x2 = min(gt_x2 + w, image_shape[1])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:rois_per_box]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:rois_per_box]
if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
box_rois = np.hstack([y1, x1, y2, x2])
rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
# Generate random ROIs anywhere in the image (10% of count)
remaining_count = count - (rois_per_box * gt_boxes.shape[0])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >= threshold][:remaining_count]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >= threshold][:remaining_count]
if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
global_rois = np.hstack([y1, x1, y2, x2])
rois[-remaining_count:] = global_rois
return rois
pass
def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes, gt_masks):
"""
Generate targets for training Stage 2 classifier and mask heads.
This is not used in normal training. It's useful for debugging or to train
the Mask RCNN heads without using the RPN head.
:param rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
:param gt_class_ids: [instance count] Integer class IDs
:param gt_boxes: [instance count, (y1, x1, y2, x2)]
:param gt_masks: [height, width, instance count] Ground truth masks. Can be full
size or mini-masks.
:return:
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
bbox refinements.
masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
to bbox boundaries and resized to neural network output size.
"""
assert rpn_rois.shape[0] > 0
assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
gt_class_ids.dtype)
assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
gt_boxes.dtype)
assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
gt_masks.dtype)
# It's common to add GT Boxes to ROIs but we don't do that here because
# according to XinLei Chen's paper, it doesn't help.
# Trim empty padding in gt_boxes and gt_masks parts
instance_ids = np.where(gt_class_ids > 0)[0]
assert instance_ids.shape[0] > 0, "Image must contain instances."
gt_class_ids = gt_class_ids[instance_ids]
gt_boxes = gt_boxes[instance_ids]
gt_masks = gt_masks[:, :, instance_ids]
# Compute areas of ROIs and ground truth boxes.
# rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) * (rpn_rois[:, 3] - rpn_rois[:, 1])
# gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) * (gt_boxes[:, 3] - gt_boxes[:, 1])
# Compute overlaps [rpn_rois, gt_boxes]
overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
for i in range(overlaps.shape[1]):
gt = gt_boxes[i]
overlaps[:, i] = self.bbox_util.compute_iou(gt, rpn_rois)
pass
# Assign ROIs to GT boxes
rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
rpn_roi_iou_max = overlaps[np.arange(overlaps.shape[0]), rpn_roi_iou_argmax]
# GT box assigned to each ROI
rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
# Positive ROIs are those with >= 0.5 IoU with a GT box.
fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
# Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
# TODO: To hard example mine or not to hard example mine, that's the question
# bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
# Subsample ROIs. Aim for 33% foreground.
# FG
fg_roi_count = int(self.rois_per_image * self.roi_positive_ratio)
if fg_ids.shape[0] > fg_roi_count:
keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
else:
keep_fg_ids = fg_ids
# BG
remaining = self.rois_per_image - keep_fg_ids.shape[0]
if bg_ids.shape[0] > remaining:
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
else:
keep_bg_ids = bg_ids
# Combine indices of ROIs to keep
keep = np.concatenate([keep_fg_ids, keep_bg_ids])
# Need more?
remaining = self.rois_per_image - keep.shape[0]
if remaining > 0:
# Looks like we don't have enough samples to maintain the desired
# balance. Reduce requirements and fill in the rest. This is
# likely different from the Mask RCNN paper.
# There is a small chance we have neither fg nor bg samples.
if keep.shape[0] == 0:
# Pick bg regions with easier IoU threshold
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
assert bg_ids.shape[0] >= remaining
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
assert keep_bg_ids.shape[0] == remaining
keep = np.concatenate([keep, keep_bg_ids])
else:
# Fill the rest with repeated bg rois.
keep_extra_ids = np.random.choice(
keep_bg_ids, remaining, replace=True)
keep = np.concatenate([keep, keep_extra_ids])
assert keep.shape[0] == self.rois_per_image, \
"keep doesn't match ROI batch size {}, {}".format(keep.shape[0], self.rois_per_image)
# Reset the gt boxes assigned to BG ROIs.
rpn_roi_gt_boxes[keep_bg_ids, :] = 0
rpn_roi_gt_class_ids[keep_bg_ids] = 0
# For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
rois = rpn_rois[keep]
roi_gt_boxes = rpn_roi_gt_boxes[keep]
roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
roi_gt_assignment = rpn_roi_iou_argmax[keep]
# Class-aware bbox deltas. [y, x, log(h), log(w)]
bboxes = np.zeros((self.rois_per_image, self.class_num, 4), dtype=np.float32)
pos_ids = np.where(roi_gt_class_ids > 0)[0]
bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = self.bbox_util.box_refinement(rois[pos_ids],
roi_gt_boxes[pos_ids, :4])
# Normalize bbox refinements
bbox_std_dev = np.array(cfg.COMMON.BBOX_STD_DEV)
bboxes /= bbox_std_dev
# Generate class-specific target masks
masks = np.zeros((self.rois_per_image, self.image_shape[0], self.image_shape[1], self.class_num),
dtype=np.float32)
for i in pos_ids:
class_id = roi_gt_class_ids[i]
assert class_id > 0, "class id must be greater than 0"
gt_id = roi_gt_assignment[i]
class_mask = gt_masks[:, :, gt_id]
if cfg.TRAIN.USE_MINI_MASK:
# Create a mask placeholder, the size of the image
placeholder = np.zeros(self.image_shape[:2], dtype=bool)
# GT box
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]
gt_w = gt_x2 - gt_x1
gt_h = gt_y2 - gt_y1
# Resize mini mask to size of GT box
placeholder[gt_y1:gt_y2, gt_x1:gt_x2] = \
np.round(self.image_utils.resize(class_mask, (gt_h, gt_w))).astype(bool)
# Place the mini batch in the placeholder
class_mask = placeholder
# Pick part of the mask and resize it
y1, x1, y2, x2 = rois[i].astype(np.int32)
m = class_mask[y1:y2, x1:x2]
mask = self.image_utils.resize(m, self.image_shape)
masks[i, :, :, class_id] = mask
return rois, roi_gt_class_ids, bboxes, masks
pass
# #############################################################################################
# test
# #############################################################################################
def detect(self, images_info_list, verbose=0):
"""
Runs the detection pipeline.
:param images_info_list: List of images, potentially of different sizes.
:param verbose:
:return: a list of dicts, one dict per image. The dict contains:
rois: [N, (y1, x1, y2, x2)] detection bounding boxes
class_ids: [N] int class IDs
scores: [N] float probability scores for the class IDs
masks: [H, W, N] instance binary masks
"""
if verbose:
print("processing {} image_info.".format(len(images_info_list)))
for image_info in images_info_list:
print("image_info: {}".format(image_info))
pass
pass
# Mold inputs to format expected by the neural network
molded_images_list, image_metas_list, windows_list = self.image_utils.mode_input(images_info_list)
# Validate image sizes
# All images in a batch MUST be of the same size
image_shape = molded_images_list[0].shape
for g in molded_images_list[1:]:
assert g.shape == image_shape, \
"After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes."
pass
# Anchors
anchors = self.anchor_utils.get_anchors(image_shape)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (cfg.TEST.BATCH_SIZE,) + anchors.shape)
if verbose:
print("molded_images_list: ", molded_images_list)
print("image_metas_list: ", image_metas_list)
print("anchors: ", anchors)
pass
# Run object detection
detections, _, _, mrcnn_mask, _, _, _ = \
self.keras_model.predict([molded_images_list, image_metas_list, anchors], verbose=0)
# Process detections
results_list = []
for i, image_info in enumerate(images_info_list):
molded_image_shape = molded_images_list[i].shape
final_rois, final_class_ids, final_scores, final_masks = self.un_mold_detections(detections[i],
mrcnn_mask[i],
image_info.shape,
molded_image_shape,
windows_list[i])
results_list.append({"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks,
})
return results_list
pass
def un_mold_detections(self, detections, mrcnn_mask, original_image_shape,
image_shape, window):
"""
Reformats the detections of one image from the format of the neural
network output to a format suitable for use in the rest of the
application.
:param detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates
:param mrcnn_mask: [N, height, width, num_classes]
:param original_image_shape: [H, W, C] Original image shape before resizing
:param image_shape: [H, W, C] Shape of the image after resizing and padding
:param window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real
image is excluding the padding.
:return:
boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
class_ids: [N] Integer class IDs for each bounding box
scores: [N] Float probability scores of the class_id
masks: [height, width, num_instances] Instance masks
"""
# How many detections do we have?
# Detections array is padded with zeros. Find the first class_id == 0.
zero_ix = np.where(detections[:, 4] == 0)[0]
n = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0]
# Extract boxes, class_ids, scores, and class-specific masks
boxes = detections[: n, :4]
class_ids = detections[: n, 4].astype(np.int32)
scores = detections[: n, 5]
masks = mrcnn_mask[np.arange(n), :, :, class_ids]
# Translate normalized coordinates in the resized image to pixel
# coordinates in the original image before resizing
window = self.bbox_util.norm_boxes(window, image_shape[:2])
wy1, wx1, wy2, wx2 = window
shift = np.array([wy1, wx1, wy1, wx1])
wh = wy2 - wy1 # window height
ww = wx2 - wx1 # window width
scale = np.array([wh, ww, wh, ww])
# Convert boxes to normalized coordinates on the window
boxes = np.divide(boxes - shift, scale)
# Convert boxes to pixel coordinates on the original image
boxes = self.bbox_util.denorm_boxes(boxes, original_image_shape[:2])
# Filter out detections with zero area. Happens in early training when
# network weights are still random
exclude_ix = np.where(
(boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
if exclude_ix.shape[0] > 0:
boxes = np.delete(boxes, exclude_ix, axis=0)
class_ids = np.delete(class_ids, exclude_ix, axis=0)
scores = np.delete(scores, exclude_ix, axis=0)
masks = np.delete(masks, exclude_ix, axis=0)
n = class_ids.shape[0]
# Resize masks to original image size and set boundary threshold.
full_masks = []
for i in range(n):
# Convert neural network mask to full size mask
full_mask = self.mask_util.unmold_mask(masks[i], boxes[i], original_image_shape)
full_masks.append(full_mask)
pass
full_masks = np.stack(full_masks, axis=-1) if full_masks else np.empty(original_image_shape[:2] + (0,))
return boxes, class_ids, scores, full_masks
pass
#!/usr/bin/env python
# _*_ coding:utf-8 _*_
# ============================================
# @Time : 2020/05/15 12:44
# @Author : WanDaoYi
# @FileName : parallel_model.py
# ============================================
import tensorflow as tf
import keras.backend as K
import keras.layers as KL
import keras.models as KM
class ParallelModel(KM.Model):
"""
Subclasses the standard Keras Model and adds multi-GPU support.
It works by creating a copy of the model on each GPU. Then it slices
the inputs and sends a slice to each copy of the model, and then
merges the outputs together and applies the loss on the combined outputs.
"""
def __init__(self, keras_model, gpu_count):
"""
Class constructor.
:param keras_model: The Keras model to parallelize
:param gpu_count: gpu 个数,当 gpu 个数 大于 1 时,调用这个对象,启用多 GPU 训练
"""
self.inner_model = keras_model
self.gpu_count = gpu_count
merged_outputs = self.make_parallel()
super(ParallelModel, self).__init__(inputs=self.inner_model.inputs,
outputs=merged_outputs)
def __getattribute__(self, attrname):
"""
Redirect loading and saving methods to the inner model. That's where the weights are stored
:param attrname:
:return:
"""
if 'load' in attrname or 'save' in attrname:
return getattr(self.inner_model, attrname)
return super(ParallelModel, self).__getattribute__(attrname)
def summary(self, *args, **kwargs):
"""
Override summary() to display summaries of both, the wrapper and inner models.
:param args:
:param kwargs:
:return:
"""
super(ParallelModel, self).summary(*args, **kwargs)
self.inner_model.summary(*args, **kwargs)
def make_parallel(self):
"""
Creates a new wrapper model that consists of multiple replicas of
the original model placed on different GPUs.
:return:
"""
# Slice inputs. Slice inputs on the CPU to avoid sending a copy
# of the full inputs to all GPUs. Saves on bandwidth and memory.
input_slices = {name: tf.split(x, self.gpu_count)
for name, x in zip(self.inner_model.input_names, self.inner_model.inputs)}
output_names = self.inner_model.output_names
outputs_all = []
for i in range(len(self.inner_model.outputs)):
outputs_all.append([])
# Run the model call() on each GPU to place the ops there
for i in range(self.gpu_count):
with tf.device('/gpu:%d' % i):
with tf.name_scope('tower_%d' % i):
# Run a slice of inputs through this replica
zipped_inputs = zip(self.inner_model.input_names,
self.inner_model.inputs)
inputs = [
KL.Lambda(lambda s: input_slices[name][i],
output_shape=lambda s: (None,) + s[1:])(tensor)
for name, tensor in zipped_inputs]
# Create the model replica and get the outputs
outputs = self.inner_model(inputs)
if not isinstance(outputs, list):
outputs = [outputs]
# Save the outputs for merging back together later
for l, o in enumerate(outputs):
outputs_all[l].append(o)
# Merge outputs on CPU
with tf.device('/cpu:0'):
merged = []
for outputs, name in zip(outputs_all, output_names):
# Concatenate or average outputs?
# Outputs usually have a batch dimension and we concatenate
# across it. If they don't, then the output is likely a loss
# or a metric value that gets averaged across the batch.
# Keras expects losses and metrics to be scalars.
if K.int_shape(outputs[0]) == ():
# Average
m = KL.Lambda(lambda o: tf.add_n(o) / len(outputs), name=name)(outputs)
else:
# Concatenate
m = KL.Concatenate(axis=0, name=name)(outputs)
merged.append(m)
return merged
到这里,收工
回主目录
返回 实例分割 目录
上一章:深度篇——实例分割(二) 细说 mask rcnn 实例分割代码 训练自己数据