借鉴了slim模块的预处理过程,主要步骤如下
def preprocess_for_train(image, labels, bboxes,
out_shape, data_format='NHWC',
scope='ssd_preprocessing_train'):
fast_mode = False
with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]):
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
# 读入的数据可能是uint8,先转换成float32, 范围[0,1]
if image.dtype != tf.float32:
image = tf.image.convert_image_dtype(image, dtype=tf.float32)
tf_summary_image(image, bboxes, 'image_with_bboxes')
# 根据bbox对图像进行裁剪,同时对bbox进行处理
dst_image = image
dst_image, labels, bboxes, distort_bbox = \
distorted_bounding_box_crop(image, labels, bboxes,
min_object_covered=MIN_OBJECT_COVERED,
aspect_ratio_range=CROP_RATIO_RANGE)
# resize,因为bbox是相对位置,resize操作不需要处理bbox。此处实测与直接用tf.resize结果是一样的,不知道为什么作者搞的这么复杂
dst_image = tf_image.resize_image(dst_image, out_shape,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
tf_summary_image(dst_image, bboxes, 'image_shape_distorted')
# 随机左右翻转,同时对bbox进行处理
dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes)
# 随机改变色彩
dst_image = apply_with_random_selector(
dst_image,
lambda x, ordering: distort_color(x, ordering, fast_mode),
num_cases=4)
tf_summary_image(dst_image, bboxes, 'image_color_distorted')
# 把image恢复成[0, 255]范围,并进行中心化(减均值)
image = dst_image * 255.
image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
# 如果数据格式为NCHW,需进行转置
if data_format == 'NCHW':
image = tf.transpose(image, perm=(2, 0, 1))
return image, labels, bboxes
def preprocess_for_eval(image, labels, bboxes,
out_shape=EVAL_SIZE, data_format='NHWC',
difficults=None, resize=Resize.WARP_RESIZE,
scope='ssd_preprocessing_train'):
with tf.name_scope(scope):
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
image = tf.to_float(image)
image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN])
# bbox添加一个以整个图片为边界的框
bbox_img = tf.constant([[0., 0., 1., 1.]])
if bboxes is None:
bboxes = bbox_img
else:
bboxes = tf.concat([bbox_img, bboxes], axis=0)
#resize
if resize == Resize.NONE:
# No resizing...
pass
elif resize == Resize.CENTRAL_CROP:
# Central cropping of the image.
image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad(
image, bboxes, out_shape[0], out_shape[1])
elif resize == Resize.PAD_AND_RESIZE:
# Resize image first: find the correct factor...
shape = tf.shape(image)
factor = tf.minimum(tf.to_double(1.0),
tf.minimum(tf.to_double(out_shape[0] / shape[0]),
tf.to_double(out_shape[1] / shape[1])))
resize_shape = factor * tf.to_double(shape[0:2])
resize_shape = tf.cast(tf.floor(resize_shape), tf.int32)
image = tf_image.resize_image(image, resize_shape,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
# Pad to expected size.
image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad(
image, bboxes, out_shape[0], out_shape[1])
elif resize == Resize.WARP_RESIZE:
# Warp resize of the image.
image = tf_image.resize_image(image, out_shape,
method=tf.image.ResizeMethod.BILINEAR,
align_corners=False)
# Split back bounding boxes.
bbox_img = bboxes[0]
bboxes = bboxes[1:]
# Remove difficult boxes.
if difficults is not None:
mask = tf.logical_not(tf.cast(difficults, tf.bool))
labels = tf.boolean_mask(labels, mask)
bboxes = tf.boolean_mask(bboxes, mask)
# Image data format.
if data_format == 'NCHW':
image = tf.transpose(image, perm=(2, 0, 1))
return image, labels, bboxes, bbox_img
def distorted_bounding_box_crop(image,
labels,
bboxes,
min_object_covered=0.3,
aspect_ratio_range=(0.9, 1.1),
area_range=(0.1, 1.0),
max_attempts=200,
clip_bboxes=True,
scope=None):
with tf.name_scope(scope, 'distorted_bounding_box_crop', [image, bboxes]):
#随机截取图片,
#bbox_begin: 裁剪框的[offset_height, offset_width, 0]
#bbox_size: 裁剪框的[target_height, target_width, -1]
#distort_bbox: shape为 [1, 1, 4] 的三维矩阵,表示随机变形后的边界框
bbox_begin, bbox_size, distort_bbox = tf.image.sample_distorted_bounding_box(
tf.shape(image), #图像尺寸
bounding_boxes=tf.expand_dims(bboxes, 0), #ground truth边框
min_object_covered=min_object_covered, #裁剪框与其中一个边框的最小重合率
aspect_ratio_range=aspect_ratio_range, #长宽比的范围
area_range=area_range, #裁剪框必须包含该区域内的图像的一部分
max_attempts=max_attempts, #尝试生成图像指定约束的裁剪区域的次数。经过 max_attempts 次失败后,将返回整个图像。
use_image_if_no_bounding_boxes=True)
distort_bbox = distort_bbox[0, 0] #获取边界框,结果有4维,后2维才是实际的坐标
cropped_image = tf.slice(image, bbox_begin, bbox_size) #裁剪图片
cropped_image.set_shape([None, None, 3]) #恢复维度
# 修正bbox,并把与原bbox重合率低于threshold的label和bbox过滤掉
bboxes = tfe.bboxes_resize(distort_bbox, bboxes)
labels, bboxes = tfe.bboxes_filter_overlap(labels, bboxes,
threshold=BBOX_CROP_OVERLAP,
assign_negative=False)
return cropped_image, labels, bboxes, distort_bbox
def bboxes_resize(bbox_ref, bboxes, name=None):
# 处理bboxes是dict的情况
if isinstance(bboxes, dict):
with tf.name_scope(name, 'bboxes_resize_dict'):
d_bboxes = {}
for c in bboxes.keys():
d_bboxes[c] = bboxes_resize(bbox_ref, bboxes[c])
return d_bboxes
# Tensors inputs.
with tf.name_scope(name, 'bboxes_resize'):
# 把bbox的y_min和x_min修改为以bbox_ref的左上角为(0,0)
v = tf.stack([bbox_ref[0], bbox_ref[1], bbox_ref[0], bbox_ref[1]])
bboxes = bboxes - v
# 把坐标的相对值改为bbox_ref的高度和宽度
s = tf.stack([bbox_ref[2] - bbox_ref[0],
bbox_ref[3] - bbox_ref[1],
bbox_ref[2] - bbox_ref[0],
bbox_ref[3] - bbox_ref[1]])
bboxes = bboxes / s
return bboxes
def bboxes_filter_overlap(labels, bboxes,
threshold=0.5, assign_negative=False,
scope=None):
with tf.name_scope(scope, 'bboxes_filter', [labels, bboxes]):
#计算重合率
scores = bboxes_intersection(tf.constant([0, 0, 1, 1], bboxes.dtype),
bboxes)
#把重合率低于threshold的标签置为负,或者把bbox和label删除
mask = scores > threshold
if assign_negative:
labels = tf.where(mask, labels, -labels)
# bboxes = tf.where(mask, bboxes, bboxes)
else:
labels = tf.boolean_mask(labels, mask)
bboxes = tf.boolean_mask(bboxes, mask)
return labels, bboxes
def bboxes_intersection(bbox_ref, bboxes, name=None):
with tf.name_scope(name, 'bboxes_intersection'):
# Should be more efficient to first transpose.
bboxes = tf.transpose(bboxes)
bbox_ref = tf.transpose(bbox_ref)
# 重合部分的左上角和右下角坐标
int_ymin = tf.maximum(bboxes[0], bbox_ref[0])
int_xmin = tf.maximum(bboxes[1], bbox_ref[1])
int_ymax = tf.minimum(bboxes[2], bbox_ref[2])
int_xmax = tf.minimum(bboxes[3], bbox_ref[3])
#重合部分的高和宽
h = tf.maximum(int_ymax - int_ymin, 0.)
w = tf.maximum(int_xmax - int_xmin, 0.)
# 交集面积
inter_vol = h * w
# bbox面积
bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1])
#重合部分占bbox的比例
scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection')
return scores
def random_flip_left_right(image, bboxes, seed=None):
# 左右翻转bbox,bbox形式为(y_min, x_min, y_max, x_max)
def flip_bboxes(bboxes):
bboxes = tf.stack([bboxes[:, 0], 1 - bboxes[:, 3],
bboxes[:, 2], 1 - bboxes[:, 1]], axis=-1)
return bboxes
# 翻转图片,参考tf的官方实现
with tf.name_scope('random_flip_left_right'):
image = ops.convert_to_tensor(image, name='image')
_Check3DImage(image, require_static=False)
uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
mirror_cond = math_ops.less(uniform_random, .5)
# Flip image.
result = control_flow_ops.cond(mirror_cond,
lambda: array_ops.reverse_v2(image, [1]),
lambda: image)
# 添加翻转bbox处理
bboxes = control_flow_ops.cond(mirror_cond,
lambda: flip_bboxes(bboxes),
lambda: bboxes)
return fix_image_flip_shape(image, result), bboxes
def distort_color(image, color_ordering=0, fast_mode=True, scope=None):
with tf.name_scope(scope, 'distort_color', [image]):
if fast_mode:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
else:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
if color_ordering == 0:
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
elif color_ordering == 1:
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
elif color_ordering == 2:
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
elif color_ordering == 3:
image = tf.image.random_hue(image, max_delta=0.2)
image = tf.image.random_saturation(image, lower=0.5, upper=1.5)
image = tf.image.random_contrast(image, lower=0.5, upper=1.5)
image = tf.image.random_brightness(image, max_delta=32. / 255.)
else:
raise ValueError('color_ordering must be in [0, 3]')
# The random_* ops do not necessarily clamp.
return tf.clip_by_value(image, 0.0, 1.0)
def apply_with_random_selector(x, func, num_cases):
sel = tf.random_uniform([], maxval=num_cases, dtype=tf.int32)
# merge:for循环建立了4个通道,只要其中一个通道有合法数据,则从该通道继续向前处理。
# switch:返回两个通道,条件为False时,x传向通道0,为True时传向通道1,此处只取sel等于case为True的通道[1]
return control_flow_ops.merge([
func(control_flow_ops.switch(x, tf.equal(sel, case))[1], case)
for case in range(num_cases)])[0]
def tf_image_whitened(image, means=[_R_MEAN, _G_MEAN, _B_MEAN]):
if image.get_shape().ndims != 3:
raise ValueError('Input must be of size [height, width, C>0]')
num_channels = image.get_shape().as_list()[-1]
if len(means) != num_channels:
raise ValueError('len(means) must match the number of channels')
mean = tf.constant(means, dtype=image.dtype)
image = image - mean
return image