Unsupervised Monocular Depth Estimation with Left-Right Consistency :monodepth_dataloader.py
源码:monodepth
"""
代码学习 注释专用
song
stay hungry stay foolish
"""
from __future__ import absolute_import, division, print_function
import tensorflow as tf
def string_length_tf(t): # 测量tensor长度
return tf.py_func(len, [t], [tf.int64]) # tf.py_func 提供对tensor的操作接口
class MonodepthDataloader(object): # 定义类 MonodepthDataloader
"""monodepth dataloader"""
def __init__(self, data_path, filenames_file, params, dataset, mode):
self.data_path = data_path # 数据路径
self.params = params # 参数
self.dataset = dataset # 数据集
self.mode = mode # 模型
self.left_image_batch = None # 定义赋值变量 采用默认值
self.right_image_batch = None
input_queue = tf.train.string_input_producer([filenames_file], shuffle=False)
"""
把输入的数据进行按照要求排序成一个队列。 这里把KITTI的图片文件名 整理 成一个队列(queue)
Tip:这里的 shuffle 是布尔值的意思,默认为TRUE 。会改变input的顺序
"""
line_reader = tf.TextLineReader() # 创建一个 TextLineReader 文件
_, line = line_reader.read(input_queue) # 输出键值对
"""
一个键值对如下图所示:
key: 第几个键值对
b'kitti_train_files.txt:11987'
value: 包含的内容,即 一对双目图像的路径
b'2011_09_30/2011_09_30_drive_0033_sync/image_02/data/0000001585.jpg 2011_09_30/2011_09_30_drive_0033_sync/image_03/data/0000001585.jpg'
"""
split_line = tf.string_split([line]).values # 将两张图片的地址字符串分开
# we load only one image for test, except if we trained a stereo model
if mode == 'test' and not self.params.do_stereo: # 单张图片的测试
left_image_path = tf.string_join([self.data_path, split_line[0]])
left_image_o = self.read_image(left_image_path) # 读取图片
else: # 两张图片的测试 stereo
left_image_path = tf.string_join([self.data_path, split_line[0]])
right_image_path = tf.string_join([self.data_path, split_line[1]])
left_image_o = self.read_image(left_image_path)
right_image_o = self.read_image(right_image_path)
if mode == 'train': # 训练模型
# randomly flip images # 随机翻转图片 用来实现数据增强
do_flip = tf.random_uniform([], 0, 1)
left_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(right_image_o), lambda: left_image_o)
right_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(left_image_o), lambda: right_image_o)
# randomly augment images # 随机增强图片 用来实现数据增强
do_augment = tf.random_uniform([], 0, 1)
left_image, right_image = tf.cond(do_augment > 0.5, lambda: self.augment_image_pair(left_image, right_image), lambda: (left_image, right_image))
left_image.set_shape( [None, None, 3])
right_image.set_shape([None, None, 3])
"""
set_shpape 和 reshape
set_shape 是重新定义 placeholder 的 shape
reshape 是重新定义 对应参数的 shape
"""
# capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size
min_after_dequeue = 2048
capacity = min_after_dequeue + 4 * params.batch_size
self.left_image_batch, self.right_image_batch = tf.train.shuffle_batch([left_image, right_image],
params.batch_size, capacity, min_after_dequeue, params.num_threads)
# 提取出一个 batch 训练所需要的所有 tensor
"""
def shuffle_batch(tensors: Any, # 入队的张量列表
batch_size: Any, # 一次批处理的 tensor 数量
capacity: {__sub__}, # 队列中的最大元素数
capacity=(min_after_dequeue+(num_threads+a small safety margin∗batchsize)
min_after_dequeue: Any, # 当一次出列操作完成后,队列中元素的最小数量,往往用于定义元素的混合级别.
定义了随机取样的缓冲区大小,此参数越大表示更大级别的混合但是会导致启动更加缓慢,并且会占用更多的内存
num_threads: int = 1, # 多线程操作
seed: Any = None,
enqueue_many: bool = False,
shapes: Any = None,
allow_smaller_final_batch: bool = False,
shared_name: Any = None,
name: Any = None)
"""
elif mode == 'test': # 测试模式
self.left_image_batch = tf.stack([left_image_o, tf.image.flip_left_right(left_image_o)], 0)
"""
left_image_o 是图取的图片的 tensor , 翻转之后组合。对第 0 维度进行拼接 ,就是变成了 [ 512,512,3 ] 的矩阵
"""
self.left_image_batch.set_shape( [2, None, None, 3]) # 定义 一个 batch 为一个四维的数组。这样就变成了两张图片的矩阵
if self.params.do_stereo: # 如果是双目图片的测试 ,则加入对右边图片的处理
self.right_image_batch = tf.stack([right_image_o, tf.image.flip_left_right(right_image_o)], 0)
self.right_image_batch.set_shape( [2, None, None, 3])
def augment_image_pair(self, left_image, right_image): # 图片增强
# randomly shift gamma
random_gamma = tf.random_uniform([], 0.8, 1.2)
left_image_aug = left_image ** random_gamma # 随机的进行 参数为(0.8,1.2)的伽马增强
right_image_aug = right_image ** random_gamma
# randomly shift brightness
random_brightness = tf.random_uniform([], 0.5, 2.0)
left_image_aug = left_image_aug * random_brightness # 改变亮度
right_image_aug = right_image_aug * random_brightness
# randomly shift color
random_colors = tf.random_uniform([3], 0.8, 1.2)
white = tf.ones([tf.shape(left_image)[0], tf.shape(left_image)[1]]) # tf.ones 生成值为 0 大小为 [left_image] 的矩阵
color_image = tf.stack([white * random_colors[i] for i in range(3)], axis=2) # 生成 随机像素矩阵
left_image_aug *= color_image
right_image_aug *= color_image # 改变 color
# saturate
left_image_aug = tf.clip_by_value(left_image_aug, 0, 1)
right_image_aug = tf.clip_by_value(right_image_aug, 0, 1) # 图像的归一化 像素值在(0,1)之间
return left_image_aug, right_image_aug
def read_image(self, image_path): # decode image
# tf.decode_image does not return the image size, this is an ugly workaround to handle both jpeg and png
path_length = string_length_tf(image_path)[0]
file_extension = tf.substr(image_path, path_length - 3, 3) # 提取文件扩展名
file_cond = tf.equal(file_extension, 'jpg') # 返回一个布尔值(TRUE or FALSE),判断文件名是否为 jpg
image = tf.cond(file_cond, lambda: tf.image.decode_jpeg(tf.read_file(image_path)), lambda: tf.image.decode_png(tf.read_file(image_path)))
# 进行 图片的解码 工作
# if the dataset is cityscapes, we crop the last fifth to remove the car hood
if self.dataset == 'cityscapes':
o_height = tf.shape(image)[0]
crop_height = (o_height * 4) // 5
image = image[:crop_height,:,:] # 提取 cityscapes 数据集中所有照片的前 4/5 部分
image = tf.image.convert_image_dtype(image, tf.float32) # 图片归一化 浮点型
image = tf.image.resize_images(image, [self.params.height, self.params.width], tf.image.ResizeMethod.AREA)
# 重新 resize 图片大小 变成 256 * 512 的图片
return image
"""
通过 monodepthload.py 我们可以知道模型输入的整体流程:
Train :
输入 == left_image_o == left_image(经过图像增强) (重新set_shape,变成了三维矩阵) == 输出训练的 left_image_batch(三维)
Test:
输入 == left_image_batch (翻转,合并,四维矩阵)
"""