代码位于im2txt/data/build_mscoco_data.py
,数据预处理的步骤如下:
MS COCO图像数据存储在data/mscoco/raw-data/train2014
和data/mscoco/raw-data/train2014
中,描述文件存储在data/mscoco/raw-data/annotations/captions_train2014.json
和data/mscoco/raw-data/annotations/captions_val2014.json
中。
数据预处理命令如下:
python im2txt/data/build_mscoco_data.py \
# MS COCO训练集图像存储路径
--train_image_dir="data/mscoco/raw-data/train2014" \
# MS COCO验证集图像存储路径
--val_image_dir="data/mscoco/raw-data/val2014" \
# MS COCO训练集图像描述存储路径
--train_captions_file="data/mscoco/raw-data/annotations/captions_train2014.json" \
# MS COCO验证集图像存储路径
--val_captions_file="data/mscoco/raw-data/annotations/captions_val2014.json" \
# 数据预处理之后保存TFRecord的路径
--output_dir="data/mscoco" \
# 单词词汇表保存位置,格式(单词,词频)
--word_counts_output_file="data/mscoco/word_counts.txt" \
代码的main
函数如下:
def main(unused_argv):
def _is_valid_num_shards(num_shards):
"""Returns True if num_shards is compatible with FLAGS.num_threads."""
return num_shards < FLAGS.num_threads or not num_shards % FLAGS.num_threads
assert _is_valid_num_shards(FLAGS.train_shards), (
"Please make the FLAGS.num_threads commensurate with FLAGS.train_shards")
assert _is_valid_num_shards(FLAGS.val_shards), (
"Please make the FLAGS.num_threads commensurate with FLAGS.val_shards")
assert _is_valid_num_shards(FLAGS.test_shards), (
"Please make the FLAGS.num_threads commensurate with FLAGS.test_shards")
if not tf.gfile.IsDirectory(FLAGS.output_dir):
tf.gfile.MakeDirs(FLAGS.output_dir)
# Load image metadata from caption files.
# 打开MS COCO训练集图像描述文件,提取(图像id,图像具体路径,图像参考描述)信息,组织成namedtuple格式
mscoco_train_dataset = _load_and_process_metadata(FLAGS.train_captions_file,
FLAGS.train_image_dir)
# 打开MS COCO验证集图像描述文件,提取(图像id,图像具体路径,图像参考描述)信息,组织成namedtuple格式
mscoco_val_dataset = _load_and_process_metadata(FLAGS.val_captions_file,
FLAGS.val_image_dir)
# Redistribute the MSCOCO data as follows:
# train_dataset = 100% of mscoco_train_dataset + 85% of mscoco_val_dataset.
# val_dataset = 5% of mscoco_val_dataset (for validation during training).
# test_dataset = 10% of mscoco_val_dataset (for final evaluation).
# 按照MS COCO训练集和验证集重新划分具体使用的训练集/验证集/测试集
train_cutoff = int(0.85 * len(mscoco_val_dataset))
val_cutoff = int(0.90 * len(mscoco_val_dataset))
train_dataset = mscoco_train_dataset + mscoco_val_dataset[0:train_cutoff]
val_dataset = mscoco_val_dataset[train_cutoff:val_cutoff]
test_dataset = mscoco_val_dataset[val_cutoff:]
# Create vocabulary from the training captions.
# 提取划分之后训练集图像的所有参考描述
train_captions = [c for image in train_dataset for c in image.captions]
# 根据参考描述创建词汇字典
vocab = _create_vocab(train_captions)
# 处理数据集图像,存储到TFRecord中
_process_dataset("train", train_dataset, vocab, FLAGS.train_shards)
_process_dataset("val", val_dataset, vocab, FLAGS.val_shards)
_process_dataset("test", test_dataset, vocab, FLAGS.test_shards)
利用_load_and_process_metadata(captions_file, image_dir)
函数进行处理,把该部分数据整理为[图像id,图像路径,图像参考的描述]
格式(A list of ImageMetadata),代码如下:
# 构造一个namedtuple,用于存储该部分信息,可以利用定义的命名(image_id,filename,captions)对数据进行访问
ImageMetadata = namedtuple("ImageMetadata", ["image_id", "filename", "captions"])
def _load_and_process_metadata(captions_file, image_dir):
"""Loads image metadata from a JSON file and processes the captions.
Args:
captions_file: JSON file containing caption annotations. 图像描述文件
image_dir: Directory containing the image files. 图像存储路径
Returns:
A list of ImageMetadata. [图像id,图像路径,图像参考的描述]
"""
with tf.gfile.FastGFile(captions_file, "r") as f:
caption_data = json.load(f)
# Extract the filenames.
id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]]
# Extract the captions. Each image_id is associated with multiple captions.
# 提取每张图片及其对应的参考描述
id_to_captions = {}
for annotation in caption_data["annotations"]:
# 图像id
image_id = annotation["image_id"]
# 图像对应的描述,大多数图片包含5条描述,还有少量包含6条描述、7条描述
caption = annotation["caption"]
id_to_captions.setdefault(image_id, [])
id_to_captions[image_id].append(caption)
assert len(id_to_filename) == len(id_to_captions)
assert set([x[0] for x in id_to_filename]) == set(id_to_captions.keys())
print("Loaded caption metadata for %d images from %s" %
(len(id_to_filename), captions_file))
# Process the captions and combine the data into a list of ImageMetadata.
print("Processing captions.")
image_metadata = []
num_captions = 0
for image_id, base_filename in id_to_filename:
# 构造图像的具体路径
filename = os.path.join(image_dir, base_filename)
# 利用nltk对图像描述进行处理,每条描述添加了和标记
captions = [_process_caption(c) for c in id_to_captions[image_id]]
# 构造数据[图像id,图像路径,图像参考的描述]
image_metadata.append(ImageMetadata(image_id, filename, captions))
num_captions += len(captions)
print("Finished processing %d captions for %d images in %s" %
(num_captions, len(id_to_filename), captions_file))
return image_metadata
def _process_caption(caption):
"""Processes a caption string into a list of tonenized words.
Args:
caption: A string caption.
Returns:
A list of strings; the tokenized caption.
"""
tokenized_caption = [FLAGS.start_word]
tokenized_caption.extend(nltk.tokenize.word_tokenize(caption.lower()))
tokenized_caption.append(FLAGS.end_word)
return tokenized_caption
这部分没啥,就是一个再划分。
上一步,划分得到最终我们使用的训练集之后,我们需要利用训练集中所有图像的参考描述来构建词汇表(根据词频)并建立word_to_id
词汇字典(方便模型通过单词获取单词的id编号,然后根据id编号获取该单词的词嵌入向量)。通过_create_vocab(captions)
函数实现(在这之前,把训练集所有图像的参考描述组织存储到captions中),代码如下:
def _create_vocab(captions):
"""Creates the vocabulary of word to word_id.
The vocabulary is saved to disk in a text file of word counts. The id of each
word in the file is its corresponding 0-based line number.
Args:
captions: A list of lists of strings. 包含训练集所有描述
Returns:
A Vocabulary object.
"""
print("Creating vocabulary.")
# 利用Counter对描述中单词进行词频统计
counter = Counter()
for c in captions:
counter.update(c)
print("Total words:", len(counter))
# Filter uncommon words and sort by descending count.
# 仅保留词频大于FLAGS.min_word_count的单词,格式保存为(单词,词频)
word_counts = [x for x in counter.items() if x[1] >= FLAGS.min_word_count]
# 按照词频从大到小进行排序
word_counts.sort(key=lambda x: x[1], reverse=True)
print("Words in vocabulary:", len(word_counts))
# Write out the word counts file.
# 把统计得到的单词及其词频信息保存到文件中
with tf.gfile.FastGFile(FLAGS.word_counts_output_file, "w") as f:
f.write("\n".join(["%s %d" % (w, c) for w, c in word_counts]))
print("Wrote vocabulary file:", FLAGS.word_counts_output_file)
# Create the vocabulary dictionary.
# 构建word_to_id词汇字典
# 只保留排序之后的单词,丢弃词频
reverse_vocab = [x[0] for x in word_counts]
unk_id = len(reverse_vocab)
# 构建word_to_id字典,x为单词,y为单词在排序之后词汇表中的位置
vocab_dict = dict([(x, y) for (y, x) in enumerate(reverse_vocab)])
# 其余未被收纳的单词都被定义为
vocab = Vocabulary(vocab_dict, unk_id)
return vocab
利用函数_process_dataset(name, images, vocab, num_shards)
进行处理(该函数使用了多线程进行数据处理和存储),其中images为第2步划分之后包含(图像id,图像具体路径,参考描述)的数据,vocab为第3步构建的word_to_id字典,num_shards为存储TFRecord的文件个数(train:256,val:4;test:8)。
相关代码如下:
def _process_dataset(name, images, vocab, num_shards):
"""Processes a complete data set and saves it as a TFRecord.
Args:
name: Unique identifier specifying the dataset.
images: List of ImageMetadata.
vocab: A Vocabulary object.
num_shards: Integer number of shards for the output files.
"""
# Break up each image into a separate entity for each caption.
# 之前是一条记录包含一张图像和该图像的多条描述
# 这里把每一条描述分割,一条记录包含一张图像和该图像的一条描述(因此图像数据会重复)
images = [ImageMetadata(image.image_id, image.filename, [caption])
for image in images for caption in image.captions]
# Shuffle the ordering of images. Make the randomization repeatable.
random.seed(12345)
# 对数据进行随机排序,打乱顺序
random.shuffle(images)
# Break the images into num_threads batches. Batch i is defined as
# images[ranges[i][0]:ranges[i][1]].
# 线程数
num_threads = min(num_shards, FLAGS.num_threads)
# 依据线程数,计算划分到每个线程中处理数据的起始索引
spacing = np.linspace(0, len(images), num_threads + 1).astype(np.int)
ranges = []
threads = []
for i in xrange(len(spacing) - 1):
ranges.append([spacing[i], spacing[i + 1]])
# Create a mechanism for monitoring when all threads are finished.
# Tensorflow线程管理器
coord = tf.train.Coordinator()
# Create a utility for decoding JPEG images to run sanity checks.
# ImageDecoder类,把图像从string数据串转换为RGB数据
# 参考前面博客,图像用tf.gfile.FastGFile().read()打开,用tf.image.decode_jpeg(image, channels=3)进行解码
decoder = ImageDecoder()
# Launch a thread for each batch.
# 创建线程对图像数据进行处理
print("Launching %d threads for spacings: %s" % (num_threads, ranges))
for thread_index in xrange(len(ranges)):
# 处理数据所需参数:线程号,该线程需处理数据索引*,名称,所有数据*,图像解码类*,word_to_id字典*,存储为TFRecord个数*
# *为必须的重要数据
args = (thread_index, ranges, name, images, decoder, vocab, num_shards)
t = threading.Thread(target=_process_image_files, args=args)
t.start()
threads.append(t)
# Wait for all the threads to terminate.
# 把线程加入到线程管理器中
coord.join(threads)
print("%s: Finished processing all %d image-caption pairs in data set '%s'." %
(datetime.now(), len(images), name))
# 线程目标函数
def _process_image_files(thread_index, ranges, name, images, decoder, vocab,
num_shards):
"""Processes and saves a subset of images as TFRecord files in one thread.
Args:
thread_index: Integer thread identifier within [0, len(ranges)].
ranges: A list of pairs of integers specifying the ranges of the dataset to
process in parallel.
name: Unique identifier specifying the dataset.
images: List of ImageMetadata.
decoder: An ImageDecoder object.
vocab: A Vocabulary object.
num_shards: Integer number of shards for the output files.
"""
# Each thread produces N shards where N = num_shards / num_threads. For
# instance, if num_shards = 128, and num_threads = 2, then the first thread
# would produce shards [0, 64).
# 总共需要存储为n_shards个TFRecord文件,一共有num_threads个线程,
# 因此每个线程处理num_shards / num_threads个TFRecord文件(需要验证能整除 )。
num_threads = len(ranges)
assert not num_shards % num_threads
num_shards_per_batch = int(num_shards / num_threads)
# 该线程一共需要处理ranges[thread_index][0]到ranges[thread_index][1]的图像数据
# 其次,每个线程需要存储num_shards_per_batch个TFRecord文件
# 因此需要对数据进行划分,获取存储到各个TFRecord文件中数据的索引
shard_ranges = np.linspace(ranges[thread_index][0], ranges[thread_index][1],
num_shards_per_batch + 1).astype(int)
# 该线程中需要处理的图像数据量
num_images_in_thread = ranges[thread_index][1] - ranges[thread_index][0]
counter = 0
# 每个TFRecord文件的处理和存储
for s in xrange(num_shards_per_batch):
# Generate a sharded version of the file name, e.g. 'train-00002-of-00010'
# 生成TFRecord文件名
shard = thread_index * num_shards_per_batch + s
output_filename = "%s-%.5d-of-%.5d" % (name, shard, num_shards)
output_file = os.path.join(FLAGS.output_dir, output_filename)
# TFRecord存储器
writer = tf.python_io.TFRecordWriter(output_file)
shard_counter = 0
# 该TFRecord文件需要处理的数据的索引
images_in_shard = np.arange(shard_ranges[s], shard_ranges[s + 1], dtype=int)
for i in images_in_shard:
# 索引i的数据,包括图像id,图像路径和一条参考描述
image = images[i]
# 利用_to_sequence_example函数处理图像数据和其描述
sequence_example = _to_sequence_example(image, decoder, vocab)
if sequence_example is not None:
# 将数据写入TFRecord文件中,
# shard_counter为该TFRecord文件处理的数据量,counter为该线程处理的数据量
writer.write(sequence_example.SerializeToString())
shard_counter += 1
counter += 1
if not counter % 1000:
print("%s [thread %d]: Processed %d of %d items in thread batch." %
(datetime.now(), thread_index, counter, num_images_in_thread))
sys.stdout.flush()
writer.close()
print("%s [thread %d]: Wrote %d image-caption pairs to %s" %
(datetime.now(), thread_index, shard_counter, output_file))
sys.stdout.flush()
shard_counter = 0
print("%s [thread %d]: Wrote %d image-caption pairs to %d shards." %
(datetime.now(), thread_index, counter, num_shards_per_batch))
sys.stdout.flush()
# 处理图像数据和描述数据
def _to_sequence_example(image, decoder, vocab):
"""Builds a SequenceExample proto for an image-caption pair.
Args:
image: An ImageMetadata object.
decoder: An ImageDecoder object.
vocab: A Vocabulary object.
Returns:
A SequenceExample proto.
"""
# 根据图像具体路径,打开图像,格式为string,因此还需要保证能将其解码为RGB数据
with tf.gfile.FastGFile(image.filename, "r") as f:
encoded_image = f.read()
# (所以此处只是验证,没有进行解码,在模型的输入构造部分包括了将输入解码为RGB数据部分)
try:
decoder.decode_jpeg(encoded_image)
except (tf.errors.InvalidArgumentError, AssertionError):
print("Skipping file with invalid JPEG data: %s" % image.filename)
return
# 1、组织图像id和图像像素数据
context = tf.train.Features(feature={
"image/image_id": _int64_feature(image.image_id),
"image/data": _bytes_feature(encoded_image),
})
assert len(image.captions) == 1
caption = image.captions[0]
# 2、组织描述数据,利用传入的word_to_id词汇字典将描述的每个单词转为其id(即在词汇表中的索引)
caption_ids = [vocab.word_to_id(word) for word in caption]
feature_lists = tf.train.FeatureLists(feature_list={
"image/caption": _bytes_feature_list(caption),
"image/caption_ids": _int64_feature_list(caption_ids)
})
sequence_example = tf.train.SequenceExample(
context=context, feature_lists=feature_lists)
return sequence_example
namedtuple
有点类似与C语言、C++中的结构体?可以按照其命名访问元素。DemoData = namedtuple('Student', ['name','age'])
Students = []
Students.append(DemoData('xiaoming', 20))
Students.append(DemoData('xiaofa', 19))
for msg in Students:
print msg.name, msg.age