Tensorflow2.0学习(10):TFRecord

Tfrecord简介

  • 什么是Tfrecord?
    Tfrecord是TensorFlow中一种统一的格式,用来存储数据,这个格式就是TFRecords。TFRecords 其实是一种二进制文件,虽然它不如其他格式好理解,但是它能更好的利用内存,更方便赋值和移动,并且不需要单独的标签文件,理论上,它能保存所有的信息。
  • Tfrecord的结构
    TFRecord其内部包含了多个tf.train.Example,而Example是protocol buffer(protobuf) 数据标准的实现,在一个Example消息体中包含了一系列的tf.train.feature属性,而每一个feature 是一个key-value的键值对,其中,key 是string类型,而value 的取值有三种:
    • bytes_list: 可以存储string 和byte两种数据类型。
    • float_list: 可以存储float(float32)与double(float64) 两种数据类型 。
    • int64_list: 可以存储:bool, enum, int32, uint32, int64, uint64 。

Tfrecord实战

  • 导包
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl, np ,pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)
2.0.0
sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
matplotlib 3.1.3
numpy 1.18.1
pandas 1.0.0
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf
  • 生成Example消息体中的tf.train.feature属性
# tfrecord是一个文件格式
# 1.tf.train.Example
#  1.1 tf.train.Features:{"key": tf.train.Feature}
#   1.1.1 tf.train.Feature:tf.train.ByteList/FloatList/Int64List

# 将字符串列表转化为utf-8编码
favorite_books = [name.encode('utf-8')
                 for name in ['machine learning', 'cc150']]
# 生成bytes_list
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)
# 生成float_list
hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatlist)
# 生成int64_list
age_int64list = tf.train.Int64List(value=[42])
print(age_int64list)

# 生成tf.train.feature属性(key和value的键值对),在将这些单独feature整合成features
features = tf.train.Features(
    feature = {
        "favorite_books":tf.train.Feature(
            bytes_list = favorite_books_bytelist),
        "hours":tf.train.Feature(
            float_list = hours_floatlist),
        "age":tf.train.Feature(int64_list = age_int64list),
    }
)
print(features)


value: "machine learning"
value: "cc150"

value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 42

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}
  • 生成Example并序列化
# tf.train.Example在tf.train.Features外面又多了一层封装
example = tf.train.Example(features= features)
print(example)
# 将example序列化,压缩以减少size
serialized_example = example.SerializeToString()
print(serialized_example)
features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*'
  • 生成Tfrecords文件
# 将example存入一个文件下,生成一个tfrecords文件
output_dir = 'tfrecord_basic'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = 'test.tfrecords'
filename_fullpath = os.path.join(output_dir, filename)
# 打开tfrecords文件并写入序列化后的数据
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)
         
  • 读取Tfrecords为dataset格式
# 将tfrecord读取为dataset形式
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*', shape=(), dtype=string)
  • 解析Tfrecord文件
# 定义解析后的形式
expected_feature = {
    "favorite_books": tf.io.VarLenFeature(dtype = tf.string),
    "hours": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.FixedLenFeature([],dtype=tf.int64),
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    # 将example解析
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_feature)
    # 将稀疏矩阵解析出来
    books = tf.sparse.to_dense(example["favorite_books"])
    for book in books:
        print(book.numpy().decode("UTF-8"))
machine learning
cc150
machine learning
cc150
machine learning
cc150
  • 将tfrecord存成压缩文件并解析
# 将tfrecord存成压缩文件
filename_fullpath_zip = filename_fullpath + '.zip'
# 定义压缩操作
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],
                                  compression_type='GZIP')
for serialized_example_tensor in dataset_zip:
    # 将example解析
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_feature)
    # 将稀疏矩阵解析出来
    books = tf.sparse.to_dense(example["favorite_books"])
    for book in books:
        print(book.numpy().decode("UTF-8"))
machine learning
cc150
machine learning
cc150
machine learning
cc150

你可能感兴趣的:(Tensorflow2.0学习(10):TFRecord)