TensorFlow学习笔记(四)——tf.data API

  1. tf.data.Dataset
  2. csv文件读取为dataset并用于训练
  3. tfrecord

1. tf.data.Dataset

tf.data.Dataset使用流程:
(1)以源数据创建一个dataset;
(2)对数据进行预处理;
(3)遍历整个dataset,进行数据处理

1.1 Source Datasets

(1)由数组、列表等创建,将其转化为tensor

#创建一个dataset
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))#该函数接受数组,列表,字典等输入
for element in dataset:
    print(element)

(2)由tf.data.TextLineDataset创建

dataset = tf.data.TextLineDataset(["file1.txt", "file2.txt"]) 

(3)由tf.data.TFRecordDataset创建

dataset = tf.data.TFRecordDataset(["file1.tfrecords", "file2.tfrecords"]) 

(4)将文件名数组生成Dataset

dataset = tf.data.dataset.list_files("/path/*.txt")  # doctest: +SKIP

1.2 Method

①. as_numpy_iterator():将dataset所有元素转化为numpy,返回一个迭代器

print(list(dataset.as_numpy_iterator()))
"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """

②.batch(batch_size,drop_remainder=False):将dataset分桶,返回dataset,当第二个参数设置为True时将抛弃余下的数据,如下代码运行结果中的array([9])

dataset = dataset.batch(3)
print(list(dataset.as_numpy_iterator()))
"""[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8]), array([9])]"""

interleave( map_func, cycle_length=AUTOTUNE, block_length=1, num_parallel_calls=None ):从dataset中同时抽取cycle_length个元素,通过map_func映射后,得到cycle_length个新dataset,再同时每个新dataset中一次取出block_length个元素

# Preprocess 4 files concurrently, and interleave blocks of 16 records 
# from each file. 
filenames = ["/var/data/file1.txt", "/var/data/file2.txt", 
             "/var/data/file3.txt", "/var/data/file4.txt"] 
dataset = tf.data.Dataset.from_tensor_slices(filenames) 
def parse_fn(filename): 
  return tf.data.Dataset.range(10) 
dataset = dataset.interleave(lambda x: 
    tf.data.TextLineDataset(x).map(parse_fn, num_parallel_calls=1), 
    cycle_length=4, block_length=16) 

2. csv文件读取为dataset并用于训练

2.1 生成csv文件

(1)导入包文件

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import sys
import time
import tensorflow as tf
from tensorflow import keras
import pprint

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

(2)导入数据

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)

(3)数据分割及归一化

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x_train_all,x_test,y_train_all,y_test = train_test_split(
    housing.data, housing.target,random_state = 42)
x_train,x_valid,y_train,y_valid = train_test_split(
    x_train_all,y_train_all,random_state = 42)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

(4)将一个numpy生成多个csv文件来存储

output_dir = os.path.join("generate_csv")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

#
def save_to_csv(output_dir,data,name_prefix,header = None,n_parts = 10):
    path_format = os.path.join(output_dir,"{}_{:02d}.csv")
    filename = []
    
    for file_idx,row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
        part_csv = path_format.format(name_prefix,file_idx)
        filename.append(part_csv)
        with open (part_csv,"wt",encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filename
    
    
train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir,train_data,"train",header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,"test",header_str,n_parts=5)
""" test_filenames:
['generate_csv\\test_00.csv',
 'generate_csv\\test_01.csv',
 'generate_csv\\test_02.csv',
 'generate_csv\\test_03.csv',
 'generate_csv\\test_04.csv']"""

(5)读取csv文件
①将所有filename生成一个dataset1
②将dataset1中文件名所指向的文件读取出来,并生成dataset,再将所有dataset合并成一个
③由于读取出的数据是string,需要将其解析成numpy

def parse_csv_line(line,n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

def csv_reader_dataset(filenames,n_readers=5,batch_size=32,
                       n_parse_threads=5,shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)#将filenames生成一个dataset
    dataset = dataset.repeat()#如果不设置参数,则重复无数次,由于interleave函数要重复调用dataset内的值,所以需要多次重复
    #interleave的作用是将filenames中每一个filename所指向的文件读取出来,并生成dataset,再将所有dataset合并成一个
    dataset = dataset.interleave(
        lambda filename:tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers)
    # 从data数据集中按顺序抽取buffer_size个样本放在buffer中,然后打乱buffer中的样本
    # buffer中样本个数不足buffer_size,继续从data数据集中安顺序填充至buffer_size,
    # 此时会再次打乱
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,num_parallel_calls=n_parse_threads)
    # 每次从buffer中抽取batch_size个样本
    dataset = dataset.batch(batch_size)
    return dataset

batch_size = 32
train_set = csv_reader_dataset(train_filenames,batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size = batch_size)

(6)读取的数据用于训练

model = keras.models.Sequential([
    keras.layers.Dense(30,input_shape = [8],activation = 'relu'),
    keras.layers.Dense(1)])
opt = keras.optimizers.SGD(1e-3)
model.compile(loss = "mean_squared_error",
              optimizer = opt)
callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-2)]
his = model.fit(train_set,epochs=100,
                validation_data=valid_set,
                steps_per_epoch = 11610 // batch_size,#由于每次只读取batch_size个数据,所以这个是每个epoch的步数,才能将所有数据遍历
                validation_steps = 3870 //batch_size,
                callbacks = callbacks)

3. tfrecord

tfrecord是一种文件格式,层层向下封装
-> tf.train.Example
-->tf.trian.Features ->{"key":tf.train.Feature}
--->tf.train.Feature ->tf.train.ByateList/FloatList/Int64List

3.1 tfrecord举例

(1)生成tf.train.Example

#生成ByateList/FloatList/Int64List
favorite_books = [name.encode('utf-8') for name in ["machine laerning","deep learning"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [14.5,20,34,31.2])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [24])
print(age_int64list)
#生成features
features = tf.train.Features(
    feature = {
        "favorite_books":tf.train.Feature(bytes_list = favorite_books_bytelist),
        "hours":tf.train.Feature(float_list = hours_floatlist),
        "age":tf.train.Feature(int64_list = age_int64list)
    })
#生成example
example = tf.train.Example(features = features)
#对example进行压缩
serialized_example = example.SerializeToString()
""" 压缩后:b'\nd\n5\n\x0efavorite_books\x12#\n!\n\x10machine laerning\n\rdeep learning\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x18\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00hA\x00\x00\xa0A\x00\x00\x08B\x9a\x99\xf9A'"""

(2)生成tfrecord文件

#生成tfrecord文件
output_dir = os.path.join("tfrecord_basic")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
filename = "test.tfrecords"
filename_full_path = os.path.join(output_dir,filename)
with tf.io.TFRecordWriter(filename_full_path) as wr:
    for i in range(3):
        wr.write(serialized_example)

(3)解析读取出来的文件

#1.定义features类型
expected_features = {
    "favorite_books":tf.io.VarLenFeature(dtype=tf.string),
    "hours":tf.io.VarLenFeature(dtype=tf.float32),
    "age":tf.io.FixedLenFeature([],dtype=tf.int64),
}
#2.读取文件
dataset = tf.data.TFRecordDataset([filename_full_path])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor,
                                         expected_features)
#     print(example)#此时得到的是SparseTensor,还需要进一步解析
    
    books = tf.sparse.to_dense(example["favorite_books"])
    for book in books:
        print(book.numpy().decode("UTF-8"))
"""
machine laerning
deep learning
machine laerning
deep learning
machine laerning
deep learning
"""

(4) 存储为zip格式并读取

#将tfrecord存储为压缩文件  
filename_full_path_zip = filename_full_path + '.zip'
opt = tf.io.TFRecordOptions(compression_type = "GZIP")
with tf.io.TFRecordWriter(filename_full_path_zip,opt) as wr:
    for i in range(3):
        wr.write(serialized_example)
#读取压缩的tfrecord文件
expected_features = {
    "favorite_books":tf.io.VarLenFeature(dtype=tf.string),
    "hours":tf.io.VarLenFeature(dtype=tf.float32),
    "age":tf.io.FixedLenFeature([],dtype=tf.int64),
}

dataset_zip = tf.data.TFRecordDataset([filename_full_path_zip],compression_type = "GZIP")#添加compression_type即可
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(serialized_example_tensor,
                                         expected_features)
    books = tf.sparse.to_dense(example["favorite_books"])
    for book in books:
        print(book.numpy().decode("UTF-8"))

3.2 tfrecord应用

(1)通过dataset读取csv文件生成dataset

#获取文件名列表
source_dir = os.path.join("./generate_csv/")
def get_filenames_by_prefix(source_dir,prefix_name):
    all_files = os.listdir(source_dir)
    results = []
    for filename in all_files:
        if filename.startswith(prefix_name):
            results.append(os.path.join(source_dir,filename))
    return results
train_filenames = get_filenames_by_prefix(source_dir,"train")
valid_filenames = get_filenames_by_prefix(source_dir,"valid")
test_filenames = get_filenames_by_prefix(source_dir,"test")

#读取csv文件内容,并生成dataset
def parse_csv_line(line,n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

def csv_reader_dataset(filenames,n_readers=5,batch_size=32,
                       n_parse_threads=5,shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    
    dataset = dataset.interleave(
        lambda filename:tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers)
   
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

batch_size = 32
train_set = csv_reader_dataset(train_filenames,batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size = batch_size)

(2)将dataset转化为tfrecord

#定义将样本转化为序列化example的函数
def serialize_example(x,y):
    input_feature = tf.train.FloatList(value = x)
    label_feature = tf.train.FloatList(value = y)
    feats = tf.train.Features(
        feature = {
            "input_feature":tf.train.Feature(float_list = input_feature),
            "label_feature":tf.train.Feature(float_list = label_feature)
        }
    )
    example = tf.train.Example(features = feats)
    return example.SerializeToString()

#定义将csv格式的dataset转化为tfrecord文件的函数
def csv_dataset_to_tfrecord(base_filename,dataset,n_shards,
                            steps_per_shard,compression_type = None):
    opt = tf.io.TFRecordOptions(compression_type = compression_type)
    all_filenames = []
    for shard_id in range(n_shards):
        filename_fullpath = os.path.join('{}_{:05d}_of_{:05d}'.format(
            base_filename,shard_id,n_shards))
        with tf.io.TFRecordWriter(filename_fullpath,opt) as wr:
            for x_batch,y_batch in dataset.take(steps_per_shard):
                for x_example,y_example in zip(x_batch,y_batch):
                    wr.write(serialize_example(x_example,y_example))
        all_filenames.append(filename_fullpath)
    return all_filenames  
n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3870 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size //n_shards

output_dir = os.path.join("generate_tfrecords")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
train_basename = os.path.join(output_dir,"train")
valid_basename = os.path.join(output_dir,"valid")
test_basename = os.path.join(output_dir,"test")

train_tfrecord_filenames = csv_dataset_to_tfrecord(
    train_basename,train_set,n_shards,train_steps_per_shard,None)
valid_tfrecord_filenames = csv_dataset_to_tfrecord(
    valid_basename,valid_set,n_shards,valid_steps_per_shard,None)
test_tfrecord_filenames = csv_dataset_to_tfrecord(
    test_basename,test_set,n_shards,test_steps_per_shard,None)

(3)生成压缩格式tfrecord

output_dir = os.path.join("generate_tfrecords_zip")
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
train_basename = os.path.join(output_dir,"train")
valid_basename = os.path.join(output_dir,"valid")
test_basename = os.path.join(output_dir,"test")

train_tfrecord_filenames = csv_dataset_to_tfrecord(
    train_basename,train_set,n_shards,train_steps_per_shard,compression_type="GZIP")
valid_tfrecord_filenames = csv_dataset_to_tfrecord(
    valid_basename,valid_set,n_shards,valid_steps_per_shard,compression_type="GZIP")
test_tfrecord_filenames = csv_dataset_to_tfrecord(
    test_basename,test_set,n_shards,test_steps_per_shard,compression_type="GZIP")
#定义解析example特征字典
excepted_features = {
    "input_feature":tf.io.FixedLenFeature([8],dtype=tf.float32),
    "label_feature":tf.io.FixedLenFeature([1],dtype=tf.float32)
}
def parde_example(serialized_example):
    example = tf.io.parse_single_example(serialized_example,excepted_features)
    return example["input_feature"],example["label_feature"]

def tfrecord_reader_dataset(filenames,n_readers = 5,
                            batch_size = 32, n_parse_threads = 5,
                            shuffle_buffer_size = 10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename:tf.data.TFRecordDataset(filename,compression_type = "GZIP"),
        cycle_length= n_readers)
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parde_example,num_parallel_calls = n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset
batch_size = 32
tfrecord_train_set = tfrecord_reader_dataset(train_tfrecord_filenames,batch_size = batch_size)
tfrecord_valid_set = tfrecord_reader_dataset(valid_tfrecord_filenames,batch_size = batch_size)
tfrecord_test_set = tfrecord_reader_dataset(test_tfrecord_filenames,batch_size = batch_size)

model = keras.models.Sequential([
    keras.layers.Dense(30,input_shape = [8],activation = 'relu'),
    keras.layers.Dense(1)])
opt = keras.optimizers.SGD(1e-3)
model.compile(loss = "mean_squared_error",
              optimizer = opt)
callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-2)]
his = model.fit(tfrecord_train_set,epochs=100,
                validation_data=tfrecord_valid_set,
                steps_per_epoch = 11610 // batch_size,#由于每次只读取batch_size个数据,所以这个是每个epoch的步数,才能将所有数据遍历
                validation_steps = 3870 //batch_size,
                callbacks = callbacks)

你可能感兴趣的:(TensorFlow学习笔记(四)——tf.data API)