- tf.data.Dataset
- csv文件读取为dataset并用于训练
- tfrecord
1. tf.data.Dataset
tf.data.Dataset使用流程:
(1)以源数据创建一个dataset;
(2)对数据进行预处理;
(3)遍历整个dataset,进行数据处理
1.1 Source Datasets
(1)由数组、列表等创建,将其转化为tensor
#创建一个dataset
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))#该函数接受数组,列表,字典等输入
for element in dataset:
print(element)
(2)由tf.data.TextLineDataset
创建
dataset = tf.data.TextLineDataset(["file1.txt", "file2.txt"])
(3)由tf.data.TFRecordDataset
创建
dataset = tf.data.TFRecordDataset(["file1.tfrecords", "file2.tfrecords"])
(4)将文件名数组生成Dataset
dataset = tf.data.dataset.list_files("/path/*.txt") # doctest: +SKIP
1.2 Method
①. as_numpy_iterator()
:将dataset所有元素转化为numpy,返回一个迭代器
print(list(dataset.as_numpy_iterator()))
"""[0, 1, 2, 3, 4, 5, 6, 7, 8, 9] """
②.batch(batch_size,drop_remainder=False)
:将dataset分桶,返回dataset,当第二个参数设置为True时将抛弃余下的数据,如下代码运行结果中的array([9])
dataset = dataset.batch(3)
print(list(dataset.as_numpy_iterator()))
"""[array([0, 1, 2]), array([3, 4, 5]), array([6, 7, 8]), array([9])]"""
③ interleave( map_func, cycle_length=AUTOTUNE, block_length=1, num_parallel_calls=None )
:从dataset中同时抽取cycle_length个元素,通过map_func映射后,得到cycle_length个新dataset,再同时每个新dataset中一次取出block_length个元素
# Preprocess 4 files concurrently, and interleave blocks of 16 records
# from each file.
filenames = ["/var/data/file1.txt", "/var/data/file2.txt",
"/var/data/file3.txt", "/var/data/file4.txt"]
dataset = tf.data.Dataset.from_tensor_slices(filenames)
def parse_fn(filename):
return tf.data.Dataset.range(10)
dataset = dataset.interleave(lambda x:
tf.data.TextLineDataset(x).map(parse_fn, num_parallel_calls=1),
cycle_length=4, block_length=16)
2. csv文件读取为dataset并用于训练
2.1 生成csv文件
(1)导入包文件
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import sys
import time
import tensorflow as tf
from tensorflow import keras
import pprint
print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
print(module.__name__,module.__version__)
(2)导入数据
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)
(3)数据分割及归一化
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
x_train_all,x_test,y_train_all,y_test = train_test_split(
housing.data, housing.target,random_state = 42)
x_train,x_valid,y_train,y_valid = train_test_split(
x_train_all,y_train_all,random_state = 42)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)
(4)将一个numpy生成多个csv文件来存储
output_dir = os.path.join("generate_csv")
if not os.path.exists(output_dir):
os.mkdir(output_dir)
#
def save_to_csv(output_dir,data,name_prefix,header = None,n_parts = 10):
path_format = os.path.join(output_dir,"{}_{:02d}.csv")
filename = []
for file_idx,row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
part_csv = path_format.format(name_prefix,file_idx)
filename.append(part_csv)
with open (part_csv,"wt",encoding="utf-8") as f:
if header is not None:
f.write(header + "\n")
for row_index in row_indices:
f.write(",".join([repr(col) for col in data[row_index]]))
f.write('\n')
return filename
train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)
train_filenames = save_to_csv(output_dir,train_data,"train",header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,"valid",header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,"test",header_str,n_parts=5)
""" test_filenames:
['generate_csv\\test_00.csv',
'generate_csv\\test_01.csv',
'generate_csv\\test_02.csv',
'generate_csv\\test_03.csv',
'generate_csv\\test_04.csv']"""
(5)读取csv文件
①将所有filename生成一个dataset1
②将dataset1中文件名所指向的文件读取出来,并生成dataset,再将所有dataset合并成一个
③由于读取出的数据是string,需要将其解析成numpy
def parse_csv_line(line,n_fields=9):
defs = [tf.constant(np.nan)] * n_fields
parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
x = tf.stack(parsed_fields[0:-1])
y = tf.stack(parsed_fields[-1:])
return x, y
def csv_reader_dataset(filenames,n_readers=5,batch_size=32,
n_parse_threads=5,shuffle_buffer_size=10000):
dataset = tf.data.Dataset.list_files(filenames)#将filenames生成一个dataset
dataset = dataset.repeat()#如果不设置参数,则重复无数次,由于interleave函数要重复调用dataset内的值,所以需要多次重复
#interleave的作用是将filenames中每一个filename所指向的文件读取出来,并生成dataset,再将所有dataset合并成一个
dataset = dataset.interleave(
lambda filename:tf.data.TextLineDataset(filename).skip(1),
cycle_length=n_readers)
# 从data数据集中按顺序抽取buffer_size个样本放在buffer中,然后打乱buffer中的样本
# buffer中样本个数不足buffer_size,继续从data数据集中安顺序填充至buffer_size,
# 此时会再次打乱
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parse_csv_line,num_parallel_calls=n_parse_threads)
# 每次从buffer中抽取batch_size个样本
dataset = dataset.batch(batch_size)
return dataset
batch_size = 32
train_set = csv_reader_dataset(train_filenames,batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size = batch_size)
(6)读取的数据用于训练
model = keras.models.Sequential([
keras.layers.Dense(30,input_shape = [8],activation = 'relu'),
keras.layers.Dense(1)])
opt = keras.optimizers.SGD(1e-3)
model.compile(loss = "mean_squared_error",
optimizer = opt)
callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-2)]
his = model.fit(train_set,epochs=100,
validation_data=valid_set,
steps_per_epoch = 11610 // batch_size,#由于每次只读取batch_size个数据,所以这个是每个epoch的步数,才能将所有数据遍历
validation_steps = 3870 //batch_size,
callbacks = callbacks)
3. tfrecord
tfrecord是一种文件格式,层层向下封装
-> tf.train.Example
-->tf.trian.Features ->{"key":tf.train.Feature}
--->tf.train.Feature ->tf.train.ByateList/FloatList/Int64List
3.1 tfrecord
举例
(1)生成tf.train.Example
#生成ByateList/FloatList/Int64List
favorite_books = [name.encode('utf-8') for name in ["machine laerning","deep learning"]]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)
hours_floatlist = tf.train.FloatList(value = [14.5,20,34,31.2])
print(hours_floatlist)
age_int64list = tf.train.Int64List(value = [24])
print(age_int64list)
#生成features
features = tf.train.Features(
feature = {
"favorite_books":tf.train.Feature(bytes_list = favorite_books_bytelist),
"hours":tf.train.Feature(float_list = hours_floatlist),
"age":tf.train.Feature(int64_list = age_int64list)
})
#生成example
example = tf.train.Example(features = features)
#对example进行压缩
serialized_example = example.SerializeToString()
""" 压缩后:b'\nd\n5\n\x0efavorite_books\x12#\n!\n\x10machine laerning\n\rdeep learning\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01\x18\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00hA\x00\x00\xa0A\x00\x00\x08B\x9a\x99\xf9A'"""
(2)生成tfrecord
文件
#生成tfrecord文件
output_dir = os.path.join("tfrecord_basic")
if not os.path.exists(output_dir):
os.mkdir(output_dir)
filename = "test.tfrecords"
filename_full_path = os.path.join(output_dir,filename)
with tf.io.TFRecordWriter(filename_full_path) as wr:
for i in range(3):
wr.write(serialized_example)
(3)解析读取出来的文件
#1.定义features类型
expected_features = {
"favorite_books":tf.io.VarLenFeature(dtype=tf.string),
"hours":tf.io.VarLenFeature(dtype=tf.float32),
"age":tf.io.FixedLenFeature([],dtype=tf.int64),
}
#2.读取文件
dataset = tf.data.TFRecordDataset([filename_full_path])
for serialized_example_tensor in dataset:
example = tf.io.parse_single_example(serialized_example_tensor,
expected_features)
# print(example)#此时得到的是SparseTensor,还需要进一步解析
books = tf.sparse.to_dense(example["favorite_books"])
for book in books:
print(book.numpy().decode("UTF-8"))
"""
machine laerning
deep learning
machine laerning
deep learning
machine laerning
deep learning
"""
(4) 存储为zip格式并读取
#将tfrecord存储为压缩文件
filename_full_path_zip = filename_full_path + '.zip'
opt = tf.io.TFRecordOptions(compression_type = "GZIP")
with tf.io.TFRecordWriter(filename_full_path_zip,opt) as wr:
for i in range(3):
wr.write(serialized_example)
#读取压缩的tfrecord文件
expected_features = {
"favorite_books":tf.io.VarLenFeature(dtype=tf.string),
"hours":tf.io.VarLenFeature(dtype=tf.float32),
"age":tf.io.FixedLenFeature([],dtype=tf.int64),
}
dataset_zip = tf.data.TFRecordDataset([filename_full_path_zip],compression_type = "GZIP")#添加compression_type即可
for serialized_example_tensor in dataset_zip:
example = tf.io.parse_single_example(serialized_example_tensor,
expected_features)
books = tf.sparse.to_dense(example["favorite_books"])
for book in books:
print(book.numpy().decode("UTF-8"))
3.2 tfrecord
应用
(1)通过dataset读取csv文件生成dataset
#获取文件名列表
source_dir = os.path.join("./generate_csv/")
def get_filenames_by_prefix(source_dir,prefix_name):
all_files = os.listdir(source_dir)
results = []
for filename in all_files:
if filename.startswith(prefix_name):
results.append(os.path.join(source_dir,filename))
return results
train_filenames = get_filenames_by_prefix(source_dir,"train")
valid_filenames = get_filenames_by_prefix(source_dir,"valid")
test_filenames = get_filenames_by_prefix(source_dir,"test")
#读取csv文件内容,并生成dataset
def parse_csv_line(line,n_fields=9):
defs = [tf.constant(np.nan)] * n_fields
parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
x = tf.stack(parsed_fields[0:-1])
y = tf.stack(parsed_fields[-1:])
return x, y
def csv_reader_dataset(filenames,n_readers=5,batch_size=32,
n_parse_threads=5,shuffle_buffer_size=10000):
dataset = tf.data.Dataset.list_files(filenames)
dataset = dataset.repeat()
dataset = dataset.interleave(
lambda filename:tf.data.TextLineDataset(filename).skip(1),
cycle_length=n_readers)
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parse_csv_line,num_parallel_calls=n_parse_threads)
dataset = dataset.batch(batch_size)
return dataset
batch_size = 32
train_set = csv_reader_dataset(train_filenames,batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,batch_size = batch_size)
(2)将dataset转化为tfrecord
#定义将样本转化为序列化example的函数
def serialize_example(x,y):
input_feature = tf.train.FloatList(value = x)
label_feature = tf.train.FloatList(value = y)
feats = tf.train.Features(
feature = {
"input_feature":tf.train.Feature(float_list = input_feature),
"label_feature":tf.train.Feature(float_list = label_feature)
}
)
example = tf.train.Example(features = feats)
return example.SerializeToString()
#定义将csv格式的dataset转化为tfrecord文件的函数
def csv_dataset_to_tfrecord(base_filename,dataset,n_shards,
steps_per_shard,compression_type = None):
opt = tf.io.TFRecordOptions(compression_type = compression_type)
all_filenames = []
for shard_id in range(n_shards):
filename_fullpath = os.path.join('{}_{:05d}_of_{:05d}'.format(
base_filename,shard_id,n_shards))
with tf.io.TFRecordWriter(filename_fullpath,opt) as wr:
for x_batch,y_batch in dataset.take(steps_per_shard):
for x_example,y_example in zip(x_batch,y_batch):
wr.write(serialize_example(x_example,y_example))
all_filenames.append(filename_fullpath)
return all_filenames
n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3870 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size //n_shards
output_dir = os.path.join("generate_tfrecords")
if not os.path.exists(output_dir):
os.mkdir(output_dir)
train_basename = os.path.join(output_dir,"train")
valid_basename = os.path.join(output_dir,"valid")
test_basename = os.path.join(output_dir,"test")
train_tfrecord_filenames = csv_dataset_to_tfrecord(
train_basename,train_set,n_shards,train_steps_per_shard,None)
valid_tfrecord_filenames = csv_dataset_to_tfrecord(
valid_basename,valid_set,n_shards,valid_steps_per_shard,None)
test_tfrecord_filenames = csv_dataset_to_tfrecord(
test_basename,test_set,n_shards,test_steps_per_shard,None)
(3)生成压缩格式tfrecord
output_dir = os.path.join("generate_tfrecords_zip")
if not os.path.exists(output_dir):
os.mkdir(output_dir)
train_basename = os.path.join(output_dir,"train")
valid_basename = os.path.join(output_dir,"valid")
test_basename = os.path.join(output_dir,"test")
train_tfrecord_filenames = csv_dataset_to_tfrecord(
train_basename,train_set,n_shards,train_steps_per_shard,compression_type="GZIP")
valid_tfrecord_filenames = csv_dataset_to_tfrecord(
valid_basename,valid_set,n_shards,valid_steps_per_shard,compression_type="GZIP")
test_tfrecord_filenames = csv_dataset_to_tfrecord(
test_basename,test_set,n_shards,test_steps_per_shard,compression_type="GZIP")
#定义解析example特征字典
excepted_features = {
"input_feature":tf.io.FixedLenFeature([8],dtype=tf.float32),
"label_feature":tf.io.FixedLenFeature([1],dtype=tf.float32)
}
def parde_example(serialized_example):
example = tf.io.parse_single_example(serialized_example,excepted_features)
return example["input_feature"],example["label_feature"]
def tfrecord_reader_dataset(filenames,n_readers = 5,
batch_size = 32, n_parse_threads = 5,
shuffle_buffer_size = 10000):
dataset = tf.data.Dataset.list_files(filenames)
dataset = dataset.repeat()
dataset = dataset.interleave(
lambda filename:tf.data.TFRecordDataset(filename,compression_type = "GZIP"),
cycle_length= n_readers)
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parde_example,num_parallel_calls = n_parse_threads)
dataset = dataset.batch(batch_size)
return dataset
batch_size = 32
tfrecord_train_set = tfrecord_reader_dataset(train_tfrecord_filenames,batch_size = batch_size)
tfrecord_valid_set = tfrecord_reader_dataset(valid_tfrecord_filenames,batch_size = batch_size)
tfrecord_test_set = tfrecord_reader_dataset(test_tfrecord_filenames,batch_size = batch_size)
model = keras.models.Sequential([
keras.layers.Dense(30,input_shape = [8],activation = 'relu'),
keras.layers.Dense(1)])
opt = keras.optimizers.SGD(1e-3)
model.compile(loss = "mean_squared_error",
optimizer = opt)
callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-2)]
his = model.fit(tfrecord_train_set,epochs=100,
validation_data=tfrecord_valid_set,
steps_per_epoch = 11610 // batch_size,#由于每次只读取batch_size个数据,所以这个是每个epoch的步数,才能将所有数据遍历
validation_steps = 3870 //batch_size,
callbacks = callbacks)