基于 tf.data API,我们可以使用简单的代码来构建复杂的输入,tf.data API 可以轻松处理大量数据、不同的数据格式以及复杂的转换。tf.data.Dataset中每个元素包含一个或多个 Tensor 对象。例如,在图
片管道中,一个元素可能是单个训练样本,具有一对表示图片数据和标签的张量。
可以通过两种不同的方式来创建tf.data.Dataset
(1)直接从 Tensor 创建 Dataset
例如 Dataset.from_tensor_slices());
当然 Numpy 也是可以的,TensorFlow 会自动将其转换为 Tensor。
(2)通过对一个或多个 tf.data.Dataset 对象来使用变换
(例如 Dataset.zip)来创建 Dataset。
一个 Dataset 对象包含多个元素,每个元素的结构都相同。每个元素包含一个或多个 tf.Tensor 对象,这些对象被称为组件。
Dataset 的属性由构成该 Dataset 的元素的属性映射得到,元素可以是单个张量、张量元组,也可以是张量的嵌套元组。
import tensorflow as tf
# 通过一维列表创建DataSet
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 3, 3, 43, 43, 43, 43, 54, 5, 34])
# 迭代获取
for ele in dataset:
print(ele)
print(ele.numpy())
import tensorflow as tf
# 通过二维列表创建DataSet
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [3, 43], [43, 34]])
# 迭代获取
for ele in dataset:
print(ele)
print(ele.numpy())
import tensorflow as tf
# 通过字典创建DataSet
dataset = tf.data.Dataset.from_tensor_slices({
'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8], 'c': [12, 23, 23, 34]})
for ele in dataset:
print(ele)
效果和第一种相同
import tensorflow as tf
import numpy as np
# 通过np.array创建DataSet
dataset = tf.data.Dataset.from_tensor_slices(np.array([1, 2, 3, 3, 43, 43, 43, 43, 54, 5, 34]))
# 迭代获取
for ele in dataset:
print(ele)
print(ele.numpy())
import tensorflow as tf
import numpy as np
# 通过np.array创建DataSet
dataset = tf.data.Dataset.from_tensor_slices(np.array([1, 2, 3, 3, 43, 43, 43, 43, 54, 5, 34]))
# 迭代获取前四个
for ele in dataset.take(4):
print(ele)
print(ele.numpy())
# 通过take取出第一个
print(next(iter(dataset.take(1))).numpy())
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 5], [6, 7]])
for ele in dataset:
print(ele.numpy())
# 通过shuffle对数据乱序,当对全部数据乱序时参数填dataset的长度
dataset=dataset.shuffle(3)
for ele in dataset:
print(ele.numpy())
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 5], [6, 7]])
# 通过repeat方法对dataset进行重复,count为重复的次数
dataset=dataset.repeat(count=3)
for ele in dataset:
print(ele.numpy())
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices([[1, 2], [4, 5], [6, 7]])
for ele in dataset:
print(ele.numpy())
print('-------------------')
# 通过shuffle对数据乱序,通过repeat重复数据,使得生成的数据重复并乱序
dataset=dataset.shuffle(3).repeat(count=3)
for ele in dataset:
print(ele.numpy())
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 4, 5, 6, 7])
for ele in dataset:
print(ele.numpy())
print('-------------------')
# 生成大量数据,为batch函数准备数据
dataset = dataset.shuffle(3).repeat(10)
# 通过batch方法设置每次取出若干数据
dataset = dataset.batch(batch_size=3)
for ele in dataset:
print(ele.numpy())
import tensorflow as tf
dataset = tf.data.Dataset.from_tensor_slices([1, 2, 4, 5, 6, 7])
# 对所有的数据进行平方运算
dataset=dataset.map(tf.square)
for ele in dataset:
print(ele.numpy())
import gzip
import numpy as np
import tensorflow as tf
def get_data():
# 文件获取
train_image = r"../../dataset/fashion-mnist/train-images-idx3-ubyte.gz"
test_image = r"../../dataset/fashion-mnist/t10k-images-idx3-ubyte.gz"
train_label = r"../../dataset/fashion-mnist/train-labels-idx1-ubyte.gz"
test_label = r"../../dataset/fashion-mnist/t10k-labels-idx1-ubyte.gz" # 文件路径
paths = [train_label, train_image, test_label, test_image]
with gzip.open(paths[0], 'rb') as lbpath:
y_train = np.frombuffer(lbpath.read(), np.uint8, offset=8)
with gzip.open(paths[1], 'rb') as imgpath:
x_train = np.frombuffer(
imgpath.read(), np.uint8, offset=16).reshape(len(y_train), 28, 28)
with gzip.open(paths[2], 'rb') as lbpath:
y_test = np.frombuffer(lbpath.read(), np.uint8, offset=8)
with gzip.open(paths[3], 'rb') as imgpath:
x_test = np.frombuffer(
imgpath.read(), np.uint8, offset=16).reshape(len(y_test), 28, 28)
return (x_train, y_train), (x_test, y_test)
# 加载数据
(train_image, train_lable), (test_image, test_label) = get_data()
train_image = train_image / 255
test_image = test_image / 255
ds_train_img = tf.data.Dataset.from_tensor_slices(train_image)
ds_train_lab = tf.data.Dataset.from_tensor_slices(train_lable)
# 通过zip函数将ds_train_img和ds_train_lab合并到一起
ds_train = tf.data.Dataset.zip((ds_train_img, ds_train_lab))
ds_test = tf.data.Dataset.from_tensor_slices((test_image, test_label))
ds_train = ds_train.shuffle(10000).repeat().batch(64)
ds_test = ds_test.batch(64)
model = tf.keras.Sequential([tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(10, activation='softmax')])
model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy',
metrics=['acc'])
# train_image第一个维度时总的图片数量大小
steps_per_epoch = train_image.shape[0] // 64
valication_epoch = test_image.shape[0] // 64
model.fit(ds_train, epochs=5, steps_per_epoch=steps_per_epoch, validation_data=ds_test,
validation_steps=valication_epoch)