tf.keras的数据读取
对于小数据集,特征数据均已经数值化, 可以直接使用np.array、pandas构建输入数据。
history = model.fit(
train_dataset, train_labels, # 二维数组或者df
epochs=EPOCHS, validation_split=0.2, verbose=0)
dataframe 转换为np.array的三种方法:
df.values
df.as_matrix()
np.array(df)
对于大数据集可以使用tf.data构建训练输入。使用Dataset
使用 tf.data.Dataset
加载Numpy数据,将(feature_array,label_array)
两个数组作为元组传递给tf.data.Dataset.from_tensor_slices
以创建tf.data.Dataset
BATCH_SIZE=64
SHUFFLE_BUFFER_SIZE=1000
# 构建训练集、测试集迭代器
train_dataset=tf.data.Dataset.from_tensor_slices((train_examples,train_labels)).\
shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE) # 打乱和批次化数据集
test_dataset=tf.data.Dataset.from_tensor_slices((test_examples,test_labels)).batch(BATCH_SIZE)
需要使用Keras
定义模型,并将csv中各列的特征转化为训练的输入:
方法1: 使用 tf.data.experiment.make_csv_dataset
方法将csv格式的数据加载到tf.data.Dataset
make_csv_dataset
函数的column_names
参数。select_columns
LABEL_COLUMN='label'
import tensorflow as tf
TRAIN_DATA_URL = "./train.csv"
TEST_DATA_URL = "./eval.csv"
# 对于包含模型需要预测的值的列是你需要显式指定的
LABEL_COLUMN = 'survived'
def get_dataset(file_path):
dataset = tf.data.experimental.make_csv_dataset(
file_path,
batch_size=12, # 为了示例更容易展示,手动设置较小的值
label_name=LABEL_COLUMN,
na_value="?",
num_epochs=1,
ignore_errors=True)
return dataset
raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)
方法2: 使用Pandas
加载CSV
文件
pipeline
,使用tf.data
批处理和打乱数据CSV
中的列映射到用于训练模型的输入要素。Keras
构建,训练和评估模型。import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
# 使用pandas读取数据
URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
dataframe = pd.read_csv(URL)
# 划分训练集验证集和测试集
train, test = train_test_split(dataframe, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
# 使用tf.data构造输入pipeline
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
dataframe = dataframe.copy()
labels = dataframe.pop('target')
# csv数据,用dict和下面的feature_column中处理各列数据保持一致
# TODO: tf.keras.layers.DenseFeatures 基于给定的feature_columns产生稠密张量的层。
ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
if shuffle:
ds = ds.shuffle(buffer_size=len(dataframe))
ds = ds.batch(batch_size)
return ds
# 选择使用feature_column
feature_columns = []
# numeric cols
for header in ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca']:
feature_columns.append(feature_column.numeric_column(header))
# bucketized cols
age_buckets = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)
# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)
# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)
# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal], hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)
# 构建特征层
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
# 构建模型并训练
model = tf.keras.Sequential([
feature_layer,
layers.Dense(128, activation='relu'),
layers.Dense(128, activation='relu'),
layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])
model.fit(train_ds, validation_data=val_ds,epochs=5)
# 测试
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)
读取csv数据文件的步骤如下:
tf.data.Dataset.list_files(file_pattern)
获取csv文件名列表filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
print(filename)
输出如下:
tf.Tensor(b'data/generate_csv/train_00019-of-00020.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_00004-of-00020.csv', shape=(), dtype=string)
...
tf.Tensor(b'data/generate_csv/train_00006-of-00020.csv', shape=(), dtype=string)
tf.data.TextLineDataset(filename)
将csv文件内容转为TextLineDataset,
注意使用skip()
跳过csv文件的header
dataset = filename_dataset.interleave(lambda filename: tf.data.TextLineDataset(filename).skip(1)) # 使用skip跳过header
for line in dataset.take(15):
print(line.numpy())
输出如下:
b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226'
b'0.4853051504718848,-0.8492418886278699,-0.06530126513877861,-0.023379656040017353,1.4974350551260218,-0.07790657783453239,-0.9023632702857819,0.7814514907892068,2.956'
...
b'1.1990412250459561,-0.04823952235146133,0.7491221281727167,0.1308828788491473,-0.060375323994361546,-0.02954897439374466,-0.5524365449182886,0.03243130523751367,5.00001'
tf.io.decode_csv(records, record_defaults)
解析一行csv文件内容,Convert CSV records to tensors. Each column maps to one tensor.使用tf.io.decode_csv(records, record_defaults)
函数可以解析一行csv文件内容,其中record_defaults
字典存储各字段的默认值.
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0, dtype=tf.int32), 0, np.nan, "hello", tf.constant([]) ]
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)
输出如下:
# 这里解析出来的为5个标量
[<tf.Tensor: shape=(), dtype=int32, numpy=1>,
<tf.Tensor: shape=(), dtype=int32, numpy=2>,
<tf.Tensor: shape=(), dtype=float32, numpy=3.0>,
<tf.Tensor: shape=(), dtype=string, numpy=b'4'>,
<tf.Tensor: shape=(), dtype=float32, numpy=5.0>]
解析csv数据过程如下:
def parse_csv_line(line, n_fields=9):
defaults = [tf.constant(np.nan)] * n_fields
parsed_fields = tf.io.decode_csv(line, record_defaults=defaults)
x = tf.stack(parsed_fields[0:-1]) # 将多个标量数通过拼接tf.stack生成1维特征向量:feature_vector
y = tf.stack(parsed_fields[-1:])
return x, y
dataset = dataset.map(lambda line: parse_csv_line(line, n_fields=9))
# 输出一条数据
print(next(iter(dataset.take(1))))
输出如下
(<tf.Tensor: shape=(8,), dtype=float32, numpy=
array([-1.119975 , -1.3298433 , 0.14190045, 0.4658137 , -0.10301778,
-0.10744184, -0.7950524 , 1.5304717 ], dtype=float32)>,
<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.66], dtype=float32)>)
if __name__ == '__main__':
# 训练数据
batch_size = 100
dataset = tf.data.Dataset.list_files("/path/*.csv").flat_map(
lambda filepath: tf.data.TextLineDataset(filepath).skip(1)
)
# 常用函数:flat_map、map、filter、apply
dataset = dataset.map(_parse_function, num_parallel_calls=50) \
.shuffle(batch_size, reshuffle_each_iteration=True) \
.batch(batch_size=100, drop_remainder=True) \
.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
读取和保存TFRecord文件