every blog every motto: Love is carefully designed lie.
数据 -> csv文件 -> tf.data读取csv文件
实战tf.data的使用
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
print(module.__name__,module.__version__)
from sklearn.datasets import fetch_california_housing
# 房价预测
housing = fetch_california_housing()
# 划分样本
from sklearn.model_selection import train_test_split
x_train_all,x_test,y_train_all,y_test = train_test_split(housing.data,housing.target,random_state=7)
x_train,x_valid,y_train,y_valid = train_test_split(x_train_all,y_train_all,random_state=11)
print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)
# 归一化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)
output_dir = 'generate_csv'
if not os.path.exists(output_dir):
os.mkdir(output_dir)
def save_to_csv(output_dir,data,name_prefix,header=None,n_parts=10):
path_format = os.path.join(output_dir,'{}_{:02d}.csv')
filenames = []
for file_idx,row_indices in enumerate(np.array_split(np.arange(len(data)),n_parts)):
part_csv = path_format.format(name_prefix,file_idx)
filenames.append(part_csv)
with open(part_csv,'wt',encoding='utf-8') as f:
if header is not None:
f.write(header + '\n')
for row_index in row_indices:
f.write(','.join([repr(col) for col in data[row_index]]))
f.write('\n')
return filenames
train_data = np.c_[x_train_scaled,y_train]
valid_data = np.c_[x_valid_scaled,y_valid]
test_data = np.c_[x_test_scaled,y_test]
header_cols = housing.feature_names + ['MiddianHouseValue']
header_str = ','.join(header_cols)
train_filenames = save_to_csv(output_dir,train_data,'train',header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,'valid',header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,'test',header_str,n_parts=10)
# 读取上面生成的文件
# 打印文件名
import pprint
print("train filenames: ")
pprint.pprint(train_filenames)
print("valid filenames: ")
pprint.pprint(valid_filenames)
print("test filenames: ")
pprint.pprint(test_filenames)
# 读取文件
# 1. filename -> dataset
# 2. read file -> dataset -> datasets -> megre
# 3. parse csv
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
print(filename)
n_readers = 5 # 并行度
dataset = filename_dataset.interleave(
lambda filename : tf.data.TextLineDataset(filename).skip(1), # skip(1) 省略一行
cycle_length = n_readers
)
for line in dataset.take(15):
print(line.numpy())
# parse csv 字符串变成tensor
# tf.io.decode_csv(str,record_defaults,)
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str,record_defaults)
print(parsed_fields)
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32),
0,
np.nan,
'hello',
tf.constant([])]
parsed_fields = tf.io.decode_csv(sample_str,record_defaults)
print(parsed_fields)
try:
parsed_fields = tf.io.decode_csv(',,,,',record_defaults)
except tf.errors.InvalidArgumentError as ex:
print(ex)
try:
parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7',record_defaults)
except tf.errors.InvalidArgumentError as ex:
print(ex)
# 解析dataset 中的一行
def parse_csv_line(line,n_fields=9):
defs = [tf.constant(np.nan)] * n_fields
parsed_fields = tf.io.decode_csv(line,record_defaults=defs)
x = tf.stack(parsed_fields[0:-1])
y = tf.stack(parsed_fields[-1:])
return x,y
parse_csv_line(b'-0.060214068004363165,0.7527628439249472,0.0835940301935345,-0.06250122441959183,-0.03497131082291674,-0.026442380178345683,1.0712234607868782,-1.3707331756959855,1.651',
n_fields=9)
整体
# 整体
# 1. filename -> dataset
# 2. read file -> dataset -> datasets -> megre
# 3. parse csv
def csv_reader_dataset(filenames,n_readers=5,batch_size=32,n_parse_threads=5,
shuffle_buffer_size=10000):
# 1. filename -> dataset
dataset = tf.data.Dataset.list_files(filenames)
dataset = dataset.repeat() # 重复多少次
# 文件名转换成文本内容
dataset = dataset.interleave(
lambda filename:tf.data.TextLineDataset(filename).skip(1),
cycle_length = n_readers
)
dataset.shuffle(shuffle_buffer_size)
# 解析
dataset = dataset.map(parse_csv_line,
num_parallel_calls=n_parse_threads)
dataset = dataset.batch(batch_size)
return dataset
# 测试
train_set = csv_reader_dataset(train_filenames,batch_size=3)
for x_batch,y_batch in train_set.take(2):
print("x:")
pprint.pprint(x_batch)
print("y:")
pprint.pprint(y_batch)
# 搭建模型
model = keras.models.Sequential([
keras.layers.Dense(30,activation='relu',input_shape=[8]),
keras.layers.Dense(1),
])
# 打印model信息
model.summary()
# 编译
model.compile(loss='mean_squared_error',optimizer="sgd")
# 回调函数
callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-3)]
#训练
history = model.fit(train_set,validation_data=valid_set,
steps_per_epoch=11160 // batch_size,
validation_steps = 3870 // batch_size,
epochs=100,callbacks=callbacks)
model.evaluate(test_set,steps = 5160 // batch_size)