tensorflow2.x学习笔记七:生成、解析、读取csv文件

一、数据准备,使用的是房价预测的数据集

from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
#数据划分
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

输出:(11610, 8) (11610,)
     (3870, 8) (3870,)
     (5160, 8) (5160,)

#数据预处理
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)

二、生成csv文件

output_dir = "generate_csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

'''
output_dir:csv文件保存的文件夹路径
data:原始数据,也就是上面的房价数据
name_prefix:文件名的前缀,比如train,test或者valid
header:csv文件的第一行的内容
n_parts:把训练,测试或者验证数据集分成几部分
'''
def save_to_csv(output_dir, data, name_prefix,header=None, n_parts=10):
    ##{:02d}表示显示十进制数,且要显示两位数,不够就在前面补0,
    ##所以就会出现01,02这种形式的数字
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    '''
    ①np.array_split(list,n_parts)意思是将这个list分成n_parts如果
    不能均分的话,那么允许不均匀分割;最后把分割后的各个array存放在
    一个列表里。比如,它会把11610个样本分成n_parts份,然后返回每一份
    的样本在原始数据中的索引值,一份的索引值放在一个列表中,多份再放在一个大列表
    
    ②enumerate是把一个序列或可迭代对象组合为一个索引序列,同时列出数据和数据下
    标,就像pandas下的series一样,每个元素都有一个index
    '''
    for file_idx, row_indices in  enumerate(
                                  np.array_split(np.arange(len(data)), 
                                                 n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                '''
                repr是将读取到的一行数据里的每一个元素依次
                转换成字符串后再组合成一个列表。最终返回的是
                通过指定字符","连接序列中的元素后生成的新的列表。
                '''
                f.write(",".join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames
'''
在这里train_data的形状是(11610, 9) 
      valid_data的形状是(3870, 9)
      test_data的形状是(5160, 9) 
'''
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

'''
header_cols:[MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,
                   Latitude,Longitude,"MidianHouseValue"]
'''
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, train_data, "train",
                              header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
                              header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
                             header_str, n_parts=10)

让我们看一下最后生成的结果

import pprint
print("train filenames:")
pprint.pprint(train_filenames)
print("valid filenames:")
pprint.pprint(valid_filenames)
print("test filenames:")
pprint.pprint(test_filenames)

输出:
train filenames:
['generate_csv\\train_00.csv',
 'generate_csv\\train_01.csv',
 'generate_csv\\train_02.csv',
 'generate_csv\\train_03.csv',
 'generate_csv\\train_04.csv',
 'generate_csv\\train_05.csv',
 'generate_csv\\train_06.csv',
 'generate_csv\\train_07.csv',
 'generate_csv\\train_08.csv',
 'generate_csv\\train_09.csv',
 'generate_csv\\train_10.csv',
 'generate_csv\\train_11.csv',
 'generate_csv\\train_12.csv',
 'generate_csv\\train_13.csv',
 'generate_csv\\train_14.csv',
 'generate_csv\\train_15.csv',
 'generate_csv\\train_16.csv',
 'generate_csv\\train_17.csv',
 'generate_csv\\train_18.csv',
 'generate_csv\\train_19.csv']
valid filenames:
['generate_csv\\valid_00.csv',
 'generate_csv\\valid_01.csv',
 'generate_csv\\valid_02.csv',
 'generate_csv\\valid_03.csv',
 'generate_csv\\valid_04.csv',
 'generate_csv\\valid_05.csv',
 'generate_csv\\valid_06.csv',
 'generate_csv\\valid_07.csv',
 'generate_csv\\valid_08.csv',
 'generate_csv\\valid_09.csv']
test filenames:
['generate_csv\\test_00.csv',
 'generate_csv\\test_01.csv',
 'generate_csv\\test_02.csv',
 'generate_csv\\test_03.csv',
 'generate_csv\\test_04.csv',
 'generate_csv\\test_05.csv',
 'generate_csv\\test_06.csv',
 'generate_csv\\test_07.csv',
 'generate_csv\\test_08.csv',
 'generate_csv\\test_09.csv']

这时候文件已经保存好了,返回的是各个文件名

三、解析csv文件

  • 第一步,先把filenames列表变成dataset,这里以train_filenames为例:
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

输出:
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
  • 读取文件内容,将其变成dataset,然后再将多个文件的dataset合并。也即最后是这些csv文件的内容组成了dataset,每一个元素就是一行的内容,使用到了tf.data.TextLineDataset函数:
n_readers = 5
'''
TextLineDataset函数负责读取所提供的文件名对应文件的内容,每次读取一行
所以,这里就是一个循环操作5个文件,然后在循环里,一个文件读出一行,
作为dataset的一个tensor;所以一个循环里会读出五行内容,这五行内容
分属五个文件。然后紧接着下一个循环,还是操作这五个文件,
直到全部行读完,才会进入下一个循环,也即按照和上面一样的操作处理下五个文件
'''
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)
for line in dataset.take(1):
    print(line.numpy())

输出:b'0.6363646332204844,-1.0895425985107923,0.09260902815633619,
      -0.20538124656801682,1.2025670451003232,-0.03630122549633783,
      -0.6784101660505877,0.182235342347858,2.429'

因为我们存储的时候,将每一个数据都变成了string,所以这里读出来的,也即写进去之后的存储类型是bytes类型,因此,前面会有一个b。

  • 开始解析
'''
record_defaults:用来设置读取出来的数据的数据类型
np.nan表示的就是float32类型,需要注意元素个数需要和解析的数据个数相等
此外,还有0表示int32;字符串表示string;空数据,如[]也表示float32
'''
def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line(b'-0.9868720801669367,0.832863080552588,\
                 -0.18684708416901633,-0.14888949288707784,\
                 -0.4532302419670616,-0.11504995754593579,\
                  1.6730974284189664,-0.7465496877362412,1.138',
                  n_fields=9)

输出:注意这是个元组,然后里面有两个元素,一个是x,一个是y
(<tf.Tensor: id=153, shape=(8,), dtype=float32, numpy=
 array([-0.9868721 ,  0.8328631 , -0.18684709, -0.1488895 , -0.45323023,
        -0.11504996,  1.6730974 , -0.74654967], dtype=float32)>,
 <tf.Tensor: id=154, shape=(1,), dtype=float32, numpy=array([1.138],   
  dtype=float32)>)

四、利用上面的解析csv文件的一行内容的函数,正式读取所有的csv文件

'''
filenames:csv文件文件名,也即路径名
n_readers:interleave的参数,表示一个循环操作几个文件
batch_size:批量大小
n_parse_threads:解析时的线程数
shuffle_buffer_size:数据缓冲区大小,一定要大于或者等于所需数据,
打乱数据后先将缓冲区填满,然后从缓冲区中读取打乱后的数据
'''
def csv_reader_dataset(filenames, n_readers=5,
                       batch_size=32, n_parse_threads=5,
                       shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    #无限重复,也即取不完
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line,
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

输出:
x:
<tf.Tensor: id=229, shape=(3, 8), dtype=float32, numpy=
array([[ 0.09734604,  0.75276285, -0.20218964, -0.19547   , -0.40605137,
         0.00678553, -0.81371516,  0.6566148 ],
       [-0.32652634,  0.4323619 , -0.09345459, -0.08402992,  0.8460036 ,
        -0.02663165, -0.56176794,  0.1422876 ],
       [ 0.4240821 ,  0.91296333, -0.04437482, -0.15297213, -0.24727628,
        -0.10539167,  0.86126745, -1.335779  ]], dtype=float32)>
y:
<tf.Tensor: id=230, shape=(3, 1), dtype=float32, numpy=
array([[1.119],
       [2.431],
       [3.955]], dtype=float32)>
x:
<tf.Tensor: id=233, shape=(3, 8), dtype=float32, numpy=
array([[ 0.48530516, -0.8492419 , -0.06530126, -0.02337966,  1.4974351 ,
        -0.07790658, -0.90236324,  0.78145146],
       [ 0.63034356,  1.8741661 , -0.06713215, -0.12543367, -0.19737554,
        -0.02272263, -0.69240725,  0.72652334],
       [-1.4803331 , -0.68904144, -0.35624704, -0.17255889, -0.82158846,
        -0.13823092,  1.9157133 , -1.0211904 ]], dtype=float32)>
y:
<tf.Tensor: id=234, shape=(3, 1), dtype=float32, numpy=
array([[2.956],
       [2.419],
       [0.928]], dtype=float32)>

最后,我们就可以使用该函数生成的数据集,进行模型训练了,下面写一个简单的模型,演示一下:

batch_size = 32
train_set = csv_reader_dataset(train_filenames,
                               batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
                               batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
                              batch_size = batch_size)
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',
                       input_shape=[8]),
    keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
    patience=5, min_delta=1e-2)]

history = model.fit(train_set,
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

好了,到此结束,如果哪里有错误,希望大家指正,加油!!!

你可能感兴趣的:(tensorflow,深度学习)