from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
#数据划分
from sklearn.model_selection import train_test_split
x_train_all, x_test, y_train_all, y_test = train_test_split(
housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
x_train_all, y_train_all, random_state = 11)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)
输出:(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)
#数据预处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled = scaler.transform(x_test)
output_dir = "generate_csv"
if not os.path.exists(output_dir):
os.mkdir(output_dir)
'''
output_dir:csv文件保存的文件夹路径
data:原始数据,也就是上面的房价数据
name_prefix:文件名的前缀,比如train,test或者valid
header:csv文件的第一行的内容
n_parts:把训练,测试或者验证数据集分成几部分
'''
def save_to_csv(output_dir, data, name_prefix,header=None, n_parts=10):
##{:02d}表示显示十进制数,且要显示两位数,不够就在前面补0,
##所以就会出现01,02这种形式的数字
path_format = os.path.join(output_dir, "{}_{:02d}.csv")
filenames = []
'''
①np.array_split(list,n_parts)意思是将这个list分成n_parts如果
不能均分的话,那么允许不均匀分割;最后把分割后的各个array存放在
一个列表里。比如,它会把11610个样本分成n_parts份,然后返回每一份
的样本在原始数据中的索引值,一份的索引值放在一个列表中,多份再放在一个大列表
②enumerate是把一个序列或可迭代对象组合为一个索引序列,同时列出数据和数据下
标,就像pandas下的series一样,每个元素都有一个index
'''
for file_idx, row_indices in enumerate(
np.array_split(np.arange(len(data)),
n_parts)):
part_csv = path_format.format(name_prefix, file_idx)
filenames.append(part_csv)
with open(part_csv, "wt", encoding="utf-8") as f:
if header is not None:
f.write(header + "\n")
for row_index in row_indices:
'''
repr是将读取到的一行数据里的每一个元素依次
转换成字符串后再组合成一个列表。最终返回的是
通过指定字符","连接序列中的元素后生成的新的列表。
'''
f.write(",".join([repr(col) for col in data[row_index]]))
f.write('\n')
return filenames
'''
在这里train_data的形状是(11610, 9)
valid_data的形状是(3870, 9)
test_data的形状是(5160, 9)
'''
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
'''
header_cols:[MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,
Latitude,Longitude,"MidianHouseValue"]
'''
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)
train_filenames = save_to_csv(output_dir, train_data, "train",
header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, "valid",
header_str, n_parts=10)
test_filenames = save_to_csv(output_dir, test_data, "test",
header_str, n_parts=10)
让我们看一下最后生成的结果
import pprint
print("train filenames:")
pprint.pprint(train_filenames)
print("valid filenames:")
pprint.pprint(valid_filenames)
print("test filenames:")
pprint.pprint(test_filenames)
输出:
train filenames:
['generate_csv\\train_00.csv',
'generate_csv\\train_01.csv',
'generate_csv\\train_02.csv',
'generate_csv\\train_03.csv',
'generate_csv\\train_04.csv',
'generate_csv\\train_05.csv',
'generate_csv\\train_06.csv',
'generate_csv\\train_07.csv',
'generate_csv\\train_08.csv',
'generate_csv\\train_09.csv',
'generate_csv\\train_10.csv',
'generate_csv\\train_11.csv',
'generate_csv\\train_12.csv',
'generate_csv\\train_13.csv',
'generate_csv\\train_14.csv',
'generate_csv\\train_15.csv',
'generate_csv\\train_16.csv',
'generate_csv\\train_17.csv',
'generate_csv\\train_18.csv',
'generate_csv\\train_19.csv']
valid filenames:
['generate_csv\\valid_00.csv',
'generate_csv\\valid_01.csv',
'generate_csv\\valid_02.csv',
'generate_csv\\valid_03.csv',
'generate_csv\\valid_04.csv',
'generate_csv\\valid_05.csv',
'generate_csv\\valid_06.csv',
'generate_csv\\valid_07.csv',
'generate_csv\\valid_08.csv',
'generate_csv\\valid_09.csv']
test filenames:
['generate_csv\\test_00.csv',
'generate_csv\\test_01.csv',
'generate_csv\\test_02.csv',
'generate_csv\\test_03.csv',
'generate_csv\\test_04.csv',
'generate_csv\\test_05.csv',
'generate_csv\\test_06.csv',
'generate_csv\\test_07.csv',
'generate_csv\\test_08.csv',
'generate_csv\\test_09.csv']
这时候文件已经保存好了,返回的是各个文件名
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
print(filename)
输出:
tf.Tensor(b'generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv\\train_00.csv', shape=(), dtype=string)
n_readers = 5
'''
TextLineDataset函数负责读取所提供的文件名对应文件的内容,每次读取一行
所以,这里就是一个循环操作5个文件,然后在循环里,一个文件读出一行,
作为dataset的一个tensor;所以一个循环里会读出五行内容,这五行内容
分属五个文件。然后紧接着下一个循环,还是操作这五个文件,
直到全部行读完,才会进入下一个循环,也即按照和上面一样的操作处理下五个文件
'''
dataset = filename_dataset.interleave(
lambda filename: tf.data.TextLineDataset(filename).skip(1),
cycle_length = n_readers
)
for line in dataset.take(1):
print(line.numpy())
输出:b'0.6363646332204844,-1.0895425985107923,0.09260902815633619,
-0.20538124656801682,1.2025670451003232,-0.03630122549633783,
-0.6784101660505877,0.182235342347858,2.429'
因为我们存储的时候,将每一个数据都变成了string,所以这里读出来的,也即写进去之后的存储类型是bytes类型,因此,前面会有一个b。
'''
record_defaults:用来设置读取出来的数据的数据类型
np.nan表示的就是float32类型,需要注意元素个数需要和解析的数据个数相等
此外,还有0表示int32;字符串表示string;空数据,如[]也表示float32
'''
def parse_csv_line(line, n_fields = 9):
defs = [tf.constant(np.nan)] * n_fields
parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
x = tf.stack(parsed_fields[0:-1])
y = tf.stack(parsed_fields[-1:])
return x, y
parse_csv_line(b'-0.9868720801669367,0.832863080552588,\
-0.18684708416901633,-0.14888949288707784,\
-0.4532302419670616,-0.11504995754593579,\
1.6730974284189664,-0.7465496877362412,1.138',
n_fields=9)
输出:注意这是个元组,然后里面有两个元素,一个是x,一个是y
(<tf.Tensor: id=153, shape=(8,), dtype=float32, numpy=
array([-0.9868721 , 0.8328631 , -0.18684709, -0.1488895 , -0.45323023,
-0.11504996, 1.6730974 , -0.74654967], dtype=float32)>,
<tf.Tensor: id=154, shape=(1,), dtype=float32, numpy=array([1.138],
dtype=float32)>)
'''
filenames:csv文件文件名,也即路径名
n_readers:interleave的参数,表示一个循环操作几个文件
batch_size:批量大小
n_parse_threads:解析时的线程数
shuffle_buffer_size:数据缓冲区大小,一定要大于或者等于所需数据,
打乱数据后先将缓冲区填满,然后从缓冲区中读取打乱后的数据
'''
def csv_reader_dataset(filenames, n_readers=5,
batch_size=32, n_parse_threads=5,
shuffle_buffer_size=10000):
dataset = tf.data.Dataset.list_files(filenames)
#无限重复,也即取不完
dataset = dataset.repeat()
dataset = dataset.interleave(
lambda filename: tf.data.TextLineDataset(filename).skip(1),
cycle_length = n_readers
)
dataset.shuffle(shuffle_buffer_size)
dataset = dataset.map(parse_csv_line,
num_parallel_calls=n_parse_threads)
dataset = dataset.batch(batch_size)
return dataset
train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
print("x:")
pprint.pprint(x_batch)
print("y:")
pprint.pprint(y_batch)
输出:
x:
<tf.Tensor: id=229, shape=(3, 8), dtype=float32, numpy=
array([[ 0.09734604, 0.75276285, -0.20218964, -0.19547 , -0.40605137,
0.00678553, -0.81371516, 0.6566148 ],
[-0.32652634, 0.4323619 , -0.09345459, -0.08402992, 0.8460036 ,
-0.02663165, -0.56176794, 0.1422876 ],
[ 0.4240821 , 0.91296333, -0.04437482, -0.15297213, -0.24727628,
-0.10539167, 0.86126745, -1.335779 ]], dtype=float32)>
y:
<tf.Tensor: id=230, shape=(3, 1), dtype=float32, numpy=
array([[1.119],
[2.431],
[3.955]], dtype=float32)>
x:
<tf.Tensor: id=233, shape=(3, 8), dtype=float32, numpy=
array([[ 0.48530516, -0.8492419 , -0.06530126, -0.02337966, 1.4974351 ,
-0.07790658, -0.90236324, 0.78145146],
[ 0.63034356, 1.8741661 , -0.06713215, -0.12543367, -0.19737554,
-0.02272263, -0.69240725, 0.72652334],
[-1.4803331 , -0.68904144, -0.35624704, -0.17255889, -0.82158846,
-0.13823092, 1.9157133 , -1.0211904 ]], dtype=float32)>
y:
<tf.Tensor: id=234, shape=(3, 1), dtype=float32, numpy=
array([[2.956],
[2.419],
[0.928]], dtype=float32)>
batch_size = 32
train_set = csv_reader_dataset(train_filenames,
batch_size = batch_size)
valid_set = csv_reader_dataset(valid_filenames,
batch_size = batch_size)
test_set = csv_reader_dataset(test_filenames,
batch_size = batch_size)
model = keras.models.Sequential([
keras.layers.Dense(30, activation='relu',
input_shape=[8]),
keras.layers.Dense(1),
])
model.compile(loss="mean_squared_error", optimizer="sgd")
callbacks = [keras.callbacks.EarlyStopping(
patience=5, min_delta=1e-2)]
history = model.fit(train_set,
validation_data = valid_set,
steps_per_epoch = 11160 // batch_size,
validation_steps = 3870 // batch_size,
epochs = 100,
callbacks = callbacks)
好了,到此结束,如果哪里有错误,希望大家指正,加油!!!