文中先用经度纬度作为横纵坐标,用‘median_house_value'来作为colormap,
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('california_housing_train.csv', sep=',')
validation_examples = df.tail(5000)
training_examples = df.head(12000)
pd.set_option('display.max_columns', None)
print(validation_examples.describe())
print(training_examples.describe())
plt.figure(1)
ax = plt.subplot(1, 2, 1)
plt.scatter(validation_examples['longitude'], validation_examples['latitude'], cmap='coolwarm',
c=validation_examples['median_house_value']/validation_examples['median_house_value'].max())
ax.set_autoscaley_on(True)
ax.set_autoscalex_on(True)
ax1 = plt.subplot(1, 2, 2)
plt.scatter(training_examples['longitude'], training_examples['latitude'], cmap='coolwarm',
c=training_examples['median_house_value']/training_examples['median_house_value'].max())
plt.show()
不知道你是否看出问题了呢?
是的,这里我并没有将 x轴和y轴的lim统一,如果我们在同一个figure上画多张图,很多时候我们是为了对比,然而没有统一x轴y轴的lim很可能让我们对数据有误判。当我们统一坐标轴后依旧有问题,请自己写写看。
是的,统一坐标轴后,我们发现训练集和验证集的分布完全不一致。这是因为我们并没有将数据集的index打乱的原因。(原文之前打乱了,这里直接comment掉了。(我这里并没有给出comment的代码)不妨试着自己处理一下。(需要用到numpy库)
这里是我处理好的图片:
因为所有方法之前提到过,这里就不再给出代码,请自己动手试一下。
“机器学习中的调试通常是数据调试而不是代码调试。
如果数据有误,即使最高级的机器学习代码也挽救不了局面。”
好了,下一步我们就该开始搭建我们的神经网络了,这里我新建了一个py文件:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.data import Dataset
df = pd.read_csv('california_housing_train.csv', sep=',')
df['median_house_value'] /= 1000
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
validation_examples = df.tail(5000)
training_examples = df.head(12000)
def input_features(features, targets, batch_size=1, shuffle=None, num_epochs=None):
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(buffer_size=10000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def add_layer(inputs, input_size, output_size, activation_function=None):
weights = tf.Variable(tf.random_normal([input_size, output_size]))
biases = tf.Variable(tf.zeros(output_size))
wx_b = tf.matmul(inputs, weights) + biases
if activation_function is None:
outputs = wx_b
else:
outputs = activation_function(wx_b)
return outputs
def loss(pred, ys):
RMSE = tf.sqrt(tf.reduce_mean(tf.square(pred - ys)))
return RMSE
def train_step(learning_rate, loss):
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
return train_step
train_x1 = training_examples[['longitude']].astype('float32')
train_x2 = training_examples[['latitude']].astype('float32')
train_y1 = training_examples['median_house_value'].astype('float32')
validation_x1 = validation_examples[['longitude']].astype('float32')
validation_x2 = validation_examples[['latitude']].astype('float32')
validation_y1 = validation_examples['latitude'].astype('float32')
xs1, ys = input_features(train_x1, train_y1, batch_size=2000)
xs1 = tf.expand_dims(xs1['longitude'], -1)
xs2, _ = input_features(train_x2, train_y1, batch_size=2000, shuffle=False)
xs2 = tf.expand_dims(xs2['latitude'], -1)
xv1, v1 = input_features(validation_x1, validation_y1)
xv2, _ = input_features(validation_x2, validation_y1)
xv1 = tf.expand_dims(xv1['longitude'], -1)
xv2 = tf.expand_dims(xv2['latitude'], -1)
inputs = tf.concat([xs1, xs2], 1)
l1 = add_layer(inputs, 2, 10, activation_function=tf.nn.tanh)
pred = add_layer(l1, 10, 1)
_loss = loss(pred, ys)
_train = train_step(0.01, _loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(5000):
sess.run(_train)
if i % 50 ==0:
print(sess.run(_loss))
这里可以看到xv的数据我们并没有用上,因为这里我们只是显示出了训练集的loss。
我们不妨考虑下 测试集的loss,这里给出整体代码:
# coding=utf-8
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.data import Dataset
df = pd.read_csv('california_housing_train.csv', sep=',')
df['median_house_value'] /= 1000
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
validation_examples = df.tail(5000)
training_examples = df.head(12000)
def input_features(features, targets, batch_size=1, shuffle=None, num_epochs=None):
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(buffer_size=10000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def add_layer(inputs, input_size, output_size, activation_function=None):
weights = tf.Variable(tf.random_normal([input_size, output_size]))
biases = tf.Variable(tf.zeros(output_size))
wx_b = tf.matmul(inputs, weights) + biases
if activation_function is None:
outputs = wx_b
else:
outputs = activation_function(wx_b)
return weights, biases, outputs
def loss(pred, ys):
RMSE = tf.sqrt(tf.reduce_mean(tf.square(pred - ys)))
return RMSE
def train_step(learning_rate, loss):
train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss)
return train_step
train_x1 = training_examples[['longitude']].astype('float32')
train_x2 = training_examples[['latitude']].astype('float32')
train_y1 = training_examples['median_house_value'].astype('float32')
validation_x1 = validation_examples[['longitude']].astype('float32')
validation_x2 = validation_examples[['latitude']].astype('float32')
validation_y1 = validation_examples['median_house_value'].astype('float32')
xs1, ys = input_features(train_x1, train_y1, batch_size=2000)
xs1 = tf.expand_dims(xs1['longitude'], -1)
xs2, _ = input_features(train_x2, train_y1, batch_size=2000, shuffle=False)
xs2 = tf.expand_dims(xs2['latitude'], -1)
inputs = tf.concat([xs1, xs2], 1)
w1, b1, l1 = add_layer(inputs, 2, 10, activation_function=tf.nn.tanh)
w2, b2, pred = add_layer(l1, 10, 1)
_loss = loss(pred, ys)
_train = train_step(0.01, _loss)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(5000):
sess.run(_train)
if i % 50 == 0:
_inputs = np.concatenate((validation_x1, validation_x2), 1)
_l1 = tf.nn.tanh(tf.matmul(_inputs, w1) + b1)
_pred = tf.matmul(_l1, w2) + b2
validation_y2 = validation_y1[:, np.newaxis]
v_loss = tf.sqrt(tf.reduce_mean(tf.square(_pred - validation_y2), reduction_indices=[0]))
print(sess.run(v_loss))
我这里测试集RMSE的结果最终为115.5(因初始化参数不同,结果可能不同)。
这里的图像就不再画了(因为之后会介绍tensorboard),这里的代码与书中略有不同,请酌情参考。