import numpy as np
import pandas as pd
import tensorflow as tf
df = pd.read_csv('california_housing_train.csv')
def z_score(features):
df = (features - features.mean()) / features.std()
return df
def log_score(features):
df = np.log(features + 1)
return df
def gets_onehot(inputs, features, buckets):
_inputs = {features: inputs.values}
df_fc = tf.feature_column.numeric_column(features)
_range = np.linspace(inputs.min(), inputs.max(), buckets+2)
_range = np.delete(_range, -1)
_range = np.delete(_range, 0)
_column = tf.feature_column.bucketized_column(df_fc, list(_range))
_tensor = tf.feature_column.input_layer(_inputs, [_column])
return _tensor
df['per_rooms'] = df['total_rooms'] / df['population']
df['per_bedrooms'] = df['total_bedrooms'] / df['population']
df['households'] = log_score(df['households'])
df['housing_median_age'] = log_score(df['housing_median_age'])
df['median_house_value'] = log_score(df['median_house_value'])
df['per_rooms'] = log_score(df['per_rooms'])
df['per_bedrooms'] = log_score(df['per_bedrooms'])
df['median_income'] = log_score(df['median_income'])
sess = tf.Session()
df1 = sess.run(gets_onehot(df['latitude'], 'latitude', 6)).copy()
df2 = sess.run(gets_onehot(df['longitude'], 'longitude', 6)).copy()
df1 = pd.DataFrame(df1, columns=['latitude1', 'latitude2', 'latitude3', 'latitude4', 'latitude5'
, 'latitude6', 'latitude7'])
df2 = pd.DataFrame(df2, columns=['longitude1', 'longitude2', 'longitude3', 'longitude4', 'longitude5'
, 'longitude6', 'longitude7'])
df_total = pd.concat([df, df1, df2], axis=1)
df_total.to_csv('California_features.csv')
这里我第一个python先进行了特征组合,可见我这里基本都是用z分数处理的,但是对于经度纬度实在没办法用z分数处理,同时log也处理不了,这里我选用了分箱的方式,将结果存为另一个csv,以便我们导入。
import tensorflow as tf
from tensorflow.data import Dataset
import numpy as np
class dnn():
def get_inputs(self, features, targets, batch_size=1, shuffle=True, num_epochs=100):
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(10000)
try:
features, labels = ds.make_one_shot_iterator().get_next()
except tf.errors.OutOfRangeError:
tf.Session.run(tf.data.Iterator.from_structure(ds.output_types, ds.output_shapes).make_initializer())
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def add_layers(self, n_layer, inputs, input_size, output_size, activation_function=None):
layer_name = 'layer_%s' % n_layer
with tf.name_scope(layer_name):
with tf.name_scope('Weights'):
weights = tf.Variable(tf.random_normal([input_size, output_size]))
tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.1)(weights))
tf.summary.histogram(layer_name + '/weights', weights)
with tf.name_scope('biases'):
biases = tf.Variable(tf.zeros(output_size) + 0.1)
tf.summary.histogram(layer_name + '/biases', biases)
with tf.name_scope('Wx_b'):
wx_b = tf.matmul(inputs, weights) + biases
if activation_function is None:
outputs = wx_b
else:
outputs = activation_function(wx_b)
tf.summary.histogram(layer_name + '/outputs', outputs)
return outputs
def loss(self, pred, targets):
with tf.name_scope('loss'):
_loss_rmse = tf.sqrt(tf.reduce_mean(tf.square(pred - targets)))
_loss = _loss_rmse + tf.add_n(tf.get_collection('losses'))
tf.summary.scalar('loss', _loss)
return _loss
def train_step(self, learning_rate, loss):
with tf.name_scope('train'):
_train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
return _train
下一步我新建了一个python来创建我们的全连接神经网络,这里有点超过所学内容了(由于最后loss出现了震荡的情况,我在这里加入了l2正则化)。如果你在运行中发现OutofRange的错误,请记得检查一下你的repeat中的epochs数值(改了好半天)。这里train的方法我并没有沿用文档中的ftrl(运行多次loss均高于adam),而是选用了adam算法,至于adam算法我之后会专门讲解。
import pandas as pd
import numpy as np
import tensorflow as tf
import california_neuralnetwork
df = pd.read_csv('california_features.csv')
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
training_examples = df.head(17000).astype('float32')
training_targets = df['median_house_value'].head(17000).astype('float32')
# features_validation = df.tail(5000).astype('float32')
_dnn = california_neuralnetwork.dnn()
xs, ys = _dnn.get_inputs(training_examples, training_targets, batch_size= 100, shuffle=True)
training_targets = tf.expand_dims(training_targets, -1)
xs_1 = xs['housing_median_age']
xs_1 = tf.expand_dims(xs_1, -1)
xs_2 = xs['households']
xs_2 = tf.expand_dims(xs_2, -1)
xs_3 = xs['median_income']
xs_3 = tf.expand_dims(xs_3, -1)
xs_4 = xs['per_rooms']
xs_4 = tf.expand_dims(xs_4, -1)
xs_5 = xs['per_bedrooms']
xs_5 = tf.expand_dims(xs_5, -1)
xs_6 = xs['latitude1']
xs_6 = tf.expand_dims(xs_6, -1)
xs_7 = xs['latitude2']
xs_7 = tf.expand_dims(xs_7, -1)
xs_8 = xs['latitude3']
xs_8 = tf.expand_dims(xs_8, -1)
xs_9 = xs['latitude4']
xs_9 = tf.expand_dims(xs_9, -1)
xs_10 = xs['latitude5']
xs_10 = tf.expand_dims(xs_10, -1)
xs_11 = xs['latitude6']
xs_11 = tf.expand_dims(xs_11, -1)
xs_12 = xs['latitude7']
xs_12 = tf.expand_dims(xs_12, -1)
xs_13 = xs['longitude1']
xs_13 = tf.expand_dims(xs_13, -1)
xs_14 = xs['longitude2']
xs_14 = tf.expand_dims(xs_14, -1)
xs_15 = xs['longitude3']
xs_15 = tf.expand_dims(xs_15, -1)
xs_16 = xs['longitude4']
xs_16 = tf.expand_dims(xs_16, -1)
xs_17 = xs['longitude5']
xs_17 = tf.expand_dims(xs_17, -1)
xs_18 = xs['longitude6']
xs_18 = tf.expand_dims(xs_18, -1)
xs_19 = xs['longitude7']
xs_19 = tf.expand_dims(xs_19, -1)
xs_input = tf.concat([xs_1, xs_2, xs_3, xs_4, xs_5, xs_6, xs_7, xs_8, xs_9, xs_10
, xs_11, xs_12, xs_13, xs_14, xs_15, xs_16, xs_17, xs_18, xs_19], -1)
l1 = _dnn.add_layers('layer_1', xs_input, 19, 100, activation_function=tf.nn.tanh)
l2 = _dnn.add_layers('layer_2', l1, 100, 1, activation_function=None)
_loss = _dnn.loss(l2, ys)
_train_step = _dnn.train_step(0.005, _loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('logs/', sess.graph)
for i in range(5000):
sess.run(_train_step)
if i % 50 == 0:
result = sess.run(merged)
writer.add_summary(result, i)
print(sess.run(_loss))
最后让我们运行一下我们的网络,我这里最后接近0.63(因初始化数据不同,结果可能略有不同,如果你不希望出现这种情况,请提前设置好随机数种子)。我这里把验证数据comment掉了。 loss输出的为训练数据,之后可能会介绍FileWriter的用法,到时候我们储存整个网络后再看验证集或者测试集的loss(因为OutofRange的问题折腾了半天,实在无心再加别的东西了)。
至此,关于拟合的问题基本已经介绍完了,大体的框架也给了,至于特征组合,还请大家自己尝试,应该会有loss更低的方法。
我在这里就不再尝试了,原文希望用2、3个特征组合来检验,我这里用了太多的特征组合了(把所有的都用了)。有些特征未必真的需要。
在这里再提一下,df.corr()的问题:
我们可以通过corr来获得r系数(相关系数)但是我们用plt.scatter()检测后,发现相关的特征之间并没有特别明显的线性关系。因此我们不能在这里通过r系数来排除某些特征。
至于room_per_person 之前的文章提到过,我们用log处理后还会对高于4的值设置为4,这里并没有设置,我们的源数据可能存在问题,还请各位斟酌。
至于某特征的处理方法,是用分箱还是布尔值(或独热编码),这个需要我们用经验判断(我没什么经验,或许用的方法并不好):
不过,我们应把每个特征的数值都限定在一个范围,容易让机器更加快速的学习(后面可能会讲到批标准化的问题)。
暂时能想到的问题就这么多,先写到这吧。
忘了提,这里补充一句,原文的代码会生成一个/logs 文件夹下的文件,这个文件是让我们神经网络训练过程可视化的。
使用方法为:
pip3 tensorboard
pip3 tensorflow.tensorboard
安装好后,在终端把目录定位到 /logs的上层文件夹后
tensorboard --logdir logs
最后结果会返回一个localhost:6006。
然后我们打开浏览器就输入 localhost:6006就可以看到图了。