【学习笔记】特征组合编程练习

import numpy as np
import pandas as pd
import tensorflow as tf

df = pd.read_csv('california_housing_train.csv')

def z_score(features):
    df = (features - features.mean()) / features.std()
    return df


def log_score(features):
    df = np.log(features + 1)
    return df


def gets_onehot(inputs, features, buckets):
    _inputs = {features: inputs.values}
    df_fc = tf.feature_column.numeric_column(features)
    _range = np.linspace(inputs.min(), inputs.max(), buckets+2)
    _range = np.delete(_range, -1)
    _range = np.delete(_range, 0)
    _column = tf.feature_column.bucketized_column(df_fc, list(_range))
    _tensor = tf.feature_column.input_layer(_inputs, [_column])
    return _tensor


df['per_rooms'] = df['total_rooms'] / df['population']
df['per_bedrooms'] = df['total_bedrooms'] / df['population']

df['households'] = log_score(df['households'])
df['housing_median_age'] = log_score(df['housing_median_age'])
df['median_house_value'] = log_score(df['median_house_value'])
df['per_rooms'] = log_score(df['per_rooms'])
df['per_bedrooms'] = log_score(df['per_bedrooms'])
df['median_income'] = log_score(df['median_income'])

sess = tf.Session()

df1 = sess.run(gets_onehot(df['latitude'], 'latitude', 6)).copy()
df2 = sess.run(gets_onehot(df['longitude'], 'longitude', 6)).copy()
df1 = pd.DataFrame(df1, columns=['latitude1', 'latitude2', 'latitude3', 'latitude4', 'latitude5'
                                 , 'latitude6', 'latitude7'])
df2 = pd.DataFrame(df2, columns=['longitude1', 'longitude2', 'longitude3', 'longitude4', 'longitude5'
                                 , 'longitude6', 'longitude7'])


df_total = pd.concat([df, df1, df2], axis=1)

df_total.to_csv('California_features.csv')

这里我第一个python先进行了特征组合,可见我这里基本都是用z分数处理的,但是对于经度纬度实在没办法用z分数处理,同时log也处理不了,这里我选用了分箱的方式,将结果存为另一个csv,以便我们导入。

 

import tensorflow as tf
from tensorflow.data import Dataset
import numpy as np

class dnn():
    def get_inputs(self, features, targets, batch_size=1, shuffle=True, num_epochs=100):
        features = {key: np.array(value) for key, value in dict(features).items()}
        ds = Dataset.from_tensor_slices((features, targets))
        ds = ds.batch(batch_size).repeat(num_epochs)
        if shuffle:
            ds = ds.shuffle(10000)
        try:
            features, labels = ds.make_one_shot_iterator().get_next()
        except tf.errors.OutOfRangeError:
            tf.Session.run(tf.data.Iterator.from_structure(ds.output_types, ds.output_shapes).make_initializer())
            features, labels = ds.make_one_shot_iterator().get_next()
        return features, labels

    def add_layers(self, n_layer, inputs, input_size, output_size, activation_function=None):
        layer_name = 'layer_%s' % n_layer
        with tf.name_scope(layer_name):
            with tf.name_scope('Weights'):
                weights = tf.Variable(tf.random_normal([input_size, output_size]))
                tf.add_to_collection('losses', tf.contrib.layers.l2_regularizer(0.1)(weights))
                tf.summary.histogram(layer_name + '/weights', weights)
            with tf.name_scope('biases'):
                biases = tf.Variable(tf.zeros(output_size) + 0.1)
                tf.summary.histogram(layer_name + '/biases', biases)
            with tf.name_scope('Wx_b'):
                wx_b = tf.matmul(inputs, weights) + biases
            if activation_function is None:
                outputs = wx_b
            else:
                outputs = activation_function(wx_b)
            tf.summary.histogram(layer_name + '/outputs', outputs)
            return outputs

    def loss(self, pred, targets):
        with tf.name_scope('loss'):
            _loss_rmse = tf.sqrt(tf.reduce_mean(tf.square(pred - targets)))
            _loss = _loss_rmse + tf.add_n(tf.get_collection('losses'))
            tf.summary.scalar('loss', _loss)
            return _loss

    def train_step(self, learning_rate, loss):
        with tf.name_scope('train'):
            _train = tf.train.AdamOptimizer(learning_rate).minimize(loss)
            return _train


下一步我新建了一个python来创建我们的全连接神经网络,这里有点超过所学内容了(由于最后loss出现了震荡的情况,我在这里加入了l2正则化)。如果你在运行中发现OutofRange的错误,请记得检查一下你的repeat中的epochs数值(改了好半天)。这里train的方法我并没有沿用文档中的ftrl(运行多次loss均高于adam),而是选用了adam算法,至于adam算法我之后会专门讲解。

 

import pandas as pd
import numpy as np
import tensorflow as tf
import california_neuralnetwork

df = pd.read_csv('california_features.csv')
df = df.reindex(np.random.permutation(df.index))
df = df.sort_index()
training_examples = df.head(17000).astype('float32')
training_targets = df['median_house_value'].head(17000).astype('float32')
# features_validation = df.tail(5000).astype('float32')
_dnn = california_neuralnetwork.dnn()

xs, ys = _dnn.get_inputs(training_examples, training_targets, batch_size= 100, shuffle=True)
training_targets = tf.expand_dims(training_targets, -1)
xs_1 = xs['housing_median_age']
xs_1 = tf.expand_dims(xs_1, -1)
xs_2 = xs['households']
xs_2 = tf.expand_dims(xs_2, -1)
xs_3 = xs['median_income']
xs_3 = tf.expand_dims(xs_3, -1)
xs_4 = xs['per_rooms']
xs_4 = tf.expand_dims(xs_4, -1)
xs_5 = xs['per_bedrooms']
xs_5 = tf.expand_dims(xs_5, -1)
xs_6 = xs['latitude1']
xs_6 = tf.expand_dims(xs_6, -1)
xs_7 = xs['latitude2']
xs_7 = tf.expand_dims(xs_7, -1)
xs_8 = xs['latitude3']
xs_8 = tf.expand_dims(xs_8, -1)
xs_9 = xs['latitude4']
xs_9 = tf.expand_dims(xs_9, -1)
xs_10 = xs['latitude5']
xs_10 = tf.expand_dims(xs_10, -1)
xs_11 = xs['latitude6']
xs_11 = tf.expand_dims(xs_11, -1)
xs_12 = xs['latitude7']
xs_12 = tf.expand_dims(xs_12, -1)
xs_13 = xs['longitude1']
xs_13 = tf.expand_dims(xs_13, -1)
xs_14 = xs['longitude2']
xs_14 = tf.expand_dims(xs_14, -1)
xs_15 = xs['longitude3']
xs_15 = tf.expand_dims(xs_15, -1)
xs_16 = xs['longitude4']
xs_16 = tf.expand_dims(xs_16, -1)
xs_17 = xs['longitude5']
xs_17 = tf.expand_dims(xs_17, -1)
xs_18 = xs['longitude6']
xs_18 = tf.expand_dims(xs_18, -1)
xs_19 = xs['longitude7']
xs_19 = tf.expand_dims(xs_19, -1)

xs_input = tf.concat([xs_1, xs_2, xs_3, xs_4, xs_5, xs_6, xs_7, xs_8, xs_9, xs_10
                      , xs_11, xs_12, xs_13, xs_14, xs_15, xs_16, xs_17, xs_18, xs_19], -1)

l1 = _dnn.add_layers('layer_1', xs_input, 19, 100, activation_function=tf.nn.tanh)
l2 = _dnn.add_layers('layer_2', l1, 100, 1, activation_function=None)
_loss = _dnn.loss(l2, ys)
_train_step = _dnn.train_step(0.005, _loss)

init = tf.global_variables_initializer()
sess = tf.Session()

sess.run(init)
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('logs/', sess.graph)

for i in range(5000):
    sess.run(_train_step)
    if i % 50 == 0:
        result = sess.run(merged)
        writer.add_summary(result, i)
        print(sess.run(_loss))

最后让我们运行一下我们的网络,我这里最后接近0.63(因初始化数据不同,结果可能略有不同,如果你不希望出现这种情况,请提前设置好随机数种子)。我这里把验证数据comment掉了。 loss输出的为训练数据,之后可能会介绍FileWriter的用法,到时候我们储存整个网络后再看验证集或者测试集的loss(因为OutofRange的问题折腾了半天,实在无心再加别的东西了)。

至此,关于拟合的问题基本已经介绍完了,大体的框架也给了,至于特征组合,还请大家自己尝试,应该会有loss更低的方法。

我在这里就不再尝试了,原文希望用2、3个特征组合来检验,我这里用了太多的特征组合了(把所有的都用了)。有些特征未必真的需要。

在这里再提一下,df.corr()的问题:

我们可以通过corr来获得r系数(相关系数)但是我们用plt.scatter()检测后,发现相关的特征之间并没有特别明显的线性关系因此我们不能在这里通过r系数来排除某些特征

 

至于room_per_person 之前的文章提到过,我们用log处理后还会对高于4的值设置为4,这里并没有设置,我们的源数据可能存在问题,还请各位斟酌。

 

至于某特征的处理方法,是用分箱还是布尔值(或独热编码),这个需要我们用经验判断(我没什么经验,或许用的方法并不好):

不过,我们应把每个特征的数值都限定在一个范围,容易让机器更加快速的学习(后面可能会讲到批标准化的问题)。

 

暂时能想到的问题就这么多,先写到这吧。

 

忘了提,这里补充一句,原文的代码会生成一个/logs 文件夹下的文件,这个文件是让我们神经网络训练过程可视化的。

使用方法为:

pip3 tensorboard
pip3 tensorflow.tensorboard

安装好后,在终端把目录定位到 /logs的上层文件夹后

tensorboard --logdir logs

最后结果会返回一个localhost:6006。

然后我们打开浏览器就输入 localhost:6006就可以看到图了。

你可能感兴趣的:(【学习笔记】特征组合编程练习)