学习谷歌的机器学习快速入门课程,有一个作业是利用线性回归预测房价,数据集有8个特征,但是在这里仅用其中一个,起到练习作用。所以不可能仅靠单个特性搞出一个好的模型。
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.logging.set_verbosity(tf.logging.ERROR) # DEBUG INFO WARN ERROR FATAL
pd.options.display.max_rows = 10
pd.options.display.max_columns = 9
# pd.set_option('max_columns', 9)
pd.options.display.float_format = '{:.1f}'.format
# 加载数据集
# california_housing_dataframe = pd.read_csv
# ("https://storage.googleapis.com/mledu-datasets/california_housing_train.csv", sep=',')
california_housing_dataframe = pd.read_csv("california_housing_train.csv", sep=',')
# 随机数据
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index))
# 将数据整合到统一范围,median_house_value单位为千记
california_housing_dataframe["median_house_value"] /= 1000.0
# 检查数据
# print('\n数据:')
# print(california_housing_dataframe.head())
# print('\n数据统计:')
# describe = california_housing_dataframe.describe()
# print(describe)
# 搞模型
# 1.定义特征和特征列
my_feature = california_housing_dataframe[['total_rooms']] # 返回dataframe
# my_feature_series = california_housing_dataframe['total_rooms'] # 返回series
# print('\n特征')
# print(type(my_feature))
# print(type(my_feature_series))
feature_columns = [tf.feature_column.numeric_column('total_rooms')] # 定义特征列 todo
# print(feature_columns)
# 2.定义目标
targets = california_housing_dataframe['median_house_value']
# 3.配置线性回归
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.0000001)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0)
linear_regressor = tf.estimator.LinearRegressor(feature_columns=feature_columns, optimizer=my_optimizer)
# 4.定义输入函数
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""
输入函数
:param features: 输入特征
:param targets: 数据标签
:param batch_size: 输出数据的大小
:param shuffle: 随机抽取数据
:param num_epochs:重复的次数
:return:数据和标签
"""
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(buffer_size=10000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
# 5.训练
_ = linear_regressor.train(input_fn=lambda: my_input_fn(my_feature, targets), steps=100)
# 6.评估模型
prediction_input_fn = lambda: my_input_fn(my_feature, targets, num_epochs=1, shuffle=False)
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
# print(predictions)
# 6.评估误差
mean_squared_error = metrics.mean_squared_error(targets, predictions)
root_mean_squared_error = math.sqrt(mean_squared_error)
min_house_value = california_housing_dataframe['median_house_value'].min()
max_house_value = california_housing_dataframe['median_house_value'].max()
max_min_difference = max_house_value - min_house_value
# print('Mean squared error(on train set): %.3f' % mean_squared_error)
print('Root mean squared error(on train set): %.3f' % root_mean_squared_error)
print('Max. median house value(on train set): %.3f' % max_house_value)
print('Min. median house value(on train set): %.3f' % min_house_value)
print('Difference between Min. and Max.(on train set): %.3f' % max_min_difference)
# 方差大,校准数据
# Root mean squared error(on train set): 237.417
# Max. median house value(on train set): 500.001
# Min. median house value(on train set): 14.999
# Difference between Min. and Max.(on train set): 485.002
calibration_data = pd.DataFrame()
calibration_data['prediction'] = pd.Series(predictions)
calibration_data['targets'] = pd.Series(targets)
print(calibration_data.describe())
# prediction targets
# count 17000.0 17000.0
# mean 0.1 207.3
# std 0.1 116.0
# min 0.0 15.0
# 25% 0.1 119.4
# 50% 0.1 180.4
# 75% 0.2 265.0
# max 1.9 500.0
# 可视化
sample = california_housing_dataframe.sample(n=300)
x_0 = sample['total_rooms'].min()
x_1 = sample['total_rooms'].max()
weight = linear_regressor.get_variable_value('linear/linear_model/total_rooms/weights')[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
y_0 = weight * x_0 + bias
y_1 = weight * x_1 + bias
plt.plot([x_0,x_1],[y_0, y_1], c='r')
plt.xlabel('total_rooms')
plt.ylabel('median_house_value')
plt.scatter(sample['total_rooms'], sample['median_house_value'])
plt.show()
显示是从数据集中抽取了300个点,红线为预测,结果很差。
每隔若干步,显示目前为止训练的结果
import math
from IPython import display
from matplotlib import cm
from matplotlib import gridspec
from matplotlib import pyplot as plt
import os
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from tensorflow.python.data import Dataset
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.logging.set_verbosity(tf.logging.ERROR) # DEBUG INFO WARN ERROR FATAL
pd.options.display.max_rows = 10
pd.options.display.max_columns = 9
pd.options.display.float_format = '{:.1f}'.format
# 加载数据集
california_housing_dataframe = pd.read_csv("california_housing_train.csv", sep=',')
california_housing_dataframe = california_housing_dataframe.reindex(
np.random.permutation(california_housing_dataframe.index)) # 随机数据
california_housing_dataframe["median_house_value"] /= 1000.0 # 将数据整合到统一范围,median_house_value单位为千记
# 4.定义输入函数
def my_input_fn(features, targets, batch_size=1, shuffle=True, num_epochs=None):
"""
输入函数
:param features: 输入特征
:param targets: 数据标签
:param batch_size: 输出数据的大小
:param shuffle: 随机抽取数据
:param num_epochs:重复的次数
:return:数据和标签
"""
features = {key: np.array(value) for key, value in dict(features).items()}
ds = Dataset.from_tensor_slices((features, targets))
ds = ds.batch(batch_size).repeat(num_epochs)
if shuffle:
ds = ds.shuffle(buffer_size=10000)
features, labels = ds.make_one_shot_iterator().get_next()
return features, labels
def train_model(learning_rate, steps, batch_size, input_feature='total_rooms'):
periods = 10 # 控制报告的粒度
steps_per_periods = steps / periods # 每次报告时所走的步长
my_feature = input_feature
my_feature_data = california_housing_dataframe[[my_feature]] # 输入值
my_label = 'median_house_value'
targets = california_housing_dataframe[my_label] # 标签
# 定义特征列
feature_columns = [tf.feature_column.numeric_column(my_feature)]
# 定义输入函数
training_input_fn = lambda: my_input_fn(my_feature_data, targets, batch_size=batch_size)
prediction_input_fn = lambda: my_input_fn(my_feature_data, targets, num_epochs=1, shuffle=False)
# 最优化函数
my_optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
my_optimizer = tf.contrib.estimator.clip_gradients_by_norm(my_optimizer, 5.0) # 梯度裁剪
# 模型
linear_regressor = tf.estimator.LinearRegressor(feature_columns=feature_columns, optimizer=my_optimizer)
# 每个时期模型的状态
plt.figure(figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.title('Learned line by period')
plt.ylabel(my_label)
plt.xlabel(my_feature)
sample = california_housing_dataframe.sample(n=300)
plt.scatter(sample[my_feature], sample[my_label])
colors = np.linspace(0, 1, periods)
cmap = cm.get_cmap('hsv')
print('Training model ...')
print('RMSE(on training set):')
predictions_buffer = None
root_mean_squared_errors = []
for period in range(0, periods):
linear_regressor.train(input_fn=training_input_fn, steps=steps_per_periods)
predictions = linear_regressor.predict(input_fn=prediction_input_fn)
predictions = np.array([item['predictions'][0] for item in predictions])
# item是这样的:{'predictions': array([0.015675], dtype=float32)}
# print(predictions)
predictions_buffer = predictions
root_mean_squared_error = math.sqrt(metrics.mean_squared_error(targets, predictions))
print('period %02d : %.2f' % (period, root_mean_squared_error))
root_mean_squared_errors.append(root_mean_squared_error)
y_extents = np.array([0, sample[my_label].max()])
weight = linear_regressor.get_variable_value('linear/linear_model/%s/weights' % input_feature)[0]
bias = linear_regressor.get_variable_value('linear/linear_model/bias_weights')
x_extents = (y_extents - bias) / weight
# 1. 实际值的最大值; 2. 预测值与实际值的最小值的最大值
x_extents = np.maximum(np.minimum(x_extents, sample[my_feature].max()), # 防止预测值的最小值比实际值的最大值还大的情况
sample[my_feature].min()) # 预测值与实际值的最小点的最大点
y_extents = weight * x_extents + bias
plt.plot(x_extents, y_extents, color=cmap(colors[period]), label='period:{}'.format(period))
print('Model training finished.')
plt.legend(loc='best')
plt.subplot(1, 2, 2)
plt.ylabel('RMSE')
plt.xlabel('Periods')
plt.title('Root mean squared error vs. periods')
plt.tight_layout()
plt.plot(root_mean_squared_errors)
plt.show()
calibration_data = pd.DataFrame()
calibration_data['prediction'] = pd.Series(predictions_buffer)
calibration_data['targets'] = pd.Series(targets)
display.display(calibration_data.describe())
print("final RMSE(on training data): % .2f" % root_mean_squared_errors[-1])
train_model(learning_rate=0.00002, steps=500, batch_size=5)
Training model ...
RMSE(on training set):
period 00 : 225.63
period 01 : 214.42
period 02 : 204.44
period 03 : 195.69
period 04 : 188.50
period 05 : 181.34
period 06 : 176.10
period 07 : 172.26
period 08 : 169.46
period 09 : 167.89
Model training finished.
prediction targets
count 17000.0 17000.0
mean 113.1 207.3
std 93.3 116.0
min 0.1 15.0
25% 62.6 119.4
50% 91.0 180.4
75% 134.9 265.0
max 1623.7 500.0
final RMSE(on training data): 167.89
Process finished with exit code 0
可以看到,随着训练次数的增加,精度越高,误差越低。
不存在一套标准的调整超参数的方法。超参数的效果取决于数据。
一下法则仅供参考。
- 训练误差应稳步减小,一开始急剧减小,最后收敛
- 训练误差未收敛,尝试运行更长时间
- 训练误差减小速度慢,尝试提高学习率,看是否能够加快减小速度
- 如果学习率提高太多,反而导致收敛速度减慢或者导致发散
如果误差震荡,尝试降低学习率
批量大小过小也会导致发散,可以从较大值开始往小尝试,试探出影响性能的最小边界
california_housing_dataframe['rooms_per_person'] = (
california_housing_dataframe['total_rooms'] / california_housing_dataframe['population'])
california_housing_dataframe['rooms_per_person'] = california_housing_dataframe['rooms_per_person'].apply(
lambda x: min(x, 5))
train_model(learning_rate=0.05, steps=500, batch_size=5, input_feature='rooms_per_person')
在训练函数最后添加:
plt.figure('02', figsize=(15, 6))
plt.subplot(1, 2, 1)
plt.scatter(calibration_data['prediction'], calibration_data['targets'])
plt.subplot(1, 2, 2)
california_housing_dataframe['rooms_per_person'].hist()
特征在有离群值(噪声)数据下的训练结果
预测值与目标值图,理想应该是一条直线。
以及特征的直方图分布。
之前的一个离群点已经去除:
预测值与目标值图,理想应该是一条直线。
以及特征的直方图分布。