学习笔记:用线性回归和梯度提升预测波士顿房价

# -*- coding: utf-8 -*-
"""
Created on Tue May 15 17:23:42 2018

@author: eagle
"""
# =============================================================================
# 线性回归预测波士顿房价
# =============================================================================

from sklearn.datasets import load_boston
from sklearn.cross_validation import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression  #线性回归
from sklearn.linear_model import SGDRegressor   #快速随机梯队下降(Stochastic Gradient Descend)

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

#加载房价数据
boston = load_boston()      #共506条数据
print(boston.DESCR)
X = boston.data
y = boston.target


#训练数据和测试数据分割
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25,random_state = 33)
print('The max target value is',np.max(boston.target)) #最大值
print('The min target value is',np.min(boston.target)) #最小值
print('The average target value is',np.mean(boston.target)) #平均值

#标准化
ss_X = StandardScaler()
ss_y = StandardScaler()

X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1,1))
y_test = ss_y.transform(y_test.reshape(-1,1))

#用线性回归模型中的LinearRegression和SGDRegression进行预测
lr=LinearRegression()
lr.fit(X_train,y_train)
lr_y_predict = lr.predict(X_test)
#SGDRegression
sgdr=SGDRegressor()
sgdr.fit(X_train,y_train)
sgdr_y_predict = sgdr.predict(X_test)

#评价
print('The scroe of LinearRegression is:',lr.score(X_test,y_test))
print('The r2-score of LinearRegression is:',r2_score(y_test,lr_y_predict))
print('The mean_squared_error of LinearRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
print('The mean_absolute_error of LinearRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))

#评价
print('The scroe of SGDRegression is:',sgdr.score(X_test,y_test))
print('The r2-score of SGDRegression is:',r2_score(y_test,sgdr_y_predict))
print('The mean_squared_error of SGDRegression is',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))
print('The mean_absolute_error of SGDRegression is',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))

代码如上,与书中不同的是,需要进行数组的变换,否则会报错。

y_train = ss_y.fit_transform(y_train.reshape(-1,1))
y_test = ss_y.transform(y_test.reshape(-1,1))
运行结果表明:使用随机梯度下降方法估计参数的性能不及解析方法的线性回归,但是在数据规模庞大的情况下,随机梯队法会非常高效。(Scikit-learn官网建议数据规模超过10万推荐使用随机梯度法参数估计模型)

你可能感兴趣的:(机器学习)