Author:龙箬
Data Science and Big Data Technology
Change the world with data!
CSDN@weixin_43975035
敬那大争之世,敬这小酌之时。
美国波士顿地区房价数据描述
# 从sklearn.datasets 导入波士顿房价数据读取器
from sklearn.datasets import load_boston
# 从读取房价数据存储在变量boston中
boston=load_boston()
# 输出数据描述
print(boston.DESCR)
#使用sklearn.model_selection 里导入数据分割器
from sklearn.model_selection import train_test_split
import numpy as np
X=boston.data
y=boston.target
#随机采样25%的数据用于构建测试样本,剩下的75%用于构建训练样本
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=33,test_size=0.25)
# 分析回归目标值的差异
print("The max target value is",np.max(boston.target))
print("The min target value is",np.min(boston.target))
print("The average target value is",np.mean(boston.target))
运行结果如下:
The max target value is 50.0
The min target value is 5.0
The average target value is 22.532806324110677
训练与测试数据标准化处理
# 从 sklearn.preprocessing 中导入数据标准化模块
from sklearn.preprocessing import StandardScaler
# 分别初始化对特征和目标值的标准化数据
ss_X=StandardScaler()
ss_y=StandardScaler()
# 分别对训练和测试数据的特征以及目标值进行标准化处理
X_train=ss_X.fit_transform(X_train)
X_test=ss_X.transform(X_test)
y_train=ss_y.fit_transform(y_train.reshape(-1, 1))
y_test=ss_y.transform(y_test.reshape(-1, 1))
使用线性回归模型LinearRegression 和 SGDRegressor 分别对美国波士顿地区房价进行预测
# 从sklearn.linear_model导入 LinearRegression
from sklearn.linear_model import LinearRegression
# 使用默认配置初始化线性回归器LinearRegression
lr=LinearRegression()
# 使用训练数据进行参数估计啊
lr.fit(X_train,y_train.ravel())
# 对测试数据进行回归预测
lr_y_predict=lr.predict(X_test)
# 从sklearn.linear_model导入 SGDRegressor(随机梯度)
from sklearn.linear_model import SGDRegressor
# 使用默认配置初始化线性回归器SGDRegressor
sgdr=SGDRegressor()
# 使用训练数据进行参数估计
sgdr.fit(X_train,y_train.ravel())
# 对测试数据进行回归预测
sgdr_y_predict=sgdr.predict(X_test)
使用三种回归评价机制以及两种调用R-squared 评价模块的方法,对模型的回归性能做出评价
# 使用LinearRegression 模型自带的评估模块,并输出评估结果
print("The value of default measurement of LinearRegression is",lr.score(X_test,y_test))
# 从sklearn.metrics 依次导入 r2_score、mean_squared_error以及mean_absolute_error用于回归性能的估计
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
# 使用r2_score模块并输出评估结果
print("The value of R-squared of LinearRegression is",r2_score(y_test,lr_y_predict))
# 使用mean_squared_error模块并输出评估结果
print("The mean squared error of LinearRegression is",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
# 使用mean_absolute_error模块并输出评估结果
print("The mean absolute error of LinearRegression is",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(lr_y_predict)))
运行结果如下:
The value of default measurement of LinearRegression is 0.675795501452948
The value of R-squared of LinearRegression is 0.675795501452948
The mean squared error of LinearRegression is 25.139236520353457
The mean absolute error of LinearRegression is 3.5325325437053983
# 使用SGDRegressor 模型自带的评估模块,并输出评估结果
print("The value of default measurement of SGDRegressor is",sgdr.score(X_test,y_test))
# 从sklearn.metrics 依次导入 r2_score、mean_squared_error以及mean_absolute_error用于回归性能的估计
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
# 使用r2_score模块并输出评估结果
print("The value of R-squared of SGDRegressor is",r2_score(y_test,sgdr_y_predict))
# 使用mean_squared_error模块并输出评估结果
print("The mean squared error of SGDRegressor is",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))
# 使用mean_absolute_error模块并输出评估结果
print("The mean absolute error of SGDRegressor is",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(sgdr_y_predict)))
运行结果如下:
The value of default measurement of SGDRegressor is 0.6662431612486133
The value of R-squared of SGDRegressor is 0.6662431612486133
The mean squared error of SGDRegressor is 25.879937345899844
The mean absolute error of SGDRegressor is 3.4963271762098502
使用三种不同核函数配置的支持向量机回归模型进行训练,并且分别对测试数据做出预测
# 从 sklearn.svm中导入支持向量机(回归)模型
from sklearn.svm import SVR
# 使用线性核函数配置的支持向量机进行回归训练,并且对测试样本进行预测
linear_svr=SVR(kernel='linear')
linear_svr.fit(X_train,y_train.ravel())
linear_svr_y_predict=linear_svr.predict(X_test)
# 使用多项式核函数配置的支持向量机进行回归训练,并且对测试样本进行预测
poly_svr=SVR(kernel='poly')
poly_svr.fit(X_train,y_train.ravel())
poly_svr_y_predict=poly_svr.predict(X_test)
# 使用径向基核函数配置的支持向量机进行回归训练,并且对测试样本进行预测
rbf_svr=SVR(kernel='rbf')
rbf_svr.fit(X_train,y_train.ravel())
rbf_svr_y_predict=rbf_svr.predict(X_test)
对三种核函数配置下的支持向量机回归模型在相同测试集上进行性能评估
# 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
# 线性核函数配置的支持向量机
# 使用r2_score模块并输出评估结果
print("R-squared value of linear SVR is",linear_svr.score(X_test,y_test))
# 使用mean_squared_error模块并输出评估结果
print("The mean squared error of linear SVR is",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(linear_svr_y_predict)))
# 使用mean_absolute_error模块并输出评估结果
print("The mean absolute error of linear SVR is",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(linear_svr_y_predict)))
运行结果如下:
R-squared value of linear SVR is 0.650659546421538
The mean squared error of linear SVR is 27.088311013556027
The mean absolute error of linear SVR is 3.4328013877599624
# 多项式核函数配置的支持向量机
# 使用r2_score模块并输出评估结果
print("R-squared value of linear SVR is",poly_svr.score(X_test,y_test))
# 使用mean_squared_error模块并输出评估结果
print("The mean squared error of linear SVR is",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(poly_svr_y_predict)))
# 使用mean_absolute_error模块并输出评估结果
print("The mean absolute error of linear SVR is",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(poly_svr_y_predict)))
运行结果如下:
R-squared value of linear SVR is 0.40365065102550846
The mean squared error of linear SVR is 46.24170053103929
The mean absolute error of linear SVR is 3.73840737104651
# 径向基核函数配置的支持向量机
# 使用r2_score模块并输出评估结果
print("R-squared value of linear SVR is",rbf_svr.score(X_test,y_test))
# 使用mean_squared_error模块并输出评估结果
print("The mean squared error of linear SVR is",mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))
# 使用mean_absolute_error模块并输出评估结果
print("The mean absolute error of linear SVR is",mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rbf_svr_y_predict)))
运行结果如下:
R-squared value of linear SVR is 0.7559887416340944
The mean squared error of linear SVR is 18.92094886153873
The mean absolute error of linear SVR is 2.6067819999501114
使用两种不同配置的K近邻回归模型对美国波士顿房价数据进行回归预测
# 从sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
# 初始化K近邻回归器,并调整配置,使得预测的方式平均回归:weights='uniform'
uni_knr=KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train,y_train)
uni_knr_y_predict=uni_knr.predict(X_test)
# 初始化K近邻回归器,并调整配置,使得预测的方式为根据距离加权回归:weights='distance'
dis_knr=KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train,y_train)
dis_knr_y_predict=dis_knr.predict(X_test)
对两种不同配置的K近邻回归模型在波士顿房价数据上进行预测性能的评估
# 使用R-squared、MSE以及MAE三种指标对平均回归配置的K近邻模型在测试集上进行性能评估
print('R-squared value of uniform-weighted KNeighorRegression:',uni_knr.score(X_test,y_test))
print('The mean squared error of uniform-weighted KNeighborRegression:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(uni_knr_y_predict)))
print('The mean absolute error of uniform-weighted KNeighborRegression:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(uni_knr_y_predict)))
print('\n')
# 使用R-squared、MSE以及MAE三种指标对距离加权回归配置的K近邻模型在测试集上进行性能评估
print('R-squared value of distance-weighted KNeighorRegression:',dis_knr.score(X_test,y_test))
print('The mean squared error of distance-weighted KNeighborRegression:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dis_knr_y_predict)))
print('The mean absolute error of distance-weighted KNeighborRegression:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dis_knr_y_predict)))
运行结果如下:
R-squared value of uniform-weighted KNeighorRegression: 0.6907212176346006
The mean squared error of uniform-weighted KNeighborRegression: 23.981877165354337
The mean absolute error of uniform-weighted KNeighborRegression: 2.9650393700787396
R-squared value of distance-weighted KNeighorRegression: 0.7201094821421603
The mean squared error of distance-weighted KNeighborRegression: 21.703073090490353
The mean absolute error of distance-weighted KNeighborRegression: 2.801125502210876
使用回归树对美国波士顿房价训练数据进行学习,并对测试数据进行预测
# 从 sklearn.tree 中导入DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
# 使用默认配置初始化DecisionTreeRegressor
dtr=DecisionTreeRegressor()
# 用波士顿房价的训练数据构建回归树
dtr.fit(X_train,y_train)
# 使用默认配置的单一回归树对测试数据进行预测,并将预测值存储在变量dtr_y_predict中
dtr_y_predict=dtr.predict(X_test)
对单一回归树模型在美国波士顿房价测试数据上的预测性能进行评估
# 使用R-squared、MSE以及MAE指标对默认配置的回归树在测试集上进行性能评估
print('R-squared value of DecisionTreeRegressor:',dtr.score(X_test,y_test))
print('The mean squared error of DecisionTreeRegressor:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))
print('The mean absolute error of DecisionTreeRegressor:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(dtr_y_predict)))
运行结果如下:
R-squared value of DecisionTreeRegressor: 0.6491683926437961
The mean squared error of DecisionTreeRegressor: 27.203937007874014
The mean absolute error of DecisionTreeRegressor: 3.2409448818897646
使用三种集成回归模型对美国波士顿房价训练数据进行学习,并对测试数据进行预测
# 从 sklearn.ensemble 中导入RandomForestRegressor、ExtraTreesRegressor 以及GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor
# 使用RandomForestRegressor 训练模型,并对测试数据做出预测,结果存储在变量rfr_y_predict中
rfr=RandomForestRegressor(n_estimators=100)
rfr.fit(X_train,y_train.ravel())
rfr_y_predict=rfr.predict(X_test)
# 使用ExtraTreesRegressor 训练模型,并对测试数据做出预测,结果存储在变量etr_y_predict中
etr=ExtraTreesRegressor(n_estimators=100)
etr.fit(X_train,y_train.ravel())
etr_y_predict=rfr.predict(X_test)
# 使用GradientBoostingRegressor 训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中
gbr=GradientBoostingRegressor()
gbr.fit(X_train,y_train.ravel())
gbr_y_predict=rfr.predict(X_test)
对三种集成回归模型在美国波士顿房价测试数据上的回归预测性能进行评估
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估
print('R-squared value of RandomForestRegressor:',rfr.score(X_test,y_test))
print('The mean squared error of RandomForestRegressor:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
print('The mean absolute error of RandomForestRegressor:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(rfr_y_predict)))
运行结果如下:
R-squared value of RandomForestRegressor: 0.8403126589586196
The mean squared error of RandomForestRegressor: 12.382363149606311
The mean absolute error of RandomForestRegressor: 2.2978740157480315
# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估
print('R-squared value of ExtraTreesRegressor:',etr.score(X_test,y_test))
print('The mean squared error of ExtraTreesRegressor:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(etr_y_predict)))
print('The mean absolute error of ExtraTreesRegressor:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(etr_y_predict)))
# 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度
print(np.sort(list(zip(etr.feature_importances_,boston.feature_names)),axis=0))
运行结果如下:
R-squared value of ExtraTreesRegressor: 0.8074220003942867
The mean squared error of ExtraTreesRegressor: 12.382363149606311
The mean absolute error of ExtraTreesRegressor: 2.2978740157480315
[['0.0037140302386605003' 'AGE']
['0.01498325758119106' 'B']
['0.01631410124524496' 'CHAS']
['0.01816680723408173' 'CRIM']
['0.020351868409311566' 'DIS']
['0.02549299458236654' 'INDUS']
['0.02698827496023488' 'LSTAT']
['0.028856959852286382' 'NOX']
['0.03220974821429572' 'PTRATIO']
['0.05106954403895167' 'RAD']
['0.05664326701977359' 'RM']
['0.33153492025307013' 'TAX']
['0.37367422637053144' 'ZN']]
# 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估
print('R-squared value of GradientBoostingRegressor:',gbr.score(X_test,y_test))
print('The mean squared error of GradientBoostingRegressor:',mean_squared_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(gbr_y_predict)))
print('The mean absolute error of GradientBoostingRegressor:',mean_absolute_error(ss_y.inverse_transform(y_test),ss_y.inverse_transform(gbr_y_predict)))
运行结果如下:
R-squared value of GradientBoostingRegressor: 0.8410763759525907
The mean squared error of GradientBoostingRegressor: 12.382363149606311
The mean absolute error of GradientBoostingRegressor: 2.2978740157480315
参考致谢:
范淼,李超.Python机器学习及实践——从零开始通往Kaggle竞赛之路
如有侵权,请联系侵删
需要本实验源数据及代码的小伙伴请联系QQ:2225872659