- 代码地址:appke/Los-House-Prices: 洛杉矶房价预测
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
数据集的准备
from sklearn.model_selection import train_test_split
train=pd.read_csv('datas/house_data.csv')
y=train['SalePrice']
train1=train.drop(['Id','SalePrice'],axis=1)
X=pd.get_dummies(train1).reset_index(drop=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
模型测评
from sklearn.metrics import mean_squared_error
def benchmark(model,testset,label):
pred=model.predict(testset)
if pred[pred<0].shape[0]>0:
print('Neg Value')
rmse=np.sqrt(mean_squared_error(label,pred))
lrmse=np.sqrt(mean_squared_error(np.log(label),np.log(pred)))
print('RMSE:',rmse)
print('LRMSE:',lrmse)
return lrmse
基础模型训练
ElasticNet
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
kfolds=KFold(n_splits=10, shuffle=True, random_state=123)
e_l1ratio=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95]
e_alphas=np.logspace(-10,2.8,150)
def elastic_train_test(alpha,l1ratio):
e_model=make_pipeline(RobustScaler(),ElasticNetCV(alphas=[alpha],l1_ratio=[l1ratio]))
e_model.fit(X_train,y_train)
lrmse=benchmark(e_model,X_test,y_test)
return lrmse
elastic_train_test(50,0.5)
RMSE: 64803.88956616406
LRMSE: 0.3056812482960621
0.3056812482960621
elastic_model=make_pipeline(RobustScaler(), ElasticNetCV(alphas=e_alphas, l1_ratio=e_l1ratio)).fit(X_train,y_train)
benchmark(elastic_model,X_test,y_test)
RMSE: 25991.07955736571
LRMSE: 0.12567210233778722
0.12567210233778722
elastic_model.steps[1][1].alpha_
0.3432183268134919
elastic_model.steps[1][1].l1_ratio_
0.9
XGBoost训练
import xgboost as xgb
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3400,subsample=0.7,nthread=6,seed=123)
xg_reg.fit(X_train,y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=0.7, gamma=0, importance_type='gain',
learning_rate=0.01, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=3400, n_jobs=1,
nthread=6, objective='reg:linear', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
subsample=0.7)
benchmark(xg_reg,X_test,y_test)
RMSE: 22926.489730019464
LRMSE: 0.10024704840338212
0.10024704840338212
Stacking集成算法
底层算法
from mlxtend.regressor import StackingCVRegressor
alphas_alt=np.logspace(-10,2.8,150)
ridge=make_pipeline(RobustScaler(),RidgeCV(alphas=alphas_alt,cv=kfolds))
lasso=make_pipeline(RobustScaler(),LassoCV(alphas=alphas_alt,cv=kfolds))
elasticnet=make_pipeline(RobustScaler(),ElasticNetCV(alphas=e_alphas,cv=kfolds, l1_ratio=e_l1ratio))
xgboost=make_pipeline(RobustScaler(),xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3460,subsample=0.7,reg_alpha=0.00006,gamma=0,nthread=6,scale_pos_weight=1,seed=27))
上层算法
stack_alg=StackingCVRegressor(regressors=(ridge, lasso, elasticnet, xgboost),
meta_regressor=xgboost, use_features_in_secondary=True)
stackX=np.array(X_train)
stacky=np.array(y_train)
stack_alg.fit(stackX, stacky)
p
benchmark(stack_alg, X_test, y_test)