12.房价预测集成学习Stacking Learning

  • 代码地址:appke/Los-House-Prices: 洛杉矶房价预测
import numpy as np 
import pandas as pd 
# 忽略警告信息
import warnings
warnings.filterwarnings("ignore")

数据集的准备

from sklearn.model_selection import train_test_split
train=pd.read_csv('datas/house_data.csv')
y=train['SalePrice']
train1=train.drop(['Id','SalePrice'],axis=1)
X=pd.get_dummies(train1).reset_index(drop=True)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=123)
# tmp=train.isnull().sum()
# tmp[tmp>0] 

模型测评

from sklearn.metrics import mean_squared_error
def benchmark(model,testset,label):
    pred=model.predict(testset)
    if pred[pred<0].shape[0]>0:
        print('Neg Value')
    rmse=np.sqrt(mean_squared_error(label,pred))
    lrmse=np.sqrt(mean_squared_error(np.log(label),np.log(pred)))

    print('RMSE:',rmse)
    print('LRMSE:',lrmse)
    return lrmse

基础模型训练

ElasticNet

from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold
kfolds=KFold(n_splits=10, shuffle=True, random_state=123)
e_l1ratio=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.85,0.9,0.95]
e_alphas=np.logspace(-10,2.8,150)
def elastic_train_test(alpha,l1ratio):
    e_model=make_pipeline(RobustScaler(),ElasticNetCV(alphas=[alpha],l1_ratio=[l1ratio]))
    e_model.fit(X_train,y_train)
    lrmse=benchmark(e_model,X_test,y_test)
    return lrmse
elastic_train_test(50,0.5) 
RMSE: 64803.88956616406
LRMSE: 0.3056812482960621

0.3056812482960621
elastic_model=make_pipeline(RobustScaler(), ElasticNetCV(alphas=e_alphas, l1_ratio=e_l1ratio)).fit(X_train,y_train)
benchmark(elastic_model,X_test,y_test)
RMSE: 25991.07955736571
LRMSE: 0.12567210233778722

0.12567210233778722
elastic_model.steps[1][1].alpha_
0.3432183268134919
elastic_model.steps[1][1].l1_ratio_
0.9

XGBoost训练

import xgboost as xgb
xg_reg=xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3400,subsample=0.7,nthread=6,seed=123)
xg_reg.fit(X_train,y_train)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=0.7, gamma=0, importance_type='gain',
             learning_rate=0.01, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=3400, n_jobs=1,
             nthread=6, objective='reg:linear', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=123, silent=True,
             subsample=0.7)
benchmark(xg_reg,X_test,y_test)
RMSE: 22926.489730019464
LRMSE: 0.10024704840338212

0.10024704840338212



Stacking集成算法

底层算法

from mlxtend.regressor import StackingCVRegressor
alphas_alt=np.logspace(-10,2.8,150)
ridge=make_pipeline(RobustScaler(),RidgeCV(alphas=alphas_alt,cv=kfolds))
lasso=make_pipeline(RobustScaler(),LassoCV(alphas=alphas_alt,cv=kfolds))
elasticnet=make_pipeline(RobustScaler(),ElasticNetCV(alphas=e_alphas,cv=kfolds, l1_ratio=e_l1ratio))
xgboost=make_pipeline(RobustScaler(),xgb.XGBRegressor(objective='reg:linear',colsample_bytree=0.7,learning_rate=0.01,max_depth=3,n_estimators=3460,subsample=0.7,reg_alpha=0.00006,gamma=0,nthread=6,scale_pos_weight=1,seed=27))

上层算法

# 是否使用原训练集中的feature
stack_alg=StackingCVRegressor(regressors=(ridge, lasso, elasticnet, xgboost),
                              meta_regressor=xgboost, use_features_in_secondary=True)
stackX=np.array(X_train)
stacky=np.array(y_train)
stack_alg.fit(stackX, stacky)
p
benchmark(stack_alg, X_test, y_test)

你可能感兴趣的:(洛杉矶房价预测)