[一周算法进阶]--任务三-模型融合

Task3.模型融合

用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分果。

1.导入相关包&读取数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder,Imputer

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

data_original=pd.read_csv('data.csv',skipinitialspace=True)
data=data_original.copy()
data.head(5)
Unnamed: 0 custid trade_no bank_card_no low_volume_percent middle_volume_percent take_amount_in_later_12_month_highest trans_amount_increase_rate_lately trans_activity_month trans_activity_day ... loans_max_limit loans_avg_limit consfin_credit_limit consfin_credibility consfin_org_count_current consfin_product_count consfin_max_limit consfin_avg_limit latest_query_day loans_latest_day
0 5 2791858 20180507115231274000000023057383 卡号1 0.01 0.99 0 0.90 0.55 0.313 ... 2900.0 1688.0 1200.0 75.0 1.0 2.0 1200.0 1200.0 12.0 18.0
1 10 534047 20180507121002192000000023073000 卡号1 0.02 0.94 2000 1.28 1.00 0.458 ... 3500.0 1758.0 15100.0 80.0 5.0 6.0 22800.0 9360.0 4.0 2.0
2 12 2849787 20180507125159718000000023114911 卡号1 0.04 0.96 0 1.00 1.00 0.114 ... 1600.0 1250.0 4200.0 87.0 1.0 1.0 4200.0 4200.0 2.0 6.0
3 13 1809708 20180507121358683000000388283484 卡号1 0.00 0.96 2000 0.13 0.57 0.777 ... 3200.0 1541.0 16300.0 80.0 5.0 5.0 30000.0 12180.0 2.0 4.0
4 14 2499829 20180507115448545000000388205844 卡号1 0.01 0.99 0 0.46 1.00 0.175 ... 2300.0 1630.0 8300.0 79.0 2.0 2.0 8400.0 8250.0 22.0 120.0

5 rows × 90 columns

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

2. 删除无关特征

data.drop(['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no', 'source','id_name'], axis=1, inplace=True)
object_cols = [col for col in data.columns if data[col].dtypes == 'O']
data_obj=data[object_cols]
data_num=data.drop(object_cols,axis=1)

3.缺失值填充和热编码

#缺失值填充
imputer=Imputer(strategy='mean')
mean_num=imputer.fit_transform(data_num)
data_num=pd.DataFrame(mean_num,columns=data_num.columns)
data_obj.ffill(inplace=True)
#One-HotEncoder
encoder = LabelBinarizer()
reg_preference_1hot = encoder.fit_transform(data_obj[['reg_preference_for_trad']])
data_obj.drop(['reg_preference_for_trad'], axis=1, inplace=True)
reg_preference_df = pd.DataFrame(reg_preference_1hot, columns=encoder.classes_)
data_obj = pd.concat([data_obj, reg_preference_df], axis=1)

#['latest_query_time']  ['loans_latest_time']
data_obj['latest_query_time'] = pd.to_datetime(data_obj['latest_query_time'])
data_obj['latest_query_time_month'] = data_obj['latest_query_time'].dt.month
data_obj['latest_query_time_weekday'] = data_obj['latest_query_time'].dt.weekday

data_obj['loans_latest_time'] = pd.to_datetime(data_obj['loans_latest_time'])
data_obj['loans_latest_time_month'] = data_obj['loans_latest_time'].dt.month
data_obj['loans_latest_time_weekday'] = data_obj['loans_latest_time'].dt.weekday

data_obj = data_obj.drop(['latest_query_time', 'loans_latest_time'], axis=1)

data=pd.concat([data_num,data_obj],axis=1)
data.shape

(4754, 90)
#数据集分割
from sklearn.model_selection import train_test_split
y=data['status']
X=data.drop(['status'],axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)

5.IV值与RF特征选择

X_train=X_train[['trans_amount_increase_rate_lately', 'trans_activity_day',
       'first_transaction_time', 'historical_trans_amount',
       'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
       'top_trans_count_last_1_month', 'trans_top_time_last_1_month',
       'consume_top_time_last_1_month',
       'trans_fail_top_count_enum_last_1_month',
       'trans_fail_top_count_enum_last_6_month',
       'trans_fail_top_count_enum_last_12_month',
       'max_cumulative_consume_later_1_month', 'first_transaction_day',
       'trans_day_last_12_month', 'apply_score', 'loans_score', 'loans_count',
       'loans_settle_count', 'loans_overdue_count', 'latest_three_month_loan',
       'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc',
       'latest_one_month_fail', 'consfin_credit_limit', 'consfin_avg_limit',
       'latest_query_day', 'loans_latest_day']]
X_test=X_test[['trans_amount_increase_rate_lately', 'trans_activity_day',
       'first_transaction_time', 'historical_trans_amount',
       'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
       'top_trans_count_last_1_month', 'trans_top_time_last_1_month',
       'consume_top_time_last_1_month',
       'trans_fail_top_count_enum_last_1_month',
       'trans_fail_top_count_enum_last_6_month',
       'trans_fail_top_count_enum_last_12_month',
       'max_cumulative_consume_later_1_month', 'first_transaction_day',
       'trans_day_last_12_month', 'apply_score', 'loans_score', 'loans_count',
       'loans_settle_count', 'loans_overdue_count', 'latest_three_month_loan',
       'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc',
       'latest_one_month_fail', 'consfin_credit_limit', 'consfin_avg_limit',
       'latest_query_day', 'loans_latest_day']]
X_train.head(5)
trans_amount_increase_rate_lately trans_activity_day first_transaction_time historical_trans_amount historical_trans_day rank_trad_1_month trans_amount_3_month top_trans_count_last_1_month trans_top_time_last_1_month consume_top_time_last_1_month trans_fail_top_count_enum_last_1_month trans_fail_top_count_enum_last_6_month trans_fail_top_count_enum_last_12_month max_cumulative_consume_later_1_month first_transaction_day trans_day_last_12_month apply_score loans_score loans_count loans_settle_count loans_overdue_count latest_three_month_loan history_suc_fee history_fail_fee latest_one_month_suc latest_one_month_fail consfin_credit_limit consfin_avg_limit latest_query_day loans_latest_day
110 0.96 0.405 20170217.0 181770.0 150.0 0.85 15610.0 1.00 0.0 0.0 6.0 9.0 9.0 220.0 458.0 99.0 535.0 498.0 92.0 77.0 7.0 3.0 85.0 52.0 0.0 3.0 10600.0 8228.0 0.0 9.0
3394 0.87 0.205 20170331.0 63350.0 74.0 0.65 12200.0 0.40 14.0 14.0 1.0 4.0 9.0 470.0 416.0 82.0 540.0 510.0 19.0 16.0 3.0 1.0 22.0 11.0 1.0 0.0 16300.0 7160.0 30.0 27.0
3052 1.98 0.205 20141110.0 97190.0 93.0 0.45 33280.0 0.30 11.0 11.0 0.0 4.0 21.0 1950.0 1288.0 82.0 516.0 482.0 16.0 16.0 2.0 0.0 20.0 5.0 0.0 0.0 10400.0 10320.0 3.0 137.0
490 1.49 0.555 20130817.0 373700.0 356.0 0.30 61940.0 0.10 15.0 15.0 8.0 8.0 8.0 3090.0 1738.0 82.0 491.0 448.0 40.0 22.0 7.0 3.0 40.0 78.0 0.0 10.0 6600.0 6418.0 20.0 51.0
1 1.28 0.458 20160402.0 302910.0 224.0 0.35 10590.0 0.05 13.0 13.0 0.0 3.0 3.0 2100.0 779.0 84.0 653.0 635.0 37.0 36.0 0.0 2.0 49.0 4.0 2.0 1.0 15100.0 9360.0 4.0 2.0

6.stacking模型融合

#数据归一化评价
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve
from mlxtend.classifier import StackingClassifier

lr_model = LogisticRegression(C = 0.1, penalty = 'l1')
svm_model = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
dt_model = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=80, max_depth=3, min_child_weight=5, 
                    gamma=0.2, subsample=0.8, colsample_bytree=0.8, reg_alpha=1e-5, 
                    objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgbm_model = LGBMClassifier(learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=11, 
                    gamma=0.1, subsample=0.5, colsample_bytree=0.9, reg_alpha=1e-5, 
                    nthread=4,scale_pos_weight=1, seed=27)
gbdt_model=GradientBoostingClassifier(n_estimators=100)
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100)

sclf_model = StackingClassifier(classifiers=[lgb_model, gbdt_model, rf_model], use_probas=True,
                          average_probas=False,
                          meta_classifier=lr_model)

models={'LR':lr_model, 'SVM':svm_model, 'DT':dt_model, 'GBDT':gbdt_model, 
        'XGBoost':xgb_model, 'LGBM':lgbm_model,'Stack':sclf_model}

---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

 in ()
     12 from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
     13 from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve
---> 14 from mlxtend.classifier import StackingClassifier
     15 
     16 lr_model = LogisticRegression(C = 0.1, penalty = 'l1')


ModuleNotFoundError: No module named 'mlxtend'
df_result=pd.DataFrame(columns=('model','accuracy','precision','recall','f1_score','auc'))
row=0
#定义评价函数
def evaluate(y_pre,y):
    acc=accuracy_score(y,y_pre)
    p=precision_score(y,y_pre)
    r=recall_score(y,y_pre)
    f1=f1_score(y,y_pre)
    return acc,p,r,f1

for name,model in models.items():
    print(name,'start training...')
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    y_proba=model.predict_proba(X_test)
    acc,p,r,f1=evaluate(y_pred,y_test)
    auc=roc_auc_score(y_test,y_proba[:,1])
    df_result.loc[row]=[name,acc,p,r,f1,auc]
    row+=1
print(df_result)

ubuntu16不知怎么了,使用pip安装mlxten始终报错,先把程序写上,明天继续安装。

你可能感兴趣的:(#,数据挖掘比赛整理)