用你目前评分最高的模型作为基准模型,和其他模型进行stacking融合,得到最终模型及评分果。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer,OneHotEncoder,Imputer
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
data_original=pd.read_csv('data.csv',skipinitialspace=True)
data=data_original.copy()
data.head(5)
Unnamed: 0 | custid | trade_no | bank_card_no | low_volume_percent | middle_volume_percent | take_amount_in_later_12_month_highest | trans_amount_increase_rate_lately | trans_activity_month | trans_activity_day | ... | loans_max_limit | loans_avg_limit | consfin_credit_limit | consfin_credibility | consfin_org_count_current | consfin_product_count | consfin_max_limit | consfin_avg_limit | latest_query_day | loans_latest_day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5 | 2791858 | 20180507115231274000000023057383 | 卡号1 | 0.01 | 0.99 | 0 | 0.90 | 0.55 | 0.313 | ... | 2900.0 | 1688.0 | 1200.0 | 75.0 | 1.0 | 2.0 | 1200.0 | 1200.0 | 12.0 | 18.0 |
1 | 10 | 534047 | 20180507121002192000000023073000 | 卡号1 | 0.02 | 0.94 | 2000 | 1.28 | 1.00 | 0.458 | ... | 3500.0 | 1758.0 | 15100.0 | 80.0 | 5.0 | 6.0 | 22800.0 | 9360.0 | 4.0 | 2.0 |
2 | 12 | 2849787 | 20180507125159718000000023114911 | 卡号1 | 0.04 | 0.96 | 0 | 1.00 | 1.00 | 0.114 | ... | 1600.0 | 1250.0 | 4200.0 | 87.0 | 1.0 | 1.0 | 4200.0 | 4200.0 | 2.0 | 6.0 |
3 | 13 | 1809708 | 20180507121358683000000388283484 | 卡号1 | 0.00 | 0.96 | 2000 | 0.13 | 0.57 | 0.777 | ... | 3200.0 | 1541.0 | 16300.0 | 80.0 | 5.0 | 5.0 | 30000.0 | 12180.0 | 2.0 | 4.0 |
4 | 14 | 2499829 | 20180507115448545000000388205844 | 卡号1 | 0.01 | 0.99 | 0 | 0.46 | 1.00 | 0.175 | ... | 2300.0 | 1630.0 | 8300.0 | 79.0 | 2.0 | 2.0 | 8400.0 | 8250.0 | 22.0 | 120.0 |
5 rows × 90 columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
data.drop(['Unnamed: 0', 'custid', 'trade_no', 'bank_card_no', 'source','id_name'], axis=1, inplace=True)
object_cols = [col for col in data.columns if data[col].dtypes == 'O']
data_obj=data[object_cols]
data_num=data.drop(object_cols,axis=1)
#缺失值填充
imputer=Imputer(strategy='mean')
mean_num=imputer.fit_transform(data_num)
data_num=pd.DataFrame(mean_num,columns=data_num.columns)
data_obj.ffill(inplace=True)
#One-HotEncoder
encoder = LabelBinarizer()
reg_preference_1hot = encoder.fit_transform(data_obj[['reg_preference_for_trad']])
data_obj.drop(['reg_preference_for_trad'], axis=1, inplace=True)
reg_preference_df = pd.DataFrame(reg_preference_1hot, columns=encoder.classes_)
data_obj = pd.concat([data_obj, reg_preference_df], axis=1)
#['latest_query_time'] ['loans_latest_time']
data_obj['latest_query_time'] = pd.to_datetime(data_obj['latest_query_time'])
data_obj['latest_query_time_month'] = data_obj['latest_query_time'].dt.month
data_obj['latest_query_time_weekday'] = data_obj['latest_query_time'].dt.weekday
data_obj['loans_latest_time'] = pd.to_datetime(data_obj['loans_latest_time'])
data_obj['loans_latest_time_month'] = data_obj['loans_latest_time'].dt.month
data_obj['loans_latest_time_weekday'] = data_obj['loans_latest_time'].dt.weekday
data_obj = data_obj.drop(['latest_query_time', 'loans_latest_time'], axis=1)
data=pd.concat([data_num,data_obj],axis=1)
data.shape
(4754, 90)
#数据集分割
from sklearn.model_selection import train_test_split
y=data['status']
X=data.drop(['status'],axis=1)
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=2018)
X_train=X_train[['trans_amount_increase_rate_lately', 'trans_activity_day',
'first_transaction_time', 'historical_trans_amount',
'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
'top_trans_count_last_1_month', 'trans_top_time_last_1_month',
'consume_top_time_last_1_month',
'trans_fail_top_count_enum_last_1_month',
'trans_fail_top_count_enum_last_6_month',
'trans_fail_top_count_enum_last_12_month',
'max_cumulative_consume_later_1_month', 'first_transaction_day',
'trans_day_last_12_month', 'apply_score', 'loans_score', 'loans_count',
'loans_settle_count', 'loans_overdue_count', 'latest_three_month_loan',
'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc',
'latest_one_month_fail', 'consfin_credit_limit', 'consfin_avg_limit',
'latest_query_day', 'loans_latest_day']]
X_test=X_test[['trans_amount_increase_rate_lately', 'trans_activity_day',
'first_transaction_time', 'historical_trans_amount',
'historical_trans_day', 'rank_trad_1_month', 'trans_amount_3_month',
'top_trans_count_last_1_month', 'trans_top_time_last_1_month',
'consume_top_time_last_1_month',
'trans_fail_top_count_enum_last_1_month',
'trans_fail_top_count_enum_last_6_month',
'trans_fail_top_count_enum_last_12_month',
'max_cumulative_consume_later_1_month', 'first_transaction_day',
'trans_day_last_12_month', 'apply_score', 'loans_score', 'loans_count',
'loans_settle_count', 'loans_overdue_count', 'latest_three_month_loan',
'history_suc_fee', 'history_fail_fee', 'latest_one_month_suc',
'latest_one_month_fail', 'consfin_credit_limit', 'consfin_avg_limit',
'latest_query_day', 'loans_latest_day']]
X_train.head(5)
trans_amount_increase_rate_lately | trans_activity_day | first_transaction_time | historical_trans_amount | historical_trans_day | rank_trad_1_month | trans_amount_3_month | top_trans_count_last_1_month | trans_top_time_last_1_month | consume_top_time_last_1_month | trans_fail_top_count_enum_last_1_month | trans_fail_top_count_enum_last_6_month | trans_fail_top_count_enum_last_12_month | max_cumulative_consume_later_1_month | first_transaction_day | trans_day_last_12_month | apply_score | loans_score | loans_count | loans_settle_count | loans_overdue_count | latest_three_month_loan | history_suc_fee | history_fail_fee | latest_one_month_suc | latest_one_month_fail | consfin_credit_limit | consfin_avg_limit | latest_query_day | loans_latest_day | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
110 | 0.96 | 0.405 | 20170217.0 | 181770.0 | 150.0 | 0.85 | 15610.0 | 1.00 | 0.0 | 0.0 | 6.0 | 9.0 | 9.0 | 220.0 | 458.0 | 99.0 | 535.0 | 498.0 | 92.0 | 77.0 | 7.0 | 3.0 | 85.0 | 52.0 | 0.0 | 3.0 | 10600.0 | 8228.0 | 0.0 | 9.0 |
3394 | 0.87 | 0.205 | 20170331.0 | 63350.0 | 74.0 | 0.65 | 12200.0 | 0.40 | 14.0 | 14.0 | 1.0 | 4.0 | 9.0 | 470.0 | 416.0 | 82.0 | 540.0 | 510.0 | 19.0 | 16.0 | 3.0 | 1.0 | 22.0 | 11.0 | 1.0 | 0.0 | 16300.0 | 7160.0 | 30.0 | 27.0 |
3052 | 1.98 | 0.205 | 20141110.0 | 97190.0 | 93.0 | 0.45 | 33280.0 | 0.30 | 11.0 | 11.0 | 0.0 | 4.0 | 21.0 | 1950.0 | 1288.0 | 82.0 | 516.0 | 482.0 | 16.0 | 16.0 | 2.0 | 0.0 | 20.0 | 5.0 | 0.0 | 0.0 | 10400.0 | 10320.0 | 3.0 | 137.0 |
490 | 1.49 | 0.555 | 20130817.0 | 373700.0 | 356.0 | 0.30 | 61940.0 | 0.10 | 15.0 | 15.0 | 8.0 | 8.0 | 8.0 | 3090.0 | 1738.0 | 82.0 | 491.0 | 448.0 | 40.0 | 22.0 | 7.0 | 3.0 | 40.0 | 78.0 | 0.0 | 10.0 | 6600.0 | 6418.0 | 20.0 | 51.0 |
1 | 1.28 | 0.458 | 20160402.0 | 302910.0 | 224.0 | 0.35 | 10590.0 | 0.05 | 13.0 | 13.0 | 0.0 | 3.0 | 3.0 | 2100.0 | 779.0 | 84.0 | 653.0 | 635.0 | 37.0 | 36.0 | 0.0 | 2.0 | 49.0 | 4.0 | 2.0 | 1.0 | 15100.0 | 9360.0 | 4.0 | 2.0 |
#数据归一化评价
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve
from mlxtend.classifier import StackingClassifier
lr_model = LogisticRegression(C = 0.1, penalty = 'l1')
svm_model = svm.SVC(C = 0.01, kernel = 'linear', probability=True)
dt_model = DecisionTreeClassifier(max_depth=5,min_samples_split=50,min_samples_leaf=60, max_features=9, random_state =2333)
xgb_model = XGBClassifier(learning_rate =0.1, n_estimators=80, max_depth=3, min_child_weight=5,
gamma=0.2, subsample=0.8, colsample_bytree=0.8, reg_alpha=1e-5,
objective= 'binary:logistic', nthread=4,scale_pos_weight=1, seed=27)
lgbm_model = LGBMClassifier(learning_rate =0.1, n_estimators=100, max_depth=3, min_child_weight=11,
gamma=0.1, subsample=0.5, colsample_bytree=0.9, reg_alpha=1e-5,
nthread=4,scale_pos_weight=1, seed=27)
gbdt_model=GradientBoostingClassifier(n_estimators=100)
rf_model = RandomForestClassifier(class_weight='balanced', n_estimators=100)
sclf_model = StackingClassifier(classifiers=[lgb_model, gbdt_model, rf_model], use_probas=True,
average_probas=False,
meta_classifier=lr_model)
models={'LR':lr_model, 'SVM':svm_model, 'DT':dt_model, 'GBDT':gbdt_model,
'XGBoost':xgb_model, 'LGBM':lgbm_model,'Stack':sclf_model}
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
in ()
12 from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
13 from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,roc_curve
---> 14 from mlxtend.classifier import StackingClassifier
15
16 lr_model = LogisticRegression(C = 0.1, penalty = 'l1')
ModuleNotFoundError: No module named 'mlxtend'
df_result=pd.DataFrame(columns=('model','accuracy','precision','recall','f1_score','auc'))
row=0
#定义评价函数
def evaluate(y_pre,y):
acc=accuracy_score(y,y_pre)
p=precision_score(y,y_pre)
r=recall_score(y,y_pre)
f1=f1_score(y,y_pre)
return acc,p,r,f1
for name,model in models.items():
print(name,'start training...')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
y_proba=model.predict_proba(X_test)
acc,p,r,f1=evaluate(y_pred,y_test)
auc=roc_auc_score(y_test,y_proba[:,1])
df_result.loc[row]=[name,acc,p,r,f1,auc]
row+=1
print(df_result)
ubuntu16不知怎么了,使用pip安装mlxten始终报错,先把程序写上,明天继续安装。