Table of Contents
- 1 数据导入
- 2 模型挑选
- 3 模型调参
- 3.1 设立初始参数
- 3.2 调解n_estimators
- 3.3 max_depth/num_leaves
- 3.4 min_child_samples/min_child_weight
- 3.5 subsample/colsample_bytree(0.6,1)
- 3.6 reg_alpha/reg_lamb
- 3.7 学习率
- 4 测试集生成结果
- 5 特征选择
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['KaiTi']
plt.rcParams['axes.unicode_minus'] = False
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
数据导入
op = pd.read_csv('op_done.csv',index_col='user')
base = pd.read_csv('base_done.csv',index_col='user')
tr = pd.read_csv('tr_done.csv',index_col='user')
label = pd.read_csv('train_label.csv',index_col='user')
sumbit = pd.read_csv('submit_example.csv',index_col='user')
train = label.join(base).join(op).join(tr)
train.fillna(0,inplace=True)
train.head()
|
label |
sex |
age |
provider |
level |
verified |
using_time |
regist_type |
card_a_cnt |
card_b_cnt |
card_c_cnt |
agreement1 |
op1_cnt |
op2_cnt |
card_d_cnt |
agreement_total |
service1_cnt |
service1_amt |
service2_cnt |
agreement2 |
agreement3 |
agreement4 |
acc_count |
login_cnt_period1 |
login_cnt_period2 |
ip_cnt |
login_cnt_avg |
login_days_cnt |
province |
city |
balance |
balance_avg |
balance1 |
balance1_avg |
balance2 |
balance2_avg |
service3 |
service3_level |
product1_amount |
product2_amount |
product3_amount |
product4_amount |
product5_amount |
product6_amount |
product7_cnt |
product7_fail_cnt |
op_time |
is_all |
116a2503b987ea81 |
b131ac74aa38a121 |
b2e7fa260df4998d |
type4 |
op_type0 |
op_type1 |
op_type2 |
op_type3 |
op_type4 |
mode0 |
mode1 |
mode2 |
mode3 |
mode4 |
channel0 |
channel1 |
channel2 |
channel3 |
channel4 |
ip_num |
platform_0 |
platform_1 |
platform_2 |
platform_3 |
platform_4 |
platform_5 |
tunnel_in_0 |
tunnel_in_1 |
tunnel_in_2 |
tunnel_in_3 |
tunnel_in_4 |
tunnel_out_0 |
tunnel_out_1 |
tunnel_out_2 |
tunnel_out_3 |
type1_0 |
type1_1 |
type1_2 |
type1_3 |
type1_4 |
type1_5 |
type1_6 |
type1_7 |
type1_8 |
type1_9 |
type1_10 |
type1_11 |
type1_12 |
type1_13 |
type1_14 |
type1_15 |
type1_16 |
type1_17 |
type1_18 |
type1_19 |
type2_0 |
type2_1 |
type2_2 |
type2_3 |
type2_4 |
type2_5 |
type2_6 |
type2_7 |
type2_8 |
type2_9 |
type2_10 |
type2_11 |
type2_12 |
type2_13 |
tr_time |
mean_amount |
ip_ture |
user |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Train_00000 |
0 |
0 |
24871 |
0 |
1 |
0 |
24712 |
1 |
24712 |
24712 |
24706 |
0 |
24731 |
24719 |
24706 |
24743 |
24706 |
24706 |
24706 |
1 |
0 |
0 |
24737 |
25041 |
24938 |
24719 |
24737 |
24749 |
1 |
0 |
14 |
14 |
1 |
1 |
16 |
5 |
0 |
4 |
1 |
1 |
1 |
0 |
0 |
1 |
24712 |
24706 |
102.0 |
0.705882 |
1.0 |
1.0 |
28.0 |
72.0 |
12.0 |
0.0 |
44.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
12.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2.0 |
0.0 |
4.0 |
6.0 |
7.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
7.0 |
0.0 |
13.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2.0 |
4.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
6.0 |
0.0 |
0.0 |
0.0 |
7.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
13.0 |
53330.307692 |
0.384615 |
Train_00001 |
1 |
0 |
24889 |
0 |
2 |
0 |
24716 |
1 |
24719 |
24719 |
24706 |
0 |
24712 |
24712 |
24706 |
24755 |
24706 |
24706 |
24706 |
1 |
0 |
0 |
24737 |
25443 |
24931 |
24731 |
24749 |
24737 |
2 |
1 |
3 |
8 |
6 |
6 |
1 |
5 |
0 |
4 |
2 |
3 |
1 |
0 |
0 |
6 |
24712 |
24706 |
18.0 |
0.333333 |
11.0 |
0.0 |
1.0 |
6.0 |
6.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
6.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
2.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
2.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2.0 |
36098.000000 |
0.500000 |
Train_00002 |
0 |
0 |
24963 |
0 |
2 |
0 |
24736 |
7 |
24712 |
24712 |
24706 |
0 |
24712 |
24712 |
24706 |
24743 |
24706 |
24706 |
24706 |
1 |
0 |
0 |
24731 |
26584 |
26524 |
24774 |
24774 |
24859 |
2 |
0 |
8 |
8 |
1 |
1 |
9 |
5 |
0 |
4 |
1 |
1 |
1 |
0 |
0 |
1 |
24719 |
24719 |
8.0 |
0.125000 |
0.0 |
5.0 |
2.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2.0 |
0.0 |
1.0 |
5.0 |
7.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
7.0 |
0.0 |
12.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
6.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
5.0 |
0.0 |
0.0 |
0.0 |
7.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
12.0 |
57329.583333 |
0.500000 |
Train_00003 |
0 |
0 |
24840 |
2 |
2 |
0 |
24719 |
3 |
24712 |
24712 |
24706 |
0 |
24719 |
24706 |
24706 |
24737 |
24706 |
24706 |
24706 |
0 |
1 |
0 |
24712 |
25571 |
25529 |
24908 |
24737 |
24846 |
2 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
0 |
4 |
1 |
1 |
1 |
0 |
0 |
1 |
24712 |
24706 |
108.0 |
0.472222 |
0.0 |
4.0 |
46.0 |
58.0 |
24.0 |
0.0 |
14.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
24.0 |
0.0 |
0.0 |
0.0 |
0.0 |
6.0 |
0.0 |
19.0 |
2.0 |
9.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
9.0 |
0.0 |
11.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
3.0 |
0.0 |
4.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
2.0 |
0.0 |
0.0 |
0.0 |
9.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
11.0 |
61652.454545 |
0.727273 |
Train_00004 |
0 |
0 |
24871 |
2 |
1 |
0 |
24707 |
3 |
24712 |
24712 |
24706 |
0 |
24706 |
24706 |
24706 |
24725 |
24706 |
24706 |
24706 |
0 |
0 |
0 |
24712 |
25838 |
25838 |
24755 |
24816 |
24767 |
1 |
0 |
9 |
6 |
1 |
1 |
10 |
3 |
0 |
4 |
1 |
1 |
1 |
0 |
0 |
1 |
24706 |
24706 |
5.0 |
0.200000 |
0.0 |
4.0 |
0.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
36689.000000 |
0.000000 |
train_X = train.iloc[:,1:].values
train_y = train.iloc[:,0].values
stand = StandardScaler()
train_X = stand.fit_transform(train_X)
test = sumbit.join(base).join(op).join(tr)
test.fillna(0,inplace =True)
test_X = test.iloc[:,1:].values
test_X = stand.fit_transform(test_X)
模型挑选
lr = LogisticRegression(random_state=2018)
svm = SVC(probability=True,random_state=2018)
forest=RandomForestClassifier(n_estimators=100,random_state=2018)
Gbdt=GradientBoostingClassifier(random_state=2018)
Xgbc=XGBClassifier(random_state=2018)
gbm=lgb.LGBMClassifier(random_state=2018)
model_name=["lr","svm","forest","Gbdt","Xgbc","gbm"]
def muti_score(model):
auc = cross_val_score(model, train_X, train_y, scoring='roc_auc', cv=3)
return auc.mean()
scores = []
for name in model_name:
model = eval(name)
socre = muti_score(model)
scores.append((name,socre))
scores
[('lr', 0.6374291913334925),
('svm', 0.42584336157620334),
('forest', 0.6732019222635085),
('Gbdt', 0.6995580705824883),
('Xgbc', 0.6890128512134231),
('gbm', 0.7027585172289985)]
模型调参
经过对比 gbdt和gbm效果较好
调参顺序 n_estimators -- max_depth/num_leaves -- min_child_samples/min_child_weight -- subsample/colsample_bytree --reg_alpha/reg_lambda -- 学习率
设立初始参数
params = {'boosting_type':'gbdt','objective': 'binary','subsample': 0.8,'colsample_bytree': 0.8}
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': -1,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 100,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0}
调解n_estimators
param_1 = {'n_estimators':range(50,150,5)}
cv = GridSearchCV(gbm,param_grid=param_1,scoring='roc_auc',cv=5)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
plt.plot(result['param_n_estimators'],result['mean_test_score'])
0.718750789233066 {'n_estimators': 80}
[]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hqXl8ymd-1597326586451)(output_17_2.png)]
params.update(grid_result.best_params_)
gbm=lgb.LGBMClassifier(**params)
max_depth/num_leaves
param_2 = {'max_depth':range(5,9),'num_leaves ':range(20,50,2)}
cv = GridSearchCV(gbm,param_grid=param_2,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'max_depth': 8, 'num_leaves ': 20}
params.update({'max_depth': 8, 'num_leaves ': 20})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.001,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}
min_child_samples/min_child_weight
param_3 = {'min_child_samples':range(10,30,2),'min_child_weight':[i/1000 for i in range(0,20,2)]}
cv = GridSearchCV(gbm,param_grid=param_3,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'min_child_samples': 20, 'min_child_weight': 0.0}
params.update({'min_child_samples': 20, 'min_child_weight': 0.0})
gbm=lgb.LGBMClassifier(**params)
gbm.get_params()
{'boosting_type': 'gbdt',
'class_weight': None,
'colsample_bytree': 0.8,
'importance_type': 'split',
'learning_rate': 0.1,
'max_depth': 8,
'min_child_samples': 20,
'min_child_weight': 0.0,
'min_split_gain': 0.0,
'n_estimators': 80,
'n_jobs': -1,
'num_leaves': 31,
'objective': 'binary',
'random_state': None,
'reg_alpha': 0.0,
'reg_lambda': 0.0,
'silent': True,
'subsample': 0.8,
'subsample_for_bin': 200000,
'subsample_freq': 0,
'num_leaves ': 20}
subsample/colsample_bytree(0.6,1)
param_4 = {'subsample':[i/10 for i in range(6,10,1)],'colsample_bytree':[i/10 for i in range(6,10,1)]}
cv = GridSearchCV(gbm,param_grid=param_4,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
result = pd.DataFrame(grid_result.cv_results_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.6}
reg_alpha/reg_lamb
param_5 = {'subsample':[i/10 for i in range(10)],'colsample_bytree':[i/10 for i in range(10)]}
cv = GridSearchCV(gbm,param_grid=param_5,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'colsample_bytree': 0.8, 'subsample': 0.1}
学习率
param_6 = {'learning_rate':[i/100 for i in range(20)]}
cv = GridSearchCV(gbm,param_grid=param_6,scoring='roc_auc',cv=4)
grid_result = cv.fit(train_X, train_y)
print(grid_result.best_score_,grid_result.best_params_)
0.7191457708890046 {'learning_rate': 0.1}
测试集生成结果
gbm.fit(train_X, train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)
gbm.feature_importances_
array([ 17, 82, 16, 13, 8, 86, 16, 12, 13, 15, 0, 25, 33,
12, 25, 0, 6, 1, 11, 12, 6, 49, 87, 61, 56, 51,
58, 36, 21, 20, 38, 20, 24, 7, 17, 14, 7, 2, 34,
5, 0, 0, 39, 60, 121, 41, 67, 35, 30, 68, 66, 53,
12, 49, 3, 12, 1, 18, 2, 9, 1, 24, 26, 8, 13,
1, 28, 18, 24, 9, 0, 5, 1, 0, 0, 0, 1, 23,
11, 19, 5, 0, 8, 5, 31, 11, 4, 6, 7, 92, 26,
0, 0, 12, 0, 0, 0, 1, 0, 16, 0, 0, 0, 35,
0, 5, 0, 10, 9, 16, 0, 0, 0, 3, 5, 0, 23,
107, 49])
train.iloc[:,1:].columns
Index(['sex', 'age', 'provider', 'level', 'verified', 'using_time',
'regist_type', 'card_a_cnt', 'card_b_cnt', 'card_c_cnt',
...
'type2_7', 'type2_8', 'type2_9', 'type2_10', 'type2_11', 'type2_12',
'type2_13', 'tr_time', 'mean_amount', 'ip_ture'],
dtype='object', length=119)
feature_importance = pd.DataFrame({'feature':train.iloc[:,1:].columns,'importance':gbm.feature_importances_})
feature_importance.sort_values(by='importance',ascending=False).head(20)
|
feature |
importance |
44 |
product7_fail_cnt |
121 |
117 |
mean_amount |
107 |
89 |
type1_7 |
92 |
22 |
login_cnt_period1 |
87 |
5 |
using_time |
86 |
1 |
age |
82 |
49 |
b2e7fa260df4998d |
68 |
46 |
is_all |
67 |
50 |
type4 |
66 |
23 |
login_cnt_period2 |
61 |
43 |
product7_cnt |
60 |
26 |
login_days_cnt |
58 |
24 |
ip_cnt |
56 |
51 |
op_type0 |
53 |
25 |
login_cnt_avg |
51 |
118 |
ip_ture |
49 |
21 |
acc_count |
49 |
53 |
op_type2 |
49 |
45 |
op_time |
41 |
42 |
product6_amount |
39 |
y_pre = gbm.predict(train_X)
y_pre = gbm.predict_proba(train_X)
roc_auc_score(train_y,y_pre[:,1])
0.7876456295250149
y = gbm.predict_proba(test_X)
y[:,1]
array([0.02967902, 0.44846496, 0.02377314, ..., 0.21914047, 0.28423991,
0.16758796])
test['prob'] = y[:,1]
pd.DataFrame(test.iloc[:,0]).to_csv('result.csv')
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=10)
array([0.73246514, 0.72179747, 0.72288483, 0.72767674, 0.72240485,
0.72194103, 0.71986724, 0.71257605, 0.71155348, 0.7186749 ])
auc.mean()
0.7211841722940205
特征选择
list_feature = feature_importance.sort_values(by='importance',ascending=False)['feature'].to_list()
list_socre = []
for i in range(50,120,10):
fearture = list_feature[:i]
train_X = stand.fit_transform(train.loc[:,fearture].values)
auc = cross_val_score(gbm, train_X, train_y, scoring='roc_auc', cv=5)
list_socre.append((i,auc.mean()))
list_socre
[(50, 0.7164324796787287),
(60, 0.7178106094882282),
(70, 0.7200468611823796),
(80, 0.7193456575143582),
(90, 0.7190751868013574),
(100, 0.7190497035344566),
(110, 0.7182153617821309)]
test_X = stand.fit_transform(test.loc[:,list_feature[:70]].values)
train_X = stand.fit_transform(train.loc[:,list_feature[:70]].values)
gbm.fit(train_X,train_y)
LGBMClassifier(colsample_bytree=0.8, max_depth=8, min_child_weight=0.0,
n_estimators=80, num_leaves =20, objective='binary',
subsample=0.8)