根据给定的特征,预测客户是否将产生交易。
数据字段简介,训练集与测试集下载地址:
https://www.kaggle.com/c/santander-customer-transaction-prediction/data.
加载要用到的库和数据
# EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# model
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import shap
import statsmodels.api as sm
#
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns',None)
sns.set_style('whitegrid')
# load data
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
看一下数据大小,并简单了解数据:
print(f'Shape: Train, {train.shape}; Test, {test.shape}')
display(train.describe())
Shape: Train, (200000, 202); Test, (200000, 201)
target | var_0 | var_1 | var_2 | var_3 | var_4 | var_5 | var_6 | var_7 | var_8 | var_9 | var_10 | var_11 | var_12 | var_13 | var_14 | var_15 | var_16 | var_17 | var_18 | var_19 | var_20 | var_21 | var_22 | var_23 | var_24 | var_25 | var_26 | var_27 | var_28 | var_29 | var_30 | var_31 | var_32 | var_33 | var_34 | var_35 | var_36 | var_37 | var_38 | var_39 | var_40 | var_41 | var_42 | var_43 | var_44 | var_45 | var_46 | var_47 | var_48 | var_49 | var_50 | var_51 | var_52 | var_53 | var_54 | var_55 | var_56 | var_57 | var_58 | var_59 | var_60 | var_61 | var_62 | var_63 | var_64 | var_65 | var_66 | var_67 | var_68 | var_69 | var_70 | var_71 | var_72 | var_73 | var_74 | var_75 | var_76 | var_77 | var_78 | var_79 | var_80 | var_81 | var_82 | var_83 | var_84 | var_85 | var_86 | var_87 | var_88 | var_89 | var_90 | var_91 | var_92 | var_93 | var_94 | var_95 | var_96 | var_97 | var_98 | var_99 | var_100 | var_101 | var_102 | var_103 | var_104 | var_105 | var_106 | var_107 | var_108 | var_109 | var_110 | var_111 | var_112 | var_113 | var_114 | var_115 | var_116 | var_117 | var_118 | var_119 | var_120 | var_121 | var_122 | var_123 | var_124 | var_125 | var_126 | var_127 | var_128 | var_129 | var_130 | var_131 | var_132 | var_133 | var_134 | var_135 | var_136 | var_137 | var_138 | var_139 | var_140 | var_141 | var_142 | var_143 | var_144 | var_145 | var_146 | var_147 | var_148 | var_149 | var_150 | var_151 | var_152 | var_153 | var_154 | var_155 | var_156 | var_157 | var_158 | var_159 | var_160 | var_161 | var_162 | var_163 | var_164 | var_165 | var_166 | var_167 | var_168 | var_169 | var_170 | var_171 | var_172 | var_173 | var_174 | var_175 | var_176 | var_177 | var_178 | var_179 | var_180 | var_181 | var_182 | var_183 | var_184 | var_185 | var_186 | var_187 | var_188 | var_189 | var_190 | var_191 | var_192 | var_193 | var_194 | var_195 | var_196 | var_197 | var_198 | var_199 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 | 200000.000000 |
mean | 0.100490 | 10.679914 | -1.627622 | 10.715192 | 6.796529 | 11.078333 | -5.065317 | 5.408949 | 16.545850 | 0.284162 | 7.567236 | 0.394340 | -3.245596 | 14.023978 | 8.530232 | 7.537606 | 14.573126 | 9.333264 | -5.696731 | 15.244013 | 12.438567 | 13.290894 | 17.257883 | 4.305430 | 3.019540 | 10.584400 | 13.667496 | -4.055133 | -1.137908 | 5.532980 | 5.053874 | -7.687740 | 10.393046 | -0.512886 | 14.774147 | 11.434250 | 3.842499 | 2.187230 | 5.868899 | 10.642131 | 0.662956 | -6.725505 | 9.299858 | 11.222356 | 11.569954 | 8.948289 | -12.699667 | 11.326488 | -12.471737 | 14.704713 | 16.682499 | 12.740986 | 13.428912 | -2.528816 | 6.008569 | 1.137117 | 12.745852 | 16.629165 | 6.272014 | 3.177633 | 8.931124 | 12.155618 | -11.946744 | 0.874170 | 0.661173 | 6.369157 | 0.982891 | 5.794039 | 11.943223 | 5.018893 | -3.331515 | 24.446811 | 0.669756 | 0.640553 | 19.610888 | 19.518846 | 16.853732 | 6.050871 | 19.066993 | 5.349479 | 14.402136 | 5.795044 | 14.719024 | -3.471273 | 1.025817 | -2.590209 | 18.362721 | 5.621058 | 11.351483 | 8.702924 | 3.725208 | -16.548147 | 6.987541 | 12.739578 | 10.556740 | 10.999162 | -0.084344 | 14.400433 | 18.539645 | 1.752012 | -0.746296 | -6.600518 | 13.413526 | 22.294908 | 1.568393 | 11.509834 | 4.244744 | 8.617657 | 17.796266 | 14.224435 | 18.458001 | 5.513238 | 6.312603 | 3.317843 | 8.136542 | 3.081191 | 2.213717 | 2.402570 | 16.102233 | -5.305132 | 3.032849 | 24.521078 | 11.310591 | 1.192984 | 7.076254 | 4.272740 | 12.489165 | 13.202326 | 0.851507 | -1.127952 | 15.460314 | 12.257151 | 0.544674 | 7.799676 | 6.813270 | -4.826053 | -4.259472 | 22.968602 | 17.613651 | 1.210792 | 7.760193 | 3.423636 | 2.897596 | 11.983489 | 12.333698 | 8.647632 | 4.841328 | 10.341178 | -3.300779 | 3.990726 | 5.296237 | 16.817671 | 10.141542 | 7.633199 | 16.727902 | 6.974955 | -2.074128 | 13.209272 | -4.813552 | 17.914591 | 10.223282 | 24.259300 | 5.633293 | 5.362896 | 11.002170 | -2.871906 | 19.315753 | 2.963335 | -4.151155 | 4.937124 | 5.636008 | -0.004962 | -0.831777 | 19.817094 | -0.677967 | 20.210677 | 11.640613 | -2.799585 | 11.882933 | -1.014064 | 2.591444 | -2.741666 | 10.085518 | 0.719109 | 8.769088 | 12.756676 | -3.983261 | 8.970274 | -10.335043 | 15.377174 | 0.746072 | 3.234440 | 7.438408 | 1.927839 | 3.331774 | 17.993784 | -0.142088 | 2.303335 | 8.908158 | 15.870720 | -3.326537 |
std | 0.300653 | 3.040051 | 4.050044 | 2.640894 | 2.043319 | 1.623150 | 7.863267 | 0.866607 | 3.418076 | 3.332634 | 1.235070 | 5.500793 | 5.970253 | 0.190059 | 4.639536 | 2.247908 | 0.411711 | 2.557421 | 6.712612 | 7.851370 | 7.996694 | 5.876254 | 8.196564 | 2.847958 | 0.526893 | 3.777245 | 0.285535 | 5.922210 | 1.523714 | 0.783367 | 2.615942 | 7.965198 | 2.159891 | 2.587830 | 4.322325 | 0.541614 | 5.179559 | 3.119978 | 2.249730 | 4.278903 | 4.068845 | 8.279259 | 5.938088 | 0.695991 | 0.309599 | 5.903073 | 21.404912 | 2.860511 | 10.579862 | 11.384332 | 7.855762 | 0.691709 | 8.187306 | 4.985532 | 0.764753 | 8.414241 | 5.690072 | 3.540174 | 0.795026 | 4.296686 | 0.854798 | 4.222389 | 11.622948 | 2.026238 | 3.113089 | 1.485854 | 3.786493 | 1.121366 | 7.365115 | 0.007186 | 3.955723 | 11.951742 | 0.266696 | 3.944703 | 7.466303 | 14.112591 | 6.055322 | 7.938351 | 3.817292 | 1.993792 | 1.309055 | 7.436737 | 2.299567 | 8.479255 | 8.297229 | 6.225305 | 3.908536 | 7.751142 | 5.661867 | 2.491460 | 3.560554 | 13.152810 | 0.152641 | 4.186252 | 0.543341 | 2.768099 | 0.621125 | 8.525400 | 12.642382 | 0.715836 | 1.862550 | 9.181683 | 4.950537 | 8.628179 | 0.185020 | 1.970520 | 0.855698 | 1.894899 | 7.604723 | 0.171091 | 4.355031 | 3.823253 | 1.082404 | 1.591170 | 4.459077 | 0.985396 | 2.621851 | 1.650912 | 13.297662 | 8.799268 | 4.182796 | 12.121016 | 1.714416 | 5.168479 | 6.147345 | 2.736821 | 0.318100 | 0.776056 | 3.137684 | 3.238043 | 4.136453 | 0.832199 | 0.456280 | 1.456486 | 0.375603 | 6.166126 | 7.617732 | 10.382235 | 8.890516 | 4.551750 | 7.686433 | 4.896325 | 6.715637 | 5.691936 | 2.934706 | 0.922469 | 3.899281 | 2.518883 | 7.413301 | 0.199192 | 10.385133 | 2.464157 | 3.962426 | 3.005373 | 2.014200 | 4.961678 | 5.771261 | 0.955140 | 5.570272 | 7.885579 | 4.122912 | 10.880263 | 0.217938 | 1.419612 | 5.262056 | 5.457784 | 5.024182 | 0.369684 | 7.798020 | 3.105986 | 0.369437 | 4.424621 | 5.378008 | 8.674171 | 5.966674 | 7.136427 | 2.892167 | 7.513939 | 2.628895 | 8.579810 | 2.798956 | 5.261243 | 1.371862 | 8.963434 | 4.474924 | 9.318280 | 4.725167 | 3.189759 | 11.574708 | 3.944604 | 0.976348 | 4.559922 | 3.023272 | 1.478423 | 3.992030 | 3.135162 | 1.429372 | 5.454369 | 0.921625 | 3.010945 | 10.438015 |
min | 0.000000 | 0.408400 | -15.043400 | 2.117100 | -0.040200 | 5.074800 | -32.562600 | 2.347300 | 5.349700 | -10.505500 | 3.970500 | -20.731300 | -26.095000 | 13.434600 | -6.011100 | 1.013300 | 13.076900 | 0.635100 | -33.380200 | -10.664200 | -12.402500 | -5.432200 | -10.089000 | -5.322500 | 1.209800 | -0.678400 | 12.720000 | -24.243100 | -6.166800 | 2.089600 | -4.787200 | -34.798400 | 2.140600 | -8.986100 | 1.508500 | 9.816900 | -16.513600 | -8.095100 | -1.183400 | -6.337100 | -14.545700 | -35.211700 | -8.535900 | 8.859000 | 10.652800 | -9.939600 | -90.252500 | 1.206200 | -47.686200 | -23.902200 | -8.070700 | 10.385500 | -15.046200 | -24.721400 | 3.344900 | -26.778600 | -3.782600 | 2.761800 | 3.442300 | -12.600900 | 6.184000 | -2.100600 | -48.802700 | -6.328900 | -10.554400 | 1.611700 | -14.088800 | 1.336800 | -19.544300 | 4.993800 | -16.309400 | -17.027500 | -0.224000 | -12.383400 | -1.665800 | -34.101500 | -1.293600 | -21.633300 | 7.425700 | -1.818300 | 10.445400 | -18.042200 | 7.586500 | -30.026600 | -24.220100 | -24.439800 | 7.023000 | -19.272200 | -8.481600 | 1.350200 | -9.601400 | -61.718000 | 6.521800 | -1.018500 | 8.491600 | 2.819000 | -2.432400 | -12.158400 | -21.740000 | -0.603500 | -7.280600 | -39.179100 | 0.075700 | -7.382900 | 0.979300 | 4.084600 | 0.715300 | 0.942400 | -5.898000 | 13.729000 | 5.769700 | -9.239800 | 2.194200 | -2.030200 | -5.513900 | -0.050500 | -6.858600 | -3.163000 | -31.836900 | -37.527700 | -9.774200 | -18.696200 | 6.305200 | -15.194000 | -12.405900 | -7.053800 | 11.486100 | 11.265400 | -8.876900 | -11.755900 | 2.186300 | 9.528300 | -0.954800 | 2.890000 | 5.359300 | -24.254600 | -31.380800 | -9.949300 | -9.851000 | -16.468400 | -21.274300 | -15.459500 | -16.693700 | -7.108000 | 2.806800 | 5.444300 | -8.273400 | 0.427400 | -29.984000 | 3.320500 | -41.168300 | 9.242000 | -2.191500 | -2.880000 | 11.030800 | -8.196600 | -21.840900 | 9.996500 | -22.990400 | -4.554400 | -4.641600 | -7.452200 | 4.852600 | 0.623100 | -6.531700 | -19.997700 | 3.816700 | 1.851200 | -35.969500 | -5.250200 | 4.258800 | -14.506000 | -22.479300 | -11.453300 | -22.748700 | -2.995300 | 3.241500 | -29.116500 | 4.952100 | -29.273400 | -7.856100 | -22.037400 | 5.416500 | -26.001100 | -4.808200 | -18.489700 | -22.583300 | -3.022300 | -47.753600 | 4.412300 | -2.554300 | -14.093300 | -2.691700 | -3.814500 | -11.783400 | 8.694400 | -5.261000 | -14.209600 | 5.960600 | 6.299300 | -38.852800 |
25% | 0.000000 | 8.453850 | -4.740025 | 8.722475 | 5.254075 | 9.883175 | -11.200350 | 4.767700 | 13.943800 | -2.317800 | 6.618800 | -3.594950 | -7.510600 | 13.894000 | 5.072800 | 5.781875 | 14.262800 | 7.452275 | -10.476225 | 9.177950 | 6.276475 | 8.627800 | 11.551000 | 2.182400 | 2.634100 | 7.613000 | 13.456400 | -8.321725 | -2.307900 | 4.992100 | 3.171700 | -13.766175 | 8.870000 | -2.500875 | 11.456300 | 11.032300 | 0.116975 | -0.007125 | 4.125475 | 7.591050 | -2.199500 | -12.831825 | 4.519575 | 10.713200 | 11.343800 | 5.313650 | -28.730700 | 9.248750 | -20.654525 | 6.351975 | 10.653475 | 12.269000 | 7.267625 | -6.065025 | 5.435600 | -5.147625 | 8.163900 | 14.097875 | 5.687500 | 0.183500 | 8.312400 | 8.912750 | -20.901725 | -0.572400 | -1.588700 | 5.293500 | -1.702800 | 4.973800 | 6.753200 | 5.014000 | -6.336625 | 15.256625 | 0.472300 | -2.197100 | 14.097275 | 9.595975 | 12.480975 | 0.596300 | 16.014700 | 3.817275 | 13.375400 | 0.694475 | 13.214775 | -10.004950 | -5.106400 | -7.216125 | 15.338575 | 0.407550 | 7.247175 | 6.918775 | 1.140500 | -26.665600 | 6.869900 | 9.670300 | 10.195600 | 8.828000 | -0.527400 | 7.796950 | 8.919525 | 1.267675 | -2.106200 | -13.198700 | 9.639800 | 16.047975 | 1.428900 | 10.097900 | 3.639600 | 7.282300 | 12.168075 | 14.098900 | 15.107175 | 2.817475 | 5.510100 | 2.092675 | 4.803250 | 2.388775 | 0.399700 | 1.171875 | 6.373500 | -11.587850 | -0.161975 | 15.696275 | 9.996400 | -2.565200 | 2.817050 | 2.353600 | 12.245400 | 12.608400 | -1.502325 | -3.580725 | 12.514475 | 11.619300 | 0.207800 | 6.724375 | 6.543500 | -9.625700 | -9.957100 | 14.933900 | 10.656550 | -2.011825 | 2.387575 | -0.121700 | -2.153725 | 7.900000 | 10.311200 | 7.968075 | 1.885875 | 8.646900 | -8.751450 | 3.853600 | -1.903200 | 14.952200 | 7.064600 | 5.567900 | 15.233000 | 3.339900 | -6.266025 | 12.475100 | -8.939950 | 12.109200 | 7.243525 | 15.696125 | 5.470500 | 4.326100 | 7.029600 | -7.094025 | 15.744550 | 2.699000 | -9.643100 | 2.703200 | 5.374600 | -3.258500 | -4.720350 | 13.731775 | -5.009525 | 15.064600 | 9.371600 | -8.386500 | 9.808675 | -7.395700 | 0.625575 | -6.673900 | 9.084700 | -6.064425 | 5.423100 | 5.663300 | -7.360000 | 6.715200 | -19.205125 | 12.501550 | 0.014900 | -0.058825 | 5.157400 | 0.889775 | 0.584600 | 15.629800 | -1.170700 | -1.946925 | 8.252800 | 13.829700 | -11.208475 |
50% | 0.000000 | 10.524750 | -1.608050 | 10.580000 | 6.825000 | 11.108250 | -4.833150 | 5.385100 | 16.456800 | 0.393700 | 7.629600 | 0.487300 | -3.286950 | 14.025500 | 8.604250 | 7.520300 | 14.574100 | 9.232050 | -5.666350 | 15.196250 | 12.453900 | 13.196800 | 17.234250 | 4.275150 | 3.008650 | 10.380350 | 13.662500 | -4.196900 | -1.132100 | 5.534850 | 4.950200 | -7.411750 | 10.365650 | -0.497650 | 14.576000 | 11.435200 | 3.917750 | 2.198000 | 5.900650 | 10.562700 | 0.672300 | -6.617450 | 9.162650 | 11.243400 | 11.565000 | 9.437200 | -12.547200 | 11.310750 | -12.482400 | 14.559200 | 16.672400 | 12.745600 | 13.444400 | -2.502450 | 6.027800 | 1.274050 | 12.594100 | 16.648150 | 6.262500 | 3.170100 | 8.901000 | 12.064350 | -11.892000 | 0.794700 | 0.681700 | 6.377700 | 1.021350 | 5.782000 | 11.922000 | 5.019100 | -3.325500 | 24.445000 | 0.668400 | 0.646450 | 19.309750 | 19.536650 | 16.844200 | 6.297800 | 18.967850 | 5.440050 | 14.388850 | 6.061750 | 14.844500 | -3.284450 | 1.069700 | -2.517950 | 18.296450 | 6.006700 | 11.288000 | 8.616200 | 3.642550 | -16.482600 | 6.986500 | 12.673500 | 10.582200 | 10.983850 | -0.098600 | 14.369900 | 18.502150 | 1.768300 | -0.771300 | -6.401500 | 13.380850 | 22.306850 | 1.566000 | 11.497950 | 4.224500 | 8.605150 | 17.573200 | 14.226600 | 18.281350 | 5.394300 | 6.340100 | 3.408400 | 8.148550 | 3.083800 | 2.249850 | 2.456300 | 15.944850 | -5.189500 | 3.023950 | 24.354700 | 11.239700 | 1.200700 | 7.234300 | 4.302100 | 12.486300 | 13.166800 | 0.925000 | -1.101750 | 15.426800 | 12.264650 | 0.556600 | 7.809100 | 6.806700 | -4.704250 | -4.111900 | 22.948300 | 17.257250 | 1.211750 | 8.066250 | 3.564700 | 2.975500 | 11.855900 | 12.356350 | 8.651850 | 4.904700 | 10.395600 | -3.178700 | 3.996000 | 5.283250 | 16.736950 | 10.127900 | 7.673700 | 16.649750 | 6.994050 | -2.066100 | 13.184300 | -4.868400 | 17.630450 | 10.217550 | 23.864500 | 5.633500 | 5.359700 | 10.788700 | -2.637800 | 19.270800 | 2.960200 | -4.011600 | 4.761600 | 5.634300 | 0.002800 | -0.807350 | 19.748000 | -0.569750 | 20.206100 | 11.679800 | -2.538450 | 11.737250 | -0.942050 | 2.512300 | -2.688800 | 10.036050 | 0.720200 | 8.600000 | 12.521000 | -3.946950 | 8.902150 | -10.209750 | 15.239450 | 0.742600 | 3.203600 | 7.347750 | 1.901300 | 3.396350 | 17.957950 | -0.172700 | 2.408900 | 8.888200 | 15.934050 | -2.819550 |
75% | 0.000000 | 12.758200 | 1.358625 | 12.516700 | 8.324100 | 12.261125 | 0.924800 | 6.003000 | 19.102900 | 2.937900 | 8.584425 | 4.382925 | 0.852825 | 14.164200 | 12.274775 | 9.270425 | 14.874500 | 11.055900 | -0.810775 | 21.013325 | 18.433300 | 17.879400 | 23.089050 | 6.293200 | 3.403800 | 13.479600 | 13.863700 | -0.090200 | 0.015625 | 6.093700 | 6.798925 | -1.443450 | 11.885000 | 1.469100 | 18.097125 | 11.844400 | 7.487725 | 4.460400 | 7.542400 | 13.598925 | 3.637825 | -0.880875 | 13.754800 | 11.756900 | 11.804600 | 13.087300 | 3.150525 | 13.318300 | -4.244525 | 23.028650 | 22.549050 | 13.234500 | 19.385650 | 0.944350 | 6.542900 | 7.401825 | 17.086625 | 19.289700 | 6.845000 | 6.209700 | 9.566525 | 15.116500 | -3.225450 | 2.228200 | 3.020300 | 7.490600 | 3.739200 | 6.586200 | 17.037650 | 5.024100 | -0.498875 | 33.633150 | 0.864400 | 3.510700 | 25.207125 | 29.620700 | 21.432225 | 11.818800 | 22.041100 | 6.867200 | 15.383100 | 11.449125 | 16.340800 | 3.101725 | 7.449900 | 1.986700 | 21.358850 | 11.158375 | 15.433225 | 10.567025 | 6.146200 | -6.409375 | 7.101400 | 15.840225 | 10.944900 | 13.089100 | 0.329100 | 20.819375 | 28.158975 | 2.260900 | 0.528500 | 0.132100 | 17.250225 | 28.682225 | 1.705400 | 12.902100 | 4.822200 | 9.928900 | 23.348600 | 14.361800 | 21.852900 | 8.104325 | 7.080300 | 4.577400 | 11.596200 | 3.811900 | 4.121500 | 3.665100 | 25.780825 | 0.971800 | 6.098400 | 33.105275 | 12.619425 | 5.091700 | 11.734750 | 6.192200 | 12.718100 | 13.811700 | 3.293000 | 1.351700 | 18.480400 | 12.876700 | 0.901000 | 8.911425 | 7.070800 | -0.178800 | 1.125950 | 31.042425 | 24.426025 | 4.391225 | 13.232525 | 7.078525 | 8.192425 | 16.073925 | 14.461050 | 9.315000 | 7.676925 | 12.113225 | 2.028275 | 4.131600 | 12.688225 | 18.682500 | 13.057600 | 9.817300 | 18.263900 | 10.766350 | 1.891750 | 13.929300 | -0.988575 | 23.875325 | 13.094525 | 32.622850 | 5.792000 | 6.371200 | 14.623900 | 1.323600 | 23.024025 | 3.241500 | 1.318725 | 7.020025 | 5.905400 | 3.096400 | 2.956800 | 25.907725 | 3.619900 | 25.641225 | 13.745500 | 2.704400 | 13.931300 | 5.338750 | 4.391125 | 0.996200 | 11.011300 | 7.499175 | 12.127425 | 19.456150 | -0.590650 | 11.193800 | -1.466000 | 18.345225 | 1.482900 | 6.406200 | 9.512525 | 2.949500 | 6.205800 | 20.396525 | 0.829600 | 6.556725 | 9.593300 | 18.064725 | 4.836800 |
max | 1.000000 | 20.315000 | 10.376800 | 19.353000 | 13.188300 | 16.671400 | 17.251600 | 8.447700 | 27.691800 | 10.151300 | 11.150600 | 18.670200 | 17.188700 | 14.654500 | 22.331500 | 14.937700 | 15.863300 | 17.950600 | 19.025900 | 41.748000 | 35.183000 | 31.285900 | 49.044300 | 14.594500 | 4.875200 | 25.446000 | 14.654600 | 15.675100 | 3.243100 | 8.787400 | 13.143100 | 15.651500 | 20.171900 | 6.787100 | 29.546600 | 13.287800 | 21.528900 | 14.245600 | 11.863800 | 29.823500 | 15.322300 | 18.105600 | 26.165800 | 13.469600 | 12.577900 | 34.196100 | 62.084400 | 21.293900 | 20.685400 | 54.273800 | 41.153000 | 15.317200 | 40.689000 | 17.096800 | 8.231500 | 28.572400 | 29.092100 | 29.074100 | 9.160900 | 20.483300 | 11.986700 | 25.195500 | 27.102900 | 7.753600 | 11.231700 | 11.153700 | 15.731300 | 9.713200 | 39.396800 | 5.046900 | 8.547300 | 64.464400 | 1.571900 | 14.150000 | 44.536100 | 70.272000 | 36.156700 | 34.435200 | 30.956900 | 11.350700 | 18.225600 | 30.476900 | 23.132400 | 21.893400 | 27.714300 | 17.742400 | 32.901100 | 34.563700 | 33.354100 | 17.459400 | 15.481600 | 27.271300 | 7.489500 | 26.997600 | 12.534300 | 18.975000 | 1.804000 | 40.880600 | 58.287900 | 4.502800 | 5.076400 | 25.140900 | 28.459400 | 51.326500 | 2.188700 | 19.020600 | 7.169200 | 15.307400 | 46.379500 | 14.743000 | 32.059100 | 19.519300 | 9.800200 | 8.431700 | 21.542100 | 6.585000 | 11.950400 | 8.120700 | 64.810900 | 25.263500 | 15.688500 | 74.032100 | 17.307400 | 18.471400 | 26.874900 | 14.991500 | 13.664200 | 15.515600 | 10.597600 | 9.809600 | 31.203600 | 14.989500 | 2.192300 | 12.465000 | 8.309100 | 12.723600 | 21.412800 | 54.579400 | 44.437600 | 18.818700 | 36.097100 | 21.121900 | 23.965800 | 32.891100 | 22.691600 | 11.810100 | 16.008300 | 20.437300 | 22.149400 | 4.752800 | 48.424000 | 25.435700 | 21.124500 | 18.384600 | 24.007500 | 23.242800 | 16.831600 | 16.497000 | 11.972100 | 44.779500 | 25.120000 | 58.394200 | 6.309900 | 10.134400 | 27.564800 | 12.119300 | 38.332200 | 4.220400 | 21.276600 | 14.886100 | 7.089000 | 16.731900 | 17.917300 | 53.591900 | 18.855400 | 43.546800 | 20.854800 | 20.245200 | 20.596500 | 29.841300 | 13.448700 | 12.750500 | 14.393900 | 29.248700 | 23.704900 | 44.363400 | 12.997500 | 21.739200 | 22.786100 | 29.330300 | 4.034100 | 18.440900 | 16.716500 | 8.402400 | 18.281800 | 27.928800 | 4.272900 | 18.321500 | 12.000400 | 26.079100 | 28.500700 |
似乎所有的数据都是数值,再看下数据类型和数据缺失情况:
train.info()
print(f'Number of columns with nan: {train.isna().any().sum()}')
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB
Number of columns with nan: 0
所有特征的数据类型都是数值,有一列object是唯一的ID,在建模时删除,并且数据集没有数据缺失的情况。
检查测试集内目标值的分布情况:
sns.countplot(train['target'],palette='Set3')
print(f"target=1 占比:{train['target'].mean()*100}%")
target=1 占比:10.049%
数据集的特征名似乎对理解数据没有参考意义,先从一些统计量入手。
先看下不同target下各特征的分布情况,这里摘取前25个特征:
features=train.columns[2:27]
t0=train.loc[train['target']==0]
t1=train.loc[train['target']==1]
plt.subplots(5,5,figsize=(18,22))
for i,val in enumerate(features):
plt.subplot(5,5,i+1)
sns.distplot(t0[val],hist=False,label="target = 0")
sns.distplot(t1[val],hist=False,label="target = 1")
plt.xlabel(val,fontsize=9)
plt.tick_params(axis='x', which='major', labelsize=3, pad=-6)
plt.tick_params(axis='y', which='major', labelsize=3)
plt.show()
对于不同的target,一些特征在某个范围内分布会有明显的不同,如var_2、var_6、var_12等,加入特征值的数量可能会对预测有帮助。
以下对比target=0和target=1时,统计量的分布情况
样本的标准差分布:
plt.figure(figsize=(16,6))
sns.distplot(t0[features].std(axis=1),hist=True,bins=120,label='target = 0',kde=True)
sns.distplot(t1[features].std(axis=1),hist=True,bins=120,label='target = 1',kde=True)
plt.legend()
plt.figure(figsize=(16,6))
sns.distplot(t0[features].std(axis=0),hist=True,bins=120,label='target = 0',kde=True)
sns.distplot(t1[features].std(axis=0),hist=True,bins=120,label='target = 1',kde=True)
plt.legend()
plt.figure(figsize=(16,6))
sns.distplot(t0[features].max(axis=1),hist=True,bins=120,label='target = 0',kde=True)
sns.distplot(t1[features].max(axis=1),hist=True,bins=120,label='target = 1',kde=True)
plt.legend()
plt.figure(figsize=(16,6))
sns.distplot(t0[features].max(axis=0),hist=True,bins=120,label='target = 0',kde=True)
sns.distplot(t1[features].max(axis=0),hist=True,bins=120,label='target = 1',kde=True)
plt.legend()
对比训练集和测试集的数据分布情况:
样本的均值:
features=train.columns[2:202]
plt.figure(figsize=(16,6))
sns.distplot(train[features].mean(axis=1),hist=True,color='green',bins=120,label='train',kde=True)
sns.distplot(test.mean(axis=1),hist=True,color='red',bins=120,label='test',kde=True)
features=train.columns[2:202]
plt.figure(figsize=(16,6))
sns.distplot(train[features].mean(axis=0),hist=True,color='magenta',bins=120,label='train',kde=True)
sns.distplot(test[features].mean(axis=0),hist=True,color='darkblue',bins=120,label='test',kde=True)
plt.legend()
features=train.columns[2:202]
plt.figure(figsize=(16,6))
sns.distplot(train[features].std(axis=0),hist=True,color='blue',bins=120,label='train',kde=True)
sns.distplot(test[features].std(axis=0),hist=True,color='g',bins=120,label='test',kde=True)
plt.legend()
features=train.columns[2:202]
plt.figure(figsize=(16,6))
sns.distplot(train[features].std(axis=1),hist=True,color='magenta',bins=120,label='train',kde=True)
sns.distplot(test[features].std(axis=1),hist=True,color='darkblue',bins=120,label='test',kde=True)
plt.legend()
检查特征之间的相关性,最大值仅0.009844,说明特征之间的分布不存在明显的相关性
correlations=train[features].corr().abs().unstack().sort_values(kind='quicksort').reset_index()
correlations=correlations.loc[correlations['level_0']!=correlations['level_1']]
correlations.tail(10)
level_0 | level_1 | 0 | |
---|---|---|---|
39790 | var_183 | var_189 | 0.009359 |
39791 | var_189 | var_183 | 0.009359 |
39792 | var_174 | var_81 | 0.009490 |
39793 | var_81 | var_174 | 0.009490 |
39794 | var_81 | var_165 | 0.009714 |
39795 | var_165 | var_81 | 0.009714 |
39796 | var_53 | var_148 | 0.009788 |
39797 | var_148 | var_53 | 0.009788 |
39798 | var_26 | var_139 | 0.009844 |
39799 | var_139 | var_26 | 0.009844 |
如下Stack模型:1.Lightgbm 2. Logistic regression:
# 增加频数特征
def encode_FE(df,col,test):
cv=df[col].value_counts()
nm=col+'_FE'
df[nm]=df[col].map(cv)
test[nm]=test[col].map(cv)
test[nm].fillna(0,inplace=True)
if cv.max()<255:
df[nm]=df[nm].astype('uint8')
test[nm]=test[nm].astype('uint8')
else:
df[nm]=df[nm].astype('uint16')
test[nm]=test[nm].astype('uint16')
return
test['target']=-1
comb=pd.concat([train,test],axis=0,sort=True)
for i in range(200): encode_FE(comb,'var_'+str(i),test)
交叉验证前的参数与数据准备:
# 取出训练集
train = comb[:len(train)]
# LightGBM模型的参数,其中目标是分类,按要求以AUC评判模型
params={
'learning_rate':0.04,
'num_leaves':3,
'metric':'auc',
'boost_from_average':False,
'feature_fraction':1.0,
'max_depth':-1,
'objective':'binary',
'verbosity':10
}
# 洗牌
train2=train.sample(frac=1.0,random_state=1)
num_vars=200
# all_oof 交叉验证时的OUT-OF-FOLDER PREDCTION
# all_oofB 不包含频数特征
all_oof = np.zeros((len(train2),num_vars+1))
all_oof[:,0] = np.ones(len(train2))
all_oofB = np.zeros((len(train2),num_vars+1))
all_oofB[:,0] = np.ones(len(train2))
# all_preds 测试集预测;all_predsB 不含特征值频数的测试集预测
all_preds = np.zeros((len(test),num_vars+1))
all_preds[:,0] = np.ones(len(test))
all_predsB = np.zeros((len(test),num_vars+1))
all_predsB[:,0] = np.ones(len(test))
# 以下代码为模型训练:
evals_result={}
for j in range(num_vars):
features=['var_'+str(j),'var_'+str(j)+'_FE']
oof=np.zeros(len(train2))
pred=np.zeros(len(test))
# 分布图
plt.figure(figsize=(16,8))
plt.subplot(1,2,2)
sns.distplot(train2[train2['target']==0]['var_'+str(j)],label='t=0')
sns.distplot(train2[train2['target']==1]['var_'+str(j)],label='t=1')
plt.legend()
plt.yticks([])
plt.xlabel('var_'+str(j))
# mn,mx 特征极限值
# mnFE,mxFE 特征频数极限值
mn,mx=plt.xlim()
mnFE=train2['var_'+str(j)+'_FE'].min()
mxFE=train2['var_'+str(j)+'_FE'].max()
# df, 用于nunique*step, 计算该特征不同值在不同frequency下,被预测为正的概率
step=50
stepB=train2['var_'+str(j)+'_FE'].nunique()
w=(mx-mn)/step
x=w*(np.arange(0,step)+0.5)+mn #x 每个bin中心的x值
x2 = np.array([])
for i in range(stepB):
x2 = np.concatenate([x,x2])
df = pd.DataFrame({'var_'+str(j):x2})
df['var_'+str(j)+'_FE'] = mnFE + (mxFE-mnFE)/(stepB-1) * (df.index//step)
df['pred'] = 0
# 分成5个折子进行交叉验证,带频数特征的样本
for k in range(5):
valid=train2.iloc[(40000*k):(40000*(k+1))]
train=train2[~train2.index.isin(valid.index)]
val=lgb.Dataset(valid[features],valid['target'])
trn=lgb.Dataset(train[features],train['target'])
model=lgb.train(params,train_set=trn,num_boost_round=750,valid_sets=[trn,val],verbose_eval=False,evals_result=evals_result)
oof[(40000*k):(40000*(k+1))]=model.predict(valid[features],num_iteration=model.best_iteration)
pred+=model.predict(test[features],num_iteration=model.best_iteration)/5.0
df['pred'] += model.predict(df[features],num_iteration=model.best_iteration)/5.0
val_auc=roc_auc_score(train2['target'],oof)
print('VAR_'+str(j)+' with magic val_auc =',round(val_auc,5))
all_oof[:,j+1] = oof
all_preds[:,j+1] = pred
x = df['pred'].values
x = np.reshape(x,(stepB,step))
x = np.flip(x,axis=0)
plt.subplot(1,2,1)
sns.heatmap(x,cmap='RdBu_r',center=0.0)
plt.title('VAR_'+str(j)+' Predictions with Magic',fontsize=16)
plt.xticks(np.linspace(0,49,5),np.round(np.linspace(mn,mx,5),1))
plt.xlabel('Var_'+str(j))
s = min(mxFE-mnFE+1,20)
plt.yticks(np.linspace(mnFE,mxFE,s)-0.5,np.linspace(mxFE,mnFE,s).astype('int'))
plt.ylabel('Count')
plt.show()
# model without features(FE)
features = ['var_'+str(j)]
oof = np.zeros(len(train2))
preds = np.zeros(len(test))
# PLOT DENSITIES
plt.figure(figsize=(16,5))
plt.subplot(1,2,2)
sns.distplot(train2[train2['target']==0]['var_'+str(j)], label = 't=0')
sns.distplot(train2[train2['target']==1]['var_'+str(j)], label = 't=1')
plt.legend()
plt.yticks([])
plt.xlabel('Var_'+str(j))
# MAKE A GRID OF POINTS FOR LGBM TO PREDICT
mn,mx = plt.xlim()
mnFE = train2['var_'+str(j)+'_FE'].min()
mxFE = train2['var_'+str(j)+'_FE'].max()
step = 50
stepB = train2['var_'+str(j)+'_FE'].nunique()
w = (mx-mn)/step
x = w * (np.arange(0,step)+0.5) + mn
x2 = np.array([])
for i in range(stepB):
x2 = np.concatenate([x,x2])
df = pd.DataFrame({'var_'+str(j):x2})
df['var_'+str(j)+'_FE'] = mnFE + (mxFE-mnFE)/(stepB-1) * (df.index//step)
df['pred'] = 0
# 分成5个折子进行交叉验证,不带频数特征的样本
for k in range(5):
valid = train2.iloc[k*40000:(k+1)*40000]
train = train2[ ~train2.index.isin(valid.index) ]
trn_data = lgb.Dataset(train[features], label=train['target'])
val_data = lgb.Dataset(valid[features], label=valid['target'])
model = lgb.train(params, trn_data, 750, valid_sets = [trn_data, val_data], verbose_eval=False, evals_result=evals_result)
oof[k*40000:(k+1)*40000] = model.predict(valid[features], num_iteration=model.best_iteration)
preds += model.predict(test[features], num_iteration=model.best_iteration)/5.0
df['pred'] += model.predict(df[features], num_iteration=model.best_iteration)/5.0
val_auc = roc_auc_score(train2['target'],oof)
print('VAR_'+str(j)+' without magic val_auc =',round(val_auc,5))
all_oofB[:,j+1] = oof
all_predsB[:,j+1] = preds
x = df['pred'].values
x = np.reshape(x,(stepB,step))
x = np.flip(x,axis=0)
# PLOT LGBM PREDICTIONS WITHOUT USING FE
plt.subplot(1,2,1)
sns.heatmap(x, cmap='RdBu_r', center=0.0)
plt.title('VAR_'+str(j)+' Predictions without Magic',fontsize=16)
plt.xticks(np.linspace(0,49,5),np.round(np.linspace(mn,mx,5),1))
plt.xlabel('Var_'+str(j))
plt.yticks([])
plt.ylabel('')
plt.show()
截取VAR_2对比如下,VAR_2在增加频数特征,预测模型指标会有略微的提升。
增加频数特征后:
VAR_2 with magic FE val_auc = 0.55043
VAR_2 with magic FE val_auc = 0.55043
在以上模型的数据基础上,再增加逻辑斯蒂回归的模型训练:
logrB = sm.Logit(train2['target'], all_oofB[:,:num_vars+1])
logrB = logrB.fit(disp=0)
ensemble_predsB = logrB.predict(all_oofB[:,:num_vars+1])
ensemble_aucB = roc_auc_score(train2['target'],ensemble_predsB)
print('Combined Model without FE Val_AUC=',round(ensemble_aucB,5))
print()
# ENSEMBLE MODEL WITH FE
logr = sm.Logit(train2['target'], all_oof[:,:num_vars+1])
logr = logr.fit(disp=0)
ensemble_preds = logr.predict(all_preds[:,:num_vars+1])
ensemble_auc = roc_auc_score(train2['target'],ensemble_preds)
print('Combined Model with FE Val_AUC=',round(ensemble_auc,5))
print()
Combined Model without magic FE Val_AUC= 0.88619
Combined Model with magic FE Val_AUC= 0.91285
第一次提交,仅增加一些EDA中的统计量特征如最大/小值、方差、偏度、峰度、中位数等,成绩只有TOP42%左右:
第二次提交,增加上述Stack模型融合(LightGBM+Logistic Regression),并保留频数特征,成绩在TOP1.4%(123位)左右:
相比Blend,Stack更容易获得高分