import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier as RFC
%matplotlib inline
plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus']=False
df = pd.read_table('userlostprob.txt')
df.head()
|
label |
sampleid |
d |
arrival |
iforderpv_24h |
decisionhabit_user |
historyvisit_7ordernum |
historyvisit_totalordernum |
hotelcr |
ordercanceledprecent |
... |
lowestprice_pre2 |
lasthtlordergap |
businessrate_pre2 |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
0 |
0 |
24636 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
NaN |
1.04 |
NaN |
... |
615.0 |
NaN |
0.29 |
12.880 |
3.147 |
NaN |
NaN |
7 |
NaN |
12 |
1 |
1 |
24637 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
NaN |
1.06 |
NaN |
... |
513.0 |
NaN |
0.53 |
17.933 |
4.913 |
NaN |
NaN |
33 |
NaN |
14 |
2 |
0 |
24641 |
2016-05-18 |
2016-05-19 |
0 |
NaN |
NaN |
NaN |
1.05 |
NaN |
... |
382.0 |
NaN |
0.60 |
3.993 |
0.760 |
NaN |
NaN |
10 |
NaN |
19 |
3 |
0 |
24642 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
NaN |
1.01 |
NaN |
... |
203.0 |
NaN |
0.18 |
3.220 |
0.660 |
NaN |
NaN |
8 |
NaN |
16 |
4 |
1 |
24644 |
2016-05-18 |
2016-05-19 |
0 |
NaN |
NaN |
NaN |
1.00 |
NaN |
... |
84.0 |
NaN |
NaN |
0.013 |
NaN |
NaN |
NaN |
1 |
NaN |
21 |
5 rows × 51 columns
df['label'].value_counts()
0 500588
1 189357
Name: label, dtype: int64
df.tail()
|
label |
sampleid |
d |
arrival |
iforderpv_24h |
decisionhabit_user |
historyvisit_7ordernum |
historyvisit_totalordernum |
hotelcr |
ordercanceledprecent |
... |
lowestprice_pre2 |
lasthtlordergap |
businessrate_pre2 |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
689940 |
1 |
2238419 |
2016-05-15 |
2016-05-17 |
1 |
19.0 |
NaN |
NaN |
1.06 |
NaN |
... |
406.0 |
NaN |
0.48 |
13.573 |
1.660 |
1034.0 |
1.0 |
5 |
119.0 |
18 |
689941 |
1 |
2238421 |
2016-05-15 |
2016-05-15 |
1 |
10.0 |
3.0 |
3.0 |
1.06 |
0.33 |
... |
199.0 |
713.0 |
0.51 |
2.880 |
0.513 |
179.0 |
2.0 |
15 |
1472.0 |
12 |
689942 |
0 |
2238422 |
2016-05-15 |
2016-05-17 |
0 |
NaN |
NaN |
NaN |
1.07 |
NaN |
... |
544.0 |
NaN |
0.45 |
15.293 |
2.067 |
0.0 |
NaN |
8 |
107.0 |
0 |
689943 |
0 |
2238425 |
2016-05-15 |
2016-05-17 |
0 |
NaN |
NaN |
NaN |
1.04 |
NaN |
... |
156.0 |
NaN |
0.29 |
2.467 |
0.333 |
NaN |
NaN |
4 |
NaN |
0 |
689944 |
0 |
2238426 |
2016-05-15 |
2016-05-15 |
0 |
NaN |
NaN |
NaN |
1.02 |
NaN |
... |
275.0 |
NaN |
NaN |
12.600 |
2.653 |
NaN |
NaN |
2 |
NaN |
11 |
5 rows × 51 columns
df.sample(5)
|
label |
sampleid |
d |
arrival |
iforderpv_24h |
decisionhabit_user |
historyvisit_7ordernum |
historyvisit_totalordernum |
hotelcr |
ordercanceledprecent |
... |
lowestprice_pre2 |
lasthtlordergap |
businessrate_pre2 |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
477013 |
1 |
820235 |
2016-05-21 |
2016-05-21 |
0 |
15.0 |
NaN |
15.0 |
1.05 |
0.36 |
... |
582.0 |
18831.0 |
0.48 |
17.220 |
3.400 |
4242.0 |
1.33 |
446 |
906.0 |
9 |
426926 |
0 |
736598 |
2016-05-15 |
2016-05-15 |
0 |
1.0 |
NaN |
39.0 |
1.05 |
0.16 |
... |
978.0 |
12199.0 |
0.13 |
5.113 |
0.847 |
642.0 |
1.36 |
732 |
2583.0 |
8 |
628554 |
0 |
1072402 |
2016-05-20 |
2016-05-20 |
0 |
NaN |
NaN |
3.0 |
1.02 |
0.00 |
... |
147.0 |
55214.0 |
0.27 |
15.873 |
3.220 |
10002.0 |
1.11 |
186 |
905.0 |
19 |
248275 |
0 |
438633 |
2016-05-18 |
2016-06-09 |
0 |
19.0 |
2.0 |
28.0 |
1.02 |
0.78 |
... |
NaN |
3329.0 |
NaN |
1.320 |
0.087 |
145.0 |
1.12 |
449 |
17397.0 |
11 |
198972 |
0 |
356550 |
2016-05-19 |
2016-05-19 |
0 |
7.0 |
NaN |
2.0 |
1.04 |
0.50 |
... |
206.0 |
61467.0 |
0.32 |
20.480 |
5.153 |
13264.0 |
1.08 |
59 |
1522.0 |
20 |
5 rows × 51 columns
df.shape
(689945, 51)
df.dtypes
label int64
sampleid int64
d object
arrival object
iforderpv_24h int64
decisionhabit_user float64
historyvisit_7ordernum float64
historyvisit_totalordernum float64
hotelcr float64
ordercanceledprecent float64
landhalfhours float64
ordercanncelednum float64
commentnums float64
starprefer float64
novoters float64
consuming_capacity float64
historyvisit_avghotelnum float64
cancelrate float64
historyvisit_visit_detailpagenum float64
delta_price1 float64
price_sensitive float64
hoteluv float64
businessrate_pre float64
ordernum_oneyear float64
cr_pre float64
avgprice float64
lowestprice float64
firstorder_bu float64
customereval_pre2 float64
delta_price2 float64
commentnums_pre float64
customer_value_profit float64
commentnums_pre2 float64
cancelrate_pre float64
novoters_pre2 float64
novoters_pre float64
ctrip_profits float64
deltaprice_pre2_t1 float64
lowestprice_pre float64
uv_pre float64
uv_pre2 float64
lowestprice_pre2 float64
lasthtlordergap float64
businessrate_pre2 float64
cityuvs float64
cityorders float64
lastpvgap float64
cr float64
sid int64
visitnum_oneyear float64
h int64
dtype: object
df.info()
RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 label 689945 non-null int64
1 sampleid 689945 non-null int64
2 d 689945 non-null object
3 arrival 689945 non-null object
4 iforderpv_24h 689945 non-null int64
5 decisionhabit_user 385450 non-null float64
6 historyvisit_7ordernum 82915 non-null float64
7 historyvisit_totalordernum 386525 non-null float64
8 hotelcr 689148 non-null float64
9 ordercanceledprecent 447831 non-null float64
10 landhalfhours 661312 non-null float64
11 ordercanncelednum 447831 non-null float64
12 commentnums 622029 non-null float64
13 starprefer 464892 non-null float64
14 novoters 672918 non-null float64
15 consuming_capacity 463837 non-null float64
16 historyvisit_avghotelnum 387876 non-null float64
17 cancelrate 678227 non-null float64
18 historyvisit_visit_detailpagenum 307234 non-null float64
19 delta_price1 437146 non-null float64
20 price_sensitive 463837 non-null float64
21 hoteluv 689148 non-null float64
22 businessrate_pre 483896 non-null float64
23 ordernum_oneyear 447831 non-null float64
24 cr_pre 660548 non-null float64
25 avgprice 457261 non-null float64
26 lowestprice 687931 non-null float64
27 firstorder_bu 376993 non-null float64
28 customereval_pre2 661312 non-null float64
29 delta_price2 437750 non-null float64
30 commentnums_pre 598368 non-null float64
31 customer_value_profit 439123 non-null float64
32 commentnums_pre2 648457 non-null float64
33 cancelrate_pre 653015 non-null float64
34 novoters_pre2 657616 non-null float64
35 novoters_pre 648956 non-null float64
36 ctrip_profits 445187 non-null float64
37 deltaprice_pre2_t1 543180 non-null float64
38 lowestprice_pre 659689 non-null float64
39 uv_pre 660548 non-null float64
40 uv_pre2 661189 non-null float64
41 lowestprice_pre2 660664 non-null float64
42 lasthtlordergap 447831 non-null float64
43 businessrate_pre2 602960 non-null float64
44 cityuvs 682274 non-null float64
45 cityorders 651263 non-null float64
46 lastpvgap 592818 non-null float64
47 cr 457896 non-null float64
48 sid 689945 non-null int64
49 visitnum_oneyear 592910 non-null float64
50 h 689945 non-null int64
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])
|
label |
sampleid |
iforderpv_24h |
decisionhabit_user |
historyvisit_7ordernum |
historyvisit_totalordernum |
hotelcr |
ordercanceledprecent |
landhalfhours |
ordercanncelednum |
... |
lowestprice_pre2 |
lasthtlordergap |
businessrate_pre2 |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
count |
689945.000000 |
6.899450e+05 |
689945.000000 |
385450.000000 |
82915.000000 |
386525.000000 |
689148.000000 |
447831.000000 |
661312.000000 |
447831.000000 |
... |
660664.000000 |
447831.000000 |
602960.000000 |
682274.000000 |
651263.000000 |
592818.000000 |
457896.000000 |
689945.000000 |
5.929100e+05 |
689945.000000 |
mean |
0.274452 |
6.285402e+05 |
0.193737 |
5.317048 |
1.856094 |
11.710487 |
1.060996 |
0.342119 |
6.086366 |
154.179369 |
... |
318.541812 |
101830.919400 |
0.368237 |
10.648278 |
2.253250 |
12049.409382 |
1.137476 |
153.702414 |
1.855185e+04 |
14.462315 |
std |
0.446238 |
4.146815e+05 |
0.395226 |
38.524483 |
2.103862 |
17.251429 |
0.045264 |
0.354210 |
12.413225 |
398.456986 |
... |
351.913035 |
122784.313864 |
0.219945 |
15.696682 |
3.538453 |
25601.374138 |
0.204789 |
277.807697 |
2.288603e+05 |
6.301575 |
min |
0.000000 |
2.463600e+04 |
0.000000 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
... |
1.000000 |
0.000000 |
0.000000 |
0.007000 |
0.007000 |
0.000000 |
1.000000 |
0.000000 |
1.000000e+00 |
0.000000 |
1% |
0.000000 |
3.620588e+04 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
1.000000 |
0.000000 |
0.000000 |
0.000000 |
... |
52.000000 |
244.000000 |
0.010000 |
0.013000 |
0.007000 |
0.000000 |
1.000000 |
1.000000 |
2.100000e+01 |
0.000000 |
10% |
0.000000 |
1.398464e+05 |
0.000000 |
1.000000 |
1.000000 |
1.000000 |
1.010000 |
0.000000 |
0.000000 |
0.000000 |
... |
101.000000 |
3518.000000 |
0.050000 |
0.160000 |
0.033000 |
127.000000 |
1.000000 |
4.000000 |
1.610000e+02 |
6.000000 |
25% |
0.000000 |
3.123200e+05 |
0.000000 |
2.000000 |
1.000000 |
2.000000 |
1.030000 |
0.000000 |
0.000000 |
0.000000 |
... |
145.000000 |
14999.000000 |
0.170000 |
0.827000 |
0.127000 |
551.000000 |
1.000000 |
17.000000 |
4.710000e+02 |
11.000000 |
50% |
0.000000 |
5.996370e+05 |
0.000000 |
3.000000 |
1.000000 |
6.000000 |
1.050000 |
0.250000 |
0.000000 |
2.000000 |
... |
233.000000 |
46890.000000 |
0.400000 |
3.527000 |
0.627000 |
2848.000000 |
1.050000 |
62.000000 |
1.315000e+03 |
15.000000 |
75% |
1.000000 |
8.874600e+05 |
0.000000 |
5.000000 |
2.000000 |
14.000000 |
1.090000 |
0.570000 |
4.000000 |
153.000000 |
... |
388.000000 |
138953.000000 |
0.550000 |
13.327000 |
2.747000 |
10726.000000 |
1.210000 |
180.000000 |
3.141000e+03 |
20.000000 |
90% |
1.000000 |
1.059705e+06 |
1.000000 |
10.000000 |
3.000000 |
29.000000 |
1.120000 |
0.980000 |
27.000000 |
492.000000 |
... |
611.000000 |
311492.000000 |
0.650000 |
35.567000 |
7.547000 |
30384.900000 |
1.400000 |
392.000000 |
6.634000e+03 |
22.000000 |
99% |
1.000000 |
2.226893e+06 |
1.000000 |
27.000000 |
7.000000 |
82.000000 |
1.190000 |
1.000000 |
48.000000 |
1752.000000 |
... |
1464.000000 |
484734.000000 |
0.780000 |
66.007000 |
14.453000 |
138722.000000 |
2.000000 |
1212.000000 |
2.625670e+05 |
23.000000 |
max |
1.000000 |
2.238426e+06 |
1.000000 |
3167.000000 |
106.000000 |
711.000000 |
3.180000 |
1.000000 |
49.000000 |
13475.000000 |
... |
43700.000000 |
527026.000000 |
0.990000 |
67.140000 |
14.507000 |
194386.000000 |
11.000000 |
9956.000000 |
9.651192e+06 |
23.000000 |
12 rows × 49 columns
df.drop_duplicates(inplace=True)
df.shape
(689945, 51)
null = df.isnull().mean().reset_index().sort_values(0)
null_1 = null.rename(columns={'index':'特征', 0:'缺失比'})
null_1
|
特征 |
缺失比 |
0 |
label |
0.000000 |
48 |
sid |
0.000000 |
4 |
iforderpv_24h |
0.000000 |
50 |
h |
0.000000 |
2 |
d |
0.000000 |
1 |
sampleid |
0.000000 |
3 |
arrival |
0.000000 |
8 |
hotelcr |
0.001155 |
21 |
hoteluv |
0.001155 |
26 |
lowestprice |
0.002919 |
44 |
cityuvs |
0.011118 |
17 |
cancelrate |
0.016984 |
14 |
novoters |
0.024679 |
28 |
customereval_pre2 |
0.041500 |
10 |
landhalfhours |
0.041500 |
40 |
uv_pre2 |
0.041679 |
41 |
lowestprice_pre2 |
0.042440 |
39 |
uv_pre |
0.042608 |
24 |
cr_pre |
0.042608 |
38 |
lowestprice_pre |
0.043853 |
34 |
novoters_pre2 |
0.046857 |
33 |
cancelrate_pre |
0.053526 |
45 |
cityorders |
0.056065 |
35 |
novoters_pre |
0.059409 |
32 |
commentnums_pre2 |
0.060132 |
12 |
commentnums |
0.098437 |
43 |
businessrate_pre2 |
0.126075 |
30 |
commentnums_pre |
0.132731 |
49 |
visitnum_oneyear |
0.140642 |
46 |
lastpvgap |
0.140775 |
37 |
deltaprice_pre2_t1 |
0.212720 |
22 |
businessrate_pre |
0.298646 |
13 |
starprefer |
0.326190 |
20 |
price_sensitive |
0.327719 |
15 |
consuming_capacity |
0.327719 |
47 |
cr |
0.336330 |
25 |
avgprice |
0.337250 |
23 |
ordernum_oneyear |
0.350918 |
42 |
lasthtlordergap |
0.350918 |
11 |
ordercanncelednum |
0.350918 |
9 |
ordercanceledprecent |
0.350918 |
36 |
ctrip_profits |
0.354750 |
31 |
customer_value_profit |
0.363539 |
29 |
delta_price2 |
0.365529 |
19 |
delta_price1 |
0.366405 |
16 |
historyvisit_avghotelnum |
0.437816 |
7 |
historyvisit_totalordernum |
0.439774 |
5 |
decisionhabit_user |
0.441332 |
27 |
firstorder_bu |
0.453590 |
18 |
historyvisit_visit_detailpagenum |
0.554698 |
6 |
historyvisit_7ordernum |
0.879824 |
plt.figure(figsize=(8,6))
sns.kdeplot(null_1['缺失比'], shade=True)
plt.figure(figsize=(8,6))
plt.bar(range(null_1.shape[0]), null_1['缺失比'], label='lost rate')
plt.legend(loc='best')
df = df.drop(['historyvisit_7ordernum'], axis=1)
df
|
label |
sampleid |
d |
arrival |
iforderpv_24h |
decisionhabit_user |
historyvisit_totalordernum |
hotelcr |
ordercanceledprecent |
landhalfhours |
... |
lowestprice_pre2 |
lasthtlordergap |
businessrate_pre2 |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
0 |
0 |
24636 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
1.04 |
NaN |
22.0 |
... |
615.0 |
NaN |
0.29 |
12.880 |
3.147 |
NaN |
NaN |
7 |
NaN |
12 |
1 |
1 |
24637 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
1.06 |
NaN |
0.0 |
... |
513.0 |
NaN |
0.53 |
17.933 |
4.913 |
NaN |
NaN |
33 |
NaN |
14 |
2 |
0 |
24641 |
2016-05-18 |
2016-05-19 |
0 |
NaN |
NaN |
1.05 |
NaN |
3.0 |
... |
382.0 |
NaN |
0.60 |
3.993 |
0.760 |
NaN |
NaN |
10 |
NaN |
19 |
3 |
0 |
24642 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
1.01 |
NaN |
2.0 |
... |
203.0 |
NaN |
0.18 |
3.220 |
0.660 |
NaN |
NaN |
8 |
NaN |
16 |
4 |
1 |
24644 |
2016-05-18 |
2016-05-19 |
0 |
NaN |
NaN |
1.00 |
NaN |
0.0 |
... |
84.0 |
NaN |
NaN |
0.013 |
NaN |
NaN |
NaN |
1 |
NaN |
21 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
689940 |
1 |
2238419 |
2016-05-15 |
2016-05-17 |
1 |
19.0 |
NaN |
1.06 |
NaN |
1.0 |
... |
406.0 |
NaN |
0.48 |
13.573 |
1.660 |
1034.0 |
1.0 |
5 |
119.0 |
18 |
689941 |
1 |
2238421 |
2016-05-15 |
2016-05-15 |
1 |
10.0 |
3.0 |
1.06 |
0.33 |
49.0 |
... |
199.0 |
713.0 |
0.51 |
2.880 |
0.513 |
179.0 |
2.0 |
15 |
1472.0 |
12 |
689942 |
0 |
2238422 |
2016-05-15 |
2016-05-17 |
0 |
NaN |
NaN |
1.07 |
NaN |
0.0 |
... |
544.0 |
NaN |
0.45 |
15.293 |
2.067 |
0.0 |
NaN |
8 |
107.0 |
0 |
689943 |
0 |
2238425 |
2016-05-15 |
2016-05-17 |
0 |
NaN |
NaN |
1.04 |
NaN |
0.0 |
... |
156.0 |
NaN |
0.29 |
2.467 |
0.333 |
NaN |
NaN |
4 |
NaN |
0 |
689944 |
0 |
2238426 |
2016-05-15 |
2016-05-15 |
0 |
NaN |
NaN |
1.02 |
NaN |
0.0 |
... |
275.0 |
NaN |
NaN |
12.600 |
2.653 |
NaN |
NaN |
2 |
NaN |
11 |
689945 rows × 50 columns
df.describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
|
count |
mean |
std |
min |
1% |
25% |
50% |
75% |
99% |
max |
label |
689945.0 |
0.274452 |
0.446238 |
0.000 |
0.00000 |
0.000 |
0.000 |
1.000 |
1.000000e+00 |
1.000 |
sampleid |
689945.0 |
628540.209625 |
414681.498697 |
24636.000 |
36205.88000 |
312320.000 |
599637.000 |
887460.000 |
2.226893e+06 |
2238426.000 |
iforderpv_24h |
689945.0 |
0.193737 |
0.395226 |
0.000 |
0.00000 |
0.000 |
0.000 |
0.000 |
1.000000e+00 |
1.000 |
decisionhabit_user |
385450.0 |
5.317048 |
38.524483 |
0.000 |
1.00000 |
2.000 |
3.000 |
5.000 |
2.700000e+01 |
3167.000 |
historyvisit_totalordernum |
386525.0 |
11.710487 |
17.251429 |
1.000 |
1.00000 |
2.000 |
6.000 |
14.000 |
8.200000e+01 |
711.000 |
hotelcr |
689148.0 |
1.060996 |
0.045264 |
1.000 |
1.00000 |
1.030 |
1.050 |
1.090 |
1.190000e+00 |
3.180 |
ordercanceledprecent |
447831.0 |
0.342119 |
0.354210 |
0.000 |
0.00000 |
0.000 |
0.250 |
0.570 |
1.000000e+00 |
1.000 |
landhalfhours |
661312.0 |
6.086366 |
12.413225 |
0.000 |
0.00000 |
0.000 |
0.000 |
4.000 |
4.800000e+01 |
49.000 |
ordercanncelednum |
447831.0 |
154.179369 |
398.456986 |
0.000 |
0.00000 |
0.000 |
2.000 |
153.000 |
1.752000e+03 |
13475.000 |
commentnums |
622029.0 |
1272.090888 |
2101.871601 |
0.000 |
1.00000 |
115.000 |
514.000 |
1670.000 |
8.796000e+03 |
34189.000 |
starprefer |
464892.0 |
67.532304 |
19.175094 |
0.000 |
20.00000 |
53.300 |
69.400 |
80.300 |
1.000000e+02 |
100.000 |
novoters |
672918.0 |
1706.247901 |
2811.690007 |
1.000 |
1.00000 |
157.000 |
692.000 |
2196.000 |
1.157600e+04 |
45455.000 |
consuming_capacity |
463837.0 |
39.154140 |
23.240147 |
0.000 |
8.00000 |
22.000 |
33.000 |
51.000 |
1.000000e+02 |
100.000 |
historyvisit_avghotelnum |
387876.0 |
6.510179 |
41.045261 |
0.000 |
1.00000 |
2.000 |
4.000 |
7.000 |
2.900000e+01 |
3167.000 |
cancelrate |
678227.0 |
1051.604143 |
1509.066134 |
1.000 |
2.00000 |
137.000 |
503.000 |
1373.000 |
6.399000e+03 |
18930.000 |
historyvisit_visit_detailpagenum |
307234.0 |
37.153603 |
73.402891 |
1.000 |
1.00000 |
6.000 |
18.000 |
44.000 |
2.620000e+02 |
6199.000 |
delta_price1 |
437146.0 |
79.067012 |
512.942824 |
-99879.000 |
-1227.55000 |
-31.000 |
81.000 |
226.000 |
1.081000e+03 |
5398.000 |
price_sensitive |
463837.0 |
24.645863 |
26.685606 |
0.000 |
0.00000 |
5.000 |
16.000 |
33.000 |
1.000000e+02 |
100.000 |
hoteluv |
689148.0 |
95.092708 |
169.981527 |
0.007 |
0.16700 |
10.427 |
36.180 |
107.747 |
9.641130e+02 |
1722.613 |
businessrate_pre |
483896.0 |
0.372717 |
0.232791 |
0.000 |
0.01000 |
0.150 |
0.390 |
0.570 |
8.000000e-01 |
0.990 |
ordernum_oneyear |
447831.0 |
11.642061 |
17.137209 |
1.000 |
1.00000 |
2.000 |
6.000 |
14.000 |
8.100000e+01 |
711.000 |
cr_pre |
660548.0 |
1.062906 |
0.044588 |
1.000 |
1.00000 |
1.030 |
1.060 |
1.090 |
1.190000e+00 |
2.950 |
avgprice |
457261.0 |
422.458701 |
290.853332 |
1.000 |
91.00000 |
232.000 |
350.000 |
524.000 |
1.491000e+03 |
6383.000 |
lowestprice |
687931.0 |
318.806242 |
575.782415 |
-3.000 |
37.00000 |
116.000 |
200.000 |
380.000 |
1.823000e+03 |
100000.000 |
firstorder_bu |
376993.0 |
11.697795 |
2.746821 |
1.000 |
3.00000 |
12.000 |
13.000 |
13.000 |
1.700000e+01 |
21.000 |
customereval_pre2 |
661312.0 |
3.048519 |
1.226635 |
0.000 |
0.00000 |
2.000 |
3.000 |
4.000 |
5.500000e+00 |
6.000 |
delta_price2 |
437750.0 |
77.277208 |
391.413839 |
-43344.000 |
-949.00000 |
-29.000 |
69.000 |
198.000 |
1.018000e+03 |
5114.000 |
commentnums_pre |
598368.0 |
1415.159561 |
2329.418922 |
0.000 |
1.00000 |
137.000 |
592.000 |
1862.000 |
9.732000e+03 |
34189.000 |
customer_value_profit |
439123.0 |
3.038409 |
6.625281 |
-24.075 |
-0.29678 |
0.269 |
0.991 |
3.138 |
2.845100e+01 |
598.064 |
commentnums_pre2 |
648457.0 |
1313.388737 |
1719.513354 |
0.000 |
3.00000 |
270.000 |
768.000 |
1780.000 |
7.457000e+03 |
34189.000 |
cancelrate_pre |
653015.0 |
0.344422 |
0.179147 |
0.000 |
0.05000 |
0.230 |
0.320 |
0.420 |
1.000000e+00 |
1.000 |
novoters_pre2 |
657616.0 |
1787.197614 |
2316.712985 |
1.000 |
5.00000 |
391.000 |
1054.000 |
2413.000 |
1.001800e+04 |
45436.000 |
novoters_pre |
648956.0 |
1890.698450 |
3116.120062 |
1.000 |
2.00000 |
187.000 |
783.000 |
2453.000 |
1.383900e+04 |
45436.000 |
ctrip_profits |
445187.0 |
4.208495 |
9.314438 |
-44.313 |
-0.39300 |
0.340 |
1.347 |
4.320 |
4.075580e+01 |
600.820 |
deltaprice_pre2_t1 |
543180.0 |
3.283740 |
48.805880 |
-2296.000 |
-103.00000 |
-3.000 |
2.000 |
10.000 |
1.110000e+02 |
3324.000 |
lowestprice_pre |
659689.0 |
315.954583 |
463.723643 |
1.000 |
38.00000 |
118.000 |
208.000 |
385.000 |
1.750000e+03 |
100000.000 |
uv_pre |
660548.0 |
107.846076 |
186.731907 |
0.007 |
0.24000 |
12.533 |
42.500 |
124.707 |
1.047787e+03 |
1722.613 |
uv_pre2 |
661189.0 |
103.352990 |
157.117863 |
0.007 |
0.50000 |
17.563 |
51.287 |
126.200 |
8.567254e+02 |
1722.613 |
lowestprice_pre2 |
660664.0 |
318.541812 |
351.913035 |
1.000 |
52.00000 |
145.000 |
233.000 |
388.000 |
1.464000e+03 |
43700.000 |
lasthtlordergap |
447831.0 |
101830.919400 |
122784.313864 |
0.000 |
244.00000 |
14999.000 |
46890.000 |
138953.000 |
4.847340e+05 |
527026.000 |
businessrate_pre2 |
602960.0 |
0.368237 |
0.219945 |
0.000 |
0.01000 |
0.170 |
0.400 |
0.550 |
7.800000e-01 |
0.990 |
cityuvs |
682274.0 |
10.648278 |
15.696682 |
0.007 |
0.01300 |
0.827 |
3.527 |
13.327 |
6.600700e+01 |
67.140 |
cityorders |
651263.0 |
2.253250 |
3.538453 |
0.007 |
0.00700 |
0.127 |
0.627 |
2.747 |
1.445300e+01 |
14.507 |
lastpvgap |
592818.0 |
12049.409382 |
25601.374138 |
0.000 |
0.00000 |
551.000 |
2848.000 |
10726.000 |
1.387220e+05 |
194386.000 |
cr |
457896.0 |
1.137476 |
0.204789 |
1.000 |
1.00000 |
1.000 |
1.050 |
1.210 |
2.000000e+00 |
11.000 |
sid |
689945.0 |
153.702414 |
277.807697 |
0.000 |
1.00000 |
17.000 |
62.000 |
180.000 |
1.212000e+03 |
9956.000 |
visitnum_oneyear |
592910.0 |
18551.846682 |
228860.311117 |
1.000 |
21.00000 |
471.000 |
1315.000 |
3141.000 |
2.625670e+05 |
9651192.000 |
h |
689945.0 |
14.462315 |
6.301575 |
0.000 |
0.00000 |
11.000 |
15.000 |
20.000 |
2.300000e+01 |
23.000 |
df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
|
count |
mean |
std |
min |
1% |
25% |
50% |
75% |
99% |
max |
lowestprice_pre |
659689.0 |
315.954583 |
463.723643 |
1.0 |
38.0 |
118.0 |
208.0 |
385.0 |
1750.0 |
100000.0 |
lowestprice |
687931.0 |
318.806242 |
575.782415 |
-3.0 |
37.0 |
116.0 |
200.0 |
380.0 |
1823.0 |
100000.0 |
col_block = ['lowestprice_pre', 'lowestprice']
def block_upper(x):
upper = x.quantile(0.99)
out = x.mask(x > upper, upper)
return out
def block_lower(x):
lower = x.quantile(0.01)
out = x.mask(x < lower, lower)
return out
df[col_block] = df[col_block].apply(block_upper)
df[col_block] = df[col_block].apply(block_lower)
df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
|
count |
mean |
std |
min |
1% |
25% |
50% |
75% |
99% |
max |
lowestprice_pre |
659689.0 |
304.439507 |
287.192512 |
38.0 |
38.0 |
118.0 |
208.0 |
385.0 |
1750.0 |
1750.0 |
lowestprice |
687931.0 |
305.025771 |
297.382838 |
37.0 |
37.0 |
116.0 |
200.0 |
380.0 |
1823.0 |
1823.0 |
df_copy = df.copy(deep=True)
X = df_copy.iloc[:, 2:]
y = df_copy.iloc[:, 0]
X.head(10)
|
d |
arrival |
iforderpv_24h |
decisionhabit_user |
historyvisit_totalordernum |
hotelcr |
ordercanceledprecent |
landhalfhours |
ordercanncelednum |
commentnums |
... |
lowestprice_pre2 |
lasthtlordergap |
businessrate_pre2 |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
0 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
1.04 |
NaN |
22.0 |
NaN |
1089.0 |
... |
615.0 |
NaN |
0.29 |
12.880 |
3.147 |
NaN |
NaN |
7 |
NaN |
12 |
1 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
1.06 |
NaN |
0.0 |
NaN |
5612.0 |
... |
513.0 |
NaN |
0.53 |
17.933 |
4.913 |
NaN |
NaN |
33 |
NaN |
14 |
2 |
2016-05-18 |
2016-05-19 |
0 |
NaN |
NaN |
1.05 |
NaN |
3.0 |
NaN |
256.0 |
... |
382.0 |
NaN |
0.60 |
3.993 |
0.760 |
NaN |
NaN |
10 |
NaN |
19 |
3 |
2016-05-18 |
2016-05-18 |
0 |
NaN |
NaN |
1.01 |
NaN |
2.0 |
NaN |
NaN |
... |
203.0 |
NaN |
0.18 |
3.220 |
0.660 |
NaN |
NaN |
8 |
NaN |
16 |
4 |
2016-05-18 |
2016-05-19 |
0 |
NaN |
NaN |
1.00 |
NaN |
0.0 |
NaN |
NaN |
... |
84.0 |
NaN |
NaN |
0.013 |
NaN |
NaN |
NaN |
1 |
NaN |
21 |
5 |
2016-05-18 |
2016-05-20 |
0 |
NaN |
NaN |
1.02 |
NaN |
0.0 |
NaN |
15.0 |
... |
408.0 |
NaN |
NaN |
2.880 |
0.427 |
NaN |
NaN |
1 |
NaN |
21 |
6 |
2016-05-18 |
2016-05-25 |
0 |
NaN |
NaN |
1.12 |
NaN |
0.0 |
NaN |
2578.0 |
... |
145.0 |
NaN |
NaN |
4.427 |
0.493 |
NaN |
NaN |
1 |
NaN |
22 |
7 |
2016-05-18 |
2016-05-20 |
0 |
3.0 |
21.0 |
1.11 |
0.79 |
0.0 |
395.0 |
NaN |
... |
204.0 |
10475.0 |
0.53 |
12.713 |
1.987 |
7566.0 |
1.5 |
23 |
1265.0 |
17 |
8 |
2016-05-18 |
2016-05-19 |
0 |
13.0 |
NaN |
1.08 |
NaN |
0.0 |
NaN |
2572.0 |
... |
99.0 |
NaN |
0.41 |
5.393 |
0.860 |
15.0 |
1.0 |
20 |
596.0 |
20 |
9 |
2016-05-18 |
2016-06-08 |
1 |
2.0 |
7.0 |
1.07 |
0.86 |
47.0 |
6.0 |
NaN |
... |
191.0 |
18873.0 |
0.52 |
3.093 |
0.287 |
288.0 |
1.0 |
31 |
21926.0 |
7 |
10 rows × 48 columns
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
col_date = ['d', 'arrival']
X_train.drop(col_date, axis=1, inplace=True)
X_train.shape
(517458, 46)
col = X_train.columns.tolist()
col_no = ['sid', 'iforderpv_24h', 'h']
col_clf = ['decisionhabit_user']
col_neg = ['delta_price1', 'delta_price2', 'customer_value_profit', 'ctrip_profits', 'deltaprice_pre2_t1']
col_35 = ['ordernum_oneyear', 'lasthtlordergap', 'ordercanncelednum',
'ordercanceledprecent', 'ctrip_profits', 'historyvisit_avghotelnum', 'historyvisit_totalordernum',
'decisionhabit_user', 'firstorder_bu', 'historyvisit_visit_detailpagenum']
col_std = X_train.columns[X_train.describe(include='all').T['std'] > 100].to_list()
col_std.remove('sid')
col_std.remove('delta_price2')
col_std.remove('delta_price1')
col_std.remove('lasthtlordergap')
col_norm = list(set(col) - set(col_no + col_clf + col_neg + col_35))
X_train[col_clf] = X_train[col_clf].fillna(X_train[col_clf].mode())
X_train[col_neg] = X_train[col_neg].fillna(X_train[col_neg].median())
X_train[col_35] = X_train[col_35].fillna(-1)
X_train[col_std] = X_train[col_std].fillna(X_train[col_std].median())
X_train[col_norm] = X_train[col_norm].fillna(X_train[col_norm].mean())
X_test[col_clf] = X_test[col_clf].fillna(X_test[col_clf].mode())
X_test[col_neg] = X_test[col_neg].fillna(X_test[col_neg].median())
X_test[col_35] = X_test[col_35].fillna(-1)
X_test[col_std] = X_test[col_std].fillna(X_test[col_std].median())
X_test[col_norm] = X_test[col_norm].fillna(X_test[col_norm].mean())
X_train.isnull().any().sum()
0
X_test.isnull().any().sum()
0
X_train.shape
(517458, 46)
X_test.shape
(172487, 48)
selector = VarianceThreshold()
X_train_var = selector.fit_transform(X_train)
X_train_var.shape
(517458, 46)
f, p_values = f_classif(X_train, y_train)
(p_values > 0.01).sum()
6
col_f = X_train.columns[p_values <= 0.01]
X_train = X_train[col_f]
X_train.shape
(517458, 40)
X_test = X_test[col_f]
X_test.shape
(172487, 40)
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
rfc = RFC(n_estimators=10, random_state=42)
importances = rfc.fit(X_train, y_train).feature_importances_
importances
array([0.0116028 , 0.01896167, 0.01802851, 0.0150776 , 0.01591757,
0.02015549, 0.02034934, 0.0203898 , 0.01943307, 0.0211841 ,
0.01791408, 0.02056686, 0.02291317, 0.02291462, 0.02051174,
0.02323111, 0.02106971, 0.01054197, 0.02306534, 0.01938803,
0.0258305 , 0.02571515, 0.02508617, 0.02485002, 0.02534584,
0.02850795, 0.02408976, 0.02625239, 0.0279376 , 0.02743348,
0.02772216, 0.03058076, 0.02754623, 0.04068007, 0.03904556,
0.03655198, 0.03553142, 0.03861745, 0.04116244, 0.03829647])
scores = []
thresholds = np.linspace(0, importances.max(), 20)
for i in thresholds:
time0 = time()
X_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X_train, y_train)
score = cross_val_score(rfc, X_embedded, y_train, cv=5, n_jobs=-1).mean()
scores.append(score)
print(datetime.datetime.fromtimestamp(time() - time0).strftime('%M:%S:%f'))
plt.plot(thresholds, scores)
plt.show()
01:12:090613
01:13:526636
01:09:249811
01:08:676264
01:07:224143
01:06:510684
01:14:200829
01:13:976302
01:13:173232
01:07:879761
01:05:081422
01:03:703478
00:57:668075
00:55:913328
00:49:275084
00:47:250707
00:48:596243
00:53:333874
00:45:020932
00:53:343730
max(scores)
0.9507844100831351
thresholds[scores.index(max(scores))]
0.028163774952387383
col_k = X_train.columns[importances > 0.028163774952387383].to_list()
X_train_embedded = X_train[col_k]
X_train_embedded.head()
|
ctrip_profits |
lasthtlordergap |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
0 |
1.347 |
-1.0 |
3.787 |
0.387 |
2850.0 |
1.137405 |
3 |
1314.0 |
6 |
1 |
1.347 |
-1.0 |
0.127 |
0.007 |
2850.0 |
1.137405 |
7 |
1314.0 |
13 |
2 |
0.767 |
-1.0 |
18.973 |
3.600 |
7272.0 |
1.137405 |
457 |
348.0 |
12 |
3 |
15.433 |
1986.0 |
1.507 |
0.287 |
47.0 |
1.160000 |
430 |
20273.0 |
8 |
4 |
1.347 |
-1.0 |
1.433 |
0.167 |
20.0 |
1.000000 |
85 |
83.0 |
13 |
X_test_embedded = X_test[col_k]
X_test_embedded.head()
|
ctrip_profits |
lasthtlordergap |
cityuvs |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
0 |
3.940 |
7224.0 |
7.147 |
0.580000 |
539.0 |
1.150000 |
220 |
4542.0 |
4 |
1 |
1.347 |
40911.0 |
0.447 |
0.060000 |
3.0 |
1.137687 |
81 |
3156.0 |
3 |
2 |
0.887 |
-1.0 |
4.313 |
0.460000 |
6532.0 |
1.000000 |
81 |
1026.0 |
17 |
3 |
1.347 |
-1.0 |
0.460 |
0.053000 |
363.0 |
1.000000 |
27 |
349.0 |
22 |
4 |
1.540 |
82256.0 |
0.060 |
2.246314 |
41.0 |
1.170000 |
63 |
811.0 |
10 |
plt.figure(figsize=(10,8))
sns.heatmap(X_train_embedded.corr(), annot=True, linewidths=1)
X_train_embedded.drop('cityuvs', axis=1, inplace=True)
X_test_embedded.drop('cityuvs', axis=1, inplace=True)
X_train_embedded.to_csv('X_train_embedded.csv')
X_test_embedded.to_csv('X_test_embedded.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')
X_train_embedded = pd.read_csv('X_train_embedded.csv', index_col=0)
X_test_embedded = pd.read_csv('X_test_embedded.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_test = pd.read_csv('y_test.csv', index_col=0)
y_train = np.ravel(y_train)
y_train.shape
y_test = np.ravel(y_test)
y_test.shape
(172487,)
import scipy
woe_data = pd.concat([X_train_embedded, pd.Series(y_train, name='label')], axis=1)
woe_data
|
ctrip_profits |
lasthtlordergap |
cityorders |
lastpvgap |
cr |
sid |
visitnum_oneyear |
h |
label |
0 |
1.347 |
-1.0 |
0.387 |
2850.0 |
1.137405 |
3 |
1314.0 |
6 |
0 |
1 |
1.347 |
-1.0 |
0.007 |
2850.0 |
1.137405 |
7 |
1314.0 |
13 |
0 |
2 |
0.767 |
-1.0 |
3.600 |
7272.0 |
1.137405 |
457 |
348.0 |
12 |
1 |
3 |
15.433 |
1986.0 |
0.287 |
47.0 |
1.160000 |
430 |
20273.0 |
8 |
1 |
4 |
1.347 |
-1.0 |
0.167 |
20.0 |
1.000000 |
85 |
83.0 |
13 |
0 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
517453 |
1.347 |
-1.0 |
0.113 |
4347.0 |
1.137405 |
8 |
278.0 |
21 |
0 |
517454 |
1.347 |
41045.0 |
0.520 |
2972.0 |
1.330000 |
25 |
1095.0 |
0 |
0 |
517455 |
1.347 |
113046.0 |
0.093 |
522.0 |
1.137405 |
120 |
6309.0 |
16 |
0 |
517456 |
-0.067 |
266544.0 |
0.600 |
28378.0 |
1.000000 |
22 |
100.0 |
9 |
0 |
517457 |
1.347 |
-1.0 |
0.420 |
2850.0 |
1.137405 |
5 |
1314.0 |
17 |
0 |
517458 rows × 9 columns
def get_woe(num_bins):
columns = ['min', 'max', 'count_0', 'count_1']
df = pd.DataFrame(num_bins, columns=columns)
df['total'] = df['count_0'] + df['count_1']
df['percentage'] = df['total'] / df['total'].sum()
df['bad_rate'] = df['count_1'] / df['total']
df['good%'] = df['count_0'] / df['count_0'].sum()
df['bad%'] = df['count_1'] / df['count_1'].sum()
df['good-bad'] = df['good%'] - df['bad%']
df['woe'] = np.log(df['good%'] / df['bad%'])
return df
def get_iv(df):
iv = np.sum(df['good-bad'] * df['woe'])
return iv
def get_bin(X, q):
df = woe_data.copy()
df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
count_0 = df[df['label']==0].groupby('qcut').count()['label']
count_1 = df[df['label']==1].groupby('qcut').count()['label']
num_bins = [*zip(updown,updown[1:],count_0,count_1)]
woe_df = get_woe(num_bins)
return woe_df
def get_graph(X, n=2, q=20):
df = woe_data.copy()
df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
count_0 = df[df['label']==0].groupby('qcut').count()['label']
count_1 = df[df['label']==1].groupby('qcut').count()['label']
num_bins = [*zip(updown,updown[1:],count_0,count_1)]
IV = []
axisx = []
while len(num_bins) > n:
pvs = []
for i in range(len(num_bins)-1):
x1 = num_bins[i][2:]
x2 = num_bins[i+1][2:]
pv = scipy.stats.chi2_contingency([x1,x2])[1]
pvs.append(pv)
i = pvs.index(max(pvs))
num_bins[i:i+2] = [(num_bins[i][0], num_bins[i+1][1],
num_bins[i][2] + num_bins[i+1][2],
num_bins[i][3] + num_bins[i+1][3])]
woe_df = get_woe(num_bins)
axisx.append(len(num_bins))
IV.append(get_iv(woe_df))
plt.figure()
plt.plot(axisx, IV)
plt.xticks(axisx)
plt.xlabel("number of box")
plt.ylabel("IV")
plt.show()
col_woe = ['ctrip_profits', 'lasthtlordergap', 'cityorders',
'lastpvgap', 'cr', 'sid', 'visitnum_oneyear', 'h']
for i in col_woe:
print(i)
get_graph(i)
ctrip_profits
lasthtlordergap
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
get_bin('lasthtlordergap', 10)
|
min |
max |
count_0 |
count_1 |
total |
percentage |
bad_rate |
good% |
bad% |
good-bad |
woe |
0 |
-1.0 |
2356.0 |
158912 |
48086 |
206998 |
0.400029 |
0.232302 |
0.423198 |
0.338741 |
0.084457 |
0.222603 |
1 |
2356.0 |
13291.0 |
30983 |
20754 |
51737 |
0.099983 |
0.401144 |
0.082511 |
0.146201 |
-0.063691 |
-0.572057 |
2 |
13291.0 |
29219.0 |
34175 |
17574 |
51749 |
0.100006 |
0.339601 |
0.091011 |
0.123800 |
-0.032789 |
-0.307683 |
3 |
29219.0 |
56455.0 |
35511 |
16229 |
51740 |
0.099989 |
0.313664 |
0.094569 |
0.114325 |
-0.019756 |
-0.189714 |
4 |
56455.0 |
110984.0 |
37245 |
14499 |
51744 |
0.099997 |
0.280206 |
0.099187 |
0.102138 |
-0.002951 |
-0.029318 |
5 |
110984.0 |
232020.0 |
39177 |
12572 |
51749 |
0.100006 |
0.242942 |
0.104332 |
0.088563 |
0.015769 |
0.163861 |
6 |
232020.0 |
527026.0 |
39500 |
12241 |
51741 |
0.099991 |
0.236582 |
0.105192 |
0.086232 |
0.018961 |
0.198753 |
get_bin('cr',10)
|
min |
max |
count_0 |
count_1 |
total |
percentage |
bad_rate |
good% |
bad% |
good-bad |
woe |
0 |
1.000000 |
1.120000 |
166695 |
42471 |
209166 |
0.404218 |
0.203049 |
0.443925 |
0.299186 |
0.144738 |
0.394588 |
1 |
1.120000 |
1.137405 |
135099 |
47248 |
182347 |
0.352390 |
0.259110 |
0.359781 |
0.332838 |
0.026944 |
0.077841 |
2 |
1.137405 |
1.170000 |
15752 |
7000 |
22752 |
0.043969 |
0.307665 |
0.041949 |
0.049311 |
-0.007362 |
-0.161699 |
3 |
1.170000 |
1.330000 |
37674 |
24461 |
62135 |
0.120077 |
0.393675 |
0.100329 |
0.172315 |
-0.071986 |
-0.540866 |
4 |
1.330000 |
11.000000 |
20283 |
20775 |
41058 |
0.079346 |
0.505992 |
0.054016 |
0.146349 |
-0.092334 |
-0.996724 |
get_bin('cityorders',10)
|
min |
max |
count_0 |
count_1 |
total |
percentage |
bad_rate |
good% |
bad% |
good-bad |
woe |
0 |
0.007000 |
0.033000 |
44555 |
11516 |
56071 |
0.108359 |
0.205382 |
0.118654 |
0.081124 |
0.037530 |
0.380231 |
1 |
0.033000 |
0.093000 |
39744 |
10584 |
50328 |
0.097260 |
0.210300 |
0.105842 |
0.074559 |
0.031283 |
0.350359 |
2 |
0.093000 |
0.200000 |
40294 |
11095 |
51389 |
0.099310 |
0.215902 |
0.107307 |
0.078159 |
0.029148 |
0.316952 |
3 |
0.200000 |
0.380000 |
37656 |
11761 |
49417 |
0.095500 |
0.237995 |
0.100281 |
0.082850 |
0.017431 |
0.190947 |
4 |
0.380000 |
0.753000 |
37170 |
14675 |
51845 |
0.100192 |
0.283055 |
0.098987 |
0.103378 |
-0.004391 |
-0.043400 |
5 |
0.753000 |
1.400000 |
35492 |
16268 |
51760 |
0.100027 |
0.314297 |
0.094519 |
0.114600 |
-0.020081 |
-0.192649 |
6 |
1.400000 |
2.255565 |
49463 |
16530 |
65993 |
0.127533 |
0.250481 |
0.131725 |
0.116445 |
0.015279 |
0.123292 |
7 |
2.255565 |
3.260000 |
25531 |
11738 |
37269 |
0.072023 |
0.314953 |
0.067991 |
0.082688 |
-0.014697 |
-0.195694 |
8 |
3.260000 |
6.633000 |
32872 |
18901 |
51773 |
0.100053 |
0.365074 |
0.087541 |
0.133148 |
-0.045607 |
-0.419350 |
9 |
6.633000 |
14.507000 |
32726 |
18887 |
51613 |
0.099743 |
0.365935 |
0.087152 |
0.133049 |
-0.045897 |
-0.423060 |
get_bin('h',10)
|
min |
max |
count_0 |
count_1 |
total |
percentage |
bad_rate |
good% |
bad% |
good-bad |
woe |
0 |
0.0 |
6.0 |
42287 |
15678 |
57965 |
0.112019 |
0.270474 |
0.112614 |
0.110443 |
0.002171 |
0.019465 |
1 |
6.0 |
10.0 |
46850 |
22957 |
69807 |
0.134904 |
0.328864 |
0.124766 |
0.161720 |
-0.036954 |
-0.259428 |
2 |
10.0 |
12.0 |
34400 |
16455 |
50855 |
0.098279 |
0.323567 |
0.091610 |
0.115917 |
-0.024307 |
-0.235329 |
3 |
12.0 |
13.0 |
19815 |
8752 |
28567 |
0.055206 |
0.306367 |
0.052769 |
0.061653 |
-0.008884 |
-0.155599 |
4 |
13.0 |
15.0 |
38660 |
18137 |
56797 |
0.109762 |
0.319330 |
0.102955 |
0.127766 |
-0.024811 |
-0.215905 |
5 |
15.0 |
17.0 |
42537 |
18293 |
60830 |
0.117555 |
0.300723 |
0.113280 |
0.128865 |
-0.015585 |
-0.128901 |
6 |
17.0 |
19.0 |
40305 |
16122 |
56427 |
0.109047 |
0.285714 |
0.107336 |
0.113571 |
-0.006235 |
-0.056466 |
7 |
19.0 |
21.0 |
49680 |
15527 |
65207 |
0.126014 |
0.238119 |
0.132303 |
0.109380 |
0.022923 |
0.190266 |
8 |
21.0 |
22.0 |
31405 |
6631 |
38036 |
0.073505 |
0.174335 |
0.083634 |
0.046712 |
0.036922 |
0.582455 |
9 |
22.0 |
23.0 |
29564 |
3403 |
32967 |
0.063710 |
0.103224 |
0.078732 |
0.023972 |
0.054759 |
1.189144 |
get_bin('ctrip_profits',10)
|
min |
max |
count_0 |
count_1 |
total |
percentage |
bad_rate |
good% |
bad% |
good-bad |
woe |
0 |
-44.313 |
0.147 |
37921 |
13979 |
51900 |
0.100298 |
0.269345 |
0.100987 |
0.098475 |
0.002512 |
0.025192 |
1 |
0.147 |
0.500 |
37673 |
13923 |
51596 |
0.099711 |
0.269846 |
0.100327 |
0.098080 |
0.002246 |
0.022645 |
2 |
0.500 |
1.147 |
37713 |
14400 |
52113 |
0.100710 |
0.276323 |
0.100433 |
0.101441 |
-0.001007 |
-0.009980 |
3 |
1.147 |
1.347 |
150615 |
44404 |
195019 |
0.376879 |
0.227691 |
0.401102 |
0.312803 |
0.088299 |
0.248641 |
4 |
1.347 |
1.587 |
8296 |
3333 |
11629 |
0.022473 |
0.286611 |
0.022093 |
0.023479 |
-0.001386 |
-0.060856 |
5 |
1.587 |
3.220 |
36089 |
15701 |
51790 |
0.100085 |
0.303167 |
0.096108 |
0.110605 |
-0.014497 |
-0.140493 |
6 |
3.220 |
7.327 |
35310 |
16403 |
51713 |
0.099937 |
0.317193 |
0.094034 |
0.115551 |
-0.021517 |
-0.206054 |
7 |
7.327 |
600.820 |
31886 |
19812 |
51698 |
0.099908 |
0.383226 |
0.084915 |
0.139565 |
-0.054650 |
-0.496877 |
X_train_woe = X_train_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]
X_test_woe = X_test_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]
scores = []
time0 = time()
for i in np.arange(5,21,1):
rfc = RFC(n_estimators=10, max_depth=i, random_state=42)
score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
scores.append(score)
print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},最大深度为{}'.format(max(scores), np.arange(5,21,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(5,21,1), scores)
plt.show()
花费时间:03:27:049065
最大分数为0.8891446300224614,最大深度为20
scores = []
time0 = time()
for i in np.arange(2,10,1):
rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=i, random_state=42)
score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
scores.append(score)
print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},最小分割数为{}'.format(max(scores), np.arange(2,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(2,10,1), scores)
plt.show()
花费时间:02:16:546873
最大分数为0.8891446300224614,最小分割数为2
scores = []
time0 = time()
for i in np.arange(1,10,1):
rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=2, min_samples_leaf=i, random_state=42)
score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
scores.append(score)
print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},小叶子节点样本数{}'.format(max(scores), np.arange(1,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(1,10,1), scores)
plt.show()
花费时间:02:13:264602
最大分数为0.8891446300224614,小叶子节点样本数1
scores = []
time0 = time()
for i in np.arange(10,201,10):
rfc = RFC(n_estimators=i, max_depth=20, random_state=42)
score = cross_val_score(rfc, X_train_embedded, y_train, cv=3, n_jobs=-1).mean()
scores.append(score)
print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},树数量{}'.format(max(scores), np.arange(10,201,10)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(10,201,10), scores)
plt.show()
花费时间:00:33:316156
花费时间:01:39:789986
花费时间:03:21:423289
花费时间:05:40:185622
花费时间:08:30:948403
花费时间:11:56:105026
花费时间:15:51:769696
花费时间:20:36:914026
花费时间:25:38:212320
花费时间:31:10:214501
花费时间:37:27:759795
花费时间:44:10:525391
花费时间:51:28:059915
花费时间:59:16:547367
花费时间:07:41:280818
花费时间:16:38:253700
花费时间:26:07:070954
花费时间:36:06:570602
花费时间:48:40:747174
花费时间:00:14:880475
最大分数为0.8988864801364768,树数量100
rfc = RFC(n_estimators=100, max_depth=20, random_state=42).fit(X_train_embedded, y_train)
print('训练集得分为{}'.format(rfc.score(X_train_embedded, y_train)))
print('测试集得分为{}'.format(rfc.score(X_train_embedded, y_test)))
训练集得分为0.9144162424776504
测试集得分为0.8858812548192038
rfc.feature_importances_
array([0.12193391, 0.12869867, 0.14163503, 0.13799971, 0.10331983,
0.12834216, 0.14547079, 0.09259991])
y_scores = rfc.predict_proba(X_train_embedded)
y_scores
array([[0.42810513, 0.57189487],
[0.90260411, 0.09739589],
[0.87430575, 0.12569425],
...,
[0.3633656 , 0.6366344 ],
[0.79112542, 0.20887458],
[0.14388709, 0.85611291]])
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc
0.9680887508287116
def draw_roc(roc_auc, fpr, tpr):
plt.subplots(figsize=(7,5.5))
plt.plot(fpr, tpr, color='orange', label='roc curve(area={})'.format(roc_auc))
plt.plot([0,1], [0,1], color='blue', linestyle='--')
plt.xlabel('fpr')
plt.ylabel('tpr')
plt.xlim([0,1])
plt.ylim([0,1.05])
plt.title('ROC Curve')
plt.legend(loc=4)
plt.show()
draw_roc(roc_auc, fpr, tpr)
rfm = df[['sampleid','ordernum_oneyear','avgprice','lasthtlordergap']]
rfm.head()
|
sampleid |
ordernum_oneyear |
avgprice |
lasthtlordergap |
0 |
24636 |
NaN |
NaN |
NaN |
1 |
24637 |
NaN |
NaN |
NaN |
2 |
24641 |
NaN |
NaN |
NaN |
3 |
24642 |
NaN |
NaN |
NaN |
4 |
24644 |
NaN |
NaN |
NaN |
rfm = rfm.dropna().reset_index(drop=True).rename(columns={'ordernum_oneyear':'F', 'avgprice':'M', 'lasthtlordergap':'R'})
rfm.head()
|
sampleid |
F |
M |
R |
0 |
24650 |
21.0 |
363.0 |
10475.0 |
1 |
24653 |
7.0 |
307.0 |
18873.0 |
2 |
24655 |
1.0 |
343.0 |
32071.0 |
3 |
24658 |
33.0 |
1000.0 |
4616.0 |
4 |
24662 |
4.0 |
685.0 |
44830.0 |
rfm['R'] = round(rfm['R'] / 1440, 0)
rfm.head()
|
sampleid |
F |
M |
R |
0 |
24650 |
21.0 |
363.0 |
7.0 |
1 |
24653 |
7.0 |
307.0 |
13.0 |
2 |
24655 |
1.0 |
343.0 |
22.0 |
3 |
24658 |
33.0 |
1000.0 |
3.0 |
4 |
24662 |
4.0 |
685.0 |
31.0 |
rfm.describe().T
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
sampleid |
426425.0 |
629380.138599 |
414760.183032 |
24650.0 |
313549.0 |
600907.0 |
887813.0 |
2238403.0 |
F |
426425.0 |
12.137916 |
17.405419 |
1.0 |
3.0 |
6.0 |
14.0 |
711.0 |
M |
426425.0 |
421.604962 |
286.987700 |
1.0 |
233.0 |
351.0 |
523.0 |
6383.0 |
R |
426425.0 |
70.742163 |
84.844780 |
0.0 |
11.0 |
33.0 |
97.0 |
366.0 |
f_bins = [-1, 3, 5, 7, 10, 720]
m_bins = [-1, 200, 400, 600, 800, 7000]
r_bins = [-1, 3, 7, 30, 180, 370]
rfm['R_score'] = pd.cut(rfm['R'], bins=r_bins, labels=[5,4,3,2,1]).astype('int')
rfm['F_score'] = pd.cut(rfm['F'], bins=f_bins, labels=[1,2,3,4,5]).astype('int')
rfm['M_score'] = pd.cut(rfm['M'], bins=m_bins, labels=[1,2,3,4,5]).astype('int')
rfm
|
sampleid |
F |
M |
R |
R_score |
F_score |
M_score |
0 |
24650 |
21.0 |
363.0 |
7.0 |
4 |
5 |
2 |
1 |
24653 |
7.0 |
307.0 |
13.0 |
3 |
3 |
2 |
2 |
24655 |
1.0 |
343.0 |
22.0 |
3 |
1 |
2 |
3 |
24658 |
33.0 |
1000.0 |
3.0 |
5 |
5 |
5 |
4 |
24662 |
4.0 |
685.0 |
31.0 |
2 |
2 |
4 |
... |
... |
... |
... |
... |
... |
... |
... |
426420 |
2238388 |
2.0 |
226.0 |
119.0 |
2 |
1 |
2 |
426421 |
2238389 |
4.0 |
461.0 |
0.0 |
5 |
2 |
3 |
426422 |
2238396 |
5.0 |
193.0 |
44.0 |
2 |
2 |
1 |
426423 |
2238397 |
1.0 |
258.0 |
87.0 |
2 |
1 |
2 |
426424 |
2238403 |
3.0 |
256.0 |
52.0 |
2 |
1 |
2 |
426425 rows × 7 columns
rfm['R_level'] = (rfm['R_score'] > rfm['R_score'].mean()) * 1
rfm['F_level'] = (rfm['F_score'] > rfm['F_score'].mean()) * 1
rfm['M_level'] = (rfm['M_score'] > rfm['M_score'].mean()) * 1
rfm
|
sampleid |
F |
M |
R |
R_score |
F_score |
M_score |
R_level |
F_level |
M_level |
0 |
24650 |
21.0 |
363.0 |
7.0 |
4 |
5 |
2 |
1 |
1 |
0 |
1 |
24653 |
7.0 |
307.0 |
13.0 |
3 |
3 |
2 |
1 |
1 |
0 |
2 |
24655 |
1.0 |
343.0 |
22.0 |
3 |
1 |
2 |
1 |
0 |
0 |
3 |
24658 |
33.0 |
1000.0 |
3.0 |
5 |
5 |
5 |
1 |
1 |
1 |
4 |
24662 |
4.0 |
685.0 |
31.0 |
2 |
2 |
4 |
0 |
0 |
1 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
426420 |
2238388 |
2.0 |
226.0 |
119.0 |
2 |
1 |
2 |
0 |
0 |
0 |
426421 |
2238389 |
4.0 |
461.0 |
0.0 |
5 |
2 |
3 |
1 |
0 |
1 |
426422 |
2238396 |
5.0 |
193.0 |
44.0 |
2 |
2 |
1 |
0 |
0 |
0 |
426423 |
2238397 |
1.0 |
258.0 |
87.0 |
2 |
1 |
2 |
0 |
0 |
0 |
426424 |
2238403 |
3.0 |
256.0 |
52.0 |
2 |
1 |
2 |
0 |
0 |
0 |
426425 rows × 10 columns
rfm['RFM'] = pd.concat([rfm['R_level'].astype('str') + rfm['F_level'].astype('str') + rfm['M_level'].astype('str')])
rfm['RFM'].replace(['111','101','011','001','110','100','010','000']
, ['重要价值用户','重要发展用户','重要保持用户','重要挽留用户','一般价值用户','一般发展用户','一般保持用户','一般挽留用户'], inplace=True)
rfm
|
sampleid |
F |
M |
R |
R_score |
F_score |
M_score |
R_level |
F_level |
M_level |
RFM |
0 |
24650 |
21.0 |
363.0 |
7.0 |
4 |
5 |
2 |
1 |
1 |
0 |
一般价值用户 |
1 |
24653 |
7.0 |
307.0 |
13.0 |
3 |
3 |
2 |
1 |
1 |
0 |
一般价值用户 |
2 |
24655 |
1.0 |
343.0 |
22.0 |
3 |
1 |
2 |
1 |
0 |
0 |
一般发展用户 |
3 |
24658 |
33.0 |
1000.0 |
3.0 |
5 |
5 |
5 |
1 |
1 |
1 |
重要价值用户 |
4 |
24662 |
4.0 |
685.0 |
31.0 |
2 |
2 |
4 |
0 |
0 |
1 |
重要挽留用户 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
426420 |
2238388 |
2.0 |
226.0 |
119.0 |
2 |
1 |
2 |
0 |
0 |
0 |
一般挽留用户 |
426421 |
2238389 |
4.0 |
461.0 |
0.0 |
5 |
2 |
3 |
1 |
0 |
1 |
重要发展用户 |
426422 |
2238396 |
5.0 |
193.0 |
44.0 |
2 |
2 |
1 |
0 |
0 |
0 |
一般挽留用户 |
426423 |
2238397 |
1.0 |
258.0 |
87.0 |
2 |
1 |
2 |
0 |
0 |
0 |
一般挽留用户 |
426424 |
2238403 |
3.0 |
256.0 |
52.0 |
2 |
1 |
2 |
0 |
0 |
0 |
一般挽留用户 |
426425 rows × 11 columns
rfm_new = pd.DataFrame(rfm.groupby('RFM', as_index=False)['sampleid'].agg('count'))
rfm_new
|
RFM |
sampleid |
0 |
一般价值用户 |
78592 |
1 |
一般保持用户 |
46850 |
2 |
一般发展用户 |
42275 |
3 |
一般挽留用户 |
83394 |
4 |
重要价值用户 |
63595 |
5 |
重要保持用户 |
38850 |
6 |
重要发展用户 |
20235 |
7 |
重要挽留用户 |
52634 |
plt.figure(figsize=(12,6))
plt.pie((rfm_new['sampleid'] / rfm_new['sampleid'].sum()).to_list(), labels=rfm_new['RFM'].to_list(), autopct='%0.2f%%')
[Text(0.9207056795449674, 0.6019144886556893, '一般价值用户'),
Text(0.07432600254562635, 1.0974860570164833, '一般保持用户'),
Text(-0.6110720650087508, 0.9146534487804336, '一般发展用户'),
Text(-1.0982775444537256, 0.061534017816936265, '一般挽留用户'),
Text(-0.5691837587192285, -0.9412915854347425, '重要价值用户'),
Text(0.23025758705583027, -1.0756307189752563, '重要保持用户'),
Text(0.6623554620643879, -0.8782284679247601, '重要发展用户'),
Text(1.0183302486279413, -0.4159368999371846, '重要挽留用户')],
[Text(0.5022030979336185, 0.3283169938121941, '18.43%'),
Text(0.04054145593397801, 0.5986287583726272, '10.99%'),
Text(-0.33331203545931853, 0.49890188115296374, '9.91%'),
Text(-0.5990604787929412, 0.03356400971832887, '19.56%'),
Text(-0.31046386839230644, -0.5134317738734958, '14.91%'),
Text(0.1255950474849983, -0.5867076648955943, '9.11%'),
Text(0.3612847974896661, -0.47903370977714177, '4.75%'),
Text(0.5554528628879679, -0.22687467269300973, '12.34%')])