携程客户流失分析项目(个人练习+源代码)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier as RFC
%matplotlib inline

plt.rcParams['font.family'] = ['SimHei']    # 显示中文,解决图中无法显示中文的问题
plt.rcParams['axes.unicode_minus']=False
# 读取文件
df = pd.read_table('userlostprob.txt')
# 查看头五行
df.head()
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 0 24636 2016-05-18 2016-05-18 0 NaN NaN NaN 1.04 NaN ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12
1 1 24637 2016-05-18 2016-05-18 0 NaN NaN NaN 1.06 NaN ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14
2 0 24641 2016-05-18 2016-05-19 0 NaN NaN NaN 1.05 NaN ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19
3 0 24642 2016-05-18 2016-05-18 0 NaN NaN NaN 1.01 NaN ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16
4 1 24644 2016-05-18 2016-05-19 0 NaN NaN NaN 1.00 NaN ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21

5 rows × 51 columns

# 观察标签分布状况
df['label'].value_counts()
0    500588
1    189357
Name: label, dtype: int64
# 查看后五行
df.tail()
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
689940 1 2238419 2016-05-15 2016-05-17 1 19.0 NaN NaN 1.06 NaN ... 406.0 NaN 0.48 13.573 1.660 1034.0 1.0 5 119.0 18
689941 1 2238421 2016-05-15 2016-05-15 1 10.0 3.0 3.0 1.06 0.33 ... 199.0 713.0 0.51 2.880 0.513 179.0 2.0 15 1472.0 12
689942 0 2238422 2016-05-15 2016-05-17 0 NaN NaN NaN 1.07 NaN ... 544.0 NaN 0.45 15.293 2.067 0.0 NaN 8 107.0 0
689943 0 2238425 2016-05-15 2016-05-17 0 NaN NaN NaN 1.04 NaN ... 156.0 NaN 0.29 2.467 0.333 NaN NaN 4 NaN 0
689944 0 2238426 2016-05-15 2016-05-15 0 NaN NaN NaN 1.02 NaN ... 275.0 NaN NaN 12.600 2.653 NaN NaN 2 NaN 11

5 rows × 51 columns

# 随机查看五行
df.sample(5)
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
477013 1 820235 2016-05-21 2016-05-21 0 15.0 NaN 15.0 1.05 0.36 ... 582.0 18831.0 0.48 17.220 3.400 4242.0 1.33 446 906.0 9
426926 0 736598 2016-05-15 2016-05-15 0 1.0 NaN 39.0 1.05 0.16 ... 978.0 12199.0 0.13 5.113 0.847 642.0 1.36 732 2583.0 8
628554 0 1072402 2016-05-20 2016-05-20 0 NaN NaN 3.0 1.02 0.00 ... 147.0 55214.0 0.27 15.873 3.220 10002.0 1.11 186 905.0 19
248275 0 438633 2016-05-18 2016-06-09 0 19.0 2.0 28.0 1.02 0.78 ... NaN 3329.0 NaN 1.320 0.087 145.0 1.12 449 17397.0 11
198972 0 356550 2016-05-19 2016-05-19 0 7.0 NaN 2.0 1.04 0.50 ... 206.0 61467.0 0.32 20.480 5.153 13264.0 1.08 59 1522.0 20

5 rows × 51 columns

# 数据形状
df.shape
(689945, 51)
# 查看数据类型
df.dtypes
label                                 int64
sampleid                              int64
d                                    object
arrival                              object
iforderpv_24h                         int64
decisionhabit_user                  float64
historyvisit_7ordernum              float64
historyvisit_totalordernum          float64
hotelcr                             float64
ordercanceledprecent                float64
landhalfhours                       float64
ordercanncelednum                   float64
commentnums                         float64
starprefer                          float64
novoters                            float64
consuming_capacity                  float64
historyvisit_avghotelnum            float64
cancelrate                          float64
historyvisit_visit_detailpagenum    float64
delta_price1                        float64
price_sensitive                     float64
hoteluv                             float64
businessrate_pre                    float64
ordernum_oneyear                    float64
cr_pre                              float64
avgprice                            float64
lowestprice                         float64
firstorder_bu                       float64
customereval_pre2                   float64
delta_price2                        float64
commentnums_pre                     float64
customer_value_profit               float64
commentnums_pre2                    float64
cancelrate_pre                      float64
novoters_pre2                       float64
novoters_pre                        float64
ctrip_profits                       float64
deltaprice_pre2_t1                  float64
lowestprice_pre                     float64
uv_pre                              float64
uv_pre2                             float64
lowestprice_pre2                    float64
lasthtlordergap                     float64
businessrate_pre2                   float64
cityuvs                             float64
cityorders                          float64
lastpvgap                           float64
cr                                  float64
sid                                   int64
visitnum_oneyear                    float64
h                                     int64
dtype: object
# 查看数据基本信息
df.info()

RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   label                             689945 non-null  int64  
 1   sampleid                          689945 non-null  int64  
 2   d                                 689945 non-null  object 
 3   arrival                           689945 non-null  object 
 4   iforderpv_24h                     689945 non-null  int64  
 5   decisionhabit_user                385450 non-null  float64
 6   historyvisit_7ordernum            82915 non-null   float64
 7   historyvisit_totalordernum        386525 non-null  float64
 8   hotelcr                           689148 non-null  float64
 9   ordercanceledprecent              447831 non-null  float64
 10  landhalfhours                     661312 non-null  float64
 11  ordercanncelednum                 447831 non-null  float64
 12  commentnums                       622029 non-null  float64
 13  starprefer                        464892 non-null  float64
 14  novoters                          672918 non-null  float64
 15  consuming_capacity                463837 non-null  float64
 16  historyvisit_avghotelnum          387876 non-null  float64
 17  cancelrate                        678227 non-null  float64
 18  historyvisit_visit_detailpagenum  307234 non-null  float64
 19  delta_price1                      437146 non-null  float64
 20  price_sensitive                   463837 non-null  float64
 21  hoteluv                           689148 non-null  float64
 22  businessrate_pre                  483896 non-null  float64
 23  ordernum_oneyear                  447831 non-null  float64
 24  cr_pre                            660548 non-null  float64
 25  avgprice                          457261 non-null  float64
 26  lowestprice                       687931 non-null  float64
 27  firstorder_bu                     376993 non-null  float64
 28  customereval_pre2                 661312 non-null  float64
 29  delta_price2                      437750 non-null  float64
 30  commentnums_pre                   598368 non-null  float64
 31  customer_value_profit             439123 non-null  float64
 32  commentnums_pre2                  648457 non-null  float64
 33  cancelrate_pre                    653015 non-null  float64
 34  novoters_pre2                     657616 non-null  float64
 35  novoters_pre                      648956 non-null  float64
 36  ctrip_profits                     445187 non-null  float64
 37  deltaprice_pre2_t1                543180 non-null  float64
 38  lowestprice_pre                   659689 non-null  float64
 39  uv_pre                            660548 non-null  float64
 40  uv_pre2                           661189 non-null  float64
 41  lowestprice_pre2                  660664 non-null  float64
 42  lasthtlordergap                   447831 non-null  float64
 43  businessrate_pre2                 602960 non-null  float64
 44  cityuvs                           682274 non-null  float64
 45  cityorders                        651263 non-null  float64
 46  lastpvgap                         592818 non-null  float64
 47  cr                                457896 non-null  float64
 48  sid                               689945 non-null  int64  
 49  visitnum_oneyear                  592910 non-null  float64
 50  h                                 689945 non-null  int64  
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
# 描述性统计
df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])
label sampleid iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ordercanncelednum ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
count 689945.000000 6.899450e+05 689945.000000 385450.000000 82915.000000 386525.000000 689148.000000 447831.000000 661312.000000 447831.000000 ... 660664.000000 447831.000000 602960.000000 682274.000000 651263.000000 592818.000000 457896.000000 689945.000000 5.929100e+05 689945.000000
mean 0.274452 6.285402e+05 0.193737 5.317048 1.856094 11.710487 1.060996 0.342119 6.086366 154.179369 ... 318.541812 101830.919400 0.368237 10.648278 2.253250 12049.409382 1.137476 153.702414 1.855185e+04 14.462315
std 0.446238 4.146815e+05 0.395226 38.524483 2.103862 17.251429 0.045264 0.354210 12.413225 398.456986 ... 351.913035 122784.313864 0.219945 15.696682 3.538453 25601.374138 0.204789 277.807697 2.288603e+05 6.301575
min 0.000000 2.463600e+04 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 1.000000 0.000000 0.000000 0.007000 0.007000 0.000000 1.000000 0.000000 1.000000e+00 0.000000
1% 0.000000 3.620588e+04 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 52.000000 244.000000 0.010000 0.013000 0.007000 0.000000 1.000000 1.000000 2.100000e+01 0.000000
10% 0.000000 1.398464e+05 0.000000 1.000000 1.000000 1.000000 1.010000 0.000000 0.000000 0.000000 ... 101.000000 3518.000000 0.050000 0.160000 0.033000 127.000000 1.000000 4.000000 1.610000e+02 6.000000
25% 0.000000 3.123200e+05 0.000000 2.000000 1.000000 2.000000 1.030000 0.000000 0.000000 0.000000 ... 145.000000 14999.000000 0.170000 0.827000 0.127000 551.000000 1.000000 17.000000 4.710000e+02 11.000000
50% 0.000000 5.996370e+05 0.000000 3.000000 1.000000 6.000000 1.050000 0.250000 0.000000 2.000000 ... 233.000000 46890.000000 0.400000 3.527000 0.627000 2848.000000 1.050000 62.000000 1.315000e+03 15.000000
75% 1.000000 8.874600e+05 0.000000 5.000000 2.000000 14.000000 1.090000 0.570000 4.000000 153.000000 ... 388.000000 138953.000000 0.550000 13.327000 2.747000 10726.000000 1.210000 180.000000 3.141000e+03 20.000000
90% 1.000000 1.059705e+06 1.000000 10.000000 3.000000 29.000000 1.120000 0.980000 27.000000 492.000000 ... 611.000000 311492.000000 0.650000 35.567000 7.547000 30384.900000 1.400000 392.000000 6.634000e+03 22.000000
99% 1.000000 2.226893e+06 1.000000 27.000000 7.000000 82.000000 1.190000 1.000000 48.000000 1752.000000 ... 1464.000000 484734.000000 0.780000 66.007000 14.453000 138722.000000 2.000000 1212.000000 2.625670e+05 23.000000
max 1.000000 2.238426e+06 1.000000 3167.000000 106.000000 711.000000 3.180000 1.000000 49.000000 13475.000000 ... 43700.000000 527026.000000 0.990000 67.140000 14.507000 194386.000000 11.000000 9956.000000 9.651192e+06 23.000000

12 rows × 49 columns

# 删除重复值
df.drop_duplicates(inplace=True)
df.shape
(689945, 51)
# 根据缺失值比例进行排序
null = df.isnull().mean().reset_index().sort_values(0)
null_1 = null.rename(columns={'index':'特征', 0:'缺失比'})
null_1
特征 缺失比
0 label 0.000000
48 sid 0.000000
4 iforderpv_24h 0.000000
50 h 0.000000
2 d 0.000000
1 sampleid 0.000000
3 arrival 0.000000
8 hotelcr 0.001155
21 hoteluv 0.001155
26 lowestprice 0.002919
44 cityuvs 0.011118
17 cancelrate 0.016984
14 novoters 0.024679
28 customereval_pre2 0.041500
10 landhalfhours 0.041500
40 uv_pre2 0.041679
41 lowestprice_pre2 0.042440
39 uv_pre 0.042608
24 cr_pre 0.042608
38 lowestprice_pre 0.043853
34 novoters_pre2 0.046857
33 cancelrate_pre 0.053526
45 cityorders 0.056065
35 novoters_pre 0.059409
32 commentnums_pre2 0.060132
12 commentnums 0.098437
43 businessrate_pre2 0.126075
30 commentnums_pre 0.132731
49 visitnum_oneyear 0.140642
46 lastpvgap 0.140775
37 deltaprice_pre2_t1 0.212720
22 businessrate_pre 0.298646
13 starprefer 0.326190
20 price_sensitive 0.327719
15 consuming_capacity 0.327719
47 cr 0.336330
25 avgprice 0.337250
23 ordernum_oneyear 0.350918
42 lasthtlordergap 0.350918
11 ordercanncelednum 0.350918
9 ordercanceledprecent 0.350918
36 ctrip_profits 0.354750
31 customer_value_profit 0.363539
29 delta_price2 0.365529
19 delta_price1 0.366405
16 historyvisit_avghotelnum 0.437816
7 historyvisit_totalordernum 0.439774
5 decisionhabit_user 0.441332
27 firstorder_bu 0.453590
18 historyvisit_visit_detailpagenum 0.554698
6 historyvisit_7ordernum 0.879824
# 绘制密度图
plt.figure(figsize=(8,6))
sns.kdeplot(null_1['缺失比'], shade=True)

携程客户流失分析项目(个人练习+源代码)_第1张图片

# 用条形图观察缺失值
plt.figure(figsize=(8,6))
plt.bar(range(null_1.shape[0]), null_1['缺失比'], label='lost rate')
plt.legend(loc='best')

携程客户流失分析项目(个人练习+源代码)_第2张图片

# 删除缺失值过多的列
df = df.drop(['historyvisit_7ordernum'], axis=1)
df
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 0 24636 2016-05-18 2016-05-18 0 NaN NaN 1.04 NaN 22.0 ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12
1 1 24637 2016-05-18 2016-05-18 0 NaN NaN 1.06 NaN 0.0 ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14
2 0 24641 2016-05-18 2016-05-19 0 NaN NaN 1.05 NaN 3.0 ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19
3 0 24642 2016-05-18 2016-05-18 0 NaN NaN 1.01 NaN 2.0 ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16
4 1 24644 2016-05-18 2016-05-19 0 NaN NaN 1.00 NaN 0.0 ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
689940 1 2238419 2016-05-15 2016-05-17 1 19.0 NaN 1.06 NaN 1.0 ... 406.0 NaN 0.48 13.573 1.660 1034.0 1.0 5 119.0 18
689941 1 2238421 2016-05-15 2016-05-15 1 10.0 3.0 1.06 0.33 49.0 ... 199.0 713.0 0.51 2.880 0.513 179.0 2.0 15 1472.0 12
689942 0 2238422 2016-05-15 2016-05-17 0 NaN NaN 1.07 NaN 0.0 ... 544.0 NaN 0.45 15.293 2.067 0.0 NaN 8 107.0 0
689943 0 2238425 2016-05-15 2016-05-17 0 NaN NaN 1.04 NaN 0.0 ... 156.0 NaN 0.29 2.467 0.333 NaN NaN 4 NaN 0
689944 0 2238426 2016-05-15 2016-05-15 0 NaN NaN 1.02 NaN 0.0 ... 275.0 NaN NaN 12.600 2.653 NaN NaN 2 NaN 11

689945 rows × 50 columns

# 异常值观察
df.describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
count mean std min 1% 25% 50% 75% 99% max
label 689945.0 0.274452 0.446238 0.000 0.00000 0.000 0.000 1.000 1.000000e+00 1.000
sampleid 689945.0 628540.209625 414681.498697 24636.000 36205.88000 312320.000 599637.000 887460.000 2.226893e+06 2238426.000
iforderpv_24h 689945.0 0.193737 0.395226 0.000 0.00000 0.000 0.000 0.000 1.000000e+00 1.000
decisionhabit_user 385450.0 5.317048 38.524483 0.000 1.00000 2.000 3.000 5.000 2.700000e+01 3167.000
historyvisit_totalordernum 386525.0 11.710487 17.251429 1.000 1.00000 2.000 6.000 14.000 8.200000e+01 711.000
hotelcr 689148.0 1.060996 0.045264 1.000 1.00000 1.030 1.050 1.090 1.190000e+00 3.180
ordercanceledprecent 447831.0 0.342119 0.354210 0.000 0.00000 0.000 0.250 0.570 1.000000e+00 1.000
landhalfhours 661312.0 6.086366 12.413225 0.000 0.00000 0.000 0.000 4.000 4.800000e+01 49.000
ordercanncelednum 447831.0 154.179369 398.456986 0.000 0.00000 0.000 2.000 153.000 1.752000e+03 13475.000
commentnums 622029.0 1272.090888 2101.871601 0.000 1.00000 115.000 514.000 1670.000 8.796000e+03 34189.000
starprefer 464892.0 67.532304 19.175094 0.000 20.00000 53.300 69.400 80.300 1.000000e+02 100.000
novoters 672918.0 1706.247901 2811.690007 1.000 1.00000 157.000 692.000 2196.000 1.157600e+04 45455.000
consuming_capacity 463837.0 39.154140 23.240147 0.000 8.00000 22.000 33.000 51.000 1.000000e+02 100.000
historyvisit_avghotelnum 387876.0 6.510179 41.045261 0.000 1.00000 2.000 4.000 7.000 2.900000e+01 3167.000
cancelrate 678227.0 1051.604143 1509.066134 1.000 2.00000 137.000 503.000 1373.000 6.399000e+03 18930.000
historyvisit_visit_detailpagenum 307234.0 37.153603 73.402891 1.000 1.00000 6.000 18.000 44.000 2.620000e+02 6199.000
delta_price1 437146.0 79.067012 512.942824 -99879.000 -1227.55000 -31.000 81.000 226.000 1.081000e+03 5398.000
price_sensitive 463837.0 24.645863 26.685606 0.000 0.00000 5.000 16.000 33.000 1.000000e+02 100.000
hoteluv 689148.0 95.092708 169.981527 0.007 0.16700 10.427 36.180 107.747 9.641130e+02 1722.613
businessrate_pre 483896.0 0.372717 0.232791 0.000 0.01000 0.150 0.390 0.570 8.000000e-01 0.990
ordernum_oneyear 447831.0 11.642061 17.137209 1.000 1.00000 2.000 6.000 14.000 8.100000e+01 711.000
cr_pre 660548.0 1.062906 0.044588 1.000 1.00000 1.030 1.060 1.090 1.190000e+00 2.950
avgprice 457261.0 422.458701 290.853332 1.000 91.00000 232.000 350.000 524.000 1.491000e+03 6383.000
lowestprice 687931.0 318.806242 575.782415 -3.000 37.00000 116.000 200.000 380.000 1.823000e+03 100000.000
firstorder_bu 376993.0 11.697795 2.746821 1.000 3.00000 12.000 13.000 13.000 1.700000e+01 21.000
customereval_pre2 661312.0 3.048519 1.226635 0.000 0.00000 2.000 3.000 4.000 5.500000e+00 6.000
delta_price2 437750.0 77.277208 391.413839 -43344.000 -949.00000 -29.000 69.000 198.000 1.018000e+03 5114.000
commentnums_pre 598368.0 1415.159561 2329.418922 0.000 1.00000 137.000 592.000 1862.000 9.732000e+03 34189.000
customer_value_profit 439123.0 3.038409 6.625281 -24.075 -0.29678 0.269 0.991 3.138 2.845100e+01 598.064
commentnums_pre2 648457.0 1313.388737 1719.513354 0.000 3.00000 270.000 768.000 1780.000 7.457000e+03 34189.000
cancelrate_pre 653015.0 0.344422 0.179147 0.000 0.05000 0.230 0.320 0.420 1.000000e+00 1.000
novoters_pre2 657616.0 1787.197614 2316.712985 1.000 5.00000 391.000 1054.000 2413.000 1.001800e+04 45436.000
novoters_pre 648956.0 1890.698450 3116.120062 1.000 2.00000 187.000 783.000 2453.000 1.383900e+04 45436.000
ctrip_profits 445187.0 4.208495 9.314438 -44.313 -0.39300 0.340 1.347 4.320 4.075580e+01 600.820
deltaprice_pre2_t1 543180.0 3.283740 48.805880 -2296.000 -103.00000 -3.000 2.000 10.000 1.110000e+02 3324.000
lowestprice_pre 659689.0 315.954583 463.723643 1.000 38.00000 118.000 208.000 385.000 1.750000e+03 100000.000
uv_pre 660548.0 107.846076 186.731907 0.007 0.24000 12.533 42.500 124.707 1.047787e+03 1722.613
uv_pre2 661189.0 103.352990 157.117863 0.007 0.50000 17.563 51.287 126.200 8.567254e+02 1722.613
lowestprice_pre2 660664.0 318.541812 351.913035 1.000 52.00000 145.000 233.000 388.000 1.464000e+03 43700.000
lasthtlordergap 447831.0 101830.919400 122784.313864 0.000 244.00000 14999.000 46890.000 138953.000 4.847340e+05 527026.000
businessrate_pre2 602960.0 0.368237 0.219945 0.000 0.01000 0.170 0.400 0.550 7.800000e-01 0.990
cityuvs 682274.0 10.648278 15.696682 0.007 0.01300 0.827 3.527 13.327 6.600700e+01 67.140
cityorders 651263.0 2.253250 3.538453 0.007 0.00700 0.127 0.627 2.747 1.445300e+01 14.507
lastpvgap 592818.0 12049.409382 25601.374138 0.000 0.00000 551.000 2848.000 10726.000 1.387220e+05 194386.000
cr 457896.0 1.137476 0.204789 1.000 1.00000 1.000 1.050 1.210 2.000000e+00 11.000
sid 689945.0 153.702414 277.807697 0.000 1.00000 17.000 62.000 180.000 1.212000e+03 9956.000
visitnum_oneyear 592910.0 18551.846682 228860.311117 1.000 21.00000 471.000 1315.000 3141.000 2.625670e+05 9651192.000
h 689945.0 14.462315 6.301575 0.000 0.00000 11.000 15.000 20.000 2.300000e+01 23.000
# 查看异常值的列,这里出现了负数价格,以及出现过高的价格
df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
count mean std min 1% 25% 50% 75% 99% max
lowestprice_pre 659689.0 315.954583 463.723643 1.0 38.0 118.0 208.0 385.0 1750.0 100000.0
lowestprice 687931.0 318.806242 575.782415 -3.0 37.0 116.0 200.0 380.0 1823.0 100000.0
# 存储异常值的列
col_block = ['lowestprice_pre', 'lowestprice']
# 定义盖帽法函数,去除异常值
def block_upper(x):
    upper = x.quantile(0.99)
    out = x.mask(x > upper, upper)
    return out

def block_lower(x):
    lower = x.quantile(0.01)
    out = x.mask(x < lower, lower)
    return out
# 处理异常值
df[col_block] = df[col_block].apply(block_upper)

df[col_block] = df[col_block].apply(block_lower)

df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
count mean std min 1% 25% 50% 75% 99% max
lowestprice_pre 659689.0 304.439507 287.192512 38.0 38.0 118.0 208.0 385.0 1750.0 1750.0
lowestprice 687931.0 305.025771 297.382838 37.0 37.0 116.0 200.0 380.0 1823.0 1823.0
# 深拷贝,不随原数据而改变
df_copy = df.copy(deep=True)

# 去除标签和编号的数据
X = df_copy.iloc[:, 2:]

# 标签列
y = df_copy.iloc[:, 0]
X.head(10)
d arrival iforderpv_24h decisionhabit_user historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ordercanncelednum commentnums ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 2016-05-18 2016-05-18 0 NaN NaN 1.04 NaN 22.0 NaN 1089.0 ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12
1 2016-05-18 2016-05-18 0 NaN NaN 1.06 NaN 0.0 NaN 5612.0 ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14
2 2016-05-18 2016-05-19 0 NaN NaN 1.05 NaN 3.0 NaN 256.0 ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19
3 2016-05-18 2016-05-18 0 NaN NaN 1.01 NaN 2.0 NaN NaN ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16
4 2016-05-18 2016-05-19 0 NaN NaN 1.00 NaN 0.0 NaN NaN ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21
5 2016-05-18 2016-05-20 0 NaN NaN 1.02 NaN 0.0 NaN 15.0 ... 408.0 NaN NaN 2.880 0.427 NaN NaN 1 NaN 21
6 2016-05-18 2016-05-25 0 NaN NaN 1.12 NaN 0.0 NaN 2578.0 ... 145.0 NaN NaN 4.427 0.493 NaN NaN 1 NaN 22
7 2016-05-18 2016-05-20 0 3.0 21.0 1.11 0.79 0.0 395.0 NaN ... 204.0 10475.0 0.53 12.713 1.987 7566.0 1.5 23 1265.0 17
8 2016-05-18 2016-05-19 0 13.0 NaN 1.08 NaN 0.0 NaN 2572.0 ... 99.0 NaN 0.41 5.393 0.860 15.0 1.0 20 596.0 20
9 2016-05-18 2016-06-08 1 2.0 7.0 1.07 0.86 47.0 6.0 NaN ... 191.0 18873.0 0.52 3.093 0.287 288.0 1.0 31 21926.0 7

10 rows × 48 columns

# 数据集切分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 删除日期列
col_date = ['d', 'arrival']
X_train.drop(col_date, axis=1, inplace=True)
X_train.shape
(517458, 46)
# 对不同类型特征进行选择
col = X_train.columns.tolist()
col_no = ['sid', 'iforderpv_24h', 'h'] # 没有缺失值的数据,除去两个日期特征
col_clf = ['decisionhabit_user'] # 分类特征
col_neg = ['delta_price1', 'delta_price2', 'customer_value_profit', 'ctrip_profits', 'deltaprice_pre2_t1'] # 含有负数的特征
col_35 = ['ordernum_oneyear', 'lasthtlordergap', 'ordercanncelednum',
          'ordercanceledprecent', 'ctrip_profits', 'historyvisit_avghotelnum', 'historyvisit_totalordernum',  # 缺失值在35以上的特征
          'decisionhabit_user', 'firstorder_bu', 'historyvisit_visit_detailpagenum']
col_std = X_train.columns[X_train.describe(include='all').T['std'] > 100].to_list() # 方差大于100的列
col_std.remove('sid')
col_std.remove('delta_price2')
col_std.remove('delta_price1')
col_std.remove('lasthtlordergap')
col_norm = list(set(col) - set(col_no + col_clf + col_neg + col_35))
# 对训练集填充缺失值
X_train[col_clf] = X_train[col_clf].fillna(X_train[col_clf].mode())

X_train[col_neg] = X_train[col_neg].fillna(X_train[col_neg].median())

X_train[col_35] = X_train[col_35].fillna(-1)

X_train[col_std] = X_train[col_std].fillna(X_train[col_std].median())

X_train[col_norm] = X_train[col_norm].fillna(X_train[col_norm].mean())
# 对测试集填充缺失值
X_test[col_clf] = X_test[col_clf].fillna(X_test[col_clf].mode())

X_test[col_neg] = X_test[col_neg].fillna(X_test[col_neg].median())

X_test[col_35] = X_test[col_35].fillna(-1)

X_test[col_std] = X_test[col_std].fillna(X_test[col_std].median())

X_test[col_norm] = X_test[col_norm].fillna(X_test[col_norm].mean())
# 查看缺失数据数
X_train.isnull().any().sum()
0
X_test.isnull().any().sum()
0
X_train.shape
(517458, 46)
X_test.shape
(172487, 48)
# 方差过滤
selector = VarianceThreshold()

X_train_var = selector.fit_transform(X_train)
X_train_var.shape
(517458, 46)
# F检验
f, p_values = f_classif(X_train, y_train)

(p_values > 0.01).sum()
6
# F检验筛选之后的训练集
col_f = X_train.columns[p_values <= 0.01]
X_train = X_train[col_f]
X_train.shape
(517458, 40)
X_test = X_test[col_f]
X_test.shape
(172487, 40)
# 重置索引
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)
# 建模,生成特征重要性
rfc = RFC(n_estimators=10, random_state=42)
importances = rfc.fit(X_train, y_train).feature_importances_
importances
array([0.0116028 , 0.01896167, 0.01802851, 0.0150776 , 0.01591757,
       0.02015549, 0.02034934, 0.0203898 , 0.01943307, 0.0211841 ,
       0.01791408, 0.02056686, 0.02291317, 0.02291462, 0.02051174,
       0.02323111, 0.02106971, 0.01054197, 0.02306534, 0.01938803,
       0.0258305 , 0.02571515, 0.02508617, 0.02485002, 0.02534584,
       0.02850795, 0.02408976, 0.02625239, 0.0279376 , 0.02743348,
       0.02772216, 0.03058076, 0.02754623, 0.04068007, 0.03904556,
       0.03655198, 0.03553142, 0.03861745, 0.04116244, 0.03829647])
# 交叉检验和嵌入法,画出学习曲线
scores = []
thresholds = np.linspace(0, importances.max(), 20)
for i in thresholds:
    time0 = time()
    X_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X_train, y_train)
    score = cross_val_score(rfc, X_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)
    print(datetime.datetime.fromtimestamp(time() - time0).strftime('%M:%S:%f'))

plt.plot(thresholds, scores)
plt.show()
01:12:090613
01:13:526636
01:09:249811
01:08:676264
01:07:224143
01:06:510684
01:14:200829
01:13:976302
01:13:173232
01:07:879761
01:05:081422
01:03:703478
00:57:668075
00:55:913328
00:49:275084
00:47:250707
00:48:596243
00:53:333874
00:45:020932
00:53:343730

携程客户流失分析项目(个人练习+源代码)_第3张图片

# 查看最大分数
max(scores)
0.9507844100831351
# 查看最大分数对应的阈值
thresholds[scores.index(max(scores))]
0.028163774952387383
col_k =  X_train.columns[importances > 0.028163774952387383].to_list()
X_train_embedded = X_train[col_k]
X_train_embedded.head()
ctrip_profits lasthtlordergap cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 1.347 -1.0 3.787 0.387 2850.0 1.137405 3 1314.0 6
1 1.347 -1.0 0.127 0.007 2850.0 1.137405 7 1314.0 13
2 0.767 -1.0 18.973 3.600 7272.0 1.137405 457 348.0 12
3 15.433 1986.0 1.507 0.287 47.0 1.160000 430 20273.0 8
4 1.347 -1.0 1.433 0.167 20.0 1.000000 85 83.0 13
X_test_embedded = X_test[col_k]
X_test_embedded.head()
ctrip_profits lasthtlordergap cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 3.940 7224.0 7.147 0.580000 539.0 1.150000 220 4542.0 4
1 1.347 40911.0 0.447 0.060000 3.0 1.137687 81 3156.0 3
2 0.887 -1.0 4.313 0.460000 6532.0 1.000000 81 1026.0 17
3 1.347 -1.0 0.460 0.053000 363.0 1.000000 27 349.0 22
4 1.540 82256.0 0.060 2.246314 41.0 1.170000 63 811.0 10
# 画出热力图,查看相关性
plt.figure(figsize=(10,8))
sns.heatmap(X_train_embedded.corr(), annot=True, linewidths=1)

携程客户流失分析项目(个人练习+源代码)_第4张图片

# 删除相关性高的特征
X_train_embedded.drop('cityuvs', axis=1, inplace=True)
X_test_embedded.drop('cityuvs', axis=1, inplace=True)
# 保存清洗之后的数据
X_train_embedded.to_csv('X_train_embedded.csv')
X_test_embedded.to_csv('X_test_embedded.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')
# 读取数据
X_train_embedded = pd.read_csv('X_train_embedded.csv', index_col=0)
X_test_embedded = pd.read_csv('X_test_embedded.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_test = pd.read_csv('y_test.csv', index_col=0)
y_train = np.ravel(y_train)
y_train.shape
y_test = np.ravel(y_test)
y_test.shape
(172487,)
# 'lasthtlordergap':'一年内距离上次下单时长'
# 'cityorders':'昨日提交当前城市同入住日期的app订单数'
# 'lastpvgap':'一年内距上次访问时长'
# 'cr':'用户转化率'
# 'sid':'会话id,sid=1可认为是新访'
# 'visitnum_oneyear':'年访问次数'
# 'h':'访问时间点'
import scipy

# 重新合并数据
woe_data = pd.concat([X_train_embedded, pd.Series(y_train, name='label')], axis=1)
woe_data
ctrip_profits lasthtlordergap cityorders lastpvgap cr sid visitnum_oneyear h label
0 1.347 -1.0 0.387 2850.0 1.137405 3 1314.0 6 0
1 1.347 -1.0 0.007 2850.0 1.137405 7 1314.0 13 0
2 0.767 -1.0 3.600 7272.0 1.137405 457 348.0 12 1
3 15.433 1986.0 0.287 47.0 1.160000 430 20273.0 8 1
4 1.347 -1.0 0.167 20.0 1.000000 85 83.0 13 0
... ... ... ... ... ... ... ... ... ...
517453 1.347 -1.0 0.113 4347.0 1.137405 8 278.0 21 0
517454 1.347 41045.0 0.520 2972.0 1.330000 25 1095.0 0 0
517455 1.347 113046.0 0.093 522.0 1.137405 120 6309.0 16 0
517456 -0.067 266544.0 0.600 28378.0 1.000000 22 100.0 9 0
517457 1.347 -1.0 0.420 2850.0 1.137405 5 1314.0 17 0

517458 rows × 9 columns

# 计算woe
def get_woe(num_bins):
    columns = ['min', 'max', 'count_0', 'count_1']
    df = pd.DataFrame(num_bins, columns=columns)
    
    df['total'] = df['count_0'] + df['count_1']
    df['percentage'] = df['total'] / df['total'].sum()
    df['bad_rate'] = df['count_1'] / df['total']
    df['good%'] = df['count_0'] / df['count_0'].sum()
    df['bad%'] = df['count_1'] / df['count_1'].sum()
    df['good-bad'] = df['good%'] - df['bad%']
    df['woe'] = np.log(df['good%'] / df['bad%'])
    return df
# 计算IV值,返回IV值
def get_iv(df):
    iv = np.sum(df['good-bad'] * df['woe'])
    return iv
# 返回详细矩阵
def get_bin(X, q):
    df = woe_data.copy()
    df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
    count_0 = df[df['label']==0].groupby('qcut').count()['label']
    count_1 = df[df['label']==1].groupby('qcut').count()['label']
    num_bins = [*zip(updown,updown[1:],count_0,count_1)]
    woe_df = get_woe(num_bins)
    
    return woe_df
# 作图,查看不同分箱方式
def get_graph(X, n=2, q=20):
    df = woe_data.copy()
    df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
    count_0 = df[df['label']==0].groupby('qcut').count()['label']
    count_1 = df[df['label']==1].groupby('qcut').count()['label']
    num_bins = [*zip(updown,updown[1:],count_0,count_1)]

    IV = []
    axisx = []

    while len(num_bins) > n:
        pvs = []
        for i in range(len(num_bins)-1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i+1][2:]
            pv = scipy.stats.chi2_contingency([x1,x2])[1]
            pvs.append(pv)
        i = pvs.index(max(pvs))
        num_bins[i:i+2] = [(num_bins[i][0], num_bins[i+1][1],
                           num_bins[i][2] + num_bins[i+1][2],
                           num_bins[i][3] + num_bins[i+1][3])]
        woe_df = get_woe(num_bins)
        axisx.append(len(num_bins))
        IV.append(get_iv(woe_df))

    plt.figure()
    plt.plot(axisx, IV)
    plt.xticks(axisx)
    plt.xlabel("number of box")
    plt.ylabel("IV")
    plt.show()
col_woe = ['ctrip_profits', 'lasthtlordergap', 'cityorders',
          'lastpvgap', 'cr', 'sid', 'visitnum_oneyear', 'h']
for i in col_woe:
    print(i)
    get_graph(i)
ctrip_profits

携程客户流失分析项目(个人练习+源代码)_第5张图片

lasthtlordergap

携程客户流失分析项目(个人练习+源代码)_第6张图片

cityorders

携程客户流失分析项目(个人练习+源代码)_第7张图片

lastpvgap

携程客户流失分析项目(个人练习+源代码)_第8张图片

cr

携程客户流失分析项目(个人练习+源代码)_第9张图片

sid

携程客户流失分析项目(个人练习+源代码)_第10张图片

visitnum_oneyear

携程客户流失分析项目(个人练习+源代码)_第11张图片

h

携程客户流失分析项目(个人练习+源代码)_第12张图片

# 发现上次下单时长在2356-29219内的用户流失率比较高
get_bin('lasthtlordergap', 10)
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe
0 -1.0 2356.0 158912 48086 206998 0.400029 0.232302 0.423198 0.338741 0.084457 0.222603
1 2356.0 13291.0 30983 20754 51737 0.099983 0.401144 0.082511 0.146201 -0.063691 -0.572057
2 13291.0 29219.0 34175 17574 51749 0.100006 0.339601 0.091011 0.123800 -0.032789 -0.307683
3 29219.0 56455.0 35511 16229 51740 0.099989 0.313664 0.094569 0.114325 -0.019756 -0.189714
4 56455.0 110984.0 37245 14499 51744 0.099997 0.280206 0.099187 0.102138 -0.002951 -0.029318
5 110984.0 232020.0 39177 12572 51749 0.100006 0.242942 0.104332 0.088563 0.015769 0.163861
6 232020.0 527026.0 39500 12241 51741 0.099991 0.236582 0.105192 0.086232 0.018961 0.198753
# 发现用户转化率在1.12以下时用户留存较多
get_bin('cr',10)
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe
0 1.000000 1.120000 166695 42471 209166 0.404218 0.203049 0.443925 0.299186 0.144738 0.394588
1 1.120000 1.137405 135099 47248 182347 0.352390 0.259110 0.359781 0.332838 0.026944 0.077841
2 1.137405 1.170000 15752 7000 22752 0.043969 0.307665 0.041949 0.049311 -0.007362 -0.161699
3 1.170000 1.330000 37674 24461 62135 0.120077 0.393675 0.100329 0.172315 -0.071986 -0.540866
4 1.330000 11.000000 20283 20775 41058 0.079346 0.505992 0.054016 0.146349 -0.092334 -0.996724
# 发现客户留存结果随着订单数增加而逐渐降低,除了在1.4--2.25的区间中有了明显的回升
get_bin('cityorders',10)
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe
0 0.007000 0.033000 44555 11516 56071 0.108359 0.205382 0.118654 0.081124 0.037530 0.380231
1 0.033000 0.093000 39744 10584 50328 0.097260 0.210300 0.105842 0.074559 0.031283 0.350359
2 0.093000 0.200000 40294 11095 51389 0.099310 0.215902 0.107307 0.078159 0.029148 0.316952
3 0.200000 0.380000 37656 11761 49417 0.095500 0.237995 0.100281 0.082850 0.017431 0.190947
4 0.380000 0.753000 37170 14675 51845 0.100192 0.283055 0.098987 0.103378 -0.004391 -0.043400
5 0.753000 1.400000 35492 16268 51760 0.100027 0.314297 0.094519 0.114600 -0.020081 -0.192649
6 1.400000 2.255565 49463 16530 65993 0.127533 0.250481 0.131725 0.116445 0.015279 0.123292
7 2.255565 3.260000 25531 11738 37269 0.072023 0.314953 0.067991 0.082688 -0.014697 -0.195694
8 3.260000 6.633000 32872 18901 51773 0.100053 0.365074 0.087541 0.133148 -0.045607 -0.419350
9 6.633000 14.507000 32726 18887 51613 0.099743 0.365935 0.087152 0.133049 -0.045897 -0.423060
# 发现在晚上七点之后访问的用户,流失率较低,白天访问的用户流失率较高
get_bin('h',10)
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe
0 0.0 6.0 42287 15678 57965 0.112019 0.270474 0.112614 0.110443 0.002171 0.019465
1 6.0 10.0 46850 22957 69807 0.134904 0.328864 0.124766 0.161720 -0.036954 -0.259428
2 10.0 12.0 34400 16455 50855 0.098279 0.323567 0.091610 0.115917 -0.024307 -0.235329
3 12.0 13.0 19815 8752 28567 0.055206 0.306367 0.052769 0.061653 -0.008884 -0.155599
4 13.0 15.0 38660 18137 56797 0.109762 0.319330 0.102955 0.127766 -0.024811 -0.215905
5 15.0 17.0 42537 18293 60830 0.117555 0.300723 0.113280 0.128865 -0.015585 -0.128901
6 17.0 19.0 40305 16122 56427 0.109047 0.285714 0.107336 0.113571 -0.006235 -0.056466
7 19.0 21.0 49680 15527 65207 0.126014 0.238119 0.132303 0.109380 0.022923 0.190266
8 21.0 22.0 31405 6631 38036 0.073505 0.174335 0.083634 0.046712 0.036922 0.582455
9 22.0 23.0 29564 3403 32967 0.063710 0.103224 0.078732 0.023972 0.054759 1.189144
# 客户价值并非越高流失率越低,在1.147-1.347时流失率最低的区间
get_bin('ctrip_profits',10)
min max count_0 count_1 total percentage bad_rate good% bad% good-bad woe
0 -44.313 0.147 37921 13979 51900 0.100298 0.269345 0.100987 0.098475 0.002512 0.025192
1 0.147 0.500 37673 13923 51596 0.099711 0.269846 0.100327 0.098080 0.002246 0.022645
2 0.500 1.147 37713 14400 52113 0.100710 0.276323 0.100433 0.101441 -0.001007 -0.009980
3 1.147 1.347 150615 44404 195019 0.376879 0.227691 0.401102 0.312803 0.088299 0.248641
4 1.347 1.587 8296 3333 11629 0.022473 0.286611 0.022093 0.023479 -0.001386 -0.060856
5 1.587 3.220 36089 15701 51790 0.100085 0.303167 0.096108 0.110605 -0.014497 -0.140493
6 3.220 7.327 35310 16403 51713 0.099937 0.317193 0.094034 0.115551 -0.021517 -0.206054
7 7.327 600.820 31886 19812 51698 0.099908 0.383226 0.084915 0.139565 -0.054650 -0.496877
# 通过观察可以发现lastpvgap,sid,visitnum_oneyear的IV值过小,可以去掉这三个特征
X_train_woe = X_train_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]
X_test_woe = X_test_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]
# 最大深度学习曲线
scores = []
time0 = time()
for i in np.arange(5,21,1):
    rfc = RFC(n_estimators=10, max_depth=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},最大深度为{}'.format(max(scores), np.arange(5,21,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(5,21,1), scores)
plt.show()
花费时间:03:27:049065
最大分数为0.8891446300224614,最大深度为20

携程客户流失分析项目(个人练习+源代码)_第13张图片

# 最小分割数学习曲线
scores = []
time0 = time()
for i in np.arange(2,10,1):
    rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},最小分割数为{}'.format(max(scores), np.arange(2,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(2,10,1), scores)
plt.show()
花费时间:02:16:546873
最大分数为0.8891446300224614,最小分割数为2

携程客户流失分析项目(个人练习+源代码)_第14张图片

# 最小叶子节点样本数学习曲线
scores = []
time0 = time()
for i in np.arange(1,10,1):
    rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=2, min_samples_leaf=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},小叶子节点样本数{}'.format(max(scores), np.arange(1,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(1,10,1), scores)
plt.show()
花费时间:02:13:264602
最大分数为0.8891446300224614,小叶子节点样本数1

携程客户流失分析项目(个人练习+源代码)_第15张图片

# 树数量学习曲线
scores = []
time0 = time()
for i in np.arange(10,201,10):
    rfc = RFC(n_estimators=i, max_depth=20, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=3, n_jobs=-1).mean()
    scores.append(score)
    print('花费时间:{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{},树数量{}'.format(max(scores), np.arange(10,201,10)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(10,201,10), scores)
plt.show()
花费时间:00:33:316156
花费时间:01:39:789986
花费时间:03:21:423289
花费时间:05:40:185622
花费时间:08:30:948403
花费时间:11:56:105026
花费时间:15:51:769696
花费时间:20:36:914026
花费时间:25:38:212320
花费时间:31:10:214501
花费时间:37:27:759795
花费时间:44:10:525391
花费时间:51:28:059915
花费时间:59:16:547367
花费时间:07:41:280818
花费时间:16:38:253700
花费时间:26:07:070954
花费时间:36:06:570602
花费时间:48:40:747174
花费时间:00:14:880475
最大分数为0.8988864801364768,树数量100

携程客户流失分析项目(个人练习+源代码)_第16张图片

# 选出最佳模型参数
rfc = RFC(n_estimators=100, max_depth=20, random_state=42).fit(X_train_embedded, y_train)
# 查看训练集和测试集上的得分
print('训练集得分为{}'.format(rfc.score(X_train_embedded, y_train)))
print('测试集得分为{}'.format(rfc.score(X_train_embedded, y_test)))
训练集得分为0.9144162424776504
测试集得分为0.8858812548192038
# 查看特征重要性
rfc.feature_importances_
array([0.12193391, 0.12869867, 0.14163503, 0.13799971, 0.10331983,
       0.12834216, 0.14547079, 0.09259991])
# 测试集预测概率
y_scores = rfc.predict_proba(X_train_embedded)
y_scores
array([[0.42810513, 0.57189487],
       [0.90260411, 0.09739589],
       [0.87430575, 0.12569425],
       ...,
       [0.3633656 , 0.6366344 ],
       [0.79112542, 0.20887458],
       [0.14388709, 0.85611291]])
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc
0.9680887508287116
# 绘制roc曲线
def draw_roc(roc_auc, fpr, tpr):
    plt.subplots(figsize=(7,5.5))
    plt.plot(fpr, tpr, color='orange', label='roc curve(area={})'.format(roc_auc))
    plt.plot([0,1], [0,1], color='blue', linestyle='--')
    plt.xlabel('fpr')
    plt.ylabel('tpr')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.title('ROC Curve')
    plt.legend(loc=4)
    plt.show()

draw_roc(roc_auc, fpr, tpr)

携程客户流失分析项目(个人练习+源代码)_第17张图片

# RFM计算
rfm = df[['sampleid','ordernum_oneyear','avgprice','lasthtlordergap']]
rfm.head()
sampleid ordernum_oneyear avgprice lasthtlordergap
0 24636 NaN NaN NaN
1 24637 NaN NaN NaN
2 24641 NaN NaN NaN
3 24642 NaN NaN NaN
4 24644 NaN NaN NaN
# RFM模型重命名
rfm = rfm.dropna().reset_index(drop=True).rename(columns={'ordernum_oneyear':'F', 'avgprice':'M', 'lasthtlordergap':'R'})
rfm.head()
sampleid F M R
0 24650 21.0 363.0 10475.0
1 24653 7.0 307.0 18873.0
2 24655 1.0 343.0 32071.0
3 24658 33.0 1000.0 4616.0
4 24662 4.0 685.0 44830.0
# 通过计算得出R的单位是分钟,可以将其转换成天
rfm['R'] = round(rfm['R'] / 1440, 0)
rfm.head()
sampleid F M R
0 24650 21.0 363.0 7.0
1 24653 7.0 307.0 13.0
2 24655 1.0 343.0 22.0
3 24658 33.0 1000.0 3.0
4 24662 4.0 685.0 31.0
rfm.describe().T
count mean std min 25% 50% 75% max
sampleid 426425.0 629380.138599 414760.183032 24650.0 313549.0 600907.0 887813.0 2238403.0
F 426425.0 12.137916 17.405419 1.0 3.0 6.0 14.0 711.0
M 426425.0 421.604962 286.987700 1.0 233.0 351.0 523.0 6383.0
R 426425.0 70.742163 84.844780 0.0 11.0 33.0 97.0 366.0
# 这里根据数据分布情况以及常规思路,对RFM进行划分
f_bins = [-1, 3, 5, 7, 10, 720]
m_bins = [-1, 200, 400, 600, 800, 7000]
r_bins = [-1, 3, 7, 30, 180, 370]

rfm['R_score'] = pd.cut(rfm['R'], bins=r_bins, labels=[5,4,3,2,1]).astype('int')
rfm['F_score'] = pd.cut(rfm['F'], bins=f_bins, labels=[1,2,3,4,5]).astype('int')
rfm['M_score'] = pd.cut(rfm['M'], bins=m_bins, labels=[1,2,3,4,5]).astype('int')

rfm
sampleid F M R R_score F_score M_score
0 24650 21.0 363.0 7.0 4 5 2
1 24653 7.0 307.0 13.0 3 3 2
2 24655 1.0 343.0 22.0 3 1 2
3 24658 33.0 1000.0 3.0 5 5 5
4 24662 4.0 685.0 31.0 2 2 4
... ... ... ... ... ... ... ...
426420 2238388 2.0 226.0 119.0 2 1 2
426421 2238389 4.0 461.0 0.0 5 2 3
426422 2238396 5.0 193.0 44.0 2 2 1
426423 2238397 1.0 258.0 87.0 2 1 2
426424 2238403 3.0 256.0 52.0 2 1 2

426425 rows × 7 columns

# 大于平均分时记为1,否则记为0
rfm['R_level'] = (rfm['R_score'] > rfm['R_score'].mean()) * 1
rfm['F_level'] = (rfm['F_score'] > rfm['F_score'].mean()) * 1
rfm['M_level'] = (rfm['M_score'] > rfm['M_score'].mean()) * 1

rfm
sampleid F M R R_score F_score M_score R_level F_level M_level
0 24650 21.0 363.0 7.0 4 5 2 1 1 0
1 24653 7.0 307.0 13.0 3 3 2 1 1 0
2 24655 1.0 343.0 22.0 3 1 2 1 0 0
3 24658 33.0 1000.0 3.0 5 5 5 1 1 1
4 24662 4.0 685.0 31.0 2 2 4 0 0 1
... ... ... ... ... ... ... ... ... ... ...
426420 2238388 2.0 226.0 119.0 2 1 2 0 0 0
426421 2238389 4.0 461.0 0.0 5 2 3 1 0 1
426422 2238396 5.0 193.0 44.0 2 2 1 0 0 0
426423 2238397 1.0 258.0 87.0 2 1 2 0 0 0
426424 2238403 3.0 256.0 52.0 2 1 2 0 0 0

426425 rows × 10 columns

# 合并数据,并根据RFM标签来对用户进行划分
rfm['RFM'] = pd.concat([rfm['R_level'].astype('str') + rfm['F_level'].astype('str') + rfm['M_level'].astype('str')])
rfm['RFM'].replace(['111','101','011','001','110','100','010','000']
            , ['重要价值用户','重要发展用户','重要保持用户','重要挽留用户','一般价值用户','一般发展用户','一般保持用户','一般挽留用户'], inplace=True)
rfm
sampleid F M R R_score F_score M_score R_level F_level M_level RFM
0 24650 21.0 363.0 7.0 4 5 2 1 1 0 一般价值用户
1 24653 7.0 307.0 13.0 3 3 2 1 1 0 一般价值用户
2 24655 1.0 343.0 22.0 3 1 2 1 0 0 一般发展用户
3 24658 33.0 1000.0 3.0 5 5 5 1 1 1 重要价值用户
4 24662 4.0 685.0 31.0 2 2 4 0 0 1 重要挽留用户
... ... ... ... ... ... ... ... ... ... ... ...
426420 2238388 2.0 226.0 119.0 2 1 2 0 0 0 一般挽留用户
426421 2238389 4.0 461.0 0.0 5 2 3 1 0 1 重要发展用户
426422 2238396 5.0 193.0 44.0 2 2 1 0 0 0 一般挽留用户
426423 2238397 1.0 258.0 87.0 2 1 2 0 0 0 一般挽留用户
426424 2238403 3.0 256.0 52.0 2 1 2 0 0 0 一般挽留用户

426425 rows × 11 columns

# 统计各个类型用户的数量
rfm_new = pd.DataFrame(rfm.groupby('RFM', as_index=False)['sampleid'].agg('count'))
rfm_new
RFM sampleid
0 一般价值用户 78592
1 一般保持用户 46850
2 一般发展用户 42275
3 一般挽留用户 83394
4 重要价值用户 63595
5 重要保持用户 38850
6 重要发展用户 20235
7 重要挽留用户 52634
# 绘制饼状图观察结果
plt.figure(figsize=(12,6))
plt.pie((rfm_new['sampleid'] / rfm_new['sampleid'].sum()).to_list(), labels=rfm_new['RFM'].to_list(), autopct='%0.2f%%')
 [Text(0.9207056795449674, 0.6019144886556893, '一般价值用户'),
  Text(0.07432600254562635, 1.0974860570164833, '一般保持用户'),
  Text(-0.6110720650087508, 0.9146534487804336, '一般发展用户'),
  Text(-1.0982775444537256, 0.061534017816936265, '一般挽留用户'),
  Text(-0.5691837587192285, -0.9412915854347425, '重要价值用户'),
  Text(0.23025758705583027, -1.0756307189752563, '重要保持用户'),
  Text(0.6623554620643879, -0.8782284679247601, '重要发展用户'),
  Text(1.0183302486279413, -0.4159368999371846, '重要挽留用户')],
 [Text(0.5022030979336185, 0.3283169938121941, '18.43%'),
  Text(0.04054145593397801, 0.5986287583726272, '10.99%'),
  Text(-0.33331203545931853, 0.49890188115296374, '9.91%'),
  Text(-0.5990604787929412, 0.03356400971832887, '19.56%'),
  Text(-0.31046386839230644, -0.5134317738734958, '14.91%'),
  Text(0.1255950474849983, -0.5867076648955943, '9.11%'),
  Text(0.3612847974896661, -0.47903370977714177, '4.75%'),
  Text(0.5554528628879679, -0.22687467269300973, '12.34%')])

携程客户流失分析项目(个人练习+源代码)_第18张图片


你可能感兴趣的:(机器学习个人练习项目,python,机器学习,数据分析)