jidongdaoshi

携程客户流失分析项目（个人练习+源代码）

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier as RFC
%matplotlib inline

plt.rcParams['font.family'] = ['SimHei']    # 显示中文,解决图中无法显示中文的问题
plt.rcParams['axes.unicode_minus']=False

# 读取文件
df = pd.read_table('userlostprob.txt')

# 查看头五行
df.head()

	label	sampleid	d	arrival	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	0	24636	2016-05-18	2016-05-18	NaN	NaN	NaN	1.04	NaN	...	615.0	NaN	0.29	12.880	3.147	NaN	NaN	7	NaN	12
1	1	24637	2016-05-18	2016-05-18	NaN	NaN	NaN	1.06	NaN	...	513.0	NaN	0.53	17.933	4.913	NaN	NaN	33	NaN	14
2	0	24641	2016-05-18	2016-05-19	NaN	NaN	NaN	1.05	NaN	...	382.0	NaN	0.60	3.993	0.760	NaN	NaN	10	NaN	19
3	0	24642	2016-05-18	2016-05-18	NaN	NaN	NaN	1.01	NaN	...	203.0	NaN	0.18	3.220	0.660	NaN	NaN	8	NaN	16
4	1	24644	2016-05-18	2016-05-19	NaN	NaN	NaN	1.00	NaN	...	84.0	NaN	NaN	0.013	NaN	NaN	NaN	1	NaN	21

5 rows × 51 columns

# 观察标签分布状况
df['label'].value_counts()

0    500588
1    189357
Name: label, dtype: int64

# 查看后五行
df.tail()

	label	sampleid	d	arrival	iforderpv_24h	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
689940	1	2238419	2016-05-15	2016-05-17	1	19.0	NaN	NaN	1.06	NaN	...	406.0	NaN	0.48	13.573	1.660	1034.0	1.0	5	119.0	18
689941	1	2238421	2016-05-15	2016-05-15	1	10.0	3.0	3.0	1.06	0.33	...	199.0	713.0	0.51	2.880	0.513	179.0	2.0	15	1472.0	12
689942	0	2238422	2016-05-15	2016-05-17	0	NaN	NaN	NaN	1.07	NaN	...	544.0	NaN	0.45	15.293	2.067	0.0	NaN	8	107.0	0
689943	0	2238425	2016-05-15	2016-05-17	0	NaN	NaN	NaN	1.04	NaN	...	156.0	NaN	0.29	2.467	0.333	NaN	NaN	4	NaN	0
689944	0	2238426	2016-05-15	2016-05-15	0	NaN	NaN	NaN	1.02	NaN	...	275.0	NaN	NaN	12.600	2.653	NaN	NaN	2	NaN	11

5 rows × 51 columns

# 随机查看五行
df.sample(5)

	label	sampleid	d	arrival	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
477013	1	820235	2016-05-21	2016-05-21	15.0	NaN	15.0	1.05	0.36	...	582.0	18831.0	0.48	17.220	3.400	4242.0	1.33	446	906.0	9
426926	0	736598	2016-05-15	2016-05-15	1.0	NaN	39.0	1.05	0.16	...	978.0	12199.0	0.13	5.113	0.847	642.0	1.36	732	2583.0	8
628554	0	1072402	2016-05-20	2016-05-20	NaN	NaN	3.0	1.02	0.00	...	147.0	55214.0	0.27	15.873	3.220	10002.0	1.11	186	905.0	19
248275	0	438633	2016-05-18	2016-06-09	19.0	2.0	28.0	1.02	0.78	...	NaN	3329.0	NaN	1.320	0.087	145.0	1.12	449	17397.0	11
198972	0	356550	2016-05-19	2016-05-19	7.0	NaN	2.0	1.04	0.50	...	206.0	61467.0	0.32	20.480	5.153	13264.0	1.08	59	1522.0	20

5 rows × 51 columns

# 数据形状
df.shape

(689945, 51)

# 查看数据类型
df.dtypes

label                                 int64
sampleid                              int64
d                                    object
arrival                              object
iforderpv_24h                         int64
decisionhabit_user                  float64
historyvisit_7ordernum              float64
historyvisit_totalordernum          float64
hotelcr                             float64
ordercanceledprecent                float64
landhalfhours                       float64
ordercanncelednum                   float64
commentnums                         float64
starprefer                          float64
novoters                            float64
consuming_capacity                  float64
historyvisit_avghotelnum            float64
cancelrate                          float64
historyvisit_visit_detailpagenum    float64
delta_price1                        float64
price_sensitive                     float64
hoteluv                             float64
businessrate_pre                    float64
ordernum_oneyear                    float64
cr_pre                              float64
avgprice                            float64
lowestprice                         float64
firstorder_bu                       float64
customereval_pre2                   float64
delta_price2                        float64
commentnums_pre                     float64
customer_value_profit               float64
commentnums_pre2                    float64
cancelrate_pre                      float64
novoters_pre2                       float64
novoters_pre                        float64
ctrip_profits                       float64
deltaprice_pre2_t1                  float64
lowestprice_pre                     float64
uv_pre                              float64
uv_pre2                             float64
lowestprice_pre2                    float64
lasthtlordergap                     float64
businessrate_pre2                   float64
cityuvs                             float64
cityorders                          float64
lastpvgap                           float64
cr                                  float64
sid                                   int64
visitnum_oneyear                    float64
h                                     int64
dtype: object

# 查看数据基本信息
df.info()


RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   label                             689945 non-null  int64  
 1   sampleid                          689945 non-null  int64  
 2   d                                 689945 non-null  object 
 3   arrival                           689945 non-null  object 
 4   iforderpv_24h                     689945 non-null  int64  
 5   decisionhabit_user                385450 non-null  float64
 6   historyvisit_7ordernum            82915 non-null   float64
 7   historyvisit_totalordernum        386525 non-null  float64
 8   hotelcr                           689148 non-null  float64
 9   ordercanceledprecent              447831 non-null  float64
 10  landhalfhours                     661312 non-null  float64
 11  ordercanncelednum                 447831 non-null  float64
 12  commentnums                       622029 non-null  float64
 13  starprefer                        464892 non-null  float64
 14  novoters                          672918 non-null  float64
 15  consuming_capacity                463837 non-null  float64
 16  historyvisit_avghotelnum          387876 non-null  float64
 17  cancelrate                        678227 non-null  float64
 18  historyvisit_visit_detailpagenum  307234 non-null  float64
 19  delta_price1                      437146 non-null  float64
 20  price_sensitive                   463837 non-null  float64
 21  hoteluv                           689148 non-null  float64
 22  businessrate_pre                  483896 non-null  float64
 23  ordernum_oneyear                  447831 non-null  float64
 24  cr_pre                            660548 non-null  float64
 25  avgprice                          457261 non-null  float64
 26  lowestprice                       687931 non-null  float64
 27  firstorder_bu                     376993 non-null  float64
 28  customereval_pre2                 661312 non-null  float64
 29  delta_price2                      437750 non-null  float64
 30  commentnums_pre                   598368 non-null  float64
 31  customer_value_profit             439123 non-null  float64
 32  commentnums_pre2                  648457 non-null  float64
 33  cancelrate_pre                    653015 non-null  float64
 34  novoters_pre2                     657616 non-null  float64
 35  novoters_pre                      648956 non-null  float64
 36  ctrip_profits                     445187 non-null  float64
 37  deltaprice_pre2_t1                543180 non-null  float64
 38  lowestprice_pre                   659689 non-null  float64
 39  uv_pre                            660548 non-null  float64
 40  uv_pre2                           661189 non-null  float64
 41  lowestprice_pre2                  660664 non-null  float64
 42  lasthtlordergap                   447831 non-null  float64
 43  businessrate_pre2                 602960 non-null  float64
 44  cityuvs                           682274 non-null  float64
 45  cityorders                        651263 non-null  float64
 46  lastpvgap                         592818 non-null  float64
 47  cr                                457896 non-null  float64
 48  sid                               689945 non-null  int64  
 49  visitnum_oneyear                  592910 non-null  float64
 50  h                                 689945 non-null  int64  
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB

# 描述性统计
df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

	label	sampleid	iforderpv_24h	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	landhalfhours	ordercanncelednum	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
count	689945.000000	6.899450e+05	689945.000000	385450.000000	82915.000000	386525.000000	689148.000000	447831.000000	661312.000000	447831.000000	...	660664.000000	447831.000000	602960.000000	682274.000000	651263.000000	592818.000000	457896.000000	689945.000000	5.929100e+05	689945.000000
mean	0.274452	6.285402e+05	0.193737	5.317048	1.856094	11.710487	1.060996	0.342119	6.086366	154.179369	...	318.541812	101830.919400	0.368237	10.648278	2.253250	12049.409382	1.137476	153.702414	1.855185e+04	14.462315
std	0.446238	4.146815e+05	0.395226	38.524483	2.103862	17.251429	0.045264	0.354210	12.413225	398.456986	...	351.913035	122784.313864	0.219945	15.696682	3.538453	25601.374138	0.204789	277.807697	2.288603e+05	6.301575
min	0.000000	2.463600e+04	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	...	1.000000	0.000000	0.000000	0.007000	0.007000	0.000000	1.000000	0.000000	1.000000e+00	0.000000
1%	0.000000	3.620588e+04	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	...	52.000000	244.000000	0.010000	0.013000	0.007000	0.000000	1.000000	1.000000	2.100000e+01	0.000000
10%	0.000000	1.398464e+05	0.000000	1.000000	1.000000	1.000000	1.010000	0.000000	0.000000	0.000000	...	101.000000	3518.000000	0.050000	0.160000	0.033000	127.000000	1.000000	4.000000	1.610000e+02	6.000000
25%	0.000000	3.123200e+05	0.000000	2.000000	1.000000	2.000000	1.030000	0.000000	0.000000	0.000000	...	145.000000	14999.000000	0.170000	0.827000	0.127000	551.000000	1.000000	17.000000	4.710000e+02	11.000000
50%	0.000000	5.996370e+05	0.000000	3.000000	1.000000	6.000000	1.050000	0.250000	0.000000	2.000000	...	233.000000	46890.000000	0.400000	3.527000	0.627000	2848.000000	1.050000	62.000000	1.315000e+03	15.000000
75%	1.000000	8.874600e+05	0.000000	5.000000	2.000000	14.000000	1.090000	0.570000	4.000000	153.000000	...	388.000000	138953.000000	0.550000	13.327000	2.747000	10726.000000	1.210000	180.000000	3.141000e+03	20.000000
90%	1.000000	1.059705e+06	1.000000	10.000000	3.000000	29.000000	1.120000	0.980000	27.000000	492.000000	...	611.000000	311492.000000	0.650000	35.567000	7.547000	30384.900000	1.400000	392.000000	6.634000e+03	22.000000
99%	1.000000	2.226893e+06	1.000000	27.000000	7.000000	82.000000	1.190000	1.000000	48.000000	1752.000000	...	1464.000000	484734.000000	0.780000	66.007000	14.453000	138722.000000	2.000000	1212.000000	2.625670e+05	23.000000
max	1.000000	2.238426e+06	1.000000	3167.000000	106.000000	711.000000	3.180000	1.000000	49.000000	13475.000000	...	43700.000000	527026.000000	0.990000	67.140000	14.507000	194386.000000	11.000000	9956.000000	9.651192e+06	23.000000

12 rows × 49 columns

# 删除重复值
df.drop_duplicates(inplace=True)
df.shape

(689945, 51)

# 根据缺失值比例进行排序
null = df.isnull().mean().reset_index().sort_values(0)
null_1 = null.rename(columns={'index':'特征', 0:'缺失比'})
null_1

	特征	缺失比
0	label	0.000000
48	sid	0.000000
4	iforderpv_24h	0.000000
50	h	0.000000
2	d	0.000000
1	sampleid	0.000000
3	arrival	0.000000
8	hotelcr	0.001155
21	hoteluv	0.001155
26	lowestprice	0.002919
44	cityuvs	0.011118
17	cancelrate	0.016984
14	novoters	0.024679
28	customereval_pre2	0.041500
10	landhalfhours	0.041500
40	uv_pre2	0.041679
41	lowestprice_pre2	0.042440
39	uv_pre	0.042608
24	cr_pre	0.042608
38	lowestprice_pre	0.043853
34	novoters_pre2	0.046857
33	cancelrate_pre	0.053526
45	cityorders	0.056065
35	novoters_pre	0.059409
32	commentnums_pre2	0.060132
12	commentnums	0.098437
43	businessrate_pre2	0.126075
30	commentnums_pre	0.132731
49	visitnum_oneyear	0.140642
46	lastpvgap	0.140775
37	deltaprice_pre2_t1	0.212720
22	businessrate_pre	0.298646
13	starprefer	0.326190
20	price_sensitive	0.327719
15	consuming_capacity	0.327719
47	cr	0.336330
25	avgprice	0.337250
23	ordernum_oneyear	0.350918
42	lasthtlordergap	0.350918
11	ordercanncelednum	0.350918
9	ordercanceledprecent	0.350918
36	ctrip_profits	0.354750
31	customer_value_profit	0.363539
29	delta_price2	0.365529
19	delta_price1	0.366405
16	historyvisit_avghotelnum	0.437816
7	historyvisit_totalordernum	0.439774
5	decisionhabit_user	0.441332
27	firstorder_bu	0.453590
18	historyvisit_visit_detailpagenum	0.554698
6	historyvisit_7ordernum	0.879824

# 绘制密度图
plt.figure(figsize=(8,6))
sns.kdeplot(null_1['缺失比'], shade=True)

# 用条形图观察缺失值
plt.figure(figsize=(8,6))
plt.bar(range(null_1.shape[0]), null_1['缺失比'], label='lost rate')
plt.legend(loc='best')

# 删除缺失值过多的列
df = df.drop(['historyvisit_7ordernum'], axis=1)
df

	label	sampleid	d	arrival	iforderpv_24h	decisionhabit_user	historyvisit_totalordernum	hotelcr	ordercanceledprecent	landhalfhours	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	0	24636	2016-05-18	2016-05-18	0	NaN	NaN	1.04	NaN	22.0	...	615.0	NaN	0.29	12.880	3.147	NaN	NaN	7	NaN	12
1	1	24637	2016-05-18	2016-05-18	0	NaN	NaN	1.06	NaN	0.0	...	513.0	NaN	0.53	17.933	4.913	NaN	NaN	33	NaN	14
2	0	24641	2016-05-18	2016-05-19	0	NaN	NaN	1.05	NaN	3.0	...	382.0	NaN	0.60	3.993	0.760	NaN	NaN	10	NaN	19
3	0	24642	2016-05-18	2016-05-18	0	NaN	NaN	1.01	NaN	2.0	...	203.0	NaN	0.18	3.220	0.660	NaN	NaN	8	NaN	16
4	1	24644	2016-05-18	2016-05-19	0	NaN	NaN	1.00	NaN	0.0	...	84.0	NaN	NaN	0.013	NaN	NaN	NaN	1	NaN	21
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
689940	1	2238419	2016-05-15	2016-05-17	1	19.0	NaN	1.06	NaN	1.0	...	406.0	NaN	0.48	13.573	1.660	1034.0	1.0	5	119.0	18
689941	1	2238421	2016-05-15	2016-05-15	1	10.0	3.0	1.06	0.33	49.0	...	199.0	713.0	0.51	2.880	0.513	179.0	2.0	15	1472.0	12
689942	0	2238422	2016-05-15	2016-05-17	0	NaN	NaN	1.07	NaN	0.0	...	544.0	NaN	0.45	15.293	2.067	0.0	NaN	8	107.0	0
689943	0	2238425	2016-05-15	2016-05-17	0	NaN	NaN	1.04	NaN	0.0	...	156.0	NaN	0.29	2.467	0.333	NaN	NaN	4	NaN	0
689944	0	2238426	2016-05-15	2016-05-15	0	NaN	NaN	1.02	NaN	0.0	...	275.0	NaN	NaN	12.600	2.653	NaN	NaN	2	NaN	11

689945 rows × 50 columns

# 异常值观察
df.describe([0.01, 0.25, 0.5, 0.75, 0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
label	689945.0	0.274452	0.446238	0.000	0.00000	0.000	0.000	1.000	1.000000e+00	1.000
sampleid	689945.0	628540.209625	414681.498697	24636.000	36205.88000	312320.000	599637.000	887460.000	2.226893e+06	2238426.000
iforderpv_24h	689945.0	0.193737	0.395226	0.000	0.00000	0.000	0.000	0.000	1.000000e+00	1.000
decisionhabit_user	385450.0	5.317048	38.524483	0.000	1.00000	2.000	3.000	5.000	2.700000e+01	3167.000
historyvisit_totalordernum	386525.0	11.710487	17.251429	1.000	1.00000	2.000	6.000	14.000	8.200000e+01	711.000
hotelcr	689148.0	1.060996	0.045264	1.000	1.00000	1.030	1.050	1.090	1.190000e+00	3.180
ordercanceledprecent	447831.0	0.342119	0.354210	0.000	0.00000	0.000	0.250	0.570	1.000000e+00	1.000
landhalfhours	661312.0	6.086366	12.413225	0.000	0.00000	0.000	0.000	4.000	4.800000e+01	49.000
ordercanncelednum	447831.0	154.179369	398.456986	0.000	0.00000	0.000	2.000	153.000	1.752000e+03	13475.000
commentnums	622029.0	1272.090888	2101.871601	0.000	1.00000	115.000	514.000	1670.000	8.796000e+03	34189.000
starprefer	464892.0	67.532304	19.175094	0.000	20.00000	53.300	69.400	80.300	1.000000e+02	100.000
novoters	672918.0	1706.247901	2811.690007	1.000	1.00000	157.000	692.000	2196.000	1.157600e+04	45455.000
consuming_capacity	463837.0	39.154140	23.240147	0.000	8.00000	22.000	33.000	51.000	1.000000e+02	100.000
historyvisit_avghotelnum	387876.0	6.510179	41.045261	0.000	1.00000	2.000	4.000	7.000	2.900000e+01	3167.000
cancelrate	678227.0	1051.604143	1509.066134	1.000	2.00000	137.000	503.000	1373.000	6.399000e+03	18930.000
historyvisit_visit_detailpagenum	307234.0	37.153603	73.402891	1.000	1.00000	6.000	18.000	44.000	2.620000e+02	6199.000
delta_price1	437146.0	79.067012	512.942824	-99879.000	-1227.55000	-31.000	81.000	226.000	1.081000e+03	5398.000
price_sensitive	463837.0	24.645863	26.685606	0.000	0.00000	5.000	16.000	33.000	1.000000e+02	100.000
hoteluv	689148.0	95.092708	169.981527	0.007	0.16700	10.427	36.180	107.747	9.641130e+02	1722.613
businessrate_pre	483896.0	0.372717	0.232791	0.000	0.01000	0.150	0.390	0.570	8.000000e-01	0.990
ordernum_oneyear	447831.0	11.642061	17.137209	1.000	1.00000	2.000	6.000	14.000	8.100000e+01	711.000
cr_pre	660548.0	1.062906	0.044588	1.000	1.00000	1.030	1.060	1.090	1.190000e+00	2.950
avgprice	457261.0	422.458701	290.853332	1.000	91.00000	232.000	350.000	524.000	1.491000e+03	6383.000
lowestprice	687931.0	318.806242	575.782415	-3.000	37.00000	116.000	200.000	380.000	1.823000e+03	100000.000
firstorder_bu	376993.0	11.697795	2.746821	1.000	3.00000	12.000	13.000	13.000	1.700000e+01	21.000
customereval_pre2	661312.0	3.048519	1.226635	0.000	0.00000	2.000	3.000	4.000	5.500000e+00	6.000
delta_price2	437750.0	77.277208	391.413839	-43344.000	-949.00000	-29.000	69.000	198.000	1.018000e+03	5114.000
commentnums_pre	598368.0	1415.159561	2329.418922	0.000	1.00000	137.000	592.000	1862.000	9.732000e+03	34189.000
customer_value_profit	439123.0	3.038409	6.625281	-24.075	-0.29678	0.269	0.991	3.138	2.845100e+01	598.064
commentnums_pre2	648457.0	1313.388737	1719.513354	0.000	3.00000	270.000	768.000	1780.000	7.457000e+03	34189.000
cancelrate_pre	653015.0	0.344422	0.179147	0.000	0.05000	0.230	0.320	0.420	1.000000e+00	1.000
novoters_pre2	657616.0	1787.197614	2316.712985	1.000	5.00000	391.000	1054.000	2413.000	1.001800e+04	45436.000
novoters_pre	648956.0	1890.698450	3116.120062	1.000	2.00000	187.000	783.000	2453.000	1.383900e+04	45436.000
ctrip_profits	445187.0	4.208495	9.314438	-44.313	-0.39300	0.340	1.347	4.320	4.075580e+01	600.820
deltaprice_pre2_t1	543180.0	3.283740	48.805880	-2296.000	-103.00000	-3.000	2.000	10.000	1.110000e+02	3324.000
lowestprice_pre	659689.0	315.954583	463.723643	1.000	38.00000	118.000	208.000	385.000	1.750000e+03	100000.000
uv_pre	660548.0	107.846076	186.731907	0.007	0.24000	12.533	42.500	124.707	1.047787e+03	1722.613
uv_pre2	661189.0	103.352990	157.117863	0.007	0.50000	17.563	51.287	126.200	8.567254e+02	1722.613
lowestprice_pre2	660664.0	318.541812	351.913035	1.000	52.00000	145.000	233.000	388.000	1.464000e+03	43700.000
lasthtlordergap	447831.0	101830.919400	122784.313864	0.000	244.00000	14999.000	46890.000	138953.000	4.847340e+05	527026.000
businessrate_pre2	602960.0	0.368237	0.219945	0.000	0.01000	0.170	0.400	0.550	7.800000e-01	0.990
cityuvs	682274.0	10.648278	15.696682	0.007	0.01300	0.827	3.527	13.327	6.600700e+01	67.140
cityorders	651263.0	2.253250	3.538453	0.007	0.00700	0.127	0.627	2.747	1.445300e+01	14.507
lastpvgap	592818.0	12049.409382	25601.374138	0.000	0.00000	551.000	2848.000	10726.000	1.387220e+05	194386.000
cr	457896.0	1.137476	0.204789	1.000	1.00000	1.000	1.050	1.210	2.000000e+00	11.000
sid	689945.0	153.702414	277.807697	0.000	1.00000	17.000	62.000	180.000	1.212000e+03	9956.000
visitnum_oneyear	592910.0	18551.846682	228860.311117	1.000	21.00000	471.000	1315.000	3141.000	2.625670e+05	9651192.000
h	689945.0	14.462315	6.301575	0.000	0.00000	11.000	15.000	20.000	2.300000e+01	23.000

# 查看异常值的列，这里出现了负数价格，以及出现过高的价格
df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
lowestprice_pre	659689.0	315.954583	463.723643	1.0	38.0	118.0	208.0	385.0	1750.0	100000.0
lowestprice	687931.0	318.806242	575.782415	-3.0	37.0	116.0	200.0	380.0	1823.0	100000.0

# 存储异常值的列
col_block = ['lowestprice_pre', 'lowestprice']

# 定义盖帽法函数，去除异常值
def block_upper(x):
    upper = x.quantile(0.99)
    out = x.mask(x > upper, upper)
    return out

def block_lower(x):
    lower = x.quantile(0.01)
    out = x.mask(x < lower, lower)
    return out

# 处理异常值
df[col_block] = df[col_block].apply(block_upper)

df[col_block] = df[col_block].apply(block_lower)

df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
lowestprice_pre	659689.0	304.439507	287.192512	38.0	38.0	118.0	208.0	385.0	1750.0	1750.0
lowestprice	687931.0	305.025771	297.382838	37.0	37.0	116.0	200.0	380.0	1823.0	1823.0

# 深拷贝，不随原数据而改变
df_copy = df.copy(deep=True)

# 去除标签和编号的数据
X = df_copy.iloc[:, 2:]

# 标签列
y = df_copy.iloc[:, 0]

X.head(10)

	d	arrival	iforderpv_24h	decisionhabit_user	historyvisit_totalordernum	hotelcr	ordercanceledprecent	landhalfhours	ordercanncelednum	commentnums	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	2016-05-18	2016-05-18	0	NaN	NaN	1.04	NaN	22.0	NaN	1089.0	...	615.0	NaN	0.29	12.880	3.147	NaN	NaN	7	NaN	12
1	2016-05-18	2016-05-18	0	NaN	NaN	1.06	NaN	0.0	NaN	5612.0	...	513.0	NaN	0.53	17.933	4.913	NaN	NaN	33	NaN	14
2	2016-05-18	2016-05-19	0	NaN	NaN	1.05	NaN	3.0	NaN	256.0	...	382.0	NaN	0.60	3.993	0.760	NaN	NaN	10	NaN	19
3	2016-05-18	2016-05-18	0	NaN	NaN	1.01	NaN	2.0	NaN	NaN	...	203.0	NaN	0.18	3.220	0.660	NaN	NaN	8	NaN	16
4	2016-05-18	2016-05-19	0	NaN	NaN	1.00	NaN	0.0	NaN	NaN	...	84.0	NaN	NaN	0.013	NaN	NaN	NaN	1	NaN	21
5	2016-05-18	2016-05-20	0	NaN	NaN	1.02	NaN	0.0	NaN	15.0	...	408.0	NaN	NaN	2.880	0.427	NaN	NaN	1	NaN	21
6	2016-05-18	2016-05-25	0	NaN	NaN	1.12	NaN	0.0	NaN	2578.0	...	145.0	NaN	NaN	4.427	0.493	NaN	NaN	1	NaN	22
7	2016-05-18	2016-05-20	0	3.0	21.0	1.11	0.79	0.0	395.0	NaN	...	204.0	10475.0	0.53	12.713	1.987	7566.0	1.5	23	1265.0	17
8	2016-05-18	2016-05-19	0	13.0	NaN	1.08	NaN	0.0	NaN	2572.0	...	99.0	NaN	0.41	5.393	0.860	15.0	1.0	20	596.0	20
9	2016-05-18	2016-06-08	1	2.0	7.0	1.07	0.86	47.0	6.0	NaN	...	191.0	18873.0	0.52	3.093	0.287	288.0	1.0	31	21926.0	7

10 rows × 48 columns

# 数据集切分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 删除日期列
col_date = ['d', 'arrival']
X_train.drop(col_date, axis=1, inplace=True)
X_train.shape

(517458, 46)

# 对不同类型特征进行选择
col = X_train.columns.tolist()
col_no = ['sid', 'iforderpv_24h', 'h'] # 没有缺失值的数据，除去两个日期特征
col_clf = ['decisionhabit_user'] # 分类特征
col_neg = ['delta_price1', 'delta_price2', 'customer_value_profit', 'ctrip_profits', 'deltaprice_pre2_t1'] # 含有负数的特征
col_35 = ['ordernum_oneyear', 'lasthtlordergap', 'ordercanncelednum',
          'ordercanceledprecent', 'ctrip_profits', 'historyvisit_avghotelnum', 'historyvisit_totalordernum',  # 缺失值在35以上的特征
          'decisionhabit_user', 'firstorder_bu', 'historyvisit_visit_detailpagenum']
col_std = X_train.columns[X_train.describe(include='all').T['std'] > 100].to_list() # 方差大于100的列
col_std.remove('sid')
col_std.remove('delta_price2')
col_std.remove('delta_price1')
col_std.remove('lasthtlordergap')
col_norm = list(set(col) - set(col_no + col_clf + col_neg + col_35))

# 对训练集填充缺失值
X_train[col_clf] = X_train[col_clf].fillna(X_train[col_clf].mode())

X_train[col_neg] = X_train[col_neg].fillna(X_train[col_neg].median())

X_train[col_35] = X_train[col_35].fillna(-1)

X_train[col_std] = X_train[col_std].fillna(X_train[col_std].median())

X_train[col_norm] = X_train[col_norm].fillna(X_train[col_norm].mean())

# 对测试集填充缺失值
X_test[col_clf] = X_test[col_clf].fillna(X_test[col_clf].mode())

X_test[col_neg] = X_test[col_neg].fillna(X_test[col_neg].median())

X_test[col_35] = X_test[col_35].fillna(-1)

X_test[col_std] = X_test[col_std].fillna(X_test[col_std].median())

X_test[col_norm] = X_test[col_norm].fillna(X_test[col_norm].mean())

# 查看缺失数据数
X_train.isnull().any().sum()

X_test.isnull().any().sum()

X_train.shape

(517458, 46)

X_test.shape

(172487, 48)

# 方差过滤
selector = VarianceThreshold()

X_train_var = selector.fit_transform(X_train)
X_train_var.shape

(517458, 46)

# F检验
f, p_values = f_classif(X_train, y_train)

(p_values > 0.01).sum()

# F检验筛选之后的训练集
col_f = X_train.columns[p_values <= 0.01]
X_train = X_train[col_f]
X_train.shape

(517458, 40)

X_test = X_test[col_f]
X_test.shape

(172487, 40)

# 重置索引
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# 建模，生成特征重要性
rfc = RFC(n_estimators=10, random_state=42)
importances = rfc.fit(X_train, y_train).feature_importances_
importances

array([0.0116028 , 0.01896167, 0.01802851, 0.0150776 , 0.01591757,
       0.02015549, 0.02034934, 0.0203898 , 0.01943307, 0.0211841 ,
       0.01791408, 0.02056686, 0.02291317, 0.02291462, 0.02051174,
       0.02323111, 0.02106971, 0.01054197, 0.02306534, 0.01938803,
       0.0258305 , 0.02571515, 0.02508617, 0.02485002, 0.02534584,
       0.02850795, 0.02408976, 0.02625239, 0.0279376 , 0.02743348,
       0.02772216, 0.03058076, 0.02754623, 0.04068007, 0.03904556,
       0.03655198, 0.03553142, 0.03861745, 0.04116244, 0.03829647])

# 交叉检验和嵌入法，画出学习曲线
scores = []
thresholds = np.linspace(0, importances.max(), 20)
for i in thresholds:
    time0 = time()
    X_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X_train, y_train)
    score = cross_val_score(rfc, X_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)
    print(datetime.datetime.fromtimestamp(time() - time0).strftime('%M:%S:%f'))

plt.plot(thresholds, scores)
plt.show()

01:12:090613
01:13:526636
01:09:249811
01:08:676264
01:07:224143
01:06:510684
01:14:200829
01:13:976302
01:13:173232
01:07:879761
01:05:081422
01:03:703478
00:57:668075
00:55:913328
00:49:275084
00:47:250707
00:48:596243
00:53:333874
00:45:020932
00:53:343730

# 查看最大分数
max(scores)

0.9507844100831351

# 查看最大分数对应的阈值
thresholds[scores.index(max(scores))]

0.028163774952387383

col_k =  X_train.columns[importances > 0.028163774952387383].to_list()

X_train_embedded = X_train[col_k]
X_train_embedded.head()

	ctrip_profits	lasthtlordergap	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	1.347	-1.0	3.787	0.387	2850.0	1.137405	3	1314.0	6
1	1.347	-1.0	0.127	0.007	2850.0	1.137405	7	1314.0	13
2	0.767	-1.0	18.973	3.600	7272.0	1.137405	457	348.0	12
3	15.433	1986.0	1.507	0.287	47.0	1.160000	430	20273.0	8
4	1.347	-1.0	1.433	0.167	20.0	1.000000	85	83.0	13

X_test_embedded = X_test[col_k]
X_test_embedded.head()

	ctrip_profits	lasthtlordergap	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	3.940	7224.0	7.147	0.580000	539.0	1.150000	220	4542.0	4
1	1.347	40911.0	0.447	0.060000	3.0	1.137687	81	3156.0	3
2	0.887	-1.0	4.313	0.460000	6532.0	1.000000	81	1026.0	17
3	1.347	-1.0	0.460	0.053000	363.0	1.000000	27	349.0	22
4	1.540	82256.0	0.060	2.246314	41.0	1.170000	63	811.0	10

# 画出热力图，查看相关性
plt.figure(figsize=(10,8))
sns.heatmap(X_train_embedded.corr(), annot=True, linewidths=1)

# 删除相关性高的特征
X_train_embedded.drop('cityuvs', axis=1, inplace=True)
X_test_embedded.drop('cityuvs', axis=1, inplace=True)

# 保存清洗之后的数据
X_train_embedded.to_csv('X_train_embedded.csv')
X_test_embedded.to_csv('X_test_embedded.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

# 读取数据
X_train_embedded = pd.read_csv('X_train_embedded.csv', index_col=0)
X_test_embedded = pd.read_csv('X_test_embedded.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_test = pd.read_csv('y_test.csv', index_col=0)

y_train = np.ravel(y_train)
y_train.shape
y_test = np.ravel(y_test)
y_test.shape

(172487,)

# 'lasthtlordergap':'一年内距离上次下单时长'
# 'cityorders':'昨日提交当前城市同入住日期的app订单数'
# 'lastpvgap':'一年内距上次访问时长'
# 'cr':'用户转化率'
# 'sid':'会话id，sid=1可认为是新访'
# 'visitnum_oneyear':'年访问次数'
# 'h':'访问时间点'

import scipy

# 重新合并数据
woe_data = pd.concat([X_train_embedded, pd.Series(y_train, name='label')], axis=1)
woe_data

	ctrip_profits	lasthtlordergap	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h	label
0	1.347	-1.0	0.387	2850.0	1.137405	3	1314.0	6	0
1	1.347	-1.0	0.007	2850.0	1.137405	7	1314.0	13	0
2	0.767	-1.0	3.600	7272.0	1.137405	457	348.0	12	1
3	15.433	1986.0	0.287	47.0	1.160000	430	20273.0	8	1
4	1.347	-1.0	0.167	20.0	1.000000	85	83.0	13	0
...	...	...	...	...	...	...	...	...	...
517453	1.347	-1.0	0.113	4347.0	1.137405	8	278.0	21	0
517454	1.347	41045.0	0.520	2972.0	1.330000	25	1095.0	0	0
517455	1.347	113046.0	0.093	522.0	1.137405	120	6309.0	16	0
517456	-0.067	266544.0	0.600	28378.0	1.000000	22	100.0	9	0
517457	1.347	-1.0	0.420	2850.0	1.137405	5	1314.0	17	0

517458 rows × 9 columns

# 计算woe
def get_woe(num_bins):
    columns = ['min', 'max', 'count_0', 'count_1']
    df = pd.DataFrame(num_bins, columns=columns)
    
    df['total'] = df['count_0'] + df['count_1']
    df['percentage'] = df['total'] / df['total'].sum()
    df['bad_rate'] = df['count_1'] / df['total']
    df['good%'] = df['count_0'] / df['count_0'].sum()
    df['bad%'] = df['count_1'] / df['count_1'].sum()
    df['good-bad'] = df['good%'] - df['bad%']
    df['woe'] = np.log(df['good%'] / df['bad%'])
    return df

# 计算IV值,返回IV值
def get_iv(df):
    iv = np.sum(df['good-bad'] * df['woe'])
    return iv

# 返回详细矩阵
def get_bin(X, q):
    df = woe_data.copy()
    df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
    count_0 = df[df['label']==0].groupby('qcut').count()['label']
    count_1 = df[df['label']==1].groupby('qcut').count()['label']
    num_bins = [*zip(updown,updown[1:],count_0,count_1)]
    woe_df = get_woe(num_bins)
    
    return woe_df

# 作图，查看不同分箱方式
def get_graph(X, n=2, q=20):
    df = woe_data.copy()
    df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
    count_0 = df[df['label']==0].groupby('qcut').count()['label']
    count_1 = df[df['label']==1].groupby('qcut').count()['label']
    num_bins = [*zip(updown,updown[1:],count_0,count_1)]

    IV = []
    axisx = []

    while len(num_bins) > n:
        pvs = []
        for i in range(len(num_bins)-1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i+1][2:]
            pv = scipy.stats.chi2_contingency([x1,x2])[1]
            pvs.append(pv)
        i = pvs.index(max(pvs))
        num_bins[i:i+2] = [(num_bins[i][0], num_bins[i+1][1],
                           num_bins[i][2] + num_bins[i+1][2],
                           num_bins[i][3] + num_bins[i+1][3])]
        woe_df = get_woe(num_bins)
        axisx.append(len(num_bins))
        IV.append(get_iv(woe_df))

    plt.figure()
    plt.plot(axisx, IV)
    plt.xticks(axisx)
    plt.xlabel("number of box")
    plt.ylabel("IV")
    plt.show()

col_woe = ['ctrip_profits', 'lasthtlordergap', 'cityorders',
          'lastpvgap', 'cr', 'sid', 'visitnum_oneyear', 'h']
for i in col_woe:
    print(i)
    get_graph(i)

ctrip_profits

lasthtlordergap

cityorders

lastpvgap

cr

sid

visitnum_oneyear

# 发现上次下单时长在2356-29219内的用户流失率比较高
get_bin('lasthtlordergap', 10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	-1.0	2356.0	158912	48086	206998	0.400029	0.232302	0.423198	0.338741	0.084457	0.222603
1	2356.0	13291.0	30983	20754	51737	0.099983	0.401144	0.082511	0.146201	-0.063691	-0.572057
2	13291.0	29219.0	34175	17574	51749	0.100006	0.339601	0.091011	0.123800	-0.032789	-0.307683
3	29219.0	56455.0	35511	16229	51740	0.099989	0.313664	0.094569	0.114325	-0.019756	-0.189714
4	56455.0	110984.0	37245	14499	51744	0.099997	0.280206	0.099187	0.102138	-0.002951	-0.029318
5	110984.0	232020.0	39177	12572	51749	0.100006	0.242942	0.104332	0.088563	0.015769	0.163861
6	232020.0	527026.0	39500	12241	51741	0.099991	0.236582	0.105192	0.086232	0.018961	0.198753

# 发现用户转化率在1.12以下时用户留存较多
get_bin('cr',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	1.000000	1.120000	166695	42471	209166	0.404218	0.203049	0.443925	0.299186	0.144738	0.394588
1	1.120000	1.137405	135099	47248	182347	0.352390	0.259110	0.359781	0.332838	0.026944	0.077841
2	1.137405	1.170000	15752	7000	22752	0.043969	0.307665	0.041949	0.049311	-0.007362	-0.161699
3	1.170000	1.330000	37674	24461	62135	0.120077	0.393675	0.100329	0.172315	-0.071986	-0.540866
4	1.330000	11.000000	20283	20775	41058	0.079346	0.505992	0.054016	0.146349	-0.092334	-0.996724

# 发现客户留存结果随着订单数增加而逐渐降低，除了在1.4--2.25的区间中有了明显的回升
get_bin('cityorders',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	0.007000	0.033000	44555	11516	56071	0.108359	0.205382	0.118654	0.081124	0.037530	0.380231
1	0.033000	0.093000	39744	10584	50328	0.097260	0.210300	0.105842	0.074559	0.031283	0.350359
2	0.093000	0.200000	40294	11095	51389	0.099310	0.215902	0.107307	0.078159	0.029148	0.316952
3	0.200000	0.380000	37656	11761	49417	0.095500	0.237995	0.100281	0.082850	0.017431	0.190947
4	0.380000	0.753000	37170	14675	51845	0.100192	0.283055	0.098987	0.103378	-0.004391	-0.043400
5	0.753000	1.400000	35492	16268	51760	0.100027	0.314297	0.094519	0.114600	-0.020081	-0.192649
6	1.400000	2.255565	49463	16530	65993	0.127533	0.250481	0.131725	0.116445	0.015279	0.123292
7	2.255565	3.260000	25531	11738	37269	0.072023	0.314953	0.067991	0.082688	-0.014697	-0.195694
8	3.260000	6.633000	32872	18901	51773	0.100053	0.365074	0.087541	0.133148	-0.045607	-0.419350
9	6.633000	14.507000	32726	18887	51613	0.099743	0.365935	0.087152	0.133049	-0.045897	-0.423060

# 发现在晚上七点之后访问的用户，流失率较低，白天访问的用户流失率较高
get_bin('h',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	0.0	6.0	42287	15678	57965	0.112019	0.270474	0.112614	0.110443	0.002171	0.019465
1	6.0	10.0	46850	22957	69807	0.134904	0.328864	0.124766	0.161720	-0.036954	-0.259428
2	10.0	12.0	34400	16455	50855	0.098279	0.323567	0.091610	0.115917	-0.024307	-0.235329
3	12.0	13.0	19815	8752	28567	0.055206	0.306367	0.052769	0.061653	-0.008884	-0.155599
4	13.0	15.0	38660	18137	56797	0.109762	0.319330	0.102955	0.127766	-0.024811	-0.215905
5	15.0	17.0	42537	18293	60830	0.117555	0.300723	0.113280	0.128865	-0.015585	-0.128901
6	17.0	19.0	40305	16122	56427	0.109047	0.285714	0.107336	0.113571	-0.006235	-0.056466
7	19.0	21.0	49680	15527	65207	0.126014	0.238119	0.132303	0.109380	0.022923	0.190266
8	21.0	22.0	31405	6631	38036	0.073505	0.174335	0.083634	0.046712	0.036922	0.582455
9	22.0	23.0	29564	3403	32967	0.063710	0.103224	0.078732	0.023972	0.054759	1.189144

# 客户价值并非越高流失率越低，在1.147-1.347时流失率最低的区间
get_bin('ctrip_profits',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	-44.313	0.147	37921	13979	51900	0.100298	0.269345	0.100987	0.098475	0.002512	0.025192
1	0.147	0.500	37673	13923	51596	0.099711	0.269846	0.100327	0.098080	0.002246	0.022645
2	0.500	1.147	37713	14400	52113	0.100710	0.276323	0.100433	0.101441	-0.001007	-0.009980
3	1.147	1.347	150615	44404	195019	0.376879	0.227691	0.401102	0.312803	0.088299	0.248641
4	1.347	1.587	8296	3333	11629	0.022473	0.286611	0.022093	0.023479	-0.001386	-0.060856
5	1.587	3.220	36089	15701	51790	0.100085	0.303167	0.096108	0.110605	-0.014497	-0.140493
6	3.220	7.327	35310	16403	51713	0.099937	0.317193	0.094034	0.115551	-0.021517	-0.206054
7	7.327	600.820	31886	19812	51698	0.099908	0.383226	0.084915	0.139565	-0.054650	-0.496877

# 通过观察可以发现lastpvgap，sid，visitnum_oneyear的IV值过小，可以去掉这三个特征
X_train_woe = X_train_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]
X_test_woe = X_test_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]

# 最大深度学习曲线
scores = []
time0 = time()
for i in np.arange(5,21,1):
    rfc = RFC(n_estimators=10, max_depth=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，最大深度为{}'.format(max(scores), np.arange(5,21,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(5,21,1), scores)
plt.show()

花费时间：03:27:049065
最大分数为0.8891446300224614，最大深度为20

# 最小分割数学习曲线
scores = []
time0 = time()
for i in np.arange(2,10,1):
    rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，最小分割数为{}'.format(max(scores), np.arange(2,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(2,10,1), scores)
plt.show()

花费时间：02:16:546873
最大分数为0.8891446300224614，最小分割数为2

# 最小叶子节点样本数学习曲线
scores = []
time0 = time()
for i in np.arange(1,10,1):
    rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=2, min_samples_leaf=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，小叶子节点样本数{}'.format(max(scores), np.arange(1,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(1,10,1), scores)
plt.show()

花费时间：02:13:264602
最大分数为0.8891446300224614，小叶子节点样本数1

# 树数量学习曲线
scores = []
time0 = time()
for i in np.arange(10,201,10):
    rfc = RFC(n_estimators=i, max_depth=20, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=3, n_jobs=-1).mean()
    scores.append(score)
    print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，树数量{}'.format(max(scores), np.arange(10,201,10)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(10,201,10), scores)
plt.show()

花费时间：00:33:316156
花费时间：01:39:789986
花费时间：03:21:423289
花费时间：05:40:185622
花费时间：08:30:948403
花费时间：11:56:105026
花费时间：15:51:769696
花费时间：20:36:914026
花费时间：25:38:212320
花费时间：31:10:214501
花费时间：37:27:759795
花费时间：44:10:525391
花费时间：51:28:059915
花费时间：59:16:547367
花费时间：07:41:280818
花费时间：16:38:253700
花费时间：26:07:070954
花费时间：36:06:570602
花费时间：48:40:747174
花费时间：00:14:880475
最大分数为0.8988864801364768，树数量100

# 选出最佳模型参数
rfc = RFC(n_estimators=100, max_depth=20, random_state=42).fit(X_train_embedded, y_train)

# 查看训练集和测试集上的得分
print('训练集得分为{}'.format(rfc.score(X_train_embedded, y_train)))
print('测试集得分为{}'.format(rfc.score(X_train_embedded, y_test)))

训练集得分为0.9144162424776504
测试集得分为0.8858812548192038

# 查看特征重要性
rfc.feature_importances_

array([0.12193391, 0.12869867, 0.14163503, 0.13799971, 0.10331983,
       0.12834216, 0.14547079, 0.09259991])

# 测试集预测概率
y_scores = rfc.predict_proba(X_train_embedded)
y_scores

array([[0.42810513, 0.57189487],
       [0.90260411, 0.09739589],
       [0.87430575, 0.12569425],
       ...,
       [0.3633656 , 0.6366344 ],
       [0.79112542, 0.20887458],
       [0.14388709, 0.85611291]])

from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc

0.9680887508287116

# 绘制roc曲线
def draw_roc(roc_auc, fpr, tpr):
    plt.subplots(figsize=(7,5.5))
    plt.plot(fpr, tpr, color='orange', label='roc curve(area={})'.format(roc_auc))
    plt.plot([0,1], [0,1], color='blue', linestyle='--')
    plt.xlabel('fpr')
    plt.ylabel('tpr')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.title('ROC Curve')
    plt.legend(loc=4)
    plt.show()

draw_roc(roc_auc, fpr, tpr)

# RFM计算
rfm = df[['sampleid','ordernum_oneyear','avgprice','lasthtlordergap']]
rfm.head()

	sampleid	ordernum_oneyear	avgprice	lasthtlordergap
0	24636	NaN	NaN	NaN
1	24637	NaN	NaN	NaN
2	24641	NaN	NaN	NaN
3	24642	NaN	NaN	NaN
4	24644	NaN	NaN	NaN

# RFM模型重命名
rfm = rfm.dropna().reset_index(drop=True).rename(columns={'ordernum_oneyear':'F', 'avgprice':'M', 'lasthtlordergap':'R'})
rfm.head()

	sampleid	F	M	R
0	24650	21.0	363.0	10475.0
1	24653	7.0	307.0	18873.0
2	24655	1.0	343.0	32071.0
3	24658	33.0	1000.0	4616.0
4	24662	4.0	685.0	44830.0

# 通过计算得出R的单位是分钟，可以将其转换成天
rfm['R'] = round(rfm['R'] / 1440, 0)
rfm.head()

	sampleid	F	M	R
0	24650	21.0	363.0	7.0
1	24653	7.0	307.0	13.0
2	24655	1.0	343.0	22.0
3	24658	33.0	1000.0	3.0
4	24662	4.0	685.0	31.0

rfm.describe().T

	count	mean	std	min	25%	50%	75%	max
sampleid	426425.0	629380.138599	414760.183032	24650.0	313549.0	600907.0	887813.0	2238403.0
F	426425.0	12.137916	17.405419	1.0	3.0	6.0	14.0	711.0
M	426425.0	421.604962	286.987700	1.0	233.0	351.0	523.0	6383.0
R	426425.0	70.742163	84.844780	0.0	11.0	33.0	97.0	366.0

# 这里根据数据分布情况以及常规思路，对RFM进行划分
f_bins = [-1, 3, 5, 7, 10, 720]
m_bins = [-1, 200, 400, 600, 800, 7000]
r_bins = [-1, 3, 7, 30, 180, 370]

rfm['R_score'] = pd.cut(rfm['R'], bins=r_bins, labels=[5,4,3,2,1]).astype('int')
rfm['F_score'] = pd.cut(rfm['F'], bins=f_bins, labels=[1,2,3,4,5]).astype('int')
rfm['M_score'] = pd.cut(rfm['M'], bins=m_bins, labels=[1,2,3,4,5]).astype('int')

rfm

	sampleid	F	M	R	R_score	F_score	M_score
0	24650	21.0	363.0	7.0	4	5	2
1	24653	7.0	307.0	13.0	3	3	2
2	24655	1.0	343.0	22.0	3	1	2
3	24658	33.0	1000.0	3.0	5	5	5
4	24662	4.0	685.0	31.0	2	2	4
...	...	...	...	...	...	...	...
426420	2238388	2.0	226.0	119.0	2	1	2
426421	2238389	4.0	461.0	0.0	5	2	3
426422	2238396	5.0	193.0	44.0	2	2	1
426423	2238397	1.0	258.0	87.0	2	1	2
426424	2238403	3.0	256.0	52.0	2	1	2

426425 rows × 7 columns

# 大于平均分时记为1，否则记为0
rfm['R_level'] = (rfm['R_score'] > rfm['R_score'].mean()) * 1
rfm['F_level'] = (rfm['F_score'] > rfm['F_score'].mean()) * 1
rfm['M_level'] = (rfm['M_score'] > rfm['M_score'].mean()) * 1

rfm

	sampleid	F	M	R	R_score	F_score	M_score	R_level	F_level	M_level
0	24650	21.0	363.0	7.0	4	5	2	1	1	0
1	24653	7.0	307.0	13.0	3	3	2	1	1	0
2	24655	1.0	343.0	22.0	3	1	2	1	0	0
3	24658	33.0	1000.0	3.0	5	5	5	1	1	1
4	24662	4.0	685.0	31.0	2	2	4	0	0	1
...	...	...	...	...	...	...	...	...	...	...
426420	2238388	2.0	226.0	119.0	2	1	2	0	0	0
426421	2238389	4.0	461.0	0.0	5	2	3	1	0	1
426422	2238396	5.0	193.0	44.0	2	2	1	0	0	0
426423	2238397	1.0	258.0	87.0	2	1	2	0	0	0
426424	2238403	3.0	256.0	52.0	2	1	2	0	0	0

426425 rows × 10 columns

# 合并数据，并根据RFM标签来对用户进行划分
rfm['RFM'] = pd.concat([rfm['R_level'].astype('str') + rfm['F_level'].astype('str') + rfm['M_level'].astype('str')])
rfm['RFM'].replace(['111','101','011','001','110','100','010','000']
            , ['重要价值用户','重要发展用户','重要保持用户','重要挽留用户','一般价值用户','一般发展用户','一般保持用户','一般挽留用户'], inplace=True)
rfm

	sampleid	F	M	R	R_score	F_score	M_score	R_level	F_level	M_level	RFM
0	24650	21.0	363.0	7.0	4	5	2	1	1	0	一般价值用户
1	24653	7.0	307.0	13.0	3	3	2	1	1	0	一般价值用户
2	24655	1.0	343.0	22.0	3	1	2	1	0	0	一般发展用户
3	24658	33.0	1000.0	3.0	5	5	5	1	1	1	重要价值用户
4	24662	4.0	685.0	31.0	2	2	4	0	0	1	重要挽留用户
...	...	...	...	...	...	...	...	...	...	...	...
426420	2238388	2.0	226.0	119.0	2	1	2	0	0	0	一般挽留用户
426421	2238389	4.0	461.0	0.0	5	2	3	1	0	1	重要发展用户
426422	2238396	5.0	193.0	44.0	2	2	1	0	0	0	一般挽留用户
426423	2238397	1.0	258.0	87.0	2	1	2	0	0	0	一般挽留用户
426424	2238403	3.0	256.0	52.0	2	1	2	0	0	0	一般挽留用户

426425 rows × 11 columns

# 统计各个类型用户的数量
rfm_new = pd.DataFrame(rfm.groupby('RFM', as_index=False)['sampleid'].agg('count'))
rfm_new

	RFM	sampleid
0	一般价值用户	78592
1	一般保持用户	46850
2	一般发展用户	42275
3	一般挽留用户	83394
4	重要价值用户	63595
5	重要保持用户	38850
6	重要发展用户	20235
7	重要挽留用户	52634

# 绘制饼状图观察结果
plt.figure(figsize=(12,6))
plt.pie((rfm_new['sampleid'] / rfm_new['sampleid'].sum()).to_list(), labels=rfm_new['RFM'].to_list(), autopct='%0.2f%%')

 [Text(0.9207056795449674, 0.6019144886556893, '一般价值用户'),
  Text(0.07432600254562635, 1.0974860570164833, '一般保持用户'),
  Text(-0.6110720650087508, 0.9146534487804336, '一般发展用户'),
  Text(-1.0982775444537256, 0.061534017816936265, '一般挽留用户'),
  Text(-0.5691837587192285, -0.9412915854347425, '重要价值用户'),
  Text(0.23025758705583027, -1.0756307189752563, '重要保持用户'),
  Text(0.6623554620643879, -0.8782284679247601, '重要发展用户'),
  Text(1.0183302486279413, -0.4159368999371846, '重要挽留用户')],
 [Text(0.5022030979336185, 0.3283169938121941, '18.43%'),
  Text(0.04054145593397801, 0.5986287583726272, '10.99%'),
  Text(-0.33331203545931853, 0.49890188115296374, '9.91%'),
  Text(-0.5990604787929412, 0.03356400971832887, '19.56%'),
  Text(-0.31046386839230644, -0.5134317738734958, '14.91%'),
  Text(0.1255950474849983, -0.5867076648955943, '9.11%'),
  Text(0.3612847974896661, -0.47903370977714177, '4.75%'),
  Text(0.5554528628879679, -0.22687467269300973, '12.34%')])

你可能感兴趣的:(机器学习个人练习项目,python,机器学习,数据分析)

关于沟通这件事，项目经理不需要每次都面对面进行流程大师兄
很多项目经理都会遇到这样的问题，项目中由于事情太多，根本没有足够的时间去召开会议，那在这种情况下如何去有效地管理项目中的利益相关者？当然，不建议电子邮件也不需要开会的话，建议可以采取下面几种方式来形成有效的沟通，这几种方式可以帮助你努力的通过各种办法来保持和各方面的联系。项目经理首先要问自己几个问题，项目中哪些利益相关者是必须要进行沟通的？可以列出项目中所有的利益相关者清单，同时也整理出项目中哪些
机器学习与深度学习间关系与区别 ℒℴѵℯ心·动ꦿ໊ོ꫞ 人工智能学习深度学习 python
一、机器学习概述定义机器学习（MachineLearning,ML）是一种通过数据驱动的方法，利用统计学和计算算法来训练模型，使计算机能够从数据中学习并自动进行预测或决策。机器学习通过分析大量数据样本，识别其中的模式和规律，从而对新的数据进行判断。其核心在于通过训练过程，让模型不断优化和提升其预测准确性。主要类型1.监督学习（SupervisedLearning）监督学习是指在训练数据集中包含输入
OC语言多界面传值五大方式 Magnetic_h ios ui 学习 objective-c 开发语言
前言在完成暑假仿写项目时，遇到了许多需要用到多界面传值的地方，这篇博客来总结一下比较常用的五种多界面传值的方式。属性传值属性传值一般用前一个界面向后一个界面传值，简单地说就是通过访问后一个视图控制器的属性来为它赋值，通过这个属性来做到从前一个界面向后一个界面传值。首先在后一个界面中定义属性@interfaceBViewController:UIViewController@propertyNSSt
element实现动态路由+面包屑软件技术NINI vue案例 vue.js 前端
el-breadcrumb是ElementUI组件库中的一个面包屑导航组件，它用于显示当前页面的路径，帮助用户快速理解和导航到应用的各个部分。在Vue.js项目中，如果你已经安装了ElementUI，就可以很方便地使用el-breadcrumb组件。以下是一个基本的使用示例：安装ElementUI（如果你还没有安装的话）:你可以通过npm或yarn来安装ElementUI。bash复制代码npmi
理解Gunicorn：Python WSGI服务器的基石范范0825 ipython linux 运维
理解Gunicorn：PythonWSGI服务器的基石介绍Gunicorn，全称GreenUnicorn，是一个为PythonWSGI（WebServerGatewayInterface）应用设计的高效、轻量级HTTP服务器。作为PythonWeb应用部署的常用工具，Gunicorn以其高性能和易用性著称。本文将介绍Gunicorn的基本概念、安装和配置，帮助初学者快速上手。1.什么是Gunico
如何在 Fork 的 GitHub 项目中保留自己的修改并同步上游更新？github_fork_update iBaoxing github
如何在Fork的GitHub项目中保留自己的修改并同步上游更新？在GitHub上Fork了一个项目后，你可能会对项目进行一些修改，同时原作者也在不断更新。如果想要在保留自己修改的基础上，同步原作者的最新更新，很多人会不知所措。本文将详细讲解如何在不丢失自己改动的情况下，将上游仓库的更新合并到自己的仓库中。问题描述假设你在GitHub上Fork了一个项目，并基于该项目做了一些修改，随后你发现原作者对
抖音乐买买怎么加入赚钱?赚钱方法是什么测评君高省
你会在抖音买东西吗?如果会，那么一定要免费注册一个乐买买，抖音直播间，橱窗，小视频里的小黄车买东西都可以返佣金!省下来都是自己的，分享还可以赚钱乐买买是好省旗下的抖音返佣平台，乐买买分析社交电商的价值，乐买买属于今年难得的副业项目风口机会，2019年错过做好省的搞钱的黄金时期，那么2022年千万别再错过乐买买至于我为何转到高省呢？当然是高省APP佣金更高，模式更好，终端用户不流失。【高省】是一个自
Python数据分析与可视化实战指南 William数据分析 python python 数据
在数据驱动的时代，Python因其简洁的语法、强大的库生态系统以及活跃的社区，成为了数据分析与可视化的首选语言。本文将通过一个详细的案例，带领大家学习如何使用Python进行数据分析，并通过可视化来直观呈现分析结果。一、环境准备1.1安装必要库在开始数据分析和可视化之前，我们需要安装一些常用的库。主要包括pandas、numpy、matplotlib和seaborn等。这些库分别用于数据处理、数学
python os.environ 江湖偌大 python 深度学习
os.environ['TF_CPP_MIN_LOG_LEVEL']='0'#默认值，输出所有信息os.environ['TF_CPP_MIN_LOG_LEVEL']='1'#屏蔽通知信息（INFO）os.environ['TF_CPP_MIN_LOG_LEVEL']='2'#屏蔽通知信息和警告信息（INFO\WARNING）os.environ['TF_CPP_MIN_LOG_LEVEL']='
Python中os.environ基本介绍及使用方法鹤冲天Pro #Python python 服务器开发语言
文章目录python中os.environos.environ简介os.environ进行环境变量的增删改查python中os.environ的使用详解1.简介2.key字段详解2.1常见key字段3.os.environ.get()用法4.环境变量的增删改查和判断是否存在4.1新增环境变量4.2更新环境变量4.3获取环境变量4.4删除环境变量4.5判断环境变量是否存在python中os.envi
Pyecharts数据可视化大屏：打造沉浸式数据分析体验我的运维人生信息可视化数据分析数据挖掘运维开发技术共享
Pyecharts数据可视化大屏：打造沉浸式数据分析体验在当今这个数据驱动的时代，如何将海量数据以直观、生动的方式展现出来，成为了数据分析师和企业决策者关注的焦点。Pyecharts，作为一款基于Python的开源数据可视化库，凭借其丰富的图表类型、灵活的配置选项以及高度的定制化能力，成为了构建数据可视化大屏的理想选择。本文将深入探讨如何利用Pyecharts打造数据可视化大屏，并通过实际代码案例
Python教程：一文了解使用Python处理XPath 旦莫 Python进阶 python 开发语言
目录1.环境准备1.1安装lxml1.2验证安装2.XPath基础2.1什么是XPath？2.2XPath语法2.3示例XML文档3.使用lxml解析XML3.1解析XML文档3.2查看解析结果4.XPath查询4.1基本路径查询4.2使用属性查询4.3查询多个节点5.XPath的高级用法5.1使用逻辑运算符5.2使用函数6.实战案例6.1从网页抓取数据6.1.1安装Requests库6.1.2代
python os.environ_python os.environ 读取和设置环境变量 weixin_39605414 python os.environ
>>>importos>>>os.environ.keys()['LC_NUMERIC','GOPATH','GOROOT','GOBIN','LESSOPEN','SSH_CLIENT','LOGNAME','USER','HOME','LC_PAPER','PATH','DISPLAY','LANG','TERM','SHELL','J2REDIR','LC_MONETARY','QT_QPA
使用Faiss进行高效相似度搜索 llzwxh888 faiss python
在现代AI应用中，快速和高效的相似度搜索是至关重要的。Faiss（FacebookAISimilaritySearch）是一个专门用于快速相似度搜索和聚类的库，特别适用于高维向量。本文将介绍如何使用Faiss来进行相似度搜索，并结合Python代码演示其基本用法。什么是Faiss？Faiss是一个由FacebookAIResearch团队开发的开源库，主要用于高维向量的相似性搜索和聚类。Faiss
python是什么意思中文-在python中%是什么意思编程大乐趣
Python中%有两种：1、数值运算：%代表取模，返回除法的余数。如：>>>7%212、%操作符（字符串格式化，stringformatting），说明如下：%[(name)][flags][width].[precision]typecode(name)为命名flags可以有+，-，''或0。+表示右对齐。-表示左对齐。''为一个空格，表示在正数的左侧填充一个空格，从而与负数对齐。0表示使用0填
GitHub上克隆项目 bigbig猩猩 github
从GitHub上克隆项目是一个简单且直接的过程，它允许你将远程仓库中的项目复制到你的本地计算机上，以便进行进一步的开发、测试或学习。以下是一个详细的步骤指南，帮助你从GitHub上克隆项目。一、准备工作1.安装Git在克隆GitHub项目之前，你需要在你的计算机上安装Git工具。Git是一个开源的分布式版本控制系统，用于跟踪和管理代码变更。你可以从Git的官方网站（https://git-scm.
Day1笔记-Python简介&标识符和关键字&输入输出 ~在杰难逃~ Python python 开发语言大数据数据分析数据挖掘
大家好，从今天开始呢，杰哥开展一个新的专栏，当然，数据分析部分也会不定时更新的，这个新的专栏主要是讲解一些Python的基础语法和知识，帮助0基础的小伙伴入门和学习Python，感兴趣的小伙伴可以开始认真学习啦！一、Python简介【了解】1.计算机工作原理编程语言就是用来定义计算机程序的形式语言。我们通过编程语言来编写程序代码，再通过语言处理程序执行向计算机发送指令，让计算机完成对应的工作，编程
python八股文面试题分享及解析(1) Shawn________ python
#1.'''a=1b=2不用中间变量交换a和b'''#1.a=1b=2a,b=b,aprint(a)print(b)结果：21#2.ll=[]foriinrange(3):ll.append({'num':i})print(11)结果:#[{'num':0},{'num':1},{'num':2}]#3.kk=[]a={'num':0}foriinrange(3):#0,12#可变类型，不仅仅改变
git - Webhook让部署自动化大猪大猪
我们现在有一个需求，将项目打包上传到gitlab或者github后，程序能自动部署，不用手动地去服务器中进行项目更新并运行，如何做到？这里我们可以使用gitlab与github的挂钩，挂钩的原理就是，每当我们有请求到gitlab与github服务器时，这时他俩会根据我们配置的挂钩地扯进行访问，webhook挂钩程序会一直监听着某个端口请求，一但收到他们发过来的请求，这时就知道用户有请求提交了，这时
每日算法&面试题，大厂特训二十八天——第二十天（树）肥学 ⚡算法题⚡面试题每日精进 java 算法数据结构
目录标题导读算法特训二十八天面试题点击直接资料领取导读肥友们为了更好的去帮助新同学适应算法和面试题，最近我们开始进行专项突击一步一步来。上一期我们完成了动态规划二十一天现在我们进行下一项对各类算法进行二十八天的一个小总结。还在等什么快来一起肥学进行二十八天挑战吧！！特别介绍小白练手专栏，适合刚入手的新人欢迎订阅编程小白进阶python有趣练手项目里面包括了像《机器人尬聊》《恶搞程序》这样的有趣文章
Python快速入门 —— 第三节：类与对象孤华暗香 Python快速入门 python 开发语言
第三节：类与对象目标：了解面向对象编程的基础概念，并学会如何定义类和创建对象。内容：类与对象：定义类：class关键字。类的构造函数：__init__()。类的属性和方法。对象的创建与使用。示例：classStudent:def__init__(self,name,age,major):self.name&#
Faiss Tips：高效向量搜索与聚类的利器焦习娜Samantha
FaissTips：高效向量搜索与聚类的利器faiss_tipsSomeusefultipsforfaiss项目地址:https://gitcode.com/gh_mirrors/fa/faiss_tips项目介绍Faiss是由FacebookAIResearch开发的一个用于高效相似性搜索和密集向量聚类的库。它支持多种硬件平台，包括CPU和GPU，能够在海量数据集上实现快速的近似最近邻搜索（AN
pyecharts——绘制柱形图折线图 2224070247 信息可视化 python java 数据可视化
一、pyecharts概述自2013年6月百度EFE(ExcellentFrontEnd）数据可视化团队研发的ECharts1.0发布到GitHub网站以来，ECharts一直备受业界权威的关注并获得广泛好评，成为目前成熟且流行的数据可视化图表工具，被应用到诸多数据可视化的开发领域。Python作为数据分析领域最受欢迎的语言，也加入ECharts的使用行列，并研发出方便Python开发者使用的数据
冬天短期的暴利小生意有哪些？那些小生意适合新手做？一起高省
短期生意不失为创业的一个商机，不过短期生意的商机是转瞬即逝的，而且这类生意也很难作为长期的生意去做，那冬天短期暴利小生意查看更多关于短期暴利小生意的文章有哪些呢?给大家先推荐一个2023年风口项目吧，真很不错的项目，全程零投资，当做副业来做真的很稳定，不管你什么阶层的人，或多或少都网购吧？你们知道网购是可以拿提成，拿返利，拿分佣的吗？你们知道很多优惠券群里面，天天群主和管理发一些商品吗？他们其实在
Python 实现图片裁剪（附代码） | Python工具剑客阿良_ALiang
前言本文提供将图片按照自定义尺寸进行裁剪的工具方法，一如既往的实用主义。环境依赖ffmpeg环境安装，可以参考我的另一篇文章：windowsffmpeg安装部署_阿良的博客-CSDN博客本文主要使用到的不是ffmpeg，而是ffprobe也在上面这篇文章中的zip包中。ffmpy安装：pipinstallffmpy-ihttps://pypi.douban.com/simple代码不废话了，上代码
【华为OD技术面试真题 - 技术面】- python八股文真题题库（4) 算法大师华为od 面试 python
华为OD面试真题精选专栏：华为OD面试真题精选目录:2024华为OD面试手撕代码真题目录以及八股文真题目录文章目录华为OD面试真题精选**1.Python中的`with`**用途和功能自动资源管理示例：文件操作上下文管理协议示例代码工作流程解析优点2.\_\_new\_\_和**\_\_init\_\_**区别__new____init__区别总结3.**切片（Slicing）操作**基本切片语法
python os 环境变量 CV矿工 python 开发语言 numpy
环境变量：环境变量是程序和操作系统之间的通信方式。有些字符不宜明文写进代码里，比如数据库密码，个人账户密码，如果写进自己本机的环境变量里，程序用的时候通过os.environ.get（）取出来就行了。os.environ是一个环境变量的字典。环境变量的相关操作importos"""设置/修改环境变量：os.environ[‘环境变量名称’]=‘环境变量值’#其中key和value均为string类
高级 ECharts 技巧：自定义图表主题与样式 SnowMan1993 echarts 信息可视化数据分析
ECharts是一个强大的数据可视化库，提供了多种内置主题和样式，但你也可以根据项目的设计需求，自定义图表的主题与样式。本文将介绍如何使用ECharts自定义图表主题，以提升数据可视化的吸引力和一致性。1.什么是ECharts主题？ECharts的主题是指定义图表样式的配置项，包括颜色、字体、线条样式等。通过预设主题，你可以快速更改图表的整体风格，而自定义主题则允许你在此基础上进行个性化设置。2.
Python爬虫解析工具之xpath使用详解 eqa11 python 爬虫开发语言
文章目录Python爬虫解析工具之xpath使用详解一、引言二、环境准备1、插件安装2、依赖库安装三、xpath语法详解1、路径表达式2、通配符3、谓语4、常用函数四、xpath在Python代码中的使用1、文档树的创建2、使用xpath表达式3、获取元素内容和属性五、总结Python爬虫解析工具之xpath使用详解一、引言在Python爬虫开发中，数据提取是一个至关重要的环节。xpath作为一门
01-Git初识 Meereen Git git
01-Git初识概念：一个免费开源，分布式的代码版本控制系统，帮助开发团队维护代码作用：记录代码内容。切换代码版本，多人开发时高效合并代码内容如何学：个人本机使用：Git基础命令和概念多人共享使用：团队开发同一个项目的代码版本管理Git配置用户信息配置：用户名和邮箱，应用在每次提交代码版本时表明自己的身份命令：查看git版本号git-v配置用户名gitconfig--globaluser.name
jvm调优总结（从基本概念到深度优化） oloz java jvm jdk 虚拟机应用服务器
JVM参数详解：http://www.cnblogs.com/redcreen/archive/2011/05/04/2037057.html Java虚拟机中，数据类型可以分为两类：基本类型和引用类型。基本类型的变量保存原始值，即：他代表的值就是数值本身；而引用类型的变量保存引用值。“引用值”代表了某个对象的引用，而不是对象本身，对象本身存放在这个引用值所表示的地址的位置。
【Scala十六】Scala核心十：柯里化函数 bit1129 scala
本篇文章重点说明什么是函数柯里化，这个语法现象的背后动机是什么，有什么样的应用场景，以及与部分应用函数(Partial Applied Function)之间的联系 1. 什么是柯里化函数 A way to write functions with multiple parameter lists. For instance def f(x: Int)(y: Int) is a
HashMap dalan_123 java
HashMap在java中对很多人来说都是熟的；基于hash表的map接口的非同步实现。允许使用null和null键；同时不能保证元素的顺序；也就是从来都不保证其中的元素的顺序恒久不变。 1、数据结构在java中，最基本的数据结构无外乎：数组和引用（指针），所有的数据结构都可以用这两个来构造，HashMap也不例外，归根到底HashMap就是一个链表散列的数据
Java Swing如何实时刷新JTextArea，以显示刚才加append的内容周凡杨 java 更新 swing JTextArea
在代码中执行完textArea.append("message")后，如果你想让这个更新立刻显示在界面上而不是等swing的主线程返回后刷新，我们一般会在该语句后调用textArea.invalidate()和textArea.repaint()。问题是这个方法并不能有任何效果，textArea的内容没有任何变化，这或许是swing的一个bug，有一个笨拙的办法可以实现
servlet或struts的Action处理ajax请求 g21121 servlet
其实处理ajax的请求非常简单，直接看代码就行了： //如果用的是struts //HttpServletResponse response = ServletActionContext.getResponse(); // 设置输出为文字流 response.setContentType("text/plain"); // 设置字符集 res
FineReport的公式编辑框的语法简介老A不折腾 finereport 公式总结
FINEREPORT用到公式的地方非常多，单元格（以=开头的便被解析为公式），条件显示，数据字典，报表填报属性值定义，图表标题，轴定义，页眉页脚，甚至单元格的其他属性中的鼠标悬浮提示内容都可以写公式。简单的说下自己感觉的公式要注意的几个地方： 1.if语句语法刚接触感觉比较奇怪，if(条件式子,值1,值2)，if可以嵌套，if(条件式子1，值1，if(条件式子2，值2，值3)
linux mysql 数据库乱码的解决办法墙头上一根草 linux mysql 数据库乱码
linux 上mysql数据库区分大小写的配置 lower_case_table_names=1 1-不区分大小写 0-区分大小写修改/etc/my.cnf 具体的修改内容如下: [client] default-character-set=utf8 [mysqld] datadir=/var/lib/mysql socket=/va
我的spring学习笔记6-ApplicationContext实例化的参数兼容思想 aijuans Spring 3
ApplicationContext能读取多个Bean定义文件，方法是： ApplicationContext appContext = new ClassPathXmlApplicationContext（ new String[]｛“bean-config1.xml”，“bean-config2.xml”，“bean-config3.xml”，“bean-config4.xml
mysql 基准测试之sysbench annan211 基准测试 mysql基准测试 MySQL测试 sysbench
1 执行如下命令，安装sysbench-0.5： tar xzvf sysbench-0.5.tar.gz cd sysbench-0.5 chmod +x autogen.sh ./autogen.sh ./configure --with-mysql --with-mysql-includes=/usr/local/mysql
sql的复杂查询使用案列与技巧百合不是茶 oracle sql 函数数据分页合并查询
本片博客使用的数据库表是oracle中的scott用户表; ------------------- 自然连接查询查询 smith 的上司(两种方法) &
深入学习Thread类 bijian1013 java thread 多线程 java多线程
一．线程的名字下面来看一下Thread类的name属性，它的类型是String。它其实就是线程的名字。在Thread类中，有String getName()和void setName(String)两个方法用来设置和获取这个属性的值。同时，Thr
JSON串转换成Map以及如何转换到对应的数据类型 bijian1013 java fastjson net.sf.json
在实际开发中，难免会碰到JSON串转换成Map的情况，下面来看看这方面的实例。另外，由于fastjson只支持JDK1.5及以上版本，因此在JDK1.4的项目中可以采用net.sf.json来处理。一.fastjson实例 JsonUtil.java package com.study; impor
【RPC框架HttpInvoker一】HttpInvoker：Spring自带RPC框架 bit1129 spring
HttpInvoker是Spring原生的RPC调用框架，HttpInvoker同Burlap和Hessian一样，提供了一致的服务Exporter以及客户端的服务代理工厂Bean，这篇文章主要是复制粘贴了Hessian与Spring集成一文，【RPC框架Hessian四】Hessian与Spring集成在【RPC框架Hessian二】Hessian 对象序列化和反序列化一文中
【Mahout二】基于Mahout CBayes算法的20newsgroup的脚本分析 bit1129 Mahout
#!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information re
nginx三种获取用户真实ip的方法 ronin47
随着nginx的迅速崛起，越来越多公司将apache更换成nginx. 同时也越来越多人使用nginx作为负载均衡, 并且代理前面可能还加上了CDN加速，但是随之也遇到一个问题：nginx如何获取用户的真实IP地址,如果后端是apache,请跳转到<apache获取用户真实IP地址>，如果是后端真实服务器是nginx，那么继续往下看。实例环境：用户IP 120.22.11.11
java-判断二叉树是不是平衡 bylijinnan java
参考了 http://zhedahht.blog.163.com/blog/static/25411174201142733927831/ 但是用java来实现有一个问题。由于Java无法像C那样“传递参数的地址，函数返回时能得到参数的值”，唯有新建一个辅助类：AuxClass import ljn.help.*; public class BalancedBTree {
BeanUtils.copyProperties VS PropertyUtils.copyProperties 诸葛不亮 PropertyUtils BeanUtils
BeanUtils.copyProperties VS PropertyUtils.copyProperties 作为两个bean属性copy的工具类，他们被广泛使用，同时也很容易误用，给人造成困然；比如：昨天发现同事在使用BeanUtils.copyProperties copy有integer类型属性的bean时，没有考虑到会将null转换为0，而后面的业
[金融与信息安全]最简单的数据结构最安全 comsci 数据结构
现在最流行的数据库的数据存储文件都具有复杂的文件头格式，用操作系统的记事本软件是无法正常浏览的，这样的情况会有什么问题呢？从信息安全的角度来看，如果我们数据库系统仅仅把这种格式的数据文件做异地备份，如果相同版本的所有数据库管理系统都同时被攻击，那么
vi区段删除 Cwind linux vi 区段删除
区段删除是编辑和分析一些冗长的配置文件或日志文件时比较常用的操作。简记下vi区段删除要点备忘。 vi概述引文中并未将末行模式单独列为一种模式。单不单列并不重要，能区分命令模式与末行模式即可。 vi区段删除步骤： 1. 在末行模式下使用:set nu显示行号非必须，随光标移动vi右下角也会显示行号，能够正确找到并记录删除开始行
清除tomcat缓存的方法总结 dashuaifu tomcat 缓存
用tomcat容器，大家可能会发现这样的问题，修改jsp文件后，但用IE打开依然是以前的Jsp的页面。出现这种现象的原因主要是tomcat缓存的原因。解决办法如下: 在jsp文件头加上 <meta http-equiv="Expires" content="0"> <meta http-equiv="kiben&qu
不要盲目的在项目中使用LESS CSS dcj3sjt126com Web less
　如果你还不知道LESS CSS是什么东西，可以看一下这篇文章，是我一朋友写给新人看的《CSS——LESS》　　不可否认，LESS CSS是个强大的工具，它弥补了css没有变量、无法运算等一些“先天缺陷”，但它似乎给我一种错觉，就是为了功能而实现功能。　　比如它的引用功能 ? .rounded_corners{
[入门]更上一层楼 dcj3sjt126com PHP yii2
更上一层楼通篇阅读完整个“入门”部分，你就完成了一个完整 Yii 应用的创建。在此过程中你学到了如何实现一些常用功能，例如通过 HTML 表单从用户那获取数据，从数据库中获取数据并以分页形式显示。你还学到了如何通过 Gii 去自动生成代码。使用 Gii 生成代码把 Web 开发中多数繁杂的过程转化为仅仅填写几个表单就行。本章将介绍一些有助于更好使用 Yii 的资源：
Apache HttpClient使用详解 eksliang httpclient http协议
Http协议的重要性相信不用我多说了，HttpClient相比传统JDK自带的URLConnection，增加了易用性和灵活性（具体区别，日后我们再讨论），它不仅是客户端发送Http请求变得容易，而且也方便了开发人员测试接口（基于Http协议的），即提高了开发的效率，也方便提高代码的健壮性。因此熟练掌握HttpClient是很重要的必修内容，掌握HttpClient后，相信对于Http协议的了解会
zxing二维码扫描功能 gundumw100 android zxing
经常要用到二维码扫描功能现给出示例代码 import com.google.zxing.WriterException; import com.zxing.activity.CaptureActivity; import com.zxing.encoding.EncodingHandler; import android.app.Activity; import an
纯HTML+CSS带说明的黄色导航菜单 ini html Web html5 css hovertree
HoverTree带说明的CSS菜单:纯HTML+CSS结构链接带说明的黄色导航在线体验效果：http://hovertree.com/texiao/css/1.htm代码如下,保存到HTML文件可以看到效果： <!DOCTYPE html > <html > <head> <title>HoverTree
fastjson初始化对性能的影响 kane_xie fastjson 序列化
之前在项目中序列化是用thrift，性能一般，而且需要用编译器生成新的类，在序列化和反序列化的时候感觉很繁琐，因此想转到json阵营。对比了jackson，gson等框架之后，决定用fastjson，为什么呢，因为看名字感觉很快。。。网上的说法： fastjson 是一个性能很好的 Java 语言实现的 JSON 解析器和生成器，来自阿里巴巴的工程师开发。
基于Mybatis封装的增删改查实现通用自动化sql mengqingyu DAO
1.基于map或javaBean的增删改查可实现不写dao接口和实现类以及xml，有效的提高开发速度。 2.支持自定义注解包括主键生成、列重复验证、列名、表名等 3.支持批量插入、批量更新、批量删除 <bean id="dynamicSqlSessionTemplate" class="com.mqy.mybatis.support.Dynamic
js控制input输入框的方法封装(数字，中文，字母，浮点数等) qifeifei javascript js
在项目开发的时候，经常有一些输入框，控制输入的格式，而不是等输入好了再去检查格式，格式错了就报错，体验不好。 /** 数字，中文，字母,浮点数(+/-/.) 类型输入限制，只要在input标签上加上 jInput="number,chinese,alphabet,floating" 备注：floating属性只能单独用*/ funct
java 计时器应用 tangqi609567707 java timer
mport java.util.TimerTask; import java.util.Calendar; public class MyTask extends TimerTask { private static final int
erlang输出调用栈信息 wudixiaotie erlang
在erlang otp的开发中，如果调用第三方的应用，会有有些错误会不打印栈信息，因为有可能第三方应用会catch然后输出自己的错误信息，所以对排查bug有很大的阻碍，这样就要求我们自己打印调用的栈信息。用这个函数：erlang:process_display (self (), backtrace).需要注意这个函数只会输出到标准错误输出。也可以用这个函数：erlang:get_s