import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.family'] = ['SimHei']
plt.rcParams['axes.unicode_minus']=False
sheet_names = ['2015', '2016', '2017', '2018', '会员等级']
sheet_datas = [pd.read_excel('sales.xlsx', sheet_name=i) for i in sheet_names]
sheet_datas_copy = sheet_datas.copy
for sheet_name, sheet_data in zip(sheet_names, sheet_datas):
print('{}\n{}'.format(sheet_name, sheet_data.head()))
print('描述性统计\n{}'.format(sheet_data.describe()))
print('基本属性')
print(sheet_data.info())
print('特征缺失值数量\n{}'.format(sheet_data.isnull().sum()))
2015
会员ID 订单号 提交日期 订单金额 Unnamed: 4 Unnamed: 5 \
0 15278002468 3000304681 2015-01-01 499.0 NaN NaN
1 39236378972 3000305791 2015-01-01 2588.0 NaN NaN
2 38722039578 3000641787 2015-01-01 498.0 NaN NaN
3 11049640063 3000798913 2015-01-01 1572.0 NaN NaN
4 35038752292 3000821546 2015-01-01 10.1 NaN NaN
Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9
0 NaN NaN NaN NaN
1 NaN NaN NaN NaN
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
描述性统计
会员ID 订单号 订单金额 Unnamed: 4 Unnamed: 5 \
count 3.077400e+04 3.077400e+04 30774.000000 0.0 0.0
mean 2.918779e+10 4.020414e+09 960.991161 NaN NaN
std 1.385333e+10 2.630510e+08 2068.107231 NaN NaN
min 2.670000e+02 3.000305e+09 0.500000 NaN NaN
25% 1.944122e+10 3.885510e+09 59.000000 NaN NaN
50% 3.746545e+10 4.117491e+09 139.000000 NaN NaN
75% 3.923593e+10 4.234882e+09 899.000000 NaN NaN
max 3.954613e+10 4.282025e+09 111750.000000 NaN NaN
Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9
count 0.0 0.0 0.0 0.0
mean NaN NaN NaN NaN
std NaN NaN NaN NaN
min NaN NaN NaN NaN
25% NaN NaN NaN NaN
50% NaN NaN NaN NaN
75% NaN NaN NaN NaN
max NaN NaN NaN NaN
基本属性
RangeIndex: 30774 entries, 0 to 30773
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 会员ID 30774 non-null int64
1 订单号 30774 non-null int64
2 提交日期 30774 non-null datetime64[ns]
3 订单金额 30774 non-null float64
4 Unnamed: 4 0 non-null float64
5 Unnamed: 5 0 non-null float64
6 Unnamed: 6 0 non-null float64
7 Unnamed: 7 0 non-null float64
8 Unnamed: 8 0 non-null float64
9 Unnamed: 9 0 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(2)
memory usage: 2.3 MB
None
特征缺失值数量
会员ID 0
订单号 0
提交日期 0
订单金额 0
Unnamed: 4 30774
Unnamed: 5 30774
Unnamed: 6 30774
Unnamed: 7 30774
Unnamed: 8 30774
Unnamed: 9 30774
dtype: int64
2016
会员ID 订单号 提交日期 订单金额 Unnamed: 4 Unnamed: 5 \
0 39288120141 4282025766 2016-01-01 76.0 NaN NaN
1 39293812118 4282037929 2016-01-01 7599.0 NaN NaN
2 27596340905 4282038740 2016-01-01 802.0 NaN NaN
3 15111475509 4282043819 2016-01-01 65.0 NaN NaN
4 38896594001 4282051044 2016-01-01 95.0 NaN NaN
Unnamed: 6
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
描述性统计
会员ID 订单号 订单金额 Unnamed: 4 Unnamed: 5 \
count 4.127800e+04 4.127800e+04 41277.000000 0.0 0.0
mean 2.908415e+10 4.313583e+09 957.106694 NaN NaN
std 1.389468e+10 1.094572e+07 2478.560036 NaN NaN
min 8.100000e+01 4.282026e+09 0.100000 NaN NaN
25% 1.934990e+10 4.309457e+09 59.000000 NaN NaN
50% 3.730339e+10 4.317545e+09 147.000000 NaN NaN
75% 3.923182e+10 4.321132e+09 888.000000 NaN NaN
max 3.954554e+10 4.324911e+09 174900.000000 NaN NaN
Unnamed: 6
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
基本属性
RangeIndex: 41278 entries, 0 to 41277
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 会员ID 41278 non-null int64
1 订单号 41278 non-null int64
2 提交日期 41278 non-null datetime64[ns]
3 订单金额 41277 non-null float64
4 Unnamed: 4 0 non-null float64
5 Unnamed: 5 0 non-null float64
6 Unnamed: 6 0 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 2.2 MB
None
特征缺失值数量
会员ID 0
订单号 0
提交日期 0
订单金额 1
Unnamed: 4 41278
Unnamed: 5 41278
Unnamed: 6 41278
dtype: int64
2017
会员ID 订单号 提交日期 订单金额 Unnamed: 4 Unnamed: 5 \
0 38765290840 4324911135 2017-01-01 1799.0 NaN NaN
1 39305832102 4324911213 2017-01-01 369.0 NaN NaN
2 34190994969 4324911251 2017-01-01 189.0 NaN NaN
3 38986333210 4324911283 2017-01-01 169.0 NaN NaN
4 4271359 4324911355 2017-01-01 78.0 NaN NaN
Unnamed: 6
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
描述性统计
会员ID 订单号 订单金额 Unnamed: 4 Unnamed: 5 \
count 5.083900e+04 5.083900e+04 50839.000000 0.0 0.0
mean 2.882368e+10 4.332466e+09 963.587872 NaN NaN
std 1.409416e+10 4.404350e+06 2178.727261 NaN NaN
min 2.780000e+02 4.324911e+09 0.300000 NaN NaN
25% 1.869274e+10 4.328415e+09 59.000000 NaN NaN
50% 3.688044e+10 4.331989e+09 149.000000 NaN NaN
75% 3.923020e+10 4.337515e+09 898.000000 NaN NaN
max 3.954554e+10 4.338764e+09 123609.000000 NaN NaN
Unnamed: 6
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
基本属性
RangeIndex: 50839 entries, 0 to 50838
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 会员ID 50839 non-null int64
1 订单号 50839 non-null int64
2 提交日期 50839 non-null datetime64[ns]
3 订单金额 50839 non-null float64
4 Unnamed: 4 0 non-null float64
5 Unnamed: 5 0 non-null float64
6 Unnamed: 6 0 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 2.7 MB
None
特征缺失值数量
会员ID 0
订单号 0
提交日期 0
订单金额 0
Unnamed: 4 50839
Unnamed: 5 50839
Unnamed: 6 50839
dtype: int64
2018
会员ID 订单号 提交日期 订单金额 Unnamed: 4 Unnamed: 5 \
0 39229691808 4338764262 2018-01-01 3646.0 NaN NaN
1 39293668916 4338764363 2018-01-01 3999.0 NaN NaN
2 35059646224 4338764376 2018-01-01 10.1 NaN NaN
3 1084397 4338770013 2018-01-01 828.0 NaN NaN
4 3349915 4338770121 2018-01-01 3758.0 NaN NaN
Unnamed: 6
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
描述性统计
会员ID 订单号 订单金额 Unnamed: 4 Unnamed: 5 \
count 8.134900e+04 8.134900e+04 81348.000000 0.0 0.0
mean 2.902317e+10 4.348372e+09 966.582792 NaN NaN
std 1.404116e+10 4.183774e+06 2204.969534 NaN NaN
min 2.780000e+02 4.338764e+09 0.000000 NaN NaN
25% 1.902755e+10 4.345654e+09 60.000000 NaN NaN
50% 3.740121e+10 4.349448e+09 149.000000 NaN NaN
75% 3.923380e+10 4.351639e+09 899.000000 NaN NaN
max 3.954614e+10 4.354235e+09 174900.000000 NaN NaN
Unnamed: 6
count 0.0
mean NaN
std NaN
min NaN
25% NaN
50% NaN
75% NaN
max NaN
基本属性
RangeIndex: 81349 entries, 0 to 81348
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 会员ID 81349 non-null int64
1 订单号 81349 non-null int64
2 提交日期 81349 non-null datetime64[ns]
3 订单金额 81348 non-null float64
4 Unnamed: 4 0 non-null float64
5 Unnamed: 5 0 non-null float64
6 Unnamed: 6 0 non-null float64
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 4.3 MB
None
特征缺失值数量
会员ID 0
订单号 0
提交日期 0
订单金额 1
Unnamed: 4 81349
Unnamed: 5 81349
Unnamed: 6 81349
dtype: int64
会员等级
会员ID 会员等级 Unnamed: 2 Unnamed: 3
0 100090 3 NaN NaN
1 10012905801 1 NaN NaN
2 10012935109 1 NaN NaN
3 10013498043 1 NaN NaN
4 10014087899 4 NaN NaN
描述性统计
会员ID 会员等级 Unnamed: 2 Unnamed: 3
count 1.543850e+05 154385.000000 0.0 0.0
mean 2.980055e+10 2.259701 NaN NaN
std 1.365654e+10 1.346408 NaN NaN
min 8.100000e+01 1.000000 NaN NaN
25% 2.213894e+10 1.000000 NaN NaN
50% 3.833022e+10 2.000000 NaN NaN
75% 3.927932e+10 3.000000 NaN NaN
max 3.954614e+10 5.000000 NaN NaN
基本属性
RangeIndex: 154385 entries, 0 to 154384
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 会员ID 154385 non-null int64
1 会员等级 154385 non-null int64
2 Unnamed: 2 0 non-null float64
3 Unnamed: 3 0 non-null float64
dtypes: float64(2), int64(2)
memory usage: 4.7 MB
None
特征缺失值数量
会员ID 0
会员等级 0
Unnamed: 2 154385
Unnamed: 3 154385
dtype: int64
for i in range(4):
sheet_datas[i] = sheet_datas[i].dropna(axis=1, thresh=2).dropna()
sheet_datas[i] = sheet_datas[i][sheet_datas[i]['订单金额'] > 1]
sheet_datas[i]['max_year_date'] = sheet_datas[i]['提交日期'].max()
data_merge = pd.concat(sheet_datas[:-1],axis=0)
data_merge
|
会员ID |
订单号 |
提交日期 |
订单金额 |
max_year_date |
0 |
15278002468 |
3000304681 |
2015-01-01 |
499.0 |
2015-12-31 |
1 |
39236378972 |
3000305791 |
2015-01-01 |
2588.0 |
2015-12-31 |
2 |
38722039578 |
3000641787 |
2015-01-01 |
498.0 |
2015-12-31 |
3 |
11049640063 |
3000798913 |
2015-01-01 |
1572.0 |
2015-12-31 |
4 |
35038752292 |
3000821546 |
2015-01-01 |
10.1 |
2015-12-31 |
... |
... |
... |
... |
... |
... |
81344 |
39229485704 |
4354225182 |
2018-12-31 |
249.0 |
2018-12-31 |
81345 |
39229021075 |
4354225188 |
2018-12-31 |
89.0 |
2018-12-31 |
81346 |
39288976750 |
4354230034 |
2018-12-31 |
48.5 |
2018-12-31 |
81347 |
26772630 |
4354230163 |
2018-12-31 |
3196.0 |
2018-12-31 |
81348 |
39455580335 |
4354235084 |
2018-12-31 |
2999.0 |
2018-12-31 |
202827 rows × 5 columns
data_merge['日期间隔'] = (data_merge['max_year_date'] - data_merge['提交日期']).apply(lambda x:x.days)
data_merge['年份'] = data_merge['提交日期'].dt.year
data_merge
|
会员ID |
订单号 |
提交日期 |
订单金额 |
max_year_date |
日期间隔 |
年份 |
0 |
15278002468 |
3000304681 |
2015-01-01 |
499.0 |
2015-12-31 |
364 |
2015 |
1 |
39236378972 |
3000305791 |
2015-01-01 |
2588.0 |
2015-12-31 |
364 |
2015 |
2 |
38722039578 |
3000641787 |
2015-01-01 |
498.0 |
2015-12-31 |
364 |
2015 |
3 |
11049640063 |
3000798913 |
2015-01-01 |
1572.0 |
2015-12-31 |
364 |
2015 |
4 |
35038752292 |
3000821546 |
2015-01-01 |
10.1 |
2015-12-31 |
364 |
2015 |
... |
... |
... |
... |
... |
... |
... |
... |
81344 |
39229485704 |
4354225182 |
2018-12-31 |
249.0 |
2018-12-31 |
0 |
2018 |
81345 |
39229021075 |
4354225188 |
2018-12-31 |
89.0 |
2018-12-31 |
0 |
2018 |
81346 |
39288976750 |
4354230034 |
2018-12-31 |
48.5 |
2018-12-31 |
0 |
2018 |
81347 |
26772630 |
4354230163 |
2018-12-31 |
3196.0 |
2018-12-31 |
0 |
2018 |
81348 |
39455580335 |
4354235084 |
2018-12-31 |
2999.0 |
2018-12-31 |
0 |
2018 |
202827 rows × 7 columns
rfm_gb = data_merge.groupby(['年份', '会员ID'], as_index=False).agg({'日期间隔':'min', '提交日期':'count', '订单金额':'sum'})
rfm_gb.columns = ['year', '会员ID', 'R', 'F', 'M']
rfm_gb
|
year |
会员ID |
R |
F |
M |
0 |
2015 |
267 |
197 |
2 |
105.0 |
1 |
2015 |
282 |
251 |
1 |
29.7 |
2 |
2015 |
283 |
340 |
1 |
5398.0 |
3 |
2015 |
343 |
300 |
1 |
118.0 |
4 |
2015 |
525 |
37 |
3 |
213.0 |
... |
... |
... |
... |
... |
... |
148586 |
2018 |
39538034299 |
272 |
1 |
49.0 |
148587 |
2018 |
39538034662 |
189 |
1 |
3558.0 |
148588 |
2018 |
39538035729 |
179 |
1 |
3699.0 |
148589 |
2018 |
39545237824 |
275 |
1 |
49.0 |
148590 |
2018 |
39546136285 |
163 |
1 |
19.9 |
148591 rows × 5 columns
rfm_col = ['R', 'F', 'M']
rfm_gb[rfm_col].describe().T
|
count |
mean |
std |
min |
25% |
50% |
75% |
max |
R |
148591.0 |
165.524043 |
101.988472 |
0.0 |
79.0 |
156.0 |
255.0 |
365.0 |
F |
148591.0 |
1.365002 |
2.626953 |
1.0 |
1.0 |
1.0 |
1.0 |
130.0 |
M |
148591.0 |
1323.741329 |
3753.906883 |
1.5 |
69.0 |
189.0 |
1199.0 |
206251.8 |
for i in rfm_col:
plt.figure(figsize=(6,4))
plt.hist(rfm_gb[i], bins=20)
plt.title(i)
plt.show()
rfm_merge = pd.merge(rfm_gb, sheet_datas[-1], on='会员ID', how='inner')
rfm_merge.head()
|
year |
会员ID |
R |
F |
M |
会员等级 |
Unnamed: 2 |
Unnamed: 3 |
0 |
2015 |
267 |
197 |
2 |
105.0 |
1 |
NaN |
NaN |
1 |
2015 |
282 |
251 |
1 |
29.7 |
5 |
NaN |
NaN |
2 |
2017 |
282 |
314 |
2 |
12992.0 |
5 |
NaN |
NaN |
3 |
2018 |
282 |
19 |
5 |
30027.0 |
5 |
NaN |
NaN |
4 |
2015 |
283 |
340 |
1 |
5398.0 |
4 |
NaN |
NaN |
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42).fit(rfm_merge[rfm_col], rfm_merge['会员等级'])
print(clf.feature_importances_)
[0.39694201 0.00605384 0.59700415]
R_bins = [-1,79,255,365]
F_bins = [0,2,5,130]
M_bins = [0,69,1199,206252]
rfm_gb['R_score'] = pd.cut(rfm_gb['R'], bins=R_bins, labels=[3,2,1])
rfm_gb['F_score'] = pd.cut(rfm_gb['F'], bins=F_bins, labels=[1,2,3])
rfm_gb['M_score'] = pd.cut(rfm_gb['M'], bins=M_bins, labels=[1,2,3])
rfm_gb
|
year |
会员ID |
R |
F |
M |
R_score |
F_score |
M_score |
0 |
2015 |
267 |
197 |
2 |
105.0 |
2 |
1 |
2 |
1 |
2015 |
282 |
251 |
1 |
29.7 |
2 |
1 |
1 |
2 |
2015 |
283 |
340 |
1 |
5398.0 |
1 |
1 |
3 |
3 |
2015 |
343 |
300 |
1 |
118.0 |
1 |
1 |
2 |
4 |
2015 |
525 |
37 |
3 |
213.0 |
3 |
2 |
2 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
148586 |
2018 |
39538034299 |
272 |
1 |
49.0 |
1 |
1 |
1 |
148587 |
2018 |
39538034662 |
189 |
1 |
3558.0 |
2 |
1 |
3 |
148588 |
2018 |
39538035729 |
179 |
1 |
3699.0 |
2 |
1 |
3 |
148589 |
2018 |
39545237824 |
275 |
1 |
49.0 |
1 |
1 |
1 |
148590 |
2018 |
39546136285 |
163 |
1 |
19.9 |
2 |
1 |
1 |
148591 rows × 8 columns
rfm_gb = rfm_gb.apply(np.int64)
weight = clf.feature_importances_
rfm_gb['RFM_score'] = rfm_gb['R_score'] * weight[0] + rfm_gb['F_score'] * weight[1] + rfm_gb['M_score'] * weight[2]
rfm_gb
|
year |
会员ID |
R |
F |
M |
R_score |
F_score |
M_score |
RFM_score |
0 |
2015 |
267 |
197 |
2 |
105 |
2 |
1 |
2 |
1.993946 |
1 |
2015 |
282 |
251 |
1 |
29 |
2 |
1 |
1 |
1.396942 |
2 |
2015 |
283 |
340 |
1 |
5398 |
1 |
1 |
3 |
2.194008 |
3 |
2015 |
343 |
300 |
1 |
118 |
1 |
1 |
2 |
1.597004 |
4 |
2015 |
525 |
37 |
3 |
213 |
3 |
2 |
2 |
2.396942 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
148586 |
2018 |
39538034299 |
272 |
1 |
49 |
1 |
1 |
1 |
1.000000 |
148587 |
2018 |
39538034662 |
189 |
1 |
3558 |
2 |
1 |
3 |
2.590950 |
148588 |
2018 |
39538035729 |
179 |
1 |
3699 |
2 |
1 |
3 |
2.590950 |
148589 |
2018 |
39545237824 |
275 |
1 |
49 |
1 |
1 |
1 |
1.000000 |
148590 |
2018 |
39546136285 |
163 |
1 |
19 |
2 |
1 |
1 |
1.396942 |
148591 rows × 9 columns
rfm_gb['R_score'] = rfm_gb['R_score'].astype('str')
rfm_gb['F_score'] = rfm_gb['F_score'].astype('str')
rfm_gb['M_score'] = rfm_gb['M_score'].astype('str')
rfm_gb['RFM_label'] = rfm_gb['R_score'] + rfm_gb['F_score'] + rfm_gb['M_score']
rfm_gb
|
year |
会员ID |
R |
F |
M |
R_score |
F_score |
M_score |
RFM_score |
RFM_label |
0 |
2015 |
267 |
197 |
2 |
105 |
2 |
1 |
2 |
1.993946 |
212 |
1 |
2015 |
282 |
251 |
1 |
29 |
2 |
1 |
1 |
1.396942 |
211 |
2 |
2015 |
283 |
340 |
1 |
5398 |
1 |
1 |
3 |
2.194008 |
113 |
3 |
2015 |
343 |
300 |
1 |
118 |
1 |
1 |
2 |
1.597004 |
112 |
4 |
2015 |
525 |
37 |
3 |
213 |
3 |
2 |
2 |
2.396942 |
322 |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
148586 |
2018 |
39538034299 |
272 |
1 |
49 |
1 |
1 |
1 |
1.000000 |
111 |
148587 |
2018 |
39538034662 |
189 |
1 |
3558 |
2 |
1 |
3 |
2.590950 |
213 |
148588 |
2018 |
39538035729 |
179 |
1 |
3699 |
2 |
1 |
3 |
2.590950 |
213 |
148589 |
2018 |
39545237824 |
275 |
1 |
49 |
1 |
1 |
1 |
1.000000 |
111 |
148590 |
2018 |
39546136285 |
163 |
1 |
19 |
2 |
1 |
1 |
1.396942 |
211 |
148591 rows × 10 columns
rfm_gb.to_csv('rfm_vip.csv', index=False, encoding='utf-16', sep='\t')