DataWhale会员数据化运营项目(个人练习)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = ['SimHei']    # 显示中文,解决图中无法显示中文的问题
plt.rcParams['axes.unicode_minus']=False
# 建立sheet_name列表,对应excel种的每一个工作表
sheet_names = ['2015', '2016', '2017', '2018', '会员等级']
# 读取数据
sheet_datas = [pd.read_excel('sales.xlsx', sheet_name=i) for i in sheet_names]
sheet_datas_copy = sheet_datas.copy  #拷贝
# sheet_datas = sheet_datas_copy
# 利用循环查看每一个工作表的基本情况
for sheet_name, sheet_data in zip(sheet_names, sheet_datas):
    print('{}\n{}'.format(sheet_name, sheet_data.head()))
    print('描述性统计\n{}'.format(sheet_data.describe()))
    print('基本属性')
    print(sheet_data.info())
    print('特征缺失值数量\n{}'.format(sheet_data.isnull().sum()))
2015
          会员ID         订单号       提交日期    订单金额  Unnamed: 4  Unnamed: 5  \
0  15278002468  3000304681 2015-01-01   499.0         NaN         NaN   
1  39236378972  3000305791 2015-01-01  2588.0         NaN         NaN   
2  38722039578  3000641787 2015-01-01   498.0         NaN         NaN   
3  11049640063  3000798913 2015-01-01  1572.0         NaN         NaN   
4  35038752292  3000821546 2015-01-01    10.1         NaN         NaN   

   Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  
0         NaN         NaN         NaN         NaN  
1         NaN         NaN         NaN         NaN  
2         NaN         NaN         NaN         NaN  
3         NaN         NaN         NaN         NaN  
4         NaN         NaN         NaN         NaN  
描述性统计
               会员ID           订单号           订单金额  Unnamed: 4  Unnamed: 5  \
count  3.077400e+04  3.077400e+04   30774.000000         0.0         0.0   
mean   2.918779e+10  4.020414e+09     960.991161         NaN         NaN   
std    1.385333e+10  2.630510e+08    2068.107231         NaN         NaN   
min    2.670000e+02  3.000305e+09       0.500000         NaN         NaN   
25%    1.944122e+10  3.885510e+09      59.000000         NaN         NaN   
50%    3.746545e+10  4.117491e+09     139.000000         NaN         NaN   
75%    3.923593e+10  4.234882e+09     899.000000         NaN         NaN   
max    3.954613e+10  4.282025e+09  111750.000000         NaN         NaN   

       Unnamed: 6  Unnamed: 7  Unnamed: 8  Unnamed: 9  
count         0.0         0.0         0.0         0.0  
mean          NaN         NaN         NaN         NaN  
std           NaN         NaN         NaN         NaN  
min           NaN         NaN         NaN         NaN  
25%           NaN         NaN         NaN         NaN  
50%           NaN         NaN         NaN         NaN  
75%           NaN         NaN         NaN         NaN  
max           NaN         NaN         NaN         NaN  
基本属性

RangeIndex: 30774 entries, 0 to 30773
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   会员ID        30774 non-null  int64         
 1   订单号         30774 non-null  int64         
 2   提交日期        30774 non-null  datetime64[ns]
 3   订单金额        30774 non-null  float64       
 4   Unnamed: 4  0 non-null      float64       
 5   Unnamed: 5  0 non-null      float64       
 6   Unnamed: 6  0 non-null      float64       
 7   Unnamed: 7  0 non-null      float64       
 8   Unnamed: 8  0 non-null      float64       
 9   Unnamed: 9  0 non-null      float64       
dtypes: datetime64[ns](1), float64(7), int64(2)
memory usage: 2.3 MB
None
特征缺失值数量
会员ID              0
订单号               0
提交日期              0
订单金额              0
Unnamed: 4    30774
Unnamed: 5    30774
Unnamed: 6    30774
Unnamed: 7    30774
Unnamed: 8    30774
Unnamed: 9    30774
dtype: int64
2016
          会员ID         订单号       提交日期    订单金额  Unnamed: 4  Unnamed: 5  \
0  39288120141  4282025766 2016-01-01    76.0         NaN         NaN   
1  39293812118  4282037929 2016-01-01  7599.0         NaN         NaN   
2  27596340905  4282038740 2016-01-01   802.0         NaN         NaN   
3  15111475509  4282043819 2016-01-01    65.0         NaN         NaN   
4  38896594001  4282051044 2016-01-01    95.0         NaN         NaN   

   Unnamed: 6  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  
描述性统计
               会员ID           订单号           订单金额  Unnamed: 4  Unnamed: 5  \
count  4.127800e+04  4.127800e+04   41277.000000         0.0         0.0   
mean   2.908415e+10  4.313583e+09     957.106694         NaN         NaN   
std    1.389468e+10  1.094572e+07    2478.560036         NaN         NaN   
min    8.100000e+01  4.282026e+09       0.100000         NaN         NaN   
25%    1.934990e+10  4.309457e+09      59.000000         NaN         NaN   
50%    3.730339e+10  4.317545e+09     147.000000         NaN         NaN   
75%    3.923182e+10  4.321132e+09     888.000000         NaN         NaN   
max    3.954554e+10  4.324911e+09  174900.000000         NaN         NaN   

       Unnamed: 6  
count         0.0  
mean          NaN  
std           NaN  
min           NaN  
25%           NaN  
50%           NaN  
75%           NaN  
max           NaN  
基本属性

RangeIndex: 41278 entries, 0 to 41277
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   会员ID        41278 non-null  int64         
 1   订单号         41278 non-null  int64         
 2   提交日期        41278 non-null  datetime64[ns]
 3   订单金额        41277 non-null  float64       
 4   Unnamed: 4  0 non-null      float64       
 5   Unnamed: 5  0 non-null      float64       
 6   Unnamed: 6  0 non-null      float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 2.2 MB
None
特征缺失值数量
会员ID              0
订单号               0
提交日期              0
订单金额              1
Unnamed: 4    41278
Unnamed: 5    41278
Unnamed: 6    41278
dtype: int64
2017
          会员ID         订单号       提交日期    订单金额  Unnamed: 4  Unnamed: 5  \
0  38765290840  4324911135 2017-01-01  1799.0         NaN         NaN   
1  39305832102  4324911213 2017-01-01   369.0         NaN         NaN   
2  34190994969  4324911251 2017-01-01   189.0         NaN         NaN   
3  38986333210  4324911283 2017-01-01   169.0         NaN         NaN   
4      4271359  4324911355 2017-01-01    78.0         NaN         NaN   

   Unnamed: 6  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  
描述性统计
               会员ID           订单号           订单金额  Unnamed: 4  Unnamed: 5  \
count  5.083900e+04  5.083900e+04   50839.000000         0.0         0.0   
mean   2.882368e+10  4.332466e+09     963.587872         NaN         NaN   
std    1.409416e+10  4.404350e+06    2178.727261         NaN         NaN   
min    2.780000e+02  4.324911e+09       0.300000         NaN         NaN   
25%    1.869274e+10  4.328415e+09      59.000000         NaN         NaN   
50%    3.688044e+10  4.331989e+09     149.000000         NaN         NaN   
75%    3.923020e+10  4.337515e+09     898.000000         NaN         NaN   
max    3.954554e+10  4.338764e+09  123609.000000         NaN         NaN   

       Unnamed: 6  
count         0.0  
mean          NaN  
std           NaN  
min           NaN  
25%           NaN  
50%           NaN  
75%           NaN  
max           NaN  
基本属性

RangeIndex: 50839 entries, 0 to 50838
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   会员ID        50839 non-null  int64         
 1   订单号         50839 non-null  int64         
 2   提交日期        50839 non-null  datetime64[ns]
 3   订单金额        50839 non-null  float64       
 4   Unnamed: 4  0 non-null      float64       
 5   Unnamed: 5  0 non-null      float64       
 6   Unnamed: 6  0 non-null      float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 2.7 MB
None
特征缺失值数量
会员ID              0
订单号               0
提交日期              0
订单金额              0
Unnamed: 4    50839
Unnamed: 5    50839
Unnamed: 6    50839
dtype: int64
2018
          会员ID         订单号       提交日期    订单金额  Unnamed: 4  Unnamed: 5  \
0  39229691808  4338764262 2018-01-01  3646.0         NaN         NaN   
1  39293668916  4338764363 2018-01-01  3999.0         NaN         NaN   
2  35059646224  4338764376 2018-01-01    10.1         NaN         NaN   
3      1084397  4338770013 2018-01-01   828.0         NaN         NaN   
4      3349915  4338770121 2018-01-01  3758.0         NaN         NaN   

   Unnamed: 6  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  
描述性统计
               会员ID           订单号           订单金额  Unnamed: 4  Unnamed: 5  \
count  8.134900e+04  8.134900e+04   81348.000000         0.0         0.0   
mean   2.902317e+10  4.348372e+09     966.582792         NaN         NaN   
std    1.404116e+10  4.183774e+06    2204.969534         NaN         NaN   
min    2.780000e+02  4.338764e+09       0.000000         NaN         NaN   
25%    1.902755e+10  4.345654e+09      60.000000         NaN         NaN   
50%    3.740121e+10  4.349448e+09     149.000000         NaN         NaN   
75%    3.923380e+10  4.351639e+09     899.000000         NaN         NaN   
max    3.954614e+10  4.354235e+09  174900.000000         NaN         NaN   

       Unnamed: 6  
count         0.0  
mean          NaN  
std           NaN  
min           NaN  
25%           NaN  
50%           NaN  
75%           NaN  
max           NaN  
基本属性

RangeIndex: 81349 entries, 0 to 81348
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   会员ID        81349 non-null  int64         
 1   订单号         81349 non-null  int64         
 2   提交日期        81349 non-null  datetime64[ns]
 3   订单金额        81348 non-null  float64       
 4   Unnamed: 4  0 non-null      float64       
 5   Unnamed: 5  0 non-null      float64       
 6   Unnamed: 6  0 non-null      float64       
dtypes: datetime64[ns](1), float64(4), int64(2)
memory usage: 4.3 MB
None
特征缺失值数量
会员ID              0
订单号               0
提交日期              0
订单金额              1
Unnamed: 4    81349
Unnamed: 5    81349
Unnamed: 6    81349
dtype: int64
会员等级
          会员ID  会员等级  Unnamed: 2  Unnamed: 3
0       100090     3         NaN         NaN
1  10012905801     1         NaN         NaN
2  10012935109     1         NaN         NaN
3  10013498043     1         NaN         NaN
4  10014087899     4         NaN         NaN
描述性统计
               会员ID           会员等级  Unnamed: 2  Unnamed: 3
count  1.543850e+05  154385.000000         0.0         0.0
mean   2.980055e+10       2.259701         NaN         NaN
std    1.365654e+10       1.346408         NaN         NaN
min    8.100000e+01       1.000000         NaN         NaN
25%    2.213894e+10       1.000000         NaN         NaN
50%    3.833022e+10       2.000000         NaN         NaN
75%    3.927932e+10       3.000000         NaN         NaN
max    3.954614e+10       5.000000         NaN         NaN
基本属性

RangeIndex: 154385 entries, 0 to 154384
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   会员ID        154385 non-null  int64  
 1   会员等级        154385 non-null  int64  
 2   Unnamed: 2  0 non-null       float64
 3   Unnamed: 3  0 non-null       float64
dtypes: float64(2), int64(2)
memory usage: 4.7 MB
None
特征缺失值数量
会员ID               0
会员等级               0
Unnamed: 2    154385
Unnamed: 3    154385
dtype: int64
for i in range(4):
    sheet_datas[i] = sheet_datas[i].dropna(axis=1, thresh=2).dropna()  # 去除具有空值的行的同时,把空白列也去掉
    sheet_datas[i] = sheet_datas[i][sheet_datas[i]['订单金额'] > 1]  # 去除订单金额小于1的行
    sheet_datas[i]['max_year_date'] =  sheet_datas[i]['提交日期'].max()  # 添加一列日期最大值
data_merge = pd.concat(sheet_datas[:-1],axis=0)  # 合并四个工作表
data_merge
会员ID 订单号 提交日期 订单金额 max_year_date
0 15278002468 3000304681 2015-01-01 499.0 2015-12-31
1 39236378972 3000305791 2015-01-01 2588.0 2015-12-31
2 38722039578 3000641787 2015-01-01 498.0 2015-12-31
3 11049640063 3000798913 2015-01-01 1572.0 2015-12-31
4 35038752292 3000821546 2015-01-01 10.1 2015-12-31
... ... ... ... ... ...
81344 39229485704 4354225182 2018-12-31 249.0 2018-12-31
81345 39229021075 4354225188 2018-12-31 89.0 2018-12-31
81346 39288976750 4354230034 2018-12-31 48.5 2018-12-31
81347 26772630 4354230163 2018-12-31 3196.0 2018-12-31
81348 39455580335 4354235084 2018-12-31 2999.0 2018-12-31

202827 rows × 5 columns

# 新增年份列,用于后续分组,并增加日期间隔列,通过lambda函数转换成整数
data_merge['日期间隔'] = (data_merge['max_year_date'] - data_merge['提交日期']).apply(lambda x:x.days)
data_merge['年份'] = data_merge['提交日期'].dt.year
data_merge
会员ID 订单号 提交日期 订单金额 max_year_date 日期间隔 年份
0 15278002468 3000304681 2015-01-01 499.0 2015-12-31 364 2015
1 39236378972 3000305791 2015-01-01 2588.0 2015-12-31 364 2015
2 38722039578 3000641787 2015-01-01 498.0 2015-12-31 364 2015
3 11049640063 3000798913 2015-01-01 1572.0 2015-12-31 364 2015
4 35038752292 3000821546 2015-01-01 10.1 2015-12-31 364 2015
... ... ... ... ... ... ... ...
81344 39229485704 4354225182 2018-12-31 249.0 2018-12-31 0 2018
81345 39229021075 4354225188 2018-12-31 89.0 2018-12-31 0 2018
81346 39288976750 4354230034 2018-12-31 48.5 2018-12-31 0 2018
81347 26772630 4354230163 2018-12-31 3196.0 2018-12-31 0 2018
81348 39455580335 4354235084 2018-12-31 2999.0 2018-12-31 0 2018

202827 rows × 7 columns

# 对年份、id进行分组,并通过计算日期间隔最小值,提交日期次数,订单总额来分别计算rfm
rfm_gb = data_merge.groupby(['年份', '会员ID'], as_index=False).agg({'日期间隔':'min', '提交日期':'count', '订单金额':'sum'})
rfm_gb.columns = ['year', '会员ID', 'R', 'F', 'M']
rfm_gb
year 会员ID R F M
0 2015 267 197 2 105.0
1 2015 282 251 1 29.7
2 2015 283 340 1 5398.0
3 2015 343 300 1 118.0
4 2015 525 37 3 213.0
... ... ... ... ... ...
148586 2018 39538034299 272 1 49.0
148587 2018 39538034662 189 1 3558.0
148588 2018 39538035729 179 1 3699.0
148589 2018 39545237824 275 1 49.0
148590 2018 39546136285 163 1 19.9

148591 rows × 5 columns

# 观察RFM的分布
rfm_col = ['R', 'F', 'M']
rfm_gb[rfm_col].describe().T
count mean std min 25% 50% 75% max
R 148591.0 165.524043 101.988472 0.0 79.0 156.0 255.0 365.0
F 148591.0 1.365002 2.626953 1.0 1.0 1.0 1.0 130.0
M 148591.0 1323.741329 3753.906883 1.5 69.0 189.0 1199.0 206251.8
# 通过直方图可视化观察分布情况,发现R属性相对均匀,但另外两个属性分布差距较大
for i in rfm_col:
    plt.figure(figsize=(6,4))
    plt.hist(rfm_gb[i], bins=20)
    plt.title(i)
    plt.show()

DataWhale会员数据化运营项目(个人练习)_第1张图片

DataWhale会员数据化运营项目(个人练习)_第2张图片

DataWhale会员数据化运营项目(个人练习)_第3张图片

# 将rfm表格与会员等级表格连接
rfm_merge = pd.merge(rfm_gb, sheet_datas[-1], on='会员ID', how='inner')
rfm_merge.head()
year 会员ID R F M 会员等级 Unnamed: 2 Unnamed: 3
0 2015 267 197 2 105.0 1 NaN NaN
1 2015 282 251 1 29.7 5 NaN NaN
2 2017 282 314 2 12992.0 5 NaN NaN
3 2018 282 19 5 30027.0 5 NaN NaN
4 2015 283 340 1 5398.0 4 NaN NaN
# 通过随机森林模型来获得RFM因子得分
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=42).fit(rfm_merge[rfm_col], rfm_merge['会员等级'])
print(clf.feature_importances_)
[0.39694201 0.00605384 0.59700415]
# 设置区间,左开右闭
R_bins = [-1,79,255,365] # 注意起始边界小于最小值
F_bins = [0,2,5,130]
M_bins = [0,69,1199,206252]

# 分箱
rfm_gb['R_score'] = pd.cut(rfm_gb['R'], bins=R_bins, labels=[3,2,1])
rfm_gb['F_score'] = pd.cut(rfm_gb['F'], bins=F_bins, labels=[1,2,3])
rfm_gb['M_score'] = pd.cut(rfm_gb['M'], bins=M_bins, labels=[1,2,3])
rfm_gb
year 会员ID R F M R_score F_score M_score
0 2015 267 197 2 105.0 2 1 2
1 2015 282 251 1 29.7 2 1 1
2 2015 283 340 1 5398.0 1 1 3
3 2015 343 300 1 118.0 1 1 2
4 2015 525 37 3 213.0 3 2 2
... ... ... ... ... ... ... ... ...
148586 2018 39538034299 272 1 49.0 1 1 1
148587 2018 39538034662 189 1 3558.0 2 1 3
148588 2018 39538035729 179 1 3699.0 2 1 3
148589 2018 39545237824 275 1 49.0 1 1 1
148590 2018 39546136285 163 1 19.9 2 1 1

148591 rows × 8 columns

# 将分类型标签转换为数值型,并加权得出RFM分数
rfm_gb = rfm_gb.apply(np.int64)

weight = clf.feature_importances_
rfm_gb['RFM_score'] = rfm_gb['R_score'] * weight[0] + rfm_gb['F_score'] * weight[1] + rfm_gb['M_score'] * weight[2]
rfm_gb
year 会员ID R F M R_score F_score M_score RFM_score
0 2015 267 197 2 105 2 1 2 1.993946
1 2015 282 251 1 29 2 1 1 1.396942
2 2015 283 340 1 5398 1 1 3 2.194008
3 2015 343 300 1 118 1 1 2 1.597004
4 2015 525 37 3 213 3 2 2 2.396942
... ... ... ... ... ... ... ... ... ...
148586 2018 39538034299 272 1 49 1 1 1 1.000000
148587 2018 39538034662 189 1 3558 2 1 3 2.590950
148588 2018 39538035729 179 1 3699 2 1 3 2.590950
148589 2018 39545237824 275 1 49 1 1 1 1.000000
148590 2018 39546136285 163 1 19 2 1 1 1.396942

148591 rows × 9 columns

# 对RFM标签进行组合
rfm_gb['R_score'] = rfm_gb['R_score'].astype('str')
rfm_gb['F_score'] = rfm_gb['F_score'].astype('str')
rfm_gb['M_score'] = rfm_gb['M_score'].astype('str')

rfm_gb['RFM_label'] = rfm_gb['R_score'] + rfm_gb['F_score'] + rfm_gb['M_score']
rfm_gb
year 会员ID R F M R_score F_score M_score RFM_score RFM_label
0 2015 267 197 2 105 2 1 2 1.993946 212
1 2015 282 251 1 29 2 1 1 1.396942 211
2 2015 283 340 1 5398 1 1 3 2.194008 113
3 2015 343 300 1 118 1 1 2 1.597004 112
4 2015 525 37 3 213 3 2 2 2.396942 322
... ... ... ... ... ... ... ... ... ... ...
148586 2018 39538034299 272 1 49 1 1 1 1.000000 111
148587 2018 39538034662 189 1 3558 2 1 3 2.590950 213
148588 2018 39538035729 179 1 3699 2 1 3 2.590950 213
148589 2018 39545237824 275 1 49 1 1 1 1.000000 111
148590 2018 39546136285 163 1 19 2 1 1 1.396942 211

148591 rows × 10 columns

rfm_gb.to_csv('rfm_vip.csv', index=False, encoding='utf-16', sep='\t')

你可能感兴趣的:(机器学习个人练习项目,python,pandas,数据分析)