模型介绍
RFM模型即"R"——Recency(最近一次消费时间)、“F”——Frequency(一段时间内消费频次)、“M”——(一段时间内消费总额)。这三个指标可以将我们的用户划分成不同的等级和层次,目的是为了衡量他们的用户价值,从而能够更准确地将成本和精力花在更精确的用户层次身上。一个典型的例子就是针对一个明显无意愿的流失用户,对其继续push自己的核心产品,费时费力也费钱。
数据清洗
import pandas as pd
import numpy as np
import os
df = pd.read_excel('PYTHON-RFM实战数据.xlsx')
df.head()
|
品牌名称 |
买家昵称 |
付款日期 |
订单状态 |
实付金额 |
邮费 |
省份 |
城市 |
购买数量 |
0 |
数据不吹牛 |
叫我李2 |
2019-01-01 00:17:59 |
交易成功 |
186 |
6 |
上海 |
上海市 |
1 |
1 |
数据不吹牛 |
0cyb1992 |
2019-01-01 00:59:54 |
交易成功 |
145 |
0 |
广东省 |
广州市 |
1 |
2 |
数据不吹牛 |
萝污萌莉 |
2019-01-01 07:48:48 |
交易成功 |
194 |
8 |
山东省 |
东营市 |
1 |
3 |
数据不吹牛 |
atblovemyy |
2019-01-01 09:15:49 |
付款以后用户退款成功,交易自动关闭 |
84 |
0 |
江苏省 |
镇江市 |
1 |
4 |
数据不吹牛 |
小星期鱼 |
2019-01-01 09:59:33 |
付款以后用户退款成功,交易自动关闭 |
74 |
0 |
上海 |
上海市 |
1 |
df['订单状态'].unique()
array(['交易成功', '付款以后用户退款成功,交易自动关闭'], dtype=object)
df.info()
RangeIndex: 28833 entries, 0 to 28832
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 品牌名称 28833 non-null object
1 买家昵称 28833 non-null object
2 付款日期 28833 non-null datetime64[ns]
3 订单状态 28833 non-null object
4 实付金额 28833 non-null int64
5 邮费 28833 non-null int64
6 省份 28833 non-null object
7 城市 28832 non-null object
8 购买数量 28833 non-null int64
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 2.0+ MB
df = df.loc[df['订单状态'] == '交易成功',:]
print('剔除退款后还剩:%d行' % len(df))
剔除退款后还剩:27793行
df = df[['买家昵称','付款日期','实付金额']]
df.head()
|
买家昵称 |
付款日期 |
实付金额 |
0 |
叫我李2 |
2019-01-01 00:17:59 |
186 |
1 |
0cyb1992 |
2019-01-01 00:59:54 |
145 |
2 |
萝污萌莉 |
2019-01-01 07:48:48 |
194 |
5 |
重碎叠 |
2019-01-01 10:00:07 |
197 |
6 |
iho_jann |
2019-01-01 10:00:08 |
168 |
R值构造
r = df.groupby('买家昵称')['付款日期'].max().reset_index()
r.head()
|
买家昵称 |
付款日期 |
0 |
.blue_ram |
2019-02-04 17:49:34.000 |
1 |
.christiny |
2019-01-29 14:17:15.000 |
2 |
.willn1 |
2019-01-11 03:46:18.000 |
3 |
.托托m |
2019-01-11 02:26:33.000 |
4 |
0000妮 |
2019-06-28 16:53:26.458 |
r['R'] = (pd.to_datetime('2019-7-1') - r['付款日期']).dt.days
r = r[['买家昵称','R']]
r.head()
|
买家昵称 |
R |
0 |
.blue_ram |
146 |
1 |
.christiny |
152 |
2 |
.willn1 |
170 |
3 |
.托托m |
170 |
4 |
0000妮 |
2 |
F值构造
df['日期标签'] = df['付款日期'].astype(str).str[:10]
dup_f = df.groupby(['买家昵称','日期标签'])['付款日期'].count().reset_index()
f = dup_f.groupby('买家昵称')['付款日期'].count().reset_index()
f.columns = ['买家昵称','F']
f.head()
|
买家昵称 |
F |
0 |
.blue_ram |
1 |
1 |
.christiny |
1 |
2 |
.willn1 |
1 |
3 |
.托托m |
1 |
4 |
0000妮 |
1 |
M值构造
sum_m = df.groupby('买家昵称')['实付金额'].sum().reset_index()
sum_m.columns = ['买家昵称','总支付金额']
com_m = pd.merge(sum_m,f,left_on = '买家昵称',right_on = '买家昵称',how = 'inner')
com_m['M'] = com_m['总支付金额'] / com_m['F']
com_m.head()
|
买家昵称 |
总支付金额 |
F |
M |
0 |
.blue_ram |
49 |
1 |
49.0 |
1 |
.christiny |
183 |
1 |
183.0 |
2 |
.willn1 |
34 |
1 |
34.0 |
3 |
.托托m |
37 |
1 |
37.0 |
4 |
0000妮 |
164 |
1 |
164.0 |
rfm = pd.merge(r,com_m,left_on = '买家昵称',right_on = '买家昵称',how = 'inner')
rfm = rfm[['买家昵称','R','F','M']]
rfm.head()
|
买家昵称 |
R |
F |
M |
0 |
.blue_ram |
146 |
1 |
49.0 |
1 |
.christiny |
152 |
1 |
183.0 |
2 |
.willn1 |
170 |
1 |
34.0 |
3 |
.托托m |
170 |
1 |
37.0 |
4 |
0000妮 |
2 |
1 |
164.0 |
STEP 3.维度确认(不涉及代码故省略)
STEP 4.分值计算
rfm['R-SCORE'] = pd.cut(rfm['R'],bins = [0,30,60,90,120,1000000],labels = [5,4,3,2,1],right = False).astype(float)
rfm.head()
|
买家昵称 |
R |
F |
M |
R-SCORE |
0 |
.blue_ram |
146 |
1 |
49.0 |
1.0 |
1 |
.christiny |
152 |
1 |
183.0 |
1.0 |
2 |
.willn1 |
170 |
1 |
34.0 |
1.0 |
3 |
.托托m |
170 |
1 |
37.0 |
1.0 |
4 |
0000妮 |
2 |
1 |
164.0 |
5.0 |
F、M值计算
rfm['F-SCORE'] = pd.cut(rfm['F'],bins = [1,2,3,4,5,1000000],labels = [1,2,3,4,5],right = False).astype(float)
rfm['M-SCORE'] = pd.cut(rfm['M'],bins = [0,50,100,150,200,1000000],labels = [1,2,3,4,5],right = False).astype(float)
rfm.head()
|
买家昵称 |
R |
F |
M |
R-SCORE |
F-SCORE |
M-SCORE |
0 |
.blue_ram |
146 |
1 |
49.0 |
1.0 |
1.0 |
1.0 |
1 |
.christiny |
152 |
1 |
183.0 |
1.0 |
1.0 |
4.0 |
2 |
.willn1 |
170 |
1 |
34.0 |
1.0 |
1.0 |
1.0 |
3 |
.托托m |
170 |
1 |
37.0 |
1.0 |
1.0 |
1.0 |
4 |
0000妮 |
2 |
1 |
164.0 |
5.0 |
1.0 |
4.0 |
rfm['R是否大于均值'] = (rfm['R-SCORE'] > rfm['R-SCORE'].mean()) * 1
rfm['F是否大于均值'] = (rfm['F-SCORE'] > rfm['F-SCORE'].mean()) * 1
rfm['M是否大于均值'] = (rfm['M-SCORE'] > rfm['M-SCORE'].mean()) * 1
rfm.head()
|
买家昵称 |
R |
F |
M |
R-SCORE |
F-SCORE |
M-SCORE |
R是否大于均值 |
F是否大于均值 |
M是否大于均值 |
0 |
.blue_ram |
146 |
1 |
49.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
1 |
.christiny |
152 |
1 |
183.0 |
1.0 |
1.0 |
4.0 |
0 |
0 |
1 |
2 |
.willn1 |
170 |
1 |
34.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
3 |
.托托m |
170 |
1 |
37.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
4 |
0000妮 |
2 |
1 |
164.0 |
5.0 |
1.0 |
4.0 |
1 |
0 |
1 |
STEP 5.客户分层
构建合并指标
rfm['人群数值'] = (rfm['R是否大于均值'] * 100) + (rfm['F是否大于均值'] * 10) + (rfm['M是否大于均值'] * 1)
rfm.head()
|
买家昵称 |
R |
F |
M |
R-SCORE |
F-SCORE |
M-SCORE |
R是否大于均值 |
F是否大于均值 |
M是否大于均值 |
人群数值 |
0 |
.blue_ram |
146 |
1 |
49.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
0 |
1 |
.christiny |
152 |
1 |
183.0 |
1.0 |
1.0 |
4.0 |
0 |
0 |
1 |
1 |
2 |
.willn1 |
170 |
1 |
34.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
0 |
3 |
.托托m |
170 |
1 |
37.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
0 |
4 |
0000妮 |
2 |
1 |
164.0 |
5.0 |
1.0 |
4.0 |
1 |
0 |
1 |
101 |
def transform_label(x):
if x == 111:
label = '重要价值客户'
elif x == 110:
label = '消费潜力客户'
elif x == 101:
label = '频次深耕客户'
elif x == 100:
label = '新客户'
elif x == 11:
label = '重要价值流失预警客户'
elif x == 10:
label = '一般客户'
elif x == 1:
label = '高消费唤回客户'
elif x == 0:
label = '流失客户'
return label
rfm['人群类型'] = rfm['人群数值'].apply(transform_label)
rfm.head()
|
买家昵称 |
R |
F |
M |
R-SCORE |
F-SCORE |
M-SCORE |
R是否大于均值 |
F是否大于均值 |
M是否大于均值 |
人群数值 |
人群类型 |
0 |
.blue_ram |
146 |
1 |
49.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
0 |
流失客户 |
1 |
.christiny |
152 |
1 |
183.0 |
1.0 |
1.0 |
4.0 |
0 |
0 |
1 |
1 |
高消费唤回客户 |
2 |
.willn1 |
170 |
1 |
34.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
0 |
流失客户 |
3 |
.托托m |
170 |
1 |
37.0 |
1.0 |
1.0 |
1.0 |
0 |
0 |
0 |
0 |
流失客户 |
4 |
0000妮 |
2 |
1 |
164.0 |
5.0 |
1.0 |
4.0 |
1 |
0 |
1 |
101 |
频次深耕客户 |
人数统计
count = rfm['人群类型'].value_counts().reset_index()
count.columns = ['客户类型','人数']
count['人数占比'] = count['人数'] / count['人数'].sum()
count
|
客户类型 |
人数 |
人数占比 |
0 |
高消费唤回客户 |
7338 |
0.288670 |
1 |
流失客户 |
6680 |
0.262785 |
2 |
频次深耕客户 |
5427 |
0.213493 |
3 |
新客户 |
4224 |
0.166168 |
4 |
重要价值客户 |
756 |
0.029740 |
5 |
消费潜力客户 |
450 |
0.017703 |
6 |
重要价值流失预警客户 |
360 |
0.014162 |
7 |
一般客户 |
185 |
0.007278 |
金额统计
rfm['购买总金额'] = rfm['F'] * rfm['M']
mon = rfm.groupby('人群类型')['购买总金额'].sum().reset_index()
mon.columns = ['客户类型','消费金额']
mon['金额占比'] = mon['消费金额'] / mon['消费金额'].sum()
mon
|
客户类型 |
消费金额 |
金额占比 |
0 |
一般客户 |
25803.0 |
0.007349 |
1 |
新客户 |
270869.0 |
0.077142 |
2 |
流失客户 |
444617.0 |
0.126624 |
3 |
消费潜力客户 |
64075.0 |
0.018248 |
4 |
重要价值客户 |
269230.0 |
0.076675 |
5 |
重要价值流失预警客户 |
116665.0 |
0.033226 |
6 |
频次深耕客户 |
981893.0 |
0.279638 |
7 |
高消费唤回客户 |
1338153.0 |
0.381098 |
def get_rfm(name = 'PYTHON-RFM实战数据.xlsx'):
df = pd.read_excel(name)
df = df.loc[df['订单状态'] == '交易成功',:]
print('剔除退款后还剩:%d行' % len(df))
df = df[['买家昵称','付款日期','实付金额']]
r = df.groupby('买家昵称')['付款日期'].max().reset_index()
r['R'] = (pd.to_datetime('2019-7-1') - r['付款日期']).dt.days
r = r[['买家昵称','R']]
df['日期标签'] = df['付款日期'].astype(str).str[:10]
dup_f = df.groupby(['买家昵称','日期标签'])['付款日期'].count().reset_index()
f = dup_f.groupby('买家昵称')['付款日期'].count().reset_index()
f.columns = ['买家昵称','F']
sum_m = df.groupby('买家昵称')['实付金额'].sum().reset_index()
sum_m.columns = ['买家昵称','总支付金额']
com_m = pd.merge(sum_m,f,left_on = '买家昵称',right_on = '买家昵称',how = 'inner')
com_m['M'] = com_m['总支付金额'] / com_m['F']
rfm = pd.merge(r,com_m,left_on = '买家昵称',right_on = '买家昵称',how = 'inner')
rfm = rfm[['买家昵称','R','F','M']]
rfm['R-SCORE'] = pd.cut(rfm['R'],bins = [0,30,60,90,120,1000000],labels = [5,4,3,2,1],right = False).astype(float)
rfm['F-SCORE'] = pd.cut(rfm['F'],bins = [1,2,3,4,5,1000000],labels = [1,2,3,4,5],right = False).astype(float)
rfm['M-SCORE'] = pd.cut(rfm['M'],bins = [0,50,100,150,200,1000000],labels = [1,2,3,4,5],right = False).astype(float)
rfm['R是否大于均值'] = (rfm['R-SCORE'] > rfm['R-SCORE'].mean()) * 1
rfm['F是否大于均值'] = (rfm['F-SCORE'] > rfm['F-SCORE'].mean()) * 1
rfm['M是否大于均值'] = (rfm['M-SCORE'] > rfm['M-SCORE'].mean()) * 1
rfm['人群数值'] = (rfm['R是否大于均值'] * 100) + (rfm['F是否大于均值'] * 10) + (rfm['M是否大于均值'] * 1)
rfm['人群类型'] = rfm['人群数值'].apply(transform_label)
count = rfm['人群类型'].value_counts().reset_index()
count.columns = ['客户类型','人数']
count['人数占比'] = count['人数'] / count['人数'].sum()
rfm['购买总金额'] = rfm['F'] * rfm['M']
mon = rfm.groupby('人群类型')['购买总金额'].sum().reset_index()
mon.columns = ['客户类型','消费金额']
mon['金额占比'] = mon['消费金额'] / mon['消费金额'].sum()
result = pd.merge(count,mon,left_on = '客户类型',right_on = '客户类型')
return result
def transform_label(x):
if x == 111:
label = '重要价值客户'
elif x == 110:
label = '消费潜力客户'
elif x == 101:
label = '频次深耕客户'
elif x == 100:
label = '新客户'
elif x == 11:
label = '重要价值流失预警客户'
elif x == 10:
label = '一般客户'
elif x == 1:
label = '高消费唤回客户'
elif x == 0:
label = '流失客户'
return label
res = get_rfm(name = 'PYTHON-RFM实战数据.xlsx')
res
剔除退款后还剩:27793行
|
客户类型 |
人数 |
人数占比 |
消费金额 |
金额占比 |
0 |
高消费唤回客户 |
7338 |
0.288670 |
1338153.0 |
0.381098 |
1 |
流失客户 |
6680 |
0.262785 |
444617.0 |
0.126624 |
2 |
频次深耕客户 |
5427 |
0.213493 |
981893.0 |
0.279638 |
3 |
新客户 |
4224 |
0.166168 |
270869.0 |
0.077142 |
4 |
重要价值客户 |
756 |
0.029740 |
269230.0 |
0.076675 |
5 |
消费潜力客户 |
450 |
0.017703 |
64075.0 |
0.018248 |
6 |
重要价值流失预警客户 |
360 |
0.014162 |
116665.0 |
0.033226 |
7 |
一般客户 |
185 |
0.007278 |
25803.0 |
0.007349 |