写在前面
1、RFM模型是用户价值分析中最常见的模型,核心思想是用户的消费间隔(忠诚度)、消费金额(付费情况)、消费频次(活跃度)三个特征对用户进行分群(2*3=8个类别),从而针对不同的群体使用不同的策略
2、K-Means算法是常用的聚类算法之一,基于RFM的特征实现用户分群,可解释性稍微差点,K值的合适确定模型的好坏
3、数据集为线上零售数据集
一、数据导入及清洗
import pandas as pd
import numpy as np
import datetime as dt
data= pd.read_excel('Online_Retail.xlsx')
data.head()
1.1查看数据分布
data[['Quantity','UnitPrice']].describe().T
1.2数据清洗
剔除异常数据,将购物金额和数量为负的数值剔除
data=data.loc[data['Quantity']>=0]
data=data.loc[data['UnitPrice']>=0]
对时间数据进行处理,仅保留年月日数据
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'].apply(lambda x:x.date()))
data.head()
1.3数据计算
计算用户每一次的消费金额
data['TotalAmount'] = data['Quantity'] * data['UnitPrice']
在计算消费间隔之前需要定位观测点作为统计日,将统计日 cal_date定为最大的消费时间后一天
cal_date = max(data.InvoiceDate) + dt.timedelta(days=1)
计算时间间隔
def cal_frequency(date):
return (cal_date-date.max()).days
通过group by语法对每一个用户进行分组聚合,计算每个用户的RFM值
rfm = data.groupby(['CustomerID']).agg({
'InvoiceDate': cal_frequency,
'InvoiceNo': 'count',
'TotalAmount': 'sum'}).sort_index(ascending=True)
字段重命名
rfm.rename(columns = {'InvoiceDate': 'Recency',
'InvoiceNo': 'Frequency',
'TotalAmount': 'Monetary'}, inplace=True)
1.4结果展示
rfm.head()
二、查看数据分布情况
2.1通过RFM指标的核密度曲线查看数据分布
import matplotlib.pyplot as plt
import seaborn as sns
查看数据分布的函数
def data_distribution(keyvalue,data):
plt.figure(figsize = (18,4),dpi=600)
j=1
for i in keyvalue:
plt.subplot(1,3,j)
sns.distplot(data[i])
plt.title(i,fontsize = 15)
j+=1
keyvalue=['Recency', 'Frequency', 'Monetary']
data_distribution(keyvalue,rfm)
可以看出长尾用户的存在是普遍现象
三、对数据进行分箱处理,计算RFM分数特征实现用户分群
3.1按照各个数值的1/4,1/2,3/4中位数进行数据分类
创建三个新的Column, 分别表示R,F,M的quntitle值
labels= list(range(1,5))
labels_reverse = list(range(4,0,-1))
Rquartiles = pd.qcut(rfm['Recency'],4,labels=labels_reverse)
rfm = rfm.assign(R = Rquartiles.values)
Fquartiles = pd.qcut(rfm['Frequency'],4,labels=labels)
rfm = rfm.assign(F = Fquartiles.values)
Mquartiles = pd.qcut(rfm['Monetary'],4,labels=labels)
rfm = rfm.assign(M = Mquartiles.values)
3.2计算用户RFM总分数
rfm['RFM_Score'] = rfm[['R','F','M']].sum(axis=1)
按照分数排名将用户打上标签
labels=['general', 'sliver', 'gold', 'diamond']
RFM_Score=pd.qcut(rfm['RFM_Score'],4,labels=labels)
用户标签与原始数据映射
rfm = rfm.assign(Category =RFM_Score.values).sort_index(ascending=True)
3.3查看数据
rfm.head()
计算各类用户的数量
rfm['Category'].value_counts().sort_index(ascending=True)
四、K-Means聚类
4.1 数据归一化处理
原因是不同用户的RFM特征差别较大,需将不同量级的数据转换到同一量级
在k-means聚类之前,先对数据进行一个归一化处理
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn import metrics
from mpl_toolkits.mplot3d import Axes3D
data_kmeans=rfm[['Recency','Frequency','Monetary']]
在正式处理数据之前保留一份原始数据,以便之后使用
original=data_kmeans.copy()
先通过log(x+1)变化对数据进行处理
data_kmeans= np.log(data_kmeans+1)
z-score标准化,特征值映射到N(0,1)的正态分布
Model = preprocessing.StandardScaler()
data_kmeans[['Recency','Frequency','Monetary']] =Model.fit_transform(data_kmeans[['Recency','Frequency','Monetary']])
写一个循环查看数据分布,除R外,用户的F和M均符合正态分布
keyvalue=['Recency', 'Frequency', 'Monetary']
data_distribution(keyvalue,data_kmeans)
4.2 聚类分析
聚类数k值有三种方法:哈拉巴斯指数法、轮廓系数法、簇内平方和法
inertia = []
ch_score = []
ss_score = []
x = np.array(data_kmeans[['Recency', 'Frequency', 'Monetary']])
for k in range(2,11):
model = KMeans(n_clusters = k, init = 'k-means++',max_iter = 1000,random_state=123).fit(x)
pre = model.predict(x)
ch = metrics.calinski_harabaz_score(x,pre)
ss = metrics.silhouette_score(x,pre)
inertia.append(model.inertia_)
ch_score.append(ch)
ss_score.append(ss)
score = pd.Series([ch_score,ss_score,inertia],index = ['ch_score','ss_score','inertia'])
key = score.index.tolist()
plt.figure(figsize = (15,6),dpi=600)
j = 1
for i in key:
plt.subplot(1,3,j)
plt.plot(list(range(2,11)),score[i])
plt.xlabel('n_cluster',fontsize = 13)
plt.ylabel(f'{i}',fontsize = 13)
plt.title(f'{i}',fontsize = 15)
j+=1
plt.subplots_adjust(wspace = 0.3)
根据上图指标可见,当k=4时指标有较为明显的变化趋势
model = KMeans(n_clusters=4,max_iter=1000,random_state=123).fit(x)
labels=pd.DataFrame(model.labels_,columns = ['Category'])
kmeans_result = pd.concat([pd.DataFrame(model.cluster_centers_),labels['Category'].value_counts().sort_index()],axis = 1)
kmeans_result.columns = ['Recency', 'Frequency', 'Monetary','Category']
kmeans_result
4.3 评估聚类效果
将原始数据与便签进行关联
data_kmeans['Category'] = model.labels_
df=data_kmeans
plt.figure(figsize=(15,6))
ax = plt.subplot(121, projection='3d')
df_label0 = df.loc[df.Category==0,]
df_label1 = df.loc[df.Category==1,]
df_label2 = df.loc[df.Category==2,]
df_label3 = df.loc[df.Category==3,]
ax.scatter(df_label0[['Recency']], df_label0[['Frequency']], df_label0[['Monetary']], c='y')
ax.scatter(df_label1[['Recency']], df_label1[['Frequency']], df_label1[['Monetary']], c='r')
ax.scatter(df_label2[['Recency']], df_label2[['Frequency']], df_label2[['Monetary']], c='g')
ax.scatter(df_label3[['Recency']], df_label3[['Frequency']], df_label3[['Monetary']], c='b')
4.4 聚类含义解释
R,F,M的中位数及对应cluster的人数
original['Category'] = model.labels_
original_final=original.groupby("Category").agg({
'Recency':'median',
'Frequency':'median',
'Monetary':'median'})
original_final = pd.concat([original_final,original['Category'].value_counts().sort_index()],axis = 1)
original_final
original_reset=original.reset_index()
消费金额大小不等,以100为间距对对消费金额进行分箱处理
print('Min = {}, Max = {}'.format(min(original_reset.Monetary ), max(original_reset.Monetary)))
x=range(100,280400,100)
Monetary_1 = pd.cut(original_reset['Monetary'],len(x),labels=x)
original_reset= original_reset.assign(M1 = Monetary_1.values)
data_r=original_reset.groupby(["Recency","Category"]).agg({
'CustomerID':'count'
}).reset_index()
data_r.rename(columns = {'Recency': 'R',
'Category':'Category',
'CustomerID': 'Count'}, inplace=True)
data_f=original_reset.groupby(["Frequency","Category"]).
agg({'CustomerID':'count'}).reset_index()
data_f.rename(columns = {'Frequency': 'F',
'Category':'Category',
'CustomerID': 'Count'}, inplace=True)
data_m=original_reset.groupby(["M1","Category"]).agg({
'CustomerID':'count'
}).reset_index()
data_m.rename(columns = {'M1': 'M',
'Category':'Category',
'CustomerID': 'Count'}, inplace=True)
绘制R四个类型的可视化图形
plt.figure(figsize=(15,15),dpi=600)
ax = plt.subplot(3,4,1)
ax.plot(data_r.loc[data_r.Category==0,'R'],data_r.loc[data_r.Category==0,'Count'],c='y',label='0')
ax.legend(fontsize='medium')
plt.title("R Distribution")
ax = plt.subplot(3,4,2)
ax.plot(data_r.loc[data_r.Category==1,'R'],data_r.loc[data_r.Category==1,'Count'],c='r',label='1')
ax.legend(fontsize='medium')
plt.title("R Distribution")
ax = plt.subplot(3,4,3)
ax.plot(data_r.loc[data_r.Category==2,'R'],data_r.loc[data_r.Category==2,'Count'],c='g',label='2')
ax.legend(fontsize='medium')
plt.title("R Distribution")
ax = plt.subplot(3,4,4)
ax.plot(data_r.loc[data_r.Category==3,'R'],data_r.loc[data_r.Category==3,'Count'],c='b',label='3')
ax.legend(fontsize='medium')
plt.title("R Distribution")
绘制F四个类型的可视化图形
ax = plt.subplot(3,4,5)
ax.plot(data_f.loc[data_f.Category==0,'F'],data_f.loc[data_f.Category==0,'Count'],c='y',label='0')
ax.legend(fontsize='medium')
plt.title("F Distribution")
ax = plt.subplot(3,4,6)
ax.plot(data_f.loc[data_f.Category==1,'F'],data_f.loc[data_f.Category==1,'Count'],c='r',label='1')
ax.legend(fontsize='medium')
plt.title("F Distribution")
plt.xlim(0,550)
ax = plt.subplot(3,4,7)
ax.plot(data_f.loc[data_f.Category==2,'F'],data_f.loc[data_f.Category==2,'Count'],c='g',label='2')
ax.legend(fontsize='medium')
plt.title("F Distribution")
plt.xlim(0,120)
ax = plt.subplot(3,4,8)
ax.plot(data_f.loc[data_f.Category==3,'F'],data_f.loc[data_f.Category==3,'Count'],c='b',label='3')
ax.legend(fontsize='medium')
plt.title("F Distribution")
plt.xlim(0,300)
绘制M四个类型的可视化图形
ax = plt.subplot(3,4,9)
ax.plot(data_m.loc[data_m.Category==0,'M'].astype("int"),data_m.loc[data_m.Category==0,'Count'],c='y',label='0')
plt.xlim(50,2000)
ax.legend(fontsize='medium')
plt.title("M Distribution")
ax = plt.subplot(3,4,10)
ax.plot(data_m.loc[data_m.Category==1,'M'].astype("int"),data_m.loc[data_m.Category==1,'Count'],c='r',label='1')
ax.legend(fontsize='medium')
plt.title("M Distribution")
plt.xlim(50,20000)
ax = plt.subplot(3,4,11)
ax.plot(data_m.loc[data_m.Category==2,'M'].astype("int"),data_m.loc[data_m.Category==2,'Count'],c='g',label='2')
ax.legend(fontsize='medium')
plt.title("M Distribution")
plt.xlim(50,2000)
ax = plt.subplot(3,4,12)
ax.plot(data_m.loc[data_m.Category==3,'M'].astype("int"),data_m.loc[data_m.Category==3,'Count'],c='b',label='3')
ax.legend(fontsize='medium')
plt.title("M Distribution")
plt.xlim(50,6000)
由上述的分析结果可以看到,Category=0组的用户是消费频次少,消费间隔长,付费金额低的用户,因此我们认为该类用户群体的价值较低,是较为普通的客户;
Category=1的用户是消费频次多,消费间隔短,付费金额高的用户,因此我们认为该类用户群体的价值最高,是优质的钻石客户;
Category=2的用户消费频次/付费金额略高于Category=0,消费间隔明显低于Category=0的用户,因此我们认为Category=2的用户是较优的银牌用户;
Category=3用户的消费水平/付费金额/消费间隔在Category=1和Category=2之间,因此我们认为Category=3的用户是优质的金牌用户。