描述型数据分析 收集各种数据, 计算各种业务指标 对当前的业务状态进行描述
诊断型 业务是处于什么状态
预测 机器学习算法
规范/方案 数据驱动运营 数据赋能业务
找到问题, 发现机会
新增用户指标 DNU
留存: 次日留存 3日留存 7日留存
应用的质量
渠道质量
转化率: 400UV 20收藏 收藏转化率 20/400 5%
行为维度
PV UV
转化率, 转发率
产品
GMV: 电商
退货不考虑
没付款不考虑
只要拍了链接, 运费, 银行手续费都算进去
ARPU: 平均付费
ARPPU: 付费用户的平均付费
CPM :千次展示
CPC : 点击收费
CPA :按效果收费
# 导包/导入数据
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.offline as pyoff
import plotly.graph_objs as go
pyoff.init_notebook_mode()
# 导入数据
retail_data_1 = pd.read_excel('data2/online_retail_II.xlsx',sheet_name='Year 2009-2010')
retail_data_2 = pd.read_excel('data2/online_retail_II.xlsx',sheet_name='Year 2010-2011')
# 合并数据
retail_data = pd.concat([retail_data_1,retail_data_2],
ignore_index=True)
retail_data.info()
# 导入忽略警告
import warnings
warnings.filterwarnings('ignore')
# 整理数据,将InvoiceDate 字段整理成日期时间格式
retail_data['InvoiceDate'] = pd.to_datetime(retail_data['InvoiceDate'])
retail_data['InvoiceDate'].describe()
# price小于零的是坏账调整,我们把Quantity<0 和 Price<0的订单删除
retail_data_clean = retail_data.query('Quantity > 0 & Price > 0 ')
# 过滤长度<5的内容
retail_data_clean
[retail_data_clean.StockCode.str.len().fillna(5) < 5].
StockCode.unique()
# 过滤长度>6的内容
retail_data_clean
[retail_data_clean.StockCode.str.len().fillna(5) > 6].
StockCode.unique()
# 发现特殊的SKU有如下这些
special_sku = ['POST', 'DOT', 'M', 'C2', 'PADS', 'm', 'D', 'S', 'B','BANKCHARGES','AMAZONFEE','ADJUST2','TEST001','TEST002']
# ~ 对布尔型数据取反的出应该存在的费用
retail_data_clean = retail_data_clean
[~retail_data_clean['StockCode'].isin(['B','TEST001','TEST002'])]
# 接下来我们添加一个字段,用来表示交易的月份
retail_data_clean['BuyYearMonth'] = retail_data_clean['InvoiceDate'].apply(lambda x :x.strftime('%Y%m'))
# 用单价*数量计算每条交易记录的交易金额
retail_data_clean['Revenue'] = retail_data_clean['Quantity'] *
retail_data_clean['Price']
# 然后按照月份分组,计算每月的总收入:GMV
retail_revenue = retail_data_clean.groupby('BuyYearMonth')
[['Revenue']].sum().reset_index()
# plotly可视化, plotly是一个Python可视化开源库,
# 使用Plotly可以绘制交互式的图形, 功能类似于Pyecharts
import plotly.offline as pyoff
import plotly.graph_objs as go
plot_data = [
go.Scatter(
x=retail_revenue['BuyYearMonth'],
y=retail_revenue['Revenue'],)]
plot_layout = go.Layout(
xaxis={'type':'category'},
title='月交易额'
)
fig = go.Figure(data = plot_data, layout=plot_layout)
pyoff.iplot(fig)
retail_revenue['MonthlyGrowth'] = retail_revenue['Revenue'].pct_change()
# 自定义函数绘图
def draw_line(data_x,data_y,title):
plot_data = [go.Scatter(x=data_x,y=data_y,)]
plot_layout = go.Layout(xaxis={"type": "category"},title=title)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
# 调用函数 draw_line
draw_line(data_x = retail_revenue.query("BuyYearMonth < 201112")['BuyYearMonth'],
data_y = retail_revenue.query("BuyYearMonth < 201112")['MonthlyGrowth'],
title = '月销售环比')
# 删除用户中的空值
retail_data_clean.dropna(subset=['Customer ID'], inplace=True)
# 计算每个月的活跃用户数量
mau = retail_data_clean.groupby('BuyYearMonth')
[['Customer ID']].nunique().reset_index()
#封装绘制柱状图的方法并可视化
# 定义绘图函数
def draw_bar(data_x , data_y, title):
plot_data = [go.Bar(x=data_x, y=data_y,)]
plot_layout = go.Layout(xaxis={"type": "category"},title = title)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
# 调用函数
draw_bar(mau['BuyYearMonth'],mau['Customer ID'],'月活用户数')
# 调用函数绘制折线图
draw_line(mau['BuyYearMonth'],mau['Customer ID'],'月活用户数')
retail_monthly_sales = retail_data_clean.groupby('BuyYearMonth')
['Quantity'].sum().reset_index()
retail_monthly_sales.head()
#调用上面的绘图函数绘制柱状图
draw_bar(retail_monthly_sales['BuyYearMonth'],retail_monthly_sales['Quantity'],'月销商品数量')
average_revenue = retail_data_clean.groupby(['BuyYearMonth'])
['Revenue'].mean().reset_index()
draw_bar(data_x=average_revenue['BuyYearMonth'],data_y=average_revenue['Revenue'],title ='月销商品件单价')
#按照用户ID分组,找到每个用户购买的发票日期的最小值
retail_first_buy = retail_data_clean.groupby('Customer
ID').InvoiceDate.min().reset_index()
#重命名列名
retail_first_buy.columns = ['Customer ID','FirstBuyDate']
#创建新的字段 FirstBuyYearMonth, 计算首次购买的年月
retail_first_buy['FirstBuyYearMonth'] = retail_first_buy['FirstBuyDate'].map(lambda date:
100*date.year + date.month)
retail_first_buy
#与原数据合并
retail_data_clean = pd.merge(retail_data_clean,retail_first_buy, on='Customer ID')
retail_data_clean.head()
#创建UserType字段来区分新老用户
retail_data_clean['UserType'] = 'New'
retail_data_clean.loc[retail_data_clean['BuyYearMonth']>retail_data_clean['FirstBuyYearMo
nth'],'UserType'] = 'Existing'
retail_data_clean.UserType.value_counts()
new_exist_revenue = retail_data_clean.groupby(['BuyYearMonth','UserType'])
['Revenue'].sum().reset_index()
new_exist_revenue.head()
new_exist_revenue=new_exist_revenue.query("BuyYearMonth != 200912 and BuyYearMonth !=
201112")
# 统计每月用户购买情况
user_purchase = retail_data_clean.groupby(['Customer ID','BuyYearMonth'])
['Revenue'].sum().astype(int).reset_index()
#创建交叉表,有购买的月份记1, 没有购买的月份记0
user_retention = pd.crosstab(user_purchase['Customer ID'],
user_purchase['BuyYearMonth']).reset_index()
user_retention
#提取出月份数据:我们从上面的结果中, 截取出201001以后的数据来
onths = user_retention.columns[2:]
months
#计算每个月的月留存率
#创建list 用来记录计算结果
retention_list = []
for i in range(len(months)-1):
#创建字典,用来记录每一个字段的计算结果
retention_ = {}
#当前的月份
selected_mon = months[i+1]
#前一个月份
pre_mon = months[i]
#创建一列,用来记录当前的月份
retention_['InvoiceYearMonth'] = int(selected_mon)
#前一个月有购买的用户数量
retention_['AllUserCount'] = user_retention[pre_mon].sum()
#当前月和前一个月都有购买的用户数量
retention_['RetainedUserCount'] = user_retention[(user_retention[selected_mon]>0) &
(user_retention[pre_mon]>0)][selected_mon].sum()
#把数据保存到list当中
retention_list.append(retention_)
#把list中的数据转变成 dataframe 并计算留存率
monthly_retention = pd.DataFrame(retention_list)
monthly_retention['RetentionRate'] =
monthly_retention['RetainedUserCount']/monthly_retention['AllUserCount']