参考：http://mp.weixin.qq.com/s?__biz=MzIxNjA2ODUzNg==&mid=2651440274&idx=1&sn=7c5b8062891110c191f2463c3f1060c8&chksm=8c73dc45bb04555346d33c0e91449b7603468d09d5b3b945140451e9c259a5995b55f070527d&mpshare=1&scene=24&srcid=040751THZgbr42wORjuPnmR0&sharer_sharetime=1586226101529&sharer_shareid=e7cab4ddb0b83013d3591b3744cb904b#rd
数据：https://tianchi.aliyun.com/dataset/dataDetail?dataId=46

1、数据准备与数据理解

> library(pacman)
> p_load(readr,dplyr,dtplyr)
> df <- read_csv("./tianchi_mobile_recommend_train_user.csv",
+                col_types = list(col_character(),col_character(),
+                                 col_factor(),col_character(),
+                                 col_character(),col_character()))
> str(df)

## tibble [802,757 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ user_id      : chr [1:802757] "98047837" "97726136" "98607707" "98662432" ...
##  $ item_id      : chr [1:802757] "232431562" "383583590" "64749712" "320593836" ...
##  $ behavior_type: Factor w/ 4 levels "1","3","2","4": 1 1 1 1 1 1 1 1 1 2 ...
##  $ user_geohash : chr [1:802757] NA NA NA "96nn52n" ...
##  $ item_category: chr [1:802757] "4245" "5894" "2883" "6562" ...
##  $ time         : chr [1:802757] "2014-12-06 02" "2014-12-09 20" "2014-12-18 11" "2014-12-06 10" ...
##  - attr(*, "problems")= tibble [1 x 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int 802757
##   ..$ col     : chr NA
##   ..$ expected: chr "6 columns"
##   ..$ actual  : chr "5 columns"
##   ..$ file    : chr "'./tianchi_mobile_recommend_train_user.csv'"
##  - attr(*, "spec")=
##   .. cols(
##   ..   user_id = col_character(),
##   ..   item_id = col_character(),
##   ..   behavior_type = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE),
##   ..   user_geohash = col_character(),
##   ..   item_category = col_character(),
##   ..   time = col_character()
##   .. )

字段名	解释
user_id：	用户ID
item_id：	商品ID
behavior_type：	行为类型,包括click、collect、add-to-cart和payment，相应的值分别为1、2、3和4
user_geohash：	行为发生时的用户位置
item_category：	商品所属的品类
time：	行为发生时间

查看数据集缺失值情况：

> naniar::miss_var_summary(df)

## # A tibble: 6 x 3
##   variable      n_miss  pct_miss
##                  
## 1 user_geohash  550130 68.5     
## 2 time               1  0.000125
## 3 user_id            0  0       
## 4 item_id            0  0       
## 5 behavior_type      0  0       
## 6 item_category      0  0

> naniar::gg_miss_var(df,show_pct = TRUE)

各变量缺失值情况

变量time和user_geohash都包含缺失值，其中time包含1个缺失值，user_geohash包含550130个缺失值。
因为user_geohash地理信息在数据收集过程中做过加密转换，所以选择不对缺失值进行处理。

将time列分裂为date和hour列，并转换数据类型：

> df <- tidyr::separate(df,time,into=c("date","hour"),sep=" ")
> df$hour <- stringr::str_extract(df$hour,"^[0-9]{2}") %>% as.integer
> summary(df)

##    user_id            item_id          behavior_type user_geohash      
##  Length:802757      Length:802757      1:756387      Length:802757     
##  Class :character   Class :character   3: 22510      Class :character  
##  Mode  :character   Mode  :character   2: 16013      Mode  :character  
##                                        4:  7847                        
##                                                                        
##                                                                        
##                                                                        
##  item_category          date                hour      
##  Length:802757      Length:802757      Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:10.00  
##  Mode  :character   Mode  :character   Median :16.00  
##                                        Mean   :14.78  
##                                        3rd Qu.:20.00  
##                                        Max.   :23.00  
##                                        NA's   :1

2、用户行为分析

2.1 pv和uv分析

PV(访问量)：即Page View，指网站页面的浏览量或点击量，页面被刷新一次就计算一次。

UV(独立访客)：即Unique Visitor，访问网站的一台电脑客户端为一个访客。

1）日访问量分析

> p_load(ggplot2)
> daily <- df %>% lazy_dt %>% na.omit(date) 
> hourly <- df %>% lazy_dt %>% na.omit(hour)
> 
> pv.daily <- daily %>% 
+   group_by(date) %>% summarise(pv=n()) %>% 
+   as_tibble()
> 
> p1 <- ggplot(pv.daily,aes(as.Date(date),pv)) +
+   geom_step(size=1) +
+   ylim(c(20000,50000)) +
+   theme_bw() +
+   labs(x="")

2）日访客分析

> uv.daily <- daily %>% 
+   group_by(date) %>% distinct(user_id) %>% 
+   summarise(uv=n()) %>% as_tibble()
> 
> p2 <- ggplot(uv.daily,aes(as.Date(date),uv)) +
+   geom_step(size=1) +
+   ylim(c(3000,5000)) +
+   theme_bw() +
+   labs(x="")
> 
> ratio <- max(uv.daily$uv) / length(unique(df$user_id))
> print(scales::percent(ratio,accuracy = 0.1))

## [1] "67.5%"

> p_load(patchwork)
> p1 + p2 + plot_layout(ncol = 1)

pv与uv分布

结果如图所示，pv和uv访问量在双十二期间达到峰值。使用最高uv除以总人数，可得出双十二期间淘宝用户的日活跃率最高为67.5%。

3）小时访问量分析

> pv.hour <- hourly %>% 
+   group_by(hour,behavior_type) %>% summarise(pv=n()) %>% 
+   as_tibble()
> 
> p3 <- ggplot(pv.hour,aes(hour,pv,col=behavior_type)) +
+   geom_line(size=1) +
+   geom_point(size=2) +
+   scale_x_continuous(breaks = c(0:23)) +
+   theme_bw() +
+   theme(legend.position = "bottom") +
+   labs(x="")
> 
> p4 <- ggplot(pv.hour,aes(hour,pv,col=behavior_type)) +
+   geom_line(data=subset(pv.hour,behavior_type!=1),size=1) +
+   geom_point(data=subset(pv.hour,behavior_type!=1),size=2) +
+   scale_x_continuous(breaks = c(0:23)) +
+   theme_bw() +
+   theme(legend.position = "bottom") +
+   labs(x="")
> 
> p3 + p4 + plot_layout(ncol = 1)

小时访问量分析

1、2、3、4分别代表点击、收藏、加购物车和支付。从上图中可以看到点击的次数远高于其他三种行为，以至于其他三种行为的趋势看不出来，所以下图中去掉了行为1的曲线。
从整体上看，四种行为的波动情况基本一致，并且在晚上7点-10点间pv访问量最高。
同时，从下图中也可以看到，行为3（加购物车）的pv总量高于行为2（收藏），行为2又高于行为4（支付）。

2.2 用户消费行为分析

1）活跃用户每天购买次数情况分析

> buy.daily <- daily %>% 
+   filter(behavior_type==4) %>% group_by(user_id) %>%
+   summarise(n=n()) %>% as_tibble()
> 
> ggplot(buy.daily,aes(n)) +
+   geom_histogram(stat = "count") +
+   theme_bw() +
+   labs(x="")

用户消费次数分布

淘宝用户消费次数普通在10次以下，因此需要重点关注消费次数在10次以上的用户。

2）活跃用户每天人均消费次数

每天消费总次数 / 每天消费总人数

> daily.totle <- daily %>% 
+   filter(behavior_type==4) %>% 
+   group_by(date) %>% summarise(totle=n()) %>% as_tibble()
> daily.count <- daily %>%
+   filter(behavior_type==4) %>% 
+   group_by(date) %>% distinct(user_id) %>% 
+   summarise(count=n()) %>% as_tibble()
> 
> full_join(daily.totle,daily.count) %>% 
+   mutate(freq=totle/count) %>% 
+   ggplot(aes(as.Date(date),freq)) +
+   geom_line(size=1) +
+   theme_bw() +
+   labs(x="",y="")

每天人均消费次数

可以看到每天的平均消费次数一般都在1.2次左右，双十二期间达到最高值，约为1.4。

3）付费率

每日消费总人数 / 每日总活跃人数（每日有操作行为的人数）

> # 每天访问的用户数
> daily.user <- daily %>% group_by(date) %>%
+   distinct(user_id) %>% 
+   summarise(s=n()) %>% as_tibble()
> 
> # 每天交易笔数
> daily.buy <- daily %>% filter(behavior_type==4) %>% 
+   group_by(date) %>% distinct(user_id) %>%  
+   summarise(b=n()) %>% as_tibble()
> 
> full_join(daily.buy,daily.user) %>% 
+   mutate(prob=b / s) %>% 
+   ggplot(aes(as.Date(date),prob)) +
+   geom_line(size=1) +
+   theme_bw() +
+   labs(x="",y="")

付费率

用户付费率在6%左右，平均低于6%，双十二期间超过16%。

4）同一时间段用户消费次数分布

> # 各用户消费次数
> user.buy <- df %>% lazy_dt() %>% 
+   filter(behavior_type==4) %>% 
+   group_by(user_id) %>% summarise(n=n()) %>% as_tibble()
> 
> summary(user.buy$n)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.359   3.000  82.000

> ggplot(user.buy,aes(n)) +
+   geom_density(outline.type = "full",fill="gray",
+                size=2,col="gray") +
+   theme_bw() +
+   labs(x="",y="")

同时段消费次数分布

大多数用户消费次数为1次，平均消费次数为2.359，75%用户消费次数在3次以下。

3、复购情况分析

复购率 = 有复购行为的用户数 / 有购买行为的用户总数
复购：两天以上有购买行为，但一天内购买两次或以上只算一次购买行为

将数据按日期排序，可以看到数据集的时间跨度为从2014-11-18到2014-12-18之间，正好一个月的时间：

> head(daily %>% arrange(date) %>% as_tibble() %>% .[c(1,6,7)])

## # A tibble: 6 x 3
##   user_id   date        hour
##              
## 1 104221274 2014-11-18    19
## 2 104221274 2014-11-18    20
## 3 103582986 2014-11-18    13
## 4 103802946 2014-11-18    15
## 5 100684618 2014-11-18    23
## 6 104221274 2014-11-18    19

> tail(daily %>% arrange(date) %>% as_tibble() %>% .[c(1,6,7)])

## # A tibble: 6 x 3
##   user_id   date        hour
##              
## 1 120873    2014-12-18    17
## 2 18883390  2014-12-18    11
## 3 106090167 2014-12-18    23
## 4 18883390  2014-12-18    16
## 5 18883390  2014-12-18    11
## 6 106090167           NA

先按date和user_id去重，然后按user_id计数，大于1时即为复购行为：

> daily.rebuy <- daily %>% filter(behavior_type==4) %>% 
+   distinct(date,user_id) %>% group_by(user_id) %>% 
+   summarise(n=n()) %>% as_tibble()
> 
> rebuy.ratio <- nrow(filter(daily.rebuy,n>1)) / nrow(daily.rebuy)
> scales::percent(rebuy.ratio,accuracy = 0.1)

## [1] "46.7%"

> ggplot(daily.rebuy,aes(n)) +
+   geom_bar(stat = "count") +
+   theme_bw() +
+   scale_x_continuous(breaks = c(1:30)) +
+   labs(x="",y="")

复购情况分布

一个月之内的复购率为46.7%，多数用户在一个月内的购买次数为1-5次。

4、漏斗流失分析

反映用户行为状态从起点（点击）到终点（支付）各阶段的转化率情况。
将数据按商品（item_category）和用户行为分组，然后计算各行为的次数。

> df.category <- df %>% lazy_dt %>% group_by(item_id,behavior_type) %>% 
+   summarise(n=n()) %>% as_tibble()
> 
> table(df.category$behavior_type) %>% prop.table() %>% 
+   as.data.frame() %>% ggplot(aes(Var1,Freq)) +
+   geom_bar(stat = "identity",fill="gray60") +
+   geom_text(aes(label=scales::percent(Freq,accuracy = 0.1)),
+             col="blue") +
+   scale_x_discrete(breaks = c(1,3,2,4),
+                    labels = c("点击","加入购物车","收藏","支付")) +
+   theme_bw() +
+   labs(x="",y="")

漏斗流失分析

> scales::percent(3.7 / 92.2,accuracy = 0.001)

## [1] "4.013%"

> scales::percent(1.3 / 3.7,accuracy = 0.001)

## [1] "35.135%"

用户点击后，大约有4.013%的概率会加入购物车，而加入购物车后大约有35.135%的概率会支付。

72-R语言淘宝用户行为分析案例