数据集链接:淘宝用户购物行为数据集_数据集-阿里云天池
因原数据集过大,电脑和MySQL跑不动,故截取前一百万条数据进行实战演练、
利用navicat软件直接导入下载好的Excel文件(较大数据集可用kettle导入)
use aori;
desc userbehavior;
select * from userbehavior limit 5;
--更改字段名
alter table userbehavior change f1 user_id int;
alter table userbehavior change f2 item_id int;
alter table userbehavior change f3 category_id int;
alter table userbehavior change f4 behavior_type varchar(5);
alter table userbehavior change f5 timestamps int;
--检查空值
select * from userbehavior where user_id is null;
select * from userbehavior where item_id is null;
select * from userbehavior where category_id is null;
select * from userbehavior where behavior_type is null;
select * from userbehavior where timestamps is null;
--检查重复值
select user_id,item_id,timestamps from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1;
--去重
-设置主键
alter table userbehavior add id int first;
alter table userbehavior modify id int primary key auto_increment;
select * from userbehavior limit 5;
-去除重复值
delete userbehavior from
userbehavior,
(
select user_id,item_id,timestamps,min(id) id from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1
) t2
where userbehavior.user_id = t2.user_id
and userbehavior.item_id = t2.item_id
and userbehavior.timestamps = t2.timestamps
and userbehavior.id > t2.id
--新增日期
- datetime
alter table userbehavior add datetimes TIMESTAMP(0);
update userbehavior set datetimes=FROM_UNIXTIME(timestamps);
select * from userbehavior limit 5;
-time
alter table userbehavior add dates char(10);
alter table userbehavior add times char(8);
alter table userbehavior add hours char(2);
update userbehavior set dates=substring(datetimes,1,10);
update userbehavior set times=substring(datetimes,12,8);
update userbehavior set hours=substring(datetimes,12,2);
select * from userbehavior limit 5;
-- 去异常
select max(datetimes),min(datetimes) from userbehavior;
delete from userbehavior
where datetimes < '2017-11-25 00:00:00'
or datetimes > '2017-12-03 23:59:59';
--数据概览
desc userbehavior;
select * from userbehavior limit 5;
select count(1) from userbehavior; # 999530条记录
-- 统计日期-小时行为
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hours
create table date_hour_behavior(
dates char(10),
hours char(2),
pv int,
cart int,
fav int,
buy int
);
insert into date_hour_behavior
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hours
select * from date_hour_behavior
--创建临时表
create table temp_behaviors like userbehavior
--截取
insert into temp_behaviors
select * from userbehavior limit 100000;
select * from temp_behaviors;
-- pv(Page View页面浏览量)
select dates
,count(behavior_type) 'pv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;
-- UV(Uniqque Visitor独立访客数)
select dates
,count(distinct user_id) 'uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;
-- 一条语句
select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;
-- 处理真实数据
create table pv_uv_puv(
dates char(10),
pv int(9),
uv int(9),
puv decimal(10,1)
);
insert into pv_uv_puv
select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv'
from userbehavior
where behavior_type = 'pv'
group by dates;
select * from pv_uv_puv;
①留存率计算
select user_id,dates
from userbehavior
group by user_id,dates;
-- 自关联
select * from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
-- 留存数
select a.dates
,count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_0
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null)) retention_1
,count(if(datediff(b.dates,a.dates)=3,b.user_id,null)) retention_3
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.dates
-- 保存结果
create table retention_rate(
dates char(10),
retention_1 float
);
insert into retention_rate
select a.dates
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.dates
select * from retention_rate;
②跳失用户计算
-- 跳失率
-- 跳失用户
select count(*)
from
(
select user_id from userbehavior
group by user_id
having count(behavior_type)=1
) a;
select sum(pv) from pv_uv_puv;
create view user_behavior_view as
select user_id,item_id
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by user_id,item_id
-- 用户行为标准化
create view user_behavior_standard as
select user_id,item_id
,(case when pv>0 then 1 else 0 end) '浏览'
,(case when fav>0 then 1 else 0 end) '收藏'
,(case when cart>0 then 1 else 0 end) '加购'
,(case when buy>0 then 1 else 0 end) '购买'
from user_behavior_view
-- 路径类型
create view user_behavior_path as
select *,
concat(浏览,收藏,加购,购买) as path_type
from user_behavior_standard as a
where a.购买 > 0;
-- 统计各类型购买数量
create view path_count as
select path_type
,count(*) path_type_num
from user_behavior_path
group by path_type
order by path_type_num desc;
select * from path_count;
-- 改名表
create table change_name(
path_type char(4),
description varchar(40));
insert into change_name
values('0001','购买'),
('1001','浏览购买'),
('0011','加购购买'),
('1011','浏览加购购买'),
('0101','收藏购买'),
('1101','浏览收藏购买'),
('0111','收藏加购购买'),
('1111','浏览收藏加购购买');
select * from change_name;
create table path_result
(description varchar(40)
,path_type_num int);
insert into path_result
select description,
path_type_num
from path_count
join change_name
on path_count.path_type = change_name.path_type;
select * from path_result;
-- 统计各类行为用户数
select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desc
create table behavior_user_num(
behavior_type varchar(5),
user_num int
);
insert into behavior_user_num
select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desc;
select * from behavior_user_num;
-- 统计各类行为数量
select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desc
create table behavior_num(
behavior_type varchar(5),
behavior_count_num int
);
insert into behavior_num
select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desc;
select * from behavior_num;
-- 品类浏览量TOP10
create table popular_categories(
category_id int,
pv int
);
insert into popular_categories
select category_id
,count(if(behavior_type='pv',behavior_type,null)) '品类浏览量'
from userbehavior
group by category_id
order by 2 desc
limit 10;
select * from popular_categories;
-- 商品浏览量TOP10
create table popular_items(
item_id int,
pv int
);
insert into popular_items
select item_id
,count(if(behavior_type='pv',behavior_type,null)) '商品浏览量'
from userbehavior
group by item_id
order by 2 desc
limit 10;
select * from popular_items;
-- 各类别商品浏览量TOP10
create table popular_cateitems(
category_id int,
item_id int,
pv int
);
insert into popular_cateitems
select category_id,item_id,品类商品浏览量 from
(
select category_id
,item_id
,count(if(behavior_type='pv',behavior_type,null)) '品类商品浏览量'
,rank() over(partition by category_id order by count(if(behavior_type='pv',behavior_type,null)) desc) r
from userbehavior
group by category_id,item_id
order by 3 desc
) a
where a.r = 1
order by a.品类商品浏览量 desc
limit 10;
select * from popular_cateitems;
-- 最近购买时间
select user_id
,max(dates) '最近购买时间'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;
-- 购买次数
select user_id
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;
-- 统一
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;
-- 存储
drop table if exists rfm_model;
create table rfm_model(
user_id int,
recently char(10),
frequency int
);
insert into rfm_model
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;
-- 根据购买次数对用户进行分层
alter table rfm_model add column fscore int;
update rfm_model
set fscore =
case when frequency >= 20 then 5
when frequency between 15 and 19 then 4
when frequency between 10 and 15 then 3
when frequency between 5 and 10 then 2
else 1
end;
-- 根据最近购买时间对用户进行分层
alter table rfm_model add column rscore int;
update rfm_model
set rscore =
case when recently = '2017-12-03' then 5
when recently in ('2017-12-01','2017-12-02') then 4
when recently in ('2017-11-30','2017-11-29') then 3
when recently in ('2017-11-28','2017-11-27') then 2
else 1
end;
-- 分层
- 设置变量
set @f_avg = null;
set @r_avg = null;
select avg(fscore) into @f_avg from rfm_model;
select avg(rscore) into @r_avg from rfm_model;
select *
,(case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end) class
from rfm_model;
-- 插入
alter table rfm_model add column class varchar(40);
update rfm_model
set class=
case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end;
select * from rfm_model;
-- 统计各分区用户数
select class
,count(user_id) class_num
from rfm_model
group by class;