实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)

数据集链接:淘宝用户购物行为数据集_数据集-阿里云天池

因原数据集过大,电脑和MySQL跑不动,故截取前一百万条数据进行实战演练、

1、导入数据

利用navicat软件直接导入下载好的Excel文件(较大数据集可用kettle导入)

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第1张图片

 2、数据预处理

use aori;
desc userbehavior;
select * from userbehavior limit 5;

--更改字段名

alter table userbehavior change f1 user_id int;
alter table userbehavior change f2 item_id int;
alter table userbehavior change f3 category_id int;
alter table userbehavior change f4 behavior_type varchar(5);
alter table userbehavior change f5 timestamps int;


--检查空值

select * from userbehavior where user_id is null;
select * from userbehavior where item_id is null;
select * from userbehavior where category_id is null;
select * from userbehavior where behavior_type is null;
select * from userbehavior where timestamps is null;

--检查重复值
select user_id,item_id,timestamps from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1;

--去重

-设置主键

alter table userbehavior add id int first;
alter table userbehavior modify id int primary key auto_increment;
select * from userbehavior limit 5;

-去除重复值

delete userbehavior from 
userbehavior,
(
select user_id,item_id,timestamps,min(id) id from userbehavior
group by user_id,item_id,timestamps
having count(*) > 1
) t2
where userbehavior.user_id = t2.user_id
and userbehavior.item_id = t2.item_id
and userbehavior.timestamps  = t2.timestamps
and userbehavior.id > t2.id

--新增日期
- datetime

alter table userbehavior add datetimes TIMESTAMP(0);
update userbehavior set datetimes=FROM_UNIXTIME(timestamps);
select * from userbehavior limit 5;

-time
alter table userbehavior add dates char(10);
alter table userbehavior add times char(8);
alter table userbehavior add hours char(2);

update userbehavior set dates=substring(datetimes,1,10);
update userbehavior set times=substring(datetimes,12,8);
update userbehavior set hours=substring(datetimes,12,2);
select * from userbehavior limit 5;

-- 去异常
select max(datetimes),min(datetimes) from userbehavior;

delete from userbehavior
where datetimes < '2017-11-25 00:00:00'
or datetimes > '2017-12-03 23:59:59';


--数据概览
desc userbehavior;
select * from userbehavior limit 5;
select count(1) from userbehavior; # 999530条记录



实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第2张图片

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第3张图片

3、时间序列分析

-- 统计日期-小时行为
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hours


create table date_hour_behavior(
dates char(10),
hours char(2),
pv int,
cart int,
fav int,
buy int
);


insert into date_hour_behavior
select dates,hours
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by dates,hours
order by dates,hours

select * from date_hour_behavior 

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第4张图片

 4、获客情况

--创建临时表
create table temp_behaviors like userbehavior


--截取

insert into temp_behaviors
select * from userbehavior limit 100000;

select * from temp_behaviors;


-- pv(Page View页面浏览量)

select dates
,count(behavior_type) 'pv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;


-- UV(Uniqque Visitor独立访客数)

select dates
,count(distinct user_id) 'uv'
from temp_behaviors
where behavior_type = 'pv'
group by dates;


-- 一条语句

select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv' 
from temp_behaviors
where behavior_type = 'pv'
group by dates;


-- 处理真实数据
create table pv_uv_puv(
dates char(10),
pv int(9),
uv int(9),
puv decimal(10,1)
);


insert into pv_uv_puv
select dates
,count(behavior_type) 'pv'
,count(distinct user_id) 'uv'
,round(count(behavior_type)/count(distinct user_id),1) 'pv/uv' 
from userbehavior
where behavior_type = 'pv'
group by dates;

select * from pv_uv_puv;



实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第5张图片

 5、留存情况

  ①留存率计算

select user_id,dates
from userbehavior 
group by user_id,dates;


-- 自关联

select * from 
(
select user_id,dates
from userbehavior 
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior 
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates


-- 留存数
select a.dates
,count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_0
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null)) retention_1
,count(if(datediff(b.dates,a.dates)=3,b.user_id,null)) retention_3
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior 
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior 
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.dates


-- 保存结果
create table retention_rate(
dates char(10),
retention_1 float
);

insert into retention_rate
select a.dates
,count(if(datediff(b.dates,a.dates)=1,b.user_id,null))/count(if(datediff(b.dates,a.dates)=0,b.user_id,null)) retention_rate_1
from
(
select user_id,dates
from userbehavior 
group by user_id,dates
) a
,(
select user_id,dates
from userbehavior 
group by user_id,dates
) b
where a.user_id = b.user_id
and a.dates <= b.dates
group by a.dates


select * from retention_rate;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第6张图片

②跳失用户计算

-- 跳失率

-- 跳失用户

select count(*)
from
(
select user_id from userbehavior
group by user_id
having count(behavior_type)=1
) a;


select sum(pv) from pv_uv_puv;

 6、行为路径分析


create view user_behavior_view as 
select user_id,item_id
,count(if(behavior_type='pv',behavior_type,null)) 'pv'
,count(if(behavior_type='fav',behavior_type,null)) 'fav'
,count(if(behavior_type='cart',behavior_type,null)) 'cart'
,count(if(behavior_type='buy',behavior_type,null)) 'buy'
from userbehavior
group by user_id,item_id


-- 用户行为标准化

create view user_behavior_standard as 
select user_id,item_id
,(case when pv>0 then 1 else 0 end) '浏览'
,(case when fav>0 then 1 else 0 end) '收藏'
,(case when cart>0 then 1 else 0 end) '加购'
,(case when buy>0 then 1 else 0 end) '购买'
from user_behavior_view


-- 路径类型

create view user_behavior_path as 
select *,
concat(浏览,收藏,加购,购买) as path_type
from user_behavior_standard as a 
where a.购买 > 0;


-- 统计各类型购买数量

create view path_count as 
select path_type
,count(*) path_type_num
from user_behavior_path
group by path_type
order by path_type_num desc;


select * from path_count;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第7张图片

-- 改名表

create table change_name(
path_type char(4),
description varchar(40));


insert into change_name
values('0001','购买'),
('1001','浏览购买'),
('0011','加购购买'),
('1011','浏览加购购买'),
('0101','收藏购买'),
('1101','浏览收藏购买'),
('0111','收藏加购购买'),
('1111','浏览收藏加购购买');

select * from change_name;

create table path_result
(description varchar(40)
,path_type_num int);



insert into path_result
select description,
path_type_num
from path_count
join change_name
on path_count.path_type = change_name.path_type;

select * from path_result;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第8张图片

 7、用户转化率分析


-- 统计各类行为用户数

select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desc


create table behavior_user_num(
behavior_type varchar(5),
user_num int
);


insert into behavior_user_num
select behavior_type
,count(DISTINCT user_id) user_num
from userbehavior
group by behavior_type
order by behavior_type desc;


select * from behavior_user_num;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第9张图片

-- 统计各类行为数量


select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desc

create table behavior_num(
behavior_type varchar(5),
behavior_count_num int
);

insert into behavior_num
select behavior_type
,count(*) behavior_count_num
from userbehavior
group by behavior_type
order by behavior_type desc;

select * from behavior_num;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第10张图片

 8、TOP商品

-- 品类浏览量TOP10

create table popular_categories(
category_id int,
pv int
);


insert into popular_categories
select category_id
,count(if(behavior_type='pv',behavior_type,null))  '品类浏览量'
from userbehavior
group by category_id
order by 2 desc
limit 10;

select * from popular_categories;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第11张图片

-- 商品浏览量TOP10

create table popular_items(
item_id int,
pv int
);

insert into popular_items
select item_id
,count(if(behavior_type='pv',behavior_type,null))  '商品浏览量'
from userbehavior
group by item_id
order by 2 desc
limit 10;

select * from popular_items;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第12张图片

-- 各类别商品浏览量TOP10

create table popular_cateitems(
category_id int,
item_id int,
pv int
);

insert into popular_cateitems
select category_id,item_id,品类商品浏览量 from
(
select category_id
,item_id
,count(if(behavior_type='pv',behavior_type,null))  '品类商品浏览量'
,rank() over(partition by category_id order by count(if(behavior_type='pv',behavior_type,null)) desc) r
from userbehavior
group by category_id,item_id
order by 3 desc
) a 
where a.r = 1
order by a.品类商品浏览量 desc
limit 10;

select * from popular_cateitems;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第13张图片

9、RFM模型


-- 最近购买时间
select user_id
,max(dates) '最近购买时间'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;


-- 购买次数
select user_id
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc;


-- 统一
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;

-- 存储

drop table if exists rfm_model;
create table rfm_model(
user_id int,
recently char(10),
frequency int
);


insert into rfm_model
select user_id
,max(dates) '最近购买时间'
,count(user_id) '购买次数'
from userbehavior
where behavior_type = 'buy'
group by user_id
order by 2 desc,3 desc;

-- 根据购买次数对用户进行分层

alter table rfm_model add column fscore int;

update rfm_model
set fscore = 
case when frequency >= 20 then 5
when frequency between 15 and 19 then 4
when frequency between 10 and 15 then 3
when frequency between 5 and 10 then 2
else 1
end;

-- 根据最近购买时间对用户进行分层

alter table rfm_model add column rscore int;

update rfm_model
set rscore = 
case when recently = '2017-12-03' then 5
when recently in ('2017-12-01','2017-12-02') then 4
when recently in ('2017-11-30','2017-11-29') then 3
when recently in ('2017-11-28','2017-11-27') then 2
else 1
end;

-- 分层

- 设置变量
set @f_avg = null;
set @r_avg = null;
select avg(fscore) into @f_avg from rfm_model;
select avg(rscore) into @r_avg from rfm_model;

select *
,(case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end) class
from rfm_model;

-- 插入

alter table rfm_model add column class varchar(40);

update rfm_model
set class=
case
when fscore > @f_avg and rscore > @r_avg then '价值用户'
when fscore > @f_avg and rscore < @r_avg then '保持用户'
when fscore < @f_avg and rscore > @r_avg then '发展用户'
when fscore < @f_avg and rscore < @r_avg then '挽留用户'
end;

select * from rfm_model;



实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第14张图片

-- 统计各分区用户数
select class
,count(user_id) class_num
from rfm_model
group by class;

实战整理-阿里天池淘宝用户购物行为数据集实战(MySQL数据分析+Navicat)_第15张图片

 

你可能感兴趣的:(mysql)