所需数据:ORDER_INFO_UTF.CSV、USER_INFO_UTF.CSV
步骤:将两份csv文件导入数据库、建表、导入数据
前面的表分别是10万和50万条数据,如果用数据库自带的导数工具比较缓慢,像WORKBENCH导入几千行数据就需要3分钟时间,这里可以用cmd命令行导入、或者用KETTLE进行抽取。
#1、订单明细表
CREATE TABLE ORDERINFO (
ORDERID varchar(10) PRIMARY KEY,-- 订单ID,主键
USERID varchar(10) NULL,-- 用户ID,可以和用户表进行关联
ISPAID varchar(10) NULL,-- 是否支付
PRINCE varchar(10) NULL,-- 订单价格
PAIDTIME varchar(100) NULL -- 订单支付时间
)
#2、用户表
CREATE TABLE USERINFO (
USERID varchar(10) PRIMARY KEY,-- 用户ID,主键
SEX varchar(10) NULL,-- 性别
BIRTH varchar(100) NULL -- 出生日期
)
#3、装载文件
load data infile 'C:\\ProgramData\\MySQL\\MySQL Server 5.7\\Uploads\\order_info_utf.csv'
into table orderinfo
fields terminated by ','
lines terminated by '\r\n';
1-统计不同月份的下单人数
2-统计用户三月份的回购率和复购率
3-统计男女的消费频次是否有差异
4-统计多次消费的用户,第一次和最后一次消费时间的间隔
5-统计不同年龄段的用户消费金额是否有差异
6-统计消费的二八法则,消费的top20%用户,贡献了多少额度
1、统计不同月份的下单人数
思路:查看orderinfo发现支付时间字段都是同一年份的,所以直接使用month函数提取月份,这样更加简便。
select month(paidTime) ,count(distinct userId) from orderinfo
where isPaid = '已支付'
group by month(paidTime)
2、统计用户三月份的回购率和复购率
(1)复购率:
select count(ct) ,count(if(ct>1,1,null)),count(if(ct>1,1,null))/count(ct) as ratio
from ( select userId,count(userId) as ct from orderinfo
where isPaid = '已支付'
and month(paidTime) = 3
group by userId) t
(2)回购率:
select t1.m,count(t1.m),count(t2.m) from (
select userId,date_format(paidTime,'%Y-%m-01') as m from orderinfo
where isPaid = '已支付'
group by userId,date_format(paidTime,'%Y-%m-01')) t1
left join (
select userId,date_format(paidTime,'%Y-%m-01') as m from orderinfo
where isPaid = '已支付'
group by userId,date_format(paidTime,'%Y-%m-01')) t2
on t1.userId = t2.userId and t1.m = date_sub(t2.m,interval 1 month)
group by t1.m
3、统计男女用户消费频次是否有差异
select sex,avg(ct) from (
select o.userId,sex,count(1) as ct from orderinfo o
inner join (
select * from userinfo
where sex is not null) t
on o.userId = t.userId
group by userId,sex) t2
group by sex
4、统计多次消费的用户,第一次和最后一次消费间隔是多少
select userId,max(paidTime),min(paidTime),
datediff(max(paidTime),min(paidTime)) from orderinfo
where isPaid = '已支付'
group by userId having count(1) > 1
5、统计不同年龄段用户消费频次是否有差异
select age,avg(ct) from (
select o.userId,age,count(o.userId) as ct
from orderinfo o
inner join (
select userId,ceil((year(now()) - year(birth)) / 10) as age
from userinfo
where birth > '1901-00-00') t
on o.userId = t.userId
group by o.userId,age) t2
group by age
6、统计消费的二八法则,消费的top20%用户,贡献了多少额度
select round(COUNT(distinct userid)*0.2) from orderinfo where ISPAID='已支付'
-- 17130
select count(userId),sum(total) from (
select userId,sum(prince) as total from orderinfo o
where isPaid = '已支付'
group by userId
order by total desc
limit 17130)t
注意:mysql5.7之前没有窗口函数row_number() over(order by )
我们使用这种方式实现:
-- 最终sql
select COUNT(userid),SUM(total) from
(select @num := @num+1 num,userid,total from
(select userId,sum(prince) as total from orderinfo o
where isPaid = '已支付'
group by userId
order by total desc) t,(SELECT @str := '', @num := 0)t1)t2
where t2.num < (select round(COUNT(distinct userid)*0.2) from orderinfo where ISPAID='已支付')