第一题
表结构:uid,subject_id,score
求:找出所有科目成绩都大于某一学科平均成绩的学生
数据集如下
1001 01 90
1001 02 90
1001 03 90
1002 01 85
1002 02 85
1002 03 70
1003 01 70
1003 02 70
1003 03 85
1)建表语句
create table score(
uid string,
subject_id string,
score int)
row format delimited fields terminated by '\t';
2)求出每个学科平均成绩
select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score;t1
3)根据是否大于平均成绩记录flag,大于则记为0否则记为1
select
uid,
if(score>avg_score,0,1) flag
from
t1;t2
4)根据学生id进行分组统计flag的和,和为0则是所有学科都大于平均成绩
select
uid
from
t2
group by
uid
having
sum(flag)=0;
5)最终SQL
select
uid
from
(select
uid,
if(score>avg_score,0,1) flag
from
(select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score)t1)t2
group by
uid
having
sum(flag)=0;
第二题
我们有如下的用户访问数据
userId visitDate visitCount
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
U02 2017/1/23 6
U01 2017/2/22 4
要求使用SQL统计出每个用户的累积访问次数,如下表所示:
用户id 月份 小计 累积
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
u03 2017-01 8 8
u04 2017-01 3 3
数据集
u01,2017/1/21,5
u02,2017/1/23,6
u03,2017/1/22,8
u04,2017/1/20,3
u01,2017/1/23,6
u01,2017/2/21,8
u02,2017/1/23,6
u01,2017/2/22,4
1)创建表
create table action
(userId string,
visitDate string,
visitCount int)
row format delimited fields terminated by ",";
2)修改数据格式
select
userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action;t1
3)计算每人单月访问量
select
userId,
mn,
sum(visitCount) mn_count
from
t1
group by
userId,mn;t2
4)按月累计访问量
select
userId,
mn,
mn_count,
sum(mn_count) over(partition by userId order by mn)
from t2;
5)最终SQL
select
userId,
mn,
mn_count,
sum(mn_count) over(partition by userId order by mn)
from
(select
userId,
mn,
sum(visitCount) mn_count
from
(select
userId,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action) t1
group by userId,mn) t2;
显示结果:
OK
userid mn mn_count sum_window_0
u01 2017-01 11 11
u01 2017-02 12 23
u02 2017-01 12 12
u03 2017-01 8 8
u04 2017-01 3 3
Time taken: 72.886 seconds, Fetched: 5 row(s)
第三题
有50W个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志,访问日志存储的表名为Visit,访客的用户id为user_id,被访问的店铺名称为shop,请统计:
1)每个店铺的UV(访客数)
2)每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
数据集
u1 a
u2 b
u3 b
u1 a
u3 c
u4 b
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 b
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a
u5 a
1.建表
create table visit(user_id string,shop string) row format delimited fields terminated by '\t';
2.每个店铺的UV(访客数)
select shop,count(distinct user_id) from visit group by shop;
3.每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
(1)查询每个店铺被每个用户访问次数
select shop,user_id,count(*) ct
from visit
group by shop,user_id;t1
shop user_id ct
a u1 3
a u2 2
a u3 1
a u5 4
b u1 1
b u2 1
b u3 1
b u4 3
b u5 1
c u2 2
c u3 1
c u6 1
(2)计算每个店铺被用户访问次数排名
select shop,user_id,ct,rank() over(partition by shop order by ct) rk from t1;t2
shop user_id ct rk
a u5 4 1
a u1 3 2
a u2 2 3
a u3 1 4
b u4 3 1
b u5 1 2
b u3 1 2
b u2 1 2
b u1 1 2
c u2 2 1
c u6 1 2
c u3 1 2
(3)取每个店铺排名前3的
select shop,user_id,ct
from t2
where rk<=3;
(4)最终SQL
select
shop,
user_id,
ct
from
(select
shop,
user_id,
ct,
rank() over(partition by shop order by ct desc) rk
from
(select
shop,
user_id,
count(*) ct
from visit
group by
shop,
user_id) t1
) t2
where rk <= 3;
结果:
shop user_id ct
a u5 4
a u1 3
a u2 2
b u4 3
b u5 1
b u3 1
b u2 1
b u1 1
c u2 2
c u6 1
c u3 1
第四题
已知一个表STG.ORDER,有如下字段:Date,Order_id,User_id,amount。请给出sql进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。
1)给出 2017年每个月的订单数、用户数、总成交金额。
2)给出2017年11月的新客数(指在11月才有第一笔订单)
建表:
create table order_tab
(dt string,order_id string,user_id string,amount decimal(10,2))
row format delimited fields terminated by '\t';
插入数据:
INSERT INTO TABLE order_tab VALUES ('2017-01-01','10029028','1000003251',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-01-01','10029029','1000003251',38.57);
INSERT INTO TABLE order_tab VALUES ('2017-01-01','100290288','1000003252',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-02-02','10029088','1000003251',39.57);
INSERT INTO TABLE order_tab VALUES ('2017-02-02','100290281','1000003251',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-02-02','100290282','1000003253',33.57);
INSERT INTO TABLE order_tab VALUES ('2017-11-02','10290282','100003253',234);
INSERT INTO TABLE order_tab VALUES ('2018-11-02','10290284','100003243',234);
(1)给出 2017年每个月的订单数、用户数、总成交金额。
SELECT t1.mon,
count(t1.order_id) AS order_cnt,
count(DISTINCT t1.user_id) AS user_cnt,
sum(amount) AS total_amount
FROM
(SELECT order_id,
user_id,
amount,
date_format(dt,'yyyy-MM') mon
FROM order_tab
WHERE date_format(dt,'yyyy') = '2017') t1
GROUP BY t1.mon;
t1.mon order_cnt user_cnt total_amount
2017-01 3 2 105.71
2017-02 3 2 106.71
2017-11 1 1 234
(2)给出2017年11月的新客数(指在11月才有第一笔订单)
SELECT count(user_id) new_user
FROM order_tab
GROUP BY user_id
HAVING date_format(min(dt),'yyyy-MM')='2017-11';
new_user
1
第五题
2019-02-11,test_1,23
2019-02-12,test_1,23
2019-02-11,test_2,19
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-11,test_3,39
2019-02-11,test_1,23
2019-02-12,test_2,19
2019-02-13,test_1,23
2019-02-15,test_2,19
2019-02-16,test_2,19
INSERT INTO TABLE user_age VALUES ('2019-02-12','test_1',23);
1)建表
create table user_age(dt string,user_id string,age int)row format delimited fields terminated by ',';
2)按照日期以及用户分组,按照日期排序并给出排名
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id;t1
dt user_id age rk
2019-02-11 test_1 23 1
2019-02-12 test_1 23 2
2019-02-13 test_1 23 3
2019-02-11 test_2 19 1
2019-02-12 test_2 19 2
2019-02-15 test_2 19 3
2019-02-16 test_2 19 4
2019-02-11 test_3 39 1
3)计算日期及排名的差值
select
user_id,
age,
date_sub(dt,rk) flag
from (select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id) t1;
user_id age flag
test_1 23 2019-02-10
test_1 23 2019-02-10
test_1 23 2019-02-10
test_2 19 2019-02-10
test_2 19 2019-02-10
test_2 19 2019-02-12
test_2 19 2019-02-12
test_3 39 2019-02-10
4)过滤出差值大于等于2的,即为连续两天活跃的用户
select
user_id,
min(age) age
from
(select
user_id,
age,
date_sub(dt,rk) flag
from (select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id) t1) t2
group by
user_id,flag
having
count(*)>=2;t3
user_id age
test_1 23
test_2 19
test_2 19
5)对数据进行去重处理(一个用户可以在两个不同的时间点连续登录),例如:a用户在1月10号1月11号以及1月20号和1月21号4天登录。
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(select
user_id,
age,
date_sub(dt,rk) flag
from (select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id) t1) t2
group by
user_id,flag
having
count(*)>=2) t3
group by
user_id; t4
user_id age
test_1 23
test_2 19
6)计算活跃用户(两天连续有访问)的人数以及平均年龄
select
count() ct,
cast(sum(age)/count() as decimal(10,2)) avg_age
from (select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(select
user_id,
age,
date_sub(dt,rk) flag
from (select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id) t1) t2
group by
user_id,flag
having
count(*)>=2) t3
group by
user_id) t4;
ct avg_age
2 21
------------表2-------------
7)对全量数据集进行按照用户去重
select
user_id,
min(age) age
from
user_age
group by
user_id;t5
user_id age
test_1 23
test_2 19
test_3 39
8)计算所有用户的数量以及平均年龄
select
count() user_count,
cast((sum(age)/count()) as decimal(10,1)) all_avg_age
from
(select
user_id,
min(age) age
from
user_age
group by
user_id) t5;
user_count all_avg_age
3 27
9)将第5步以及第7步两个数据集进行union all操作
select
0 user_total_count,
0 user_total_avg_age,
count() twice_count,
cast(sum(age)/count() as decimal(10,2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt,rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id
)t1
)t2
group by
user_id,flag
having
count(*)>=2)t3
group by
user_id
)t4
union all
select
count() user_total_count,
cast((sum(age)/count()) as decimal(10,1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
)t5;t6
_u1.user_total_count _u1.user_total_avg_age _u1.twice_count _u1.twice_count_avg_age
0 0 2 21
3 27 0 0
10)求和并拼接为最终SQL
select
sum(user_total_count) user_total_count,
sum(user_total_avg_age) user_total_avg_age,
sum(twice_count) twice_count,
sum(twice_count_avg_age) twice_count_avg_age
from
(select
0 user_total_count,
0 user_total_avg_age,
count() twice_count,
cast(sum(age)/count() as decimal(10,2)) twice_count_avg_age
from
(
select
user_id,
min(age) age
from
(select
user_id,
min(age) age
from
(
select
user_id,
age,
date_sub(dt,rk) flag
from
(
select
dt,
user_id,
min(age) age,
rank() over(partition by user_id order by dt) rk
from
user_age
group by
dt,user_id
)t1
)t2
group by
user_id,flag
having
count(*)>=2)t3
group by
user_id
)t4
union all
select
count() user_total_count,
cast((sum(age)/count()) as decimal(10,1)),
0 twice_count,
0 twice_count_avg_age
from
(
select
user_id,
min(age) age
from
user_age
group by
user_id
)t5)t6;
user_total_count user_total_avg_age twice_count twice_count_avg_age
3 27 2 21