问题 1:
(1) 需求
找出所有科目成绩都大于某一学科平均成绩的学生。
(2) 建表
drop table if exists score;
create table score(
uid string,
subject_id string,
score int
)
row format delimited
fields terminated by '\t';
(3) 数据
1001 01 90
1001 02 90
1001 03 90
1002 01 85
1002 02 85
1002 03 70
1003 01 70
1003 02 70
1003 03 85
(4) 加载数据
load data local inpath '/opt/hivesqltopic/data/data1.txt' overwrite into table score;
(5) 查看表数据
select * from score;
(6) 写 hql
A、思路
a、求出每个学科的平均成绩
select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score; t1
b、根据是否大于平均成绩 flag,大于记为 0 否则记为 1
select
uid,
if(score>avg_score,0,1) flag
from
t1; t2
c、根据学生 id 进行分组统计 flag 的和,和为 0 则是所有学科成绩都大于平均成绩
select
uid
from
t2
group by
uid
having
sum(flag)=0;
B、最终 hql
select
uid
from
(
select
uid,
if(score>avg_score,0,1) flag
from
(
select
uid,
score,
avg(score) over(partition by subject_id) avg_score
from
score t1
) t2
) t3
group by
uid
having
sum(flag)=0;
(2) 建表
drop table if exists action;
create table action (
userId string,
visitDate string,
visitCount int
)
row format delimited
fields terminated by "\t";
(3) 数据
u01 2017/1/21 5
u02 2017/1/23 6
u03 2017/1/22 8
u04 2017/1/20 3
u01 2017/1/23 6
u01 2017/2/21 8
u02 2017/1/23 6
u01 2017/2/22 4
(4) 加载数据
load data local inpath '/opt/hivesqltopic/data/data2.txt' overwrite into table action;
(5) 查看表数据
select * from action;
(6) 写 hql
A、思路
a、修改数据格式
select
userId,
date_format(regexp_replace(visitDate,'/','-'), 'yyyy-MM') mn,
visitCount
from
action; t1
b、计算每人单月访问量
select
userId,
mn,
sum(visitCount) mn_count
from
t1
group by
userId,mn; t2
c、按月累计访问量
select
userId,
mn,
mn_count,
sum(mn_count) over(partition by userId order by mn)
from
t2
B、最终 hql
select
userid,
mn,
mn_Count,
sum(mn_Count) over(partition by userid order by mn) sum_Count
from
(
select
userid,
mn,
sum(visitCount) mn_Count
from
(
select
userid,
date_format(regexp_replace(visitDate,'/','-'),'yyyy-MM') mn,
visitCount
from
action t1
) t2
group by userid,mn
) t3
问题三:
(1) 需求
有 50W 个京东店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条
访问日志,访问日志存储的表名为 visit,访客的用户 id 为 user_id,被访问的店
铺名称为shop,请统计:
A、每个店铺的 UV (访客数z)。
B、每个店铺访问次数 top3 的访客信息。输出店铺名称、访客 id、访问次数。
(2) 建表
drop table if exists visit;
create table visit(
user_id string,
shop string
)
row format delimited
fields terminated by '\t';
(3) 数据
u1 a
u2 b
u1 b
u1 a
u3 c
u4 b
u1 a
u2 c
u5 b
u4 b
u6 c
u2 c
u1 b
u2 a
u2 a
u3 a
u5 a
u5 a
u5 a
(4) 加载数据
load data local inpath '/opt/hivesqltopic/data/data3.txt' overwrite into table visit;
(5) 查看表数据
select * from visit;
(6) 写 hql
A、统计每个店铺的 UV(访客数)
select
shop,
count(distinct user_id) UV
from
visit
group by
shop
B、每个店铺访问次数 top3 的访客信息,输出店铺名称、访客 id、访问次数。
a、思路
查询每个店铺被每个用户访问次数
select
shop,
user_id,
count(*) ct
from
visit
group by
shop,user_id; t1
计算每个店铺被用户访问次数排名
select
shop,
user_id,
ct,
rank() over(partition by shop order by ct) rk
from
t1; t2
每个店铺取排名前三的
select
shop,
user_id,
ct
from
t2
where rk<=3;
b、最终 hql
select
shop,
user_id,
num
from
(
select
shop,
user_id,
num,
rank() over(partition by shop order by num) rk
from
(
select
shop,
user_id,
count(*) num
from
visit t1
group by
shop,user_id
) t2
) t3
where
rk<=3;
问题四:
(1) 需求
已知一个表 order_tab,有如下字段:Date,Order_id,User_id,amount。
请给出 hql 进行统计:数据样例:2017-01-01,10029028,1000003251,33.57。
A、给出 2017 年每个月的订单数、用户数、总成交金额。
B、给出 2017 年 11 月的新客数(指在 11 月才有第一笔订单)。
(2) 建表
drop table if exists order_tab;
create table order_tab (
dt string,
order_id string,
user_id string,
amount decimal(10,2)
)
row format delimited
fields terminated by ',';
(3) 数据
2017-01-01,1,1,33.5
2017-05-20,1,3,45.6
2017-11-05,2,4,22
2017-02-06,2,1,43.2
(4) 加载数据
load data local inpath '/opt/hivesqltopic/data/data4.txt' overwrite into table order_tab;
(5) 查看表数据
select * from order_tab;
select
date_format(dt, 'yyyy-MM') mn,
count(order_id) order_num,
count(distinct user_id) user_id,
sum(amount) amount
from
order_tab
where
date_format(dt, 'yyyy')='2017'
group by
date_format(dt, 'yyyy-MM')
B、给出 2017 年 11 月的新客数(指在 11 月才有第一笔订单)。
select
count(user_id) num
from
order_tab
group by
user_id
having
date_format(min(dt),'yyyy-MM')='2017-11';
问题五(hql 中的行列转换):
1、列转行
(1) 建表
drop table if exists tb_course;
create table tb_course(
name string,
course string,
grade int
)
row format delimited
fields terminated by ',';
(2) 数据
tom,JDBC,20
tom,Hibernate,50
tom,Spring,80
marry,JDBC,30
marry,Hibernate,60
marry,Spring,70
(3) 加载数据
load data local inpath '/opt/hivesqltopic/data/data9.txt' overwrite into table tb_course;
(4) 查询数据
select * from tb_course;
select
name,
sum(if(course='JDBC',grade,0)) JDBC,
sum(if(course='Hibernate',grade,0)) Hibernate,
sum(if(course='Spring',grade,0)) Spring
from
tb_course
group by
name
2、行转列
(1) 建表
drop table if exists tb_courses;
create table tb_courses(
name string,
JDBC int,
Hibernate int,
Spring int
)
row format delimited
fields terminated by ',';
(2) 数据
tom,20,50,80
marry,30,60,70
(3) 加载数据
load data local inpath '/opt/hivesqltopic/data/data10.txt' overwrite into table tb_courses;
(4) 查询数据
select * from tb_courses;
select
name,
'JDBC' course,
jdbc grade
from
tb_courses
union all
select
name,
'Hibernate' course,
Hibernate grade
from
tb_courses
union all
select
name,
'Spring' course,
Spring grade
from
tb_courses
order by
name;