本系列是本人对Hive的学习进行一个整理,主要包括以下内容:
1.HiveQL学习笔记(一):Hive安装及Hadoop,Hive原理简介
2.HiveQL学习笔记(二):Hive基础语法与常用函数
3.HiveQL学习笔记(三):Hive表连接
4.HiveQL学习笔记(四):Hive窗口函数
5.HiveQL学习笔记(五):Hive练习题
接下来对第五个内容进行介绍。
HiveQL学习笔记(二):Hive基础语法与常用函数
这里没有原版的数据,只有字段名,所以只能靠脑补……主要是锻炼思考问题的方法,也可以上网找一些MySQL的题,然后用HiveSQL去完成。
select user_name
from user_info
where city='beijing'
and sex='female'
limit 10;
select user_name,piece,price
from user_trade
where dt='2019-04-09'
and goods_category='food';
select goods_category,
sum(distinct user_name) as user_sum,
sum(pay_amount) as total_amount
from user_trade
where dt BETWEEN '2019-01-01' AND '2019-04-30'
group by goods_category;
select user_name,sum(pay_amount) as total_amount
from user_trade
where dt BETWEEN '2019-04-01' AND '2019-04-30'
group by user_name
having sum(pay_amount) > 50000;
select user_name,sum(pay_amount) as total_amount
from user_trade
where dt BETWEEN '2019-04-01' AND '2019-04-30'
group by user_name
order by sum(pay_amount) desc
limit 5;
select pay_time,
from_unixtime(pay_time,'yyyy-MM-dd hh:mm:ss')
from user_trade
where dt='2019-04-09';
****************************
select pay_time,
from_unixtime(pay_time,'yyyy-MM-dd hh')
from user_trade
where dt='2019-04-09';
****************************
select pay_time,
from_unixtime(pay_time,'yyyy-MM-dd hh:mm')
from user_trade
where dt='2019-04-09';
****************************
select pay_time,
from_unixtime(pay_time,'yyyyMMdd')
from user_trade
where dt='2019-04-09';
select user_name,
datediff('2019-05-01',to_date(firstactivetime))
from user_info
limit 10;
select case when age<20 then '20岁以下'
when age>=20 and age<30 then'20-30岁'
when age>=30 and age<40 then'30-40岁'
else '40岁以上'
end as '年龄分组',
count(distinct user_id) as user_num
from user_info
group by case when age<20 then '20岁以下'
when age>=20 and age<30 then'20-30岁'
when age>=30 and age<40 then'30-40岁'
else '40岁以上'
end;
select sex,
if(level>5,'高级','低级'),
count(distinct user_id) as user_num
from user_info
group by sex,if(level>5,'高级','低级');
select substr(firstactivetime,1,7) as month,
count(distinct user_id) as user_num
from user_info
group by substr(firstactivetime,1,7);
select get_json_object(extra1,'$.phonebrand') as phone_brand,
count(distinct user_id) as user_num
from user_info
group by get_json_object(extra1,'$.phonebrand');
************************************
select extra2['phonebrand'] as phone_brand,
count(distinct user_id) as user_num
from user_info
group by extra2['phonebrand'];
select avg(pay_amount) as avg_mount,
datediff(max(from_unixtime(pay_time,'yyyy-MM-dd')),
min(from_unixtime(pay_time,'yyyy-MM-dd')))
from user_trade
where year(dt)=2018
and user_name='ELLA';
select count(a.user_name)
from
(select user_name,
count(distinct goods_category) as num_goods
from user_trade
where year(pay_time) = 2018
group by user_name
having count(distinct goods_category)>2) as a
select a.age_type,
if(a.marriage_status=1,'已婚','未婚'),
count(distinct a.user_id)
from
(select case when age>=20 and age<30 then '20-30岁'
when age>=30 and age<40 then '30-40岁'
else '40岁以上'
end as age_type,
get_json_object(extra1,'$.marriage_status') as marriage_status,
user_id
from user_info
where year(firstactivetime)=2018) as a
where a.age_type in ('20-30岁','30-40岁')
group by a.age_type,
if(a.marriage_status=1,'已婚','未婚');
select count(distinct user_id)
from user_info
where datediff(from_unixtime(unix_timestamp(),'yyyy-MM-dd',to_date(firstactivetime))>300
group by sex;
select sex,extra2['education'] as education,count(distinct user_id)
from user_info
group by sex,extra2['education'];
select dt,goods_category,sum(pay_amount) as total_amount
from user_trade
where dt BETWEEN '2019-01-01' AND '2019-04-30'
group by dt,goods_category;
select a.user_name
from
(select distinct user_name
from user_trade
where year(dt)=2019) a
join
(select distinct user_name
from user_refund
where year(dt)=2019) b
on a.user_name = b.user_name;
select a.user_name
from
(select distinct user_name
from user_trade
where year(dt)=2017) a
join
(select distinct user_name
from user_trade
where year(dt)=2018) b
on a.user_name=b.user_name;
select a.user_name
from
(select distinct user_name
from user_trade
where year(dt)=2017) a
join
(select distinct user_name
from user_trade
where year(dt)=2018) b
on a.user_name=b.user_name
join
(select distinct user_name
from user_trade
where year(dt)=2019) c
on b.user_name=c.user_name;
select a.user_id,a.user_name
from user_list_1 a
left join
user_list_2 b
on a.user_id=b.user_id
where b.user_id is null;
select a.user_name
from
(select distinct user_name
from user_trade
where year(dt)=2019) a
left join
(select distinct user_name
from user_refund
where year(dt)=2019) b
on a.user_name = b.user_name
where b.user_name is null;
select b.education,count(a.user_name)
from
(select distinct user_name
from user_trade
where year(dt)=2019) a
left join
(select distinct user_name,
get_json_object(extra1,'$.education') as education
from user_info) b
on a.user_name = b.user_name
group by b.education;
select a.user_name
from
(select distinct user_name
from trade_2017) a
join
(select distinct user_name
from trade_2018) b
on a.user_name=b.user_name
left join
(select distinct user_name
from trade_2019) c
b.user_name = c.user_name
where c.user_name is null;
select coalesce(a.user_name,b.user_name)
from user_list_1 as a
full join
user_list_2 as b
on a.user_id=b.user_id;
select count(distinct a.user_name)
from
(
select user_name
from trade_2017
union all
select user_name
from trade_2018
union all
select user_name
from trade_2019
) a;
select a.user_name,
sum(a.pay_amount),
sum(a.refund_amount)
#将0和真实值相加
from
(
select user_name,
sum(pay_amount) as pay_amount,
0 as refund_amount
# 将不存在的refund_amount列用0填充
from user_trade
where year(dt)=2019
group by user_name
union all
select user_name,
0 as pay_amount,
sum(refund_amount) as refund_amount
from user_refund
where year(dt)=2019
group by user_name
) a
group by a.user_name;
select a.user_name,
a.pay_amount,
b.refund_amount
from
(select user_name,
sum(pay_amount) as pay_amount,
# 将不存在的refund_amount列用0填充
from user_trade
where year(dt)=2019
group by user_name) a
left join
(select user_name,
sum(refund_amount) as refund_amount
from user_refund
where year(dt)=2019
group by user_name) b
on a.user_name=b.user_name;
select a.age_type,
count(a.user_name)
from
(
select case when age<20 then '20岁以下'
when age>=20 and age<30 then'20-30岁'
when age>=30 and age<40 then'30-40岁'
else '40岁以上'
end as age_type,
user_name
from user_info
where year(firstactivetime)=2017
) a
left join
(
select distinct user_name
from user_trade
where dt>0
) b
on a.user_name=b.user_name
group by age_type;
select b.hour(firstactivetime),count(a.user_name)
from
(select user_name
from trade_2018
union
select
from trade_2019) a
left join
(select user_name,hour(firstactivetime)
from user_info) b
on a.user_name=b.user_name
group by hour(firstactivetime);
#1.先求每个月的
#2.在每个月的基础上再做累计(需要有子查询)
select a.month,a.pay_amount
sum(a.pay_amount) over(order by a.month)
from
(select month(dt) month,sum(pay_amount) pay_amount
from user_trade
where year(dt)=2019
group by month(dt)) a;
按照季度:
select a.season,a.pay_amount
sum(a.pay_amount) over(order by a.season)
from
(select case when month(dt) in (1,2,3) then 'Q1'
when month(dt) in (4,5,6) then 'Q2'
when month(dt) in (7,8.9) then 'Q3'
else 'Q4'
end as season,
sum(pay_amount) pay_amount
from user_trade
where year(dt)=2019
group by case when month(dt) in (1,2,3) then 'Q1'
when month(dt) in (4,5,6) then 'Q2'
when month(dt) in (7,8.9) then 'Q3'
else 'Q4'
end) a;
select a.year,a.month,a.pay_amount
sum(a.pay_amount) over(partition by a.year order by a.month)
from
(
select sum(pay_amount) pay_amount,
month(dt) month,
year(dt) year
from user_trade
where year(dt) in (2017,2018)
group by year(dt),month(dt)
) as a;
select a.month,
a.pay_amount,
avg(a.pay_amount) over(order by a.month
rows between 2 preceding and current row)
from
(
select month(dt) month,
sum(pay_amount) pay_amount
from user_trade
where year(dt) = 2018
group by month(dt)
) as a;
select user_name,
count(distinct goods_category)
row_umber() over(order by count(distinct goods_category)),
rank() over(order by count(distinct goods_category)),
dense_rank() over(order by count(distinct goods_category)),
from user_trade
where substr(dt,1,7) = '2019-01'
#从第一个截取到第七个
group by user_name;
一次工作完不成时,考虑子表查询:先用where筛选年份,进行排名,然后子查询再用where选出符合的排名。
select a.user_name,
a.pay_amount,
a.rank
from
(
select user_name,
sum(pay_amount) pay_amount,
rank() over(order by sum(a.pay_amount) desc) rank
from user_trade
where substr(dt,1,4) = '2019'
group by user_name
) as a
where a.rank in (10,20,30);
select user_name,
sum(pay_amount) pay_amount,
ntile(5) over(order by pay_amount desc) level
from user_trade
where substr(dt,1,7) = '2019-01'
group by user_name;
取前x%,用ntile(1/x)。钱10%,n=10.
select a.user_name,
a.refund_amount,
a.level
from
(
select user_name,
sum(refund_amount) refund_amount,
ntile(10) over(order by refund_amount desc) level
from user_refund
where year(dt)=2019
group by user_name
) as a
where a.level = 1;
select count(distinct user_name)
from
(
select user_name,
dt,
lead(dt) over(partition by user_name order by dt) lead_dt
from user_name
where dt > '0'
#清除异常dt数据
) as a
where a.lead_dt is not null
#lead_dt若为null,无法进行计算
and datediff(a.lead_dt,a.dt) > 100;
#1.先将2018年,城市,性别,每个用户的支付金额求出来,即a,b表
#2.在此基础上,按照不同城市,不同性别(partition by),对支付金额进行排序操作(rank(),oeder by),排序后的表为c表
#3.最后用where,进行筛选
select c.user_name,
c.city,
c.sex,
c.pay_amount,
c.rank
from
(
select a.user_name,
b.city,
b.sex,
a.pay_amount,
rank() over(partition by b.city,b.sex order by a.pay_amount desc) rank
from
(
select distinct user_name,
sum(pay_amount) pay_amount
from user_trade
where year(dt)=2019
group by user_name
) a
left join
(
select user_name,city,sex
from user_info
) b
on a.user_name = b.user_name
) c
where c.rank <= 3;
#思路同上
select *
from
(
select a.user_name,
extra2['phonebrand'] as phonebrand,
a.refund_amount,
ntile(4) over(partition by extra2['phonebrand'] order by a.refund_amount desc) level
from
(
select user_name,
sum(refund_amount) refund_amount
from user_refund
where dt > '0'
group by user_name
) a
left join
user_info b
on a.user_name = b.user_name
) c
where c.level=1;