练习1
1.数据准备
name cadate money
jack,2017-01-01,10
tony,2017-01-02,15
jack,2017-02-03,23
tony,2017-01-04,29
jack,2017-01-05,46
jack,2017-04-06,46
tony,2017-01-07,50
jack,2017-01-08,55
mart,2017-04-08,62
mart,2017-04-09,68
neil,2017-05-10,12
mart,2017-04-11,75
neil,2017-06-12,80
mart,2017-04-13,94
2.建表,上传数据
create table t_orders
(
name string,
cdate string,
money double
) row format delimited fields terminated by ',';
load data local inpath '/root/orders.txt' into table t_orders;
select * from t_orders;
3.
(1)查询每个用户总订单金额
select *,sum(money) over(partition by name ) from t_orders ;
(2)查询每个月的订单总数
select *,substr(cdate,1,7) month,count(1) over(partition by substr(cdate,1,7)) from t_orders ;
(3)查询所有所有用户的总订单金额
select *,sum(money) over() from t_orders;
(4)查询每个用户的订单总金额
select *,sum(money) over(partition by name)from t_orders;
(5)查询每个用户的订单总金额 按天数排序 累加
select *,sum(money) over(partition by name order by cdate rows between unbounded preceding and current row )from t_orders;
(6)查询每个月的订单总金额 按照天数累加
select *,sum(money) over(partition by substr(cdate,1,7) order by cdate )from t_orders;
练习2
1.数据准备
uid login_date
001,2017-02-05 12:00:00
001,2017-02-05 14:00:00
001,2017-02-06 13:00:00
001,2017-02-07 12:00:00
001,2017-02-08 12:00:00
001,2017-02-10 14:00:00
002,2017-02-05 13:00:00
002,2017-02-06 12:00:00
002,2017-02-06 14:00:00
002,2017-02-08 12:00:00
002,2017-02-09 16:00:00
002,2017-02-10 12:00:00
003,2017-01-31 13:00:00
003,2017-01-31 12:00:00
003,2017-02-01 12:00:00
004,2017-02-02 12:00:00
004,2017-02-03 12:00:00
004,2017-02-10 12:00:00
004,2017-03-01 12:00:00
2.建表,数据导入
create table t_login_user(
uid string,
login_date string
)row format delimited fields terminated by ",";
load data local inpath "/hive/login_user.txt" overwrite into table t_login_user;
select * from t_login_user;
3.
计算连续登陆3天的用户
1.去重
select uid,login_date from t_login_user group by uid,login_date;
2.排号
select uid,login_date,
row_number() over (partition by uid order by login_date) as n
from (select uid,login_date from t_login_user group by uid,login_date) t;
3.获得新的日期 旧日期-行号 相等的说明是连续登录
select uid,login_date,n,
date_sub(login_date,n) new_date
from (select uid,login_date,
row_number() over (partition by uid order by login_date) as n
from (select uid,login_date from t_login_user group by uid,login_date) t) t2;
3.count+过滤
select uid,count(1)
from(select uid,login_date,n,
date_sub(login_date,n) new_date
from (select uid,login_date,
row_number() over (partition by uid order by login_date) as n
from (select uid,login_date from t_login_user group by uid,login_date) t) t2) t3 group by uid,new_date having count(1)>=3;
练习3
1.数据准备
uid,hit,m
1,1,0
1,2,1
1,3,1
1,4,1
1,5,0
1,6,0
1,7,1
2,1,1
2,2,1
2,3,1
2,4,1
2,5,1
3,1,1
3,2,1
3,3,1
3,4,0
3,5,0
3,6,1
3,7,0
3,8,1
2.建表,数据导入
create table tb_ds(
uid int , -- 用户名
hit int , -- 第几次打地鼠
m int -- 是否命中 1命中 0 未命中
)
row format delimited fields terminated by ',' ;
load data local inpath '/hive/ds.txt' into table tb_ds ;
select * from tb_ds;
3.
-- 查询用户最大连续命中次数
-- 过滤只剩m=1
select uid, hit, m
from tb_ds where m=1;
--打行号
select uid, hit, m,
row_number() over (partition by uid order by hit) flag
from (select uid, hit, m
from tb_ds where m=1) t;
-- 同一id hit-flag结果相同说明是连续的
select uid, hit, m, flag,(hit-flag) n
from (select uid, hit, m,
row_number() over (partition by uid order by hit) flag
from (select uid, hit, m
from tb_ds where m=1) t) t1;
-- count
select uid,count(1) count
from (select uid, hit, m, flag,(hit-flag) n
from (select uid, hit, m,
row_number() over (partition by uid order by hit) flag
from (select uid, hit, m
from tb_ds where m=1) t) t1) t2 group by uid,n ;
-- uid 进行分组, 获得最大的值
select uid,max(count)
from (select uid,count(1) count
from (select uid, hit, m, flag,(hit-flag) n
from (select uid, hit, m,
row_number() over (partition by uid order by hit) flag
from (select uid, hit, m
from tb_ds where m=1) t) t1) t2 group by uid,n) t3 group by uid;