目录
HIVEsql复杂用法举例
1.保存select查询结果的几种方式:
2.行转列,根据主键对某列进行合并
3.列转行,将某列数据拆分成多行
4.hive实现wordcount
5.级联查询实现累积报表(笨办法)
6-12均为9.hive 窗口分析函数
6.窗口分析函数 sum() over() :可以实现在窗口中进行逐行累加(简单办法)
7.分组排序求topn
8.各种打序号方法
9.LAG函数
10.LEAD函数
11.FIRST_VALUE 函数
12 LAST_VALUE 函数
13 hive本地模式(数据量级少时,或测试时,使用localjob提交运算更快)
14 动态分区
15.分桶:
16 自定义函数
17.transform案例:
18.日活新增
19.维度统计
--1、将查询结果保存到一张新的hive表中
create table t_tmp
as
select * from t_p;
--2、将查询结果保存到一张已经存在的hive表中
insert into table t_tmp
select * from t_p;
--3、将查询结果保存到指定的文件目录(可以是本地,也可以是hdfs)
insert overwrite local directory '/home/hadoop/test'
select * from t_p;
insert overwrite directory '/aaa/test'
select * from t_p;
--concat_ws(',',collect_set(column)) collect_set去重,collect_set不去重
/*stu_subject1.data:
1,zhangsan,化学
1,zhangsan,物理
1,zhangsan,数学
1,zhangsan,语文
2,lisi,化学
2,lisi,数学
2,lisi,生物
2,lisi,生理
2,lisi,卫生
3,wangwu,化学
3,wangwu,语文
3,wangwu,英语
3,wangwu,体育
3,wangwu,生物
*/
create table t_stu_subjects1(id int,name string,subject string)
row format delimited fields terminated by ',';
load data local inpath '/root/t_data/stu_subject1.data' into table t_stu_subjects1;
select id,name,concat_ws(',',collect_set(subject)) subjects from t_stu_subjects1
group by id,name;
--lateral view explode():
/*stu_subject.data:
1,zhangsan,化学:物理:数学:语文
2,lisi,化学:数学:生物:生理:卫生
3,wangwu,化学:语文:英语:体育:生物
*/
create table t_stu_subjects(id int,name string,subjects array)
row format delimited fields terminated by ','
collection items terminated by ':';
load data local inpath '/root/t_data/stu_subject.data' into table t_stu_subjects;
--列拆分
select explode(subjects) as subject from t_stu_subjects;
--列拆分后去重
select distinct subject from
(select explode(subjects) as subject from t_stu_subjects) tmp;
--使用lateral view 表生成函数实现拆分后形成新表
select id,name,sub_final
from t_stu_subjects lateral view explode(subjects) tmp as sub_final;
select
word,count(1) cns
from (select explode(split(trim(line),' ')) word from t_wordcount) tmp
group by word order by cns desc;
--统计用户当月总计次数及累积到当月的总次数;
/*access_time.dat
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
C,2015-01,20
A,2015-02,4
A,2015-02,6
C,2015-02,30
C,2015-02,10
B,2015-02,10
B,2015-02,5
A,2015-03,14
A,2015-03,6
B,2015-03,20
B,2015-03,25
C,2015-03,10
C,2015-03,20
*/
create table t_access_time(uid string,month string,times int)
row format delimited fields terminated by ',';
load data local inpath '/root/t_data/access_time.dat' into table t_access_time;
--数据按月聚合求当月总次数(建临时表)
create table access_temp as
select uid,month,sum(times) sum_times
from t_access_time group by uid,month order by uid,month;
--access_temp表级联自己,并去除b表中month >= a表的数据,将b表的sum_times根据uid,month聚合sum
create table t_tmp as
select *
from
(select a.uid,a.month,a.sum_times,b.sum_times b_sum_times,b.month b_month
from access_temp a join access_temp b on a.uid=b.uid) temp where temp.month>=temp.b_month;
select uid,month,max(sum_times) times_month,sum(b_sum_times) times_all
from t_tmp group by uid,month;
--月聚合表基础上:
select uid
,month
,sum_times
,sum(sum_times) over(partition by uid order by month rows between unbounded preceding and current row) sum_all
from access_temp;
-- 注:sum() over() sum必须带参数(要做累积操作的列)
--row_number() over(partition by sex order by age desc) 分组排序求topn
select *
from
(select *,row_number() over(partition by sex order by age desc) rn from t_rn) tmp
where tmp.rn<=2;
/*
打序号, 不同效果看示例
row_number() over()
rank() over()
dense_rank() over()
求薪资排名中位于前1/3的人
ntile(2) over()
score rownumber rankover dense_rank ntile
89 1 1 1 1
90 2 2 2 1
90 3 2 2 1
91 4 4 3 2
92 5 5 4 2
92 6 5 4 2
93 7 7 5 3
*/
/*t_access data:
192.168.33.3 ,http://www.edu360.cn/stu,2017-08-04 15:30:20,20170804
192.168.33.3 ,http://www.edu360.cn/teach,2017-08-04 15:35:20,20170804
192.168.33.4 ,http://www.edu360.cn/stu,2017-08-04 15:30:20,20170804
192.168.33.4 ,http://www.edu360.cn/job,2017-08-04 16:30:20,20170804
192.168.33.5 ,http://www.edu360.cn/job,2017-08-04 15:40:20,20170804
192.168.33.3 ,http://www.edu360.cn/stu,2017-08-05 15:30:20,20170805
192.168.44.3 ,http://www.edu360.cn/teach,2017-08-05 15:35:20,20170805
192.168.33.44,http://www.edu360.cn/stu,2017-08-05 15:30:20,20170805
192.168.33.46,http://www.edu360.cn/job,2017-08-05 16:30:20,20170805
192.168.33.55,http://www.edu360.cn/job,2017-08-05 15:40:20,20170805
192.168.133.3,http://www.edu360.cn/register,2017-08-06 15:30:20,20170806
192.168.111.3,http://www.edu360.cn/register,2017-08-06 15:35:20,20170806
192.168.34.44,http://www.edu360.cn/pay,2017-08-06 15:30:20,20170806
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20,20170806
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20,20170806
192.168.33.46,http://www.edu360.cn/excersize,2017-08-06 16:30:20,20170806
192.168.33.25,http://www.edu360.cn/job,2017-08-06 15:40:20,20170806
192.168.33.36,http://www.edu360.cn/excersize,2017-08-06 16:30:20,20170806
192.168.33.55,http://www.edu360.cn/job,2017-08-06 15:40:20,20170806
*/
9.LAG函数
--lag(access_time,num,null) over() 根据access_time分组后的数据,access_time字段下移num行,形成新的一列,第一行空的数据用‘null’替换
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lag(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;
--lead(access_time,num,null) over() 根据access_time分组后的数据,access_time字段上移num行,形成新的一列,第一行空的数据用‘null’替换
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
lead(access_time,1,0) over(partition by ip order by access_time)as last_access_time
from t_access;
--例:取每个用户访问的第一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
first_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;
--例:取每个用户访问的最后一个页面
select ip,url,access_time,
row_number() over(partition by ip order by access_time) as rn,
last_value(url) over(partition by ip order by access_time rows between unbounded preceding and unbounded following)as last_access_time
from t_access;
set hive.exec.mode.local.auto=true
/*
(将数据中最后的字段直接映射成分区,常用于二级分区)
静态分区,动态分区同时存在是,主分区应为静态分区,二级分区为动态分区,
因为,hivesql在执行时,先编译静态分区(查询语句外的部分),并创建静态分区,
执行sql时,再根据查询语句内的字段做动态分区,如果主分区动态,
那么静态分区创建时,动态分区还未创建(hdfs目录未创建),就会报错!
*/
--(默认false),表示开启动态分区功能
set hive.exec.dynamic.partition =true;
-- (默认strict),表示允许所有分区都是动态的,否则必须有静态分区字段
set hive.exec.dynamic.partition.mode = nonstrict;
--动态分区相关的调优参数:
--(默认100,一般可以设置大一点,比如1000)表示每个maper或reducer可以允许创建的最大动态分区个数,默认是100,超出则会报错。
set hive.exec.max.dynamic.partitions.pernode=100
-- 表示一个动态分区语句可以创建的最大动态分区个数,超出报错
set hive.exec.max.dynamic.partitions =1000 --(默认值)
-- 全局可以创建的最大文件个数,超出报错。
set hive.exec.max.created.files =10000 --(默认)
/*示例part2
1,zhangsan,2020-04-16
2,lisi,2020-04-16
3,wangwu,2020-04-15
4,zhaoliu,2020-04-15
*/
create table t_partition_test(id int,name string) partitioned by(city string,dt string)
row format delimited fields terminated by ',';
create table t_partition_test2(id int,name string) partitioned by(dt string)
row format delimited fields terminated by ',';
create table t_partition_test3(id int,name string) partitioned by(city string,dt string)
row format delimited fields terminated by ',';
--load data into table 使用动态分区会报错,暂无解决办法
load data local inpath '/root/t_data/part2' into table t_partition_test partition(city='beijing',dt);
load data local inpath '/root/t_data/part2' into table t_partition_test2 partition(dt='2020-04-14');
load data local inpath '/root/t_data/part2' into table t_partition_test2 partition(dt='2020-04-15');
--使用sql insert可成功插入动态分区
insert into table t_partition_test2 partition(dt)
select id,name,dt from t_partition_test;
insert into table t_partition_test3 partition(city='beijing',dt)
select id,name,dt from t_partition_test2;
/*
对于每一个表(table)或者分区, Hive可以进一步组织成桶,也就是说桶是更为细粒度的数据范围划分。Hive也是 针对某一列进行桶的组织。Hive采用对列值哈希,然后除以桶的个数求余的方式决定该条记录存放在哪个桶当中。
把表(或者分区)组织成桶(Bucket)有两个理由:
(1)获得更高的查询处理效率。桶为表加上了额外的结构,Hive 在处理有些查询时能利用这个结构。具体而言,连接两个在(包含连接列的)相同列上划分了桶的表,可以使用 Map 端连接 (Map-side join)高效的实现。比如JOIN操作。对于JOIN操作两个表有一个相同的列,如果对这两个表都进行了桶操作。那么将保存相同列值的桶进行JOIN操作就可以,可以大大较少JOIN的数据量。
--(2)使取样(sampling)更高效。在处理大规模数据集时,在开发和修改查询的阶段,如果能在数据集的一小部分数据上试运行查询,会带来很多方便。
*/
set hive.enforce.bucketing = true;
set mapreduce.job.reduces=4;
create table stu_buck(Sno int,Sname string,Sex string,Sage int,Sdept string)
clustered by(Sno)
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';
insert overwrite table student_buck
select * from student cluster by(Sno) sort by(Sage); 报错,cluster 和 sort 不能共存
insert into table stu_buck
select Sno,Sname,Sex,Sage,Sdept from student distribute by(Sno) sort by(Sno asc);
insert overwrite table stu_buck
select * from student distribute by(Sno) sort by(Sno asc);
insert overwrite table stu_buck
select * from student cluster by(Sno);
drop table sc_buck;
create table sc_buck(Sno int,Cno int,Grade int)
clustered by(Sno)
sorted by(Sno DESC)
into 4 buckets
row format delimited
fields terminated by ',';
insert overwrite table sc_buck
select * from sc cluster by(Sno);
select a.*,b.* from stu_buck a join sc_buck b on a.sno=b.sno;
--取样示例, X out of Y ,Y 最好是buckets数量的整数倍
select * from stu_buck tablesample(bucket 1 out of 4 on id );
/*
有如下json数据:rating.json
{"movie":"1193","rate":"5","timeStamp":"978300760","uid":"1"}
{"movie":"661","rate":"3","timeStamp":"978302109","uid":"1"}
{"movie":"914","rate":"3","timeStamp":"978301968","uid":"1"}
{"movie":"3408","rate":"4","timeStamp":"978300275","uid":"1"}
需要导入hive中进行数据分析
*/
-- 建表映射上述数据
create table t_ratingjson(json string);
load data local inpath '/root/hivetest/rating.json' into table t_ratingjson;
/*
想把上面的原始数据变成如下形式:
1193,5,978300760,1
661,3,978302109,1
914,3,978301968,1
3408,4,978300275,1
思路:如果能够定义一个json解析函数,则很方便了
*/
create table t_rate
as
select myjson(json,1) as movie,cast(myjson(json,2) as int) as rate,myjson(json,3) as ts,myjson(json,4) as uid from t_ratingjson;
/*解决:
hive中如何定义自己的函数:
1、先写一个java类(extends UDF,重载方法public C evaluate(A a,B b)),实现你所想要的函数的功能(传入一个json字符串和一个脚标,返回一个值)
2、将java程序打成jar包,上传到hive所在的机器
3、在hive命令行中将jar包添加到classpath :
hive>add jar /root/hivetest/myjson.jar;
4、在hive命令中用命令创建一个函数叫做myjson,关联你所写的这个java类
hive> create temporary function myjson as 'cn.edu360.hive.udf.MyJsonParser';
*/
--1、先加载rating.json文件到hive的一个原始表 rat_json
create table rat_json(line string) row format delimited;
load data local inpath '/home/hadoop/rating.json' into table rat_json;
--2、需要解析json数据成四个字段,插入一张新的表 t_rating
insert overwrite table t_rating
select get_json_object(line,'$.movie') as moive,get_json_object(line,'$.rate') as rate from rat_json;
/*3、使用transform+python的方式去转换unixtime为weekday
先编辑一个python脚本文件
########python######代码
vi weekday_mapper.py
#!/bin/python
import sys
import datetime
for line in sys.stdin:
line = line.strip()
movieid, rating, unixtime,userid = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([movieid, rating, str(weekday),userid])
保存文件
然后,将文件加入hive的classpath:
*/
hive>add FILE /home/hadoop/weekday_mapper.py;
hive>
create TABLE u_data_new as
SELECT
TRANSFORM (movieid, rate, timestring,uid)
USING 'python weekday_mapper.py'
AS (movieid, rate, weekday,uid)
FROM t_rating;
select distinct(weekday) from u_data_new limit 10;
--web用户日志实现用户日活统计、新增统计,数据如下:
/*
day10:
192.168.33.6,hunter,2020-04-10 10:30:20,/a
192.168.33.7,hunter,2020-04-10 10:30:26,/b
192.168.33.6,jack,2020-04-10 10:30:27,/a
192.168.33.8,tom,2020-04-10 10:30:28,/b
192.168.33.9,rose,2020-04-10 10:30:30,/b
192.168.33.10,julia,2020-04-10 10:30:40,/c
day11号的数据:
192.168.33.16,hunter,2020-04-11 10:30:20,/a
192.168.33.18,jerry,2020-04-11 10:30:30,/b
192.168.33.26,jack,2020-04-11 10:30:40,/a
192.168.33.18,polo,2020-04-11 10:30:50,/b
192.168.33.39,nissan,2020-04-11 10:30:53,/b
192.168.33.39,nissan,2020-04-11 10:30:55,/a
192.168.33.39,nissan,2020-04-11 10:30:58,/c
192.168.33.20,ford,2020-04-11 10:30:54,/c
day12号的数据:
192.168.33.46,hunter,2020-04-12 10:30:21,/a
192.168.43.18,jerry,2020-04-12 10:30:22,/b
192.168.43.26,tom,2020-04-12 10:30:23,/a
192.168.53.18,bmw,2020-04-12 10:30:24,/b
192.168.63.39,benz,2020-04-12 10:30:25,/b
192.168.33.25,haval,2020-04-12 10:30:30,/c
192.168.33.10,julia,2020-04-12 10:30:40,/c
*/
create table t_user(ip string,uname string,login_time timeStamp,url string)
partitioned by(dt date)
row format delimited fields terminated by ',';
load data local inpath '/root/t_data/day10' into table t_user partition(dt='2020-04-10');
load data local inpath '/root/t_data/day11' into table t_user partition(dt='2020-04-11');
load data local inpath '/root/t_data/day12' into table t_user partition(dt='2020-04-12');
--当日活跃用户
create table t_user_active like t_user;
insert into table t_user_active partition(dt='2020-04-10')
select ip,uname,login_time,url
from
(select ip,uname,login_time,url
,row_number() over(partition by uname order by login_time) rn
from t_user where dt='2020-04-10') tmp where tmp.rn = 1;
insert into table t_user_active partition(dt='2020-04-11')
select ip,uname,login_time,url
from
(select ip,uname,login_time,url
,row_number() over(partition by uname order by login_time) rn
from t_user where dt='2020-04-11') tmp where tmp.rn = 1;
insert into table t_user_active partition(dt='2020-04-12')
select ip,uname,login_time,url
from
(select ip,uname,login_time,url
,row_number() over(partition by uname order by login_time) rn
from t_user where dt='2020-04-12') tmp where tmp.rn = 1;
--当日新增用户
create table t_user_new like t_user;
insert into table t_user_new partition(dt='2020-04-10')
select ip,uname,login_time,url from (
select a.*,b.uname b_uname from
(select *
from t_user_active where dt='2020-04-10') a
left join (select uname
from t_user_active where dt<'2020-04-10') b on a.uname = b.uname) tmp where tmp.b_uname is null;
insert into table t_user_new partition(dt='2020-04-11')
select ip,uname,login_time,url from (
select a.*,b.uname b_uname from
(select *
from t_user_active where dt='2020-04-11') a
left join (select uname
from t_user_active where dt<'2020-04-11') b on a.uname = b.uname) tmp where tmp.b_uname is null;
insert into table t_user_new partition(dt='2020-04-12')
select ip,uname,login_time,url from (
select a.*,b.uname b_uname from
(select *
from t_user_active where dt='2020-04-12') a
left join (select uname
from t_user_active where dt<'2020-04-12') b on a.uname = b.uname) tmp where tmp.b_uname is null;
--建维度统计结果表 dim_user_active_day
DROP TABLE dim_user_active_day;
CREATE TABLE dim_user_active_day (
os_name string
,city string
,release_channel string
,app_ver_name string
,cnts INT
) partitioned BY (
day string
,dim string
);
-- 利用多重insert语法来统计各种维度组合的日活用户数,并插入到日活维度统计表的各分区中;
FROM etl_user_active_day
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2017-09-21'
,dim = '0000'
)
SELECT 'all'
,'all'
,'all'
,'all'
,count(1)
WHERE day = '2017-09-21'
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2017-09-21'
,dim = '1000'
)
SELECT os_name
,'all'
,'all'
,'all'
,count(1)
WHERE day = '2017-09-21'
GROUP BY (os_name)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2017-09-21'
,dim = '0100'
)
SELECT 'all'
,city
,'all'
,'all'
,count(1)
WHERE day = '2017-09-21'
GROUP BY (city)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2017-09-21'
,dim = '0010'
)
SELECT 'all'
,'all'
,release_channel
,'all'
,count(1)
WHERE day = '2017-09-21'
GROUP BY (release_channel)
INSERT INTO TABLE dim_user_active_day PARTITION (
day = '2017-09-21'
,dim = '0001'
)
SELECT 'all'
,'all'
,'all'
,app_ver_name
,count(1)
WHERE day = '2017-09-21'
GROUP BY (app_ver_name)
......
;