hiveSql

第十二单元 常用函数、窗口函数(开窗函数)

1、常用函数

1.1 字符串
--字符串拼接函数 concat
select concat('abc','def');
select concat_ws('-','abc','def');

--求字符串长度 length
select length('jsdfijsdkfjkdsfjkdf');


1.2 日期
--日期函数 to_date
select to_date('2019-09-11 16:55:11');

--把字符串转换成unix时间戳
select unix_timestamp('2019-09-11 11:55:11','yyyy-MM-dd HH:mm:ss');

--得到现在的时间戳
select unix_timestamp();

--把unix时间戳转换成字符串
select from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss');

1.3 数学运算函数
--四舍五入
select round(5.4)
--四舍五入保留2位小数
select round(5.1345,2)
--向上取整
select ceil(5.3)
--向下取整
select floor(5.3)
--取绝对值
select abs(-5.2)
--取最大值
select greatest(3,4,5,6,7)
select max
--取最小值
select least(3,4,5,6,7)
select min
--随机数
rand();

2、窗口函数(开窗函数)

在开窗函数出现之前存在着很多用 SQL 语句很难解决的问题,很多都要通过复杂的相关子查询或者存储过程来完成。为了解决这些问题,在2003年ISO SQL标准加入了开窗函数,开窗函数的使用使得这些经典的难题可以被轻松的解决。目前在 MSSQLServer、Oracle、DB2 等主流数据库中都提供了对开窗函数的支持,不过非常遗憾的是 MYSQL 暂时还未对开窗函数给予支持。

与聚合函数一样,开窗函数也是对行集组进行聚合计算,但是它不像普通聚合函数那样每组只返回一个值,开窗函数可以为每组返回多个值,因为开窗函数所执行聚合计算的行集组是窗口。

2.1 窗口---排序

1,18,a,male
2,19,b,male
3,19,c,male
4,22,d,female
5,22,e,female
6,16,f,female
7,30,g,male
8,26,h,female

建表:

create table t_student(id int,age int,name string,sex string)
row format delimited fields terminated by ',';

导入数据:

load data local inpath '/root/student.dat' into table t_student;

row_number() over:

-- select * from t_student group by sex;能否执行??
-- select sex,max(age) from t_student group by sex; 能否执行??
-- select id,age,name,sex,max(age) from t_student group by sex; 能否执行??
-- 常规group by 只能查聚合的字段或使用聚合函数,若使聚合后返回多个值则需要更复杂的操作

select 
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student;

+-----+------+-------+---------+-----+--+
| id  | age  | name  |   sex   | rk  |
+-----+------+-------+---------+-----+--+
| 8   | 26   | h     | female  | 1   |
| 5   | 22   | e     | female  | 2   |
| 4   | 22   | d     | female  | 3   |
| 6   | 16   | f     | female  | 4   |
| 7   | 30   | g     | male    | 1   |
| 3   | 19   | c     | male    | 2   |
| 2   | 19   | b     | male    | 3   |
| 1   | 18   | a     | male    | 4   |
+-----+------+-------+---------+-----+--+

分组求Top1、TopN:

select tmp.* from 
(select 
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student) tmp
where tmp.rk = 1;

select tmp.* from 
(select 
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student) tmp
where tmp.rk <=3;

rank() over、dense_rank() over、ntile(n) over

select 
id,age,name,sex,rank() over(partition by sex order by age desc) rk
from t_student;

+-----+------+-------+---------+-----+--+
| id  | age  | name  |   sex   | rk  |
+-----+------+-------+---------+-----+--+
| 8   | 26   | h     | female  | 1   |
| 5   | 22   | e     | female  | 2   |
| 4   | 22   | d     | female  | 2   |
| 6   | 16   | f     | female  | 4   |
| 7   | 30   | g     | male    | 1   |
| 3   | 19   | c     | male    | 2   |
| 2   | 19   | b     | male    | 2   |
| 1   | 18   | a     | male    | 4   |
+-----+------+-------+---------+-----+--+

select 
id,age,name,sex,dense_rank() over(partition by sex order by age desc) rk
from t_student;

+-----+------+-------+---------+-----+--+
| id  | age  | name  |   sex   | rk  |
+-----+------+-------+---------+-----+--+
| 8   | 26   | h     | female  | 1   |
| 5   | 22   | e     | female  | 2   |
| 4   | 22   | d     | female  | 2   |
| 6   | 16   | f     | female  | 3   |
| 7   | 30   | g     | male    | 1   |
| 3   | 19   | c     | male    | 2   |
| 2   | 19   | b     | male    | 2   |
| 1   | 18   | a     | male    | 3   |
+-----+------+-------+---------+-----+--+

select
id,age,name,sex,ntile(3) over(partition by sex order by age desc) rk
from
t_student;

+-----+------+-------+---------+-----+--+
| id  | age  | name  |   sex   | rk  |
+-----+------+-------+---------+-----+--+
| 8   | 26   | h     | female  | 1   |
| 5   | 22   | e     | female  | 1   |
| 4   | 22   | d     | female  | 2   |
| 6   | 16   | f     | female  | 2   |
| 7   | 30   | g     | male    | 1   |
| 3   | 19   | c     | male    | 1   |
| 2   | 19   | b     | male    | 2   |
| 1   | 18   | a     | male    | 2   |
+-----+------+-------+---------+-----+--+
2.2 窗口---求和

A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
C,2015-01,20
A,2015-02,4
A,2015-02,6
C,2015-02,30
C,2015-02,10
B,2015-02,10
B,2015-02,5
A,2015-03,14
A,2015-03,6
B,2015-03,20
B,2015-03,25
C,2015-03,10
C,2015-03,20

建表:

create table t_saller(name string,month string,amount int)
row format delimited fields terminated by ',';

导入数据:

load data local inpath '/root/saller.dat' overwrite into table t_saller;

求每个销售员每个月的销售额和到当月为止的累计销售额:

-- 求每个销售员每月销售额,先按照销售员分组,再按月份分组
select name,month,sum(amount) samount from t_saller group by name,month;

+-------+----------+----------+--+
| name  |  month   | samount  |
+-------+----------+----------+--+
| A     | 2015-01  | 33       |
| A     | 2015-02  | 10       |
| A     | 2015-03  | 20       |
| B     | 2015-01  | 30       |
| B     | 2015-02  | 15       |
| B     | 2015-03  | 45       |
| C     | 2015-01  | 30       |
| C     | 2015-02  | 40       |
| C     | 2015-03  | 30       |
+-------+----------+----------+--+


-- 为避免过多子查询,课提前创建 月销售额表:
create table t_accumulate
as
select name,month,sum(amount) samount from t_saller group by name,month;

-- 累加 从最前面一行到当前行 between unbounded preceding and current row
-- preceding 往前
-- unbounded 起点
-- current row 当前行
-- unbounded preceding 最前一行
-- unbounded following  最后一行
-- following 窗口长度

select 
name,month,samount,
sum(samount) over(partition by name order by month rows between unbounded preceding and current row) accumlateAmount
from
t_accumulate;
+-------+----------+----------+------------------+--+
| name  |  month   | samount  | accumlateamount  |
+-------+----------+----------+------------------+--+
| A     | 2015-01  | 33       | 33               |
| A     | 2015-02  | 10       | 43               |
| A     | 2015-03  | 20       | 63               |
| B     | 2015-01  | 30       | 30               |
| B     | 2015-02  | 15       | 45               |
| B     | 2015-03  | 45       | 90               |
| C     | 2015-01  | 30       | 30               |
| C     | 2015-02  | 40       | 70               |
| C     | 2015-03  | 30       | 100              |
+-------+----------+----------+------------------+--+

--其他 between 2 preceding  and 1 following
select 
name,month,samount,sum(samount) over(partition by name order by month rows between 2 preceding  and 1 following ) accumlateAmount
from
t_accumulate;

2.3 窗口---其他
min()  over()    ,max() over() ,   avg() over()

3、explode()

名词解释:explode 爆炸

1,zhangsan,化学:物理:数学:语文
2,lisi,化学:数学:生物:生理:卫生
3,wangwu,化学:语文:英语:体育:生物

建表:

create table t_stu_subject(id int,name string,subjects array)
row format delimited fields terminated by ','
collection items terminated by ':';

导入数据:

load data local inpath '/root/stu_subject.dat'  into table t_stu_subject;

需求:从学生学习课程分析这个学校都有哪些课程

-- 炸科目:explode(subjects) 
select explode(subjects) from t_stu_subject;

-- 去重
select distinct tmp.subs from (select explode(subjects) subs from t_stu_subject) tmp;

lateral view 连接函数

select  id,name,sub from 
t_stu_subject 
lateral view explode(subjects) tmp as sub;

+-----+-----------+------+--+
| id  |   name    | sub  |
+-----+-----------+------+--+
| 1   | zhangsan  | 化学   |
| 1   | zhangsan  | 物理   |
| 1   | zhangsan  | 数学   |
| 1   | zhangsan  | 语文   |
| 2   | lisi      | 化学   |
| 2   | lisi      | 数学   |
| 2   | lisi      | 生物   |
| 2   | lisi      | 生理   |
| 2   | lisi      | 卫生   |
| 3   | wangwu    | 化学   |
| 3   | wangwu    | 语文   |
| 3   | wangwu    | 英语   |
| 3   | wangwu    | 体育   |
| 3   | wangwu    | 生物   |
+-----+-----------+------+--+

-- 单词统计 wordcount
-- split()
select
tmp.word,count(1) cnts from 
(select  explode(split(line,' ')) word from words) tmp
group by tmp.word order by cnts desc;

-- 炸map
select  id,name,key,value from  t_people
lateral view explode(family) tmp as key,value;

第十三单元 Hive 自定义函数,Hive API、Hive脚本

1月11号

第十四单元 其他

1月12号

你可能感兴趣的:(hiveSql)