第十二单元 常用函数、窗口函数(开窗函数)
1、常用函数
1.1 字符串
--字符串拼接函数 concat
select concat('abc','def');
select concat_ws('-','abc','def');
--求字符串长度 length
select length('jsdfijsdkfjkdsfjkdf');
1.2 日期
--日期函数 to_date
select to_date('2019-09-11 16:55:11');
--把字符串转换成unix时间戳
select unix_timestamp('2019-09-11 11:55:11','yyyy-MM-dd HH:mm:ss');
--得到现在的时间戳
select unix_timestamp();
--把unix时间戳转换成字符串
select from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss');
1.3 数学运算函数
--四舍五入
select round(5.4)
--四舍五入保留2位小数
select round(5.1345,2)
--向上取整
select ceil(5.3)
--向下取整
select floor(5.3)
--取绝对值
select abs(-5.2)
--取最大值
select greatest(3,4,5,6,7)
select max
--取最小值
select least(3,4,5,6,7)
select min
--随机数
rand();
2、窗口函数(开窗函数)
在开窗函数出现之前存在着很多用 SQL 语句很难解决的问题,很多都要通过复杂的相关子查询或者存储过程来完成。为了解决这些问题,在2003年ISO SQL标准加入了开窗函数,开窗函数的使用使得这些经典的难题可以被轻松的解决。目前在 MSSQLServer、Oracle、DB2 等主流数据库中都提供了对开窗函数的支持,不过非常遗憾的是 MYSQL 暂时还未对开窗函数给予支持。
与聚合函数一样,开窗函数也是对行集组进行聚合计算,但是它不像普通聚合函数那样每组只返回一个值,开窗函数可以为每组返回多个值,因为开窗函数所执行聚合计算的行集组是窗口。
2.1 窗口---排序
1,18,a,male
2,19,b,male
3,19,c,male
4,22,d,female
5,22,e,female
6,16,f,female
7,30,g,male
8,26,h,female
建表:
create table t_student(id int,age int,name string,sex string)
row format delimited fields terminated by ',';
导入数据:
load data local inpath '/root/student.dat' into table t_student;
row_number() over:
-- select * from t_student group by sex;能否执行??
-- select sex,max(age) from t_student group by sex; 能否执行??
-- select id,age,name,sex,max(age) from t_student group by sex; 能否执行??
-- 常规group by 只能查聚合的字段或使用聚合函数,若使聚合后返回多个值则需要更复杂的操作
select
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student;
+-----+------+-------+---------+-----+--+
| id | age | name | sex | rk |
+-----+------+-------+---------+-----+--+
| 8 | 26 | h | female | 1 |
| 5 | 22 | e | female | 2 |
| 4 | 22 | d | female | 3 |
| 6 | 16 | f | female | 4 |
| 7 | 30 | g | male | 1 |
| 3 | 19 | c | male | 2 |
| 2 | 19 | b | male | 3 |
| 1 | 18 | a | male | 4 |
+-----+------+-------+---------+-----+--+
分组求Top1、TopN:
select tmp.* from
(select
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student) tmp
where tmp.rk = 1;
select tmp.* from
(select
id,age,name,sex,row_number() over(partition by sex order by age desc) rk
from t_student) tmp
where tmp.rk <=3;
rank() over、dense_rank() over、ntile(n) over
select
id,age,name,sex,rank() over(partition by sex order by age desc) rk
from t_student;
+-----+------+-------+---------+-----+--+
| id | age | name | sex | rk |
+-----+------+-------+---------+-----+--+
| 8 | 26 | h | female | 1 |
| 5 | 22 | e | female | 2 |
| 4 | 22 | d | female | 2 |
| 6 | 16 | f | female | 4 |
| 7 | 30 | g | male | 1 |
| 3 | 19 | c | male | 2 |
| 2 | 19 | b | male | 2 |
| 1 | 18 | a | male | 4 |
+-----+------+-------+---------+-----+--+
select
id,age,name,sex,dense_rank() over(partition by sex order by age desc) rk
from t_student;
+-----+------+-------+---------+-----+--+
| id | age | name | sex | rk |
+-----+------+-------+---------+-----+--+
| 8 | 26 | h | female | 1 |
| 5 | 22 | e | female | 2 |
| 4 | 22 | d | female | 2 |
| 6 | 16 | f | female | 3 |
| 7 | 30 | g | male | 1 |
| 3 | 19 | c | male | 2 |
| 2 | 19 | b | male | 2 |
| 1 | 18 | a | male | 3 |
+-----+------+-------+---------+-----+--+
select
id,age,name,sex,ntile(3) over(partition by sex order by age desc) rk
from
t_student;
+-----+------+-------+---------+-----+--+
| id | age | name | sex | rk |
+-----+------+-------+---------+-----+--+
| 8 | 26 | h | female | 1 |
| 5 | 22 | e | female | 1 |
| 4 | 22 | d | female | 2 |
| 6 | 16 | f | female | 2 |
| 7 | 30 | g | male | 1 |
| 3 | 19 | c | male | 1 |
| 2 | 19 | b | male | 2 |
| 1 | 18 | a | male | 2 |
+-----+------+-------+---------+-----+--+
2.2 窗口---求和
A,2015-01,5
A,2015-01,15
B,2015-01,5
A,2015-01,8
B,2015-01,25
A,2015-01,5
C,2015-01,10
C,2015-01,20
A,2015-02,4
A,2015-02,6
C,2015-02,30
C,2015-02,10
B,2015-02,10
B,2015-02,5
A,2015-03,14
A,2015-03,6
B,2015-03,20
B,2015-03,25
C,2015-03,10
C,2015-03,20
建表:
create table t_saller(name string,month string,amount int)
row format delimited fields terminated by ',';
导入数据:
load data local inpath '/root/saller.dat' overwrite into table t_saller;
求每个销售员每个月的销售额和到当月为止的累计销售额:
-- 求每个销售员每月销售额,先按照销售员分组,再按月份分组
select name,month,sum(amount) samount from t_saller group by name,month;
+-------+----------+----------+--+
| name | month | samount |
+-------+----------+----------+--+
| A | 2015-01 | 33 |
| A | 2015-02 | 10 |
| A | 2015-03 | 20 |
| B | 2015-01 | 30 |
| B | 2015-02 | 15 |
| B | 2015-03 | 45 |
| C | 2015-01 | 30 |
| C | 2015-02 | 40 |
| C | 2015-03 | 30 |
+-------+----------+----------+--+
-- 为避免过多子查询,课提前创建 月销售额表:
create table t_accumulate
as
select name,month,sum(amount) samount from t_saller group by name,month;
-- 累加 从最前面一行到当前行 between unbounded preceding and current row
-- preceding 往前
-- unbounded 起点
-- current row 当前行
-- unbounded preceding 最前一行
-- unbounded following 最后一行
-- following 窗口长度
select
name,month,samount,
sum(samount) over(partition by name order by month rows between unbounded preceding and current row) accumlateAmount
from
t_accumulate;
+-------+----------+----------+------------------+--+
| name | month | samount | accumlateamount |
+-------+----------+----------+------------------+--+
| A | 2015-01 | 33 | 33 |
| A | 2015-02 | 10 | 43 |
| A | 2015-03 | 20 | 63 |
| B | 2015-01 | 30 | 30 |
| B | 2015-02 | 15 | 45 |
| B | 2015-03 | 45 | 90 |
| C | 2015-01 | 30 | 30 |
| C | 2015-02 | 40 | 70 |
| C | 2015-03 | 30 | 100 |
+-------+----------+----------+------------------+--+
--其他 between 2 preceding and 1 following
select
name,month,samount,sum(samount) over(partition by name order by month rows between 2 preceding and 1 following ) accumlateAmount
from
t_accumulate;
2.3 窗口---其他
min() over() ,max() over() , avg() over()
3、explode()
名词解释:explode 爆炸
1,zhangsan,化学:物理:数学:语文
2,lisi,化学:数学:生物:生理:卫生
3,wangwu,化学:语文:英语:体育:生物
建表:
create table t_stu_subject(id int,name string,subjects array)
row format delimited fields terminated by ','
collection items terminated by ':';
导入数据:
load data local inpath '/root/stu_subject.dat' into table t_stu_subject;
需求:从学生学习课程分析这个学校都有哪些课程
-- 炸科目:explode(subjects)
select explode(subjects) from t_stu_subject;
-- 去重
select distinct tmp.subs from (select explode(subjects) subs from t_stu_subject) tmp;
lateral view 连接函数
select id,name,sub from
t_stu_subject
lateral view explode(subjects) tmp as sub;
+-----+-----------+------+--+
| id | name | sub |
+-----+-----------+------+--+
| 1 | zhangsan | 化学 |
| 1 | zhangsan | 物理 |
| 1 | zhangsan | 数学 |
| 1 | zhangsan | 语文 |
| 2 | lisi | 化学 |
| 2 | lisi | 数学 |
| 2 | lisi | 生物 |
| 2 | lisi | 生理 |
| 2 | lisi | 卫生 |
| 3 | wangwu | 化学 |
| 3 | wangwu | 语文 |
| 3 | wangwu | 英语 |
| 3 | wangwu | 体育 |
| 3 | wangwu | 生物 |
+-----+-----------+------+--+
-- 单词统计 wordcount
-- split()
select
tmp.word,count(1) cnts from
(select explode(split(line,' ')) word from words) tmp
group by tmp.word order by cnts desc;
-- 炸map
select id,name,key,value from t_people
lateral view explode(family) tmp as key,value;
第十三单元 Hive 自定义函数,Hive API、Hive脚本
1月11号
第十四单元 其他
1月12号