常用hive开窗函数示例

个人博客原文链接

简介

普通的聚合函数每组(group by)只返回一个值,而开窗函数则可以为每行都返回一个值。简而言之,相当于对查询的结果添加新的一列值,这一列可以是聚合值,也可以是排序号。开窗函数的关键字是over()。

准备测试数据

  1. 建表
    create table if not exists student_scores(
    id string,
    studentId string,
    language string,
    math string,
    english string,
    classId string,
    departmentId string
    );

  2. 插入数据
    insert into table student_scores values
    (‘1’,‘111’,‘68’,‘69’,‘90’,‘class1’,‘department1’),
    (‘2’,‘112’,‘73’,‘80’,‘96’,‘class1’,‘department1’),
    (‘3’,‘113’,‘90’,‘74’,‘75’,‘class1’,‘department1’),
    (‘4’,‘114’,‘89’,‘94’,‘93’,‘class1’,‘department2’),
    (‘5’,‘115’,‘99’,‘93’,‘89’,‘class1’,‘department1’),
    (‘6’,‘121’,‘96’,‘74’,‘79’,‘class2’,‘department1’),
    (‘7’,‘122’,‘89’,‘86’,‘85’,‘class2’,‘department1’),
    (‘8’,‘123’,‘70’,‘78’,‘61’,‘class2’,‘department1’),
    (‘9’,‘124’,‘76’,‘70’,‘76’,‘class2’,‘department1’),
    (‘10’,‘211’,‘89’,‘93’,‘60’,‘class1’,‘department2’),
    (‘11’,‘212’,‘76’,‘83’,‘75’,‘class1’,‘department2’),
    (‘12’,‘213’,‘71’,‘94’,‘90’,‘class2’,‘department2’),
    (‘13’,‘214’,‘94’,‘94’,‘66’,‘class1’,‘department2’),
    (‘14’,‘215’,‘84’,‘82’,‘73’,‘class1’,‘department2’),
    (‘15’,‘216’,‘85’,‘74’,‘93’,‘class1’,‘department2’),
    (‘16’,‘221’,‘77’,‘99’,‘61’,‘class2’,‘department2’),
    (‘17’,‘222’,‘80’,‘78’,‘96’,‘class2’,‘department2’),
    (‘18’,‘223’,‘79’,‘74’,‘96’,‘class2’,‘department2’),
    (‘19’,‘224’,‘75’,‘80’,‘78’,‘class2’,‘department2’),
    (‘20’,‘225’,‘82’,‘85’,‘63’,‘class2’,‘department2’);

count()

  1. 以符合条件的所有行为作为窗口
    select studentId,language,math,english,classId,departmentId,count(math) over() as count1
    from student_scores;
    结果:
    111 68 69 90 class1 department1 20
    112 73 80 96 class1 department1 20
    113 90 74 75 class1 department1 20
    114 89 94 93 class1 department2 20
    115 99 93 89 class1 department1 20
    121 96 74 79 class2 department1 20
    122 89 86 85 class2 department1 20
    123 70 78 61 class2 department1 20
    124 76 70 76 class2 department1 20
    211 89 93 60 class1 department2 20
    212 76 83 75 class1 department2 20
    213 71 94 90 class2 department2 20
    214 94 94 66 class1 department2 20
    215 84 82 73 class1 department2 20
    216 85 74 93 class1 department2 20
    221 77 99 61 class2 department2 20
    222 80 78 96 class2 department2 20
    223 79 74 96 class2 department2 20
    224 75 80 78 class2 department2 20
    225 82 85 63 class2 department2 20

  2. 以根据classId分组的所有行为作为窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId) as count2
    from student_scores;
    结果:
    121 96 74 79 class2 department1 10
    122 89 86 85 class2 department1 10
    123 70 78 61 class2 department1 10
    124 76 70 76 class2 department1 10
    213 71 94 90 class2 department2 10
    221 77 99 61 class2 department2 10
    222 80 78 96 class2 department2 10
    223 79 74 96 class2 department2 10
    224 75 80 78 class2 department2 10
    225 82 85 63 class2 department2 10
    111 68 69 90 class1 department1 10
    112 73 80 96 class1 department1 10
    113 90 74 75 class1 department1 10
    114 89 94 93 class1 department2 10
    115 99 93 89 class1 department1 10
    211 89 93 60 class1 department2 10
    212 76 83 75 class1 department2 10
    214 94 94 66 class1 department2 10
    215 84 82 73 class1 department2 10
    216 85 74 93 class1 department2 10

  3. 以根据classId分组、根据math倒序排序(即count大于等于当前行math的值的个数)的所有行为作为窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc) as count3
    from student_scores;
    结果:
    221 77 99 61 class2 department2 1
    213 71 94 90 class2 department2 2
    122 89 86 85 class2 department1 3
    225 82 85 63 class2 department2 4
    224 75 80 78 class2 department2 5
    123 70 78 61 class2 department1 7
    222 80 78 96 class2 department2 7
    121 96 74 79 class2 department1 9
    223 79 74 96 class2 department2 9
    124 76 70 76 class2 department1 10
    114 89 94 93 class1 department2 2
    214 94 94 66 class1 department2 2
    115 99 93 89 class1 department1 4
    211 89 93 60 class1 department2 4
    212 76 83 75 class1 department2 5
    215 84 82 73 class1 department2 6
    112 73 80 96 class1 department1 7
    113 90 74 75 class1 department1 9
    216 85 74 93 class1 department2 9
    111 68 69 90 class1 department1 10

  4. 以根据classId分组、根据math排序、从当前行前1行到当前行后2行的所有行为作为窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc rows between 1 preceding and 2 following) as count4
    from student_scores;
    结果:
    221 77 99 61 class2 department2 3
    213 71 94 90 class2 department2 4
    122 89 86 85 class2 department1 4
    225 82 85 63 class2 department2 4
    224 75 80 78 class2 department2 4
    123 70 78 61 class2 department1 4
    222 80 78 96 class2 department2 4
    121 96 74 79 class2 department1 4
    223 79 74 96 class2 department2 3
    124 76 70 76 class2 department1 2
    114 89 94 93 class1 department2 3
    214 94 94 66 class1 department2 4
    115 99 93 89 class1 department1 4
    211 89 93 60 class1 department2 4
    212 76 83 75 class1 department2 4
    215 84 82 73 class1 department2 4
    112 73 80 96 class1 department1 4
    113 90 74 75 class1 department1 4
    216 85 74 93 class1 department2 3
    111 68 69 90 class1 department1 2

  5. 以根据classId分组、根据math排序、从第一行到最后一行的所有行为作为窗口,默认就是所有行,相当于count3
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc rows between unbounded preceding and unbounded following) as count5
    from student_scores;

  6. 以根据classId分组、根据math排序、从当前行前1行到当前行的所有行为作为窗口
    select studentId,language,math,english,classId,departmentId,count(math) over(partition by classId order by math desc rows between 1 preceding and current row) as count6
    from student_scores;
    结果:
    221 77 99 61 class2 department2 1
    213 71 94 90 class2 department2 2
    122 89 86 85 class2 department1 2
    225 82 85 63 class2 department2 2
    224 75 80 78 class2 department2 2
    123 70 78 61 class2 department1 2
    222 80 78 96 class2 department2 2
    121 96 74 79 class2 department1 2
    223 79 74 96 class2 department2 2
    124 76 70 76 class2 department1 2
    114 89 94 93 class1 department2 1
    214 94 94 66 class1 department2 2
    115 99 93 89 class1 department1 2
    211 89 93 60 class1 department2 2
    212 76 83 75 class1 department2 2
    215 84 82 73 class1 department2 2
    112 73 80 96 class1 department1 2
    113 90 74 75 class1 department1 2
    216 85 74 93 class1 department2 2
    111 68 69 90 class1 department1 2

row_number()

以根据classId分组、根据math排序的所有行为作为窗口,从1开始对分区内的数据排序,相同的值无重复排名,即排名是连续的。
select studentId,language,math,english,classId,departmentId,row_number() over(partition by classId order by math desc) as count1
from student_scores;
结果:
221 77 99 61 class2 department2 1
213 71 94 90 class2 department2 2
122 89 86 85 class2 department1 3
225 82 85 63 class2 department2 4
224 75 80 78 class2 department2 5
123 70 78 61 class2 department1 6
222 80 78 96 class2 department2 7
121 96 74 79 class2 department1 8
223 79 74 96 class2 department2 9
124 76 70 76 class2 department1 10
114 89 94 93 class1 department2 1
214 94 94 66 class1 department2 2
115 99 93 89 class1 department1 3
211 89 93 60 class1 department2 4
212 76 83 75 class1 department2 5
215 84 82 73 class1 department2 6
112 73 80 96 class1 department1 7
113 90 74 75 class1 department1 8
216 85 74 93 class1 department2 9
111 68 69 90 class1 department1 10

rank()

  1. 以根据math排序的所有行为作为窗口,基于over子句中的orderby来确定一组值中一个值的排名,相同的值有重复排名,即排名可能不是连续的。
    select studentId,language,math,english,classId,departmentId,rank() over(order by math desc) as count1
    from student_scores;
    结果:
    221 77 99 61 class2 department2 1
    114 89 94 93 class1 department2 2
    213 71 94 90 class2 department2 2
    214 94 94 66 class1 department2 2
    115 99 93 89 class1 department1 5
    211 89 93 60 class1 department2 5
    122 89 86 85 class2 department1 7
    225 82 85 63 class2 department2 8
    212 76 83 75 class1 department2 9
    215 84 82 73 class1 department2 10
    112 73 80 96 class1 department1 11
    224 75 80 78 class2 department2 11
    123 70 78 61 class2 department1 13
    222 80 78 96 class2 department2 13
    113 90 74 75 class1 department1 15
    121 96 74 79 class2 department1 15
    216 85 74 93 class1 department2 15
    223 79 74 96 class2 department2 15
    124 76 70 76 class2 department1 19
    111 68 69 90 class1 department1 20

  2. 以根据classId分组、math排序的所有行为作为窗口,基于over子句中的orderby来确定一组值中一个值的排名,相同的值有重复排名,即排名可能不是连续的。
    select studentId,language,math,english,classId,departmentId,rank() over(partition by classId order by math desc) as count2
    from student_scores;
    结果:
    221 77 99 61 class2 department2 1
    213 71 94 90 class2 department2 2
    122 89 86 85 class2 department1 3
    225 82 85 63 class2 department2 4
    224 75 80 78 class2 department2 5
    123 70 78 61 class2 department1 6
    222 80 78 96 class2 department2 6
    121 96 74 79 class2 department1 8
    223 79 74 96 class2 department2 8
    124 76 70 76 class2 department1 10
    114 89 94 93 class1 department2 1
    214 94 94 66 class1 department2 1
    115 99 93 89 class1 department1 3
    211 89 93 60 class1 department2 3
    212 76 83 75 class1 department2 5
    215 84 82 73 class1 department2 6
    112 73 80 96 class1 department1 7
    113 90 74 75 class1 department1 8
    216 85 74 93 class1 department2 8
    111 68 69 90 class1 department1 10

dense_rank()

dense_rank与rank的区别在于,相同的值会有重复排名,且排名也是连续的。如两个行的排名为1,则下一个排名为2。
select studentId,language,math,english,classId,departmentId,dense_rank() over(order by math desc) as count1
from student_scores;
结果:
221 77 99 61 class2 department2 1
114 89 94 93 class1 department2 2
213 71 94 90 class2 department2 2
214 94 94 66 class1 department2 2
115 99 93 89 class1 department1 3
211 89 93 60 class1 department2 3
122 89 86 85 class2 department1 4
225 82 85 63 class2 department2 5
212 76 83 75 class1 department2 6
215 84 82 73 class1 department2 7
112 73 80 96 class1 department1 8
224 75 80 78 class2 department2 8
123 70 78 61 class2 department1 9
222 80 78 96 class2 department2 9
113 90 74 75 class1 department1 10
121 96 74 79 class2 department1 10
216 85 74 93 class1 department2 10
223 79 74 96 class2 department2 10
124 76 70 76 class2 department1 11
111 68 69 90 class1 department1 12

其他

简要介绍一下其它不太常用的

  1. max(取窗口内最大值)
    select studentId,language,math,english,classId,departmentId,max(math) over(partition by classId) as count1
    from student_scores;

  2. min(取窗口内最小值)
    select studentId,language,math,english,classId,departmentId,min(math) over(partition by classId) as count1
    from student_scores;

  3. sum(取窗口内的和)
    select studentId,language,math,english,classId,departmentId,sum(math) over(partition by classId) as count1
    from student_scores;

  4. avg(取窗口内的平均值)
    select studentId,language,math,english,classId,departmentId,avg(math) over(partition by classId) as count1
    from student_scores;

  5. first_value(取窗口内第一个值)
    select studentId,language,math,english,classId,departmentId,first_value(math) over(partition by classId) as count1
    from student_scores;

  6. last_value(取窗口内最后一个值)
    select studentId,language,math,english,classId,departmentId,last_value(math) over(partition by classId) as count1
    from student_scores;

  7. ntile(将分区中已排序的值根据大小分组,返回组的排名)
    select studentId,language,math,english,classId,departmentId,ntile(2) over(partition by classId order by math desc) as count1
    from student_scores;

  8. percent_rank(返回当前行的百分比排行)
    select studentId,language,math,english,classId,departmentId,percent_rank() over(partition by classId order by math desc) as count1
    from student_scores;

你可能感兴趣的:(大数据)