--当with cube和grouping sets的维度字段中有null值或结果的填充值时,会重复显示, --维度多于4个时,需要配置:set hive.new.job.grouping.set.cardinality=128;
创建一个测试表
drop table if exists scot_dwd.search_event_test;
CREATE TABLE IF NOT EXISTS scot_dwd.search_event_test (
search_mode STRING comment '搜索方式',
ab_test STRING comment '',
search_id STRING COMMENT '',
distinct_id STRING comment '',
tid STRING comment ''
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\001'
COLLECTION ITEMS TERMINATED BY ','
LINES TERMINATED BY '\n'
STORED AS PARQUET
TBLPROPERTIES('parquet.compression'='SNAPPY',
'parquet.column.index.access'='true')
;
向表中插入数据
INSERT INTO scot_dwd.search_event_test SELECT 'hot', 'A', '1', '11', 'x';
INSERT INTO scot_dwd.search_event_test SELECT 'hot', 'B' , '2', '22', 'y';
INSERT INTO scot_dwd.search_event_test SELECT 'yy', 'B' ,'1', '11' , 'x';
INSERT INTO scot_dwd.search_event_test SELECT 'hot', 'B' , '1', '44', 'x';
with cube函数:除了返回group by字句指定的列外,还返回按组统计的行,返回的结果先按照分组的第一个条件排序显示,再按第二个列排序显示, 以此类推。统计行包括了group by字句指定列的各种组合的数据统计
SELECT
search_mode,
ab_test,
count(distinct search_id) as search_pv_count,
count(distinct distinct_id) as search_uv_count,
grouping__id
FROM scot_dwd.search_event_test
group by search_mode, ab_test with cube
order by grouping__id asc
;
+--------------+----------+------------------+------------------+---------------+
| search_mode | ab_test | search_pv_count | search_uv_count | grouping__id |
+--------------+----------+------------------+------------------+---------------+
| yy | B | 1 | 1 | 0 |
| hot | A | 1 | 1 | 0 |
| hot | B | 2 | 2 | 0 |
| hot | NULL | 2 | 3 | 1 |此行表示仅仅以search_mode为hot值分组进行count(distinct search_id)和count(distinct distinct_id)计算
| yy | NULL | 1 | 1 | 1 |
| NULL | A | 1 | 1 | 2 |此行表示search_mode为所有,以ab_test为A值进行统计
| NULL | B | 2 | 3 | 2 |
| NULL | NULL | 2 | 3 | 3 |此行表示不分组进行count(distinct search_id)和count(distinct distinct_id)计算
+--------------+----------+------------------+------------------+---------------+
查询结果为NULL时,说明此列不分组(也可理解为此列所有的值分为一个组)
SELECT
search_mode,
ab_test,
tid,
count(distinct search_id) as search_pv_count,
count(distinct distinct_id) as search_uv_count,
grouping__id
FROM scot_dwd.search_event_test
group by search_mode, ab_test, tid with cube
order by grouping__id asc
;
+--------------+----------+-------+------------------+------------------+---------------+
| search_mode | ab_test | tid | search_pv_count | search_uv_count | grouping__id |
+--------------+----------+-------+------------------+------------------+---------------+
| hot | A | x | 1 | 1 | 0 |二进制111 取反-> 000
| hot | B | x | 1 | 1 | 0 |
| hot | B | y | 1 | 1 | 0 |
| yy | B | x | 1 | 1 | 0 |
| hot | A | NULL | 1 | 1 | 1 |以search_mode、ab_test进行分组统计 二进制110 位运算取反-> 001
| hot | B | NULL | 2 | 2 | 1 |
| yy | B | NULL | 1 | 1 | 1 |
| hot | NULL | y | 1 | 1 | 2 |以search_mode、tid进行分组统计 二进制101 位运算取反-> 010
| yy | NULL | x | 1 | 1 | 2 |
| hot | NULL | x | 1 | 2 | 2 |
| hot | NULL | NULL | 2 | 3 | 3 |以search_mode进行分组统计 二进制100 位运算取反-> 011
| yy | NULL | NULL | 1 | 1 | 3 |
| NULL | A | x | 1 | 1 | 4 |以ab_test、tid进行分组统计 二进制011 位运算反-> 100
| NULL | B | x | 1 | 2 | 4 |
| NULL | B | y | 1 | 1 | 4 |
| NULL | A | NULL | 1 | 1 | 5 |以ab_test进行分组统计 二进制010 位运算取反- 101
| NULL | B | NULL | 2 | 3 | 5 |
| NULL | NULL | x | 1 | 2 | 6 |以tid进行分组统计 二进制001 位运算取反-> 110
| NULL | NULL | y | 1 | 1 | 6 |
| NULL | NULL | NULL | 2 | 3 | 7 |不分组统计 二进制000 位运算取反- 111
+--------------+----------+-------+------------------+------------------+---------------+
grouping__id的实现: 以group by的所有的字段的排列顺序为基准,对于每个字段,若该字段出现(即不为NULL),则该字段的位置赋值为1,否则为0,组成二进制数据 之后,对二进制数据进行位运算取反,取反后得到的十进制数即为grouping__id的标号
SELECT
search_mode,
tid,
ab_test,
count(distinct search_id) as search_pv_count,
count(distinct distinct_id) as search_uv_count,
grouping__id
FROM scot_dwd.search_event_test
group by search_mode, ab_test, tid with cube
order by grouping__id asc
;
+--------------+-------+----------+------------------+------------------+---------------+
| search_mode | tid | ab_test | search_pv_count | search_uv_count | grouping__id |
+--------------+-------+----------+------------------+------------------+---------------+
| hot | x | A | 1 | 1 | 0 |111 ->000
| hot | x | B | 1 | 1 | 0 |
| hot | y | B | 1 | 1 | 0 |
| yy | x | B | 1 | 1 | 0 |
| hot | NULL | A | 1 | 1 | 1 |110 -> 001
| hot | NULL | B | 2 | 2 | 1 |
| yy | NULL | B | 1 | 1 | 1 |
| hot | y | NULL | 1 | 1 | 2 |101 -> 010
| yy | x | NULL | 1 | 1 | 2 |
| hot | x | NULL | 1 | 2 | 2 |
| hot | NULL | NULL | 2 | 3 | 3 |100 -> 011
| yy | NULL | NULL | 1 | 1 | 3 |
| NULL | x | A | 1 | 1 | 4 |011 -> 100
| NULL | x | B | 1 | 2 | 4 |
| NULL | y | B | 1 | 1 | 4 |
| NULL | NULL | A | 1 | 1 | 5 |010 ->101
| NULL | NULL | B | 2 | 3 | 5 |
| NULL | x | NULL | 1 | 2 | 6 |001 -> 110
| NULL | y | NULL | 1 | 1 | 6 |
| NULL | NULL | NULL | 2 | 3 | 7 |000 ->111
+--------------+-------+----------+------------------+------------------+---------------+
grouping__id的实现: 以group by的所有的字段的排列顺序为基准,对于每个字段,若该字段出现(即不为NULL),则该字段的位置赋值为1,否则为0,组成二进制数据 之后,对二进制数据进行位运算取反,取反后得到十进制数即为grouping__id的标号