很多的UDTF不能将结果与源表进行关联,使用lateral view ,可以将UDTF拆分的单个字段数据与原始表数据关联上
select …… from tabelA lateral view UDTF(xxx) 视图名 as a,b,c
-- 假设我们有一张表pageAds,它有两列数据
-- 第一列是pageid(string类型),第二列是adid_list(Array类型),即用逗号分隔的广告ID集合:
pageid adid_list
"front_page" [1, 2, 3]
"contact_page" [3, 4, 5]
-- 要统计所有广告ID在所有页面中出现的次数。
-- 首先分拆广告ID:
SELECT
pageid, adid
FROM pageAds .
LATERAL VIEW explode(adid_list) adTable AS adid;
-- 执行结果如下:pageid(string类型),adid(int类型)
pageid adid
"front_page" 1
"front_page" 2
"front_page" 3
"contact_page" 3
"contact_page" 4
"contact_page" 5
-- 接下来就是一个聚合的统计:
SELECT
adid, count(1)
FROM pageAds
LATERAL VIEW explode(adid_list) adTable AS adid
GROUP BY adid;
-- 执行结果如下:
adid count(1)
1 1
2 1
3 2
4 1
5 1
--准备数据:vim /export/datas/lateral.txt
1 http://facebook.com/path/p1.php?query=1
2 http://www.baidu.com/news/index.jsp?uuid=frank
3 http://www.jd.com/index?source=baidu
--创建表
create table tb_url(
id int,
url string
) row format delimited fields terminated by '\t';
--加载数据
load data local inpath '/export/datas/lateral.txt' into table tb_url;
--使用UDTF解析
SELECT parse_url_tuple(url, 'HOST') from tb_url;
--使用UDTF+lateral view
select
a.*,
b.host,
b.path
from
tb_url a
lateral view parse_url_tuple(url, 'HOST',"PATH") b as host,path;
--对比
SELECT id,parse_url_tuple(url, 'HOST') from tb_url;--失败,UDTF函数不能与字段连用
select id, a.* from tb_url lateral view parse_url_tuple(url, 'HOST','PATH') a as host,path;
一个FROM语句后可以跟多个lateral view语句,后面的lateral view语句能够引用它前面的所有表和列名。
-- 以下面的表为例:
--col1(Array类型) col2(Array类型)
col1 col2
[1, 2] [a", "b", "c"]
[3, 4] [d", "e", "f"]
SELECT
myCol1, col2
FROM baseTable
LATERAL VIEW explode(col1) myTable1 AS myCol1;
--执行结果为:mycol1(int类型) col2(Array类型)
mycol1 col2
1 [a", "b", "c"]
2 [a", "b", "c"]
3 [d", "e", "f"]
4 [d", "e", "f"]
--加上一个lateral view:
SELECT
myCol1, myCol2
FROM baseTable
LATERAL VIEW explode(col1) myTable1 AS myCol1
LATERAL VIEW explode(col2) myTable2 AS myCol2;
--它的执行结果为:mycol1(int类型) col2(string类型)
myCol1 myCol2
1 "a"
1 "b"
1 "c"
2 "a"
2 "b"
2 "c"
3 "d"
3 "e"
3 "f"
4 "d"
4 "e"
4 "f"
注意上面语句中,两个lateral view按照出现的次序被执行。
将array列表里的每个元素生成一行
每一对元素作为一行,key为一列,value为一列
1-实现wordcount【explode(array)】
select explode(split(word," ")) from words; --直接使用,word为array
2-将兴趣爱好展开【explode(map)】
select explode(hobby) from complex_map; --直接使用,hobby为map
3-与侧视图连用
select a.name,b.* from complex_map a lateral view explode(hobby) b as hobby,deep;
-- 与lateral view连用
-》创建数据:vim /export/datas/r2c1.txt
a c 1
a d 2
a e 3
b c 4
b d 5
b e 6
-》希望结果
+-------+----+----+----+--+
| col1 | c | d | e |
+-------+----+----+----+--+
| a | 1 | 2 | 3 |
| b | 4 | 5 | 6 |
+-------+----+----+----+--+
-》创建表
create table row2col1(
col1 string,
col2 string,
col3 int
)row format delimited fields terminated by '\t';
-》加载数据
load data local inpath '/export/datas/r2c1.txt' into table row2col1;
a c 1
a d 2
a e 3
case col2 when 'c' then col3 else 0 end =》 1 0 0 =》 1
case col2 when 'd' then col3 else 0 end =》 0 2 0 =》 2
case col2 when 'e' then col3 else 0 end =》 0 0 3 =》 3
select
col1 as col1,
max(case col2 when 'c' then col3 else 0 end) as c,
max(case col2 when 'd' then col3 else 0 end) as d,
max(case col2 when 'e' then col3 else 0 end) as e
from
row2col1
group by
col1;
案例:多行转单列
-》创建数据:vim /export/datas/r2c2.txt
a b 1
a b 2
a b 3
c d 4
c d 5
c d 6
-》希望结果:concat_ws(分隔符,col3)
+-------+-------+--------+--+
| col1 | col2 | col3 |
+-------+-------+--------+--+
| a | b | 1,2,3 |
| c | d | 4,5,6 |
+-------+-------+--------+--+
-》创建表
create table row2col2(
col1 string,
col2 string,
col3 int
)row format delimited fields terminated by '\t';
-》加载数据
load data local inpath '/export/datas/r2c2.txt' into table row2col2;
select
col1,
col2,
concat_ws(',', collect_set(cast(col3 as string))) as col3
from
row2col2
group by
col1, col2;
-》创建数据:vim /export/datas/c2r1.txt
a 1 2 3
b 4 5 6
-》希望结果
+-----------+-----------+-----------+--+
| _u1.col1 | _u1.col2 | _u1.col3 |
+-----------+-----------+-----------+--+
| a | c | 1 |
| b | c | 4 |
| a | d | 2 |
| b | d | 5 |
| a | e | 3 |
| b | e | 6 |
+-----------+-----------+-----------+--+
-》创建表
create table col2row1(
col1 string,
col2 int,
col3 int,
col4 int
)row format delimited fields terminated by '\t';
-》加载数据
load data local inpath '/export/datas/c2r1.txt' into table col2row1;
或者
drop table if exists row2col1;
create table row2col1 as
select 'a' as col1,1 as col2,2 as col3,3 as col4
union all
select 'b' as col1,4 as col2,5 as col3,6 as col4
;
select col1, 'c' as col2, col2 as col3 from col2row1
UNION ALL
select col1, 'd' as col2, col3 as col3 from col2row1
UNION ALL
select col1, 'e' as col2, col4 as col3 from col2row1;
select split(value_list,':')[0] as col1
,split(value_list,':')[1] as col2
,split(value_list,':')[2] as col3
from (select concat_ws(',',concat(col1,':c:',col2),concat(col1,':d:',col3),concat(col1,':e:',col4)) as value
from col2row1
)t
lateral view explode(split(value,','))v as value_list
;
select col1
-- ,tb1.a as tb1a
-- ,tb2.a as tb2a
-- ,tb3.a as tb3a
,col2
,col3
from(select array(col1,col1,col1) as col1_list
,array('c','d','e') as col2_list
,array(col2,col3,col4) as col3_list
from col2row1
)t
lateral view posexplode(col1_list) tb1 as a,col1
lateral view posexplode(col2_list) tb2 as a,col2
lateral view posexplode(col3_list) tb3 as a,col3
where tb1.a=tb2.a and tb2.a=tb3.a
;
posexplode 我经常会用来 做膨胀数据使用~供参考
select a.a
,a.b
,date_add('2023-08-01',pos) as dt
,pos+1 as n_day
from (select 1 as a ,2 as b
union all
select 3 as a ,4 as b
)a
lateral view posexplode(split(repeat('a,a',datediff('2023-08-31','2023-08-01')),',')) tab as pos,day
-- lateral view posexplode(split(repeat('a,a',30),',')) tab as pos,day
-- 这里如果直接写30会被炸开成31(n+1)
-- pos是从0开始,到30(repeat次数)
-- 为什么不是lateral view posexplode(split(repeat(',',datediff('2023-08-31','2023-08-01')),',')) tab as pos,day 呢???
-- 因为hive引擎下 和spark引擎下是不一样的,可以自行测试一下,hive 切出来不是你想要的。。。
-- select split(repeat(',',datediff('2023-08-31','2023-08-01')),',');
-- []
-- select split(repeat('a,a',datediff('2023-08-31','2023-08-01')),',');
-- ["a","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","aa","a"]
order by dt desc
;
多表查询中 Join:多表之间列合并 union:多表之间行的合并
案例:单列转多行
-》创建数据:vim /export/datas/c2r2.txt
a b 1,2,3
c d 4,5,6
-》希望结果
+-------+-------+-------+--+
| col1 | col2 | col3 |
+-------+-------+-------+--+
| a | b | 1 |
| a | b | 2 |
| a | b | 3 |
| c | d | 4 |
| c | d | 5 |
| c | d | 6 |
+-------+-------+-------+--+
-》创建表
create table col2row2(
col1 string,
col2 string,
col3 string
)row format delimited fields terminated by '\t';
-》加载数据
load data local inpath '/export/datas/c2r2.txt' into table col2row2;
select
col1,
col2,
lv.col3 as col3
from
col2row2
lateral view explode(split(col3, ',')) lv as col3;
SELECT
statisticLadder_json
,get_json_object(statisticLadder_json, '$.activityId') as activityId
,get_json_object(statisticLadder_json, '$.strategyId') as strategyId
FROM ( SELECT '[{"activityId":1,"strategyId":2,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}},{"activityId":2,"strategyId":3,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}}]' as str
) a
LATERAL view explode(split(regexp_replace(str,'^\\[|]$',''), ',(?!")')) b as statisticLadder_json
-- 结果
statisticLadder_json activityId strategyId
{"activityId":1,"strategyId":2,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}} 1 2
{"activityId":2,"strategyId":3,"status":0,"strategyMatch":{"dmp":true,"property":false,"scene":true},"propertyMatch":{"clientType":false}} 2 3