Hive sql面试
select store,count(distinct uid) as uv from Visit group by store;
select * from Users order by age desc, total;
考点:列转行
select stage_someone, count(distinct UID) from LifeStage lateral view explode(split(stage,',')) LifeStage_tmp as stage_someone group by stage_someone;
select UID,concat_ws(',',collect_set(stage)) as stages from LifeStage group by UID;
考点:行转列
explode与lateral view在关系型数据库中本身是不该出现的,因为他的出现本身就是在操作不满足第一范式的数据(每个属性都不可再分),
本身已经违背了数据库的设计原理(不论是业务系统还是数据仓库系统),不过大数据技术普及后,很多类似pv,uv的数据,在业务系统中是存贮在非关系型数据库中,
用json存储的概率比较大,直接导入hive为基础的数仓系统中,就需要经过ETL过程解析这类数据,explode与lateral view在这种场景下大显身手。
explode作用是处理map结构的字段,使用案例如下(hive自带map,struct,array字段类型,但是需要先定义好泛型,所以在此案例不使用):
建表语句:
drop table explode_lateral_view;
create table explode_lateral_view
(`area` string,
`goods_id` string,
`sale_info` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
STORED AS textfile;
导入数据:
a:shandong,b:beijing,c:hebei|1,2,3,4,5,6,7,8,9|[{"source":"7fresh","monthSales":4900,"userCount":1900,"score":"9.9"},{"source":"jd","monthSales":2090,"userCount":78981,"score":"9.8"},{"source":"jdmart","monthSales":6987,"userCount":1600,"score":"9.0"}]
表内数据如下
explode的使用:
我们只拆解array字段,语句为select explode(split(goods_id,',')) as goods_id from explode_lateral_view;
结果如下
拆解map字段,语句为select explode(split(area,',')) as area from explode_lateral_view;
拆解json字段
这个时候要配合一下get_json_object
我们想获取所有的monthSales,第一步我们先把这个字段拆成list,并且拆成行展示:
select explode(split(regexp_replace(regexp_replace(sale_info,'\\[\\{',''),'}]',''),'},\\{')) as sale_info from explode_lateral_view;
然后我们想用get_json_object来获取key为monthSales的数据:
select get_json_object(explode(split(regexp_replace(regexp_replace(sale_info,'\\[\\{',''),'}]',''),'},\\{')),'$.monthSales') as sale_info from explode_lateral_view;
然后挂了FAILED: SemanticException [Error 10081]: UDTF's are not supported outside the SELECT clause, nor nested in expressions
UDTF explode不能写在别的函数内
如果你这么写,想查两个字段,select explode(split(area,',')) as area,good_id from explode_lateral_view;
会报错FAILED: SemanticException 1:40 Only a single expression in the SELECT clause is supported with UDTF's. Error encountered near token 'good_id'
使用UDTF的时候,只支持一个字段,这时候就需要LATERAL VIEW出场了
LATERAL VIEW的使用:
侧视图的意义是配合explode(或者其他的UDTF),一个语句生成把单行数据拆解成多行后的数据结果集。
select goods_id2,sale_info from explode_lateral_view LATERAL VIEW explode(split(goods_id,','))goods as goods_id2;
其中LATERAL VIEW explode(split(goods_id,','))goods相当于一个虚拟表,与原表explode_lateral_view笛卡尔积关联。
也可以多重使用
select goods_id2,sale_info,area2
from explode_lateral_view
LATERAL VIEW explode(split(goods_id,','))goods as goods_id2
LATERAL VIEW explode(split(area,','))area as area2;
也是三个表笛卡尔积的结果
现在我们解决一下上面的问题,从sale_info字段中找出所有的monthSales并且行展示
select get_json_object(concat('{',sale_info_r,'}'),'$.monthSales') as monthSales from explode_lateral_view
LATERAL VIEW explode(split(regexp_replace(regexp_replace(sale_info,'\\[\\{',''),'}]',''),'},\\{'))sale_info as sale_info_r;
最终,我们可以通过下面的句子,把这个json格式的一行数据,完全转换成二维表的方式展现
select get_json_object(concat('{',sale_info_1,'}'),'$.source') as source,
get_json_object(concat('{',sale_info_1,'}'),'$.monthSales') as monthSales,
get_json_object(concat('{',sale_info_1,'}'),'$.userCount') as monthSales,
get_json_object(concat('{',sale_info_1,'}'),'$.score') as monthSales
from explode_lateral_view
LATERAL VIEW explode(split(regexp_replace(regexp_replace(sale_info,'\\[\\{',''),'}]',''),'},\\{'))sale_info as sale_info_1;
6.这个面试题偏难,但毕竟有意思
USE tempdb
GO
IF OBJECT_ID('t') IS NOT NULL DROP TABLE t
GO
CREATE TABLE t(id INT PRIMARY KEY,[name] NVARCHAR(10),parentId INT)
INSERT INTO t
SELECT 1,'北京市',0
UNION ALL SELECT 2,'山东省',0
UNION ALL SELECT 3,'昌平区',1
UNION ALL SELECT 4,'海淀区',1
UNION ALL SELECT 5,'沙闸镇',3
UNION ALL SELECT 6,'马池口镇',3
UNION ALL SELECT 7,'中关村',4
UNION ALL SELECT 8,'上地',4
UNION ALL SELECT 9,'烟台市',2
UNION ALL SELECT 10,'青岛市',2
UNION ALL SELECT 11,'五通桥区',9
UNION ALL SELECT 12,'马边区',9
UNION ALL SELECT 13,'定文镇',10
UNION ALL SELECT 14,'罗成镇',10
;WITH cte AS (
SELECT id,NAME,parentId,1 AS level FROM t WHERE parentId=0
UNION ALL
SELECT t.id,t.NAME,t.parentId,cte.level+1 AS level FROM t
INNER JOIN cte ON t.parentId=cte.id
)
SELECT
t1.name AS [一级地名]
,t2.name AS [二级地名]
,t3.name AS [三级地名]
FROM
(SELECT * FROM cte WHERE LEVEL=1) AS t1
INNER JOIN
(SELECT * FROM cte WHERE LEVEL=2) AS t2 ON t1.id=t2.parentId
INNER JOIN
(SELECT * FROM cte WHERE LEVEL=3) AS t3 ON t2.id=t3.parentId
ORDER BY 1,2,3
或者用:
select faddr.city_name as '一级地区' ,saddr.city_name as '二级地区',taddr.city_name as '三级地区' from CITY a
join (SELECT City_Id,City_Name,City_ParentId from CITY) faddr on a.City_Id=faddr.City_Id
join (SELECT City_Id,City_Name,City_ParentId from CITY) saddr on faddr.City_Id=saddr.City_ParentId
join (SELECT City_Id,City_Name,City_ParentId from CITY) taddr on saddr.City_Id=taddr.City_ParentId