1) 原始查询sql(多个etype, 多个dt查询, 数据量过大查询不出来)
select
dt, etype, count(distinct gazj)
from table_name
where dt between '20201101' and '20210228'
and etype in ('10410007','10410008')
and get_json_object(eparam,'$._pkg')='net.bat.store'
group by dt, etype;
2) 改进sql(任务分解: 单个etype查询/缩小时段查询)
select
dt, '10410007' etype, count(distinct gazj)
from table_name
where dt between '20201101' and '20210228'
and etype='10410007'
and get_json_object(eparam,'$._pkg')='net.bat.store'
group by dt;
select
dt, '10410008' etype, count(distinct gazj)
from table_name
where dt between '20201101' and '20210228'
and etype='10410008'
and get_json_object(eparam,'$._pkg')='net.bat.store'
group by dt;
1) 指定schema展开数组(格式统一时能用, 效率高)
with tmp as(
select
explode( from_json(get_json_object(eparam, '$._d'), 'array< struct<`_origin`: string, `_re`: int, `_se`: string, `_ue`: int > >')) element
from table_name
where dt='20210403' and etype='10257001' and ts%5=0
)
select
count(1)
from tmp;
2) 字符串分隔方式展开数组(json串中没有嵌套时用, 效率偏低)
with tmp as (
select
explode(split(regexp_replace(regexp_replace(regexp_replace( get_json_object(eparam, '$._d'), '\\[', ''), '\\]', ''), '\\}\\,\\{', '\\}\\|\\|\\{'),'\\|\\|')) element
from table_name
where dt='20210403' and etype='10257001' and ts%5=0
)
select
count(1)
from tmp;
1) rlike 方式 6.1min
select
dt, '10410007' etype, count(distinct gazj)
from table_name
where dt like '202011%'
and etype='10410007'
and eparam rlike '"_pkg":"net.bat.store"'
group by dt;
2) like 方式 6min
select
dt, '10410007' etype, count(distinct gazj)
from table_name
where dt like '202011%'
and etype='10410007'
and eparam like '%"_pkg":"net.bat.store"%'
group by dt;
3) instr方式 8min
select
dt, '10410007' etype, count(distinct gazj)
from table_name
where dt like '202011%'
and etype='10410007'
and instr(eparam, '"_pkg":"net.bat.store"')!=0
group by dt;
4) 解析json方式 15min
select
dt, '10410007' etype, count(distinct gazj)
from table_name
where dt like '202011%'
and etype='10410007'
and get_json_objet(eparam,'$._pkg')='net.bat.store'
group by dt;
1) get_json_object解析多次(用时4.4min)
insert overwrite directory 's3://shy/test/shy/01/'
select /*+ coalesce(120) */
get_json_object(value, '$.gaid') gazj,
get_json_object(value, '$.vid') vid,
get_json_object(value, '$.tid') tid,
get_json_object(value, '$.event') event,
get_json_object(value, '$.ts') ts,
get_json_object(value, '$.tz') tz,
get_json_object(value, '$.eparam') eparam
from text.`s3://shy/online/athena_10019999/20200112/*/*`;
2) json_tuple解析(用时1.9min)
insert overwrite directory 's3://shy/test/shy/00/'
select /*+ coalesce(120) */
json_tuple(value, 'gaid', 'vid', 'tid', 'event',' ts', 'tz', 'eparam') as (gazj, vid, tid, event, ts, tz, eparam)
from text.`s3://shy/online/athena_10019999/20200112/*/*`;
1 原始sql 82min
with t0 as (
select pkg
from trandw.dws_log_app_active_trace_dd
where dt='20210315'
and lower(pkg) in (
select lower(pkg)
from trandw.dim_pub_app
where type=2
)
group by pkg
having sum(case when substr(active_trace,-1)='1' then 1 else 0 end)>=10000
),
t1 as (
select pkg
from trandw.dws_log_app_active_trace_dd
where dt='20210315'
and pkg in (
select pkg
from tranadm.mid_dim_pub_ret_app_01
where dt='20210314'
)
group by pkg
having sum(case when substr(concat(lpad('',1,'0'),active_trace),-1,1)='1' then 1 else 0 end)>=10000
or sum(case when substr(concat(lpad('',2,'0'),active_trace),-2,1)='1' then 1 else 0 end)>=10000
)
insert overwrite table tranadm.mid_dim_pub_ret_app_01 partition(dt='20210315')
select pkg
from
(select pkg
from t0
union all
select pkg
from t1
)t
group by pkg
2 优化后的sql 18min
with t1 as (
select /*+ broadcast(bb) */
aa.pkg
from trandw.dws_log_app_active_trace_dd aa
join trandw.dim_pub_app bb
on lower(aa.pkg)=lower(bb.pkg)
where aa.dt='20210315'
and aa.active_trace like '%1'
and bb.type=2
group by aa.pkg
having count(1)>=10000
),
t2 as (
select /*+ broadcast(bb) */
aa.pkg
from trandw.dws_log_app_active_trace_dd aa
join tranadm.mid_dim_pub_ret_app_01 bb
on aa.pkg=bb.pkg
where aa.dt='20210315' and aa.active_trace not like '%0000000' and bb.dt='20210314'
group by aa.pkg
having
sum( case when active_trace like '%1' then 1 else 0 end ) >=10000
or sum( case when active_trace like '%1_' then 1 else 0 end ) >=10000
)
insert overwrite table tranadm.mid_dim_pub_ret_app_01 partition(dt='20210315')
selet /*+ coalesce(1) */
pkg
from (
select pkg from t1
union
select pkg from t2
) aa;
3 优化的地方
1) 增加过滤条件(aa.active_trace like '%1' , aa.active_trace not like '%0000000'), shuffle前缩小数据集;
2) 更高效的函数: substr(concat(lpad('',1,'0'),active_trace),-1,1)='1' --> active_trace like '%1'
3) 轨迹表能提前过滤数据, 就过滤数据;
1 id最早出现在那个source
-- 思路: 直接id全局排序,找到id最早出现的源
-- 适用场景:数据量较小时,排序很耗性能;
with tmp as (
select
id,
source,
ts,
row_number() over (partition by id order by ts) as rnk
from
source_table
where
dt='${dt}'
and dh='${dh}'
)
insert overwrite table target_table partition(dt='${dt}',dh='${dh}')
select /*+ coalesce(100) */
id,
souce,
ts
from
tmp
where
rnk = 1
;
-- 思路(缩小数据量量后再排序):先源source内取最早记录,再源source间取最早记录
-- 适用场景:源source内取最早记录能显著缩小数据量
with tmp as (
select
id,
source,
min(ts) as ts
from
source_table
where
dt='${dt}'
and dh='${dh}'
group by
id,
source
)
insert overwrite table target_table partition(dt='${dt}',dh='${dh}')
select /*+ coalesce(100) */
id,
souce,
ts
from
(
select
id,
souce,
ts
row_number() over (partition by id order by ts) as rnk
from
tmp
)
where
rnk = 1
;
-- 思路:ts+source当做排序主键,一次聚合获取结果
-- 适用场景:ts位数相同,能按字符排序
with tmp as (
select
id,
min(concat(ts,'#',source) as ts_source
from
source_table
where
dt='${dt}'
and dh='${dh}'
group by
id
)
insert overwrite table target_table partition(dt='${dt}',dh='${dh}')
select /*+ coalesce(100) */
id,
split(ts_source,'#')[1] as souce,
split(ts_source,'#')[0] as ts
from
tmp
;
类别 | 链接 |
语法参考 | DISTRIBUTE BY_数据湖探索 DLI_Spark SQL语法参考_SELECT_排序_华为云 |
Spark SQL 的聚集 CLUSTER BY 子句 | Spark SQL 教程 - 盖若 | |
-
-