spark-sql优化

1  任务拆解

1) 原始查询sql(多个etype, 多个dt查询, 数据量过大查询不出来)
    select 
        dt, etype, count(distinct gazj)  
    from table_name  
    where dt between '20201101' and '20210228'
    and etype in ('10410007','10410008') 
    and get_json_object(eparam,'$._pkg')='net.bat.store'
    group by dt, etype;
    
    
 2) 改进sql(任务分解: 单个etype查询/缩小时段查询)
    select
        dt, '10410007' etype, count(distinct gazj)
    from table_name
    where dt between '20201101' and '20210228'
    and etype='10410007'
    and get_json_object(eparam,'$._pkg')='net.bat.store'
    group by dt;
    
    select
        dt, '10410008' etype, count(distinct gazj)
    from table_name
    where dt between '20201101' and '20210228'
    and etype='10410008'
    and get_json_object(eparam,'$._pkg')='net.bat.store'
    group by dt;
 


2 json数组


 1) 指定schema展开数组(格式统一时能用, 效率高)
    with tmp as(
        select  
            explode( from_json(get_json_object(eparam, '$._d'), 'array< struct<`_origin`: string, `_re`: int, `_se`: string, `_ue`: int  > >')) element
        from table_name
        where dt='20210403' and etype='10257001' and ts%5=0
    )
    select 
    count(1)
    from tmp;
    
2) 字符串分隔方式展开数组(json串中没有嵌套时用, 效率偏低)
    with tmp as (
        select  
            explode(split(regexp_replace(regexp_replace(regexp_replace( get_json_object(eparam, '$._d'), '\\[', ''), '\\]', ''), '\\}\\,\\{', '\\}\\|\\|\\{'),'\\|\\|')) element
        from table_name
        where dt='20210403' and etype='10257001' and ts%5=0
    )
    select 
    count(1)
    from tmp;
 


3 json串包含


   1) rlike 方式 6.1min
    select
        dt, '10410007' etype, count(distinct gazj)
    from table_name
    where dt like '202011%'
    and etype='10410007'
    and eparam rlike '"_pkg":"net.bat.store"'
    group by dt;

   2) like 方式 6min
    select
        dt, '10410007' etype, count(distinct gazj)
    from table_name
    where dt like '202011%'
    and etype='10410007'
    and eparam like '%"_pkg":"net.bat.store"%'
    group by dt;

   3) instr方式 8min
    select
       dt, '10410007' etype, count(distinct gazj)
    from table_name
    where dt like '202011%'
    and etype='10410007'
    and instr(eparam, '"_pkg":"net.bat.store"')!=0
    group by dt;

   4) 解析json方式 15min
    select
        dt, '10410007' etype, count(distinct gazj)
    from table_name
    where dt like '202011%'
    and etype='10410007'
    and get_json_objet(eparam,'$._pkg')='net.bat.store'
    group by dt;

4 解析json多个字段

1) get_json_object解析多次(用时4.4min)
insert overwrite directory 's3://shy/test/shy/01/'
select /*+ coalesce(120) */
    get_json_object(value, '$.gaid') gazj,
    get_json_object(value, '$.vid') vid,
    get_json_object(value, '$.tid') tid,
    get_json_object(value, '$.event') event,
    get_json_object(value, '$.ts') ts,
    get_json_object(value, '$.tz') tz,
    get_json_object(value, '$.eparam') eparam
from text.`s3://shy/online/athena_10019999/20200112/*/*`;

2) json_tuple解析(用时1.9min)
insert overwrite directory 's3://shy/test/shy/00/'
select /*+ coalesce(120) */
    json_tuple(value, 'gaid', 'vid', 'tid', 'event',' ts', 'tz', 'eparam') as (gazj, vid, tid, event, ts, tz, eparam)
from text.`s3://shy/online/athena_10019999/20200112/*/*`;

5 shuffle前过滤和使用高效算子

1 原始sql 82min
    with t0 as (
    select pkg
    from   trandw.dws_log_app_active_trace_dd
    where  dt='20210315'
    and    lower(pkg) in ( 
       select lower(pkg)
       from   trandw.dim_pub_app 
       where  type=2
    )
    group by pkg
    having sum(case when substr(active_trace,-1)='1' then 1 else 0 end)>=10000
    ),
    
    t1 as (
    select pkg
    from   trandw.dws_log_app_active_trace_dd
    where  dt='20210315'
    and    pkg in (
       select pkg
       from   tranadm.mid_dim_pub_ret_app_01
       where  dt='20210314'
    )
    group by pkg 
    having sum(case when substr(concat(lpad('',1,'0'),active_trace),-1,1)='1' then 1 else 0 end)>=10000
    or     sum(case when substr(concat(lpad('',2,'0'),active_trace),-2,1)='1' then 1 else 0 end)>=10000

    )
    
    insert overwrite table tranadm.mid_dim_pub_ret_app_01 partition(dt='20210315')
    select pkg
    from 
    (select pkg
     from   t0
     union all
     select pkg
     from   t1
    )t
    group by pkg
    
    
2 优化后的sql 18min
with t1 as (
   select /*+ broadcast(bb) */
        aa.pkg
   from trandw.dws_log_app_active_trace_dd aa
   join trandw.dim_pub_app bb
        on lower(aa.pkg)=lower(bb.pkg)
   where aa.dt='20210315' 
        and aa.active_trace like '%1' 
        and bb.type=2
   group by aa.pkg
   having count(1)>=10000
),
t2 as (
select /*+ broadcast(bb) */
        aa.pkg
    from trandw.dws_log_app_active_trace_dd aa
    join tranadm.mid_dim_pub_ret_app_01 bb
        on aa.pkg=bb.pkg
    where aa.dt='20210315' and aa.active_trace not like '%0000000' and bb.dt='20210314'
    group by aa.pkg
    having    
           sum( case when active_trace like '%1'       then 1 else 0 end ) >=10000 
        or sum( case when active_trace like '%1_'      then 1 else 0 end ) >=10000 
)
insert overwrite table tranadm.mid_dim_pub_ret_app_01 partition(dt='20210315')
selet /*+ coalesce(1) */
    pkg
from (
    select pkg from t1
    union
    select pkg from t2
) aa;


3 优化的地方
1) 增加过滤条件(aa.active_trace like '%1' ,  aa.active_trace not like '%0000000'), shuffle前缩小数据集;

2) 更高效的函数: substr(concat(lpad('',1,'0'),active_trace),-1,1)='1' --> active_trace like '%1'

3) 轨迹表能提前过滤数据, 就过滤数据;

6 分析函数

1 id最早出现在那个source

-- 思路: 直接id全局排序,找到id最早出现的源
-- 适用场景:数据量较小时,排序很耗性能;
with tmp as (
    select
        id,
        source,
        ts,
        row_number() over (partition by id order by ts) as rnk
    from
        source_table
    where
        dt='${dt}'
        and dh='${dh}'
)
insert overwrite table target_table partition(dt='${dt}',dh='${dh}')
select /*+ coalesce(100) */
    id,
    souce,
    ts
from
    tmp
where
    rnk = 1
;

-- 思路(缩小数据量量后再排序):先源source内取最早记录,再源source间取最早记录
-- 适用场景:源source内取最早记录能显著缩小数据量
with tmp as (
    select
        id,
        source,
        min(ts) as ts
    from
        source_table
    where
        dt='${dt}'
        and dh='${dh}'
    group by
        id,
        source
)
insert overwrite table target_table partition(dt='${dt}',dh='${dh}')
select /*+ coalesce(100) */
    id,
    souce,
    ts
from
    (
        select 
            id,
            souce,
            ts
            row_number() over (partition by id order by ts) as rnk
        from
            tmp
    )
where
    rnk = 1
;

-- 思路:ts+source当做排序主键,一次聚合获取结果
-- 适用场景:ts位数相同,能按字符排序
with tmp as (
    select
        id,
        min(concat(ts,'#',source) as ts_source
    from
        source_table
    where
        dt='${dt}'
        and dh='${dh}'
    group by
        id
)
insert overwrite table target_table partition(dt='${dt}',dh='${dh}')
select /*+ coalesce(100) */
    id,
    split(ts_source,'#')[1] as souce,
    split(ts_source,'#')[0] as ts
from
    tmp
;

100 参考链接

类别 链接
语法参考 DISTRIBUTE BY_数据湖探索 DLI_Spark SQL语法参考_SELECT_排序_华为云
Spark SQL 的聚集 CLUSTER BY 子句 | Spark SQL 教程 - 盖若

-

-


你可能感兴趣的:(spark,sql,spark,数据库)