sparksql 优化,性能提高了3-4倍

这几天代码跑得特别慢,一个大sql跑4-6h,着实急人,大sql如下:

val bidDetailDf=ss.sql(
		s"""
select app,day,hour,adx,os,osv,country,impType, sum(cnt) as cnt,sum(request) as request, sum(response) as response, sum(bid) as bid, sum(timeout) as timeout
,sum(status) as status,sum(ccount) as ccount,sum(remarketing) as remarketing,sum(banner) as banner,sum(video) as video,sum(mediatype) as mediatype,sum(publisher) as publisher,sum(ifa) as ifa,sum(qps) as qps,sum(budget) as budget,
sum(cat) as cat,sum(error) as error,sum(bidfloor) as bidfloor,sum(seatbid) as seatbid,sum(admerror) as admerror
from
(

select app,day,hour,adx,os,osv,country,impType,count(1) as cnt,
0 as request,
0 as response,
0 as bid,
sum(case when filter='timeout' then 1 else 0 end) as timeout,
sum(case when filter='status' then 1 else 0 end) as status,
sum(case when filter='country' then 1 else 0 end) as ccount,
sum(case when filter='remarketing' then 1 else 0 end) as remarketing,
sum(case when filter='banner' then 1 else 0 end) as banner,
sum(case when filter='video' then 1 else 0 end) as video,
sum(case when filter='mediatype' then 1 else 0 end) as mediatype,
sum(case when filter='publisher' then 1 else 0 end) as publisher,
sum(case when filter='ifa' then 1 else 0 end) as ifa,
sum(case when filter='qps' then 1 else 0 end) as qps,
sum(case when filter='budget' then 1 else 0 end) as budget,
sum(case when filter='cat' then 1 else 0 end) as cat,
sum(case when filter='error' then 1 else 0 end) as error,
sum(case when filter='bidfloor' then 1 else 0 end) as bidfloor,
sum(case when filter='seatbid' then 1 else 0 end) as seatbid,
sum(case when filter='admerror' then 1 else 0 end) as admerror
from
(
select rr.app,day,hour,adx,os,osv,country,impType,rr.filter
from (
select day,hour,adx,os,osv,country,impType,split(regexp_replace(regexp_extract(allapps,'^\\\\[(.+)\\\\]',1),'\\\\}\\\\,\\\\{', '\\\\}\\\\|\\\\|\\\\{'),'\\\\|\\\\|') as str
from mediabuy_dsp.t_dsp_bid_detail_tbl
where day='%s') pp
lateral view explode(pp.str) ss as col
lateral view json_tuple(ss.col,'app','filter') rr as app, filter
) t where app<>'NULL' group by app,day,hour,adx,os,osv,country,impType

union all

select app,day,hour,adx,os,osv,country,impType, 0 as cnt, count(1) as request, 0 as response, 0 as bid, 0 as timeout,
0 as status,0 as ccount,0 as remarketing,0 as banner,0 as video,0 as mediatype,0 as publisher,0 as ifa,0 as qps,0 as budget,
0 as cat,0 as error,0 as bidfloor,0 as seatbid,0 as admerror
from
mediabuy_dsp.t_dsp_bid_detail_tbl
lateral view explode(split(regexp_replace(appsitemfinal,'\\\\[|\\\\]|"',''), ',')) aa as app
where day='%s' and app <>'NULL' and app is not null and app <> ''
group by app,day,hour,adx,os,osv,country,impType
union all

select appName as app,day,hour,adx,os,osv,country,impType,0 as cnt, 0 as request,  sum(case when httpStatus='200' then 1 else 0 end) as response, 0 as bid, 0 as timeout,
0 as status,0 as ccount,0 as remarketing,0 as banner,0 as video,0 as mediatype,0 as publisher,0 as ifa,0 as qps,0 as budget,
0 as cat,0 as error,0 as bidfloor,0 as seatbid,0 as admerror
from
(
select rr.appName,day,hour,adx,os,osv,country,impType, rr.httpStatus
from (
select day,hour,adx,os,osv,country,impType, split(regexp_replace(regexp_extract(appitemresponse,'\\\\[(.+)\\\\]',1),'\\\\}\\\\,\\\\{', '\\\\}\\\\|\\\\|\\\\{'),'\\\\|\\\\|') as str
from
mediabuy_dsp.t_dsp_bid_detail_tbl
where day='%s'
) pp lateral view explode(pp.str) ss as col
lateral view json_tuple(ss.col,'appName','httpStatus') rr as appName, httpStatus
) t   where appName<>'NULL' group by appName,day,hour,adx,os,osv,country,impType
union all

select appname as app,day,hour,adx,os,osv,country,impType, 0 as cnt ,0 as request, 0 as response, count(1) as bid, 0 as timeout,
0 as status,0 as ccount,0 as remarketing,0 as banner,0 as video,0 as mediatype,0 as publisher,0 as ifa,0 as qps,0 as budget,
0 as cat,0 as error,0 as bidfloor,0 as seatbid,0 as admerror
from
mediabuy_dsp.t_dsp_bid_detail_tbl
where day='%s' and trim(appname)<>''
group by appname,day,hour,adx,os,osv,country,impType
) t  where app<>'' group by app,day,hour,adx,os,osv,country,impType
  """.format(yesterDay,yesterDay,yesterDay,yesterDay)).persist(StorageLevel.MEMORY_AND_DISK_SER)

通过测试观察,每一个union的临时表都是跑得挺快,最慢的也就40min左右就跑完了,
原来是个特别大的sql 一张表要扫描4次,然后把结果 union到一块,这个sql的执行速度就特别慢,原来还好只要1.5个小时,最进不知道是哪里出现了问题一度跑4-6个小时,我就把这几个union 的 小sql单独跑,发现每个最慢也就40min就跑完了,因为这几个sql跑起来比较费时间,所以我每次跑一个sql 我都把结果insert到四个中间表中
然后把4张中间表union,再group by 这样时间就短了,而且就算万一数据跑失败,只要是再这四个表后面发生的错误,我可以改下程序让从四张中间表中取数据,然后就会节省很多时间,补数据现在只需要5分钟

改造完的代码如下:

 //删除结果表数据,并写入结果表
    //s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_1
    val CurDayFinalDirPath1 = s"s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_1/updatedate=$yesterDay"
    FileSystem.get(new URI("s3://baidu.taobao.tenent.com"), ss.sparkContext.hadoopConfiguration).
      delete(new Path(CurDayFinalDirPath1), true)
ss.sql(
		s"""
INSERT OVERWRITE TABLE mediabuy_dsp.t_dsp_bid_middle_detail_tbl_1
PARTITION (updatedate = '%s')
select app,day,hour,adx,os,osv,country,impType, sum(cnt) as cnt,sum(request) as request, sum(response) as response, sum(bid) as bid, sum(timeout) as timeout
,sum(status) as status,sum(ccount) as ccount,sum(remarketing) as remarketing,sum(banner) as banner,sum(video) as video,sum(mediatype) as mediatype,sum(publisher) as publisher,sum(ifa) as ifa,sum(qps) as qps,sum(budget) as budget,
sum(cat) as cat,sum(error) as error,sum(bidfloor) as bidfloor,sum(seatbid) as seatbid,sum(admerror) as admerror
from
(

select app,day,hour,adx,os,osv,country,impType,count(1) as cnt,
0 as request,
0 as response,
0 as bid,
sum(case when filter='timeout' then 1 else 0 end) as timeout,
sum(case when filter='status' then 1 else 0 end) as status,
sum(case when filter='country' then 1 else 0 end) as ccount,
sum(case when filter='remarketing' then 1 else 0 end) as remarketing,
sum(case when filter='banner' then 1 else 0 end) as banner,
sum(case when filter='video' then 1 else 0 end) as video,
sum(case when filter='mediatype' then 1 else 0 end) as mediatype,
sum(case when filter='publisher' then 1 else 0 end) as publisher,
sum(case when filter='ifa' then 1 else 0 end) as ifa,
sum(case when filter='qps' then 1 else 0 end) as qps,
sum(case when filter='budget' then 1 else 0 end) as budget,
sum(case when filter='cat' then 1 else 0 end) as cat,
sum(case when filter='error' then 1 else 0 end) as error,
sum(case when filter='bidfloor' then 1 else 0 end) as bidfloor,
sum(case when filter='seatbid' then 1 else 0 end) as seatbid,
sum(case when filter='admerror' then 1 else 0 end) as admerror
from
(
select rr.app,day,hour,adx,os,osv,country,impType,rr.filter
from (
select day,hour,adx,os,osv,country,impType,split(regexp_replace(regexp_extract(allapps,'^\\\\[(.+)\\\\]',1),'\\\\}\\\\,\\\\{', '\\\\}\\\\|\\\\|\\\\{'),'\\\\|\\\\|') as str
from mediabuy_dsp.t_dsp_bid_detail_tbl
where day='%s') pp
lateral view explode(pp.str) ss as col
lateral view json_tuple(ss.col,'app','filter') rr as app, filter
) t where app<>'NULL' and app<>'' group by app,day,hour,adx,os,osv,country,impType

) t  where app<>'' group by app,day,hour,adx,os,osv,country,impType
  """.format(yesterDay,yesterDay))

    //ssssssssssssss22222222222222222
    //删除结果表数据,并写入结果表
    //s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_1
    val CurDayFinalDirPath2 = s"s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_2/updatedate=$yesterDay"
    FileSystem.get(new URI("s3://baidu.taobao.tenent.com"), ss.sparkContext.hadoopConfiguration).
      delete(new Path(CurDayFinalDirPath2), true)
ss.sql(s"""
INSERT OVERWRITE TABLE mediabuy_dsp.t_dsp_bid_middle_detail_tbl_2
PARTITION (updatedate = '%s')
select app,day,hour,adx,os,osv,country,impType, sum(cnt) as cnt,sum(request) as request, sum(response) as response, sum(bid) as bid, sum(timeout) as timeout
,sum(status) as status,sum(ccount) as ccount,sum(remarketing) as remarketing,sum(banner) as banner,sum(video) as video,sum(mediatype) as mediatype,sum(publisher) as publisher,sum(ifa) as ifa,sum(qps) as qps,sum(budget) as budget,
sum(cat) as cat,sum(error) as error,sum(bidfloor) as bidfloor,sum(seatbid) as seatbid,sum(admerror) as admerror
from
(



select app,day,hour,adx,os,osv,country,impType, 0 as cnt, count(1) as request, 0 as response, 0 as bid, 0 as timeout,
0 as status,0 as ccount,0 as remarketing,0 as banner,0 as video,0 as mediatype,0 as publisher,0 as ifa,0 as qps,0 as budget,
0 as cat,0 as error,0 as bidfloor,0 as seatbid,0 as admerror
from
mediabuy_dsp.t_dsp_bid_detail_tbl
lateral view explode(split(regexp_replace(appsitemfinal,'\\\\[|\\\\]|"',''), ',')) aa as app
where day='%s' and app <>'NULL' and app is not null and app <> ''
group by app,day,hour,adx,os,osv,country,impType
) t  where app<>'' group by app,day,hour,adx,os,osv,country,impType
  """.format(yesterDay,yesterDay))

//    //ssssssssssssss3333333333333333333333
//删除结果表数据,并写入结果表
//s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_1
val CurDayFinalDirPath3 = s"s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_3/updatedate=$yesterDay"
    FileSystem.get(new URI("s3://baidu.taobao.tenent.com"), ss.sparkContext.hadoopConfiguration).
      delete(new Path(CurDayFinalDirPath3), true)

ss.sql(s"""
INSERT OVERWRITE TABLE mediabuy_dsp.t_dsp_bid_middle_detail_tbl_3
PARTITION (updatedate = '%s')
select app,day,hour,adx,os,osv,country,impType, sum(cnt) as cnt,sum(request) as request, sum(response) as response, sum(bid) as bid, sum(timeout) as timeout
,sum(status) as status,sum(ccount) as ccount,sum(remarketing) as remarketing,sum(banner) as banner,sum(video) as video,sum(mediatype) as mediatype,sum(publisher) as publisher,sum(ifa) as ifa,sum(qps) as qps,sum(budget) as budget,
sum(cat) as cat,sum(error) as error,sum(bidfloor) as bidfloor,sum(seatbid) as seatbid,sum(admerror) as admerror
from
(
select appName as app,day,hour,adx,os,osv,country,impType,0 as cnt, 0 as request,  sum(case when httpStatus='200' then 1 else 0 end) as response, 0 as bid, 0 as timeout,
0 as status,0 as ccount,0 as remarketing,0 as banner,0 as video,0 as mediatype,0 as publisher,0 as ifa,0 as qps,0 as budget,
0 as cat,0 as error,0 as bidfloor,0 as seatbid,0 as admerror
from
(
select rr.appName,day,hour,adx,os,osv,country,impType, rr.httpStatus
from (
select day,hour,adx,os,osv,country,impType, split(regexp_replace(regexp_extract(appitemresponse,'\\\\[(.+)\\\\]',1),'\\\\}\\\\,\\\\{', '\\\\}\\\\|\\\\|\\\\{'),'\\\\|\\\\|') as str
from
mediabuy_dsp.t_dsp_bid_detail_tbl
where day='%s'
) pp lateral view explode(pp.str) ss as col
lateral view json_tuple(ss.col,'appName','httpStatus') rr as appName, httpStatus
) t   where appName<>'NULL' group by appName,day,hour,adx,os,osv,country,impType
) t  where app<>'' group by app,day,hour,adx,os,osv,country,impType
  """.format(yesterDay,yesterDay))

    //ssssssssssssss44444444444444444444

    //删除结果表数据,并写入结果表
    //s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_1
    val CurDayFinalDirPath4 = s"s3://baidu.taobao.tenent.com/hive_dataware/mediabuy_dsp/t_dsp_bid_middle_detail_tbl_4/updatedate=$yesterDay"
    FileSystem.get(new URI("s3://baidu.taobao.tenent.com"), ss.sparkContext.hadoopConfiguration).
      delete(new Path(CurDayFinalDirPath4), true)
ss.sql( s"""
INSERT OVERWRITE TABLE mediabuy_dsp.t_dsp_bid_middle_detail_tbl_4
PARTITION (updatedate = '%s')
select app,day,hour,adx,os,osv,country,impType, sum(cnt) as cnt,sum(request) as request, sum(response) as response, sum(bid) as bid, sum(timeout) as timeout
,sum(status) as status,sum(ccount) as ccount,sum(remarketing) as remarketing,sum(banner) as banner,sum(video) as video,sum(mediatype) as mediatype,sum(publisher) as publisher,sum(ifa) as ifa,sum(qps) as qps,sum(budget) as budget,
sum(cat) as cat,sum(error) as error,sum(bidfloor) as bidfloor,sum(seatbid) as seatbid,sum(admerror) as admerror
from
(

select appname as app,day,hour,adx,os,osv,country,impType, 0 as cnt ,0 as request, 0 as response, count(1) as bid, 0 as timeout,
0 as status,0 as ccount,0 as remarketing,0 as banner,0 as video,0 as mediatype,0 as publisher,0 as ifa,0 as qps,0 as budget,
0 as cat,0 as error,0 as bidfloor,0 as seatbid,0 as admerror
from
mediabuy_dsp.t_dsp_bid_detail_tbl
where day='%s' and trim(appname)<>''
group by appname,day,hour,adx,os,osv,country,impType
) t  where app<>'' group by app,day,hour,adx,os,osv,country,impType
  """.format(yesterDay,yesterDay))

val bidDetailDf= ss.sql(
  s"""
     |select app,day,hour,adx,os,osv,country,impType, sum(cnt) as cnt,sum(request) as request, sum(response) as response, sum(bid) as bid, sum(timeout) as timeout
     |,sum(status) as status,sum(ccount) as ccount,sum(remarketing) as remarketing,sum(banner) as banner,sum(video) as video,sum(mediatype) as mediatype,sum(publisher) as publisher,sum(ifa) as ifa,sum(qps) as qps,sum(budget) as budget,
     |sum(cat) as cat,sum(error) as error,sum(bidfloor) as bidfloor,sum(seatbid) as seatbid,sum(admerror) as admerror
     |from
     |(
     |select app,day,hour,adx,os,osv,country,impType,cnt,request,response,bid,timeout,status,ccount,remarketing,banner,video,mediatype,publisher,ifa,qps,budget,cat,error,bidfloor,seatbid,admerror from mediabuy_dsp.t_dsp_bid_middle_detail_tbl_1 where updatedate='$yesterDay'
     |union all
     |select app,day,hour,adx,os,osv,country,impType,cnt,request,response,bid,timeout,status,ccount,remarketing,banner,video,mediatype,publisher,ifa,qps,budget,cat,error,bidfloor,seatbid,admerror from mediabuy_dsp.t_dsp_bid_middle_detail_tbl_2 where updatedate='$yesterDay'
     |union all
     |select app,day,hour,adx,os,osv,country,impType,cnt,request,response,bid,timeout,status,ccount,remarketing,banner,video,mediatype,publisher,ifa,qps,budget,cat,error,bidfloor,seatbid,admerror from mediabuy_dsp.t_dsp_bid_middle_detail_tbl_3 where updatedate='$yesterDay'
     |union all
     |select app,day,hour,adx,os,osv,country,impType,cnt,request,response,bid,timeout,status,ccount,remarketing,banner,video,mediatype,publisher,ifa,qps,budget,cat,error,bidfloor,seatbid,admerror from mediabuy_dsp.t_dsp_bid_middle_detail_tbl_4 where updatedate='$yesterDay'
     |)group by app,day,hour,adx,os,osv,country,impType
   """.stripMargin)


你可能感兴趣的:(spark)