sparksql通过hash算法使得总体性能提高39%

优化前总任务使用时间:1小时43min=103min
在这里插入图片描述

优化后,总任务使用时间:1小时20分钟=80min
sparksql通过hash算法使得总体性能提高39%_第1张图片
优化的效率提升了,1-80/130=39%
性能提高了39%

默认不配置 exutors-cores excutor-cores 为4 那么如下配置:
sparksql通过hash算法使得总体性能提高39%_第2张图片
那么没利用的cores就只有19*4=76个core,
提交的脚本如下:

spark-submit --deploy-mode cluster --master yarn --driver-memory 10G --executor-memory 20G --conf spark.executor.memoryOverhead=4096 --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 --conf 
spark.network.timeout=300s --conf spark.executor.heartbeatInterval=100s --conf spark.driver.maxResultSize=3G --class com.yeahmobi.dmp.mediacode.InstallListMake s3://taobao/leif/dmp_tags-1.0-SNAPSHOT.jar 20190821 0

注意到总cores是1619=304 ,那么core的利用率就仅仅为76/304=***24%**,
等于四分之三的core是闲置的,造成极大的资源浪费
但是又不能将cores调大,调大,就会单个task的内存不够用,
为了使得单个task的使用内存变小,就想到了利用hash将大表变成小表,这样单个task的内存就可以小一点,这样就可以利用所有的cores了,说干就干
优化前的代码为:

def joinAndWriteData(ss: SparkSession, Day: String) : Unit = {
  val OneDayAgo = DateTime.parse(Day, DayFormater).minusDays(1).toString(DayFormater)
  val SevenDayAgo = DateTime.parse(Day, DayFormater).minusDays(7).toString(DayFormater)
  ss.sql("set spark.sql.shuffle.partitions=3800")
  val sql =
    """
           SELECT
               ifa, array_distinct(flatten(collect_list(bundle))) AS bundles ,array_distinct(flatten(collect_list(countrys))) AS countrys, max(updateday) as updateday
           FROM
           (
             SELECT  ifa, bundles AS bundle, countrys, updateday from dmp.t_dmp_idfa_bundle_country_array_middle_tbl  where day='%s'
             UNION ALL
             SELECT  ifa, media_id AS bundle, array() AS countrys, '%s'  as updateday FROM dmp.t_dmp_idfa_bundle_every_tbl
             UNION ALL
             SELECT  ifa, array() AS bundle, country  AS countrys,  '%s'  as updateday FROM dmp.t_dmp_idfa_country_every_tbl
           ) x  GROUP BY ifa HAVING size(bundles)<2000 and size(bundles)>0
 """.format(OneDayAgo, Day,Day)


  val result = ss.sql(sql).persist(StorageLevel.MEMORY_AND_DISK_SER)
  result.printSchema()

  println("start write  data to middle tbl")
  ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl drop if exists partition (day ='$Day')")
  val middleTable = s"s3://xiandmpdata.yeahtargeter.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl/day=$Day"
  FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(middleTable), true)
  result.write.format("orc").save(middleTable)
  ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl add if not exists partition (day ='$Day')")
  println("write  data to middle tbl success")

  //删除七天前的分天表数据含分区
  ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl drop if exists partition (day ='$SevenDayAgo')")
  val SevenDaysSgoMidDirPath = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl/day=$SevenDayAgo"
  FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(SevenDaysSgoMidDirPath), true)
  println(s"delete day $SevenDayAgo t_dmp_idfa_bundle_country_array_middle_tbl data successed")


  println("start write t_dmp_idfa_bundle_country_array_tbl tbl")
  val resultTable = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_tbl"
  FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(resultTable), true)
  result.write.format("orc").save(resultTable)
  println("write  idfa country array success")

ndWriteData(ss: SparkSession, Day: String) : Unit = {
  val OneDayAgo = DateTime.parse(Day, DayFormater).minusDays(1).toString(DayFormater)
  val SevenDayAgo = DateTime.parse(Day, DayFormater).minusDays(7).toString(DayFormater)
  ss.sql("set spark.sql.shuffle.partitions=3800")
  //利用udf hash将数据分成两份,单个task的内存使用率就降低一倍
  for(routeCode <- 0 to 1){
    val hashSql =
      s"""
           SELECT
               ifa, array_distinct(flatten(collect_list(bundle))) AS bundles ,array_distinct(flatten(collect_list(countrys))) AS countrys, max(updateday) as updateday
           FROM
           (
             SELECT  ifa, bundles AS bundle, countrys, updateday from dmp.t_dmp_idfa_bundle_country_array_middle_tbl  where day='$OneDayAgo' and $routeCode=getRouteCode(ifa)
             UNION ALL
             SELECT  ifa, media_id AS bundle, array() AS countrys, '$Day'  as updateday FROM dmp.t_dmp_idfa_bundle_every_tbl where $routeCode=getRouteCode(ifa)
             UNION ALL
             SELECT  ifa, array() AS bundle, country  AS countrys,  '$Day'  as updateday FROM dmp.t_dmp_idfa_country_every_tbl where $routeCode=getRouteCode(ifa)
           ) x  GROUP BY ifa HAVING size(bundles)<2000 and size(bundles)>0
 """


    val result = ss.sql(hashSql).persist(StorageLevel.MEMORY_AND_DISK_SER)
    result.printSchema()

    println("start write  data to middle tbl")
    ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode drop if exists partition (day ='$Day')")
    val middleTable = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode/day=$Day"
    FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(middleTable), true)
    result.write.format("orc").save(middleTable)
    ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode add if not exists partition (day ='$Day')")
    println(s"write  data to middle tbl_$routeCode success")

    //删除七天前的分天表数据含分区
    ss.sql(s"alter table dmp.t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode drop if exists partition (day ='$SevenDayAgo')")
    val SevenDaysSgoMidDirPath = s"s3://taobao.com/hive_dataware/dmp/t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode/day=$SevenDayAgo"
    FileSystem.get(new URI("s3://taobao.com"), ss.sparkContext.hadoopConfiguration).delete(new Path(SevenDaysSgoMidDirPath), true)
    println(s"delete day $SevenDayAgo t_dmp_idfa_bundle_country_array_middle_tbl_dylan_$routeCode data successed")
    result.unpersist(true)
  }

优化后的提交代码

spark-submit --deploy-mode cluster --master yarn --driver-memory 10G --executor-memory 20G --executor-cores 12 --conf spark.executor.memoryOverhead=4096 --conf spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version=2 --conf 
spark.network.timeout=300s --conf spark.executor.heartbeatInterval=100s --conf spark.driver.maxResultSize=3G --class com.yeahmobi.dmp.mediacode.InstallListMake6 s3://taobao/dmp_tags-1.0-SNAPSHOT.jar 20190822 0

你可能感兴趣的:(hive,spark)