Spark Sql 处理groupby 的数据倾斜问题

写sql处理使用groupby 产生的数据倾斜问题:

import java.util.Random

import org.apache.spark.sql.SparkSession

object TestUDF {
  def main(args: Array[String]): Unit = {
    val spark =
      SparkSession.builder()
        .appName("TestUDF")
        .enableHiveSupport()
        .getOrCreate()


    spark.udf.register("random_prefix", (value: Int, num: Int) => randomPrefixUDF(value, num))
    spark.udf.register("remove_random_prefix", (value: String) => removeRandomPrefixUDF(value))

    // 加随机前缀
    val sql1 =
      s"""
         |select
         |  random_prefix(name, 6) product,
         |  id
         |from
         |  ggg.test
       """.stripMargin

    // 分组求和
    val sql2 =
      s"""
         |select
         |  product,
         |  sum(id) click
         |from
         |  (
         |    select
         |      random_prefix(name, 6) product,
         |      id
         |    from
         |      ggg.test
         |  ) t1
         |group by
         |  product
       """.stripMargin

    // 去掉随机前缀
    val sql3 =
      s"""
         |select
         |  remove_random_prefix(product) product,
         |  click
         |from
         |  (
         |    select
         |      product,
         |      sum(id) click
         |    from
         |      (
         |        select
         |          random_prefix(name, 6) product,
         |          id
         |        from
         |          ggg.test
         |      ) t1
         |    group by
         |      product
         |  ) t2
         |
       """.stripMargin

    // 分组求和
    val sql4 =
      s"""
         |select
         |  product,
         |  sum(click) click
         |from
         |  (
         |    select
         |      remove_random_prefix(product) product,
         |      click
         |    from
         |      (
         |        select
         |          product,
         |          sum(id) click
         |        from
         |          (
         |            select
         |              random_prefix(name, 6) product,
         |              id
         |            from
         |              ggg.test
         |          ) t1
         |        group by
         |          product
         |      ) t2
         |  ) t3
         |group by
         |  product
       """.stripMargin

    //    spark.sql(sql1).show()
    //    spark.sql(sql2).show()
    //    spark.sql(sql3).show()
    spark.sql(sql4).show()
  }


  def randomPrefixUDF(value: Int, num: Int): String = {
    new Random().nextInt(num).toString + "_" + value
  }

  def removeRandomPrefixUDF(value: String): String = {
    value.toString.split("_")(1)
  }
}

 

你可能感兴趣的:(大数据学习,Spark)