spark篇1:Spark和SparkSql UDF数据倾斜之随机数前缀

spark篇1:Spark和SparkSql UDF数据倾斜之随机数前缀
废话不多说,直接上干货

1 spark rdd
思路:先加前缀一次聚会,再去前缀二次聚合

package RDDDFDS

import RDDDFDS.初始化.ContextUtils
import org.apache.spark.rdd.RDD
import RDDDFDS.隐式转换.ImplicitAspect.rdd2RichRDD

object sparkWc {
  def main(args: Array[String]): Unit = {
    val sc = ContextUtils.getSC(this.getClass.getSimpleName)
    val readRdd: RDD[String] = sc.textFile("C:\\数据\\WC数据")
    val words: RDD[String] = readRdd.flatMap(_.split(""))
    //    增加随机数前缀,把倾斜的数据打散
    val randomRDD: RDD[String] = words.map(scala.util.Random.nextInt(9).toString+"_"+_)
    val tuples: RDD[(String, Int)] = randomRDD.map((_,1))
//    带前缀进行首次聚合
    val sumed: RDD[(String, Int)] = tuples.reduceByKey(_+_)
    sumed.distinct()
//    把随机数前缀去掉
    val value1: RDD[(String, Int)] = sumed.map(x => {
      val strings = x._1.split("_")
      val key = strings(1)
      val value = x._2
      (key, value)
    })
//    对没有前缀的RDD进行二次聚合(最终聚合)
    val value: RDD[(String, Int)] = value1.reduceByKey(_+_)
    value.printInfo()
    sc.stop()
  }
}

2 sparksql DataFrame udf
思路:先加前缀一次聚会,再去前缀二次聚合

package RDDDFDS

import java.util.Random

import RDDDFDS.初始化.ContextUtils
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext}

object sparkMysqlWc {
  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.ERROR)
    val sc: SparkContext = ContextUtils.getSC(this.getClass.getSimpleName)
    val sparkSession = new SQLContext(sc)
    val reader = sparkSession.read.format("jdbc")
      .option("url","jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=utf8&autoReconnect=true&rewriteBatchedStatements=TRUE&useSSL=false")
      .option("driver", "com.mysql.jdbc.Driver")
      .option("user", "root")
      .option("password","123456")
      .option("dbtable", "boytest").load()

    reader.registerTempTable("boytest")
    sparkSession.cacheTable("boytest")

    //udf加前缀
    sparkSession.udf.register("random_prefix", (key:String)=>{
      val random = new Random()
      val randNum = random.nextInt(10)
      randNum + "_" + key
    })
  //udf去前缀
    sparkSession.udf.register("remove_prefix",(key:String)=>{
      val strings = key.split("_")
      strings(1)
    })

    val datas = sparkSession.sql(
      s"""
         |select sum(ct),remove_prefix(name)
         |	  from(
         |        select
         |        count(age) as ct,
         |        random_prefix(name) as name
         |        from boytest
         |        group by random_prefix(name)
         |		)
         |		group by remove_prefix(name)
 """.stripMargin)

    datas.show()
 //   Thread.sleep(10000)
    sparkSession.uncacheTable("boytest")
    sc.stop()

  }
}

你可能感兴趣的:(spark,sparksql,udf)