Spark实现列转行------求平均温度案例

import org.apache.spark.{SparkConf, SparkContext}

object MySparkUtil {
  //获取本地的SparkContext
  def apply(appName: String): SparkContext = {
    val conf = new SparkConf()
      .setAppName(appName)
      .setMaster("local[*]")
    new SparkContext(conf)
  }

}
object TempDemo {
  def main(args: Array[String]): Unit = {
    //本地数据
    val d1 = Array(("bj", 28.1), ("sh", 28.7), ("gz", 32.0), ("sz", 33.1))
    val d2 = Array(("bj", 27.3), ("sh", 30.1), ("gz", 33.3))
    val d3 = Array(("bj", 28.2), ("sh", 29.1), ("gz", 32.0), ("sz", 30.5))

    //Spark进程
    val sc = MySparkUtil.apply(getClass.getSimpleName)

    //数组拼接
    val data = d1 ++ d2 ++ d3

    //将数据转换成RDD
    /** rdd1.foreach(println)
      *
      * (sh,28.7)
      * (bj,28.1)
      * (gz,32.0)
      * (gz,32.0)
      * (sz,30.5)
      * (sz,33.1)
      * (bj,27.3)
      * (bj,28.2)
      * (sh,29.1)
      * (sh,30.1)
      * (gz,33.3)
      */
    val rdd1: RDD[(String, Double)] = sc.makeRDD(data)

    //将rdd1的value转换为list
    /** rdd2.foreach(println)
      *
      * (sz,List(33.1))
      * (bj,List(28.2))
      * (sh,List(29.1))
      * (sh,List(30.1))
      * (gz,List(33.3))
      * (sh,List(28.7))
      * (gz,List(32.0))
      * (sz,List(30.5))
      * (bj,List(28.1))
      * (gz,List(32.0))
      * (bj,List(27.3))
      */
    val rdd2: RDD[(String, List[Double])] = rdd1.mapValues(List(_))

    //分组聚合,将相同城市的list聚合在一起,并将分散的list组成一个list
    /** rdd3.foreach(println)
      *
      * (bj,List(28.1, 27.3, 28.2))
      * (gz,List(32.0, 33.3, 32.0))
      * (sh,List(28.7, 30.1, 29.1))
      * (sz,List(33.1, 30.5))
      */
    val rdd3: RDD[(String, List[Double])] = rdd2.reduceByKey((_ ++ _))

    //求每个城市温度的平均值
    //    val res: RDD[(String, Double)] = rdd3.map(lis => {
    //      val avg: Double = lis._2.sum / lis._2.size
    //      (lis._1, avg)
    //    })

    val res: RDD[(String, Double)] = rdd3.mapValues(lis => lis.sum / lis.size)


    res.foreach(println)


    sc.stop()
  }

}

 

你可能感兴趣的:(Spark)