import org.apache.spark.{SparkConf, SparkContext}
object MySparkUtil {
//获取本地的SparkContext
def apply(appName: String): SparkContext = {
val conf = new SparkConf()
.setAppName(appName)
.setMaster("local[*]")
new SparkContext(conf)
}
}
object TempDemo {
def main(args: Array[String]): Unit = {
//本地数据
val d1 = Array(("bj", 28.1), ("sh", 28.7), ("gz", 32.0), ("sz", 33.1))
val d2 = Array(("bj", 27.3), ("sh", 30.1), ("gz", 33.3))
val d3 = Array(("bj", 28.2), ("sh", 29.1), ("gz", 32.0), ("sz", 30.5))
//Spark进程
val sc = MySparkUtil.apply(getClass.getSimpleName)
//数组拼接
val data = d1 ++ d2 ++ d3
//将数据转换成RDD
/** rdd1.foreach(println)
*
* (sh,28.7)
* (bj,28.1)
* (gz,32.0)
* (gz,32.0)
* (sz,30.5)
* (sz,33.1)
* (bj,27.3)
* (bj,28.2)
* (sh,29.1)
* (sh,30.1)
* (gz,33.3)
*/
val rdd1: RDD[(String, Double)] = sc.makeRDD(data)
//将rdd1的value转换为list
/** rdd2.foreach(println)
*
* (sz,List(33.1))
* (bj,List(28.2))
* (sh,List(29.1))
* (sh,List(30.1))
* (gz,List(33.3))
* (sh,List(28.7))
* (gz,List(32.0))
* (sz,List(30.5))
* (bj,List(28.1))
* (gz,List(32.0))
* (bj,List(27.3))
*/
val rdd2: RDD[(String, List[Double])] = rdd1.mapValues(List(_))
//分组聚合,将相同城市的list聚合在一起,并将分散的list组成一个list
/** rdd3.foreach(println)
*
* (bj,List(28.1, 27.3, 28.2))
* (gz,List(32.0, 33.3, 32.0))
* (sh,List(28.7, 30.1, 29.1))
* (sz,List(33.1, 30.5))
*/
val rdd3: RDD[(String, List[Double])] = rdd2.reduceByKey((_ ++ _))
//求每个城市温度的平均值
// val res: RDD[(String, Double)] = rdd3.map(lis => {
// val avg: Double = lis._2.sum / lis._2.size
// (lis._1, avg)
// })
val res: RDD[(String, Double)] = rdd3.mapValues(lis => lis.sum / lis.size)
res.foreach(println)
sc.stop()
}
}