数据准备
object WordCountExer {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("file - RDD")
val sc = new SparkContext(sparkConf)
val list = List("hello", "world", "hello", "scala", "hello", "spark", "spark")
val rdd3: RDD[String] = sc.makeRDD(list, 2)
sc.stop()
}
}
groupBy
val res2: RDD[(String, Int)] = rdd3.groupBy(str => str).map(kv => (kv._1, kv._2.size))
println("1:" + res2.collect().mkString(","))
groupByKey
val res3: RDD[(String, Int)] = rdd3.map((_, 1)).groupByKey().map(kv => (kv._1, kv._2.sum))
println("2:" + res3.collect().mkString(","))
recduceByKey
val res4: RDD[(String, Int)] = rdd3.map((_, 1)).reduceByKey(_ + _)
println("3:" + res4.collect().mkString(","))
aggregateByKey
val res5: RDD[(String, Int)] = rdd3.map((_, 1)).aggregateByKey(0)(_ + _, _ + _)
println("4:" + res5.collect().mkString(","))
foldByKey
val res6: RDD[(String, Int)] = rdd3.map((_, 1)).foldByKey(0)(_ + _)
println("5:" + res6.collect().mkString(","))
combineByKey
val res7: RDD[(String, Int)] = rdd3.map((_, 1)).combineByKey(v => v, (v1: Int, v2) => v1 + v2, (v1: Int, v2: Int) => v1 + v2)
println("6:" + res7.collect().mkString(","))
countByKey
val res8: collection.Map[String, Long] = rdd3.countByValue()
println("7:" + res8.mkString(","))
cogroup
val rdd4: RDD[(String, Int)] = sc.makeRDD(list).map((_, 1))
val rdd5: RDD[(String, (Iterable[Int], Iterable[Int]))] = rdd3.map((_, 1)).cogroup(rdd4)
val res9: RDD[(String, Int)] = rdd5.map(kv => (kv._1, kv._2._1.size))
println("8:" + res9.collect().mkString(","))
aggregate
val map: mutable.Map[String, Int] = mutable.Map()
val res10: mutable.Map[String, Int] = rdd3.aggregate(map)(
(map, word) => {
map(word) = map.getOrElse(word, 0) + 1
map
},
(map1, map2) => {
map1.foldLeft(map2)(
(m, kv) => {
m(kv._1) = m.getOrElse(kv._1, 0) + kv._2
m
}
)
}
)
println("9:" + res10.mkString(","))
fold
val mapZero: mutable.Map[String, Int] = mutable.Map()
val res11: mutable.Map[String, Int] = rdd3.map(word => mutable.Map(word -> 1)).fold(mapZero)((m1: mutable.Map[String, Int], m2: mutable.Map[String, Int]) => {
m2.foreach(kv=>{
m1(kv._1) = m1.getOrElse(kv._1,0) + kv._2
})
m1
})
println("10:" + res11.mkString(","))
打印结果
1:(scala,1),(hello,3),(world,1),(spark,2)
2:(scala,1),(hello,3),(world,1),(spark,2)
3:(scala,1),(hello,3),(world,1),(spark,2)
4:(scala,1),(hello,3),(world,1),(spark,2)
5:(scala,1),(hello,3),(world,1),(spark,2)
6:(scala,1),(hello,3),(world,1),(spark,2)
7:scala -> 1,hello -> 3,world -> 1,spark -> 2
8:(spark,2),(scala,1),(world,1),(hello,3)
9:spark -> 2,scala -> 1,world -> 1,hello -> 3
10:spark -> 2,scala -> 1,world -> 1,hello -> 3