方法一:map + reduceByKey
package com.cw.bigdata.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount1 {
def main(args: Array[String]): Unit = {
val config: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount1")
val sc: SparkContext = new SparkContext(config)
val lines: RDD[String] = sc.textFile("in")
lines.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
}
}
方法二:使用countByValue代替map + reduceByKey
package com.cw.bigdata.spark.wordcount
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object WordCount2 {
def main(args: Array[String]): Unit = {
val config: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount2")
val sc: SparkContext = new SparkContext(config)
val lines: RDD[String] = sc.textFile("in")
lines.flatMap(_.split(" ")).countByValue().foreach(println)
}
}
方法三:aggregateByKey或者foldByKey
package com.cw.bigdata.spark.wordcount
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object WordCount3 {
def main(args: Array[String]): Unit = {
val config: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount3")
val sc: SparkContext = new SparkContext(config)
val lines: RDD[String] = sc.textFile("in")
lines.flatMap(_.split(" ")).map((_, 1)).aggregateByKey(0)(_ + _, _ + _).collect().foreach(println)
lines.flatMap(_.split(" ")).map((_, 1)).foldByKey(0)(_ + _).collect().foreach(println)
}
}
方法四:groupByKey+map
package com.cw.bigdata.spark.wordcount
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object WordCount4 {
def main(args: Array[String]): Unit = {
val config: SparkConf = new SparkConf().setMaster("local[*]").setAppName("WordCount4")
val sc: SparkContext = new SparkContext(config)
val lines: RDD[String] = sc.textFile("in")
val groupByKeyRDD: RDD[(String, Iterable[Int])] = lines.flatMap(_.split(" ")).map((_, 1)).groupByKey()
groupByKeyRDD.map(tuple => {
(tuple._1, tuple._2.sum)
}).collect().foreach(println)
}
}
方法五:Scala原生实现wordcount
package com.cw.bigdata.spark.wordcount
object WordCount5 {
def main(args: Array[String]): Unit = {
val list = List("cw is cool", "wc is beautiful", "andy is beautiful", "mike is cool")
val res0 = list.map(_.split(" ")).flatten
val res1 = list.flatMap(_.split(" "))
println("第一步结果")
println(res0)
println(res1)
val res3 = res1.map((_, 1))
println("第二步结果")
println(res3)
val res4 = res3.groupBy(_._1)
println("第三步结果")
println(res4)
val res5 = res4.mapValues(_.size)
println("最后一步结果")
println(res5.toBuffer)
}
}
方法六:combineByKey
package com.cw.bigdata.spark.wordcount
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object WordCount6 {
def main(args: Array[String]): Unit = {
val config: SparkConf = new SparkConf().setMaster("local[*]").setAppName("combineByKey")
val sc: SparkContext = new SparkContext(config)
val lines: RDD[String] = sc.textFile("in")
val mapRDD: RDD[(String, Int)] = lines.flatMap(_.split(" ")).map((_, 1))
mapRDD.combineByKey(
x => x,
(x: Int, y: Int) => x + y,
(x: Int, y: Int) => x + y
).collect().foreach(println)
}
}