方法1:groupBy()
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val words: RDD[String] = sc.makeRDD(List("Hello", "Hello", "Hello", "Spark"))
val wordGroup: RDD[(String, Iterable[String])] = words.groupBy(word => word)
val wordCount: Array[(String, Int)] = wordGroup.map(tup => (tup._1, tup._2.size)).collect()
println(wordCount.mkString(" "))
// TODO 释放连接
sc.stop()
}
方法2:reduceByKey()
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val words: RDD[String] = sc.makeRDD(List("Hello", "Hello", "Hello", "Spark"))
val wordCount: RDD[(String, Int)] = words.map(word=>(word, 1)).reduceByKey(_+_)
println(wordCount.collect().mkString(" "))
// TODO 释放连接
sc.stop()
}
方法3:groupByKey()
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val words: RDD[String] = sc.makeRDD(List("Hello", "Hello", "Hello", "Spark"))
val wordsGroup: RDD[(String, Iterable[Int])] = words.map(word=>(word, 1)).groupByKey()
val wordCounts: RDD[(String, Int)] = wordsGroup.map(tup=>(tup._1,tup._2.sum))
println(wordCounts.collect().mkString(" "))
// TODO 释放连接
sc.stop()
}
方法4:aggregateByKey()
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val words = sc.parallelize(List(("a",3),("a",2),("c",4),("b",3),("c",6),("c",8)),2)
val result: RDD[(String, Int)] = words.aggregateByKey(0)(
(x, y) => x + y,
(x, y) => x + y
)
result.saveAsTextFile("output")
// TODO 释放连接
sc.stop()
}
方法5:foldByKey()
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val words = sc.parallelize(List(("a",3),("a",2),("c",4),("b",3),("c",6),("c",8)),2)
val result: RDD[(String, Int)] = words.foldByKey(0)(_+_)
result.saveAsTextFile("output")
// TODO 释放连接
sc.stop()
}
方法6:combineByKey
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val words = sc.parallelize(List(("a", 3), ("a", 2), ("c", 4), ("b", 3), ("c", 6), ("c", 8)), 2)
val wordCount: RDD[(String, Int)] = words.combineByKey(
num => num,
(count: Int, num: Int) => count + num,
(count1: Int, count2: Int) => count1 + count2
)
wordCount.saveAsTextFile("output")
// TODO 释放连接
sc.stop()
}
方法7+方法8:
def main(args: Array[String]): Unit = {
// TODO 创建Spark配置对象
val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
// TODO 创建Spark环境连接对象
val sc: SparkContext = new SparkContext(sparkConf)
// TODO 构建RDD以及RDD操作
val rdd1 = sc.parallelize(List(("a", 1), ("a", 1), ("c", 1), ("b", 1), ("c", 1), ("c", 1)), 2)
val rdd2 = sc.makeRDD(List("a","a","a","a","b","b"))
println(rdd1.countByKey().mkString(" "))
println(rdd2.countByValue().mkString(" "))
// TODO 释放连接
sc.stop()
}