Spark之8种WordCount

方法1:groupBy()

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val words: RDD[String] = sc.makeRDD(List("Hello", "Hello", "Hello", "Spark"))
        val wordGroup: RDD[(String, Iterable[String])] = words.groupBy(word => word)
        val wordCount: Array[(String, Int)] = wordGroup.map(tup => (tup._1, tup._2.size)).collect()
        println(wordCount.mkString(" "))
        // TODO 释放连接
        sc.stop()
}

方法2:reduceByKey()

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val words: RDD[String] = sc.makeRDD(List("Hello", "Hello", "Hello", "Spark"))
        val wordCount: RDD[(String, Int)] = words.map(word=>(word, 1)).reduceByKey(_+_)
        println(wordCount.collect().mkString(" "))


        // TODO 释放连接
        sc.stop()
}

方法3:groupByKey()

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val words: RDD[String] = sc.makeRDD(List("Hello", "Hello", "Hello", "Spark"))
        val wordsGroup: RDD[(String, Iterable[Int])] = words.map(word=>(word, 1)).groupByKey()
        val wordCounts: RDD[(String, Int)] = wordsGroup.map(tup=>(tup._1,tup._2.sum))
        println(wordCounts.collect().mkString(" "))

        // TODO 释放连接
        sc.stop()
}

方法4:aggregateByKey()

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val words = sc.parallelize(List(("a",3),("a",2),("c",4),("b",3),("c",6),("c",8)),2)

        val result: RDD[(String, Int)] = words.aggregateByKey(0)(
            (x, y) => x + y,
            (x, y) => x + y
        )
        result.saveAsTextFile("output")

        // TODO 释放连接
        sc.stop()
    }

方法5:foldByKey()

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val words = sc.parallelize(List(("a",3),("a",2),("c",4),("b",3),("c",6),("c",8)),2)

        val result: RDD[(String, Int)] = words.foldByKey(0)(_+_)
        result.saveAsTextFile("output")

        // TODO 释放连接
        sc.stop()
}

方法6:combineByKey

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val words = sc.parallelize(List(("a", 3), ("a", 2), ("c", 4), ("b", 3), ("c", 6), ("c", 8)), 2)

        val wordCount: RDD[(String, Int)] = words.combineByKey(
            num => num,
            (count: Int, num: Int) => count + num,
            (count1: Int, count2: Int) => count1 + count2
        )

        wordCount.saveAsTextFile("output")

        // TODO 释放连接
        sc.stop()
 }

方法7+方法8:

def main(args: Array[String]): Unit = {
     
        // TODO 创建Spark配置对象
        val sparkConf: SparkConf = new SparkConf().setAppName("Spark_RDD").setMaster("local[*]")
        // TODO 创建Spark环境连接对象
        val sc: SparkContext = new SparkContext(sparkConf)
        // TODO 构建RDD以及RDD操作
        val rdd1 = sc.parallelize(List(("a", 1), ("a", 1), ("c", 1), ("b", 1), ("c", 1), ("c", 1)), 2)
        val rdd2 = sc.makeRDD(List("a","a","a","a","b","b"))
        println(rdd1.countByKey().mkString(" "))
        println(rdd2.countByValue().mkString(" "))

        // TODO 释放连接
        sc.stop()
}

你可能感兴趣的:(大数据专栏)