本文章主要通过scala实现spark的基础action操作演示
import org.apache.spark.SparkConf import org.apache.spark.SparkContext /** * @author jhp * spark 基础action操作 */ object ActionOperation { def main(args: Array[String]): Unit = { reduce() collect() count() take() countByKey() } /** * 累加 */ def reduce() { val conf = new SparkConf() .setAppName("reduce") .setMaster("local") val sc = new SparkContext(conf) val numberArray = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val numbers = sc.parallelize(numberArray, 1) val sum = numbers.reduce(_ + _) println(sum) } /** * 遍历集合 */ def collect() { val conf = new SparkConf() .setAppName("collect") .setMaster("local") val sc = new SparkContext(conf) val numberArray = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val numbers = sc.parallelize(numberArray, 1) val doubleNumbers = numbers.map { num => num * 2 } val doubleNumberArray = doubleNumbers.collect() for(num <- doubleNumberArray) { println(num) } } /** * 统计数量 */ def count() { val conf = new SparkConf() .setAppName("count") .setMaster("local") val sc = new SparkContext(conf) val numberArray = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val numbers = sc.parallelize(numberArray, 1) val count = numbers.count() println(count) } /** * 获取TOP */ def take() { val conf = new SparkConf() .setAppName("take") .setMaster("local") val sc = new SparkContext(conf) val numberArray = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val numbers = sc.parallelize(numberArray, 1) val top3Numbers = numbers.take(3) for(num <- top3Numbers) { println(num) } } /** * 根据key统计数量 */ def countByKey() { val conf = new SparkConf() .setAppName("countByKey") .setMaster("local") val sc = new SparkContext(conf) val studentList = Array(Tuple2("class1", "leo"), Tuple2("class2", "jack"), Tuple2("class1", "tom"), Tuple2("class2", "jen"), Tuple2("class2", "marry")) val students = sc.parallelize(studentList, 1) val studentCounts = students.countByKey() println(studentCounts) }
def reduceByKey() { val conf = new SparkConf() .setAppName("groupByKey") .setMaster("local") val sc = new SparkContext(conf) val scoreList = Array(Tuple2("class1", 80), Tuple2("class2", 75), Tuple2("class1", 90), Tuple2("class2", 60)) val scores = sc.parallelize(scoreList, 1) val totalScores = scores.reduceByKey(_ + _) totalScores.foreach(classScore => println(classScore._1 + ": " + classScore._2)) }}