Spark setMaster源码
/** * The master URL to connect to, such as "local" to run locally with one thread, "local[4]" to * run locally with 4 cores, or "spark://master:7077" to run on a Spark standalone cluster. */ def setMaster(master: String): SparkConf = { set("spark.master", master) }
要连接到的主URL,例如“local”用一个线程在本地运行,“local [ 4 ]”用4个内核在本地运行,或者“Spark : / / master : 7077”用Spark独立集群运行。
package cn.rzlee.spark.scala import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} // object相当于静态的 object ScalaWordCount { def main(args: Array[String]): Unit = { //创建spark配置,设置应用程序名字 val conf = new SparkConf().setAppName("wordCountApp") // 创建spark执行入口 val sc = new SparkContext() // 指定以后从哪里读取数据创建RDD(弹性分布式数据集) val lines: RDD[String] = sc.textFile("") // 切分压平 val words: RDD[String] = lines.flatMap(_.split(" ")) // 将单词和一组合 val wordAndOne: RDD[(String, Int)] = words.map((_, 1)) // 按key进行聚合 相同key不变,将value相加 val reduced: RDD[(String, Int)] = wordAndOne.reduceByKey(_+_) // 排序 val sorted = reduced.sortBy(_._2,false) // 将结果保存到HDFS中 sorted.saveAsTextFile("") //释放资源 sc.stop() } }
基于排序机制的wordCount
java 版本:
package cn.rzlee.spark.core; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; import scala.actors.threadpool.Arrays; /** * @Author ^_^ * @Create 2018/11/3 */ public class SortWordCount { public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("SortWordCount").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); // 创建line RDD JavaRDDlines = sc.textFile("C:\\Users\\txdyl\\Desktop\\log\\in\\data.txt", 1); // 执行单词计数 JavaRDD words = lines.flatMap(new FlatMapFunction () { @Override public Iterable call(String s) throws Exception { return Arrays.asList(s.split("\t")); } }); JavaPairRDD pair = words.mapToPair(new PairFunction () { @Override public Tuple2 call(String s) throws Exception { return new Tuple2<>(s, 1); } }); JavaPairRDD wordCounts = pair.reduceByKey(new Function2 () { @Override public Integer call(Integer v1, Integer v2) throws Exception { return v1 + v2; } }); // 进行key-value的反转映射 JavaPairRDD countWords = wordCounts.mapToPair(new PairFunction , Integer, String>() { @Override public Tuple2 call(Tuple2 t) throws Exception { return new Tuple2<>(t._2, t._1); } }); // 按照key进行排序 JavaPairRDD sortedCountWords = countWords.sortByKey(false); // 再次进行key-value的反转映射 JavaPairRDD sortedWordCounts = sortedCountWords.mapToPair(new PairFunction , String, Integer>() { @Override public Tuple2 call(Tuple2 t) throws Exception { return new Tuple2<>(t._2, t._1); } }); // 打印结果 sortedWordCounts.foreach(new VoidFunction >() { @Override public void call(Tuple2 t) throws Exception { System.out.println(t._1 + " appears " + t._2+ " times."); } }); // 关闭JavaSparkContext sc.close(); } }
scala版本:
package cn.rzlee.spark.scala import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object SortWordCount { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local") val sc = new SparkContext(conf) val lines = sc.textFile("C:\\Users\\txdyl\\Desktop\\log\\in\\data.txt",1) val words: RDD[String] = lines.flatMap(line=>line.split("\t")) val pairs: RDD[(String, Int)] = words.map(word=>(word,1)) val wordCounts: RDD[(String, Int)] = pairs.reduceByKey(_+_) val countWords: RDD[(Int, String)] = wordCounts.map(wordCount=>(wordCount._2, wordCount._1)) val sortedCountWords = countWords.sortByKey(false) val sortedWordCounts: RDD[(String, Int)] = sortedCountWords.map(sortedCountWord=>(sortedCountWord._2, sortedCountWord._1)) sortedWordCounts.foreach(sortedWordCount=>{ println(sortedWordCount._1+" appear "+ sortedWordCount._2 + " times.") }) sc.stop() } }