使用Spark写WordCount

框架搭建

使用IDEA搭建同时可以写Scala语言和Java语言的Maven项目,步骤:
new--->project
选择Maven项目,不要勾选任何选项,一直next到finish;
pom.xml如下:


    
        4.0.0
        com.tao
        spark
        1.0-SNAPSHOT

        
            2.1.0
            2.11.8
        
        
            
                org.scala-lang
                scala-library
                ${scala.version}
            
            
                org.apache.spark
                spark-core_2.11
                ${spark.version}
            
            
                org.apache.hadoop
                hadoop-client
                2.6.5
            
        

        
        
            src/main/scala
            src/test/scala
            
                
                    
                        net.alchim31.maven
                        scala-maven-plugin
                        3.2.2
                    
                    
                        org.apache.maven.plugins
                        maven-compiler-plugin
                        3.7.0
                    
                
            
            
                
                    net.alchim31.maven
                    scala-maven-plugin
                    
                        
                            scala-compile-first
                            process-resources
                            
                                add-source
                                compile
                            
                        
                        
                            scala-test-compile
                            process-test-resources
                            
                                testCompile
                            
                        
                    
                
                
                    org.apache.maven.plugins
                    maven-shade-plugin
                    2.4.3
                    
                        
                            package
                            
                                shade
                            
                            
                                
                                    
                                        *:*
                                        
                                            META-INF/*.SF
                                            META-INF/*.DSA
                                            META-INF/*.RSA
                                        
                                    
                                
                            
                        
                    
                
            
        
    

使用该pom文件,直接下载即可;

Scala写WordCount

代码如下:

/**
  *  Taoyongpan
  *  Created in 12:47 2018/5/24
  */
object WordCount {
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("ScalaWorkContext").setMaster("local")
    //sc是Spark Context,是Spark程序的入口
    val sc = new SparkContext(conf)
    //编写Spark程序
    //sc.textFile(args(0)).flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).saveAsTextFile(args(1))
    //指定从哪里读取数据,并生成RDD
    val lines : RDD[String] = sc.textFile(args(0))
    //将一行内容进行切分压平
    val words : RDD[String] = lines.flatMap(_.split(" "))
    //将单词和1放到一个元组中
    val  wordAndOne : RDD[(String,Int)] = words.map((_,1))
    //继续聚合
    val reduced : RDD[(String,Int)] = wordAndOne.reduceByKey(_+_)
    //从小到大排序
    val sorted : RDD[(String,Int)] = reduced.sortBy(_._2)
    //逆序排序
//    val sorted : RDD[(String,Int)] = reduced.sortBy(_._2,false)
    //存储到指定位置
    sorted.saveAsTextFile(args(1))
    sc.stop()
  }
}

Java写 WordCount

代码如下:

/**
 * Author: Taoyongpan
 * Date: Created in 13:16 2018/5/24
 */
public class WordCount {
    public static void main(String[] args) {

        //
        SparkConf conf = new SparkConf();
        conf.setAppName("WordCount").setMaster("local");
        //创建程序执行的入口
        JavaSparkContext jsc = new JavaSparkContext(conf);
        //Spark程序
        //指定从哪里读取数据
        final JavaRDD lines = jsc.textFile(args[0]);
        //切分压平
        JavaRDD words = lines.flatMap(new FlatMapFunction() {
            public Iterator call(String line) throws Exception {
                String[] words = line.split(" ");
                return Arrays.asList(words).iterator();
            }
        });
        //将单词和1放在一起
        JavaPairRDD wordAndOne = words.mapToPair(new PairFunction() {
            public Tuple2 call(String word) throws Exception {
                return new Tuple2(word, 1);
            }
        });
        //聚合
        JavaPairRDD reduced = wordAndOne.reduceByKey(new Function2() {
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });
        //排序,Java的RDD只支持SortByKey,调换单词和次数的顺序
        JavaPairRDD swaped = reduced.mapToPair(new PairFunction, Integer, String>() {
            public Tuple2 call(Tuple2 tp) throws Exception {
                return tp.swap();
            }
        });
        //排序
        JavaPairRDD sorted = swaped.sortByKey();
        //再调换顺序
        JavaPairRDD res = sorted.mapToPair(new PairFunction, String, Integer>() {
            public Tuple2 call(Tuple2 tp) throws Exception {
                return tp.swap();
            }
        });
        //保存
        res.saveAsTextFile(args[1]);
        //释放资源
        jsc.stop();
    }
}

未完待续。。。

你可能感兴趣的:(使用Spark写WordCount)