Spark第一弹——实现WordCount的三种方式

方法一   Scala语言

分步进行

$scala>val rdd1 = sc.textFile("/home/centos/tmp/test.txt")
$scala>val rdd2 = rdd1.flatMap(line=>line.split(" "))
$scala>val rdd3 = rdd2.map(word = > (word,1))
$scala>val rdd4 = rdd3.reduceByKey(_ + _)
$scala>rdd4.collect

一步到位

sc.textFile("/home/centos/test.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_ + _).collect

过滤单词

sc.textFile("/home/centos/test.txt").flatMap(_.split(" ")).filter(_.contains("wor")).map((_,1)).reduceByKey(_ + _).collect

方法2 Spark本地类库

pom.xml文件


		
			4.0.0

			com
			SparkDemo
			1.0-SNAPSHOT
			
				
					org.apache.spark
					spark-core_2.11
					2.1.0
				
			
		

创建Scala.object

import org.apache.spark.{SparkConf, SparkContext}
object WordCount {
	def main(args: Array[String]): Unit = {
		//创建Spark配置对象
		val conf = new SparkConf();
		conf.setAppName("WordCountSpark")
		//设置master属性
		conf.setMaster("local") ;

		//通过conf创建sc
		val sc = new SparkContext(conf);

		//加载文本文件
		val rdd1 = sc.textFile("d:/scala/test.txt");
		//压扁
		val rdd2 = rdd1.flatMap(line => line.split(" ")) ;
		//映射w => (w,1)
		val rdd3 = rdd2.map((_,1))
		val rdd4 = rdd3.reduceByKey(_ + _)
		val r = rdd4.collect()
		r.foreach(println)
	}
}

方法3 JAVA语言

import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

/**
 * java版
 */
public class WordCountJava2 {
    public static void main(String[] args) {
        //创建SparkConf对象
        SparkConf conf = new SparkConf();
        conf.setAppName("WordCountJava2");
        conf.setMaster("local");

        //创建java sc
        JavaSparkContext sc = new JavaSparkContext(conf);
        //加载文本文件
        JavaRDD rdd1 = sc.textFile("F:\\BigData\\Scala\\scalaWorkspace\\FileTmp\\test.txt");

        //压扁
        JavaRDD rdd2 = rdd1.flatMap(new FlatMapFunction() {
            public Iterator call(String s) throws Exception {
                List list = new ArrayList();
                String[] arr = s.split(" ");
                for(String ss :arr){
                    list.add(ss);
                }
                return list.iterator();
            }
        });

        //映射,word -> (word,1)
        JavaPairRDD rdd3 = rdd2.mapToPair(new PairFunction() {
            public Tuple2 call(String s) throws Exception {
                return new Tuple2(s,1);
            }
        });

        //reduce化简
        JavaPairRDD rdd4 = rdd3.reduceByKey(new Function2() {
            public Integer call(Integer v1, Integer v2) throws Exception {
                return v1 + v2;
            }
        });

        //
        List> list = rdd4.collect();
        for(Tuple2 t : list){
            System.out.println(t._1() + " : " + t._2());
        }
    }
}

 

你可能感兴趣的:(Spark)