Spark 2.x 与 Java 8 下 WordCount 示例

RDD, 不用 lambda,reduceByKey

  • import
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
  • main
public class WordCount {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/walker/input/wordcount/idea.txt";    
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
        JavaRDD hdfstext = jsc.textFile(wordFile);
        // 切分(转化操作)
        JavaRDD words = hdfstext.flatMap(new FlatMapFunction() {
            public Iterator call(String x) {
                return Arrays.asList(x.split(" ")).iterator();
            }
        });
        // 单次计 1(转化操作)        
        JavaPairRDD pairs = words.mapToPair(    
                                        new PairFunction() {
            public Tuple2 call(String word) {
                return new Tuple2<>(word, 1);
            }
        }); 
        // 累加 1(转化操作) 
        JavaPairRDD wordCounts = pairs.reduceByKey(    
                                        new Function2() {
            public Integer call(Integer v1, Integer v2) {
                return v1 + v2;
            }
        }).repartition(1);     
        // 输出目录   
        String outDir = "/user/walker/output/wordcount";    
        wordCounts.saveAsTextFile(outDir);
        jsc.close();
    }
}

RDD + reduceByKey

  • import
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
  • main
public class WordCount2 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/walker/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
        JavaRDD hdfstext = jsc.textFile(wordFile);
        // 切分(转化操作)
        JavaRDD words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
        // 单次计 1(转化操作)
        JavaPairRDD pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
        // 累加 1(转化操作)
        JavaPairRDD wordCounts = pairs.reduceByKey((v1, v2) -> v1 + v2)
                                                        .repartition(1);
        // 输出目录
        String outDir = "/user/walker/output/wordcount2";
        wordCounts.saveAsTextFile(outDir);
        jsc.close();
    }
}

RDD + countByValue

  • import
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
  • main
public class WordCount3 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/walker/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
        JavaRDD hdfstext = jsc.textFile(wordFile);
        // 切分(转化操作)
        JavaRDD words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
        // 计数(行动操作)
        Map wordCounts =  words.countByValue(); 
        // 将 Map 转为 RDD
        List lst = new LinkedList<>();
        wordCounts.forEach((k, v) -> lst.add(k + "\t" + v));
        JavaRDD result = jsc.parallelize(lst).repartition(1);
        // 保存结果到 HDFS
        String outDir = "/user/walker/output/wordcount3";    //输出目录
        result.saveAsTextFile(outDir);
        jsc.close();
    }
}

DataFrame/DataSet + SQL

  • import
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
  • main
public class WordCount4 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/qhy/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        Dataset dfRow = spark.read().text(wordFile);
        // row.toString() 生成的字符串会被 [] 包围
        Dataset words = dfRow.flatMap((FlatMapFunction) row ->
                    Arrays.asList(row.getString(0).split("\\s+")).iterator(), Encoders.STRING());
        Dataset dfWord = words.toDF("word");
        dfWord.createOrReplaceTempView("words");
        Dataset dfCnt = dfWord.sqlContext().sql("SELECT word, count(*) FROM words GROUP BY word");
        // 保存结果到 HDFS
        String outDir = "/user/qhy/output/wordcount4";
        dfCnt.repartition(1).write().csv(outDir);
        spark.stop();
    }
}

DataSet/DataFrame + groupBy

  • import
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
  • main
public class WordCount5 {
    public static void main(String[] args) {
        // 输入文件
        String wordFile = "/user/qhy/input/wordcount/idea.txt";
        SparkSession spark = SparkSession.builder().appName("wordcount")
                .config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
                .getOrCreate();
        Dataset ds = spark.read().textFile(wordFile);
        Dataset words = ds.flatMap((FlatMapFunction) line ->
                               Arrays.asList(line.split("\\s+")).iterator(), Encoders.STRING());
        // words.printSchema();   // value: string (nullable = true)
        Dataset df = words.groupBy("value").count();
        // 保存结果到 HDFS
        String outDir = "/user/qhy/output/wordcount5";
        df.repartition(1).write().csv(outDir);
        spark.stop();
    }
}

相关阅读

  • Spark RDD 转化操作与行动操作
  • WordCount Using Apache Spark's DataFrame API.
  • Data WordCount
  • Spark SQL, DataFrame 和 Dataset 编程指南
本文出自 walker snapshot

你可能感兴趣的:(java,spark,lambda)