RDD, 不用 lambda,reduceByKey
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.Iterator;
public class WordCount {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/walker/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD hdfstext = jsc.textFile(wordFile);
// 切分(转化操作)
JavaRDD words = hdfstext.flatMap(new FlatMapFunction() {
public Iterator call(String x) {
return Arrays.asList(x.split(" ")).iterator();
}
});
// 单次计 1(转化操作)
JavaPairRDD pairs = words.mapToPair(
new PairFunction() {
public Tuple2 call(String word) {
return new Tuple2<>(word, 1);
}
});
// 累加 1(转化操作)
JavaPairRDD wordCounts = pairs.reduceByKey(
new Function2() {
public Integer call(Integer v1, Integer v2) {
return v1 + v2;
}
}).repartition(1);
// 输出目录
String outDir = "/user/walker/output/wordcount";
wordCounts.saveAsTextFile(outDir);
jsc.close();
}
}
RDD + reduceByKey
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
public class WordCount2 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/walker/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD hdfstext = jsc.textFile(wordFile);
// 切分(转化操作)
JavaRDD words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
// 单次计 1(转化操作)
JavaPairRDD pairs = words.mapToPair(word -> new Tuple2<>(word, 1));
// 累加 1(转化操作)
JavaPairRDD wordCounts = pairs.reduceByKey((v1, v2) -> v1 + v2)
.repartition(1);
// 输出目录
String outDir = "/user/walker/output/wordcount2";
wordCounts.saveAsTextFile(outDir);
jsc.close();
}
}
RDD + countByValue
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
public class WordCount3 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/walker/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
JavaRDD hdfstext = jsc.textFile(wordFile);
// 切分(转化操作)
JavaRDD words = hdfstext.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
// 计数(行动操作)
Map wordCounts = words.countByValue();
// 将 Map 转为 RDD
List lst = new LinkedList<>();
wordCounts.forEach((k, v) -> lst.add(k + "\t" + v));
JavaRDD result = jsc.parallelize(lst).repartition(1);
// 保存结果到 HDFS
String outDir = "/user/walker/output/wordcount3"; //输出目录
result.saveAsTextFile(outDir);
jsc.close();
}
}
DataFrame/DataSet + SQL
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
public class WordCount4 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/qhy/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
Dataset dfRow = spark.read().text(wordFile);
// row.toString() 生成的字符串会被 [] 包围
Dataset words = dfRow.flatMap((FlatMapFunction) row ->
Arrays.asList(row.getString(0).split("\\s+")).iterator(), Encoders.STRING());
Dataset dfWord = words.toDF("word");
dfWord.createOrReplaceTempView("words");
Dataset dfCnt = dfWord.sqlContext().sql("SELECT word, count(*) FROM words GROUP BY word");
// 保存结果到 HDFS
String outDir = "/user/qhy/output/wordcount4";
dfCnt.repartition(1).write().csv(outDir);
spark.stop();
}
}
DataSet/DataFrame + groupBy
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import java.util.Arrays;
public class WordCount5 {
public static void main(String[] args) {
// 输入文件
String wordFile = "/user/qhy/input/wordcount/idea.txt";
SparkSession spark = SparkSession.builder().appName("wordcount")
.config("spark.hadoop.mapreduce.output.fileoutputformat.compress", false)
.getOrCreate();
Dataset ds = spark.read().textFile(wordFile);
Dataset words = ds.flatMap((FlatMapFunction) line ->
Arrays.asList(line.split("\\s+")).iterator(), Encoders.STRING());
// words.printSchema(); // value: string (nullable = true)
Dataset df = words.groupBy("value").count();
// 保存结果到 HDFS
String outDir = "/user/qhy/output/wordcount5";
df.repartition(1).write().csv(outDir);
spark.stop();
}
}
相关阅读
- Spark RDD 转化操作与行动操作
- WordCount Using Apache Spark's DataFrame API.
- Data WordCount
- Spark SQL, DataFrame 和 Dataset 编程指南
本文出自 walker snapshot