SparkRDD之combineByKey

combineByKey是Spark中一个比较核心的高级函数,其他一些高阶键值对函数底层都是用它实现的。诸如 groupByKey,reduceByKey等等。combineByKey作用在键值对RDD上,根据键来对RDD进行合并。

java示例:

package com.cb.spark.sparkrdd;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;

public class CombineByKeyExample {
	public static void main(String[] args) {
		SparkConf conf = new SparkConf().setAppName("CombineByKey").setMaster("local");
		JavaSparkContext jsc = new JavaSparkContext(conf);

		List l1 = new ArrayList<>();
		l1.add("dog");
		l1.add("cat");
		l1.add("gnu");
		l1.add("salmon");
		l1.add("rabbit");
		l1.add("turkey");
		l1.add("wolf");
		l1.add("bear");
		l1.add("bee");
		JavaRDD javaRDD = jsc.parallelize(l1, 3);
		JavaRDD javaRDD2 = jsc.parallelize(Arrays.asList(1, 1, 2, 2, 2, 1, 2, 2, 2), 3);
		JavaPairRDD javaPairRDD = javaRDD2.zip(javaRDD);
		JavaPairRDD> javaPairRDD2 = javaPairRDD
//输入string,返回List,也就是将每个partition的第一个元素(String类型)添加到list中,此时每个partition中的元素为List,string,string
				.combineByKey(new Function>() {
					private static final long serialVersionUID = 1L;

					@Override
					public List call(String arg0) throws Exception {
						List list = new ArrayList<>();
						list.add(arg0);
						return list;
					}
				}, new Function2, String, List>() {
					private static final long serialVersionUID = 1L;
//输入List和String,这里的List就是上一个函数作用的结果,这一步作用是把每个partition中剩余的String类型元素添加到List当中,最后返回一个List
					@Override
					public List call(List arg0, String arg1) throws Exception {
						arg0.add(arg1);
						return arg0;
					}
				}, new Function2, List, List>() {
					private static final long serialVersionUID = 1L;
//输入List,输出List,这一个函数作用是把各个partition中的List进行合并,返回最终的List
					@Override
					public List call(List arg0, List arg1) throws Exception {
						arg0.addAll(arg1);
						return arg0;
					}
				});
		// (1,[dog, cat, turkey])
		// (2,[gnu, salmon, rabbit, wolf, bear, bee])
		javaPairRDD2.foreach(x -> System.out.print(x + " "));

		jsc.stop();
	}
}

 

你可能感兴趣的:(大数据开发)