这里我列举了几个常见的Java类型的spark算子,主要包括Join、GroupByKey、mapPartition、mapPartitionWithIndex、sortBy算子
Join案例:
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
//Join的使用
public class JoinTest {
public static void main(String[] args) {
//join就是把两个集合根据key,进行内容聚合;
//例如元组集合A:(1,"Spark"),(2,"Tachyon"),(3,"Hadoop")
//元组集合B:(1,100),(2,95),(3,65)
//A join B的结果:(1,("Spark",100)),(3,("hadoop",65)),(2,("Tachyon",95))
SparkConf conf = new SparkConf().setAppName("SparkRDD").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> rdd = sc.parallelize(data);
JavaPairRDD<Integer, Integer> rdd1 = rdd.mapToPair(new PairFunction<Integer, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Integer num) throws Exception {
return new Tuple2<>(num, num * num);
}
});
JavaPairRDD<Integer, String> rdd2 = rdd.mapToPair(new PairFunction<Integer, Integer, String>() {
@Override
public Tuple2<Integer, String> call(Integer num) throws Exception {
return new Tuple2<>(num, String.valueOf((char)(64 + num * num)));
}
});
JavaPairRDD<Integer, Tuple2<Integer, String>> joinRDD = rdd1.join(rdd2);
JavaRDD<String> res = joinRDD.map(new Function<Tuple2<Integer, Tuple2<Integer, String>>, String>() {
@Override
public String call(Tuple2<Integer, Tuple2<Integer, String>> integerTuple2Tuple2) throws Exception {
int key = integerTuple2Tuple2._1();
int value1 = integerTuple2Tuple2._2._1();
String value2 = integerTuple2Tuple2._2._2();
return "<" + key + ",<" + value1 + "," + value2 + ">>";
}
});
List<String> resList = res.collect();
for(String str:resList){
System.out.println(str);
}
sc.stop();
}
}
GroupByKey算子:
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Arrays;
import java.util.List;
//groupByKey的使用
public class GroupByKeyTest {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("Spark_GroupByKey_Sample");
sparkConf.setMaster("local");
JavaSparkContext context = new JavaSparkContext(sparkConf);
List<Integer> data = Arrays.asList(1,1,2,2,1);
JavaRDD<Integer> distData= context.parallelize(data);
JavaPairRDD<Integer, Integer> firstRDD = distData.mapToPair(new PairFunction<Integer, Integer, Integer>() {
@Override
public Tuple2<Integer, Integer> call(Integer integer) throws Exception {
return new Tuple2<>(integer, integer * integer);
}
});
JavaPairRDD<Integer, Iterable<Integer>> secondRDD = firstRDD.groupByKey();
JavaRDD<Tuple2<Integer, String>> resultRDD = secondRDD.map(new Function<Tuple2<Integer, Iterable<Integer>>, Tuple2<Integer, String>>() {
@Override
public Tuple2<Integer, String> call(Tuple2<Integer, Iterable<Integer>> integerIterableTuple2) throws Exception {
int key = integerIterableTuple2._1();
StringBuffer sb = new StringBuffer();
Iterable<Integer> iter = integerIterableTuple2._2();
for (Integer integer : iter) {
sb.append(integer).append(" ");
}
return new Tuple2<>(key, sb.toString().trim());
}
});
List<Tuple2<Integer, String>> resultList = resultRDD.collect();
for(Tuple2<Integer, String> str:resultList){
System.out.println(str._1() + " -> (" + str._2() + ")");
}
context.stop();
}
}
mapPartitions算子
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
//mapPartitions的使用
public class mapPartitionsTest {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("Spark_GroupByKey_Sample");
sparkConf.setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> data = Arrays.asList(1, 2, 4, 3, 5, 6, 7);
//RDD有两个分区
JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(data,2);
JavaRDD<Integer> mapPartitionsRDD = javaRDD.mapPartitions(new FlatMapFunction<Iterator<Integer>, Integer>() {
@Override
public Iterator<Integer> call(Iterator<Integer> integerIterator) throws Exception {
int isum = 0;
LinkedList<Integer> linkedList = new LinkedList<Integer>();
while (integerIterator.hasNext()) {
isum += integerIterator.next();
}
linkedList.add(isum);
return linkedList.iterator();
}
});
List<Integer> collect = mapPartitionsRDD.collect();
for(Integer integer:collect){
System.out.println(integer);
}
}
}
mapPartitionsWithIndex算子
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
//mapPartitionsWithIndex的使用
public class mapPartitionsWithIndexTest {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("Spark_GroupByKey_Sample");
sparkConf.setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> data = Arrays.asList(1, 2, 4, 3, 5, 6, 7);
//RDD有两个分区
JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(data,2);
//分区index、元素值、元素编号输出
JavaRDD<String> stringJavaRDD = javaRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {
@Override
public Iterator<String> call(Integer v1, Iterator<Integer> v2) throws Exception {
LinkedList<String> linkedList = new LinkedList<String>();
int i = 0;
while (v2.hasNext()) {
linkedList.add(Integer.toString(v1) + "|" + v2.next().toString() + Integer.toString(i++));
}
return linkedList.iterator();
}
}, false);
List<String> collect = stringJavaRDD.collect();
for (String str :collect){
System.out.println(str);
}
}
}
sortBy算子
package com.liuze;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.rdd.RDD;
import java.util.Arrays;
import java.util.List;
import java.util.Random;
//通过sortBy自定义排序
public class SortByTest {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf();
sparkConf.setAppName("Spark_GroupByKey_Sample");
sparkConf.setMaster("local");
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
List<Integer> data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);
JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(data);
Random random = new Random(100);
//对RDD进行转换,每个元素有两部分组成
JavaRDD<String> javaRDD1 = javaRDD.map(new Function<Integer, String>() {
@Override
public String call(Integer v1) throws Exception {
return v1.toString() + "_" + random.nextInt(100);
}
});
//按RDD中每个元素的第二部分进行排序
JavaRDD<String> resultRDD = javaRDD1.sortBy(new Function<String, String>() {
@Override
public String call(String v1) throws Exception {
return v1.split("_")[1];
}
},false,10);
System.out.println("result--------------" + resultRDD.collect());
}
}