Java版本的一些常见Spark算子

这里我列举了几个常见的Java类型的spark算子,主要包括Join、GroupByKey、mapPartition、mapPartitionWithIndex、sortBy算子
Join案例:

package com.liuze;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.List;

//Join的使用
public class JoinTest {
    public static void main(String[] args) {

        //join就是把两个集合根据key,进行内容聚合;

        //例如元组集合A:(1,"Spark"),(2,"Tachyon"),(3,"Hadoop")
        //元组集合B:(1,100),(2,95),(3,65)
        //A join B的结果:(1,("Spark",100)),(3,("hadoop",65)),(2,("Tachyon",95))

        SparkConf conf = new SparkConf().setAppName("SparkRDD").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        List<Integer> data  = Arrays.asList(1, 2, 3, 4, 5);
        JavaRDD<Integer> rdd = sc.parallelize(data);

        JavaPairRDD<Integer, Integer> rdd1 = rdd.mapToPair(new PairFunction<Integer, Integer, Integer>() {
            @Override
            public Tuple2<Integer, Integer> call(Integer num) throws Exception {
                return new Tuple2<>(num, num * num);

            }
        });

        JavaPairRDD<Integer, String> rdd2 = rdd.mapToPair(new PairFunction<Integer, Integer, String>() {
            @Override
            public Tuple2<Integer, String> call(Integer num) throws Exception {
                return new Tuple2<>(num, String.valueOf((char)(64 + num * num)));

            }
        });

        JavaPairRDD<Integer, Tuple2<Integer, String>> joinRDD  = rdd1.join(rdd2);
        JavaRDD<String> res  = joinRDD.map(new Function<Tuple2<Integer, Tuple2<Integer, String>>, String>() {

            @Override
            public String call(Tuple2<Integer, Tuple2<Integer, String>> integerTuple2Tuple2) throws Exception {
                int key = integerTuple2Tuple2._1();
                int value1 = integerTuple2Tuple2._2._1();
                String value2 = integerTuple2Tuple2._2._2();
                return "<" + key + ",<" + value1 + "," + value2 + ">>";
            }
        });

        List<String> resList  = res.collect();
        for(String str:resList){
            System.out.println(str);
        }
        sc.stop();
    }
}

GroupByKey算子:

package com.liuze;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.Arrays;
import java.util.List;

//groupByKey的使用
public class GroupByKeyTest {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName("Spark_GroupByKey_Sample");
        sparkConf.setMaster("local");
        JavaSparkContext context = new JavaSparkContext(sparkConf);
        List<Integer> data = Arrays.asList(1,1,2,2,1);
        JavaRDD<Integer> distData= context.parallelize(data);
        JavaPairRDD<Integer, Integer> firstRDD  = distData.mapToPair(new PairFunction<Integer, Integer, Integer>() {
            @Override
            public Tuple2<Integer, Integer> call(Integer integer) throws Exception {
                return new Tuple2<>(integer, integer * integer);
            }
        });
        JavaPairRDD<Integer, Iterable<Integer>> secondRDD  = firstRDD.groupByKey();
        JavaRDD<Tuple2<Integer, String>> resultRDD  = secondRDD.map(new Function<Tuple2<Integer, Iterable<Integer>>, Tuple2<Integer, String>>() {
            @Override
            public Tuple2<Integer, String> call(Tuple2<Integer, Iterable<Integer>> integerIterableTuple2) throws Exception {
                int key = integerIterableTuple2._1();
                StringBuffer sb = new StringBuffer();
                Iterable<Integer> iter = integerIterableTuple2._2();
                for (Integer integer : iter) {
                    sb.append(integer).append(" ");
                }

                return new Tuple2<>(key, sb.toString().trim());
            }
        });
        List<Tuple2<Integer, String>> resultList = resultRDD.collect();

        for(Tuple2<Integer, String> str:resultList){
            System.out.println(str._1() + " -> (" + str._2() + ")");
        }
        context.stop();
    }
}

mapPartitions算子

package com.liuze;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;

import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

//mapPartitions的使用
public class mapPartitionsTest {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName("Spark_GroupByKey_Sample");
        sparkConf.setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        List<Integer> data = Arrays.asList(1, 2, 4, 3, 5, 6, 7);
        //RDD有两个分区
        JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(data,2);
        JavaRDD<Integer> mapPartitionsRDD  = javaRDD.mapPartitions(new FlatMapFunction<Iterator<Integer>, Integer>() {
            @Override
            public Iterator<Integer> call(Iterator<Integer> integerIterator) throws Exception {
                int isum = 0;
                LinkedList<Integer> linkedList = new LinkedList<Integer>();
                while (integerIterator.hasNext()) {
                    isum += integerIterator.next();
                }
                linkedList.add(isum);
                return linkedList.iterator();
            }
        });

        List<Integer> collect = mapPartitionsRDD.collect();

        for(Integer integer:collect){
            System.out.println(integer);
        }
    }
}

mapPartitionsWithIndex算子

package com.liuze;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;

import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

//mapPartitionsWithIndex的使用
public class mapPartitionsWithIndexTest {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName("Spark_GroupByKey_Sample");
        sparkConf.setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);
        List<Integer> data = Arrays.asList(1, 2, 4, 3, 5, 6, 7);
        //RDD有两个分区
        JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(data,2);
        //分区index、元素值、元素编号输出
        JavaRDD<String> stringJavaRDD = javaRDD.mapPartitionsWithIndex(new Function2<Integer, Iterator<Integer>, Iterator<String>>() {
            @Override
            public Iterator<String> call(Integer v1, Iterator<Integer> v2) throws Exception {
                LinkedList<String> linkedList = new LinkedList<String>();
                int i = 0;
                while (v2.hasNext()) {
                    linkedList.add(Integer.toString(v1) + "|" + v2.next().toString() + Integer.toString(i++));
                }
                return linkedList.iterator();
            }
        }, false);
         List<String> collect = stringJavaRDD.collect();
         for (String str :collect){
             System.out.println(str);
         }
    }
}

sortBy算子

package com.liuze;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.rdd.RDD;

import java.util.Arrays;
import java.util.List;
import java.util.Random;

//通过sortBy自定义排序
public class SortByTest {
    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf();
        sparkConf.setAppName("Spark_GroupByKey_Sample");
        sparkConf.setMaster("local");
        JavaSparkContext javaSparkContext = new JavaSparkContext(sparkConf);

        List<Integer> data = Arrays.asList(5, 1, 1, 4, 4, 2, 2);
        JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(data);
        Random random = new Random(100);
        //对RDD进行转换,每个元素有两部分组成
        JavaRDD<String> javaRDD1 = javaRDD.map(new Function<Integer, String>() {
            @Override
            public String call(Integer v1) throws Exception {
                return v1.toString() + "_" + random.nextInt(100);
            }
        });

        //按RDD中每个元素的第二部分进行排序
        JavaRDD<String> resultRDD = javaRDD1.sortBy(new Function<String, String>() {

            @Override
            public String call(String v1) throws Exception {
                return v1.split("_")[1];
            }
        },false,10);
        System.out.println("result--------------" + resultRDD.collect());
    }
}

你可能感兴趣的:(java大数据)