【菜鸟系列】spark常用算子总结(java)--groupByKey,reduceByKey

https://blog.csdn.net/Java_Soldier/article/details/80582336
reduceByKey样例

SparkConf conf = new SparkConf().setAppName("jiangtao_demo").setMaster("local");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        //并行集合生成JavaRDD
        JavaRDD lines = jsc.parallelize(Arrays.asList("pandas","numpy","pip","pip","pip"));
JavaPairRDD mapToPairResult = lines.mapToPair(new PairFunction() {
            @Override
            public Tuple2 call(String o) throws Exception {
                Tuple2 tuple2 = new Tuple2(o,1);
                //System.out.println(tuple2._1()+":"+tuple2._2());
                return tuple2;
            }
        });
        //reduceByKey 统计词频
        JavaPairRDD reduceByKeyResult = mapToPairResult.reduceByKey(new Function2() {
            @Override
            public Integer call(Integer i1, Integer i2) throws Exception {
                return i1+i2;
            }
        });

groupByKey样例

SparkConf conf = new SparkConf().setAppName("jiangtao_demo").setMaster("local");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        //并行集合生成JavaRDD
        JavaRDD lines = jsc.parallelize(Arrays.asList("pandas","numpy","pip","pip","pip"));
JavaPairRDD mapToPairResult = lines.mapToPair(new PairFunction() {
            @Override
            public Tuple2 call(String o) throws Exception {
                Tuple2 tuple2 = new Tuple2(o,1);
                //System.out.println(tuple2._1()+":"+tuple2._2());
                return tuple2;
            }
        });
        //groupByKey 统计词频
        JavaPairRDD groupByKeyResult = mapToPairResult.groupByKey();
        //此时返回的结果是JavaPairRDD
        //[(pip,[1, 1, 1]), (pandas,[1]), (numpy,[1])]
        System.out.println(groupByKeyResult.collect());
        JavaPairRDD gr = groupByKeyResult.mapToPair(new PairFunction,String,Integer>(){

            public Tuple2 call(Tuple2 tuple2){
                int sum = 0;
                Iterator it = tuple2._2.iterator();
                while(it.hasNext()){
                    sum += it.next();
                }
                return new Tuple2(tuple2._1,sum);
            }
        });
        System.out.println(gr.collect());
        //返回结果[(pip,3), (pandas,1), (numpy,1)]

 

你可能感兴趣的:(spark)