数据倾斜解决实例【适用于reduceByKey】

package com.imooc;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

public class HelloWorld {
    public static void main(String[] args) {


        SparkConf conf = new SparkConf();
        conf.setMaster("local[2]").setAppName("HelloWorld");
        JavaSparkContext sc = new JavaSparkContext(conf);
        List list = new ArrayList();
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");
        list.add("spark flume spark");
        list.add("hadoop flume hive");


        JavaRDD rdd = sc.parallelize(list);

        JavaRDD rdd2 =  rdd.flatMap(line ->
                Arrays.asList(line.split(" ")).iterator());

        JavaPairRDD counts= rdd2.mapToPair(word ->
                new Tuple2(word, 1));

        counts.foreach(o -> {
            System.out.println(o);
        });

        // 第一步,给RDD中的每个key都打上一个随机前缀。
        JavaPairRDD randomPrefixRdd =  counts.mapToPair(
                new PairFunction, String, Integer>() {
                    @Override
                    public Tuple2 call(Tuple2 stringIntegerTuple2) throws Exception {
                        Random random = new Random();
                        int prefix = random.nextInt(10);
                        return new Tuple2(prefix + "_" + stringIntegerTuple2._1, stringIntegerTuple2._2);
                    }
                }
        );

        randomPrefixRdd.foreach(o -> {
            System.out.println(o);
        });


        //第二步,对打上随机前缀的key进行局部聚合。
        JavaPairRDD localAggrRdd = randomPrefixRdd.reduceByKey(
               new Function2() {
                   @Override
                   public Integer call(Integer v1, Integer v2) throws Exception {
                       return v1 + v2;
                   }
               }
       );

        localAggrRdd.foreach(o -> {
            System.out.println(o);
        });


        //第三步,去除RDD中每个key的随机前缀。
        JavaPairRDD removedRandomPrefixRdd =   localAggrRdd.mapToPair(
                new PairFunction, String, Integer>() {
                    @Override
                    public Tuple2 call(Tuple2 stringIntegerTuple2) throws Exception {
                        String originalKey = stringIntegerTuple2._1.split("_")[1];
                        return new Tuple2(originalKey, stringIntegerTuple2._2);
                    }
                }
        );

        removedRandomPrefixRdd.foreach(o -> {
            System.out.println(o);
        });

        //第四步,对去掉随机数的key进行全局聚合
        JavaPairRDD globalAggrRdd =  removedRandomPrefixRdd.reduceByKey(
                new Function2() {
                    @Override
                    public Integer call(Integer integer, Integer integer2) throws Exception {
                        return integer + integer2;
                    }
                }
        );

        globalAggrRdd.foreach(o -> {
            System.out.println(o);
        });
    }
}


你可能感兴趣的:(数据倾斜解决实例【适用于reduceByKey】)