Spark入门(十六)之分组求TOP N最小值

 一、分组求TOP N最小值

计算文本里面的每个key分组求TOP N最小值,输出结果。

 

二、maven设置


 

  4.0.0
 
  com.mk
  spark-test
  1.0
 
  spark-test
  http://spark.mk.com
 
  
    UTF-8
    1.8
    1.8
    2.11.1
    2.4.4
    2.6.0
  
 
  
    
    
      org.scala-lang
      scala-library
      ${scala.version}
    
 
    
    
      org.apache.spark
      spark-core_2.11
      ${spark.version}
    
    
      org.apache.spark
      spark-sql_2.11
      ${spark.version}
    
 
 
    
      junit
      junit
      4.11
      test
    
  
 
  
    
      
 
        
          maven-clean-plugin
          3.1.0
        
 
        
          maven-resources-plugin
          3.0.2
        
        
          maven-compiler-plugin
          3.8.0
        
        
          maven-surefire-plugin
          2.22.1
        
        
          maven-jar-plugin
          3.0.2
        
      
    
  

 

三、编程代码 

public class TopNApp implements SparkConfInfo {


    public static class SortData implements Comparable, Serializable {
        private String key;
        private Integer value;

        public SortData(String key, Integer value) {
            this.key = key;
            this.value = value;
        }

        public String getKey() {
            return key;
        }

        public void setKey(String key) {
            this.key = key;
        }

        public Integer getValue() {
            return value;
        }

        public void setValue(Integer value) {
            this.value = value;
        }

        @Override
        public int compareTo(SortData o) {

            if (o == null) {
                return 1;
            }

            int diff = this.value - o.value;
            if (diff != 0)
                return diff;

            if(key == o.key)
                return 0 ;

            if(key == null)
                return -1;

            if(o.key == null)
                return 1;

            return  this.key.compareTo(o.key);
        }
    }


    public static void main(String[] args) {

        String filePath = "E:\\spark\\groubByNumber.txt";
        SparkSession sparkSession = new TopNApp().getSparkConf("groubByNumber");
        JavaPairRDD numbers = sparkSession.sparkContext()
                .textFile(filePath, 4)
                .toJavaRDD()
                .flatMap(v -> Arrays.asList(v.split("\n")).iterator())
                .mapToPair(v -> {
                    String[] data = v.split("\\s+");
                    if (data.length != 2) {
                        return null;
                    }
                    if (!data[1].matches("-?[0-9]+(.[0-9]+)?"))
                        return null;
                    return new Tuple2<>(data[0], Integer.valueOf(data[1]));
                }).filter(v -> v != null).cache();

        //数据量大会溢出内存无法计算
//        numbers.groupByKey()
//                .sortByKey(true)
//                .mapValues(v -> {
//
//                    Integer value = null;
//                    Iterator it = v.iterator();
//                    while (it.hasNext()) {
//                        Integer val = it.next();
//                        if(value==null || value>val){
//                            value = val;
//                        }
//                    }
//                    return value;
//                })
//                .map(v-> new SortData(v._1, v._2))
//                .sortBy(v->v, true, 3)
//                .take(3)
//                .forEach(v -> System.out.println(v._1 + ":" + v._2));

        //这种聚合数据再计算
        numbers.combineByKey(min -> min,  // 将val映射为一个元组,作为分区内聚合初始值
                (min,val) -> {
                    if (min > val) {
                        min = val;
                    }
                    return min;
                }, //分区内聚合,
                (a, b) -> Math.min(a, b))   //分区间聚合
                .map(v-> new SortData(v._1, v._2))
                .sortBy(v->v, true, 3)
                .take(3)
                .forEach(v -> System.out.println(v.key + ":" + v.value));

        sparkSession.stop();
    }
}

public interface SparkConfInfo {

    default SparkSession getSparkConf(String appName){
        SparkConf sparkConf = new SparkConf();
        if(System.getProperty("os.name").toLowerCase().contains("win")) {
            sparkConf.setMaster("local[4]");
            System.out.println("使用本地模拟是spark");
        }else
        {
            sparkConf.setMaster("spark://hadoop01:7077,hadoop02:7077,hadoop03:7077");
            sparkConf.set("spark.driver.host","192.168.150.1");//本地ip,必须与spark集群能够相互访问,如:同一个局域网
            sparkConf.setJars(new String[] {".\\out\\artifacts\\spark_test\\spark-test.jar"});//项目构建生成的路径
        }

        SparkSession session = SparkSession.builder().appName(appName).config(sparkConf).config(sparkConf).getOrCreate();
        return session;
    }
}

groubByNumber.txt文件内容

A 100
A 24
B 43
C 774
D 43
D 37
D 78
E 42
C 68
F 89
G 49
F 543
H 36
E 888
A 258
A 538
B 79
B 6
H 67
C 99

输出

B:6
A:24
H:36

 

四、take方法

 List take(int num);

获取前num元素返回

你可能感兴趣的:(Spark)