Spark Java 分组排序取TopN

1.输入

c1 85
c2 77
c3 88
c1 22
c1 66
c3 95
c3 54
c2 91
c2 66
c1 54
c1 65
c2 41
c4 65

2.代码实现

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

import java.util.Iterator;

/**
 * User:leen
 * Date:2017/10/18 0018
 * Time:17:41
 */
public class GroupTopN {
    public static void main(String[] args) {

        SparkConf conf = new SparkConf().setAppName("GroupTopN").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD lines = sc.textFile("C:\\Users\\leen\\Desktop\\group.txt");

        //拆分为JavaPairRDD
        JavaPairRDD cs = lines.mapToPair(new PairFunction() {
            public Tuple2 call(String s) throws Exception {
                return new Tuple2(s.split(" ")[0], Integer.valueOf(s.split(" ")[1]));
            }
        });

        //根据Key分组
        JavaPairRDD> csPairs = cs.groupByKey();
        //根据Key排序,升序
        JavaPairRDD> csPairs1 = csPairs.sortByKey();
        //遍历取出Top3
        csPairs1.foreach(new VoidFunction>>() {
            public void call(Tuple2> csPair) throws Exception {
                String name = csPair._1();
                Iterator ite = csPair._2().iterator();
                Integer[] res = new Integer[3];
                //排序,取出Top3
                while (ite.hasNext()) {
                    Integer score = ite.next();
                    for (int i = 0; i < 3; i++) {
                        if (res[i] == null) {
                            res[i] = score;
                            break;
                        } else if (res[i] < score) {
                            for (int j = 2; j > i; j--) {
                                res[j] = res[j - 1];
                            }
                            res[i] = score;
                            break;
                        }
                    }
                }
                System.out.print(name + ":");
                for (int i = 0; i < res.length; i++) {
                    System.out.print(res[i] + "\t");
                }
                System.out.println();
            }
        });

        sc.close();
    }
}

3.输出

c1:85   66  65  
c2:91   77  66  
c3:95   88  54  
c4:65   null    null    

4.Scala版本参考地址:

Spark Scala 分组排序取TopN

你可能感兴趣的:(spark)