28、Spark核心编程之高级编程之二次排序

需求

  1. 按照文件中的第一列排序。
  2. 如果第一列相同,则按照第二列排序。

文件内容

2 5
3 6
2 4
1 3
1 5

Java实现

自定义的二次排序key

/**
 * 自定义的二次排序key
 */
public class SecondarySortKey implements Ordered,Serializable {

    // 首先在自定义key里面,定义需要进行排序的列
    private int first;
    private int second;

    public SecondarySortKey(int first, int second) {
        this.first = first;
        this.second = second;
    }

    @Override
    public int compare(SecondarySortKey that) {
        if(this.first - that.first != 0) {
            return this.first - that.first;
        }else {
            return this.second - that.second;
        }
    }

    @Override
    public boolean $less(SecondarySortKey that) {
        if(this.first < that.first) {
            return true;
        }else if(this.first == that.first && this.second < that.second){
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater(SecondarySortKey that) {
        if(this.first > that.first) {
            return true;
        }else if(this.first == that.first && this.second > that.second){
            return true;
        }
        return false;
    }

    @Override
    public boolean $less$eq(SecondarySortKey that) {
        if($less(that)){
            return true;
        }else if(this.first == that.first && this.second == that.second) {
            return true;
        }
        return false;
    }

    @Override
    public boolean $greater$eq(SecondarySortKey that) {
        if($greater(that)) {
            return true;
        }else if(this.first == that.first && this.second == that.second) {
            return true;
        }
        return false;
    }

    @Override
    public int compareTo(SecondarySortKey that) {
        if(this.first - that.first != 0) {
            return this.first - that.first;
        }else {
            return this.second - that.second;
        }
    }
    // 为要进行排序的多个列,提供getter和setter方法,以及hashcode和equals方法

    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) {
            return true;
        }
        if (o == null || getClass() != o.getClass()) {
            return false;
        }
        SecondarySortKey that = (SecondarySortKey) o;
        return first == that.first &&
                second == that.second;
    }

    @Override
    public int hashCode() {

        return Objects.hash(first, second);
    }
}

二次排序

/**
 * 二次排序
 * 1、实现自定义的key,要实现Ordered接口和Serializable接口,在key中实现自己对多个列的排序算法
 * 2、将包含文本的RDD,映射成key为自定义key,value为文本的JavaPairRDD
 * 3、使用sortByKey算子按照自定义的key进行排序
 * 4、再次映射,剔除自定义的key,只保留文本行
 *
 */
public class SecondarySort {

    public static void main(String[] args) {
        SparkConf sparkConf = new SparkConf().setAppName("SecondarySortJava").setMaster("local");
        JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
        JavaRDD numsRDD = sparkContext.textFile("E:\\testdata\\wordcount\\input\\sort.txt");
        JavaPairRDD pairs = numsRDD.mapToPair(new PairFunction() {
            @Override
            public Tuple2 call(String s) throws Exception {
                return new Tuple2<>(new SecondarySortKey(Integer.parseInt(s.split(" ")[0]),
                        Integer.parseInt(s.split(" ")[1])), s);
            }
        });
        JavaPairRDD sortedPairs = pairs.sortByKey();
        JavaRDD result = sortedPairs.map(new Function, String>() {
            @Override
            public String call(Tuple2 secondarySortKeyStringTuple2) throws Exception {
                return secondarySortKeyStringTuple2._2;
            }
        });

        result.foreach(new VoidFunction() {
            @Override
            public void call(String s) throws Exception {
                System.out.println("s = " + s);
            }
        });

        sparkContext.close();
    }
}

Scala实现

SecondarySortKey

class SecondarySortKey(val first:Int, val second:Int) extends Ordered[SecondarySortKey] with Serializable {
  override def compare(that: SecondarySortKey): Int = {
    if(this.first - that.first != 0) {
      this.first - that.first
    }else {
      this.second - that.second
    }
  }
}

SecondarySort

object SecondarySort {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("SecondarySortScala").setMaster("local")
    val sparkContext = new SparkContext(conf)

    val linesRDD = sparkContext.textFile("E:\\testdata\\wordcount\\input\\sort.txt")
    val keyLineRDD = linesRDD.map(line => (new SecondarySortKey(line.split(" ")(0).toInt, line.split(" ")(1).toInt),line))
    val sortedKeyLine = keyLineRDD.sortByKey()
    val result = sortedKeyLine.map(keyline => keyline._2)
    result.foreach(result => println(result))
  }
}

你可能感兴趣的:(28、Spark核心编程之高级编程之二次排序)