Day19 实现二次排序

本文来自王家林大数据梦工厂整理:http://weibo.com/ilovepains

 分别用java 和scala 实现二次排序

分析:

// 按照order 和Serializable 实现自定义排序的key
// 将要进行二次排序的文件加载进来生成(key, value) 类型的RDD
//使用sortBykey 基于自定义的key 进行排序
//去除掉排序的值,保留排序的结果

实现自定义key:

import java.io.Serializable;

import scala.math.Ordered;

/** 
 * @author 作者 E-mail: 
 * @version 创建时间:2016年2月20日 上午12:13:57 
 * 类说明 
 */
public class SecondSortByKey implements Ordered<SecondSortByKey> , Serializable {

    public int getFirst() {
        return first;
    }

    public void setFirst(int first) {
        this.first = first;
    }

    public int getSecond() {
        return second;
    }

    public void setSecond(int second) {
        this.second = second;
    }

    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;

        SecondSortByKey that = (SecondSortByKey) o;

        if (first != that.first) return false;
        return second == that.second;

    }

    @Override
    public int hashCode() {
        int result = first;
        result = 31 * result + second;
        return result;
    }

    private  int first, second;

    public SecondSortByKey(int first, int second){
        this.first = first;
        this.second = second;
    }
    public int compare(SecondSortByKey other) {
        if (this.first - other.getFirst() != 0){
            return this.first - other.getFirst();
        } else {
            return  this.second - other.getSecond();
        }
    }

  
    public boolean $less(SecondSortByKey other) {
        if (this.first < other.getFirst() ){
            return  true;
        } else if (this.first == other.getFirst() && this.second < other.getSecond()){
            return true;
        }
       return  false;
    }

    
    public boolean $greater(SecondSortByKey other) {
        if (this.first > other.getFirst()){
            return  true;
        } else if ( this.first == other.getFirst() && this.second > other.getSecond()){ // first equals and second bigger
            return true;
        }
        return false;
    }

    
    public boolean $less$eq(SecondSortByKey other) {
        if (this.$less(other)){
            return true;
        } else if (this.first == other.getFirst() && this.second == other.getSecond()){
            return true;
        }
        return false;
    }

   
    public boolean $greater$eq(SecondSortByKey other) {   //是否相等 每次进行比较
        if (this.$greater(other)){
            return true;
        } else if (this.first == other.getFirst() && this.second == other.getSecond()){
            return true;
        }
        return false;
    }

    
    public int compareTo(SecondSortByKey other) {
        if (this.first - other.getFirst() != 0){
            return this.first - other.getFirst();
        } else {
            return  this.second - other.getSecond();
        }
    }

}

 实现类:

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

/** 
 * @author 作者 E-mail: 
 * @version 创建时间:2016年2月20日 上午6:54:29 
 * 类说明 
 */
public class SecondArraySort {

    // 按照order 和Serializable 实现自定义排序的key
    // 将要进行二次排序的文件加载进来生成(key, value) 类型的RDD
    //使用sortBykey 基于自定义的key 进行排序
    //去除掉排序的值,保留排序的结果
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("sort by spark").setMaster("local");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> lines = sc.textFile("D://googledown//datas.txt");
        JavaPairRDD<SecondSortByKey, String> paris = lines.mapToPair(new PairFunction<String, SecondSortByKey, String>() {
            private static final long serialVersionID = 1L;

            
            public Tuple2<SecondSortByKey, String> call(String line) throws Exception {
                String[] strs = line.split(" ");
                SecondSortByKey sortByKey = new SecondSortByKey(Integer.valueOf(strs[0]), Integer.valueOf(strs[1]));
                return new Tuple2<SecondSortByKey, String>(sortByKey, line);
            }
        });

        JavaPairRDD<SecondSortByKey, String> sorted = paris.sortByKey();    //完成二次排序
        //key是自己构造的,不需要过滤后的key,保留排序结果
       JavaRDD<String> secondSorted =  sorted.map(new Function<Tuple2<SecondSortByKey, String>, String>() {
           private static final long serialVersionID = 1L;
            public String call(Tuple2<SecondSortByKey, String> sortedContext) throws Exception {
                return sortedContext._2();  // 返回value
            }
        });
        secondSorted.foreach(new VoidFunction<String>() {
            
            public void call(String sorted) throws Exception {
                System.out.println(sorted);
            }
        });
    }
}

pom.xml配置文件

<dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>3.8.1</version>
      <scope>test</scope>
    </dependency>
    <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-core_2.10</artifactId>
       <version>1.6.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.10</artifactId>
        <version>1.6.0</version>
    </dependency>
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-hive_2.10</artifactId>
      <version>1.6.0</version>
    </dependency>
	<dependency>
	      <groupId>org.apache.spark</groupId>
	      <artifactId>spark-streaming_2.10</artifactId>
	      <version>1.6.0</version>
	</dependency>
	<dependency>
	      <groupId>org.apache.hadoop</groupId>
	      <artifactId>hadoop-client</artifactId>
	      <version>2.6.0</version>
	</dependency>
	<dependency>
	      <groupId>org.apache.spark</groupId>
	      <artifactId>spark-streaming-kafka_2.10</artifactId>
	      <version>1.6.0</version>
	</dependency>
	<dependency>
	      <groupId>org.apache.spark</groupId>
	      <artifactId>spark-graphx_2.10</artifactId>
	      <version>1.6.0</version>
	</dependency>

测试数据:

2 3
4 1
3 2
4 3
8 7
2 1
9 7
9 8
8 3

运行结果:


你可能感兴趣的:(Day19 实现二次排序)