本文来自王家林大数据梦工厂整理:http://weibo.com/ilovepains
分别用java 和scala 实现二次排序
分析:
// 按照order 和Serializable 实现自定义排序的key // 将要进行二次排序的文件加载进来生成(key, value) 类型的RDD //使用sortBykey 基于自定义的key 进行排序 //去除掉排序的值,保留排序的结果
实现自定义key:
import java.io.Serializable; import scala.math.Ordered; /** * @author 作者 E-mail: * @version 创建时间:2016年2月20日 上午12:13:57 * 类说明 */ public class SecondSortByKey implements Ordered<SecondSortByKey> , Serializable { public int getFirst() { return first; } public void setFirst(int first) { this.first = first; } public int getSecond() { return second; } public void setSecond(int second) { this.second = second; } @Override public boolean equals(Object o) { if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SecondSortByKey that = (SecondSortByKey) o; if (first != that.first) return false; return second == that.second; } @Override public int hashCode() { int result = first; result = 31 * result + second; return result; } private int first, second; public SecondSortByKey(int first, int second){ this.first = first; this.second = second; } public int compare(SecondSortByKey other) { if (this.first - other.getFirst() != 0){ return this.first - other.getFirst(); } else { return this.second - other.getSecond(); } } public boolean $less(SecondSortByKey other) { if (this.first < other.getFirst() ){ return true; } else if (this.first == other.getFirst() && this.second < other.getSecond()){ return true; } return false; } public boolean $greater(SecondSortByKey other) { if (this.first > other.getFirst()){ return true; } else if ( this.first == other.getFirst() && this.second > other.getSecond()){ // first equals and second bigger return true; } return false; } public boolean $less$eq(SecondSortByKey other) { if (this.$less(other)){ return true; } else if (this.first == other.getFirst() && this.second == other.getSecond()){ return true; } return false; } public boolean $greater$eq(SecondSortByKey other) { //是否相等 每次进行比较 if (this.$greater(other)){ return true; } else if (this.first == other.getFirst() && this.second == other.getSecond()){ return true; } return false; } public int compareTo(SecondSortByKey other) { if (this.first - other.getFirst() != 0){ return this.first - other.getFirst(); } else { return this.second - other.getSecond(); } } }
import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; /** * @author 作者 E-mail: * @version 创建时间:2016年2月20日 上午6:54:29 * 类说明 */ public class SecondArraySort { // 按照order 和Serializable 实现自定义排序的key // 将要进行二次排序的文件加载进来生成(key, value) 类型的RDD //使用sortBykey 基于自定义的key 进行排序 //去除掉排序的值,保留排序的结果 public static void main(String[] args) { SparkConf conf = new SparkConf().setAppName("sort by spark").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> lines = sc.textFile("D://googledown//datas.txt"); JavaPairRDD<SecondSortByKey, String> paris = lines.mapToPair(new PairFunction<String, SecondSortByKey, String>() { private static final long serialVersionID = 1L; public Tuple2<SecondSortByKey, String> call(String line) throws Exception { String[] strs = line.split(" "); SecondSortByKey sortByKey = new SecondSortByKey(Integer.valueOf(strs[0]), Integer.valueOf(strs[1])); return new Tuple2<SecondSortByKey, String>(sortByKey, line); } }); JavaPairRDD<SecondSortByKey, String> sorted = paris.sortByKey(); //完成二次排序 //key是自己构造的,不需要过滤后的key,保留排序结果 JavaRDD<String> secondSorted = sorted.map(new Function<Tuple2<SecondSortByKey, String>, String>() { private static final long serialVersionID = 1L; public String call(Tuple2<SecondSortByKey, String> sortedContext) throws Exception { return sortedContext._2(); // 返回value } }); secondSorted.foreach(new VoidFunction<String>() { public void call(String sorted) throws Exception { System.out.println(sorted); } }); } }
<dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>1.6.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-graphx_2.10</artifactId> <version>1.6.0</version> </dependency>
2 3
4 1
3 2
4 3
8 7
2 1
9 7
9 8
8 3
运行结果: