大数据IMF传奇 第19课 spark 二次排序 使用JAVA自定义key 进行二次排序

scala> sc.textFile("/README.txt").flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).map(x =>(x._2,x._1)).sortByKey(false).map(x=>(x._2,x._1)).collect

res0: Array[(String, Int)] = Array(("",18), (the,8), (and,6), (of,5), (The,4), (this,3), (encryption,3), (for,3), (cryptographic,3), (Software,2), (which,2), (at:,2), (software,2), (re-export,2), (includes,2), (import,,2), (software.,2), (possession,,2), (our,2), (please,2), (distribution,2), (on,2), (using,2), (or,2), (use,,2), (information,2), (to,2), (software,,2), (more,2), (Export,2), (Hadoop,1), (Commodity,1), (For,1), (country,1), (under,1), (it,1), (Jetty,1), (Technology,1), (<http://www.wassenaar.org/>,1), (have,1), (http://wiki.apache.org/hadoop/,1), (BIS,1), (classified,1), (This,1), (following,1), (security,1), (See,1), (Number,1), (export,1), (reside,1), ((BIS),,1), (any,1), (makes,1), (algorithms.,1), (latest,1), (your,1), (SSL,1), (Administration,1), (provides,1), (Unrest...
scala>

 

[root@master IMF2016]#hadoop fs -put helloSpark.txt /
16/01/24 04:37:35 WARN util.NativeCodeLoader: Unable

to load native-hadoop library for your platform...

using builtin-java classes where applicable
[root@master IMF2016]#


[root@master IMF2016]#hadoop fs -put helloSpark.txt /

 

[root@master IMF2016]#cat helloSpark.txt
2 3
4 1
3 2
4 3
8 7
2 1

运行结果

2 1
2 3
3 2
4 1
4 3
8 7

 

大数据IMF传奇 第19课 spark 二次排序 使用JAVA自定义key 进行二次排序_第1张图片

 

源代码:

package com.dt.spark.SparkApps.cores;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;

import scala.Tuple2;

public class SecondaySortApp {

 public static void main(String[] args) {
  SparkConf conf = new SparkConf().setAppName("SecondaySortApp").setMaster("local");
  JavaSparkContext sc = new JavaSparkContext(conf); //其底层实际上就是Scala的SparkContext
  JavaRDD<String> lines = sc.textFile("G://IMFBigDataSpark2016//tesdata//helloSpark.txt");
  
  JavaPairRDD<SecondarySortKey, String> pairs = lines.mapToPair(new PairFunction<String, SecondarySortKey, String>() {

   private static final long serialVersionUID = 1L;

   @Override
   public Tuple2<SecondarySortKey, String> call(String line) throws Exception {
    String[] splited =  line.split(" ");
    SecondarySortKey key =new SecondarySortKey(Integer.valueOf(splited[0]),Integer.valueOf(splited[1]));
    return new Tuple2<SecondarySortKey,String>(key,line);
   }
  });
  
  JavaPairRDD<SecondarySortKey, String> sorted = pairs.sortByKey();
  
   JavaRDD<String> SecondaySorted=sorted.map(new Function<Tuple2<SecondarySortKey,String>, String>() {

   /**
    *
    */
   private static final long serialVersionUID = 1L;

   @Override
   public String call(Tuple2<SecondarySortKey, String> sortedContent) throws Exception {
    // TODO Auto-generated method stub
    return sortedContent._2;
   }
  });
   SecondaySorted.foreach(new VoidFunction<String>() {

   @Override
   public void call(String sorted) throws Exception {
     System.out.println(sorted);
   }
  });
  
 }

}

 

KEY的定义

package com.dt.spark.SparkApps.cores;

import java.io.Serializable;

import scala.math.Ordered;

public class SecondarySortKey implements Ordered<SecondarySortKey>,Serializable{

 private int first;
 private int second;
 
 
 public int getFirst() {
  return first;
 }

 


 public void setFirst(int first) {
  this.first = first;
 }

 


 public int getSeconde() {
  return second;
 }

 


 public void setSeconde(int seconde) {
  this.second = seconde;
 }

 


 public SecondarySortKey(int first, int second){
  this.first =first;
  this.second = second;
 }
 
 
 
 
 @Override
 public boolean $greater(SecondarySortKey other) {
        if(this.first > other.getFirst()){
         return true;
        } else if (this.first==other.getFirst()&& this.second > other.getSeconde()) {
         return true;
        }

  return false;
 }
 @Override
 public boolean $greater$eq(SecondarySortKey other) {
     if (this.$greater(other)){
      return true;
     } else if (this.first == other.getFirst() && this.second == other.getSeconde()) {
      return true;
     }
  return false;
 }
 @Override
 public boolean $less(SecondarySortKey other) {
  if(this.first < other.getFirst() ) {
   return true;
  } else if (this.first == other.getFirst() && this.second < other.getSeconde()){
      return true; 
  }
   return false;
 }
 @Override
 public boolean $less$eq(SecondarySortKey other) {
 if (this.$less(other)){
  return true;
 }  else if (this.first == other.getFirst() && this.second == other.getSeconde()) {
  return true;
 }
  return false;
 }
 @Override
 public int compare(SecondarySortKey other) {
  if(this.first - other.getFirst() != 0){
   return this.first - other.getFirst();
  }else {
   return this.second - other.getSeconde();
  }
  
 }
 @Override
 public int compareTo(SecondarySortKey other) {
  if(this.first - other.getFirst() != 0){
   return this.first - other.getFirst();
  }else {
   return this.second - other.getSeconde();
  }
 }

 


 @Override
 public int hashCode() {
  final int prime = 31;
  int result = 1;
  result = prime * result + first;
  result = prime * result + second;
  return result;
 }

 


 @Override
 public boolean equals(Object obj) {
  if (this == obj)
   return true;
  if (obj == null)
   return false;
  if (getClass() != obj.getClass())
   return false;
  SecondarySortKey other = (SecondarySortKey) obj;
  if (first != other.first)
   return false;
  if (second != other.second)
   return false;
  return true;
 }
 
 
}

 

 

 

你可能感兴趣的:(大数据IMF传奇 第19课 spark 二次排序 使用JAVA自定义key 进行二次排序)