Spark: sortBy sortByKey 二次排序

Sample data

(考场号,班级号,学号)–> 考场号升序,班级号升序,学号降序

1 1 3
1 1 4
1 2 8
1 3 7
3 2 9
3 5 11
1 4 13
1 5 12
2 1 14
2 1 10
2 4 1
2 3 5
2 4 6
3 5 2
3 2 15
1 1 16
2 2 17
3 3 18
2 2 19
3 3 20

sortBy

package com.spark.sort
import org.apache.spark.{SparkConf, SparkContext}
import scala.reflect.ClassTag
/**
  * Created by wqh on 2017/9/11.
  */
object TestsortBy extends App {

    val conf = new SparkConf()
    conf.setAppName("TestsortBy").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/s.txt", 4)
    val rdd2 = rdd1.flatMap(line => Array(line.split(" "))).map(t => (t(0), t(1), t(2)))

    val mysortBy = new Ordering[Tuple3[String, String, String]] {
        override def compare(x: (String, String, String), y: (String, String, String)): Int = {
            val r = x._1.compare(y._1)
            val r2 = x._2.compare(y._2)
            if (r == 0) {
                if (r2 == 0) y._3.toInt - x._3.toInt else x._2.toInt - y._2.toInt
            } else r
        }
    }
    val rdd3 = rdd2.sortBy(x => x)(mysortBy, ClassTag.apply[Tuple3[String, String, String]](classOf[Tuple3[String, String, String]]))
    rdd3.collect().foreach(println)
}

sortByKey

package com.spark.sort
import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by wqh on 2017/9/11.
  */
object TestsortByKey extends App {

    val conf = new SparkConf()
    conf.setAppName("TestsortBykey").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/s.txt", 4)
    val rdd2 = rdd1.flatMap(line => Array(line.split(" "))).map(t => ((t(0), t(1), t(2)),1))

    implicit val mysort = new Ordering[Tuple3[Int,Int,Int]]{
        override def compare(x: (Int, Int, Int), y: (Int, Int, Int)): Int = {
            val r = x._1.compare(y._1)
            val r2 = x._2.compare(y._2)
            if (r == 0) {
                if (r2 == 0) y._3 - x._3 else x._2 - y._2
            } else r
        }
    }
    val rdd3 = rdd2.sortByKey().collect()
    for((k,v) <- rdd3){println(k)}
    //rdd3.keys.foreach(println) error
}

SecondarySort

sortByKey 的二次排序 重新构成需要排序的数据为key 整个line为value

package com.spark.sort
import org.apache.spark.{SparkConf, SparkContext}
/**
  * Created by wqh on 2017/9/11.
  */
object SecondarySort extends App {

    val conf = new SparkConf()
    conf.setAppName("TestsortBy").setMaster("local[4]")
    val sc = new SparkContext(conf)
    val rdd1 = sc.textFile("/Users/wqh/Desktop/data/s.txt", 4)
    val rdd2 = rdd1.map(line => {
        val r = line.split(" ")
        val key = new SecondarySortKey(r(0).toInt, r(1).toInt,r(2).toInt)
        (key, line)
    })
    val res = rdd2.sortByKey().map(t => t._2)
    res.collect().foreach(println)
}

重构key的类:class SecondarySortKey

package com.spark.sort

/**
  * Created by wqh on 2017/9/12.
  */
class SecondarySortKey(val first: Int, val second: Int, val third: Int) extends Ordered[SecondarySortKey] with Serializable {
    override def compare(other: SecondarySortKey): Int = {
        val r = first.compare(other.first)
        val r2 = second.compare(other.second)
        if (r == 0) {
            if (r2 == 0) other.third - this.third else this.second - other.second
        } else r
    }
}

Result

1 1 16
1 1 4
1 1 3
1 2 8
1 3 7
1 4 13
1 5 12
2 1 14
2 1 10
2 2 19
2 2 17
2 3 5
2 4 6
2 4 1
3 2 15
3 2 9
3 3 20
3 3 18
3 5 11
3 5 2

你可能感兴趣的:(spark)