更多代码请见:https://github.com/xubo245/SparkLearning
1解释
由于有时候数据的列数很多,不只是按一项作为key来排序,有时候需要对其中两项进行排序,spark的RDD提供了keyBy的方法。使用场景例子为:
init:
(ab,2,3)
(ac,4,100)
(bb,1,200)
(ac,1,1)
(bb,2,5)
sort:
(ab,2,3)
(ac,1,1)
(ac,4,100)
(bb,1,200)
(bb,2,5)
2.代码:
package org.bdgenomics.adam.test
import org.apache.spark.{SparkContext, SparkConf}
import org.bdgenomics.adam.instrumentation.Timers._
/**
* Created by xubo on 2016/5/19.
*/
object KeyBySuite {
def main(args: Array[String]) {
keybyTest()
}
def keybyTest() = SortReads.time {
val conf = new SparkConf().setAppName(this.getClass().getSimpleName().filter(!_.equals('$'))).setMaster("local[4]")
val sc = new SparkContext(conf)
// val rdd1 = sc.parallelize(Array(("a", 2, 3), ("b", 2, 5), ("a", 4, 100), ("b", 1, 200), ("a", 1, 1)))
val rdd1 = sc.parallelize(Array(("ab", 2, 3), ("bb", 2, 5), ("ac", 4, 100), ("bb", 1, 200), ("ac", 1, 1)))
val rdd2 = rdd1.keyBy(each => (each._1, each._2)).sortByKey()
println("init:")
rdd1.foreach(println)
println("keyby not zipWithIndex:")
rdd2.foreach(println)
println("keyby value:")
rdd2.map(_._2).foreach(println)
println("zipWithIndex:")
val rdd3 = rdd1.keyBy(each => (each._1, each._2)).sortByKey().map(_._2).collect().zipWithIndex
rdd3.foreach(println)
println("sort:")
rdd3.map(_._1).foreach(println)
sc.stop
}
}
3.结果:
init:
(ab,2,3)
(ac,4,100)
(bb,1,200)
(ac,1,1)
(bb,2,5)
keyby not zipWithIndex:
((ab,2),(ab,2,3))
((ac,4),(ac,4,100))
((bb,2),(bb,2,5))
((bb,1),(bb,1,200))
((ac,1),(ac,1,1))
keyby value:
(bb,2,5)
(ac,4,100)
(ab,2,3)
(ac,1,1)
(bb,1,200)
zipWithIndex:
((ab,2,3),0)
((ac,1,1),1)
((ac,4,100),2)
((bb,1,200),3)
((bb,2,5),4)
sort:
(ab,2,3)
(ac,1,1)
(ac,4,100)
(bb,1,200)
(bb,2,5)
参考
【1】http://spark.apache.org/docs/1.5.2/mllib-guide.html
【2】http://spark.apache.org/docs/1.5.2/mllib-collaborative-filtering.html#collaborative-filtering
【3】https://github.com/xubo245/SparkLearning