源数据格式
cat source.txt
2000,12,04,10
2000,11,01,20
2000,12,02,-20
2000,11,07,30
2000,11,24,-40
2012,12,21,30
2012,12,22,-20
2012,12,23,60
2012,12,24,70
2012,12,25,10
2013,01,23,90
2013,01,24,70
2013,01,20,-10
排序后
cat result
2013-01 90,70,-10
2012-12 70,60,30,10,-20
2000-12 10,-20
2000-11 30,20,-40
实现方法
spark-shell
var hf = sc.textFile("file:///data/spark/source.txt")
val data = hf.map(line=>line.split(",")).map(x=>((x(0),x(1)),x(3))).groupByKey().sortByKey(false).map(x=>(x._1._1+"-"+x._1._2,x._2.toList.sortWith(_>_))).collect()
data.foreach(x=>{
println(x._1+" "+x._2.mkString(","))
})
/* SimpleApp.scala */
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.SparkConf
object SimpleApp {
def main(args: Array[String]) {
val logFile="file:///data/spark/temp.txt"
val conf = new SparkConf().setAppName("SortData")
val sc = new SparkContext(conf)
val hf = sc.textFile(logFile, 2).cache()
val data = hf.map(line=>line.split(",")).map(x=>((x(0),x(1)),x(3))).groupByKey().sortByKey(false).map(x=>(x._1._1+"-"+x._1._2,x._2.toList.sortWith(_>_))).collect()
data.foreach(x=>{
println(x._1+" "+x._2.mkString(","))
})
sc.stop()
}
}
vim simple.sbt
name := "Simple Project"
version := "1.0"
scalaVersion := "2.11.8"
libraryDependencies += "org.apache.spark" %% "spark-core" % "2.3.1"
编译运行
sbt package
[info] Loading project definition from /data/spark/demo/sort/project
[info] Loading settings from simple.sbt ...
[info] Set current project to Simple Project (in build file:/data/spark/demo/sort/)
[info] Compiling 1 Scala source to /data/spark/demo/sort/target/scala-2.11/classes ...
[info] Done compiling.
[info] Packaging /data/spark/demo/sort/target/scala-2.11/simple-project_2.11-1.0.jar ...
[info] Done packaging.
[success] Total time: 11 s, completed 2018-7-4 14:00:27
运行
spark-submit --class "SimpleApp" target/scala-2.11/simple-project_2.11-1.0.jar >/data/spark/result
2013-01 90,70,-10
2012-12 70,60,30,10,-20
2000-12 10,-20
2000-11 30,20,-40