spark 排序并添加编号添加行号和初始值

使用Spark sortBy进行排序,使用zipWithIndex进行行号添加通常可用于count或添加索引或计算当前值是第几大等用途

Case1 :全局排序,输出原始值和对应的行号索引

启动spark-shell并使用yarn提交
[[email protected] ~]$ spark-shell --master yarn --executor-memory 4G
scala> val data = Array(1, 10,12,39,23456,8,2, 3,50,87, 4,1,7,3,10000002, 5);
data: Array[Int] = Array(1, 10, 12, 39, 23456, 8, 2, 3, 50, 87, 4, 1, 7, 3, 10000002, 5)

scala> var rdd = sc.parallelize(data,5);//将data数组转换成rdd,使用5个partition
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[57] at parallelize at <console>:26

scala> var sortrdd = rdd.sortBy(x=>x);//根据值排序
sortrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[62] at sortBy at <console>:28

scala> var seqrdd = sortrdd.zipWith
zipWithIndex   zipWithUniqueId

scala> var seqrdd = sortrdd.zipWithIndex();//添加行号
seqrdd: org.apache.spark.rdd.RDD[(Int, Long)] = ZippedWithIndexRDD[63] at zipWithIndex at <console>:30

scala> seqrdd.collect();
res25: Array[(Int, Long)] = Array((1,0), (1,1), (2,2), (3,3), (3,4), (4,5), (5,6), (7,7), (8,8), (10,9), (12,10), (39,11), (50,12), (87,13), (23456,14), (10000002,15))

Case2 :输出排序后的值和行号,行号可以指定初始值(偏移值)

scala> val data = Array(1, 10,12,39,23456,8,2, 3,50,87, 4,1,7,3,10000002, 5);
data: Array[Int] = Array(1, 10, 12, 39, 23456, 8, 2, 3, 50, 87, 4, 1, 7, 3, 10000002, 5)

scala> var rdd = sc.parallelize(data,5);//将data数组转换成rdd,使用5个partition
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[57] at parallelize at <console>:26

scala> var sortrdd = rdd.sortBy(x=>x);//根据值排序
sortrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[62] at sortBy at <console>:28

scala> var seqrdd = sortrdd.zipWith
zipWithIndex   zipWithUniqueId

#咱们只需要在case1的基础上加上一个map操作即可
~~scala> var seqrdd = sortrdd.zipWithIndex();//添加行号~~ 
scala> seqrdd = sortrdd.zipWithIndex().map{case(x,y)=>(x,y+10)};//x为原始的值,y为行号,10为行号,10,10为咱们的行号初始值默认初始值为0
seqrdd: org.apache.spark.rdd.RDD[(Int, Long)] = MapPartitionsRDD[65] at map at <console>:32

scala> seqrdd.collect();
res26: Array[(Int, Long)] = Array((1,10), (1,11), (2,12), (3,13), (3,14), (4,15), (5,16), (7,17), (8,18), (10,19), (12,20), (39,21), (50,22), (87,23), (23456,24), (10000002,25))

Case3 :输出排序后的值和行号,并添加一列带初始值的行号

scala> val data = Array(1, 10,12,39,23456,8,2, 3,50,87, 4,1,7,3,10000002, 5);
data: Array[Int] = Array(1, 10, 12, 39, 23456, 8, 2, 3, 50, 87, 4, 1, 7, 3, 10000002, 5)

scala> var rdd = sc.parallelize(data,5);//将data数组转换成rdd,使用5个partition
rdd: org.apache.spark.rdd.RDD[Int] = ParallelCollectionRDD[57] at parallelize at <console>:26

scala> var sortrdd = rdd.sortBy(x=>x);//根据值排序
sortrdd: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[62] at sortBy at <console>:28

scala> var seqrdd = sortrdd.zipWith
zipWithIndex   zipWithUniqueId

#咱们只需要在case2的基础上将map操作输出多一列即可
~~scala> var seqrdd = sortrdd.zipWithIndex();//添加行号~~ 
~~scala> seqrdd = sortrdd.zipWithIndex().map{case(x,y)=>(x,y+10)};~~ 
scala> var seqrdd2 = sortrdd.zipWithIndex().map{case(x,y)=>(x,y,y+10)};//x为原始的值,y为行号,10为行号,10,10为咱们的行号初始值默认初始值为0
seqrdd2: org.apache.spark.rdd.RDD[(Int, Long, Long)] = MapPartitionsRDD[67] at map at <console>:30

scala> seqrdd2.collect();
res27: Array[(Int, Long, Long)] = Array((1,0,10), (1,1,11), (2,2,12), (3,3,13), (3,4,14), (4,5,15), (5,6,16), (7,7,17), (8,8,18), (10,9,19), (12,10,20), (39,11,21), (50,12,22), (87,13,23), (23456,14,24), (10000002,15,25))
去掉输出中的括号然后保存到hdfs
scala> seqrdd.map(line =>{line._2+","+line._1});
res1: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[9] at map at <console>:31

scala> seqrdd.saveAsTextFile("/user/prod_kylin/tmp/sparktest/kylin_intermediate_dm_pub_passenger_objec_o_spark_nglobbal_fac4f04e_60a0_3e3a_bf3a_488766f91446__group_by/");

你可能感兴趣的:(hadoop,大数据开发)