对数据进行分组排序,首先对数据进行分组,然后对该组下的数据进行排序。
1.首先准备数据集,本次的数据集如下。
Chinese,90
Math,93
English,84
Computer,89
Chinese,83
English,79
Math,89
Computer,88
Chinese,86
English,82
Math,94
Computer,81
2.放码
Java版
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
import scala.Tuple2;
public class GroupTop {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("GroupTop").setMaster("local[*]");
SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
JavaRDD input = spark.read().textFile("dataset/groupTop.txt").javaRDD();
JavaPairRDD pairs = input.mapToPair(line -> {
String arr[] = line.split(",");
return new Tuple2(arr[0], Integer.valueOf(arr[1]));
});
JavaPairRDD> group = pairs.groupByKey();
JavaPairRDD> top2score = group.mapToPair(tuple -> {
List list = new ArrayList();
Iterator it = tuple._2.iterator();
while(it.hasNext()) {
Integer score = it.next();
list.add(score);
}
Collections.sort(list, (v1, v2) -> -(v1.compareTo(v2)));
return new Tuple2>(tuple._1,list);
});
top2score.collect().forEach(System.out::println);
// top2score.foreach(tuple -> {
// System.out.println("city:" + tuple._1);
// Iterator it = tuple._2.iterator();
// while(it.hasNext()) {
// System.out.println(it.next());
// }
//
// System.out.println("-----------------------");
// });
}
}
运行结果:
(Computer,[89, 88, 81])
(Math,[94, 93, 89])
(Chinese,[90, 86, 83])
(English,[84, 82, 79])
Scala版:
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object GroupTopN {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("WordCountLocal") .setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val lines = spark.read.textFile("dataset/groupTop.txt").rdd
val cores = lines.map(line => (line.split(",")(0),line.split(",")(1).toInt))
val group = cores.groupByKey()
val groupSort = group.map(css => {
val c = css._1
val ss = css._2
val sortSorce = ss.toList.sortWith(_ > _)
(c, sortSorce)
})
groupSort.foreach(println)
}
}
运行结果:
(Computer,List(89, 88, 81))
(Math,List(94, 93, 89))
(Chinese,List(90, 86, 83))
(English,List(84, 82, 79))
大数据环境:
Spark 2.2.0
jdk1.8
可以看出,Scala的代码确实比Java少了很多,而且更简洁明了。