分组top n,在业务场景用的地方特别多。
案例数据格式为:
方式一:全排序
全排序缺点在于,不管你最后结果如何,当前的list都会被全部遍历。
package com.shsxt.java;
import org.apache.commons.collections.IteratorUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
public class TopN {
public static void main(String[] args) {
//1.构建SparkContext执行环境入口对象
SparkConf conf = new SparkConf();
conf.setAppName("SecondSort").setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
//2.读取数据
JavaRDD<String> lineRDD = jsc.textFile("data/scores.txt");
//3.通过mapToPair构建可排序对象
JavaPairRDD<String, Integer> pairRDD = lineRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
String[] split = s.split(" ");
return new Tuple2<>(split[0], Integer.parseInt(split[1]));
}
});
//4.groupByKey算子:按key聚合分组,value存在Iterable中
JavaPairRDD<String, Iterable<Integer>> groupByKey = pairRDD.groupByKey();
//5.对Iterable进行排序取top n
groupByKey.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> stringIterableTuple2) throws Exception {
String key = stringIterableTuple2._1;
Iterator<Integer> iterator = stringIterableTuple2._2.iterator();
//将iterator转化为list
List list = IteratorUtils.toList(iterator);
//使用全排序,升序
Collections.sort(list);
//取top3
for (int i = 0; i < Math.min(3, list.size()); i++) {
System.out.println(key + ":" + list.get(list.size() - 1 - i));
}
}
});
jsc.stop();
}
}
e:98
e:47
e:46
d:84
d:41
d:23
a:86
a:84
a:58
b:98
m:48
b:78
b:51
c:89
c:73
c:47
f:48
f:32
f:18
方式二:
在内存中开辟一段固定空间和业务需求相关。top3就定义长度为3。与方法一相比,会节省内存空间。
package com.shsxt.java;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;
import java.util.Iterator;
public class TopN {
public static void main(String[] args) {
//1.构建SparkContext执行环境入口对象
SparkConf conf = new SparkConf();
conf.setAppName("SecondSort").setMaster("local[*]");
JavaSparkContext jsc = new JavaSparkContext(conf);
//2.读取数据
JavaRDD<String> lineRDD = jsc.textFile("data/scores.txt");
//3.通过mapToPair构建可排序对象
JavaPairRDD<String, Integer> pairRDD = lineRDD.mapToPair(new PairFunction<String, String, Integer>() {
@Override
public Tuple2<String, Integer> call(String s) throws Exception {
String[] split = s.split(" ");
return new Tuple2<>(split[0], Integer.parseInt(split[1]));
}
});
//4.groupByKey算子:按key聚合分组,value存在Iterable中
JavaPairRDD<String, Iterable<Integer>> groupByKey = pairRDD.groupByKey();
// //5.对Iterable进行排序取top n
// groupByKey.foreach(new VoidFunction>>() {
// @Override
// public void call(Tuple2> stringIterableTuple2) throws Exception {
// String key = stringIterableTuple2._1;
// Iterator iterator = stringIterableTuple2._2.iterator();
// //将iterator转化为list
// List list = IteratorUtils.toList(iterator);
// //使用全排序,升序
// Collections.sort(list);
// //取top3
// for (int i = 0; i < Math.min(3, list.size()); i++) {
// System.out.println(key + ":" + list.get(list.size() - 1 - i));
// }
//
// }
// });
//5.自定义排序算法,取top n
groupByKey.foreach(new VoidFunction<Tuple2<String, Iterable<Integer>>>() {
@Override
public void call(Tuple2<String, Iterable<Integer>> tuple2) throws Exception {
String key = tuple2._1;
Iterator<Integer> iterator = tuple2._2.iterator();
//根据业务定义一个固定长度的数组
Integer[] top3 = new Integer[3];
//在迭代的过程中进行值的比较,并放到数组中
while (iterator.hasNext()) {
Integer score = iterator.next();
for (int i=0;i<top3.length;i++) {
//如果数组为空,则将score放入数组中;如果数组有值,则需要先进行移位
if (top3[i] == null) {
top3[i] = score;
break;
} else if (score > top3[i]) {
//移位操作
for (int j=2;j>i;j--) {
top3[j] = top3[j-1];
}
top3[i] = score;
break;
}
}
}
//遍历打印
for (Integer score:top3) {
System.out.println(key +":"+score);
}
}
});
jsc.stop();
}
}
运算结果为:
d:84
d:41
d:23
b:98
b:78
b:51
e:98
e:47
e:46
a:86
a:84
a:58
m:48
m:null
m:null
c:89
c:73
c:47
f:48
f:32
f:18