package cn.weida.Spark.TopNNonUnique;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import Util.SparkUtil;
import scala.Tuple2;
/**
* 假设:对于所有输入(K,V),K是不唯一的
* 这个类实现了Top N设计模式(N>0)
* 主要假设为所有输入(K,V) 对,K非唯一
* 如果发现重复的K,使用reduceByKey()对值进行处理得到唯一Key
* 先选出本地topN 再选出最终TopN
* @author acm160920007
*
* 上午11:40:13 2018年8月9日
*
*/
public class TopNNonUnique {
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Usage:TopN N [top/bottom] ");
System.exit(1);
}
int topN = Integer.parseInt(args[0]);
String direction = args[1];
if (!(direction.equals("top")||direction.equals("bottom"))) {
System.out.println("Usage:TopN N [top/bottom] ");
System.exit(1);
}
String inputPath = args[2];
System.out.println("inputPath : =" + inputPath);
JavaSparkContext ctx = SparkUtil.createJavaSparkContext();
Broadcast broadcastTopN = ctx.broadcast(topN);
Broadcast broadcastDirection = ctx.broadcast(direction);
// 输入
JavaRDD lines = ctx.textFile(inputPath, 1);
//RDD分区
JavaRDD rdd = lines.coalesce(9);
// (String) -> (String,Integer) 输入 输出key 输出 value
JavaPairRDD pairs = rdd.mapToPair(new PairFunction() {
public Tuple2 call(String s) {
String[] tokens = s.split(",");
return new Tuple2(tokens[0], Integer.parseInt(tokens[1]));
}
});
//归约重复键
JavaPairRDD uniqueKeys = pairs.reduceByKey(new Function2() {
@Override
public Integer call(Integer arg0, Integer arg1) throws Exception {
return arg0+arg1;
}
});
//创建一个本地的topN
JavaRDD> partitions = pairs
.mapPartitions(new FlatMapFunction>, SortedMap>() {
@Override
public Iterator> call(Iterator> iter) {
SortedMap topN = new TreeMap(); // 等价 setup()
while (iter.hasNext()) { // 等价map()
Tuple2 tuple = iter.next();
topN.put(tuple._2, tuple._1);
if (topN.size() > broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
topN.remove(topN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
topN.remove(topN.lastKey());
}
}
}
return Collections.singletonList(topN).iterator(); // 等价clearup()
}
});
//所有本地topN 创建最终TopN
SortedMap finaltopN = new TreeMap();
List> alltopN = partitions.collect();
for (SortedMap localtopN :alltopN) {
for (Map.Entry entry : localtopN.entrySet()) {
finaltopN.put(entry.getKey(), entry.getValue());
if (finaltopN.size()>broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
finaltopN.remove(finaltopN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
finaltopN.remove(finaltopN.lastKey());
}
}
}
}
for (Map.Entry entry : finaltopN.entrySet()) {
System.out.println(entry.getKey() + "--" + entry.getValue());
}
System.exit(0);
}
}
使用自定义排序
package cn.weida.Spark.TopNUsingTakeOrderd;
import java.io.Serializable;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import Util.SparkUtil;
import scala.Tuple2;
/**
* 调用RDD.takeOrdered(int n,java.util.Comparator comp)
* 自定义比较类,实现Comparator接口
* @author acm160920007
*
* 下午1:16:34 2018年8月9日
*
*/
public class TopNUsingTakeOrdered implements Serializable{
//自定义比较器类
static class MyTupleComparator implements Comparator>,Serializable{
final static MyTupleComparator INSTANCE = new MyTupleComparator();
@Override
public int compare(Tuple2 o1, Tuple2 o2) {
return -o1._2.compareTo(o2._2); //返回TopN
//return o1._2.compareTo(o2._2); //返回bottom N
}
}
public static void main(String[] args) throws Exception {
if (args.length != 3) {
System.out.println("Usage:TopN N [top/bottom] ");
System.exit(1);
}
int topN = Integer.parseInt(args[0]);
String direction = args[1];
if (!(direction.equals("top")||direction.equals("bottom"))) {
System.out.println("Usage:TopN N [top/bottom] ");
System.exit(1);
}
String inputPath = args[2];
System.out.println("inputPath : =" + inputPath);
JavaSparkContext ctx = SparkUtil.createJavaSparkContext();
Broadcast broadcastTopN = ctx.broadcast(topN);
Broadcast broadcastDirection = ctx.broadcast(direction);
// 输入
JavaRDD lines = ctx.textFile(inputPath, 1);
//RDD分区
JavaRDD rdd = lines.coalesce(9);
// (String) -> (String,Integer) 输入 输出key 输出 value
JavaPairRDD pairs = rdd.mapToPair(new PairFunction() {
public Tuple2 call(String s) {
String[] tokens = s.split(",");
return new Tuple2(tokens[0], Integer.parseInt(tokens[1]));
}
});
//归约重复键
JavaPairRDD uniqueKeys = pairs.reduceByKey(new Function2() {
@Override
public Integer call(Integer arg0, Integer arg1) throws Exception {
return arg0+arg1;
}
});
//创建一个本地的topN
JavaRDD> partitions = pairs
.mapPartitions(new FlatMapFunction>, SortedMap>() {
@Override
public Iterator> call(Iterator> iter) {
SortedMap topN = new TreeMap(); // 等价 setup()
while (iter.hasNext()) { // 等价map()
Tuple2 tuple = iter.next();
topN.put(tuple._2, tuple._1);
if (topN.size() > broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
topN.remove(topN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
topN.remove(topN.lastKey());
}
}
}
return Collections.singletonList(topN).iterator(); // 等价clearup()
}
});
List> topNResult = uniqueKeys.takeOrdered(broadcastTopN.value(), MyTupleComparator.INSTANCE);
for (Tuple2 entry : topNResult) {
System.out.println(entry._2+ "--" + entry._1);
}
System.exit(0);
}
}