package cn.spark.study.core;
import java.io.Serializable;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
import scala.Tuple2;
import org.apache.spark.api.java.function.*;
//import org.dataalgorithms.util.SparkUtil;
public class TopN implements Serializable{
private static final long serialVersionUID = 1L;
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("TopN");
JavaSparkContext sc = new JavaSparkContext(conf);
if(args.length < 2){
System.err.println("你没有输入参数");
System.exit(1);;
}
System.out.println("args[0]:
System.out.println("args[1]:
final String inputPath = args[0];
final int N = Integer.parseInt(args[1]);
//将topN广播到所有的节点上
final Broadcast
JavaRDD
//lines.saveAsTextFile("output/1");
//RDD分区 每个执行器使用(2*num_executors * cores_per_executor)个分区
JavaRDD
//转换成(key,value)形式
JavaPairRDD
private static final long serialVersionUID = 1L;
public Tuple2
String[] tokens = s.split(",");
return new Tuple2
}
});
//以相同的key聚合
JavaPairRDD
private static final long serialVersionUID = 1L;
public Integer call(Integer i1,Integer i2){
return i1 + i2;
}
});
//本地topN
JavaRDD
new FlatMapFunction
private static final long serialVersionUID = 1L;
@Override
public Iterable
throws Exception {
final int N = topN.value();
SortedMap
while(iter.hasNext()){
Tuple2
localTop.put(tuple._2,tuple._1);
if(localTop.size()>N){
localTop.remove(localTop.firstKey());
}
}
return Collections.singletonList(localTop);
}
});
//查找最终TopN
SortedMap
List
for(SortedMap
for(Map.Entry
finalTopN.put(entry.getKey(), entry.getValue());
if(finalTopN.size()>N){
finalTopN.remove(finalTopN.firstKey());
}
}
}
//输出结果
System.out.println("---------Top-N List(java类实现)----------");
System.out.println("-----------------------------");
for(Map.Entry
System.out.println("key = " + ent.getValue() + " value = " + ent.getKey());
}
//List
}
static class MyTupleComparator implements Comparator
private static final long serialVersionUID = 1L;
final static MyTupleComparator INSTANCE = new MyTupleComparator();
public int compare(Tuple2
return -t1._2.compareTo(t2._2);
}
}
}
脚本
INPUT=hdfs://spark01:9000/TopN
TOPN=4
/usr/local/spark1.5/bin/spark-submit \
--class cn.spark.study.core.TopN \
--num-executors 3 \
--driver-memory 100m \
--executor-memory 100m \
--executor-cores 3 \
/usr/local/spark-text/java/TopN/spark-TopN.jar $INPUT $TOPN
file1.txt
A,2
B,2
C,3
D,2
E,1
G,2
file2.txt
A,1
B,1
C,3
F,1
E,1
G,2
file3.txt
A,2
B,2
C,1
D,2
F,1
E,1
G,2