数据算法(TopN) :Spark+Spark(takeOrdered)实现(非键唯一情况)

package cn.weida.Spark.TopNNonUnique;

import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;

import Util.SparkUtil;
import scala.Tuple2;

/**
 * 假设:对于所有输入(K,V),K是不唯一的
 * 这个类实现了Top N设计模式(N>0)
 * 主要假设为所有输入(K,V) 对,K非唯一
 * 如果发现重复的K,使用reduceByKey()对值进行处理得到唯一Key
 * 先选出本地topN 再选出最终TopN
 * @author acm160920007
 * 
 * 上午11:40:13  2018年8月9日
 *
 */
public class TopNNonUnique {

	public static void main(String[] args) throws Exception {
		if (args.length != 3) {
			System.out.println("Usage:TopN N [top/bottom] ");
			System.exit(1);
		}
		int topN = Integer.parseInt(args[0]);
		String direction = args[1];
		if (!(direction.equals("top")||direction.equals("bottom"))) {
			System.out.println("Usage:TopN N [top/bottom] ");
			System.exit(1);
		}
		
		String inputPath = args[2];
		System.out.println("inputPath : =" + inputPath);
		
		
		JavaSparkContext ctx = SparkUtil.createJavaSparkContext();
		
		Broadcast broadcastTopN = ctx.broadcast(topN);
		Broadcast broadcastDirection = ctx.broadcast(direction);

		// 输入
		JavaRDD lines = ctx.textFile(inputPath, 1);

		//RDD分区
		JavaRDD rdd = lines.coalesce(9);
		// (String) -> (String,Integer) 输入 输出key 输出 value
		JavaPairRDD pairs = rdd.mapToPair(new PairFunction() {
			public Tuple2 call(String s) {
				String[] tokens = s.split(",");
				return new Tuple2(tokens[0], Integer.parseInt(tokens[1]));
			}
		});
		//归约重复键
		JavaPairRDD uniqueKeys = pairs.reduceByKey(new Function2() {
			
			@Override
			public Integer call(Integer arg0, Integer arg1) throws Exception {
				return arg0+arg1;
			}
		});

		//创建一个本地的topN
		JavaRDD> partitions = pairs
				.mapPartitions(new FlatMapFunction>, SortedMap>() {
					@Override
					public Iterator> call(Iterator> iter) {
						SortedMap topN = new TreeMap(); // 等价 setup()
						while (iter.hasNext()) { // 等价map()
							Tuple2 tuple = iter.next();
							topN.put(tuple._2, tuple._1);
							if (topN.size() > broadcastTopN.value()) {
								if (broadcastDirection.toString().equals("top")) {
									topN.remove(topN.firstKey());
								} else if (broadcastDirection.equals("bottom")) {
									topN.remove(topN.lastKey());
								}
 								
							}
						}
						return Collections.singletonList(topN).iterator(); // 等价clearup()
					}
				});
		
		
		//所有本地topN 创建最终TopN
		SortedMap finaltopN = new TreeMap();
		List> alltopN = partitions.collect();
		for (SortedMap localtopN :alltopN) {
			for (Map.Entry entry : localtopN.entrySet()) {
				finaltopN.put(entry.getKey(), entry.getValue());
				if (finaltopN.size()>broadcastTopN.value()) {
						if (broadcastDirection.toString().equals("top")) {
							finaltopN.remove(finaltopN.firstKey());
						} else if (broadcastDirection.equals("bottom")) {
							finaltopN.remove(finaltopN.lastKey());
						}
							
					}
				}
			}
		
		
		for (Map.Entry entry : finaltopN.entrySet()) {
	         System.out.println(entry.getKey() + "--" + entry.getValue());
	      }
		
		System.exit(0);
	}
	
	 
}

使用自定义排序

package cn.weida.Spark.TopNUsingTakeOrderd;

import java.io.Serializable;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;

import Util.SparkUtil;
import scala.Tuple2;

/**
 * 调用RDD.takeOrdered(int n,java.util.Comparator comp)
 * 自定义比较类,实现Comparator接口
 * @author acm160920007
 * 
 * 下午1:16:34  2018年8月9日
 *
 */
public class TopNUsingTakeOrdered implements Serializable{
	
	//自定义比较器类
	static class MyTupleComparator implements Comparator>,Serializable{
		final static MyTupleComparator INSTANCE = new MyTupleComparator();
		@Override
		public int compare(Tuple2 o1, Tuple2 o2) {
			return -o1._2.compareTo(o2._2);   //返回TopN
			//return o1._2.compareTo(o2._2);   //返回bottom N
		}
		
	}

	public static void main(String[] args) throws Exception {
		if (args.length != 3) {
			System.out.println("Usage:TopN N [top/bottom] ");
			System.exit(1);
		}
		int topN = Integer.parseInt(args[0]);
		String direction = args[1];
		if (!(direction.equals("top")||direction.equals("bottom"))) {
			System.out.println("Usage:TopN N [top/bottom] ");
			System.exit(1);
		}
		
		String inputPath = args[2];
		System.out.println("inputPath : =" + inputPath);
		
		
		JavaSparkContext ctx = SparkUtil.createJavaSparkContext();
		
		Broadcast broadcastTopN = ctx.broadcast(topN);
		Broadcast broadcastDirection = ctx.broadcast(direction);

		// 输入
		JavaRDD lines = ctx.textFile(inputPath, 1);

		//RDD分区
		JavaRDD rdd = lines.coalesce(9);
		// (String) -> (String,Integer) 输入 输出key 输出 value
		JavaPairRDD pairs = rdd.mapToPair(new PairFunction() {
			public Tuple2 call(String s) {
				String[] tokens = s.split(",");
				return new Tuple2(tokens[0], Integer.parseInt(tokens[1]));
			}
		});
		//归约重复键
		JavaPairRDD uniqueKeys = pairs.reduceByKey(new Function2() {
			
			@Override
			public Integer call(Integer arg0, Integer arg1) throws Exception {
				return arg0+arg1;
			}
		});

		//创建一个本地的topN
		JavaRDD> partitions = pairs
				.mapPartitions(new FlatMapFunction>, SortedMap>() {
					@Override
					public Iterator> call(Iterator> iter) {
						SortedMap topN = new TreeMap(); // 等价 setup()
						while (iter.hasNext()) { // 等价map()
							Tuple2 tuple = iter.next();
							topN.put(tuple._2, tuple._1);
							if (topN.size() > broadcastTopN.value()) {
								if (broadcastDirection.toString().equals("top")) {
									topN.remove(topN.firstKey());
								} else if (broadcastDirection.equals("bottom")) {
									topN.remove(topN.lastKey());
								}
 								
							}
						}
						return Collections.singletonList(topN).iterator(); // 等价clearup()
					}
				});
		
		
		List> topNResult = uniqueKeys.takeOrdered(broadcastTopN.value(), MyTupleComparator.INSTANCE);
		
		
		for (Tuple2 entry : topNResult) {
	         System.out.println(entry._2+ "--" + entry._1);
	      }
		
		System.exit(0);
	}
}

 

你可能感兴趣的:(数据算法)