MapReduce实现
Driver类
package cn.weida.MapReduce.ToN;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.log4j.Logger;
import Util.HadoopUtil;
/**
* 一个简单的TopN 默认选区前十 不考虑重复
* 使用TreeMap数据类型进行自动排序
* 将各个Map函数处理,得到本地TopN ,在将数据传到Reducer(一个) 再选最终的TopN
* @author acm160920007
*
* 下午1:40:22 2018年8月8日
*
*/
public class TopNDriver extends Configured implements Tool {
private static Logger THE_LOGGER = Logger.getLogger(TopNDriver.class);
@Override
public int run(String[] arg0) throws Exception {
Job job = new Job(getConf());
HadoopUtil.addJarsToDistributedCache(job, "/lib/");
int N = Integer.parseInt(arg0[0]);
job.getConfiguration().setInt("N", N);
job.setJobName("Top N");
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
// set map out (k,v)
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
// set value (k,v)
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(Text.class);
job.setJarByClass(TopNDriver.class);
job.setMapperClass(TopN_Mapper.class);
job.setReducerClass(TopN_Reducer.class);
job.setNumReduceTasks(1); //设置一个Reducer任务
Path input = new Path(arg0[1]);
Path output = new Path(arg0[2]);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
boolean status = job.waitForCompletion(true);
THE_LOGGER.info("run(): status=" + status);
return status ? 0 : 1;
}
public static void main(String[] args) throws Exception {
if (args.length != 3) {
THE_LOGGER.warn("usage TopNDriver
Maper类
package cn.weida.MapReduce.ToN;
import java.io.IOException;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class TopN_Mapper extends Mapper{
//定义本地top 10 所需的数据结构
private SortedMap TopNcats = new TreeMap();
private int N = 10; //默认 top 10
@Override
protected void cleanup(Mapper.Context context)
throws IOException, InterruptedException {
for (String catAttributes : TopNcats.values()) {
context.write(NullWritable.get(), new Text(catAttributes));
}
}
@Override
protected void map(LongWritable key, Text value, Mapper.Context context)
throws IOException, InterruptedException {
String[] tokens = value.toString().trim().split(",");
Integer weight = Integer.parseInt(tokens[0]);
TopNcats.put(weight, value.toString());
if (TopNcats.size()>N) {
//TopNcats.remove(TopNcats.lastKey()); 查找Botton N
TopNcats.remove(TopNcats.firstKey()); //查找Top N
}
}
@Override
protected void setup(Mapper.Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
this.N = conf.getInt("N",10);
}
}
Reducer类
package cn.weida.MapReduce.ToN;
import static org.mockito.Matchers.contains;
import static org.mockito.Mockito.inOrder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class TopN_Reducer extends Reducer{
private int N = 10;
private SortedMap FinaltopN = new TreeMap();
@Override
protected void reduce(NullWritable key, Iterable values,
Reducer.Context Context) throws IOException, InterruptedException {
for (Text catRecord:values) {
String[] tokens = catRecord.toString().trim().split(",");
int frequency = Integer.parseInt(tokens[1]);
String url = tokens[0];
FinaltopN.put(frequency, url);
if (FinaltopN.size()>N) {
//TopNcats.remove(TopNcats.lastKey()); 查找Botton N
FinaltopN.remove(FinaltopN.firstKey()); //查找Top N
}
}
List keys = new ArrayList(FinaltopN.keySet());
for (int i =keys.size()-1;i>=0;i--) {
Context.write(new IntWritable(keys.get(i)), new Text(FinaltopN.get(keys.get(i))));
}
}
@Override
protected void setup(Reducer.Context context)
throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
this.N = conf.getInt("N",10);
}
}
Spark实现
package cn.weida.Spark.TopN;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.commons.httpclient.CircularRedirectException;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.broadcast.Broadcast;
import org.stringtemplate.v4.compiler.STParser.element_return;
import com.codahale.metrics.Timer.Context;
import scala.Tuple2;
/**
* 对于输入的键唯一的情况 也就是不会考虑 (A,5) 和 (A,10)同时出现的情况
*
* 要求输入N大于0
*
* Top-10 Design Pattern: “Top Ten” Structure
*
* class mapper :
* setup(): 创建本地topN
* map(key, record ):
* Insert record into top ten sorted list if length of array
* is greater than 10.
* Truncate list to a length of 10.
* cleanup() : 输出
*
* class reducer:
* setup(): initialize top ten sorted list
* reduce(key, records): sort records
* truncate records to top 10
*
* @author acm160920007
*
* 上午10:25:43 2018年8月9日
**/
public class TopN {
public static void main(String[] args) {
if (args.length != 3) {
System.out.println("Usage:TopN N [top/bottom] ");
System.exit(1);
}
int topN = Integer.parseInt(args[0]);
String direction = args[1];
if (!(direction.equals("top")||direction.equals("bottom"))) {
System.out.println("Usage:TopN N [top/bottom] ");
System.exit(1);
}
String inputPath = args[2];
System.out.println("inputPath : =" + inputPath);
JavaSparkContext ctx = new JavaSparkContext();
Broadcast broadcastTopN = ctx.broadcast(topN);
Broadcast broadcastDirection = ctx.broadcast(direction);
// 输入
JavaRDD line = ctx.textFile(inputPath, 1);
// (String) -> (String,Integer) 输入 输出key 输出 value
JavaPairRDD pairs = line.mapToPair(new PairFunction() {
public Tuple2 call(String s) {
String[] tokens = s.split(",");
return new Tuple2(tokens[0], Integer.parseInt(tokens[1]));
}
});
//创建一个本地的topN
JavaRDD> partitions = pairs
.mapPartitions(new FlatMapFunction>, SortedMap>() {
@Override
public Iterator> call(Iterator> iter) {
SortedMap topN = new TreeMap(); // 等价 setup()
while (iter.hasNext()) { // 等价map()
Tuple2 tuple = iter.next();
topN.put(tuple._2, tuple._1);
if (topN.size() > broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
topN.remove(topN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
topN.remove(topN.lastKey());
}
}
}
return Collections.singletonList(topN).iterator(); // 等价clearup()
}
});
//所有本地topN 创建最终TopN
SortedMap finaltopN = new TreeMap();
List> alltopN = partitions.collect();
for (SortedMap localtopN :alltopN) {
for (Map.Entry entry : localtopN.entrySet()) {
finaltopN.put(entry.getKey(), entry.getValue());
if (finaltopN.size()>broadcastTopN.value()) {
if (broadcastDirection.toString().equals("top")) {
finaltopN.remove(finaltopN.firstKey());
} else if (broadcastDirection.equals("bottom")) {
finaltopN.remove(finaltopN.lastKey());
}
}
}
}
for (Map.Entry entry : finaltopN.entrySet()) {
System.out.println(entry.getKey() + "--" + entry.getValue());
}
System.exit(0);
}
}