MapReduce前N个热度统计(TopN)

在Reduce阶段进行排序的时候,对每个相同的Key进行分组,然后缓存在TreeMap中,他可以自动按照对象的比较器进行排序, 最终输出前N个热门访问页面.

1. 自定义序列化数据类型,并设定比较器

package com.gerry.bigdata.mapreduce.top3;

public class PageCount implements Comparable {
	
	private String page;
	private int count;
	
	public void set(String page, int count) {
		this.page = page;
		this.count = count;
	}


	public String getPage() {
		return page;
	}

	public void setPage(String page) {
		this.page = page;
	}

	public int getCount() {
		return count;
	}

	public void setCount(int count) {
		this.count = count;
	}


	@Override
	public int compareTo(PageCount o) {
		// TODO Auto-generated method stub
		return o.getCount()-this.getCount()==0?this.page.compareTo(o.getPage()):o.getCount()-this.getCount();
	}
	
	

}

2. Job提交客户端

package com.gerry.bigdata.mapreduce.top3;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class JobSubmiter {

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		//获取前N个参数方法
		// 方法一
//		conf.setInt("top.n", Integer.parseInt(args[0]));
		// 方法二
//		conf.setInt("top.n", 3);
		// 方法三
//		Properties props = new Properties();
//		props.load(JobSubmiter.class.getClassLoader().getResourceAsStream("topn.properties"));
//		conf.setInt("top.n", Integer.parseInt(props.getProperty("top.n")));
//		
		// 方法四:读取classspath的配置文件
		conf.addResource("oo.xml");

		Job job = Job.getInstance(conf);

		job.setJarByClass(JobSubmiter.class);
//		job.setJar("/home/gerry/pyspark/Bigdata/jars/Topn.jar");
		job.setMapperClass(PageTopnMapper.class);
		job.setReducerClass(PageTopnReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);

		Path inputPath = new Path("/home/gerry/pyspark/Bigdata/data/requests/input");
		Path outPath = new Path("/home/gerry/pyspark/Bigdata/data/requests//output");
//		FileSystem fs = FileSystem.get(new URI("hdfs://172.16.0.2:9000/"), conf, "root");
//		if (fs.exists(outPath)) {
//			fs.delete(outPath, true);
//		}

		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outPath);

//		job.setNumReduceTasks(3);
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);

	}

}

3. Mapper端

package com.gerry.bigdata.mapreduce.top3;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class PageTopnMapper extends Mapper {
	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String line = value.toString();
		String[] split = line.split(" ");
		context.write(new Text(split[1]), new IntWritable(1));

	}
}

4. Reducer端,使用TreeMap进行相同key的缓存

package com.gerry.bigdata.mapreduce.top3;

import java.io.IOException;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class PageTopnReducer extends Reducer {

	TreeMap treeMap = new TreeMap();

	@Override
	protected void reduce(Text Key, Iterable values, Context context)
			throws IOException, InterruptedException {
		int count = 0;
		for (IntWritable value : values) {
			count += value.get();
		}

		PageCount pageCount = new PageCount();
		pageCount.set(Key.toString(), count);
		treeMap.put(pageCount, null);
	}

	@Override
	protected void cleanup(Context context) throws IOException, InterruptedException {
		Configuration conf = context.getConfiguration();
		int topn = conf.getInt("top.n", 5);

		Set> entrySet = treeMap.entrySet();
		int i = 0;
		for (Entry entry : entrySet) {
			context.write(new Text(entry.getKey().getPage()), new IntWritable(entry.getKey().getCount()));
			i++;
			if (i == topn) {
				return;
			}
		}
	}

}

 

你可能感兴趣的:(Hadoop)