所有代码:github-wttttt
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
// LogCount: count IP Address's visits
// map(): map each line to
// reduce(): add to
// use combiner() to optimize
public class LogCount{
public static class xxMapper
extends Mapper<Object, Text, Text, IntWritable>{ // extends继承类
private final static IntWritable one = new IntWritable(1); // final常量
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException{
// for each line
word.set(value.toString().split(" ")[0]);
context.write(word,one);
}
}
public static class xxReducer
extends Reducer<Text, IntWritable, Text, IntWritable>{ // extends继承类
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable values,
Context context) throws IOException, InterruptedException{
int sum = 0;
// for each key, count it
for (IntWritable val : values){
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
// 从输入获取剩下的配置:包括输入和输出路径
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
// 输入不合理检测
if (otherArgs.length < 2){
System.err.println("Usage: wordcount [...] " );
System.exit(2);
}
Job job = new Job(conf, "Log Count");
job.setJarByClass(LogCount.class);
job.setMapperClass(xxMapper.class);
// combiner和reducer使用同一个class,当如果combiner处理逻辑相同时
// 否则,为combiner写一个类,一般xxcombiner也是继承自Reducer
job.setCombinerClass(xxReducer.class); // combiner and reducer use the same class
job.setReducerClass(xxReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
for (int i = 0; i < otherArgs.length - 1; ++i){
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
/**
TopK 问题
Log_Max_k: find the max_k visiter's IP Address
map(): get TopK for each mapper
* Use TreeMap to store topK for each mapper
* For each mapper:
for each record, we try to updata the treemap, and finally we get TopK
* TreeMap is somewhat like a 'large root heap'.
* Unlike usual(write after one line), we write after all the input split is handled.
this is realized by the function 'cleanup'(conducted after the mapper task).
reduce(): get the global TopK in one Reducer
* we need just one Reducer to ensure top-k
TopK的k值是从外部(命令行)传给Mapper&Reducer
利用conf.set()以及conf.get()
**/
public class Log_Max_k {
public static class xxMap extends
Mapper{
/**
* the map function
* input file: format as: IPAddress\tVisitNum (for each line)
*/
// TODO: or
private TreeMap tree = new TreeMap();
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException{
// TODO: conf.set() in function run()
//在map方法中通过Context对象获取conf对象,进而取得参数值
Configuration conf = context. getConfiguration();
int K = conf.getInt("K_value", 10); // default = 10
String[] values = value.toString().split("\t"); // Tab split
//int visit_num = Integer.parseInt(values[1]);
//String IPAddress = values[0];
Text txt = new Text();
txt.set(values[0]);
tree.put(Integer.parseInt(values[1]), txt);
if (tree.size() > K){
tree.remove(tree.firstKey()); // store the top-k
}
}
@Override
protected void cleanup(Context context) throws IOException,
InterruptedException{
/**
* write after all the input split is handled, by the function cleanup()
*/
// iterate on the treemap, use Iterator
Iterator iter = tree.entrySet().iterator();
while (iter.hasNext()){
@SuppressWarnings("unchecked")
Map.Entry ent = (Map.Entry)iter.next();
// Map.Entry ent = (Map.Entry)iter.next();
// write: IPAddress Visit_num
context.write(ent.getValue(), new IntWritable(ent.getKey().intValue()));
}
}
}
public static class xxReduce extends Reducer{
private TreeMap tree = new TreeMap();
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException{
Configuration conf = context.getConfiguration();
int K = conf.getInt("K_value", 10); // default = 10
for(IntWritable visit_num: values){
tree.put(visit_num, key);
if (tree.size() > K){
tree.remove(tree.firstKey());
}
}
// iterate on tree, to write top-k
Iterator iter = tree.entrySet().iterator();
while (iter.hasNext()){
Map.Entry ent =(Map.Entry)iter.next();
context.write(ent.getValue(), ent.getKey());
}
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
// 从输入获取剩下的配置:包括输入和输出路径
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
// 输入不合理检测
if (otherArgs.length < 3){
System.err.println("Usage: wordcount [...] " );
System.exit(2);
}
Job job = new Job(conf, "TopKIP");
job.setJarByClass(Log_Max_k.class);
job.setMapperClass(xxMap.class);
// job.setCombinerClass(xxReducer.class); // combiner and reducer use the same class
job.setReducerClass(xxReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
conf.set("K_value", otherArgs[0]);
job.setNumReduceTasks(1); // set the reducer num to 1
for (int i = 1; i < otherArgs.length - 1; ++i){
FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
}
FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
附录3:关于TopK问题的详细思路: