import java.io.IOException; import java.util.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.*; import org.apache.hadoop.mapred.*; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; public class wordcount1 extends Configured implements Tool{ public static class mapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{ @Override public void map(LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter report) throws IOException { Map<String, Integer> map = new HashMap<String,Integer>(); String[] ss = value.toString().split(":"); FileSplit fs = (FileSplit)report.getInputSplit(); System.out.println(fs.getPath().toUri().toString()); for(int i=0;i<ss.length;i++){ if(!map.containsKey(ss[i])){ map.put(ss[i], 1); }else{ int tmp = map.get(ss[i])+1; map.put(ss[i], tmp); } } for(Map.Entry<String, Integer> m : map.entrySet()){ System.out.println(m.getKey()+"\t"+m.getValue()); output.collect(new Text(m.getKey()), new IntWritable(m.getValue())); } } } public static class reducer extends MapReduceBase implements Reducer<Text, IntWritable, Text,IntWritable>{ @Override public void reduce(Text key, Iterator<IntWritable> value, OutputCollector<Text, IntWritable> output, Reporter report) throws IOException { int sum = 0; while(value.hasNext()){ sum += value.next().get(); } output.collect(key, new IntWritable(sum)); } } @Override public int run(String[] arg0) throws Exception { Configuration conf = new Configuration(); JobConf job = new JobConf(conf, wordcount1.class); FileInputFormat.addInputPath(job, new Path(arg0[0])); FileOutputFormat.setOutputPath(job, new Path(arg0[1])); job.setJobName("test citation"); job.setMapperClass(mapper.class); job.setReducerClass(reducer.class); /*12/04/08 13:56:09 INFO mapred.JobClient: Reduce input groups=4 12/04/08 13:56:09 INFO mapred.JobClient: Combine output records=4 12/04/08 13:56:09 INFO mapred.JobClient: Map input records=4 12/04/08 13:56:09 INFO mapred.JobClient: Reduce shuffle bytes=0 12/04/08 13:56:09 INFO mapred.JobClient: Reduce output records=4 12/04/08 13:56:09 INFO mapred.JobClient: Spilled Records=8 12/04/08 13:56:09 INFO mapred.JobClient: Map output bytes=42 12/04/08 13:56:09 INFO mapred.JobClient: Map input bytes=33 12/04/08 13:56:09 INFO mapred.JobClient: Combine input records=5 12/04/08 13:56:09 INFO mapred.JobClient: Map output records=5 12/04/08 13:56:09 INFO mapred.JobClient: Reduce input records=4 * */ job.setCombinerClass(reducer.class); //job.setNumReduceTasks(2); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); JobClient.runJob(job); return 0; } public static void main(String[] args) { try { System.exit(ToolRunner.run(new Configuration(), new wordcount1(), args)); } catch (Exception e) { e.printStackTrace(); } } } 此例只能在单个map输入key/value对上进行聚集, 比如 value为 huhu xie xie map输出 huhu 1 xie 2 而如果不采用聚集则输出是 huhu 1 xie 1 xie 1 public class wordcount2 { public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable>{ private Map<String,Integer> map ; @Override protected void setup(Context context) throws IOException, InterruptedException { map = new HashMap<String,Integer>(); } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] ss = value.toString().split(":"); //相当于combiner的工作 for(int i=0;i<ss.length;i++){ if(!map.containsKey(ss[i])){ map.put(ss[i], 1); }else{ int tmp = map.get(ss[i])+1; map.put(ss[i], tmp); } } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { for(Map.Entry<String, Integer> m : map.entrySet()){ context.write(new Text(m.getKey()), new IntWritable(m.getValue())); } } } public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException { int sum = 0; while(value.iterator().hasNext()){ sum += value.iterator().next().get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) { try { Job job = new Job(); job.setJarByClass(wordcount2.class); job.setJobName("wordcount2"); FileInputFormat.addInputPath(job, new Path("input")); FileOutputFormat.setOutputPath(job, new Path("output")); job.setMapperClass(mapper.class); job.setReducerClass(reducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit( job.waitForCompletion(true) ? 0 : 1 ); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } 此例可以在多个key/value,也可以是不同文件的key/value 进行聚集,起作用相当于Combiner,但是后者只是hadoop的一种优化策略,并不保证其正确性,前者相对后者更灵活控制执行过程 存在一个问题:内存问题,由于这种方法是在处理完所有的文件后才产生map输出,故可能存在内存不足的问题,对于这一个很有效的方法是设定阈值N,达到N就输出,而不是要等到全部处理完成才输出
public class wordcount3 { public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable>{ private Map<String,Integer> map ; private int N ; @Override protected void setup(Context context) throws IOException, InterruptedException { map = new HashMap<String,Integer>(); N = 0; } @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] ss = value.toString().split(":"); N++; //相当于combiner的工作 for(int i=0;i<ss.length;i++){ if(!map.containsKey(ss[i])){ map.put(ss[i], 1); }else{ int tmp = map.get(ss[i])+1; map.put(ss[i], tmp); } } if(N == 2){ for(Map.Entry<String, Integer> m : map.entrySet()){ context.write(new Text(m.getKey()), new IntWritable(m.getValue())); } N = 0; map.clear(); System.out.println("write two key/value"); } } @Override protected void cleanup(Context context) throws IOException, InterruptedException { //写入最后<=N的 key/value if(map.size()>0){ for(Map.Entry<String, Integer> m : map.entrySet()){ context.write(new Text(m.getKey()), new IntWritable(m.getValue())); } System.out.println("writable last "+ map.size()+ " key/value"); } } } public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable>{ @Override protected void reduce(Text key, Iterable<IntWritable> value, Context context) throws IOException, InterruptedException { int sum = 0; while(value.iterator().hasNext()){ sum += value.iterator().next().get(); } context.write(key, new IntWritable(sum)); } } public static void main(String[] args) { try { Job job = new Job(); job.setJarByClass(wordcount3.class); job.setJobName("wordcount2"); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(mapper.class); job.setReducerClass(reducer.class); //job.setCombinerClass(reducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit( job.waitForCompletion(true) ? 0 : 1 ); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (InterruptedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (ClassNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
N太大,内存溢出 N太小,聚集性能下降 N的选择很重要