Hadoop in-mapper combining 实例

 

import java.io.IOException;
import java.util.*;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;




public class wordcount1 extends Configured implements Tool{

	public static class mapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, IntWritable>{

		@Override
		public void map(LongWritable key, Text value,
				OutputCollector<Text, IntWritable> output, Reporter report)
				throws IOException {
			Map<String, Integer> map = new HashMap<String,Integer>();
			String[] ss = value.toString().split(":");
			
			FileSplit fs = (FileSplit)report.getInputSplit();
			
			System.out.println(fs.getPath().toUri().toString());
			
			for(int i=0;i<ss.length;i++){
				if(!map.containsKey(ss[i])){
					map.put(ss[i], 1);
				}else{
					int tmp = map.get(ss[i])+1;
					map.put(ss[i], tmp);
				}
			}
			
			for(Map.Entry<String, Integer> m : map.entrySet()){
				System.out.println(m.getKey()+"\t"+m.getValue());
				output.collect(new Text(m.getKey()), new IntWritable(m.getValue()));
			}
		}
		
	}
	
	public static class reducer extends MapReduceBase implements Reducer<Text, IntWritable, Text,IntWritable>{

		@Override
		public void reduce(Text key, Iterator<IntWritable> value,
				OutputCollector<Text, IntWritable> output, Reporter report)
				throws IOException {
			int sum = 0;
			while(value.hasNext()){
				sum += value.next().get();
			}
			output.collect(key, new IntWritable(sum));
		}
		
	}

	@Override
	public int run(String[] arg0) throws Exception {

		Configuration conf = new Configuration();
		
		JobConf job = new JobConf(conf, wordcount1.class);
		
		FileInputFormat.addInputPath(job, new Path(arg0[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
		
		job.setJobName("test citation");
		job.setMapperClass(mapper.class);
		job.setReducerClass(reducer.class);
		/*12/04/08 13:56:09 INFO mapred.JobClient:     Reduce input groups=4
		12/04/08 13:56:09 INFO mapred.JobClient:     Combine output records=4
		12/04/08 13:56:09 INFO mapred.JobClient:     Map input records=4
		12/04/08 13:56:09 INFO mapred.JobClient:     Reduce shuffle bytes=0
		12/04/08 13:56:09 INFO mapred.JobClient:     Reduce output records=4
		12/04/08 13:56:09 INFO mapred.JobClient:     Spilled Records=8
		12/04/08 13:56:09 INFO mapred.JobClient:     Map output bytes=42
		12/04/08 13:56:09 INFO mapred.JobClient:     Map input bytes=33
		12/04/08 13:56:09 INFO mapred.JobClient:     Combine input records=5
		12/04/08 13:56:09 INFO mapred.JobClient:     Map output records=5
		12/04/08 13:56:09 INFO mapred.JobClient:     Reduce input records=4
		 * */
		job.setCombinerClass(reducer.class);
		//job.setNumReduceTasks(2);
		
		job.setInputFormat(TextInputFormat.class);
		job.setOutputFormat(TextOutputFormat.class);
		
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(IntWritable.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		JobClient.runJob(job);
		
		return 0;
	}
	
	public static void main(String[] args) {
		try {
			System.exit(ToolRunner.run(new Configuration(), new wordcount1(), args));
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

此例只能在单个map输入key/value对上进行聚集,
比如 value为 huhu xie xie    map输出 huhu 1  xie 2
而如果不采用聚集则输出是 huhu 1 xie 1 xie 1


public class wordcount2 {

	public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable>{

		private  Map<String,Integer> map ;
		
		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			map = new HashMap<String,Integer>();
		}	

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] ss = value.toString().split(":");
			//相当于combiner的工作
			for(int i=0;i<ss.length;i++){
				if(!map.containsKey(ss[i])){
					map.put(ss[i], 1);
				}else{
					int tmp = map.get(ss[i])+1;
					map.put(ss[i], tmp);
				}
			}
		}
		
		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {
			for(Map.Entry<String, Integer> m : map.entrySet()){
				context.write(new Text(m.getKey()), new IntWritable(m.getValue()));
			}
		}
	}
	
	public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable>{

		@Override
		protected void reduce(Text key, Iterable<IntWritable> value,
				Context context)
				throws IOException, InterruptedException {
			int sum = 0;
			while(value.iterator().hasNext()){
				sum += value.iterator().next().get();
			}
			context.write(key, new IntWritable(sum));
		}
	}


	
	public static void main(String[] args) {
		
		try {
			Job job = new Job();
			job.setJarByClass(wordcount2.class);
			job.setJobName("wordcount2");
			
			FileInputFormat.addInputPath(job, new Path("input"));
			FileOutputFormat.setOutputPath(job, new Path("output"));
			
			job.setMapperClass(mapper.class);
			job.setReducerClass(reducer.class);
			
			job.setInputFormatClass(TextInputFormat.class);
			job.setOutputFormatClass(TextOutputFormat.class);
			
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			System.exit( job.waitForCompletion(true) ? 0 : 1 );
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
}

此例可以在多个key/value,也可以是不同文件的key/value 进行聚集,起作用相当于Combiner,但是后者只是hadoop的一种优化策略,并不保证其正确性,前者相对后者更灵活控制执行过程

存在一个问题:内存问题,由于这种方法是在处理完所有的文件后才产生map输出,故可能存在内存不足的问题,对于这一个很有效的方法是设定阈值N,达到N就输出,而不是要等到全部处理完成才输出

public class wordcount3 {

	public static class mapper extends Mapper<LongWritable, Text, Text, IntWritable>{

		private  Map<String,Integer> map ;
		private int N ;
		
		@Override
		protected void setup(Context context) throws IOException,
				InterruptedException {
			map = new HashMap<String,Integer>();
			N = 0;
		}	

		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] ss = value.toString().split(":");
			N++;
			//相当于combiner的工作
			for(int i=0;i<ss.length;i++){
				if(!map.containsKey(ss[i])){
					map.put(ss[i], 1);
				}else{
					int tmp = map.get(ss[i])+1;
					map.put(ss[i], tmp);
				}
			}
			
			if(N == 2){
				for(Map.Entry<String, Integer> m : map.entrySet()){
					context.write(new Text(m.getKey()), new IntWritable(m.getValue()));
				}
				N = 0;
				map.clear();
				System.out.println("write two key/value");
			}
		}
		
		@Override
		protected void cleanup(Context context) throws IOException,
				InterruptedException {
			//写入最后<=N的 key/value
			if(map.size()>0){
				for(Map.Entry<String, Integer> m : map.entrySet()){
					context.write(new Text(m.getKey()), new IntWritable(m.getValue()));
				}
				System.out.println("writable last "+ map.size()+ " key/value");
			}
		}
	}
	
	public static class reducer extends Reducer<Text, IntWritable, Text, IntWritable>{

		@Override
		protected void reduce(Text key, Iterable<IntWritable> value,
				Context context)
				throws IOException, InterruptedException {
			int sum = 0;
			while(value.iterator().hasNext()){
				sum += value.iterator().next().get();
			}
			context.write(key, new IntWritable(sum));
		}
	}


	
	public static void main(String[] args) {
		
		try {
			Job job = new Job();
			job.setJarByClass(wordcount3.class);
			job.setJobName("wordcount2");
			
			FileInputFormat.addInputPath(job, new Path(args[0]));
			FileOutputFormat.setOutputPath(job, new Path(args[1]));
			
			job.setMapperClass(mapper.class);
			job.setReducerClass(reducer.class);
			//job.setCombinerClass(reducer.class);
			
			job.setInputFormatClass(TextInputFormat.class);
			job.setOutputFormatClass(TextOutputFormat.class);
			
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(IntWritable.class);
			System.exit( job.waitForCompletion(true) ? 0 : 1 );
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} catch (ClassNotFoundException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		
	}
}
N太大,内存溢出 N太小,聚集性能下降    N的选择很重要

你可能感兴趣的:(hadoop)