Hadoop MapReduce示例代码

《Hadoop in Action》第四章习题:

0.MaxValue:要求输出cite75_99.txt中最大的CITED值:

要点:

    1.Mapper只输出它所处理的数据中的最大值。(重写cleanup()函数)

    2.设置Reducer数目为一个 -D mapred.reduce.tasks=1,同时也只输出所处理的最大值。(重写cleanup()函数)

    3.cleanup()函数:在任务结束时执行一次。详见API。

代码如下:

/*
 * MaxValues
 * 函数作用:输出Patent中最大数值
 * Author: jokes000
 * Date: 2011-12-15
 */

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class MaxValue extends Configured implements Tool {

	public static class MapClass extends Mapper<LongWritable,Text,Text,Text> {
		
		int max = 0;
		// Map Method
		public void map(LongWritable key, Text value, Context context){
			String[] citation = value.toString().split(",", 2);
			try {
				int tmp = Integer.parseInt(citation[0]);
				if( tmp > max ) max = tmp;
			} catch(NumberFormatException e){
				// do nothing.
			}
			//context.write(new Text(citation[0]), new Text(citation[0]));
		}
		
		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(new Text(max+""), new Text(max+""));
		}
	}
	
	public static class Reduce extends Reducer<Text,Text,Text,IntWritable> {
		
		int max = 0;
		// Reduce Method
		public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			//IntWritable[] top = new IntWritable[10];
			for(Text value : values) {
				try {
					int tmp = Integer.parseInt(value.toString());
					if( tmp > max ) max = tmp;
				} catch(NumberFormatException e) {
					// do nothing.
				}
			}
			//context.write(new Text("Max"), new IntWritable(max));
		}
		
		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			context.write(new Text("Max"), new IntWritable(max));
		}
	}
	
	@Override
	public int run(String[] arg0) throws Exception {
		
		Job job = new Job();
		job.setJarByClass(MaxValue.class);
		
		FileInputFormat.addInputPath(job, new Path(arg0[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
		
		job.setMapperClass(MapClass.class);
		job.setReducerClass(Reduce.class);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		
		job.waitForCompletion(true);
		
		return 0;
	}
	
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new MaxValue(), args);
		System.exit(res);
	}

}


1.Top  K Values: 要求输出adapt63_99.txt中的第9列CLAIMS值的最大的K个值:

要点:

    1.Mapper值输出它所处理的数据中的最大的K个值。(重写 cleanup()函数)

    2.设置Reducer数目为1 -D mapred.reduce.tasks=1,同时对Mapper中输出进行排序,输出最大的K个值(重写 cleanup()函数)

代码如下:

/*
 * TopKValues
 * 函数作用:输出CLAIMS中最大的几个数值
 * Author: jokes000
 * Date: 2011-12-15
 */

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class TopKValues extends Configured implements Tool {

	public static class MapClass extends Mapper<LongWritable,Text,Text,IntWritable> {
		// 全局变量
		int len;	// K值
		int[] top;	// 用于保存的数组
		
		// Map Method
		public void map(LongWritable key, Text value, Context context) {
			String[] fields = value.toString().split(",",-20);
			try {
				int claims = Integer.parseInt(fields[8]);
				add(claims);
			} catch(NumberFormatException e) {
				// do nothing..
			}
		}
		
		private void add(int value) {
			top[0] = value;
			Arrays.sort(top);
		}
		
		@Override
		protected void setup(Context context) {
			// 获取设置的"K"值,若没有K值,则设置该值为10
		    len = context.getConfiguration().getInt("K", 10);
			top = new int[len+1];
		}
		
		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			for( int i = 1; i <= len; ++ i ) {
				context.write(new Text(top[i]+""), new IntWritable(top[i]));
			}
		}
	}
	
	public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> {
		int[] top;
		int len;
		
		@Override
		protected void setup(Context context) {
			len = context.getConfiguration().getInt("K", 10);
			top = new int[len+1];
		}
		
		private void add(int value) {
			top[0] = value;
			Arrays.sort(top);
		}
		
		// Reduce Method
		public void reduce(Text key, Iterable<IntWritable> values, Context context) {
			for(IntWritable value : values) {
				add(value.get());
			}
		}
		
		@Override
		protected void cleanup(Context context) throws IOException, InterruptedException {
			for( int i = len; i > 0; -- i ) {
				context.write(new Text("No."+(len-i+1)), new IntWritable(top[i]));
			}
		}
	}
	
	@Override
	public int run(String[] arg0) throws Exception {
		
		Job job = new Job();
		job.setJarByClass(TopKValues.class);
		
		FileInputFormat.addInputPath(job, new Path(arg0[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
		
		try{
			int K = Integer.parseInt(arg0[2]);
			getConf().setInt("K", K);
		} catch(NumberFormatException e) {
			// do nothing..
			getConf().setInt("K", 20);
		}
		
		job.setMapperClass(MapClass.class);
		job.setReducerClass(Reduce.class);
		job.setInputFormatClass(TextInputFormat.class);
		job.setOutputFormatClass(TextOutputFormat.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		
		job.waitForCompletion(true);
		
		return 0;
	}
	
	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new TopKValues(), args);
		System.exit(res);
	}

}


你可能感兴趣的:(mapreduce,hadoop,exception,String,Class,Path)