MapReduce之WordCount

package wordcount;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.commons.math3.stat.descriptive.summary.Sum;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class WordCount {
	
											///这里的object也可以写成IntWritable
	public static class WMap extends Mapper<Object, Text, Text, IntWritable>
	{
		private Text word = new Text();
		private IntWritable one = new IntWritable(1);
		///从文件块读取数据,接收到(偏移,line),然后调用map函数
		@Override    ///map函数,每读取一条数据就调用一次,所以一些常用的变量定义在map函数之外比较合适
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			// TODO Auto-generated method stub
//			super.map(key, value, context);
			//分词
			StringTokenizer st = new StringTokenizer(value.toString());
			while(st.hasMoreTokens())
			{
				//获取单词
				word.set(st.nextToken());
				//输出单词,以及次数,如("aaa",1),经过shuffle之后,传递给reduce
				context.write(word, one);
			}
		}
	}
	
	public static class WReduce extends Reducer<Text, IntWritable, Text, IntWritable>
	{
		private IntWritable sum = new IntWritable();
		@Override ///接收到{key,value(1,2,3,4,5)}这种格式,然后,通过reduce进行处理。
		protected void reduce(Text arg0, Iterable<IntWritable> arg1,
				Context arg2) throws IOException, InterruptedException {
			// TODO Auto-generated method stub
//			super.reduce(arg0, arg1, arg2);
			int num = 0;
			///将key对应的value累加,求次数
			for(IntWritable tmp:arg1)
			{
				num+=tmp.get();
			}
			
			sum.set(num);
			
			///这里写出到hdfs
			arg2.write(arg0, sum);
			
		}
	}
	
	public static void main(String [] args) throws IOException, ClassNotFoundException, InterruptedException
	{
		Configuration conf = new Configuration();
		Job job = new Job(conf, "word count");
		job.setJarByClass(WordCount.class);
		job.setMapperClass(WMap.class);
		job.setReducerClass(WReduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(IntWritable.class);
		///指定输入路径
		FileInputFormat.addInputPath(job, new Path("hdfs://node01:9000/ld/in/"));
		//执行输出路径
		FileOutputFormat.setOutputPath(job, new Path("hdfs://node01:9000/ld/out/wordcount"));
		//等待执行完成
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}

}
in目录下的数据就是:
Hello MapReduce Hello Hadoop
结果如下:
Hadoop 6
Hello 12
MapReduce 6


你可能感兴趣的:(mapreduce)