package wordcount; import java.io.IOException; import java.util.StringTokenizer; import org.apache.commons.math3.stat.descriptive.summary.Sum; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { ///这里的object也可以写成IntWritable public static class WMap extends Mapper<Object, Text, Text, IntWritable> { private Text word = new Text(); private IntWritable one = new IntWritable(1); ///从文件块读取数据,接收到(偏移,line),然后调用map函数 @Override ///map函数,每读取一条数据就调用一次,所以一些常用的变量定义在map函数之外比较合适 protected void map(Object key, Text value, Context context) throws IOException, InterruptedException { // TODO Auto-generated method stub // super.map(key, value, context); //分词 StringTokenizer st = new StringTokenizer(value.toString()); while(st.hasMoreTokens()) { //获取单词 word.set(st.nextToken()); //输出单词,以及次数,如("aaa",1),经过shuffle之后,传递给reduce context.write(word, one); } } } public static class WReduce extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable sum = new IntWritable(); @Override ///接收到{key,value(1,2,3,4,5)}这种格式,然后,通过reduce进行处理。 protected void reduce(Text arg0, Iterable<IntWritable> arg1, Context arg2) throws IOException, InterruptedException { // TODO Auto-generated method stub // super.reduce(arg0, arg1, arg2); int num = 0; ///将key对应的value累加,求次数 for(IntWritable tmp:arg1) { num+=tmp.get(); } sum.set(num); ///这里写出到hdfs arg2.write(arg0, sum); } } public static void main(String [] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(WMap.class); job.setReducerClass(WReduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); ///指定输入路径 FileInputFormat.addInputPath(job, new Path("hdfs://node01:9000/ld/in/")); //执行输出路径 FileOutputFormat.setOutputPath(job, new Path("hdfs://node01:9000/ld/out/wordcount")); //等待执行完成 System.exit(job.waitForCompletion(true) ? 0 : 1); } }
in目录下的数据就是:
Hello MapReduce Hello Hadoop
结果如下:
Hadoop 6 Hello 12 MapReduce 6