分析wordcount的源代码,研究MapReduce的运行过程和数据流向。
import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class wordcount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); context.write(word, one); } } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "word count"); job.setJarByClass(wordcount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
编译打包运行
javac -classpathhadoop-1.2.1/hadoop-core-1.2.1.jar:hadoop-1.2.1/lib/commons-cli-1.2.jar -d tempwordcount.java jar -cvf word.jar -C temp . hadoop jar word.jar wordcount input output
input下有两个文件,file1和file2
file1内容 hello world bye world file2内容 hello hadoop bye hadoop
Object类是类层次结构的根,Java中所有的类从根本上都继承自这个类。Object类是Java中唯一没有父类的类。
StringTokenizer是一个用来分隔String的应用类。
public StringTokenizer(String str):以”\t\n\r\f”作为分隔符初始化。
boolean hasMoreTokens() :返回是否还有分隔符。
String nextToken():返回从当前位置到下一个分隔符的字符串。
一个MapReduce作业由Map阶段和Reduce阶段两部分组成,这两阶段会对数据排序,从这个意义上说,MapReduce框架本质就 是一个Distributed Sort。
在Map阶段,Map Task会在本地磁盘输出一个按照key排序(采用的是快速排序)的文件(中间可能产生多个文件,但最终会合并成一个);在Reduce阶段,每个 Reduce Task会对收到的数据排序,这样,数据便按照Key分成了若干组,之后以组为单位交给reduce()处理。
很多人的误解在Map阶段,如果不使用 Combiner便不会排序,这是错误的,不管你用不用Combiner,Map Task均会对产生的数据排序(如果没有Reduce Task,则不会排序, 实际上Map阶段的排序就是为了减轻Reduce端排序负载)。
Map和Reduce阶段的排序是MapReduce自动完成的,用户无法控制,在hadoop 1.x中不可以关闭,但hadoop2.x是可以关闭的。
在执行一个Job的时候,数据是这样流动的:
1、输入数据被划分成N个Split,然后启动相应的N个Map程序来分别处理它们。
2、每个Map程序中,对每一个<key,value>依次执行map函数。然后Map Task会对这些数据排序。
3、combine函数把排序后的<key,value>对合并成新的<key,value>。
4、在Reduce阶段,每个Reduce Task会对收到的数据排序,这样数据便按照Key分成了若干组,之后以组为单位交给reduce函数处理。
我们能修改的部分只有map、combine、reduce三个函数。下面我会稍加修改wordcount程序,使之输出中间数据来验证以上流程。
InputSplit用来把输入数据传送给每个单独的Map,InputSplit存储的并非数据本身,而是一个分片长度和一个记录数据位置的数组。
Hadoop预定义了多种方法将不同类型的输入数据转化为Map能够处理的<key,value>对。
TextInputFormat是Hadoop默认的输入方法,每个文件都会单独作为Map的输入,之后每行数据都生成一条记录,每条记录则表示成<key,value>形式。
使map,combine,reduce分别输出其处理的<key,value>对。
中间数据输出到“/home/tom/log.txt”文件,此文件需要提前建立。
import java.io.IOException; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; import java.io.BufferedWriter; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStreamWriter; public class wordcount { public static class TokenizerMapper extends Mapper<Object, Text, Text, IntWritable> { private final static IntWritable one = new IntWritable(1); private Text word = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { writelog(key.toString()); writelog("\t"); writelog(value.toString()); writelog("\n"); StringTokenizer itr = new StringTokenizer(value.toString()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); writelog(word.toString()); writelog(" "); context.write(word, one); } writelog("\n"); } } public static class IntSumReducer extends Reducer<Text,IntWritable,Text,IntWritable> { private IntWritable result = new IntWritable(); public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum = 0; for (IntWritable val : values) { sum += val.get(); } result.set(sum); writelog(key.toString()); writelog(" "); writelog(result.toString()); writelog("\n"); context.write(key, result); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = new Job(conf, "word count"); job.setJarByClass(wordcount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); } public static void writelog(String conent) { String file = "/home/tom/log.txt"; BufferedWriter out = null; try { out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file, true))); out.write(conent); } catch (Exception e) { e.printStackTrace(); } finally { try { if(out != null) { out.close(); } } catch (IOException e) { e.printStackTrace(); } } } }
file1
hello world
bye world
file2
hello hadoop
bye hadoop
file1 |
file2 |
0 hello world |
0 hello hadoop |
12 bye world |
13 bye world |
file1 |
file2 |
hello 1 world 1 bye 1 world 1 |
hello 1 hadoop 1 bye 1 hadoop 1 |
file1 |
file2 |
bye 1 hello 1 world 1 world 1 |
bye 1 hadoop 1 hadoop 1 hello 1 |
file1 |
file2 |
bye 1 hello 1 world 2 |
bye 1 hadoop 2 hello 1 |
bye 1 bye 1 |
hadoop 2 |
hello 1 hello 1 |
world 2 |
bye 2 hadoop 2 hello 2 world 2 |
0 hello world
hello world
12 bye world
bye world
bye 1
hello 1
world 2
0 hello hadoop
hello hadoop
13 bye hadoop
bye hadoop
bye 1
hadoop 2
hello 1
bye 2
hadoop 2
hello 2
world 2
与分析完全吻合。
hadoop面试题集锦
http://blog.csdn.net/net19880504/article/details/38326583
java向文件写数据的3种方式
http://blog.csdn.net/wgw335363240/article/details/5678965
Java追加文件内容的三种方法
http://blog.csdn.net/malik76/article/details/6408726/
StringTokenizer类的使用
http://blog.csdn.net/riyunzhu/article/details/7989145