hadoop版本为hadoop1.2.1
eclipse版本为eclipse-standard-kepler-SR2-win32-x86_64
WordCount.java为hadoop-1.2.1\src\examples\org\apache\hadoop\examples\WordCount.java
1 /**
2 * Licensed under the Apache License, Version 2.0 (the "License"); 3 * you may not use this file except in compliance with the License. 4 * You may obtain a copy of the License at 5 * 6 * http://www.apache.org/licenses/LICENSE-2.0
7 * 8 * Unless required by applicable law or agreed to in writing, software 9 * distributed under the License is distributed on an "AS IS" BASIS, 10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 * See the License for the specific language governing permissions and 12 * limitations under the License. 13 */
14
15
16 package org.apache.hadoop.examples; 17
18 import java.io.IOException; 19 import java.util.StringTokenizer; 20
21 import org.apache.hadoop.conf.Configuration; 22 import org.apache.hadoop.fs.Path; 23 import org.apache.hadoop.io.IntWritable; 24 import org.apache.hadoop.io.Text; 25 import org.apache.hadoop.mapreduce.Job; 26 import org.apache.hadoop.mapreduce.Mapper; 27 import org.apache.hadoop.mapreduce.Reducer; 28 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 29 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 30 import org.apache.hadoop.util.GenericOptionsParser; 31
32 public class WordCount { 33
34 public static class TokenizerMapper 35 extends Mapper<Object, Text, Text, IntWritable>{ 36
37 private final static IntWritable one = new IntWritable(1); 38 private Text word = new Text(); 39
40 public void map(Object key, Text value, Context context 41 ) throws IOException, InterruptedException { 42 StringTokenizer itr = new StringTokenizer(value.toString()); 43 while (itr.hasMoreTokens()) { 44 word.set(itr.nextToken()); 45 context.write(word, one); 46 } 47 } 48 } 49
50 public static class IntSumReducer 51 extends Reducer<Text,IntWritable,Text,IntWritable> { 52 private IntWritable result = new IntWritable(); 53
54 public void reduce(Text key, Iterable<IntWritable> values, 55 Context context 56 ) throws IOException, InterruptedException { 57 int sum = 0; 58 for (IntWritable val : values) { 59 sum += val.get(); 60 } 61 result.set(sum); 62 context.write(key, result); 63 } 64 } 65
66 public static void main(String[] args) throws Exception { 67 Configuration conf = new Configuration(); 68 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 69 if (otherArgs.length != 2) { 70 System.err.println("Usage: wordcount <in> <out>"); 71 System.exit(2); 72 } 73 Job job = new Job(conf, "word count"); 74 job.setJarByClass(WordCount.class); 75 job.setMapperClass(TokenizerMapper.class); 76 job.setCombinerClass(IntSumReducer.class); 77 job.setReducerClass(IntSumReducer.class); 78 job.setOutputKeyClass(Text.class); 79 job.setOutputValueClass(IntWritable.class); 80 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 81 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 82 System.exit(job.waitForCompletion(true) ? 0 : 1); 83 } 84 }
在eclipse中新建java project,project名为WordCount
在project中新建class,类名为WordCount
再将上述代码覆盖eclipse中的WordCount.java
并将页首的package改了wordcount,改后的源码如下
1 package wordcount; 2
3 import java.io.IOException; 4 import java.util.StringTokenizer; 5
6 import org.apache.hadoop.conf.Configuration; 7 import org.apache.hadoop.fs.Path; 8 import org.apache.hadoop.io.IntWritable; 9 import org.apache.hadoop.io.Text; 10 import org.apache.hadoop.mapreduce.Job; 11 import org.apache.hadoop.mapreduce.Mapper; 12 import org.apache.hadoop.mapreduce.Reducer; 13 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 14 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 15 import org.apache.hadoop.util.GenericOptionsParser; 16
17 public class WordCount { 18
19 public static class TokenizerMapper 20 extends Mapper<Object, Text, Text, IntWritable>{ 21
22 private final static IntWritable one = new IntWritable(1); 23 private Text word = new Text(); 24
25 public void map(Object key, Text value, Context context 26 ) throws IOException, InterruptedException { 27 StringTokenizer itr = new StringTokenizer(value.toString()); 28 while (itr.hasMoreTokens()) { 29 word.set(itr.nextToken()); 30 context.write(word, one); 31 } 32 } 33 } 34
35 public static class IntSumReducer 36 extends Reducer<Text,IntWritable,Text,IntWritable> { 37 private IntWritable result = new IntWritable(); 38
39 public void reduce(Text key, Iterable<IntWritable> values, 40 Context context 41 ) throws IOException, InterruptedException { 42 int sum = 0; 43 for (IntWritable val : values) { 44 sum += val.get(); 45 } 46 result.set(sum); 47 context.write(key, result); 48 } 49 } 50
51 public static void main(String[] args) throws Exception { 52 Configuration conf = new Configuration(); 53 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 54 if (otherArgs.length != 2) { 55 System.err.println("Usage: wordcount <in> <out>"); 56 System.exit(2); 57 } 58 Job job = new Job(conf, "word count"); 59 job.setJarByClass(WordCount.class); 60 job.setMapperClass(TokenizerMapper.class); 61 job.setCombinerClass(IntSumReducer.class); 62 job.setReducerClass(IntSumReducer.class); 63 job.setOutputKeyClass(Text.class); 64 job.setOutputValueClass(IntWritable.class); 65 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 66 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 67 System.exit(job.waitForCompletion(true) ? 0 : 1); 68
69 } 70 }
1 import org.apache.hadoop.conf.Configuration; 2 import org.apache.hadoop.fs.Path; 3 import org.apache.hadoop.io.IntWritable; 4 import org.apache.hadoop.io.Text; 5 import org.apache.hadoop.mapreduce.Job; 6 import org.apache.hadoop.mapreduce.Mapper; 7 import org.apache.hadoop.mapreduce.Reducer; 8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 import org.apache.hadoop.util.GenericOptionsParser;
可以看到源码import了好几个hadoop自定义类,非JDK环境自带的类,所以需要把这些依赖包导入eclipse中,不然编译器如何能找到这些类呢,得明确让编译器知道这些类所在位置。
这时候编译并运行一下,会发现有如下错误
Exception in thread "main" java.lang.Error: Unresolved compilation problems: The import org.apache.commons cannot be resolved The import org.apache.commons cannot be resolved The import org.codehaus cannot be resolved The import org.codehaus cannot be resolved Log cannot be resolved to a type LogFactory cannot be resolved Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type Log cannot be resolved to a type JsonFactory cannot be resolved to a type JsonFactory cannot be resolved to a type JsonGenerator cannot be resolved to a type at org.apache.hadoop.conf.Configuration.<init>(Configuration.java:60) at wordcount.WordCount.main(WordCount.java:52)
原因是缺少依赖的jar库文件,再把缺少的jar库文件添加入库即可。
使用Add External JARs添加hadoop1.2.1\lib目录下所有jar文件。
再一次编译并运行,成功
最后打包成为jar文件
file->export
其中,WordCount.jar不是强求与类名相同,可以改为其他,譬如CountWord.jar,关系不大,然后点击Finish即可。
之后就可以在hadoop上运行了。运行WordCount详解可以参考Hadoop集群(第6期)_WordCount运行详解
1 hadoop jar WordCount.jar WordCount input output
注意上述代码中是没有
1 package org.apache.hadoop.examples;
倘若使用了package,那么jar文件中就有层次的,不再如hadoop jar WordCount.jar WordCount input output就可以运行了,需要详细指出WordCount(这个是主类的类名),运行命令改为
hadoop jar WordCount.jar org.apache.hadoop.examples.WordCount input output
关于这里打包的内容,在[hadoop]命令行编译并运行hadoop例子WordCount有讲述
转载自:林羽飞扬