首先保证JDK、Hadoop安装设置成功
可以参考[linux]ubuntu下安装hadoop [linux]ubutnu12.04 下安装jdk1.7
使用hadoop版本为1.2.1,jdk为1.7
在hadoop-1.2.1\src\examples\org\apache\hadoop\examples找到WordCount.java
源码如下:
1 /** 2 * Licensed under the Apache License, Version 2.0 (the "License"); 3 * you may not use this file except in compliance with the License. 4 * You may obtain a copy of the License at 5 * 6 * http://www.apache.org/licenses/LICENSE-2.0 7 * 8 * Unless required by applicable law or agreed to in writing, software 9 * distributed under the License is distributed on an "AS IS" BASIS, 10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 * See the License for the specific language governing permissions and 12 * limitations under the License. 13 */ 14 15 16 package org.apache.hadoop.examples; 17 18 import java.io.IOException; 19 import java.util.StringTokenizer; 20 21 import org.apache.hadoop.conf.Configuration; 22 import org.apache.hadoop.fs.Path; 23 import org.apache.hadoop.io.IntWritable; 24 import org.apache.hadoop.io.Text; 25 import org.apache.hadoop.mapreduce.Job; 26 import org.apache.hadoop.mapreduce.Mapper; 27 import org.apache.hadoop.mapreduce.Reducer; 28 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 29 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 30 import org.apache.hadoop.util.GenericOptionsParser; 31 32 public class WordCount { 33 34 public static class TokenizerMapper 35 extends Mapper<Object, Text, Text, IntWritable>{ 36 37 private final static IntWritable one = new IntWritable(1); 38 private Text word = new Text(); 39 40 public void map(Object key, Text value, Context context 41 ) throws IOException, InterruptedException { 42 StringTokenizer itr = new StringTokenizer(value.toString()); 43 while (itr.hasMoreTokens()) { 44 word.set(itr.nextToken()); 45 context.write(word, one); 46 } 47 } 48 } 49 50 public static class IntSumReducer 51 extends Reducer<Text,IntWritable,Text,IntWritable> { 52 private IntWritable result = new IntWritable(); 53 54 public void reduce(Text key, Iterable<IntWritable> values, 55 Context context 56 ) throws IOException, InterruptedException { 57 int sum = 0; 58 for (IntWritable val : values) { 59 sum += val.get(); 60 } 61 result.set(sum); 62 context.write(key, result); 63 } 64 } 65 66 public static void main(String[] args) throws Exception { 67 Configuration conf = new Configuration(); 68 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 69 if (otherArgs.length != 2) { 70 System.err.println("Usage: wordcount <in> <out>"); 71 System.exit(2); 72 } 73 Job job = new Job(conf, "word count"); 74 job.setJarByClass(WordCount.class); 75 job.setMapperClass(TokenizerMapper.class); 76 job.setCombinerClass(IntSumReducer.class); 77 job.setReducerClass(IntSumReducer.class); 78 job.setOutputKeyClass(Text.class); 79 job.setOutputValueClass(IntWritable.class); 80 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 81 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 82 System.exit(job.waitForCompletion(true) ? 0 : 1); 83 } 84 }
在hadoop主目录下新建classes用于存放编译后的.class文件
mkdir hadoop1.2.1/classes
把WordCount.java放入classes文件夹中
我们直接在classes编译一下WordCount.java
javac WordCount.java -d .
出现如下错误
1 WordCount.java:21: error: package org.apache.hadoop.conf does not exist 2 import org.apache.hadoop.conf.Configuration; 3 ^ 4 WordCount.java:22: error: package org.apache.hadoop.fs does not exist 5 import org.apache.hadoop.fs.Path; 6 ^ 7 WordCount.java:23: error: package org.apache.hadoop.io does not exist 8 import org.apache.hadoop.io.IntWritable; 9 ^ 10 WordCount.java:24: error: package org.apache.hadoop.io does not exist 11 import org.apache.hadoop.io.Text; 12 ^ 13 WordCount.java:25: error: package org.apache.hadoop.mapreduce does not exist 14 import org.apache.hadoop.mapreduce.Job; 15 ^ 16 WordCount.java:26: error: package org.apache.hadoop.mapreduce does not exist 17 import org.apache.hadoop.mapreduce.Mapper; 18 ^ 19 WordCount.java:27: error: package org.apache.hadoop.mapreduce does not exist 20 import org.apache.hadoop.mapreduce.Reducer; 21 ^ 22 WordCount.java:28: error: package org.apache.hadoop.mapreduce.lib.input does not exist 23 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 24 ^ 25 WordCount.java:29: error: package org.apache.hadoop.mapreduce.lib.output does not exist 26 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 27 ^ 28 WordCount.java:30: error: package org.apache.hadoop.util does not exist 29 import org.apache.hadoop.util.GenericOptionsParser; 30 ^ 31 WordCount.java:35: error: cannot find symbol 32 extends Mapper<Object, Text, Text, IntWritable>{ 33 ^ 34 symbol: class Mapper 35 location: class WordCount 36 WordCount.java:35: error: cannot find symbol 37 extends Mapper<Object, Text, Text, IntWritable>{ 38 ^ 39 symbol: class Text 40 location: class WordCount 41 WordCount.java:35: error: cannot find symbol 42 extends Mapper<Object, Text, Text, IntWritable>{ 43 ^ 44 symbol: class Text 45 location: class WordCount 46 WordCount.java:35: error: cannot find symbol 47 extends Mapper<Object, Text, Text, IntWritable>{ 48 ^ 49 symbol: class IntWritable 50 location: class WordCount 51 WordCount.java:37: error: cannot find symbol 52 private final static IntWritable one = new IntWritable(1); 53 ^ 54 symbol: class IntWritable 55 location: class TokenizerMapper 56 WordCount.java:38: error: cannot find symbol 57 private Text word = new Text(); 58 ^ 59 symbol: class Text 60 location: class TokenizerMapper 61 WordCount.java:40: error: cannot find symbol 62 public void map(Object key, Text value, Context context 63 ^ 64 symbol: class Text 65 location: class TokenizerMapper 66 WordCount.java:40: error: cannot find symbol 67 public void map(Object key, Text value, Context context 68 ^ 69 symbol: class Context 70 location: class TokenizerMapper 71 WordCount.java:51: error: cannot find symbol 72 extends Reducer<Text,IntWritable,Text,IntWritable> { 73 ^ 74 symbol: class Reducer 75 location: class WordCount 76 WordCount.java:51: error: cannot find symbol 77 extends Reducer<Text,IntWritable,Text,IntWritable> { 78 ^ 79 symbol: class Text 80 location: class WordCount 81 WordCount.java:51: error: cannot find symbol 82 extends Reducer<Text,IntWritable,Text,IntWritable> { 83 ^ 84 symbol: class IntWritable 85 location: class WordCount 86 WordCount.java:51: error: cannot find symbol 87 extends Reducer<Text,IntWritable,Text,IntWritable> { 88 ^ 89 symbol: class Text 90 location: class WordCount 91 WordCount.java:51: error: cannot find symbol 92 extends Reducer<Text,IntWritable,Text,IntWritable> { 93 ^ 94 symbol: class IntWritable 95 location: class WordCount 96 WordCount.java:52: error: cannot find symbol 97 private IntWritable result = new IntWritable(); 98 ^ 99 symbol: class IntWritable 100 location: class IntSumReducer 101 WordCount.java:54: error: cannot find symbol 102 public void reduce(Text key, Iterable<IntWritable> values, 103 ^ 104 symbol: class Text 105 location: class IntSumReducer 106 WordCount.java:54: error: cannot find symbol 107 public void reduce(Text key, Iterable<IntWritable> values, 108 ^ 109 symbol: class IntWritable 110 location: class IntSumReducer 111 WordCount.java:55: error: cannot find symbol 112 Context context 113 ^ 114 symbol: class Context 115 location: class IntSumReducer 116 WordCount.java:37: error: cannot find symbol 117 private final static IntWritable one = new IntWritable(1); 118 ^ 119 symbol: class IntWritable 120 location: class TokenizerMapper 121 WordCount.java:38: error: cannot find symbol 122 private Text word = new Text(); 123 ^ 124 symbol: class Text 125 location: class TokenizerMapper 126 WordCount.java:52: error: cannot find symbol 127 private IntWritable result = new IntWritable(); 128 ^ 129 symbol: class IntWritable 130 location: class IntSumReducer 131 WordCount.java:58: error: cannot find symbol 132 for (IntWritable val : values) { 133 ^ 134 symbol: class IntWritable 135 location: class IntSumReducer 136 WordCount.java:67: error: cannot find symbol 137 Configuration conf = new Configuration(); 138 ^ 139 symbol: class Configuration 140 location: class WordCount 141 WordCount.java:67: error: cannot find symbol 142 Configuration conf = new Configuration(); 143 ^ 144 symbol: class Configuration 145 location: class WordCount 146 WordCount.java:68: error: cannot find symbol 147 String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); 148 ^ 149 symbol: class GenericOptionsParser 150 location: class WordCount 151 WordCount.java:73: error: cannot find symbol 152 Job job = new Job(conf, "word count"); 153 ^ 154 symbol: class Job 155 location: class WordCount 156 WordCount.java:73: error: cannot find symbol 157 Job job = new Job(conf, "word count"); 158 ^ 159 symbol: class Job 160 location: class WordCount 161 WordCount.java:78: error: cannot find symbol 162 job.setOutputKeyClass(Text.class); 163 ^ 164 symbol: class Text 165 location: class WordCount 166 WordCount.java:79: error: cannot find symbol 167 job.setOutputValueClass(IntWritable.class); 168 ^ 169 symbol: class IntWritable 170 location: class WordCount 171 WordCount.java:80: error: cannot find symbol 172 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 173 ^ 174 symbol: class Path 175 location: class WordCount 176 WordCount.java:80: error: cannot find symbol 177 FileInputFormat.addInputPath(job, new Path(otherArgs[0])); 178 ^ 179 symbol: variable FileInputFormat 180 location: class WordCount 181 WordCount.java:81: error: cannot find symbol 182 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 183 ^ 184 symbol: class Path 185 location: class WordCount 186 WordCount.java:81: error: cannot find symbol 187 FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); 188 ^ 189 symbol: variable FileOutputFormat 190 location: class WordCount 191 42 errors
原因是缺少依赖包
因为源码import了好几个hadoop自定义类,非JDK环境自带的类,所以需要把这些依赖包导入eclipse中,不然编译器如何能找到这些类呢,得明确让编译器知道这些类所在位置。
而hadoop的依赖包就是hadoop1.2.1下的几个jar文件,以及hadoop/lib下的jar文件。
有时候不知道源代码使用了哪一个依赖包,所以把全部依赖包告诉编译器,我使用的方法是在~/.bashrc设置一个hadoop_CLASSPATH变量(最好不用使用HADOOP_CLASSPATH变量名,因为在hadoop1.2.1/conf/hadoop-env.sh中有这个变量名,所以最好不要使用)
hadoop_CLASSPATH如下产生。
hadoop_HOME=/home/hadoop/hadoop1.2.1 #HADOOP_HOME不能占用,因为hadoop-env.sh中有使用 for f in $hadoop_HOME/hadoop-*.jar; do hadoop_CLASSPATH=${hadoop_CLASSPATH}:$f done for f in $hadoop_HOME/lib/*.jar; do hadoop_CLASSPATH=${hadoop_CLASSPATH}:$f done
看一下javac命令的用法
1 Usage: javac <options> <source files> 2 where possible options include: 3 -g Generate all debugging info 4 -g:none Generate no debugging info 5 -g:{lines,vars,source} Generate only some debugging info 6 -nowarn Generate no warnings 7 -verbose Output messages about what the compiler is doing 8 -deprecation Output source locations where deprecated APIs are used 9 -classpath <path> Specify where to find user class files and annotation processors 10 -cp <path> Specify where to find user class files and annotation processors 11 -sourcepath <path> Specify where to find input source files 12 -bootclasspath <path> Override location of bootstrap class files 13 -extdirs <dirs> Override location of installed extensions 14 -endorseddirs <dirs> Override location of endorsed standards path 15 -proc:{none,only} Control whether annotation processing and/or compilation is done. 16 -processor <class1>[,<class2>,<class3>...] Names of the annotation processors to run; bypasses default discovery process 17 -processorpath <path> Specify where to find annotation processors 18 -d <directory> Specify where to place generated class files 19 -s <directory> Specify where to place generated source files 20 -implicit:{none,class} Specify whether or not to generate class files for implicitly referenced files 21 -encoding <encoding> Specify character encoding used by source files 22 -source <release> Provide source compatibility with specified release 23 -target <release> Generate class files for specific VM version 24 -version Version information 25 -help Print a synopsis of standard options 26 -Akey[=value] Options to pass to annotation processors 27 -X Print a synopsis of nonstandard options 28 -J<flag> Pass <flag> directly to the runtime system 29 -Werror Terminate compilation if warnings occur 30 @<filename> Read options and filenames from file
参数classpath与cp均是设置依赖包的途径
1 -classpath <path> Specify where to find user class files and annotation processors 2 -cp <path> Specify where to find user class files and annotation processors
可以如下编译
javac -cp $hadoop_CLASSPATH WordCount.java -d .
编译成功,classes文件夹出现了一个org的文件夹,点击进去可以发现文件夹的层次是org/apache/hadoop/examples 然后在examples文件夹中看到三个.class
hadoop@Mint ~/hadoop-1.2.1/classes/org/apache/hadoop/examples $ pwd /home/hadoop/hadoop-1.2.1/classes/org/apache/hadoop/examples hadoop@Mint ~/hadoop-1.2.1/classes/org/apache/hadoop/examples $ ls WordCount.class WordCount$IntSumReducer.class WordCount$TokenizerMapper.class
层次结构出现的原因是源代码开始是有一个package org.apache.hadoop.examples;
如果没有,三个c.lass直接出现在classes中。
然后返回classes目录,打包为jar文件
jar -cvf WordCount.jar org
然后当前文件夹就会出现WordCount.jar文件,可以使用jar -tvf WordCount.jar看一下这个包的层次结构
jar -tvf WordCount.jar 0 Fri Aug 15 19:58:32 CST 2014 META-INF/ 68 Fri Aug 15 19:58:32 CST 2014 META-INF/MANIFEST.MF 0 Fri Aug 15 19:53:28 CST 2014 org/ 0 Fri Aug 15 19:53:28 CST 2014 org/apache/ 0 Fri Aug 15 19:53:28 CST 2014 org/apache/hadoop/ 0 Fri Aug 15 19:53:28 CST 2014 org/apache/hadoop/examples/ 1911 Fri Aug 15 19:53:28 CST 2014 org/apache/hadoop/examples/WordCount.class 1790 Fri Aug 15 19:53:28 CST 2014 org/apache/hadoop/examples/WordCount$TokenizerMapper.class 1793 Fri Aug 15 19:53:28 CST 2014 org/apache/hadoop/examples/WordCount$IntSumReducer.class
WordCount.jar打包成功,运行WordCount详解可以参考Hadoop集群(第6期)_WordCount运行详解
hadoop jar WordCount.jar org.apache.hadoop.examples.WordCount input output
org.apache.hadoop.examples.WordCount是程序的主类WordCount.class名字,这里就不需要加.class后缀。如果没有层次结构的话,就直接是
hadoop jar WordCount.jar WordCount input output
WordCount.jar不一定是与主类相同,可以是CountWord.jar或者其他名字,但是上述命令hadoop jar 包名 程序主类名字 输入文件夹 输出文件夹
程序主类名字就一定不能换成其他,得是主类名字,不然不能运行。
本文基于知识共享署名-非商业性使用 3.0 许可协议进行许可。欢迎转载、演绎,但是必须保留本文的署名林羽飞扬,若需咨询,请给我发信