持之以恒,但求对MapReduce有所觉悟
理论学习:
http://hadooptutorial.wikispaces.com
http://developer.yahoo.com/hadoop/tutorial/module4.html
实践学习:
执行倒排索引程序:
本段代码是Yahoo! Hadoop tutorial的module4——MapReduce最后面的代码
1、从Eclipse中导出Jar包LineIndexer.jar
2、将所有处理的文件上传到HDFS
root@ubuntu:/# hadoop dfs -put *.txt /user/root/input root@ubuntu:/# hadoop dfs -ls /user/root/input Found 3 items -rw-r--r-- 1 root supergroup 569218 2012-01-15 19:46 /user/root/input/All's Well That Ends Well.txt -rw-r--r-- 1 root supergroup 569218 2012-01-15 19:46 /user/root/input/As You Like It.txt -rw-r--r-- 1 root supergroup 569218 2012-01-15 19:46 /user/root/input/The Comedy of Errors.txt
3、执行jar包
root@ubuntu:/usr/hadoop-0.20.2/chenwq# hadoop jar LineIndexer.jar /user/root/input /user/root/output
4、查看Hadoop状态
http://localhost:50030/ - Hadoop 管理介面 http://localhost:50060/ - Hadoop Task Tracker 状态 http://localhost:50070/ - Hadoop DFS 状态
5、输出结果
12/01/16 04:53:14 WARN mapred.JobClient: Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same. 12/01/16 04:53:14 INFO mapred.FileInputFormat: Total input paths to process : 14 12/01/16 04:53:15 INFO mapred.JobClient: Running job: job_201201150129_0001 12/01/16 04:53:16 INFO mapred.JobClient: map 0% reduce 0% 12/01/16 04:53:38 INFO mapred.JobClient: map 1% reduce 0% 12/01/16 04:53:44 INFO mapred.JobClient: map 2% reduce 0% 12/01/16 04:53:50 INFO mapred.JobClient: map 3% reduce 0% 12/01/16 04:53:56 INFO mapred.JobClient: map 4% reduce 0% 12/01/16 04:54:08 INFO mapred.JobClient: map 5% reduce 0% 12/01/16 04:54:11 INFO mapred.JobClient: map 6% reduce 0% 12/01/16 04:54:58 INFO mapred.JobClient: map 7% reduce 0% 12/01/16 04:55:06 INFO mapred.JobClient: map 7% reduce 1% 12/01/16 04:55:12 INFO mapred.JobClient: map 7% reduce 2% 12/01/16 04:55:15 INFO mapred.JobClient: map 8% reduce 2% 12/01/16 04:55:21 INFO mapred.JobClient: map 9% reduce 2% 12/01/16 04:55:27 INFO mapred.JobClient: map 10% reduce 2% 12/01/16 04:55:33 INFO mapred.JobClient: map 11% reduce 2% 12/01/16 04:55:42 INFO mapred.JobClient: map 12% reduce 2% 12/01/16 04:55:48 INFO mapred.JobClient: map 13% reduce 2% 12/01/16 04:56:23 INFO mapred.JobClient: map 13% reduce 3% 12/01/16 04:56:26 INFO mapred.JobClient: map 13% reduce 4% 12/01/16 04:56:38 INFO mapred.JobClient: map 14% reduce 4% 12/01/16 04:56:41 INFO mapred.JobClient: map 15% reduce 4% 12/01/16 04:56:47 INFO mapred.JobClient: map 16% reduce 4% 12/01/16 04:56:53 INFO mapred.JobClient: map 17% reduce 4% 12/01/16 04:56:59 INFO mapred.JobClient: map 18% reduce 4% 12/01/16 04:57:05 INFO mapred.JobClient: map 19% reduce 4% 12/01/16 04:57:11 INFO mapred.JobClient: map 20% reduce 4% 12/01/16 04:57:48 INFO mapred.JobClient: map 20% reduce 5% 12/01/16 04:57:51 INFO mapred.JobClient: map 20% reduce 6% 12/01/16 04:57:54 INFO mapred.JobClient: map 21% reduce 6% 12/01/16 04:58:00 INFO mapred.JobClient: map 22% reduce 6% 12/01/16 04:58:06 INFO mapred.JobClient: map 23% reduce 6% 12/01/16 04:58:12 INFO mapred.JobClient: map 24% reduce 6% 12/01/16 04:58:18 INFO mapred.JobClient: map 25% reduce 6% 12/01/16 04:58:24 INFO mapred.JobClient: map 26% reduce 6% 12/01/16 04:59:05 INFO mapred.JobClient: map 26% reduce 7% 12/01/16 04:59:12 INFO mapred.JobClient: map 26% reduce 8% 12/01/16 04:59:15 INFO mapred.JobClient: map 27% reduce 8% 12/01/16 04:59:21 INFO mapred.JobClient: map 28% reduce 8% 12/01/16 04:59:27 INFO mapred.JobClient: map 29% reduce 8% 12/01/16 04:59:33 INFO mapred.JobClient: map 30% reduce 8% 12/01/16 04:59:36 INFO mapred.JobClient: map 31% reduce 8% 12/01/16 04:59:42 INFO mapred.JobClient: map 32% reduce 8% 12/01/16 04:59:48 INFO mapred.JobClient: map 33% reduce 8% 12/01/16 05:00:30 INFO mapred.JobClient: map 33% reduce 10% 12/01/16 05:00:33 INFO mapred.JobClient: map 34% reduce 10% 12/01/16 05:00:36 INFO mapred.JobClient: map 34% reduce 11% 12/01/16 05:00:42 INFO mapred.JobClient: map 35% reduce 11% 12/01/16 05:00:48 INFO mapred.JobClient: map 36% reduce 11% 12/01/16 05:00:54 INFO mapred.JobClient: map 37% reduce 11% 12/01/16 05:01:00 INFO mapred.JobClient: map 38% reduce 11% 12/01/16 05:01:06 INFO mapred.JobClient: map 39% reduce 11% 12/01/16 05:01:12 INFO mapred.JobClient: map 40% reduce 11% 12/01/16 05:01:55 INFO mapred.JobClient: map 40% reduce 12% 12/01/16 05:01:58 INFO mapred.JobClient: map 41% reduce 13% 12/01/16 05:02:04 INFO mapred.JobClient: map 42% reduce 13% 12/01/16 05:02:10 INFO mapred.JobClient: map 43% reduce 13% 12/01/16 05:02:16 INFO mapred.JobClient: map 44% reduce 13% 12/01/16 05:02:22 INFO mapred.JobClient: map 45% reduce 13% 12/01/16 05:02:28 INFO mapred.JobClient: map 46% reduce 13% 12/01/16 05:03:15 INFO mapred.JobClient: map 46% reduce 14% 12/01/16 05:03:19 INFO mapred.JobClient: map 46% reduce 15% 12/01/16 05:03:22 INFO mapred.JobClient: map 47% reduce 15% 12/01/16 05:03:30 INFO mapred.JobClient: map 48% reduce 15% 12/01/16 05:03:36 INFO mapred.JobClient: map 49% reduce 15% 12/01/16 05:03:42 INFO mapred.JobClient: map 50% reduce 15% 12/01/16 05:03:48 INFO mapred.JobClient: map 51% reduce 15% 12/01/16 05:03:54 INFO mapred.JobClient: map 52% reduce 15% 12/01/16 05:04:00 INFO mapred.JobClient: map 53% reduce 15% 12/01/16 05:04:37 INFO mapred.JobClient: map 56% reduce 15% 12/01/16 05:04:40 INFO mapred.JobClient: map 56% reduce 16% 12/01/16 05:04:43 INFO mapred.JobClient: map 57% reduce 17% 12/01/16 05:04:46 INFO mapred.JobClient: map 60% reduce 17% 12/01/16 05:04:49 INFO mapred.JobClient: map 61% reduce 17% 12/01/16 05:04:52 INFO mapred.JobClient: map 64% reduce 20% 12/01/16 05:04:55 INFO mapred.JobClient: map 65% reduce 20% 12/01/16 05:04:58 INFO mapred.JobClient: map 68% reduce 20% 12/01/16 05:05:01 INFO mapred.JobClient: map 69% reduce 21% 12/01/16 05:05:04 INFO mapred.JobClient: map 73% reduce 21% 12/01/16 05:05:07 INFO mapred.JobClient: map 73% reduce 22% 12/01/16 05:05:10 INFO mapred.JobClient: map 76% reduce 22% 12/01/16 05:05:16 INFO mapred.JobClient: map 80% reduce 23% 12/01/16 05:05:22 INFO mapred.JobClient: map 83% reduce 25% 12/01/16 05:05:25 INFO mapred.JobClient: map 90% reduce 25% 12/01/16 05:05:31 INFO mapred.JobClient: map 96% reduce 27% 12/01/16 05:05:34 INFO mapred.JobClient: map 100% reduce 30% 12/01/16 05:05:44 INFO mapred.JobClient: map 100% reduce 33% 12/01/16 05:06:45 INFO mapred.JobClient: map 100% reduce 66% 12/01/16 05:06:48 INFO mapred.JobClient: map 100% reduce 67% 12/01/16 05:06:51 INFO mapred.JobClient: map 100% reduce 68% 12/01/16 05:06:57 INFO mapred.JobClient: map 100% reduce 69%
import java.io.IOException; import java.util.Iterator; import java.util.StringTokenizer; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.MapReduceBase; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; public class LineIndexer { public static class LineIndexMapper extends MapReduceBase implements Mapper<LongWritable, Text, Text, Text> { private final static Text word = new Text(); private final static Text location = new Text(); public void map(LongWritable key, Text val, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit)reporter.getInputSplit(); String fileName = fileSplit.getPath().getName(); location.set(fileName); String line = val.toString(); StringTokenizer itr = new StringTokenizer(line.toLowerCase()); while (itr.hasMoreTokens()) { word.set(itr.nextToken()); output.collect(word, location); } } } public static class LineIndexReducer extends MapReduceBase implements Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterator<Text> values, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { boolean first = true; StringBuilder toReturn = new StringBuilder(); while (values.hasNext()){ if (!first) toReturn.append(", "); first=false; toReturn.append(values.next().toString()); } output.collect(key, new Text(toReturn.toString())); } } /** * The actual main() method for our program; this is the * "driver" for the MapReduce job. */ public static void main(String[] args) { JobClient client = new JobClient(); JobConf conf = new JobConf(LineIndexer.class); conf.setJobName("LineIndexer"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); FileInputFormat.addInputPath(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); conf.setMapperClass(LineIndexMapper.class); conf.setReducerClass(LineIndexReducer.class); client.setConf(conf); try { JobClient.runJob(conf); } catch (Exception e) { e.printStackTrace(); } } }