Hadoop Map Reduce 实现倒排索引的程序

先创建3个文件并填入对应的内容:

input/T0.txt MapReduce is simple
input/T1.txt MapReduce is powerful is simple
input/T3.txt Hello MapReduce bye MapReduce

上传到hadoop 分布式系统上:  hadoop fs -put *.txt input

运行直接见代码:

package org.apache.hadoop.examples.yao;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

/**
 * 倒排索引的程序
 * @author yaokj
 *
 */
public class InvertedIndex {

	// <pianyi,linecontent>-----> <word+filename,count>
	public static class InvertedIndexMapper extends Mapper<Object, Text, Text, Text> {
		private Text keyInfo = new Text();
		private Text valueInfo = new Text();
		private FileSplit fs = null;

		protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
			fs = (FileSplit) context.getInputSplit();
			String fileName = fs.getPath().getName();

			StringTokenizer tokenizer = new StringTokenizer(value.toString());
			while (tokenizer.hasMoreTokens()) {
				keyInfo.set(tokenizer.nextToken() + ":" + fileName);
				valueInfo.set("1");
				context.write(keyInfo, valueInfo);
			}
		}
	}

	// <word+filename,list(count)>-------> <word,filename+result>
	// result=the sum of list(count)
	public static class InvertedIndexCombiner extends Reducer<Text, Text, Text, Text> {
		private Text info = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			int splitIndex = key.toString().indexOf(":");
			int sum = 0;
			String word = key.toString().substring(0, splitIndex);
			String fileName = key.toString().substring(splitIndex + 1);
			for (Text value : values) {
				sum += Integer.parseInt(value.toString());
			}
			info.set(fileName + ":" + sum);
			context.write(new Text(word), info);
		}
	}

	// <word,filename+result>------> <word,list(filename+result)>
	public static class InvertedIndexReducer extends Reducer<Text, Text, Text, Text> {

		private Text result = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
			String sum = new String();
			for (Text value : values) {
				sum += value.toString() + ";";
			}
			result.set(sum);
			context.write(key, result);
		}
	}

	// inputdir:input
	// outputdir:output
	public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
		Configuration conf = new Configuration();
		conf.addResource(new Path("/home/yaokj/hadoop-0.20.203.0/conf/hdfs-site.xml"));//配置文件上的位置
		conf.addResource(new Path("/home/yaokj/hadoop-0.20.203.0/conf/core-site.xml"));
		String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
		if (otherArgs.length != 2) {
			System.err.println("Usage : invertedindex ");
			System.exit(2);
		}
		Job job = new Job(conf, "invertedindex");
		job.setJarByClass(InvertedIndex.class);

		job.setMapperClass(InvertedIndexMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		job.setCombinerClass(InvertedIndexCombiner.class);
		job.setReducerClass(InvertedIndexReducer.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);

		FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
		FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}
}
下面是运行后的结果:
Hello	T3.txt:1;
MapReduce	T3.txt:2;T0.txt:1;T1.txt:1;
bye	T3.txt:1;
is	T0.txt:1;T1.txt:2;
powerful	T1.txt:1;
simple	T1.txt:1;T0.txt:1;




你可能感兴趣的:(hadoop,倒排索引)