hadoop 学习笔记之倒排索引

package cn.yws;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

//倒排索引 请在hadoop index_in目录下放置file1,file2测试文件
public class MyInvertedIndex {

	public static class Map extends Mapper<Object, Text, Text, Text>
	{
		private Text keyinfo=new Text();
		private Text valueinfo=new Text();
		private FileSplit split;
		//映射
		@Override
		protected void map(Object key, Text value,
				Context context)
				throws IOException, InterruptedException {
			
			//super.map(key, value, context);
			//获取文件分词
			split=(FileSplit) context.getInputSplit();
			StringTokenizer tokenizer=new StringTokenizer(value.toString());
			while(tokenizer.hasMoreTokens())
			{
				int splitindex=split.getPath().toString().indexOf("file");
				keyinfo.set(tokenizer.nextToken()+":"+split.getPath().toString().substring(splitindex));
				valueinfo.set("1");
				//file3:1;
				context.write(keyinfo, valueinfo);
			}		
			
		}
	}
	public static class Combine extends Reducer<Text, Text, Text, Text>
	{
		private Text infoText=new Text();
		
		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Context context)
				throws IOException, InterruptedException {
			
			//super.reduce(key, values, context);
			int sum=0;
			for(Text value:values)
			{
				sum+=Integer.parseInt(value.toString());
			}
			int splitindex=key.toString().indexOf(":");
			
			//file2:1;file3:2;file1:1
			infoText.set(key.toString().substring(splitindex+1)+":"+sum);
			
			key.set(key.toString().substring(0,splitindex));
			context.write(key, infoText);			
			
		}
	}
	public static class Reduce extends Reducer<Text, Text, Text, Text>
	{
		private Text result=new Text();
		
		@Override
		protected void reduce(Text key, Iterable<Text> values,
				Context context)
				throws IOException, InterruptedException {
			
			//super.reduce(key, values, context);
			//生成文档列表
			String filelist=new String();
			for(Text value:values)
			{
				filelist+=value.toString()+";";
			}
			result.set(filelist);
			context.write(key, result);			
			
		}
	}
	public static void main(String[] args) {
		try {
			Configuration configuration=new Configuration();
			//这句话很关键
			configuration.set("mapred.job.tracker", "192.168.1.15:9001");
			String[] ioargs=new String[]{"index_in","index_out3"};
			if(args.length==2)
			{
				ioargs=args;
			}			
			String[] otherArgs=new GenericOptionsParser(configuration,ioargs).getRemainingArgs();
			if(otherArgs.length!=2)
			{
				System.err.println("Usage:inverted "+MyInvertedIndex.class.getSimpleName()+" <in> <out>");
				System.exit(2);
			}
			//启动计算任务
			Job job=new Job(configuration, MyInvertedIndex.class.getSimpleName());
			
			job.setJarByClass(MyInvertedIndex.class);
			//映射
			job.setMapperClass(Map.class);
			//合成
			job.setCombinerClass(Combine.class);
			//规约
			job.setReducerClass(Reduce.class);
			
			//设置映射Map输出类型
			job.setMapOutputKeyClass(Text.class);
			job.setMapOutputValueClass(Text.class);
			
			//设置reduce规约输出类型
			job.setOutputKeyClass(Text.class);
			job.setOutputValueClass(Text.class);
			
			//设置输入和输出目录
			FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
			FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
			
			System.exit(job.waitForCompletion(true)?0:1);
			
		} catch (Exception e) {
			
			e.printStackTrace();
		}
		
	}
}

你可能感兴趣的:(hadoop 学习笔记之倒排索引)