数据去重

输入样例

file1

a

a

b

b

c

file2

a

b

d

d

输出样例

a

b

c

d

package mapreduce.test;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Dedup {
	//map将输入中的value复制到输出数据的key上,并直接输出
	public static class Map extends Mapper<Object, Text, Text, Text>{
		private static Text line = new Text();

		@Override
		protected void map(Object key, Text value, Context context)
				throws IOException, InterruptedException {
			line = value;
			context.write(line, new Text(""));
		}
	}
	
	//reduce将输入中的key复制到输出数据的可以上,并直接输出
	public static class Reduce extends Reducer<Text, Text, Text, Text>{

		
		protected void reduce(Text key, Iterable<Text> values,Context context)
				throws IOException, InterruptedException {
			context.write(key, new Text(""));
		}
		
	}
	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = new Job(conf,"Data Deduplicaiton");
		job.setJarByClass(Dedup.class);
		job.setMapperClass(Map.class);
		job.setCombinerClass(Reduce.class);
		job.setReducerClass(Reduce.class);
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		FileInputFormat.addInputPath(job, new Path(args[0]));
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		System.exit(job.waitForCompletion(true) ? 0 : 1);
	}
}



你可能感兴趣的:(数据去重)