package com.mr.distinct; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; /** * @author luobao * */ public class Person implements WritableComparable<Person> { private Text name; private Text sex; private Text age; private Text remark; public Person() { this.name = new Text(); this.sex = new Text(); this.age = new Text(); this.remark = new Text(); } @Override public void readFields(DataInput paramDataInput) throws IOException { this.name.readFields(paramDataInput); this.sex.readFields(paramDataInput); this.age.readFields(paramDataInput); this.remark.readFields(paramDataInput); } @Override public void write(DataOutput paramDataOutput) throws IOException { this.name.write(paramDataOutput); this.sex.write(paramDataOutput); this.age.write(paramDataOutput); this.remark.write(paramDataOutput); } @Override public int compareTo(Person person) { // 这里定制distinct哪些字段,现在是忽略remark字段的定制 Text t = new Text(this.name.toString() + this.sex.toString() + this.age.toString()); return t.compareTo(new Text(person.getName().toString() + person.getSex().toString() + person.getAge().toString())); } public Text getName() { return name; } public void setName(Text name) { this.name = name; } public Text getSex() { return sex; } public void setSex(Text sex) { this.sex = sex; } public Text getAge() { return age; } public void setAge(Text age) { this.age = age; } public Text getRemark() { return remark; } public void setRemark(Text remark) { this.remark = remark; } @Override public String toString() { return "Person [name=" + name + ", sex=" + sex + ", age=" + age + ", remark=" + remark + "]"; } }
package com.mr.distinct; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * @author luobao * */ public class Distinct { public static class DistinctUserMapper extends Mapper<Object, Text, Person, NullWritable> { public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String[] personInfo = value.toString().split(" "); Person person = new Person(); person.setName(new Text(personInfo[0])); person.setSex(new Text(personInfo[1])); person.setAge(new Text(personInfo[2])); person.setRemark(new Text(personInfo[3])); context.write(person, NullWritable.get()); } } public static class DistinctUserReducer extends Reducer<Person, NullWritable, Person, NullWritable> { public void reduce(Person key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException { context.write(key, NullWritable.get()); } } public static void main(String[] args) { try { Configuration conf = new Configuration(); conf.addResource(new Path("log4j.properties.xml")); String ipPre = "hdfs://192.168.40.191:9000/"; removeOutput(conf, ipPre); Job job = Job.getInstance(conf); job.setJarByClass(Distinct.class); job.setMapperClass(DistinctUserMapper.class); job.setReducerClass(DistinctUserReducer.class); job.setMapOutputKeyClass(Person.class);// map阶段的输出的key job.setMapOutputValueClass(NullWritable.class);// map阶段的输出的value job.setOutputKeyClass(Person.class);// reduce阶段的输出的key job.setOutputValueClass(NullWritable.class);// reduce阶段的输出的value FileInputFormat.addInputPath(job, new Path(ipPre + "input/distinct")); FileOutputFormat.setOutputPath(job, new Path(ipPre + "output")); System.exit(job.waitForCompletion(true) ? 0 : 1); } catch (Exception e) { e.printStackTrace(); } } private static void removeOutput(Configuration conf, String ipPre) throws IOException { String outputPath = ipPre + "output"; FileSystem fs = FileSystem.get(URI.create(outputPath), conf); Path path = new Path(outputPath); if (fs.exists(path)) { fs.deleteOnExit(path); } fs.close(); } }
Person [name=张三, sex=女, age=20, remark=备注1]
Person [name=张三, sex=男, age=20, remark=备注2]
Person [name=李四, sex=男, age=21, remark=备注3]
Person [name=王五, sex=女, age=20, remark=备注]