Hadoop二次排序<转>

Hadoop二次排序:

  1. import java.io.IOException;  
  2.   
  3. import org.apache.Hadoop.conf.Configuration;  
  4. import org.apache.Hadoop.fs.Path;  
  5. import org.apache.Hadoop.io.IntWritable;  
  6. import org.apache.Hadoop.io.LongWritable;  
  7. import org.apache.Hadoop.io.Text;  
  8. import org.apache.Hadoop.io.WritableComparable;  
  9. import org.apache.Hadoop.io.WritableComparator;  
  10. import org.apache.Hadoop.mapreduce.Job;  
  11. import org.apache.Hadoop.mapreduce.Mapper;  
  12. import org.apache.Hadoop.mapreduce.Reducer;  
  13. import org.apache.Hadoop.mapreduce.lib.input.FileInputFormat;  
  14. import org.apache.Hadoop.mapreduce.lib.input.TextInputFormat;  
  15. import org.apache.Hadoop.mapreduce.lib.output.FileOutputFormat;  
  16. import org.apache.Hadoop.mapreduce.lib.output.TextOutputFormat;  
  17. import org.apache.Hadoop.mapreduce.lib.partition.HashPartitioner;  
  18.   
  19. /** 
  20.  * @author 吕桂强 
  21.  * @email [email protected] 
  22.  * @version 创建时间:2012-5-21 下午5:06:57 
  23.  */  
  24. public class SecondarySort {  
  25.     // map阶段的最后会对整个map的List进行分区,每个分区映射到一个reducer   
  26.     public static class FirstPartitioner extends HashPartitioner<Text, IntWritable> {  
  27.         @Override  
  28.         public int getPartition(Text key, IntWritable value, int numPartitions) {  
  29.             return (key.toString().split(":")[0].hashCode() & Integer.MAX_VALUE) % numPartitions;  
  30.         }  
  31.     }  
  32.   
  33.     // 每个分区内又调用job.setSortComparatorClass或者key的比较函数进行排序   
  34.     public static class SortComparator extends WritableComparator {  
  35.         protected SortComparator() {  
  36.             super(Text.classtrue);  
  37.         }  
  38.   
  39.         @SuppressWarnings("rawtypes")  
  40.         @Override  
  41.         public int compare(WritableComparable w1, WritableComparable w2) {  
  42.             return -w1.toString().split(":")[0].compareTo(w2.toString().split(":")[0]);  
  43.         }  
  44.     }  
  45.   
  46.     // 只要这个比较器比较的两个key相同,他们就属于同一个组.   
  47.     // 它们的value放在一个value迭代器,而这个迭代器的key使用属于同一个组的所有key的第一个key   
  48.     public static class GroupingComparator extends WritableComparator {  
  49.         protected GroupingComparator() {  
  50.             super(Text.classtrue);  
  51.         }  
  52.         @SuppressWarnings("rawtypes")  
  53.         @Override  
  54.         public int compare(WritableComparable w1, WritableComparable w2) {  
  55.             return w1.toString().split(":")[0].compareTo(w2.toString().split(":")[0]);  
  56.         }  
  57.     }  
  58.   
  59.     // 自定义map   
  60.     public static class Map extends Mapper<LongWritable, Text, Text, IntWritable> {  
  61.         private final IntWritable intvalue = new IntWritable();  
  62.   
  63.         public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {  
  64.             context.write(value, intvalue);  
  65.         }  
  66.     }  
  67.   
  68.     // 自定义reduce   
  69.     public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable> {  
  70.         public void setup(Context context) {  
  71.             context.getConfiguration();  
  72.             System.out.println("reduce");  
  73.         }  
  74.   
  75.         public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {  
  76.             context.write(new Text("-------------------------"), new IntWritable(1));  
  77.             for (IntWritable val : values) {  
  78.                 // 虽然分在同一个组里,但是循环里每次输出的key都不相同(key看上去是个Text但实际也是一个list)   
  79.                 context.write(key, val);  
  80.             }  
  81.         }  
  82.     }  
  83.   
  84.     public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {  
  85.         Configuration conf = new Configuration();  
  86.         Job job = new Job(conf, "secondarysort");  
  87.         job.setJarByClass(SecondarySort.class);  
  88.         job.setMapperClass(Map.class);  
  89.         // job.setCombinerClass(Reduce.class);   
  90.         job.setReducerClass(Reduce.class);  
  91.         // 分区函数   
  92.         job.setPartitionerClass(FirstPartitioner.class);  
  93.         job.setSortComparatorClass(SortComparator.class);  
  94.         // 分组函数   
  95.         job.setGroupingComparatorClass(GroupingComparator.class);  
  96.   
  97.         job.setMapOutputKeyClass(Text.class);  
  98.         job.setMapOutputValueClass(IntWritable.class);  
  99.         job.setOutputKeyClass(Text.class);  
  100.         job.setOutputValueClass(IntWritable.class);  
  101.   
  102.         job.setInputFormatClass(TextInputFormat.class);  
  103.         job.setOutputFormatClass(TextOutputFormat.class);  
  104.   
  105.         FileInputFormat.setInputPaths(job, new Path("/larry/wc/input"));  
  106.         FileOutputFormat.setOutputPath(job, new Path("/larry/wc/output"));  
  107.   
  108.         job.setNumReduceTasks(1);  
  109.         System.exit(job.waitForCompletion(true) ? 0 : 1);  
  110.     }  
  111. }  

输入:

1:3
1:2
1:1
2:1
2:2
2:3
3:1
3:2
3:3


输出:(Text类型的key每输出一次都会改变,所以其实也是个Iterable)

____________________ 1
3:1 0
3:2 0
3:3 0
____________________ 1
2:1 0
2:2 0
2:3 0
____________________ 1
1:3 0
1:2 0
1:1 0

你可能感兴趣的:(Hadoop二次排序<转>)