MapReduce SecondarySort

package wjj;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
 
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;

public class SecondarySort {
	
	
	/*
	 * 这里新定义的类型为IntPair封装了两个个int型,
	 * 依次存放两次排序的value。Hadoop要求key的类型必须实现Writable和Comparable,前者为了支持序列化和反序列化,后者为了实现基于比较的排序。
	 * 需要注意的是compareTo()方法中先按first升序排列,后按second排列。
	 * 我们可以构造一个复合类IntPair,他有两个字段,先利用分区对第一字段排序,再利用分区内的比较对第二字段排序。
       所有自定义的key应该实现接口WritableComparable,因为是可序列的并且可比较的
	 * 
	 * 
	 * */
	
	public  static  class IntPair implements WritableComparable{

		int first;
		int second;
		
		public void set(int left,int right)
		{
			first=left;
			second=right;
		}
		
		
		
		public int getFirst() {
			return first;
		}



		public void setFirst(int first) {
			this.first = first;
		}



		public int getSecond() {
			return second;
		}



		public void setSecond(int second) {
			this.second = second;
		}


		 //反序列化,从流中的二进制转换成IntPair  
		@Override
		public void readFields(DataInput in) throws IOException {
			// TODO Auto-generated method stub
			first=in.readInt();
			second=in.readInt();
		}
		 //序列化,将IntPair转化成使用流传送的二进制  
		@Override
		public void write(DataOutput out) throws IOException {
			// TODO Auto-generated method stub
			out.writeInt(first);
            out.writeInt(second);//这里有的地方写的是write()方法,但是我在实际的测试过程中会抛出异常
		}

		@Override
		public int compareTo(IntPair o) {
			// TODO Auto-generated method stub
			if(first!=o.first)
				return first{

		@Override
		public int getPartition(IntPair key, IntWritable value, int num) {
			// TODO Auto-generated method stub
			return Math.abs(key.getFirst()*127)%num;
			 
		}
		
	}
	/*
	 * 而我们希望first相同的key中,只获取第一个的second即可,
	 * 其他数据可以忽略。这就需要数据执行reduce前按照key的first字段
	 * 进行归并,即grouping。first相同的key归为一个group,
	 * 将第一个key和所有的value传给reduce()方法。
	 * 然后reduce将key输出即可实现目的。
	 * 为了实现这样的grouping操纵,需要自定义归并比较器
	 * (ValueGroupingComparator), 
	 * */
	
	/*//第一种方法,实现接口RawComparator 
    public static class GroupingComparator implements RawComparator { 
        @Override 
        public int compare(IntPair o1, IntPair o2) { 
            int l = o1.getFirst(); 
            int r = o2.getFirst(); 
            return l == r ? 0 : (l < r ? -1 : 1); 
        } 
        @Override 
        //一个字节一个字节的比,直到找到一个不相同的字节,然后比这个字节的大小作为两个字节流的大小比较结果。 
        public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){ 
            // TODO Auto-generated method stub 
             return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8,  
                     b2, s2, Integer.SIZE/8); 
        } 
    }*/  
	 public static class GroupingComparator extends WritableComparator
	    {
	        protected GroupingComparator()
	        {
	            super(IntPair.class, true);
	        }
	        @Override
	        //Compare two WritableComparables.
	        public int compare(WritableComparable w1, WritableComparable w2)
	        {
	            IntPair ip1 = (IntPair) w1;
	            IntPair ip2 = (IntPair) w2;
	            int l = ip1.getFirst();
	            int r = ip2.getFirst();
	            return l == r ? 0 : (l < r ? -1 : 1);
	        }
	    }

	
	
	public static class Map extends Mapper
	{
		private final IntPair intkey=new IntPair();
		private final IntWritable intvalue=new IntWritable();
		
		public void map(LongWritable key,Text value,Context context) throws IOException, InterruptedException
		{
			String line=value.toString();
			StringTokenizer tokenizer=new StringTokenizer(line);
			int left=0;
			int right=0;
			if(tokenizer.hasMoreTokens())
			{
				left=Integer.parseInt(tokenizer.nextToken());
				if(tokenizer.hasMoreTokens())
				{
					right=Integer.parseInt(tokenizer.nextToken());
				}
				intkey.set(left, right);
				intvalue.set(right);
				context.write(intkey, intvalue);
			}
		}
	}
	
	
	public static class Reduce extends Reducer
	{
		private final Text left=new Text();
		private static  final Text SEPARATOR=new Text("--------------------");
		public void reduce(IntPair key,Iterablevalues,Context context) throws IOException, InterruptedException
		{
			context.write(SEPARATOR, null);
			left.set(Integer.toString(key.getFirst()));
			for(IntWritable val:values)
			{
				context.write(left, val);
			}
		}
	}
	
	public static void main(String[] args) throws Exception {
	   Configuration conf=new Configuration();
	   @SuppressWarnings("deprecation")
	   // 实例化一道作业  
	   Job job=new Job(conf,"secondarysort");
	   job.setJarByClass(SecondarySort.class);
	   
	   //Mapper
	   job.setMapperClass(Map.class);
	   //Reducer
	   // 不再需要Combiner类型,因为Combiner的输出类型对Reduce的输入类型不适用  
       //job.setCombinerClass(Reduce.class);  
	   job.setReducerClass(Reduce.class);
	   
	   job.setPartitionerClass(FirstPartitioner.class);
	   job.setGroupingComparatorClass(GroupingComparator.class);
	   
	   job.setMapOutputKeyClass(IntPair.class);
	   job.setOutputKeyClass(Text.class);
	   
	   job.setOutputValueClass(IntWritable.class);
	   
	   job.setInputFormatClass(TextInputFormat.class);
	   job.setOutputFormatClass(TextOutputFormat.class);
	   
	   FileInputFormat.setInputPaths(job, new Path(args[0]));
	   FileOutputFormat.setOutputPath(job, new Path(args[1]));
	   System.exit(job.waitForCompletion(true)?0:1);
	}
	
}


你可能感兴趣的:(hadoop)