2-1、二次排序代码



1、输入数据:
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/secondarysort.txt
3       5
5       89
7       63
5       56
3       9
3       1
7       26
7       45
7       4
5       18
5       23
7       63
3       24
[hadoop@hadoop ~]$




2、代码:
package secondarySort;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;


public class SecondarySort_Demo {
	//自己定义的key类应该实现WritableComparable接口  
	public static class IntPair implements WritableComparable<IntPair>{
		
		int first;
		int second;
		public void set(int left,int right){
			first=left;
			second=right;
		}
		public int getFirst(){
			return first;
		}
		public int getSecond(){
			return second;
		}
		//序列化,将IntPair转化成使用流传送的二进制  
		public void write(DataOutput out) throws IOException {
			out.writeInt(first);
			out.writeInt(second);
		}
		//反序列化,从流中的二进制转换成IntPair
		public void readFields(DataInput in) throws IOException {
			first=in.readInt();
			second=in.readInt();
		}
		//key的比较  
		public int compareTo(IntPair o) {
			if(first!=o.first){
				return first<o.first ? -1:1;
			}else if(second!=o.second){
				return second<o.second ? -1:1;
			}else{
				return 0;
			}
		
		}
		//新定义类应该重写的两个方法  
		public int hashCode(){
			return first*157+second;
		}
		
		public boolean equals(Object right){
			if(right==null)
				return false;
			if(this==right)
				return true;
			if(right instanceof IntPair){
				IntPair r=(IntPair) right;
				return r.first==first&&r.second==second;
			}else {
				return false;
			}
		}
	}
	 /** 
     * 分区函数类。根据first确定Partition。 
     */
	public static class FirstPartitioner extends Partitioner<IntPair, IntWritable>{
		@Override
		public int getPartition(IntPair key, IntWritable value,
				int numPartitions) {
			return Math.abs(key.getFirst()*127) % numPartitions;
		}
	}
	/** 
     * 分组函数类。只要first相同就属于同一个组。 
     */  
	/*//第一种方法,实现接口RawComparator
	public static class GroupingCpmparator implements RawComparator<IntPair>{


		public int compare(IntPair o1, IntPair o2) {
			int l=o1.getFirst();
			int r=o2.getFirst();
			return l == r ? 0:(l<r ? -1:1);
		}
		 //一个字节一个字节的比,直到找到一个不相同的字节,然后比这个字节的大小作为两个字节流的大小比较结果。
		public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
			return WritableComparator.compareBytes(b1, s1, Integer.SIZE/8, b2, s2, Integer.SIZE/8);
		}
	}*/
	//第二种方法,继承WritableComparator
	public static class GroupingComparator extends WritableComparator{
		protected GroupingComparator(){
			super(IntPair.class,true);
		}
		
		@SuppressWarnings("rawtypes")
		//Compare two WritableComparables. 
		public int compare(WritableComparable w1,WritableComparable w2){
			IntPair ip1=(IntPair) w1;
			IntPair ip2=(IntPair) w2;
			int l=ip1.getFirst();
			int r=ip2.getFirst();
			return l==r?1:(l<r?-1:1);
		}
	}
	// 自定义map 
	public static class MyMap extends Mapper<LongWritable, Text, IntPair, IntWritable>{
		private final IntPair intkey=new IntPair();
		private final IntWritable intvalue=new IntWritable();
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			String line=value.toString();
			String[] splited=line.split("\t");
			intkey.set(Integer.parseInt(splited[0]), Integer.parseInt(splited[1]));
			intvalue.set(Integer.parseInt(splited[1]));
			context.write(intkey, intvalue);
		}
	}
	// 自定义reduce  
	public static class MyReduce extends Reducer<IntPair, IntWritable, Text, IntWritable>{
		private final Text left =new Text();
	//	private static final Text SEPARATOR =new Text("========================");
		@Override
		protected void reduce(IntPair k2, Iterable<IntWritable> v2s,Context context)
				throws IOException, InterruptedException {
	//		context.write(SEPARATOR, null);
			left.set(Integer.toString(k2.getFirst()));
			for (IntWritable val : v2s) {
				context.write(left, val);
			}
		}
	}
	
	public static void main(String[] args) throws Exception{
		Configuration conf = new Configuration();  
        // 实例化一道作业  
        Job job=Job.getInstance(conf, SecondarySort_Demo.class.getSimpleName());
        job.setJarByClass(SecondarySort_Demo.class);  
        // Mapper类型  
        job.setMapperClass(MyMap.class);  
        // 不再需要Combiner类型,因为Combiner的输出类型<Text, IntWritable>对Reduce的输入类型<IntPair, IntWritable>不适用  
        //job.setCombinerClass(Reduce.class);  
        // Reducer类型  
        job.setReducerClass(MyReduce.class);  
        // 分区函数  
        job.setPartitionerClass(FirstPartitioner.class);  
        // 分组函数  
        job.setGroupingComparatorClass(GroupingComparator.class);  
          
        // map 输出Key的类型  
        job.setMapOutputKeyClass(IntPair.class);  
        // map输出Value的类型  
        job.setMapOutputValueClass(IntWritable.class);  
        // rduce输出Key的类型,是Text,因为使用的OutputFormatClass是TextOutputFormat  
        job.setOutputKeyClass(Text.class);  
        // rduce输出Value的类型  
        job.setOutputValueClass(IntWritable.class);  
          
        // 将输入的数据集分割成小数据块splites,同时提供一个RecordReder的实现。  
        job.setInputFormatClass(TextInputFormat.class);  
        // 提供一个RecordWriter的实现,负责数据输出。  
        job.setOutputFormatClass(TextOutputFormat.class);  
          
        // 输入hdfs路径  
        FileInputFormat.setInputPaths(job, args[0]);  
        // 输出hdfs路径  
        FileOutputFormat.setOutputPath(job, new Path(args[1]));  
        // 提交job  
//        System.exit(job.waitForCompletion(true) ? 0 : 1);  
        job.waitForCompletion(true); 
	}
	
}



3、打包执行命令:
 
hadoop jar secondarysort.jar /user/hadoop/secondarysort.txt /user/hadoop/output



4、结果输出文件目录:
[hadoop@hadoop ~]$ hdfs dfs -ls /user/hadoop/output
Found 77 items
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/_SUCCESS
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00000
-rw-r--r--   3 hadoop supergroup         17 2015-08-30 17:22 /user/hadoop/output/part-r-00001
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00002
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00003
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00004
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00005
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00006
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00007
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00008
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00009
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00010
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00011
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00012
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00013
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00014
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00015
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00016
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00017
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00018
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00019
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00020
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00021
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00022
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00023
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00024
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00025
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00026
-rw-r--r--   3 hadoop supergroup         20 2015-08-30 17:22 /user/hadoop/output/part-r-00027
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00028
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00029
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00030
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00031
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00032
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00033
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00034
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00035
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00036
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00037
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00038
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00039
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00040
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00041
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00042
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00043
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00044
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00045
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00046
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00047
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00048
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00049
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00050
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00051
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00052
-rw-r--r--   3 hadoop supergroup         24 2015-08-30 17:22 /user/hadoop/output/part-r-00053
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00054
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00055
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00056
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00057
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00058
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00059
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00060
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00061
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00062
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00063
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00064
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00065
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00066
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00067
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00068
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00069
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00070
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00071
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00072
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00073
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00074
-rw-r--r--   3 hadoop supergroup          0 2015-08-30 17:22 /user/hadoop/output/part-r-00075



5、排序完数据:

[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output/part-r-00001
3       1
3       5
3       9
3       24
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output/part-r-00027
5       18
5       23
5       56
5       89
[hadoop@hadoop ~]$ hdfs dfs -text /user/hadoop/output/part-r-00053
7       4
7       26
7       45
7       63
7       63
[hadoop@hadoop ~]$



你可能感兴趣的:(2-1二次排序代码)