hadoop之最大气温

hadoop权威指南第8章第2小结介绍了通过MR求最大气温的例子。
我做了简单修改后代码如下:
输入
1995 10
1996 10
1995 5
1999 20
1999 10
1996 3

期望输出:
1995 10
1996 10
1999 20
//自定义类InitPair
	
	public class InitPair implements WritableComparable{
	private int year; //年份
	private int tmp;  //气温
	public int getYear() {
		return year;
	}
	public void setYear(int year) {
		this.year = year;
	}
	public int getTmp() {
		return tmp;
	}
	public void setTmp(int tmp) {
		this.tmp = tmp;
	}
	@Override
	public void write(DataOutput out) throws IOException {
		// TODO Auto-generated method stub
		out.writeInt(year);
		out.writeInt(tmp);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		// TODO Auto-generated method stub
		this.year=in.readInt();
		this.tmp=in.readInt();
	}
	@Override
	public int compareTo(InitPair o) {
		// TODO Auto-generated method stub
		if(this.year>o.year){
			return 1;
		}else if(this.year==o.year){
			return 0;
		}else{
			return -1;
		}
	}
	@Override
	public boolean equals(Object obj) {
		// 年份和气温都相等才返回true
		InitPair o=(InitPair) obj;
		if(this.year==o.year && this.tmp==o.tmp){
			return true;
		}
		return false;
	}
	public InitPair() {
		super();
		// TODO Auto-generated constructor stub
	}
	public InitPair(int year, int tmp) {
		super();
		this.year = year;
		this.tmp = tmp;
	}
	@Override
	public String toString() {
		// TODO Auto-generated method stub
		return this.year+ "    " +this.tmp;
	}	
}


   //二次排序
	public class MaxTempByTwoSort {
	static class maxTempMap extends Mapper {
		@Override
		protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			// 简单的解析
			String[] line = value.toString().split("\\t");
			if (line.length == 2) {
			//key为自定义的类 value为null 表示本例不用到value值
				context.write(new InitPair(Integer.parseInt(line[0]), Integer.parseInt(line[1])), NullWritable.get());
			}
		}
	}
	static class maxTempReduce extends Reducer {
		@Override
		protected void reduce(InitPair key, Iterable arg1, Context context)
				throws IOException, InterruptedException {
			//只是简单的输出
			context.write(key, NullWritable.get());
		}
	}
	//分区
	static class FirstPartition extends Partitioner {
		@Override
		public int getPartition(InitPair key, NullWritable value, int numPartitions) {
			//仅用年份进行分区
			return Math.abs(key.getYear() * 127) % numPartitions;
		}
	}
	
	//全部的排序规则,首先按年份升序排列,再按气温降序排列
	static class KeyComparator extends WritableComparator {
		protected KeyComparator() {
			super(InitPair.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			// TODO Auto-generated method stub
			InitPair ip1 = (InitPair) a;
			InitPair ip2 = (InitPair) b;
			int cmp = ip1.compareTo(ip2);
			if (cmp != 0) {
				return cmp;
			}else{
				if (ip1.getTmp() > ip2.getTmp()) {
					return -1; //气温高表示排在前面
				} else{
					return 1;//气温低表示排在后面
				}
			}
		}
	}
	
	//分组合并规则:只根据年份进行分组,年份相同气温不同被认为是同一个key
	static class GroupComparator extends WritableComparator {
		protected GroupComparator() {
			super(InitPair.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			// TODO Auto-generated method stub
			InitPair ip1 = (InitPair) a;
			InitPair ip2 = (InitPair) b;
			return ip1.compareTo(ip2);
		}
	}
	public static void main(String[] arg) throws IOException, ClassNotFoundException, InterruptedException {
		Job job=Job.getInstance(new Configuration());
		job.setJarByClass(MaxTempByTwoSort.class);	
		//map
		job.setMapperClass(maxTempMap.class);
		job.setPartitionerClass(FirstPartition.class);
		job.setSortComparatorClass(KeyComparator.class);
		job.setGroupingComparatorClass(GroupComparator.class);
		//reduce
		job.setReducerClass(maxTempReduce.class);
		job.setOutputKeyClass(InitPair.class);
		job.setOutputValueClass(NullWritable.class);
		
		FileInputFormat.addInputPath(job, new Path(arg[0]));
		FileOutputFormat.setOutputPath(job, new Path(arg[1]));
		job.waitForCompletion(true);
	}
}

输出:
1995 10
1996 10
1999 20

总结:刚开始看这个实例的时候比较迷惑的地方在于:分组后再经过reduce输出便能得到每年最大的气温。
 当时对GroupComparator没有很好的理解,比如:<<1995,20>,null> <<1995,10>,null>  按照年份分组后只剩下
 <<1995,20>,null>,因为我们的分组策略中只包含了年份,两个年份相同时被认为是同一组。
 自然reduce输出的就是<<1995,20>,null>中的<1995,20>。

你可能感兴趣的:(hadoop)