hadoop案例java代码实现

今天手写一个hadoop的案例:

需求:找出每个月气温最高的2天

hadoop案例java代码实现_第1张图片

思路:

记录特点
每年
每个月
最高
2天
1天多条记录?
进一步思考
年月分组
温度升序
key中要包含时间和温度!
MR原语:相同的key分到一组
通过GroupCompartor设置分组规则
自定义数据类型Weather
包含时间
包含温度
自定义排序比较规则
自定义分组比较
年月相同被视为相同的key
那么reduce迭代时,相同年月的记录有可能是同一天的,reduce中需要判断是否同一天
注意OOM
数据量很大
全量数据可以切分成最少按一个月份的数据量进行判断
这种业务场景可以设置多个reduce
通过实现partition

代码实现:

MainClass:

public static void main(String[] args) throws Exception {
		//加载配置文件,如有改动则覆盖默认配置
		Configuration conf=new Configuration(true);
	
		//根据配置信息实例化job对象
		Job job=Job.getInstance(conf);
		
		//设置job的主入口程序
		job.setJarByClass(MainClass.class);
		//给作业设置名称
		job.setJobName("weather");
		//设置输入文件路径
		FileInputFormat.addInputPath(job, new Path(args[0]));
		//设置作业的输出路径
		FileOutputFormat.setOutputPath(job, new Path(args[1]));
		//reduce任务的数量,默认值是1
		job.setNumReduceTasks(2);
		
		//设置map输出键值对键的类型
		job.setMapOutputKeyClass(Weather.class);
		//设置map输出value的类型
		job.setMapOutputValueClass(Text.class);
		
		//设置分区器类型,目的防止数据倾斜
		job.setPartitionerClass(WeatherPartitioner.class);
		
		//设置map输出key的比较器,如果不设置,默认使用key类型自带的比较器
		job.setSortComparatorClass(WeatherComparator.class);
		
		//设置分组比较器,保证同年同月是一组数据
		//由于map阶段的排序规则和这里的分组规则不一样,称为二次排序
		job.setGroupingComparatorClass(WeatherGroupingComparator.class);
		
		//设置mapper类
		job.setMapperClass(WeatherMapper.class);
		//设置reducer类
		job.setReducerClass(WeatherReducer.class);
		
		job.waitForCompletion(true);
	}

Weateher:


public class Weather implements WritableComparable{
	
	private String year;
	private String month;
	private String day;
	private Integer wenDu;
	
	public String getYear() {
		return year;
	}

	public void setYear(String year) {
		this.year = year;
	}

	public String getMonth() {
		return month;
	}

	public void setMonth(String month) {
		this.month = month;
	}

	public String getDay() {
		return day;
	}

	public void setDay(String day) {
		this.day = day;
	}

	public Integer getWenDu() {
		return wenDu;
	}

	public void setWenDu(Integer wenDu) {
		this.wenDu = wenDu;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		//将该对象封装的信息序列化之后写出去
		out.writeUTF(year);
		out.writeUTF(month);
		out.writeUTF(day);
		out.writeInt(wenDu);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		//读的顺序要和写的顺序保持一致
		setYear(in.readUTF());
		setMonth(in.readUTF());
		setDay(in.readUTF());
		setWenDu(in.readInt());

	}

	@Override
	public int compareTo(Weather that) {
		int result=0;
		//1 表示this>that  0 表示this=that -1 表示this

WeatherComparator:

public class WeatherComparator extends WritableComparator {
	
	public WeatherComparator() {
		//让系统将读取到的key转换为weather类型,true是表示实例化该weather对象
		super(Weather.class,true);	
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		int result = 0;
		
		Weather wa =(Weather) a;
		Weather wb =(Weather) b;
		
		result = wa.getYear().compareTo(wb.getYear());
		if(result==0) {
			result=wa.getMonth().compareTo(wb.getMonth());
			if(result==0) {
				result=wb.getWenDu().compareTo(wa.getWenDu());
			}
		}
		return result;
	}
}

WeatherGroupingComparator:

public class WeatherGroupingComparator extends WritableComparator {

	public WeatherGroupingComparator() {
		super(Weather.class,true);
	}
	
	@Override
	public int compare(WritableComparable a, WritableComparable b) {
		int result=0;
		
		Weather wa =(Weather) a;
		Weather wb =(Weather) b;

		//分组比较器要保证一组数据是同年同月的
		result=wa.getYear().compareTo(wb.getYear());
		if(result==0) {
			result=wa.getMonth().compareTo(wb.getMonth());
		}
		return result;
	}
}

WeatherPartitioner:

public class WeatherPartitioner extends Partitioner {

	@Override
	public int getPartition(Weather key, Text value, int numPartitions) {
		
		String month=key.getMonth();
		
		int partionNum=(month.hashCode() & Integer.MAX_VALUE)%numPartitions;
		
		return partionNum;
	}

}

WeatherMapper:

public class WeatherMapper extends Mapper{
	private Weather weather=new Weather();
	private static SimpleDateFormat DATE_FORMAT=new SimpleDateFormat("yyyy-MM-dd");
	
	@Override
	protected void map(LongWritable key, Text value, Context context)
			throws IOException, InterruptedException {
		String lineStr=value.toString();
		
		String[] lineStrs=lineStr.split("\t");
		
		String wenDuStr=lineStrs[1].substring(0, lineStrs[1].length()-1);
	
		int wenDu=Integer.parseInt(wenDuStr);
		
		try {
			Date date = DATE_FORMAT.parse(lineStrs[0]);
			
			//日历对象
			Calendar cal=Calendar.getInstance();
			cal.setTime(date);
		
			int year = cal.get(Calendar.YEAR);
			int month = cal.get(Calendar.MONTH);
			int day = cal.get(Calendar.DAY_OF_MONTH);
			
			weather.setDay(day+"");
			weather.setMonth(month+"");
			weather.setWenDu(wenDu);
			weather.setYear(year+""); 
			
			context.write(weather, value);
			
			
		} catch (ParseException e) {
			e.printStackTrace();
		}
		 
	}
}

WeatherReducer:

public class WeatherReducer extends Reducer{
	@Override
	protected void reduce(Weather key, Iterable values, Context context)
			throws IOException, InterruptedException {
	
		String day=null;
		
		Iterator iterator = values.iterator();
		Text text=null;
		while(iterator.hasNext()) {
			text=iterator.next();
			
			if(day==null) {//输出本月温度最高的一天
				context.write(text,NullWritable.get());
				day=key.getDay();//标记已经输出的一天
			}else {//找第二天的
				if(!day.equals(key.getDay())) {//判断其不是第一天
					context.write(text, NullWritable.get());//输出第二天的
					break;//找到第二天的退出循环
				}
			}
		}	
	}
}

运行结果图就不放了(o°ω°o)

你可能感兴趣的:(hadoop)