今天手写一个hadoop的案例:
记录特点
每年
每个月
最高
2天
1天多条记录?
进一步思考
年月分组
温度升序
key中要包含时间和温度!
MR原语:相同的key分到一组
通过GroupCompartor设置分组规则
自定义数据类型Weather
包含时间
包含温度
自定义排序比较规则
自定义分组比较
年月相同被视为相同的key
那么reduce迭代时,相同年月的记录有可能是同一天的,reduce中需要判断是否同一天
注意OOM
数据量很大
全量数据可以切分成最少按一个月份的数据量进行判断
这种业务场景可以设置多个reduce
通过实现partition
MainClass:
public static void main(String[] args) throws Exception {
//加载配置文件,如有改动则覆盖默认配置
Configuration conf=new Configuration(true);
//根据配置信息实例化job对象
Job job=Job.getInstance(conf);
//设置job的主入口程序
job.setJarByClass(MainClass.class);
//给作业设置名称
job.setJobName("weather");
//设置输入文件路径
FileInputFormat.addInputPath(job, new Path(args[0]));
//设置作业的输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//reduce任务的数量,默认值是1
job.setNumReduceTasks(2);
//设置map输出键值对键的类型
job.setMapOutputKeyClass(Weather.class);
//设置map输出value的类型
job.setMapOutputValueClass(Text.class);
//设置分区器类型,目的防止数据倾斜
job.setPartitionerClass(WeatherPartitioner.class);
//设置map输出key的比较器,如果不设置,默认使用key类型自带的比较器
job.setSortComparatorClass(WeatherComparator.class);
//设置分组比较器,保证同年同月是一组数据
//由于map阶段的排序规则和这里的分组规则不一样,称为二次排序
job.setGroupingComparatorClass(WeatherGroupingComparator.class);
//设置mapper类
job.setMapperClass(WeatherMapper.class);
//设置reducer类
job.setReducerClass(WeatherReducer.class);
job.waitForCompletion(true);
}
Weateher:
public class Weather implements WritableComparable{
private String year;
private String month;
private String day;
private Integer wenDu;
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public String getMonth() {
return month;
}
public void setMonth(String month) {
this.month = month;
}
public String getDay() {
return day;
}
public void setDay(String day) {
this.day = day;
}
public Integer getWenDu() {
return wenDu;
}
public void setWenDu(Integer wenDu) {
this.wenDu = wenDu;
}
@Override
public void write(DataOutput out) throws IOException {
//将该对象封装的信息序列化之后写出去
out.writeUTF(year);
out.writeUTF(month);
out.writeUTF(day);
out.writeInt(wenDu);
}
@Override
public void readFields(DataInput in) throws IOException {
//读的顺序要和写的顺序保持一致
setYear(in.readUTF());
setMonth(in.readUTF());
setDay(in.readUTF());
setWenDu(in.readInt());
}
@Override
public int compareTo(Weather that) {
int result=0;
//1 表示this>that 0 表示this=that -1 表示this
WeatherComparator:
public class WeatherComparator extends WritableComparator {
public WeatherComparator() {
//让系统将读取到的key转换为weather类型,true是表示实例化该weather对象
super(Weather.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
int result = 0;
Weather wa =(Weather) a;
Weather wb =(Weather) b;
result = wa.getYear().compareTo(wb.getYear());
if(result==0) {
result=wa.getMonth().compareTo(wb.getMonth());
if(result==0) {
result=wb.getWenDu().compareTo(wa.getWenDu());
}
}
return result;
}
}
WeatherGroupingComparator:
public class WeatherGroupingComparator extends WritableComparator {
public WeatherGroupingComparator() {
super(Weather.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
int result=0;
Weather wa =(Weather) a;
Weather wb =(Weather) b;
//分组比较器要保证一组数据是同年同月的
result=wa.getYear().compareTo(wb.getYear());
if(result==0) {
result=wa.getMonth().compareTo(wb.getMonth());
}
return result;
}
}
WeatherPartitioner:
public class WeatherPartitioner extends Partitioner {
@Override
public int getPartition(Weather key, Text value, int numPartitions) {
String month=key.getMonth();
int partionNum=(month.hashCode() & Integer.MAX_VALUE)%numPartitions;
return partionNum;
}
}
WeatherMapper:
public class WeatherMapper extends Mapper{
private Weather weather=new Weather();
private static SimpleDateFormat DATE_FORMAT=new SimpleDateFormat("yyyy-MM-dd");
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String lineStr=value.toString();
String[] lineStrs=lineStr.split("\t");
String wenDuStr=lineStrs[1].substring(0, lineStrs[1].length()-1);
int wenDu=Integer.parseInt(wenDuStr);
try {
Date date = DATE_FORMAT.parse(lineStrs[0]);
//日历对象
Calendar cal=Calendar.getInstance();
cal.setTime(date);
int year = cal.get(Calendar.YEAR);
int month = cal.get(Calendar.MONTH);
int day = cal.get(Calendar.DAY_OF_MONTH);
weather.setDay(day+"");
weather.setMonth(month+"");
weather.setWenDu(wenDu);
weather.setYear(year+"");
context.write(weather, value);
} catch (ParseException e) {
e.printStackTrace();
}
}
}
WeatherReducer:
public class WeatherReducer extends Reducer{
@Override
protected void reduce(Weather key, Iterable values, Context context)
throws IOException, InterruptedException {
String day=null;
Iterator iterator = values.iterator();
Text text=null;
while(iterator.hasNext()) {
text=iterator.next();
if(day==null) {//输出本月温度最高的一天
context.write(text,NullWritable.get());
day=key.getDay();//标记已经输出的一天
}else {//找第二天的
if(!day.equals(key.getDay())) {//判断其不是第一天
context.write(text, NullWritable.get());//输出第二天的
break;//找到第二天的退出循环
}
}
}
}
}
运行结果图就不放了(o°ω°o)