数据为每5秒统计一次的温度数据,我需要求取每个月份的最高三个温度
数据类型格式:
自定义排序
自定义分组
在上面分析, key包含year, month, T,在map输出和reduce的输入,都会发生溢写数据到disk上, map到reduce可能跨网络传输,
package com.chb.myWeather;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
自定义key:
包含 year,month, T
map输出,和reduce的输入都会将数据溢写道磁盘,
而map到reduce 会跨网络传输,所以需要持久化(序列化)
/
public class MyKey implements WritableComparable{
private int year;
private int month;
private double t;
...
getter/setter...
...
//反序列化
@Override
public void readFields(DataInput arg0) throws IOException {
this.year = arg0.readInt();
this.month = arg0.readInt();
this.t = arg0.readDouble();
}
//序列化
@Override
public void write(DataOutput arg0) throws IOException {
arg0.writeInt(year);
arg0.writeInt(month);
arg0.writeDouble(t);
}
/**
判断是否为同一个对象, 该对象最为map输出的key
/
public int compareTo(MyKey o) {
int c1 = Integer.compare(this.getYear(), o.getYear());
if (c1 == 0) {
int c2 = Integer.compare(this.getMonth(), o.getMonth());
if (c2 == 0){
return Double.compare(this.getT(), o.getT());
}else {
return c2;
}
}else {
return c1;
}
}
}
package com.chb.myWeather;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 输入:
* 默认的map输入的是一行行的数据, 按照每行数据字母的序号为键(LongWritable), 行数据为值(Text)
* 按照默认情况,我们需要对每行需要进行切割
* 1925-11-23 15:23:33 23c 时间YYYY-MM-dd空格HH:mm:ss制表符温度
* 输出:
* mapper的输出是使用自定key,
*/
public class MyMapper extends Mapper {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
@Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
try {
Date date = sdf.parse(key.toString());
Calendar c = Calendar.getInstance();
c.setTime(date);
int year = c.get(Calendar.YEAR);
int month =c.get(Calendar.MONTH);
//获取温度
Double t = Double.parseDouble(value.toString().substring(0, value.toString().length()-1));
MyKey myKey = new MyKey();
myKey.setYear(year);
myKey.setMonth(month);
myKey.setT(t);
//输出
context.write(myKey, new DoubleWritable(t));
} catch (Exception e) {
e.printStackTrace();
}
}
}
默认的partitio是根据key的hashcode模reduceTask的数据,现在我们需要将一年的数据交给一个reduce处理, 所以需要按照年份分对应的reduceTask。这些reduceTask可以并行计算,性能就提高了,
package com.chb.myWeather;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
public class MyPartitioner extends HashPartitioner<MyKey, NullWritable>{
@Override
public int getPartition(MyKey key, NullWritable value, int numReduceTasks) {
return (key.getYear()-1949)%numReduceTasks;
}
}
默认情况下, shuffle中的sort是按照key的字典排序,而我们需要求一个月的前三个温度,这势必会对温度排序,所以我们自定义排序,按照温度的排序,输出的结果就是按照温度排序, 这样我们不用再对温度进行排序, 直接去reduce每组数据的输出数据的前三个。
package com.chb.myWeather;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class MySort extends WritableComparator{
//将自定义的key类加载进行
public MySort() {
super(MyKey.class, true);
}
/**
* 比较两个对象的温度
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
MyKey k1 = (MyKey)a;
MyKey k2 = (MyKey)b;
int r1 = Integer.compare(k1.getYear(), k2.getYear());
if (r1 == 0 ) {
int r2 = Integer.compare(k2.getMonth(), k2.getMonth());
if (r2 == 0) {
//前面加一个- 。因为我们是按照温度的降序排序
return -Double.compare(k1.getT(),k2.getT());
}else {
return r2;
}
}else {
return r1;
}
}
}
默认的分区是按照key十分相同,进行分组,如果使用默认分组, 那么就死将一年数据作为一个分组,情况麻烦, 所以我们将按照月份分组,每组输出前三个即可。
package com.chb.myWeather;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* 自定义分组,
* 默认的分组是按照key是否相同,
*/
public class MyGroup extends WritableComparator{
public MyGroup() {
super(MyKey.class, true);
}
/**
* 由于分组是按照月份分组,所以在比较是只要比较月份是否相同,
* 其他略去,做数据过滤
*/
public int compare(WritableComparable a, WritableComparable b) {
MyKey k1 = (MyKey)a;
MyKey k2 = (MyKey)b;
int r1 = Integer.compare(k1.getYear(), k2.getYear());
if (r1 == 0) {
return Integer.compare(k1.getMonth(), k2.getMonth());
}else {
return r1;
}
}
}
package com.chb.myWeather;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<MyKey, DoubleWritable, Text, NullWritable>{
@Override
protected void reduce(MyKey key, Iterable values, Context context)
throws IOException, InterruptedException {
//由于分组是按照月份,自定义排序是按照温度降序
//所以shuffler执行完的数据,是一个月数据,按照温度的降序排序,
//所以我们只要获取前三个值,就是一个月份的最高三个温度
int num = 0;
for (DoubleWritable dw : values) {
String msg = key.getYear()+"年"+key.getMonth()+"月"+"\t"+dw.toString();
//写数据
context.write(new Text(msg), null);
if (num==3) {
break;
}
}
}
}
代码:
package com.chb.myWeather;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
public class RunJob {
public static void main(String[] args) {
Configuration conf = new Configuration();
try {
FileSystem fs = FileSystem.get(conf);
Job job = Job.getInstance();
job.setJarByClass(RunJob.class);
job.setJar("");
//设置自定义
job.setMapperClass(MyMapper.class);
job.setPartitionerClass(MyPartitioner.class);
job.setSortComparatorClass(MySort.class);
job.setGroupingComparatorClass(MyGroup.class);
job.setReducerClass(MyReducer.class);
//设置Mappper的键值对类型
job.setMapOutputKeyClass(MyKey.class);
job.setMapOutputValueClass(DoubleWritable.class);
//设置reduce数据量
job.setNumReduceTasks(3);
//设置输入格式类型
job.setInputFormatClass(KeyValueTextInputFormat.class);
//设置输入路径
FileInputFormat.addInputPath(job, new Path("/user/chb/input/weather"));
//设置输出
Path out = new Path("/user/chb/output/weather");
if (fs.exists(out)) {
fs.delete(out);
}
boolean f = job.waitForCompletion(true);
if (f) {
System.out.println("任务执行完成。。。");
}
} catch (Exception e) {
e.printStackTrace();
}
}
}