import java.io.*;
import org.apache.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
public class MaxTemperature{
public static void main(String[]args) throws IOException{
//args=new String[2];
//args[0]="/home/yukjin/Downsload/1901";
//args[1]="output";
if(args.length!=2){
System.err.println("Usage:MaxTemperature
System.exit(-1);
}
JobConf conf=new JobConf(MaxTemperature.class);//JobConf指定作业执行规范,可以使用它控制整个作业的运行
conf.setJobName("Max temperature");
FileInputFormat.addInputPath(conf,new Path(args[0]));//指定文件输入路径,路径既可以是单个文件也可以是某个目录,也可多次调用实现多路径输入
FileOutputFormat.setOutputPath(conf,new Path(args[1]));//指定文件输出路径,执行前该路径不能存在,负责hadoop拒绝运行该任务
conf.setMapperClass(MaxTemperatureMapper.class);
conf.setReducerClass(MaxTemperatureReducer.class);
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(IntWritable.class);
JobClient.runJob(conf);
}
}
import java.io.*;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
public class MaxTemperatureMapper extends MapReduceBase implements Mapper{
private static final int MISSING=9999;
public void map(LongWritable key,Text value,OutputCollectoroutput,Reporter reporter)throws IOException{
String line=value.toString();//Text类型转换为String类型
String year=line.substring(15,19);//截取年份
int airtemperature;
if(line.charAt(87)=='+'){
airtemperature=Integer.parseInt(line.substring(88,92));
}
else{
airtemperature=Integer.parseInt(line.substring(87,92));
}
String quality=line.substring(92,93);
if(airtemperature!=MISSING&&quality.matches("[01459]")){
output.collect(new Text(year),new IntWritable(airtemperature));
}
}
}
Mapper接口是一个泛型类型,需要指定4个参数类型,分别指定Map函数的输入键,输入值,输出键,输出值,此例中输入键位LongWritable(长整型偏移量),输入值Text(一行文本),输出键Text(年份),以及输出值IntWritable(气温)。
Hadoop自身提供一套可优化网络序列化传输的基本类型,而不直接使用java的的基本类型,这些类型在org.apache.hadoop.io包中可以找到。
import java.io.*;
import java.util.Iterator;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
public class MaxTemperatureReducer extends MapReduceBase implements Reducer{
public void reduce(Text key,Iteratorvalues,OutputCollectoroutput,Reporter reporter)throws IOException{
int maxValue=Integer.MIN_VALUE;
while(values.hasNext()){
maxValue=Math.max(maxValue,values.next().get());
}
output.collect(key,new IntWritable(maxValue));
}
}
Reducer接口同样也是泛化类型,需要四个参数,分别指定Reduce函数的输入键,输入值,输出键以及输出值。Reduce函数的输入键值必须与Map函数的输出键值匹配。