转自:http://my.oschina.net/itblog/blog/275294
功能简介:计算每年的最高气温,数据源中前八位是日期,后两位是温度
数据源:
[zhoulx]$ cat input.txt
2014010114
2014010216
2014010317
2014010410
2014010506
2012010609
2012010732
2012010812
2012010919
2012011023
2001010116
2001010212
2001010310
2001010411
2001010529
2013010619
2013010722
2013010812
2013010929
2013011023
2008010105
2008010216
2008010337
2008010414
2008010516
2007010619
2007010712
2007010812
2007010999
2007011023
2010010114
2010010216
2010010317
2010010410
2010010506
2015010649
2015010722
2015010812
2015010999
2015011023
[zhoulx]$ hadoop dfs -ls hdfs://hadoop-namenode/tmp/zhoulx/output
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
^[[A-rw-r--r-- 3 sdp_dp hadoop 0 2016-01-19 14:59 hdfs://hadoop-namenode/tmp/zhoulx/output/_SUCCESS
-rw-r--r-- 3 sdp_dp hadoop 64 2016-01-19 14:59 hdfs://hadoop-namenode/tmp/zhoulx/output/part-r-00000
计算结果:
[zhoulx]$ hadoop dfs -cat hdfs://hadoop-namenode/tmp/zhoulx/output/part-r-00000
DEPRECATED: Use of this script to execute hdfs command is deprecated.
Instead use the hdfs command for it.
2001 29
2007 99
2008 37
2010 17
2012 32
2013 29
2014 17
2015 99
package test;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Temperature {
/**
* 四个泛型类型分别代表:
* KeyIn Mapper的输入数据的Key,这里是每行文字的起始位置(0,11,...)
* ValueIn Mapper的输入数据的Value,这里是每行文字
* KeyOut Mapper的输出数据的Key,这里是每行文字中的“年份”
* ValueOut Mapper的输出数据的Value,这里是每行文字中的“气温”
*/
static class TempMapper extends
Mapper
@Override
public void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 打印样本: Before Mapper: 0, 2000010115
System.out.print("Before Mapper: " + key + ", " + value);
String line = value.toString();
String year = line.substring(0, 4);
int temperature = Integer.parseInt(line.substring(8));
context.write(new Text(year), new IntWritable(temperature));
// 打印样本: After Mapper:2000, 15
System.out.println(
"======" +
"After Mapper:" + new Text(year) + ", " + new IntWritable(temperature));
}
}
/**
* 四个泛型类型分别代表:
* KeyIn Reducer的输入数据的Key,这里是每行文字中的“年份”
* ValueIn Reducer的输入数据的Value,这里是每行文字中的“气温”
* KeyOut Reducer的输出数据的Key,这里是不重复的“年份”
* ValueOut Reducer的输出数据的Value,这里是这一年中的“最高气温”
*/
static class TempReducer extends
Reducer
@Override
public void reduce(Text key, Iterable
Context context) throws IOException, InterruptedException {
int maxValue = Integer.MIN_VALUE;
StringBuffer sb = new StringBuffer();
//取values的最大值
for (IntWritable value : values) {
maxValue = Math.max(maxValue, value.get());
sb.append(value).append(", ");
}
// 打印样本: Before Reduce: 2000, 15, 23, 99, 12, 22,
System.out.print("Before Reduce: " + key + ", " + sb.toString());
context.write(key, new IntWritable(maxValue));
// 打印样本: After Reduce: 2000, 99
System.out.println(
"======" +
"After Reduce: " + key + ", " + maxValue);
}
}
public static void main(String[] args) throws Exception {
//输入路径
String dst = "/tmp/zhoulx/input.txt";
//输出路径,必须是不存在的,空文件加也不行。
String dstOut = "/tmp/zhoulx/output";
Configuration hadoopConfig = new Configuration();
hadoopConfig.set("fs.hdfs.impl",
org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()
);
hadoopConfig.set("fs.file.impl",
org.apache.hadoop.fs.LocalFileSystem.class.getName()
);
Job job = new Job(hadoopConfig);
//如果需要打成jar运行,需要下面这句
job.setJarByClass(Temperature.class);
//job执行作业时输入和输出文件的路径
FileInputFormat.addInputPath(job, new Path(dst));
FileOutputFormat.setOutputPath(job, new Path(dstOut));
//指定自定义的Mapper和Reducer作为两个阶段的任务处理类
job.setMapperClass(TempMapper.class);
job.setReducerClass(TempReducer.class);
//设置最后输出结果的Key和Value的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//执行job,直到完成
job.waitForCompletion(true);
System.out.println("Finished");
}
}
//如果需要打成jar运行,需要下面这句
job.setJarByClass(Temperature.class);
打包需要打成Runable JAR file
需要依赖这四个包:
commons-configuration-1.6.jar
commons-lang-2.5.jar
commons-logging-1.1.1.jar
hadoop-core-1.0.3.jar
MANIFEST.MF内容如下:
Manifest-Version: 1.0
Class-Path: lib/commons-configuration-1.6.jar lib/commons-lang-2.5.jar lib/commons-logging-1.1.1.jar lib/hadoop-core-1.0.3.jar
Main-Class: test.Temperature
这两个都是hdfs目录,可以省略hdfs://hadoop-name:9000/
//输入路径
String dst = "/tmp/zhoulx/input.txt";
//输出路径,必须是不存在的,空文件加也不行。
String dstOut = "/tmp/zhoulx/output";
运行要用:
hadoop jar testr.jar
hadoop dfs -cat hdfs://hadoop-namenode/tmp/zhoulx/output/part-r-00000
hadoop jar testr.jar
hadoop dfs -put input.txt hdfs://hadoop-namenode/tmp/zhoulx/