源码下载地址:http://download.csdn.net/detail/huhui_bj/5645641
opencsv下载地址:http://download.csdn.net/detail/huhui_bj/5645661
地震数据下载地址:http://download.csdn.net/detail/huhui_bj/5645685
日期,时间,纬度(°),经度(°),深度(km),震级类型,震级值,事件类型,参考地名 2013-06-25,06:04:13.0,10.70,-42.60,10,Ms,6.5,eq,中大西洋海岭北部 2013-06-24,14:34:48.7,44.33,84.10,6,Ms,4.1,eq,新疆维吾尔自治区塔城地区乌苏市 2013-06-24,13:02:01.9,44.31,84.17,8,Ms,4.3,eq,新疆维吾尔自治区塔城地区乌苏市 2013-06-24,11:44:20.8,39.42,95.50,6,Ms,3.4,eq,甘肃省酒泉市肃北蒙古族自治县
/** * 测试读取csv文件中的地震数据 */ package com.eq.test; import java.io.IOException; import au.com.bytecode.opencsv.CSVParser; public class CSVProcessingTest { /** * @param args */ // 从csv文件复制一行数据 private final String LINE = "2013-06-23,22:31:30.3,24.70,99.21,5,ML,1.4,eq,云南施甸"; public void testReadingOneLine() { String[] lines = null; try { // 用opencsv解析 lines = new CSVParser().parseLine(LINE); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } // 打印解析结果 for (String line : lines) { System.out.println(line); } } public static void main(String[] args) { // TODO Auto-generated method stub CSVProcessingTest csvProcessingTest = new CSVProcessingTest(); csvProcessingTest.testReadingOneLine(); } }opencsv处理逗号分隔值值非常简单,该解析器仅返回一组String数组。
/** * 统计地震次数的区域的map */ package com.eq.map; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import au.com.bytecode.opencsv.CSVParser; public class EarthQuakeLocationMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { if (key.get() > 0) { String[] lines = new CSVParser().parseLine(value.toString()); context.write(new Text(lines[8]), new IntWritable(1)); } } }map函数十分简单。首先我们检查字节数(key对象)是否为0,这样可以避免CSV文件头部数据。然后传入地名,设置传出键。就是说,我们为每个地名编写一个计数器,当下文中reduce实现被调用时,获取一个键和一系列值。本例中,键是地名及其值,如下面所示:
"四川汶川":[1,1,1,1,1,1,1,1] "甘肃天祝":[1,1,1,1] "广西平果":[1,1,1,1,1,1]注意:context.write(new Text(lines[8]), new IntWritable(1))构建了如上面所示的逻辑关系集合。context是一个保存各种信息的hadoop的数据结构。context将被传递到reduce实现,reduce获取这些值为1的值然后叠加起来,算出总数。因此,一个reduce的输出视图将是这样的:
"四川汶川":[8] "甘肃天祝":[4] "广西平果":[6]
package com.eq.reduce; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class EarthQuakeLocationReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value : values) { count++; } if (count >= 10) { context.write(key, new IntWritable(count)); } } }reduce的实现也是非常简单的,传入到reduce中实际上是一个值的集合,我们所做的就是将他们加起来,然后写出一个新键值对来表示地点和次数。
"四川汶川":[1,1,1,1,1,1,1,1] --> "四川汶川":8
/** * 定义一个hadoop job,用于统计不同地域的地震次数 */ package com.eq.job; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.eq.map.EarthQuakeLocationMapper; import com.eq.map.EarthQuakesPerDateMapper; import com.eq.reduce.EarthQuakeLocationReducer; import com.eq.reduce.EarthQuakesPerDateReducer; import org.apache.hadoop.io.Text; public class EarthQuakesLocationJob { /** * @param args */ public static void main(String[] args) throws Throwable { // TODO Auto-generated method stub Job job = new Job(); job.setJarByClass(EarthQuakesLocationJob.class); FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/input/earthquake_data.csv"));//csv文件所在目录 FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output")); job.setMapperClass(EarthQuakeLocationMapper.class); job.setReducerClass(EarthQuakeLocationReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true)?0:1); } }
/** * map函数的实现 */ package com.eq.map; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import au.com.bytecode.opencsv.CSVParser; public class EarthQuakesPerDateMapper extends Mapper<LongWritable, Text, Text, IntWritable> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException { if (key.get() > 0) { try { // csv解析器 CSVParser parser = new CSVParser(); // 解析csv数据 String[] lines = parser.parseLine(value.toString()); String dtstr = lines[0]; //map输出 context.write(new Text(dtstr), new IntWritable(1)); } catch (Exception e) { // TODO: handle exception e.printStackTrace(); } } } }
package com.eq.reduce; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class EarthQuakesPerDateReducer extends Reducer<Text, IntWritable, Text, IntWritable> { @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int count = 0; for (IntWritable value : values) { count++; } context.write(key, new IntWritable(count)); } }
/** * 定义一个hadoop job */ package com.eq.job; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.eq.map.EarthQuakesPerDateMapper; import com.eq.reduce.EarthQuakesPerDateReducer; import org.apache.hadoop.io.Text; public class EarthQuakesPerDayJob { /** * @param args */ public static void main(String[] args) throws Throwable { // TODO Auto-generated method stub Job job = new Job(); job.setJarByClass(EarthQuakesPerDayJob.class); FileInputFormat.addInputPath(job, new Path("hdfs://localhost:9000/input/all_month.csv"));//csv文件所在目录 FileOutputFormat.setOutputPath(job, new Path("hdfs://localhost:9000/output")); job.setMapperClass(EarthQuakesPerDateMapper.class); job.setReducerClass(EarthQuakesPerDateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); System.exit(job.waitForCompletion(true)?0:1); } }这几段代码和之前的很相似,此处不再赘述。