MultipleInputs允许定义多个数据源,并且为每一个数据源指定一个独立的输入格式和Mapper,因此可以对多个输入文件执行Reduce操作。
package org.cy.pack; import java.io.IOException; import java.net.URISyntaxException; import java.util.Date; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.*; import org.apache.hadoop.mapreduce.lib.output.*; public class ReduceJoin { public static class SalesRecordMapper extends Mapper<Object,Text,Text,Text>{ public void map(Object key,Text value, Context context) throws InterruptedException{ String record = value.toString(); String[] parts = record.split("\t"); try { context.write(new Text(parts[0]),new Text("Sales\t"+parts[1])); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public static class AccountRecordMapper extends Mapper<Object,Text,Text,Text>{ public void map(Object key,Text value, Context context) throws InterruptedException{ String record = value.toString(); String[] parts = record.split("\t"); try { context.write(new Text(parts[0]),new Text("Accounts\t"+parts[1])); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } public static class ReduceJoinReducer extends Reducer<Text,Text,Text,Text>{ public void reduce(Text key,Iterable<Text> values,Context context) throws IOException, InterruptedException{ String name = ""; double total = 0.0; int count = 0; for(Text val:values){ String[] parts = val.toString().split("\t"); if(parts[0].equals("Sales")){ count++; total += Float.parseFloat(parts[1]); }else if(parts[0].equals("Accounts")){ name = parts[1]; } } String str = String.format("%d\t%f", count,total); context.write(new Text(name), new Text(str)); } } /** * @param args * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException * @throws URISyntaxException */ @SuppressWarnings("deprecation") public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { // TODO Auto-generated method stub Date startTime = new Date(); System.out.println("Job started: " + startTime); Configuration conf = new Configuration(); Job job = new Job(conf,"ReduceJoin"); job.setJarByClass(ReduceJoin.class); job.setReducerClass(ReduceJoinReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); Path in1 = new Path(args[0]); Path in2 = new Path(args[1]); Path out = new Path(args[2]); MultipleInputs.addInputPath(job, in1, TextInputFormat.class,SalesRecordMapper.class); MultipleInputs.addInputPath(job, in2, TextInputFormat.class, AccountRecordMapper.class); FileOutputFormat.setOutputPath(job, out); out.getFileSystem(conf).delete(out); job.waitForCompletion(true); int flag = job.waitForCompletion(true)?0:1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) + " ms."); System.exit(flag); } }
输入内容的各行的字段以tab输入作为分隔符。
sales.txt内容详情:账户ID,销售额,时间
sales.txt:
001 35.99 time
002 12.49 time
004 13.42 time
003 499.99 time
001 78.95 time
002 21.99 time
002 93.45 time
001 9.99 time
Accounts.txt内容详情:账户ID,姓名,时间
Accounts.txt:
001 J time
002 AB time
003 AP time
004 NA time
caiyong@caiyong:/opt/hadoop$ bin/hadoop fs -copyFromLocal /home/caiyong/桌面/sales.txt /
caiyong@caiyong:/opt/hadoop$ bin/hadoop fs -copyFromLocal /home/caiyong/桌面/Accounts.txt /
Arhuments:
hdfs://127.0.0.1:8020/sales.txt
hdfs://127.0.0.1:8020/Accounts.txt
hdfs://127.0.0.1:8020/ReduceJoinRes
caiyong@caiyong:/opt/hadoop$ bin/hadoop fs -cat /ReduceJoinRes/*
J 3 124.929998
AB 3 127.929996
AP 1 499.989990
NA 1 13.420000
参考资料:《Hadoop Beginner's Guide》 [英]Garry Tukington 著