本例实现问题2:外连接、左外连接、右外连接
问题描述:
1、 任意多个数据源的内连接
输入有两个文件,一个名为factory的输入文件包含描述工厂名和其对应地址ID的表,另一个名为address的输入文件包含描述地址名和其ID的表格。请编写一个程序输出工厂名和其对应地址的名字。
输入:输入有两个文件,第一个描述了工厂名和对应地址的ID,第二个输入文件描述了地址名和其ID。
输出:输出是一个包含工厂名和其对应地名的文件。
【数据样例】 输入:
①factory.txt:
factoryname addressID
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1
Nanchang Univ 5
Shanghai Bank 10
②address.txt:
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
11 Chengdu
全外连接输出
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
Shanghai Bank 10 NULL
NULL NULL Chengdu
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
NULL NULL Xian
Nanchang Univ 5 NULL
左外连接输出
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
Shanghai Bank 10 NULL
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
Nanchang Univ 5 NULL
右外连接输出
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
NULL NULL Chengdu
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
NULL NULL Xian
要求:输出文件的第一行必须是“factoryname addressID addressname”
2、选做题,上述数据如果改为左外(右外)或外连接,程序应该怎么修改
3、如果上述两个表格数据量很大,尝试改进程序(可以自己模式数据测试)
说明: 数据连接实验可以使用基本MapReduce或者使用Hadoop DataJoin工具包来写。
Bean
public class MyBean implements Writable {
private String facName;
private int addID;
private String addName;
private String type;
public MyBean() {
super();
}
public MyBean(String facName, int addID, String addName, String type) {
this.facName = facName;
this.addID = addID;
this.addName = addName;
this.type = type;
}
@Override
public String toString() {
return facName + "\t" + addID + "\t" + addName;
}
// write readFields
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(facName);
out.writeInt(addID);
out.writeUTF(addName);
out.writeUTF(type);
}
@Override
public void readFields(DataInput in) throws IOException {
this.facName = in.readUTF();
this.addID = in.readInt();
this.addName = in.readUTF();
this.type = in.readUTF();
}
//get & set
public String getFacName() {
return facName;
}
public void setFacName(String facName) {
this.facName = facName;
}
public int getAddID() {
return addID;
}
public void setAddID(int addID) {
this.addID = addID;
}
public String getAddName() {
return addName;
}
public void setAddName(String addName) {
this.addName = addName;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
}
MyAnyJoin类
public class MyAnyJoin {
public static class FactoryMapper
extends Mapper<LongWritable, Text, Text, MyBean> {}
public static class AddressMapper
extends Mapper<LongWritable, Text, Text, MyBean> {}
public static class AnyJoinReducer
extends Reducer<Text, MyBean, Text, NullWritable> {}
public static void main(String[] args) {}
}
Driver
设置连接类型(外连接、左外连接、右外连接)
public static void main(String[] args) throws
ClassNotFoundException, IOException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// job.getConfiguration().set("joinType", "leftOuterJoin"); // 设置左外连接类型
// job.getConfiguration().set("joinType", "rightOuterJoin"); // 设置右外连接类型
job.getConfiguration().set("joinType", "allOuterJoin"); // 设置全外连接类型
//设置两个mapper分别处理factory.txt和address.txt
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, FactoryMapper.class);
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, AddressMapper.class);
job.setReducerClass(AnyJoinReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(MyBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
FileOutputFormat.setOutputPath(job, new Path(args[2]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
FactoryMapper
public static class FactoryMapper
extends Mapper<LongWritable, Text, Text, MyBean> {
// k1 & v1
Text k1 = new Text();
MyBean myBean = new MyBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
if (!line.startsWith("factory")) {
String[] fields = line.split(" ");
k1.set(fields[fields.length - 1]);
String facName = "";
for (int i = 0; i <= fields.length - 2; i++) {
facName += fields[i];
if (i != fields.length - 2)
facName += " ";
}
myBean.setFacName(facName); // "Beijing Red Star"
myBean.setAddID(Integer.parseInt(fields[fields.length - 1]));
myBean.setAddName("");
myBean.setType("factory");
context.write(k1, myBean);
}
}
}
AddressMapper
public static class AddressMapper
extends Mapper<LongWritable, Text, Text, MyBean> {
// k1 & v1
Text k1 = new Text();
MyBean myBean = new MyBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
if (!line.startsWith("addressID")) {
String[] fields = line.split(" ");
k1.set(fields[0]);
myBean.setFacName("");
myBean.setAddID(Integer.parseInt(fields[0]));
myBean.setAddName(fields[1]);
myBean.setType("address");
context.write(k1, myBean);
}
}
}
Reducer
// reducer
public static class AnyJoinReducer
extends Reducer<Text, MyBean, Text, NullWritable> {
private String joinType;
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
this.joinType = context.getConfiguration().get("joinType");
System.out.println("- joinType:\t" + joinType);
}
@Override
protected void reduce(Text key, Iterable<MyBean> values, Context context)
throws IOException, InterruptedException {
ArrayList<MyBean> factoryBeans = new ArrayList<>();
ArrayList<MyBean> addressBeans = new ArrayList<>();
for (MyBean value : values) {
MyBean tempBean = new MyBean();
try {
BeanUtils.copyProperties(tempBean, value);
} catch (IllegalAccessException | InvocationTargetException e) {
e.printStackTrace();
}
if (value.getType().equals("factory")) {
factoryBeans.add(tempBean);
} else {
addressBeans.add(tempBean);
}
}
if (joinType.equals("leftOuterJoin")) {
for (MyBean factoryBean : factoryBeans) {
if (addressBeans.size() > 0) {
for (MyBean addressBean : addressBeans) {
String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
+ addressBean.getAddName();
context.write(new Text(str), NullWritable.get());
}
} else {
String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "+ "NULL";
context.write(new Text(str), NullWritable.get());
}
}
} else if (joinType.equals("rightOuterJoin")) {
for (MyBean addressBean : addressBeans) {
if (factoryBeans.size() > 0) {
for (MyBean factoryBean : factoryBeans) {
String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
+ addressBean.getAddName();
context.write(new Text(str), NullWritable.get());
}
} else {
String str = "NULL NULL "+ addressBean.getAddName();
context.write(new Text(str), NullWritable.get());
}
}
} else if (joinType.equals("allOuterJoin")) {
if (factoryBeans.size() > 0) {
for (MyBean factoryBean : factoryBeans) {
if (addressBeans.size() > 0) {
for (MyBean addressBean : addressBeans) {
String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
+ addressBean.getAddName();
context.write(new Text(str), NullWritable.get());
}
} else {
String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "+ "NULL";
context.write(new Text(str), NullWritable.get());
}
}
} else {
for (MyBean addressBean : addressBeans) {
if (factoryBeans.size() > 0) {
for (MyBean factoryBean : factoryBeans) {
String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
+ addressBean.getAddName();
context.write(new Text(str), NullWritable.get());
}
} else {
String str = "NULL NULL "+ addressBean.getAddName();
context.write(new Text(str), NullWritable.get());
}
}
}
}
}
}