mapreduce连接_外连接、左外连接、右外连接

外连接、左外连接、右外连接


本例实现问题2:外连接、左外连接、右外连接

问题描述:
1、 任意多个数据源的内连接
输入有两个文件,一个名为factory的输入文件包含描述工厂名和其对应地址ID的表,另一个名为address的输入文件包含描述地址名和其ID的表格。请编写一个程序输出工厂名和其对应地址的名字。

输入:输入有两个文件,第一个描述了工厂名和对应地址的ID,第二个输入文件描述了地址名和其ID。

输出:输出是一个包含工厂名和其对应地名的文件。

【数据样例】 输入:

①factory.txt:
factoryname addressID
Beijing Red Star 1
Shenzhen Thunder 3
Guangzhou Honda 2
Beijing Rising 1
Guangzhou Development Bank 2
Tencent 3
Bank of Beijing 1
Nanchang Univ 5
Shanghai Bank 10

②address.txt:
addressID addressname
1 Beijing
2 Guangzhou
3 Shenzhen
4 Xian
11 Chengdu

全外连接输出
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
Shanghai Bank 10 NULL
NULL NULL Chengdu
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
NULL NULL Xian
Nanchang Univ 5 NULL

左外连接输出
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
Shanghai Bank 10 NULL
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
Nanchang Univ 5 NULL

右外连接输出
Bank of Beijing 1 Beijing
Beijing Rising 1 Beijing
Beijing Red Star 1 Beijing
NULL NULL Chengdu
Guangzhou Development Bank 2 Guangzhou
Guangzhou Honda 2 Guangzhou
Tencent 3 Shenzhen
Shenzhen Thunder 3 Shenzhen
NULL NULL Xian

要求:输出文件的第一行必须是“factoryname addressID addressname”

2、选做题,上述数据如果改为左外(右外)或外连接,程序应该怎么修改
3、如果上述两个表格数据量很大,尝试改进程序(可以自己模式数据测试)
说明: 数据连接实验可以使用基本MapReduce或者使用Hadoop DataJoin工具包来写。


Bean

public class MyBean implements Writable {

	private String facName;
	private int addID;
	private String addName;
	private String type;

	public MyBean() {
		super();
	}
	public MyBean(String facName, int addID, String addName, String type) {
		this.facName = facName;
		this.addID = addID;
		this.addName = addName;
		this.type = type;
	}
	@Override
	public String toString() {
		return facName + "\t" + addID + "\t" + addName;
	}
	// write readFields
	@Override
	public void write(DataOutput out) throws IOException {
		out.writeUTF(facName);
		out.writeInt(addID);
		out.writeUTF(addName);
		out.writeUTF(type);
	}
	@Override
	public void readFields(DataInput in) throws IOException {
		this.facName = in.readUTF();
		this.addID = in.readInt();
		this.addName = in.readUTF();
		this.type = in.readUTF();
	}
	//get & set
	public String getFacName() {
		return facName;
	}
	public void setFacName(String facName) {
		this.facName = facName;
	}
	public int getAddID() {
		return addID;
	}
	public void setAddID(int addID) {
		this.addID = addID;
	}
	public String getAddName() {
		return addName;
	}
	public void setAddName(String addName) {
		this.addName = addName;
	}
	public String getType() {
		return type;
	}
	public void setType(String type) {
		this.type = type;
	}
}

MyAnyJoin类

public class MyAnyJoin {
	
	public static class FactoryMapper 
		extends Mapper<LongWritable, Text, Text, MyBean> {}
	
	public static class AddressMapper 
		extends Mapper<LongWritable, Text, Text, MyBean> {}

	public static class AnyJoinReducer 
		extends Reducer<Text, MyBean, Text, NullWritable> {}
	
	public static void main(String[] args) {}
}

Driver
设置连接类型(外连接、左外连接、右外连接)

public static void main(String[] args) throws 
		ClassNotFoundException, IOException, InterruptedException {

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

//		job.getConfiguration().set("joinType", "leftOuterJoin"); 	// 设置左外连接类型
//		job.getConfiguration().set("joinType", "rightOuterJoin"); 	// 设置右外连接类型
		job.getConfiguration().set("joinType", "allOuterJoin"); 	// 设置全外连接类型

		//设置两个mapper分别处理factory.txt和address.txt
		MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, FactoryMapper.class);
		MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, AddressMapper.class);
		job.setReducerClass(AnyJoinReducer.class);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(MyBean.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(NullWritable.class);

		FileOutputFormat.setOutputPath(job, new Path(args[2]));

		System.exit(job.waitForCompletion(true) ? 0 : 1);

	}

FactoryMapper

public static class FactoryMapper 
	extends Mapper<LongWritable, Text, Text, MyBean> {

		// k1 & v1
		Text k1 = new Text();
		MyBean myBean = new MyBean();

		@Override
		protected void map(LongWritable key, Text value, Context context) 
			throws IOException, InterruptedException {
			
			String line = value.toString();
			if (!line.startsWith("factory")) {
				String[] fields = line.split(" ");
				k1.set(fields[fields.length - 1]); 	
				String facName = "";
				for (int i = 0; i <= fields.length - 2; i++) {
					facName += fields[i];
					if (i != fields.length - 2)
						facName += " ";
				}
				myBean.setFacName(facName); // "Beijing Red Star"
				myBean.setAddID(Integer.parseInt(fields[fields.length - 1])); 
				myBean.setAddName(""); 
				myBean.setType("factory");
				context.write(k1, myBean);
			}
		}

	}

AddressMapper

public static class AddressMapper 
	extends Mapper<LongWritable, Text, Text, MyBean> {

		// k1 & v1
		Text k1 = new Text();
		MyBean myBean = new MyBean();

		@Override
		protected void map(LongWritable key, Text value, Context context) 
			throws IOException, InterruptedException {
			
			String line = value.toString();
			if (!line.startsWith("addressID")) {
				String[] fields = line.split(" ");
				k1.set(fields[0]);
				myBean.setFacName(""); 
				myBean.setAddID(Integer.parseInt(fields[0])); 
				myBean.setAddName(fields[1]); 
				myBean.setType("address");
				context.write(k1, myBean);
			}
		}

	}

Reducer

// reducer
	public static class AnyJoinReducer 
		extends Reducer<Text, MyBean, Text, NullWritable> {

		private String joinType;

		@Override
		protected void setup(Context context) 
			throws IOException, InterruptedException {

			this.joinType = context.getConfiguration().get("joinType");
			System.out.println("- joinType:\t" + joinType);
		}

		@Override
		protected void reduce(Text key, Iterable<MyBean> values, Context context)
			throws IOException, InterruptedException {

			ArrayList<MyBean> factoryBeans = new ArrayList<>();
			ArrayList<MyBean> addressBeans = new ArrayList<>();

			for (MyBean value : values) {
				MyBean tempBean = new MyBean();
				try {
					BeanUtils.copyProperties(tempBean, value);
				} catch (IllegalAccessException | InvocationTargetException e) {
					e.printStackTrace();
				}
				if (value.getType().equals("factory")) {
					factoryBeans.add(tempBean);
				} else {
					addressBeans.add(tempBean);
				}
			}

			if (joinType.equals("leftOuterJoin")) {

				for (MyBean factoryBean : factoryBeans) {
					if (addressBeans.size() > 0) {
						for (MyBean addressBean : addressBeans) {
							String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
									+ addressBean.getAddName();
							context.write(new Text(str), NullWritable.get());
						}
					} else {
						String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "+ "NULL";
						context.write(new Text(str), NullWritable.get());
					}
				}
			} else if (joinType.equals("rightOuterJoin")) {

				for (MyBean addressBean : addressBeans) {
					if (factoryBeans.size() > 0) {
						for (MyBean factoryBean : factoryBeans) {
							String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
									+ addressBean.getAddName();
							context.write(new Text(str), NullWritable.get());
						}
					} else {
						String str = "NULL NULL "+ addressBean.getAddName();
						context.write(new Text(str), NullWritable.get());
					}
				}
			} else if (joinType.equals("allOuterJoin")) {

				if (factoryBeans.size() > 0) {
					for (MyBean factoryBean : factoryBeans) {
						if (addressBeans.size() > 0) {
							for (MyBean addressBean : addressBeans) {
								String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
										+ addressBean.getAddName();
								context.write(new Text(str), NullWritable.get());
							}
						} else {
							String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "+ "NULL";
							context.write(new Text(str), NullWritable.get());
						}
					}
				} else {
					for (MyBean addressBean : addressBeans) {
						if (factoryBeans.size() > 0) {
							for (MyBean factoryBean : factoryBeans) {
								String str = factoryBean.getFacName() + " " + factoryBean.getAddID() + " "
										+ addressBean.getAddName();
								context.write(new Text(str), NullWritable.get());
							}
						} else {
							String str = "NULL NULL "+ addressBean.getAddName();
							context.write(new Text(str), NullWritable.get());
						}
					}
				}
			}

		}
	}

你可能感兴趣的:(Hadoop,hadoop,mapreduce,外连接)