本文以订单和商品演示如何实现left join。
订单数据表t_order:
id |
date |
pid |
amount |
1001 |
20150710 |
P0001 |
2 |
1002 |
20150710 |
P0001 |
3 |
1002 |
20150710 |
P0002 |
3 |
商品信息表t_product
id |
pname |
category_id |
price |
P0001 |
小米5 |
1000 |
2000 |
P0002 |
锤子T1 |
1000 |
3000 |
实现类似如下sql的left join效果:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id
将订单表的pid和商品表的id作为map阶段的k2,为了实现上面sql输出的效果,所以v2则是我们自定义的一个orderBean,封装了sql输出字段,也就是封装了两张表的字段
根据上下文对象context,从中获取当前map读取的文件名,从而v2就是OrderBean,利用读取不同的文件作为判断条件,将不同表的信息封装到orderBean中,在reduce阶段利用相同key在同一reduceTask,value合并的规则,可知,在reduce阶段相同k2的v2是一个orderBean的集合,该集合中的orderBean分别保存了订单bean和商品bean,遍历v2将组合orderBean进行输出
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 读取多个文件
* 自定义map
*/
public class OrderJoinMapper extends Mapper{
private OrderJoinBean orderJoinBean = new OrderJoinBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//通过获取文件名来区分两个不同的文件
String[] split = value.toString().split(",");
FileSplit inputSplit = (FileSplit) context.getInputSplit();//获取输入分区
String fileName = inputSplit.getPath().getName();
System.out.println("当前指定的文件名称是:"+fileName);
if("orders.txt".equals(fileName)){
//订单数据
orderJoinBean.setId(split[0]);
orderJoinBean.setDate(split[1]);
orderJoinBean.setPid(split[2]);
orderJoinBean.setAmount(split[3]);
context.write(new Text(split[2]),orderJoinBean);
}else{
//商品数据
orderJoinBean.setName(split[1]);
orderJoinBean.setCategoryId(split[2]);
orderJoinBean.setPrice(split[3]);
context.write(new Text(split[0]),orderJoinBean);
}
}
}
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class OrderJoinReduce extends Reducer {
private OrderJoinBean orderJoinBean;
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
List temp = new ArrayList<>();
List temp1 = new ArrayList<>();
for (OrderJoinBean value : values) {//注意:temp集合不能直接添加value
orderJoinBean = new OrderJoinBean();
//相同的key的对象都发送到了这里,在这里将数据拼接完整
if(null !=value.getId() && !value.getId().equals("null") ){//通过自定义map可知,订单数据设置了id,商品数据没有设置id
orderJoinBean.setId(value.getId());
orderJoinBean.setDate(value.getDate());
orderJoinBean.setPid(value.getPid());
orderJoinBean.setAmount(value.getAmount());
temp.add(orderJoinBean);
}else{
orderJoinBean.setName(value.getName());
orderJoinBean.setCategoryId(value.getCategoryId());
orderJoinBean.setPrice(value.getPrice());
temp1.add(orderJoinBean);
}
}
if(temp != null && !temp.isEmpty()){
for (OrderJoinBean bean : temp) {
if(temp1 != null && !temp1.isEmpty()){
for (OrderJoinBean joinBean : temp1) {
bean.setName(joinBean.getName());
bean.setCategoryId(joinBean.getCategoryId());
bean.setPrice(joinBean.getPrice());
context.write(bean,NullWritable.get());
}
}else{
context.write(bean,NullWritable.get());
}
}
}
}
}
//通过获取文件名来区分两个不同的文件
FileSplit inputSplit = (FileSplit) context.getInputSplit();//获取输入分区
String fileName = inputSplit.getPath().getName();
System.out.println("当前指定的文件名称是:"+fileName);