对两份数据data1和data2进行关键词连接(Join)是一个很通用的问题。
如果数据量比较小,数据连接(Join)的操作可以在内存中完成,但如果数据量比较大,在内存中进行数据连接操作就会存在OOM(OutOfMemery)问题。针对这种情况,我们也可以考虑利用Mapreduce解决大数据的连接(Join)问题。
源数据
商品信息product示例
表头pid pname
订单数据order示例:
表头id pid amount
预期结果
将商品信息表中数据根据商品pid合并到订单数据表中。
表头id pid amount pname
利用Reducer实现join业务
1)Reducer实现Join操作的原理分析:
Map端的主要工作:为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
Reduce端的主要工作:在reduce端以连接字段作为key的分组已经完成,我们只需要在每一个分组当中将那些来源于不同文件的记录(在map阶段已经打标志)分开,最后进行合并就可以了。
2)Reducer实现Join操作的缺点分析
之所以会存在Reduce Join这种方式,是因为整体数据被分割了,每个Map Task只处理一部分数据而不能够获取到所有需要的Join字段,即:同一个key相应的字段可能位于不同map中。因此我们可以充分利用Mapreduce框架的特性,让他按照Join Key进行分区,将所有Join key相同的记录集中起来进行处理,所以Reduce Join这种方式就出现了。
但这种方式的缺点也很明显,就是会造成Map和Reduce流程对接过程中也就是Shuffle阶段出现大量的数据传输,效率很低。并且这种方式中,合并的操作是在Reduce阶段完成,Reduce端的处理压力太大,Map节点的运算负载则很低,资源利用率不高,且在Reduce阶段极易产生数据倾斜。
创建商品和订单合并后的bean类程序示例
package com.oracle.mrexample.f.reducejoin;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class ComboBean implements Writable{
private String orderId = "";
private String amount = "";
private String productId = "";
private String productName = "";
private String dataType = "";
public String getOrderId() {
return orderId;
}
public void setOrderId(String orderId) {
this.orderId = orderId;
}
public String getAmount() {
return amount;
}
public void setAmount(String amount) {
this.amount = amount;
}
public String getProductId() {
return productId;
}
public void setProductId(String productId) {
this.productId = productId;
}
public String getProductName() {
return productName;
}
public void setProductName(String productName) {
this.productName = productName;
}
public String getDataType() {
return dataType;
}
public void setDataType(String dataType) {
this.dataType = dataType;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(this.orderId);
out.writeUTF(this.amount);
out.writeUTF(this.productId);
out.writeUTF(this.productName);
out.writeUTF(this.dataType);
}
@Override
public void readFields(DataInput in) throws IOException {
this.orderId = in.readUTF();
this.amount = in.readUTF();
this.productId = in.readUTF();
this.productName = in.readUTF();
this.dataType = in.readUTF();
}
@Override
public String toString() {
return orderId+"\t"+productId+"\t"+amount+"\t"+productName;
}
}
主程序代码
package com.oracle.mrexample.f.reducejoin;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ReduceJoinApp {
public static class ReduceJoinMapper extends Mapper{
private Text pId = new Text();
private ComboBean bean = new ComboBean();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] strs = line.split("\t");
if(strs.length==2) {
bean.setProductId(strs[0]);
bean.setProductName(strs[1]);
bean.setDataType("P");
pId.set(strs[0]);
}else {
bean.setOrderId(strs[0]);
bean.setProductId(strs[1]);
bean.setAmount(strs[2]);
//bean.setProductName(productName);
bean.setDataType("O");
pId.set(strs[1]);
}
context.write(pId, bean);
}
}
public static class ReduceJoinReducer extends Reducer{
//1001 01 1
//1004 01 4
//01 小米
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
//存放唯一的那个产品
ComboBean product = new ComboBean();
//存放若干个订单数据
List orders = new ArrayList<>();
try {
for(ComboBean value : values) {
if("P".equals(value.getDataType())) {
//错误的:product = value;
BeanUtils.copyProperties(product, value);
}else if("O".equals(value.getDataType())) {
ComboBean o = new ComboBean();
BeanUtils.copyProperties(o, value);
orders.add(o);
}
}
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
for(ComboBean order : orders) {
order.setProductName(product.getProductName());
context.write(NullWritable.get(), order);
}
}
}
public static void main(String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(ComboBean.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(ComboBean.class);
job.setJarByClass(ReduceJoinApp.class);
job.setMapperClass(ReduceJoinMapper.class);
job.setReducerClass(ReduceJoinReducer.class);
job.setInputFormatClass(TextInputFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
Path outPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
}
}
利用Mapper实现join业务
主程序代码
package com.oracle.mrexample.g.mapjoin;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MapJoinApp implements Tool {
private Configuration conf;
@Override
public void setConf(Configuration conf) {
this.conf = conf;
}
@Override
public Configuration getConf() {
return conf;
}
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(this.conf);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(Text.class);
job.setJarByClass(MapJoinApp.class);
job.setMapperClass(MapJoinMapper.class);
job.setNumReduceTasks(0);
//增加分布式缓存中需要暂存的文件
job.addCacheFile(new URI("file:///F:/datas/join/cache/pd.txt"));
FileInputFormat.setInputPaths(job, new Path(args[0]));
Path outPath = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
boolean result = job.waitForCompletion(true);
return result ? 0 : 1;
}
public static void main(String[] args) {
if (args == null || args.length < 2) {
System.out.println("Parmas is not valid!");
return;
}
try {
ToolRunner.run(new MapJoinApp(), args);
} catch (Exception e) {
e.printStackTrace();
}
}
public static class MapJoinMapper extends Mapper {
private Map productMap = new HashMap<>();
private Text outValue = new Text();
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
//集群中应该使用的分布式缓存处理方式
//BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"),"UTF-8"));
//Window的本地环境中应该使用的分布式缓存处理方式
//因为并非真正的HDFS系统中的文件,可能会获取不到文件,可以试试老API
Path[] cacheFiles = context.getLocalCacheFiles();
Path cacheFile = cacheFiles[0];
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(cacheFile.toUri().getPath()), StandardCharsets.UTF_8));
String input;
while ((input = reader.readLine()) != null) {
String[] strs = input.split("\t");
productMap.put(strs[0], strs[1]);
}
reader.close();
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//从缓存中读取出产品信息
//1001 01 1 小米
String line = value.toString();
String[] strs = line.split("\t");
outValue.set(line + "\t" + productMap.get(strs[1]));
context.write(NullWritable.get(), outValue);
}
}
}