MapReduce中join业务

对两份数据data1和data2进行关键词连接(Join)是一个很通用的问题。
如果数据量比较小,数据连接(Join)的操作可以在内存中完成,但如果数据量比较大,在内存中进行数据连接操作就会存在OOM(OutOfMemery)问题。针对这种情况,我们也可以考虑利用Mapreduce解决大数据的连接(Join)问题。

源数据

商品信息product示例

表头pid pname


pd.txt
订单数据order示例:

表头id pid amount


order.txt

预期结果

将商品信息表中数据根据商品pid合并到订单数据表中。

表头id pid amount pname


只进行maptask得到的结果文件

利用Reducer实现join业务

1)Reducer实现Join操作的原理分析:
Map端的主要工作:为来自不同表(文件)的key/value对打标签以区别不同来源的记录。然后用连接字段作为key,其余部分和新加的标志作为value,最后进行输出。
Reduce端的主要工作:在reduce端以连接字段作为key的分组已经完成,我们只需要在每一个分组当中将那些来源于不同文件的记录(在map阶段已经打标志)分开,最后进行合并就可以了。
2)Reducer实现Join操作的缺点分析
之所以会存在Reduce Join这种方式,是因为整体数据被分割了,每个Map Task只处理一部分数据而不能够获取到所有需要的Join字段,即:同一个key相应的字段可能位于不同map中。因此我们可以充分利用Mapreduce框架的特性,让他按照Join Key进行分区,将所有Join key相同的记录集中起来进行处理,所以Reduce Join这种方式就出现了。
但这种方式的缺点也很明显,就是会造成Map和Reduce流程对接过程中也就是Shuffle阶段出现大量的数据传输,效率很低。并且这种方式中,合并的操作是在Reduce阶段完成,Reduce端的处理压力太大,Map节点的运算负载则很低,资源利用率不高,且在Reduce阶段极易产生数据倾斜。

创建商品和订单合并后的bean类程序示例
package com.oracle.mrexample.f.reducejoin;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class ComboBean implements Writable{
    private String orderId = "";
    private String amount = "";

    private String productId = "";
    private String productName = "";

    private String dataType = "";

    public String getOrderId() {
        return orderId;
    }

    public void setOrderId(String orderId) {
        this.orderId = orderId;
    }

    public String getAmount() {
        return amount;
    }

    public void setAmount(String amount) {
        this.amount = amount;
    }

    public String getProductId() {
        return productId;
    }

    public void setProductId(String productId) {
        this.productId = productId;
    }

    public String getProductName() {
        return productName;
    }

    public void setProductName(String productName) {
        this.productName = productName;
    }

    public String getDataType() {
        return dataType;
    }

    public void setDataType(String dataType) {
        this.dataType = dataType;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(this.orderId);
        out.writeUTF(this.amount);
        out.writeUTF(this.productId);
        out.writeUTF(this.productName);
        out.writeUTF(this.dataType);
        
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.orderId = in.readUTF();
        this.amount = in.readUTF();
        this.productId = in.readUTF();
        this.productName = in.readUTF();
        this.dataType = in.readUTF();
    }
    
    @Override
    public String toString() {
        return orderId+"\t"+productId+"\t"+amount+"\t"+productName;
    }

}
主程序代码
package com.oracle.mrexample.f.reducejoin;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class ReduceJoinApp {
    public static class ReduceJoinMapper extends Mapper{
        private Text pId = new Text();
        private ComboBean bean = new ComboBean();
        
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            String line = value.toString();
            String[] strs = line.split("\t");
            if(strs.length==2) {
                bean.setProductId(strs[0]);
                bean.setProductName(strs[1]);
                
                bean.setDataType("P");
                
                pId.set(strs[0]);
            }else {
                bean.setOrderId(strs[0]);
                bean.setProductId(strs[1]);
                bean.setAmount(strs[2]);
                //bean.setProductName(productName);
                
                bean.setDataType("O");
                
                pId.set(strs[1]);
            }
            
            context.write(pId, bean);
        }
    }
    
    
    
    
    public static class ReduceJoinReducer extends Reducer{
        //1001  01  1
        //1004  01  4
        //01    小米
        @Override
        protected void reduce(Text key, Iterable values, Context context)
                throws IOException, InterruptedException {
            //存放唯一的那个产品
            ComboBean product = new ComboBean();
            //存放若干个订单数据
            List orders = new ArrayList<>();
            
            try {
                for(ComboBean value : values) {
                    if("P".equals(value.getDataType())) {
                        //错误的:product = value;
                        BeanUtils.copyProperties(product, value);
                    }else if("O".equals(value.getDataType())) {
                        ComboBean o = new ComboBean();
                        BeanUtils.copyProperties(o, value);
                        orders.add(o);
                    }
                }
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (InvocationTargetException e) {
                e.printStackTrace();
            }
            
            for(ComboBean order : orders) {
                order.setProductName(product.getProductName());
                context.write(NullWritable.get(), order);
            }
            
        }
    }

    public static void main(String[] args) throws Exception{
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(ComboBean.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(ComboBean.class);
        
        
        job.setJarByClass(ReduceJoinApp.class);
        job.setMapperClass(ReduceJoinMapper.class);
        job.setReducerClass(ReduceJoinReducer.class);
        
        job.setInputFormatClass(TextInputFormat.class);
        
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        Path outPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);

        job.waitForCompletion(true);
    }

}

利用Mapper实现join业务

主程序代码
package com.oracle.mrexample.g.mapjoin;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class MapJoinApp implements Tool {

    private Configuration conf;

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public Configuration getConf() {
        return conf;
    }

    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(this.conf);

        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(Text.class);

        job.setJarByClass(MapJoinApp.class);
        job.setMapperClass(MapJoinMapper.class);

        job.setNumReduceTasks(0);

        //增加分布式缓存中需要暂存的文件
        job.addCacheFile(new URI("file:///F:/datas/join/cache/pd.txt"));

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        Path outPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if (fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);


        boolean result = job.waitForCompletion(true);
        return result ? 0 : 1;
    }

    public static void main(String[] args) {
        if (args == null || args.length < 2) {
            System.out.println("Parmas is not valid!");
            return;
        }
        try {
            ToolRunner.run(new MapJoinApp(), args);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static class MapJoinMapper extends Mapper {
        private Map productMap = new HashMap<>();
        private Text outValue = new Text();

        @Override
        protected void setup(Context context)
                throws IOException, InterruptedException {
            //集群中应该使用的分布式缓存处理方式
            //BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream("pd.txt"),"UTF-8"));
            //Window的本地环境中应该使用的分布式缓存处理方式
            //因为并非真正的HDFS系统中的文件,可能会获取不到文件,可以试试老API
            Path[] cacheFiles = context.getLocalCacheFiles();
            Path cacheFile = cacheFiles[0];
            BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(cacheFile.toUri().getPath()), StandardCharsets.UTF_8));
            String input;
            while ((input = reader.readLine()) != null) {
                String[] strs = input.split("\t");
                productMap.put(strs[0], strs[1]);
            }
            reader.close();
        }

        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //从缓存中读取出产品信息
            //1001 01 1  小米
            String line = value.toString();
            String[] strs = line.split("\t");
            outValue.set(line + "\t" + productMap.get(strs[1]));
            context.write(NullWritable.get(), outValue);
        }
    }
}

你可能感兴趣的:(MapReduce中join业务)