MapReduce的ReduceJoin和MapJoin

Join在SQL中就表示两张或多张表的连接,在MapReduce中 也可能存在这种连接,只不过是文件间的连接,例如下面这种需求:

需要合并一下两张表:
a表:
	#id	  #pid	#amount
	1001	01	1
	1002	02	2
	1003	03	3
	1004	01	4
	1005	02	5
	1006	03	6
b表:
	#pid	 #pname
	01	小米
	02	华为
	03	格力

要求将这两个表合并 把第一个表的pid替换成第二个表中对应的pname

这个需求在SQL中就是简单的连表查询 但是我们MapReduce处理的是文件 所有我们需要用其他的方式来处理这个需求

ReduceJoin : 见名知意 就是在Reduce中将这两个表合并

将表的结构封装成一个类 然后将这个类作为Map任务写出去的Key 然后自定义分组规则 
将相同pid的一行分成一组并且让有pname的一行成为首行 然后用首行的pname替换其他行的pid

RJBean:

package com.jee.reducejoin;

import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class RJBean implements WritableComparable {

    private String id;
    private String pId;
    private int amount;
    private String pName;

    public RJBean() {
    }

    public RJBean(String id, String pId, int amount, String pName) {
        this.id = id;
        this.pId = pId;
        this.amount = amount;
        this.pName = pName;
    }

    public String getId() {
        return id;
    }

    public void setId(String id) {
        this.id = id;
    }

    public String getpId() {
        return pId;
    }

    public void setpId(String pId) {
        this.pId = pId;
    }

    public int getAmount() {
        return amount;
    }

    public void setAmount(int amount) {
        this.amount = amount;
    }

    public String getpName() {
        return pName;
    }

    public void setpName(String pName) {
        this.pName = pName;
    }

    @Override
    public String toString() {
        return "RJBean{" +
                "id='" + id + '\'' +
                ", pId='" + pId + '\'' +
                ", amount=" + amount +
                ", pName='" + pName + '\'' +
                '}';
    }

    @Override
    public int compareTo(RJBean o) {
        int compare = this.pId.compareTo(o.getpId());
        if(compare == 0){
            return o.getpName().compareTo(this.getpName());
        }else{
            return compare;
        }
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
        dataOutput.writeUTF(id);
        dataOutput.writeUTF(pId);
        dataOutput.writeInt(amount);
        dataOutput.writeUTF(pName);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
        id = dataInput.readUTF();
        pId = dataInput.readUTF();
        amount = dataInput.readInt();
        pName = dataInput.readUTF();
    }
}

Map类:

package com.jee.reducejoin;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class RJMapper extends Mapper {

    private String fileName;
    private RJBean bean = new RJBean();
    //setup方法 在每个map任务中 最开始的时候执行一次(只会执行一次)
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //由于我们输入文件有两个  不同的文件的每一行的格式是不一样的  我们需要根据不同的文件来执行不一样的操作
        //获取当前切片
        FileSplit split = (FileSplit)context.getInputSplit();
        //根据切片获得当前文件名称
        fileName = split.getPath().getName();
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //根据不同的fileName 执行不一样的map操作
        String line = value.toString();
        String[] items = line.split("\t");
        if(fileName.contains("First")){
            bean.setId(items[0]);
            bean.setpId(items[1]);
            bean.setAmount(Integer.parseInt(items[2]));
            bean.setpName("");
        }else{
            bean.setpId(items[0]);
            bean.setpName(items[1]);
            bean.setId("");
            bean.setAmount(0);
        }
        context.write(bean,NullWritable.get());
    }
}

自定义分组类:

package com.jee.reducejoin;

import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

public class RjComparator extends WritableComparator{

    //参数是要进行比较的类 (必须重写这个方法 否则会空指针异常)
    protected RjComparator() {
        super(RJBean.class,true);
    }

    @Override
    public int compare(WritableComparable a, WritableComparable b) {
        RJBean bean1 = (RJBean) a;
        RJBean bean2 = (RJBean) b;
        //自定义分组规则 只要pId相同就分为一组
        return bean1.getpId().compareTo(bean2.getpId());
    }
}

Reduce类:

package com.jee.reducejoin;

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

public class RJReducer extends Reducer {

    @Override
    protected void reduce(RJBean key, Iterable values, Context context) throws IOException, InterruptedException {
        Iterator iterator = values.iterator();
        iterator.next();
        //拿到每组中第一行的pName
        String pName = key.getpName();
        while(iterator.hasNext()){
            iterator.next();
            key.setpName(pName);
            context.write(key,NullWritable.get());
        }
    }
}

Driver类:

package com.jee.reducejoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class RJDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(RJDriver.class);

        job.setMapperClass(RJMapper.class);
        job.setReducerClass(RJReducer.class);

        job.setMapOutputKeyClass(RJBean.class);
        job.setMapOutputValueClass(NullWritable.class);
        job.setOutputKeyClass(RJBean.class);
        job.setOutputValueClass(NullWritable.class);

        job.setGroupingComparatorClass(RjComparator.class);

        FileInputFormat.setInputPaths(job,new Path("d:/Hadoop/input"));
        FileOutputFormat.setOutputPath(job,new Path("d://Hadoop/output"));

        boolean b = job.waitForCompletion(true);

        System.exit(b ? 0 : 1);
    }
}

ReduceJoin 是在Reduce阶段再将两个文件合并的 而MapJoin是在Map阶段就讲文件合并了 而无需经过Shuffle过程和Reduce过程

MapJoin需要将要合并的文件中的其中一个或几个(最小的文件)放入缓存中 然后再Map过程中将它取出来 在Map中就直接进行合并

所以MapJoin无需Shuffle 和Reducer类

Mapper类

package com.jee.mapjoin;

import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;

public class MjMapper extends Mapper {

    //用户存放缓存文件中的键值对
    private Map map = new HashMap<>();

    private Text text = new Text();
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        //将缓存文件在Map任务一开始的时候 读进来
        URI[] cacheFiles = context.getCacheFiles();
        //获得存入缓存的文件的路径
        String path = cacheFiles[0].getPath().toString();
        //打开这个缓存文件的输入流
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
        String line = new String();
        while((line = reader.readLine()) != null){
            String[] items = line.split("\t");
            //将缓存文件中的每行数据存入hashMap中
            map.put(items[0],items[1]);
        }
        IOUtils.closeStream(reader);
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] items = line.split("\t");
        String pName = map.get(items[1]);
        text.set(items[0] + '\t' + pName + "\t" + items[2]);
        context.write(text,NullWritable.get());
    }
}

Driver类

package com.jee.mapjoin;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.net.URI;

public class MJDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(MJDriver.class);

        job.setMapperClass(MjMapper.class);

        //设置Reducer任务数量为0  就不会启动reduce
        job.setNumReduceTasks(0);

        //设置写入缓存中的文件
        // file:/// 表示这个是File协议  是存放在本地的文件 而不是HDFS中  HDFS中的文件都是用hdfs:hadoop:端口号:文件路径 来表示的
        job.addCacheFile(URI.create("file:///d:/Hadoop/input/RJLast.txt"));

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        FileInputFormat.setInputPaths(job,new Path("d:/Hadoop/input/RJFirst.txt"));
        FileOutputFormat.setOutputPath(job,new Path("d:/Hadoop/output"));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }
}

你可能感兴趣的:(MapReduce的ReduceJoin和MapJoin)