Join在SQL中就表示两张或多张表的连接,在MapReduce中 也可能存在这种连接,只不过是文件间的连接
,例如下面这种需求:
需要合并一下两张表:
a表:
#id #pid #amount
1001 01 1
1002 02 2
1003 03 3
1004 01 4
1005 02 5
1006 03 6
b表:
#pid #pname
01 小米
02 华为
03 格力
要求将这两个表合并 把第一个表的pid替换成第二个表中对应的pname
这个需求在SQL中就是简单的连表查询 但是我们MapReduce处理的是文件 所有我们需要用其他的方式来处理这个需求
ReduceJoin : 见名知意 就是在Reduce中将这两个表合并
将表的结构封装成一个类 然后将这个类作为Map任务写出去的Key 然后自定义分组规则
将相同pid的一行分成一组并且让有pname的一行成为首行 然后用首行的pname替换其他行的pid
RJBean:
package com.jee.reducejoin;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class RJBean implements WritableComparable {
private String id;
private String pId;
private int amount;
private String pName;
public RJBean() {
}
public RJBean(String id, String pId, int amount, String pName) {
this.id = id;
this.pId = pId;
this.amount = amount;
this.pName = pName;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getpId() {
return pId;
}
public void setpId(String pId) {
this.pId = pId;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getpName() {
return pName;
}
public void setpName(String pName) {
this.pName = pName;
}
@Override
public String toString() {
return "RJBean{" +
"id='" + id + '\'' +
", pId='" + pId + '\'' +
", amount=" + amount +
", pName='" + pName + '\'' +
'}';
}
@Override
public int compareTo(RJBean o) {
int compare = this.pId.compareTo(o.getpId());
if(compare == 0){
return o.getpName().compareTo(this.getpName());
}else{
return compare;
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(id);
dataOutput.writeUTF(pId);
dataOutput.writeInt(amount);
dataOutput.writeUTF(pName);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readUTF();
pId = dataInput.readUTF();
amount = dataInput.readInt();
pName = dataInput.readUTF();
}
}
Map类:
package com.jee.reducejoin;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class RJMapper extends Mapper {
private String fileName;
private RJBean bean = new RJBean();
//setup方法 在每个map任务中 最开始的时候执行一次(只会执行一次)
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//由于我们输入文件有两个 不同的文件的每一行的格式是不一样的 我们需要根据不同的文件来执行不一样的操作
//获取当前切片
FileSplit split = (FileSplit)context.getInputSplit();
//根据切片获得当前文件名称
fileName = split.getPath().getName();
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//根据不同的fileName 执行不一样的map操作
String line = value.toString();
String[] items = line.split("\t");
if(fileName.contains("First")){
bean.setId(items[0]);
bean.setpId(items[1]);
bean.setAmount(Integer.parseInt(items[2]));
bean.setpName("");
}else{
bean.setpId(items[0]);
bean.setpName(items[1]);
bean.setId("");
bean.setAmount(0);
}
context.write(bean,NullWritable.get());
}
}
自定义分组类:
package com.jee.reducejoin;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class RjComparator extends WritableComparator{
//参数是要进行比较的类 (必须重写这个方法 否则会空指针异常)
protected RjComparator() {
super(RJBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
RJBean bean1 = (RJBean) a;
RJBean bean2 = (RJBean) b;
//自定义分组规则 只要pId相同就分为一组
return bean1.getpId().compareTo(bean2.getpId());
}
}
Reduce类:
package com.jee.reducejoin;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class RJReducer extends Reducer {
@Override
protected void reduce(RJBean key, Iterable values, Context context) throws IOException, InterruptedException {
Iterator iterator = values.iterator();
iterator.next();
//拿到每组中第一行的pName
String pName = key.getpName();
while(iterator.hasNext()){
iterator.next();
key.setpName(pName);
context.write(key,NullWritable.get());
}
}
}
Driver类:
package com.jee.reducejoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class RJDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(RJDriver.class);
job.setMapperClass(RJMapper.class);
job.setReducerClass(RJReducer.class);
job.setMapOutputKeyClass(RJBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(RJBean.class);
job.setOutputValueClass(NullWritable.class);
job.setGroupingComparatorClass(RjComparator.class);
FileInputFormat.setInputPaths(job,new Path("d:/Hadoop/input"));
FileOutputFormat.setOutputPath(job,new Path("d://Hadoop/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}
ReduceJoin 是在Reduce阶段再将两个文件合并的 而MapJoin是在Map阶段就讲文件合并了 而无需经过Shuffle过程和Reduce过程
MapJoin需要将要合并的文件中的其中一个或几个(最小的文件)放入缓存中 然后再Map过程中将它取出来 在Map中就直接进行合并
所以MapJoin无需Shuffle 和Reducer类
Mapper类
package com.jee.mapjoin;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.*;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
public class MjMapper extends Mapper {
//用户存放缓存文件中的键值对
private Map map = new HashMap<>();
private Text text = new Text();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
//将缓存文件在Map任务一开始的时候 读进来
URI[] cacheFiles = context.getCacheFiles();
//获得存入缓存的文件的路径
String path = cacheFiles[0].getPath().toString();
//打开这个缓存文件的输入流
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(path)));
String line = new String();
while((line = reader.readLine()) != null){
String[] items = line.split("\t");
//将缓存文件中的每行数据存入hashMap中
map.put(items[0],items[1]);
}
IOUtils.closeStream(reader);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] items = line.split("\t");
String pName = map.get(items[1]);
text.set(items[0] + '\t' + pName + "\t" + items[2]);
context.write(text,NullWritable.get());
}
}
Driver类
package com.jee.mapjoin;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.net.URI;
public class MJDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MJDriver.class);
job.setMapperClass(MjMapper.class);
//设置Reducer任务数量为0 就不会启动reduce
job.setNumReduceTasks(0);
//设置写入缓存中的文件
// file:/// 表示这个是File协议 是存放在本地的文件 而不是HDFS中 HDFS中的文件都是用hdfs:hadoop:端口号:文件路径 来表示的
job.addCacheFile(URI.create("file:///d:/Hadoop/input/RJLast.txt"));
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path("d:/Hadoop/input/RJFirst.txt"));
FileOutputFormat.setOutputPath(job,new Path("d:/Hadoop/output"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}