有两张表,一张是客户表,一张是订单表,他们都有一个共同的cid来连接。
如果客户表比较小,那么可以在map端的setup阶段将这个表加载到内存中,那么就可以直接在mapper中将两个表连接。
如果客户表和订单表都很大,那么就需要一个mapper和reducer来处理了。
先展示当客户表比较小的时候的处理方法
Mapper
package com.huawei.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
public class JoinMapper extends Mapper{
MapcustomsMap=new HashMap();
@Override
protected void setup(Context context) throws IOException {
Configuration conf= context.getConfiguration();
FileSystem fs= FileSystem.get(conf);
FSDataInputStream fis=fs.open(new Path("/Users/simmucheng/tmp/join_test/customer.txt"));
BufferedReader br=new BufferedReader(new InputStreamReader(fis));
String line=null;
while((line=br.readLine())!=null){
String cid =line.substring(0,line.indexOf(","));
customsMap.put(cid,line);
}
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String str=value.toString();
String id=str.substring(str.lastIndexOf(",")+1);
String productInfo=str.substring(0,str.lastIndexOf(','));
String customerinfo=customsMap.get(id);
context.write(new Text(customerinfo+','+productInfo),NullWritable.get());
}
}
Main端代码
package com.huawei.join;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* 顾客表为小表,所以
* 将顾客表在mapper的setup阶段将小表加载到内存中,然后直接在mapper端进行连接,而不需要
* 多余的reducer
*/
public class JoinApp {
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
conf.set("fs.defaultFS","file:///");
FileSystem fs=FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]));
}
Job job =Job.getInstance(conf);
job.setJarByClass(JoinApp.class);
job.setMapperClass(JoinMapper.class);
job.setOutputKeyClass(Text.class);
job.setJobName("JoinTest");
job.setNumReduceTasks(0);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}
如果客户表和订单表都是大表,也就是数据量很大的表,那么处理办法为
首先要自定义数据类
JoinBean
package com.huawei.JoinBoth;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class JoinBean implements WritableComparable {
private int type;
private int cid;
private int pid;
private String customerInfo="";
private String productorInfo="";
/*
type 0 customer 1 productor
*/
public int getType() {
return type;
}
public void setType(int type) {
this.type = type;
}
public int getCid() {
return cid;
}
public void setCid(int cid) {
this.cid = cid;
}
public int getPid() {
return pid;
}
public void setPid(int pid) {
this.pid = pid;
}
public String getCustomerInfo() {
return customerInfo;
}
public void setCustomerInfo(String customerInfo) {
this.customerInfo = customerInfo;
}
public String getProductorInfo() {
return productorInfo;
}
public void setProductorInfo(String productorInfo) {
this.productorInfo = productorInfo;
}
public int compareTo(Object o) {
JoinBean jb=(JoinBean)o;
if(cid==jb.getCid()){
if(type!=jb.getType()){
return type-jb.getType();
}
else return -(pid-jb.pid);
}
else {
return (cid-jb.getCid());
}
}
public void write(DataOutput out) throws IOException {
out.writeInt(type);
out.writeInt(cid);
out.writeInt(pid);
out.writeUTF(customerInfo);
out.writeUTF(productorInfo);
}
public void readFields(DataInput in) throws IOException {
this.type=in.readInt();
this.cid=in.readInt();
this.pid=in.readInt();
this.customerInfo=in.readUTF();
this.productorInfo=in.readUTF();
}
}
自定义分区类
package com.huawei.JoinBoth;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class JoinPartitioner extends Partitioner{
/**
* 按照用户id来划分
* @param joinBean
* @param nullWritable
* @param numPartitions
* @return
*/
public int getPartition(JoinBean joinBean, NullWritable nullWritable, int numPartitions) {
return (joinBean.getCid()%numPartitions);
}
}
自定义reducer端的分组类
package com.huawei.JoinBoth;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class JoinCompareGroup extends WritableComparator{
public JoinCompareGroup() {
super(JoinBean.class,true);
}
/**
* 按照cid进行分组
* 不管是分组对比器还是排序比较器,都需要重写compare(WritableComparable类型的方法)
* @param a
* @param b
* @return
*/
@Override
public int compare(WritableComparable a, WritableComparable b) {
JoinBean ja=(JoinBean)a;
JoinBean jb=(JoinBean)b;
return (ja.getCid()-jb.getCid());
}
}
自定义排序比较器
package com.huawei.JoinBoth;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class JoinComparator extends WritableComparator{
public JoinComparator() {
super(JoinBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
JoinBean ja=(JoinBean)a;
JoinBean jb=(JoinBean)b;
return ja.compareTo(jb);
}
}
Mapper
package com.huawei.JoinBoth;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
public class JoinMapper extends Mapper{
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String str=value.toString();
FileSplit split=(FileSplit) context.getInputSplit();
String path=split.getPath().toString();
JoinBean joinBean=new JoinBean();
if(path.contains("customer")){
joinBean.setType(0);
String cid=str.substring(0,str.indexOf(","));
String cusinfo=str;
joinBean.setCid(Integer.parseInt(cid));
joinBean.setCustomerInfo(cusinfo);
}
else {
joinBean.setType(1);
String cid=str.substring(str.lastIndexOf(',')+1);
String pid=str.substring(0,str.indexOf(','));
String proinfo=str.substring(0,str.lastIndexOf(','));
joinBean.setPid(Integer.parseInt(pid));
joinBean.setProductorInfo(proinfo);
joinBean.setCid(Integer.parseInt(cid));
}
context.write(joinBean,NullWritable.get());
}
}
Reducer
package com.huawei.JoinBoth;
import org.apache.commons.collections.iterators.IteratorChain;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
public class JoinReducer extends Reducer{
@Override
protected void reduce(JoinBean key, Iterable values, Context context) throws IOException, InterruptedException {
JoinBean joinBean=new JoinBean();
Iterator it=values.iterator();
it.next();
int type=key.getType();
int cid=key.getCid();
String custominfo=key.getCustomerInfo();
while(it.hasNext()){
it.next();
String proinfo=key.getProductorInfo();
context.write(new Text(custominfo+","+proinfo),NullWritable.get());
}
}
}
主函数
package com.huawei.JoinBoth;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class JoinTest {
public static void main(String[] args) throws Exception {
Configuration conf=new Configuration();
conf.set("fs.defaultFS","file:///");
Job job=Job.getInstance(conf);
FileSystem fs=FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]));
}
job.setJarByClass(JoinTest.class);
job.setMapperClass(JoinMapper.class);
job.setReducerClass(JoinReducer.class);
job.setInputFormatClass(TextInputFormat.class);
job.setMapOutputKeyClass(JoinBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(1);
job.setGroupingComparatorClass(JoinCompareGroup.class);
//job.setSortComparatorClass(JoinComparator.class);
job.setPartitionerClass(JoinPartitioner.class);
FileInputFormat.addInputPath(job,new Path(args[0]));
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.waitForCompletion(true);
}
}
'''
客户表和订单表如下:
![屏幕快照 2018-03-25 上午11.09.59.png](https://upload-images.jianshu.io/upload_images/2111066-fd4f3937ef0e309a.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
![屏幕快照 2018-03-25 上午11.10.08.png](https://upload-images.jianshu.io/upload_images/2111066-653df3ce9afb65aa.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)
结果如下:
![屏幕快照 2018-03-25 上午11.10.16.png](https://upload-images.jianshu.io/upload_images/2111066-4745975cd51bb15f.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)