第一张表的内容:
login:
uid sexid logindate
1 1 2017-04-17 08:16:20
2 2 2017-04-15 06:18:20
3 1 2017-04-16 05:16:24
4 2 2017-04-14 03:18:20
5 1 2017-04-13 02:16:25
6 2 2017-04-13 01:15:20
7 1 2017-04-12 08:16:34
8 2 2017-04-11 09:16:20
9 0 2017-04-10 05:16:50
第二张表的内容:
sex:
0 不知道
1 男
2 女
第三张表的内容:
user uname
1 小红
2 小行
3 小通
4 小闪
5 小镇
6 小振
7 小秀
8 小微
9 小懂
10 小明
11 小刚
12 小举
13 小黑
14 小白
15 小鹏
16 小习
最终输出效果:
loginuid sex uname logindate
1 男 小红 2017-04-17 08:16:20
2 女 小行 2017-04-15 06:18:20
3 男 小通 2017-04-16 05:16:24
4 女 小闪 2017-04-14 03:18:20
5 男 小镇 2017-04-13 02:16:25
6 女 小振 2017-04-13 01:15:20
7 男 小秀 2017-04-12 08:16:34
9 不知道 小微 2017-04-10 05:16:50
8 女 小懂 2017-04-11 09:16:20
思路:
map端join:map端join
核心思想:将小表文件缓存到分布式缓存中,然后再map端进行连接处理。
适用场景:有一个或者多个小表 和 一个或者多个大表文件。
优点:map端使用内存缓存小表数据,加载速度快;大大减少map端到reduce端的传输量;大大较少shuffle过程耗时。
缺点:解决的业务需要有小表。
semi join:半连接
解决map端的缺点,当多个大文件同时存在,且一个大文件中有效数据抽取出来是小文件时,
则可以单独抽取出来并缓存到分布式缓存中,然后再使用map端join来进行连接。
自定义一个writable类User
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* user 信息bean
* @author lyd
*
*/
public class User implements Writable{
public String uid;
public String uname;
public String gender;
public String ldt;
public User(){
}
public User(String uid, String uname, String gender, String ldt) {
this.uid = uid;
this.uname = uname;
this.gender = gender;
this.ldt = ldt;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(uid);
out.writeUTF(uname);
out.writeUTF(gender);
out.writeUTF(ldt);
}
@Override
public void readFields(DataInput in) throws IOException {
this.uid = in.readUTF();
this.uname = in.readUTF();
this.gender = in.readUTF();
this.ldt = in.readUTF();
}
/**
* @return the uid
*/
public String getUid() {
return uid;
}
/**
* @param uid the uid to set
*/
public void setUid(String uid) {
this.uid = uid;
}
/**
* @return the uname
*/
public String getUname() {
return uname;
}
/**
* @param uname the uname to set
*/
public void setUname(String uname) {
this.uname = uname;
}
/**
* @return the gender
*/
public String getGender() {
return gender;
}
/**
* @param gender the gender to set
*/
public void setGender(String gender) {
this.gender = gender;
}
/**
* @return the ldt
*/
public String getLdt() {
return ldt;
}
/**
* @param ldt the ldt to set
*/
public void setLdt(String ldt) {
this.ldt = ldt;
}
/* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return uid + "\t" + uname + "\t" + gender + "\t" + ldt;
}
}
MapReduce类MultipleTableJoin
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class MultipleTableJoin extends ToolRunner implements Tool{
/**
* 自定义的myMapper
* @author lyd
*
*/
static class MyMapper extends Mapper{
Map sexMap = new ConcurrentHashMap();
Map userMap = new ConcurrentHashMap();
//读取缓存文件
@Override
protected void setup(Context context)throws IOException, InterruptedException {
Path [] paths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
for (Path p : paths) {
String fileName = p.getName();
if(fileName.equals("sex")){//读取 “性别表”
BufferedReader sb = new BufferedReader(new FileReader(new File(p.toString())));
String str = null;
while((str = sb.readLine()) != null){
String [] strs = str.split("\t");
sexMap.put(strs[0], strs[1]);
}
sb.close();
} else if(fileName.equals("user")){//读取“用户表”
BufferedReader sb = new BufferedReader(new FileReader(new File(p.toString())));
String str = null;
while((str = sb.readLine()) != null){
String [] strs = str.split("\t");
userMap.put(strs[0], strs[1]);
}
sb.close();
}
}
}
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line = value.toString();
String lines [] = line.split("\t");
String uid = lines[0];
String sexid = lines[1];
String logindate = lines[2];
//join连接操作
if(sexMap.containsKey(sexid) && userMap.containsKey(uid)){
String uname = userMap.get(uid);
String gender = sexMap.get(sexid);
//User user = new User(uid, uname, gender, logindate);
//context.write(new Text(uid+"\t"+uname+"\t"+gender+"\t"+logindate), NullWritable.get());
User user = new User(uid, uname, gender, logindate);
context.write(user, NullWritable.get());
}
}
@Override
protected void cleanup(Context context)throws IOException, InterruptedException {
}
}
/**
* 自定义MyReducer
* @author lyd
*
*/
/*static class MyReducer extends Reducer{
@Override
protected void setup(Context context)throws IOException, InterruptedException {
}
@Override
protected void reduce(Text key, Iterable value,Context context)
throws IOException, InterruptedException {
}
@Override
protected void cleanup(Context context)throws IOException, InterruptedException {
}
}*/
@Override
public void setConf(Configuration conf) {
conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
}
@Override
public Configuration getConf() {
return new Configuration();
}
/**
* 驱动方法
*/
@Override
public int run(String[] args) throws Exception {
//1、获取conf对象
Configuration conf = getConf();
//2、创建job
Job job = Job.getInstance(conf, "model01");
//3、设置运行job的class
job.setJarByClass(MultipleTableJoin.class);
//4、设置map相关属性
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(User.class);
job.setMapOutputValueClass(NullWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
//设置缓存文件
job.addCacheFile(new URI(args[2]));
job.addCacheFile(new URI(args[3]));
// URI [] uris = {new URI(args[2]),new URI(args[3])};
// job.setCacheFiles(uris);
/* DistributedCache.addCacheFile(new URI(args[2]), conf);
DistributedCache.addCacheFile(new URI(args[3]), conf);*/
/*//5、设置reduce相关属性
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);*/
//判断输出目录是否存在,若存在则删除
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]), true);
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//6、提交运行job
int isok = job.waitForCompletion(true) ? 0 : 1;
return isok;
}
/**
* job的主入口
* @param args
*/
public static void main(String[] args) {
try {
//对输入参数作解析
String [] argss = new GenericOptionsParser(new Configuration(), args).getRemainingArgs();
System.exit(ToolRunner.run(new MultipleTableJoin(), argss));
} catch (Exception e) {
e.printStackTrace();
}
}
}