MapReduce 的 Join 操作主要分两类:MapJoin 和 ReduceJoin
ReduceJoin:
1、 map 阶段,两份数据 data1 和 data2 会被 map 分别读入,解析成以链接字段为 key 以查询字段为 value 的 key-value 对,并标明数据来源是 data1 还是 data2。
2、 reduce 阶段,reducetask 会接收来自 data1 和 data2 的相同 key 的数据,在 reduce 端进行乘积链接,最直接的影响是很消耗内存,导致 OOM
MapJoin:
MapJoin 适用于有一份数据较小的连接情况。做法是直接把该小份数据直接全部加载到内存当中,按链接关键字建立索引。然后大份数据就作为 MapTask 的输入,对 map()方法的每次输入都去内存当中直接去匹配连接。然后把连接结果按 key 输出,这种方法要使用 hadoop中的DistributedCache 把小份数据分布到各个计算节点,每个 maptask 执行任务的节点都需要加载该数据到内存,并且按连接关键字建立索引。
代码实现实例
/**
* 需求:
* 现有两份数据 movies.dat 和 ratings.dat
*数据样式分别为:
*
*Movies.dat
*1::Toy Story (1995)::Animation|Children's|Comedy
*2::Jumanji (1995)::Adventure|Children's|Fantasy
*3::Grumpier Old Men (1995)::Comedy|Romance
*字段含义:movieid, moviename, movietype
*
*Ratings.dat
*1::1193::5::978300760
*1::661::3::978302109
*1::914::3::978301968
*字段含义:userid, movieid, rate, timestamp
*
*Select * from movie a join ratings b on a.movieid = b.movieid
*现要求对两表进行连接,要求输出最终的结果有以上六个字段:
*movieid, userid, rate, moviename, movietype, timestamp
*/
/**
*第一步:封装 MovieRate,方便数据的排序和序列化
*@author lv_hulk
*/
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MovieRate implements WritableComparable{
private String movieid;
private String userid;
private int rate;
private String movieName;
private String movieType;
private long ts;
public String getMovieid() {
return movieid;
}
public void setMovieid(String movieid) {
this.movieid = movieid;
}
public String getUserid() {
return userid;
}
public void setUserid(String userid) {
this.userid = userid;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public String getMovieName() {
return movieName;
}
public void setMovieName(String movieName) {
this.movieName = movieName;
}
public String getMovieType() {
return movieType;
}
public void setMovieType(String movieType) {
this.movieType = movieType;
}
public long getTs() {
return ts;
}
public void setTs(long ts) {
this.ts = ts;
}
public MovieRate() {
}
public MovieRate(String movieid, String userid, int rate, String movieName,
String movieType, long ts) {
this.movieid = movieid;
this.userid = userid;
this.rate = rate;
this.movieName = movieName;
this.movieType = movieType;
this.ts = ts;
}
@Override
public String toString() {
return movieid + "\t" + userid + "\t" + rate + "\t" + movieName
+ "\t" + movieType + "\t" + ts;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(movieid);
out.writeUTF(userid);
out.writeInt(rate);
out.writeUTF(movieName);
out.writeUTF(movieType);
out.writeLong(ts);
}
@Override
public void readFields(DataInput in) throws IOException {
this.movieid = in.readUTF();
this.userid = in.readUTF();
this.rate = in.readInt();
this.movieName = in.readUTF();
this.movieType = in.readUTF();
this.ts = in.readLong();
}
@Override
public int compareTo(MovieRate mr) {
int it = mr.getMovieid().compareTo(this.movieid);
if(it == 0){
return mr.getUserid().compareTo(this.userid);
}else{
return it;
}
}
}
/**
*第二步:编写 MapReduce 程序
*@author lv_hulk
*/
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MovieRatingMapJoinMR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop02:9000");
System.setProperty("HADOOP_USER_NAME","hadoop");
Job job = Job.getInstance(conf);
job.setJar("/home/hadoop/mrmr.jar");
job.setMapperClass(MovieRatingMapJoinMRMapper.class);
job.setMapOutputKeyClass(MovieRate.class);
job.setMapOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
String minInput = args[0];
String maxInput = args[1];
String output = args[2];
FileInputFormat.setInputPaths(job, new Path(maxInput));
Path outputPath = new Path(output);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath, true);
}
FileOutputFormat.setOutputPath(job, outputPath);
URI uri = new Path(minInput).toUri();
job.addCacheFile(uri);
boolean status = job.waitForCompletion(true);
System.exit(status?0:1);
}
static class MovieRatingMapJoinMRMapper extends Mapper{
// 用来存储小份数据的所有解析出来的 key-value
private static Map movieMap = new HashMap();
@Override
protected void setup(Context context) throws IOException, InterruptedException {
Path[] localCacheFilePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
String myfilePath = localCacheFilePaths[0].toString();
System.out.println(myfilePath);
URI[] cacheFiles = context.getCacheFiles();
System.out.println(cacheFiles[0].toString());
BufferedReader br = new BufferedReader(new
FileReader(myfilePath.toString()));
// 此处的 line 就是从文件当中逐行读到的 movie
String line = "";
while(null != (line = br.readLine())){
String[] splits = line.split("::");
movieMap.put(splits[0], new Movie(splits[0], splits[1], splits[2]));
}
IOUtils.closeStream(br);
}
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException,
InterruptedException {
String[] splits = value.toString().split("::");
String userid = splits[0];
String movieid = splits[1];
int rate = Integer.parseInt(splits[2]);
long ts = Long.parseLong(splits[3]);
String movieName = movieMap.get(movieid).getMovieName();
String movieType = movieMap.get(movieid).getMovieType();
MovieRate mr = new MovieRate(movieid, userid, rate, movieName, movieType,
ts);
context.write(mr, NullWritable.get());
}
}
}