MapJoin 适用于有一份数据较小的连接情况。做法是直接把该小份数据直接全部加载到内存当中,按链接关键字建立索引。然后大份数据就作为 MapTask 的输入,对 map()方法的每次输入都去内存当中直接去匹配连接。然后把连接结果按 key 输出,这种方法要使用 hadoop中的 DistributedCache 把小份数据分布到各个计算节点,每个 maptask 执行任务的节点都需要加载该数据到内存,并且按连接关键字建立索引:
封装 MovieRate,方便数据的排序和序列化:
package mapreduce.join.mapJoin;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class MovieRate implements WritableComparable{
// new MovieRate(userid, movieid, rate, ts, movieName, movieType);
private String userid;
private String movieid;
private int rate;
private Long ts;
private String movieName;
private String movieType;
public String getUserid() {
return userid;
}
public void setUserid(String userid) {
this.userid = userid;
}
public String getMovieid() {
return movieid;
}
public void setMovieid(String movieid) {
this.movieid = movieid;
}
public int getRate() {
return rate;
}
public void setRate(int rate) {
this.rate = rate;
}
public Long getTs() {
return ts;
}
public void setTs(Long ts) {
this.ts = ts;
}
public String getMovieName() {
return movieName;
}
public void setMovieName(String movieName) {
this.movieName = movieName;
}
public String getMovieType() {
return movieType;
}
public void setMovieType(String movieType) {
this.movieType = movieType;
}
public MovieRate(String userid, String movieid, int rate, Long ts, String movieName, String movieType) {
super();
this.userid = userid;
this.movieid = movieid;
this.rate = rate;
this.ts = ts;
this.movieName = movieName;
this.movieType = movieType;
}
public MovieRate() {
super();
}
@Override
public String toString() {
return userid + "\t" + movieid + "\t" + rate + "\t" + ts + "\t" + movieName + "\t" + movieType ;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(userid);
out.writeUTF(movieid);
out.writeInt(rate);
out.writeLong(ts);
out.writeUTF(movieName);
out.writeUTF(movieType);
}
@Override
public void readFields(DataInput in) throws IOException {
this.userid = in.readUTF();
this.movieid = in.readUTF();
this.rate = in.readInt();
this.ts = in.readLong();
this.movieName = in.readUTF();
this.movieType = in.readUTF();
}
@Override
public int compareTo(MovieRate mr) {
int it = mr.getMovieid().compareTo(this.movieid);
if(it == 0){
return mr.getUserid().compareTo(this.userid);
}else{
return it;
}
}
}
编写 MapReduce 程序:
package mapreduce.join.mapJoin;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.filecache.DistributedCache;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.zookeeper.common.IOUtils;
public class MovieRateMapJoinMR {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
System.setProperty("HADOOP_USER_NAME", "hadoop");
// conf.set("mapreduce.framework.name", "yarn");
Job job = Job.getInstance(conf);
job.setJarByClass(MovieRateMapJoinMR.class);
job.setMapperClass(MovieRateMapJoinMRMapper.class);
job.setMapOutputKeyClass(MovieRate.class);
job.setMapOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);
Path minPath = new Path(args[0]);
// Path maxPath = new Path(args[0]);
// Path outPath = new Path(args[1]);
Path maxPath = new Path(args[1]);
// URI uri = new URI("C:/Users/Administrator/Desktop/mapjoin/minpath");
// DistributedCache.addCacheFile(uri, conf);
Path outPath = new Path(args[2]);
URI uri = minPath.toUri();
job.addCacheFile(uri);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)){
fs.delete(outPath, true);
}
FileInputFormat.setInputPaths(job, maxPath);
FileOutputFormat.setOutputPath(job, outPath);
boolean status = job.waitForCompletion(true);
System.exit(status ? 0 : 1);
}
private static class MovieRateMapJoinMRMapper extends Mapper{
private static Map movieMap = new HashMap<>();
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
URI[] cacheFiles = context.getCacheFiles();
// String path = cacheFiles[0].getPath();
// Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
//通过context获取所有的缓存文件,并且获取缓存文件的路径等信息
// Path[] localCacheFiles = context.getLocalCacheFiles();
// String strPath = localCacheFiles[0].toUri().toString();
Path cacheFile = new Path(cacheFiles[0]);
String strPath = cacheFile.toUri().toString();
// 自定义读取逻辑去读这个文件,然后进行相关的业务处理
BufferedReader br = new BufferedReader(new FileReader(strPath));
String readLine = null;
while(null != (readLine = br.readLine())){
System.out.println(readLine);
String[] movieFileds = readLine.split("::");
String movieid = movieFileds[0];
String moviename = movieFileds[1];
String movieType = movieFileds[2];
movieMap.put(movieid,moviename + "::" + movieType);
}
IOUtils.closeStream(br);
}
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String[] splits = value.toString().split("::");
String movieid = splits[1];
int rate = Integer.parseInt(splits[2]);
long ts = Long.parseLong(splits[3]);
String userid = splits[0];
// map根据key到movieMap中去匹配小表数据
String movieNameAndType = movieMap.get(movieid);
String movieName = movieNameAndType.split("::")[0];
String movieType = movieNameAndType.split("::")[1];
MovieRate mr = new MovieRate(userid, movieid, rate, ts, movieName, movieType);
context.write(new Text(mr.toString()), NullWritable.get());
}
}
}