Hadoop应用——Map端Join操作

联接
使用案例
Table EMP:

Name    Sex Age DepNo
zhang   male    20  1
li  female  25  2
wang    female  30  3
zhou    male    35  2

Table DEP:

DepNo   DepName
1   Sales
2   Dev
3   Mgt

Map端联接是指数据到达map处理函数之前进行合并的。
基本思路
1)需要join的两个文件,一个存储在HDFS中,一个使用DistributedCache.addCacheFile()将需要join的另一个文件加入到所有Map的缓存里
2)在Map函数里读取该文件,进行join
3)将结果输出到HDFS上
4)DistributedCache.addCacheFile()需要在作业提交前设置

Hadoop DistributedCache原理分析
DistributedCache是Hadoop为方便用户进行应用程序开发而设计的文件分发工具。它能够将只读的外部文件自动分发到各个节点上进行本地缓存,以便Task运行时加载使用。它的大体工作流程如下:
1、用户提交作业后,Hadoop将由-files和-archives选项指定的文件复制到JobTracker的文件系统(一般为HDFS)中;
2、之后,当某个TaskTracker收到该作业的第一个Task后,该任务将负责从JobTracker文件系统中将文件下载到本地磁盘进行缓存,这样后续的Task就可以直接在本地访问这些文件了。
EMP_DEP.java

package com.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class EMP_DEP implements WritableComparable{
    private String name  = "" ;
    private String sex = "" ;
    private int age = 0 ;
    private int depNo = 0 ;
    private String depName = "" ;
    private String table = "" ;

    public EMP_DEP(){

    }

    public EMP_DEP(EMP_DEP emp_dep){
        this.name = emp_dep.getName() ;
        this.sex = emp_dep.getSex() ;
        this.age = emp_dep.getAge() ;
        this.depNo = emp_dep.getDepNo() ;
        this.depName = emp_dep.getDepName() ;
        this.table = emp_dep.getTable() ;
    }
    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public String getSex() {
        return sex;
    }

    public void setSex(String sex) {
        this.sex = sex;
    }

    public int getAge() {
        return age;
    }

    public void setAge(int age) {
        this.age = age;
    }

    public int getDepNo() {
        return depNo;
    }

    public void setDepNo(int depNo) {
        this.depNo = depNo;
    }

    public String getDepName() {
        return depName;
    }

    public void setDepName(String depName) {
        this.depName = depName;
    }

    public String getTable() {
        return table;
    }

    public void setTable(String table) {
        this.table = table;
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.name = in.readUTF() ;
        this.sex = in.readUTF() ;
        this.age = in.readInt() ;
        this.depNo = in.readInt() ;
        this.depName = in.readUTF() ;
        this.table = in.readUTF() ;

    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(name) ;
        out.writeUTF(sex) ;
        out.writeInt(age) ;
        out.writeInt(depNo) ;
        out.writeUTF(depName) ;
        out.writeUTF(table) ;

    }
    @Override
    public int compareTo(Object o) {
        return 0;
    }

    public String toString(){
        return name + " " + sex + " " + age + " " + depName ; 
    }
}

MapSideMapper.java

package com.join;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MapSideMapper extends Mapper {

    private Map joinData = new HashMap() ;

    @Override
    protected void map(LongWritable key, Text value,Context context)
            throws IOException, InterruptedException {
        //处理EMP表
        String[] values = value.toString().split("\\s+") ;
        EMP_DEP emp_dep = new EMP_DEP() ;
        emp_dep.setName(values[0]) ;
        emp_dep.setSex(values[1]) ;
        emp_dep.setAge(Integer.valueOf(values[2])) ;
        int depNo = Integer.parseInt(values[3]) ;
        String depName = joinData.get(depNo) ;
        emp_dep.setDepNo(depNo) ;
        emp_dep.setDepName(depName) ;
        context.write(NullWritable.get(), emp_dep) ;
    }

    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
        Path[] path = DistributedCache.getLocalCacheFiles(context.getConfiguration()) ;
        //只缓存了一个DEP文件,在本地文件系统中
        BufferedReader reader = new BufferedReader(new FileReader(path[0].toString())) ;
        String str = null ;
        while((str = reader.readLine()) != null){
            String[] s = str.split("\\s+") ;
            joinData.put(Integer.valueOf(s[0]), s[1]) ;
        }
    }

}

在Mapper或者Reducer类中使用文件时,Mapper或者Reducer开始运行前,各种文件已经下载到本地的工作目录中,直接调用文件读写API即可获取文件内容。在map函数执行前通过setup进行初始化。。。

TestMapSideJoin.java

package com.join;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

public class TestMapSideJoin {
    public static void main(String args[]) throws Exception{
        Configuration conf = new Configuration();
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 3 ) {
          System.err.println("Usage: wordcount   ");
          System.exit(2);
        }

        DistributedCache.addCacheFile(new Path(otherArgs[1]).toUri(), conf) ;

        Job job = new Job(conf, "Map side join");
        job.setJarByClass(TestMapSideJoin.class);
        job.setMapperClass(MapSideMapper.class);
        job.setNumReduceTasks(0) ;

        job.setMapOutputKeyClass(NullWritable.class) ;
        job.setMapOutputValueClass(EMP_DEP.class) ;

        FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
        FileOutputFormat.setOutputPath(job, new Path(otherArgs[2]));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

DistributedCache添加使用的外部文件只需要一行代码

DistributedCache.addCacheFile(new Path(otherArgs[1]).toUri(), conf) ;

你可能感兴趣的:(大数据,hadoop)