mapreduce的join算法编程案例

mapreduce编程案例

map端的join算法

1、原理阐述

适用于关联表中有小表的情形,可以将小表发送到所有的map节点,这样map节点就可以在本地对自己读到的大表数据进行join并输出最终结果,可以大大提高join操作的并发度,加快处理速度

2、实例:

两表数据:

商品表数据
p0001,小米5,1000,2000
p0002,锤子T1,1000,3000
订单表数据
1001,20150710,p0001,2
1002,20150710,p0002,3
1002,20150710,p0003,3

编写map类

import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.HashMap;

public class joinMap extends Mapper<LongWritable, Text, Text, Text> {
    HashMap<String, String> map = new HashMap<String, String>();
    String line = null;

    /**
     * 在map端的初始化方法中获取缓存文件,一次性加载到map中
     *
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        Path[] localCacheFiles = DistributedCache.getLocalCacheFiles(context.getConfiguration());
        //获得所有的缓存文件
        URI[] cacheFiles = DistributedCache.getCacheFiles(context.getConfiguration());
        //获得文件系统
        FileSystem fileSystem = FileSystem.get(cacheFiles[0], context.getConfiguration());
        FSDataInputStream open = fileSystem.open(new Path(cacheFiles[0]));
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(open));
        while ((line = bufferedReader.readLine()) != null) {
            String[] split = line.split(",");
            map.put(split[0], split[1] + "\t" + split[2] + "\t" + split[3]);
        }
        fileSystem.close();
        IOUtils.closeStream(bufferedReader);
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //这里读的是这个map task所负责的那一个切片数据(在hdfs上)
        String[] fields = value.toString().split(",");
        String orderId = fields[0];
        String date = fields[1];
        String pdId = fields[2];
        String amount = fields[3];
        //获取map当中的商品详细信息
        String productInfo = map.get(pdId);
        context.write(new Text(orderId), new Text(date + "\t" + productInfo + "\t" + amount));

    }
}

编写main

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import java.net.URI;

public class joinJobMain extends Configured implements Tool {
    @Override
    public int run(String[] strings) throws Exception {
        Configuration conf = super.getConf();
        //注意,这里的缓存文件的添加,只能将缓存文件放到hdfs文件系统当中,放到本地加载不到
        DistributedCache.addCacheFile(new URI("hdfs://node01:8020/cachefile/pdts.txt"), conf);
        Job job = Job.getInstance(conf, joinJobMain.class.getSimpleName());
        job.setJarByClass(joinJobMain.class);
        job.setInputFormatClass(TextInputFormat.class);
        TextInputFormat.addInputPath(job, new Path("file:///d:\\map端join\\map_join_input"));
        job.setMapperClass(joinMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, new Path("file:///d:\\map端join\\map_join_output"));
        boolean b = job.waitForCompletion(true);
        return b ? 0 : 1;

    }

    public static void main(String[] args) throws Exception {
        Configuration configuration = new Configuration();
        ToolRunner.run(configuration, new joinJobMain(), args);
    }
}

reduce端join算法的缺陷:

缺点:这种方式中,join的操作是在reduce阶段完成,reduce端的处理压力太大,map节点的运算负载则很低,资源利用率不高,且在reduce阶段极易产生数据倾斜

你可能感兴趣的:(mapreduce的join算法编程案例)