Hadoop多目录输入,join,进入reduce,数据流分析

前言

在做需求时,经常遇到多个目录,也就是多个维度进行join,这里分析一下,数据是怎么流动的。

1、多目录输入

使用MultipleInputs.addInputPath()  对多目录制定格式和map

2、数据流分析

map按行读入数据,需要对不同的输入目录,打上不同的标记(这个方法又叫reduce端连接),map在输出后会进行partition和sort,按照key进行排序,然后输出到reduce进行处理。


例子

三个输入文件:
a.txt:
500
501

b.txt:
500	501
600 505

c.txt:
501	500
700 800




代码

import java.io.IOException;
import java.util.Iterator;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import util.TextPair;

import com.sina.hadoop.MultipleInputs;

public class Main extends Configured implements Tool
{

    public static void main(String[] args) throws Exception
    {
        int exitcode = ToolRunner.run(new Main(), args);
        System.exit(exitcode);
    }

    /**
     * 分区
     */
    static class TextPairKeyPartitioner extends Partitioner<TextPair, Text>
    {
        public int getPartition(TextPair key, Text value, int numPartitions)
        {
            return (key.getFirst().hashCode() & Integer.MAX_VALUE) % numPartitions;
        }

    }

    public int run(String[] arg0) throws Exception
    {
        int exitcode = 0;
        if (exitcode == 0)
        {
            Job job1 = new Job();
            job1.setJobName("testMultipleInputs");
            job1.setJarByClass(Main.class);

            MultipleInputs.addInputPath(job1, new Path("xx/testMultipleInputs/input/a/"),
                    TextInputFormat.class, AMapper.class);
            MultipleInputs.addInputPath(job1, new Path("xx/testMultipleInputs/input/b/"),
                    TextInputFormat.class, BMapper.class);
            MultipleInputs.addInputPath(job1, new Path("xx/testMultipleInputs/input/c/"),
                    TextInputFormat.class, CMapper.class);

            job1.setReducerClass(TestReducer.class);
            FileOutputFormat.setOutputPath(job1, new Path("xx/testMultipleInputs/output/"));
            job1.setOutputKeyClass(Text.class);
            job1.setOutputValueClass(Text.class);
            job1.setPartitionerClass(TextPairKeyPartitioner.class);
            job1.setGroupingComparatorClass(TextPair.FirstComparator.class);
            job1.setMapOutputKeyClass(TextPair.class);
            job1.setMapOutputValueClass(Text.class);

            job1.setNumReduceTasks(1);
            exitcode = job1.waitForCompletion(true) ? 0 : 1;
        }

        return exitcode;
    }

    public class AMapper extends Mapper<LongWritable, Text, TextPair, Text>
    {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
        {
            String[] data = value.toString().split("\t", -1);
            String id = "";
            if (data.length >= 1)
            {
                id = data[0];
                if (!"".equals(id))
                {
                    context.write(new TextPair(id, "1"), new Text("0"));
                }
            }
        }
    }

    public class BMapper extends Mapper<LongWritable, Text, TextPair, Text>
    {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
        {
            String[] data = value.toString().split("\t", -1);
            String id1 = "";
            String id2 = "";
            if (data.length >= 2)
            {
                id1 = data[0];
                id2 = data[1];
                if (!"".equals(id1))
                {
                    context.write(new TextPair(id1, "2"), new Text(id2));
                }
            }
        }
    }

    public class CMapper extends Mapper<LongWritable, Text, TextPair, Text>
    {
        public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
        {
            String[] data = value.toString().split("\t", -1);
            String id1 = "";
            String id2 = "";
            if (data.length >= 2)
            {
                id1 = data[0];
                id2 = data[1];
                if (!"".equals(id1))
                {
                    context.write(new TextPair(id1, "3"), new Text(id2));
                }
            }
        }
    }

    public class TestReducer extends Reducer<TextPair, Text, Text, Text>
    {
        public void reduce(TextPair key, Iterable<Text> values, Context context) throws IOException, InterruptedException
        {
            String data = "";
            Iterator<Text> i = values.iterator();
            while (i.hasNext())
            {
                data = i.next().toString();
                context.write(key.getFirst(), new Text(data));
            }
        }

    }

}



你可能感兴趣的:(hadoop,数据流,多目录输入)