MapReduce2-3.1.1 分布式计算 实验示例(四)Join 取两个结果集的交集

大家好,我是Iggi。

今天我给大家分享的是MapReduce2-3.1.1版本的Join实验。

关于MapReduce的一段文字简介请自行查阅我的实验示例:MapReduce2-3.1.1 实验示例 单词计数(一)

好,下面进入正题。介绍Java操作MapReduce2组件完成两个结果集的Join的操作。

首先,使用IDE建立Maven工程,建立工程时没有特殊说明,按照向导提示点击完成即可。重要的是在pom.xml文件中添加依赖包,内容如下图:

MapReduce2-3.1.1 分布式计算 实验示例(四)Join 取两个结果集的交集_第1张图片
image.png

待系统下载好依赖的jar包后便可以编写程序了。

展示实验代码:

package linose.mapreduce.join;

import java.io.IOException;
import java.io.OutputStreamWriter;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
//import org.apache.log4j.BasicConfigurator;

public class AppJoin {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        
        /**
         * 设定MapReduce示例拥有HDFS的操作权限
         */
        System.setProperty("HADOOP_USER_NAME", "hdfs");
        
        /**
         * 为了清楚的看到输出结果,暂将集群调试信息缺省。
         * 如果想查阅集群调试信息,取消注释即可。
         */
        //BasicConfigurator.configure();
        
        /**
         * MapReude实验准备阶段:
         * 定义HDFS文件路径
         */
        String defaultFS = "hdfs://master2.linose.cloud.beijing.com:8020";
        String inputPath1 = defaultFS + "/index.dirs/data1.txt";
        String inputPath2 = defaultFS + "/index.dirs/data2.txt";
        String outputPath = defaultFS + "/index.dirs/joinresult.txt";
        
        /**
         * 定义输入路径,输出路径
         */
        Path inputHdfsPath1 = new Path(inputPath1);
        Path inputHdfsPath2 = new Path(inputPath2);
        Path outputHdfsPath = new Path(outputPath);
        
        /**
         * 生产配置,并获取HDFS对象
         */
        Configuration conf = new Configuration();
        conf.set("fs.defaultFS", defaultFS);
        FileSystem system = FileSystem.get(conf);
        
        /**
         * 如果实验数据文件不存在则创建数据文件
         */
        system.delete(inputHdfsPath1, false);
        if (!system.exists(inputHdfsPath1)) {
            FSDataOutputStream outputStream = system.create(inputHdfsPath1);
            OutputStreamWriter file = new OutputStreamWriter(outputStream);
            file.write("2019-05-13 13:00\t001\tIggi\tbeijing\t010\n");
            file.write("2019-05-13 14:00\t002\tchap\tbeijing\t010\n");
            file.write("2019-05-13 15:00\t003\ttxingsg\tbeijing\t010\n");
            file.write("2019-05-13 16:00\t004\tthedd\tbeijing\t010\n");
            file.write("2019-05-13 17:00\t005\ttlix\tbeijing\t010\n");
            file.close();
            outputStream.close();
        }
        system.delete(inputHdfsPath2, false);
        if (!system.exists(inputHdfsPath2)) {
            FSDataOutputStream outputStream = system.create(inputHdfsPath2);
            OutputStreamWriter file = new OutputStreamWriter(outputStream);
            file.write("002\tjava\n");
            file.write("003\tjava\n");
            file.write("004\tproduct\n");
            file.write("005\tmanager\n");
            file.close();
            outputStream.close();
        }
        
        /**
         * 如果实验结果目录存在,遍历文件内容全部删除
         */
        if (system.exists(outputHdfsPath)) {
            RemoteIterator fsIterator = system.listFiles(outputHdfsPath, true);
            LocatedFileStatus fileStatus;
            while (fsIterator.hasNext()) {
                fileStatus = fsIterator.next();
                system.delete(fileStatus.getPath(), false);
            }
            system.delete(outputHdfsPath, false);
        }
        
        /**
         * 创建MapReduce任务并设定Job名称
         */
        Job job = Job.getInstance(conf, "Join Data Set");
        job.setJarByClass(JoinMapper.class);

        /**
         * 设置输入文件、输出文件
         */
        FileInputFormat.addInputPath(job, inputHdfsPath1);
        FileInputFormat.addInputPath(job, inputHdfsPath2);
        FileOutputFormat.setOutputPath(job, outputHdfsPath);
        
        /**
         * 指定Reduce类输出类型Key类型与Value类型
         */
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        
        /**
         * 指定Map类输出类型Key类型与Value类型
         */
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        
        /**
         * 指定自定义Map类,Reduce类,Partitioner类、Comparator类
         */
        job.setMapperClass(JoinMapper.mrJoinMapper.class);
        job.setReducerClass(JoinMapper.mrJoinReduce.class);
        job.setPartitionerClass(JoinMapper.mrJoinPartitioner.class);
        job.setGroupingComparatorClass(JoinMapper.mrJoinComparator.class);
        
        /**
         * 提交作业
         */
        job.waitForCompletion(true);
        
        /**
         * 然后轮询进度,直到作业完成。
         */
        float progress = 0.0f;
        do {
            progress = job.setupProgress();
            System.out.println("Word Count Ver2: 的当前进度:" + progress * 100);
            Thread.sleep(1000);
        } while (progress != 1.0f && !job.isComplete());
        
        /**
         * 如果成功,查看输出文件内容
         */
        if (job.isSuccessful()) {
            RemoteIterator fsIterator = system.listFiles(outputHdfsPath, true);
            LocatedFileStatus fileStatus;
            while (fsIterator.hasNext()) {
                fileStatus = fsIterator.next();
                FSDataInputStream outputStream = system.open(fileStatus.getPath());
                IOUtils.copyBytes(outputStream, System.out, conf, false);
                outputStream.close();
                System.out.println("--------------------------------------------");
            }
        }
    }
}

展示MapReduce2-3.1.1组件编写TextPair测试类:

package linose.mapreduce.join;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;

import linose.mapreduce.secondarysort.IntPair;

public class TextPair implements WritableComparable{
    
    /**
     * 定义:指定键、匹配键
     */
    private Text pairKey;
    private Text specificKey;
    
    public TextPair() {
        set(new Text(), new Text());
    }
    
    public TextPair(Text pairKey, Text specificKey) {
        set(pairKey, specificKey);
    }
    
    public TextPair(String pairKey, String specificKey) {
        set(new Text(pairKey), new Text(specificKey));
    }
    
    public void set(Text pairKey, Text specificKey) {
        this.pairKey = pairKey;
        this.specificKey = specificKey;
    }
    
    public Text getPairKey() {
        return pairKey;
    }
    
    public Text getSpecificKey() {
        return specificKey;
    }
    
    public void readFields(DataInput input) throws IOException {
        pairKey.readFields(input);
        specificKey.readFields(input);
    }
    public void write(DataOutput output) throws IOException {
        pairKey.write(output);
        specificKey.write(output);
    }
    public int compareTo(TextPair o) {
        int compare = pairKey.compareTo(o.pairKey);
        if (0 != compare) {
            return compare;
        }
        return specificKey.compareTo(o.specificKey);
    }
    
    public int hashCode() {
        return pairKey.hashCode()*163 + specificKey.hashCode();
    }
    
    public boolean equals(Object o) {
        if (o instanceof IntPair) {
            TextPair pair = (TextPair)o;
            return pairKey.equals(pair.pairKey) && specificKey.equals(pair.specificKey);
        }
        return false;
    }

    public String toString() {
        return pairKey + "\t" + specificKey;
    }

}

展示MapReduce2-3.1.1组件编写Join测试类:

package linose.mapreduce.join;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class JoinMapper {
    
    public static class mrJoinMapper extends Mapper {
        
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
            FileSplit file = (FileSplit) context.getInputSplit();
            String path = file.getPath().toString();
            
            if (path.contains("data1.txt")) {
                String values[] = value.toString().split("\t");
                if (values.length < 3) {
                    return;
                } 
                
                String SpecificKey = values[0];
                String pairKey = values[1];
                
                StringBuffer buffer = new StringBuffer();
                buffer.append(SpecificKey);
                buffer.append("\t");
                for (int i = 2; i < values.length; ++i) {
                    buffer.append(values[i]);
                    buffer.append("\t");
                }
                
                TextPair outputKey = new TextPair(pairKey, "1");
                Text outputValue = new Text(buffer.toString());
                context.write(outputKey, outputValue);
            }
            
            if (path.contains("data2.txt")) {
                String values[] = value.toString().split("\t");
                if (values.length < 2) {
                    return;
                } 
                
                String pairKey = values[0];
                
                StringBuffer buffer = new StringBuffer();
                
                for (int i = 1; i < values.length; ++i) {
                    buffer.append(values[i]);
                    buffer.append("\t");
                }
                
                TextPair outputKey = new TextPair(pairKey, "0");
                Text outputValue = new Text(buffer.toString());
                context.write(outputKey, outputValue);
            }
        }
    }
    
    public static class mrJoinReduce extends Reducer {
        
        protected void reduce(TextPair key, Iterable values, Context context) throws IOException, InterruptedException {
            Text pairKey = key.getPairKey();
            String val = values.iterator().next().toString();
            while (values.iterator().hasNext()) {
                Text pairValue = new Text(values.iterator().next().toString() + "\t" + val);
                context.write(pairKey, pairValue);
            }
            
        }
    }
    
    public static class mrJoinPartitioner extends Partitioner {

        @Override
        public int getPartition(TextPair key, Text value, int partition) {
            return Math.abs(key.getPairKey().hashCode() * 127) % partition;
        }
        
    }
    
    public static class mrJoinComparator extends WritableComparator {
        
        public mrJoinComparator() {
            super(TextPair.class, true);
        }
        
        public int compare(@SuppressWarnings("rawtypes") WritableComparable data1,  @SuppressWarnings("rawtypes") WritableComparable data2) {
            TextPair d1 = (TextPair)data1;
            TextPair d2 = (TextPair)data2;
            return d1.getPairKey().compareTo(d2.getPairKey());
            
        }
    }
}

下图为测试结果:


MapReduce2-3.1.1 分布式计算 实验示例(四)Join 取两个结果集的交集_第2张图片
image.png

至此,MapReduce2-3.1.1 Join实验示例演示完毕。

你可能感兴趣的:(MapReduce2-3.1.1 分布式计算 实验示例(四)Join 取两个结果集的交集)