Hadoop 实例15 MultipleInputs实战2:多种自定义文件格式的文件输入处理

MultipleInputs 可以让MR支持多种输入格式。

比如我们有两种文件格式,那么我们就需要有两套 Record Class, RecordReader和InputFormat。

MultipleInputs需要不同的InputFormat, 一种InputFormat使用一种RecordReader来读取文件并返回一种Record格式的值,这就是这三个类型的关系,也是map过程中涉及的几个步骤的工具和产物。

1、数据准备

a文件

1t80
2t90
3t100
4t50
5t73

b文件

1tlilit3
2txiaomingt3
3tfeifeit3
4tzhangsant3
5tlisit3

2、要求自定义实现inputFormat,输出 key、value格式数据

1   FirstClass:80; SecondClass:lili,ClassNum:3; 
2   SecondClass:xiaoming,ClassNum:3; FirstClass:90; 
3   FirstClass:100; SecondClass:feifei,ClassNum:3; 
4   SecondClass:zhangsan,ClassNum:3; FirstClass:50; 
5   FirstClass:73; SecondClass:lisi,ClassNum:3; 

3.程序实现:

package cn.edu.bjut.multitwo;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class FirstClass implements Writable {

    private String value;

    public FirstClass() {}

    public FirstClass(String value) {
        this.value = value;
    }


    public void write(DataOutput out) throws IOException {
        out.writeUTF(value);
    }

    public void readFields(DataInput in) throws IOException {
        this.value = in.readUTF();
    }

    @Override
    public String toString() {
        return "FirstClass:"+value;
    }

    public String getValue() {
        return value;
    }

    public void setValue(String value) {
        this.value = value;
    }
}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class FirstClassReader extends RecordReader<Text, FirstClass> {

    private LineRecordReader lineRecordReader = null;
    private Text key = null;
    private FirstClass firstClass = null;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        close();
        lineRecordReader = new LineRecordReader();
        lineRecordReader.initialize(split, context);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if(!lineRecordReader.nextKeyValue()) {
            key = null;
            firstClass = null;
            return false;
        }
        String line = lineRecordReader.getCurrentValue().toString().trim();
        String[] arr = line.split("t");
        if(2 == arr.length) {
            key = new Text(arr[0]);
            firstClass = new FirstClass(arr[1]);
        }

        return true;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public FirstClass getCurrentValue() throws IOException,
            InterruptedException {
        return firstClass;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return lineRecordReader.getProgress();
    }

    @Override
    public void close() throws IOException {
        if(null != lineRecordReader) {
            lineRecordReader.close();
            lineRecordReader = null;
        }
        key = null;
        firstClass = null;
    }

}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class FirstInputFormat extends FileInputFormat {

    @Override
    public RecordReader createRecordReader(InputSplit split,
            TaskAttemptContext context) throws IOException,
            InterruptedException {
        return new FirstClassReader();
    }

}
package cn.edu.bjut.multitwo;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class SecondClass implements Writable {
    private String value;
    private int classNum;

    public SecondClass() {}

    public SecondClass(String value, int classNum) {
        super();
        this.value = value;
        this.classNum = classNum;
    }

    public void write(DataOutput out) throws IOException {
        out.writeUTF(value);
        out.writeInt(classNum);
    }

    public void readFields(DataInput in) throws IOException {
        this.value = in.readUTF();
        this.classNum = in.readInt();
    }

    @Override
    public String toString() {
        return "SecondClass:"+value+",ClassNum:"+classNum;
    }

    public String getValue() {
        return value;
    }

    public void setValue(String value) {
        this.value = value;
    }

    public int getClassNum() {
        return classNum;
    }

    public void setClassNum(int classNum) {
        this.classNum = classNum;
    }
}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;

public class SecondClassReader extends RecordReader<Text, SecondClass> {

    private LineRecordReader lineRecordReader = null;
    private Text key = null;
    private SecondClass secondClass = null;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        close();
        lineRecordReader = new LineRecordReader();
        lineRecordReader.initialize(split, context);
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if(!lineRecordReader.nextKeyValue()) {
            key = null;
            secondClass = null;
            return false;
        }
        String line = lineRecordReader.getCurrentValue().toString().trim();
        String[] arr = line.split("t");
        if(3 == arr.length) {
            key = new Text(arr[0]);
            secondClass = new SecondClass(arr[1], Integer.parseInt(arr[2]));
        }
        return true;
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public SecondClass getCurrentValue() throws IOException,
            InterruptedException {
        return secondClass;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return lineRecordReader.getProgress();
    }

    @Override
    public void close() throws IOException {
        if(null != lineRecordReader) {
            lineRecordReader.close();
            lineRecordReader = null;
        }
        key = null;
        secondClass = null;
    }

}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;


public class SecondInputFormat extends FileInputFormat {

    @Override
    public RecordReader createRecordReader(InputSplit split,
            TaskAttemptContext context) throws IOException,
            InterruptedException {
        return new SecondClassReader();
    }

}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MultiMapper1 extends Mapper<Text, FirstClass, Text, Text> {

    @Override
    protected void map(Text key, FirstClass value, Context context)
            throws IOException, InterruptedException {
        context.write(key, new Text(value.toString()));
    }

}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MultiMapper2 extends Mapper<Text, SecondClass, Text, Text> {

    @Override
    protected void map(Text key, SecondClass value, Context context)
            throws IOException, InterruptedException {
        context.write(key, new Text(value.toString()));
    }

}
package cn.edu.bjut.multitwo;

import java.io.IOException;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MultiReducer extends Reducer<Text, Text, Text, Text> {

    @Override
    protected void reduce(Text key, Iterable values, Context context)
            throws IOException, InterruptedException {
        StringBuffer stringBuffer = new StringBuffer();
        for(Text text : values) {
            stringBuffer.append(text.toString()).append("; ");
        }
        context.write(key, new Text(stringBuffer.toString()));
    }

}
package cn.edu.bjut.multitwo;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MainJob {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf, "multi");
        job.setJarByClass(MainJob.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(MultiReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        MultipleInputs.addInputPath(job, new Path(args[0]), FirstInputFormat.class, MultiMapper1.class);
        MultipleInputs.addInputPath(job, new Path(args[1]), SecondInputFormat.class, MultiMapper2.class);

        Path outPath = new Path(args[2]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);
        job.waitForCompletion(true);
    }
}

你可能感兴趣的:(Hadoop 实例15 MultipleInputs实战2:多种自定义文件格式的文件输入处理)