MultipleInputs 可以让MR支持多种输入格式。
比如我们有两种文件格式,那么我们就需要有两套 Record Class, RecordReader和InputFormat。
MultipleInputs需要不同的InputFormat, 一种InputFormat使用一种RecordReader来读取文件并返回一种Record格式的值,这就是这三个类型的关系,也是map过程中涉及的几个步骤的工具和产物。
1、数据准备
a文件
1t80
2t90
3t100
4t50
5t73
b文件
1tlilit3
2txiaomingt3
3tfeifeit3
4tzhangsant3
5tlisit3
2、要求自定义实现inputFormat,输出 key、value格式数据
1 FirstClass:80; SecondClass:lili,ClassNum:3;
2 SecondClass:xiaoming,ClassNum:3; FirstClass:90;
3 FirstClass:100; SecondClass:feifei,ClassNum:3;
4 SecondClass:zhangsan,ClassNum:3; FirstClass:50;
5 FirstClass:73; SecondClass:lisi,ClassNum:3;
3.程序实现:
package cn.edu.bjut.multitwo;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class FirstClass implements Writable {
private String value;
public FirstClass() {}
public FirstClass(String value) {
this.value = value;
}
public void write(DataOutput out) throws IOException {
out.writeUTF(value);
}
public void readFields(DataInput in) throws IOException {
this.value = in.readUTF();
}
@Override
public String toString() {
return "FirstClass:"+value;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
public class FirstClassReader extends RecordReader<Text, FirstClass> {
private LineRecordReader lineRecordReader = null;
private Text key = null;
private FirstClass firstClass = null;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
close();
lineRecordReader = new LineRecordReader();
lineRecordReader.initialize(split, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!lineRecordReader.nextKeyValue()) {
key = null;
firstClass = null;
return false;
}
String line = lineRecordReader.getCurrentValue().toString().trim();
String[] arr = line.split("t");
if(2 == arr.length) {
key = new Text(arr[0]);
firstClass = new FirstClass(arr[1]);
}
return true;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public FirstClass getCurrentValue() throws IOException,
InterruptedException {
return firstClass;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return lineRecordReader.getProgress();
}
@Override
public void close() throws IOException {
if(null != lineRecordReader) {
lineRecordReader.close();
lineRecordReader = null;
}
key = null;
firstClass = null;
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class FirstInputFormat extends FileInputFormat {
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
return new FirstClassReader();
}
}
package cn.edu.bjut.multitwo;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class SecondClass implements Writable {
private String value;
private int classNum;
public SecondClass() {}
public SecondClass(String value, int classNum) {
super();
this.value = value;
this.classNum = classNum;
}
public void write(DataOutput out) throws IOException {
out.writeUTF(value);
out.writeInt(classNum);
}
public void readFields(DataInput in) throws IOException {
this.value = in.readUTF();
this.classNum = in.readInt();
}
@Override
public String toString() {
return "SecondClass:"+value+",ClassNum:"+classNum;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
public int getClassNum() {
return classNum;
}
public void setClassNum(int classNum) {
this.classNum = classNum;
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
public class SecondClassReader extends RecordReader<Text, SecondClass> {
private LineRecordReader lineRecordReader = null;
private Text key = null;
private SecondClass secondClass = null;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
close();
lineRecordReader = new LineRecordReader();
lineRecordReader.initialize(split, context);
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!lineRecordReader.nextKeyValue()) {
key = null;
secondClass = null;
return false;
}
String line = lineRecordReader.getCurrentValue().toString().trim();
String[] arr = line.split("t");
if(3 == arr.length) {
key = new Text(arr[0]);
secondClass = new SecondClass(arr[1], Integer.parseInt(arr[2]));
}
return true;
}
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public SecondClass getCurrentValue() throws IOException,
InterruptedException {
return secondClass;
}
@Override
public float getProgress() throws IOException, InterruptedException {
return lineRecordReader.getProgress();
}
@Override
public void close() throws IOException {
if(null != lineRecordReader) {
lineRecordReader.close();
lineRecordReader = null;
}
key = null;
secondClass = null;
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
public class SecondInputFormat extends FileInputFormat {
@Override
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
return new SecondClassReader();
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MultiMapper1 extends Mapper<Text, FirstClass, Text, Text> {
@Override
protected void map(Text key, FirstClass value, Context context)
throws IOException, InterruptedException {
context.write(key, new Text(value.toString()));
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MultiMapper2 extends Mapper<Text, SecondClass, Text, Text> {
@Override
protected void map(Text key, SecondClass value, Context context)
throws IOException, InterruptedException {
context.write(key, new Text(value.toString()));
}
}
package cn.edu.bjut.multitwo;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MultiReducer extends Reducer<Text, Text, Text, Text> {
@Override
protected void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
StringBuffer stringBuffer = new StringBuffer();
for(Text text : values) {
stringBuffer.append(text.toString()).append("; ");
}
context.write(key, new Text(stringBuffer.toString()));
}
}
package cn.edu.bjut.multitwo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MainJob {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "multi");
job.setJarByClass(MainJob.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
job.setReducerClass(MultiReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
MultipleInputs.addInputPath(job, new Path(args[0]), FirstInputFormat.class, MultiMapper1.class);
MultipleInputs.addInputPath(job, new Path(args[1]), SecondInputFormat.class, MultiMapper2.class);
Path outPath = new Path(args[2]);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outPath)) {
fs.delete(outPath, true);
}
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
}
}