在看老版的API时,发现旧的KeyValueTextInputFormat的作者基本上都是拿算法自己写,hadoop源码的很多地方都是不会拿现成的api来用,都是自己定义,这样做对性能的可控性是很强,这也折射出国外程序员跟国内程序员的差异,国内提倡拿来主义,国外可能更强调创新精神吧。
而我属于前者:拿来主义者
自定义的KeyValueInputFormat:
package cn.edu.xmu.dm.mpdemo.ioformat; import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.util.LineReader; /** * desc: custom KeyValueInputFormat * <code>DMKeyValueInputFormat</code> * * @author chenwq ([email protected]) * @version 1.0 2012/05/19 */ public class DMKeyValueInputFormat extends FileInputFormat<Text, Text> { protected static class KeyVRecordReader extends RecordReader<Text, Text> { private static final Log LOG = LogFactory .getLog(KeyVRecordReader.class); private CompressionCodecFactory compressionCodecs = null; private long start; private long pos; private long end; private LineReader in; private int maxLineLength; private Text key = null; private Text value = null; private String separator = "\t"; @Override public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit split = (FileSplit) genericSplit; Configuration job = context.getConfiguration(); this.maxLineLength = job.getInt( "mapred.linerecordreader.maxlength", Integer.MAX_VALUE); this.separator = job.get("key.value.separator.in.input.line", "\t"); start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); FileSystem fs = file.getFileSystem(job); FSDataInputStream fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; if (codec != null) { in = new LineReader(codec.createInputStream(fileIn), job); end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } in = new LineReader(fileIn, job); } if (skipFirstLine) { start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; } @Override public synchronized void close() throws IOException { if (in != null) { in.close(); } } @Override public Text getCurrentKey() throws IOException, InterruptedException { return key; } @Override public Text getCurrentValue() throws IOException, InterruptedException { return value; } @Override public float getProgress() throws IOException, InterruptedException { if (start == end) { return 0.0f; } else { return Math.min(1.0f, (pos - start) / (float) (end - start)); } } @Override public boolean nextKeyValue() throws IOException, InterruptedException { Text line = new Text(); if (key == null) { key = new Text(); } if (value == null) { value = new Text(); } int newSize = 0; while (pos < end) { newSize = in.readLine(line, maxLineLength, Math.max( (int) Math.min(Integer.MAX_VALUE, end - pos), maxLineLength)); // 此处添加额外处理即可,其他地方与TextInputFormat一样。 if (null != line) { String[] kv = line.toString().split(this.separator); if (kv.length == 2) { key.set(kv[0]); value.set(kv[1]); } else { LOG.info("Skipped line has no separator"); key.set(line.toString()); value.set(""); } } if (newSize == 0) { break; } pos += newSize; if (newSize < maxLineLength) { break; } LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize)); } if (newSize == 0) { key = null; value = null; return false; } else { return true; } } } @Override public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new KeyVRecordReader(); } @Override protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = new CompressionCodecFactory( context.getConfiguration()).getCodec(file); return codec == null; } }
测试定义的KeyValueInputForamt:
package cn.edu.xmu.dm.mpdemo.ioformat; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * desc: Test custom KeyValueTextInputFormat * <code>KeyValueTextInputFormatDemo</code> * * @author chenwq ([email protected]) * @version 1.0 2012/05/19 */ public class KeyValueTextInputFormatDemo extends Configured implements Tool { public static class KVMapper extends Mapper<Text, Text, Text, Text>{ private final static Logger LOG = LoggerFactory .getLogger(KVMapper.class); @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { System.out.println(key); System.out.println(value); LOG.info(key.toString()); LOG.info(value.toString()); context.write(key, value); } } @Override public int run(String[] args) throws Exception { String input = "input"; String output = "output"; Path inputDir = new Path(input); Path outputDir = new Path(output); Configuration conf = new Configuration(); Job job = new Job(conf, this.getClass().getSimpleName()); job.setJarByClass(this.getClass()); job.setMapperClass(KVMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(DMKeyValueInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); return job.waitForCompletion(true) ? 0 : -1; } public static void main(String[] args) throws Exception { ToolRunner.run(new KeyValueTextInputFormatDemo(), args); } }