KeyValueTextInputFormat 重写

hadoop的版本发展中出现了mapred(老版)和mapreduce(新版)两种体系,hadoop鼓励使用新版的特性,而mapreduce不如 mapred 对格式的支持多。

其实只要借鉴mapred系列的写法就可以重写,由自己实现。一下为代码:

package kvMap;

import java.io.IOException;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

public class KVTextInputFormat extends FileInputFormat<IntWritable, MetaInfo> {

 @Override
 public RecordReader<IntWritable, MetaInfo> createRecordReader(
   InputSplit split, TaskAttemptContext context) {
  return new LineRecordReader();
 }

 @Override
 protected boolean isSplitable(JobContext context, Path file) {
  CompressionCodec codec = new CompressionCodecFactory(
    context.getConfiguration()).getCodec(file);
  return codec == null;
 }

 public static class LineRecordReader extends
   RecordReader<IntWritable, MetaInfo> {
  private static final Log LOG = LogFactory
    .getLog(LineRecordReader.class);

  private CompressionCodecFactory compressionCodecs = null;
  private long start;
  private long pos;
  private long end;
  private LineReader in;
  private int maxLineLength;
  private IntWritable key = null;
  private MetaInfo value = null;
  private static int i = 0;

  public void initialize(InputSplit genericSplit,
    TaskAttemptContext context) throws IOException {
   FileSplit split = (FileSplit) genericSplit;
   Configuration job = context.getConfiguration();
   this.maxLineLength = job.getInt(
     "mapred.linerecordreader.maxlength", Integer.MAX_VALUE);
   start = split.getStart();
   end = start + split.getLength();
   final Path file = split.getPath();
   compressionCodecs = new CompressionCodecFactory(job);
   final CompressionCodec codec = compressionCodecs.getCodec(file);

   // open the file and seek to the start of the split
   FileSystem fs = file.getFileSystem(job);
   FSDataInputStream fileIn = fs.open(split.getPath());
   boolean skipFirstLine = false;
   if (codec != null) {
    in = new LineReader(codec.createInputStream(fileIn), job);
    end = Long.MAX_VALUE;
   } else {
    if (start != 0) {
     skipFirstLine = true;
     --start;
     fileIn.seek(start);
    }
    in = new LineReader(fileIn, job);
   }
   if (skipFirstLine) { // skip first line and re-establish "start".
    start += in.readLine(new Text(), 0,
      (int) Math.min((long) Integer.MAX_VALUE, end - start));
   }
   this.pos = start;
  }

  public boolean nextKeyValue() throws IOException {
   Text line = new Text();
   if (key == null) {
    key = new IntWritable();
   }
   if (value == null) {
    value = new MetaInfo();
   }
   int newSize = 0;
   while (pos < end) {
    i++;
    newSize = in.readLine(line, maxLineLength, Math.max(
      (int) Math.min(Integer.MAX_VALUE, end - pos),
      maxLineLength));
    
    // used to parse the text into "key-value" model
    if (null != line) {

String kv[]=line.split("\t");
     key.set(kv[0]);
     value.set(kv[1]);
    } else {
     LOG.info("Skipped line has no separator");
    }
    
    if (newSize == 0) {
     break;
    }
    pos += newSize;
    if (newSize < maxLineLength) {
     break;
    }

    // line too long. try again
    LOG.info("Skipped line of size " + newSize + " at pos "
      + (pos - newSize));
   }
   if (newSize == 0) {
    key = null;
    value = null;
    return false;
   } else {
    return true;
   }
  }

  @Override
  public IntWritable getCurrentKey() {
   return key;
  }

  @Override
  public MetaInfo getCurrentValue() {
   return value;
  }

  /**
   * Get the progress within the split
   */
  public float getProgress() {
   if (start == end) {
    return 0.0f;
   } else {
    return Math.min(1.0f, (pos - start) / (float) (end - start));
   }
  }

  public synchronized void close() throws IOException {
   if (in != null) {
    in.close();
   }
  }
 }

}

 

你可能感兴趣的:(KeyValueTextInputFormat 重写)