Hadoop InputFormat源码分析


Hadoop InputFormat源码分析_第1张图片




public abstract class InputSplit {

  public abstract long getLength() throws IOException, InterruptedException;

  public abstract 
    String[] getLocations() throws IOException, InterruptedException;


public class FileSplit extends InputSplit implements Writable {
  private Path file;
  private long start;
  private long length;
  private String[] hosts;

  FileSplit() {}

  /** Constructs a split with host information
   * @param file the file name
   * @param start the position of the first byte in the file to process
   * @param length the number of bytes in the file to process
   * @param hosts the list of hosts containing the block, possibly null
  public FileSplit(Path file, long start, long length, String[] hosts) {
    this.file = file;
    this.start = start;
    this.length = length;
    this.hosts = hosts;
  /** The file containing this split's data. */
  public Path getPath() { return file; }
  /** The position of the first byte in the file to process. */
  public long getStart() { return start; }
  /** The number of bytes in the file to process. */
  public long getLength() { return length; }

  public String toString() { return file + ":" + start + "+" + length; }

  // 序列化和反序列化

  public void write(DataOutput out) throws IOException {
    Text.writeString(out, file.toString());

  public void readFields(DataInput in) throws IOException {
    file = new Path(Text.readString(in));
    start = in.readLong();
    length = in.readLong();
    hosts = null;

  public String[] getLocations() throws IOException {
    if (this.hosts == null) {
      return new String[]{};
    } else {
      return this.hosts;


public class CombineFileSplit extends InputSplit implements Writable {

  private Path[] paths;
  private long[] startoffset;
  private long[] lengths;
  private String[] locations;
  private long totLength;

   * default constructor
  public CombineFileSplit() {}
  public CombineFileSplit(Path[] files, long[] start, 
                          long[] lengths, String[] locations) {
    initSplit(files, start, lengths, locations);

  public CombineFileSplit(Path[] files, long[] lengths) {
    long[] startoffset = new long[files.length];
    for (int i = 0; i < startoffset.length; i++) {
      startoffset[i] = 0;
    String[] locations = new String[files.length];
    for (int i = 0; i < locations.length; i++) {
      locations[i] = "";
    initSplit(files, startoffset, lengths, locations);
  private void initSplit(Path[] files, long[] start, 
                         long[] lengths, String[] locations) {
    this.startoffset = start;
    this.lengths = lengths;
    this.paths = files;
    this.totLength = 0;
    this.locations = locations;
    for(long length : lengths) {
      totLength += length;

   * Copy constructor
  public CombineFileSplit(CombineFileSplit old) throws IOException {
    this(old.getPaths(), old.getStartOffsets(),
         old.getLengths(), old.getLocations());

  public long getLength() {
    return totLength;

  /** Returns an array containing the start offsets of the files in the split*/ 
  public long[] getStartOffsets() {
    return startoffset;
  /** Returns an array containing the lengths of the files in the split*/ 
  public long[] getLengths() {
    return lengths;

  /** Returns the start offset of the i<sup>th</sup> Path */
  public long getOffset(int i) {
    return startoffset[i];
  /** Returns the length of the i<sup>th</sup> Path */
  public long getLength(int i) {
    return lengths[i];
  /** Returns the number of Paths in the split */
  public int getNumPaths() {
    return paths.length;

  /** Returns the i<sup>th</sup> Path */
  public Path getPath(int i) {
    return paths[i];
  /** Returns all the Paths in the split */
  public Path[] getPaths() {
    return paths;

  /** Returns all the Paths where this input-split resides */
  public String[] getLocations() throws IOException {
    return locations;

  public void readFields(DataInput in) throws IOException {
    totLength = in.readLong();
    int arrLength = in.readInt();
    lengths = new long[arrLength];
    for(int i=0; i<arrLength;i++) {
      lengths[i] = in.readLong();
    int filesLength = in.readInt();
    paths = new Path[filesLength];
    for(int i=0; i<filesLength;i++) {
      paths[i] = new Path(Text.readString(in));
    arrLength = in.readInt();
    startoffset = new long[arrLength];
    for(int i=0; i<arrLength;i++) {
      startoffset[i] = in.readLong();

  public void write(DataOutput out) throws IOException {
    for(long length : lengths) {
    for(Path p : paths) {
      Text.writeString(out, p.toString());
    for(long length : startoffset) {
 public String toString() {
    StringBuffer sb = new StringBuffer();
    for (int i = 0; i < paths.length; i++) {
      if (i == 0 ) {
      sb.append(paths[i].toUri().getPath() + ":" + startoffset[i] +
                "+" + lengths[i]);
      if (i < paths.length -1) {
    if (locations != null) {
      String locs = "";
      StringBuffer locsb = new StringBuffer();
      for (int i = 0; i < locations.length; i++) {
        locsb.append(locations[i] + ":");
      locs = locsb.toString();
      sb.append(" Locations:" + locs + "; ");
    return sb.toString();
与FileSPlit类似,CombineFileSplit同样包含文件路径,分片起始位置,分片大小和存储分片数据的host列表,由于CombineFileSplit是针对小文件的,它把很多小文件包在一个InputSplit中,这样一个Mapper就可以处理很多小文件。要知道我们上面的FileSplit是对应一个输入文件的也就是说如果用FileSplit对应的FileInputFormat来作为输入格式。那么即使文件特别小,也是单独计算成一个分片来处理的。当我们的输入是由大量小文件组成的,就会导致同样大量的InputSplit,从而需要同样大量的Mapper来处理,这将很慢,想想一堆Map Task要运行(运行一个新的MapTask可是要启动虚拟机的),这是不符合Hadoop的设计理念的,所以使用CombineFileSplit可以优化Hadoop处理众多小文件的场景。


class TaggedInputSplit extends InputSplit implements Configurable, Writable {

  private Class<? extends InputSplit> inputSplitClass;

  private InputSplit inputSplit;

  private Class<? extends InputFormat> inputFormatClass;

  private Class<? extends Mapper> mapperClass;

  private Configuration conf;

  public TaggedInputSplit() {
    // Default constructor.

   * Creates a new TaggedInputSplit.
   * @param inputSplit The InputSplit to be tagged
   * @param conf The configuration to use
   * @param inputFormatClass The InputFormat class to use for this job
   * @param mapperClass The Mapper class to use for this job
  public TaggedInputSplit(InputSplit inputSplit, Configuration conf,
      Class<? extends InputFormat> inputFormatClass,
      Class<? extends Mapper> mapperClass) {
    this.inputSplitClass = inputSplit.getClass();
    this.inputSplit = inputSplit;
    this.conf = conf;
    this.inputFormatClass = inputFormatClass;
    this.mapperClass = mapperClass;

   * Retrieves the original InputSplit.
   * @return The InputSplit that was tagged
  public InputSplit getInputSplit() {
    return inputSplit;

   * Retrieves the InputFormat class to use for this split.
   * @return The InputFormat class to use
  public Class<? extends InputFormat> getInputFormatClass() {
    return inputFormatClass;

   * Retrieves the Mapper class to use for this split.
   * @return The Mapper class to use
  public Class<? extends Mapper> getMapperClass() {
    return mapperClass;

  public long getLength() throws IOException, InterruptedException {
    return inputSplit.getLength();

  public String[] getLocations() throws IOException, InterruptedException {
    return inputSplit.getLocations();

  public void readFields(DataInput in) throws IOException {
    inputSplitClass = (Class<? extends InputSplit>) readClass(in);
    inputFormatClass = (Class<? extends InputFormat<?, ?>>) readClass(in);
    mapperClass = (Class<? extends Mapper<?, ?, ?, ?>>) readClass(in);
    inputSplit = (InputSplit) ReflectionUtils
       .newInstance(inputSplitClass, conf);
    SerializationFactory factory = new SerializationFactory(conf);
    Deserializer deserializer = factory.getDeserializer(inputSplitClass);
    inputSplit = (InputSplit)deserializer.deserialize(inputSplit);

  private Class<?> readClass(DataInput in) throws IOException {
    String className = Text.readString(in);
    try {
      return conf.getClassByName(className);
    } catch (ClassNotFoundException e) {
      throw new RuntimeException("readObject can't find class", e);

  public void write(DataOutput out) throws IOException {
    Text.writeString(out, inputSplitClass.getName());
    Text.writeString(out, inputFormatClass.getName());
    Text.writeString(out, mapperClass.getName());
    SerializationFactory factory = new SerializationFactory(conf);
    Serializer serializer = 

  public Configuration getConf() {
    return conf;

  public void setConf(Configuration conf) {
    this.conf = conf;







public abstract class InputFormat<K, V> {

  public abstract 
    List<InputSplit> getSplits(JobContext context
                               ) throws IOException, InterruptedException;
  public abstract 
    RecordReader<K,V> createRecordReader(InputSplit split,
                                         TaskAttemptContext context
                                        ) throws IOException, 




public interface PathFilter {
   * Tests whether or not the specified abstract pathname should be
   * included in a pathname list.
   * @param  path  The abstract pathname to be tested
   * @return  <code>true</code> if and only if <code>pathname</code>
   *          should be included
  boolean accept(Path path);

 private static class MultiPathFilter implements PathFilter {
    private List<PathFilter> filters;

    public MultiPathFilter() {
      this.filters = new ArrayList<PathFilter>();

    public MultiPathFilter(List<PathFilter> filters) {
      this.filters = filters;

    public void add(PathFilter one) {

    public boolean accept(Path path) {
      for (PathFilter filter : filters) {
        if (filter.accept(path)) {
          return true;
      return false;

    public String toString() {
      StringBuffer buf = new StringBuffer();
      for (PathFilter f: filters) {
      return buf.toString();


 public List<InputSplit> getSplits(JobContext job
                                    ) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus>files = listStatus(job);
    for (FileStatus file: files) {
      Path path = file.getPath();
      FileSystem fs = path.getFileSystem(job.getConfiguration());
      long length = file.getLen();
      BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
      if ((length != 0) && isSplitable(job, path)) { 
        long blockSize = file.getBlockSize();
        long splitSize = computeSplitSize(blockSize, minSize, maxSize);

        long bytesRemaining = length;
        while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
          int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
          splits.add(new FileSplit(path, length-bytesRemaining, splitSize, 
          bytesRemaining -= splitSize;
        if (bytesRemaining != 0) {
          splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining, 
      } else if (length != 0) {
        splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
      } else { 
        //Create empty hosts array for zero length files
        splits.add(new FileSplit(path, 0, length, new String[0]));
    // Save the number of input files in the job-conf
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());

    LOG.debug("Total # of splits: " + splits.size());
    return splits;






public class LineRecordReader extends RecordReader<LongWritable, Text> {
  private static final Log LOG = LogFactory.getLog(LineRecordReader.class);

  private CompressionCodecFactory compressionCodecs = null;
  private long start;
  private long pos;
  private long end;
  private LineReader in;
  private int maxLineLength;
  private LongWritable key = null;
  private Text value = null;
  private Seekable filePosition;
  private CompressionCodec codec;
  private Decompressor decompressor;

  public void initialize(InputSplit genericSplit,
                         TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;
    Configuration job = context.getConfiguration();
    this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
    start = split.getStart();
    end = start + split.getLength();
    final Path file = split.getPath();
    compressionCodecs = new CompressionCodecFactory(job);
    codec = compressionCodecs.getCodec(file);

    // open the file and seek to the start of the split
    FileSystem fs = file.getFileSystem(job);
    FSDataInputStream fileIn = fs.open(split.getPath());

    if (isCompressedInput()) {
      decompressor = CodecPool.getDecompressor(codec);
      if (codec instanceof SplittableCompressionCodec) {
        final SplitCompressionInputStream cIn =
            fileIn, decompressor, start, end,
        in = new LineReader(cIn, job);
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else {
        in = new LineReader(codec.createInputStream(fileIn, decompressor),
        filePosition = fileIn;
    } else {
      in = new LineReader(fileIn, job);
      filePosition = fileIn;
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) {
      start += in.readLine(new Text(), 0, maxBytesToConsume(start));
    this.pos = start;
  private boolean isCompressedInput() {
    return (codec != null);

  private int maxBytesToConsume(long pos) {
    return isCompressedInput()
      ? Integer.MAX_VALUE
      : (int) Math.min(Integer.MAX_VALUE, end - pos);

  private long getFilePosition() throws IOException {
    long retVal;
    if (isCompressedInput() && null != filePosition) {
      retVal = filePosition.getPos();
    } else {
      retVal = pos;
    return retVal;

  public boolean nextKeyValue() throws IOException {
    if (key == null) {
      key = new LongWritable();
    if (value == null) {
      value = new Text();
    int newSize = 0;
    // We always read one extra line, which lies outside the upper
    // split limit i.e. (end - 1)
    while (getFilePosition() <= end) {
      newSize = in.readLine(value, maxLineLength,
          Math.max(maxBytesToConsume(pos), maxLineLength));
      if (newSize == 0) {
      pos += newSize;
      if (newSize < maxLineLength) {

      // line too long. try again
      LOG.info("Skipped line of size " + newSize + " at pos " + 
               (pos - newSize));
    if (newSize == 0) {
      key = null;
      value = null;
      return false;
    } else {
      return true;

  public LongWritable getCurrentKey() {
    return key;

  public Text getCurrentValue() {
    return value;

   * Get the progress within the split
  public float getProgress() throws IOException {
    if (start == end) {
      return 0.0f;
    } else {
      return Math.min(1.0f,
        (getFilePosition() - start) / (float)(end - start));

  public synchronized void close() throws IOException {
    try {
      if (in != null) {
    } finally {
      if (decompressor != null) {



public class Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {

  public class Context 
    public Context(Configuration conf, TaskAttemptID taskid,
                   RecordReader<KEYIN,VALUEIN> reader,
                   RecordWriter<KEYOUT,VALUEOUT> writer,
                   OutputCommitter committer,
                   StatusReporter reporter,
                   InputSplit split) throws IOException, InterruptedException {
      super(conf, taskid, reader, writer, committer, reporter, split);
   * Called once at the beginning of the task.
  protected void setup(Context context
                       ) throws IOException, InterruptedException {
    // NOTHING

   * Called once for each key/value pair in the input split. Most applications
   * should override this, but the default is the identity function.
  protected void map(KEYIN key, VALUEIN value, 
                     Context context) throws IOException, InterruptedException {
    context.write((KEYOUT) key, (VALUEOUT) value);

   * Called once at the end of the task.
  protected void cleanup(Context context
                         ) throws IOException, InterruptedException {
    // NOTHING
   * Expert users can override this method for more complete control over the
   * execution of the Mapper.
   * @param context
   * @throws IOException
  public void run(Context context) throws IOException, InterruptedException {
    while (context.nextKeyValue()) {
      map(context.getCurrentKey(), context.getCurrentValue(), context);



public class MapContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> extends TaskInputOutputContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> {
	private RecordReader<KEYIN, VALUEIN> reader;
	private InputSplit split;

	public MapContext(Configuration conf, TaskAttemptID taskid, RecordReader<KEYIN, VALUEIN> reader, RecordWriter<KEYOUT, VALUEOUT> writer,
			OutputCommitter committer, StatusReporter reporter, InputSplit split) {
		super(conf, taskid, writer, committer, reporter);
		this.reader = reader;
		this.split = split;

	 * Get the input split for this map.
	public InputSplit getInputSplit() {
		return split;

	public KEYIN getCurrentKey() throws IOException, InterruptedException {
		return reader.getCurrentKey();

	public VALUEIN getCurrentValue() throws IOException, InterruptedException {
		return reader.getCurrentValue();

	public boolean nextKeyValue() throws IOException, InterruptedException {
		return reader.nextKeyValue();






public class TextInputFormat extends FileInputFormat<LongWritable, Text> {

  public RecordReader<LongWritable, Text> 
    createRecordReader(InputSplit split,
                       TaskAttemptContext context) {
    return new LineRecordReader();

  protected boolean isSplitable(JobContext context, Path file) {
    CompressionCodec codec = 
      new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    if (null == codec) {
      return true;
    return codec instanceof SplittableCompressionCodec;

我们还看到isSplitable()方法,当文件使用压缩的形式,这个文件就不可分割,否则就读取不正确的数据了。这从某种程度上将影响分片的计算。 有时我们希望一个文件只被一个Mapper处理的时候,我们就可以重写isSplitable()方法,告诉MapReduce框架,我哪些文件可以分割,哪些文件不能分割而只能作为一个分片。



public class NLineInputFormat extends FileInputFormat<LongWritable, Text> { 
  public static final String LINES_PER_MAP = 

  public RecordReader<LongWritable, Text> createRecordReader(
      InputSplit genericSplit, TaskAttemptContext context) 
      throws IOException {
    return new LineRecordReader();

   * Logically splits the set of input files for the job, splits N lines
   * of the input as one split.
   * @see FileInputFormat#getSplits(JobContext)
  public List<InputSplit> getSplits(JobContext job)
  throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
        job.getConfiguration(), numLinesPerSplit));
    return splits;
  public static List<FileSplit> getSplitsForFile(FileStatus status,
      Configuration conf, int numLinesPerSplit) throws IOException {
    List<FileSplit> splits = new ArrayList<FileSplit> ();
    Path fileName = status.getPath();
    if (status.isDir()) {
      throw new IOException("Not a file: " + fileName);
    FileSystem  fs = fileName.getFileSystem(conf);
    LineReader lr = null;
    try {
      FSDataInputStream in  = fs.open(fileName);
      lr = new LineReader(in, conf);
      Text line = new Text();
      int numLines = 0;
      long begin = 0;
      long length = 0;
      int num = -1;
      while ((num = lr.readLine(line)) > 0) {
        length += num;
        if (numLines == numLinesPerSplit) {
          splits.add(createFileSplit(fileName, begin, length));
          begin += length;
          length = 0;
          numLines = 0;
      if (numLines != 0) {
        splits.add(createFileSplit(fileName, begin, length));
    } finally {
      if (lr != null) {
    return splits; 

   * NLineInputFormat uses LineRecordReader, which always reads
   * (and consumes) at least one character out of its upper split
   * boundary. So to make sure that each mapper gets N lines, we
   * move back the upper split limits of each split 
   * by one character here.
   * @param fileName  Path of file
   * @param begin  the position of the first byte in the file to process
   * @param length  number of bytes in InputSplit
   * @return  FileSplit
  protected static FileSplit createFileSplit(Path fileName, long begin, long length) {
    return (begin == 0) 
    ? new FileSplit(fileName, begin, length - 1, new String[] {})
    : new FileSplit(fileName, begin - 1, length, new String[] {});
   * Set the number of lines per split
   * @param job the job to modify
   * @param numLines the number of lines per split
  public static void setNumLinesPerSplit(Job job, int numLines) {
    job.getConfiguration().setInt(LINES_PER_MAP, numLines);

   * Get the number of lines per split
   * @param job the job
   * @return the number of lines per split
  public static int getNumLinesPerSplit(JobContext job) {
    return job.getConfiguration().getInt(LINES_PER_MAP, 1);
