hadoop2.2编程:自定义hadoop map/reduce输入文件切割InputFormat

        hadoop会对原始输入文件进行文件切割,然后把每个split传入mapper程序中进行处理,FileInputFormat是所有以文件作为数据源的InputFormat实现的基类,FileInputFormat保存作为job输入的所有文件,并实现了对输入文件计算splits的方法。至于获得记录的方法是有不同的子类进行实现的。

        那么,FileInputFormat是怎样将他们划分成splits的呢?FileInputFormat只划分比HDFS block大的文件,所以如果一个文件的大小比block小,将不会被划分,这也是Hadoop处理大文件的效率要比处理很多小文件的效率高的原因。 

       hadoop默认的InputFormat是TextInputFormat,重写了FileInputFormat中的createRecordReader和isSplitable方法。该类使用的reader是LineRecordReader,即以回车键(CR = 13)或换行符(LF = 10)为行分隔符。

      但大多数情况下,回车键或换行符作为输入文件的行分隔符并不能满足我们的需求,通常用户很有可能会输入回车键、换行符,所以通常我们会定义不可见字符(即用户无法输入的字符)为行分隔符,这种情况下,就需要新写一个InputFormat。

      又或者,一条记录的分隔符不是字符,而是字符串,这种情况相对麻烦;还有一种情况,输入文件的主键key已经是排好序的了,需要hadoop做的只是把相同的key作为一个数据块进行逻辑处理,这种情况更麻烦,相当于免去了mapper的过程,直接进去reduce,那么InputFormat的逻辑就相对较为复杂了,但并不是不能实现。

    1、改变一条记录的分隔符,不用默认的回车或换行符作为记录分隔符,甚至可以采用字符串作为记录分隔符
     1)自定义一个InputFormat,继承FileInputFormat,重写createRecordReader方法,如果不需要分片或者需要改变分片的方式,则重写isSplitable方法,具体代码如下:

public class FileInputFormatB extends FileInputFormat<LongWritable, Text> {



   @Override



   public RecordReader<LongWritable, Text> createRecordReader( InputSplit split, TaskAttemptContext context) {

        return new SearchRecordReader("\b");



    }



    @Override

    protected boolean isSplitable(FileSystem fs, Path filename) {

         // 输入文件不分片

        return false;

     }

}

 

   2)关键在于定义一个新的SearchRecordReader继承RecordReader,支持自定义的行分隔符,即一条记录的分隔符。标红的地方为与hadoop默认的LineRecordReader不同的地方。

  1 public class IsearchRecordReader extends RecordReader<LongWritable, Text> {

  2  private static final Log LOG = LogFactory.getLog(IsearchRecordReader.class);

  3  

  4  private CompressionCodecFactory compressionCodecs = null;

  5  private long start;

  6  private long pos;

  7  private long end;

  8  private LineReader in;

  9  private int maxLineLength;

 10  private LongWritable key = null;

 11  private Text value = null;

 12  //行分隔符,即一条记录的分隔符

 13  private byte[] separator = {'\b'};

 14  private int sepLength = 1;

 15 

 16public IsearchRecordReader(){

 17  }

 18  public IsearchRecordReader(String seps){

 19   this.separator = seps.getBytes(); 

 20   sepLength = separator.length;

 21  }

 22 

 23  public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {

 24   FileSplit split = (FileSplit) genericSplit;

 25   Configuration job = context.getConfiguration();

 26   this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE);

 27 

 28   this.start = split.getStart();

 29   this.end = (this.start + split.getLength());

 30   Path file = split.getPath();

 31   this.compressionCodecs = new CompressionCodecFactory(job);

 32   CompressionCodec codec = this.compressionCodecs.getCodec(file);

 33 

 34   // open the file and seek to the start of the split

 35   FileSystem fs = file.getFileSystem(job);

 36   FSDataInputStream fileIn = fs.open(split.getPath());

 37   boolean skipFirstLine = false;

 38   if (codec != null) {

 39    this.in = new LineReader(codec.createInputStream(fileIn), job);

 40    this.end = Long.MAX_VALUE;

 41   } else {

 42    if (this.start != 0L) {

 43     skipFirstLine = true;

 44     this.start -= sepLength;

 45     fileIn.seek(this.start);

 46    }

 47    this.in = new LineReader(fileIn, job);

 48   }

 49   if (skipFirstLine) { // skip first line and re-establish "start".

 50    int newSize = in.readLine(new Text(), 0, (int) Math.min( (long) Integer.MAX_VALUE, end - start));

 51    

 52    if(newSize > 0){

 53     start += newSize;

 54    }

 55   }

 56 

 57   this.pos = this.start;

 58  }

 59 

 60  public boolean nextKeyValue() throws IOException {

 61   if (this.key == null) {

 62    this.key = new LongWritable();

 63   }

 64   this.key.set(this.pos);

 65   if (this.value == null) {

 66    this.value = new Text();

 67   }

 68   int newSize = 0;

 69   while (this.pos < this.end) {

 70    newSize = this.in.readLine(this.value, this.maxLineLength, Math.max(

 71  (int) Math.min(Integer.MAX_VALUE, this.end - this.pos), this.maxLineLength));

 72 

 73    if (newSize == 0) {

 74     break;

 75    }

 76    this.pos += newSize;

 77    if (newSize < this.maxLineLength) {

 78     break;

 79    }

 80 

 81    LOG.info("Skipped line of size " + newSize + " at pos " + (this.pos - newSize));

 82   }

 83 

 84   if (newSize == 0) {

 85    //读下一个buffer

 86    this.key = null;

 87    this.value = null;

 88    return false;

 89   }

 90   //读同一个buffer的下一个记录

 91   return true;

 92  }

 93 

 94  public LongWritable getCurrentKey() {

 95   return this.key;

 96  }

 97 

 98  public Text getCurrentValue() {

 99   return this.value;

100  }

101 

102  public float getProgress() {

103   if (this.start == this.end) {

104    return 0.0F;

105   }

106   return Math.min(1.0F, (float) (this.pos - this.start) / (float) (this.end - this.start));

107  }

108 

109  public synchronized void close() throws IOException {

110   if (this.in != null)

111    this.in.close();

112  }

113 

114 }

 

   3)重写SearchRecordReader需要的LineReader,可作为SearchRecordReader内部类。特别需要注意的地方就是,读取文件的方式是按指定大小的buffer来读,必定就会遇到一条完整的记录被切成两半,甚至如果分隔符大于1个字符时分隔符也会被切成两半的情况,这种情况一定要加以拼接处理。

  1 public class LineReader {

  2   //回车键(hadoop默认)

  3   //private static final byte CR = 13;

  4   //换行符(hadoop默认)

  5   //private static final byte LF = 10;

  6     

  7   //按buffer进行文件读取

  8   private static final int DEFAULT_BUFFER_SIZE = 32 * 1024 * 1024;

  9   private int bufferSize = DEFAULT_BUFFER_SIZE;

 10   private InputStream in;

 11   private byte[] buffer;

 12   private int bufferLength = 0;

 13   private int bufferPosn = 0;

 14   

 15   LineReader(InputStream in, int bufferSize) {

 16    this.bufferLength = 0;

 17     this.bufferPosn = 0;

 18       

 19    this.in = in;

 20    this.bufferSize = bufferSize;

 21    this.buffer = new byte[this.bufferSize];

 22   }

 23 

 24   public LineReader(InputStream in, Configuration conf) throws IOException {

 25    this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));

 26   }

 27 

 28   public void close() throws IOException {

 29    in.close();

 30   }

 31 

 32  public int readLine(Text str, int maxLineLength) throws IOException {

 33    return readLine(str, maxLineLength, Integer.MAX_VALUE);

 34   }

 35 

 36   public int readLine(Text str) throws IOException {

 37    return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE);

 38   }

 39 

 40   //以下是需要改写的部分_start,核心代码

 41 

 42   public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException{

 43    str.clear();

 44    Text record = new Text();

 45    int txtLength = 0;

 46    long bytesConsumed = 0L;

 47    boolean newline = false;

 48    int sepPosn = 0;

 49    

 50    do {

 51     //已经读到buffer的末尾了,读下一个buffer

 52     if (this.bufferPosn >= this.bufferLength) {

 53      bufferPosn = 0;

 54      bufferLength = in.read(buffer);

 55      

 56      //读到文件末尾了,则跳出,进行下一个文件的读取

 57      if (bufferLength <= 0) {

 58       break;

 59      }

 60     }

 61     

 62     int startPosn = this.bufferPosn;

 63     for (; bufferPosn < bufferLength; bufferPosn ++) {

 64      //处理上一个buffer的尾巴被切成了两半的分隔符(如果分隔符中重复字符过多在这里会有问题)

 65      if(sepPosn > 0 && buffer[bufferPosn] != separator[sepPosn]){

 66       sepPosn = 0;

 67      }

 68      

 69      //遇到行分隔符的第一个字符

 70      if (buffer[bufferPosn] == separator[sepPosn]) {

 71       bufferPosn ++;

 72       int i = 0;

 73       

 74       //判断接下来的字符是否也是行分隔符中的字符

 75       for(++ sepPosn; sepPosn < sepLength; i ++, sepPosn ++){

 76        

 77        //buffer的最后刚好是分隔符,且分隔符被不幸地切成了两半

 78        if(bufferPosn + i >= bufferLength){

 79         bufferPosn += i - 1;

 80         break;

 81        }

 82        

 83        //一旦其中有一个字符不相同,就判定为不是分隔符

 84        if(this.buffer[this.bufferPosn + i] != separator[sepPosn]){

 85         sepPosn = 0;

 86         break;

 87        }

 88       }

 89       

 90       //的确遇到了行分隔符

 91       if(sepPosn == sepLength){

 92        bufferPosn += i;

 93        newline = true;

 94        sepPosn = 0;

 95        break;

 96       }

 97      }

 98     }

 99     

100     int readLength = this.bufferPosn - startPosn;

101 

102     bytesConsumed += readLength;

103     //行分隔符不放入块中

104     //int appendLength = readLength - newlineLength;

105     if (readLength > maxLineLength - txtLength) {

106      readLength = maxLineLength - txtLength;

107     }

108     if (readLength > 0) {

109      record.append(this.buffer, startPosn, readLength);

110      txtLength += readLength;

111      

112      //去掉记录的分隔符

113      if(newline){

114       str.set(record.getBytes(), 0, record.getLength() - sepLength);

115      }

116     }

117 

118    } while (!newline && (bytesConsumed < maxBytesToConsume));

119 

120    if (bytesConsumed > (long)Integer.MAX_VALUE) {

121     throw new IOException("Too many bytes before newline: " + bytesConsumed);

122    }

123    

124    return (int) bytesConsumed;

125   }

126 

127   //以下是需要改写的部分_end

128 

129 //以下是hadoop-core中LineReader的源码_start

130 

131 public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException{

132     str.clear();

133     int txtLength = 0;

134     int newlineLength = 0;

135     boolean prevCharCR = false;

136     long bytesConsumed = 0L;

137     do {

138       int startPosn = this.bufferPosn;

139       if (this.bufferPosn >= this.bufferLength) {

140         startPosn = this.bufferPosn = 0;

141         if (prevCharCR)  bytesConsumed ++;

142         this.bufferLength = this.in.read(this.buffer);

143         if (this.bufferLength <= 0)  break;

144       }

145       for (; this.bufferPosn < this.bufferLength; this.bufferPosn ++) {

146         if (this.buffer[this.bufferPosn] == LF) {

147           newlineLength = (prevCharCR) ? 2 : 1;

148           this.bufferPosn ++;

149           break;

150         }

151         if (prevCharCR) {

152           newlineLength = 1;

153           break;

154         }

155         prevCharCR = this.buffer[this.bufferPosn] == CR;

156       }

157       int readLength = this.bufferPosn - startPosn;

158       if ((prevCharCR) && (newlineLength == 0))

159         --readLength;

160       bytesConsumed += readLength;

161       int appendLength = readLength - newlineLength;

162       if (appendLength > maxLineLength - txtLength) {

163         appendLength = maxLineLength - txtLength;

164       }

165       if (appendLength > 0) {

166         str.append(this.buffer, startPosn, appendLength);

167         txtLength += appendLength; }

168     }

169     while ((newlineLength == 0) && (bytesConsumed < maxBytesToConsume));

170 

171     if (bytesConsumed > (long)Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed);

172     return (int)bytesConsumed;

173   }

174 

175 //以下是hadoop-core中LineReader的源码_end

176 

177 }

 

2、已经按主键key排好序了,并保证相同主键key一定是在一起的,假设每条记录的第一个字段为主键,那么如果沿用上面的LineReader,需要在核心方法readLine中对前后两条记录的id进行equals判断,如果不同才进行split,如果相同继续下一条记录的判断。代码就不再贴了,但需要注意的地方,依旧是前后两个buffer进行交接的时候,非常有可能一条记录被切成了两半,一半在前一个buffer中,一半在后一个buffer中。

     这种方式的好处在于少去了reduce操作,会大大地提高效率,其实mapper的过程相当的快,费时的通常是reduce。

你可能感兴趣的:(hadoop2)