那么,FileInputFormat是怎样将他们划分成splits的呢?FileInputFormat只划分比HDFS block大的文件,所以如果一个文件的大小比block小,将不会被划分,这也是Hadoop处理大文件的效率要比处理很多小文件的效率高的原因。
hadoop默认的InputFormat是TextInputFormat,重写了FileInputFormat中的createRecordReader和isSplitable方法。该类使用的reader是LineRecordReader,即以回车键(CR = 13)或换行符(LF = 10)为行分隔符。
但大多数情况下,回车键或换行符作为输入文件的行分隔符并不能满足我们的需求,通常用户很有可能会输入回车键、换行符,所以通常我们会定义不可见字符(即用户无法输入的字符)为行分隔符,这种情况下,就需要新写一个InputFormat。
又或者,一条记录的分隔符不是字符,而是字符串,这种情况相对麻烦;还有一种情况,输入文件的主键key已经是排好序的了,需要hadoop做的只是把相同的key作为一个数据块进行逻辑处理,这种情况更麻烦,相当于免去了mapper的过程,直接进去reduce,那么InputFormat的逻辑就相对较为复杂了,但并不是不能实现。
1、改变一条记录的分隔符,不用默认的回车或换行符作为记录分隔符,甚至可以采用字符串作为记录分隔符。
1)自定义一个InputFormat,继承FileInputFormat,重写createRecordReader方法,如果不需要分片或者需要改变分片的方式,则重写isSplitable方法,具体代码如下:
public class FileInputFormatB extends FileInputFormat<LongWritable, Text> { @Override public RecordReader<LongWritable, Text> createRecordReader( InputSplit split, TaskAttemptContext context) { return new SearchRecordReader("\b"); } @Override protected boolean isSplitable(FileSystem fs, Path filename) { // 输入文件不分片 return false; } }
2)关键在于定义一个新的SearchRecordReader继承RecordReader,支持自定义的行分隔符,即一条记录的分隔符。标红的地方为与hadoop默认的LineRecordReader不同的地方。
1 public class IsearchRecordReader extends RecordReader<LongWritable, Text> { 2 private static final Log LOG = LogFactory.getLog(IsearchRecordReader.class); 3 4 private CompressionCodecFactory compressionCodecs = null; 5 private long start; 6 private long pos; 7 private long end; 8 private LineReader in; 9 private int maxLineLength; 10 private LongWritable key = null; 11 private Text value = null; 12 //行分隔符,即一条记录的分隔符 13 private byte[] separator = {'\b'}; 14 private int sepLength = 1; 15 16 public IsearchRecordReader(){ 17 } 18 public IsearchRecordReader(String seps){ 19 this.separator = seps.getBytes(); 20 sepLength = separator.length; 21 } 22 23 public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { 24 FileSplit split = (FileSplit) genericSplit; 25 Configuration job = context.getConfiguration(); 26 this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); 27 28 this.start = split.getStart(); 29 this.end = (this.start + split.getLength()); 30 Path file = split.getPath(); 31 this.compressionCodecs = new CompressionCodecFactory(job); 32 CompressionCodec codec = this.compressionCodecs.getCodec(file); 33 34 // open the file and seek to the start of the split 35 FileSystem fs = file.getFileSystem(job); 36 FSDataInputStream fileIn = fs.open(split.getPath()); 37 boolean skipFirstLine = false; 38 if (codec != null) { 39 this.in = new LineReader(codec.createInputStream(fileIn), job); 40 this.end = Long.MAX_VALUE; 41 } else { 42 if (this.start != 0L) { 43 skipFirstLine = true; 44 this.start -= sepLength; 45 fileIn.seek(this.start); 46 } 47 this.in = new LineReader(fileIn, job); 48 } 49 if (skipFirstLine) { // skip first line and re-establish "start". 50 int newSize = in.readLine(new Text(), 0, (int) Math.min( (long) Integer.MAX_VALUE, end - start)); 51 52 if(newSize > 0){ 53 start += newSize; 54 } 55 } 56 57 this.pos = this.start; 58 } 59 60 public boolean nextKeyValue() throws IOException { 61 if (this.key == null) { 62 this.key = new LongWritable(); 63 } 64 this.key.set(this.pos); 65 if (this.value == null) { 66 this.value = new Text(); 67 } 68 int newSize = 0; 69 while (this.pos < this.end) { 70 newSize = this.in.readLine(this.value, this.maxLineLength, Math.max( 71 (int) Math.min(Integer.MAX_VALUE, this.end - this.pos), this.maxLineLength)); 72 73 if (newSize == 0) { 74 break; 75 } 76 this.pos += newSize; 77 if (newSize < this.maxLineLength) { 78 break; 79 } 80 81 LOG.info("Skipped line of size " + newSize + " at pos " + (this.pos - newSize)); 82 } 83 84 if (newSize == 0) { 85 //读下一个buffer 86 this.key = null; 87 this.value = null; 88 return false; 89 } 90 //读同一个buffer的下一个记录 91 return true; 92 } 93 94 public LongWritable getCurrentKey() { 95 return this.key; 96 } 97 98 public Text getCurrentValue() { 99 return this.value; 100 } 101 102 public float getProgress() { 103 if (this.start == this.end) { 104 return 0.0F; 105 } 106 return Math.min(1.0F, (float) (this.pos - this.start) / (float) (this.end - this.start)); 107 } 108 109 public synchronized void close() throws IOException { 110 if (this.in != null) 111 this.in.close(); 112 } 113 114 }
3)重写SearchRecordReader需要的LineReader,可作为SearchRecordReader内部类。特别需要注意的地方就是,读取文件的方式是按指定大小的buffer来读,必定就会遇到一条完整的记录被切成两半,甚至如果分隔符大于1个字符时分隔符也会被切成两半的情况,这种情况一定要加以拼接处理。
1 public class LineReader { 2 //回车键(hadoop默认) 3 //private static final byte CR = 13; 4 //换行符(hadoop默认) 5 //private static final byte LF = 10; 6 7 //按buffer进行文件读取 8 private static final int DEFAULT_BUFFER_SIZE = 32 * 1024 * 1024; 9 private int bufferSize = DEFAULT_BUFFER_SIZE; 10 private InputStream in; 11 private byte[] buffer; 12 private int bufferLength = 0; 13 private int bufferPosn = 0; 14 15 LineReader(InputStream in, int bufferSize) { 16 this.bufferLength = 0; 17 this.bufferPosn = 0; 18 19 this.in = in; 20 this.bufferSize = bufferSize; 21 this.buffer = new byte[this.bufferSize]; 22 } 23 24 public LineReader(InputStream in, Configuration conf) throws IOException { 25 this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE)); 26 } 27 28 public void close() throws IOException { 29 in.close(); 30 } 31 32 public int readLine(Text str, int maxLineLength) throws IOException { 33 return readLine(str, maxLineLength, Integer.MAX_VALUE); 34 } 35 36 public int readLine(Text str) throws IOException { 37 return readLine(str, Integer.MAX_VALUE, Integer.MAX_VALUE); 38 } 39 40 //以下是需要改写的部分_start,核心代码 41 42 public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException{ 43 str.clear(); 44 Text record = new Text(); 45 int txtLength = 0; 46 long bytesConsumed = 0L; 47 boolean newline = false; 48 int sepPosn = 0; 49 50 do { 51 //已经读到buffer的末尾了,读下一个buffer 52 if (this.bufferPosn >= this.bufferLength) { 53 bufferPosn = 0; 54 bufferLength = in.read(buffer); 55 56 //读到文件末尾了,则跳出,进行下一个文件的读取 57 if (bufferLength <= 0) { 58 break; 59 } 60 } 61 62 int startPosn = this.bufferPosn; 63 for (; bufferPosn < bufferLength; bufferPosn ++) { 64 //处理上一个buffer的尾巴被切成了两半的分隔符(如果分隔符中重复字符过多在这里会有问题) 65 if(sepPosn > 0 && buffer[bufferPosn] != separator[sepPosn]){ 66 sepPosn = 0; 67 } 68 69 //遇到行分隔符的第一个字符 70 if (buffer[bufferPosn] == separator[sepPosn]) { 71 bufferPosn ++; 72 int i = 0; 73 74 //判断接下来的字符是否也是行分隔符中的字符 75 for(++ sepPosn; sepPosn < sepLength; i ++, sepPosn ++){ 76 77 //buffer的最后刚好是分隔符,且分隔符被不幸地切成了两半 78 if(bufferPosn + i >= bufferLength){ 79 bufferPosn += i - 1; 80 break; 81 } 82 83 //一旦其中有一个字符不相同,就判定为不是分隔符 84 if(this.buffer[this.bufferPosn + i] != separator[sepPosn]){ 85 sepPosn = 0; 86 break; 87 } 88 } 89 90 //的确遇到了行分隔符 91 if(sepPosn == sepLength){ 92 bufferPosn += i; 93 newline = true; 94 sepPosn = 0; 95 break; 96 } 97 } 98 } 99 100 int readLength = this.bufferPosn - startPosn; 101 102 bytesConsumed += readLength; 103 //行分隔符不放入块中 104 //int appendLength = readLength - newlineLength; 105 if (readLength > maxLineLength - txtLength) { 106 readLength = maxLineLength - txtLength; 107 } 108 if (readLength > 0) { 109 record.append(this.buffer, startPosn, readLength); 110 txtLength += readLength; 111 112 //去掉记录的分隔符 113 if(newline){ 114 str.set(record.getBytes(), 0, record.getLength() - sepLength); 115 } 116 } 117 118 } while (!newline && (bytesConsumed < maxBytesToConsume)); 119 120 if (bytesConsumed > (long)Integer.MAX_VALUE) { 121 throw new IOException("Too many bytes before newline: " + bytesConsumed); 122 } 123 124 return (int) bytesConsumed; 125 } 126 127 //以下是需要改写的部分_end 128 129 //以下是hadoop-core中LineReader的源码_start 130 131 public int readLine(Text str, int maxLineLength, int maxBytesToConsume) throws IOException{ 132 str.clear(); 133 int txtLength = 0; 134 int newlineLength = 0; 135 boolean prevCharCR = false; 136 long bytesConsumed = 0L; 137 do { 138 int startPosn = this.bufferPosn; 139 if (this.bufferPosn >= this.bufferLength) { 140 startPosn = this.bufferPosn = 0; 141 if (prevCharCR) bytesConsumed ++; 142 this.bufferLength = this.in.read(this.buffer); 143 if (this.bufferLength <= 0) break; 144 } 145 for (; this.bufferPosn < this.bufferLength; this.bufferPosn ++) { 146 if (this.buffer[this.bufferPosn] == LF) { 147 newlineLength = (prevCharCR) ? 2 : 1; 148 this.bufferPosn ++; 149 break; 150 } 151 if (prevCharCR) { 152 newlineLength = 1; 153 break; 154 } 155 prevCharCR = this.buffer[this.bufferPosn] == CR; 156 } 157 int readLength = this.bufferPosn - startPosn; 158 if ((prevCharCR) && (newlineLength == 0)) 159 --readLength; 160 bytesConsumed += readLength; 161 int appendLength = readLength - newlineLength; 162 if (appendLength > maxLineLength - txtLength) { 163 appendLength = maxLineLength - txtLength; 164 } 165 if (appendLength > 0) { 166 str.append(this.buffer, startPosn, appendLength); 167 txtLength += appendLength; } 168 } 169 while ((newlineLength == 0) && (bytesConsumed < maxBytesToConsume)); 170 171 if (bytesConsumed > (long)Integer.MAX_VALUE) throw new IOException("Too many bytes before newline: " + bytesConsumed); 172 return (int)bytesConsumed; 173 } 174 175 //以下是hadoop-core中LineReader的源码_end 176 177 }
2、已经按主键key排好序了,并保证相同主键key一定是在一起的,假设每条记录的第一个字段为主键,那么如果沿用上面的LineReader,需要在核心方法readLine中对前后两条记录的id进行equals判断,如果不同才进行split,如果相同继续下一条记录的判断。代码就不再贴了,但需要注意的地方,依旧是前后两个buffer进行交接的时候,非常有可能一条记录被切成了两半,一半在前一个buffer中,一半在后一个buffer中。
这种方式的好处在于少去了reduce操作,会大大地提高效率,其实mapper的过程相当的快,费时的通常是reduce。