code version: hadoop-0.19.1
首先说pread。pread会明确的把要读的size传给datanode(在new BlockReader的时候)
/** * Read bytes starting from the specified position. * * @param position start read from this position * @param buffer read buffer * @param offset offset into buffer * @param length number of bytes to read * * @return actual number of bytes read */ @Override public int read(long position, byte[] buffer, int offset, int length) throws IOException { // sanity checks checkOpen(); if (closed) { throw new IOException("Stream closed"); } long filelen = getFileLength(); if ((position < 0) || (position >= filelen)) { return -1; } int realLen = length; if ((position + length) > filelen) { realLen = (int)(filelen - position); } // determine the block and byte range within the block // corresponding to position and realLen List<LocatedBlock> blockRange = getBlockRange(position, realLen); int remaining = realLen; for (LocatedBlock blk : blockRange) { long targetStart = position - blk.getStartOffset(); long bytesToRead = Math.min(remaining, blk.getBlockSize() - targetStart); fetchBlockByteRange(blk, targetStart, targetStart + bytesToRead - 1, buffer, offset); remaining -= bytesToRead; position += bytesToRead; offset += bytesToRead; } assert remaining == 0 : "Wrong number of bytes read."; if (stats != null) { stats.incrementBytesRead(realLen); } return realLen; }
private void fetchBlockByteRange(LocatedBlock block, long start, long end, byte[] buf, int offset) throws IOException { // // Connect to best DataNode for desired Block, with potential offset // Socket dn = null; int numAttempts = block.getLocations().length; IOException ioe = null; while (dn == null && numAttempts-- > 0 ) { long prepareRealReadStart = System.currentTimeMillis(); DNAddrPair retval = chooseDataNode(block); DatanodeInfo chosenNode = retval.info; InetSocketAddress targetAddr = retval.addr; BlockReader reader = null; try { dn = socketFactory.createSocket(); NetUtils.connect(dn, targetAddr, socketTimeout); dn.setSoTimeout(socketTimeout); int len = (int) (end - start + 1); reader = BlockReader.newBlockReader(dn, src, block.getBlock().getBlockId(), block.getBlock().getGenerationStamp(), start, len, buffersize, verifyChecksum, clientName); int nread = reader.readAll(buf, offset, len); if (nread != len) { throw new IOException("truncated return from reader.read(): " + "excpected " + len + ", got " + nread); } return; } catch (ChecksumException e) { ioe = e; LOG.warn("fetchBlockByteRange(). Got a checksum exception for " + src + " at " + block.getBlock() + ":" + e.getPos() + " from " + chosenNode.getName()); reportChecksumFailure(src, block.getBlock(), chosenNode); } catch (IOException e) { ioe = e; LOG.warn("Failed to connect to " + targetAddr + " for file " + src + " for block " + block.getBlock().getBlockId() + ":" + StringUtils.stringifyException(e)); } finally { IOUtils.closeStream(reader); IOUtils.closeSocket(dn); dn = null; } // Put chosen node into dead list, continue addToDeadNodes(chosenNode); } throw (ioe == null) ? new IOException("Could not read data") : ioe; }
pread 的过程:
根据要读的数据的offset和readLen,计算出要读的blockRange,即有哪些block在要读的范围内。具体的getBlockRange()这个函数中要判断要读的blocks是否在维护的locatedBlocks这个block cache中,如果不在,要问namenode查询,然后再放入到cache中。
然后在针对获得的blockRange中每个block读取数据,选取datanode,创建连接,对每个block都要重新生成一个BlockReader(这种实现比较废柴啊!)
然后看seek+read,read会把当前位置到block结束的长度传给datanode (也是在new BlockReader的时候)这样DataNode就可以read ahead,然后由于TCP_WINDOW的buffer作用(hadoop code里面是128K),可以加快连续读的性能。
/** * Seek to a new arbitrary location */ @Override public synchronized void seek(long targetPos) throws IOException { if (targetPos > getFileLength()) { throw new IOException("Cannot seek after EOF"); } boolean done = false; if (pos <= targetPos && targetPos <= blockEnd) { // // If this seek is to a positive position in the current // block, and this piece of data might already be lying in // the TCP buffer, then just eat up the intervening data. // int diff = (int)(targetPos - pos); if (diff <= TCP_WINDOW_SIZE) { try { pos += blockReader.skip(diff); if (pos == targetPos) { done = true; } } catch (IOException e) {//make following read to retry LOG.debug("Exception while seek to " + targetPos + " from " + currentBlock +" of " + src + " from " + currentNode + ": " + StringUtils.stringifyException(e)); } } } if (!done) { pos = targetPos; blockEnd = -1; } }
这个 seek其实是不做什么事情的(我的测试中做上万次seek,平均时间是0)。它主要移动pos这个游标:如果在当前block中,就移动到正确位置,否则,就把pos设成目标位置,但是blockEnd置成-1.这样其实最终的seek任务是在后面的read里面实现的。
看read的code:
/** * Read the entire buffer. */ @Override public synchronized int read(byte buf[], int off, int len) throws IOException { checkOpen(); if (closed) { throw new IOException("Stream closed"); } if (pos < getFileLength()) { int retries = 2; while (retries > 0) { try { if (pos > blockEnd) { currentNode = blockSeekTo(pos); } int realLen = Math.min(len, (int) (blockEnd - pos + 1)); int result = readBuffer(buf, off, realLen); if (result >= 0) { pos += result; } else { // got a EOS from reader though we expect more data on it. throw new IOException("Unexpected EOS from the reader"); } if (stats != null && result != -1) { stats.incrementBytesRead(result); } return result; } catch (ChecksumException ce) { throw ce; } catch (IOException e) { if (retries == 1) { LOG.warn("DFS Read: " + StringUtils.stringifyException(e)); } blockEnd = -1; if (currentNode != null) { addToDeadNodes(currentNode); } if (--retries == 0) { throw e; } } } } return -1; }
如果上面seek的时候要seek的位置在同一个block,现在就只需直接读好了。如果不在,刚才知道blockEnd会被置成-1,现在就要做一次真正的seek操作,函数blockSeekTo()实现这一功能。
看blockSeekTo()干了哪些事情:
/** * Open a DataInputStream to a DataNode so that it can be read from. * We get block ID and the IDs of the destinations at startup, from the namenode. */ private synchronized DatanodeInfo blockSeekTo(long target) throws IOException { if (target >= getFileLength()) { throw new IOException("Attempted to read past end of file"); } if ( blockReader != null ) { blockReader.close(); blockReader = null; } if (s != null) { s.close(); s = null; } // // Compute desired block // LocatedBlock targetBlock = getBlockAt(target); assert (target==this.pos) : "Wrong postion " + pos + " expect " + target; long offsetIntoBlock = target - targetBlock.getStartOffset(); // // Connect to best DataNode for desired Block, with potential offset // DatanodeInfo chosenNode = null; while (s == null) { DNAddrPair retval = chooseDataNode(targetBlock); chosenNode = retval.info; InetSocketAddress targetAddr = retval.addr; try { s = socketFactory.createSocket(); NetUtils.connect(s, targetAddr, socketTimeout); s.setSoTimeout(socketTimeout); Block blk = targetBlock.getBlock(); blockReader = BlockReader.newBlockReader(s, src, blk.getBlockId(), blk.getGenerationStamp(), offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock, buffersize, verifyChecksum, clientName); return chosenNode; } catch (IOException ex) { // Put chosen node into dead list, continue LOG.debug("Failed to connect to " + targetAddr + ":" + StringUtils.stringifyException(ex)); addToDeadNodes(chosenNode); if (s != null) { try { s.close(); } catch (IOException iex) { } } s = null; } } return chosenNode; }
发现这个函数跟上面的fetchBlockByteRange一样废柴,先把先前的blockReader close掉,然后再创建到目的datanode的新连接,主要不同的地方在哪里呢,在于getBlockRange和getBlockAt的区别,上面的pread模式提供了第二个参数readLen,找目标blocks的时候找这个范围内的就可以了,而seek+read这种模式假设的是seek的时候并不知道后面要读的长度,所以用了一个缺省的prefetchSize,缺省是10个block size大小。
那综合考虑两种read,作为random read的实现,第一种pread无论何时都要重新创建连接,第二种当要读的数据在当前block的时候可以重用上次的连接,理论上应该第二种效率高些。