HDFS源代码分析之DataNode BlockScanner实现

BlockScanner实现

每个DataNode都会有一个BlockScanner周期性的验证DataNode上存储的所有数据块的正确性,并把损坏的数据块报告给NameNode。
VolumeScanner是专门针对每个存储目录做块扫描的服务,由于DataNode可以使用多目录,所以BlockScanner会持有多个VolumeScanner。

public class BlockScanner {
  ...
  // 保存VolumeScanner
  private final TreeMap<String, VolumeScanner> scanners =
      new TreeMap<String, VolumeScanner>();
  ...
	// 添加VolumeScanner对象
   public synchronized void addVolumeScanner(FsVolumeReference ref) {
	   boolean success = false;
	   try {
	     FsVolumeSpi volume = ref.getVolume();
	     if (!isEnabled()) {
	       LOG.debug("Not adding volume scanner for {}, because the block " +
	           "scanner is disabled.", volume.getBasePath());
	       return;
	     }
	     // 在map查找VolumeScanner对象
	     VolumeScanner scanner = scanners.get(volume.getStorageID());
	     if (scanner != null) {
	       LOG.error("Already have a scanner for volume {}.",
	           volume.getBasePath());
	       return;
	     }
	     LOG.debug("Adding scanner for volume {} (StorageID {})",
	         volume.getBasePath(), volume.getStorageID());
	     // 创建VolumeScanner
	     scanner = new VolumeScanner(conf, datanode, ref);
	     // 启动VolumeScanner
	     scanner.start();
	     // 将VolumeScanner加入map中
	     scanners.put(volume.getStorageID(), scanner);
	     success = true;
	   } finally {
	     if (!success) {
	       // If we didn't create a new VolumeScanner object, we don't
	       // need this reference to the volume.
	       IOUtils.cleanup(null, ref);
	     }
	 }
 }

}
VolumeScanner实现
public class VolumeScanner extends Thread {
...
  // 可疑块列表,scanner会优先扫描
  private final LinkedHashSet<ExtendedBlock> suspectBlocks =
      new LinkedHashSet<ExtendedBlock>();
...

  public void run() {
    // Record the minute on which the scanner started.
    // 记录扫描时间
    this.startMinute =
        TimeUnit.MINUTES.convert(Time.monotonicNow(), TimeUnit.MILLISECONDS);
    this.curMinute = startMinute;
    try {
      LOG.trace("{}: thread starting.", this);
      resultHandler.setup(this);
      try {
        long timeout = 0;
        while (true) {
          ExtendedBlock suspectBlock = null;
          synchronized (this) {
            if (stopping) {
              break;
            }
            if (timeout > 0) {
              LOG.debug("{}: wait for {} milliseconds", this, timeout);
              wait(timeout);
              if (stopping) {
                break;
              }
            }
            // 获取下一个可疑块
            suspectBlock = popNextSuspectBlock();
          }
          // 进行扫描
          timeout = runLoop(suspectBlock);
        }
      } catch (InterruptedException e) {
        // We are exiting because of an InterruptedException,
        // probably sent by VolumeScanner#shutdown.
        LOG.trace("{} exiting because of InterruptedException.", this);
      } catch (Throwable e) {
        LOG.error("{} exiting because of exception ", this, e);
      }
      LOG.info("{} exiting.", this);
      // Save the current position of all block iterators and close them.
      for (BlockIterator iter : blockIters) {
        saveBlockIterator(iter);
        IOUtils.cleanup(null, iter);
      }
    } finally {
      // When the VolumeScanner exits, release the reference we were holding
      // on the volume.  This will allow the volume to be removed later.
      IOUtils.cleanup(null, ref);
    }
  }
}

suspectBlocks是VolumeScanner所维护的可疑块列表。markSuspectBlock负责将可疑块加入suspectBlocks列表中。

public synchronized void markSuspectBlock(ExtendedBlock block) {
    if (stopping) {
      LOG.debug("{}: Not scheduling suspect block {} for " +
          "rescanning, because this volume scanner is stopping.", this, block);
      return;
    }
    Boolean recent = recentSuspectBlocks.getIfPresent(block);
    if (recent != null) {
      LOG.debug("{}: Not scheduling suspect block {} for " +
          "rescanning, because we rescanned it recently.", this, block);
      return;
    }
    if (suspectBlocks.contains(block)) {
      LOG.debug("{}: suspect block {} is already queued for " +
          "rescanning.", this, block);
      return;
    }
    suspectBlocks.add(block);
    recentSuspectBlocks.put(block, true);
    LOG.debug("{}: Scheduling suspect block {} for rescanning.", this, block);
    notify(); // wake scanner thread.
  }

markSuspectBlock在BlockSender的sendPacket方法中被调用。当发生了IO异常时会进行可疑块标记。

private int sendPacket(ByteBuffer pkt, int maxChunks, OutputStream out,
      boolean transferTo, DataTransferThrottler throttler) throws IOException {
    ...
    try {
      if (transferTo) {
        SocketOutputStream sockOut = (SocketOutputStream)out;
        sockOut.write(buf, headerOff, dataOff - headerOff);
        FileChannel fileCh = ((FileInputStream)blockIn).getChannel();
        LongWritable waitTime = new LongWritable();
        LongWritable transferTime = new LongWritable();
        sockOut.transferToFully(fileCh, blockInPosition, dataLen, 
            waitTime, transferTime);
        datanode.metrics.addSendDataPacketBlockedOnNetworkNanos(waitTime.get());
        datanode.metrics.addSendDataPacketTransferNanos(transferTime.get());
        blockInPosition += dataLen;
      } else {
        out.write(buf, headerOff, dataOff + dataLen - headerOff);
      }
    } catch (IOException e) {
      if (e instanceof SocketTimeoutException) {
      } else {
        String ioem = e.getMessage();
        if (!ioem.startsWith("Broken pipe") && !ioem.startsWith("Connection reset")) {
          LOG.error("BlockSender.sendChunks() exception: ", e);
          // 发生IOException时,将这个块标记为可疑块
          datanode.getBlockScanner().markSuspectBlock(
              volumeRef.getVolume().getStorageID(),
              block);
        }
      }
      throw ioeToSocketException(e);
    }
    ...
  }

可疑块被筛选出来,会经过runLoop方法。

private long runLoop(ExtendedBlock suspectBlock) {
    long bytesScanned = -1;
    boolean scanError = false;
    ExtendedBlock block = null;
    try {
      long monotonicMs = Time.monotonicNow();
      expireOldScannedBytesRecords(monotonicMs);

      if (!calculateShouldScan(volume.getStorageID(), conf.targetBytesPerSec,
          scannedBytesSum, startMinute, curMinute)) {
        // If neededBytesPerSec is too low, then wait few seconds for some old
        // scannedBytes records to expire.
        return 30000L;
      }

      // Find a usable block pool to scan.
      // 寻找一个可用的blockpool准备扫描
      if (suspectBlock != null) {
        block = suspectBlock;
      } else {
        if ((curBlockIter == null) || curBlockIter.atEnd()) {
          long timeout = findNextUsableBlockIter();
          if (timeout > 0) {
            LOG.trace("{}: no block pools are ready to scan yet.  Waiting " +
                "{} ms.", this, timeout);
            synchronized (stats) {
              stats.nextBlockPoolScanStartMs = Time.monotonicNow() + timeout;
            }
            return timeout;
          }
          synchronized (stats) {
            stats.scansSinceRestart++;
            stats.blocksScannedInCurrentPeriod = 0;
            stats.nextBlockPoolScanStartMs = -1;
          }
          return 0L;
        }
        try {
          // 获取当前blockpool中下一个待扫描块
          block = curBlockIter.nextBlock();
        } catch (IOException e) {
          LOG.warn("{}: nextBlock error on {}", this, curBlockIter);
          return 0L;
        }
        if (block == null) {
          LOG.info("{}: finished scanning block pool {}",
              this, curBlockIter.getBlockPoolId());
          saveBlockIterator(curBlockIter);
          return 0;
        }
      }
      if (curBlockIter != null) {
        long saveDelta = monotonicMs - curBlockIter.getLastSavedMs();
        if (saveDelta >= conf.cursorSaveMs) {
          LOG.debug("{}: saving block iterator {} after {} ms.",
              this, curBlockIter, saveDelta);
          saveBlockIterator(curBlockIter);
        }
      }
      // 对选好的块进行扫描
      bytesScanned = scanBlock(block, conf.targetBytesPerSec);
      if (bytesScanned >= 0) {
        scannedBytesSum += bytesScanned;
        scannedBytes[(int)(curMinute % MINUTES_PER_HOUR)] += bytesScanned;
      } else {
        scanError = true;
      }
      return 0L;
    } finally {
      synchronized (stats) {
        stats.bytesScannedInPastHour = scannedBytesSum;
        if (bytesScanned > 0) {
          stats.blocksScannedInCurrentPeriod++;
          stats.blocksScannedSinceRestart++;
        }
        if (scanError) {
          stats.scanErrorsSinceRestart++;
        }
        if (block != null) {
          stats.lastBlockScanned = block;
        }
        if (curBlockIter == null) {
          stats.eof = true;
          stats.blockPoolPeriodEndsMs = -1;
        } else {
          stats.eof = curBlockIter.atEnd();
          stats.blockPoolPeriodEndsMs =
              curBlockIter.getIterStartMs() + conf.scanPeriodMs;
        }
      }
    }
  }

对于给定的块使用scanBlock方法处理。

private long scanBlock(ExtendedBlock cblock, long bytesPerSec) {
	...
    try {
      // 创建BlockSender
      blockSender = new BlockSender(block, 0, -1,
          false, true, true, datanode, null,
          CachingStrategy.newDropBehind());
      // 对blocksender进行限流
      throttler.setBandwidth(bytesPerSec);
      long bytesRead = blockSender.sendBlock(nullStream, null, throttler);
      // 具体处理逻辑
      resultHandler.handle(block, null);
      metrics.incrBlocksVerified();
      return bytesRead;
    } catch (IOException e) {
      resultHandler.handle(block, e);
    } finally {
      IOUtils.cleanup(null, blockSender);
    }
    metrics.incrBlockVerificationFailures();
    return -1;
  }

resultHandler的handle方法中包含了最终的处理逻辑。

public void handle(ExtendedBlock block, IOException e) {
      FsVolumeSpi volume = scanner.volume;
      // 没有IOException,则是正常块
      if (e == null) {
        LOG.trace("Successfully scanned {} on {}", block, volume.getBasePath());
        return;
      }
      // If the block does not exist anymore, then it's not an error.
      // 如果块不存在则返回
      if (!volume.getDataset().contains(block)) {
        LOG.debug("Volume {}: block {} is no longer in the dataset.",
            volume.getBasePath(), block);
        return;
      }
      if (e instanceof FileNotFoundException ) {
        LOG.info("Volume {}: verification failed for {} because of " +
                "FileNotFoundException.  This may be due to a race with write.",
            volume.getBasePath(), block);
        return;
      }
      // 其他Exception则认为是坏块
      LOG.warn("Reporting bad " + block + " with volume "
          + volume.getBasePath(), e);
      try {
        // 向namenode汇报坏块
        scanner.datanode.reportBadBlocks(block, volume);
      } catch (IOException ie) {
        LOG.warn("Cannot report bad block " + block, ie);
      }
    }
  }

handle方法的逻辑是根据BlockSender读块时是否抛出IO异常来判断是否是坏块,当出现坏块时会向NameNode进行块汇报。

你可能感兴趣的:(大数据,--,hadoop源代码分析)