每个DataNode都会有一个BlockScanner周期性的验证DataNode上存储的所有数据块的正确性,并把损坏的数据块报告给NameNode。
VolumeScanner是专门针对每个存储目录做块扫描的服务,由于DataNode可以使用多目录,所以BlockScanner会持有多个VolumeScanner。
public class BlockScanner {
...
// 保存VolumeScanner
private final TreeMap<String, VolumeScanner> scanners =
new TreeMap<String, VolumeScanner>();
...
// 添加VolumeScanner对象
public synchronized void addVolumeScanner(FsVolumeReference ref) {
boolean success = false;
try {
FsVolumeSpi volume = ref.getVolume();
if (!isEnabled()) {
LOG.debug("Not adding volume scanner for {}, because the block " +
"scanner is disabled.", volume.getBasePath());
return;
}
// 在map查找VolumeScanner对象
VolumeScanner scanner = scanners.get(volume.getStorageID());
if (scanner != null) {
LOG.error("Already have a scanner for volume {}.",
volume.getBasePath());
return;
}
LOG.debug("Adding scanner for volume {} (StorageID {})",
volume.getBasePath(), volume.getStorageID());
// 创建VolumeScanner
scanner = new VolumeScanner(conf, datanode, ref);
// 启动VolumeScanner
scanner.start();
// 将VolumeScanner加入map中
scanners.put(volume.getStorageID(), scanner);
success = true;
} finally {
if (!success) {
// If we didn't create a new VolumeScanner object, we don't
// need this reference to the volume.
IOUtils.cleanup(null, ref);
}
}
}
}
public class VolumeScanner extends Thread {
...
// 可疑块列表,scanner会优先扫描
private final LinkedHashSet<ExtendedBlock> suspectBlocks =
new LinkedHashSet<ExtendedBlock>();
...
public void run() {
// Record the minute on which the scanner started.
// 记录扫描时间
this.startMinute =
TimeUnit.MINUTES.convert(Time.monotonicNow(), TimeUnit.MILLISECONDS);
this.curMinute = startMinute;
try {
LOG.trace("{}: thread starting.", this);
resultHandler.setup(this);
try {
long timeout = 0;
while (true) {
ExtendedBlock suspectBlock = null;
synchronized (this) {
if (stopping) {
break;
}
if (timeout > 0) {
LOG.debug("{}: wait for {} milliseconds", this, timeout);
wait(timeout);
if (stopping) {
break;
}
}
// 获取下一个可疑块
suspectBlock = popNextSuspectBlock();
}
// 进行扫描
timeout = runLoop(suspectBlock);
}
} catch (InterruptedException e) {
// We are exiting because of an InterruptedException,
// probably sent by VolumeScanner#shutdown.
LOG.trace("{} exiting because of InterruptedException.", this);
} catch (Throwable e) {
LOG.error("{} exiting because of exception ", this, e);
}
LOG.info("{} exiting.", this);
// Save the current position of all block iterators and close them.
for (BlockIterator iter : blockIters) {
saveBlockIterator(iter);
IOUtils.cleanup(null, iter);
}
} finally {
// When the VolumeScanner exits, release the reference we were holding
// on the volume. This will allow the volume to be removed later.
IOUtils.cleanup(null, ref);
}
}
}
suspectBlocks是VolumeScanner所维护的可疑块列表。markSuspectBlock负责将可疑块加入suspectBlocks列表中。
public synchronized void markSuspectBlock(ExtendedBlock block) {
if (stopping) {
LOG.debug("{}: Not scheduling suspect block {} for " +
"rescanning, because this volume scanner is stopping.", this, block);
return;
}
Boolean recent = recentSuspectBlocks.getIfPresent(block);
if (recent != null) {
LOG.debug("{}: Not scheduling suspect block {} for " +
"rescanning, because we rescanned it recently.", this, block);
return;
}
if (suspectBlocks.contains(block)) {
LOG.debug("{}: suspect block {} is already queued for " +
"rescanning.", this, block);
return;
}
suspectBlocks.add(block);
recentSuspectBlocks.put(block, true);
LOG.debug("{}: Scheduling suspect block {} for rescanning.", this, block);
notify(); // wake scanner thread.
}
markSuspectBlock在BlockSender的sendPacket方法中被调用。当发生了IO异常时会进行可疑块标记。
private int sendPacket(ByteBuffer pkt, int maxChunks, OutputStream out,
boolean transferTo, DataTransferThrottler throttler) throws IOException {
...
try {
if (transferTo) {
SocketOutputStream sockOut = (SocketOutputStream)out;
sockOut.write(buf, headerOff, dataOff - headerOff);
FileChannel fileCh = ((FileInputStream)blockIn).getChannel();
LongWritable waitTime = new LongWritable();
LongWritable transferTime = new LongWritable();
sockOut.transferToFully(fileCh, blockInPosition, dataLen,
waitTime, transferTime);
datanode.metrics.addSendDataPacketBlockedOnNetworkNanos(waitTime.get());
datanode.metrics.addSendDataPacketTransferNanos(transferTime.get());
blockInPosition += dataLen;
} else {
out.write(buf, headerOff, dataOff + dataLen - headerOff);
}
} catch (IOException e) {
if (e instanceof SocketTimeoutException) {
} else {
String ioem = e.getMessage();
if (!ioem.startsWith("Broken pipe") && !ioem.startsWith("Connection reset")) {
LOG.error("BlockSender.sendChunks() exception: ", e);
// 发生IOException时,将这个块标记为可疑块
datanode.getBlockScanner().markSuspectBlock(
volumeRef.getVolume().getStorageID(),
block);
}
}
throw ioeToSocketException(e);
}
...
}
可疑块被筛选出来,会经过runLoop方法。
private long runLoop(ExtendedBlock suspectBlock) {
long bytesScanned = -1;
boolean scanError = false;
ExtendedBlock block = null;
try {
long monotonicMs = Time.monotonicNow();
expireOldScannedBytesRecords(monotonicMs);
if (!calculateShouldScan(volume.getStorageID(), conf.targetBytesPerSec,
scannedBytesSum, startMinute, curMinute)) {
// If neededBytesPerSec is too low, then wait few seconds for some old
// scannedBytes records to expire.
return 30000L;
}
// Find a usable block pool to scan.
// 寻找一个可用的blockpool准备扫描
if (suspectBlock != null) {
block = suspectBlock;
} else {
if ((curBlockIter == null) || curBlockIter.atEnd()) {
long timeout = findNextUsableBlockIter();
if (timeout > 0) {
LOG.trace("{}: no block pools are ready to scan yet. Waiting " +
"{} ms.", this, timeout);
synchronized (stats) {
stats.nextBlockPoolScanStartMs = Time.monotonicNow() + timeout;
}
return timeout;
}
synchronized (stats) {
stats.scansSinceRestart++;
stats.blocksScannedInCurrentPeriod = 0;
stats.nextBlockPoolScanStartMs = -1;
}
return 0L;
}
try {
// 获取当前blockpool中下一个待扫描块
block = curBlockIter.nextBlock();
} catch (IOException e) {
LOG.warn("{}: nextBlock error on {}", this, curBlockIter);
return 0L;
}
if (block == null) {
LOG.info("{}: finished scanning block pool {}",
this, curBlockIter.getBlockPoolId());
saveBlockIterator(curBlockIter);
return 0;
}
}
if (curBlockIter != null) {
long saveDelta = monotonicMs - curBlockIter.getLastSavedMs();
if (saveDelta >= conf.cursorSaveMs) {
LOG.debug("{}: saving block iterator {} after {} ms.",
this, curBlockIter, saveDelta);
saveBlockIterator(curBlockIter);
}
}
// 对选好的块进行扫描
bytesScanned = scanBlock(block, conf.targetBytesPerSec);
if (bytesScanned >= 0) {
scannedBytesSum += bytesScanned;
scannedBytes[(int)(curMinute % MINUTES_PER_HOUR)] += bytesScanned;
} else {
scanError = true;
}
return 0L;
} finally {
synchronized (stats) {
stats.bytesScannedInPastHour = scannedBytesSum;
if (bytesScanned > 0) {
stats.blocksScannedInCurrentPeriod++;
stats.blocksScannedSinceRestart++;
}
if (scanError) {
stats.scanErrorsSinceRestart++;
}
if (block != null) {
stats.lastBlockScanned = block;
}
if (curBlockIter == null) {
stats.eof = true;
stats.blockPoolPeriodEndsMs = -1;
} else {
stats.eof = curBlockIter.atEnd();
stats.blockPoolPeriodEndsMs =
curBlockIter.getIterStartMs() + conf.scanPeriodMs;
}
}
}
}
对于给定的块使用scanBlock方法处理。
private long scanBlock(ExtendedBlock cblock, long bytesPerSec) {
...
try {
// 创建BlockSender
blockSender = new BlockSender(block, 0, -1,
false, true, true, datanode, null,
CachingStrategy.newDropBehind());
// 对blocksender进行限流
throttler.setBandwidth(bytesPerSec);
long bytesRead = blockSender.sendBlock(nullStream, null, throttler);
// 具体处理逻辑
resultHandler.handle(block, null);
metrics.incrBlocksVerified();
return bytesRead;
} catch (IOException e) {
resultHandler.handle(block, e);
} finally {
IOUtils.cleanup(null, blockSender);
}
metrics.incrBlockVerificationFailures();
return -1;
}
resultHandler的handle方法中包含了最终的处理逻辑。
public void handle(ExtendedBlock block, IOException e) {
FsVolumeSpi volume = scanner.volume;
// 没有IOException,则是正常块
if (e == null) {
LOG.trace("Successfully scanned {} on {}", block, volume.getBasePath());
return;
}
// If the block does not exist anymore, then it's not an error.
// 如果块不存在则返回
if (!volume.getDataset().contains(block)) {
LOG.debug("Volume {}: block {} is no longer in the dataset.",
volume.getBasePath(), block);
return;
}
if (e instanceof FileNotFoundException ) {
LOG.info("Volume {}: verification failed for {} because of " +
"FileNotFoundException. This may be due to a race with write.",
volume.getBasePath(), block);
return;
}
// 其他Exception则认为是坏块
LOG.warn("Reporting bad " + block + " with volume "
+ volume.getBasePath(), e);
try {
// 向namenode汇报坏块
scanner.datanode.reportBadBlocks(block, volume);
} catch (IOException ie) {
LOG.warn("Cannot report bad block " + block, ie);
}
}
}
handle方法的逻辑是根据BlockSender读块时是否抛出IO异常来判断是否是坏块,当出现坏块时会向NameNode进行块汇报。