以hadoop fs -chown data_sum:data_sum /test/input为例,讲解决EditLog的执行过程。
当客户端执行以上命令时,通过RPC调用,服务器端执行NameNodeRpcServer.setOwner方法,代码如下:
@Override // ClientProtocol public void setOwner(String src, String username, String groupname) throws IOException { namesystem.setOwner(src, username, groupname); }
void setOwner(String src, String username, String group) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { try { setOwnerInt(src, username, group); } catch (AccessControlException e) { logAuditEvent(false, "setOwner", src); throw e; } }
setOwnerInt相关代码如下:
private void setOwnerInt(final String srcArg, String username, String group) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { String src = srcArg; HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); checkOperation(OperationCategory.WRITE); byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); writeLock(); try { checkOperation(OperationCategory.WRITE); checkNameNodeSafeMode("Cannot set owner for " + src); src = resolvePath(src, pathComponents); checkOwner(pc, src); if (!pc.isSuperUser()) { if (username != null && !pc.getUser().equals(username)) { throw new AccessControlException("Non-super user cannot change owner"); } if (group != null && !pc.containsGroup(group)) { throw new AccessControlException("User does not belong to " + group); } } dir.setOwner(src, username, group); <span style="color:#FF0000;"> getEditLog().logSetOwner(src, username, group);</span> resultingStat = getAuditFileInfo(src, false); } finally { writeUnlock(); } <span style="color:#FF0000;"> getEditLog().logSync();</span> logAuditEvent(true, "setOwner", srcArg, null, resultingStat); }
以上有两个地方重要,一个是getEditLog().logSetOwner(src, username, group);用于把日志记入缓冲区。另一个是getEditLog().logSync();用于把日志刷到持久设备上。输出日志和持久化日志的分离(getEditLog().logSync()不在tryLock内,即释放tryLock之后,其它线程可以再次获得tryLock,而不必等logSync之后),是Hadoop日志系统的设计核心,也是高效的日志设计方法,因为并不是每个操作都要进行logSync(),一次sync可能会同步一批操作的日志,从而提高了效率。但是最后的getEditLog().logSync();保证了日志同步之后,操作才会返回。
logSetOwner的方法内容:
void logSetOwner(String src, String username, String groupname) { SetOwnerOp op = SetOwnerOp.getInstance(cache.get()) .setSource(src) .setUser(username) .setGroup(groupname); logEdit(op); }
private final ThreadLocal<OpInstanceCache> cache = new ThreadLocal<OpInstanceCache>() { @Override protected OpInstanceCache initialValue() { return new OpInstanceCache(); } };
final public static class OpInstanceCache { private final EnumMap<FSEditLogOpCodes, FSEditLogOp> inst = new EnumMap<FSEditLogOpCodes, FSEditLogOp>(FSEditLogOpCodes.class); public OpInstanceCache() { inst.put(OP_ADD, new AddOp()); inst.put(OP_CLOSE, new CloseOp()); inst.put(OP_SET_REPLICATION, new SetReplicationOp()); inst.put(OP_CONCAT_DELETE, new ConcatDeleteOp()); inst.put(OP_RENAME_OLD, new RenameOldOp()); inst.put(OP_DELETE, new DeleteOp()); inst.put(OP_MKDIR, new MkdirOp()); inst.put(OP_SET_GENSTAMP_V1, new SetGenstampV1Op()); inst.put(OP_SET_PERMISSIONS, new SetPermissionsOp()); inst.put(OP_SET_OWNER, new SetOwnerOp()); inst.put(OP_SET_NS_QUOTA, new SetNSQuotaOp()); inst.put(OP_CLEAR_NS_QUOTA, new ClearNSQuotaOp()); inst.put(OP_SET_QUOTA, new SetQuotaOp()); inst.put(OP_TIMES, new TimesOp()); inst.put(OP_SYMLINK, new SymlinkOp()); inst.put(OP_RENAME, new RenameOp()); inst.put(OP_REASSIGN_LEASE, new ReassignLeaseOp()); inst.put(OP_GET_DELEGATION_TOKEN, new GetDelegationTokenOp()); inst.put(OP_RENEW_DELEGATION_TOKEN, new RenewDelegationTokenOp()); inst.put(OP_CANCEL_DELEGATION_TOKEN, new CancelDelegationTokenOp()); inst.put(OP_UPDATE_MASTER_KEY, new UpdateMasterKeyOp()); inst.put(OP_START_LOG_SEGMENT, new LogSegmentOp(OP_START_LOG_SEGMENT)); inst.put(OP_END_LOG_SEGMENT, new LogSegmentOp(OP_END_LOG_SEGMENT)); inst.put(OP_UPDATE_BLOCKS, new UpdateBlocksOp()); inst.put(OP_ALLOW_SNAPSHOT, new AllowSnapshotOp()); inst.put(OP_DISALLOW_SNAPSHOT, new DisallowSnapshotOp()); inst.put(OP_CREATE_SNAPSHOT, new CreateSnapshotOp()); inst.put(OP_DELETE_SNAPSHOT, new DeleteSnapshotOp()); inst.put(OP_RENAME_SNAPSHOT, new RenameSnapshotOp()); inst.put(OP_SET_GENSTAMP_V2, new SetGenstampV2Op()); inst.put(OP_ALLOCATE_BLOCK_ID, new AllocateBlockIdOp()); inst.put(OP_ADD_BLOCK, new AddBlockOp()); inst.put(OP_ADD_CACHE_DIRECTIVE, new AddCacheDirectiveInfoOp()); inst.put(OP_MODIFY_CACHE_DIRECTIVE, new ModifyCacheDirectiveInfoOp()); inst.put(OP_REMOVE_CACHE_DIRECTIVE, new RemoveCacheDirectiveInfoOp()); inst.put(OP_ADD_CACHE_POOL, new AddCachePoolOp()); inst.put(OP_MODIFY_CACHE_POOL, new ModifyCachePoolOp()); inst.put(OP_REMOVE_CACHE_POOL, new RemoveCachePoolOp()); inst.put(OP_SET_ACL, new SetAclOp()); inst.put(OP_ROLLING_UPGRADE_START, new RollingUpgradeOp( OP_ROLLING_UPGRADE_START, "start")); inst.put(OP_ROLLING_UPGRADE_FINALIZE, new RollingUpgradeOp( OP_ROLLING_UPGRADE_FINALIZE, "finalize")); inst.put(OP_SET_XATTR, new SetXAttrOp()); inst.put(OP_REMOVE_XATTR, new RemoveXAttrOp()); inst.put(OP_SET_STORAGE_POLICY, new SetStoragePolicyOp()); } public FSEditLogOp get(FSEditLogOpCodes opcode) { return inst.get(opcode); } }
FSEditLogOpCodes是一个Enum对象,定义了所有可能的日志类型:
@InterfaceAudience.Private @InterfaceStability.Unstable public enum FSEditLogOpCodes { // last op code in file OP_ADD ((byte) 0), OP_RENAME_OLD ((byte) 1), // deprecated operation OP_DELETE ((byte) 2), OP_MKDIR ((byte) 3), OP_SET_REPLICATION ((byte) 4), @Deprecated OP_DATANODE_ADD ((byte) 5), // obsolete @Deprecated OP_DATANODE_REMOVE((byte) 6), // obsolete OP_SET_PERMISSIONS ((byte) 7), OP_SET_OWNER ((byte) 8), OP_CLOSE ((byte) 9), OP_SET_GENSTAMP_V1 ((byte) 10), OP_SET_NS_QUOTA ((byte) 11), // obsolete OP_CLEAR_NS_QUOTA ((byte) 12), // obsolete OP_TIMES ((byte) 13), // set atime, mtime OP_SET_QUOTA ((byte) 14), OP_RENAME ((byte) 15), // filecontext rename OP_CONCAT_DELETE ((byte) 16), // concat files OP_SYMLINK ((byte) 17), OP_GET_DELEGATION_TOKEN ((byte) 18), OP_RENEW_DELEGATION_TOKEN ((byte) 19), OP_CANCEL_DELEGATION_TOKEN ((byte) 20), OP_UPDATE_MASTER_KEY ((byte) 21), OP_REASSIGN_LEASE ((byte) 22), OP_END_LOG_SEGMENT ((byte) 23), OP_START_LOG_SEGMENT ((byte) 24), OP_UPDATE_BLOCKS ((byte) 25), OP_CREATE_SNAPSHOT ((byte) 26), OP_DELETE_SNAPSHOT ((byte) 27), OP_RENAME_SNAPSHOT ((byte) 28), OP_ALLOW_SNAPSHOT ((byte) 29), OP_DISALLOW_SNAPSHOT ((byte) 30), OP_SET_GENSTAMP_V2 ((byte) 31), OP_ALLOCATE_BLOCK_ID ((byte) 32), OP_ADD_BLOCK ((byte) 33), OP_ADD_CACHE_DIRECTIVE ((byte) 34), OP_REMOVE_CACHE_DIRECTIVE ((byte) 35), OP_ADD_CACHE_POOL ((byte) 36), OP_MODIFY_CACHE_POOL ((byte) 37), OP_REMOVE_CACHE_POOL ((byte) 38), OP_MODIFY_CACHE_DIRECTIVE ((byte) 39), OP_SET_ACL ((byte) 40), OP_ROLLING_UPGRADE_START ((byte) 41), OP_ROLLING_UPGRADE_FINALIZE ((byte) 42), OP_SET_XATTR ((byte) 43), OP_REMOVE_XATTR ((byte) 44), OP_SET_STORAGE_POLICY ((byte) 45), // Note that the current range of the valid OP code is 0~127 OP_INVALID ((byte) -1); private final byte opCode; /** * Constructor * * @param opCode byte value of constructed enum */ FSEditLogOpCodes(byte opCode) { this.opCode = opCode; } /** * return the byte value of the enum * * @return the byte value of the enum */ public byte getOpCode() { return opCode; } private static final FSEditLogOpCodes[] VALUES; static { byte max = 0; for (FSEditLogOpCodes code : FSEditLogOpCodes.values()) { if (code.getOpCode() > max) { max = code.getOpCode(); } } VALUES = new FSEditLogOpCodes[max + 1]; for (FSEditLogOpCodes code : FSEditLogOpCodes.values()) { if (code.getOpCode() >= 0) { VALUES[code.getOpCode()] = code; } } } /** * Converts byte to FSEditLogOpCodes enum value * * @param opCode get enum for this opCode * @return enum with byte value of opCode */ public static FSEditLogOpCodes fromByte(byte opCode) { if (opCode >= 0 && opCode < VALUES.length) { return VALUES[opCode]; } return opCode == -1 ? OP_INVALID : null; } }
FSEditLogOp是一个抽象类,部分代码如下:
public abstract class FSEditLogOp { public final FSEditLogOpCodes opCode; long txid = HdfsConstants.INVALID_TXID; byte[] rpcClientId = RpcConstants.DUMMY_CLIENT_ID; int rpcCallId = RpcConstants.INVALID_CALL_ID;
static class SetOwnerOp extends FSEditLogOp { String src; String username; String groupname; private SetOwnerOp() { super(OP_SET_OWNER); } static SetOwnerOp getInstance(OpInstanceCache cache) { return (SetOwnerOp)cache.get(OP_SET_OWNER); } SetOwnerOp setSource(String src) { this.src = src; return this; } SetOwnerOp setUser(String username) { this.username = username; return this; } SetOwnerOp setGroup(String groupname) { this.groupname = groupname; return this; } @Override public void writeFields(DataOutputStream out) throws IOException { FSImageSerialization.writeString(src, out); FSImageSerialization.writeString(username == null ? "" : username, out); FSImageSerialization.writeString(groupname == null ? "" : groupname, out); } @Override void readFields(DataInputStream in, int logVersion) throws IOException { this.src = FSImageSerialization.readString(in); this.username = FSImageSerialization.readString_EmptyAsNull(in); this.groupname = FSImageSerialization.readString_EmptyAsNull(in); } @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append("SetOwnerOp [src="); builder.append(src); builder.append(", username="); builder.append(username); builder.append(", groupname="); builder.append(groupname); builder.append(", opCode="); builder.append(opCode); builder.append(", txid="); builder.append(txid); builder.append("]"); return builder.toString(); } @Override protected void toXml(ContentHandler contentHandler) throws SAXException { XMLUtils.addSaxString(contentHandler, "SRC", src); if (username != null) { XMLUtils.addSaxString(contentHandler, "USERNAME", username); } if (groupname != null) { XMLUtils.addSaxString(contentHandler, "GROUPNAME", groupname); } } @Override void fromXml(Stanza st) throws InvalidXmlException { this.src = st.getValue("SRC"); this.username = (st.hasChildren("USERNAME")) ? st.getValue("USERNAME") : null; this.groupname = (st.hasChildren("GROUPNAME")) ? st.getValue("GROUPNAME") : null; } }
返回logSetOwner方法:从缓冲区里得到FSEditLogOp
void logSetOwner(String src, String username, String groupname) { SetOwnerOp op = SetOwnerOp.getInstance(cache.get()) .setSource(src) .setUser(username) .setGroup(groupname); logEdit(op); }SetOwnerOp.getInstance方法,把 FSEditLogOp转为SetOwnerOp对象,代码如下:
static SetOwnerOp getInstance(OpInstanceCache cache) { return (SetOwnerOp)cache.get(OP_SET_OWNER); }
void logEdit(final FSEditLogOp op) { synchronized (this) { assert isOpenForWrite() : "bad state: " + state; // wait if an automatic sync is scheduled waitIfAutoSyncScheduled(); long start = beginTransaction(); op.setTransactionId(txid); try { editLogStream.write(op); } catch (IOException ex) { // All journals failed, it is handled in logSync. } endTransaction(start); // check if it is time to schedule an automatic sync if (!shouldForceSync()) { return; } isAutoSyncScheduled = true; } // sync buffered edit log entries to persistent store logSync(); }
myTransactionIdisOpenForWrite判断当前状态是否在write模式。
/** * @return true if the log is currently open in write mode, regardless * of whether it actually has an open segment. */ synchronized boolean isOpenForWrite() { return state == State.IN_SEGMENT || state == State.BETWEEN_LOG_SEGMENTS; }waitIfAutoSyncScheduled代码如下:
/** * Wait if an automatic sync is scheduled */ synchronized void waitIfAutoSyncScheduled() { try { while (isAutoSyncScheduled) { this.wait(1000); } } catch (InterruptedException e) { } }beginTransaction,用于生成txid,代码如下:
private long beginTransaction() { assert Thread.holdsLock(this); // get a new transactionId txid++; // // record the transactionId when new data was written to the edits log // TransactionId id = myTransactionId.get(); id.txid = txid; return now(); }
private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>() { @Override protected synchronized TransactionId initialValue() { return new TransactionId(Long.MAX_VALUE); } };
write的方法如下:
@Override public void write(final FSEditLogOp op) throws IOException { mapJournalsAndReportErrors(new JournalClosure() { @Override public void apply(JournalAndStream jas) throws IOException { if (jas.isActive()) { jas.getCurrentStream().write(op); } } }, "write op"); }
/** * Implementations of this interface encapsulate operations that can be * iteratively applied on all the journals. For example see * {@link JournalSet#mapJournalsAndReportErrors}. */ private interface JournalClosure { /** * The operation on JournalAndStream. * @param jas Object on which operations are performed. * @throws IOException */ public void apply(JournalAndStream jas) throws IOException; }
/** * Apply the given operation across all of the journal managers, disabling * any for which the closure throws an IOException. * @param closure {@link JournalClosure} object encapsulating the operation. * @param status message used for logging errors (e.g. "opening journal") * @throws IOException If the operation fails on all the journals. */ private void mapJournalsAndReportErrors( JournalClosure closure, String status) throws IOException{ List<JournalAndStream> badJAS = Lists.newLinkedList(); for (JournalAndStream jas : journals) { try { closure.apply(jas); } catch (Throwable t) { if (jas.isRequired()) { final String msg = "Error: " + status + " failed for required journal (" + jas + ")"; LOG.fatal(msg, t); // If we fail on *any* of the required journals, then we must not // continue on any of the other journals. Abort them to ensure that // retry behavior doesn't allow them to keep going in any way. abortAllJournals(); // the current policy is to shutdown the NN on errors to shared edits // dir. There are many code paths to shared edits failures - syncs, // roll of edits etc. All of them go through this common function // where the isRequired() check is made. Applying exit policy here // to catch all code paths. terminate(1, msg); } else { LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); badJAS.add(jas); } } } disableAndReportErrorOnJournals(badJAS); if (!NameNodeResourcePolicy.areResourcesAvailable(journals, minimumRedundantJournals)) { String message = status + " failed for too many journals"; LOG.error("Error: " + message); throw new IOException(message); } }最终用调回JournalCl osure.apply方法:
@Override public void apply(JournalAndStream jas) throws IOException { if (jas.isActive()) { jas.getCurrentStream().write(op); } }
@Override public void write(FSEditLogOp op) throws IOException { doubleBuf.writeOp(op); }
doubleBuf是EditsDoubleBuffer的实例。EditsDoubleBuffer类的代码如下:
/** * A double-buffer for edits. New edits are written into the first buffer * while the second is available to be flushed. Each time the double-buffer * is flushed, the two internal buffers are swapped. This allows edits * to progress concurrently to flushes without allocating new buffers each * time. */ @InterfaceAudience.Private public class EditsDoubleBuffer { private TxnBuffer bufCurrent; // current buffer for writing private TxnBuffer bufReady; // buffer ready for flushing private final int initBufferSize; public EditsDoubleBuffer(int defaultBufferSize) { initBufferSize = defaultBufferSize; bufCurrent = new TxnBuffer(initBufferSize); bufReady = new TxnBuffer(initBufferSize); } public void writeOp(FSEditLogOp op) throws IOException { bufCurrent.writeOp(op); } public void writeRaw(byte[] bytes, int offset, int length) throws IOException { bufCurrent.write(bytes, offset, length); } public void close() throws IOException { Preconditions.checkNotNull(bufCurrent); Preconditions.checkNotNull(bufReady); int bufSize = bufCurrent.size(); if (bufSize != 0) { throw new IOException("FSEditStream has " + bufSize + " bytes still to be flushed and cannot be closed."); } IOUtils.cleanup(null, bufCurrent, bufReady); bufCurrent = bufReady = null; } public void setReadyToFlush() { assert isFlushed() : "previous data not flushed yet"; TxnBuffer tmp = bufReady; bufReady = bufCurrent; bufCurrent = tmp; } /** * Writes the content of the "ready" buffer to the given output stream, * and resets it. Does not swap any buffers. */ public void flushTo(OutputStream out) throws IOException { bufReady.writeTo(out); // write data to file bufReady.reset(); // erase all data in the buffer } public boolean shouldForceSync() { return bufCurrent.size() >= initBufferSize; } DataOutputBuffer getReadyBuf() { return bufReady; } DataOutputBuffer getCurrentBuf() { return bufCurrent; } public boolean isFlushed() { return bufReady.size() == 0; } public int countBufferedBytes() { return bufReady.size() + bufCurrent.size(); } /** * @return the transaction ID of the first transaction ready to be flushed */ public long getFirstReadyTxId() { assert bufReady.firstTxId > 0; return bufReady.firstTxId; } /** * @return the number of transactions that are ready to be flushed */ public int countReadyTxns() { return bufReady.numTxns; } /** * @return the number of bytes that are ready to be flushed */ public int countReadyBytes() { return bufReady.size(); } private static class TxnBuffer extends DataOutputBuffer { long firstTxId; int numTxns; private final Writer writer; public TxnBuffer(int initBufferSize) { super(initBufferSize); writer = new FSEditLogOp.Writer(this); reset(); } public void writeOp(FSEditLogOp op) throws IOException { if (firstTxId == HdfsConstants.INVALID_TXID) { firstTxId = op.txid; } else { assert op.txid > firstTxId; } writer.writeOp(op); numTxns++; } @Override public DataOutputBuffer reset() { super.reset(); firstTxId = HdfsConstants.INVALID_TXID; numTxns = 0; return this; } } }
public void writeOp(FSEditLogOp op) throws IOException { if (firstTxId == HdfsConstants.INVALID_TXID) { firstTxId = op.txid; } else { assert op.txid > firstTxId; } writer.writeOp(op); numTxns++; }
** * Write an operation to the output stream * * @param op The operation to write * @throws IOException if an error occurs during writing. */ public void writeOp(FSEditLogOp op) throws IOException { int start = buf.getLength(); // write the op code first to make padding and terminator verification // work buf.writeByte(op.opCode.getOpCode()); buf.writeInt(0); // write 0 for the length first buf.writeLong(op.txid); op.writeFields(buf); int end = buf.getLength(); // write the length back: content of the op + 4 bytes checksum - op_code int length = end - start - 1; buf.writeInt(length, start + 1); checksum.reset(); checksum.update(buf.getData(), start, end-start); int sum = (int)checksum.getValue(); buf.writeInt(sum); } }
@Override public void writeFields(DataOutputStream out) throws IOException { FSImageSerialization.writeString(src, out); FSImageSerialization.writeString(username == null ? "" : username, out); FSImageSerialization.writeString(groupname == null ? "" : groupname, out); }
FSImageSerialization.writeString代码如下:
@SuppressWarnings("deprecation") public static void writeString(String str, DataOutput out) throws IOException { DeprecatedUTF8 ustr = TL_DATA.get().U_STR; ustr.set(str); ustr.write(out); }
/** * In order to reduce allocation, we reuse some static objects. However, the methods * in this class should be thread-safe since image-saving is multithreaded, so * we need to keep the static objects in a thread-local. */ static private final ThreadLocal<TLData> TL_DATA = new ThreadLocal<TLData>() { @Override protected TLData initialValue() { return new TLData(); } };
TLData对象的定义如下:
/** * Simple container "struct" for threadlocal data. */ static private final class TLData { final DeprecatedUTF8 U_STR = new DeprecatedUTF8(); final ShortWritable U_SHORT = new ShortWritable(); final IntWritable U_INT = new IntWritable(); final LongWritable U_LONG = new LongWritable(); final FsPermission FILE_PERM = new FsPermission((short) 0); final BooleanWritable U_BOOLEAN = new BooleanWritable(); }
/** * A simple wrapper around {@link org.apache.hadoop.io.UTF8}. * This class should be used only when it is absolutely necessary * to use {@link org.apache.hadoop.io.UTF8}. The only difference is that * using this class does not require "@SuppressWarning" annotation to avoid * javac warning. Instead the deprecation is implied in the class name. * * This should be treated as package private class to HDFS. */ @InterfaceAudience.Private @SuppressWarnings("deprecation") public class DeprecatedUTF8 extends org.apache.hadoop.io.UTF8 { public DeprecatedUTF8() { super(); } /** Construct from a given string. */ public DeprecatedUTF8(String string) { super(string); } /** Construct from a given string. */ public DeprecatedUTF8(DeprecatedUTF8 utf8) { super(utf8); } /* The following two are the mostly commonly used methods. * wrapping them so that editors do not complain about the deprecation. */ public static String readString(DataInput in) throws IOException { return org.apache.hadoop.io.UTF8.readString(in); } public static int writeString(DataOutput out, String s) throws IOException { return org.apache.hadoop.io.UTF8.writeString(out, s); } }
void logEdit(final FSEditLogOp op) { synchronized (this) { assert isOpenForWrite() : "bad state: " + state; // wait if an automatic sync is scheduled waitIfAutoSyncScheduled(); long start = beginTransaction(); op.setTransactionId(txid); try { editLogStream.write(op); } catch (IOException ex) { // All journals failed, it is handled in logSync. } endTransaction(start); // check if it is time to schedule an automatic sync if (!shouldForceSync()) { return; } isAutoSyncScheduled = true; } // sync buffered edit log entries to persistent store logSync(); }
endTransaction主要用于统计信息,代码如下: private void endTransaction(long start) { assert Thread.holdsLock(this); // update statistics long end = now(); numTransactions++; totalTimeTransactions += (end-start); if (metrics != null) // Metrics is non-null only when used inside name node metrics.addTransaction(end-start); }
private boolean shouldForceSync() { return editLogStream.shouldForceSync(); }
然后调用JournalSet.shouldForceSync()
@Override public boolean shouldForceSync() { for (JournalAndStream js : journals) { if (js.isActive() && js.getCurrentStream().shouldForceSync()) { return true; } } return false; }
js.getCurentStream().shouldForcesSync方法如下:
/** * @return true if the number of buffered data exceeds the intial buffer size */ @Override public boolean shouldForceSync() { return doubleBuf.shouldForceSync(); }EditsDoubleBuffer.shouldForceSync代码如下,:
public boolean shouldForceSync() { return bufCurrent.size() >= initBufferSize; }
private void setOwnerInt(final String srcArg, String username, String group) throws AccessControlException, FileNotFoundException, SafeModeException, UnresolvedLinkException, IOException { String src = srcArg; HdfsFileStatus resultingStat = null; FSPermissionChecker pc = getPermissionChecker(); checkOperation(OperationCategory.WRITE); byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src); writeLock(); try { checkOperation(OperationCategory.WRITE); checkNameNodeSafeMode("Cannot set owner for " + src); src = resolvePath(src, pathComponents); checkOwner(pc, src); if (!pc.isSuperUser()) { if (username != null && !pc.getUser().equals(username)) { throw new AccessControlException("Non-super user cannot change owner"); } if (group != null && !pc.containsGroup(group)) { throw new AccessControlException("User does not belong to " + group); } } dir.setOwner(src, username, group); getEditLog().logSetOwner(src, username, group); resultingStat = getAuditFileInfo(src, false); } finally { writeUnlock(); } <span style="color:#FF0000;"> getEditLog().logSync();</span> logAuditEvent(true, "setOwner", srcArg, null, resultingStat); }
如果返回true,则设置isAutoSyncScheduled = true,直接调用logSync。设置了isAutoSyncScheduled = true之后,其它线程调用 logEdit方法,会在waitIfAutoSyncScheduled等待;logSync代码如下:
/** * Sync all modifications done by this thread. * * The internal concurrency design of this class is as follows: * - Log items are written synchronized into an in-memory buffer, * and each assigned a transaction ID. * - When a thread (client) would like to sync all of its edits, logSync() * uses a ThreadLocal transaction ID to determine what edit number must * be synced to. * - The isSyncRunning volatile boolean tracks whether a sync is currently * under progress. * * The data is double-buffered within each edit log implementation so that * in-memory writing can occur in parallel with the on-disk writing. * * Each sync occurs in three steps: * 1. synchronized, it swaps the double buffer and sets the isSyncRunning * flag. * 2. unsynchronized, it flushes the data to storage * 3. synchronized, it resets the flag and notifies anyone waiting on the * sync. * * The lack of synchronization on step 2 allows other threads to continue * to write into the memory buffer while the sync is in progress. * Because this step is unsynchronized, actions that need to avoid * concurrency with sync() should be synchronized and also call * waitForSyncToFinish() before assuming they are running alone. */ public void logSync() { long syncStart = 0; // Fetch the transactionId of this thread. long mytxid = myTransactionId.get().txid; boolean sync = false; try { EditLogOutputStream logStream = null; synchronized (this) { try { printStatistics(false); // if somebody is already syncing, then wait,解释为什么出现这种现象,因为第一个线程进入此方法之后,设置isSyncRunning = true; while (mytxid > synctxid && isSyncRunning) { try { wait(1000); } catch (InterruptedException ie) { } } // // If this transaction was already flushed, then nothing to do // if (mytxid <= synctxid) { numTransactionsBatchedInSync++; if (metrics != null) { // Metrics is non-null only when used inside name node metrics.incrTransactionsBatchedInSync(); } return; } // now, this thread will do the sync syncStart = txid; isSyncRunning = true; sync = true; // swap buffers try { if (journalSet.isEmpty()) { throw new IOException("No journals available to flush"); } editLogStream.setReadyToFlush(); } catch (IOException e) { final String msg = "Could not sync enough journals to persistent storage " + "due to " + e.getMessage() + ". " + "Unsynced transactions: " + (txid - synctxid); LOG.fatal(msg, new Exception()); synchronized(journalSetLock) { IOUtils.cleanup(LOG, journalSet); } terminate(1, msg); } } finally { // Prevent RuntimeException from blocking other log edit write doneWithAutoSyncScheduling(); } //editLogStream may become null, //so store a local variable for flush. logStream = editLogStream; } // do the sync long start = now(); try { if (logStream != null) { logStream.flush(); } } catch (IOException ex) { synchronized (this) { final String msg = "Could not sync enough journals to persistent storage. " + "Unsynced transactions: " + (txid - synctxid); LOG.fatal(msg, new Exception()); synchronized(journalSetLock) { IOUtils.cleanup(LOG, journalSet); } terminate(1, msg); } } long elapsed = now() - start; if (metrics != null) { // Metrics non-null only when used inside name node metrics.addSync(elapsed); } } finally { // Prevent RuntimeException from blocking other log edit sync synchronized (this) { if (sync) { synctxid = syncStart; isSyncRunning = false; } this.notifyAll(); } } }
下面依次分析这个方法:
从synchronized (this) {开始,printStatistics(false);是打印统计信息,代码如下:
private void printStatistics(boolean force) { long now = now(); if (lastPrintTime + 60000 > now && !force) { return; } lastPrintTime = now; StringBuilder buf = new StringBuilder(); buf.append("Number of transactions: "); buf.append(numTransactions); buf.append(" Total time for transactions(ms): "); buf.append(totalTimeTransactions); buf.append(" Number of transactions batched in Syncs: "); buf.append(numTransactionsBatchedInSync); buf.append(" Number of syncs: "); buf.append(editLogStream.getNumSync()); buf.append(" SyncTimes(ms): "); buf.append(journalSet.getSyncTimes()); LOG.info(buf); }
// if somebody is already syncing, then wait while (mytxid > synctxid && isSyncRunning) { try { wait(1000); } catch (InterruptedException ie) { } }
下面的代码也是一个优化,如果这个事务已经被flushed,则返回。
// // If this transaction was already flushed, then nothing to do // if (mytxid <= synctxid) { numTransactionsBatchedInSync++; if (metrics != null) { // Metrics is non-null only when used inside name node metrics.incrTransactionsBatchedInSync(); } return; }
syncStart = txid; isSyncRunning = true; sync = true;
try { if (journalSet.isEmpty()) { throw new IOException("No journals available to flush"); } editLogStream.setReadyToFlush(); } catch (IOException e) { final String msg = "Could not sync enough journals to persistent storage " + "due to " + e.getMessage() + ". " + "Unsynced transactions: " + (txid - synctxid); LOG.fatal(msg, new Exception()); synchronized(journalSetLock) { IOUtils.cleanup(LOG, journalSet); } terminate(1, msg); } } finally { // Prevent RuntimeException from blocking other log edit write doneWithAutoSyncScheduling(); }上边的try块中,setReadyToFlush的 代码如下,依次在各Journal上调用闭包类的apply方法。
@Override public void setReadyToFlush() throws IOException { mapJournalsAndReportErrors(new JournalClosure() { @Override public void apply(JournalAndStream jas) throws IOException { if (jas.isActive()) { jas.getCurrentStream().setReadyToFlush(); } } }, "setReadyToFlush"); }
mapJournalsAndReportErrors的方法如下:
private void mapJournalsAndReportErrors( JournalClosure closure, String status) throws IOException{ List<JournalAndStream> badJAS = Lists.newLinkedList(); for (JournalAndStream jas : journals) { try { closure.apply(jas); } catch (Throwable t) { if (jas.isRequired()) { final String msg = "Error: " + status + " failed for required journal (" + jas + ")"; LOG.fatal(msg, t); // If we fail on *any* of the required journals, then we must not // continue on any of the other journals. Abort them to ensure that // retry behavior doesn't allow them to keep going in any way. abortAllJournals(); // the current policy is to shutdown the NN on errors to shared edits // dir. There are many code paths to shared edits failures - syncs, // roll of edits etc. All of them go through this common function // where the isRequired() check is made. Applying exit policy here // to catch all code paths. terminate(1, msg); } else { LOG.error("Error: " + status + " failed for (journal " + jas + ")", t); badJAS.add(jas); } } } disableAndReportErrorOnJournals(badJAS); if (!NameNodeResourcePolicy.areResourcesAvailable(journals, minimumRedundantJournals)) { String message = status + " failed for too many journals"; LOG.error("Error: " + message); throw new IOException(message); } }//disableAndReportErrorOnJournals方法如下:
private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) { if (badJournals == null || badJournals.isEmpty()) { return; // nothing to do } for (JournalAndStream j : badJournals) { LOG.error("Disabling journal " + j); j.abort(); j.setDisabled(true); } }
/** * All data that has been written to the stream so far will be flushed. New * data can be still written to the stream while flushing is performed. */ @Override public void setReadyToFlush() throws IOException { doubleBuf.setReadyToFlush(); }dbouleBuf的setReadyToFlush()方法如下:
public void setReadyToFlush() { assert isFlushed() : "previous data not flushed yet"; TxnBuffer tmp = bufReady; bufReady = bufCurrent; bufCurrent = tmp; }