以hadoop fs -chown data_sum:data_sum /test/input为例,讲解决EditLog的执行过程。
当客户端执行以上命令时,通过RPC调用,服务器端执行NameNodeRpcServer.setOwner方法,代码如下:
@Override // ClientProtocol
public void setOwner(String src, String username, String groupname)
throws IOException {
namesystem.setOwner(src, username, groupname);
}
void setOwner(String src, String username, String group)
throws AccessControlException, FileNotFoundException, SafeModeException,
UnresolvedLinkException, IOException {
try {
setOwnerInt(src, username, group);
} catch (AccessControlException e) {
logAuditEvent(false, "setOwner", src);
throw e;
}
}
setOwnerInt相关代码如下:
private void setOwnerInt(final String srcArg, String username, String group)
throws AccessControlException, FileNotFoundException, SafeModeException,
UnresolvedLinkException, IOException {
String src = srcArg;
HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
checkNameNodeSafeMode("Cannot set owner for " + src);
src = resolvePath(src, pathComponents);
checkOwner(pc, src);
if (!pc.isSuperUser()) {
if (username != null && !pc.getUser().equals(username)) {
throw new AccessControlException("Non-super user cannot change owner");
}
if (group != null && !pc.containsGroup(group)) {
throw new AccessControlException("User does not belong to " + group);
}
}
dir.setOwner(src, username, group);
getEditLog().logSetOwner(src, username, group);
resultingStat = getAuditFileInfo(src, false);
} finally {
writeUnlock();
}
getEditLog().logSync();
logAuditEvent(true, "setOwner", srcArg, null, resultingStat);
}
以上有两个地方重要,一个是getEditLog().logSetOwner(src, username, group);用于把日志记入缓冲区。另一个是getEditLog().logSync();用于把日志刷到持久设备上。输出日志和持久化日志的分离(getEditLog().logSync()不在tryLock内,即释放tryLock之后,其它线程可以再次获得tryLock,而不必等logSync之后),是Hadoop日志系统的设计核心,也是高效的日志设计方法,因为并不是每个操作都要进行logSync(),一次sync可能会同步一批操作的日志,从而提高了效率。但是最后的getEditLog().logSync();保证了日志同步之后,操作才会返回。
logSetOwner的方法内容:
void logSetOwner(String src, String username, String groupname) {
SetOwnerOp op = SetOwnerOp.getInstance(cache.get())
.setSource(src)
.setUser(username)
.setGroup(groupname);
logEdit(op);
}
private final ThreadLocal cache =
new ThreadLocal() {
@Override
protected OpInstanceCache initialValue() {
return new OpInstanceCache();
}
};
final public static class OpInstanceCache {
private final EnumMap inst =
new EnumMap(FSEditLogOpCodes.class);
public OpInstanceCache() {
inst.put(OP_ADD, new AddOp());
inst.put(OP_CLOSE, new CloseOp());
inst.put(OP_SET_REPLICATION, new SetReplicationOp());
inst.put(OP_CONCAT_DELETE, new ConcatDeleteOp());
inst.put(OP_RENAME_OLD, new RenameOldOp());
inst.put(OP_DELETE, new DeleteOp());
inst.put(OP_MKDIR, new MkdirOp());
inst.put(OP_SET_GENSTAMP_V1, new SetGenstampV1Op());
inst.put(OP_SET_PERMISSIONS, new SetPermissionsOp());
inst.put(OP_SET_OWNER, new SetOwnerOp());
inst.put(OP_SET_NS_QUOTA, new SetNSQuotaOp());
inst.put(OP_CLEAR_NS_QUOTA, new ClearNSQuotaOp());
inst.put(OP_SET_QUOTA, new SetQuotaOp());
inst.put(OP_TIMES, new TimesOp());
inst.put(OP_SYMLINK, new SymlinkOp());
inst.put(OP_RENAME, new RenameOp());
inst.put(OP_REASSIGN_LEASE, new ReassignLeaseOp());
inst.put(OP_GET_DELEGATION_TOKEN, new GetDelegationTokenOp());
inst.put(OP_RENEW_DELEGATION_TOKEN, new RenewDelegationTokenOp());
inst.put(OP_CANCEL_DELEGATION_TOKEN, new CancelDelegationTokenOp());
inst.put(OP_UPDATE_MASTER_KEY, new UpdateMasterKeyOp());
inst.put(OP_START_LOG_SEGMENT, new LogSegmentOp(OP_START_LOG_SEGMENT));
inst.put(OP_END_LOG_SEGMENT, new LogSegmentOp(OP_END_LOG_SEGMENT));
inst.put(OP_UPDATE_BLOCKS, new UpdateBlocksOp());
inst.put(OP_ALLOW_SNAPSHOT, new AllowSnapshotOp());
inst.put(OP_DISALLOW_SNAPSHOT, new DisallowSnapshotOp());
inst.put(OP_CREATE_SNAPSHOT, new CreateSnapshotOp());
inst.put(OP_DELETE_SNAPSHOT, new DeleteSnapshotOp());
inst.put(OP_RENAME_SNAPSHOT, new RenameSnapshotOp());
inst.put(OP_SET_GENSTAMP_V2, new SetGenstampV2Op());
inst.put(OP_ALLOCATE_BLOCK_ID, new AllocateBlockIdOp());
inst.put(OP_ADD_BLOCK, new AddBlockOp());
inst.put(OP_ADD_CACHE_DIRECTIVE,
new AddCacheDirectiveInfoOp());
inst.put(OP_MODIFY_CACHE_DIRECTIVE,
new ModifyCacheDirectiveInfoOp());
inst.put(OP_REMOVE_CACHE_DIRECTIVE,
new RemoveCacheDirectiveInfoOp());
inst.put(OP_ADD_CACHE_POOL, new AddCachePoolOp());
inst.put(OP_MODIFY_CACHE_POOL, new ModifyCachePoolOp());
inst.put(OP_REMOVE_CACHE_POOL, new RemoveCachePoolOp());
inst.put(OP_SET_ACL, new SetAclOp());
inst.put(OP_ROLLING_UPGRADE_START, new RollingUpgradeOp(
OP_ROLLING_UPGRADE_START, "start"));
inst.put(OP_ROLLING_UPGRADE_FINALIZE, new RollingUpgradeOp(
OP_ROLLING_UPGRADE_FINALIZE, "finalize"));
inst.put(OP_SET_XATTR, new SetXAttrOp());
inst.put(OP_REMOVE_XATTR, new RemoveXAttrOp());
inst.put(OP_SET_STORAGE_POLICY, new SetStoragePolicyOp());
}
public FSEditLogOp get(FSEditLogOpCodes opcode) {
return inst.get(opcode);
}
}
FSEditLogOpCodes是一个Enum对象,定义了所有可能的日志类型:
@InterfaceAudience.Private
@InterfaceStability.Unstable
public enum FSEditLogOpCodes {
// last op code in file
OP_ADD ((byte) 0),
OP_RENAME_OLD ((byte) 1), // deprecated operation
OP_DELETE ((byte) 2),
OP_MKDIR ((byte) 3),
OP_SET_REPLICATION ((byte) 4),
@Deprecated OP_DATANODE_ADD ((byte) 5), // obsolete
@Deprecated OP_DATANODE_REMOVE((byte) 6), // obsolete
OP_SET_PERMISSIONS ((byte) 7),
OP_SET_OWNER ((byte) 8),
OP_CLOSE ((byte) 9),
OP_SET_GENSTAMP_V1 ((byte) 10),
OP_SET_NS_QUOTA ((byte) 11), // obsolete
OP_CLEAR_NS_QUOTA ((byte) 12), // obsolete
OP_TIMES ((byte) 13), // set atime, mtime
OP_SET_QUOTA ((byte) 14),
OP_RENAME ((byte) 15), // filecontext rename
OP_CONCAT_DELETE ((byte) 16), // concat files
OP_SYMLINK ((byte) 17),
OP_GET_DELEGATION_TOKEN ((byte) 18),
OP_RENEW_DELEGATION_TOKEN ((byte) 19),
OP_CANCEL_DELEGATION_TOKEN ((byte) 20),
OP_UPDATE_MASTER_KEY ((byte) 21),
OP_REASSIGN_LEASE ((byte) 22),
OP_END_LOG_SEGMENT ((byte) 23),
OP_START_LOG_SEGMENT ((byte) 24),
OP_UPDATE_BLOCKS ((byte) 25),
OP_CREATE_SNAPSHOT ((byte) 26),
OP_DELETE_SNAPSHOT ((byte) 27),
OP_RENAME_SNAPSHOT ((byte) 28),
OP_ALLOW_SNAPSHOT ((byte) 29),
OP_DISALLOW_SNAPSHOT ((byte) 30),
OP_SET_GENSTAMP_V2 ((byte) 31),
OP_ALLOCATE_BLOCK_ID ((byte) 32),
OP_ADD_BLOCK ((byte) 33),
OP_ADD_CACHE_DIRECTIVE ((byte) 34),
OP_REMOVE_CACHE_DIRECTIVE ((byte) 35),
OP_ADD_CACHE_POOL ((byte) 36),
OP_MODIFY_CACHE_POOL ((byte) 37),
OP_REMOVE_CACHE_POOL ((byte) 38),
OP_MODIFY_CACHE_DIRECTIVE ((byte) 39),
OP_SET_ACL ((byte) 40),
OP_ROLLING_UPGRADE_START ((byte) 41),
OP_ROLLING_UPGRADE_FINALIZE ((byte) 42),
OP_SET_XATTR ((byte) 43),
OP_REMOVE_XATTR ((byte) 44),
OP_SET_STORAGE_POLICY ((byte) 45),
// Note that the current range of the valid OP code is 0~127
OP_INVALID ((byte) -1);
private final byte opCode;
/**
* Constructor
*
* @param opCode byte value of constructed enum
*/
FSEditLogOpCodes(byte opCode) {
this.opCode = opCode;
}
/**
* return the byte value of the enum
*
* @return the byte value of the enum
*/
public byte getOpCode() {
return opCode;
}
private static final FSEditLogOpCodes[] VALUES;
static {
byte max = 0;
for (FSEditLogOpCodes code : FSEditLogOpCodes.values()) {
if (code.getOpCode() > max) {
max = code.getOpCode();
}
}
VALUES = new FSEditLogOpCodes[max + 1];
for (FSEditLogOpCodes code : FSEditLogOpCodes.values()) {
if (code.getOpCode() >= 0) {
VALUES[code.getOpCode()] = code;
}
}
}
/**
* Converts byte to FSEditLogOpCodes enum value
*
* @param opCode get enum for this opCode
* @return enum with byte value of opCode
*/
public static FSEditLogOpCodes fromByte(byte opCode) {
if (opCode >= 0 && opCode < VALUES.length) {
return VALUES[opCode];
}
return opCode == -1 ? OP_INVALID : null;
}
}
FSEditLogOp是一个抽象类,部分代码如下:
public abstract class FSEditLogOp {
public final FSEditLogOpCodes opCode;
long txid = HdfsConstants.INVALID_TXID;
byte[] rpcClientId = RpcConstants.DUMMY_CLIENT_ID;
int rpcCallId = RpcConstants.INVALID_CALL_ID;
static class SetOwnerOp extends FSEditLogOp {
String src;
String username;
String groupname;
private SetOwnerOp() {
super(OP_SET_OWNER);
}
static SetOwnerOp getInstance(OpInstanceCache cache) {
return (SetOwnerOp)cache.get(OP_SET_OWNER);
}
SetOwnerOp setSource(String src) {
this.src = src;
return this;
}
SetOwnerOp setUser(String username) {
this.username = username;
return this;
}
SetOwnerOp setGroup(String groupname) {
this.groupname = groupname;
return this;
}
@Override
public
void writeFields(DataOutputStream out) throws IOException {
FSImageSerialization.writeString(src, out);
FSImageSerialization.writeString(username == null ? "" : username, out);
FSImageSerialization.writeString(groupname == null ? "" : groupname, out);
}
@Override
void readFields(DataInputStream in, int logVersion)
throws IOException {
this.src = FSImageSerialization.readString(in);
this.username = FSImageSerialization.readString_EmptyAsNull(in);
this.groupname = FSImageSerialization.readString_EmptyAsNull(in);
}
@Override
public String toString() {
StringBuilder builder = new StringBuilder();
builder.append("SetOwnerOp [src=");
builder.append(src);
builder.append(", username=");
builder.append(username);
builder.append(", groupname=");
builder.append(groupname);
builder.append(", opCode=");
builder.append(opCode);
builder.append(", txid=");
builder.append(txid);
builder.append("]");
return builder.toString();
}
@Override
protected void toXml(ContentHandler contentHandler) throws SAXException {
XMLUtils.addSaxString(contentHandler, "SRC", src);
if (username != null) {
XMLUtils.addSaxString(contentHandler, "USERNAME", username);
}
if (groupname != null) {
XMLUtils.addSaxString(contentHandler, "GROUPNAME", groupname);
}
}
@Override void fromXml(Stanza st) throws InvalidXmlException {
this.src = st.getValue("SRC");
this.username = (st.hasChildren("USERNAME")) ?
st.getValue("USERNAME") : null;
this.groupname = (st.hasChildren("GROUPNAME")) ?
st.getValue("GROUPNAME") : null;
}
}
返回logSetOwner方法:从缓冲区里得到FSEditLogOp
void logSetOwner(String src, String username, String groupname) {
SetOwnerOp op = SetOwnerOp.getInstance(cache.get())
.setSource(src)
.setUser(username)
.setGroup(groupname);
logEdit(op);
}
SetOwnerOp.getInstance方法,把
FSEditLogOp转为SetOwnerOp对象,代码如下:
static SetOwnerOp getInstance(OpInstanceCache cache) {
return (SetOwnerOp)cache.get(OP_SET_OWNER);
}
void logEdit(final FSEditLogOp op) {
synchronized (this) {
assert isOpenForWrite() :
"bad state: " + state;
// wait if an automatic sync is scheduled
waitIfAutoSyncScheduled();
long start = beginTransaction();
op.setTransactionId(txid);
try {
editLogStream.write(op);
} catch (IOException ex) {
// All journals failed, it is handled in logSync.
}
endTransaction(start);
// check if it is time to schedule an automatic sync
if (!shouldForceSync()) {
return;
}
isAutoSyncScheduled = true;
}
// sync buffered edit log entries to persistent store
logSync();
}
myTransactionIdisOpenForWrite判断当前状态是否在write模式。
/**
* @return true if the log is currently open in write mode, regardless
* of whether it actually has an open segment.
*/
synchronized boolean isOpenForWrite() {
return state == State.IN_SEGMENT ||
state == State.BETWEEN_LOG_SEGMENTS;
}
waitIfAutoSyncScheduled代码如下:
/**
* Wait if an automatic sync is scheduled
*/
synchronized void waitIfAutoSyncScheduled() {
try {
while (isAutoSyncScheduled) {
this.wait(1000);
}
} catch (InterruptedException e) {
}
}
beginTransaction,用于生成txid,代码如下:
private long beginTransaction() {
assert Thread.holdsLock(this);
// get a new transactionId
txid++;
//
// record the transactionId when new data was written to the edits log
//
TransactionId id = myTransactionId.get();
id.txid = txid;
return now();
}
private static final ThreadLocal myTransactionId = new ThreadLocal() {
@Override
protected synchronized TransactionId initialValue() {
return new TransactionId(Long.MAX_VALUE);
}
};
write的方法如下:
@Override
public void write(final FSEditLogOp op)
throws IOException {
mapJournalsAndReportErrors(new JournalClosure() {
@Override
public void apply(JournalAndStream jas) throws IOException {
if (jas.isActive()) {
jas.getCurrentStream().write(op);
}
}
}, "write op");
}
/**
* Implementations of this interface encapsulate operations that can be
* iteratively applied on all the journals. For example see
* {@link JournalSet#mapJournalsAndReportErrors}.
*/
private interface JournalClosure {
/**
* The operation on JournalAndStream.
* @param jas Object on which operations are performed.
* @throws IOException
*/
public void apply(JournalAndStream jas) throws IOException;
}
/**
* Apply the given operation across all of the journal managers, disabling
* any for which the closure throws an IOException.
* @param closure {@link JournalClosure} object encapsulating the operation.
* @param status message used for logging errors (e.g. "opening journal")
* @throws IOException If the operation fails on all the journals.
*/
private void mapJournalsAndReportErrors(
JournalClosure closure, String status) throws IOException{
List badJAS = Lists.newLinkedList();
for (JournalAndStream jas : journals) {
try {
closure.apply(jas);
} catch (Throwable t) {
if (jas.isRequired()) {
final String msg = "Error: " + status + " failed for required journal ("
+ jas + ")";
LOG.fatal(msg, t);
// If we fail on *any* of the required journals, then we must not
// continue on any of the other journals. Abort them to ensure that
// retry behavior doesn't allow them to keep going in any way.
abortAllJournals();
// the current policy is to shutdown the NN on errors to shared edits
// dir. There are many code paths to shared edits failures - syncs,
// roll of edits etc. All of them go through this common function
// where the isRequired() check is made. Applying exit policy here
// to catch all code paths.
terminate(1, msg);
} else {
LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
badJAS.add(jas);
}
}
}
disableAndReportErrorOnJournals(badJAS);
if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
minimumRedundantJournals)) {
String message = status + " failed for too many journals";
LOG.error("Error: " + message);
throw new IOException(message);
}
}
最终用调回JournalCl
osure.apply方法:
@Override
public void apply(JournalAndStream jas) throws IOException {
if (jas.isActive()) {
jas.getCurrentStream().write(op);
}
}
@Override
public void write(FSEditLogOp op) throws IOException {
doubleBuf.writeOp(op);
}
doubleBuf是EditsDoubleBuffer的实例。EditsDoubleBuffer类的代码如下:
/**
* A double-buffer for edits. New edits are written into the first buffer
* while the second is available to be flushed. Each time the double-buffer
* is flushed, the two internal buffers are swapped. This allows edits
* to progress concurrently to flushes without allocating new buffers each
* time.
*/
@InterfaceAudience.Private
public class EditsDoubleBuffer {
private TxnBuffer bufCurrent; // current buffer for writing
private TxnBuffer bufReady; // buffer ready for flushing
private final int initBufferSize;
public EditsDoubleBuffer(int defaultBufferSize) {
initBufferSize = defaultBufferSize;
bufCurrent = new TxnBuffer(initBufferSize);
bufReady = new TxnBuffer(initBufferSize);
}
public void writeOp(FSEditLogOp op) throws IOException {
bufCurrent.writeOp(op);
}
public void writeRaw(byte[] bytes, int offset, int length) throws IOException {
bufCurrent.write(bytes, offset, length);
}
public void close() throws IOException {
Preconditions.checkNotNull(bufCurrent);
Preconditions.checkNotNull(bufReady);
int bufSize = bufCurrent.size();
if (bufSize != 0) {
throw new IOException("FSEditStream has " + bufSize
+ " bytes still to be flushed and cannot be closed.");
}
IOUtils.cleanup(null, bufCurrent, bufReady);
bufCurrent = bufReady = null;
}
public void setReadyToFlush() {
assert isFlushed() : "previous data not flushed yet";
TxnBuffer tmp = bufReady;
bufReady = bufCurrent;
bufCurrent = tmp;
}
/**
* Writes the content of the "ready" buffer to the given output stream,
* and resets it. Does not swap any buffers.
*/
public void flushTo(OutputStream out) throws IOException {
bufReady.writeTo(out); // write data to file
bufReady.reset(); // erase all data in the buffer
}
public boolean shouldForceSync() {
return bufCurrent.size() >= initBufferSize;
}
DataOutputBuffer getReadyBuf() {
return bufReady;
}
DataOutputBuffer getCurrentBuf() {
return bufCurrent;
}
public boolean isFlushed() {
return bufReady.size() == 0;
}
public int countBufferedBytes() {
return bufReady.size() + bufCurrent.size();
}
/**
* @return the transaction ID of the first transaction ready to be flushed
*/
public long getFirstReadyTxId() {
assert bufReady.firstTxId > 0;
return bufReady.firstTxId;
}
/**
* @return the number of transactions that are ready to be flushed
*/
public int countReadyTxns() {
return bufReady.numTxns;
}
/**
* @return the number of bytes that are ready to be flushed
*/
public int countReadyBytes() {
return bufReady.size();
}
private static class TxnBuffer extends DataOutputBuffer {
long firstTxId;
int numTxns;
private final Writer writer;
public TxnBuffer(int initBufferSize) {
super(initBufferSize);
writer = new FSEditLogOp.Writer(this);
reset();
}
public void writeOp(FSEditLogOp op) throws IOException {
if (firstTxId == HdfsConstants.INVALID_TXID) {
firstTxId = op.txid;
} else {
assert op.txid > firstTxId;
}
writer.writeOp(op);
numTxns++;
}
@Override
public DataOutputBuffer reset() {
super.reset();
firstTxId = HdfsConstants.INVALID_TXID;
numTxns = 0;
return this;
}
}
}
public void writeOp(FSEditLogOp op) throws IOException {
if (firstTxId == HdfsConstants.INVALID_TXID) {
firstTxId = op.txid;
} else {
assert op.txid > firstTxId;
}
writer.writeOp(op);
numTxns++;
}
**
* Write an operation to the output stream
*
* @param op The operation to write
* @throws IOException if an error occurs during writing.
*/
public void writeOp(FSEditLogOp op) throws IOException {
int start = buf.getLength();
// write the op code first to make padding and terminator verification
// work
buf.writeByte(op.opCode.getOpCode());
buf.writeInt(0); // write 0 for the length first
buf.writeLong(op.txid);
op.writeFields(buf);
int end = buf.getLength();
// write the length back: content of the op + 4 bytes checksum - op_code
int length = end - start - 1;
buf.writeInt(length, start + 1);
checksum.reset();
checksum.update(buf.getData(), start, end-start);
int sum = (int)checksum.getValue();
buf.writeInt(sum);
}
}
@Override
public
void writeFields(DataOutputStream out) throws IOException {
FSImageSerialization.writeString(src, out);
FSImageSerialization.writeString(username == null ? "" : username, out);
FSImageSerialization.writeString(groupname == null ? "" : groupname, out);
}
FSImageSerialization.writeString代码如下:
@SuppressWarnings("deprecation")
public static void writeString(String str, DataOutput out) throws IOException {
DeprecatedUTF8 ustr = TL_DATA.get().U_STR;
ustr.set(str);
ustr.write(out);
}
/**
* In order to reduce allocation, we reuse some static objects. However, the methods
* in this class should be thread-safe since image-saving is multithreaded, so
* we need to keep the static objects in a thread-local.
*/
static private final ThreadLocal TL_DATA =
new ThreadLocal() {
@Override
protected TLData initialValue() {
return new TLData();
}
};
TLData对象的定义如下:
/**
* Simple container "struct" for threadlocal data.
*/
static private final class TLData {
final DeprecatedUTF8 U_STR = new DeprecatedUTF8();
final ShortWritable U_SHORT = new ShortWritable();
final IntWritable U_INT = new IntWritable();
final LongWritable U_LONG = new LongWritable();
final FsPermission FILE_PERM = new FsPermission((short) 0);
final BooleanWritable U_BOOLEAN = new BooleanWritable();
}
/**
* A simple wrapper around {@link org.apache.hadoop.io.UTF8}.
* This class should be used only when it is absolutely necessary
* to use {@link org.apache.hadoop.io.UTF8}. The only difference is that
* using this class does not require "@SuppressWarning" annotation to avoid
* javac warning. Instead the deprecation is implied in the class name.
*
* This should be treated as package private class to HDFS.
*/
@InterfaceAudience.Private
@SuppressWarnings("deprecation")
public class DeprecatedUTF8 extends org.apache.hadoop.io.UTF8 {
public DeprecatedUTF8() {
super();
}
/** Construct from a given string. */
public DeprecatedUTF8(String string) {
super(string);
}
/** Construct from a given string. */
public DeprecatedUTF8(DeprecatedUTF8 utf8) {
super(utf8);
}
/* The following two are the mostly commonly used methods.
* wrapping them so that editors do not complain about the deprecation.
*/
public static String readString(DataInput in) throws IOException {
return org.apache.hadoop.io.UTF8.readString(in);
}
public static int writeString(DataOutput out, String s) throws IOException {
return org.apache.hadoop.io.UTF8.writeString(out, s);
}
}
void logEdit(final FSEditLogOp op) {
synchronized (this) {
assert isOpenForWrite() :
"bad state: " + state;
// wait if an automatic sync is scheduled
waitIfAutoSyncScheduled();
long start = beginTransaction();
op.setTransactionId(txid);
try {
editLogStream.write(op);
} catch (IOException ex) {
// All journals failed, it is handled in logSync.
}
endTransaction(start);
// check if it is time to schedule an automatic sync
if (!shouldForceSync()) {
return;
}
isAutoSyncScheduled = true;
}
// sync buffered edit log entries to persistent store
logSync();
}
endTransaction主要用于统计信息,代码如下:
private void endTransaction(long start) {
assert Thread.holdsLock(this);
// update statistics
long end = now();
numTransactions++;
totalTimeTransactions += (end-start);
if (metrics != null) // Metrics is non-null only when used inside name node
metrics.addTransaction(end-start);
}
private boolean shouldForceSync() {
return editLogStream.shouldForceSync();
}
然后调用JournalSet.shouldForceSync()
@Override
public boolean shouldForceSync() {
for (JournalAndStream js : journals) {
if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
return true;
}
}
return false;
}
js.getCurentStream().shouldForcesSync方法如下:
/**
* @return true if the number of buffered data exceeds the intial buffer size
*/
@Override
public boolean shouldForceSync() {
return doubleBuf.shouldForceSync();
}
EditsDoubleBuffer.shouldForceSync代码如下,:
public boolean shouldForceSync() {
return bufCurrent.size() >= initBufferSize;
}
private void setOwnerInt(final String srcArg, String username, String group)
throws AccessControlException, FileNotFoundException, SafeModeException,
UnresolvedLinkException, IOException {
String src = srcArg;
HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
checkNameNodeSafeMode("Cannot set owner for " + src);
src = resolvePath(src, pathComponents);
checkOwner(pc, src);
if (!pc.isSuperUser()) {
if (username != null && !pc.getUser().equals(username)) {
throw new AccessControlException("Non-super user cannot change owner");
}
if (group != null && !pc.containsGroup(group)) {
throw new AccessControlException("User does not belong to " + group);
}
}
dir.setOwner(src, username, group);
getEditLog().logSetOwner(src, username, group);
resultingStat = getAuditFileInfo(src, false);
} finally {
writeUnlock();
}
getEditLog().logSync();
logAuditEvent(true, "setOwner", srcArg, null, resultingStat);
}
如果返回true,则设置isAutoSyncScheduled = true,直接调用logSync。设置了isAutoSyncScheduled = true之后,其它线程调用 logEdit方法,会在waitIfAutoSyncScheduled等待;logSync代码如下:
/**
* Sync all modifications done by this thread.
*
* The internal concurrency design of this class is as follows:
* - Log items are written synchronized into an in-memory buffer,
* and each assigned a transaction ID.
* - When a thread (client) would like to sync all of its edits, logSync()
* uses a ThreadLocal transaction ID to determine what edit number must
* be synced to.
* - The isSyncRunning volatile boolean tracks whether a sync is currently
* under progress.
*
* The data is double-buffered within each edit log implementation so that
* in-memory writing can occur in parallel with the on-disk writing.
*
* Each sync occurs in three steps:
* 1. synchronized, it swaps the double buffer and sets the isSyncRunning
* flag.
* 2. unsynchronized, it flushes the data to storage
* 3. synchronized, it resets the flag and notifies anyone waiting on the
* sync.
*
* The lack of synchronization on step 2 allows other threads to continue
* to write into the memory buffer while the sync is in progress.
* Because this step is unsynchronized, actions that need to avoid
* concurrency with sync() should be synchronized and also call
* waitForSyncToFinish() before assuming they are running alone.
*/
public void logSync() {
long syncStart = 0;
// Fetch the transactionId of this thread.
long mytxid = myTransactionId.get().txid;
boolean sync = false;
try {
EditLogOutputStream logStream = null;
synchronized (this) {
try {
printStatistics(false);
// if somebody is already syncing, then wait,解释为什么出现这种现象,因为第一个线程进入此方法之后,设置isSyncRunning = true;
while (mytxid > synctxid && isSyncRunning) {
try {
wait(1000);
} catch (InterruptedException ie) {
}
}
//
// If this transaction was already flushed, then nothing to do
//
if (mytxid <= synctxid) {
numTransactionsBatchedInSync++;
if (metrics != null) {
// Metrics is non-null only when used inside name node
metrics.incrTransactionsBatchedInSync();
}
return;
}
// now, this thread will do the sync
syncStart = txid;
isSyncRunning = true;
sync = true;
// swap buffers
try {
if (journalSet.isEmpty()) {
throw new IOException("No journals available to flush");
}
editLogStream.setReadyToFlush();
} catch (IOException e) {
final String msg =
"Could not sync enough journals to persistent storage " +
"due to " + e.getMessage() + ". " +
"Unsynced transactions: " + (txid - synctxid);
LOG.fatal(msg, new Exception());
synchronized(journalSetLock) {
IOUtils.cleanup(LOG, journalSet);
}
terminate(1, msg);
}
} finally {
// Prevent RuntimeException from blocking other log edit write
doneWithAutoSyncScheduling();
}
//editLogStream may become null,
//so store a local variable for flush.
logStream = editLogStream;
}
// do the sync
long start = now();
try {
if (logStream != null) {
logStream.flush();
}
} catch (IOException ex) {
synchronized (this) {
final String msg =
"Could not sync enough journals to persistent storage. "
+ "Unsynced transactions: " + (txid - synctxid);
LOG.fatal(msg, new Exception());
synchronized(journalSetLock) {
IOUtils.cleanup(LOG, journalSet);
}
terminate(1, msg);
}
}
long elapsed = now() - start;
if (metrics != null) { // Metrics non-null only when used inside name node
metrics.addSync(elapsed);
}
} finally {
// Prevent RuntimeException from blocking other log edit sync
synchronized (this) {
if (sync) {
synctxid = syncStart;
isSyncRunning = false;
}
this.notifyAll();
}
}
}
下面依次分析这个方法:
从synchronized (this) {开始,printStatistics(false);是打印统计信息,代码如下:
private void printStatistics(boolean force) {
long now = now();
if (lastPrintTime + 60000 > now && !force) {
return;
}
lastPrintTime = now;
StringBuilder buf = new StringBuilder();
buf.append("Number of transactions: ");
buf.append(numTransactions);
buf.append(" Total time for transactions(ms): ");
buf.append(totalTimeTransactions);
buf.append(" Number of transactions batched in Syncs: ");
buf.append(numTransactionsBatchedInSync);
buf.append(" Number of syncs: ");
buf.append(editLogStream.getNumSync());
buf.append(" SyncTimes(ms): ");
buf.append(journalSet.getSyncTimes());
LOG.info(buf);
}
// if somebody is already syncing, then wait
while (mytxid > synctxid && isSyncRunning) {
try {
wait(1000);
} catch (InterruptedException ie) {
}
}
下面的代码也是一个优化,如果这个事务已经被flushed,则返回。
//
// If this transaction was already flushed, then nothing to do
//
if (mytxid <= synctxid) {
numTransactionsBatchedInSync++;
if (metrics != null) {
// Metrics is non-null only when used inside name node
metrics.incrTransactionsBatchedInSync();
}
return;
}
syncStart = txid;
isSyncRunning = true;
sync = true;
try {
if (journalSet.isEmpty()) {
throw new IOException("No journals available to flush");
}
editLogStream.setReadyToFlush();
} catch (IOException e) {
final String msg =
"Could not sync enough journals to persistent storage " +
"due to " + e.getMessage() + ". " +
"Unsynced transactions: " + (txid - synctxid);
LOG.fatal(msg, new Exception());
synchronized(journalSetLock) {
IOUtils.cleanup(LOG, journalSet);
}
terminate(1, msg);
}
} finally {
// Prevent RuntimeException from blocking other log edit write
doneWithAutoSyncScheduling();
}
上边的try块中,setReadyToFlush的 代码如下,依次在各Journal上调用闭包类的apply方法。
@Override
public void setReadyToFlush() throws IOException {
mapJournalsAndReportErrors(new JournalClosure() {
@Override
public void apply(JournalAndStream jas) throws IOException {
if (jas.isActive()) {
jas.getCurrentStream().setReadyToFlush();
}
}
}, "setReadyToFlush");
}
mapJournalsAndReportErrors的方法如下:
private void mapJournalsAndReportErrors(
JournalClosure closure, String status) throws IOException{
List badJAS = Lists.newLinkedList();
for (JournalAndStream jas : journals) {
try {
closure.apply(jas);
} catch (Throwable t) {
if (jas.isRequired()) {
final String msg = "Error: " + status + " failed for required journal ("
+ jas + ")";
LOG.fatal(msg, t);
// If we fail on *any* of the required journals, then we must not
// continue on any of the other journals. Abort them to ensure that
// retry behavior doesn't allow them to keep going in any way.
abortAllJournals();
// the current policy is to shutdown the NN on errors to shared edits
// dir. There are many code paths to shared edits failures - syncs,
// roll of edits etc. All of them go through this common function
// where the isRequired() check is made. Applying exit policy here
// to catch all code paths.
terminate(1, msg);
} else {
LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
badJAS.add(jas);
}
}
}
disableAndReportErrorOnJournals(badJAS);
if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
minimumRedundantJournals)) {
String message = status + " failed for too many journals";
LOG.error("Error: " + message);
throw new IOException(message);
}
}
//disableAndReportErrorOnJournals方法如下:
private void disableAndReportErrorOnJournals(List badJournals) {
if (badJournals == null || badJournals.isEmpty()) {
return; // nothing to do
}
for (JournalAndStream j : badJournals) {
LOG.error("Disabling journal " + j);
j.abort();
j.setDisabled(true);
}
}
/**
* All data that has been written to the stream so far will be flushed. New
* data can be still written to the stream while flushing is performed.
*/
@Override
public void setReadyToFlush() throws IOException {
doubleBuf.setReadyToFlush();
}
dbouleBuf的setReadyToFlush()方法如下:
public void setReadyToFlush() {
assert isFlushed() : "previous data not flushed yet";
TxnBuffer tmp = bufReady;
bufReady = bufCurrent;
bufCurrent = tmp;
}