Hadoop2.6 HDFS EDIT LOG分析

  以hadoop fs -chown data_sum:data_sum /test/input为例,讲解决EditLog的执行过程。

  当客户端执行以上命令时,通过RPC调用,服务器端执行NameNodeRpcServer.setOwner方法,代码如下:

  @Override // ClientProtocol
  public void setOwner(String src, String username, String groupname)
      throws IOException {
    namesystem.setOwner(src, username, groupname);
  }

FSNamesystem的setOwner相关代码如下:
  void setOwner(String src, String username, String group)
      throws AccessControlException, FileNotFoundException, SafeModeException,
      UnresolvedLinkException, IOException {
    try {
      setOwnerInt(src, username, group);
    } catch (AccessControlException e) {
      logAuditEvent(false, "setOwner", src);
      throw e;
    } 
  }


setOwnerInt相关代码如下:

private void setOwnerInt(final String srcArg, String username, String group)
      throws AccessControlException, FileNotFoundException, SafeModeException,
      UnresolvedLinkException, IOException {
    String src = srcArg;
    HdfsFileStatus resultingStat = null;
    FSPermissionChecker pc = getPermissionChecker();
    checkOperation(OperationCategory.WRITE);
    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
    writeLock();
    try {
      checkOperation(OperationCategory.WRITE);
      checkNameNodeSafeMode("Cannot set owner for " + src);
      src = resolvePath(src, pathComponents);
      checkOwner(pc, src);
      if (!pc.isSuperUser()) {
        if (username != null && !pc.getUser().equals(username)) {
          throw new AccessControlException("Non-super user cannot change owner");
        }
        if (group != null && !pc.containsGroup(group)) {
          throw new AccessControlException("User does not belong to " + group);
        }
      }
      dir.setOwner(src, username, group);
<span style="color:#FF0000;">      getEditLog().logSetOwner(src, username, group);</span>
      resultingStat = getAuditFileInfo(src, false);
    } finally {
      writeUnlock();
    }
<span style="color:#FF0000;">   getEditLog().logSync();</span>
    logAuditEvent(true, "setOwner", srcArg, null, resultingStat);
  }


 以上有两个地方重要,一个是getEditLog().logSetOwner(src, username, group);用于把日志记入缓冲区。另一个是getEditLog().logSync();用于把日志刷到持久设备上。输出日志和持久化日志的分离(getEditLog().logSync()不在tryLock内,即释放tryLock之后,其它线程可以再次获得tryLock,而不必等logSync之后),是Hadoop日志系统的设计核心,也是高效的日志设计方法,因为并不是每个操作都要进行logSync(),一次sync可能会同步一批操作的日志,从而提高了效率。但是最后的getEditLog().logSync();保证了日志同步之后,操作才会返回。


logSetOwner的方法内容:

  void logSetOwner(String src, String username, String groupname) {
    SetOwnerOp op = SetOwnerOp.getInstance(cache.get())
      .setSource(src)
      .setUser(username)
      .setGroup(groupname);
    logEdit(op);
  }

我们先来分析cache,cache是一个ThreadLocal对象,作用范围是当前线程,代码如下

  private final ThreadLocal<OpInstanceCache> cache =
      new ThreadLocal<OpInstanceCache>() {
    @Override
    protected OpInstanceCache initialValue() {
      return new OpInstanceCache();
    }
  };

OpInstanceCache的内容如下:

final public static class OpInstanceCache {
    private final EnumMap<FSEditLogOpCodes, FSEditLogOp> inst =
        new EnumMap<FSEditLogOpCodes, FSEditLogOp>(FSEditLogOpCodes.class);
    
    public OpInstanceCache() {
      inst.put(OP_ADD, new AddOp());
      inst.put(OP_CLOSE, new CloseOp());
      inst.put(OP_SET_REPLICATION, new SetReplicationOp());
      inst.put(OP_CONCAT_DELETE, new ConcatDeleteOp());
      inst.put(OP_RENAME_OLD, new RenameOldOp());
      inst.put(OP_DELETE, new DeleteOp());
      inst.put(OP_MKDIR, new MkdirOp());
      inst.put(OP_SET_GENSTAMP_V1, new SetGenstampV1Op());
      inst.put(OP_SET_PERMISSIONS, new SetPermissionsOp());
      inst.put(OP_SET_OWNER, new SetOwnerOp());
      inst.put(OP_SET_NS_QUOTA, new SetNSQuotaOp());
      inst.put(OP_CLEAR_NS_QUOTA, new ClearNSQuotaOp());
      inst.put(OP_SET_QUOTA, new SetQuotaOp());
      inst.put(OP_TIMES, new TimesOp());
      inst.put(OP_SYMLINK, new SymlinkOp());
      inst.put(OP_RENAME, new RenameOp());
      inst.put(OP_REASSIGN_LEASE, new ReassignLeaseOp());
      inst.put(OP_GET_DELEGATION_TOKEN, new GetDelegationTokenOp());
      inst.put(OP_RENEW_DELEGATION_TOKEN, new RenewDelegationTokenOp());
      inst.put(OP_CANCEL_DELEGATION_TOKEN, new CancelDelegationTokenOp());
      inst.put(OP_UPDATE_MASTER_KEY, new UpdateMasterKeyOp());
      inst.put(OP_START_LOG_SEGMENT, new LogSegmentOp(OP_START_LOG_SEGMENT));
      inst.put(OP_END_LOG_SEGMENT, new LogSegmentOp(OP_END_LOG_SEGMENT));
      inst.put(OP_UPDATE_BLOCKS, new UpdateBlocksOp());

      inst.put(OP_ALLOW_SNAPSHOT, new AllowSnapshotOp());
      inst.put(OP_DISALLOW_SNAPSHOT, new DisallowSnapshotOp());
      inst.put(OP_CREATE_SNAPSHOT, new CreateSnapshotOp());
      inst.put(OP_DELETE_SNAPSHOT, new DeleteSnapshotOp());
      inst.put(OP_RENAME_SNAPSHOT, new RenameSnapshotOp());
      inst.put(OP_SET_GENSTAMP_V2, new SetGenstampV2Op());
      inst.put(OP_ALLOCATE_BLOCK_ID, new AllocateBlockIdOp());
      inst.put(OP_ADD_BLOCK, new AddBlockOp());
      inst.put(OP_ADD_CACHE_DIRECTIVE,
          new AddCacheDirectiveInfoOp());
      inst.put(OP_MODIFY_CACHE_DIRECTIVE,
          new ModifyCacheDirectiveInfoOp());
      inst.put(OP_REMOVE_CACHE_DIRECTIVE,
          new RemoveCacheDirectiveInfoOp());
      inst.put(OP_ADD_CACHE_POOL, new AddCachePoolOp());
      inst.put(OP_MODIFY_CACHE_POOL, new ModifyCachePoolOp());
      inst.put(OP_REMOVE_CACHE_POOL, new RemoveCachePoolOp());

      inst.put(OP_SET_ACL, new SetAclOp());
      inst.put(OP_ROLLING_UPGRADE_START, new RollingUpgradeOp(
          OP_ROLLING_UPGRADE_START, "start"));
      inst.put(OP_ROLLING_UPGRADE_FINALIZE, new RollingUpgradeOp(
          OP_ROLLING_UPGRADE_FINALIZE, "finalize"));
      inst.put(OP_SET_XATTR, new SetXAttrOp());
      inst.put(OP_REMOVE_XATTR, new RemoveXAttrOp());
      inst.put(OP_SET_STORAGE_POLICY, new SetStoragePolicyOp());
    }
    
    public FSEditLogOp get(FSEditLogOpCodes opcode) {
      return inst.get(opcode);
    }
  }


FSEditLogOpCodes是一个Enum对象,定义了所有可能的日志类型:

@InterfaceAudience.Private
@InterfaceStability.Unstable
public enum FSEditLogOpCodes {
  // last op code in file
  OP_ADD                        ((byte)  0),
  OP_RENAME_OLD                 ((byte)  1), // deprecated operation
  OP_DELETE                     ((byte)  2),
  OP_MKDIR                      ((byte)  3),
  OP_SET_REPLICATION            ((byte)  4),
  @Deprecated OP_DATANODE_ADD   ((byte)  5), // obsolete
  @Deprecated OP_DATANODE_REMOVE((byte)  6), // obsolete
  OP_SET_PERMISSIONS            ((byte)  7),
  OP_SET_OWNER                  ((byte)  8),
  OP_CLOSE                      ((byte)  9),
  OP_SET_GENSTAMP_V1            ((byte) 10),
  OP_SET_NS_QUOTA               ((byte) 11), // obsolete
  OP_CLEAR_NS_QUOTA             ((byte) 12), // obsolete
  OP_TIMES                      ((byte) 13), // set atime, mtime
  OP_SET_QUOTA                  ((byte) 14),
  OP_RENAME                     ((byte) 15), // filecontext rename
  OP_CONCAT_DELETE              ((byte) 16), // concat files
  OP_SYMLINK                    ((byte) 17),
  OP_GET_DELEGATION_TOKEN       ((byte) 18),
  OP_RENEW_DELEGATION_TOKEN     ((byte) 19),
  OP_CANCEL_DELEGATION_TOKEN    ((byte) 20),
  OP_UPDATE_MASTER_KEY          ((byte) 21),
  OP_REASSIGN_LEASE             ((byte) 22),
  OP_END_LOG_SEGMENT            ((byte) 23),
  OP_START_LOG_SEGMENT          ((byte) 24),
  OP_UPDATE_BLOCKS              ((byte) 25),
  OP_CREATE_SNAPSHOT            ((byte) 26),
  OP_DELETE_SNAPSHOT            ((byte) 27),
  OP_RENAME_SNAPSHOT            ((byte) 28),
  OP_ALLOW_SNAPSHOT             ((byte) 29),
  OP_DISALLOW_SNAPSHOT          ((byte) 30),
  OP_SET_GENSTAMP_V2            ((byte) 31),
  OP_ALLOCATE_BLOCK_ID          ((byte) 32),
  OP_ADD_BLOCK                  ((byte) 33),
  OP_ADD_CACHE_DIRECTIVE       ((byte) 34),
  OP_REMOVE_CACHE_DIRECTIVE    ((byte) 35),
  OP_ADD_CACHE_POOL                       ((byte) 36),
  OP_MODIFY_CACHE_POOL                    ((byte) 37),
  OP_REMOVE_CACHE_POOL                    ((byte) 38),
  OP_MODIFY_CACHE_DIRECTIVE     ((byte) 39),
  OP_SET_ACL                    ((byte) 40),
  OP_ROLLING_UPGRADE_START      ((byte) 41),
  OP_ROLLING_UPGRADE_FINALIZE   ((byte) 42),
  OP_SET_XATTR                  ((byte) 43),
  OP_REMOVE_XATTR               ((byte) 44),
  OP_SET_STORAGE_POLICY         ((byte) 45),

  // Note that the current range of the valid OP code is 0~127
  OP_INVALID                    ((byte) -1);

  private final byte opCode;

  /**
   * Constructor
   *
   * @param opCode byte value of constructed enum
   */
  FSEditLogOpCodes(byte opCode) {
    this.opCode = opCode;
  }

  /**
   * return the byte value of the enum
   *
   * @return the byte value of the enum
   */
  public byte getOpCode() {
    return opCode;
  }

  private static final FSEditLogOpCodes[] VALUES;
  
  static {
    byte max = 0;
    for (FSEditLogOpCodes code : FSEditLogOpCodes.values()) {
      if (code.getOpCode() > max) {
        max = code.getOpCode();
      }
    }
    VALUES = new FSEditLogOpCodes[max + 1];
    for (FSEditLogOpCodes code : FSEditLogOpCodes.values()) {
      if (code.getOpCode() >= 0) {
        VALUES[code.getOpCode()] = code;
      }
    }
  }

  /**
   * Converts byte to FSEditLogOpCodes enum value
   *
   * @param opCode get enum for this opCode
   * @return enum with byte value of opCode
   */
  public static FSEditLogOpCodes fromByte(byte opCode) {
    if (opCode >= 0 && opCode < VALUES.length) {
      return VALUES[opCode];
    }
    return opCode == -1 ? OP_INVALID : null;
  }
}


FSEditLogOp是一个抽象类,部分代码如下:

public abstract class FSEditLogOp {
  public final FSEditLogOpCodes opCode;
  long txid = HdfsConstants.INVALID_TXID;
  byte[] rpcClientId = RpcConstants.DUMMY_CLIENT_ID;
  int rpcCallId = RpcConstants.INVALID_CALL_ID;

SetOwnerOP是FSEditLogOp的静态内部类,实现代码如下:

static class SetOwnerOp extends FSEditLogOp {
    String src;
    String username;
    String groupname;

    private SetOwnerOp() {
      super(OP_SET_OWNER);
    }

    static SetOwnerOp getInstance(OpInstanceCache cache) {
      return (SetOwnerOp)cache.get(OP_SET_OWNER);
    }

    SetOwnerOp setSource(String src) {
      this.src = src;
      return this;
    }

    SetOwnerOp setUser(String username) {
      this.username = username;
      return this;
    }

    SetOwnerOp setGroup(String groupname) {
      this.groupname = groupname;
      return this;
    }

    @Override
    public 
    void writeFields(DataOutputStream out) throws IOException {
      FSImageSerialization.writeString(src, out);
      FSImageSerialization.writeString(username == null ? "" : username, out);
      FSImageSerialization.writeString(groupname == null ? "" : groupname, out);
    }

    @Override
    void readFields(DataInputStream in, int logVersion)
        throws IOException {
      this.src = FSImageSerialization.readString(in);
      this.username = FSImageSerialization.readString_EmptyAsNull(in);
      this.groupname = FSImageSerialization.readString_EmptyAsNull(in);
    }

    @Override
    public String toString() {
      StringBuilder builder = new StringBuilder();
      builder.append("SetOwnerOp [src=");
      builder.append(src);
      builder.append(", username=");
      builder.append(username);
      builder.append(", groupname=");
      builder.append(groupname);
      builder.append(", opCode=");
      builder.append(opCode);
      builder.append(", txid=");
      builder.append(txid);
      builder.append("]");
      return builder.toString();
    }
    
    @Override
    protected void toXml(ContentHandler contentHandler) throws SAXException {
      XMLUtils.addSaxString(contentHandler, "SRC", src);
      if (username != null) {
        XMLUtils.addSaxString(contentHandler, "USERNAME", username);
      }
      if (groupname != null) {
        XMLUtils.addSaxString(contentHandler, "GROUPNAME", groupname);
      }
    }
    
    @Override void fromXml(Stanza st) throws InvalidXmlException {
      this.src = st.getValue("SRC");
      this.username = (st.hasChildren("USERNAME")) ? 
          st.getValue("USERNAME") : null;
      this.groupname = (st.hasChildren("GROUPNAME")) ? 
          st.getValue("GROUPNAME") : null;
    }
  }

通过以下代码分析,可以得出:每一个线程拥有一个OpInstanceCache对象,这个对象是一个EnumMap,有日志类型和 FSEditLogOp对象的映射,这好的好处是避免了每次操作都分配一个FSEditLogOp对象, FSEditLogOp对象每个线程每种日志只分配一次。

 返回logSetOwner方法:从缓冲区里得到FSEditLogOp

  void logSetOwner(String src, String username, String groupname) {
    SetOwnerOp op = SetOwnerOp.getInstance(cache.get())
      .setSource(src)
      .setUser(username)
      .setGroup(groupname);
    logEdit(op);
  }
SetOwnerOp.getInstance方法,把 FSEditLogOp转为SetOwnerOp对象,代码如下:
static SetOwnerOp getInstance(OpInstanceCache cache) {
      return (SetOwnerOp)cache.get(OP_SET_OWNER);
    }

logEdit,首先synchronized(this),this指的是FSEditLog对象,代表整个日志,源代码如下:


void logEdit(final FSEditLogOp op) {
    synchronized (this) {
      assert isOpenForWrite() :
        "bad state: " + state;
      
      // wait if an automatic sync is scheduled
      waitIfAutoSyncScheduled();
      
      long start = beginTransaction();
      op.setTransactionId(txid);

      try {
        editLogStream.write(op);
      } catch (IOException ex) {
        // All journals failed, it is handled in logSync.
      }

      endTransaction(start);
      
      // check if it is time to schedule an automatic sync
      if (!shouldForceSync()) {
        return;
      }
      isAutoSyncScheduled = true;
    }
    
    // sync buffered edit log entries to persistent store
    logSync();
  }

myTransactionIdisOpenForWrite判断当前状态是否在write模式。

 /**
   * @return true if the log is currently open in write mode, regardless
   * of whether it actually has an open segment.
   */
  synchronized boolean isOpenForWrite() {
    return state == State.IN_SEGMENT ||
      state == State.BETWEEN_LOG_SEGMENTS;
  }
  waitIfAutoSyncScheduled代码如下:
  /**
   * Wait if an automatic sync is scheduled
   */
  synchronized void waitIfAutoSyncScheduled() {
    try {
      while (isAutoSyncScheduled) {
        this.wait(1000);
      }
    } catch (InterruptedException e) {
    }
  }
 beginTransaction,用于生成txid,代码如下:
private long beginTransaction() {
    assert Thread.holdsLock(this);
    // get a new transactionId
    txid++;

    //
    // record the transactionId when new data was written to the edits log
    //
    TransactionId id = myTransactionId.get();
    id.txid = txid;
    return now();
  }

 #myTransactionID是一个线程局部变量,一个线程同时只能处理一个事务。
  private static final ThreadLocal<TransactionId> myTransactionId = new ThreadLocal<TransactionId>() {
    @Override
    protected synchronized TransactionId initialValue() {
      return new TransactionId(Long.MAX_VALUE);
    }
  };

由于txid是FSEditLog的成员变量,在synchronized(this)块中顺序生成txid,并且op.setTransactionId(txid);是线程安全的。

  write的方法如下:

 @Override
    public void write(final FSEditLogOp op)
        throws IOException {
      mapJournalsAndReportErrors(new JournalClosure() {
        @Override
        public void apply(JournalAndStream jas) throws IOException {
          if (jas.isActive()) {
            jas.getCurrentStream().write(op);
          }
        }
      }, "write op");
    }

JournalClosure是一个接口,只有一个方法apply,注释里写的很明白,即这个类的实现类所包装的操作,会依次应用到所有的journal上。

  /**
   * Implementations of this interface encapsulate operations that can be
   * iteratively applied on all the journals. For example see
   * {@link JournalSet#mapJournalsAndReportErrors}.
   */
  private interface JournalClosure {
    /**
     * The operation on JournalAndStream.
     * @param jas Object on which operations are performed.
     * @throws IOException
     */
    public void apply(JournalAndStream jas) throws IOException;
  }

mapJournalsAndReportErrors代码如下:

/**
   * Apply the given operation across all of the journal managers, disabling
   * any for which the closure throws an IOException.
   * @param closure {@link JournalClosure} object encapsulating the operation.
   * @param status message used for logging errors (e.g. "opening journal")
   * @throws IOException If the operation fails on all the journals.
   */
  private void mapJournalsAndReportErrors(
      JournalClosure closure, String status) throws IOException{

    List<JournalAndStream> badJAS = Lists.newLinkedList();
    for (JournalAndStream jas : journals) {
      try {
        closure.apply(jas);
      } catch (Throwable t) {
        if (jas.isRequired()) {
          final String msg = "Error: " + status + " failed for required journal ("
            + jas + ")";
          LOG.fatal(msg, t);
          // If we fail on *any* of the required journals, then we must not
          // continue on any of the other journals. Abort them to ensure that
          // retry behavior doesn't allow them to keep going in any way.
          abortAllJournals();
          // the current policy is to shutdown the NN on errors to shared edits
          // dir. There are many code paths to shared edits failures - syncs,
          // roll of edits etc. All of them go through this common function 
          // where the isRequired() check is made. Applying exit policy here 
          // to catch all code paths.
          terminate(1, msg);
        } else {
          LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
          badJAS.add(jas);          
        }
      }
    }
    disableAndReportErrorOnJournals(badJAS);
    if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
        minimumRedundantJournals)) {
      String message = status + " failed for too many journals";
      LOG.error("Error: " + message);
      throw new IOException(message);
    }
  }
最终用调回JournalCl osure.apply方法:

   
        @Override
        public void apply(JournalAndStream jas) throws IOException {
          if (jas.isActive()) {
            jas.getCurrentStream().write(op);
          }
        }
    

write 方法如下:

  @Override
  public void write(FSEditLogOp op) throws IOException {
    doubleBuf.writeOp(op);
  }

doubleBuf是EditsDoubleBuffer的实例。EditsDoubleBuffer类的代码如下:

/**
 * A double-buffer for edits. New edits are written into the first buffer
 * while the second is available to be flushed. Each time the double-buffer
 * is flushed, the two internal buffers are swapped. This allows edits
 * to progress concurrently to flushes without allocating new buffers each
 * time.
 */
@InterfaceAudience.Private
public class EditsDoubleBuffer {

  private TxnBuffer bufCurrent; // current buffer for writing
  private TxnBuffer bufReady; // buffer ready for flushing
  private final int initBufferSize;

  public EditsDoubleBuffer(int defaultBufferSize) {
    initBufferSize = defaultBufferSize;
    bufCurrent = new TxnBuffer(initBufferSize);
    bufReady = new TxnBuffer(initBufferSize);

  }
    
  public void writeOp(FSEditLogOp op) throws IOException {
    bufCurrent.writeOp(op);
  }

  public void writeRaw(byte[] bytes, int offset, int length) throws IOException {
    bufCurrent.write(bytes, offset, length);
  }
  
  public void close() throws IOException {
    Preconditions.checkNotNull(bufCurrent);
    Preconditions.checkNotNull(bufReady);

    int bufSize = bufCurrent.size();
    if (bufSize != 0) {
      throw new IOException("FSEditStream has " + bufSize
          + " bytes still to be flushed and cannot be closed.");
    }

    IOUtils.cleanup(null, bufCurrent, bufReady);
    bufCurrent = bufReady = null;
  }
  
  public void setReadyToFlush() {
    assert isFlushed() : "previous data not flushed yet";
    TxnBuffer tmp = bufReady;
    bufReady = bufCurrent;
    bufCurrent = tmp;
  }
  
  /**
   * Writes the content of the "ready" buffer to the given output stream,
   * and resets it. Does not swap any buffers.
   */
  public void flushTo(OutputStream out) throws IOException {
    bufReady.writeTo(out); // write data to file
    bufReady.reset(); // erase all data in the buffer
  }
  
  public boolean shouldForceSync() {
    return bufCurrent.size() >= initBufferSize;
  }

  DataOutputBuffer getReadyBuf() {
    return bufReady;
  }
  
  DataOutputBuffer getCurrentBuf() {
    return bufCurrent;
  }

  public boolean isFlushed() {
    return bufReady.size() == 0;
  }

  public int countBufferedBytes() {
    return bufReady.size() + bufCurrent.size();
  }

  /**
   * @return the transaction ID of the first transaction ready to be flushed 
   */
  public long getFirstReadyTxId() {
    assert bufReady.firstTxId > 0;
    return bufReady.firstTxId;
  }

  /**
   * @return the number of transactions that are ready to be flushed
   */
  public int countReadyTxns() {
    return bufReady.numTxns;
  }

  /**
   * @return the number of bytes that are ready to be flushed
   */
  public int countReadyBytes() {
    return bufReady.size();
  }
  
  private static class TxnBuffer extends DataOutputBuffer {
    long firstTxId;
    int numTxns;
    private final Writer writer;
    
    public TxnBuffer(int initBufferSize) {
      super(initBufferSize);
      writer = new FSEditLogOp.Writer(this);
      reset();
    }

    public void writeOp(FSEditLogOp op) throws IOException {
      if (firstTxId == HdfsConstants.INVALID_TXID) {
        firstTxId = op.txid;
      } else {
        assert op.txid > firstTxId;
      }
      writer.writeOp(op);
      numTxns++;
    }
    
    @Override
    public DataOutputBuffer reset() {
      super.reset();
      firstTxId = HdfsConstants.INVALID_TXID;
      numTxns = 0;
      return this;
    }
  }

}

实际上调用doubleBuf.writeOp

   public void writeOp(FSEditLogOp op) throws IOException {
      if (firstTxId == HdfsConstants.INVALID_TXID) {
        firstTxId = op.txid;
      } else {
        assert op.txid > firstTxId;
      }
      writer.writeOp(op);
      numTxns++;
    }

writer.writeOp代码如下:

**
     * Write an operation to the output stream
     * 
     * @param op The operation to write
     * @throws IOException if an error occurs during writing.
     */
    public void writeOp(FSEditLogOp op) throws IOException {
      int start = buf.getLength();
      // write the op code first to make padding and terminator verification
      // work
      buf.writeByte(op.opCode.getOpCode());
      buf.writeInt(0); // write 0 for the length first
      buf.writeLong(op.txid);
      op.writeFields(buf);
      int end = buf.getLength();
      
      // write the length back: content of the op + 4 bytes checksum - op_code
      int length = end - start - 1;
      buf.writeInt(length, start + 1);

      checksum.reset();
      checksum.update(buf.getData(), start, end-start);
      int sum = (int)checksum.getValue();
      buf.writeInt(sum);
    }
  }

op.writeFields(buf)的代码如下:

  @Override
    public 
    void writeFields(DataOutputStream out) throws IOException {
      FSImageSerialization.writeString(src, out);
      FSImageSerialization.writeString(username == null ? "" : username, out);
      FSImageSerialization.writeString(groupname == null ? "" : groupname, out);
    }

FSImageSerialization.writeString代码如下:

  @SuppressWarnings("deprecation")
  public static void writeString(String str, DataOutput out) throws IOException {
    DeprecatedUTF8 ustr = TL_DATA.get().U_STR;
    ustr.set(str);
    ustr.write(out);
  }

TL_DATA是一个线程本地对象,目的是写日志时,减少对象的生成。否则,以String对象为例,要把String对象写到输出流中,先生成UTF8对象,先写生成的字节数组的长度,再写内容。如果写10000次String对像,要生成 10000个UTF8类型的对象。采用线程本地对象,一个线程只用生成一个UTF8对象。

/**
   * In order to reduce allocation, we reuse some static objects. However, the methods
   * in this class should be thread-safe since image-saving is multithreaded, so 
   * we need to keep the static objects in a thread-local.
   */
  static private final ThreadLocal<TLData> TL_DATA =
    new ThreadLocal<TLData>() {
    @Override
    protected TLData initialValue() {
      return new TLData();
    }
  };


 TLData对象的定义如下:

  /**
   * Simple container "struct" for threadlocal data.
   */
  static private final class TLData {
    final DeprecatedUTF8 U_STR = new DeprecatedUTF8();
    final ShortWritable U_SHORT = new ShortWritable();
    final IntWritable U_INT = new IntWritable();
    final LongWritable U_LONG = new LongWritable();
    final FsPermission FILE_PERM = new FsPermission((short) 0);
    final BooleanWritable U_BOOLEAN = new BooleanWritable();
  }

DeprecatedUTF8类是UTF8类的包装类,代码如下:

/**
 * A simple wrapper around {@link org.apache.hadoop.io.UTF8}.
 * This class should be used only when it is absolutely necessary
 * to use {@link org.apache.hadoop.io.UTF8}. The only difference is that 
 * using this class does not require "@SuppressWarning" annotation to avoid 
 * javac warning. Instead the deprecation is implied in the class name.
 * 
 * This should be treated as package private class to HDFS.
 */
@InterfaceAudience.Private
@SuppressWarnings("deprecation")
public class DeprecatedUTF8 extends org.apache.hadoop.io.UTF8 {
  
  public DeprecatedUTF8() {
    super();
  }

  /** Construct from a given string. */
  public DeprecatedUTF8(String string) {
    super(string);
  }

  /** Construct from a given string. */
  public DeprecatedUTF8(DeprecatedUTF8 utf8) {
    super(utf8);
  }
  
  /* The following two are the mostly commonly used methods.
   * wrapping them so that editors do not complain about the deprecation.
   */
  
  public static String readString(DataInput in) throws IOException {
    return org.apache.hadoop.io.UTF8.readString(in);
  }
  
  public static int writeString(DataOutput out, String s) throws IOException {
    return org.apache.hadoop.io.UTF8.writeString(out, s);
  }
}

至此, editLogStream.write(op)方法分析完毕。

void logEdit(final FSEditLogOp op) {
    synchronized (this) {
      assert isOpenForWrite() :
        "bad state: " + state;
      
      // wait if an automatic sync is scheduled
      waitIfAutoSyncScheduled();
      
      long start = beginTransaction();
      op.setTransactionId(txid);

      try {
        editLogStream.write(op);
      } catch (IOException ex) {
        // All journals failed, it is handled in logSync.
      }

      endTransaction(start);
      
      // check if it is time to schedule an automatic sync
      if (!shouldForceSync()) {
        return;
      }
      isAutoSyncScheduled = true;
    }
    
    // sync buffered edit log entries to persistent store
    logSync();
  }



  

 endTransaction主要用于统计信息,代码如下:

  private void endTransaction(long start) {
    assert Thread.holdsLock(this);
    
    // update statistics
    long end = now();
    numTransactions++;
    totalTimeTransactions += (end-start);
    if (metrics != null) // Metrics is non-null only when used inside name node
      metrics.addTransaction(end-start);
  }

private boolean shouldForceSync() {
    return editLogStream.shouldForceSync();
  } 


然后调用JournalSet.shouldForceSync()

    @Override
    public boolean shouldForceSync() {
      for (JournalAndStream js : journals) {
        if (js.isActive() && js.getCurrentStream().shouldForceSync()) {
          return true;
        }
      }
      return false;
    }

js.getCurentStream().shouldForcesSync方法如下:

 /**
   * @return true if the number of buffered data exceeds the intial buffer size
   */
  @Override
  public boolean shouldForceSync() {
    return doubleBuf.shouldForceSync();
  }
EditsDoubleBuffer.shouldForceSync代码如下,:
 public boolean shouldForceSync() {
    return bufCurrent.size() >= initBufferSize;
  }

initbufferSize为524288,如果返回false,直接退出logEdit方法,返回到上级调用logSync(如setOwnerInt所示),在此过程中,如果其它线程调用logEdit方法,则仍然可以进入。

private void setOwnerInt(final String srcArg, String username, String group)
      throws AccessControlException, FileNotFoundException, SafeModeException,
      UnresolvedLinkException, IOException {
    String src = srcArg;
    HdfsFileStatus resultingStat = null;
    FSPermissionChecker pc = getPermissionChecker();
    checkOperation(OperationCategory.WRITE);
    byte[][] pathComponents = FSDirectory.getPathComponentsForReservedPath(src);
    writeLock();
    try {
      checkOperation(OperationCategory.WRITE);
      checkNameNodeSafeMode("Cannot set owner for " + src);
      src = resolvePath(src, pathComponents);
      checkOwner(pc, src);
      if (!pc.isSuperUser()) {
        if (username != null && !pc.getUser().equals(username)) {
          throw new AccessControlException("Non-super user cannot change owner");
        }
        if (group != null && !pc.containsGroup(group)) {
          throw new AccessControlException("User does not belong to " + group);
        }
      }
      dir.setOwner(src, username, group);
      getEditLog().logSetOwner(src, username, group);
      resultingStat = getAuditFileInfo(src, false);
    } finally {
      writeUnlock();
    }
   <span style="color:#FF0000;"> getEditLog().logSync();</span>
    logAuditEvent(true, "setOwner", srcArg, null, resultingStat);
  }

 如果返回true,则设置isAutoSyncScheduled = true,直接调用logSync。设置了isAutoSyncScheduled = true之后,其它线程调用 logEdit方法,会在waitIfAutoSyncScheduled等待;logSync代码如下:

 /**
   * Sync all modifications done by this thread.
   *
   * The internal concurrency design of this class is as follows:
   *   - Log items are written synchronized into an in-memory buffer,
   *     and each assigned a transaction ID.
   *   - When a thread (client) would like to sync all of its edits, logSync()
   *     uses a ThreadLocal transaction ID to determine what edit number must
   *     be synced to.
   *   - The isSyncRunning volatile boolean tracks whether a sync is currently
   *     under progress.
   *
   * The data is double-buffered within each edit log implementation so that
   * in-memory writing can occur in parallel with the on-disk writing.
   *
   * Each sync occurs in three steps:
   *   1. synchronized, it swaps the double buffer and sets the isSyncRunning
   *      flag.
   *   2. unsynchronized, it flushes the data to storage
   *   3. synchronized, it resets the flag and notifies anyone waiting on the
   *      sync.
   *
   * The lack of synchronization on step 2 allows other threads to continue
   * to write into the memory buffer while the sync is in progress.
   * Because this step is unsynchronized, actions that need to avoid
   * concurrency with sync() should be synchronized and also call
   * waitForSyncToFinish() before assuming they are running alone.
   */
  public void logSync() {
    long syncStart = 0;

    // Fetch the transactionId of this thread. 
    long mytxid = myTransactionId.get().txid;
    
    boolean sync = false;
    try {
      EditLogOutputStream logStream = null;
      synchronized (this) {
        try {
          printStatistics(false);

          // if somebody is already syncing, then wait,解释为什么出现这种现象,因为第一个线程进入此方法之后,设置isSyncRunning = true;
          while (mytxid > synctxid && isSyncRunning) {
            try {
              wait(1000);
            } catch (InterruptedException ie) {
            }
          }
  
          //
          // If this transaction was already flushed, then nothing to do
          //
          if (mytxid <= synctxid) {
            numTransactionsBatchedInSync++;
            if (metrics != null) {
              // Metrics is non-null only when used inside name node
              metrics.incrTransactionsBatchedInSync();
            }
            return;
          }
     
          // now, this thread will do the sync
          syncStart = txid;
          isSyncRunning = true;
          sync = true;
  
          // swap buffers
          try {
            if (journalSet.isEmpty()) {
              throw new IOException("No journals available to flush");
            }
            editLogStream.setReadyToFlush();
          } catch (IOException e) {
            final String msg =
                "Could not sync enough journals to persistent storage " +
                "due to " + e.getMessage() + ". " +
                "Unsynced transactions: " + (txid - synctxid);
            LOG.fatal(msg, new Exception());
            synchronized(journalSetLock) {
              IOUtils.cleanup(LOG, journalSet);
            }
            terminate(1, msg);
          }
        } finally {
          // Prevent RuntimeException from blocking other log edit write 
          doneWithAutoSyncScheduling();
        }
        //editLogStream may become null,
        //so store a local variable for flush.
        logStream = editLogStream;
      }
      
      // do the sync
      long start = now();
      try {
        if (logStream != null) {
          logStream.flush();
        }
      } catch (IOException ex) {
        synchronized (this) {
          final String msg =
              "Could not sync enough journals to persistent storage. "
              + "Unsynced transactions: " + (txid - synctxid);
          LOG.fatal(msg, new Exception());
          synchronized(journalSetLock) {
            IOUtils.cleanup(LOG, journalSet);
          }
          terminate(1, msg);
        }
      }
      long elapsed = now() - start;
  
      if (metrics != null) { // Metrics non-null only when used inside name node
        metrics.addSync(elapsed);
      }
      
    } finally {
      // Prevent RuntimeException from blocking other log edit sync 
      synchronized (this) {
        if (sync) {
          synctxid = syncStart;
          isSyncRunning = false;
        }
        this.notifyAll();
     }
    }
  }

下面依次分析这个方法:

从synchronized (this) {开始,printStatistics(false);是打印统计信息,代码如下:

  private void printStatistics(boolean force) {
    long now = now();
    if (lastPrintTime + 60000 > now && !force) {
      return;
    }
    lastPrintTime = now;
    StringBuilder buf = new StringBuilder();
    buf.append("Number of transactions: ");
    buf.append(numTransactions);
    buf.append(" Total time for transactions(ms): ");
    buf.append(totalTimeTransactions);
    buf.append(" Number of transactions batched in Syncs: ");
    buf.append(numTransactionsBatchedInSync);
    buf.append(" Number of syncs: ");
    buf.append(editLogStream.getNumSync());
    buf.append(" SyncTimes(ms): ");
    buf.append(journalSet.getSyncTimes());
    LOG.info(buf);
  }

接下来的代码如下,虽然isSyncRunning是在下方设置的,但是由于同在代码块中,只有一个线程进入,这也是一个优化,因为wait方法可以去掉本线程对该对象的锁,所以其它线程也可以进入logSync的同步块:

 

  // if somebody is already syncing, then wait
          while (mytxid > synctxid && isSyncRunning) {
            try {
              wait(1000);
            } catch (InterruptedException ie) {
            }
          }

下面的代码也是一个优化,如果这个事务已经被flushed,则返回。

     //
          // If this transaction was already flushed, then nothing to do
          //
          if (mytxid <= synctxid) {
            numTransactionsBatchedInSync++;
            if (metrics != null) {
              // Metrics is non-null only when used inside name node
              metrics.incrTransactionsBatchedInSync();
            }
            return;
          }

接下来三行,txid是FSEditLog对象的txid,因为FSEditLog一个系统只有一个对象,相当于单例,那么txid代表整个系统最后一个事务编号。

          syncStart = txid;
          isSyncRunning = true;
          sync = true;


 try {
            if (journalSet.isEmpty()) {
              throw new IOException("No journals available to flush");
            }
            editLogStream.setReadyToFlush();
          } catch (IOException e) {
            final String msg =
                "Could not sync enough journals to persistent storage " +
                "due to " + e.getMessage() + ". " +
                "Unsynced transactions: " + (txid - synctxid);
            LOG.fatal(msg, new Exception());
            synchronized(journalSetLock) {
              IOUtils.cleanup(LOG, journalSet);
            }
            terminate(1, msg);
          }
        } finally {
          // Prevent RuntimeException from blocking other log edit write 
          doneWithAutoSyncScheduling();
        }
上边的try块中,setReadyToFlush的 代码如下,依次在各Journal上调用闭包类的apply方法。
 
 @Override
    public void setReadyToFlush() throws IOException {
      mapJournalsAndReportErrors(new JournalClosure() {
        @Override
        public void apply(JournalAndStream jas) throws IOException {
          if (jas.isActive()) {
            jas.getCurrentStream().setReadyToFlush();
          }
        }
      }, "setReadyToFlush");
    }

mapJournalsAndReportErrors的方法如下:

private void mapJournalsAndReportErrors(
      JournalClosure closure, String status) throws IOException{

    List<JournalAndStream> badJAS = Lists.newLinkedList();
    for (JournalAndStream jas : journals) {
      try {
        closure.apply(jas);
      } catch (Throwable t) {
        if (jas.isRequired()) {
          final String msg = "Error: " + status + " failed for required journal ("
            + jas + ")";
          LOG.fatal(msg, t);
          // If we fail on *any* of the required journals, then we must not
          // continue on any of the other journals. Abort them to ensure that
          // retry behavior doesn't allow them to keep going in any way.
          abortAllJournals();
          // the current policy is to shutdown the NN on errors to shared edits
          // dir. There are many code paths to shared edits failures - syncs,
          // roll of edits etc. All of them go through this common function 
          // where the isRequired() check is made. Applying exit policy here 
          // to catch all code paths.
          terminate(1, msg);
        } else {
          LOG.error("Error: " + status + " failed for (journal " + jas + ")", t);
          badJAS.add(jas);          
        }
      }
    }
    disableAndReportErrorOnJournals(badJAS);
    if (!NameNodeResourcePolicy.areResourcesAvailable(journals,
        minimumRedundantJournals)) {
      String message = status + " failed for too many journals";
      LOG.error("Error: " + message);
      throw new IOException(message);
    }
  }
//disableAndReportErrorOnJournals方法如下:
  private void disableAndReportErrorOnJournals(List<JournalAndStream> badJournals) {
    if (badJournals == null || badJournals.isEmpty()) {
      return; // nothing to do
    }
 
    for (JournalAndStream j : badJournals) {
      LOG.error("Disabling journal " + j);
      j.abort();
      j.setDisabled(true);
    }
  }


其中EditLogFileOutputStream

/**
   * All data that has been written to the stream so far will be flushed. New
   * data can be still written to the stream while flushing is performed.
   */
  @Override
  public void setReadyToFlush() throws IOException {
    doubleBuf.setReadyToFlush();
  }
dbouleBuf的setReadyToFlush()方法如下:
  public void setReadyToFlush() {
    assert isFlushed() : "previous data not flushed yet";
    TxnBuffer tmp = bufReady;
    bufReady = bufCurrent;
    bufCurrent = tmp;
  }


你可能感兴趣的:(hadoop,log,hdfs)