[HBase]Write Path

HBase的批量put操作主要步骤

1.同个region的put视为同一批操作

2.对批量操作按rowkey进行字节排序

Collections.sort(actionsForRegion);

 3.检查region server的全局内存是否超过阀值,如超过则唤醒flush线程进行flush操作

 

public void reclaimMemStoreMemory() {
	//如果超过高水位,默认为堆内存的0.4,阻塞rpc线程直到内存减少到预期
    if (isAboveHighWaterMark()) {
      lock.lock();
      try {
        boolean blocked = false;
        long startTime = 0;
        while (isAboveHighWaterMark() && !server.isStopped()) {
          .....
	  //给flush线程提交一个task
          wakeupFlushThread();
          try {
            // we should be able to wait forever, but we've seen a bug where
            // we miss a notify, so put a 5 second bound on it at least.
            flushOccurred.await(5, TimeUnit.SECONDS);
          } catch (InterruptedException ie) {
            Thread.currentThread().interrupt();
          }
        }
        ....
      } finally {
        lock.unlock();
      }
    } 
	//如果超过低水位,默认为堆内存的0.35,给flush线程提交一个task,不阻塞线程
    else if (isAboveLowWaterMark()) {
      wakeupFlushThread();
    }
  }
 

 

4.检查这个region的memstore内存大小是否超过限制,超过则唤醒flush线程对该region进行flush,异步操作

 

  private void checkResources()
      throws RegionTooBusyException, InterruptedIOException {

   .....
    boolean blocked = false;
    long startTime = 0;
    //当前region内存大小超过blockingMemStoreSize,默认为memstoreFlushSize的2被,memstoreFlushSize默认128M
    while (this.memstoreSize.get() > this.blockingMemStoreSize) {
	//给flush线程发个请求
      requestFlush();
     。。。。。
      blocked = true;
	//等待一段时间,10s
      synchronized(this) {
        try {
          wait(Math.min(timeToWait, threadWakeFrequency));
        } catch (InterruptedException ie) {
          final long totalTime = EnvironmentEdgeManager.currentTimeMillis() - startTime;
          if (totalTime > 0) {
            this.updatesBlockedMs.add(totalTime);
          }
          LOG.info("Interrupted while waiting to unblock updates for region "
            + this + " '" + Thread.currentThread().getName() + "'");
          InterruptedIOException iie = new InterruptedIOException();
          iie.initCause(ie);
          throw iie;
        }
      }
    }
 ......
  }

 

5.拿行锁,如果拿不到锁,则不处理

 

  private Integer internalObtainRowLock(final byte[] row, boolean waitForLock)
      throws IOException {
	//检查row的范围是否在这个region里
    checkRow(row, "row lock");
    startRegionOperation();
    try {
      HashedBytes rowKey = new HashedBytes(row);
      //行锁是一个Latch,释放的时候Latch减1,等待线程就会被唤醒
      CountDownLatch rowLatch = new CountDownLatch(1);

      // loop until we acquire the row lock (unless !waitForLock)
      while (true) {
	//put一把
        CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch);
	//如果锁不存在,则认为拿到锁
        if (existingLatch == null) {
          break;
        } 
	//已经有锁了,则等待锁释放或超时
	else {
          // row already locked
          if (!waitForLock) {
            return null;
          }
          try {
            if (!existingLatch.await(this.rowLockWaitDuration,
                            TimeUnit.MILLISECONDS)) {
              throw new IOException("Timed out on getting lock for row="
                  + Bytes.toStringBinary(row));
            }
          } catch (InterruptedException ie) {
            // Empty
          }
        }
      }

      // loop until we generate an unused lock id
	//锁id是一个原子递增的整数
      while (true) {
        Integer lockId = lockIdGenerator.incrementAndGet();
        HashedBytes existingRowKey = lockIds.putIfAbsent(lockId, rowKey);
        if (existingRowKey == null) {
          return lockId;
        } else {
          // lockId already in use, jump generator to a new spot
          lockIdGenerator.set(rand.nextInt());
        }
      }
    } finally {
      closeRegionOperation();
    }
  }
 6.修改KeyValue的timestamp为当前时间

 

7.拿mvcc的写事务id

    public WriteEntry beginMemstoreInsert() {
    synchronized (writeQueue) {
	//事务id是一个原子递增的long
      long nextWriteNumber = ++memstoreWrite;
        //entry用来存这个事务的状态,是否已完成
      WriteEntry e = new WriteEntry(nextWriteNumber);
      writeQueue.add(e);
      return e;
    }
  }
 8.写入memstore的内存kv列表

 

 

    private long internalAdd(final KeyValue toAdd) {
    //堆内存加了多少
    long s = heapSizeChange(toAdd, this.kvset.add(toAdd));
    timeRangeTracker.includeTimestamp(toAdd);
    this.size.addAndGet(s);
    return s;
  }
 9.写Hlog,但不flush,仍在内存
    private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId,
      final long now, HTableDescriptor htd, boolean doSync)
    throws IOException {
      ......
      long txid = 0;
      synchronized (this.updateLock) {
	//log的序列号
        long seqNum = obtainSeqNum();
        // The 'lastSeqWritten' map holds the sequence number of the oldest
        // write for each region (i.e. the first edit added to the particular
        // memstore). . When the cache is flushed, the entry for the
        // region being flushed is removed if the sequence number of the flush
        // is greater than or equal to the value in lastSeqWritten.
        // Use encoded name.  Its shorter, guaranteed unique and a subset of
        // actual  name.
        byte [] encodedRegionName = info.getEncodedNameAsBytes();
	//region第一个修改的事务id,flush时所有大于等于该值的entry都会被写入文件
        this.lastSeqWritten.putIfAbsent(encodedRegionName, seqNum);
        HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId);
        doWrite(info, logKey, edits, htd);
        this.numEntries.incrementAndGet();
	//事务id,代表第几条log
        txid = this.unflushedEntries.incrementAndGet();
        if (htd.isDeferredLogFlush()) {
          lastDeferredTxid = txid;
        }
      }
      // Sync if catalog region, and if not then check if that table supports
      // deferred log flushing
      if (doSync && 
          (info.isMetaRegion() ||
          !htd.isDeferredLogFlush())) {
        // sync txn to file system
        this.sync(txid);
      }
      return txid;
    }
 
    写log的cache
    // appends new writes to the pendingWrites. It is better to keep it in
    // our own queue rather than writing it to the HDFS output stream because
    // HDFSOutputStream.writeChunk is not lightweight at all.
    synchronized void append(Entry e) throws IOException {
      pendingWrites.add(e);
    }
 10.释放行锁
     public void releaseRowLock(final Integer lockId) {
    if (lockId == null) return; // null lock id, do nothing
    //先删除lock id
    HashedBytes rowKey = lockIds.remove(lockId);
    if (rowKey == null) {
      LOG.warn("Release unknown lockId: " + lockId);
      return;
    }
    //再删除lock
    CountDownLatch rowLatch = lockedRows.remove(rowKey);
    if (rowLatch == null) {
      LOG.error("Releases row not locked, lockId: " + lockId + " row: "
          + rowKey);
      return;
    }
    //lock释放
    rowLatch.countDown();
  }
 11.flush Hlog到HDFS

 

 

  // sync all transactions upto the specified txid
  private void syncer(long txid) throws IOException {
    Writer tempWriter;
    synchronized (this.updateLock) {
      if (this.closed) return;
      tempWriter = this.writer; // guaranteed non-null
    }
    // if the transaction that we are interested in is already 
    // synced, then return immediately.
    //当前flush到第一个日志了,有可能已经被其他rpc线程flush掉了
    if (txid <= this.syncedTillHere) {
      return;
    }
    try {
      long doneUpto;
      long now = System.currentTimeMillis();
      // First flush all the pending writes to HDFS. Then 
      // issue the sync to HDFS. If sync is successful, then update
      // syncedTillHere to indicate that transactions till this
      // number has been successfully synced.
      synchronized (flushLock) {
        if (txid <= this.syncedTillHere) {
          return;
        }
        doneUpto = this.unflushedEntries.get();
	//当前所有cache的log
        List<Entry> pending = logSyncerThread.getPendingWrites();
        try {
		//写,但没sync到HDFS
          logSyncerThread.hlogFlush(tempWriter, pending);
        } catch(IOException io) {
          synchronized (this.updateLock) {
            // HBASE-4387, HBASE-5623, retry with updateLock held
            tempWriter = this.writer;
            logSyncerThread.hlogFlush(tempWriter, pending);
          }
        }
      }
      // another thread might have sync'ed avoid double-sync'ing
      if (txid <= this.syncedTillHere) {
        return;
      }
      try {
	//sync到HDFS,写失败重试一次
        tempWriter.sync();
      } catch(IOException io) {
        synchronized (this.updateLock) {
          // HBASE-4387, HBASE-5623, retry with updateLock held
          tempWriter = this.writer;
          tempWriter.sync();
        }
      }
      //当前已sync的日志
      this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto);

     ......
    } catch (IOException e) {
      LOG.fatal("Could not sync. Requesting close of hlog", e);
      //回滚。
      requestLogRoll();
      throw e;
    }
  }

 

 @Override
  public void append(HLog.Entry entry) throws IOException {
    entry.setCompressionContext(compressionContext);
    try {
      //SequenceFile写入
      this.writer.append(entry.getKey(), entry.getEdit());
    } catch (NullPointerException npe) {
      // Concurrent close...
      throw new IOException(npe);
    }
  }

 12.修改mvcc的读事务id

  public void completeMemstoreInsert(WriteEntry e) {
    //递增读事务id
    advanceMemstore(e);
    //等待之前的请求全部完成
    waitForRead(e);
  }

 

   boolean advanceMemstore(WriteEntry e) {
    synchronized (writeQueue) {
	//事务结束
      e.markCompleted();

      long nextReadValue = -1;
      boolean ranOnce=false;
      //遍历队列,拿到最近已完成的事务id,如果中间有一个请求还未完成,则可能拿到的事务id比当前事务小
      while (!writeQueue.isEmpty()) {
        ranOnce=true;
        WriteEntry queueFirst = writeQueue.getFirst();

        if (nextReadValue > 0) {
          if (nextReadValue+1 != queueFirst.getWriteNumber()) {
            throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: "
                + nextReadValue + " next: " + queueFirst.getWriteNumber());
          }
        }

        if (queueFirst.isCompleted()) {
          nextReadValue = queueFirst.getWriteNumber();
          writeQueue.removeFirst();
        } else {
          break;
        }
      }

      if (!ranOnce) {
        throw new RuntimeException("never was a first");
      }

	//修改读事务的id,所有小于该id的事务都已完成,对read可见
      if (nextReadValue > 0) {
        synchronized (readWaiters) {
          memstoreRead = nextReadValue;
          readWaiters.notifyAll();
        }
      }
      if (memstoreRead >= e.getWriteNumber()) {
        return true;
      }
      return false;
    }
  }

 

  /**
   * Wait for the global readPoint to advance upto
   * the specified transaction number.
   */
  public void waitForRead(WriteEntry e) {
    boolean interrupted = false;
    synchronized (readWaiters) {
      //如果前面请求还未处理完,则等待它们结束
      while (memstoreRead < e.getWriteNumber()) {
        try {
          readWaiters.wait(0);
        } catch (InterruptedException ie) {
          // We were interrupted... finish the loop -- i.e. cleanup --and then
          // on our way out, reset the interrupt flag.
          interrupted = true;
        }
      }
    }
    if (interrupted) Thread.currentThread().interrupt();
  }

 13.检查memstore的内存大小是否超过memstoreFlushSize,是则请求flush,异步

14.返回结果,如果put操作没拿到行锁,则结果是null

你可能感兴趣的:(hbase,put,write)