HBase的批量put操作主要步骤
1.同个region的put视为同一批操作
2.对批量操作按rowkey进行字节排序
Collections.sort(actionsForRegion);
3.检查region server的全局内存是否超过阀值,如超过则唤醒flush线程进行flush操作
public void reclaimMemStoreMemory() { //如果超过高水位,默认为堆内存的0.4,阻塞rpc线程直到内存减少到预期 if (isAboveHighWaterMark()) { lock.lock(); try { boolean blocked = false; long startTime = 0; while (isAboveHighWaterMark() && !server.isStopped()) { ..... //给flush线程提交一个task wakeupFlushThread(); try { // we should be able to wait forever, but we've seen a bug where // we miss a notify, so put a 5 second bound on it at least. flushOccurred.await(5, TimeUnit.SECONDS); } catch (InterruptedException ie) { Thread.currentThread().interrupt(); } } .... } finally { lock.unlock(); } } //如果超过低水位,默认为堆内存的0.35,给flush线程提交一个task,不阻塞线程 else if (isAboveLowWaterMark()) { wakeupFlushThread(); } }
4.检查这个region的memstore内存大小是否超过限制,超过则唤醒flush线程对该region进行flush,异步操作
private void checkResources() throws RegionTooBusyException, InterruptedIOException { ..... boolean blocked = false; long startTime = 0; //当前region内存大小超过blockingMemStoreSize,默认为memstoreFlushSize的2被,memstoreFlushSize默认128M while (this.memstoreSize.get() > this.blockingMemStoreSize) { //给flush线程发个请求 requestFlush(); 。。。。。 blocked = true; //等待一段时间,10s synchronized(this) { try { wait(Math.min(timeToWait, threadWakeFrequency)); } catch (InterruptedException ie) { final long totalTime = EnvironmentEdgeManager.currentTimeMillis() - startTime; if (totalTime > 0) { this.updatesBlockedMs.add(totalTime); } LOG.info("Interrupted while waiting to unblock updates for region " + this + " '" + Thread.currentThread().getName() + "'"); InterruptedIOException iie = new InterruptedIOException(); iie.initCause(ie); throw iie; } } } ...... }
5.拿行锁,如果拿不到锁,则不处理
private Integer internalObtainRowLock(final byte[] row, boolean waitForLock) throws IOException { //检查row的范围是否在这个region里 checkRow(row, "row lock"); startRegionOperation(); try { HashedBytes rowKey = new HashedBytes(row); //行锁是一个Latch,释放的时候Latch减1,等待线程就会被唤醒 CountDownLatch rowLatch = new CountDownLatch(1); // loop until we acquire the row lock (unless !waitForLock) while (true) { //put一把 CountDownLatch existingLatch = lockedRows.putIfAbsent(rowKey, rowLatch); //如果锁不存在,则认为拿到锁 if (existingLatch == null) { break; } //已经有锁了,则等待锁释放或超时 else { // row already locked if (!waitForLock) { return null; } try { if (!existingLatch.await(this.rowLockWaitDuration, TimeUnit.MILLISECONDS)) { throw new IOException("Timed out on getting lock for row=" + Bytes.toStringBinary(row)); } } catch (InterruptedException ie) { // Empty } } } // loop until we generate an unused lock id //锁id是一个原子递增的整数 while (true) { Integer lockId = lockIdGenerator.incrementAndGet(); HashedBytes existingRowKey = lockIds.putIfAbsent(lockId, rowKey); if (existingRowKey == null) { return lockId; } else { // lockId already in use, jump generator to a new spot lockIdGenerator.set(rand.nextInt()); } } } finally { closeRegionOperation(); } }6.修改KeyValue的timestamp为当前时间
7.拿mvcc的写事务id
public WriteEntry beginMemstoreInsert() { synchronized (writeQueue) { //事务id是一个原子递增的long long nextWriteNumber = ++memstoreWrite; //entry用来存这个事务的状态,是否已完成 WriteEntry e = new WriteEntry(nextWriteNumber); writeQueue.add(e); return e; } }8.写入memstore的内存kv列表
private long internalAdd(final KeyValue toAdd) { //堆内存加了多少 long s = heapSizeChange(toAdd, this.kvset.add(toAdd)); timeRangeTracker.includeTimestamp(toAdd); this.size.addAndGet(s); return s; }9.写Hlog,但不flush,仍在内存
private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId, final long now, HTableDescriptor htd, boolean doSync) throws IOException { ...... long txid = 0; synchronized (this.updateLock) { //log的序列号 long seqNum = obtainSeqNum(); // The 'lastSeqWritten' map holds the sequence number of the oldest // write for each region (i.e. the first edit added to the particular // memstore). . When the cache is flushed, the entry for the // region being flushed is removed if the sequence number of the flush // is greater than or equal to the value in lastSeqWritten. // Use encoded name. Its shorter, guaranteed unique and a subset of // actual name. byte [] encodedRegionName = info.getEncodedNameAsBytes(); //region第一个修改的事务id,flush时所有大于等于该值的entry都会被写入文件 this.lastSeqWritten.putIfAbsent(encodedRegionName, seqNum); HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId); doWrite(info, logKey, edits, htd); this.numEntries.incrementAndGet(); //事务id,代表第几条log txid = this.unflushedEntries.incrementAndGet(); if (htd.isDeferredLogFlush()) { lastDeferredTxid = txid; } } // Sync if catalog region, and if not then check if that table supports // deferred log flushing if (doSync && (info.isMetaRegion() || !htd.isDeferredLogFlush())) { // sync txn to file system this.sync(txid); } return txid; }
写log的cache // appends new writes to the pendingWrites. It is better to keep it in // our own queue rather than writing it to the HDFS output stream because // HDFSOutputStream.writeChunk is not lightweight at all. synchronized void append(Entry e) throws IOException { pendingWrites.add(e); }10.释放行锁
public void releaseRowLock(final Integer lockId) { if (lockId == null) return; // null lock id, do nothing //先删除lock id HashedBytes rowKey = lockIds.remove(lockId); if (rowKey == null) { LOG.warn("Release unknown lockId: " + lockId); return; } //再删除lock CountDownLatch rowLatch = lockedRows.remove(rowKey); if (rowLatch == null) { LOG.error("Releases row not locked, lockId: " + lockId + " row: " + rowKey); return; } //lock释放 rowLatch.countDown(); }11.flush Hlog到HDFS
// sync all transactions upto the specified txid private void syncer(long txid) throws IOException { Writer tempWriter; synchronized (this.updateLock) { if (this.closed) return; tempWriter = this.writer; // guaranteed non-null } // if the transaction that we are interested in is already // synced, then return immediately. //当前flush到第一个日志了,有可能已经被其他rpc线程flush掉了 if (txid <= this.syncedTillHere) { return; } try { long doneUpto; long now = System.currentTimeMillis(); // First flush all the pending writes to HDFS. Then // issue the sync to HDFS. If sync is successful, then update // syncedTillHere to indicate that transactions till this // number has been successfully synced. synchronized (flushLock) { if (txid <= this.syncedTillHere) { return; } doneUpto = this.unflushedEntries.get(); //当前所有cache的log Listpending = logSyncerThread.getPendingWrites(); try { //写,但没sync到HDFS logSyncerThread.hlogFlush(tempWriter, pending); } catch(IOException io) { synchronized (this.updateLock) { // HBASE-4387, HBASE-5623, retry with updateLock held tempWriter = this.writer; logSyncerThread.hlogFlush(tempWriter, pending); } } } // another thread might have sync'ed avoid double-sync'ing if (txid <= this.syncedTillHere) { return; } try { //sync到HDFS,写失败重试一次 tempWriter.sync(); } catch(IOException io) { synchronized (this.updateLock) { // HBASE-4387, HBASE-5623, retry with updateLock held tempWriter = this.writer; tempWriter.sync(); } } //当前已sync的日志 this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto); ...... } catch (IOException e) { LOG.fatal("Could not sync. Requesting close of hlog", e); //回滚。 requestLogRoll(); throw e; } }
@Override public void append(HLog.Entry entry) throws IOException { entry.setCompressionContext(compressionContext); try { //SequenceFile写入 this.writer.append(entry.getKey(), entry.getEdit()); } catch (NullPointerException npe) { // Concurrent close... throw new IOException(npe); } }
12.修改mvcc的读事务id
public void completeMemstoreInsert(WriteEntry e) { //递增读事务id advanceMemstore(e); //等待之前的请求全部完成 waitForRead(e); }
boolean advanceMemstore(WriteEntry e) { synchronized (writeQueue) { //事务结束 e.markCompleted(); long nextReadValue = -1; boolean ranOnce=false; //遍历队列,拿到最近已完成的事务id,如果中间有一个请求还未完成,则可能拿到的事务id比当前事务小 while (!writeQueue.isEmpty()) { ranOnce=true; WriteEntry queueFirst = writeQueue.getFirst(); if (nextReadValue > 0) { if (nextReadValue+1 != queueFirst.getWriteNumber()) { throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: " + nextReadValue + " next: " + queueFirst.getWriteNumber()); } } if (queueFirst.isCompleted()) { nextReadValue = queueFirst.getWriteNumber(); writeQueue.removeFirst(); } else { break; } } if (!ranOnce) { throw new RuntimeException("never was a first"); } //修改读事务的id,所有小于该id的事务都已完成,对read可见 if (nextReadValue > 0) { synchronized (readWaiters) { memstoreRead = nextReadValue; readWaiters.notifyAll(); } } if (memstoreRead >= e.getWriteNumber()) { return true; } return false; } }
/** * Wait for the global readPoint to advance upto * the specified transaction number. */ public void waitForRead(WriteEntry e) { boolean interrupted = false; synchronized (readWaiters) { //如果前面请求还未处理完,则等待它们结束 while (memstoreRead < e.getWriteNumber()) { try { readWaiters.wait(0); } catch (InterruptedException ie) { // We were interrupted... finish the loop -- i.e. cleanup --and then // on our way out, reset the interrupt flag. interrupted = true; } } } if (interrupted) Thread.currentThread().interrupt(); }
13.检查memstore的内存大小是否超过memstoreFlushSize,是则请求flush,异步
14.返回结果,如果put操作没拿到行锁,则结果是null