Region的架构
HRegionServer:
配置:
hbase.client.retries.number (默认10) 客户端的重试次数
hbase.regionserver.msginterval (默认3*1000) ???
hbase.regionserver.checksum.verify(默认false) 是否启用checksum
hbase.server.thread.wakefrequency(默认10*1000) 线程检查频率
hbase.regionserver.numregionstoreport(默认10) ???
hbase.regionserver.handler.count(默认10) handler处理线程个数
hbase.regionserver.metahandler.count(默认10) 处理meta和root的线程个数
hbase.rpc.verbose(默认false)
hbase.regionserver.nbreservationblocks(默认4)
hbase.regionserver.compactionChecker.majorCompactPriority(默认Integer.MAX_VALUE)
HRegionServer的主要操作:
包含的类有
HRegion集合
Leases(租借时间检查)
HMasterRegionInterface(管理hbase)
HServerLoad(hbase负载)
CompactSplitThread(用于合并处理)
MemStoreFlusher(用于刷新memstore)
HLog(WAL相关)
LogRoller(日志回滚)
ZooKeeperWatcher(zk监听)
SplitLogWorker(用于切分日志)
ExecutorService(用户启动open,close HRegion的线程池)
ReplicationSourceService和ReplicationSinkService(replication相关)
HealthCheckChore(健康检查)
一些监听类
MasterAddressTracker
CatalogTracker
ClusterStatusTracker
一些函数
postOpenDeployTasks() 此函数用于更新root表或meta表
各种CURD,scanner,increment操作
multi操作(对于delete和put)
对HRegion的flush,close,open(提交到线程池去做)
split,compact操作,这些最终由一个具体的HRegion去完成
启动的线程
hbase.regionserver.executor.openregion.threads 3
hbase.regionserver.executor.openroot.threads 1
hbase.regionserver.executor.openmeta.threads 1
hbase.regionserver.executor.closeregion.threads 3
hbase.regionserver.executor.closeroot.threads 1
hbase.regionserver.executor.closemeta.threads 1
hlog roller
cache flusher
compact
health check
lease
WEB UI
replication
rpc server
split worker
HRegion
配置:
HRegion的主要操作:
1.CURD和increment操作
2.doMiniBatchMutation操作(用于delete和put)
3.对region的open,delete,init,close,以及addRegionToMeta等操作
4.snapshot
5.bulkload
6.split
7.compact(major,minor)
8.lock
包含的内部类
WriteState(在flush,close,compact时会根据这个类加锁)
RegionScannerImpl(scan的region级别操作)
coprocessor的处理原理
-
- coprocessorHost = new RegionCoprocessorHost(this, rsServices, conf);
-
-
- protected SortedSet coprocessors = new SortedCopyOnWriteSet(new EnvironmentPriorityComparator());
- public RegionCoprocessorHost类中() {
-
- loadSystemCoprocessors(conf,"hbase.coprocessor.region.classes");
-
-
- if (!HTableDescriptor.isMetaTable(region.getRegionInfo().getTableName())) {
- loadSystemCoprocessors(conf,"hbase.coprocessor.user.region.classes");
- }
-
-
- loadTableCoprocessors(conf);
- }
-
- public void load相关函数() {
-
-
- }
-
-
-
- HRegion#flush() {
-
-
-
- }
服务端接收处理过程
HBaseServer$Listener的run()函数和doAccept()函数简化如下 这是一个独立的listene线程
- while (running) {
- SelectionKey key = null;
- selector.select();
- Iterator iter = selector.selectedKeys().iterator();
- while (iter.hasNext()) {
- key = iter.next();
- iter.remove();
- if (key.isValid()) {
- if (key.isAcceptable())
- doAccept(key);
- }
- }
- }
- }
-
- void doAccept(SelectionKey key) {
- ServerSocketChannel server = (ServerSocketChannel) key.channel();
- currentReader = (currentReader + 1) % readers.length;
- Reader reader = readers[currentReader];
- readSelector.wakeup();
- SelectionKey readKey = reader.registerChannel(channel);
- c = getConnection(channel, System.currentTimeMillis());
- readKey.attach(c);
- }
HBaseServer$Listener$Reader的run()函数简化如下 这是一个独立的select线程
- while (running) {
- SelectionKey key = null;
- readSelector.select();
- while (adding) {
- this.wait(1000);
- }
- Iterator iter = readSelector.selectedKeys().iterator();
- while (iter.hasNext()) {
- key = iter.next();
- iter.remove();
- if (key.isValid()) {
- if (key.isReadable()) {
- doRead(key);
- }
- }
- }
- }
-
-
-
-
- void process() {
- int id = dis.readInt();
- param = ReflectionUtils.newInstance(paramClass, conf);
- param.readFields(dis);
- Call call = new Call(id, param, this, responder, callSize);
-
- if (priorityCallQueue != null && getQosLevel(param) > highPriorityLevel) {
- priorityCallQueue.put(call);
- } else if (replicationQueue != null && getQosLevel(param) == HConstants.REPLICATION_QOS) {
- replicationQueue.put(call);
- } else {
- callQueue.put(call);
- }
- }
HBaserServer$Handler的run()函数简化如下
- public void run() {
-
- Call call = myCallQueue.take();
- Invocation call = (Invocation)param;
- Method method = protocol.getMethod(call.getMethodName(),
- call.getParameterClasses());
- Object[] params = call.getParameters();
- Object value = method.invoke(impl, params);
-
-
-
- String name = HRegionInfo.encodeRegionName(regionName)
- onlineRegions.get(name);
- Result r = region.getClosestRowBefore(row, family);
- return r;
- }
flush的过程
服务端是收到了客户端发来的flushRegion请求,具体过程参见 客户端请求过程一文
客户端如果是flush全表,先是获取这个表的所有region名字,然后做一次批量的flushRegion请求(多个请求),但是所有的请求都是在一个线程中执行的
和flush相关的类函数简化如下,1-4是调用顺序
1.HRegion#flushcache()
2.HRegion#internalFlushcache()
3.Store#internalFlushCache()
4.StoreFile$Writer#append()
-
- HRegion#flushcache() {
- try {
- lock.readLock().lock();
- internalFlushcache(status);
- } finally {
- lock.readLock().unlock();
- }
- }
-
-
-
-
- HRegion#internalFlushcache() {
- try {
- this.updatesLock.writeLock().lock();
- List storeFlushers = new ArrayList(stores.size());
- for (Store s : stores.values()) {
- storeFlushers.add(s.getStoreFlusher(completeSequenceId));
- }
- } finally {
- this.updatesLock.writeLock().unlock();
- }
- for (StoreFlusher flusher : storeFlushers) {
- flusher.flushCache(status);
- }
- }
-
-
-
- Store#internalFlushCache() {
- InternalScanner scanner = null;
- KeyValueScanner memstoreScanner = new CollectionBackedScanner(set, this.comparator);
-
- Scan scan = new Scan();
- scan.setMaxVersions(scanInfo.getMaxVersions());
- scanner = new StoreScanner(this, scanInfo, scan,
- Collections.singletonList(memstoreScanner), ScanType.MINOR_COMPACT,
- this.region.getSmallestReadPoint(), HConstants.OLDEST_TIMESTAMP);
-
- try {
- flushLock.lock();
- StoreFile.Writer writer = createWriterInTmp(set.size());
- List kvs = new ArrayList();
- boolean hasMore;
- do {
- hasMore = scanner.next(kvs, compactionKVMax);
- for (KeyValue kv : kvs) {
- Writer.append(kv);
- flushed += this.memstore.heapSizeChange(kv, true);
- }
- kvs.clear();
- }while(hasMore);
- } finally {
- flushLock.unlock();
- }
- }
-
-
-
- StoreFile$Writer#append(final KeyValue kv) {
- appendGeneralBloomfilter(kv);
- appendDeleteFamilyBloomFilter(kv);
- HFileWriterV2#append(kv);
- trackTimestamps(kv);
- }
单个多个put和多个delete的过程
最终是将KeyValue存到KeyValueSkipListSet中,这个类内部是采用ConcurrentSkipListMap实现的
服务端是接收到客户端发来的multi请求
注意只有put操作(单个put和批量put操作)以及批量的delete操作才会执行上面的调用逻辑
incr和单个delete采用了不同的处理逻辑
简化的核心处理函数如下:
-
- HRegion#doMiniBatchMutation() {
-
-
- lock(this.updatesLock.readLock(), numReadyToWrite);
-
-
- long addedSize = 0;
- for (int i = firstIndex; i < lastIndexExclusive; i++) {
- addedSize += applyFamilyMapToMemstore(familyMaps[i], w);
- }
-
-
- addFamilyMapToWALEdit(familyMaps[i], walEdit);
-
-
- HLog.appendNoSync(regionInfo, this.htableDescriptor.getName(),
- walEdit, first.getClusterId(), now, this.htableDescriptor);
-
-
- this.updatesLock.readLock().unlock();
-
-
- mvcc.completeMemstoreInsert(w);
-
- }
这里没有memstore满了判断逻辑,而是由单独的一个线程(cacheFlusher)出处理的
写入到memstore的判断逻辑图
incr的过程
核心处理逻辑如下
- HRegion#increment() {
- Map> tempMemstore = new HashMap>();
- try {
- Integer lid = getLock(lockid, row, true);
- lock(this.updatesLock.readLock());
- byte [] row = increment.getRow();
- Get get = new Get(row);
- List results = get(get, false);
- for(KeyValue kv : results) {
- KeyValue kv = results.get();
- if(kv.getValueLength() == Bytes.SIZEOF_LONG) {
- amount += Bytes.toLong(kv.getBuffer(), kv.getValueOffset(), Bytes.SIZEOF_LONG);
- } else {
- throw new DoNotRetryIOException("Attempted to increment field that isn't 64 bits wide");
- }
- }
-
- if (writeToWAL) {
- walEdits.add(newKV);
- }
- tempMemstore.put(store, kvs);
-
-
- size = this.addAndGetGlobalMemstoreSize(size);
- flush = isFlushSize(size);
- if (flush) {
- requestFlush();
- }
- } finally {
- this.updatesLock.readLock().unlock();
- releaseRowLock(lid);
- }
- }
可以看到incrment的执行流程是先根据row创建Get对象,然后获取这个值,再对这个值做++操作
并将结果放到临时缓存中,如果缓存已满就做刷新
从获取数据到,再做++操作,最后写入缓存(可能还要做刷新处理)这么一段过程都是需要加锁处理的,加锁只是一个行锁
单个delete的过程
主要处理简化逻辑如下
- HRegion#delete(){
- try {
- lid = getLock(lockid, row, true);
- internalDelete()
- } finally {
- releaseRowLock(lid);
- }
- }
-
- HRegion#internalDelete() {
- try {
- updatesLock.readLock().lock();
-
- for(family : 获取delete关联的所有famliy) {
- Store store = getStore(family);
- for (KeyValue kv: edits) {
- kv.setMemstoreTS(localizedWriteEntry.getWriteNumber());
- addedSize += store.add(kv);
- }
- }
- flush = isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize));
- if (flush) {
- requestFlush();
- }
- } finally {
- updatesLock.readLock().unlock();
- }
- }
delete是将所有的column famliy都遍历一遍然后删除和这个key相关的所有famliy,并写入缓存中,如果缓存满了就做刷新处理,同时在删除的时候会有更新锁。
get的过程
下面是核心处理逻辑,可以看到get最后是通过scan来处理的,也就是简单的将scan包装了一下
- HRegion#get() {
- List results = new ArrayList();
- Scan scan = new Scan(get);
- RegionScanner scanner = getScanner(scan);
- List list = scanner.next(results, SchemaMetrics.METRIC_GETSIZE);
- return Result(list);
- }
scan过程
scan是最复杂的操作,其中包含了getClosestRowBefore,openScanner,next三个操作
第一个是对用于对META和ROOT表操作的,第二个用于创建一个scan对象,第三个用于做遍历操作
首先看第一个closestRowBefore的时序图
这里简单来说有这么几步操作
1.通过Store调用HFileReaderV2,这里主要用于打开一个HFile文件,然后定位到指定的key前面或者后面。
这步操作是用于在ROOT表中获取特定的KeyValue,info:server这个KeyValue,然后将这个值封装成
Get对象再去查询META表
2.调用get函数对数据进行获取,get内部又是调用scan函数的,所以实际会创建一个StroeScanner对象
3.StoreScanner也就是对底层的HFileScanner的简单封装
4.之后调用next()获取一段数据,这里还会有嵌入了filter的执行逻辑
5.最后返回给用户的是Result结果,这里就是META表中的一条记录
getClosestRowBefore的调用栈如下
scan操作的类图如下
Store是核心的类,这个类中包含了若干个StoreFile,每个StoreFile类中又有一个Reader和Writer内部类。
通过Reader内部类可以返回一个StroeFileScanner对象
而最终上层在做scan的时候,是通过RegionScannerImpl去做的,这里就包含了filter的过滤逻辑。
执行逻辑如下
-
-
-
-
- HRegion#getClosestRowBefore() {
- startRegionOperation();
- Store store = getStore(family);
- KeyValue key = store.getRowKeyAtOrBefore(row);
- if (key != null) {
- Get get = new Get(key.getRow());
- get.addFamily(family);
- result = get(get, null);
- }
- }
-
-
-
-
-
-
-
- Store#getRowKeyAtOrBefore() {
- this.memstore.getRowKeyAtOrBefore(state);
- for (StoreFile sf : Lists.reverse(storefiles)) {
- rowAtOrBeforeFromStoreFile(sf, state);
- }
- }
-
-
- Store#rowAtOrBeforeFromStoreFile() {
- HFileScanner scanner = r.getScanner(true, true, false);
- if (!seekToScanner(scanner, firstOnRow, firstKV)) return;
- if (walkForwardInSingleRow(scanner, firstOnRow, state)) return;
- while (scanner.seekBefore(firstOnRow.getBuffer(), firstOnRow.getKeyOffset(),firstOnRow.getKeyLength())) {
- KeyValue kv = scanner.getKeyValue();
- if (!state.isTargetTable(kv)) break;
- if (!state.isBetterCandidate(kv)) break;
-
- firstOnRow = new KeyValue(kv.getRow(), HConstants.LATEST_TIMESTAMP);
-
- if (!seekToScanner(scanner, firstOnRow, firstKV)) break;
-
- if (walkForwardInSingleRow(scanner, firstOnRow, state)) break;
- }
- }
-
-
-
-
-
- HFileReaderV2#readBlock() {
- BlockCacheKey cacheKey = new BlockCacheKey(name, dataBlockOffset,
- dataBlockEncoder.getEffectiveEncodingInCache(isCompaction),
- expectedBlockType);
-
- HFileBlock cachedBlock = (HFileBlock)cacheConf.getBlockCache().
- getBlock(cacheKey, cacheBlock, useLock);
- if (cachedBlock != null) {
- return cachedBlock;
- }
- HFileBlock hfileBlock = fsBlockReader.readBlockData(dataBlockOffset,onDiskBlockSize, -1, pread);
- cacheConf.getBlockCache().cacheBlock(cacheKey, hfileBlock,cacheConf.isInMemory());
- }
-
-
-
-
-
- HRegion$RegionScannerImpl#nextRaw() {
- if (outResults.isEmpty()) {
-
-
- returnResult = nextInternal(outResults, limit, metric);
- } else {
- List tmpList = new ArrayList();
- returnResult = nextInternal(tmpList, limit, metric);
- outResults.addAll(tmpList);
- }
- }
-
-
-
-
-
-
-
-
-
-
-
- HRegion$RegionScannerImpl#nextInternal() {
-
- KeyValue current = this.storeHeap.peek();
-
- }
openscanner的执行过程
执行逻辑如下
-
-
-
-
- HRegionServer#openScanner() {
- HRegion r = getRegion(regionName);
- RegionScanner s = r.getScanner(scan);
- return addScanner(s);
- }
-
-
-
- HRegion#instantiateRegionScanner() {
-
- return new RegionScannerImpl(scan, additionalScanners, this);
- }
-
-
-
-
-
- RegionScannerImpl#init() {
- for (Map.Entry<byte[], NavigableSet<byte[]>> entry :scan.getFamilyMap().entrySet()) {
- Store store = stores.get(entry.getKey());
-
- KeyValueScanner scanner = store.getScanner(scan, entry.getValue());
- scanners.add(scanner);
- }
- }
-
-
- StoreFileScanner#seek() {
-
- seekAtOrAfter()
- }
-
-
-
-
- HRegionServer#addScanner() {
- scannerId = rand.nextLong();
- String scannerName = String.valueOf(scannerId);
- scanners.put(scannerName, s);
- this.leases.createLease(scannerName, new ScannerListener(scannerName));
- }
next的执行过程
执行逻辑如下
-
-
-
- HRegionServer#next() {
- RegionScanner s = this.scanners.get(scannID);
- this.leases.cancelLease(scannID);
- HRegion region = getRegion(s.getRegionInfo().getRegionName());
- List results = new ArrayList(nbRows);
- boolean moreRows = s.nextRaw(values, SchemaMetrics.METRIC_NEXTSIZE);
- results.add(new Result(values));
- this.leases.addLease(lease);
-
- }
-
-
-
- HRegion$RegionScannerImpl#nextRaw() {
- if (outResults.isEmpty()) {
-
-
- returnResult = nextInternal(outResults, limit, metric);
- } else {
- List tmpList = new ArrayList();
- returnResult = nextInternal(tmpList, limit, metric);
- outResults.addAll(tmpList);
- }
- }
-
-
-
- HRegion$RegionScannerImpl#nextInternal() {
- boolean stopRow = isStopRow(currentRow, offset, length);
- KeyValue nextKv = populateResult(results, this.storeHeap, limit, currentRow, offset,
- length, metric);
-
- }
-
-
-
- KeyValueHeap#next() {
- InternalScanner currentAsInternal = (InternalScanner)this.current;
- boolean mayContainMoreRows = currentAsInternal.next(result, limit, metric);
- KeyValue pee = this.current.peek();
-
- }
-
-
-
-
- StoreScanner#next() {
- switch(code) {
- case SEEK_NEXT_ROW: {
- reseek(matcher.getKeyForNextRow(kv)); break;
- }
- case SEEK_NEXT_COL: {
- reseek(matcher.getKeyForNextColumn(kv)); break;
- }
- case SKIP: {
- KeyValueHeap.next();
- }
-
- }
- }
-
-
-
- StoreFileScanner#reseek() {
- if (!reseekAtOrAfter(hfs, key)) {
- close();
- return false;
- }
- cur = HFileReaderV2$ScannerV2.getKeyValue();
- }