HBase的table是该region切分的,client操作一个row的时候,如何知道这个row对应的region是在哪台Region server上呢?这里有个region location过程。主要涉及到2张系统表,-ROOT-,.META.。其结构见图
在zookeeper的/hbase/root-region-server节点中存着-ROOT-表所在的Region Server地址。
-ROOT-表的一个row代表着META的一个region信息,其key的结构是META表名,META表Region的startkey,RegionId。其value的主要保存regioninfo和server信息。ROOT表不能split
.META.表的一个row代表着用户表的一个region信息,其key的结构是其实就是用户表的regionName,用户表名,startKey,RegionId。其value同样保存着regioninfo和server信息。META表可以split,但是一个region默认有128M,可以存上亿个用户表的region信息,所以一般不会split。
其查找过程如下:
1.通过zk getData拿-ROOT-表的location
2.RPC -ROOT-表的rs,getClosestRowBefore,拿row对应的meta表的region location
3.RPC .META.表的某一个region,拿该row在真实table所在的region location
4.RPC对应region
region location需要3次网络IO,为了提升性能,client会cache数据。
LocationCache是一个2级Map,第一级的key是tableName的hash值,第二级的key是starRow,用SoftValueSortedMap包装了TreeMap实现,用软引用实现cache,内存不够时才会回收。Cache里存在META表和用户表的region location信息。
其代码实现如下,0.94版本:
HConnectionManager locateRegion入口
private HRegionLocation locateRegion(final byte [] tableName,
final byte [] row, boolean useCache)
throws IOException {
.......
//检查下都应的zkTracker是否启动
ensureZookeeperTrackers();
//如果是-ROOT-表,则通过zk节点/hbase/root-region-server获取-ROOT-表所在的Location
if (Bytes.equals(tableName, HConstants.ROOT_TABLE_NAME)) {
try {
//通过zk的getData接口拿节点数据,此处会等待节点数据就位或者超时
ServerName servername = this.rootRegionTracker.waitRootRegionLocation(this.rpcTimeout);
LOG.debug("Looked up root region location, connection=" + this +
"; serverName=" + ((servername == null)? "": servername.toString()));
if (servername == null) return null;
//返回一个拼装的HRegionLocation,因为-ROOT-表只有一个region,而且不会split
return new HRegionLocation(HRegionInfo.ROOT_REGIONINFO,
servername.getHostname(), servername.getPort());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return null;
}
}
//如果是.META.表,则请求.META.表,这里的row其实就是请求row拼装的regionName,类似test,key1,99999999999999
//如果没命中cache,则继续请求-ROOT-表,拿到这个row对应的.META.表的region location
else if (Bytes.equals(tableName, HConstants.META_TABLE_NAME)) {
return locateRegionInMeta(HConstants.ROOT_TABLE_NAME, tableName, row,
useCache, metaRegionLock);
}
//如果是用户表,则请求用户表,这里的row就是key1
//如果没命中cache,则请求.META.表,获取该row对应的region location
else {
// Region not in the cache - have to go to the meta RS
return locateRegionInMeta(HConstants.META_TABLE_NAME, tableName, row,
useCache, userRegionLock);
}
}
locateRegionInMeta方法
private HRegionLocation locateRegionInMeta(final byte [] parentTable,
final byte [] tableName, final byte [] row, boolean useCache,
Object regionLockObject)
throws IOException {
HRegionLocation location;
// If we are supposed to be using the cache, look in the cache to see if
// we already have the region.
//先读cache,cache没有再往上找
//注意如果rowkey的region locatin变化了,RPC的时候会失败,客户端做重试的时候useCache是false
if (useCache) {
location = getCachedLocation(tableName, row);
if (location != null) {
return location;
}
}
// build the key of the meta region we should be looking for.
// the extra 9's on the end are necessary to allow "exact" matches
// without knowing the precise region names.
//先拼一个想查找的key,类似于test,key1,99999999999999
byte [] metaKey = HRegionInfo.createRegionName(tableName, row,
HConstants.NINES, false);
//默认重试10次
for (int tries = 0; true; tries++) {
//找不到
if (tries >= numRetries) {
throw new NoServerForRegionException("Unable to find region for "
+ Bytes.toStringBinary(row) + " after " + numRetries + " tries.");
}
HRegionLocation metaLocation = null;
try {
// locate the root or meta region
//递归查找parentTable
metaLocation = locateRegion(parentTable, metaKey);
// If null still, go around again.
if (metaLocation == null) continue;
//找到对应Region server地址之后,可以发起RPC请求了。
//这里先生成一个RPC Proxy对象,具体RPC分析见后文
HRegionInterface server =
getHRegionConnection(metaLocation.getHostname(), metaLocation.getPort());
Result regionInfoRow = null;
// This block guards against two threads trying to load the meta
// region at the same time. The first will load the meta region and
// the second will use the value that the first one found.
synchronized (regionLockObject) {
// If the parent table is META, we may want to pre-fetch some
// region info into the global region cache for this table.
//如果parentTable是.META.表,则预先获取.META.的一些数据,默认10条
if (Bytes.equals(parentTable, HConstants.META_TABLE_NAME) &&
(getRegionCachePrefetch(tableName)) ) {
prefetchRegionCache(tableName, row);
}
// Check the cache again for a hit in case some other thread made the
// same query while we were waiting on the lock. If not supposed to
// be using the cache, delete any existing cached location so it won't
// interfere.
if (useCache) {
location = getCachedLocation(tableName, row);
if (location != null) {
return location;
}
}
//如果不使用cache,则清除之,比如row对应的region发生了分裂,用老的location启动rpc时会抛异常,此时通过useCache=fasle重新
//寻址,并把老的cache删掉
else {
deleteCachedLocation(tableName, row);
}
// Query the root or meta region for the location of the meta region
//发起RPC请求,获取<=该key的行
regionInfoRow = server.getClosestRowBefore(
metaLocation.getRegionInfo().getRegionName(), metaKey,
HConstants.CATALOG_FAMILY);
}
if (regionInfoRow == null) {
throw new TableNotFoundException(Bytes.toString(tableName));
}
//region信息,做校验,region会处于不稳定状态
byte [] value = regionInfoRow.getValue(HConstants.CATALOG_FAMILY,
HConstants.REGIONINFO_QUALIFIER);
if (value == null || value.length == 0) {
throw new IOException("HRegionInfo was null or empty in " +
Bytes.toString(parentTable) + ", row=" + regionInfoRow);
}
// convert the row result into the HRegionLocation we need!
//反序列化
HRegionInfo regionInfo = (HRegionInfo) Writables.getWritable(
value, new HRegionInfo());
// possible we got a region of a different table...
//一些校验
if (!Bytes.equals(regionInfo.getTableName(), tableName)) {
throw new TableNotFoundException(
"Table '" + Bytes.toString(tableName) + "' was not found, got: " +
Bytes.toString(regionInfo.getTableName()) + ".");
}
if (regionInfo.isSplit()) {
throw new RegionOfflineException("the only available region for" +
" the required row is a split parent," +
" the daughters should be online soon: " +
regionInfo.getRegionNameAsString());
}
if (regionInfo.isOffline()) {
throw new RegionOfflineException("the region is offline, could" +
" be caused by a disable table call: " +
regionInfo.getRegionNameAsString());
}
//该region的server location
value = regionInfoRow.getValue(HConstants.CATALOG_FAMILY,
HConstants.SERVER_QUALIFIER);
String hostAndPort = "";
if (value != null) {
hostAndPort = Bytes.toString(value);
}
......
// Instantiate the location
String hostname = Addressing.parseHostname(hostAndPort);
int port = Addressing.parsePort(hostAndPort);
location = new HRegionLocation(regionInfo, hostname, port);
//cache之
cacheLocation(tableName, location);
return location;
} catch (TableNotFoundException e) {
// if we got this error, probably means the table just plain doesn't
// exist. rethrow the error immediately. this should always be coming
// from the HTable constructor.
throw e;
} catch (IOException e) {
if (e instanceof RemoteException) {
e = RemoteExceptionHandler.decodeRemoteException((RemoteException) e);
}
if (tries < numRetries - 1) {
.......
} else {
throw e;
}
// Only relocate the parent region if necessary
//网络有问题,则重新找
if(!(e instanceof RegionOfflineException ||
e instanceof NoServerForRegionException)) {
relocateRegion(parentTable, metaKey);
}
}
//重试次数越多,sleep越长,interrupt则退出重试
try{
Thread.sleep(ConnectionUtils.getPauseTime(this.pause, tries));
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IOException("Giving up trying to location region in " +
"meta: thread is interrupted.");
}
}
}