负载均衡主要用来解决热点问题,使请求更均匀的发送给不同的应用服务器,Hbase是一个典型的主从架构,HMaster负载整个集群的资源调度,任务分配,而数据的IO操作是在HRegionServer上,每个HRegionServer会有N个HRegion,存的是每张表的一部分数据,而HRegionServer里的HRegion之间的负载影响着整个Hbase集群的读写性能。
HRegion的负载调度由HMaster负责,所以负载的实现类在HMaster类里面。先看HMaster里面用到的涉及负载的几个变量
/**触发balacer的线程 */
private BalancerChore balancerChore;
// Tracker for load balancer state
//负载均衡的追踪类
LoadBalancerTracker loadBalancerTracker;
//负载均衡实现类
LoadBalancer balancer;
用来实现HRegion在各个HReginServer之间的迁移的相关操作类在HMaster构造函数中进行了相关初始化工作。
public HMaster(final Configuration conf, CoordinatedStateManager csm)
throws IOException, KeeperException, InterruptedException {
super(conf, csm);
this.rsFatals = new MemoryBoundedLogMessageBuffer(
conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));
LOG.info("hbase.rootdir=" + FSUtils.getRootDir(this.conf) +
", hbase.cluster.distributed=" + this.conf.getBoolean(HConstants.CLUSTER_DISTRIBUTED, false));
// Disable usage of meta replicas in the master
this.conf.setBoolean(HConstants.USE_META_REPLICAS, false);
Replication.decorateMasterConfiguration(this.conf);
// Hack! Maps DFSClient => Master for logs. HDFS made this
// config param for task trackers, but we can piggyback off of it.
if (this.conf.get("mapreduce.task.attempt.id") == null) {
this.conf.set("mapreduce.task.attempt.id", "hb_m_" + this.serverName.toString());
}
// should we check the compression codec type at master side, default true, HBASE-6370
this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
// should we check encryption settings at master side, default true
this.masterCheckEncryption = conf.getBoolean("hbase.master.check.encryption", true);
this.metricsMaster = new MetricsMaster(new MetricsMasterWrapperImpl(this));
// preload table descriptor at startup
this.preLoadTableDescriptors = conf.getBoolean("hbase.master.preload.tabledescriptors", true);
// Do we publish the status?
boolean shouldPublish = conf.getBoolean(HConstants.STATUS_PUBLISHED,
HConstants.STATUS_PUBLISHED_DEFAULT);
Class extends ClusterStatusPublisher.Publisher> publisherClass =
conf.getClass(ClusterStatusPublisher.STATUS_PUBLISHER_CLASS,
ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS,
ClusterStatusPublisher.Publisher.class);
if (shouldPublish) {
if (publisherClass == null) {
LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS +
" is not set - not publishing status");
} else {
clusterStatusPublisherChore = new ClusterStatusPublisher(this, conf, publisherClass);
getChoreService().scheduleChore(clusterStatusPublisherChore);
}
}
// Some unit tests don't need a cluster, so no zookeeper at all
if (!conf.getBoolean("hbase.testing.nocluster", false)) {
activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this);
int infoPort = putUpJettyServer();
/** 开启状态为Active的master的各种管理器 */
startActiveMasterManager(infoPort);
} else {
activeMasterManager = null;
}
}
然后会调到 finishActiveMasterInitialization,这里面最重要的是对后台负责触发负载执行的线程的初始化(其它分支挺多,我们只看主干)
this.balancerChore = new BalancerChore(this);
BalancerChore类继承了ScheduledChore,ScheduledChore类里面的run方法调用了chore(),然后会调到真正的balance方法,也就是说会周期性的去判断是否需要执行HRegion在HRegionServer间的负载迁移操作。
public class BalancerChore extends ScheduledChore {
private static final Log LOG = LogFactory.getLog(BalancerChore.class);
private final HMaster master;
//默认每隔5分钟触发一次balance方法,去判断此时集群是否需要进行HRegion的负载
public BalancerChore(HMaster master) {
super(master.getServerName() + "-BalancerChore", master, master.getConfiguration().getInt(
"hbase.balancer.period", 300000));
this.master = master;
}
@Override
protected void chore() {
try {
master.balance();
} catch (IOException e) {
LOG.error("Failed to balance.", e);
}
}
}
HMaster类的balance方法
public boolean balance() throws IOException {
// if master not initialized, don't run balancer.
/** HMaster如果还没有初始化,不执行负载均衡*/
if (!isInitialized()) {
LOG.debug("Master has not been initialized, don't run balancer.");
return false;
}
// Do this call outside of synchronized block.
int maximumBalanceTime = getBalancerCutoffTime();
synchronized (this.balancer) {
// If balance not true, don't run balancer.
/** 如果当前已经在执行HRegion的负载,这个地方主要是考虑到5分钟调一次balance,有可能上一次balance还没执行完,需要需等上一次执行完后再判断是否需要执行负载(也就是上次执行HRegion迁移的时候超过了这个定时周期)*/
if (!this.loadBalancerTracker.isBalancerOn()) return false;
// Only allow one balance run at at time.
/**如果当前有HRegion处于spilt状态也不进行负载(处于spilt的HRegion会在zk上有节点标识) */
if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {
Map regionsInTransition =
this.assignmentManager.getRegionStates().getRegionsInTransition();
LOG.debug("Not running balancer because " + regionsInTransition.size() +
" region(s) in transition: " + org.apache.commons.lang.StringUtils.
abbreviate(regionsInTransition.toString(), 256));
return false;
}
/**如果当前集群有HRegionServer挂了也不执行负载 */
if (this.serverManager.areDeadServersInProgress()) {
LOG.debug("Not running balancer because processing dead regionserver(s): " +
this.serverManager.getDeadServers());
return false;
}
if (this.cpHost != null) {
try {
if (this.cpHost.preBalance()) {
LOG.debug("Coprocessor bypassing balancer request");
return false;
}
} catch (IOException ioe) {
LOG.error("Error invoking master coprocessor preBalance()", ioe);
return false;
}
}
/** 找到表名--服务名--HRegion列表的映射关系(这里也可以看到,它是以表为单位的)*/
Map>> assignmentsByTable =
this.assignmentManager.getRegionStates().getAssignmentsByTable();
List plans = new ArrayList();
//Give the balancer the current cluster state.
this.balancer.setClusterStatus(getClusterStatus());
for (Map> assignments : assignmentsByTable.values()) {
/**循环的value为每个HRegionServer和该HRegionServer上所有的HRegion的映射关系,这一步才是去真正判断是否需要执行HRegion的负载 */
List partialPlans = this.balancer.balanceCluster(assignments);
if (partialPlans != null) plans.addAll(partialPlans);
}
long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;
int rpCount = 0; // number of RegionPlans balanced so far
long totalRegPlanExecTime = 0;
if (plans != null && !plans.isEmpty()) {
/** 循环执行迁移计划*/
for (RegionPlan plan: plans) {
LOG.info("balance " + plan);
long balStartTime = System.currentTimeMillis();
//TODO: bulk assign
//执行HRegion的迁移,大概过程是首先先卸载原HRegionServer上的这个HRegion,然后目的HRegionServer上打开一个新的HRegion
this.assignmentManager.balance(plan);
totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;
rpCount++;
if (rpCount < plans.size() &&
// if performing next balance exceeds cutoff time, exit the loop
(System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {
//TODO: After balance, there should not be a cutoff time (keeping it as a security net for now)
LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +
maximumBalanceTime);
break;
}
}
}
if (this.cpHost != null) {
try {
this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);
} catch (IOException ioe) {
// balancing already succeeded so don't change the result
LOG.error("Error invoking master coprocessor postBalance()", ioe);
}
}
}
// If LoadBalancer did not generate any plans, it means the cluster is already balanced.
// Return true indicating a success.
return true;
}
上面的balaceCluster会调到下面这些方法
@Override
public synchronized List balanceCluster(Map> clusterState) {
List plans = balanceMasterRegions(clusterState);
if (plans != null || clusterState == null || clusterState.size() <= 1) {
return plans;
}
if (masterServerName != null && clusterState.containsKey(masterServerName)) {
if (clusterState.size() <= 2) {
return null;
}
clusterState = new HashMap>(clusterState);
clusterState.remove(masterServerName);
}
// On clusters with lots of HFileLinks or lots of reference files,
// instantiating the storefile infos can be quite expensive.
// Allow turning this feature off if the locality cost is not going to
// be used in any computations.
RegionLocationFinder finder = null;
if (this.localityCost != null && this.localityCost.getMultiplier() > 0) {
finder = this.regionFinder;
}
//The clusterState that is given to this method contains the state
//of all the regions in the table(s) (that's true today)
// Keep track of servers to iterate through them.
Cluster cluster = new Cluster(clusterState, loads, finder, rackManager);
/**判断当前集群是否需要进行负载 */
if (!needsBalance(cluster)) {
return null;
}
/**下面的处理是真正执行HRegion的负载迁移,大概思路就是根据不同的维度来计算迁移开销 */
long startTime = EnvironmentEdgeManager.currentTime();
initCosts(cluster);
/**计算当前集群的开销*/
double currentCost = computeCost(cluster, Double.MAX_VALUE);
double initCost = currentCost;
double newCost = currentCost;
/** 完成HRegion迁移的最大的步骤数*/
long computedMaxSteps = Math.min(this.maxSteps,
((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
// Perform a stochastic walk to see if we can get a good fit.
long step;
/** 可以看到,每次循环,选择的迁移策略都不一样,都是随机的然后再计算迁移成本*/
for (step = 0; step < computedMaxSteps; step++) {
/** 随机选择一个’选号器’,类似于不同的迁移策略随机使用*/
int generatorIdx = RANDOM.nextInt(candidateGenerators.length);
/**从candidateGenerators数组里面选择一个迁移策略,HBase在这个版本默认有好几种,后面我们单独分析
有4种
CandidateGenerator p = candidateGenerators[generatorIdx];
/**执行选择出来的迁移策略的generate,在执行正在的迁移之前,做一些判断操作,比如2个HRegionServer之间的HRegion是将A-Server里的HRegion迁移到B-Server还是B-A,还是二者交互HRegion等操作,说白了就是告诉HBase我要执行什么操作,是HRegion的迁移?交换? */
/**移步到下面的RandomCandidateGenerator,我们以随机策略为例子进行讲解 */
Cluster.Action action = p.generate(cluster);
if (action.type == Type.NULL) {
continue;
}
/**执行真正的HRegion的负载操作 */
cluster.doAction(action);
/**待HRegion负载后,将当前操作所产生的开销更新到集群 */
updateCostsWithAction(cluster, action);
//移动或者交换完之后,看看新的开销是否要继续
newCost = computeCost(cluster, currentCost);
// Should this be kept?
/**如果新的开销 < 移动之前的开销,说明还不错,可以这样执行HRegion的迁移 */
if (newCost < currentCost) {
currentCost = newCost;
/** 回退刚刚的移动操作*/
} else {
// Put things back the way they were before.
// TODO: undo by remembering old values
Action undoAction = action.undoAction();
cluster.doAction(undoAction);
updateCostsWithAction(cluster, undoAction);
}
if (EnvironmentEdgeManager.currentTime() - startTime >
maxRunningTime) {
break;
}
}
long endTime = EnvironmentEdgeManager.currentTime();
metricsBalancer.balanceCluster(endTime - startTime);
/** 迁移后开销比一开始的小*/
if (initCost > currentCost) {
/** 构建一系列的迁移计划*/
plans = createRegionPlans(cluster);
if (LOG.isDebugEnabled()) {
LOG.debug("Finished computing new load balance plan. Computation took "
+ (endTime - startTime) + "ms to try " + step
+ " different iterations. Found a solution that moves "
+ plans.size() + " regions; Going from a computed cost of "
+ initCost + " to a new cost of " + currentCost);
}
return plans;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Could not find a better load balance plan. Tried "
+ step + " different configurations in " + (endTime - startTime)
+ "ms, and did not find anything with a computed cost less than " + initCost);
}
return null;
}
needsBalance:判断是否需要进行负载迁移
protected boolean needsBalance(Cluster c) {
ClusterLoadState cs = new ClusterLoadState(c.clusterState);
//如果当前集群存活的HRegionServer的个数小于2,则不进行HRegion的迁移(因为小于2个的Server怎么迁移呢?)
if (cs.getNumServers() < MIN_SERVER_BALANCE) {
if (LOG.isDebugEnabled()) {
LOG.debug("Not running balancer because only " + cs.getNumServers()
+ " active regionserver(s)");
}
return false;
}
if(areSomeRegionReplicasColocated(c)) return true;
// Check if we even need to do any load balancing
// HBASE-3681 check sloppiness first
/** 获取当前集群的HRegionServer的平均的HRegion数*/
float average = cs.getLoadAverage(); // for logging
/**当前集群能够接受HRegion分配比例的最小值 */
int floor = (int) Math.floor(average * (1 - slop));
/**当前集群能够接受HRegion分配比例的最大值 */
int ceiling = (int) Math.ceil(average * (1 + slop));
/**如果当前集群的佣有最多HRegion的Server比最大值大或者拥有最少HRegion的Server比最小值小,说明整个集群的HRegion的分配不是很平衡,需要进行负载迁移 */
if (!(cs.getMaxLoad() > ceiling || cs.getMinLoad() < floor)) {
NavigableMap> serversByLoad = cs.getServersByLoad();
if (LOG.isTraceEnabled()) {
// If nothing to balance, then don't say anything unless trace-level logging.
LOG.trace("Skipping load balancing because balanced cluster; " +
"servers=" + cs.getNumServers() +
" regions=" + cs.getNumRegions() + " average=" + average +
" mostloaded=" + serversByLoad.lastKey().getLoad() +
" leastloaded=" + serversByLoad.firstKey().getLoad());
}
return false;
}
return true;
}
computeCost:计算开销
protected double computeCost(Cluster cluster, double previousCost) {
double total = 0;
/**costFunctions在initCost方法里面进行了初始化 */
for (CostFunction c:costFunctions) {
if (c.getMultiplier() <= 0) {
continue;
}
/** 权重 * 不同维度的开销评分*/
total += c.getMultiplier() * c.cost();
if (total > previousCost) {
return total;
}
}
return total;
}
本文以随机策略举例
static class RandomCandidateGenerator extends CandidateGenerator {
@Override
Cluster.Action generate(Cluster cluster) {
/**随机选择集群中的某一个HRegionServer*/
int thisServer = pickRandomServer(cluster);
// Pick the other server
/**随机选择集群中的另一个HRegionServer,和上一个不一样*/
int otherServer = pickOtherRandomServer(cluster, thisServer);
return pickRandomRegions(cluster, thisServer, otherServer);
}
}
pickRandomRegions方法:
protected Cluster.Action pickRandomRegions(Cluster cluster,
int thisServer,
int otherServer) {
if (thisServer < 0 || otherServer < 0) {
return Cluster.NullAction;
}
// Decide who is most likely to need another region
/**获取选择出来的第一个server的HRegion的个数 */
int thisRegionCount = cluster.getNumRegions(thisServer);
/**获取选择出来的第二个server的HRegion的个数 */
int otherRegionCount = cluster.getNumRegions(otherServer);
// Assign the chance based upon the above
//根据上面计算的情况分配机会,意思就是说A-B还是B-A还是A-B互换,这几种操作哪一种被选中的几率大一些
double thisChance = (thisRegionCount > otherRegionCount) ? 0 : 0.5;
double otherChance = (thisRegionCount <= otherRegionCount) ? 0 : 0.5;
/** 随机获取第一个Server里面的某个HRegion的index*/
int thisRegion = pickRandomRegion(cluster, thisServer, thisChance);
/** 随机获取第二个Server里面的某个HRegion的index*/
int otherRegion = pickRandomRegion(cluster, otherServer, otherChance);
return getAction(thisServer, thisRegion, otherServer, otherRegion);
}
getAction方法
protected Cluster.Action getAction (int fromServer, int fromRegion,
int toServer, int toRegion) {
if (fromServer < 0 || toServer < 0) {
return Cluster.NullAction;
}
/**A-Server和B-Server的被选的HRegion进行交换 */
if (fromRegion > 0 && toRegion > 0) {
return new Cluster.SwapRegionsAction(fromServer, fromRegion,
toServer, toRegion);
/**A-Server的HRegion迁移到B-Server */
} else if (fromRegion > 0) {
return new Cluster.MoveRegionAction(fromRegion, fromServer, toServer);
/**B-Server的HRegion迁移到A-Server */
} else if (toRegion > 0) {
return new Cluster.MoveRegionAction(toRegion, toServer, fromServer);
} else {
return Cluster.NullAction;
}
}
pickRandomRegion方法
protected int pickRandomRegion(Cluster cluster, int server, double chanceOfNoSwap) {
// Check to see if this is just a move.
if (cluster.regionsPerServer[server].length == 0 || RANDOM.nextFloat() < chanceOfNoSwap) {
// signal a move only.
return -1;
}
int rand = RANDOM.nextInt(cluster.regionsPerServer[server].length);
/**二维数组( int[][]regionsPerServer;//serverIndex -> region list),数组第一项是当前的HRegionServer,第二项这个Server上的HRegion */
return cluster.regionsPerServer[server][rand];
}
doAction 执行真正的HRegion的交换或者迁移操作
public void doAction(Action action) {
switch (action.type) {
case NULL: break;
case ASSIGN_REGION:
// FindBugs: Having the assert quietens FB BC_UNCONFIRMED_CAST warnings
assert action instanceof AssignRegionAction: action.getClass();
AssignRegionAction ar = (AssignRegionAction) action;
regionsPerServer[ar.server] = addRegion(regionsPerServer[ar.server], ar.region);
regionMoved(ar.region, -1, ar.server);
break;
case MOVE_REGION:
assert action instanceof MoveRegionAction: action.getClass();
MoveRegionAction mra = (MoveRegionAction) action;
regionsPerServer[mra.fromServer] = removeRegion(regionsPerServer[mra.fromServer], mra.region);
regionsPerServer[mra.toServer] = addRegion(regionsPerServer[mra.toServer], mra.region);
regionMoved(mra.region, mra.fromServer, mra.toServer);
break;
case SWAP_REGIONS:
assert action instanceof SwapRegionsAction: action.getClass();
SwapRegionsAction a = (SwapRegionsAction) action;
regionsPerServer[a.fromServer] = replaceRegion(regionsPerServer[a.fromServer], a.fromRegion, a.toRegion);
regionsPerServer[a.toServer] = replaceRegion(regionsPerServer[a.toServer], a.toRegion, a.fromRegion);
regionMoved(a.fromRegion, a.fromServer, a.toServer);
regionMoved(a.toRegion, a.toServer, a.fromServer);
break;
default:
throw new RuntimeException("Uknown action:" + action.type);
}
}
HMaster在初始化的时候会创建一个用来触发HRegion执行负载迁移(迁移的概念不仅仅是A-Server到B-Server,也包含了2个Server的HRegion的交换操作)的工作线程。默认情况下,这个线程每隔5分钟执行一次balance方法,判断是否需要执行balance操作,如果需要,则首先计算当前cluster的cost花销,cost花销默认有几个维度,包括Region的数量、Region move的花销、数据的本地性(底层是HDFS)、表的负载等。然后根据不同的迁移策略(随机选择策略、数据本地性策略、当前HRegionServer的HRegion数量等)每次循环选择不同的策略执行迁移操作后再计算当前集群状态对应的cost,如果迁移后cost < 迁移前的,说明迁移效果很好,保留当前的cost留做下次判断依据,否则回退到迁移之前的状态。最后生成一个迁移计划列表供执行真正的HRegion的负载迁移,在迁移的时候大概的过程是首先将原HRegionServer对应的需要迁移的HRegion标记为closing(在zk上记录节点),然后给目的HRegionServer发送Open指令打开一个HRegion,当操作成功后,删除zk上的处于正在Closing状态的HRegion。