HBase1.2.3版本HRegion的负载均衡实现源码分析

一、HRegion的负载均衡

      负载均衡主要用来解决热点问题,使请求更均匀的发送给不同的应用服务器,Hbase是一个典型的主从架构,HMaster负载整个集群的资源调度,任务分配,而数据的IO操作是在HRegionServer上,每个HRegionServer会有NHRegion,存的是每张表的一部分数据HRegionServer里的HRegion之间的负载影响着整个Hbase集群的读写性能

二、负载均衡实现

    HRegion的负载调度由HMaster负责,所以负载的实现类在HMaster类里面。先看HMaster里面用到的涉及负载的几个变量

 /**触发balacer的线程 */
private BalancerChore balancerChore;
// Tracker for load balancer state
//负载均衡的追踪类
LoadBalancerTracker loadBalancerTracker;
//负载均衡实现类
LoadBalancer balancer;

     用来实现HRegion在各个HReginServer之间的迁移的相关操作类在HMaster构造函数中进行了相关初始化工作。

 public HMaster(final Configuration conf, CoordinatedStateManager csm)
      throws IOException, KeeperException, InterruptedException {
    super(conf, csm);
    this.rsFatals = new MemoryBoundedLogMessageBuffer(
      conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));

    LOG.info("hbase.rootdir=" + FSUtils.getRootDir(this.conf) +
      ", hbase.cluster.distributed=" + this.conf.getBoolean(HConstants.CLUSTER_DISTRIBUTED, false));
    // Disable usage of meta replicas in the master
    this.conf.setBoolean(HConstants.USE_META_REPLICAS, false);
    Replication.decorateMasterConfiguration(this.conf);
    // Hack! Maps DFSClient => Master for logs.  HDFS made this
    // config param for task trackers, but we can piggyback off of it.
    if (this.conf.get("mapreduce.task.attempt.id") == null) {
      this.conf.set("mapreduce.task.attempt.id", "hb_m_" + this.serverName.toString());
    }
    // should we check the compression codec type at master side, default true, HBASE-6370
    this.masterCheckCompression = conf.getBoolean("hbase.master.check.compression", true);
    // should we check encryption settings at master side, default true
    this.masterCheckEncryption = conf.getBoolean("hbase.master.check.encryption", true);
    this.metricsMaster = new MetricsMaster(new MetricsMasterWrapperImpl(this));
    // preload table descriptor at startup
    this.preLoadTableDescriptors = conf.getBoolean("hbase.master.preload.tabledescriptors", true);
    // Do we publish the status?
    boolean shouldPublish = conf.getBoolean(HConstants.STATUS_PUBLISHED,
        HConstants.STATUS_PUBLISHED_DEFAULT);
    Class publisherClass =
        conf.getClass(ClusterStatusPublisher.STATUS_PUBLISHER_CLASS,
            ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS,
            ClusterStatusPublisher.Publisher.class);

    if (shouldPublish) {
      if (publisherClass == null) {
        LOG.warn(HConstants.STATUS_PUBLISHED + " is true, but " +
            ClusterStatusPublisher.DEFAULT_STATUS_PUBLISHER_CLASS +
            " is not set - not publishing status");
      } else {
        clusterStatusPublisherChore = new ClusterStatusPublisher(this, conf, publisherClass);
        getChoreService().scheduleChore(clusterStatusPublisherChore);
      }
    }
    // Some unit tests don't need a cluster, so no zookeeper at all
    if (!conf.getBoolean("hbase.testing.nocluster", false)) {
      activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this);
      int infoPort = putUpJettyServer();
      /** 开启状态为Active的master的各种管理器 */
      startActiveMasterManager(infoPort);
    } else {
      activeMasterManager = null;
    }
  }

    然后会调到 finishActiveMasterInitialization,这里面最重要的是对后台负责触发负载执行的线程的初始化(其它分支挺多我们只看主干)

 this.balancerChore = new BalancerChore(this);

    BalancerChore继承ScheduledChore,ScheduledChore类里面的run方法调用了chore(),然后会调到真正balance方法也就是说会周期性的去判断是否需要执行HRegionHRegionServer间的负载迁移操作。

  

public class BalancerChore extends ScheduledChore {
  private static final Log LOG = LogFactory.getLog(BalancerChore.class);
  private final HMaster master;
//默认每隔5分钟触发一次balance方法,去判断此时集群是否需要进行HRegion的负载
  public BalancerChore(HMaster master) {
    super(master.getServerName() + "-BalancerChore", master, master.getConfiguration().getInt(
      "hbase.balancer.period", 300000));
    this.master = master;
  }

  @Override
  protected void chore() {
    try {
      master.balance();
    } catch (IOException e) {
      LOG.error("Failed to balance.", e);
    }
  }
}

      HMaster类的balance方法


public boolean balance() throws IOException {
    // if master not initialized, don't run balancer.
    /** HMaster如果还没有初始化,不执行负载均衡*/
    if (!isInitialized()) {
      LOG.debug("Master has not been initialized, don't run balancer.");
      return false;
    }
    // Do this call outside of synchronized block.
    int maximumBalanceTime = getBalancerCutoffTime();
    synchronized (this.balancer) {
      // If balance not true, don't run balancer.
      /** 如果当前已经在执行HRegion的负载,这个地方主要是考虑到5分钟调一次balance,有可能上一次balance还没执行完,需要需等上一次执行完后再判断是否需要执行负载(也就是上次执行HRegion迁移的时候超过了这个定时周期)*/
      if (!this.loadBalancerTracker.isBalancerOn()) return false;
      // Only allow one balance run at at time.
      /**如果当前有HRegion处于spilt状态也不进行负载(处于spilt的HRegion会在zk上有节点标识) */
      if (this.assignmentManager.getRegionStates().isRegionsInTransition()) {
        Map regionsInTransition =
          this.assignmentManager.getRegionStates().getRegionsInTransition();
        LOG.debug("Not running balancer because " + regionsInTransition.size() +
          " region(s) in transition: " + org.apache.commons.lang.StringUtils.
            abbreviate(regionsInTransition.toString(), 256));
        return false;
      } 
      /**如果当前集群有HRegionServer挂了也不执行负载 */
      if (this.serverManager.areDeadServersInProgress()) {
        LOG.debug("Not running balancer because processing dead regionserver(s): " +
          this.serverManager.getDeadServers());
        return false;
      }
      if (this.cpHost != null) {
        try {
          if (this.cpHost.preBalance()) {
            LOG.debug("Coprocessor bypassing balancer request");
            return false;
          }
        } catch (IOException ioe) {
          LOG.error("Error invoking master coprocessor preBalance()", ioe);
          return false;
        }
      }
      /** 找到表名--服务名--HRegion列表的映射关系(这里也可以看到,它是以表为单位的)*/
      Map>> assignmentsByTable =
        this.assignmentManager.getRegionStates().getAssignmentsByTable();
      List plans = new ArrayList();
      //Give the balancer the current cluster state.
     
      this.balancer.setClusterStatus(getClusterStatus());
      for (Map> assignments : assignmentsByTable.values()) {
        /**循环的value为每个HRegionServer和该HRegionServer上所有的HRegion的映射关系,这一步才是去真正判断是否需要执行HRegion的负载 */
        List partialPlans = this.balancer.balanceCluster(assignments);
        if (partialPlans != null) plans.addAll(partialPlans);
      }
      long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;
      int rpCount = 0;  // number of RegionPlans balanced so far
      long totalRegPlanExecTime = 0;
      if (plans != null && !plans.isEmpty()) {
       /** 循环执行迁移计划*/
        for (RegionPlan plan: plans) {
          LOG.info("balance " + plan);
          long balStartTime = System.currentTimeMillis();
          //TODO: bulk assign
         //执行HRegion的迁移,大概过程是首先先卸载原HRegionServer上的这个HRegion,然后目的HRegionServer上打开一个新的HRegion
          this.assignmentManager.balance(plan);
          totalRegPlanExecTime += System.currentTimeMillis()-balStartTime;
          rpCount++;
          if (rpCount < plans.size() &&
              // if performing next balance exceeds cutoff time, exit the loop
              (System.currentTimeMillis() + (totalRegPlanExecTime / rpCount)) > cutoffTime) {
            //TODO: After balance, there should not be a cutoff time (keeping it as a security net for now)
            LOG.debug("No more balancing till next balance run; maximumBalanceTime=" +
              maximumBalanceTime);
            break;
          }
        }
      }
      if (this.cpHost != null) {
        try {
          this.cpHost.postBalance(rpCount < plans.size() ? plans.subList(0, rpCount) : plans);
        } catch (IOException ioe) {
          // balancing already succeeded so don't change the result
          LOG.error("Error invoking master coprocessor postBalance()", ioe);
        }
      }
    }
    // If LoadBalancer did not generate any plans, it means the cluster is already balanced.
    // Return true indicating a success.
    return true;
  }

     上面的balaceCluster会调到下面这些方法


 @Override
  public synchronized List balanceCluster(Map> clusterState) {
    List plans = balanceMasterRegions(clusterState);
    if (plans != null || clusterState == null || clusterState.size() <= 1) {
      return plans;
    }
    if (masterServerName != null && clusterState.containsKey(masterServerName)) {
      if (clusterState.size() <= 2) {
        return null;
      }
      clusterState = new HashMap>(clusterState);
      clusterState.remove(masterServerName);
    }

    // On clusters with lots of HFileLinks or lots of reference files,
    // instantiating the storefile infos can be quite expensive.
    // Allow turning this feature off if the locality cost is not going to
    // be used in any computations.
    RegionLocationFinder finder = null;
    if (this.localityCost != null && this.localityCost.getMultiplier() > 0) {
      finder = this.regionFinder;
    }

    //The clusterState that is given to this method contains the state
    //of all the regions in the table(s) (that's true today)
    // Keep track of servers to iterate through them.
  Cluster cluster = new Cluster(clusterState, loads, finder, rackManager);
  /**判断当前集群是否需要进行负载 */
    if (!needsBalance(cluster)) {
      return null;
    }

  /**下面的处理是真正执行HRegion的负载迁移,大概思路就是根据不同的维度来计算迁移开销 */
    long startTime = EnvironmentEdgeManager.currentTime();
      initCosts(cluster);
    /**计算当前集群的开销*/
    double currentCost = computeCost(cluster, Double.MAX_VALUE);
    double initCost = currentCost;
    double newCost = currentCost;
   /** 完成HRegion迁移的最大的步骤数*/
    long computedMaxSteps = Math.min(this.maxSteps,
        ((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
    // Perform a stochastic walk to see if we can get a good fit.
    long step;
/** 可以看到,每次循环,选择的迁移策略都不一样,都是随机的然后再计算迁移成本*/
for (step = 0; step < computedMaxSteps; step++) {
  /** 随机选择一个’选号器’,类似于不同的迁移策略随机使用*/
      int generatorIdx = RANDOM.nextInt(candidateGenerators.length);
/**从candidateGenerators数组里面选择一个迁移策略,HBase在这个版本默认有好几种,后面我们单独分析
      有4种

HBase1.2.3版本HRegion的负载均衡实现源码分析_第1张图片

  

  CandidateGenerator p = candidateGenerators[generatorIdx];
/**执行选择出来的迁移策略的generate,在执行正在的迁移之前,做一些判断操作,比如2个HRegionServer之间的HRegion是将A-Server里的HRegion迁移到B-Server还是B-A,还是二者交互HRegion等操作,说白了就是告诉HBase我要执行什么操作,是HRegion的迁移?交换? */
/**移步到下面的RandomCandidateGenerator,我们以随机策略为例子进行讲解 */
      Cluster.Action action = p.generate(cluster); 
      if (action.type == Type.NULL) {
        continue;
      }
      /**执行真正的HRegion的负载操作 */
      cluster.doAction(action);
       /**待HRegion负载后,将当前操作所产生的开销更新到集群 */
      updateCostsWithAction(cluster, action);
//移动或者交换完之后,看看新的开销是否要继续
      newCost = computeCost(cluster, currentCost);

      // Should this be kept?
/**如果新的开销 < 移动之前的开销,说明还不错,可以这样执行HRegion的迁移  */
      if (newCost < currentCost) {
        currentCost = newCost;
     /** 回退刚刚的移动操作*/
      } else {
        // Put things back the way they were before.
        // TODO: undo by remembering old values
        Action undoAction = action.undoAction();
        cluster.doAction(undoAction);
        updateCostsWithAction(cluster, undoAction);
      }

      if (EnvironmentEdgeManager.currentTime() - startTime >
          maxRunningTime) {
        break;
      }
    }
    long endTime = EnvironmentEdgeManager.currentTime();

    metricsBalancer.balanceCluster(endTime - startTime);
   /** 迁移后开销比一开始的小*/
if (initCost > currentCost) {
  /** 构建一系列的迁移计划*/
      plans = createRegionPlans(cluster);
      if (LOG.isDebugEnabled()) {
        LOG.debug("Finished computing new load balance plan.  Computation took "
            + (endTime - startTime) + "ms to try " + step
            + " different iterations.  Found a solution that moves "
            + plans.size() + " regions; Going from a computed cost of "
            + initCost + " to a new cost of " + currentCost);
      }
      return plans;
    }
    if (LOG.isDebugEnabled()) {
      LOG.debug("Could not find a better load balance plan.  Tried "
          + step + " different configurations in " + (endTime - startTime)
          + "ms, and did not find anything with a computed cost less than " + initCost);
    }
    return null;
  }

      needsBalance:判断是否需要进行负载迁移

   

protected boolean needsBalance(Cluster c) {
    ClusterLoadState cs = new ClusterLoadState(c.clusterState);
   //如果当前集群存活的HRegionServer的个数小于2,则不进行HRegion的迁移(因为小于2个的Server怎么迁移呢?)
    if (cs.getNumServers() < MIN_SERVER_BALANCE) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Not running balancer because only " + cs.getNumServers()
            + " active regionserver(s)");
      }
      return false;
    }
    if(areSomeRegionReplicasColocated(c)) return true;
    // Check if we even need to do any load balancing
// HBASE-3681 check sloppiness first
/** 获取当前集群的HRegionServer的平均的HRegion数*/
float average = cs.getLoadAverage(); // for logging
/**当前集群能够接受HRegion分配比例的最小值 */
int floor = (int) Math.floor(average * (1 - slop));
/**当前集群能够接受HRegion分配比例的最大值 */
int ceiling = (int) Math.ceil(average * (1 + slop));
/**如果当前集群的佣有最多HRegion的Server比最大值大或者拥有最少HRegion的Server比最小值小,说明整个集群的HRegion的分配不是很平衡,需要进行负载迁移 */
    if (!(cs.getMaxLoad() > ceiling || cs.getMinLoad() < floor)) {
      NavigableMap> serversByLoad = cs.getServersByLoad();
      if (LOG.isTraceEnabled()) {
        // If nothing to balance, then don't say anything unless trace-level logging.
        LOG.trace("Skipping load balancing because balanced cluster; " +
          "servers=" + cs.getNumServers() +
          " regions=" + cs.getNumRegions() + " average=" + average +
          " mostloaded=" + serversByLoad.lastKey().getLoad() +
          " leastloaded=" + serversByLoad.firstKey().getLoad());
      }
      return false;
    }
    return true;
  }

      computeCost:计算开销

protected double computeCost(Cluster cluster, double previousCost) {
    double total = 0;
   /**costFunctions在initCost方法里面进行了初始化 */
    for (CostFunction c:costFunctions) {
      if (c.getMultiplier() <= 0) {
        continue;
      }
      /** 权重 * 不同维度的开销评分*/
      total += c.getMultiplier() * c.cost();

      if (total > previousCost) {
        return total;
      }
    }
    return total;
  }

      本文以随机策略举例


static class RandomCandidateGenerator extends CandidateGenerator {

    @Override
    Cluster.Action generate(Cluster cluster) {
      /**随机选择集群中的某一个HRegionServer*/
      int thisServer = pickRandomServer(cluster);

      // Pick the other server
/**随机选择集群中的另一个HRegionServer,和上一个不一样*/
      int otherServer = pickOtherRandomServer(cluster, thisServer);

      return pickRandomRegions(cluster, thisServer, otherServer);
    }
  }

     pickRandomRegions方法:

   

protected Cluster.Action pickRandomRegions(Cluster cluster,
                                                       int thisServer,
                                                       int otherServer) {
      if (thisServer < 0 || otherServer < 0) {
        return Cluster.NullAction;
      }
      // Decide who is most likely to need another region
/**获取选择出来的第一个server的HRegion的个数 */
      int thisRegionCount = cluster.getNumRegions(thisServer);
/**获取选择出来的第二个server的HRegion的个数 */
      int otherRegionCount = cluster.getNumRegions(otherServer);
      // Assign the chance based upon the above
//根据上面计算的情况分配机会,意思就是说A-B还是B-A还是A-B互换,这几种操作哪一种被选中的几率大一些
      double thisChance = (thisRegionCount > otherRegionCount) ? 0 : 0.5;
      double otherChance = (thisRegionCount <= otherRegionCount) ? 0 : 0.5;
      /** 随机获取第一个Server里面的某个HRegion的index*/
      int thisRegion = pickRandomRegion(cluster, thisServer, thisChance);
/** 随机获取第二个Server里面的某个HRegion的index*/
      int otherRegion = pickRandomRegion(cluster, otherServer, otherChance);
      return getAction(thisServer, thisRegion, otherServer, otherRegion);
}

     getAction方法

 

protected Cluster.Action getAction (int fromServer, int fromRegion,
        int toServer, int toRegion) {
      if (fromServer < 0 || toServer < 0) {
        return Cluster.NullAction;
      }
/**A-Server和B-Server的被选的HRegion进行交换 */
      if (fromRegion > 0 && toRegion > 0) {
        return new Cluster.SwapRegionsAction(fromServer, fromRegion,
          toServer, toRegion);
/**A-Server的HRegion迁移到B-Server */
      } else if (fromRegion > 0) {
        return new Cluster.MoveRegionAction(fromRegion, fromServer, toServer);
/**B-Server的HRegion迁移到A-Server */

      } else if (toRegion > 0) {
        return new Cluster.MoveRegionAction(toRegion, toServer, fromServer);
      } else {
        return Cluster.NullAction;
      }
    }

     pickRandomRegion方法

    

protected int pickRandomRegion(Cluster cluster, int server, double chanceOfNoSwap) {
      // Check to see if this is just a move.
      if (cluster.regionsPerServer[server].length == 0 || RANDOM.nextFloat() < chanceOfNoSwap) {
        // signal a move only.
        return -1;
      }
      int rand = RANDOM.nextInt(cluster.regionsPerServer[server].length);
/**二维数组( int[][]regionsPerServer;//serverIndex -> region list),数组第一项是当前的HRegionServer,第二项这个Server上的HRegion  */
      return cluster.regionsPerServer[server][rand];

    }
doAction 执行真正的HRegion的交换或者迁移操作
 public void doAction(Action action) {
      switch (action.type) {
      case NULL: break;
      case ASSIGN_REGION:
        // FindBugs: Having the assert quietens FB BC_UNCONFIRMED_CAST warnings
        assert action instanceof AssignRegionAction: action.getClass();
        AssignRegionAction ar = (AssignRegionAction) action;
        regionsPerServer[ar.server] = addRegion(regionsPerServer[ar.server], ar.region);
        regionMoved(ar.region, -1, ar.server);
        break;
      case MOVE_REGION:
        assert action instanceof MoveRegionAction: action.getClass();
        MoveRegionAction mra = (MoveRegionAction) action;
        regionsPerServer[mra.fromServer] = removeRegion(regionsPerServer[mra.fromServer], mra.region);
        regionsPerServer[mra.toServer] = addRegion(regionsPerServer[mra.toServer], mra.region);
        regionMoved(mra.region, mra.fromServer, mra.toServer);
        break;
      case SWAP_REGIONS:
        assert action instanceof SwapRegionsAction: action.getClass();
        SwapRegionsAction a = (SwapRegionsAction) action;
        regionsPerServer[a.fromServer] = replaceRegion(regionsPerServer[a.fromServer], a.fromRegion, a.toRegion);
        regionsPerServer[a.toServer] = replaceRegion(regionsPerServer[a.toServer], a.toRegion, a.fromRegion);
        regionMoved(a.fromRegion, a.fromServer, a.toServer);
        regionMoved(a.toRegion, a.toServer, a.fromServer);
        break;
      default:
        throw new RuntimeException("Uknown action:" + action.type);
      }
    }

 三、总结

      HMaster在初始化的时候会创建一个用来触发HRegion执行负载迁移(迁移的概念不仅仅是A-ServerB-Server,也包含了2ServerHRegion的交换操作)的工作线程默认情况下这个线程每隔5分钟执行一次balance方法判断是否需要执行balance操作,如果需要则首先计算当前clustercost花销,cost花销默认有几个维度包括Region的数量、Region move的花销、数据的本地性(底层是HDFS)、表的负载等然后根据不同的迁移策略(随机选择策略数据本地性策略当前HRegionServerHRegion数量等)每次循环选择不同的策略执行迁移操作后再计算当前集群状态对应的cost,如果迁移后cost < 迁移前的说明迁移效果很好,保留当前的cost留做下次判断依据否则回退到迁移之前的状态。最后生成一个迁移计划列表供执行真正的HRegion的负载迁移在迁移的时候大概的过程是首先将原HRegionServer对应的需要迁移的HRegion标记为closing(zk上记录节点),然后给目的HRegionServer发送Open指令打开一个HRegion,当操作成功后删除zk上的处于正在Closing状态的HRegion

你可能感兴趣的:(hbase)