HBase RegionServer挂掉后的源码分析

首先肯定是HMaster通过ZK发现某RS挂掉了,HMaster使用private ServerManager serverManager来处理rs的信息:
  public synchronized void expireServer(final HServerInfo hsi) {
    // 首先从onlineServers中获取HServerInfo
    // <hostname> , <port> , <startcode>
    String serverName = hsi.getServerName();
    HServerInfo info = this.onlineServers .get(serverName);
    if (info == null) {
      LOG.warn( "Received expiration of " + hsi.getServerName() +
        " but server is not currently online" );
    if (this. deadservers.contains(serverName)) {
      // TODO : Can this happen?  It shouldn't be online in this case?
      LOG.warn( "Received expiration of " + hsi.getServerName() +
          " but server shutdown is already in progress" );
    // 已挂列表中add,在线列表中remove
    this. deadservers.add(serverName);
    this. onlineServers.remove(serverName);
    this. serverConnections.remove(serverName);
    //如果集群正在关闭,则不处理, 直接return了
    if (this. clusterShutdown) {
      LOG.info( "Cluster shutdown set; " + hsi.getServerName() +
        " expired; onlineServers=" + this .onlineServers .size());
      if (this. onlineServers.isEmpty()) {
        master.stop( "Cluster shutdown set; onlineServer=0" );

    CatalogTracker ct = this.master .getCatalogTracker();
    // Was this server carrying root?
    boolean carryingRoot;
    try {
      HServerAddress address = ct.getRootLocation();
      carryingRoot = address != null && hsi.getServerAddress().equals(address);
    } catch (InterruptedException e) {
      LOG.info( "Interrupted");
    HServerAddress address = ct.getMetaLocation();
    boolean carryingMeta = address != null && hsi.getServerAddress().equals(address);
    // MetaServerShutdownHandler是继承ServerShutdownHandler, 前者在恢复时需要assign root或者 assign meta
    if (carryingRoot || carryingMeta) {
      this. services. getExecutorService().submit(new MetaServerShutdownHandler(this .master ,
        this.services , this.deadservers, info, carryingRoot, carryingMeta));
    } else {
      this. services. getExecutorService().submit(new ServerShutdownHandler(this .master ,
        this.services , this.deadservers, info));
    LOG.debug( "Added=" + serverName +
      " to dead servers, submitted shutdown handler to be executed, root=" +
        carryingRoot + ", meta=" + carryingMeta);

接下来走到ServerShutdownHandler, 继承于 EventHandler, 最终走到process方法
  public void process() throws IOException {
    final String serverName = this.hsi .getServerName();

    LOG.info( "Splitting logs for " + serverName);
    try {
      // split log, split出的log跟着region走 split log在每个版本中的做法相差很大,92中就引入了分布式处理的方式, 看下几段的代码
      this. services.getMasterFileSystem().splitLog(serverName);
      // 这部分说到底就是返回rsi,当然里面涉及到了regionPlans,deadRegions之类的东西,都是与传进去的参数hsi有关的
      List<RegionState> regionsInTransition =
        this.services .getAssignmentManager().processServerShutdown(this. hsi);
      // 假如有root需要恢复,那就先分配root,分配之前还需要检查一下root的location是不是已经确定
      if (isCarryingRoot()) { // -ROOT-
        LOG.info("Server " + serverName + " was carrying ROOT. Trying to assign.");
      // 假如有meta需要恢复,那就先分配meta,随机分配rs
      if (isCarryingMeta()) {
        LOG.info("Server " + serverName + " was carrying META. Trying to assign.");
        this.services .getAssignmentManager().assignMeta();
      // 需要等到所有持有meta的region都open好以后才能进行下面的工作
      NavigableMap<HRegionInfo, Result> hris = null;
      while (! this.server .isStopped()) {
        try {
          this.server .getCatalogTracker().waitForMeta();
          hris = MetaReader. getServerUserRegions(this. server.getCatalogTracker(),
              this.hsi );
        } catch (InterruptedException e) {
          Thread. currentThread().interrupt();
          throw new IOException("Interrupted", e);
        } catch (IOException ioe) {
          LOG.info("Received exception accessing META during server shutdown of " +
              serverName + ", retrying META read", ioe);
      // Skip regions that were in transition unless CLOSING or PENDING_CLOSE
      for (RegionState rit : regionsInTransition) {
        if (!rit.isClosing() && !rit.isPendingClose()) {
          LOG.debug("Removed " + rit.getRegion().getRegionNameAsString() +
            " from list of regions to assign because in RIT" );
      LOG.info( "Reassigning " + (hris == null? 0: hris.size()) +
        " region(s) that " + serverName +
        " was carrying (skipping " + regionsInTransition.size() +
        " regions(s) that are already in transition)" );
      // 开始分配region,分配前先检查region所属的表是否正在被disable,或者集群是否正在被关闭 
     // 接下来将region的状态强制设为offline,最后的assign请参看下面对这个方法的分析
      for (Map.Entry<HRegionInfo, Result> e: hris.entrySet()) {
        if (processDeadRegion(e.getKey(), e.getValue(),
            this.services .getAssignmentManager(),
            this.server .getCatalogTracker())) {
          this.services .getAssignmentManager().assign(e.getKey(), true);
    } finally {
      this. deadServers.finish(serverName);
    LOG.info( "Finished processing of shutdown of " + serverName);

split log的相关代码:
 public void splitLog(final String serverName ) {
    // 首先获取锁
    this. splitLogLock.lock();
    long splitTime = 0, splitLogSize = 0;
    // 在hdfs中获取该rs的WAL位置 一般在/hbase/.logs文件夹中
    Path logDir = new Path( this.rootdir , HLog.getHLogDirectoryName (serverName));
    try {
      HLogSplitter splitter = HLogSplitter. createLogSplitter(
        conf, rootdir, logDir, oldLogDir, this .fs );
      try {
        FSUtils. waitOnSafeMode(conf,
          conf.getInt(HConstants. THREAD_WAKE_FREQUENCY, 1000)); 
      } catch (OrphanHLogAfterSplitException e) {
        LOG.warn("Retrying splitting because of:" , e);
        splitter = HLogSplitter. createLogSplitter(conf, rootdir, logDir,  oldLogDir, this .fs );
      splitTime = splitter.getTime();
      splitLogSize = splitter.getSize();
    } catch (IOException e) {
      LOG.error( "Failed splitting " + logDir.toString(), e);
    } finally {
      this. splitLogLock.unlock();
    if (this. metrics != null) {
      this. metrics.addSplit(splitTime, splitLogSize);

 public List<Path> splitLog() throws IOException {
    Preconditions.checkState(!hasSplit ,
        "An HLogSplitter instance may only be used once" );
    hasSplit = true;

    long startTime = System. currentTimeMillis();
    List<Path> splits = null;
    if (! fs.exists( srcDir)) {
      // Nothing to do
      return splits;
    FileStatus[] logfiles = fs.listStatus( srcDir);

    if (logfiles == null || logfiles.length == 0) {
      // Nothing to do
      return splits;
    LOG.info( "Splitting " + logfiles.length + " hlog(s) in "
        + srcDir.toString());

    splits = splitLog(logfiles);
    splitTime = System. currentTimeMillis() - startTime;
    LOG.info( "hlog file splitting completed in " + splitTime + " ms for " + srcDir .toString());
    return splits;

 private List<Path> splitLog(final FileStatus[] logfiles) throws IOException {
    List<Path> processedLogs = new ArrayList<Path>();
    List<Path> corruptedLogs = new ArrayList<Path>();
    List<Path> splits = null;
    //假如设为false,那么只要在split过程中出现问题,就直接抛出IOException, 整个split过程立即终止
    boolean skipErrors = conf.getBoolean( "hbase.hlog.split.skip.errors" , true);

    splitSize = 0;

    outputSink.startWriterThreads( entryBuffers);
    try {
      int i = 0;
      for (FileStatus log : logfiles) {
       Path logPath = log.getPath();
        long logLength = log.getLen();
        splitSize += logLength;
        LOG.debug("Splitting hlog " + (i++ + 1) + " of " + logfiles. length
            + ": " + logPath + ", length=" + logLength);
        try {
          recoverFileLease(fs, logPath, conf);
          //还有一个Entry类,它封装了WALEDIT和HLogKey, 到最后就放到了一个map结构中
          parseHLog(log, entryBuffers, fs , conf );
        } catch (EOFException eof) {
          // truncated files are expected if a RS crashes (see HBASE-2643)
          LOG.info("EOF from hlog " + logPath + ". Continuing");
        } catch (FileNotFoundException fnfe) {
          // A file may be missing if the region server was able to archive it
          // before shutting down. This means the edits were persisted already
          LOG.info("A log was missing " + logPath +
              ", probably because it was moved by the" +
              " now dead region server. Continuing" );
        } catch (IOException e) {
          // If the IOE resulted from bad file format,
          // then this problem is idempotent and retrying won't help
          if (e.getCause() instanceof ParseException) {
            LOG.warn("Parse exception from hlog " + logPath + ".  continuing", e);
          } else {
            if (skipErrors) {
              LOG.info("Got while parsing hlog " + logPath +
                ". Marking as corrupted", e);
            } else {
              throw e;
      if (fs.listStatus( srcDir). length > processedLogs.size()
          + corruptedLogs.size()) {
        throw new OrphanHLogAfterSplitException(
            "Discovered orphan hlog after split. Maybe the "
            + "HRegionServer was not dead when we started" );
      archiveLogs(srcDir , corruptedLogs, processedLogs, oldLogDir, fs, conf);     
    } finally {
      splits = outputSink.finishWritingAndClose();
    return splits;

 private void assign(final RegionState state, final boolean setOfflineInZK, final boolean forceNewPlan) {
    for ( int i = 0; i < this.maximumAssignmentAttempts; i++) {
      if (setOfflineInZK && !setOfflineInZooKeeper(state)) return;
      if (this. master.isStopped()) {
        LOG.debug("Server stopped; skipping assign of " + state);
      // 获取RegionPlan,注意到LoadBalancer.randomAssignment(servers),即把这个region随机分配到server上
      RegionPlan plan = getRegionPlan(state, forceNewPlan);
      // 假如获取plan失败,那就退出
      if (plan == null) {
            "Unable to determine a plan to assign " + state);
        return; // Should get reassigned later when RIT times out.
      try {
          "Assigning region " + state.getRegion().getRegionNameAsString() +
          " to " + plan.getDestination().getServerName());
        // 将region的状态设置为PENDING_OPEN
        state.update(RegionState.State. PENDING_OPEN);
        // 向rs发送请求,要求其open
        serverManager.sendRegionOpen(plan.getDestination(), state.getRegion());
      } catch (Throwable t) {
        LOG.warn("Failed assignment of " +
          state.getRegion().getRegionNameAsString() + " to " +
          plan.getDestination() + ", trying to assign elsewhere instead; " +
          "retry=" + i, t);
        // Clean out plan we failed execute and one that doesn't look like it'll
        // succeed anyways; we need a new plan!
        // Transition back to OFFLINE
        state.update(RegionState.State. OFFLINE);
        // Force a new plan and reassign.  Will return n
        // ull if no servers.
        if (getRegionPlan(state, plan.getDestination(), true) == null) {
          LOG.warn("Unable to find a viable location to assign region " +

最后,regionserver会执行openRegionHandler来打开这个region, 这是一个异步的过程。这里面其实还涉及到了region在zk中的状态变化,包括在打开region之前的HLog恢复,恢复过程和正常的向HBase写数据一样,都会先写到memstore中,最后需要更新meta表。
  @QosPriority(priority= HIGH_QOS )
  public void openRegion(HRegionInfo region)
  throws IOException {
    if ( this. regionsInTransitionInRS .contains(region.getEncodedNameAsBytes())) {
      throw new RegionAlreadyInTransitionException("open" , region.getEncodedName());
    LOG.info( "Received request to open region: " +
    if ( this. stopped) throw new RegionServerStoppedException();
    this. regionsInTransitionInRS .add(region.getEncodedNameAsBytes());
    if (region.isRootRegion()) {
      this. service.submit( new OpenRootHandler( this, this, region));
    } else if (region.isMetaRegion()) {
      this. service.submit( new OpenMetaHandler( this, this, region));
    } else {
      this. service.submit( new OpenRegionHandler( this , this , region));
