jraft日志复制

  • jraft的日志复制是指从leader往follower复制logEntry的过程。
  • 日志复制从节点成为leader开始。在nodeImpl的becomeLeader中
private void becomeLeader() {
        Requires.requireTrue(this.state == State.STATE_CANDIDATE, "Illegal state: " + this.state);
        LOG.info("Node {} term {} become leader of group {} {}", this.getNodeId(), this.currTerm, this.conf.getConf(),
            this.conf.getOldConf());
        // cancel candidate vote timer
        stopVoteTimer();
        this.state = State.STATE_LEADER;
        this.leaderId = this.serverId.copy();
        this.replicatorGroup.resetTerm(this.currTerm);
        for (final PeerId peer : this.conf.listPeers()) {
            if (peer.equals(this.serverId)) {
                continue;
            }
            LOG.debug("Node {} term {} add replicator {}", this.getNodeId(), this.currTerm, peer);
            //这里为每个follower节点创建一个replicator开始复制。
            if (!this.replicatorGroup.addReplicator(peer)) {
                LOG.error("Fail to add replicator for {}", peer);
            }
        }
        // init commit manager
        this.ballotBox.resetPendingIndex(this.logManager.getLastLogIndex() + 1);
        // Register _conf_ctx to reject configuration changing before the first log
        // is committed.
        if (this.confCtx.isBusy()) {
            throw new IllegalStateException();
        }
        this.confCtx.flush(this.conf.getConf(), this.conf.getOldConf());
        this.stepDownTimer.start();
    }
  • 在replicator的start方法中
public static ThreadId start(ReplicatorOptions opts, RaftOptions raftOptions) {
        if (opts.getLogManager() == null || opts.getBallotBox() == null || opts.getNode() == null) {
            throw new IllegalArgumentException("Invalid ReplicatorOptions.");
        }
        final Replicator r = new Replicator(opts, raftOptions);
        if (!r.rpcService.connect(opts.getPeerId().getEndpoint())) {
            LOG.error("Fail to init sending channel to {}", opts.getPeerId());
            //Return and it will be retried later.
            return null;
        }

        //Register replicator metric set.
        final MetricRegistry metricRegistry = opts.getNode().getNodeMetrics().getMetricRegistry();
        if (metricRegistry != null) {
            try {
                final String replicatorMetricName = getReplicatorMetricName(opts);
                if (!metricRegistry.getNames().contains(replicatorMetricName)) {
                    metricRegistry.register(replicatorMetricName, new ReplicatorMetricSet(opts, r));
                }
            } catch (final IllegalArgumentException e) {
                //ignore
            }
        }

        //Start replication
        r.id = new ThreadId(r, r);
        r.id.lock();
        LOG.info("Replicator={}@{} is started", r.id, r.options.getPeerId());
        r.catchUpClosure = null;
        final long now = Utils.nowMs();
        r.lastRpcSendTimestamp = now;
        // 不断hearterBeat
        r.startHeartbeatTimer(now);
        //id.unlock in sendEmptyEntries
        ///发送探针消息,获取当前follower上的日志index。
        r.sendEmptyEntries(false);
        return r.id;
    }
  • 在startHeartbeatTimer中,每隔一段时间,产出一个心跳超时的事件,通过distruptor消费这个事件来发送一次heartbeat,这里很绕,不懂为啥不直接在schedulerExecutor中发送heartbeat事件。
  • 在sendEmptyEntries方法中,先填充一些基础信息,
private boolean fillCommonFields(AppendEntriesRequest.Builder rb, long prevLogIndex, boolean isHeartbeat) {
        final long prevLogTerm = options.getLogManager().getTerm(prevLogIndex);
        //说明当前是第一任期,节点刚启动或者刚从故障中恢复,试图从snapshot中恢复数据
        if (prevLogTerm == 0 && prevLogIndex != 0) {
            if (!isHeartbeat) {
                Requires.requireTrue(prevLogIndex < options.getLogManager().getFirstLogIndex());
                LOG.debug("logIndex={} was compacted", prevLogIndex);
                return false;
            } else {
                // The log at prev_log_index has been compacted, which indicates
                // we is or is going to install snapshot to the follower. So we let
                // both prev_log_index and prev_log_term be 0 in the heartbeat
                // request so that follower would do nothing besides updating its
                // leader timestamp.
                prevLogIndex = 0;
            }
        }
        rb.setTerm(options.getTerm());
        rb.setGroupId(options.getGroupId());
        rb.setServerId(options.getServerId().toString());
        rb.setPeerId(options.getPeerId().toString());
        //当前leader的最新一条日志index。
        rb.setPrevLogIndex(prevLogIndex);
        rb.setPrevLogTerm(prevLogTerm);
        rb.setCommittedIndex(options.getBallotBox().getLastCommittedIndex());
        return true;
    }
  • 发送完rpc消息后,会执行
private void addInflight(RequestType reqType, long startIndex, int count, int size, int seq,
                             Future rpcInfly) {
        this.rpcInFly = new Inflight(reqType, startIndex, count, size, seq, rpcInfly);
        this.inflights.add(this.rpcInFly);
        nodeMetrics.recordSize("replicate-inflights-count", this.inflights.size());
    }
  • 这里维护了两个数据结构,一个rpcInfly,表示最新发送的rpc消息。inflight这个list表示所有发送出去但还没有接受到返回的request集合。
  • 当这个探针消息返回时,进入到onRpcReturned方法中。
  • 这里要保证收到请求时的处理顺序和发送请求的顺序是一致的,这里用了一个优先级队列,每个rpcResponse携带一个发送时的request seq。按照seq从小到大排列。
if (queuedPipelinedResponse.seq != r.requiredNextSeq) {
                    if (processed > 0) {
                        if (isLogDebugEnabled) {
                            sb.append("has processed ").append(processed).append(" responses,");
                        }
                        break;
                    } else {
                        //Do not processed any responses, UNLOCK id and return.
                        continueSendEntries = false;
                        id.unlock();
                        return;
                    }
                }
  • 这里判断期望的response顺序和收到的response顺序是否一致。
  • 如果顺序都一致,先发送的先收到response的话。进入到onAppendEntriesReturned逻辑。这一段时raft日志复制算法的实现。
private static boolean onAppendEntriesReturned(ThreadId id, Inflight inflight, Status status,
                                                   AppendEntriesRequest request, AppendEntriesResponse response,
                                                   long rpcSendTime, final long startTimeMs, Replicator r) {
        if (inflight.startIndex != request.getPrevLogIndex() + 1) {
            //inflight中记录对request信息和resonse中记录的不一致,直接重新发送probe,重新发送logEntry。
            LOG.warn(
                "Replicator {} received invalid AppendEntriesResponse, in-flight startIndex={}, requset prevLogIndex={}, reset the replicator state and probe again.",
                r, inflight.startIndex, request.getPrevLogIndex());
            r.resetInflights();
            r.state = State.Probe;
            //unlock id in sendEmptyEntries
            r.sendEmptyEntries(false);
            return false;
        }
        //record metrics
        if (request.getEntriesCount() > 0) {
            r.nodeMetrics.recordLatency("replicate-entries", Utils.nowMs() - rpcSendTime);
            r.nodeMetrics.recordSize("replicate-entries-count", request.getEntriesCount());
            r.nodeMetrics.recordSize("replicate-entries-bytes",
                request.getData() != null ? request.getData().size() : 0);
        }
        //如果不开启debug,就不用耗费stringBuilder去拼接

        final boolean isLogDebugEnabled = LOG.isDebugEnabled();
        StringBuilder sb = null;
        if (isLogDebugEnabled) {
            sb = new StringBuilder("Node "). //
                    append(r.options.getGroupId()).append(":").append(r.options.getServerId()). //
                    append(" received AppendEntriesResponse from "). //
                    append(r.options.getPeerId()). //
                    append(" prevLogIndex=").append(request.getPrevLogIndex()). //
                    append(" prevLogTerm=").append(request.getPrevLogTerm()). //
                    append(" count=").append(request.getEntriesCount());
        }
        if (!status.isOk()) {
            // If the follower crashes, any RPC to the follower fails immediately,
            // so we need to block the follower for a while instead of looping until
            // it comes back or be removed
            // dummy_id is unlock in block
            if (isLogDebugEnabled) {
                sb.append(" fail, sleep.");
                LOG.debug(sb.toString());
            }
            r.state = State.Probe;
            if (++r.consecutiveErrorTimes % 10 == 0) {
                LOG.warn("Fail to issue RPC to {}, consecutiveErrorTimes={}, error={}", r.options.getPeerId(),
                    r.consecutiveErrorTimes, status);
            }
            r.resetInflights();
            //unlock in in block
            r.block(startTimeMs, status.getCode());
            return false;
        }
        r.consecutiveErrorTimes = 0;
        if (!response.getSuccess()) {
            // 本节点已经不是leader了,follower节点已经收到了更高term leader的信息。
            if (response.getTerm() > r.options.getTerm()) {
                if (isLogDebugEnabled) {
                    sb.append(" fail, greater term ").append(response.getTerm()).append(" expect term ")
                    .append(r.options.getTerm());
                    LOG.debug(sb.toString());
                }
                final NodeImpl node = r.options.getNode();
                r.notifyOnCaughtUp(RaftError.EPERM.getNumber(), true);
                r.destroy();
                node.increaseTermTo(response.getTerm(), new Status(RaftError.EHIGHERTERMRESPONSE,
                    "Leader receives higher term hearbeat_response from peer:%s", r.options.getPeerId()));
                return false;
            }
            if (isLogDebugEnabled) {
                sb.append(" fail, find nextIndex remote lastLogIndex ").append(response.getLastLogIndex())
                .append(" local nextIndex ").append(r.nextIndex);
                LOG.debug(sb.toString());
            }
            if (rpcSendTime > r.lastRpcSendTimestamp) {
                r.lastRpcSendTimestamp = rpcSendTime;
            }
            //Fail, reset the state to try again from nextIndex.
            r.resetInflights();
            // prev_log_index and prev_log_term doesn't match
            if (response.getLastLogIndex() + 1 < r.nextIndex) {
                // 这里说明,follower节点还落后于当前leader节点的日志状态。
                LOG.debug("LastLogIndex at peer={} is {}", r.options.getPeerId(), response.getLastLogIndex());
                // The peer contains less logs than leader
                r.nextIndex = response.getLastLogIndex() + 1;
            } else {
                // The peer contains logs from old term which should be truncated,
                // decrease _last_log_at_peer by one to test the right index to keep
                //这里说明follower节点的日志超前于leader节点,leader 下标回退,直到和follower匹配
                if (r.nextIndex > 1) {
                    LOG.debug("logIndex={} dismatch", r.nextIndex);
                    r.nextIndex--;
                } else {
                    LOG.error("Peer={} declares that log at index=0 doesn't match, which is not supposed to happen",
                        r.options.getPeerId());
                }
            }
            // dummy_id is unlock in _send_heartbeat
            r.sendEmptyEntries(false);
            return false;
        }
        if (isLogDebugEnabled) {
            sb.append(", success");
            LOG.debug(sb.toString());
        }
        // success
        if (response.getTerm() != r.options.getTerm()) {
            r.resetInflights();
            r.state = State.Probe;
            LOG.error("Fail, response term {} dismatch, expect term {}", response.getTerm(), r.options.getTerm());
            id.unlock();
            return false;
        }
        if (rpcSendTime > r.lastRpcSendTimestamp) {
            r.lastRpcSendTimestamp = rpcSendTime;
        }
        final int entriesSize = request.getEntriesCount();
        if (entriesSize > 0) {
            //记录对应的follower已经成功写入logEntry,超过半数时,应用到stateMachine
            r.options.getBallotBox().commitAt(r.nextIndex, r.nextIndex + entriesSize - 1, r.options.getPeerId());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Replicated logs in [{}, {}] to peer {}", r.nextIndex, r.nextIndex + entriesSize - 1,
                    r.options.getPeerId());
            }
        } else {
            //The request is probe request, change the state into Replicate.
            r.state = State.Replicate;
        }
        //对于探针信息,entriesSize为0,下一次要发送的日志下标为nextIndex。
        //如果不是探针信息,说明这次发送的entriesSize个logEntry已经写入到follower的日志里了。
        r.nextIndex += entriesSize;
        r.hasSucceeded = true;
        r.notifyOnCaughtUp(0, false);
        // dummy_id is unlock in _send_entries
        if (r.timeoutNowIndex > 0 && r.timeoutNowIndex < r.nextIndex) {
            r.sendTimeoutNow(false, false);
        }
        return true;
    }
  • 当收到follower的返回时,如果:
    • 任何异常,队列里最前面的response出问题了,所有已发送未返回的信息全部丢弃,重发。
    • reponse返回的follower的最新的日志index>leader当前最新的日志index,则follower需要删除
    • reponse返回的follower的最新的日志index

你可能感兴趣的:(jraft日志复制)