- jraft的日志复制是指从leader往follower复制logEntry的过程。
- 日志复制从节点成为leader开始。在nodeImpl的becomeLeader中
private void becomeLeader() {
Requires.requireTrue(this.state == State.STATE_CANDIDATE, "Illegal state: " + this.state);
LOG.info("Node {} term {} become leader of group {} {}", this.getNodeId(), this.currTerm, this.conf.getConf(),
this.conf.getOldConf());
// cancel candidate vote timer
stopVoteTimer();
this.state = State.STATE_LEADER;
this.leaderId = this.serverId.copy();
this.replicatorGroup.resetTerm(this.currTerm);
for (final PeerId peer : this.conf.listPeers()) {
if (peer.equals(this.serverId)) {
continue;
}
LOG.debug("Node {} term {} add replicator {}", this.getNodeId(), this.currTerm, peer);
//这里为每个follower节点创建一个replicator开始复制。
if (!this.replicatorGroup.addReplicator(peer)) {
LOG.error("Fail to add replicator for {}", peer);
}
}
// init commit manager
this.ballotBox.resetPendingIndex(this.logManager.getLastLogIndex() + 1);
// Register _conf_ctx to reject configuration changing before the first log
// is committed.
if (this.confCtx.isBusy()) {
throw new IllegalStateException();
}
this.confCtx.flush(this.conf.getConf(), this.conf.getOldConf());
this.stepDownTimer.start();
}
public static ThreadId start(ReplicatorOptions opts, RaftOptions raftOptions) {
if (opts.getLogManager() == null || opts.getBallotBox() == null || opts.getNode() == null) {
throw new IllegalArgumentException("Invalid ReplicatorOptions.");
}
final Replicator r = new Replicator(opts, raftOptions);
if (!r.rpcService.connect(opts.getPeerId().getEndpoint())) {
LOG.error("Fail to init sending channel to {}", opts.getPeerId());
//Return and it will be retried later.
return null;
}
//Register replicator metric set.
final MetricRegistry metricRegistry = opts.getNode().getNodeMetrics().getMetricRegistry();
if (metricRegistry != null) {
try {
final String replicatorMetricName = getReplicatorMetricName(opts);
if (!metricRegistry.getNames().contains(replicatorMetricName)) {
metricRegistry.register(replicatorMetricName, new ReplicatorMetricSet(opts, r));
}
} catch (final IllegalArgumentException e) {
//ignore
}
}
//Start replication
r.id = new ThreadId(r, r);
r.id.lock();
LOG.info("Replicator={}@{} is started", r.id, r.options.getPeerId());
r.catchUpClosure = null;
final long now = Utils.nowMs();
r.lastRpcSendTimestamp = now;
// 不断hearterBeat
r.startHeartbeatTimer(now);
//id.unlock in sendEmptyEntries
///发送探针消息,获取当前follower上的日志index。
r.sendEmptyEntries(false);
return r.id;
}
- 在startHeartbeatTimer中,每隔一段时间,产出一个心跳超时的事件,通过distruptor消费这个事件来发送一次heartbeat,这里很绕,不懂为啥不直接在schedulerExecutor中发送heartbeat事件。
- 在sendEmptyEntries方法中,先填充一些基础信息,
private boolean fillCommonFields(AppendEntriesRequest.Builder rb, long prevLogIndex, boolean isHeartbeat) {
final long prevLogTerm = options.getLogManager().getTerm(prevLogIndex);
//说明当前是第一任期,节点刚启动或者刚从故障中恢复,试图从snapshot中恢复数据
if (prevLogTerm == 0 && prevLogIndex != 0) {
if (!isHeartbeat) {
Requires.requireTrue(prevLogIndex < options.getLogManager().getFirstLogIndex());
LOG.debug("logIndex={} was compacted", prevLogIndex);
return false;
} else {
// The log at prev_log_index has been compacted, which indicates
// we is or is going to install snapshot to the follower. So we let
// both prev_log_index and prev_log_term be 0 in the heartbeat
// request so that follower would do nothing besides updating its
// leader timestamp.
prevLogIndex = 0;
}
}
rb.setTerm(options.getTerm());
rb.setGroupId(options.getGroupId());
rb.setServerId(options.getServerId().toString());
rb.setPeerId(options.getPeerId().toString());
//当前leader的最新一条日志index。
rb.setPrevLogIndex(prevLogIndex);
rb.setPrevLogTerm(prevLogTerm);
rb.setCommittedIndex(options.getBallotBox().getLastCommittedIndex());
return true;
}
private void addInflight(RequestType reqType, long startIndex, int count, int size, int seq,
Future rpcInfly) {
this.rpcInFly = new Inflight(reqType, startIndex, count, size, seq, rpcInfly);
this.inflights.add(this.rpcInFly);
nodeMetrics.recordSize("replicate-inflights-count", this.inflights.size());
}
- 这里维护了两个数据结构,一个rpcInfly,表示最新发送的rpc消息。inflight这个list表示所有发送出去但还没有接受到返回的request集合。
- 当这个探针消息返回时,进入到onRpcReturned方法中。
- 这里要保证收到请求时的处理顺序和发送请求的顺序是一致的,这里用了一个优先级队列,每个rpcResponse携带一个发送时的request seq。按照seq从小到大排列。
if (queuedPipelinedResponse.seq != r.requiredNextSeq) {
if (processed > 0) {
if (isLogDebugEnabled) {
sb.append("has processed ").append(processed).append(" responses,");
}
break;
} else {
//Do not processed any responses, UNLOCK id and return.
continueSendEntries = false;
id.unlock();
return;
}
}
- 这里判断期望的response顺序和收到的response顺序是否一致。
- 如果顺序都一致,先发送的先收到response的话。进入到onAppendEntriesReturned逻辑。这一段时raft日志复制算法的实现。
private static boolean onAppendEntriesReturned(ThreadId id, Inflight inflight, Status status,
AppendEntriesRequest request, AppendEntriesResponse response,
long rpcSendTime, final long startTimeMs, Replicator r) {
if (inflight.startIndex != request.getPrevLogIndex() + 1) {
//inflight中记录对request信息和resonse中记录的不一致,直接重新发送probe,重新发送logEntry。
LOG.warn(
"Replicator {} received invalid AppendEntriesResponse, in-flight startIndex={}, requset prevLogIndex={}, reset the replicator state and probe again.",
r, inflight.startIndex, request.getPrevLogIndex());
r.resetInflights();
r.state = State.Probe;
//unlock id in sendEmptyEntries
r.sendEmptyEntries(false);
return false;
}
//record metrics
if (request.getEntriesCount() > 0) {
r.nodeMetrics.recordLatency("replicate-entries", Utils.nowMs() - rpcSendTime);
r.nodeMetrics.recordSize("replicate-entries-count", request.getEntriesCount());
r.nodeMetrics.recordSize("replicate-entries-bytes",
request.getData() != null ? request.getData().size() : 0);
}
//如果不开启debug,就不用耗费stringBuilder去拼接
final boolean isLogDebugEnabled = LOG.isDebugEnabled();
StringBuilder sb = null;
if (isLogDebugEnabled) {
sb = new StringBuilder("Node "). //
append(r.options.getGroupId()).append(":").append(r.options.getServerId()). //
append(" received AppendEntriesResponse from "). //
append(r.options.getPeerId()). //
append(" prevLogIndex=").append(request.getPrevLogIndex()). //
append(" prevLogTerm=").append(request.getPrevLogTerm()). //
append(" count=").append(request.getEntriesCount());
}
if (!status.isOk()) {
// If the follower crashes, any RPC to the follower fails immediately,
// so we need to block the follower for a while instead of looping until
// it comes back or be removed
// dummy_id is unlock in block
if (isLogDebugEnabled) {
sb.append(" fail, sleep.");
LOG.debug(sb.toString());
}
r.state = State.Probe;
if (++r.consecutiveErrorTimes % 10 == 0) {
LOG.warn("Fail to issue RPC to {}, consecutiveErrorTimes={}, error={}", r.options.getPeerId(),
r.consecutiveErrorTimes, status);
}
r.resetInflights();
//unlock in in block
r.block(startTimeMs, status.getCode());
return false;
}
r.consecutiveErrorTimes = 0;
if (!response.getSuccess()) {
// 本节点已经不是leader了,follower节点已经收到了更高term leader的信息。
if (response.getTerm() > r.options.getTerm()) {
if (isLogDebugEnabled) {
sb.append(" fail, greater term ").append(response.getTerm()).append(" expect term ")
.append(r.options.getTerm());
LOG.debug(sb.toString());
}
final NodeImpl node = r.options.getNode();
r.notifyOnCaughtUp(RaftError.EPERM.getNumber(), true);
r.destroy();
node.increaseTermTo(response.getTerm(), new Status(RaftError.EHIGHERTERMRESPONSE,
"Leader receives higher term hearbeat_response from peer:%s", r.options.getPeerId()));
return false;
}
if (isLogDebugEnabled) {
sb.append(" fail, find nextIndex remote lastLogIndex ").append(response.getLastLogIndex())
.append(" local nextIndex ").append(r.nextIndex);
LOG.debug(sb.toString());
}
if (rpcSendTime > r.lastRpcSendTimestamp) {
r.lastRpcSendTimestamp = rpcSendTime;
}
//Fail, reset the state to try again from nextIndex.
r.resetInflights();
// prev_log_index and prev_log_term doesn't match
if (response.getLastLogIndex() + 1 < r.nextIndex) {
// 这里说明,follower节点还落后于当前leader节点的日志状态。
LOG.debug("LastLogIndex at peer={} is {}", r.options.getPeerId(), response.getLastLogIndex());
// The peer contains less logs than leader
r.nextIndex = response.getLastLogIndex() + 1;
} else {
// The peer contains logs from old term which should be truncated,
// decrease _last_log_at_peer by one to test the right index to keep
//这里说明follower节点的日志超前于leader节点,leader 下标回退,直到和follower匹配
if (r.nextIndex > 1) {
LOG.debug("logIndex={} dismatch", r.nextIndex);
r.nextIndex--;
} else {
LOG.error("Peer={} declares that log at index=0 doesn't match, which is not supposed to happen",
r.options.getPeerId());
}
}
// dummy_id is unlock in _send_heartbeat
r.sendEmptyEntries(false);
return false;
}
if (isLogDebugEnabled) {
sb.append(", success");
LOG.debug(sb.toString());
}
// success
if (response.getTerm() != r.options.getTerm()) {
r.resetInflights();
r.state = State.Probe;
LOG.error("Fail, response term {} dismatch, expect term {}", response.getTerm(), r.options.getTerm());
id.unlock();
return false;
}
if (rpcSendTime > r.lastRpcSendTimestamp) {
r.lastRpcSendTimestamp = rpcSendTime;
}
final int entriesSize = request.getEntriesCount();
if (entriesSize > 0) {
//记录对应的follower已经成功写入logEntry,超过半数时,应用到stateMachine
r.options.getBallotBox().commitAt(r.nextIndex, r.nextIndex + entriesSize - 1, r.options.getPeerId());
if (LOG.isDebugEnabled()) {
LOG.debug("Replicated logs in [{}, {}] to peer {}", r.nextIndex, r.nextIndex + entriesSize - 1,
r.options.getPeerId());
}
} else {
//The request is probe request, change the state into Replicate.
r.state = State.Replicate;
}
//对于探针信息,entriesSize为0,下一次要发送的日志下标为nextIndex。
//如果不是探针信息,说明这次发送的entriesSize个logEntry已经写入到follower的日志里了。
r.nextIndex += entriesSize;
r.hasSucceeded = true;
r.notifyOnCaughtUp(0, false);
// dummy_id is unlock in _send_entries
if (r.timeoutNowIndex > 0 && r.timeoutNowIndex < r.nextIndex) {
r.sendTimeoutNow(false, false);
}
return true;
}
- 当收到follower的返回时,如果:
- 任何异常,队列里最前面的response出问题了,所有已发送未返回的信息全部丢弃,重发。
- reponse返回的follower的最新的日志index>leader当前最新的日志index,则follower需要删除
- reponse返回的follower的最新的日志index