在EtcdRaft源码分析(日志复制)篇章里面提了下snapshot复制的部分,但没有展开,这里我们专门针对这部分了解下来龙去脉。
Leader
func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
pr := r.getProgress(to)
if pr.IsPaused() {
return false
}
m := pb.Message{}
m.To = to
term, errt := r.raftLog.term(pr.Next - 1)
ents, erre := r.raftLog.entries(pr.Next, r.maxMsgSize)
if len(ents) == 0 && !sendIfEmpty {
return false
}
if errt != nil || erre != nil { // send snapshot if we failed to get term or entries
if !pr.RecentActive {
r.logger.Debugf("ignore sending snapshot to %x since it is not recently active", to)
return false
}
m.Type = pb.MsgSnap
snapshot, err := r.raftLog.snapshot()
if err != nil {
if err == ErrSnapshotTemporarilyUnavailable {
r.logger.Debugf("%x failed to send snapshot to %x because snapshot is temporarily unavailable", r.id, to)
return false
}
panic(err) // TODO(bdarnell)
}
if IsEmptySnap(snapshot) {
panic("need non-empty snapshot")
}
m.Snapshot = snapshot
sindex, sterm := snapshot.Metadata.Index, snapshot.Metadata.Term
r.logger.Debugf("%x [firstindex: %d, commit: %d] sent snapshot[index: %d, term: %d] to %x [%s]",
r.id, r.raftLog.firstIndex(), r.raftLog.committed, sindex, sterm, to, pr)
pr.becomeSnapshot(sindex)
r.logger.Debugf("%x paused sending replication messages to %x [%s]", r.id, to, pr)
} else {
...
}
}
r.send(m)
return true
}
- 我们看到Leader在很多场景下都会将自己最新的实时状态跟成员分享,过程也是根据对方的进度来量身打造,如果说在获取Next-1的任期或entries[Next,max]的过程中失败,说明现有的unstable+storage的组合获取不到想要的日志。在Raft里面,已经达成一致的日志是不可能丢的,那么在这个场景下只可能在snapshot里面,也就是快照中。
- 取出当前的snapshot,unstable没有的话,去storage里面取
- 将对方进度状态设为ProgressStateSnapshot,PendingSnapshot设为快照的index
- 给对方发MsgSnap消息,附上快照
Follower
case pb.MsgSnap:
r.electionElapsed = 0
r.lead = m.From
r.handleSnapshot(m)
只要有Leader发来问候,Follower就要选举超时清零,以表忠心,发誓说我不会发送选举。
handleSnapshot
func (r *raft) handleSnapshot(m pb.Message) {
sindex, sterm := m.Snapshot.Metadata.Index, m.Snapshot.Metadata.Term
if r.restore(m.Snapshot) {
r.logger.Infof("%x [commit: %d] restored snapshot [index: %d, term: %d]",
r.id, r.raftLog.committed, sindex, sterm)
r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.lastIndex()})
} else {
r.logger.Infof("%x [commit: %d] ignored snapshot [index: %d, term: %d]",
r.id, r.raftLog.committed, sindex, sterm)
r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.committed})
}
}
- 拿到快照的index和任期
- 尝试去复原快照,如果能复原成功,那么反馈给Leader的snap的index
- 否则,反馈给Leader自己的committed
restore
func (r *raft) restore(s pb.Snapshot) bool {
if s.Metadata.Index <= r.raftLog.committed {
return false
}
if r.raftLog.matchTerm(s.Metadata.Index, s.Metadata.Term) {
r.logger.Infof("%x [commit: %d, lastindex: %d, lastterm: %d] fast-forwarded commit to snapshot [index: %d, term: %d]",
r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), s.Metadata.Index, s.Metadata.Term)
r.raftLog.commitTo(s.Metadata.Index)
return false
}
// The normal peer can't become learner.
if !r.isLearner {
for _, id := range s.Metadata.ConfState.Learners {
if id == r.id {
r.logger.Errorf("%x can't become learner when restores snapshot [index: %d, term: %d]", r.id, s.Metadata.Index, s.Metadata.Term)
return false
}
}
}
r.logger.Infof("%x [commit: %d, lastindex: %d, lastterm: %d] starts to restore snapshot [index: %d, term: %d]",
r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), s.Metadata.Index, s.Metadata.Term)
r.raftLog.restore(s)
r.prs = make(map[uint64]*Progress)
r.learnerPrs = make(map[uint64]*Progress)
r.restoreNode(s.Metadata.ConfState.Nodes, false)
r.restoreNode(s.Metadata.ConfState.Learners, true)
return true
}
首先快照要比当前节点的committedindex大,这个好理解
然后就是比对任期了,看下快照的index位置的任期是否跟本地一致
- 如果一致,说明快照有的我都有了,我也不用保存下来了,发来的这次快照的唯一作用是更新我的committedindex到快照的index
将snapshot灌入本地,这里需要注意
- 首先,你都到了收快照的地步了,就不要侨情。乖乖的把committed设为快照的index
- 存到unstable的快照里面,并且将entries清空,将offset设为快照的index+1
本地的成员进度全部清空
按照快照Metadata中的成员状态复原进度
- 既然是restore,当然Match都清零
- next还是一样初始设定为跟自己一致,都为最后一位,以便不一致的时候从后往前回退
- 当然了自己本身的match要设定为最后一位
func (r *raft) restoreNode(nodes []uint64, isLearner bool) { for _, n := range nodes { match, next := uint64(0), r.raftLog.lastIndex()+1 if n == r.id { match = next - 1 r.isLearner = isLearner } r.setProgress(n, match, next, isLearner) r.logger.Infof("%x restored progress of %x [%s]", r.id, n, r.getProgress(n)) } }
Leader
case pr.State == ProgressStateSnapshot && pr.needSnapshotAbort():
r.logger.Debugf("%x snapshot aborted, resumed sending replication messages to %x [%s]", r.id, m.From, pr)
// Transition back to replicating state via probing state
// (which takes the snapshot into account). If we didn't
// move to replicating state, that would only happen with
// the next round of appends (but there may not be a next
// round for a while, exposing an inconsistent RaftStatus).
pr.becomeProbe()
pr.becomeReplicate()
还记得么,Leader在发起快照复制的时候,把对方进度设为了ProgressStateSnapshot,还记录了PendingSnapshot为snap的index
那么这里会判断对方是否已经到达了这个index,如果达到,说明没有必要再发快照给对方,让他走普通日志复制的流程就好了。
func (pr *Progress) needSnapshotAbort() bool { return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot }