EtcdRaft源码分析（快照复制)

snapshot.jpg

在EtcdRaft源码分析（日志复制）篇章里面提了下snapshot复制的部分，但没有展开，这里我们专门针对这部分了解下来龙去脉。

Leader

func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
   pr := r.getProgress(to)
   if pr.IsPaused() {
      return false
   }
   m := pb.Message{}
   m.To = to

   term, errt := r.raftLog.term(pr.Next - 1)
   ents, erre := r.raftLog.entries(pr.Next, r.maxMsgSize)
   if len(ents) == 0 && !sendIfEmpty {
      return false
   }

   if errt != nil || erre != nil { // send snapshot if we failed to get term or entries
      if !pr.RecentActive {
         r.logger.Debugf("ignore sending snapshot to %x since it is not recently active", to)
         return false
      }

      m.Type = pb.MsgSnap
      snapshot, err := r.raftLog.snapshot()
      if err != nil {
         if err == ErrSnapshotTemporarilyUnavailable {
            r.logger.Debugf("%x failed to send snapshot to %x because snapshot is temporarily unavailable", r.id, to)
            return false
         }
         panic(err) // TODO(bdarnell)
      }
      if IsEmptySnap(snapshot) {
         panic("need non-empty snapshot")
      }
      m.Snapshot = snapshot
      sindex, sterm := snapshot.Metadata.Index, snapshot.Metadata.Term
      r.logger.Debugf("%x [firstindex: %d, commit: %d] sent snapshot[index: %d, term: %d] to %x [%s]",
         r.id, r.raftLog.firstIndex(), r.raftLog.committed, sindex, sterm, to, pr)
      pr.becomeSnapshot(sindex)
      r.logger.Debugf("%x paused sending replication messages to %x [%s]", r.id, to, pr)
   } else {
      ...
      }
   }
   r.send(m)
   return true
}

我们看到Leader在很多场景下都会将自己最新的实时状态跟成员分享，过程也是根据对方的进度来量身打造，如果说在获取Next-1的任期或entries[Next,max]的过程中失败，说明现有的unstable+storage的组合获取不到想要的日志。在Raft里面，已经达成一致的日志是不可能丢的，那么在这个场景下只可能在snapshot里面，也就是快照中。

取出当前的snapshot，unstable没有的话，去storage里面取

将对方进度状态设为ProgressStateSnapshot，PendingSnapshot设为快照的index

给对方发MsgSnap消息，附上快照

Follower

case pb.MsgSnap:
   r.electionElapsed = 0
   r.lead = m.From
   r.handleSnapshot(m)

只要有Leader发来问候，Follower就要选举超时清零，以表忠心，发誓说我不会发送选举。

handleSnapshot

func (r *raft) handleSnapshot(m pb.Message) {
   sindex, sterm := m.Snapshot.Metadata.Index, m.Snapshot.Metadata.Term
   if r.restore(m.Snapshot) {
      r.logger.Infof("%x [commit: %d] restored snapshot [index: %d, term: %d]",
         r.id, r.raftLog.committed, sindex, sterm)
      r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.lastIndex()})
   } else {
      r.logger.Infof("%x [commit: %d] ignored snapshot [index: %d, term: %d]",
         r.id, r.raftLog.committed, sindex, sterm)
      r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.committed})
   }
}

拿到快照的index和任期

尝试去复原快照，如果能复原成功，那么反馈给Leader的snap的index

否则，反馈给Leader自己的committed

restore

func (r *raft) restore(s pb.Snapshot) bool {
   if s.Metadata.Index <= r.raftLog.committed {
      return false
   }
   if r.raftLog.matchTerm(s.Metadata.Index, s.Metadata.Term) {
      r.logger.Infof("%x [commit: %d, lastindex: %d, lastterm: %d] fast-forwarded commit to snapshot [index: %d, term: %d]",
         r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), s.Metadata.Index, s.Metadata.Term)
      r.raftLog.commitTo(s.Metadata.Index)
      return false
   }

   // The normal peer can't become learner.
   if !r.isLearner {
      for _, id := range s.Metadata.ConfState.Learners {
         if id == r.id {
            r.logger.Errorf("%x can't become learner when restores snapshot [index: %d, term: %d]", r.id, s.Metadata.Index, s.Metadata.Term)
            return false
         }
      }
   }

   r.logger.Infof("%x [commit: %d, lastindex: %d, lastterm: %d] starts to restore snapshot [index: %d, term: %d]",
      r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), s.Metadata.Index, s.Metadata.Term)

   r.raftLog.restore(s)
   r.prs = make(map[uint64]*Progress)
   r.learnerPrs = make(map[uint64]*Progress)
   r.restoreNode(s.Metadata.ConfState.Nodes, false)
   r.restoreNode(s.Metadata.ConfState.Learners, true)
   return true
}

首先快照要比当前节点的committedindex大，这个好理解

然后就是比对任期了，看下快照的index位置的任期是否跟本地一致

如果一致，说明快照有的我都有了，我也不用保存下来了，发来的这次快照的唯一作用是更新我的committedindex到快照的index

将snapshot灌入本地，这里需要注意

首先，你都到了收快照的地步了，就不要侨情。乖乖的把committed设为快照的index

存到unstable的快照里面，并且将entries清空，将offset设为快照的index+1

本地的成员进度全部清空
按照快照Metadata中的成员状态复原进度

既然是restore，当然Match都清零

next还是一样初始设定为跟自己一致，都为最后一位，以便不一致的时候从后往前回退

当然了自己本身的match要设定为最后一位
func (r *raft) restoreNode(nodes []uint64, isLearner bool) {
   for _, n := range nodes {
      match, next := uint64(0), r.raftLog.lastIndex()+1
      if n == r.id {
         match = next - 1
         r.isLearner = isLearner
      }
      r.setProgress(n, match, next, isLearner)
      r.logger.Infof("%x restored progress of %x [%s]", r.id, n, r.getProgress(n))
   }
}

Leader

case pr.State == ProgressStateSnapshot && pr.needSnapshotAbort():
   r.logger.Debugf("%x snapshot aborted, resumed sending replication messages to %x [%s]", r.id, m.From, pr)
   // Transition back to replicating state via probing state
   // (which takes the snapshot into account). If we didn't
   // move to replicating state, that would only happen with
   // the next round of appends (but there may not be a next
   // round for a while, exposing an inconsistent RaftStatus).
   pr.becomeProbe()
   pr.becomeReplicate()

还记得么，Leader在发起快照复制的时候，把对方进度设为了ProgressStateSnapshot，还记录了PendingSnapshot为snap的index
那么这里会判断对方是否已经到达了这个index，如果达到，说明没有必要再发快照给对方，让他走普通日志复制的流程就好了。
func (pr *Progress) needSnapshotAbort() bool {
   return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot
}

EtcdRaft源码分析（快照复制)

Leader

Follower

handleSnapshot

restore

Leader

你可能感兴趣的:(EtcdRaft源码分析（快照复制))