EtcdRaft源码分析(快照复制)

snapshot.jpg

在EtcdRaft源码分析(日志复制)篇章里面提了下snapshot复制的部分,但没有展开,这里我们专门针对这部分了解下来龙去脉。

Leader

func (r *raft) maybeSendAppend(to uint64, sendIfEmpty bool) bool {
   pr := r.getProgress(to)
   if pr.IsPaused() {
      return false
   }
   m := pb.Message{}
   m.To = to

   term, errt := r.raftLog.term(pr.Next - 1)
   ents, erre := r.raftLog.entries(pr.Next, r.maxMsgSize)
   if len(ents) == 0 && !sendIfEmpty {
      return false
   }

   if errt != nil || erre != nil { // send snapshot if we failed to get term or entries
      if !pr.RecentActive {
         r.logger.Debugf("ignore sending snapshot to %x since it is not recently active", to)
         return false
      }

      m.Type = pb.MsgSnap
      snapshot, err := r.raftLog.snapshot()
      if err != nil {
         if err == ErrSnapshotTemporarilyUnavailable {
            r.logger.Debugf("%x failed to send snapshot to %x because snapshot is temporarily unavailable", r.id, to)
            return false
         }
         panic(err) // TODO(bdarnell)
      }
      if IsEmptySnap(snapshot) {
         panic("need non-empty snapshot")
      }
      m.Snapshot = snapshot
      sindex, sterm := snapshot.Metadata.Index, snapshot.Metadata.Term
      r.logger.Debugf("%x [firstindex: %d, commit: %d] sent snapshot[index: %d, term: %d] to %x [%s]",
         r.id, r.raftLog.firstIndex(), r.raftLog.committed, sindex, sterm, to, pr)
      pr.becomeSnapshot(sindex)
      r.logger.Debugf("%x paused sending replication messages to %x [%s]", r.id, to, pr)
   } else {
      ...
      }
   }
   r.send(m)
   return true
}
  • 我们看到Leader在很多场景下都会将自己最新的实时状态跟成员分享,过程也是根据对方的进度来量身打造,如果说在获取Next-1的任期或entries[Next,max]的过程中失败,说明现有的unstable+storage的组合获取不到想要的日志。在Raft里面,已经达成一致的日志是不可能丢的,那么在这个场景下只可能在snapshot里面,也就是快照中。
  • 取出当前的snapshot,unstable没有的话,去storage里面取
  • 将对方进度状态设为ProgressStateSnapshot,PendingSnapshot设为快照的index
  • 给对方发MsgSnap消息,附上快照

Follower

case pb.MsgSnap:
   r.electionElapsed = 0
   r.lead = m.From
   r.handleSnapshot(m)

只要有Leader发来问候,Follower就要选举超时清零,以表忠心,发誓说我不会发送选举。

handleSnapshot

func (r *raft) handleSnapshot(m pb.Message) {
   sindex, sterm := m.Snapshot.Metadata.Index, m.Snapshot.Metadata.Term
   if r.restore(m.Snapshot) {
      r.logger.Infof("%x [commit: %d] restored snapshot [index: %d, term: %d]",
         r.id, r.raftLog.committed, sindex, sterm)
      r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.lastIndex()})
   } else {
      r.logger.Infof("%x [commit: %d] ignored snapshot [index: %d, term: %d]",
         r.id, r.raftLog.committed, sindex, sterm)
      r.send(pb.Message{To: m.From, Type: pb.MsgAppResp, Index: r.raftLog.committed})
   }
}
  • 拿到快照的index和任期
  • 尝试去复原快照,如果能复原成功,那么反馈给Leader的snap的index
  • 否则,反馈给Leader自己的committed

restore

func (r *raft) restore(s pb.Snapshot) bool {
   if s.Metadata.Index <= r.raftLog.committed {
      return false
   }
   if r.raftLog.matchTerm(s.Metadata.Index, s.Metadata.Term) {
      r.logger.Infof("%x [commit: %d, lastindex: %d, lastterm: %d] fast-forwarded commit to snapshot [index: %d, term: %d]",
         r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), s.Metadata.Index, s.Metadata.Term)
      r.raftLog.commitTo(s.Metadata.Index)
      return false
   }

   // The normal peer can't become learner.
   if !r.isLearner {
      for _, id := range s.Metadata.ConfState.Learners {
         if id == r.id {
            r.logger.Errorf("%x can't become learner when restores snapshot [index: %d, term: %d]", r.id, s.Metadata.Index, s.Metadata.Term)
            return false
         }
      }
   }

   r.logger.Infof("%x [commit: %d, lastindex: %d, lastterm: %d] starts to restore snapshot [index: %d, term: %d]",
      r.id, r.raftLog.committed, r.raftLog.lastIndex(), r.raftLog.lastTerm(), s.Metadata.Index, s.Metadata.Term)

   r.raftLog.restore(s)
   r.prs = make(map[uint64]*Progress)
   r.learnerPrs = make(map[uint64]*Progress)
   r.restoreNode(s.Metadata.ConfState.Nodes, false)
   r.restoreNode(s.Metadata.ConfState.Learners, true)
   return true
}
  • 首先快照要比当前节点的committedindex大,这个好理解

  • 然后就是比对任期了,看下快照的index位置的任期是否跟本地一致

    • 如果一致,说明快照有的我都有了,我也不用保存下来了,发来的这次快照的唯一作用是更新我的committedindex到快照的index
  • 将snapshot灌入本地,这里需要注意

    • 首先,你都到了收快照的地步了,就不要侨情。乖乖的把committed设为快照的index
    • 存到unstable的快照里面,并且将entries清空,将offset设为快照的index+1
  • 本地的成员进度全部清空

  • 按照快照Metadata中的成员状态复原进度

    • 既然是restore,当然Match都清零
    • next还是一样初始设定为跟自己一致,都为最后一位,以便不一致的时候从后往前回退
    • 当然了自己本身的match要设定为最后一位
    func (r *raft) restoreNode(nodes []uint64, isLearner bool) {
       for _, n := range nodes {
          match, next := uint64(0), r.raftLog.lastIndex()+1
          if n == r.id {
             match = next - 1
             r.isLearner = isLearner
          }
          r.setProgress(n, match, next, isLearner)
          r.logger.Infof("%x restored progress of %x [%s]", r.id, n, r.getProgress(n))
       }
    }
    

Leader

case pr.State == ProgressStateSnapshot && pr.needSnapshotAbort():
   r.logger.Debugf("%x snapshot aborted, resumed sending replication messages to %x [%s]", r.id, m.From, pr)
   // Transition back to replicating state via probing state
   // (which takes the snapshot into account). If we didn't
   // move to replicating state, that would only happen with
   // the next round of appends (but there may not be a next
   // round for a while, exposing an inconsistent RaftStatus).
   pr.becomeProbe()
   pr.becomeReplicate()
  • 还记得么,Leader在发起快照复制的时候,把对方进度设为了ProgressStateSnapshot,还记录了PendingSnapshot为snap的index

  • 那么这里会判断对方是否已经到达了这个index,如果达到,说明没有必要再发快照给对方,让他走普通日志复制的流程就好了。

    func (pr *Progress) needSnapshotAbort() bool {
       return pr.State == ProgressStateSnapshot && pr.Match >= pr.PendingSnapshot
    }
    

你可能感兴趣的:(EtcdRaft源码分析(快照复制))