要完成日志复制部分,也就是要修改appendAntries和requestvote部分
requestvote投true的话,有两个条件:1.日志是最新的,2.当前term是最大的。如果有一个term大于leader的分区机器进来的话,先提升所有机器term,然后再选举,这是不稳定的,所有有了prevote优化,就是分区后candidate先联系别的机器看是否有可能当选leader,如果不可能的话,就不自增term,也就没可能分区时独自自增了。下面的是requestvote部分代码:
分析如下,也就是为了让日志都是leader流向follower进行的选举约束:那些commit的log一定在leader上,因为commit和requestvote都要满足大多数原则,所以有重叠,这样就一定存在可选leader.
func (rf *Raft) RequestVote(args *RequestVoteArgs, reply *RequestVoteReply) {
// Your code here (2A, 2B).
log.Println("Rpc requestVote start ", rf.me)
rf.mu.Lock()
log.Println("Rpc get lock", rf.me)
defer rf.mu.Unlock()
reply.Vote_Granted = false
if rf.CurrentTerm > args.Candidate_Curr_Term || (rf.CurrentTerm == args.Candidate_Curr_Term && rf.VoteFor != -1 && rf.VoteFor != args.Candidate_Id) {
reply.Current_Term = rf.CurrentTerm
reply.Vote_Granted = false
log.Println(args.Candidate_Id, " vote grand fail", rf.me)
return
}
if args.Candidate_Curr_Term > rf.CurrentTerm {
log.Println("request is higer!!!!")
rf.ToFollower()
rf.VoteFor = -1
rf.CurrentTerm = args.Candidate_Curr_Term
}
// election limitation
if args.Last_Log_Term < rf.Log_Array[len(rf.Log_Array)-1].Log_Term {
reply.Current_Term = rf.CurrentTerm
reply.Vote_Granted = false
log.Println(args.Candidate_Id, " vote grand fail the log term not new", rf.me)
return
}
if args.Last_Log_Term == rf.Log_Array[len(rf.Log_Array)-1].Log_Term && args.Last_Log_Index < len(rf.Log_Array)-1 {
reply.Current_Term = rf.CurrentTerm
reply.Vote_Granted = false
log.Println(args.Candidate_Id, " vote grand fail the log length is small", rf.me)
return
}
// 两个条件均满足投true
rf.VoteFor = args.Candidate_Id
reply.Current_Term = rf.CurrentTerm
log.Println(args.Candidate_Id, " vote grand ", rf.me)
rf.ResetElection()
reply.Vote_Granted = true
// TODO: term is used for the candidate to update itself
}
关于空心跳和带有日志的心跳,这里参考了谭神的blog,用了replicator协程来单独处理每个peer的log heartbeat。这样确实是大量减少了思维负担,神来之笔!
for peer := range rf.peers {
rf.Next_Idx[peer] = 1
rf.Match_Idx[peer] = 0
if peer != rf.me {
rf.ReplicatorCond[peer] = sync.NewCond(&sync.Mutex{})
go rf.replicator(peer)
}
}
func (rf *Raft) replicator(peer int) {
rf.ReplicatorCond[peer].L.Lock()
defer rf.ReplicatorCond[peer].L.Unlock()
for !rf.killed() {
// if there is no need to replicate entries for this peer, just release CPU and wait other goroutine's signal if service adds new Command
// if this peer needs replicating entries, this goroutine will call replicateOneRound(peer) multiple times until this peer catches up, and then wait
for !rf.NeedReplicating(peer) {
rf.ReplicatorCond[peer].Wait()
}
rf.replicateOneRound(peer)
}
}
如果是log heartbeat的话,就向replicator发一个signal,然后执行一轮心跳复制,这样完美解决了一次复制不完的问题!
然后就是append部分也就是日志复制最主要的逻辑所在。
func (rf *Raft) AppendEntries(args *AppendArgs, reply *AppendReply) {
rf.mu.Lock()
reply.PrevLogIndex = args.PrevLogIndex
log.Println("append ", rf.me)
defer rf.mu.Unlock()
if args.Leader_Term < rf.CurrentTerm {
reply.Term = rf.CurrentTerm
reply.Success = false
return
}
if args.Leader_Term > rf.CurrentTerm {
rf.CurrentTerm, rf.VoteFor = args.Leader_Term, -1
}
rf.ToFollower()
// log.Println("append success", rf.me, " ", time.Now())
rf.ResetElection()
//日志不match,删除prevlogIndex及其以后的log, 并返回false
if args.PrevLogIndex != 0 {
if args.PrevLogIndex >= len(rf.Log_Array) || rf.Log_Array[args.PrevLogIndex].Log_Term != args.PrevLogTerm {
reply.Success = false
reply.Term = args.Leader_Term
// DPrintf("the follower{%d} log from {%v} to {%v} ", rf.me, rf.Log_Array, rf.Log_Array[:args.PrevLogIndex])
if args.PrevLogIndex < len(rf.Log_Array) {
rf.Log_Array = rf.Log_Array[:args.PrevLogIndex]
}
return
}
} else {
DPrintf("prevlog_idx = 0")
}
// 从PrevLogIndex + 1也就是next_id开始追加存储
// 检查是否需要截断添加,防止rpc乱序到达,截断
if !rf.checkMyLog(args.PrevLogIndex, args.Entries) {
rf.Log_Array = rf.Log_Array[:args.PrevLogIndex+1]
rf.Log_Array = append(rf.Log_Array, args.Entries...)
} else {
DPrintf("the log before will cut the log, and branch here")
}
DPrintf("the follower{%d} log is success log{%v}............", rf.me, rf.Log_Array)
// 设置本地commit为最新日志和leader_commit中较小的一个
if rf.Committed_Idx < args.Leader_Commit {
rf.Committed_Idx = int(math.Min(float64(args.Leader_Commit), float64(len(rf.Log_Array)-1)))
rf.ApplyCond.Signal()
DPrintf("Node{%v} commid{%v} change and notify", rf.me, rf.Committed_Idx)
}
//心跳内容
rf.CurrentTerm = args.Leader_Term
reply.Success = true
//TODO:2,3,4,5 in the paper
}
如果日志不匹配的话,就删除后面的日志,然后下一次发生日志就是前一个term的最后一条日志,直到pre_log 为 0。如果匹配的话,不能阶段,因为考虑到rpc乱序到达。如果log长的先到,log短的后到,就截断了,所以多了一步检查。
再append rpc回复处理中也要对rpc乱序做个处理,sequence number就用pre_log来判断,用中位数来做commit_idx,如果变大的话,通知applier线程。感觉while + wait很适合用来做背景线程(协程),处理一些特殊的任务,Applier,和replicator都用上,这也是最大的收获。
func (rf *Raft) processAppendReply(peer int, args AppendArgs, reply AppendReply) {
if reply.PrevLogIndex != rf.Next_Idx[peer]-1 {
log.Printf("rpc order delay")
return
}
if !reply.Success && reply.Term == args.Leader_Term {
//日志不一致失败,减少next_id重试
rf.Next_Idx[peer] = rf.GetIdxPreTerm(rf.Next_Idx[peer] - 1)
log.Printf("Node{%v}'s next_idx become{%v}", rf.me, rf.Next_Idx[peer])
return
}
//发现更大term的candidate, 转变为follwer
if !reply.Success && reply.Term > args.Leader_Term {
log.Println("find the leadr and change state to follower")
rf.ToFollower()
rf.CurrentTerm, rf.VoteFor = reply.Term, -1
return
}
if !reply.Success && reply.Term < args.Leader_Term {
log.Fatalf("reply term smaller")
}
if reply.Success {
log.Printf("Leader Node{%v} receive the Node{%v} append success next_id{%v} log_len{%v}, add{%v}", rf.me, peer, rf.Next_Idx[peer], len(rf.Log_Array), len(args.Entries))
// update next_id and math_id
// 防止两个rpc同时到达乱序
rf.Next_Idx[peer] += len(args.Entries)
rf.Match_Idx[peer] = rf.Next_Idx[peer] - 1
//取match_idx的中位数来做commit_idx,因为满足一半peers已经commit了
DPrintf("match_array{%v}}", rf.Match_Idx)
matchIdx := make([]int, 0)
for i := 0; i < len(rf.peers); i++ {
if rf.me != i {
matchIdx = append(matchIdx, rf.Match_Idx[i])
}
}
matchIdx = append(matchIdx, len(rf.Log_Array)-1)
sort.Ints(matchIdx)
commit_idx := matchIdx[(len(matchIdx))/2]
DPrintf("match_array{%v} and commit_idx{%v}", rf.Match_Idx, commit_idx)
if commit_idx > rf.Committed_Idx {
DPrintf("Leader Node{%v} commit increase from{%v} to {%v} and signal", rf.me, rf.Committed_Idx, commit_idx)
rf.Committed_Idx = commit_idx
//通知applier协程
rf.ApplyCond.Signal()
}
}
}
然后就是applier协程了
func (rf *Raft) Applier() {
for !rf.killed() {
rf.mu.Lock()
for rf.Last_Applied_Idx >= rf.Committed_Idx || len(rf.Log_Array) <= rf.Committed_Idx {
rf.ApplyCond.Wait()
DPrintf("Node{%v}, last_applied{%v}, commited_idx{%v}", rf.me, rf.Last_Applied_Idx, rf.Committed_Idx)
rf.Committed_Idx = int(math.Min(float64(rf.Committed_Idx), float64(len(rf.Log_Array)-1)))
DPrintf("{log len{%d}, commit_id{%d}}", len(rf.Log_Array), rf.Committed_Idx)
}
DPrintf("Node{%d}commit_idx{%d} last_applied_idx{%d} log{%v}, logtoapply{%v}", rf.me, rf.Committed_Idx, rf.Last_Applied_Idx, rf.Log_Array, rf.Log_Array[rf.Last_Applied_Idx+1:rf.Committed_Idx+1])
entries := make([]LogEntry, rf.Committed_Idx-rf.Last_Applied_Idx)
copy(entries, rf.Log_Array[rf.Last_Applied_Idx+1:rf.Committed_Idx+1])
DPrintf("Node{%v} enries{%v}", rf.me, entries)
// rf.mu.Unlock()
for _, entry := range entries {
rf.ApplyChan <- ApplyMsg{
CommandValid: true,
Command: entry.Command,
CommandIndex: entry.Index,
}
}
// rf.mu.Lock()
DPrintf("{Node %v} applies entries %v-%v in term %v", rf.me, rf.Last_Applied_Idx, rf.Committed_Idx, rf.CurrentTerm)
rf.Last_Applied_Idx = int(math.Max(float64(rf.Last_Applied_Idx), float64(rf.Committed_Idx)))
rf.mu.Unlock()
}
}
applier协程这里是有个bug的,lock,unlock然后就暴露了中间态,不一致了,然后last_apply被下一步的commit先更新了,然后下一步就卡住了。。
还学到了这样打log Node{%v} 这样看的就很清楚有个括号。
测了500次,应该行了