http://note.youdao.com/noteshare?id=9ef348e59b2ba24c5ad6323ed7ea9adb
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.zookeeper.server.quorum;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.zookeeper.common.Time;
import org.apache.zookeeper.jmx.MBeanRegistry;
import org.apache.zookeeper.server.ZooKeeperThread;
import org.apache.zookeeper.server.quorum.QuorumCnxManager.Message;
import org.apache.zookeeper.server.quorum.QuorumPeer.LearnerType;
import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
import org.apache.zookeeper.server.util.ZxidUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Implementation of leader election using TCP. It uses an object of the class
* QuorumCnxManager to manage connections. Otherwise, the algorithm is push-based
* as with the other UDP implementations.
* 翻译:使用TCP实现了Leader的选举。它使用QuorumCnxManager类的对象进行连接管理
* (与其它Server间的连接管理)。否则(即若不使用QuorumCnxManager对象的话),将使用
* UDP的基于推送的算法实现。
*
* There are a few parameters that can be tuned to change its behavior. First,
* finalizeWait determines the amount of time to wait until deciding upon a leader.
* This is part of the leader election algorithm.
* 翻译:有几个参数可以用来改变它(选举)的行为。第一,finalizeWait(这是一个代码中的常量)
* 决定了选举出一个Leader的时间,这是Leader选举算法的一部分。
*/
public class FastLeaderElection implements Election {
private static final Logger LOG = LoggerFactory.getLogger(FastLeaderElection.class);
/**
* Determine how much time a process has to wait
* once it believes that it has reached the end of
* leader election.
* 翻译:(该常量)决定一个进程不得不等待的时长,
* 一旦它(这个进程)认为它已经到达了选举的最后。
*/
final static int finalizeWait = 200;
/**
* Upper bound on the amount of time between two consecutive(连续的)
* notification checks. This impacts(影响) the amount of time to get
* the system up again after long partitions(分割). Currently 60 seconds.
* 翻译:(该常量指定了)两个连续的notification检查间的时间间隔上限。
* 它影响了系统在经历了长时间分割后再次重启的时间。目前60秒。
*
* 该常量是前面的finalizeWait所描述的超时时限的最大值
*/
final static int maxNotificationInterval = 60000;
/**
* Connection manager. Fast leader election uses TCP for
* communication between peers, and QuorumCnxManager manages
* such connections.
* 翻译:连接管理者。FastLeaderElection(选举算法)使用TCP(管理)
* 两个同辈Server的通信,并且QuorumCnxManager还管理着这些连接。
*/
QuorumCnxManager manager;
/**
* Notifications are messages that let other peers know that
* a given peer has changed its vote, either because it has
* joined leader election or because it learned of(知道了)
* another peer with higher zxid or same zxid and higher
* server id
* 翻译:Notifications是一个让其它Server知道当前Server已经改变
* 了投票的通知消息,(为什么它要改变投票呢?)要么是因为它参与了
* Leader选举(新一轮的投票,首先投向自己),要么是它知道了另一个
* Server具有更大的zxid,或者zxid相同但ServerId更大(所以它要
* 通知给其它所有Server,它要修改自己的选票)。
*/
static public class Notification {
/*
* Format version, introduced in 3.4.6
*/
public final static int CURRENTVERSION = 0x1;
int version;
/*
* Proposed leader : 当前通知所推荐的leader的serverid
*/
long leader;
/*
* zxid of the proposed leader : 当前通知所推荐的leader的zxid
*/
long zxid;
/*
* Epoch : 当前通知所在选举的epoch,即逻辑时钟
*/
long electionEpoch;
/*
* current state of sender : 当前通知发出者的状态
*/
QuorumPeer.ServerState state;
/*
* Address of sender : 当前通知发出者的serverid
*/
long sid;
/*
* epoch of the proposed leader : 当前通知所推荐的leader的epoch
*/
long peerEpoch;
@Override
public String toString() {
return Long.toHexString(version) + " (message format version), "
+ leader + " (n.leader), 0x"
+ Long.toHexString(zxid) + " (n.zxid), 0x"
+ Long.toHexString(electionEpoch) + " (n.round), " + state
+ " (n.state), " + sid + " (n.sid), 0x"
+ Long.toHexString(peerEpoch) + " (n.peerEpoch) ";
}
}
static ByteBuffer buildMsg(int state,
long leader,
long zxid,
long electionEpoch,
long epoch) {
byte requestBytes[] = new byte[40];
ByteBuffer requestBuffer = ByteBuffer.wrap(requestBytes);
/*
* Building notification packet to send
*/
requestBuffer.clear();
requestBuffer.putInt(state);
requestBuffer.putLong(leader);
requestBuffer.putLong(zxid);
requestBuffer.putLong(electionEpoch);
requestBuffer.putLong(epoch);
requestBuffer.putInt(Notification.CURRENTVERSION);
return requestBuffer;
}
/**
* Messages that a peer wants to send to other peers.
* These messages can be both Notifications and Acks
* of reception of notification.
*/
static public class ToSend {
static enum mType {crequest, challenge, notification, ack}
ToSend(mType type,
long leader,
long zxid,
long electionEpoch,
ServerState state,
long sid,
long peerEpoch) {
this.leader = leader;
this.zxid = zxid;
this.electionEpoch = electionEpoch;
this.state = state;
this.sid = sid;
this.peerEpoch = peerEpoch;
}
/*
* Proposed leader in the case of notification
*/
long leader;
/*
* id contains the tag for acks, and zxid for notifications
*/
long zxid;
/*
* Epoch
*/
long electionEpoch;
/*
* Current state;
*/
QuorumPeer.ServerState state;
/*
* Address of recipient
*/
long sid;
/*
* Leader epoch
*/
long peerEpoch;
}
LinkedBlockingQueue sendqueue;
LinkedBlockingQueue recvqueue;
/**
* Multi-threaded implementation of message handler. Messenger
* implements two sub-classes: WorkReceiver and WorkSender. The
* functionality of each is obvious from the name. Each of these
* spawns a new thread.
*/
protected class Messenger {
/**
* Receives messages from instance of QuorumCnxManager on
* method run(), and processes such messages.
*/
class WorkerReceiver extends ZooKeeperThread {
volatile boolean stop;
QuorumCnxManager manager;
WorkerReceiver(QuorumCnxManager manager) {
super("WorkerReceiver");
this.stop = false;
this.manager = manager;
}
public void run() {
Message response;
while (!stop) {
// Sleeps on receive
try{
response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);
if(response == null) continue;
/*
* If it is from an observer, respond right away.
* Note that the following predicate assumes that
* if a server is not a follower, then it must be
* an observer. If we ever have any other type of
* learner in the future, we'll have to change the
* way we check for observers.
*/
if(!validVoter(response.sid)){
Vote current = self.getCurrentVote();
ToSend notmsg = new ToSend(ToSend.mType.notification,
current.getId(),
current.getZxid(),
logicalclock.get(),
self.getPeerState(),
response.sid,
current.getPeerEpoch());
sendqueue.offer(notmsg);
} else {
// Receive new message
if (LOG.isDebugEnabled()) {
LOG.debug("Receive new notification message. My id = "
+ self.getId());
}
/*
* We check for 28 bytes for backward compatibility
*/
if (response.buffer.capacity() < 28) {
LOG.error("Got a short response: "
+ response.buffer.capacity());
continue;
}
boolean backCompatibility = (response.buffer.capacity() == 28);
response.buffer.clear();
// Instantiate Notification and set its attributes
Notification n = new Notification();
// State of peer that sent this message
QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING;
switch (response.buffer.getInt()) {
case 0:
ackstate = QuorumPeer.ServerState.LOOKING;
break;
case 1:
ackstate = QuorumPeer.ServerState.FOLLOWING;
break;
case 2:
ackstate = QuorumPeer.ServerState.LEADING;
break;
case 3:
ackstate = QuorumPeer.ServerState.OBSERVING;
break;
default:
continue;
}
n.leader = response.buffer.getLong();
n.zxid = response.buffer.getLong();
n.electionEpoch = response.buffer.getLong();
n.state = ackstate;
n.sid = response.sid;
if(!backCompatibility){
n.peerEpoch = response.buffer.getLong();
} else {
if(LOG.isInfoEnabled()){
LOG.info("Backward compatibility mode, server id=" + n.sid);
}
n.peerEpoch = ZxidUtils.getEpochFromZxid(n.zxid);
}
/*
* Version added in 3.4.6
*/
n.version = (response.buffer.remaining() >= 4) ?
response.buffer.getInt() : 0x0;
/*
* Print notification info
*/
if(LOG.isInfoEnabled()){
printNotification(n);
}
/*
* If this server is looking, then send proposed leader
*/
if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){
recvqueue.offer(n);
/*
* Send a notification back if the peer that sent this
* message is also looking and its logical clock is
* lagging behind.
*/
if((ackstate == QuorumPeer.ServerState.LOOKING)
&& (n.electionEpoch < logicalclock.get())){
Vote v = getVote();
ToSend notmsg = new ToSend(ToSend.mType.notification,
v.getId(),
v.getZxid(),
logicalclock.get(),
self.getPeerState(),
response.sid,
v.getPeerEpoch());
sendqueue.offer(notmsg);
}
} else {
/*
* If this server is not looking, but the one that sent the ack
* is looking, then send back what it believes to be the leader.
*/
Vote current = self.getCurrentVote();
if(ackstate == QuorumPeer.ServerState.LOOKING){
if(LOG.isDebugEnabled()){
LOG.debug("Sending new notification. My id = " +
self.getId() + " recipient=" +
response.sid + " zxid=0x" +
Long.toHexString(current.getZxid()) +
" leader=" + current.getId());
}
ToSend notmsg;
if(n.version > 0x0) {
notmsg = new ToSend(
ToSend.mType.notification,
current.getId(),
current.getZxid(),
current.getElectionEpoch(),
self.getPeerState(),
response.sid,
current.getPeerEpoch());
} else {
Vote bcVote = self.getBCVote();
notmsg = new ToSend(
ToSend.mType.notification,
bcVote.getId(),
bcVote.getZxid(),
bcVote.getElectionEpoch(),
self.getPeerState(),
response.sid,
bcVote.getPeerEpoch());
}
sendqueue.offer(notmsg);
}
}
}
} catch (InterruptedException e) {
System.out.println("Interrupted Exception while waiting for new message" +
e.toString());
}
}
LOG.info("WorkerReceiver is down");
}
}
/**
* This worker simply dequeues a message to send and
* and queues it on the manager's queue.
*/
class WorkerSender extends ZooKeeperThread {
volatile boolean stop;
QuorumCnxManager manager;
WorkerSender(QuorumCnxManager manager){
super("WorkerSender");
this.stop = false;
this.manager = manager;
}
public void run() {
while (!stop) {
try {
ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
if(m == null) continue;
process(m);
} catch (InterruptedException e) {
break;
}
}
LOG.info("WorkerSender is down");
}
/**
* Called by run() once there is a new message to send.
*
* @param m message to send
*/
void process(ToSend m) {
ByteBuffer requestBuffer = buildMsg(m.state.ordinal(),
m.leader,
m.zxid,
m.electionEpoch,
m.peerEpoch);
manager.toSend(m.sid, requestBuffer);
}
}
WorkerSender ws;
WorkerReceiver wr;
/**
* Constructor of class Messenger.
*
* @param manager Connection manager
*/
Messenger(QuorumCnxManager manager) {
this.ws = new WorkerSender(manager);
Thread t = new Thread(this.ws,
"WorkerSender[myid=" + self.getId() + "]");
t.setDaemon(true);
t.start();
this.wr = new WorkerReceiver(manager);
t = new Thread(this.wr,
"WorkerReceiver[myid=" + self.getId() + "]");
t.setDaemon(true);
t.start();
}
/**
* Stops instances of WorkerSender and WorkerReceiver
*/
void halt(){
this.ws.stop = true;
this.wr.stop = true;
}
}
// 表示当前参与选举的server
QuorumPeer self;
Messenger messenger;
// 逻辑时钟
AtomicLong logicalclock = new AtomicLong(); /* Election instance */
// 记录当前server的推荐情况
long proposedLeader;
long proposedZxid;
long proposedEpoch;
/**
* Returns the current vlue of the logical clock counter
*/
public long getLogicalClock(){
return logicalclock.get();
}
/**
* Constructor of FastLeaderElection. It takes two parameters, one
* is the QuorumPeer object that instantiated this object, and the other
* is the connection manager. Such an object should be created only once
* by each peer during an instance of the ZooKeeper service.
*
* @param self QuorumPeer that created this object
* @param manager Connection manager
*/
public FastLeaderElection(QuorumPeer self, QuorumCnxManager manager){
this.stop = false;
this.manager = manager;
starter(self, manager);
}
/**
* This method is invoked by the constructor. Because it is a
* part of the starting procedure of the object that must be on
* any constructor of this class, it is probably best to keep as
* a separate method. As we have a single constructor currently,
* it is not strictly necessary to have it separate.
*
* @param self QuorumPeer that created this object
* @param manager Connection manager
*/
private void starter(QuorumPeer self, QuorumCnxManager manager) {
this.self = self;
proposedLeader = -1;
proposedZxid = -1;
sendqueue = new LinkedBlockingQueue();
recvqueue = new LinkedBlockingQueue();
this.messenger = new Messenger(manager);
}
private void leaveInstance(Vote v) {
if(LOG.isDebugEnabled()){
LOG.debug("About to leave FLE instance: leader="
+ v.getId() + ", zxid=0x" +
Long.toHexString(v.getZxid()) + ", my id=" + self.getId()
+ ", my state=" + self.getPeerState());
}
recvqueue.clear();
}
public QuorumCnxManager getCnxManager(){
return manager;
}
volatile boolean stop;
public void shutdown(){
stop = true;
LOG.debug("Shutting down connection manager");
manager.halt();
LOG.debug("Shutting down messenger");
messenger.halt();
LOG.debug("FLE is down");
}
/**
* Send notifications to all peers upon a change in our vote
*/
private void sendNotifications() {
// 遍历所有的observer以外的server
for (QuorumServer server : self.getVotingView().values()) {
long sid = server.id;
// notmsg,notification msg
ToSend notmsg = new ToSend(ToSend.mType.notification,
proposedLeader,
proposedZxid,
logicalclock.get(),
QuorumPeer.ServerState.LOOKING,
sid, // 指定接收对象
proposedEpoch);
if(LOG.isDebugEnabled()){
LOG.debug("Sending Notification: " + proposedLeader + " (n.leader), 0x" +
Long.toHexString(proposedZxid) + " (n.zxid), 0x" + Long.toHexString(logicalclock.get()) +
" (n.round), " + sid + " (recipient), " + self.getId() +
" (myid), 0x" + Long.toHexString(proposedEpoch) + " (n.peerEpoch)");
}
sendqueue.offer(notmsg);
}
}
private void printNotification(Notification n){
LOG.info("Notification: " + n.toString()
+ self.getPeerState() + " (my state)");
}
/**
* Check if a pair (server id, zxid) succeeds our
* current vote.
*
* @param id Server identifier
* @param zxid Last zxid observed by the issuer of this vote
*/
protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
LOG.debug("id: " + newId + ", proposed id: " + curId + ", zxid: 0x" +
Long.toHexString(newZxid) + ", proposed zxid: 0x" + Long.toHexString(curZxid));
// observer的权重为0
if(self.getQuorumVerifier().getWeight(newId) == 0){
return false;
}
/*
* We return true if one of the following three cases hold:
* 1- New epoch is higher
* 2- New epoch is the same as current epoch, but new zxid is higher
* 3- New epoch is the same as current epoch, new zxid is the same
* as current zxid, but server id is higher.
*/
return ((newEpoch > curEpoch) ||
((newEpoch == curEpoch) &&
((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId)))));
}
/**
* Termination predicate. Given a set of votes, determines if
* have sufficient to declare the end of the election round.
*
* @param votes Set of votes
* @param l Identifier of the vote received last
* @param zxid zxid of the the vote received last
*/
protected boolean termPredicate(
HashMap votes,
Vote vote) {
HashSet set = new HashSet();
/*
* First make the views consistent. Sometimes peers will have
* different zxids for a server depending on timing.
*/
// 用于统计“票箱”中支持当前vote的选票
for (Map.Entry entry : votes.entrySet()) {
if (vote.equals(entry.getValue())){
set.add(entry.getKey());
}
}
return self.getQuorumVerifier().containsQuorum(set);
}
/**
* In the case(在这种情况下) there is a leader elected, and a quorum supporting
* this leader, we have to check if the leader has voted(投票) and acked(确认)
* that it is leading. We need this check to avoid that peers keep
* electing over and over(反反复复地) a peer that has crashed(崩溃) and it is no
* longer leading.
* 翻译:在这种情况下有个一leader已经选举了出来,并且有法定Server支持该leader,
* 我们必须检查这个leader是否投票并已确认过其领导。我们需要这种检查,以避免server
* 反复地选择一个已经崩溃并且不再领导的leader。
*
* @param votes set of votes
* @param leader leader id
* @param electionEpoch epoch id
*/
protected boolean checkLeader(
HashMap votes,
long leader,
long electionEpoch){
boolean predicate = true;
/*
* If everyone else thinks I'm the leader, I must be the leader.
* The other two checks are just for the case in which I'm not the
* leader. If I'm not the leader and I haven't received a message
* from leader stating that it is leading, then predicate is false.
*/
if(leader != self.getId()){
if(votes.get(leader) == null) predicate = false;
else if(votes.get(leader).getState() != ServerState.LEADING) predicate = false;
} else if(logicalclock.get() != electionEpoch) {
predicate = false;
}
return predicate;
}
/**
* This predicate checks that a leader has been elected. It doesn't
* make a lot of sense without context (check lookForLeader) and it
* has been separated for testing purposes.
*
* @param recv map of received votes
* @param ooe map containing out of election votes (LEADING or FOLLOWING)
* @param n Notification
* @return
*/
protected boolean ooePredicate(HashMap recv,
HashMap ooe,
Notification n) {
return (termPredicate(recv, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state))
&& checkLeader(ooe, n.leader, n.electionEpoch));
}
synchronized void updateProposal(long leader, long zxid, long epoch){
if(LOG.isDebugEnabled()){
LOG.debug("Updating proposal: " + leader + " (newleader), 0x"
+ Long.toHexString(zxid) + " (newzxid), " + proposedLeader
+ " (oldleader), 0x" + Long.toHexString(proposedZxid) + " (oldzxid)");
}
// 更新当前server的推荐情况
proposedLeader = leader;
proposedZxid = zxid;
proposedEpoch = epoch;
}
synchronized Vote getVote(){
return new Vote(proposedLeader, proposedZxid, proposedEpoch);
}
/**
* A learning state can be either FOLLOWING or OBSERVING.
* This method simply decides which one depending on the
* role of the server.
*
* @return ServerState
*/
private ServerState learningState(){
if(self.getLearnerType() == LearnerType.PARTICIPANT){
LOG.debug("I'm a participant: " + self.getId());
return ServerState.FOLLOWING;
}
else{
LOG.debug("I'm an observer: " + self.getId());
return ServerState.OBSERVING;
}
}
/**
* Returns the initial vote value of server identifier.
*
* @return long
*/
private long getInitId(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
return self.getId();
else return Long.MIN_VALUE;
}
/**
* Returns initial last logged zxid.
*
* @return long
*/
private long getInitLastLoggedZxid(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
return self.getLastLoggedZxid();
else return Long.MIN_VALUE;
}
/**
* Returns the initial vote value of the peer epoch.
*
* @return long
*/
private long getPeerEpoch(){
if(self.getLearnerType() == LearnerType.PARTICIPANT)
try {
return self.getCurrentEpoch();
} catch(IOException e) {
RuntimeException re = new RuntimeException(e.getMessage());
re.setStackTrace(e.getStackTrace());
throw re;
}
else return Long.MIN_VALUE;
}
/**
* Starts a new round of leader election. Whenever our QuorumPeer
* changes its state to LOOKING, this method is invoked, and it
* sends notifications to all other peers.
* 翻译:开启新一轮的Leader选举。无论何时,只要我们的QuorumPeer的
* 状态变为了LOOKING,那么这个方法将被调用,并且它会发送notifications
* 给所有其它的同级服务器。
*/
public Vote lookForLeader() throws InterruptedException {
// ------------------- 1 创建选举对象,做选举出的初始化 --------------
try {
// jmx,Java Management eXtensions,Oracle提供的一种分布式应用程序监控技术
self.jmxLeaderElectionBean = new LeaderElectionBean();
MBeanRegistry.getInstance().register(
self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
} catch (Exception e) {
LOG.warn("Failed to register with JMX", e);
self.jmxLeaderElectionBean = null;
}
if (self.start_fle == 0) {
self.start_fle = Time.currentElapsedTime();
}
try {
// recvset,receive set,其用于存放来自于外部server的投票信息
// 其就相当于“票箱”
// key为选举的投出者的server id
// value为选票
// 该map的一个entry对象就表示一次投票
HashMap recvset = new HashMap();
// out of election,退出选举,该集合中存放的是非法选票
// 即状态不是looking的server投出的选票
HashMap outofelection = new HashMap();
// notTimeout,notification timeout
int notTimeout = finalizeWait;
// ------------------- 2 将自己作为初始leader投出去 --------------
synchronized(this){
// 逻辑时钟增一
logicalclock.incrementAndGet();
// 更新自己的推荐信息
// getInitId():当前server的id
// getInitLastLoggedZxid() :当前server最后记录的zxid,即最大的zxid
// getPeerEpoch() : 当前server的epoch,即刚挂了的leader的epoch
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() +
", proposed zxid=0x" + Long.toHexString(proposedZxid));
// 将更新过的提案广播出去
sendNotifications();
// ------------------- 3 验证自己与大家的选票谁更适合做leader --------------
/*
* Loop in which we exchange notifications until we find a leader
*/
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
// recvqueue,receive queue,用于存放来自于外部server的通知
// recvqueue是一个带头节点的单向链表
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if(n == null){
// 若当前server与集群没有失联,则重新广播自己的推荐信息
if(manager.haveDelivered()){
sendNotifications();
} else {
// 若当前server与集群失联,则重新连接
// 只要自己没有向外发通知,那么其它server就一定收不齐,那么,它们就会
// 重新再向外发送,则当前server就会收到这些server的通知
// 所以,该情况下只需连接上即可,无需再发送
manager.connectAll();
}
/*
* Exponential backoff
*/
int tmpTimeOut = notTimeout*2;
notTimeout = (tmpTimeOut < maxNotificationInterval?
tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
}
else if(validVoter(n.sid) && validVoter(n.leader)) {
/*
* Only proceed if the vote comes from a replica in the
* voting view for a replica in the voting view.
*/
switch (n.state) {
case LOOKING:
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock.get()) {
// 更新当前server选举的逻辑时钟
logicalclock.set(n.electionEpoch);
// 清空“票箱”
recvset.clear();
// 比较当前server与外来的n谁更适合做leader,无论谁更适合,
// 当前server都会更新自己的推荐信息,然后发布出去
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
// 将更新广播出去
sendNotifications();
} else if (n.electionEpoch < logicalclock.get()) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
// 处理外部n所在选举的逻辑时钟与当前server的选举逻辑时钟相同的情况
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
// 将来自于外部的通知中所推荐的选票放入到“票箱”
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
// ------------------- 4 判断本轮选举是否应该结束 --------------
// 若当前Server推荐的leader在当前“票箱”中支持率过半,则本轮选举就可以结束了
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch))) {
// Verify if there is any change in the proposed leader
// 判断剩余选票中有没有更适合做leader的
// 注意,某选票支持率过半仅仅是最低要求
// 当前while()有两个出口:
// 1)while()循环条件:若从该出口跳出循环,则说明剩余的通知中没有比当前我们推荐
// 的leader更适合做leader的了
// 2)break:若从该出口跳出循环,则说明剩余的通知中发现了比当前我们推荐的leader
// 更适合做leader的了
while((n = recvqueue.poll(finalizeWait,TimeUnit.MILLISECONDS)) != null){
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
// 将通知n放回到队列
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
// 若n为null,说明前面的while()循环是从第一个出口出去的,则说明
// 我们推荐的leader就是真正的leader了,然后就可以做收尾工作了
if (n == null) {
// 修改当前server的状态:若新的leader是当前server,则修改为leading
// 否则修改为following
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
// 形成最终选票(选举结果)
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock.get(),
proposedEpoch);
// 清空recvqueue队列
leaveInstance(endVote);
return endVote;
}
}
break;
// ------------------- 5 无需选举的情况 --------------
// 代码若可以匹配上observing,则说明当前通知是observer发送的。
// 而observer发出的通知是无法通过前面的第906行的验证的。
// 这里出现了矛盾
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
// 首先要清楚两点:
// 1)无论当前server处于什么状态,只要接收到外部server发来的通知,当前server
// 就会向那个外部server发送自己的通知
// 2)一个server只要其能接收到其它server的通知,就说明这个server不是Observer
// 什么场景下我们会收到来自于follower或leader的通知呢?
// 场景一:若有新的server要加入到一个正常工作的集群时,该Server在启动时其状态就
// 是looking,要查找leader,向外发出通知。此时的leader与follower在接收到它的
// 通知后就会向其回复。那么该server接收到的通知就是following与leading状态的
// 场景二:当其它server已经在本轮选举中选举出了新的leader,但还没有通知到当前server,
// 所以当前server的状态仍是looking,但其接收到的其它通知状态就有可能是leading
// 或following
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
// 对场景二的处理
if(n.electionEpoch == logicalclock.get()){
// 将同一轮的选票放入票箱,虽然这些通知来自于非looking状态的server
recvset.put(n.sid, new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch));
// 判断当前server是否应该退出选举了
// 若要我承认你们推荐的这个leader,需要满足两个条件:
// 1)你们推荐的这个leader在我的票箱中支持率过半
// 2)你们推荐的这个leader的主机状态没有问题
if(ooePredicate(recvset, outofelection, n)) {
// 结束选举的收尾工作
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify
* a majority is following the same leader.
*/
outofelection.put(n.sid, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state));
// 处理场景一的情况:
// 若让我承认你们所告诉我的这个leader,必须要满足两个条件:
// 1)你们推荐的leader在你们的通知中支持率要过半
// 2)你们推荐的leader的状态不能有问题
if(ooePredicate(outofelection, outofelection, n)) {
// 结束选举的收尾工作
synchronized(this){
logicalclock.set(n.electionEpoch);
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
n.state, n.sid);
break;
}
} else {
if (!validVoter(n.leader)) {
LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
}
if (!validVoter(n.sid)) {
LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
}
}
}
return null;
} finally {
try {
if(self.jmxLeaderElectionBean != null){
MBeanRegistry.getInstance().unregister(
self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
LOG.debug("Number of connection processing threads: {}",
manager.getConnectionThreadCount());
}
}
/**
* Check if a given sid is represented in either the current or
* the next voting view
*
* @param sid Server identifier
* @return boolean
*/
private boolean validVoter(long sid) {
return self.getVotingView().containsKey(sid);
}
}