public Vote lookForLeader() throws InterruptedException {
try {
self.jmxLeaderElectionBean = new LeaderElectionBean();
MBeanRegistry.getInstance().register(
self.jmxLeaderElectionBean, self.jmxLocalPeerBean);
} catch (Exception e) {
LOG.warn("Failed to register with JMX", e);
self.jmxLeaderElectionBean = null;
}
if (self.start_fle == 0) {
self.start_fle = Time.currentElapsedTime();
}
try {
HashMap recvset = new HashMap();
HashMap outofelection = new HashMap();
int notTimeout = finalizeWait;
synchronized(this){
logicalclock.incrementAndGet();
updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
}
LOG.info("New election. My id = " + self.getId() +
", proposed zxid=0x" + Long.toHexString(proposedZxid));
sendNotifications();
/*
* Loop in which we exchange notifications until we find a leader
*/
while ((self.getPeerState() == ServerState.LOOKING) &&
(!stop)){
/*
* Remove next notification from queue, times out after 2 times
* the termination time
*/
Notification n = recvqueue.poll(notTimeout,
TimeUnit.MILLISECONDS);
/*
* Sends more notifications if haven't received enough.
* Otherwise processes new notification.
*/
if(n == null){
if(manager.haveDelivered()){
sendNotifications();
} else {
manager.connectAll();
}
/*
* Exponential backoff
*/
int tmpTimeOut = notTimeout*2;
notTimeout = (tmpTimeOut < maxNotificationInterval?
tmpTimeOut : maxNotificationInterval);
LOG.info("Notification time out: " + notTimeout);
}
else if(validVoter(n.sid) && validVoter(n.leader)) {
/*
* Only proceed if the vote comes from a replica in the
* voting view for a replica in the voting view.
*/
switch (n.state) {
case LOOKING:
// If notification > current, replace and send messages out
if (n.electionEpoch > logicalclock.get()) {
logicalclock.set(n.electionEpoch);
recvset.clear();
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
} else {
updateProposal(getInitId(),
getInitLastLoggedZxid(),
getPeerEpoch());
}
sendNotifications();
} else if (n.electionEpoch < logicalclock.get()) {
if(LOG.isDebugEnabled()){
LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"
+ Long.toHexString(n.electionEpoch)
+ ", logicalclock=0x" + Long.toHexString(logicalclock.get()));
}
break;
} else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)) {
updateProposal(n.leader, n.zxid, n.peerEpoch);
sendNotifications();
}
if(LOG.isDebugEnabled()){
LOG.debug("Adding vote: from=" + n.sid +
", proposed leader=" + n.leader +
", proposed zxid=0x" + Long.toHexString(n.zxid) +
", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
}
recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch))) {
// Verify if there is any change in the proposed leader
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
if (n == null) {
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock.get(),
proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
break;
case OBSERVING:
LOG.debug("Notification from observer: " + n.sid);
break;
case FOLLOWING:
case LEADING:
/*
* Consider all notifications from the same epoch
* together.
*/
if(n.electionEpoch == logicalclock.get()){
recvset.put(n.sid, new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch));
if(ooePredicate(recvset, outofelection, n)) {
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
}
/*
* Before joining an established ensemble, verify
* a majority is following the same leader.
*/
outofelection.put(n.sid, new Vote(n.version,
n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch,
n.state));
if(ooePredicate(outofelection, outofelection, n)) {
synchronized(this){
logicalclock.set(n.electionEpoch);
self.setPeerState((n.leader == self.getId()) ?
ServerState.LEADING: learningState());
}
Vote endVote = new Vote(n.leader,
n.zxid,
n.electionEpoch,
n.peerEpoch);
leaveInstance(endVote);
return endVote;
}
break;
default:
LOG.warn("Notification state unrecognized: {} (n.state), {} (n.sid)",
n.state, n.sid);
break;
}
} else {
if (!validVoter(n.leader)) {
LOG.warn("Ignoring notification for non-cluster member sid {} from sid {}", n.leader, n.sid);
}
if (!validVoter(n.sid)) {
LOG.warn("Ignoring notification for sid {} from non-quorum member sid {}", n.leader, n.sid);
}
}
}
return null;
} finally {
try {
if(self.jmxLeaderElectionBean != null){
MBeanRegistry.getInstance().unregister(
self.jmxLeaderElectionBean);
}
} catch (Exception e) {
LOG.warn("Failed to unregister with JMX", e);
}
self.jmxLeaderElectionBean = null;
LOG.debug("Number of connection processing threads: {}",
manager.getConnectionThreadCount());
}
}
选举过程:
proposedLeader(选中为leader的节点id);
proposedZxid(选中为leader的lastZxid);
proposedEpoch(选中为leader的当前年代);
1.所有节点首先将选票投给自己
2.调用sendNotifications方法,将自己当前选票发送给其他节点
private void sendNotifications() {
for (QuorumServer server : self.getVotingView().values()) {
long sid = server.id;
ToSend notmsg = new ToSend(ToSend.mType.notification,
proposedLeader,
proposedZxid,
logicalclock.get(),
QuorumPeer.ServerState.LOOKING,
sid,
proposedEpoch);
if(LOG.isDebugEnabled()){
LOG.debug("Sending Notification: " + proposedLeader + " (n.leader), 0x" +
Long.toHexString(proposedZxid) + " (n.zxid), 0x" + Long.toHexString(logicalclock.get()) +
" (n.round), " + sid + " (recipient), " + self.getId() +
" (myid), 0x" + Long.toHexString(proposedEpoch) + " (n.peerEpoch)");
}
sendqueue.offer(notmsg);
}
}
这里可以看出其实就是将当前选票封装成ToSend结构体,然后放到sendqueue这个阻塞队列里,等待Sender线程(后面讲消息流转时讲解)从队列中拉取要发送的消息,然后发送给其他节点.
3.然后所有节点进入到while循环中,直到选举完成,也就是PeerState!=Looking,我们从while循环中可以看出首先是从recvqueue这个阻塞队列中取拉取其他节点发个过来的投票信息,recvqueue队列中存储的结构体为Notification:
属性 | 描述 |
---|---|
version | 单元 2 |
leader | 选举的leaderId |
zxid | 选举的leader的zxid |
electionEpoch | 当前选举轮次 |
state | 发送者的节点状态 |
sid | 发送者id |
peerEpoch | 选举的leader节点epoch |
此时需要对Notification内容有效性做一些校验,并做一些处理:
1)Notification为空,出现这种情况需要判断其他节点的发送队列中的消息是否某一个已经全部发送出去,如果存在这种情况,则需要将自身的选票再次发送给所有其他节点。
2)判断发送者的sid和选举leader的sid是否在有效视图中,如果是无效的sid,则直接丢弃这张选票。
当前选票确定是有效时,需要对发送的节点状态进行一些单独的处理:
①发送者节点状态也为Looking时:
1)当electionEpoch>logicallock(当前选票的选举轮次大于当前节点的选举轮次):
则当前节点清空recvset(投票箱)中的所有选票,并判断当前选票是否优于自身,如果优于自身,则更新自身选票为当前选票的投票内容,否则就投票给自身.
如何判断选票优于当前选票:
protected boolean totalOrderPredicate(long newId, long newZxid, long newEpoch, long curId, long curZxid, long curEpoch) {
LOG.debug("id: " + newId + ", proposed id: " + curId + ", zxid: 0x" +
Long.toHexString(newZxid) + ", proposed zxid: 0x" + Long.toHexString(curZxid));
if(self.getQuorumVerifier().getWeight(newId) == 0){
return false;
}
return ((newEpoch > curEpoch) ||
((newEpoch == curEpoch) &&
((newZxid > curZxid) || ((newZxid == curZxid) && (newId > curId)))));
}
- 首先时判断新选票的选举的leader的epoch是否大于当前节点选票的epoch (epoch越大说明经历的选举次数越大,状态更新)
- 如果epoch相同,则判断新选票的选举的leader的zxid是否大于当前节点选票的zxid(zxid越大,说明节点的事务数据存储越多)
- 如果zxid相同,则判断当前新选票选举的leader的sid是否大于当前节点选票的sid(sid判断策略则是作为兜底策略)
2)当electionEpoch出现这种情况,说明当前选票已经不属于此次选举轮次中的选票,则直接忽略掉此选票
3)当electionEpoch=logicallock(当前选票的选举轮次等于当前节点的选举轮次)
此时说明,新选票和自身选票处于一个投票轮次中,则直接判断新选票是否优于当前选票,如果优于当前选票,则更新当前选票内容为新选票内容,并将自身新选举内容发送给其他节点.
最后将新选票放入到recvset投票箱中,并判断投票箱中的投票是否有超过一半已经和自身的选票内容一致,如果未超过一半则再次重新进行上面选举流程,如果已经达到一半,则进行最后的判断,把recvqueue中的投票信息全部取出来进行判断,判断是否还存在优与当前自身选票的投票消息,如果有的话,则将当前选票重新放入recvqueue中,重新进行选举流程,没有的话则直接结束选举.
//判断投票箱中,是否已经超过一半节点的投票内容和当前节点投票内容一致
if (termPredicate(recvset,
new Vote(proposedLeader, proposedZxid,
logicalclock.get(), proposedEpoch))) {
// Verify if there is any change in the proposed leader
//最后判断其他节点发送过来的投票信息是否还存在优于当前节点投票信息的消息
while((n = recvqueue.poll(finalizeWait,
TimeUnit.MILLISECONDS)) != null){
if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
proposedLeader, proposedZxid, proposedEpoch)){
recvqueue.put(n);
break;
}
}
/*
* This predicate is true once we don't read any new
* relevant message from the reception queue
*/
//说明队列消息为空或队列中的投票消息已经不会对当前节点的投票内容产生改变
if (n == null) {
//变更当前节点状态,如果选举的leader时自身,则状态变更未Leading,否则未Folloing或Observing
self.setPeerState((proposedLeader == self.getId()) ?
ServerState.LEADING: learningState());
//保存最终投票内容
Vote endVote = new Vote(proposedLeader,
proposedZxid,
logicalclock.get(),
proposedEpoch);
leaveInstance(endVote);
return endVote;
}
}
②发送者节点状态也为Observing时:
Observer角色不参与选举过程,则来自Observer的选票直接忽略掉.
③发送者节点状态也为Following和Leading时:
投票消息流转过程:
在FastLeaderElection的构造方法中,会构造出来一个Messenger实例,该实例中会新开启2个线程,并一直轮询从FastLeaderElection#sendQuene中和QuorumCnxManager#recvQueue中拉取接受到的消息和需要发送的消息.
发送端:
FastLeaderElection发送选票(ToSend)
->存储在FastLeaderElection的sendQueue队列中
->FastLeaderElection.Messenger.WorkerSender#run轮询从sendQueue拉取消息并转换成ByteBuffer
->QuorumCnxManager#toSend方法
1)如果要发送的sid为当前节点,则直接放入QuorumCnxManager#recvQueue接受队列中
2)放入QuorumCnxManager#queueSendMap()队列中,ConcurrentHashMap
->QuorumCnxManager.SendWorker#run方法从queueSendMap中拉取要发送的消息,通过Socket发送给其他节点,上一节,讲诉了和其他每个节点都维护了一个SendWorker线程
接受端:
->QuorumCnxManager.RecvWorker#run方法通过Socket阻塞读取其他节点发送的消息,转换成Message消息体,并放入QuorumCnxManager#recvQueue阻塞队列中,上一节,讲诉了和其他每个节点都维护了一个RecvWorker线程
->FastLeaderElection.Messenger.WorkerReceiver轮询通过QuorumCnxManager#pollRecvQueue方法从QuorumCnxManager#recvQueue队列中拉取消息并转换成Notification,并Notification放入FastLeaderElection#recvqueue队列中.
->FastLeaderElection#lookForLeader方法从FastLeaderElection#recvqueue拉取选票信息并处理.