zookeeper源码浅析(二)之Leader选择

1.入口函数QuorumPeerMain主线程启动
Quorumpeermain.runfromconfig()代码
  1. public void runFromConfig(QuorumPeerConfig config) throws IOException {    
  2.       ......    
  3.       
  4.       LOG.info("Starting quorum peer");    
  5.       try {    
  6.         //对client提供读写的server,一般是2181端口    
  7.           ServerCnxnFactory cnxnFactory = ServerCnxnFactory.createFactory();    
  8.           cnxnFactory.configure(config.getClientPortAddress(),    
  9.                                 config.getMaxClientCnxns());    
  10.         //zk的逻辑主线程,负责选举,投票等    
  11.           quorumPeer = new QuorumPeer();    
  12.           quorumPeer.setClientPortAddress(config.getClientPortAddress());    
  13.           quorumPeer.setTxnFactory(new FileTxnSnapLog(    
  14.                       new File(config.getDataLogDir()),    
  15.                       new File(config.getDataDir())));    
  16.         //集群机器地址    
  17.           quorumPeer.setQuorumPeers(config.getServers());    
  18.           quorumPeer.setElectionType(config.getElectionAlg());    
  19.         //本机的集群编号    
  20.           quorumPeer.setMyid(config.getServerId());    
  21.           quorumPeer.setTickTime(config.getTickTime());    
  22.           quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout());    
  23.           quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout());    
  24.           quorumPeer.setInitLimit(config.getInitLimit());    
  25.           quorumPeer.setSyncLimit(config.getSyncLimit());    
  26.         //投票决定方式,默认超过半数就通过    
  27.           quorumPeer.setQuorumVerifier(config.getQuorumVerifier());    
  28.           quorumPeer.setCnxnFactory(cnxnFactory);    
  29.           quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory()));    
  30.           quorumPeer.setLearnerType(config.getPeerType());    
  31.         //启动主线程    
  32.           quorumPeer.start();    
  33.           quorumPeer.join();    
  34.       } catch (InterruptedException e) {    
  35.           // warn, but generally this is ok    
  36.           LOG.warn("Quorum Peer interrupted", e);    
  37.       }    
  38.     }    

 2.QuorumPeer复写Thread.start方法,启动

Quorumpeer.start()代码
  1.  @Override    
  2.    public synchronized void start() {    
  3. //恢复DB,从zxid中回复epoch变量,代表投票轮数    
  4.        loadDataBase();    
  5. //启动针对client的IO线程    
  6.        cnxnFactory.start();    
  7. //选举初始化,主要是从配置获取选举类型            
  8.        startLeaderElection();    
  9. //启动    
  10.        super.start();    
  11.    }    

  3.通过QuorumPeer.loadDataBase()加载数据,初始化zkDb、currentEpoch、acceptedEpoch。

Quorumpeer.loaddatabase()代码
  1. private void loadDataBase() {  
  2.         try {  
  3.             //从本地文件恢复db   
  4.             zkDb.loadDataBase();  
  5.   
  6.             // load the epochs  
  7.             //从最新的zxid恢复epoch变量,zxid64位,前32位是epoch值,后32位是zxid   
  8.             long lastProcessedZxid = zkDb.getDataTree().lastProcessedZxid;  
  9.             long epochOfZxid = ZxidUtils.getEpochFromZxid(lastProcessedZxid);  
  10.             try {  
  11.                 currentEpoch = readLongFromFile(CURRENT_EPOCH_FILENAME);  
  12.             } catch(FileNotFoundException e) {  
  13.                 .....  
  14.             }  
  15.             if (epochOfZxid > currentEpoch) {  
  16.                 throw new IOException("The current epoch, " + ZxidUtils.zxidToString(currentEpoch) + ", is older than the last zxid, " + lastProcessedZxid);  
  17.             }  
  18.             try {  
  19.                 acceptedEpoch = readLongFromFile(ACCEPTED_EPOCH_FILENAME);  
  20.             } catch(FileNotFoundException e) {  
  21.                 .....  
  22.             }  
  23.             if (acceptedEpoch < currentEpoch) {  
  24.                 throw new IOException("The current epoch, " + ZxidUtils.zxidToString(currentEpoch) + " is less than the accepted epoch, " + ZxidUtils.zxidToString(acceptedEpoch));  
  25.             }  
  26.         } catch(IOException ie) {  
  27.            .....  
  28.         }  
  29.     }  

 4.通过QuorumPeer.startLeaderElection()初始化electionAlg、currentVote。

Quorumpeer.startleaderelection()代码
  1. synchronized public void startLeaderElection() {  
  2.         try {  
  3.                 //先投自己    
  4.             currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch());  
  5.         } catch(IOException e) {  
  6.             RuntimeException re = new RuntimeException(e.getMessage());  
  7.             re.setStackTrace(e.getStackTrace());  
  8.             throw re;  
  9.         }  
  10.         //从配置中拿自己的选举地址    
  11.         for (QuorumServer p : getView().values()) {  
  12.             if (p.id == myid) {  
  13.                 myQuorumAddr = p.addr;  
  14.                 break;  
  15.             }  
  16.         }  
  17.         .....  
  18.         this.electionAlg = createElectionAlgorithm(electionType);  
  19.     }  

 5.获取选举算法,默认为FastLeaderElection算法。从3.4.0版本开始,zookeeper废弃了0,1,2三种算法。

Quorumpeer.createelectionalgorithm()代码
  1. protected Election createElectionAlgorithm(int electionAlgorithm){  
  2.         Election le=null;  
  3.                   
  4.         //TODO: use a factory rather than a switch  
  5.         switch (electionAlgorithm) {  
  6.         case 0:  
  7.             le = new LeaderElection(this);  
  8.             break;  
  9.         case 1:  
  10.             le = new AuthFastLeaderElection(this);  
  11.             break;  
  12.         case 2:  
  13.             le = new AuthFastLeaderElection(this, true);  
  14.             break;  
  15.         case 3:  
  16.             //leader选举IO负责类    
  17.             qcm = new QuorumCnxManager(this);  
  18.             QuorumCnxManager.Listener listener = qcm.listener;  
  19.             //启动已绑定配置的选举端口的选举线程,等待集群其他机器连接    
  20.             //例如配置文件中配置了server.1=hadoop1:2888:3888则server.1的选举端口为3888,2888是其leader和其他服务器交换信息的端口  
  21.             //配置文件详见QuorumPeerConfig.parseProperties()方法  
  22.             if(listener != null){  
  23.                 listener.start();  
  24.                 //基于TCP的选举算法  
  25.                 le = new FastLeaderElection(this, qcm);  
  26.             } else {  
  27.                 LOG.error("Null listener when initializing cnx manager");  
  28.             }  
  29.             break;  
  30.         default:  
  31.             assert false;  
  32.         }  
  33.         return le;  
  34.     }  
6. FastLeaderElection初始化
Fastleaderelection.starter()代码
  1. private void starter(QuorumPeer self, QuorumCnxManager manager) {  
  2.        this.self = self;  
  3.        proposedLeader = -1;  
  4.        proposedZxid = -1;  
  5.        //业务层发送队列,业务对象ToSend    
  6.        sendqueue = new LinkedBlockingQueue();  
  7.     //业务层接受队列,业务对象Notificataion    
  8.        recvqueue = new LinkedBlockingQueue();  
  9.     //Messenger包含WorkerSender和WorkerReceiver线程  
  10.     //WorkerSender业务层发送线程,将消息发给IO负责类QuorumCnxManager  
  11.     //WorkerReceiver业务层接受线程,从IO负责类QuorumCnxManager接受消息    
  12.        this.messenger = new Messenger(manager);  
  13.    }  
7.QuorumPeer线程启动
Quorumpeer.run()代码
  1. @Override  
  2.     public void run() {  
  3.         .....  
  4.         try {  
  5.             /*  
  6.              * Main loop  
  7.              */  
  8.             while (running) {  
  9.                 switch (getPeerState()) {  
  10.                 //如果状态是LOOKING,则进入选举流程    
  11.                 case LOOKING:  
  12.                     LOG.info("LOOKING");  
  13.                         .....  
  14.                         try {  
  15.                             roZkMgr.start();  
  16.                             //选举算法开始选举   
  17.                             setCurrentVote(makeLEStrategy().lookForLeader());  
  18.                         } catch (Exception e) {  
  19.                             LOG.warn("Unexpected exception",e);  
  20.                             setPeerState(ServerState.LOOKING);  
  21.                         } finally {  
  22.                             // If the thread is in the the grace period, interrupt  
  23.                             // to come out of waiting.  
  24.                             roZkMgr.interrupt();  
  25.                             roZk.shutdown();  
  26.                         }  
  27.                     } else {  
  28.                         try {  
  29.                             setCurrentVote(makeLEStrategy().lookForLeader());  
  30.                         } catch (Exception e) {  
  31.                             LOG.warn("Unexpected exception", e);  
  32.                             setPeerState(ServerState.LOOKING);  
  33.                         }  
  34.                     }  
  35.                     break;  
  36.                 //当选举完成会改变相应的状态,并创建相应的对象                      
  37.                 case OBSERVING:  
  38.                     try {  
  39.                         LOG.info("OBSERVING");  
  40.                         setObserver(makeObserver(logFactory));  
  41.                         observer.observeLeader();  
  42.                     } catch (Exception e) {  
  43.                         LOG.warn("Unexpected exception",e );                          
  44.                     } finally {  
  45.                         observer.shutdown();  
  46.                         setObserver(null);  
  47.                         setPeerState(ServerState.LOOKING);  
  48.                     }  
  49.                     break;  
  50.                 case FOLLOWING:  
  51.                     try {  
  52.                         LOG.info("FOLLOWING");  
  53.                         setFollower(makeFollower(logFactory));  
  54.                         follower.followLeader();  
  55.                     } catch (Exception e) {  
  56.                         LOG.warn("Unexpected exception",e);  
  57.                     } finally {  
  58.                         follower.shutdown();  
  59.                         setFollower(null);  
  60.                         setPeerState(ServerState.LOOKING);  
  61.                     }  
  62.                     break;  
  63.                 case LEADING:  
  64.                     LOG.info("LEADING");  
  65.                     try {  
  66.                         setLeader(makeLeader(logFactory));  
  67.                         leader.lead();  
  68.                         setLeader(null);  
  69.                     } catch (Exception e) {  
  70.                         LOG.warn("Unexpected exception",e);  
  71.                     } finally {  
  72.                         if (leader != null) {  
  73.                             leader.shutdown("Forcing shutdown");  
  74.                             setLeader(null);  
  75.                         }  
  76.                         setPeerState(ServerState.LOOKING);  
  77.                     }  
  78.                     break;  
  79.                 }  
  80.             }  
  81.         } finally {  
  82.           .....  
  83.         }  
  84.     }  

 8.FastLeaderElection的选举流程

Fastleaderelection.lookforleader()代码
  1. public Vote lookForLeader() throws InterruptedException {  
  2.        .....  
  3.        try {  
  4.         //收到的投票    
  5.            HashMap recvset = new HashMap();  
  6.   
  7.            HashMap outofelection = new HashMap();  
  8.   
  9.            int notTimeout = finalizeWait;  
  10.   
  11.            synchronized(this){  
  12.                logicalclock++;  
  13.             //先投给自己    
  14.                updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());  
  15.            }  
  16.   
  17.            LOG.info("New election. My id =  " + self.getId() +  
  18.                    ", proposed zxid=0x" + Long.toHexString(proposedZxid));  
  19.         //发送投票,包括发给自己         
  20.            sendNotifications();  
  21.   
  22.            /*  
  23.             * Loop in which we exchange notifications until we find a leader  
  24.             */  
  25.            //主循环,直到选出leader    
  26.            while ((self.getPeerState() == ServerState.LOOKING) &&  
  27.                    (!stop)){  
  28.                /*  
  29.                 * Remove next notification from queue, times out after 2 times  
  30.                 * the termination time  
  31.                 */  
  32.             //从IO线程里拿到投票消息,自己的投票也在这里处理     
  33.                Notification n = recvqueue.poll(notTimeout,  
  34.                        TimeUnit.MILLISECONDS);  
  35.   
  36.                /*  
  37.                 * Sends more notifications if haven't received enough.  
  38.                 * Otherwise processes new notification.  
  39.                 */  
  40.             //如果空闲     
  41.                if(n == null){  
  42.                 //消息发完了,继续发送,一直到选出leader为止    
  43.                    if(manager.haveDelivered()){  
  44.                        sendNotifications();  
  45.                    } else {  
  46.                     //消息还在,可能其他server还没启动,尝试连接     
  47.                        manager.connectAll();  
  48.                    }  
  49.   
  50.                    /*  
  51.                     * Exponential backoff  
  52.                     */  
  53.                  //延长超时时间    
  54.                    int tmpTimeOut = notTimeout*2;  
  55.                    notTimeout = (tmpTimeOut < maxNotificationInterval?  
  56.                            tmpTimeOut : maxNotificationInterval);  
  57.                    LOG.info("Notification time out: " + notTimeout);  
  58.                }  
  59.             //收到了投票消息    
  60.                else if(self.getVotingView().containsKey(n.sid)) {  
  61.                    /*  
  62.                     * Only proceed if the vote comes from a replica in the  
  63.                     * voting view.  
  64.                     */  
  65.                    switch (n.state) {  
  66.                 //LOOKING消息,则    
  67.                    case LOOKING:  
  68.                     //检查下收到的这张选票是否可以胜出,依次比较选举轮数epoch,事务zxid,服务器编号server id  
  69.                        // If notification > current, replace and send messages out  
  70.                        if (n.electionEpoch > logicalclock) {  
  71.                            logicalclock = n.electionEpoch;  
  72.                            recvset.clear();  
  73.                            if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,  
  74.                                    getInitId(), getInitLastLoggedZxid(), getPeerEpoch())) {  
  75.                                updateProposal(n.leader, n.zxid, n.peerEpoch);  
  76.                            } else {  
  77.                                updateProposal(getInitId(),  
  78.                                        getInitLastLoggedZxid(),  
  79.                                        getPeerEpoch());  
  80.                            }  
  81.                            sendNotifications();  
  82.                        } else if (n.electionEpoch < logicalclock) {  
  83.                            if(LOG.isDebugEnabled()){  
  84.                                LOG.debug("Notification election epoch is smaller than logicalclock. n.electionEpoch = 0x"  
  85.                                        + Long.toHexString(n.electionEpoch)  
  86.                                        + ", logicalclock=0x" + Long.toHexString(logicalclock));  
  87.                            }  
  88.                            break;  
  89.                        } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,  
  90.                                proposedLeader, proposedZxid, proposedEpoch)) {  
  91.                         //胜出了,就把自己的投票修改为对方的,然后广播消息    
  92.                            updateProposal(n.leader, n.zxid, n.peerEpoch);  
  93.                            sendNotifications();  
  94.                        }  
  95.                        .....  
  96.                     //添加到本机投票集合,用来做选举终结判断   
  97.                        recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));  
  98.                     //选举是否结束,默认算法是超过半数server同意    
  99.                        if (termPredicate(recvset,  
  100.                                new Vote(proposedLeader, proposedZxid,  
  101.                                        logicalclock, proposedEpoch))) {  
  102.   
  103.                            // Verify if there is any change in the proposed leader  
  104.                            while((n = recvqueue.poll(finalizeWait,  
  105.                                    TimeUnit.MILLISECONDS)) != null){  
  106.                                if(totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,  
  107.                                        proposedLeader, proposedZxid, proposedEpoch)){  
  108.                                    recvqueue.put(n);  
  109.                                    break;  
  110.                                }  
  111.                            }  
  112.   
  113.                            /*  
  114.                             * This predicate is true once we don't read any new  
  115.                             * relevant message from the reception queue  
  116.                             */  
  117.                            if (n == null) {  
  118.                             //修改状态,LEADING or FOLLOWING    
  119.                                self.setPeerState((proposedLeader == self.getId()) ?  
  120.                                        ServerState.LEADING: learningState());  
  121.                             //返回最终的选票结果   
  122.                                Vote endVote = new Vote(proposedLeader,  
  123.                                        proposedZxid, proposedEpoch);  
  124.                                leaveInstance(endVote);  
  125.                                return endVote;  
  126.                            }  
  127.                        }  
  128.                        break;  
  129.                 //OBSERVING机器不参数选举        
  130.                    case OBSERVING:  
  131.                        LOG.debug("Notification from observer: " + n.sid);  
  132.                        break;  
  133.                 //这2种需要参与选举       
  134.                    case FOLLOWING:  
  135.                    case LEADING:  
  136.                        /*  
  137.                         * Consider all notifications from the same epoch  
  138.                         * together.  
  139.                         */  
  140.                        if(n.electionEpoch == logicalclock){  
  141.                          //同样需要加入到本机的投票集合   
  142.                            recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));  
  143.                          //投票是否结束,如果结束,再确认LEADER是否有效    
  144.                          //如果结束,修改自己的状态并返回投票结果   
  145.                            if(termPredicate(recvset, new Vote(n.leader,  
  146.                                            n.zxid, n.electionEpoch, n.peerEpoch, n.state))  
  147.                                            && checkLeader(outofelection, n.leader, n.electionEpoch)) {  
  148.                                self.setPeerState((n.leader == self.getId()) ?  
  149.                                        ServerState.LEADING: learningState());  
  150.   
  151.                                Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);  
  152.                                leaveInstance(endVote);  
  153.                                return endVote;  
  154.                            }  
  155.                        }  
  156.   
  157.                        /**  
  158.                         * Before joining an established ensemble, verify that  
  159.                         * a majority are following the same leader.  
  160.                         */  
  161.                        outofelection.put(n.sid, new Vote(n.leader, n.zxid,  
  162.                                n.electionEpoch, n.peerEpoch, n.state));  
  163.                        if (termPredicate(outofelection, new Vote(n.leader,  
  164.                                n.zxid, n.electionEpoch, n.peerEpoch, n.state))  
  165.                                && checkLeader(outofelection, n.leader, n.electionEpoch)) {  
  166.                            synchronized(this){  
  167.                                logicalclock = n.electionEpoch;  
  168.                                self.setPeerState((n.leader == self.getId()) ?  
  169.                                        ServerState.LEADING: learningState());  
  170.                            }  
  171.                            Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);  
  172.                            leaveInstance(endVote);  
  173.                            return endVote;  
  174.                        }  
  175.                        break;  
  176.                    default:  
  177.                        LOG.warn("Notification state unrecoginized: " + n.state  
  178.                              + " (n.state), " + n.sid + " (n.sid)");  
  179.                        break;  
  180.                    }  
  181.                } else {  
  182.                    LOG.warn("Ignoring notification from non-cluster member " + n.sid);  
  183.                }  
  184.            }  
  185.            return null;  
  186.        } finally {  
  187.            try {  
  188.                if(self.jmxLeaderElectionBean != null){  
  189.                    MBeanRegistry.getInstance().unregister(  
  190.                            self.jmxLeaderElectionBean);  
  191.                }  
  192.            } catch (Exception e) {  
  193.                LOG.warn("Failed to unregister with JMX", e);  
  194.            }  
  195.            self.jmxLeaderElectionBean = null;  
  196.        }  
  197.    }  

 9.选举消息发送

Fastleaderelection.sendnotifications()代码
  1. private void sendNotifications() {  
  2.        //循环发送    
  3.        for (QuorumServer server : self.getVotingView().values()) {  
  4.            long sid = server.id;  
  5.            //消息实体    
  6.            ToSend notmsg = new ToSend(ToSend.mType.notification,  
  7.                    proposedLeader,  
  8.                    proposedZxid,  
  9.                    logicalclock,  
  10.                    QuorumPeer.ServerState.LOOKING,  
  11.                    sid,  
  12.                    proposedEpoch);  
  13.            .....  
  14.            //添加到业务的发送队列,该队列会被WorkerSender消费    
  15.            sendqueue.offer(notmsg);  
  16.        }  
  17.    }  

 10.WorkerSender消费

Fastleaderelection.messenger.workersender.run()代码
  1. public void run() {  
  2.                while (!stop) {  
  3.                    try {  
  4.                        ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);  
  5.                        if(m == null) continue;  
  6.   
  7.                        process(m);  
  8.                    } catch (InterruptedException e) {  
  9.                        break;  
  10.                    }  
  11.                }  
  12.                LOG.info("WorkerSender is down");  
  13.            }  
Fastleaderelection.messenger.workersender.process()代码
  1. private void process(ToSend m) {  
  2.                byte requestBytes[] = new byte[36];  
  3.                ByteBuffer requestBuffer = ByteBuffer.wrap(requestBytes);  
  4.   
  5.                /*  
  6.                 * Building notification packet to send  
  7.                 */  
  8.   
  9.                requestBuffer.clear();  
  10.                requestBuffer.putInt(m.state.ordinal());  
  11.                requestBuffer.putLong(m.leader);  
  12.                requestBuffer.putLong(m.zxid);  
  13.                requestBuffer.putLong(m.electionEpoch);  
  14.                requestBuffer.putLong(m.peerEpoch);  
  15.                //CnxManager这个IO负责类发送消息   
  16.                manager.toSend(m.sid, requestBuffer);  
  17.            }  

 11.QuorumCnxManager具体发送

Quorumcnxmanager.tosend()代码
  1. public void toSend(Long sid, ByteBuffer b) {  
  2.        /*  
  3.         * If sending message to myself, then simply enqueue it (loopback).  
  4.         */  
  5.      //如果是自己,不走网络,直接添加到本地接受队列    
  6.        if (self.getId() == sid) {  
  7.             b.position(0);  
  8.             addToRecvQueue(new Message(b.duplicate(), sid));  
  9.            /*  
  10.             * Otherwise send to the corresponding thread to send.  
  11.             */  
  12.     //否则,先添加到发送队列,然后尝试连接,连接成功则给每台server启动发送和接受线程    
  13.        } else {  
  14.             /*  
  15.              * Start a new connection if doesn't have one already.  
  16.              */  
  17.             if (!queueSendMap.containsKey(sid)) {  
  18.                 ArrayBlockingQueue bq = new ArrayBlockingQueue(  
  19.                         SEND_CAPACITY);  
  20.                 queueSendMap.put(sid, bq);  
  21.                 addToSendQueue(bq, b);  
  22.   
  23.             } else {  
  24.                 ArrayBlockingQueue bq = queueSendMap.get(sid);  
  25.                 if(bq != null){  
  26.                     addToSendQueue(bq, b);  
  27.                 } else {  
  28.                     LOG.error("No queue for server " + sid);  
  29.                 }  
  30.             }  
  31.             connectOne(sid);  
  32.                  
  33.        }  
  34.    }  

 11.尝试连接过程

Quorumcnxmanager.connectone()代码
  1. synchronized void connectOne(long sid){  
  2.        if (senderWorkerMap.get(sid) == null){  
  3.            InetSocketAddress electionAddr;  
  4.            if (self.quorumPeers.containsKey(sid)) {  
  5.             //对方的选举地址  
  6.                electionAddr = self.quorumPeers.get(sid).electionAddr;  
  7.            } else {  
  8.                LOG.warn("Invalid server id: " + sid);  
  9.                return;  
  10.            }  
  11.            try {  
  12.             .....  
  13.             //同步IO连接    
  14.                Socket sock = new Socket();  
  15.                setSockOpts(sock);  
  16.                sock.connect(self.getView().get(sid).electionAddr, cnxTO);  
  17.                if (LOG.isDebugEnabled()) {  
  18.                    LOG.debug("Connected to server " + sid);  
  19.                }  
  20.             //连上了,初始化IO线程    
  21.                initiateConnection(sock, sid);  
  22.            } catch (UnresolvedAddressException e) {  
  23.              .....  
  24.            }  
  25.        } else {  
  26.            LOG.debug("There is a connection already for server " + sid);  
  27.        }  
  28.    }  
Quorumcnxmanager.initiateconnection()代码
  1. public boolean initiateConnection(Socket sock, Long sid) {  
  2.         DataOutputStream dout = null;  
  3.         try {  
  4.             // Sending id and challenge  
  5.             dout = new DataOutputStream(sock.getOutputStream());  
  6.             dout.writeLong(self.getId());  
  7.             dout.flush();  
  8.         } catch (IOException e) {  
  9.             LOG.warn("Ignoring exception reading or writing challenge: ", e);  
  10.             closeSocket(sock);  
  11.             return false;  
  12.         }  
  13.           
  14.         // If lost the challenge, then drop the new connection  
  15.         //只允许sid大的主动连接sid小的  
  16.         if (sid > self.getId()) {  
  17.             LOG.info("Have smaller server identifier, so dropping the " +  
  18.                      "connection: (" + sid + ", " + self.getId() + ")");  
  19.             closeSocket(sock);  
  20.             // Otherwise proceed with the connection  
  21.         } else {  
  22.             SendWorker sw = new SendWorker(sock, sid);  
  23.             RecvWorker rw = new RecvWorker(sock, sid, sw);  
  24.             sw.setRecv(rw);  
  25.   
  26.             SendWorker vsw = senderWorkerMap.get(sid);  
  27.               
  28.             if(vsw != null)  
  29.                 vsw.finish();  
  30.               
  31.             senderWorkerMap.put(sid, sw);  
  32.             if (!queueSendMap.containsKey(sid)) {  
  33.                 queueSendMap.put(sid, new ArrayBlockingQueue(  
  34.                         SEND_CAPACITY));  
  35.             }  
  36.               
  37.             sw.start();  
  38.             rw.start();  
  39.               
  40.             return true;      
  41.               
  42.         }  
  43.         return false;  
  44.     }  

 12.QuorumCnxManager.Listener主循环接收连接

Quorumcnxmanager.listener.run()代码
  1. @Override  
  2.        public void run() {  
  3.            int numRetries = 0;  
  4.            while((!shutdown) && (numRetries < 3)){  
  5.                try {  
  6.                    ss = new ServerSocket();  
  7.                    ss.setReuseAddress(true);  
  8.                    int port = self.quorumPeers.get(self.getId()).electionAddr  
  9.                            .getPort();  
  10.                    InetSocketAddress addr = new InetSocketAddress(port);  
  11.                    LOG.info("My election bind port: " + addr.toString());  
  12.                    setName(self.quorumPeers.get(self.getId()).electionAddr  
  13.                            .toString());  
  14.                    ss.bind(addr);  
  15.                    while (!shutdown) {  
  16.                        Socket client = ss.accept();  
  17.                        setSockOpts(client);  
  18.                        LOG.info("Received connection request "  
  19.                                + client.getRemoteSocketAddress());  
  20.                        receiveConnection(client);  
  21.                        numRetries = 0;  
  22.                    }  
  23.                } catch (IOException e) {  
  24.                   .....  
  25.                }  
  26.            }  
  27.           .....  
  28.        }  
Quorumcnxmanager.listener.receiveconnection()代码
  1. public boolean receiveConnection(Socket sock) {  
  2.         Long sid = null;  
  3.           
  4.         try {  
  5.             // Read server id  
  6.             //读server id    
  7.             DataInputStream din = new DataInputStream(sock.getInputStream());  
  8.             sid = din.readLong();    
  9.             .....  
  10.         } catch (IOException e) {  
  11.             .....  
  12.         }  
  13.           
  14.         //If wins the challenge, then close the new connection.  
  15.         //如果对方id比我小,则关闭连接,只允许大id的server连接小id的server   
  16.         if (sid < self.getId()) {  
  17.             /*  
  18.              * This replica might still believe that the connection to sid is  
  19.              * up, so we have to shut down the workers before trying to open a  
  20.              * new connection.  
  21.              */  
  22.             SendWorker sw = senderWorkerMap.get(sid);  
  23.             if (sw != null) {  
  24.                 sw.finish();  
  25.             }  
  26.   
  27.             /*  
  28.              * Now we start a new connection  
  29.              */  
  30.             LOG.debug("Create new connection to server: " + sid);  
  31.             closeSocket(sock);  
  32.             connectOne(sid);  
  33.   
  34.             // Otherwise start worker threads to receive data.  
  35.         //如果对方id比我大,允许连接,并初始化单独的IO线程    
  36.         } else {  
  37.             SendWorker sw = new SendWorker(sock, sid);  
  38.             RecvWorker rw = new RecvWorker(sock, sid, sw);  
  39.             sw.setRecv(rw);  
  40.   
  41.             SendWorker vsw = senderWorkerMap.get(sid);  
  42.               
  43.             if(vsw != null)  
  44.                 vsw.finish();  
  45.               
  46.             senderWorkerMap.put(sid, sw);  
  47.               
  48.             if (!queueSendMap.containsKey(sid)) {  
  49.                 queueSendMap.put(sid, new ArrayBlockingQueue(  
  50.                         SEND_CAPACITY));  
  51.             }  
  52.               
  53.             sw.start();  
  54.             rw.start();  
  55.               
  56.             return true;      
  57.         }  
  58.         return false;  
  59.     }  

 13.IO发送线程SendWorker启动,开始发送选举消息

Quorumcnxmanager.sendworker.run()代码
  1. @Override  
  2.         public void run() {  
  3.             .....  
  4.             try {  
  5.                 while (running && !shutdown && sock != null) {  
  6.   
  7.                     ByteBuffer b = null;  
  8.                     try {  
  9.                         //每个server一个发送队列    
  10.                         ArrayBlockingQueue bq = queueSendMap  
  11.                                 .get(sid);  
  12.                         if (bq != null) {  
  13.                             //拿消息    
  14.                             b = pollSendQueue(bq, 1000, TimeUnit.MILLISECONDS);  
  15.                         } else {  
  16.                             LOG.error("No queue of incoming messages for " +  
  17.                                       "server " + sid);  
  18.                             break;  
  19.                         }  
  20.   
  21.                         if(b != null){  
  22.                             //发消息    
  23.                             lastMessageSent.put(sid, b);  
  24.                             send(b);  
  25.                         }  
  26.                     } catch (InterruptedException e) {  
  27.                        .....  
  28.                     }  
  29.                 }  
  30.             } catch (Exception e) {  
  31.                 .....  
  32.             }  
  33.             this.finish();  
  34.             LOG.warn("Send worker leaving thread");  
  35.         }  
  36.     }  

 14. IO接收线程RecvWorker启动,开始发送选举消息

Quorumcnxmanager.recvworker.run()代码
  1. @Override  
  2.        public void run() {  
  3.            threadCnt.incrementAndGet();  
  4.            try {  
  5.                while (running && !shutdown && sock != null) {  
  6.                    /**  
  7.                     * Reads the first int to determine the length of the  
  8.                     * message  
  9.                     */  
  10.                  //包的长度   
  11.                    int length = din.readInt();  
  12.                    if (length <= 0 || length > PACKETMAXSIZE) {  
  13.                        throw new IOException(  
  14.                                "Received packet with invalid packet: "  
  15.                                        + length);  
  16.                    }  
  17.                    /**  
  18.                     * Allocates a new ByteBuffer to receive the message  
  19.                     */  
  20.                  //读到内存   
  21.                    byte[] msgArray = new byte[length];  
  22.                    din.readFully(msgArray, 0, length);  
  23.                    ByteBuffer message = ByteBuffer.wrap(msgArray);  
  24.                 //添加到接收队列,后续业务层的接收线程WorkerReceiver会来拿消息    
  25.                    addToRecvQueue(new Message(message.duplicate(), sid));  
  26.                }  
  27.            } catch (Exception e) {  
  28.               .....  
  29.            } finally {  
  30.                LOG.warn("Interrupting SendWorker");  
  31.                sw.finish();  
  32.                if (sock != null) {  
  33.                    closeSocket(sock);  
  34.                }  
  35.            }  
  36.        }  

 15.业务层的接受线程WorkerReceiver取得消息

Fastleaderelection.messenger.workerreceiver.run()代码
  1. public void run() {  
  2.   
  3.                 Message response;  
  4.                 while (!stop) {  
  5.                     // Sleeps on receive  
  6.                     try{  
  7.                         //从QuorumCnxManager取得数据    
  8.                         response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);  
  9.                         if(response == null) continue;  
  10.   
  11.                         /*  
  12.                          * If it is from an observer, respond right away.  
  13.                          * Note that the following predicate assumes that  
  14.                          * if a server is not a follower, then it must be  
  15.                          * an observer. If we ever have any other type of  
  16.                          * learner in the future, we'll have to change the  
  17.                          * way we check for observers.  
  18.                          */  
  19.                         //如果是Observer,则返回当前选举结果     
  20.                         if(!self.getVotingView().containsKey(response.sid)){  
  21.                             Vote current = self.getCurrentVote();  
  22.                             ToSend notmsg = new ToSend(ToSend.mType.notification,  
  23.                                     current.getId(),  
  24.                                     current.getZxid(),  
  25.                                     logicalclock,  
  26.                                     self.getPeerState(),  
  27.                                     response.sid,  
  28.                                     current.getPeerEpoch());  
  29.   
  30.                             sendqueue.offer(notmsg);  
  31.                         } else {  
  32.                             // Receive new message  
  33.                             .....  
  34.   
  35.                             /*  
  36.                              * We check for 28 bytes for backward compatibility  
  37.                              */  
  38.                             if (response.buffer.capacity() < 28) {  
  39.                                 LOG.error("Got a short response: "  
  40.                                         + response.buffer.capacity());  
  41.                                 continue;  
  42.                             }  
  43.                             boolean backCompatibility = (response.buffer.capacity() == 28);  
  44.                             response.buffer.clear();  
  45.   
  46.                             // State of peer that sent this message  
  47.                             //对方节点状态    
  48.                             QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING;  
  49.                             switch (response.buffer.getInt()) {  
  50.                             case 0:  
  51.                                 ackstate = QuorumPeer.ServerState.LOOKING;  
  52.                                 break;  
  53.                             case 1:  
  54.                                 ackstate = QuorumPeer.ServerState.FOLLOWING;  
  55.                                 break;  
  56.                             case 2:  
  57.                                 ackstate = QuorumPeer.ServerState.LEADING;  
  58.                                 break;  
  59.                             case 3:  
  60.                                 ackstate = QuorumPeer.ServerState.OBSERVING;  
  61.                                 break;  
  62.                             }  
  63.   
  64.                             // Instantiate Notification and set its attributes  
  65.                             //初始化Notification对象    
  66.                             Notification n = new Notification();  
  67.                             n.leader = response.buffer.getLong();  
  68.                             n.zxid = response.buffer.getLong();  
  69.                             n.electionEpoch = response.buffer.getLong();  
  70.                             n.state = ackstate;  
  71.                             n.sid = response.sid;  
  72.                             if(!backCompatibility){  
  73.                                 n.peerEpoch = response.buffer.getLong();  
  74.                             } else {  
  75.                                 if(LOG.isInfoEnabled()){  
  76.                                     LOG.info("Backward compatibility mode, server id=" + n.sid);  
  77.                                 }  
  78.                                 n.peerEpoch = ZxidUtils.getEpochFromZxid(n.zxid);  
  79.                             }  
  80.                             .....  
  81.                             /*  
  82.                              * If this server is looking, then send proposed leader  
  83.                              */  
  84.                             //如果自己也在LOOKING,则放入业务接收队列,选举主线程会消费该消息    
  85.                             if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){  
  86.                                 recvqueue.offer(n);  
  87.   
  88.                                 /*  
  89.                                  * Send a notification back if the peer that sent this  
  90.                                  * message is also looking and its logical clock is  
  91.                                  * lagging behind.  
  92.                                  */  
  93.                                 if((ackstate == QuorumPeer.ServerState.LOOKING)  
  94.                                         && (n.electionEpoch < logicalclock)){  
  95.                                     Vote v = getVote();  
  96.                                     ToSend notmsg = new ToSend(ToSend.mType.notification,  
  97.                                             v.getId(),  
  98.                                             v.getZxid(),  
  99.                                             logicalclock,  
  100.                                             self.getPeerState(),  
  101.                                             response.sid,  
  102.                                             v.getPeerEpoch());  
  103.                                     sendqueue.offer(notmsg);  
  104.                                 }  
  105.                             } else {  
  106.                                 /*  
  107.                                  * If this server is not looking, but the one that sent the ack  
  108.                                  * is looking, then send back what it believes to be the leader.  
  109.                                  */  
  110.                                  //如果自己不在选举中,而对方server在LOOKING中,则向其发送当前的选举结果,当有server加入一个essemble时有用   
  111.                                 Vote current = self.getCurrentVote();  
  112.                                 if(ackstate == QuorumPeer.ServerState.LOOKING){  
  113.                                     if(LOG.isDebugEnabled()){  
  114.                                         LOG.debug("Sending new notification. My id =  " +  
  115.                                                 self.getId() + " recipient=" +  
  116.                                                 response.sid + " zxid=0x" +  
  117.                                                 Long.toHexString(current.getZxid()) +  
  118.                                                 " leader=" + current.getId());  
  119.                                     }  
  120.                                     ToSend notmsg = new ToSend(  
  121.                                             ToSend.mType.notification,  
  122.                                             current.getId(),  
  123.                                             current.getZxid(),  
  124.                                             logicalclock,  
  125.                                             self.getPeerState(),  
  126.                                             response.sid,  
  127.                                             current.getPeerEpoch());  
  128.                                     sendqueue.offer(notmsg);  
  129.                                 }  
  130.                             }  
  131.                         }  
  132.                     } catch (InterruptedException e) {  
  133.                        .....  
  134.                     }  
  135.                 }  
  136.                 LOG.info("WorkerReceiver is down");  
  137.             }  
  138.         }  

   

Leader选举小结
 1.server启动时默认选举自己,并向整个集群广播
 2.收到消息时,通过3层判断:选举轮数,zxid,server id大小判断是否同意对方,如果同意,则修改自己的选票,并向集群广播
 3.QuorumCnxManager负责IO处理,每2个server建立一个连接,只允许id大的server连id小的server,每个server启动单独的读写线程处理,使用阻塞IO
 4.默认超过半数机器同意时,则选举成功,修改自身状态为LEADING或FOLLOWING
 5.Obserer机器不参与选举

你可能感兴趣的:(zookeeper)