深入浅出Zookeeper之六 Leader/Follower初始化

前一篇介绍了Leader选举,这一篇介绍选举成功之后Leader和Follower之间的初始化。

先看Leader端操作

 

Java代码   收藏代码
  1. case LEADING:  
  2.                     LOG.info("LEADING");  
  3.                     try {  
  4.             //初始化Leader对象  
  5.                         setLeader(makeLeader(logFactory));  
  6.             //lead,线程在这里阻塞  
  7.                         leader.lead();  
  8.                         setLeader(null);  
  9.                     } catch (Exception e) {  
  10.                         LOG.warn("Unexpected exception",e);  
  11.                     } finally {  
  12.                         if (leader != null) {  
  13.                             leader.shutdown("Forcing shutdown");  
  14.                             setLeader(null);  
  15.                         }  
  16.                         setPeerState(ServerState.LOOKING);  
  17.                     }  
  18.                     break;  
  19.                 }  

 Leader初始化

 

 

Java代码   收藏代码
  1. Leader(QuorumPeer self,LeaderZooKeeperServer zk) throws IOException {  
  2.     this.self = self;  
  3.     try {  
  4. 打开lead端口,这里是2888  
  5.         ss = new ServerSocket();  
  6.         ss.setReuseAddress(true);  
  7.         ss.bind(new InetSocketAddress(self.getQuorumAddress().getPort()));  
  8.     } catch (BindException e) {  
  9.         LOG.error("Couldn't bind to port "  
  10.                 + self.getQuorumAddress().getPort(), e);  
  11.         throw e;  
  12.     }  
  13.     this.zk=zk;  
  14. }  

 具体lead过程

 

 

Java代码   收藏代码
  1. self.tick = 0;  
  2.     //从本地文件恢复数据  
  3.            zk.loadData();  
  4.            //leader的状态信息  
  5.            leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid());  
  6.   
  7.            // Start thread that waits for connection requests from   
  8.            // new followers.  
  9.     //启动lead端口的监听线程,专门用来监听新的follower  
  10.            cnxAcceptor = new LearnerCnxAcceptor();  
  11.            cnxAcceptor.start();  
  12.              
  13.            readyToStart = true;  
  14.     //等待足够多的follower进来,代表自己确实是leader,此处lead线程可能会等待  
  15.            long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());  
  16. .......  

 等待follower连接

 

 

Java代码   收藏代码
  1. public long getEpochToPropose(long sid, long lastAcceptedEpoch) throws InterruptedException, IOException {  
  2.         synchronized(connectingFollowers) {  
  3.             if (!waitingForNewEpoch) {  
  4.                 return epoch;  
  5.             }  
  6.             if (lastAcceptedEpoch >= epoch) {  
  7.                 epoch = lastAcceptedEpoch+1;  
  8.             }  
  9.         //将自己加入连接队伍中,方便后续判断lead是否有效  
  10.             connectingFollowers.add(sid);  
  11.             QuorumVerifier verifier = self.getQuorumVerifier();  
  12.         //如果足够多的follower进入,选举有效,则无需等待,并通知其他的等待线程,类似于Barrier  
  13.             if (connectingFollowers.contains(self.getId()) &&   
  14.                                             verifier.containsQuorum(connectingFollowers)) {  
  15.                 waitingForNewEpoch = false;  
  16.                 self.setAcceptedEpoch(epoch);  
  17.                 connectingFollowers.notifyAll();  
  18.             }   
  19.         //如果进入的follower不够,则进入等待,超时即为initLimit时间,  
  20.         else {  
  21.                 long start = System.currentTimeMillis();  
  22.                 long cur = start;  
  23.                 long end = start + self.getInitLimit()*self.getTickTime();  
  24.                 while(waitingForNewEpoch && cur < end) {  
  25.                     connectingFollowers.wait(end - cur);  
  26.                     cur = System.currentTimeMillis();  
  27.                 }  
  28.         //超时了,退出lead过程,重新发起选举  
  29.                 if (waitingForNewEpoch) {  
  30.                     throw new InterruptedException("Timeout while waiting for epoch from quorum");          
  31.                 }  
  32.             }  
  33.             return epoch;  
  34.         }  
  35.     }  

 好的,这个时候我们假设其他follower还没连接进来,那Leader就会在此等待。再来看Follower的初始化过程

 

Java代码   收藏代码
  1. case FOLLOWING:  
  2.                     try {  
  3.                         LOG.info("FOLLOWING");  
  4.             //初始化Follower对象  
  5.                         setFollower(makeFollower(logFactory));  
  6.             //follow动作,线程在此等待  
  7.                         follower.followLeader();  
  8.                     } catch (Exception e) {  
  9.                         LOG.warn("Unexpected exception",e);  
  10.                     } finally {  
  11.                         follower.shutdown();  
  12.                         setFollower(null);  
  13.                         setPeerState(ServerState.LOOKING);  
  14.                     }  
  15.                     break;  

 具体follow过程

 

 

Java代码   收藏代码
  1. void followLeader() throws InterruptedException {  
  2.         .......  
  3.         try {  
  4.         //根据sid找到对应leader,拿到lead连接信息  
  5.             InetSocketAddress addr = findLeader();              
  6.             try {  
  7.         //连接leader  
  8.                 connectToLeader(addr);  
  9.         //注册follower,根据Leader和follower协议,主要是同步选举轮数  
  10.                 long newEpochZxid = registerWithLeader(Leader.FOLLOWERINFO);  
  11.   
  12.                 //check to see if the leader zxid is lower than ours  
  13.                 //this should never happen but is just a safety check  
  14.                 long newEpoch = ZxidUtils.getEpochFromZxid(newEpochZxid);  
  15.                 if (newEpoch < self.getAcceptedEpoch()) {  
  16.                     LOG.error("Proposed leader epoch " + ZxidUtils.zxidToString(newEpochZxid)  
  17.                             + " is less than our accepted epoch " + ZxidUtils.zxidToString(self.getAcceptedEpoch()));  
  18.                     throw new IOException("Error: Epoch of leader is lower");  
  19.                 }  
  20.         //同步数据  
  21.                 syncWithLeader(newEpochZxid);                  
  22.                 QuorumPacket qp = new QuorumPacket();  
  23.         //接受Leader消息,执行并反馈给leader,线程在此自旋  
  24.                 while (self.isRunning()) {  
  25.                     readPacket(qp);  
  26.                     processPacket(qp);  
  27.                 }  
  28.            ......  
  29.     }  

 连接leader

 

 

Java代码   收藏代码
  1. protected void connectToLeader(InetSocketAddress addr)   
  2.     throws IOException, ConnectException, InterruptedException {  
  3.         sock = new Socket();  
  4.     //设置读超时时间为initLimit对应时间  
  5.         sock.setSoTimeout(self.tickTime * self.initLimit);  
  6.     //重试5次,失败后退出follower角色,重新选举  
  7.         for (int tries = 0; tries < 5; tries++) {  
  8.             try {  
  9.         //连接超时  
  10.                 sock.connect(addr, self.tickTime * self.syncLimit);  
  11.                 sock.setTcpNoDelay(nodelay);  
  12.                 break;  
  13.             } catch (IOException e) {  
  14.                 if (tries == 4) {  
  15.                     LOG.error("Unexpected exception",e);  
  16.                     throw e;  
  17.                 } else {  
  18.                     LOG.warn("Unexpected exception, tries="+tries+  
  19.                             ", connecting to " + addr,e);  
  20.                     sock = new Socket();  
  21.                     sock.setSoTimeout(self.tickTime * self.initLimit);  
  22.                 }  
  23.             }  
  24.             Thread.sleep(1000);  
  25.         }  
  26.         leaderIs = BinaryInputArchive.getArchive(new BufferedInputStream(  
  27.                 sock.getInputStream()));  
  28.         bufferedOutput = new BufferedOutputStream(sock.getOutputStream());  
  29.         leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);  
  30.     }     

 假设这里follower顺利连上了leader,此时leader端会为每个follower启动单独IO线程,请看LearnerCnxAcceptor代码

 

 

Java代码   收藏代码
  1. public void run() {  
  2.            try {  
  3.                while (!stop) {  
  4.                    try{  
  5.         //线程在此等待连接  
  6.                        Socket s = ss.accept();  
  7.                        // start with the initLimit, once the ack is processed  
  8.                        // in LearnerHandler switch to the syncLimit  
  9.         //读超时设为initLimit时间  
  10.                        s.setSoTimeout(self.tickTime * self.initLimit);  
  11.                        s.setTcpNoDelay(nodelay);  
  12.         //为每个follower启动单独线程,处理IO  
  13.                        LearnerHandler fh = new LearnerHandler(s, Leader.this);  
  14.                        fh.start();  
  15.                    } catch (SocketException e) {  
  16. .....  
  17.        }  

 leader端为follower建立IO线程,其处理过程和follower自身的主线程根据协议相互交互,以下将通过数据交换场景式分析这个过程,leader端IO线程LearnerHandler启动

 

 

Java代码   收藏代码
  1. ia = BinaryInputArchive.getArchive(new BufferedInputStream(sock  
  2.                    .getInputStream()));  
  3.            bufferedOutput = new BufferedOutputStream(sock.getOutputStream());  
  4.            oa = BinaryOutputArchive.getArchive(bufferedOutput);  
  5.     //IO线程等待follower发送包  
  6.            QuorumPacket qp = new QuorumPacket();  
  7.            ia.readRecord(qp, "packet");  

 follower端进入registerWithLeader处理

 

 

Java代码   收藏代码
  1. long lastLoggedZxid = self.getLastLoggedZxid();  
  2.         QuorumPacket qp = new QuorumPacket();    
  3.     //type为Leader.FOLLOWERINFO  
  4.         qp.setType(pktType);  
  5.         qp.setZxid(ZxidUtils.makeZxid(self.getAcceptedEpoch(), 0));  
  6.           
  7.         /* 
  8.          * Add sid to payload 
  9.          */  
  10.         LearnerInfo li = new LearnerInfo(self.getId(), 0x10000);  
  11.         ByteArrayOutputStream bsid = new ByteArrayOutputStream();  
  12.         BinaryOutputArchive boa = BinaryOutputArchive.getArchive(bsid);  
  13.         boa.writeRecord(li, "LearnerInfo");  
  14.         qp.setData(bsid.toByteArray());  
  15.         //发送LearnerInfo包  
  16.         writePacket(qp, true);  
  17.     //等待leader响应  
  18.         readPacket(qp);   

 leader端收到包处理

 

 

Java代码   收藏代码
  1. byte learnerInfoData[] = qp.getData();  
  2.             if (learnerInfoData != null) {  
  3.                 if (learnerInfoData.length == 8) {  
  4.                     ByteBuffer bbsid = ByteBuffer.wrap(learnerInfoData);  
  5.                     this.sid = bbsid.getLong();  
  6.                 } else {  
  7.             //反序列化LearnerInfo  
  8.                     LearnerInfo li = new LearnerInfo();  
  9.                     ByteBufferInputStream.byteBuffer2Record(ByteBuffer.wrap(learnerInfoData), li);  
  10.                     this.sid = li.getServerid();  
  11.                     this.version = li.getProtocolVersion();  
  12.                 }  
  13.             } else {  
  14.                 this.sid = leader.followerCounter.getAndDecrement();  
  15.             }  
  16.   
  17.     ......   
  18.             //follower的选举轮数  
  19.             long lastAcceptedEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());  
  20.               
  21.             long peerLastZxid;  
  22.             StateSummary ss = null;  
  23.             long zxid = qp.getZxid();  
  24.         //将follower加入到connectingFollowers中,因为满足半数机器的条件,此时在此等待的leader主线程会退出等待,继续往下处理  
  25.             long newEpoch = leader.getEpochToPropose(this.getSid(), lastAcceptedEpoch);  
  26.               
  27.           ......  
  28.         //发一个Leader.LEADERINFO包,带上新的epoch id  
  29.                 byte ver[] = new byte[4];  
  30.                 ByteBuffer.wrap(ver).putInt(0x10000);  
  31.                 QuorumPacket newEpochPacket = new QuorumPacket(Leader.LEADERINFO, ZxidUtils.makeZxid(newEpoch, 0), ver, null);  
  32.                 oa.writeRecord(newEpochPacket, "packet");  
  33.                 bufferedOutput.flush();  
  34.                 QuorumPacket ackEpochPacket = new QuorumPacket();  
  35.         //等待follower响应  
  36.                 ia.readRecord(ackEpochPacket, "packet");  
  37.                 if (ackEpochPacket.getType() != Leader.ACKEPOCH) {  
  38.                     LOG.error(ackEpochPacket.toString()  
  39.                             + " is not ACKEPOCH");  
  40.                     return;  
  41.                 }  
  42.                 ByteBuffer bbepoch = ByteBuffer.wrap(ackEpochPacket.getData());  
  43.                 ss = new StateSummary(bbepoch.getInt(), ackEpochPacket.getZxid());  
  44.                 leader.waitForEpochAck(this.getSid(), ss);  
  45.             }  

 此时follower收到LEADERINFO包处理:

 

 

Java代码   收藏代码
  1. final long newEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());  
  2.         if (qp.getType() == Leader.LEADERINFO) {  
  3.             // we are connected to a 1.0 server so accept the new epoch and read the next packet  
  4.             leaderProtocolVersion = ByteBuffer.wrap(qp.getData()).getInt();  
  5.             byte epochBytes[] = new byte[4];  
  6.             final ByteBuffer wrappedEpochBytes = ByteBuffer.wrap(epochBytes);  
  7.         //将自己的epoch发给leader  
  8.             if (newEpoch > self.getAcceptedEpoch()) {  
  9.                 wrappedEpochBytes.putInt((int)self.getCurrentEpoch());  
  10.                 self.setAcceptedEpoch(newEpoch);  
  11.             }   
  12.         ......  
  13.         //发送一个Leader.ACKEPOCH包,带上自己的最大zxid  
  14.             QuorumPacket ackNewEpoch = new QuorumPacket(Leader.ACKEPOCH, lastLoggedZxid, epochBytes, null);  
  15.             writePacket(ackNewEpoch, true);  
  16.             return ZxidUtils.makeZxid(newEpoch, 0);  
  17.         }   

 leader收到Leader.ACKEPOCH后进入waitForEpochAck处理

 

 

Java代码   收藏代码
  1. public void waitForEpochAck(long id, StateSummary ss) throws IOException, InterruptedException {  
  2.       synchronized(electingFollowers) {  
  3.           if (electionFinished) {  
  4.               return;  
  5.           }  
  6.           if (ss.getCurrentEpoch() != -1) {  
  7.               ......  
  8. //将follower添加到等待集合  
  9.               electingFollowers.add(id);  
  10.           }  
  11.           QuorumVerifier verifier = self.getQuorumVerifier();  
  12.    //判断是否满足选举条件,如果不满足进入等待,满足则通知其他等待线程,类似于Barrier  
  13.           if (electingFollowers.contains(self.getId()) && verifier.containsQuorum(electingFollowers)) {  
  14.               electionFinished = true;  
  15.               electingFollowers.notifyAll();  
  16.           }  
  17.    //follower还不够,等等吧  
  18.    else {                  
  19.               long start = System.currentTimeMillis();  
  20.               long cur = start;  
  21.               long end = start + self.getInitLimit()*self.getTickTime();  
  22.               while(!electionFinished && cur < end) {  
  23.                   electingFollowers.wait(end - cur);  
  24.                   cur = System.currentTimeMillis();  
  25.               }  
  26.               if (!electionFinished) {  
  27.                   throw new InterruptedException("Timeout while waiting for epoch to be acked by quorum");  
  28.               }  
  29.           }  
  30.       }  
  31.   }  

 假设IO线程在此等待,此时leader主线程在getEpochToPropose恢复后继续执行

 

 

Java代码   收藏代码
  1. long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());  
  2.              
  3.            zk.setZxid(ZxidUtils.makeZxid(epoch, 0));  
  4.              
  5.            synchronized(this){  
  6.                lastProposed = zk.getZxid();  
  7.            }  
  8.            //发起一个NEWLEADER投票  
  9.            newLeaderProposal.packet = new QuorumPacket(NEWLEADER, zk.getZxid(),  
  10.                    nullnull);  
  11.     ......  
  12.     //投票箱  
  13.            outstandingProposals.put(newLeaderProposal.packet.getZxid(), newLeaderProposal);  
  14.     //自己默认同意  
  15.            newLeaderProposal.ackSet.add(self.getId());  
  16.     //等待follower进来  
  17.            waitForEpochAck(self.getId(), leaderStateSummary);  

 由于之前已经有follower进来,满足选举条件,则IO线程和leader主线程都继续往下执行,先看leader主线程

 

 

Java代码   收藏代码
  1. //当前选票轮数  
  2. self.setCurrentEpoch(epoch);  
  3.   
  4.             // We have to get at least a majority of servers in sync with  
  5.             // us. We do this by waiting for the NEWLEADER packet to get  
  6.             // acknowledged  
  7.         //等待确认NEWLEADER包的follower足够多,那自己真的是leader了  
  8.             while (!self.getQuorumVerifier().containsQuorum(newLeaderProposal.ackSet)){  
  9.             //while (newLeaderProposal.ackCount <= self.quorumPeers.size() / 2) {  
  10.         //如果超过初始化时间initlimit,则退出lead过程,重新选举,有可能是follower同步数据比较慢  
  11.                 if (self.tick > self.initLimit) {  
  12.                     // Followers aren't syncing fast enough,  
  13.                     // renounce leadership!  
  14.                     StringBuilder ackToString = new StringBuilder();  
  15.                     for(Long id : newLeaderProposal.ackSet)  
  16.                         ackToString.append(id + ": ");  
  17.                       
  18.                     shutdown("Waiting for a quorum of followers, only synced with: " + ackToString);  
  19.                     HashSet<Long> followerSet = new HashSet<Long>();  
  20.   
  21.                     for(LearnerHandler f : getLearners()) {  
  22.                         followerSet.add(f.getSid());  
  23.                     }  
  24.   
  25.                     if (self.getQuorumVerifier().containsQuorum(followerSet)) {  
  26.                     //if (followers.size() >= self.quorumPeers.size() / 2) {  
  27.                         LOG.warn("Enough followers present. "+  
  28.                                 "Perhaps the initTicks need to be increased.");  
  29.                     }  
  30.                     return;  
  31.                 }  
  32.                 Thread.sleep(self.tickTime);  
  33.                 self.tick++;  
  34.             }  

 这个时候IO线程继续执行

 

 

 

Java代码   收藏代码
  1.          /* the default to send to the follower */  
  2.   //默认发送一个SNAP包,要求follower同步数据  
  3.           int packetToSend = Leader.SNAP;  
  4.           long zxidToSend = 0;  
  5.           long leaderLastZxid = 0;  
  6.           /** the packets that the follower needs to get updates from **/  
  7.           long updates = peerLastZxid;  
  8.             
  9.           /* we are sending the diff check if we have proposals in memory to be able to  
  10.            * send a diff to the  
  11.            */   
  12.           ReentrantReadWriteLock lock = leader.zk.getZKDatabase().getLogLock();  
  13.           ReadLock rl = lock.readLock();  
  14.           try {  
  15.               rl.lock();          
  16.               final long maxCommittedLog = leader.zk.getZKDatabase().getmaxCommittedLog();  
  17.               final long minCommittedLog = leader.zk.getZKDatabase().getminCommittedLog();  
  18.               LOG.info("Synchronizing with Follower sid: " + sid  
  19.                       +" maxCommittedLog=0x"+Long.toHexString(maxCommittedLog)  
  20.                       +" minCommittedLog=0x"+Long.toHexString(minCommittedLog)  
  21.                       +" peerLastZxid=0x"+Long.toHexString(peerLastZxid));  
  22.   
  23. //看看是否还有需要投的票  
  24.               LinkedList<Proposal> proposals = leader.zk.getZKDatabase().getCommittedLog();  
  25. //如果有,则处理这些投票  
  26.               if (proposals.size() != 0) {  
  27.     //如果follower还没处理这个分布式事务,有可能down掉了又恢复,则继续处理这个事务  
  28.                  if ((maxCommittedLog >= peerLastZxid)  
  29.                           && (minCommittedLog <= peerLastZxid)) {  
  30.                      .......  
  31.                       // If we are here, we can use committedLog to sync with  
  32.                       // follower. Then we only need to decide whether to  
  33.                       // send trunc or not  
  34.                       packetToSend = Leader.DIFF;  
  35.                       zxidToSend = maxCommittedLog;  
  36.   
  37.                       for (Proposal propose: proposals) {  
  38.                           // skip the proposals the peer already has  
  39.         //这个已经被处理过了,无视  
  40.                           if (propose.packet.getZxid() <= peerLastZxid) {  
  41.                               prevProposalZxid = propose.packet.getZxid();  
  42.                               continue;  
  43.                           } else {  
  44.                               // If we are sending the first packet, figure out whether to trunc  
  45.                               // in case the follower has some proposals that the leader doesn't  
  46.         //发第一个事务之前先确认folloer是否比leader超前  
  47.                               if (firstPacket) {  
  48.                                   firstPacket = false;  
  49.                                   // Does the peer have some proposals that the leader hasn't seen yet  
  50.             //follower处理事务比leader多,则发送TRUNC包,让follower回滚到和leader一致  
  51.                                   if (prevProposalZxid < peerLastZxid) {  
  52.                                       // send a trunc message before sending the diff  
  53.                                       packetToSend = Leader.TRUNC;                                          
  54.                                       zxidToSend = prevProposalZxid;  
  55.                                       updates = zxidToSend;  
  56.                                   }  
  57.                               }  
  58.         //将事务发送到队列  
  59.                               queuePacket(propose.packet);  
  60.         //立马接一个COMMIT包  
  61.                               QuorumPacket qcommit = new QuorumPacket(Leader.COMMIT, propose.packet.getZxid(),  
  62.                                       nullnull);  
  63.                               queuePacket(qcommit);  
  64.                           }  
  65.                       }  
  66.                   }   
  67.     //如果follower超前了,则发送TRUNC包,让其和leader同步  
  68.     else if (peerLastZxid > maxCommittedLog) {  
  69.                       LOG.debug("Sending TRUNC to follower zxidToSend=0x{} updates=0x{}",  
  70.                               Long.toHexString(maxCommittedLog),  
  71.                               Long.toHexString(updates));  
  72.   
  73.                       packetToSend = Leader.TRUNC;  
  74.                       zxidToSend = maxCommittedLog;  
  75.                       updates = zxidToSend;  
  76.                   } else {  
  77.                       LOG.warn("Unhandled proposal scenario");  
  78.                   }  
  79.               }   
  80. //如果follower和leader同步,则发送DIFF包,而不需要follower拉数据  
  81. else if (peerLastZxid == leader.zk.getZKDatabase().getDataTreeLastProcessedZxid()) {  
  82.                   .....  
  83.                   packetToSend = Leader.DIFF;  
  84.                   zxidToSend = peerLastZxid;  
  85.             .......  
  86. //NEWLEADER包添加到发送队列  
  87.            QuorumPacket newLeaderQP = new QuorumPacket(Leader.NEWLEADER,  
  88.                   ZxidUtils.makeZxid(newEpoch, 0), nullnull);  
  89.            if (getVersion() < 0x10000) {  
  90.               oa.writeRecord(newLeaderQP, "packet");  
  91.           } else {  
  92.               queuedPackets.add(newLeaderQP);  
  93.           }  
  94.           bufferedOutput.flush();  
  95.           //Need to set the zxidToSend to the latest zxid  
  96.           if (packetToSend == Leader.SNAP) {  
  97.               zxidToSend = leader.zk.getZKDatabase().getDataTreeLastProcessedZxid();  
  98.           }  
  99.    //发送一个DIFF或SNAP包  
  100.           oa.writeRecord(new QuorumPacket(packetToSend, zxidToSend, nullnull), "packet");  
  101.           bufferedOutput.flush();  
  102.    ......  
  103.    // Start sending packets  
  104.    //启动一个异步发送线程  
  105.           new Thread() {  
  106.               public void run() {  
  107.                   Thread.currentThread().setName(  
  108.                           "Sender-" + sock.getRemoteSocketAddress());  
  109.                   try {  
  110.                       sendPackets();  
  111.                   } catch (InterruptedException e) {  
  112.                       LOG.warn("Unexpected interruption",e);  
  113.                   }  
  114.               }  
  115.           }.start();  
  116.             
  117.           /* 
  118.            * Have to wait for the first ACK, wait until  
  119.            * the leader is ready, and only then we can 
  120.            * start processing messages. 
  121.            */  
  122.     //等待follower确认  
  123.           qp = new QuorumPacket();  
  124.           ia.readRecord(qp, "packet");  

 在我们这个集群里。由于是刚启动的,所以leader会直接发送DIFF包,然后再发送一个NEWLEADER包

 

接着follower收到包处理,在syncWithLeader中

 

Java代码   收藏代码
  1.      QuorumPacket ack = new QuorumPacket(Leader.ACK, 0nullnull);  
  2.         QuorumPacket qp = new QuorumPacket();  
  3.         long newEpoch = ZxidUtils.getEpochFromZxid(newLeaderZxid);  
  4.           
  5.         readPacket(qp);     
  6.         LinkedList<Long> packetsCommitted = new LinkedList<Long>();  
  7.         LinkedList<PacketInFlight> packetsNotCommitted = new LinkedList<PacketInFlight>();  
  8.         synchronized (zk) {  
  9.         //DIFF包  
  10.             if (qp.getType() == Leader.DIFF) {  
  11.                 LOG.info("Getting a diff from the leader 0x" + Long.toHexString(qp.getZxid()));                  
  12.             }  
  13.         //如果是SNAP包,则从leader复制一份镜像数据到本地内存  
  14.             else if (qp.getType() == Leader.SNAP) {  
  15.                 LOG.info("Getting a snapshot from leader");  
  16.                 // The leader is going to dump the database  
  17.                 // clear our own database and read  
  18.                 zk.getZKDatabase().clear();  
  19.                 zk.getZKDatabase().deserializeSnapshot(leaderIs);  
  20.                 String signature = leaderIs.readString("signature");  
  21.                 if (!signature.equals("BenWasHere")) {  
  22.                     LOG.error("Missing signature. Got " + signature);  
  23.                     throw new IOException("Missing signature");                     
  24.                 }  
  25.             }   
  26.         //TRUNC包,回滚到对应事务  
  27.         else if (qp.getType() == Leader.TRUNC) {  
  28.                 //we need to truncate the log to the lastzxid of the leader  
  29.                 LOG.warn("Truncating log to get in sync with the leader 0x"  
  30.                         + Long.toHexString(qp.getZxid()));  
  31.                 boolean truncated=zk.getZKDatabase().truncateLog(qp.getZxid());  
  32. ......  
  33.         //最新的事务id  
  34.             zk.getZKDatabase().setlastProcessedZxid(qp.getZxid());  
  35.         //启动过期session检查  
  36.             zk.createSessionTracker();              
  37.               
  38.             long lastQueued = 0;  
  39.   
  40.             // in V1.0 we take a snapshot when we get the NEWLEADER message, but in pre V1.0  
  41.             // we take the snapshot at the UPDATE, since V1.0 also gets the UPDATE (after the NEWLEADER)  
  42.             // we need to make sure that we don't take the snapshot twice.  
  43.             boolean snapshotTaken = false;  
  44.             // we are now going to start getting transactions to apply followed by an UPTODATE  
  45.             outerLoop:  
  46.         //同步完数据后,准备执行投票  
  47.             while (self.isRunning()) {  
  48.                 readPacket(qp);  
  49.                 switch(qp.getType()) {  
  50.         //将投票添加到待处理列表  
  51.                 case Leader.PROPOSAL:  
  52.                     PacketInFlight pif = new PacketInFlight();  
  53.                     pif.hdr = new TxnHeader();  
  54.                     pif.rec = SerializeUtils.deserializeTxn(qp.getData(), pif.hdr);  
  55.                     if (pif.hdr.getZxid() != lastQueued + 1) {  
  56.                     LOG.warn("Got zxid 0x"  
  57.                             + Long.toHexString(pif.hdr.getZxid())  
  58.                             + " expected 0x"  
  59.                             + Long.toHexString(lastQueued + 1));  
  60.                     }  
  61.                     lastQueued = pif.hdr.getZxid();  
  62.                     packetsNotCommitted.add(pif);  
  63.                     break;  
  64.         //COMMIT则将事务交给Server处理掉  
  65.                 case Leader.COMMIT:  
  66.                     if (!snapshotTaken) {  
  67.                         pif = packetsNotCommitted.peekFirst();  
  68.                         if (pif.hdr.getZxid() != qp.getZxid()) {  
  69.                             LOG.warn("Committing " + qp.getZxid() + ", but next proposal is " + pif.hdr.getZxid());  
  70.                         } else {  
  71.                             zk.processTxn(pif.hdr, pif.rec);  
  72.                             packetsNotCommitted.remove();  
  73.                         }  
  74.                     } else {  
  75.                         packetsCommitted.add(qp.getZxid());  
  76.                     }  
  77.                     break;  
  78.                 case Leader.INFORM:  
  79.                     TxnHeader hdr = new TxnHeader();  
  80.                     Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);  
  81.                     zk.processTxn(hdr, txn);  
  82.                     break;  
  83.             //UPTODATE包,说明同步成功,退出循环  
  84.                 case Leader.UPTODATE:  
  85.                     if (!snapshotTaken) { // true for the pre v1.0 case  
  86.                         zk.takeSnapshot();  
  87.                         self.setCurrentEpoch(newEpoch);  
  88.                     }  
  89.                     self.cnxnFactory.setZooKeeperServer(zk);                  
  90.                     break outerLoop;  
  91.             //NEWLEADER包,说明之前残留的投票已经处理完了,则将内存中数据写文件,并发送ACK包  
  92.                 case Leader.NEWLEADER: // it will be NEWLEADER in v1.0  
  93.                     zk.takeSnapshot();  
  94.                     self.setCurrentEpoch(newEpoch);  
  95.                     snapshotTaken = true;  
  96.                     writePacket(new QuorumPacket(Leader.ACK, newLeaderZxid, nullnull), true);  
  97.                     break;  
  98.                 }  
  99.             }  
  100.         }  

 follower在这里同步leader数据,在拿到NEWLEADER包之后序列化到文件,发送ACK包,leaderIO线程处理

 

 

Java代码   收藏代码
  1. qp = new QuorumPacket();  
  2.             ia.readRecord(qp, "packet");  
  3.             if(qp.getType() != Leader.ACK){  
  4.                 LOG.error("Next packet was supposed to be an ACK");  
  5.                 return;  
  6.             }  
  7.         //ACK包处理,如果follower数据同步成功,则将它添加到NEWLEADER这个投票的结果中,这样leader主线程就会恢复执行  
  8.             leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());  
  9.               
  10.             // now that the ack has been processed expect the syncLimit  
  11.             sock.setSoTimeout(leader.self.tickTime * leader.self.syncLimit);  
  12.   
  13.             /* 
  14.              * Wait until leader starts up 
  15.              */  
  16.          //等待leader的server启动  
  17.             synchronized(leader.zk){  
  18.                 while(!leader.zk.isRunning() && !this.isInterrupted()){  
  19.                     leader.zk.wait(20);  
  20.                 }  
  21.             }  
  22.             // Mutation packets will be queued during the serialize,  
  23.             // so we need to mark when the peer can actually start  
  24.             // using the data  
  25.             //  
  26.         //leader server启动后,发送一个UPTODATE包  
  27.             queuedPackets.add(new QuorumPacket(Leader.UPTODATE, -1nullnull));  

 具体的ACK包处理

 

 

Java代码   收藏代码
  1. synchronized public void processAck(long sid, long zxid, SocketAddress followerAddr) {  
  2.   
  3.        ......  
  4.        Proposal p = outstandingProposals.get(zxid);  
  5.        ......  
  6.        //将follower添加到结果列表  
  7.        p.ackSet.add(sid);  
  8.        ......  
  9. //票数够了,则启动leader的server  
  10.        if (self.getQuorumVerifier().containsQuorum(p.ackSet)){               
  11.            .......  
  12.   
  13.            } else {  
  14.                lastCommitted = zxid;  
  15.                LOG.info("Have quorum of supporters; starting up and setting last processed zxid: 0x{}",  
  16.                        Long.toHexString(zk.getZxid()));  
  17.     //启动leader的zookeeper server  
  18.                zk.startup();  
  19.                zk.getZKDatabase().setlastProcessedZxid(zk.getZxid());  
  20.            }  
  21.        }  
  22.    }  

 由于follower进来已经满足投票条件,则leader 的server启动,如下

 

 

Java代码   收藏代码
  1. public void startup() {          
  2.        if (sessionTracker == null) {  
  3.            createSessionTracker();  
  4.        }  
  5. //session检查  
  6.        startSessionTracker();  
  7. //处理链  
  8.        setupRequestProcessors();  
  9.   
  10.        registerJMX();  
  11.   
  12.        synchronized (this) {  
  13.            running = true;  
  14.            notifyAll();  
  15.        }  
  16.    }  

 

Java代码   收藏代码
  1. protected void setupRequestProcessors() {  
  2. final处理器  
  3.     RequestProcessor finalProcessor = new FinalRequestProcessor(this);  
  4.     RequestProcessor toBeAppliedProcessor = new Leader.ToBeAppliedRequestProcessor(  
  5.             finalProcessor, getLeader().toBeApplied);  
  6. 票结果确认  
  7.     commitProcessor = new CommitProcessor(toBeAppliedProcessor,  
  8.             Long.toString(getServerId()), false);  
  9.     commitProcessor.start();  
  10. 票发起  
  11.     ProposalRequestProcessor proposalProcessor = new ProposalRequestProcessor(this,  
  12.             commitProcessor);  
  13.     proposalProcessor.initialize();  
  14. 务预处理  
  15.     firstProcessor = new PrepRequestProcessor(this, proposalProcessor);  
  16.     ((PrepRequestProcessor)firstProcessor).start();  
  17. }  

 leader启动后,发送一个UPTODATE包,follower处理

 

 

Java代码   收藏代码
  1. //退出同步数据循环  
  2. case Leader.UPTODATE:  
  3.                     if (!snapshotTaken) { // true for the pre v1.0 case  
  4.                         zk.takeSnapshot();  
  5.                         self.setCurrentEpoch(newEpoch);  
  6.                     }  
  7.                     self.cnxnFactory.setZooKeeperServer(zk);                  
  8.                     break outerLoop;  
  9.             ......  
  10. //再发ACK包  
  11. ack.setZxid(ZxidUtils.makeZxid(newEpoch, 0));  
  12.         writePacket(ack, true);  

 leader的IO线程LearnerHandler进入主循环,收到ACK包处理

Java代码   收藏代码
  1. while (true) {  
  2.                qp = new QuorumPacket();  
  3.                ia.readRecord(qp, "packet");  
  4.   
  5.                ......  
  6.                tickOfLastAck = leader.self.tick;  
  7.   
  8.   
  9.                ByteBuffer bb;  
  10.                long sessionId;  
  11.                int cxid;  
  12.                int type;  
  13.   
  14.                switch (qp.getType()) {  
  15.     //ACK包,看看之前的投票是否结束  
  16.                case Leader.ACK:  
  17.                    ......  
  18.                    leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());  
  19.                    break;  
  20.     //PING包更新下session的超时时间,往前推  
  21.                case Leader.PING:  
  22.                    // Process the touches  
  23.                    ByteArrayInputStream bis = new ByteArrayInputStream(qp  
  24.                            .getData());  
  25.                    DataInputStream dis = new DataInputStream(bis);  
  26.                    while (dis.available() > 0) {  
  27.                        long sess = dis.readLong();  
  28.                        int to = dis.readInt();  
  29.                        leader.zk.touch(sess, to);  
  30.                    }  
  31.                    break;  
  32.     //REVALIDATE包,检查session是否还有效  
  33.                case Leader.REVALIDATE:  
  34.                    bis = new ByteArrayInputStream(qp.getData());  
  35.                    dis = new DataInputStream(bis);  
  36.                    long id = dis.readLong();  
  37.                    int to = dis.readInt();  
  38.                    ByteArrayOutputStream bos = new ByteArrayOutputStream();  
  39.                    DataOutputStream dos = new DataOutputStream(bos);  
  40.                    dos.writeLong(id);  
  41.                    boolean valid = leader.zk.touch(id, to);  
  42.                    if (valid) {  
  43.                        try {  
  44.                            //set the session owner  
  45.                            // as the follower that  
  46.                            // owns the session  
  47.                            leader.zk.setOwner(id, this);  
  48.                        } catch (SessionExpiredException e) {  
  49.                            LOG.error("Somehow session " + Long.toHexString(id) + " expired right after being renewed! (impossible)", e);  
  50.                        }  
  51.                    }  
  52.                    if (LOG.isTraceEnabled()) {  
  53.                        ZooTrace.logTraceMessage(LOG,  
  54.                                                 ZooTrace.SESSION_TRACE_MASK,  
  55.                                                 "Session 0x" + Long.toHexString(id)  
  56.                                                 + " is valid: "+ valid);  
  57.                    }  
  58.                    dos.writeBoolean(valid);  
  59.                    qp.setData(bos.toByteArray());  
  60.                    queuedPackets.add(qp);  
  61.                    break;  
  62.     //REQUEST包,事务请求,follower会将事务请求转发给leader处理  
  63.                case Leader.REQUEST:                      
  64.                    bb = ByteBuffer.wrap(qp.getData());  
  65.                    sessionId = bb.getLong();  
  66.                    cxid = bb.getInt();  
  67.                    type = bb.getInt();  
  68.                    bb = bb.slice();  
  69.                    Request si;  
  70.                    if(type == OpCode.sync){  
  71.                        si = new LearnerSyncRequest(this, sessionId, cxid, type, bb, qp.getAuthinfo());  
  72.                    } else {  
  73.                        si = new Request(null, sessionId, cxid, type, bb, qp.getAuthinfo());  
  74.                    }  
  75.                    si.setOwner(this);  
  76.                    leader.zk.submitRequest(si);  
  77.                    break;  
  78.                default:  
  79.                }  
  80.            }  

 这个时候LearnerHandler线程已经启动完成,follower发完ACK包后

Java代码   收藏代码
  1. writePacket(ack, true);  
  2.     //读超时为syncLimit时间  
  3.         sock.setSoTimeout(self.tickTime * self.syncLimit);  
  4.     //启动follower的zookeeper server  
  5.         zk.startup();  
  6.         // We need to log the stuff that came in between the snapshot and the uptodate  
  7.         if (zk instanceof FollowerZooKeeperServer) {  
  8.             FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk;  
  9.             for(PacketInFlight p: packetsNotCommitted) {  
  10.                 fzk.logRequest(p.hdr, p.rec);  
  11.             }  
  12.             for(Long zxid: packetsCommitted) {  
  13.                 fzk.commit(zxid);  
  14.             }  
  15.         }  

 Follower的zookeeper server启动

Java代码   收藏代码
  1. @Override  
  2. protected void setupRequestProcessors() {  
  3.     RequestProcessor finalProcessor = new FinalRequestProcessor(this);  
  4.     commitProcessor = new CommitProcessor(finalProcessor,  
  5.             Long.toString(getServerId()), true);  
  6.     commitProcessor.start();  
  7.     firstProcessor = new FollowerRequestProcessor(this, commitProcessor);  
  8.     ((FollowerRequestProcessor) firstProcessor).start();  
  9.     syncProcessor = new SyncRequestProcessor(this,  
  10.             new SendAckRequestProcessor((Learner)getFollower()));  
  11.     syncProcessor.start();  
  12. }  

 Follower进入主处理

Java代码   收藏代码
  1. QuorumPacket qp = new QuorumPacket();  
  2.                 while (self.isRunning()) {  
  3.                     readPacket(qp);  
  4.                     processPacket(qp);  
  5.                 }  
  6.    protected void processPacket(QuorumPacket qp) throws IOException{  
  7.         switch (qp.getType()) {  
  8.     //PING包,写回session数据   
  9.         case Leader.PING:              
  10.             ping(qp);              
  11.             break;  
  12.     //PROPOSAL包,投票处理  
  13.         case Leader.PROPOSAL:              
  14.             TxnHeader hdr = new TxnHeader();  
  15.             Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);  
  16.             if (hdr.getZxid() != lastQueued + 1) {  
  17.                 LOG.warn("Got zxid 0x"  
  18.                         + Long.toHexString(hdr.getZxid())  
  19.                         + " expected 0x"  
  20.                         + Long.toHexString(lastQueued + 1));  
  21.             }  
  22.             lastQueued = hdr.getZxid();  
  23.             fzk.logRequest(hdr, txn);  
  24.             break;  
  25.     //COMMIT包,提交事务  
  26.         case Leader.COMMIT:  
  27.             fzk.commit(qp.getZxid());  
  28.             break;  
  29.         case Leader.UPTODATE:  
  30.             LOG.error("Received an UPTODATE message after Follower started");  
  31.             break;  
  32.         case Leader.REVALIDATE:  
  33.             revalidate(qp);  
  34.             break;  
  35.         case Leader.SYNC:  
  36.             fzk.sync();  
  37.             break;  
  38.         }  
  39.     }  

 这个时候Follower也初始化完成,再看leader主线程,Leader主线程之前在等待follower同步结束,结束之后,leader主线程进入主循环,检查follower是否down掉

Java代码   收藏代码
  1.     while (true) {  
  2.               Thread.sleep(self.tickTime / 2);  
  3.               if (!tickSkip) {  
  4.                   self.tick++;  
  5.               }  
  6.               HashSet<Long> syncedSet = new HashSet<Long>();  
  7.   
  8.               // lock on the followers when we use it.  
  9.               syncedSet.add(self.getId());  
  10. //检查每个follower是否还活着  
  11.               for (LearnerHandler f : getLearners()) {  
  12.                   // Synced set is used to check we have a supporting quorum, so only  
  13.                   // PARTICIPANT, not OBSERVER, learners should be used  
  14.                   if (f.synced() && f.getLearnerType() == LearnerType.PARTICIPANT) {  
  15.                       syncedSet.add(f.getSid());  
  16.                   }  
  17.                   f.ping();  
  18.               }  
  19. //如果有follower挂掉导致投票不通过,则退出lead流程,重新选举  
  20.             if (!tickSkip && !self.getQuorumVerifier().containsQuorum(syncedSet)) {  
  21.               //if (!tickSkip && syncedCount < self.quorumPeers.size() / 2) {  
  22.                   // Lost quorum, shutdown  
  23.                 // TODO: message is wrong unless majority quorums used  
  24.                   shutdown("Only " + syncedSet.size() + " followers, need "  
  25.                           + (self.getVotingView().size() / 2));  
  26.                   // make sure the order is the same!  
  27.                   // the leader goes to looking  
  28.                   return;  
  29.             }   
  30.             tickSkip = !tickSkip;  
  31.           }  

http://iwinit.iteye.com/blog/1775439

你可能感兴趣的:(zookeeper)