深入浅出Zookeeper之六 Leader/Follower初始化

前一篇介绍了Leader选举,这一篇介绍选举成功之后Leader和Follower之间的初始化。

先看Leader端操作

 

case LEADING:
                    LOG.info("LEADING");
                    try {
			//初始化Leader对象
                        setLeader(makeLeader(logFactory));
			//lead,线程在这里阻塞
                        leader.lead();
                        setLeader(null);
                    } catch (Exception e) {
                        LOG.warn("Unexpected exception",e);
                    } finally {
                        if (leader != null) {
                            leader.shutdown("Forcing shutdown");
                            setLeader(null);
                        }
                        setPeerState(ServerState.LOOKING);
                    }
                    break;
                }

 Leader初始化

 

 

    Leader(QuorumPeer self,LeaderZooKeeperServer zk) throws IOException {
        this.self = self;
        try {
		//打开lead端口,这里是2888
            ss = new ServerSocket();
            ss.setReuseAddress(true);
            ss.bind(new InetSocketAddress(self.getQuorumAddress().getPort()));
        } catch (BindException e) {
            LOG.error("Couldn't bind to port "
                    + self.getQuorumAddress().getPort(), e);
            throw e;
        }
        this.zk=zk;
    }

 具体lead过程

 

 

	self.tick = 0;
		//从本地文件恢复数据
            zk.loadData();
            //leader的状态信息
            leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid());

            // Start thread that waits for connection requests from 
            // new followers.
		//启动lead端口的监听线程,专门用来监听新的follower
            cnxAcceptor = new LearnerCnxAcceptor();
            cnxAcceptor.start();
            
            readyToStart = true;
	    //等待足够多的follower进来,代表自己确实是leader,此处lead线程可能会等待
            long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());
	.......

 等待follower连接

 

 

public long getEpochToPropose(long sid, long lastAcceptedEpoch) throws InterruptedException, IOException {
        synchronized(connectingFollowers) {
            if (!waitingForNewEpoch) {
                return epoch;
            }
            if (lastAcceptedEpoch >= epoch) {
                epoch = lastAcceptedEpoch+1;
            }
	    //将自己加入连接队伍中,方便后续判断lead是否有效
            connectingFollowers.add(sid);
            QuorumVerifier verifier = self.getQuorumVerifier();
	    //如果足够多的follower进入,选举有效,则无需等待,并通知其他的等待线程,类似于Barrier
            if (connectingFollowers.contains(self.getId()) && 
                                            verifier.containsQuorum(connectingFollowers)) {
                waitingForNewEpoch = false;
                self.setAcceptedEpoch(epoch);
                connectingFollowers.notifyAll();
            } 
	    //如果进入的follower不够,则进入等待,超时即为initLimit时间,
	    else {
                long start = System.currentTimeMillis();
                long cur = start;
                long end = start + self.getInitLimit()*self.getTickTime();
                while(waitingForNewEpoch && cur < end) {
                    connectingFollowers.wait(end - cur);
                    cur = System.currentTimeMillis();
                }
		//超时了,退出lead过程,重新发起选举
                if (waitingForNewEpoch) {
                    throw new InterruptedException("Timeout while waiting for epoch from quorum");        
                }
            }
            return epoch;
        }
    }

 好的,这个时候我们假设其他follower还没连接进来,那Leader就会在此等待。再来看Follower的初始化过程

 

case FOLLOWING:
                    try {
                        LOG.info("FOLLOWING");
			//初始化Follower对象
                        setFollower(makeFollower(logFactory));
			//follow动作,线程在此等待
                        follower.followLeader();
                    } catch (Exception e) {
                        LOG.warn("Unexpected exception",e);
                    } finally {
                        follower.shutdown();
                        setFollower(null);
                        setPeerState(ServerState.LOOKING);
                    }
                    break;

 具体follow过程

 

 

void followLeader() throws InterruptedException {
        .......
        try {
		//根据sid找到对应leader,拿到lead连接信息
            InetSocketAddress addr = findLeader();            
            try {
		//连接leader
                connectToLeader(addr);
		//注册follower,根据Leader和follower协议,主要是同步选举轮数
                long newEpochZxid = registerWithLeader(Leader.FOLLOWERINFO);

                //check to see if the leader zxid is lower than ours
                //this should never happen but is just a safety check
                long newEpoch = ZxidUtils.getEpochFromZxid(newEpochZxid);
                if (newEpoch < self.getAcceptedEpoch()) {
                    LOG.error("Proposed leader epoch " + ZxidUtils.zxidToString(newEpochZxid)
                            + " is less than our accepted epoch " + ZxidUtils.zxidToString(self.getAcceptedEpoch()));
                    throw new IOException("Error: Epoch of leader is lower");
                }
		//同步数据
                syncWithLeader(newEpochZxid);                
                QuorumPacket qp = new QuorumPacket();
		//接受Leader消息,执行并反馈给leader,线程在此自旋
                while (self.isRunning()) {
                    readPacket(qp);
                    processPacket(qp);
                }
           ......
    }

 连接leader

 

 

protected void connectToLeader(InetSocketAddress addr) 
    throws IOException, ConnectException, InterruptedException {
        sock = new Socket();
	//设置读超时时间为initLimit对应时间
        sock.setSoTimeout(self.tickTime * self.initLimit);
	//重试5次,失败后退出follower角色,重新选举
        for (int tries = 0; tries < 5; tries++) {
            try {
		//连接超时
                sock.connect(addr, self.tickTime * self.syncLimit);
                sock.setTcpNoDelay(nodelay);
                break;
            } catch (IOException e) {
                if (tries == 4) {
                    LOG.error("Unexpected exception",e);
                    throw e;
                } else {
                    LOG.warn("Unexpected exception, tries="+tries+
                            ", connecting to " + addr,e);
                    sock = new Socket();
                    sock.setSoTimeout(self.tickTime * self.initLimit);
                }
            }
            Thread.sleep(1000);
        }
        leaderIs = BinaryInputArchive.getArchive(new BufferedInputStream(
                sock.getInputStream()));
        bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
        leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);
    }   

 假设这里follower顺利连上了leader,此时leader端会为每个follower启动单独IO线程,请看LearnerCnxAcceptor代码

 

 

 public void run() {
            try {
                while (!stop) {
                    try{
			//线程在此等待连接
                        Socket s = ss.accept();
                        // start with the initLimit, once the ack is processed
                        // in LearnerHandler switch to the syncLimit
			//读超时设为initLimit时间
                        s.setSoTimeout(self.tickTime * self.initLimit);
                        s.setTcpNoDelay(nodelay);
			//为每个follower启动单独线程,处理IO
                        LearnerHandler fh = new LearnerHandler(s, Leader.this);
                        fh.start();
                    } catch (SocketException e) {
......
        }

 leader端为follower建立IO线程,其处理过程和follower自身的主线程根据协议相互交互,以下将通过数据交换场景式分析这个过程,leader端IO线程LearnerHandler启动

 

 

 ia = BinaryInputArchive.getArchive(new BufferedInputStream(sock
                    .getInputStream()));
            bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
            oa = BinaryOutputArchive.getArchive(bufferedOutput);
		//IO线程等待follower发送包
            QuorumPacket qp = new QuorumPacket();
            ia.readRecord(qp, "packet");

 follower端进入registerWithLeader处理

 

 

long lastLoggedZxid = self.getLastLoggedZxid();
        QuorumPacket qp = new QuorumPacket();  
	//type为Leader.FOLLOWERINFO
        qp.setType(pktType);
        qp.setZxid(ZxidUtils.makeZxid(self.getAcceptedEpoch(), 0));
        
        /*
         * Add sid to payload
         */
        LearnerInfo li = new LearnerInfo(self.getId(), 0x10000);
        ByteArrayOutputStream bsid = new ByteArrayOutputStream();
        BinaryOutputArchive boa = BinaryOutputArchive.getArchive(bsid);
        boa.writeRecord(li, "LearnerInfo");
        qp.setData(bsid.toByteArray());
        //发送LearnerInfo包
        writePacket(qp, true);
	//等待leader响应
        readPacket(qp); 

 leader端收到包处理

 

 

byte learnerInfoData[] = qp.getData();
            if (learnerInfoData != null) {
            	if (learnerInfoData.length == 8) {
            		ByteBuffer bbsid = ByteBuffer.wrap(learnerInfoData);
            		this.sid = bbsid.getLong();
            	} else {
			//反序列化LearnerInfo
            		LearnerInfo li = new LearnerInfo();
            		ByteBufferInputStream.byteBuffer2Record(ByteBuffer.wrap(learnerInfoData), li);
            		this.sid = li.getServerid();
            		this.version = li.getProtocolVersion();
            	}
            } else {
            	this.sid = leader.followerCounter.getAndDecrement();
            }

	...... 
            //follower的选举轮数
            long lastAcceptedEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());
            
            long peerLastZxid;
            StateSummary ss = null;
            long zxid = qp.getZxid();
	    //将follower加入到connectingFollowers中,因为满足半数机器的条件,此时在此等待的leader主线程会退出等待,继续往下处理
            long newEpoch = leader.getEpochToPropose(this.getSid(), lastAcceptedEpoch);
            
          ......
		//发一个Leader.LEADERINFO包,带上新的epoch id
                byte ver[] = new byte[4];
                ByteBuffer.wrap(ver).putInt(0x10000);
                QuorumPacket newEpochPacket = new QuorumPacket(Leader.LEADERINFO, ZxidUtils.makeZxid(newEpoch, 0), ver, null);
                oa.writeRecord(newEpochPacket, "packet");
                bufferedOutput.flush();
                QuorumPacket ackEpochPacket = new QuorumPacket();
		//等待follower响应
                ia.readRecord(ackEpochPacket, "packet");
                if (ackEpochPacket.getType() != Leader.ACKEPOCH) {
                    LOG.error(ackEpochPacket.toString()
                            + " is not ACKEPOCH");
                    return;
				}
                ByteBuffer bbepoch = ByteBuffer.wrap(ackEpochPacket.getData());
                ss = new StateSummary(bbepoch.getInt(), ackEpochPacket.getZxid());
                leader.waitForEpochAck(this.getSid(), ss);
            }

 此时follower收到LEADERINFO包处理:

 

 

final long newEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());
		if (qp.getType() == Leader.LEADERINFO) {
        	// we are connected to a 1.0 server so accept the new epoch and read the next packet
        	leaderProtocolVersion = ByteBuffer.wrap(qp.getData()).getInt();
        	byte epochBytes[] = new byte[4];
        	final ByteBuffer wrappedEpochBytes = ByteBuffer.wrap(epochBytes);
		//将自己的epoch发给leader
        	if (newEpoch > self.getAcceptedEpoch()) {
        		wrappedEpochBytes.putInt((int)self.getCurrentEpoch());
        		self.setAcceptedEpoch(newEpoch);
        	} 
		......
		//发送一个Leader.ACKEPOCH包,带上自己的最大zxid
        	QuorumPacket ackNewEpoch = new QuorumPacket(Leader.ACKEPOCH, lastLoggedZxid, epochBytes, null);
        	writePacket(ackNewEpoch, true);
            return ZxidUtils.makeZxid(newEpoch, 0);
        } 

 leader收到Leader.ACKEPOCH后进入waitForEpochAck处理

 

 

  public void waitForEpochAck(long id, StateSummary ss) throws IOException, InterruptedException {
        synchronized(electingFollowers) {
            if (electionFinished) {
                return;
            }
            if (ss.getCurrentEpoch() != -1) {
                ......
		//将follower添加到等待集合
                electingFollowers.add(id);
            }
            QuorumVerifier verifier = self.getQuorumVerifier();
	    //判断是否满足选举条件,如果不满足进入等待,满足则通知其他等待线程,类似于Barrier
            if (electingFollowers.contains(self.getId()) && verifier.containsQuorum(electingFollowers)) {
                electionFinished = true;
                electingFollowers.notifyAll();
            }
	    //follower还不够,等等吧
	    else {                
                long start = System.currentTimeMillis();
                long cur = start;
                long end = start + self.getInitLimit()*self.getTickTime();
                while(!electionFinished && cur < end) {
                    electingFollowers.wait(end - cur);
                    cur = System.currentTimeMillis();
                }
                if (!electionFinished) {
                    throw new InterruptedException("Timeout while waiting for epoch to be acked by quorum");
                }
            }
        }
    }

 假设IO线程在此等待,此时leader主线程在getEpochToPropose恢复后继续执行

 

 

 long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());
            
            zk.setZxid(ZxidUtils.makeZxid(epoch, 0));
            
            synchronized(this){
                lastProposed = zk.getZxid();
            }
            //发起一个NEWLEADER投票
            newLeaderProposal.packet = new QuorumPacket(NEWLEADER, zk.getZxid(),
                    null, null);
		......
		//投票箱
            outstandingProposals.put(newLeaderProposal.packet.getZxid(), newLeaderProposal);
		//自己默认同意
            newLeaderProposal.ackSet.add(self.getId());
		//等待follower进来
            waitForEpochAck(self.getId(), leaderStateSummary);

 由于之前已经有follower进来,满足选举条件,则IO线程和leader主线程都继续往下执行,先看leader主线程

 

 

//当前选票轮数
self.setCurrentEpoch(epoch);

            // We have to get at least a majority of servers in sync with
            // us. We do this by waiting for the NEWLEADER packet to get
            // acknowledged
	    //等待确认NEWLEADER包的follower足够多,那自己真的是leader了
            while (!self.getQuorumVerifier().containsQuorum(newLeaderProposal.ackSet)){
            //while (newLeaderProposal.ackCount <= self.quorumPeers.size() / 2) {
		//如果超过初始化时间initlimit,则退出lead过程,重新选举,有可能是follower同步数据比较慢
                if (self.tick > self.initLimit) {
                    // Followers aren't syncing fast enough,
                    // renounce leadership!
                    StringBuilder ackToString = new StringBuilder();
                    for(Long id : newLeaderProposal.ackSet)
                        ackToString.append(id + ": ");
                    
                    shutdown("Waiting for a quorum of followers, only synced with: " + ackToString);
                    HashSet<Long> followerSet = new HashSet<Long>();

                    for(LearnerHandler f : getLearners()) {
                        followerSet.add(f.getSid());
                    }

                    if (self.getQuorumVerifier().containsQuorum(followerSet)) {
                    //if (followers.size() >= self.quorumPeers.size() / 2) {
                        LOG.warn("Enough followers present. "+
                                "Perhaps the initTicks need to be increased.");
                    }
                    return;
                }
                Thread.sleep(self.tickTime);
                self.tick++;
            }

 这个时候IO线程继续执行

 

 

 

           /* the default to send to the follower */
	   //默认发送一个SNAP包,要求follower同步数据
            int packetToSend = Leader.SNAP;
            long zxidToSend = 0;
            long leaderLastZxid = 0;
            /** the packets that the follower needs to get updates from **/
            long updates = peerLastZxid;
            
            /* we are sending the diff check if we have proposals in memory to be able to 
             * send a diff to the 
             */ 
            ReentrantReadWriteLock lock = leader.zk.getZKDatabase().getLogLock();
            ReadLock rl = lock.readLock();
            try {
                rl.lock();        
                final long maxCommittedLog = leader.zk.getZKDatabase().getmaxCommittedLog();
                final long minCommittedLog = leader.zk.getZKDatabase().getminCommittedLog();
                LOG.info("Synchronizing with Follower sid: " + sid
                        +" maxCommittedLog=0x"+Long.toHexString(maxCommittedLog)
                        +" minCommittedLog=0x"+Long.toHexString(minCommittedLog)
                        +" peerLastZxid=0x"+Long.toHexString(peerLastZxid));

		//看看是否还有需要投的票
                LinkedList<Proposal> proposals = leader.zk.getZKDatabase().getCommittedLog();
		//如果有,则处理这些投票
                if (proposals.size() != 0) {
			//如果follower还没处理这个分布式事务,有可能down掉了又恢复,则继续处理这个事务
                   if ((maxCommittedLog >= peerLastZxid)
                            && (minCommittedLog <= peerLastZxid)) {
                       .......
                        // If we are here, we can use committedLog to sync with
                        // follower. Then we only need to decide whether to
                        // send trunc or not
                        packetToSend = Leader.DIFF;
                        zxidToSend = maxCommittedLog;

                        for (Proposal propose: proposals) {
                            // skip the proposals the peer already has
			    //这个已经被处理过了,无视
                            if (propose.packet.getZxid() <= peerLastZxid) {
                                prevProposalZxid = propose.packet.getZxid();
                                continue;
                            } else {
                                // If we are sending the first packet, figure out whether to trunc
                                // in case the follower has some proposals that the leader doesn't
				//发第一个事务之前先确认folloer是否比leader超前
                                if (firstPacket) {
                                    firstPacket = false;
                                    // Does the peer have some proposals that the leader hasn't seen yet
				    //follower处理事务比leader多,则发送TRUNC包,让follower回滚到和leader一致
                                    if (prevProposalZxid < peerLastZxid) {
                                        // send a trunc message before sending the diff
                                        packetToSend = Leader.TRUNC;                                        
                                        zxidToSend = prevProposalZxid;
                                        updates = zxidToSend;
                                    }
                                }
				//将事务发送到队列
                                queuePacket(propose.packet);
				//立马接一个COMMIT包
                                QuorumPacket qcommit = new QuorumPacket(Leader.COMMIT, propose.packet.getZxid(),
                                        null, null);
                                queuePacket(qcommit);
                            }
                        }
                    } 
		    //如果follower超前了,则发送TRUNC包,让其和leader同步
		    else if (peerLastZxid > maxCommittedLog) {
                        LOG.debug("Sending TRUNC to follower zxidToSend=0x{} updates=0x{}",
                                Long.toHexString(maxCommittedLog),
                                Long.toHexString(updates));

                        packetToSend = Leader.TRUNC;
                        zxidToSend = maxCommittedLog;
                        updates = zxidToSend;
                    } else {
                        LOG.warn("Unhandled proposal scenario");
                    }
                } 
		//如果follower和leader同步,则发送DIFF包,而不需要follower拉数据
		else if (peerLastZxid == leader.zk.getZKDatabase().getDataTreeLastProcessedZxid()) {
                    .....
                    packetToSend = Leader.DIFF;
                    zxidToSend = peerLastZxid;
              .......
		//NEWLEADER包添加到发送队列
             QuorumPacket newLeaderQP = new QuorumPacket(Leader.NEWLEADER,
                    ZxidUtils.makeZxid(newEpoch, 0), null, null);
             if (getVersion() < 0x10000) {
                oa.writeRecord(newLeaderQP, "packet");
            } else {
                queuedPackets.add(newLeaderQP);
            }
            bufferedOutput.flush();
            //Need to set the zxidToSend to the latest zxid
            if (packetToSend == Leader.SNAP) {
                zxidToSend = leader.zk.getZKDatabase().getDataTreeLastProcessedZxid();
            }
	    //发送一个DIFF或SNAP包
            oa.writeRecord(new QuorumPacket(packetToSend, zxidToSend, null, null), "packet");
            bufferedOutput.flush();
	    ......
	    // Start sending packets
	    //启动一个异步发送线程
            new Thread() {
                public void run() {
                    Thread.currentThread().setName(
                            "Sender-" + sock.getRemoteSocketAddress());
                    try {
                        sendPackets();
                    } catch (InterruptedException e) {
                        LOG.warn("Unexpected interruption",e);
                    }
                }
            }.start();
            
            /*
             * Have to wait for the first ACK, wait until 
             * the leader is ready, and only then we can
             * start processing messages.
             */
	     //等待follower确认
            qp = new QuorumPacket();
            ia.readRecord(qp, "packet");

 在我们这个集群里。由于是刚启动的,所以leader会直接发送DIFF包,然后再发送一个NEWLEADER包

 

接着follower收到包处理,在syncWithLeader中

 

     QuorumPacket ack = new QuorumPacket(Leader.ACK, 0, null, null);
        QuorumPacket qp = new QuorumPacket();
        long newEpoch = ZxidUtils.getEpochFromZxid(newLeaderZxid);
        
        readPacket(qp);   
        LinkedList<Long> packetsCommitted = new LinkedList<Long>();
        LinkedList<PacketInFlight> packetsNotCommitted = new LinkedList<PacketInFlight>();
        synchronized (zk) {
		//DIFF包
            if (qp.getType() == Leader.DIFF) {
                LOG.info("Getting a diff from the leader 0x" + Long.toHexString(qp.getZxid()));                
            }
	    //如果是SNAP包,则从leader复制一份镜像数据到本地内存
            else if (qp.getType() == Leader.SNAP) {
                LOG.info("Getting a snapshot from leader");
                // The leader is going to dump the database
                // clear our own database and read
                zk.getZKDatabase().clear();
                zk.getZKDatabase().deserializeSnapshot(leaderIs);
                String signature = leaderIs.readString("signature");
                if (!signature.equals("BenWasHere")) {
                    LOG.error("Missing signature. Got " + signature);
                    throw new IOException("Missing signature");                   
                }
            } 
	    //TRUNC包,回滚到对应事务
	    else if (qp.getType() == Leader.TRUNC) {
                //we need to truncate the log to the lastzxid of the leader
                LOG.warn("Truncating log to get in sync with the leader 0x"
                        + Long.toHexString(qp.getZxid()));
                boolean truncated=zk.getZKDatabase().truncateLog(qp.getZxid());
......
		//最新的事务id
            zk.getZKDatabase().setlastProcessedZxid(qp.getZxid());
	    //启动过期session检查
            zk.createSessionTracker();            
            
            long lastQueued = 0;

            // in V1.0 we take a snapshot when we get the NEWLEADER message, but in pre V1.0
            // we take the snapshot at the UPDATE, since V1.0 also gets the UPDATE (after the NEWLEADER)
            // we need to make sure that we don't take the snapshot twice.
            boolean snapshotTaken = false;
            // we are now going to start getting transactions to apply followed by an UPTODATE
            outerLoop:
	    //同步完数据后,准备执行投票
            while (self.isRunning()) {
                readPacket(qp);
                switch(qp.getType()) {
		//将投票添加到待处理列表
                case Leader.PROPOSAL:
                    PacketInFlight pif = new PacketInFlight();
                    pif.hdr = new TxnHeader();
                    pif.rec = SerializeUtils.deserializeTxn(qp.getData(), pif.hdr);
                    if (pif.hdr.getZxid() != lastQueued + 1) {
                    LOG.warn("Got zxid 0x"
                            + Long.toHexString(pif.hdr.getZxid())
                            + " expected 0x"
                            + Long.toHexString(lastQueued + 1));
                    }
                    lastQueued = pif.hdr.getZxid();
                    packetsNotCommitted.add(pif);
                    break;
		//COMMIT则将事务交给Server处理掉
                case Leader.COMMIT:
                    if (!snapshotTaken) {
                        pif = packetsNotCommitted.peekFirst();
                        if (pif.hdr.getZxid() != qp.getZxid()) {
                            LOG.warn("Committing " + qp.getZxid() + ", but next proposal is " + pif.hdr.getZxid());
                        } else {
                            zk.processTxn(pif.hdr, pif.rec);
                            packetsNotCommitted.remove();
                        }
                    } else {
                        packetsCommitted.add(qp.getZxid());
                    }
                    break;
                case Leader.INFORM:
                    TxnHeader hdr = new TxnHeader();
                    Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);
                    zk.processTxn(hdr, txn);
                    break;
		    //UPTODATE包,说明同步成功,退出循环
                case Leader.UPTODATE:
                    if (!snapshotTaken) { // true for the pre v1.0 case
                        zk.takeSnapshot();
                        self.setCurrentEpoch(newEpoch);
                    }
                    self.cnxnFactory.setZooKeeperServer(zk);                
                    break outerLoop;
		    //NEWLEADER包,说明之前残留的投票已经处理完了,则将内存中数据写文件,并发送ACK包
                case Leader.NEWLEADER: // it will be NEWLEADER in v1.0
                    zk.takeSnapshot();
                    self.setCurrentEpoch(newEpoch);
                    snapshotTaken = true;
                    writePacket(new QuorumPacket(Leader.ACK, newLeaderZxid, null, null), true);
                    break;
                }
            }
        }

 follower在这里同步leader数据,在拿到NEWLEADER包之后序列化到文件,发送ACK包,leaderIO线程处理

 

 

qp = new QuorumPacket();
            ia.readRecord(qp, "packet");
            if(qp.getType() != Leader.ACK){
                LOG.error("Next packet was supposed to be an ACK");
                return;
            }
		//ACK包处理,如果follower数据同步成功,则将它添加到NEWLEADER这个投票的结果中,这样leader主线程就会恢复执行
            leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());
            
            // now that the ack has been processed expect the syncLimit
            sock.setSoTimeout(leader.self.tickTime * leader.self.syncLimit);

            /*
             * Wait until leader starts up
             */
	     //等待leader的server启动
            synchronized(leader.zk){
                while(!leader.zk.isRunning() && !this.isInterrupted()){
                    leader.zk.wait(20);
                }
            }
            // Mutation packets will be queued during the serialize,
            // so we need to mark when the peer can actually start
            // using the data
            //
	    //leader server启动后,发送一个UPTODATE包
            queuedPackets.add(new QuorumPacket(Leader.UPTODATE, -1, null, null));

 具体的ACK包处理

 

 

 synchronized public void processAck(long sid, long zxid, SocketAddress followerAddr) {

        ......
        Proposal p = outstandingProposals.get(zxid);
        ......
        //将follower添加到结果列表
        p.ackSet.add(sid);
        ......
	//票数够了,则启动leader的server
        if (self.getQuorumVerifier().containsQuorum(p.ackSet)){             
            .......

            } else {
                lastCommitted = zxid;
                LOG.info("Have quorum of supporters; starting up and setting last processed zxid: 0x{}",
                        Long.toHexString(zk.getZxid()));
		//启动leader的zookeeper server
                zk.startup();
                zk.getZKDatabase().setlastProcessedZxid(zk.getZxid());
            }
        }
    }

 由于follower进来已经满足投票条件,则leader 的server启动,如下

 

 

 public void startup() {        
        if (sessionTracker == null) {
            createSessionTracker();
        }
	//session检查
        startSessionTracker();
	//处理链
        setupRequestProcessors();

        registerJMX();

        synchronized (this) {
            running = true;
            notifyAll();
        }
    }

 

    protected void setupRequestProcessors() {
	//最后final处理器
        RequestProcessor finalProcessor = new FinalRequestProcessor(this);
        RequestProcessor toBeAppliedProcessor = new Leader.ToBeAppliedRequestProcessor(
                finalProcessor, getLeader().toBeApplied);
	//投票结果确认
        commitProcessor = new CommitProcessor(toBeAppliedProcessor,
                Long.toString(getServerId()), false);
        commitProcessor.start();
	//投票发起
        ProposalRequestProcessor proposalProcessor = new ProposalRequestProcessor(this,
                commitProcessor);
        proposalProcessor.initialize();
	//事务预处理
        firstProcessor = new PrepRequestProcessor(this, proposalProcessor);
        ((PrepRequestProcessor)firstProcessor).start();
    }

 leader启动后,发送一个UPTODATE包,follower处理

 

 

//退出同步数据循环
case Leader.UPTODATE:
                    if (!snapshotTaken) { // true for the pre v1.0 case
                        zk.takeSnapshot();
                        self.setCurrentEpoch(newEpoch);
                    }
                    self.cnxnFactory.setZooKeeperServer(zk);                
                    break outerLoop;
		    ......
//再发ACK包
ack.setZxid(ZxidUtils.makeZxid(newEpoch, 0));
        writePacket(ack, true);

 leader的IO线程LearnerHandler进入主循环,收到ACK包处理

 while (true) {
                qp = new QuorumPacket();
                ia.readRecord(qp, "packet");

                ......
                tickOfLastAck = leader.self.tick;


                ByteBuffer bb;
                long sessionId;
                int cxid;
                int type;

                switch (qp.getType()) {
		//ACK包,看看之前的投票是否结束
                case Leader.ACK:
                    ......
                    leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());
                    break;
		//PING包更新下session的超时时间,往前推
                case Leader.PING:
                    // Process the touches
                    ByteArrayInputStream bis = new ByteArrayInputStream(qp
                            .getData());
                    DataInputStream dis = new DataInputStream(bis);
                    while (dis.available() > 0) {
                        long sess = dis.readLong();
                        int to = dis.readInt();
                        leader.zk.touch(sess, to);
                    }
                    break;
		//REVALIDATE包,检查session是否还有效
                case Leader.REVALIDATE:
                    bis = new ByteArrayInputStream(qp.getData());
                    dis = new DataInputStream(bis);
                    long id = dis.readLong();
                    int to = dis.readInt();
                    ByteArrayOutputStream bos = new ByteArrayOutputStream();
                    DataOutputStream dos = new DataOutputStream(bos);
                    dos.writeLong(id);
                    boolean valid = leader.zk.touch(id, to);
                    if (valid) {
                        try {
                            //set the session owner
                            // as the follower that
                            // owns the session
                            leader.zk.setOwner(id, this);
                        } catch (SessionExpiredException e) {
                            LOG.error("Somehow session " + Long.toHexString(id) + " expired right after being renewed! (impossible)", e);
                        }
                    }
                    if (LOG.isTraceEnabled()) {
                        ZooTrace.logTraceMessage(LOG,
                                                 ZooTrace.SESSION_TRACE_MASK,
                                                 "Session 0x" + Long.toHexString(id)
                                                 + " is valid: "+ valid);
                    }
                    dos.writeBoolean(valid);
                    qp.setData(bos.toByteArray());
                    queuedPackets.add(qp);
                    break;
		//REQUEST包,事务请求,follower会将事务请求转发给leader处理
                case Leader.REQUEST:                    
                    bb = ByteBuffer.wrap(qp.getData());
                    sessionId = bb.getLong();
                    cxid = bb.getInt();
                    type = bb.getInt();
                    bb = bb.slice();
                    Request si;
                    if(type == OpCode.sync){
                        si = new LearnerSyncRequest(this, sessionId, cxid, type, bb, qp.getAuthinfo());
                    } else {
                        si = new Request(null, sessionId, cxid, type, bb, qp.getAuthinfo());
                    }
                    si.setOwner(this);
                    leader.zk.submitRequest(si);
                    break;
                default:
                }
            }

 这个时候LearnerHandler线程已经启动完成,follower发完ACK包后

writePacket(ack, true);
	//读超时为syncLimit时间
        sock.setSoTimeout(self.tickTime * self.syncLimit);
	//启动follower的zookeeper server
        zk.startup();
        // We need to log the stuff that came in between the snapshot and the uptodate
        if (zk instanceof FollowerZooKeeperServer) {
            FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk;
            for(PacketInFlight p: packetsNotCommitted) {
                fzk.logRequest(p.hdr, p.rec);
            }
            for(Long zxid: packetsCommitted) {
                fzk.commit(zxid);
            }
        }

 Follower的zookeeper server启动

    @Override
    protected void setupRequestProcessors() {
        RequestProcessor finalProcessor = new FinalRequestProcessor(this);
        commitProcessor = new CommitProcessor(finalProcessor,
                Long.toString(getServerId()), true);
        commitProcessor.start();
        firstProcessor = new FollowerRequestProcessor(this, commitProcessor);
        ((FollowerRequestProcessor) firstProcessor).start();
        syncProcessor = new SyncRequestProcessor(this,
                new SendAckRequestProcessor((Learner)getFollower()));
        syncProcessor.start();
    }

 Follower进入主处理

QuorumPacket qp = new QuorumPacket();
                while (self.isRunning()) {
                    readPacket(qp);
                    processPacket(qp);
                }
   protected void processPacket(QuorumPacket qp) throws IOException{
        switch (qp.getType()) {
	//PING包,写回session数据 
        case Leader.PING:            
            ping(qp);            
            break;
	//PROPOSAL包,投票处理
        case Leader.PROPOSAL:            
            TxnHeader hdr = new TxnHeader();
            Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);
            if (hdr.getZxid() != lastQueued + 1) {
                LOG.warn("Got zxid 0x"
                        + Long.toHexString(hdr.getZxid())
                        + " expected 0x"
                        + Long.toHexString(lastQueued + 1));
            }
            lastQueued = hdr.getZxid();
            fzk.logRequest(hdr, txn);
            break;
	//COMMIT包,提交事务
        case Leader.COMMIT:
            fzk.commit(qp.getZxid());
            break;
        case Leader.UPTODATE:
            LOG.error("Received an UPTODATE message after Follower started");
            break;
        case Leader.REVALIDATE:
            revalidate(qp);
            break;
        case Leader.SYNC:
            fzk.sync();
            break;
        }
    }

 这个时候Follower也初始化完成,再看leader主线程,Leader主线程之前在等待follower同步结束,结束之后,leader主线程进入主循环,检查follower是否down掉

      while (true) {
                Thread.sleep(self.tickTime / 2);
                if (!tickSkip) {
                    self.tick++;
                }
                HashSet<Long> syncedSet = new HashSet<Long>();

                // lock on the followers when we use it.
                syncedSet.add(self.getId());
		//检查每个follower是否还活着
                for (LearnerHandler f : getLearners()) {
                    // Synced set is used to check we have a supporting quorum, so only
                    // PARTICIPANT, not OBSERVER, learners should be used
                    if (f.synced() && f.getLearnerType() == LearnerType.PARTICIPANT) {
                        syncedSet.add(f.getSid());
                    }
                    f.ping();
                }
		//如果有follower挂掉导致投票不通过,则退出lead流程,重新选举
              if (!tickSkip && !self.getQuorumVerifier().containsQuorum(syncedSet)) {
                //if (!tickSkip && syncedCount < self.quorumPeers.size() / 2) {
                    // Lost quorum, shutdown
                  // TODO: message is wrong unless majority quorums used
                    shutdown("Only " + syncedSet.size() + " followers, need "
                            + (self.getVotingView().size() / 2));
                    // make sure the order is the same!
                    // the leader goes to looking
                    return;
              } 
              tickSkip = !tickSkip;
            }

 

 

你可能感兴趣的:(zookeeper,leader,follower)