前一篇介绍了Leader选举,这一篇介绍选举成功之后Leader和Follower之间的初始化。
先看Leader端操作
case LEADING:
LOG.info("LEADING");
try {
//初始化Leader对象
setLeader(makeLeader(logFactory));
//lead,线程在这里阻塞
leader.lead();
setLeader(null);
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
if (leader != null) {
leader.shutdown("Forcing shutdown");
setLeader(null);
}
setPeerState(ServerState.LOOKING);
}
break;
}
Leader初始化
Leader(QuorumPeer self,LeaderZooKeeperServer zk) throws IOException {
this.self = self;
try {
//打开lead端口,这里是2888
ss = new ServerSocket();
ss.setReuseAddress(true);
ss.bind(new InetSocketAddress(self.getQuorumAddress().getPort()));
} catch (BindException e) {
LOG.error("Couldn't bind to port "
+ self.getQuorumAddress().getPort(), e);
throw e;
}
this.zk=zk;
}
具体lead过程
self.tick = 0;
//从本地文件恢复数据
zk.loadData();
//leader的状态信息
leaderStateSummary = new StateSummary(self.getCurrentEpoch(), zk.getLastProcessedZxid());
// Start thread that waits for connection requests from
// new followers.
//启动lead端口的监听线程,专门用来监听新的follower
cnxAcceptor = new LearnerCnxAcceptor();
cnxAcceptor.start();
readyToStart = true;
//等待足够多的follower进来,代表自己确实是leader,此处lead线程可能会等待
long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());
.......
等待follower连接
public long getEpochToPropose(long sid, long lastAcceptedEpoch) throws InterruptedException, IOException {
synchronized(connectingFollowers) {
if (!waitingForNewEpoch) {
return epoch;
}
if (lastAcceptedEpoch >= epoch) {
epoch = lastAcceptedEpoch+1;
}
//将自己加入连接队伍中,方便后续判断lead是否有效
connectingFollowers.add(sid);
QuorumVerifier verifier = self.getQuorumVerifier();
//如果足够多的follower进入,选举有效,则无需等待,并通知其他的等待线程,类似于Barrier
if (connectingFollowers.contains(self.getId()) &&
verifier.containsQuorum(connectingFollowers)) {
waitingForNewEpoch = false;
self.setAcceptedEpoch(epoch);
connectingFollowers.notifyAll();
}
//如果进入的follower不够,则进入等待,超时即为initLimit时间,
else {
long start = System.currentTimeMillis();
long cur = start;
long end = start + self.getInitLimit()*self.getTickTime();
while(waitingForNewEpoch && cur < end) {
connectingFollowers.wait(end - cur);
cur = System.currentTimeMillis();
}
//超时了,退出lead过程,重新发起选举
if (waitingForNewEpoch) {
throw new InterruptedException("Timeout while waiting for epoch from quorum");
}
}
return epoch;
}
}
好的,这个时候我们假设其他follower还没连接进来,那Leader就会在此等待。再来看Follower的初始化过程
case FOLLOWING:
try {
LOG.info("FOLLOWING");
//初始化Follower对象
setFollower(makeFollower(logFactory));
//follow动作,线程在此等待
follower.followLeader();
} catch (Exception e) {
LOG.warn("Unexpected exception",e);
} finally {
follower.shutdown();
setFollower(null);
setPeerState(ServerState.LOOKING);
}
break;
具体follow过程
void followLeader() throws InterruptedException {
.......
try {
//根据sid找到对应leader,拿到lead连接信息
InetSocketAddress addr = findLeader();
try {
//连接leader
connectToLeader(addr);
//注册follower,根据Leader和follower协议,主要是同步选举轮数
long newEpochZxid = registerWithLeader(Leader.FOLLOWERINFO);
//check to see if the leader zxid is lower than ours
//this should never happen but is just a safety check
long newEpoch = ZxidUtils.getEpochFromZxid(newEpochZxid);
if (newEpoch < self.getAcceptedEpoch()) {
LOG.error("Proposed leader epoch " + ZxidUtils.zxidToString(newEpochZxid)
+ " is less than our accepted epoch " + ZxidUtils.zxidToString(self.getAcceptedEpoch()));
throw new IOException("Error: Epoch of leader is lower");
}
//同步数据
syncWithLeader(newEpochZxid);
QuorumPacket qp = new QuorumPacket();
//接受Leader消息,执行并反馈给leader,线程在此自旋
while (self.isRunning()) {
readPacket(qp);
processPacket(qp);
}
......
}
连接leader
protected void connectToLeader(InetSocketAddress addr)
throws IOException, ConnectException, InterruptedException {
sock = new Socket();
//设置读超时时间为initLimit对应时间
sock.setSoTimeout(self.tickTime * self.initLimit);
//重试5次,失败后退出follower角色,重新选举
for (int tries = 0; tries < 5; tries++) {
try {
//连接超时
sock.connect(addr, self.tickTime * self.syncLimit);
sock.setTcpNoDelay(nodelay);
break;
} catch (IOException e) {
if (tries == 4) {
LOG.error("Unexpected exception",e);
throw e;
} else {
LOG.warn("Unexpected exception, tries="+tries+
", connecting to " + addr,e);
sock = new Socket();
sock.setSoTimeout(self.tickTime * self.initLimit);
}
}
Thread.sleep(1000);
}
leaderIs = BinaryInputArchive.getArchive(new BufferedInputStream(
sock.getInputStream()));
bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
leaderOs = BinaryOutputArchive.getArchive(bufferedOutput);
}
假设这里follower顺利连上了leader,此时leader端会为每个follower启动单独IO线程,请看LearnerCnxAcceptor代码
public void run() {
try {
while (!stop) {
try{
//线程在此等待连接
Socket s = ss.accept();
// start with the initLimit, once the ack is processed
// in LearnerHandler switch to the syncLimit
//读超时设为initLimit时间
s.setSoTimeout(self.tickTime * self.initLimit);
s.setTcpNoDelay(nodelay);
//为每个follower启动单独线程,处理IO
LearnerHandler fh = new LearnerHandler(s, Leader.this);
fh.start();
} catch (SocketException e) {
......
}
leader端为follower建立IO线程,其处理过程和follower自身的主线程根据协议相互交互,以下将通过数据交换场景式分析这个过程,leader端IO线程LearnerHandler启动
ia = BinaryInputArchive.getArchive(new BufferedInputStream(sock
.getInputStream()));
bufferedOutput = new BufferedOutputStream(sock.getOutputStream());
oa = BinaryOutputArchive.getArchive(bufferedOutput);
//IO线程等待follower发送包
QuorumPacket qp = new QuorumPacket();
ia.readRecord(qp, "packet");
follower端进入registerWithLeader处理
long lastLoggedZxid = self.getLastLoggedZxid();
QuorumPacket qp = new QuorumPacket();
//type为Leader.FOLLOWERINFO
qp.setType(pktType);
qp.setZxid(ZxidUtils.makeZxid(self.getAcceptedEpoch(), 0));
/*
* Add sid to payload
*/
LearnerInfo li = new LearnerInfo(self.getId(), 0x10000);
ByteArrayOutputStream bsid = new ByteArrayOutputStream();
BinaryOutputArchive boa = BinaryOutputArchive.getArchive(bsid);
boa.writeRecord(li, "LearnerInfo");
qp.setData(bsid.toByteArray());
//发送LearnerInfo包
writePacket(qp, true);
//等待leader响应
readPacket(qp);
leader端收到包处理
byte learnerInfoData[] = qp.getData();
if (learnerInfoData != null) {
if (learnerInfoData.length == 8) {
ByteBuffer bbsid = ByteBuffer.wrap(learnerInfoData);
this.sid = bbsid.getLong();
} else {
//反序列化LearnerInfo
LearnerInfo li = new LearnerInfo();
ByteBufferInputStream.byteBuffer2Record(ByteBuffer.wrap(learnerInfoData), li);
this.sid = li.getServerid();
this.version = li.getProtocolVersion();
}
} else {
this.sid = leader.followerCounter.getAndDecrement();
}
......
//follower的选举轮数
long lastAcceptedEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());
long peerLastZxid;
StateSummary ss = null;
long zxid = qp.getZxid();
//将follower加入到connectingFollowers中,因为满足半数机器的条件,此时在此等待的leader主线程会退出等待,继续往下处理
long newEpoch = leader.getEpochToPropose(this.getSid(), lastAcceptedEpoch);
......
//发一个Leader.LEADERINFO包,带上新的epoch id
byte ver[] = new byte[4];
ByteBuffer.wrap(ver).putInt(0x10000);
QuorumPacket newEpochPacket = new QuorumPacket(Leader.LEADERINFO, ZxidUtils.makeZxid(newEpoch, 0), ver, null);
oa.writeRecord(newEpochPacket, "packet");
bufferedOutput.flush();
QuorumPacket ackEpochPacket = new QuorumPacket();
//等待follower响应
ia.readRecord(ackEpochPacket, "packet");
if (ackEpochPacket.getType() != Leader.ACKEPOCH) {
LOG.error(ackEpochPacket.toString()
+ " is not ACKEPOCH");
return;
}
ByteBuffer bbepoch = ByteBuffer.wrap(ackEpochPacket.getData());
ss = new StateSummary(bbepoch.getInt(), ackEpochPacket.getZxid());
leader.waitForEpochAck(this.getSid(), ss);
}
此时follower收到LEADERINFO包处理:
final long newEpoch = ZxidUtils.getEpochFromZxid(qp.getZxid());
if (qp.getType() == Leader.LEADERINFO) {
// we are connected to a 1.0 server so accept the new epoch and read the next packet
leaderProtocolVersion = ByteBuffer.wrap(qp.getData()).getInt();
byte epochBytes[] = new byte[4];
final ByteBuffer wrappedEpochBytes = ByteBuffer.wrap(epochBytes);
//将自己的epoch发给leader
if (newEpoch > self.getAcceptedEpoch()) {
wrappedEpochBytes.putInt((int)self.getCurrentEpoch());
self.setAcceptedEpoch(newEpoch);
}
......
//发送一个Leader.ACKEPOCH包,带上自己的最大zxid
QuorumPacket ackNewEpoch = new QuorumPacket(Leader.ACKEPOCH, lastLoggedZxid, epochBytes, null);
writePacket(ackNewEpoch, true);
return ZxidUtils.makeZxid(newEpoch, 0);
}
leader收到Leader.ACKEPOCH后进入waitForEpochAck处理
public void waitForEpochAck(long id, StateSummary ss) throws IOException, InterruptedException {
synchronized(electingFollowers) {
if (electionFinished) {
return;
}
if (ss.getCurrentEpoch() != -1) {
......
//将follower添加到等待集合
electingFollowers.add(id);
}
QuorumVerifier verifier = self.getQuorumVerifier();
//判断是否满足选举条件,如果不满足进入等待,满足则通知其他等待线程,类似于Barrier
if (electingFollowers.contains(self.getId()) && verifier.containsQuorum(electingFollowers)) {
electionFinished = true;
electingFollowers.notifyAll();
}
//follower还不够,等等吧
else {
long start = System.currentTimeMillis();
long cur = start;
long end = start + self.getInitLimit()*self.getTickTime();
while(!electionFinished && cur < end) {
electingFollowers.wait(end - cur);
cur = System.currentTimeMillis();
}
if (!electionFinished) {
throw new InterruptedException("Timeout while waiting for epoch to be acked by quorum");
}
}
}
}
假设IO线程在此等待,此时leader主线程在getEpochToPropose恢复后继续执行
long epoch = getEpochToPropose(self.getId(), self.getAcceptedEpoch());
zk.setZxid(ZxidUtils.makeZxid(epoch, 0));
synchronized(this){
lastProposed = zk.getZxid();
}
//发起一个NEWLEADER投票
newLeaderProposal.packet = new QuorumPacket(NEWLEADER, zk.getZxid(),
null, null);
......
//投票箱
outstandingProposals.put(newLeaderProposal.packet.getZxid(), newLeaderProposal);
//自己默认同意
newLeaderProposal.ackSet.add(self.getId());
//等待follower进来
waitForEpochAck(self.getId(), leaderStateSummary);
由于之前已经有follower进来,满足选举条件,则IO线程和leader主线程都继续往下执行,先看leader主线程
//当前选票轮数
self.setCurrentEpoch(epoch);
// We have to get at least a majority of servers in sync with
// us. We do this by waiting for the NEWLEADER packet to get
// acknowledged
//等待确认NEWLEADER包的follower足够多,那自己真的是leader了
while (!self.getQuorumVerifier().containsQuorum(newLeaderProposal.ackSet)){
//while (newLeaderProposal.ackCount <= self.quorumPeers.size() / 2) {
//如果超过初始化时间initlimit,则退出lead过程,重新选举,有可能是follower同步数据比较慢
if (self.tick > self.initLimit) {
// Followers aren't syncing fast enough,
// renounce leadership!
StringBuilder ackToString = new StringBuilder();
for(Long id : newLeaderProposal.ackSet)
ackToString.append(id + ": ");
shutdown("Waiting for a quorum of followers, only synced with: " + ackToString);
HashSet followerSet = new HashSet();
for(LearnerHandler f : getLearners()) {
followerSet.add(f.getSid());
}
if (self.getQuorumVerifier().containsQuorum(followerSet)) {
//if (followers.size() >= self.quorumPeers.size() / 2) {
LOG.warn("Enough followers present. "+
"Perhaps the initTicks need to be increased.");
}
return;
}
Thread.sleep(self.tickTime);
self.tick++;
}
这个时候IO线程继续执行
/* the default to send to the follower */
//默认发送一个SNAP包,要求follower同步数据
int packetToSend = Leader.SNAP;
long zxidToSend = 0;
long leaderLastZxid = 0;
/** the packets that the follower needs to get updates from **/
long updates = peerLastZxid;
/* we are sending the diff check if we have proposals in memory to be able to
* send a diff to the
*/
ReentrantReadWriteLock lock = leader.zk.getZKDatabase().getLogLock();
ReadLock rl = lock.readLock();
try {
rl.lock();
final long maxCommittedLog = leader.zk.getZKDatabase().getmaxCommittedLog();
final long minCommittedLog = leader.zk.getZKDatabase().getminCommittedLog();
LOG.info("Synchronizing with Follower sid: " + sid
+" maxCommittedLog=0x"+Long.toHexString(maxCommittedLog)
+" minCommittedLog=0x"+Long.toHexString(minCommittedLog)
+" peerLastZxid=0x"+Long.toHexString(peerLastZxid));
//看看是否还有需要投的票
LinkedList proposals = leader.zk.getZKDatabase().getCommittedLog();
//如果有,则处理这些投票
if (proposals.size() != 0) {
//如果follower还没处理这个分布式事务,有可能down掉了又恢复,则继续处理这个事务
if ((maxCommittedLog >= peerLastZxid)
&& (minCommittedLog <= peerLastZxid)) {
.......
// If we are here, we can use committedLog to sync with
// follower. Then we only need to decide whether to
// send trunc or not
packetToSend = Leader.DIFF;
zxidToSend = maxCommittedLog;
for (Proposal propose: proposals) {
// skip the proposals the peer already has
//这个已经被处理过了,无视
if (propose.packet.getZxid() <= peerLastZxid) {
prevProposalZxid = propose.packet.getZxid();
continue;
} else {
// If we are sending the first packet, figure out whether to trunc
// in case the follower has some proposals that the leader doesn't
//发第一个事务之前先确认folloer是否比leader超前
if (firstPacket) {
firstPacket = false;
// Does the peer have some proposals that the leader hasn't seen yet
//follower处理事务比leader多,则发送TRUNC包,让follower回滚到和leader一致
if (prevProposalZxid < peerLastZxid) {
// send a trunc message before sending the diff
packetToSend = Leader.TRUNC;
zxidToSend = prevProposalZxid;
updates = zxidToSend;
}
}
//将事务发送到队列
queuePacket(propose.packet);
//立马接一个COMMIT包
QuorumPacket qcommit = new QuorumPacket(Leader.COMMIT, propose.packet.getZxid(),
null, null);
queuePacket(qcommit);
}
}
}
//如果follower超前了,则发送TRUNC包,让其和leader同步
else if (peerLastZxid > maxCommittedLog) {
LOG.debug("Sending TRUNC to follower zxidToSend=0x{} updates=0x{}",
Long.toHexString(maxCommittedLog),
Long.toHexString(updates));
packetToSend = Leader.TRUNC;
zxidToSend = maxCommittedLog;
updates = zxidToSend;
} else {
LOG.warn("Unhandled proposal scenario");
}
}
//如果follower和leader同步,则发送DIFF包,而不需要follower拉数据
else if (peerLastZxid == leader.zk.getZKDatabase().getDataTreeLastProcessedZxid()) {
.....
packetToSend = Leader.DIFF;
zxidToSend = peerLastZxid;
.......
//NEWLEADER包添加到发送队列
QuorumPacket newLeaderQP = new QuorumPacket(Leader.NEWLEADER,
ZxidUtils.makeZxid(newEpoch, 0), null, null);
if (getVersion() < 0x10000) {
oa.writeRecord(newLeaderQP, "packet");
} else {
queuedPackets.add(newLeaderQP);
}
bufferedOutput.flush();
//Need to set the zxidToSend to the latest zxid
if (packetToSend == Leader.SNAP) {
zxidToSend = leader.zk.getZKDatabase().getDataTreeLastProcessedZxid();
}
//发送一个DIFF或SNAP包
oa.writeRecord(new QuorumPacket(packetToSend, zxidToSend, null, null), "packet");
bufferedOutput.flush();
......
// Start sending packets
//启动一个异步发送线程
new Thread() {
public void run() {
Thread.currentThread().setName(
"Sender-" + sock.getRemoteSocketAddress());
try {
sendPackets();
} catch (InterruptedException e) {
LOG.warn("Unexpected interruption",e);
}
}
}.start();
/*
* Have to wait for the first ACK, wait until
* the leader is ready, and only then we can
* start processing messages.
*/
//等待follower确认
qp = new QuorumPacket();
ia.readRecord(qp, "packet");
在我们这个集群里。由于是刚启动的,所以leader会直接发送DIFF包,然后再发送一个NEWLEADER包
接着follower收到包处理,在syncWithLeader中
QuorumPacket ack = new QuorumPacket(Leader.ACK, 0, null, null);
QuorumPacket qp = new QuorumPacket();
long newEpoch = ZxidUtils.getEpochFromZxid(newLeaderZxid);
readPacket(qp);
LinkedList packetsCommitted = new LinkedList();
LinkedList packetsNotCommitted = new LinkedList();
synchronized (zk) {
//DIFF包
if (qp.getType() == Leader.DIFF) {
LOG.info("Getting a diff from the leader 0x" + Long.toHexString(qp.getZxid()));
}
//如果是SNAP包,则从leader复制一份镜像数据到本地内存
else if (qp.getType() == Leader.SNAP) {
LOG.info("Getting a snapshot from leader");
// The leader is going to dump the database
// clear our own database and read
zk.getZKDatabase().clear();
zk.getZKDatabase().deserializeSnapshot(leaderIs);
String signature = leaderIs.readString("signature");
if (!signature.equals("BenWasHere")) {
LOG.error("Missing signature. Got " + signature);
throw new IOException("Missing signature");
}
}
//TRUNC包,回滚到对应事务
else if (qp.getType() == Leader.TRUNC) {
//we need to truncate the log to the lastzxid of the leader
LOG.warn("Truncating log to get in sync with the leader 0x"
+ Long.toHexString(qp.getZxid()));
boolean truncated=zk.getZKDatabase().truncateLog(qp.getZxid());
......
//最新的事务id
zk.getZKDatabase().setlastProcessedZxid(qp.getZxid());
//启动过期session检查
zk.createSessionTracker();
long lastQueued = 0;
// in V1.0 we take a snapshot when we get the NEWLEADER message, but in pre V1.0
// we take the snapshot at the UPDATE, since V1.0 also gets the UPDATE (after the NEWLEADER)
// we need to make sure that we don't take the snapshot twice.
boolean snapshotTaken = false;
// we are now going to start getting transactions to apply followed by an UPTODATE
outerLoop:
//同步完数据后,准备执行投票
while (self.isRunning()) {
readPacket(qp);
switch(qp.getType()) {
//将投票添加到待处理列表
case Leader.PROPOSAL:
PacketInFlight pif = new PacketInFlight();
pif.hdr = new TxnHeader();
pif.rec = SerializeUtils.deserializeTxn(qp.getData(), pif.hdr);
if (pif.hdr.getZxid() != lastQueued + 1) {
LOG.warn("Got zxid 0x"
+ Long.toHexString(pif.hdr.getZxid())
+ " expected 0x"
+ Long.toHexString(lastQueued + 1));
}
lastQueued = pif.hdr.getZxid();
packetsNotCommitted.add(pif);
break;
//COMMIT则将事务交给Server处理掉
case Leader.COMMIT:
if (!snapshotTaken) {
pif = packetsNotCommitted.peekFirst();
if (pif.hdr.getZxid() != qp.getZxid()) {
LOG.warn("Committing " + qp.getZxid() + ", but next proposal is " + pif.hdr.getZxid());
} else {
zk.processTxn(pif.hdr, pif.rec);
packetsNotCommitted.remove();
}
} else {
packetsCommitted.add(qp.getZxid());
}
break;
case Leader.INFORM:
TxnHeader hdr = new TxnHeader();
Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);
zk.processTxn(hdr, txn);
break;
//UPTODATE包,说明同步成功,退出循环
case Leader.UPTODATE:
if (!snapshotTaken) { // true for the pre v1.0 case
zk.takeSnapshot();
self.setCurrentEpoch(newEpoch);
}
self.cnxnFactory.setZooKeeperServer(zk);
break outerLoop;
//NEWLEADER包,说明之前残留的投票已经处理完了,则将内存中数据写文件,并发送ACK包
case Leader.NEWLEADER: // it will be NEWLEADER in v1.0
zk.takeSnapshot();
self.setCurrentEpoch(newEpoch);
snapshotTaken = true;
writePacket(new QuorumPacket(Leader.ACK, newLeaderZxid, null, null), true);
break;
}
}
}
follower在这里同步leader数据,在拿到NEWLEADER包之后序列化到文件,发送ACK包,leaderIO线程处理
qp = new QuorumPacket();
ia.readRecord(qp, "packet");
if(qp.getType() != Leader.ACK){
LOG.error("Next packet was supposed to be an ACK");
return;
}
//ACK包处理,如果follower数据同步成功,则将它添加到NEWLEADER这个投票的结果中,这样leader主线程就会恢复执行
leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());
// now that the ack has been processed expect the syncLimit
sock.setSoTimeout(leader.self.tickTime * leader.self.syncLimit);
/*
* Wait until leader starts up
*/
//等待leader的server启动
synchronized(leader.zk){
while(!leader.zk.isRunning() && !this.isInterrupted()){
leader.zk.wait(20);
}
}
// Mutation packets will be queued during the serialize,
// so we need to mark when the peer can actually start
// using the data
//
//leader server启动后,发送一个UPTODATE包
queuedPackets.add(new QuorumPacket(Leader.UPTODATE, -1, null, null));
具体的ACK包处理
synchronized public void processAck(long sid, long zxid, SocketAddress followerAddr) {
......
Proposal p = outstandingProposals.get(zxid);
......
//将follower添加到结果列表
p.ackSet.add(sid);
......
//票数够了,则启动leader的server
if (self.getQuorumVerifier().containsQuorum(p.ackSet)){
.......
} else {
lastCommitted = zxid;
LOG.info("Have quorum of supporters; starting up and setting last processed zxid: 0x{}",
Long.toHexString(zk.getZxid()));
//启动leader的zookeeper server
zk.startup();
zk.getZKDatabase().setlastProcessedZxid(zk.getZxid());
}
}
}
由于follower进来已经满足投票条件,则leader 的server启动,如下
public void startup() {
if (sessionTracker == null) {
createSessionTracker();
}
//session检查
startSessionTracker();
//处理链
setupRequestProcessors();
registerJMX();
synchronized (this) {
running = true;
notifyAll();
}
}
protected void setupRequestProcessors() {
//最后final处理器
RequestProcessor finalProcessor = new FinalRequestProcessor(this);
RequestProcessor toBeAppliedProcessor = new Leader.ToBeAppliedRequestProcessor(
finalProcessor, getLeader().toBeApplied);
//投票结果确认
commitProcessor = new CommitProcessor(toBeAppliedProcessor,
Long.toString(getServerId()), false);
commitProcessor.start();
//投票发起
ProposalRequestProcessor proposalProcessor = new ProposalRequestProcessor(this,
commitProcessor);
proposalProcessor.initialize();
//事务预处理
firstProcessor = new PrepRequestProcessor(this, proposalProcessor);
((PrepRequestProcessor)firstProcessor).start();
}
leader启动后,发送一个UPTODATE包,follower处理
//退出同步数据循环
case Leader.UPTODATE:
if (!snapshotTaken) { // true for the pre v1.0 case
zk.takeSnapshot();
self.setCurrentEpoch(newEpoch);
}
self.cnxnFactory.setZooKeeperServer(zk);
break outerLoop;
......
//再发ACK包
ack.setZxid(ZxidUtils.makeZxid(newEpoch, 0));
writePacket(ack, true);
leader的IO线程LearnerHandler进入主循环,收到ACK包处理
while (true) {
qp = new QuorumPacket();
ia.readRecord(qp, "packet");
......
tickOfLastAck = leader.self.tick;
ByteBuffer bb;
long sessionId;
int cxid;
int type;
switch (qp.getType()) {
//ACK包,看看之前的投票是否结束
case Leader.ACK:
......
leader.processAck(this.sid, qp.getZxid(), sock.getLocalSocketAddress());
break;
//PING包更新下session的超时时间,往前推
case Leader.PING:
// Process the touches
ByteArrayInputStream bis = new ByteArrayInputStream(qp
.getData());
DataInputStream dis = new DataInputStream(bis);
while (dis.available() > 0) {
long sess = dis.readLong();
int to = dis.readInt();
leader.zk.touch(sess, to);
}
break;
//REVALIDATE包,检查session是否还有效
case Leader.REVALIDATE:
bis = new ByteArrayInputStream(qp.getData());
dis = new DataInputStream(bis);
long id = dis.readLong();
int to = dis.readInt();
ByteArrayOutputStream bos = new ByteArrayOutputStream();
DataOutputStream dos = new DataOutputStream(bos);
dos.writeLong(id);
boolean valid = leader.zk.touch(id, to);
if (valid) {
try {
//set the session owner
// as the follower that
// owns the session
leader.zk.setOwner(id, this);
} catch (SessionExpiredException e) {
LOG.error("Somehow session " + Long.toHexString(id) + " expired right after being renewed! (impossible)", e);
}
}
if (LOG.isTraceEnabled()) {
ZooTrace.logTraceMessage(LOG,
ZooTrace.SESSION_TRACE_MASK,
"Session 0x" + Long.toHexString(id)
+ " is valid: "+ valid);
}
dos.writeBoolean(valid);
qp.setData(bos.toByteArray());
queuedPackets.add(qp);
break;
//REQUEST包,事务请求,follower会将事务请求转发给leader处理
case Leader.REQUEST:
bb = ByteBuffer.wrap(qp.getData());
sessionId = bb.getLong();
cxid = bb.getInt();
type = bb.getInt();
bb = bb.slice();
Request si;
if(type == OpCode.sync){
si = new LearnerSyncRequest(this, sessionId, cxid, type, bb, qp.getAuthinfo());
} else {
si = new Request(null, sessionId, cxid, type, bb, qp.getAuthinfo());
}
si.setOwner(this);
leader.zk.submitRequest(si);
break;
default:
}
}
这个时候LearnerHandler线程已经启动完成,follower发完ACK包后
writePacket(ack, true);
//读超时为syncLimit时间
sock.setSoTimeout(self.tickTime * self.syncLimit);
//启动follower的zookeeper server
zk.startup();
// We need to log the stuff that came in between the snapshot and the uptodate
if (zk instanceof FollowerZooKeeperServer) {
FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk;
for(PacketInFlight p: packetsNotCommitted) {
fzk.logRequest(p.hdr, p.rec);
}
for(Long zxid: packetsCommitted) {
fzk.commit(zxid);
}
}
Follower的zookeeper server启动
@Override
protected void setupRequestProcessors() {
RequestProcessor finalProcessor = new FinalRequestProcessor(this);
commitProcessor = new CommitProcessor(finalProcessor,
Long.toString(getServerId()), true);
commitProcessor.start();
firstProcessor = new FollowerRequestProcessor(this, commitProcessor);
((FollowerRequestProcessor) firstProcessor).start();
syncProcessor = new SyncRequestProcessor(this,
new SendAckRequestProcessor((Learner)getFollower()));
syncProcessor.start();
}
Follower进入主处理
QuorumPacket qp = new QuorumPacket();
while (self.isRunning()) {
readPacket(qp);
processPacket(qp);
}
protected void processPacket(QuorumPacket qp) throws IOException{
switch (qp.getType()) {
//PING包,写回session数据
case Leader.PING:
ping(qp);
break;
//PROPOSAL包,投票处理
case Leader.PROPOSAL:
TxnHeader hdr = new TxnHeader();
Record txn = SerializeUtils.deserializeTxn(qp.getData(), hdr);
if (hdr.getZxid() != lastQueued + 1) {
LOG.warn("Got zxid 0x"
+ Long.toHexString(hdr.getZxid())
+ " expected 0x"
+ Long.toHexString(lastQueued + 1));
}
lastQueued = hdr.getZxid();
fzk.logRequest(hdr, txn);
break;
//COMMIT包,提交事务
case Leader.COMMIT:
fzk.commit(qp.getZxid());
break;
case Leader.UPTODATE:
LOG.error("Received an UPTODATE message after Follower started");
break;
case Leader.REVALIDATE:
revalidate(qp);
break;
case Leader.SYNC:
fzk.sync();
break;
}
}
这个时候Follower也初始化完成,再看leader主线程,Leader主线程之前在等待follower同步结束,结束之后,leader主线程进入主循环,检查follower是否down掉
while (true) {
Thread.sleep(self.tickTime / 2);
if (!tickSkip) {
self.tick++;
}
HashSet syncedSet = new HashSet();
// lock on the followers when we use it.
syncedSet.add(self.getId());
//检查每个follower是否还活着
for (LearnerHandler f : getLearners()) {
// Synced set is used to check we have a supporting quorum, so only
// PARTICIPANT, not OBSERVER, learners should be used
if (f.synced() && f.getLearnerType() == LearnerType.PARTICIPANT) {
syncedSet.add(f.getSid());
}
f.ping();
}
//如果有follower挂掉导致投票不通过,则退出lead流程,重新选举
if (!tickSkip && !self.getQuorumVerifier().containsQuorum(syncedSet)) {
//if (!tickSkip && syncedCount < self.quorumPeers.size() / 2) {
// Lost quorum, shutdown
// TODO: message is wrong unless majority quorums used
shutdown("Only " + syncedSet.size() + " followers, need "
+ (self.getVotingView().size() / 2));
// make sure the order is the same!
// the leader goes to looking
return;
}
tickSkip = !tickSkip;
}