前面几篇文章简单介绍了zookeeper的单机server client处理。接下来几篇文章会介绍分布式部署下zookeeper的实现原理。我们假设有3台server的集群,zoo.cfg配置如下
- tickTime=2000
- dataDir=/home/admin/zk-data
- clientPort=2181
- #Learner初始化连接到Leader的超时时间
- initLimit=10
- #Learner和Leader之间消息发送,响应的超时时间
- syncLimit=5
- #集群配置,3台机器,2888为Leader服务端口,3888为选举时所用的端口
- server.1=master:2888:3888
- server.2=slave1:2888:3888
- server.3=slave2:2888:3888
在server.1的$dataDir下
- echo '1'>myid
启动server.1
- ./zkServer.sh start
分析之前先看看选举相关的类图
入口函数QuorumPeerMain主线程启动
- public void runFromConfig(QuorumPeerConfig config) throws IOException {
- ......
- LOG.info("Starting quorum peer");
- try {
- //对client提供读写的server,一般是2181端口
- ServerCnxnFactory cnxnFactory = ServerCnxnFactory.createFactory();
- cnxnFactory.configure(config.getClientPortAddress(),
- config.getMaxClientCnxns());
- //zk的逻辑主线程,负责选举,投票等
- quorumPeer = new QuorumPeer();
- quorumPeer.setClientPortAddress(config.getClientPortAddress());
- quorumPeer.setTxnFactory(new FileTxnSnapLog(
- new File(config.getDataLogDir()),
- new File(config.getDataDir())));
- //集群机器地址
- quorumPeer.setQuorumPeers(config.getServers());
- quorumPeer.setElectionType(config.getElectionAlg());
- //本机的集群编号
- quorumPeer.setMyid(config.getServerId());
- quorumPeer.setTickTime(config.getTickTime());
- quorumPeer.setMinSessionTimeout(config.getMinSessionTimeout());
- quorumPeer.setMaxSessionTimeout(config.getMaxSessionTimeout());
- quorumPeer.setInitLimit(config.getInitLimit());
- quorumPeer.setSyncLimit(config.getSyncLimit());
- //投票决定方式,默认超过半数就通过
- quorumPeer.setQuorumVerifier(config.getQuorumVerifier());
- quorumPeer.setCnxnFactory(cnxnFactory);
- quorumPeer.setZKDatabase(new ZKDatabase(quorumPeer.getTxnFactory()));
- quorumPeer.setLearnerType(config.getPeerType());
- //启动主线程
- quorumPeer.start();
- quorumPeer.join();
- } catch (InterruptedException e) {
- // warn, but generally this is ok
- LOG.warn("Quorum Peer interrupted", e);
- }
- }
- @Override
- public synchronized void start() {
- //恢复DB,从zxid中回复epoch变量,代表投票轮数
- loadDataBase();
- //启动针对client的IO线程
- cnxnFactory.start();
- //选举初始化,主要是从配置获取选举类型
- startLeaderElection();
- //启动
- super.start();
- }
- private void loadDataBase() {
- try {
- //从本地文件恢复db
- zkDb.loadDataBase();
- // load the epochs
- //从最新的zxid恢复epoch变量,zxid64位,前32位是epoch值,后32位是zxid
- long lastProcessedZxid = zkDb.getDataTree().lastProcessedZxid;
- long epochOfZxid = ZxidUtils.getEpochFromZxid(lastProcessedZxid);
- try {
- currentEpoch = readLongFromFile(CURRENT_EPOCH_FILENAME);
- } catch(FileNotFoundException e) {
- // pick a reasonable epoch number
- // this should only happen once when moving to a
- // new code version
- currentEpoch = epochOfZxid;
- LOG.info(CURRENT_EPOCH_FILENAME
- + " not found! Creating with a reasonable default of {}. This should only happen when you are upgrading your installation",
- currentEpoch);
- writeLongToFile(CURRENT_EPOCH_FILENAME, currentEpoch);
- }
- if (epochOfZxid > currentEpoch) {
- throw new IOException("The current epoch, " + ZxidUtils.zxidToString(currentEpoch) + ", is older than the last zxid, " + lastProcessedZxid);
- }
- .......
- }
- synchronized public void startLeaderElection() {
- try {
- //先投自己
- currentVote = new Vote(myid, getLastLoggedZxid(), getCurrentEpoch());
- } catch(IOException e) {
- RuntimeException re = new RuntimeException(e.getMessage());
- re.setStackTrace(e.getStackTrace());
- throw re;
- }
- //从配置中拿自己的选举地址
- for (QuorumServer p : getView().values()) {
- if (p.id == myid) {
- myQuorumAddr = p.addr;
- break;
- }
- }
- ......
- //根据配置,获取选举算法
- this.electionAlg = createElectionAlgorithm(electionType);
- }
- protected Election createElectionAlgorithm(int electionAlgorithm){
- Election le=null;
- //TODO: use a factory rather than a switch
- switch (electionAlgorithm) {
- case 0:
- le = new LeaderElection(this);
- break;
- case 1:
- le = new AuthFastLeaderElection(this);
- break;
- case 2:
- le = new AuthFastLeaderElection(this, true);
- break;
- case 3:
- //leader选举IO负责类
- qcm = new QuorumCnxManager(this);
- QuorumCnxManager.Listener listener = qcm.listener;
- //启动已绑定3888端口的选举线程,等待集群其他机器连接
- if(listener != null){
- listener.start();
- //基于TCP的选举算法
- le = new FastLeaderElection(this, qcm);
- } else {
- LOG.error("Null listener when initializing cnx manager");
- }
- break;
- default:
- assert false;
- }
- return le;
- }
- private void starter(QuorumPeer self, QuorumCnxManager manager) {
- this.self = self;
- proposedLeader = -1;
- proposedZxid = -1;
- //业务层发送队列,业务对象ToSend
- sendqueue = new LinkedBlockingQueue<ToSend>();
- //业务层接受队列,业务对象Notificataion
- recvqueue = new LinkedBlockingQueue<Notification>();
- //
- this.messenger = new Messenger(manager);
- }
- essenger(QuorumCnxManager manager) {
- //启动业务层发送线程,将消息发给IO负责类QuorumCnxManager
- this.ws = new WorkerSender(manager);
- Thread t = new Thread(this.ws,
- "WorkerSender[myid=" + self.getId() + "]");
- t.setDaemon(true);
- t.start();
- //启动业务层接受线程,从IO负责类QuorumCnxManager接受消息
- this.wr = new WorkerReceiver(manager);
- t = new Thread(this.wr,
- "WorkerReceiver[myid=" + self.getId() + "]");
- t.setDaemon(true);
- t.start();
- }
- run(){
- .......
- try {
- /*
- * Main loop
- */
- while (running) {
- switch (getPeerState()) {
- //如果状态是LOOKING,则进入选举流程
- case LOOKING:
- LOG.info("LOOKING");
- ......
- try {
- //选举算法开始选举,主线程可能在这里耗比较长时间
- setCurrentVote(makeLEStrategy().lookForLeader());
- } catch (Exception e) {
- LOG.warn("Unexpected exception", e);
- setPeerState(ServerState.LOOKING);
- }
- }
- break;
- //其他流程处理
- case OBSERVING:
- try {
- LOG.info("OBSERVING");
- setObserver(makeObserver(logFactory));
- observer.observeLeader();
- } catch (Exception e) {
- LOG.warn("Unexpected exception",e );
- } finally {
- observer.shutdown();
- setObserver(null);
- setPeerState(ServerState.LOOKING);
- }
- break;
- case FOLLOWING:
- try {
- LOG.info("FOLLOWING");
- setFollower(makeFollower(logFactory));
- follower.followLeader();
- } catch (Exception e) {
- LOG.warn("Unexpected exception",e);
- } finally {
- follower.shutdown();
- setFollower(null);
- setPeerState(ServerState.LOOKING);
- }
- break;
- case LEADING:
- LOG.info("LEADING");
- try {
- setLeader(makeLeader(logFactory));
- leader.lead();
- setLeader(null);
- } catch (Exception e) {
- LOG.warn("Unexpected exception",e);
- } finally {
- if (leader != null) {
- leader.shutdown("Forcing shutdown");
- setLeader(null);
- }
- setPeerState(ServerState.LOOKING);
- }
- break;
- }
- .......
- }
- public Vote lookForLeader() throws InterruptedException {
- ......
- try {
- //收到的投票
- HashMap<Long, Vote> recvset = new HashMap<Long, Vote>();
- HashMap<Long, Vote> outofelection = new HashMap<Long, Vote>();
- int notTimeout = finalizeWait;
- synchronized(this){
- logicalclock++;
- //先投给自己
- updateProposal(getInitId(), getInitLastLoggedZxid(), getPeerEpoch());
- }
- LOG.info("New election. My id = " + self.getId() +
- ", proposed zxid=0x" + Long.toHexString(proposedZxid));
- //发送投票,包括发给自己
- sendNotifications();
- /*
- * Loop in which we exchange notifications until we find a leader
- */
- //主循环,直到选出leader
- while ((self.getPeerState() == ServerState.LOOKING) &&
- (!stop)){
- /*
- * Remove next notification from queue, times out after 2 times
- * the termination time
- */
- //从IO线程里拿到投票消息,自己的投票也在这里处理
- Notification n = recvqueue.poll(notTimeout,
- TimeUnit.MILLISECONDS);
- /*
- * Sends more notifications if haven't received enough.
- * Otherwise processes new notification.
- */
- //如果空闲
- if(n == null){
- //消息发完了,继续发送,一直到选出leader为止
- if(manager.haveDelivered()){
- sendNotifications();
- } else {
- //消息还在,可能其他server还没启动,尝试连接
- manager.connectAll();
- }
- /*
- * Exponential backoff
- */
- //延长超时时间
- int tmpTimeOut = notTimeout*2;
- notTimeout = (tmpTimeOut < maxNotificationInterval?
- tmpTimeOut : maxNotificationInterval);
- LOG.info("Notification time out: " + notTimeout);
- }
- //收到了投票消息
- else if(self.getVotingView().containsKey(n.sid)) {
- /*
- * Only proceed if the vote comes from a replica in the
- * voting view.
- */
- switch (n.state) {
- //LOOKING消息,则
- case LOOKING:
- ......
- //检查下收到的这张选票是否可以胜出,依次比较选举轮数epoch,事务zxid,服务器编号server id
- } else if (totalOrderPredicate(n.leader, n.zxid, n.peerEpoch,
- proposedLeader, proposedZxid, proposedEpoch)) {
- //胜出了,就把自己的投票修改为对方的,然后广播消息
- updateProposal(n.leader, n.zxid, n.peerEpoch);
- sendNotifications();
- }
- ......
- //添加到本机投票集合,用来做选举终结判断
- recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
- //选举是否结束,默认算法是超过半数server同意
- if (termPredicate(recvset,
- new Vote(proposedLeader, proposedZxid,
- logicalclock, proposedEpoch))) {
- ......
- //修改状态,LEADING or FOLLOWING
- self.setPeerState((proposedLeader == self.getId()) ?
- ServerState.LEADING: learningState());
- //返回最终的选票结果
- Vote endVote = new Vote(proposedLeader,
- proposedZxid, proposedEpoch);
- leaveInstance(endVote);
- return endVote;
- }
- }
- break;
- //如果收到的选票状态不是LOOKING,比如这台机器刚加入一个已经服务的zk集群时
- //OBSERVING机器不参数选举
- case OBSERVING:
- LOG.debug("Notification from observer: " + n.sid);
- break;
- //这2种需要参与选举
- case FOLLOWING:
- case LEADING:
- /*
- * Consider all notifications from the same epoch
- * together.
- */
- if(n.electionEpoch == logicalclock){
- //同样需要加入到本机的投票集合
- recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
- //投票是否结束,如果结束,再确认LEADER是否有效
- //如果结束,修改自己的状态并返回投票结果
- if(termPredicate(recvset, new Vote(n.leader,
- n.zxid, n.electionEpoch, n.peerEpoch, n.state))
- && checkLeader(outofelection, n.leader, n.electionEpoch)) {
- self.setPeerState((n.leader == self.getId()) ?
- ServerState.LEADING: learningState());
- Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
- leaveInstance(endVote);
- return endVote;
- }
- }
- /**
- * Before joining an established ensemble, verify that
- * a majority are following the same leader.
- */
- outofelection.put(n.sid, new Vote(n.leader, n.zxid,
- n.electionEpoch, n.peerEpoch, n.state));
- ......
- break;
- default:
- private void sendNotifications() {
- //循环发送
- for (QuorumServer server : self.getVotingView().values()) {
- long sid = server.id;
- //消息实体
- ToSend notmsg = new ToSend(ToSend.mType.notification,
- proposedLeader,
- proposedZxid,
- logicalclock,
- QuorumPeer.ServerState.LOOKING,
- sid,
- proposedEpoch);
- ......
- //添加到业务的发送队列,该队列会被WorkerSender消费
- sendqueue.offer(notmsg);
- }
- }
- public void run() {
- while (!stop) {
- try {
- ToSend m = sendqueue.poll(3000, TimeUnit.MILLISECONDS);
- if(m == null) continue;
- process(m);
- } catch (InterruptedException e) {
- break;
- }
- }
- LOG.info("WorkerSender is down");
- }
- private void process(ToSend m) {
- byte requestBytes[] = new byte[36];
- ByteBuffer requestBuffer = ByteBuffer.wrap(requestBytes);
- /*
- * Building notification packet to send
- */
- requestBuffer.clear();
- requestBuffer.putInt(m.state.ordinal());
- requestBuffer.putLong(m.leader);
- requestBuffer.putLong(m.zxid);
- requestBuffer.putLong(m.electionEpoch);
- requestBuffer.putLong(m.peerEpoch);
- CnxManager这个IO负责类发送消息
- manager.toSend(m.sid, requestBuffer);
- }
- public void toSend(Long sid, ByteBuffer b) {
- /*
- * If sending message to myself, then simply enqueue it (loopback).
- */
- //如果是自己,不走网络,直接添加到本地接受队列
- if (self.getId() == sid) {
- b.position(0);
- addToRecvQueue(new Message(b.duplicate(), sid));
- /*
- * Otherwise send to the corresponding thread to send.
- */
- //否则,先添加到发送队列,然后尝试连接,连接成功则给每台server启动发送和接受线程
- } else {
- /*
- * Start a new connection if doesn't have one already.
- */
- if (!queueSendMap.containsKey(sid)) {
- ArrayBlockingQueue<ByteBuffer> bq = new ArrayBlockingQueue<ByteBuffer>(
- SEND_CAPACITY);
- queueSendMap.put(sid, bq);
- addToSendQueue(bq, b);
- } else {
- ArrayBlockingQueue<ByteBuffer> bq = queueSendMap.get(sid);
- if(bq != null){
- addToSendQueue(bq, b);
- } else {
- LOG.error("No queue for server " + sid);
- }
- }
- connectOne(sid);
- }
- }
- synchronized void connectOne(long sid){
- if (senderWorkerMap.get(sid) == null){
- .......
- //对方的选举地址,3888端口
- electionAddr = self.quorumPeers.get(sid).electionAddr;
- .......
- //同步IO连接
- Socket sock = new Socket();
- setSockOpts(sock);
- sock.connect(self.getView().get(sid).electionAddr, cnxTO);
- if (LOG.isDebugEnabled()) {
- LOG.debug("Connected to server " + sid);
- }
- //连上了,初始化IO线程
- initiateConnection(sock, sid);
- ......
- }
这个时候被连接的server的Listener选举线程会收到新连接
Listener主循环,接受连接
- while (!shutdown) {
- Socket client = ss.accept();
- setSockOpts(client);
- LOG.info("Received connection request "
- + client.getRemoteSocketAddress());
- receiveConnection(client);
- numRetries = 0;
- }
- public boolean receiveConnection(Socket sock) {
- Long sid = null;
- try {
- // Read server id
- //读server id
- DataInputStream din = new DataInputStream(sock.getInputStream());
- sid = din.readLong();
- ......
- //If wins the challenge, then close the new connection.
- //如果对方id比我小,则关闭连接,只允许大id的server连接小id的server
- if (sid < self.getId()) {
- /*
- * This replica might still believe that the connection to sid is
- * up, so we have to shut down the workers before trying to open a
- * new connection.
- */
- SendWorker sw = senderWorkerMap.get(sid);
- if (sw != null) {
- sw.finish();
- }
- /*
- * Now we start a new connection
- */
- LOG.debug("Create new connection to server: " + sid);
- closeSocket(sock);
- connectOne(sid);
- // Otherwise start worker threads to receive data.
- }
- //如果对方id比我大,允许连接,并初始化单独的IO线程
- else {
- SendWorker sw = new SendWorker(sock, sid);
- RecvWorker rw = new RecvWorker(sock, sid, sw);
- sw.setRecv(rw);
- SendWorker vsw = senderWorkerMap.get(sid);
- if(vsw != null)
- vsw.finish();
- senderWorkerMap.put(sid, sw);
- if (!queueSendMap.containsKey(sid)) {
- queueSendMap.put(sid, new ArrayBlockingQueue<ByteBuffer>(
- SEND_CAPACITY));
- }
- sw.start();
- rw.start();
- return true;
- }
- return false;
- }
- public boolean initiateConnection(Socket sock, Long sid) {
- DataOutputStream dout = null;
- try {
- // Sending id and challenge
- //先发一个server id
- dout = new DataOutputStream(sock.getOutputStream());
- dout.writeLong(self.getId());
- dout.flush();
- }
- ......
- // If lost the challenge, then drop the new connection
- //如果对方id比自己大,则关闭连接,这样导致的结果就是大id的server才会去连接小id的server,避免连接浪费
- if (sid > self.getId()) {
- LOG.info("Have smaller server identifier, so dropping the " +
- "connection: (" + sid + ", " + self.getId() + ")");
- closeSocket(sock);
- // Otherwise proceed with the connection
- }
- //如果对方id比自己小,则保持连接,并初始化单独的发送和接受线程
- else {
- SendWorker sw = new SendWorker(sock, sid);
- RecvWorker rw = new RecvWorker(sock, sid, sw);
- sw.setRecv(rw);
- SendWorker vsw = senderWorkerMap.get(sid);
- if(vsw != null)
- vsw.finish();
- senderWorkerMap.put(sid, sw);
- if (!queueSendMap.containsKey(sid)) {
- queueSendMap.put(sid, new ArrayBlockingQueue<ByteBuffer>(
- SEND_CAPACITY));
- }
- sw.start();
- rw.start();
- return true;
- }
- return false;
- }
IO发送线程SendWorker启动,开始发送选举消息
- try {
- while (running && !shutdown && sock != null) {
- ByteBuffer b = null;
- try {
- //每个server一个发送队列
- ArrayBlockingQueue<ByteBuffer> bq = queueSendMap
- .get(sid);
- if (bq != null) {
- //拿消息
- b = pollSendQueue(bq, 1000, TimeUnit.MILLISECONDS);
- } else {
- LOG.error("No queue of incoming messages for " +
- "server " + sid);
- break;
- }
- if(b != null){
- //发消息
- lastMessageSent.put(sid, b);
- send(b);
- }
- } catch (InterruptedException e) {
- LOG.warn("Interrupted while waiting for message on queue",
- e);
- }
- }
- } catch (Exception e) {
- LOG.warn("Exception when using channel: for id " + sid + " my id = " +
- self.getId() + " error = " + e);
- }
- this.finish();
- ......
- public void run() {
- threadCnt.incrementAndGet();
- try {
- while (running && !shutdown && sock != null) {
- /**
- * Reads the first int to determine the length of the
- * message
- */
- //包的长度
- int length = din.readInt();
- if (length <= 0 || length > PACKETMAXSIZE) {
- throw new IOException(
- "Received packet with invalid packet: "
- + length);
- }
- /**
- * Allocates a new ByteBuffer to receive the message
- */
- //读到内存
- byte[] msgArray = new byte[length];
- din.readFully(msgArray, 0, length);
- ByteBuffer message = ByteBuffer.wrap(msgArray);
- //添加到接收队列,后续业务层的接收线程WorkerReceiver会来拿消息
- addToRecvQueue(new Message(message.duplicate(), sid));
- }
- ......
- }
- public void run() {
- Message response;
- while (!stop) {
- // Sleeps on receive
- try{
- //从IO线程拿数据
- response = manager.pollRecvQueue(3000, TimeUnit.MILLISECONDS);
- if(response == null) continue;
- /*
- * If it is from an observer, respond right away.
- * Note that the following predicate assumes that
- * if a server is not a follower, then it must be
- * an observer. If we ever have any other type of
- * learner in the future, we'll have to change the
- * way we check for observers.
- */
- //如果是Observer,则返回当前选举结果
- if(!self.getVotingView().containsKey(response.sid)){
- Vote current = self.getCurrentVote();
- ToSend notmsg = new ToSend(ToSend.mType.notification,
- current.getId(),
- current.getZxid(),
- logicalclock,
- self.getPeerState(),
- response.sid,
- current.getPeerEpoch());
- sendqueue.offer(notmsg);
- }
- else {
- // Receive new message
- .......
- // State of peer that sent this message
- //对方节点状态
- QuorumPeer.ServerState ackstate = QuorumPeer.ServerState.LOOKING;
- switch (response.buffer.getInt()) {
- case 0:
- ackstate = QuorumPeer.ServerState.LOOKING;
- break;
- case 1:
- ackstate = QuorumPeer.ServerState.FOLLOWING;
- break;
- case 2:
- ackstate = QuorumPeer.ServerState.LEADING;
- break;
- case 3:
- ackstate = QuorumPeer.ServerState.OBSERVING;
- break;
- }
- // Instantiate Notification and set its attributes
- //初始化Notification对象
- Notification n = new Notification();
- n.leader = response.buffer.getLong();
- n.zxid = response.buffer.getLong();
- n.electionEpoch = response.buffer.getLong();
- n.state = ackstate;
- n.sid = response.sid;
- ......
- /*
- * If this server is looking, then send proposed leader
- */
- //如果自己也在LOOKING,则放入业务接收队列,选举主线程会消费该消息
- if(self.getPeerState() == QuorumPeer.ServerState.LOOKING){
- recvqueue.offer(n);
- ......
- }
- //如果自己不在选举中,而对方server在LOOKING中,则向其发送当前的选举结果,当有server加入一个essemble时有用
- else {
- /*
- * If this server is not looking, but the one that sent the ack
- * is looking, then send back what it believes to be the leader.
- */
- Vote current = self.getCurrentVote();
- if(ackstate == QuorumPeer.ServerState.LOOKING){
- if(LOG.isDebugEnabled()){
- LOG.debug("Sending new notification. My id = " +
- self.getId() + " recipient=" +
- response.sid + " zxid=0x" +
- Long.toHexString(current.getZxid()) +
- " leader=" + current.getId());
- }
- ToSend notmsg = new ToSend(
- ToSend.mType.notification,
- current.getId(),
- current.getZxid(),
- logicalclock,
- self.getPeerState(),
- response.sid,
- current.getPeerEpoch());
- sendqueue.offer(notmsg);
- }
- }
- .......
- }