对网络编程来说,最基本的三要素是IO, 协议(编解码),服务器端线程模型。这篇来看看ZooKeeper是如何实现高性能的网络程序。
IO模型
ZooKeeper默认提供了两种网络IO的实现,一个是Java原生的NIO,一个是基于Netty的IO。先从ServerCnxn这个抽象类看起,它表示一个从客户端到服务器端的网络连接。ServerCnxn实现了Stat服务器端统计接口,Watcher接口。 Watcher接口里面定义了KeeperState和EventType这两个枚举类型。
ServerCnxn有两个默认实现类,一个是基于JDK原生NIO的NIOServerCnxn,一个是基于Netty的NettyServerCnxn。
// ServerCnxn的属性 public abstract class ServerCnxn implements Stats, Watcher { protected abstract ServerStats serverStats(); protected final Date established = new Date(); protected final AtomicLong packetsReceived = new AtomicLong(); protected final AtomicLong packetsSent = new AtomicLong(); protected long minLatency; protected long maxLatency; protected String lastOp; protected long lastCxid; protected long lastZxid; protected long lastResponseTime; protected long lastLatency; protected long count; protected long totalLatency; }
public NIOServerCnxn(ZooKeeperServer zk, SocketChannel sock, SelectionKey sk, NIOServerCnxnFactory factory) throws IOException { this.zkServer = zk; this.sock = sock; this.sk = sk; this.factory = factory; if (this.factory.login != null) { this.zooKeeperSaslServer = new ZooKeeperSaslServer(factory.login); } if (zk != null) { outstandingLimit = zk.getGlobalOutstandingLimit(); } sock.socket().setTcpNoDelay(true); /* set socket linger to false, so that socket close does not * block */ sock.socket().setSoLinger(false, -1); InetAddress addr = ((InetSocketAddress) sock.socket() .getRemoteSocketAddress()).getAddress(); authInfo.add(new Id("ip", addr.getHostAddress())); sk.interestOps(SelectionKey.OP_READ); }
NIOServerCnxn的核心方法是doIO, 它实现了SelectionKey被Selector选出后,SocketChannel如何进行读写
SocketChannel从客户端读数据的过程:
1. NIOServerCnxc维护了两个读数据的ByteBuffer, 一个是 lenBuffer = ByteBuffer.allocate(4), 4个字节的ByteBuffer,表示是否是4个字符的命令消息,比如ruok, conf这种命令。ByteBuffer incomingBuffer表示用来存放读数据ByteBuffer, 初始状态下incomingBuffer指向lenBuffer
2. SocketChannel先向incomingBuffer写入数据,如果写入的长度小于0就抛异常。如果正常写入,并且incomingBuffer写满了,如果incomingBuffer是指向lenBuffer,表示这次读的是4个字节的长度。
3. readLength方法会判断是否是4字符命令,首先调用checkFourLetterWord来判断是否是4字符命令
4. 在checkFourLetterWord中,如果是4字符命令,就调用对应的线程CommandThread,启动单独的线程去执行对应的命令,具体的如何写会在后面说.
如果不是4字符命令,就会incomingBuffer分配对应长度的ByteBuffer, incomingBuffer = ByteBuffer.allocate(len); 不在指向lenBuffer
5. 如果不是4字符命令,进入到readPayload分支。在readPayload判断incomingBuffer是否满包,如果不是,就尝试读一次SocketChanel。如果这时候满包了,就调用flip方法切换到读模式,如果是第一次读到请求,就进入readConnectRequest,如果不是就进入到readRequest。 最后 incomingBuffer = lenBuffer; 再次指向lenBuffer,读下一个请求。
void doIO(SelectionKey k) throws InterruptedException { try { if (isSocketOpen() == false) { LOG.warn("trying to do i/o on a null socket for session:0x" + Long.toHexString(sessionId)); return; } if (k.isReadable()) { int rc = sock.read(incomingBuffer); if (rc < 0) { throw new EndOfStreamException( "Unable to read additional data from client sessionid 0x" + Long.toHexString(sessionId) + ", likely client has closed socket"); } if (incomingBuffer.remaining() == 0) { boolean isPayload; if (incomingBuffer == lenBuffer) { // start of next request incomingBuffer.flip(); isPayload = readLength(k); incomingBuffer.clear(); } else { // continuation isPayload = true; } if (isPayload) { // not the case for 4letterword readPayload(); } else { // four letter words take care // need not do anything else return; } } } ............ } private boolean readLength(SelectionKey k) throws IOException { // Read the length, now get the buffer int len = lenBuffer.getInt(); if (!initialized && checkFourLetterWord(sk, len)) { return false; } if (len < 0 || len > BinaryInputArchive.maxBuffer) { throw new IOException("Len error " + len); } if (zkServer == null) { throw new IOException("ZooKeeperServer not running"); } incomingBuffer = ByteBuffer.allocate(len); return true; } private boolean checkFourLetterWord(final SelectionKey k, final int len) throws IOException { // We take advantage of the limited size of the length to look // for cmds. They are all 4-bytes which fits inside of an int String cmd = cmd2String.get(len); if (cmd == null) { return false; } LOG.info("Processing " + cmd + " command from " + sock.socket().getRemoteSocketAddress()); packetReceived(); /** cancel the selection key to remove the socket handling * from selector. This is to prevent netcat problem wherein * netcat immediately closes the sending side after sending the * commands and still keeps the receiving channel open. * The idea is to remove the selectionkey from the selector * so that the selector does not notice the closed read on the * socket channel and keep the socket alive to write the data to * and makes sure to close the socket after its done writing the data */ if (k != null) { try { k.cancel(); } catch(Exception e) { LOG.error("Error cancelling command selection key ", e); } } final PrintWriter pwriter = new PrintWriter( new BufferedWriter(new SendBufferWriter())); if (len == ruokCmd) { RuokCommand ruok = new RuokCommand(pwriter); ruok.start(); return true; } else if (len == getTraceMaskCmd) { TraceMaskCommand tmask = new TraceMaskCommand(pwriter); tmask.start(); return true; } else if (len == setTraceMaskCmd) { int rc = sock.read(incomingBuffer); if (rc < 0) { throw new IOException("Read error"); } incomingBuffer.flip(); long traceMask = incomingBuffer.getLong(); ZooTrace.setTextTraceLevel(traceMask); SetTraceMaskCommand setMask = new SetTraceMaskCommand(pwriter, traceMask); setMask.start(); return true; } else if (len == enviCmd) { EnvCommand env = new EnvCommand(pwriter); env.start(); return true; } else if (len == confCmd) { ConfCommand ccmd = new ConfCommand(pwriter); ccmd.start(); return true; } else if (len == srstCmd) { StatResetCommand strst = new StatResetCommand(pwriter); strst.start(); return true; } else if (len == crstCmd) { CnxnStatResetCommand crst = new CnxnStatResetCommand(pwriter); crst.start(); return true; } else if (len == dumpCmd) { DumpCommand dump = new DumpCommand(pwriter); dump.start(); return true; } else if (len == statCmd || len == srvrCmd) { StatCommand stat = new StatCommand(pwriter, len); stat.start(); return true; } else if (len == consCmd) { ConsCommand cons = new ConsCommand(pwriter); cons.start(); return true; } else if (len == wchpCmd || len == wchcCmd || len == wchsCmd) { WatchCommand wcmd = new WatchCommand(pwriter, len); wcmd.start(); return true; } else if (len == mntrCmd) { MonitorCommand mntr = new MonitorCommand(pwriter); mntr.start(); return true; } else if (len == isroCmd) { IsroCommand isro = new IsroCommand(pwriter); isro.start(); return true; } return false; } private void readPayload() throws IOException, InterruptedException { if (incomingBuffer.remaining() != 0) { // have we read length bytes? int rc = sock.read(incomingBuffer); // sock is non-blocking, so ok if (rc < 0) { throw new EndOfStreamException( "Unable to read additional data from client sessionid 0x" + Long.toHexString(sessionId) + ", likely client has closed socket"); } } if (incomingBuffer.remaining() == 0) { // have we read length bytes? packetReceived(); incomingBuffer.flip(); if (!initialized) { readConnectRequest(); } else { readRequest(); } lenBuffer.clear(); incomingBuffer = lenBuffer; } }
NIOServerCnxn写数据的过程如下:
1. 创建一个LinkedBlockingQueue<ByteBuffer>类型的outgoingBuffers来优化写,可以一次写多个ByteBuffer
2. 如果SelectionKey是因为写消息被Selector选中 的,先判断outgoingBuffers的长度是否大于0,如果大于0,就把outgoingBuffers中的ByteBuffer的数据都复制到factory.directBuffer这个直接内存的缓冲区中,如果directBuffer满了或者outgoingBuffers都已经复制到directBuffer了,就调用它的flip方法把它切换到读模式,然后把它的数据写入到SocketChannel中去。
由此可见,每次写的时候,都是从directBuffer写到SocketChannel中去的,利用直接内存优化了写操作。
写完后清理一下outgoingBuffers,把已经写完的ByteBuffer清理掉
3. 如果outgoingBuffers都写完了,就把SocketChannel切换到读模式中,关闭对写标志位的监听。如果没写完,继续监听写请求。
void doIO(SelectionKey k) throws InterruptedException { try { if (isSocketOpen() == false) { LOG.warn("trying to do i/o on a null socket for session:0x" + Long.toHexString(sessionId)); return; } ....... if (k.isWritable()) { if (outgoingBuffers.size() > 0) { ByteBuffer directBuffer = factory.directBuffer; directBuffer.clear(); for (ByteBuffer b : outgoingBuffers) { if (directBuffer.remaining() < b.remaining()) { b = (ByteBuffer) b.slice().limit( directBuffer.remaining()); } int p = b.position(); directBuffer.put(b); b.position(p); if (directBuffer.remaining() == 0) { break; } } directBuffer.flip(); int sent = sock.write(directBuffer); ByteBuffer bb; // Remove the buffers that we have sent while (outgoingBuffers.size() > 0) { bb = outgoingBuffers.peek(); if (bb == ServerCnxnFactory.closeConn) { throw new CloseRequestException("close requested"); } int left = bb.remaining() - sent; if (left > 0) { bb.position(bb.position() + sent); break; } packetSent(); sent -= bb.remaining(); outgoingBuffers.remove(); } // ZooLog.logTraceMessage(LOG, // ZooLog.CLIENT_DATA_PACKET_TRACE_MASK, "after send, // outgoingBuffers.size() = " + outgoingBuffers.size()); } synchronized(this.factory){ if (outgoingBuffers.size() == 0) { if (!initialized && (sk.interestOps() & SelectionKey.OP_READ) == 0) { throw new CloseRequestException("responded to info probe"); } sk.interestOps(sk.interestOps() & (~SelectionKey.OP_WRITE)); } else { sk.interestOps(sk.interestOps() | SelectionKey.OP_WRITE); } } } } catch (CancelledKeyException e) { LOG.warn("Exception causing close of session 0x" + Long.toHexString(sessionId) + " due to " + e); if (LOG.isDebugEnabled()) { LOG.debug("CancelledKeyException stack trace", e); } close(); } catch (CloseRequestException e) { // expecting close to log session closure close(); } catch (EndOfStreamException e) { LOG.warn("caught end of stream exception",e); // tell user why // expecting close to log session closure close(); } catch (IOException e) { LOG.warn("Exception causing close of session 0x" + Long.toHexString(sessionId) + " due to " + e); if (LOG.isDebugEnabled()) { LOG.debug("IOException stack trace", e); } close(); } }
1.基于同步IO的 sendBufferSync方法直接把SocketChannel设置为阻塞模式,然后直接写到Socket中去。上面提到的相应4字符命令的场景,就是使用了sendBufferSync的方法,直接写。
2. sendBuffer方法使用了NIO,它主要是因为使用了outgoingBuffers队列来优化写操作,可以一次写多个ByteBuffer。写的时候,先加入到outgoingBuffers,然后设置SelectionKey的写标志位,这样在下次Selector执行select方法时,可以进行写的动作
void sendBufferSync(ByteBuffer bb) { try { /* configure socket to be blocking * so that we dont have to do write in * a tight while loop */ sock.configureBlocking(true); if (bb != ServerCnxnFactory.closeConn) { if (sock.isOpen()) { sock.write(bb); } packetSent(); } } catch (IOException ie) { LOG.error("Error sending data synchronously ", ie); } } public void sendBuffer(ByteBuffer bb) { try { if (bb != ServerCnxnFactory.closeConn) { // We check if write interest here because if it is NOT set, // nothing is queued, so we can try to send the buffer right // away without waking up the selector if ((sk.interestOps() & SelectionKey.OP_WRITE) == 0) { try { sock.write(bb); } catch (IOException e) { // we are just doing best effort right now } } // if there is nothing left to send, we are done if (bb.remaining() == 0) { packetSent(); return; } } synchronized(this.factory){ sk.selector().wakeup(); if (LOG.isTraceEnabled()) { LOG.trace("Add a buffer to outgoingBuffers, sk " + sk + " is valid: " + sk.isValid()); } outgoingBuffers.add(bb); if (sk.isValid()) { sk.interestOps(sk.interestOps() | SelectionKey.OP_WRITE); } } } catch(Exception e) { LOG.error("Unexpected Exception: ", e); } }
协议(编解码)
ZooKeeper使用Apache jute来序列化和反序列化Java对象,把Java对象序列化成二进制数据在网络中传播。在上一篇从ZooKeeper源代码看如何实现分布式系统(二)数据的高可用存储 中已经介绍了Apache Jute,这里不再赘述,简单看一下ZooKeeperServer是如何处理收到的数据包的,可以看到如何把二进制的请求序列化成Java对象来使用。
1. 先用ByteBufferInputStream来将incomingBuffer封装成流,然后用Jute的接口读到RequestHeader对象,这个对象实现了Jute的Record接口
2. RequestHeader只有两个属性,xid表示事务id,type表示请求的类型
3. 如果是auth类型的请求,从incomingBuffer中读取数据,反序列化到AuthPacket中,然后调用AuthenticationProvider来进行认证
4.如果是sasl的请求,执行相应的代码
5. 对于其他的事务请求,构造一个Request对象,进入到submitRequest方法去执行相应的事务请求。
// ZooKeeperServer public void processPacket(ServerCnxn cnxn, ByteBuffer incomingBuffer) throws IOException { // We have the request, now process and setup for next InputStream bais = new ByteBufferInputStream(incomingBuffer); BinaryInputArchive bia = BinaryInputArchive.getArchive(bais); RequestHeader h = new RequestHeader(); h.deserialize(bia, "header"); // Through the magic of byte buffers, txn will not be // pointing // to the start of the txn incomingBuffer = incomingBuffer.slice(); if (h.getType() == OpCode.auth) { LOG.info("got auth packet " + cnxn.getRemoteSocketAddress()); AuthPacket authPacket = new AuthPacket(); ByteBufferInputStream.byteBuffer2Record(incomingBuffer, authPacket); String scheme = authPacket.getScheme(); AuthenticationProvider ap = ProviderRegistry.getProvider(scheme); Code authReturn = KeeperException.Code.AUTHFAILED; if(ap != null) { try { authReturn = ap.handleAuthentication(cnxn, authPacket.getAuth()); } catch(RuntimeException e) { LOG.warn("Caught runtime exception from AuthenticationProvider: " + scheme + " due to " + e); authReturn = KeeperException.Code.AUTHFAILED; } } if (authReturn!= KeeperException.Code.OK) { if (ap == null) { LOG.warn("No authentication provider for scheme: " + scheme + " has " + ProviderRegistry.listProviders()); } else { LOG.warn("Authentication failed for scheme: " + scheme); } // send a response... ReplyHeader rh = new ReplyHeader(h.getXid(), 0, KeeperException.Code.AUTHFAILED.intValue()); cnxn.sendResponse(rh, null, null); // ... and close connection cnxn.sendBuffer(ServerCnxnFactory.closeConn); cnxn.disableRecv(); } else { if (LOG.isDebugEnabled()) { LOG.debug("Authentication succeeded for scheme: " + scheme); } LOG.info("auth success " + cnxn.getRemoteSocketAddress()); ReplyHeader rh = new ReplyHeader(h.getXid(), 0, KeeperException.Code.OK.intValue()); cnxn.sendResponse(rh, null, null); } return; } else { if (h.getType() == OpCode.sasl) { Record rsp = processSasl(incomingBuffer,cnxn); ReplyHeader rh = new ReplyHeader(h.getXid(), 0, KeeperException.Code.OK.intValue()); cnxn.sendResponse(rh,rsp, "response"); // not sure about 3rd arg..what is it? } else { Request si = new Request(cnxn, cnxn.getSessionId(), h.getXid(), h.getType(), incomingBuffer, cnxn.getAuthInfo()); si.setOwner(ServerCnxn.me); submitRequest(si); } } cnxn.incrOutstandingRequests(h); }
线程模型
ZooKeeper提供了两种服务器端的线程模型,一种是基于原生NIO的reactor模型,一种是基于Netty的reactor模型。我们看一下基于NIO的reactor模型。
NIOServerCnxnFactory封装了Selector对象来做事件分发。NIOServerCnxnFactory本身实现了Runnable接口来作为一个可运行的线程。它还维护了一个线程,来使它本身作为一个单独的线程运行。
1. configure方法创建了一个守护线程,并且创建了ServerSocketChannel,注册到了Selector上去监听ACCEPT事件
2. 维护了一个HashMap,由客户端IP映射到来自该IP的NIOServerCnxn连接对象。
3. start方法启动线程,开始监听端口来响应客户端请求
4. run方法就是reactor模型的EventLoop,Selector每隔1秒执行一次select方法来处理IO请求,并分发到对应的SocketChannel中去。可以看到在分发请求的时候并没有创建新的线程
所以NIOServerCnxnFactory是一个最简单的单线程的reactor模型,由一个线程来进行IO事件的分发,以及IO的读写
public class NIOServerCnxnFactory extends ServerCnxnFactory implements Runnable { ServerSocketChannel ss; final Selector selector = Selector.open(); Thread thread; public void configure(InetSocketAddress addr, int maxcc) throws IOException { configureSaslLogin(); thread = new Thread(this, "NIOServerCxn.Factory:" + addr); thread.setDaemon(true); maxClientCnxns = maxcc; this.ss = ServerSocketChannel.open(); ss.socket().setReuseAddress(true); LOG.info("binding to port " + addr); ss.socket().bind(addr); ss.configureBlocking(false); ss.register(selector, SelectionKey.OP_ACCEPT); } final HashMap<InetAddress, Set<NIOServerCnxn>> ipMap = new HashMap<InetAddress, Set<NIOServerCnxn>>( ); public void start() { // ensure thread is started once and only once if (thread.getState() == Thread.State.NEW) { thread.start(); } } private void addCnxn(NIOServerCnxn cnxn) { synchronized (cnxns) { cnxns.add(cnxn); synchronized (ipMap){ InetAddress addr = cnxn.sock.socket().getInetAddress(); Set<NIOServerCnxn> s = ipMap.get(addr); if (s == null) { // in general we will see 1 connection from each // host, setting the initial cap to 2 allows us // to minimize mem usage in the common case // of 1 entry -- we need to set the initial cap // to 2 to avoid rehash when the first entry is added s = new HashSet<NIOServerCnxn>(2); s.add(cnxn); ipMap.put(addr,s); } else { s.add(cnxn); } } } } public void run() { while (!ss.socket().isClosed()) { try { selector.select(1000); Set<SelectionKey> selected; synchronized (this) { selected = selector.selectedKeys(); } ArrayList<SelectionKey> selectedList = new ArrayList<SelectionKey>( selected); Collections.shuffle(selectedList); for (SelectionKey k : selectedList) { if ((k.readyOps() & SelectionKey.OP_ACCEPT) != 0) { SocketChannel sc = ((ServerSocketChannel) k .channel()).accept(); InetAddress ia = sc.socket().getInetAddress(); int cnxncount = getClientCnxnCount(ia); if (maxClientCnxns > 0 && cnxncount >= maxClientCnxns){ LOG.warn("Too many connections from " + ia + " - max is " + maxClientCnxns ); sc.close(); } else { LOG.info("Accepted socket connection from " + sc.socket().getRemoteSocketAddress()); sc.configureBlocking(false); SelectionKey sk = sc.register(selector, SelectionKey.OP_READ); NIOServerCnxn cnxn = createConnection(sc, sk); sk.attach(cnxn); addCnxn(cnxn); } } else if ((k.readyOps() & (SelectionKey.OP_READ | SelectionKey.OP_WRITE)) != 0) { NIOServerCnxn c = (NIOServerCnxn) k.attachment(); c.doIO(k); } else { if (LOG.isDebugEnabled()) { LOG.debug("Unexpected ops in select " + k.readyOps()); } } } selected.clear(); } catch (RuntimeException e) { LOG.warn("Ignoring unexpected runtime exception", e); } catch (Exception e) { LOG.warn("Ignoring exception", e); } } closeAll(); LOG.info("NIOServerCnxn factory exited run method"); }