client 端日志如下:就是不停地重连,但是连不上,看日志像是服务端挂了,但是其他 client 确好好地,说明服务端没挂
2017-10-07 23:59:44.466 [DEBUG] [org.apache.zookeeper.ClientCnxn:717] Got ping response for sessionid: 0x35e60a652b00001 after 0ms
2017-10-07 23:59:47.802 [DEBUG] [org.apache.zookeeper.ClientCnxn:717] Got ping response for sessionid: 0x35e60a652b00001 after 0ms
2017-10-07 23:59:55.104 [INFO] [org.apache.zookeeper.ClientCnxn:1096] Client session timed out, have not heard from server in 7302ms for sessionid 0x35e60a652b00001, closing socket connection and attempting reconnect
2017-10-07 23:59:55.219 [INFO] [org.apache.curator.framework.state.ConnectionStateManager:228] State change: SUSPENDED
2017-10-07 23:59:55.225 [INFO] [org.apache.zookeeper.ClientCnxn:975] Opening socket connection to server 10.200.151.145/10.200.151.145:2181. Will not attempt to authenticate using SASL (unknown error)
2017-10-07 23:59:55.226 [INFO] [org.apache.zookeeper.ClientCnxn:852] Socket connection established to 10.200.151.145/10.200.151.145:2181, initiating session
2017-10-07 23:59:55.227 [DEBUG] [org.apache.zookeeper.ClientCnxn:892] Session establishment request sent on 10.200.151.145/10.200.151.145:2181
2017-10-07 23:59:55.229 [INFO] [org.apache.zookeeper.ClientCnxn:1235] Session establishment complete on server 10.200.151.145/10.200.151.145:2181, sessionid = 0x35e60a652b00001, negotiated timeout = 10000
2017-10-07 23:59:55.237 [INFO] [org.apache.curator.framework.state.ConnectionStateManager:228] State change: RECONNECTED
2017-10-07 23:59:55.248 [DEBUG] [org.apache.zookeeper.ClientCnxn:733] Got auth sessionid:0x35e60a652b00001
2017-10-07 23:59:55.249 [INFO] [org.apache.zookeeper.ClientCnxn:1098] Unable to read additional data from server sessionid 0x35e60a652b00001, likely server has closed socket, closing socket connection and attempting reconnect
2017-10-07 23:59:55.832 [DEBUG] [org.apache.curator.RetryLoop:187] Retry-able exception received
org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /root/project
at org.apache.zookeeper.KeeperException.create(KeeperException.java:99) ~[zookeeper-3.4.6.jar:3.4.6-1569965]
at org.apache.zookeeper.KeeperException.create(KeeperException.java:51) ~[zookeeper-3.4.6.jar:3.4.6-1569965]
at org.apache.zookeeper.ZooKeeper.exists(ZooKeeper.java:1045) ~[zookeeper-3.4.6.jar:3.4.6-1569965]
at org.apache.curator.framework.imps.ExistsBuilderImpl$3.call(ExistsBuilderImpl.java:226) ~[curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl$3.call(ExistsBuilderImpl.java:215) ~[curator-framework-2.10.0.jar:na]
at org.apache.curator.RetryLoop.callWithRetry(RetryLoop.java:108) ~[curator-client-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl.pathInForegroundStandard(ExistsBuilderImpl.java:212) [curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl.pathInForeground(ExistsBuilderImpl.java:205) [curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl.forPath(ExistsBuilderImpl.java:168) [curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl.forPath(ExistsBuilderImpl.java:39) [curator-framework-2.10.0.jar:na]
...
at org.apache.curator.framework.state.ConnectionStateManager$2.apply(ConnectionStateManager.java:259) [curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.state.ConnectionStateManager$2.apply(ConnectionStateManager.java:255) [curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.listen.ListenerContainer$1.run(ListenerContainer.java:93) [curator-framework-2.10.0.jar:na]
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) [na:1.7.0_75]
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) [na:1.7.0_75]
at java.lang.Thread.run(Thread.java:745) [na:1.7.0_75]
2017-10-07 23:59:55.846 [INFO] [org.apache.curator.framework.state.ConnectionStateManager:228] State change: SUSPENDED
2017-10-07 23:59:55.870 [INFO] [org.apache.zookeeper.ClientCnxn:975] Opening socket connection to server 10.200.151.144/10.200.151.144:2181. Will not attempt to authenticate using SASL (unknown error)
2017-10-07 23:59:55.871 [INFO] [org.apache.zookeeper.ClientCnxn:852] Socket connection established to 10.200.151.144/10.200.151.144:2181, initiating session
2017-10-07 23:59:55.872 [DEBUG] [org.apache.zookeeper.ClientCnxn:892] Session establishment request sent on 10.200.151.144/10.200.151.144:2181
2017-10-07 23:59:55.875 [INFO] [org.apache.zookeeper.ClientCnxn:1235] Session establishment complete on server 10.200.151.144/10.200.151.144:2181, sessionid = 0x35e60a652b00001, negotiated timeout = 10000
2017-10-07 23:59:55.886 [INFO] [org.apache.curator.framework.state.ConnectionStateManager:228] State change: RECONNECTED
2017-10-07 23:59:55.895 [DEBUG] [org.apache.zookeeper.ClientCnxn:733] Got auth sessionid:0x35e60a652b00001
2017-10-07 23:59:55.896 [INFO] [org.apache.zookeeper.ClientCnxn:1098] Unable to read additional data from server sessionid 0x35e60a652b00001, likely server has closed socket, closing socket connection and attempting reconnect
2017-10-07 23:59:55.997 [DEBUG] [org.apache.curator.RetryLoop:187] Retry-able exception received
org.apache.zookeeper.KeeperException$ConnectionLossException: KeeperErrorCode = ConnectionLoss for /root/project
at org.apache.zookeeper.KeeperException.create(KeeperException.java:99) ~[zookeeper-3.4.6.jar:3.4.6-1569965]
at org.apache.zookeeper.KeeperException.create(KeeperException.java:51) ~[zookeeper-3.4.6.jar:3.4.6-1569965]
at org.apache.zookeeper.ZooKeeper.exists(ZooKeeper.java:1045) ~[zookeeper-3.4.6.jar:3.4.6-1569965]
at org.apache.curator.framework.imps.ExistsBuilderImpl$3.call(ExistsBuilderImpl.java:226) ~[curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl$3.call(ExistsBuilderImpl.java:215) ~[curator-framework-2.10.0.jar:na]
at org.apache.curator.RetryLoop.callWithRetry(RetryLoop.java:108) ~[curator-client-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl.pathInForegroundStandard(ExistsBuilderImpl.java:212) [curator-framework-2.10.0.jar:na]
at org.apache.curator.framework.imps.ExistsBuilderImpl.pathInForeground(ExistsBuilderImpl.java:205) [curator-framework-2.10.0.jar:na]
server 端日志:服务端日志看起来有点像客户端自己断开,但是加上其他日志判断,关键词 Len error,成为了问题的突破口
2017-10-07 23:59:55,071 [myid:3] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@357] - caught end of stream exception
EndOfStreamException: Unable to read additional data from client sessionid 0x35e60a652b00001, likely client has closed socket
at org.apache.zookeeper.server.NIOServerCnxn.doIO(NIOServerCnxn.java:228)
at org.apache.zookeeper.server.NIOServerCnxnFactory.run(NIOServerCnxnFactory.java:208)
at java.lang.Thread.run(Thread.java:745)
2017-10-08 00:00:02,219 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:04,785 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:05,794 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:06,557 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:07,885 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:09,713 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:11,881 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:14,777 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:16,773 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
2017-10-08 00:00:18,365 [myid:1] - WARN [NIOServerCxn.Factory:0.0.0.0/0.0.0.0:2181:NIOServerCnxn@362] - Exception causing close of session 0x35e60a652b00001 due to java.io.IOException: Len error 1400708
private boolean readLength(SelectionKey k) throws IOException {
// Read the length, now get the buffer
int len = lenBuffer.getInt();
if (!initialized && checkFourLetterWord(sk, len)) {
return false;
}
if (len < 0 || len > BinaryInputArchive.maxBuffer) {
throw new IOException("Len error " + len);
}
if (!isZKServerRunning()) {
throw new IOException("ZooKeeperServer not running");
}
incomingBuffer = ByteBuffer.allocate(len);
return true;
}
static public final int maxBuffer = Integer.getInteger("jute.maxbuffer", 0xfffff);
if (!disableAutoWatchReset) {
List dataWatches = zooKeeper.getDataWatches();
List existWatches = zooKeeper.getExistWatches();
List childWatches = zooKeeper.getChildWatches();
if (!dataWatches.isEmpty()
|| !existWatches.isEmpty() || !childWatches.isEmpty()) {
SetWatches sw = new SetWatches(lastZxid,
prependChroot(dataWatches),
prependChroot(existWatches),
prependChroot(childWatches));
RequestHeader h = new RequestHeader();
h.setType(ZooDefs.OpCode.setWatches);
h.setXid(-8);
Packet packet = new Packet(h, new ReplyHeader(), sw, null, null);
outgoingQueue.addFirst(packet);
}
}
if (!dataWatches.isEmpty()
|| !existWatches.isEmpty() || !childWatches.isEmpty()) {
Iterator dataWatchesIter = prependChroot(dataWatches).iterator();
Iterator existWatchesIter = prependChroot(existWatches).iterator();
Iterator childWatchesIter = prependChroot(childWatches).iterator();
long setWatchesLastZxid = lastZxid;
while (dataWatchesIter.hasNext()
|| existWatchesIter.hasNext() || childWatchesIter.hasNext()) {
List dataWatchesBatch = new ArrayList();
List existWatchesBatch = new ArrayList();
List childWatchesBatch = new ArrayList();
int batchLength = 0;
// Note, we may exceed our max length by a bit when we add the last
// watch in the batch. This isn't ideal, but it makes the code simpler.
while (batchLength < SET_WATCHES_MAX_LENGTH) {
final String watch;
if (dataWatchesIter.hasNext()) {
watch = dataWatchesIter.next();
dataWatchesBatch.add(watch);
} else if (existWatchesIter.hasNext()) {
watch = existWatchesIter.next();
existWatchesBatch.add(watch);
} else if (childWatchesIter.hasNext()) {
watch = childWatchesIter.next();
childWatchesBatch.add(watch);
} else {
break;
}
batchLength += watch.length();
}
SetWatches sw = new SetWatches(setWatchesLastZxid,
dataWatchesBatch,
existWatchesBatch,
childWatchesBatch);
RequestHeader header = new RequestHeader(-8, OpCode.setWatches);
Packet packet = new Packet(header, new ReplyHeader(), sw, null, null);
outgoingQueue.addFirst(packet);
}
private String PATH = "/dsc/task";
static CuratorFramework client = null;
@Test
public void testLargeWatchBug() {
try {
CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder().connectString("192.168.243.128:2181")
.retryPolicy(new RetryOneTime(1000))
.connectionTimeoutMs(10000)
.sessionTimeoutMs(10000)
.namespace("yanxuan_dsc");
builder.authorization("digest", "yanxuan_dsc".getBytes("utf-8"))
.aclProvider(new ACLProvider() {
@Override
public List getDefaultAcl() {
return ZooDefs.Ids.CREATOR_ALL_ACL;
}
@Override
public List getAclForPath(final String path) {
return ZooDefs.Ids.CREATOR_ALL_ACL;
}
});
client = builder.build();
client.start();
prepareData();
startWatch();
} catch (Exception e) {
e.printStackTrace();
}
}
private void prepareData() throws Exception {
int childNum = 1024;
char[] chars = new char[1024];
Arrays.fill(chars, 'x');
String nameSuffix = new String(chars);
// client.create().forPath(PATH);
for (int i = 0; i < childNum; i++) {
String childPath = PATH + "/" + i + "-" /*+ nameSuffix*/;
client.create().forPath(childPath);
}
}
private void startWatch() throws Exception {
PathChildrenCache watcher = new PathChildrenCache(client, PATH, true);
watcher.getListenable().addListener(new PathChildrenCacheListener() {
@Override
public void childEvent(CuratorFramework client, PathChildrenCacheEvent event) throws Exception {
ChildData data = event.getData();
if (data == null) {
Constant.INIT_LOGGER.info("No data in event[" + event + "]");
} else {
Constant.INIT_LOGGER.info("Receive event: "
+ "type=[" + event.getType() + "]"
+ ", path=[" + data.getPath() + "]"
+ ", data=[" + new String(data.getData()) + "]"
+ ", stat=[" + data.getStat().toString().trim() + "]");
}
}
}
);
watcher.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
}