源码基于6.7.2
在节点启动的过程中会去调用NodeConnectionsService.start()来进行连接到其他节点
public Node start() throws NodeValidationException {
if (!lifecycle.moveToStarted()) {
return this;
}
logger.info("starting ...");
pluginLifecycleComponents.forEach(LifecycleComponent::start);
...
...
...
final NodeConnectionsService nodeConnectionsService = injector.getInstance(NodeConnectionsService.class);
// 启动nodeConnectionsService,进行对发现节点的连接的创建。
nodeConnectionsService.start();
...
...
...
// 立即启动传输服务,以便将publish address添加到 ClusterService 中的本地 disco 节点
TransportService transportService = injector.getInstance(TransportService.class);
transportService.getTaskManager().setTaskResultsService(injector.getInstance(TaskResultsService.class));
transportService.start();
assert localNodeFactory.getNode() != null;
assert transportService.getLocalNode().equals(localNodeFactory.getNode())
: "transportService has a different local node than the factory provided";
final MetaData onDiskMetadata;
try {
// we load the global state here (the persistent part of the cluster state stored on disk) to
// pass it to the bootstrap checks to allow plugins to enforce certain preconditions based on the recovered state.
if (DiscoveryNode.isMasterNode(settings) || DiscoveryNode.isDataNode(settings)) {
onDiskMetadata = injector.getInstance(GatewayMetaState.class).loadMetaState();
} else {
onDiskMetadata = MetaData.EMPTY_META_DATA;
}
assert onDiskMetadata != null : "metadata is null but shouldn't"; // this is never null
} catch (IOException e) {
throw new UncheckedIOException(e);
}
checkIfClusterNameInDataPaths(clusterService.getClusterName(), environment.dataFiles());
validateNodeBeforeAcceptingRequests(new BootstrapContext(environment, onDiskMetadata), transportService.boundAddress(),
pluginsService.filterPlugins(Plugin.class).stream()
.flatMap(p -> p.getBootstrapChecks().stream()).collect(Collectors.toList()));
clusterService.addStateApplier(transportService.getTaskManager());
// start after transport service so the local disco is known
discovery.start(); // start before cluster service so that it can set initial state on ClusterApplierService
clusterService.start();
assert clusterService.localNode().equals(localNodeFactory.getNode())
: "clusterService has a different local node than the factory provided";
transportService.acceptIncomingRequests();
discovery.startInitialJoin();
// tribe nodes don't have a master so we shouldn't register an observer s
final TimeValue initialStateTimeout = DiscoverySettings.INITIAL_STATE_TIMEOUT_SETTING.get(settings);
if (initialStateTimeout.millis() > 0) {
final ThreadPool thread = injector.getInstance(ThreadPool.class);
ClusterState clusterState = clusterService.state();
ClusterStateObserver observer = new ClusterStateObserver(clusterState, clusterService, null, logger, thread.getThreadContext());
if (clusterState.nodes().getMasterNodeId() == null) {
logger.debug("waiting to join the cluster. timeout [{}]", initialStateTimeout);
final CountDownLatch latch = new CountDownLatch(1);
observer.waitForNextChange(new ClusterStateObserver.Listener() {
@Override
public void onNewClusterState(ClusterState state) { latch.countDown(); }
@Override
public void onClusterServiceClose() {
latch.countDown();
}
@Override
public void onTimeout(TimeValue timeout) {
logger.warn("timed out while waiting for initial discovery state - timeout: {}",
initialStateTimeout);
latch.countDown();
}
}, state -> state.nodes().getMasterNodeId() != null, initialStateTimeout);
try {
latch.await();
} catch (InterruptedException e) {
throw new ElasticsearchTimeoutException("Interrupted while waiting for initial discovery state");
}
}
}
if (NetworkModule.HTTP_ENABLED.get(settings)) {
injector.getInstance(HttpServerTransport.class).start();
}
if (WRITE_PORTS_FILE_SETTING.get(settings)) {
if (NetworkModule.HTTP_ENABLED.get(settings)) {
HttpServerTransport http = injector.getInstance(HttpServerTransport.class);
writePortsFile("http", http.boundAddress());
}
TransportService transport = injector.getInstance(TransportService.class);
writePortsFile("transport", transport.boundAddress());
}
logger.info("started");
pluginsService.filterPlugins(ClusterPlugin.class).forEach(ClusterPlugin::onNodeStarted);
return this;
}
NodeConnectionsService的doStart方法,通过generic线程池每隔一段时间进行重连检查。reconnectInterval默认值10s。
@Override
protected void doStart() {
backgroundCancellable = threadPool.schedule(new ConnectionChecker(), reconnectInterval, ThreadPool.Names.GENERIC);
}
在ConnectionChecker的doRun方法中,会去调用validateAndConnectIfNeeded对每个DiscoveryNode进行连接验证。如果DiscoveryNode是节点本身,则跳过。否则通过ConnectionManager.connectToNode(…)去连接其他节点,如果connectedNodes中包含了这个DiscoveryNode的连接,则跳过,否则去通过TcpTransport的openConnection去开启连接。
public void connectToNode(DiscoveryNode node, ConnectionProfile connectionProfile,
CheckedBiConsumer<Transport.Connection, ConnectionProfile, IOException> connectionValidator)
throws ConnectTransportException {
ConnectionProfile resolvedProfile = ConnectionProfile.resolveConnectionProfile(connectionProfile, defaultProfile);
if (node == null) {
throw new ConnectTransportException(null, "can't connect to a null node");
}
closeLock.readLock().lock(); // ensure we don't open connections while we are closing
try {
ensureOpen();
try (Releasable ignored = connectionLock.acquire(node.getId())) {
// 如果connectedNodes中包含了这个DiscoveryNode的连接,则跳过
Transport.Connection connection = connectedNodes.get(node);
if (connection != null) {
return;
}
boolean success = false;
try {
// 通过TcpTransport的openConnection去开启连接
connection = internalOpenConnection(node, resolvedProfile);
connectionValidator.accept(connection, resolvedProfile);
// we acquire a connection lock, so no way there is an existing connection
connectedNodes.put(node, connection);
if (logger.isDebugEnabled()) {
logger.debug("connected to node [{}]", node);
}
try {
connectionListener.onNodeConnected(node);
} finally {
final Transport.Connection finalConnection = connection;
connection.addCloseListener(ActionListener.wrap(() -> {
connectedNodes.remove(node, finalConnection);
connectionListener.onNodeDisconnected(node);
}));
}
if (connection.isClosed()) {
throw new NodeNotConnectedException(node, "connection concurrently closed");
}
success = true;
} catch (ConnectTransportException e) {
throw e;
} catch (Exception e) {
throw new ConnectTransportException(node, "general node connection failure", e);
} finally {
if (success == false) { // close the connection if there is a failure
logger.trace(() -> new ParameterizedMessage("failed to connect to [{}], cleaning dangling connections", node));
IOUtils.closeWhileHandlingException(connection);
}
}
}
} finally {
closeLock.readLock().unlock();
}
}
openConnection方法会重载ConnectionProfile配置文件,再去调用initiateConnection进行连接初始化。可以发现,只要有一个连接失败会去关闭此次的所有连接。
private List<TcpChannel> initiateConnection(DiscoveryNode node, ConnectionProfile connectionProfile,
ActionListener<Transport.Connection> listener) {
// 建立连接的中连接数
int numConnections = connectionProfile.getNumConnections();
assert numConnections > 0 : "A connection profile must be configured with at least one connection";
final List<TcpChannel> channels = new ArrayList<>(numConnections);
for (int i = 0; i < numConnections; ++i) {
try {
// 建立一个连接
TcpChannel channel = initiateChannel(node);
logger.trace(() -> new ParameterizedMessage("Tcp transport client channel opened: {}", channel));
channels.add(channel);
} catch (ConnectTransportException e) {
// 关闭此次的所有连接
CloseableChannel.closeChannels(channels, false);
listener.onFailure(e);
return channels;
} catch (Exception e) {
// 关闭此次的所有连接
CloseableChannel.closeChannels(channels, false);
listener.onFailure(new ConnectTransportException(node, "general node connection failure", e));
return channels;
}
}
ChannelsConnectedListener channelsConnectedListener = new ChannelsConnectedListener(node, connectionProfile, channels, listener);
for (TcpChannel channel : channels) {
channel.addConnectListener(channelsConnectedListener);
}
TimeValue connectTimeout = connectionProfile.getConnectTimeout();
// 通过generic线程池调度连接全部建立完成的监听
threadPool.schedule(channelsConnectedListener::onTimeout, connectTimeout, ThreadPool.Names.GENERIC);
return channels;
}
在ChannelsConnectedListener的onResponse中会执行一次心跳,执行结束后将连接保存在NodeChannels中。
@Override
public void onResponse(Void v) {
// Returns true if all connections have completed successfully
if (countDown.countDown()) {
final TcpChannel handshakeChannel = channels.get(0);
try {
// 去执行一次心跳
executeHandshake(node, handshakeChannel, connectionProfile, new ActionListener<Version>() {
@Override
public void onResponse(Version version) {
NodeChannels nodeChannels = new NodeChannels(node, channels, connectionProfile, version);
long relativeMillisTime = threadPool.relativeTimeInMillis();
nodeChannels.channels.forEach(ch -> {
// Mark the channel init time
ch.getChannelStats().markAccessed(relativeMillisTime);
ch.addCloseListener(ActionListener.wrap(nodeChannels::close));
});
// 将连接通过NodeChannels和连接配置注册在keepAlive中。keepAlive实现keep alive ping的调度和发送。 客户端通道向服务器发送保持活动 ping,服务器通道做出响应。 仅当通道自上次 ping 以来未发送和接收消息时,才会在预定时间发送 ping。
keepAlive.registerNodeConnection(nodeChannels.channels, connectionProfile);
listener.onResponse(nodeChannels);
}
@Override
public void onFailure(Exception e) {
...
});
} catch (Exception ex) {
CloseableChannel.closeChannels(channels, false);
listener.onFailure(ex);
}
}
}
完成了与其他节点创建连接了。
以节点启动,加入集群为例。
在Nodes的start方法中,会进行执行以下语句,主要执行的是ZenDiscovery的startInitialJoin方法
discovery.startInitialJoin();
ZenDiscovery有一个私有的内部类JoinThreadControl,主要作用是加入线程的所有控制都应该发生在集群状态更新任务线程下。 这对于确保后台加入过程始终与任何集群状态更新同步非常重要,例如主服务器丢失、加入失败、加入时收到的集群状态等。
当没有JoinThread存活的时候,会generic线程池中创建一个JoinThread线程。innerJoinCluster是JoinThread的主要方法。主要保证加入集群或在失败时生成新的加入线程。
private void innerJoinCluster() {
...
...
...
// 检查当前节点是否是主节点,如果是主节点,则需要加入的节点数过半
if (transportService.getLocalNode().equals(masterNode)) {
final int requiredJoins = Math.max(0, electMaster.minimumMasterNodes() - 1); // we count as one
logger.debug("elected as master, waiting for incoming joins ([{}] needed)", requiredJoins);
nodeJoinController.waitToBeElectedAsMaster(requiredJoins, masterElectionWaitForJoinsTimeout,
new NodeJoinController.ElectionCallback() {
@Override
public void onElectedAsMaster(ClusterState state) {
synchronized (stateMutex) {
joinThreadControl.markThreadAsDone(currentThread);
}
}
@Override
public void onFailure(Throwable t) {
logger.trace("failed while waiting for nodes to join, rejoining", t);
synchronized (stateMutex) {
joinThreadControl.markThreadAsDoneAndStartNew(currentThread);
}
}
}
);
} else {
// process any incoming joins (they will fail because we are not the master)
nodeJoinController.stopElectionContext(masterNode + " elected");
// 当此节点不是master时,向master发送 join请求
final boolean success = joinElectedMaster(masterNode);
...
...
...
}
}
在发现当此节点不是master时,向master发送 join请求。joinElectedMaster在发送请求之前,先要确定与master的连接是否正常。
private boolean joinElectedMaster(DiscoveryNode masterNode) {
try {
// first, make sure we can connect to the master
transportService.connectToNode(masterNode);
} catch (Exception e) {
logger.warn(() -> new ParameterizedMessage("failed to connect to master [{}], retrying...", masterNode), e);
return false;
}
...
...
...
// 向主节点发送请求
membership.sendJoinRequestBlocking(masterNode, transportService.getLocalNode(), joinTimeout);
...
...
...
}
在调用transportService.submitRequest的方法是传入了EmptyTransportResponseHandler作为响应处理。
public void sendJoinRequestBlocking(DiscoveryNode masterNode, DiscoveryNode node, TimeValue timeout) {
transportService.submitRequest(masterNode, DISCOVERY_JOIN_ACTION_NAME, new JoinRequest(node),
EmptyTransportResponseHandler.INSTANCE_SAME).txGet(timeout.millis(), TimeUnit.MILLISECONDS);
}
进入transportService后发现,这类提供了多个submitRequest和sendRequest方法进行请求的发送。调用asyncSender.sendRequest进行网络请求的,最终有NodeChannels.sendRequest发送数据。在实际发送之前会根据请求获取连接的类型,在以循环的方式选取其中一个Channel进行使用。
@Override
public void sendRequest(long requestId, String action, TransportRequest request, TransportRequestOptions options)
throws IOException, TransportException {
if (isClosing.get()) {
throw new NodeNotConnectedException(node, "connection already closed");
}
// 根据操作类型获取对应的连接类型,在以循环的方式选取其中一个Channel
TcpChannel channel = channel(options.type());
sendRequestToChannel(this.node, channel, requestId, action, request, options, getVersion(), compress);
}
NodeChannels的channel方法
public TcpChannel channel(TransportRequestOptions.Type type) {
ConnectionProfile.ConnectionTypeHandle connectionTypeHandle = typeMapping.get(type);
if (connectionTypeHandle == null) {
throw new IllegalArgumentException("no type channel for [" + type + "]");
}
return connectionTypeHandle.getChannel(channels);
}
ConnectionTypeHandle的getChannel方法
<T> T getChannel(List<T> channels) {
if (length == 0) {
throw new IllegalStateException("can't select channel size is 0 for types: " + types);
}
assert channels.size() >= offset + length : "illegal size: " + channels.size() + " expected >= " + (offset + length);
return channels.get(offset + Math.floorMod(counter.incrementAndGet(), length));
}
在上述的请求中EmptyTransportResponseHandler作为响应的处理类,EmptyTransportResponseHandler实现了TransportResponseHandler接口,这是一个对请求结果响应处理的最原始的定义。
public interface TransportResponseHandler<T extends TransportResponse> extends Writeable.Reader<T> {
// 对远程节点执行正常的处理
void handleResponse(T response);
// 对远程节点执行异常的处理
void handleException(TransportException exp);
// 对远程节点执行的线程池名称
String executor();
}
节点之间的通信是RPC请求,节点在接收到其他节点的请求之后的执行路径。
根据上述的加入集群请求来解释请求的处理。在MembershipAction的构建函数中,使用transportService.registerRequestHandler方法注册了Action和处理的映射,
public MembershipAction(TransportService transportService, MembershipListener listener,
Collection<BiConsumer<DiscoveryNode,ClusterState>> joinValidators) {
this.transportService = transportService;
this.listener = listener;
//注册DISCOVERY_JOIN_ACTION_NAME,DISCOVERY_LEAVE_ACTION_NAME,DISCOVERY_JOIN_VALIDATE_ACTION_NAME的action与处理的映射
transportService.registerRequestHandler(DISCOVERY_JOIN_ACTION_NAME, JoinRequest::new,
ThreadPool.Names.GENERIC, new JoinRequestRequestHandler());
transportService.registerRequestHandler(DISCOVERY_JOIN_VALIDATE_ACTION_NAME,
() -> new ValidateJoinRequest(), ThreadPool.Names.GENERIC,
new ValidateJoinRequestRequestHandler(transportService::getLocalNode, joinValidators));
transportService.registerRequestHandler(DISCOVERY_LEAVE_ACTION_NAME, LeaveRequest::new,
ThreadPool.Names.GENERIC, new LeaveRequestRequestHandler());
}
DISCOVERY_JOIN_ACTION_NAME的处理类是JoinRequestRequestHandler,在messageReceived中开始执行join。监听器是MembershipListener,调用onJoin(DiscoveryNode node, MembershipAction.JoinCallback callback),在执行ZenDiscovery.handleJoinRequest,在完成处理后,通过回调将结果返回给请求的节点。
// JoinRequestRequestHandler.messageReceived方法
....
listener.onJoin(request.getNode(), new JoinCallback() {
@Override
public void onSuccess() {
try {
channel.sendResponse(TransportResponse.Empty.INSTANCE);
} catch (Exception e) {
onFailure(e);
}
}
@Override
public void onFailure(Exception e) {
try {
channel.sendResponse(e);
} catch (Exception inner) {
inner.addSuppressed(e);
logger.warn("failed to send back failure on join request", inner);
}
}
});
....
void handleJoinRequest(final DiscoveryNode node, final ClusterState state, final MembershipAction.JoinCallback callback) {
// 没有nodeJoinController说明discovery module 没有成功启动
if (nodeJoinController == null) {
throw new IllegalStateException("discovery module is not yet started");
} else {
// we do this in a couple of places including the cluster update thread. This one here is really just best effort
// to ensure we fail as fast as possible.
onJoinValidators.stream().forEach(a -> a.accept(node, state));
if (state.getBlocks().hasGlobalBlock(STATE_NOT_RECOVERED_BLOCK) == false) {
MembershipAction.ensureMajorVersionBarrier(node.getVersion(), state.getNodes().getMinNodeVersion());
}
// try and connect to the node, if it fails, we can raise an exception back to the client...
transportService.connectToNode(node);
// 验证加入请求,如果失败会抛出失败,返回给调用加入请求的节点
try {
membership.sendValidateJoinRequestBlocking(node, state, joinTimeout);
} catch (Exception e) {
logger.warn(() -> new ParameterizedMessage("failed to validate incoming join request from node [{}]", node),
e);
callback.onFailure(new IllegalStateException("failure when sending a validation request to node", e));
return;
}
// 由nodeJoinController处理加入请求,结束后并进行回调
nodeJoinController.handleJoinRequest(node, callback);
}
}
至此被请求节点处理逻辑结束。