erlang 节点互联net_kernel
net_kernel
是net_sup
是继erl_epmd,auth
后启动的supervisor
,
主要目的是 生成一个socket
然后注册到epmd
,而且会定时与别的节点维持一个心跳(ticker)
,下图是net_kernel
的拓扑结构:
| --- tcp接收连接proc (inet_tcp_dist:accept_loop/3, priv_inet:accept0/3)
net_kernel --- | --- 定时器(net_kernel:ticker/2, net_kernel:tick_loop/2)
| --- 连接1 (inet_tcp_dist:do_accept/7, dist_util:con_loop/2)
| --- 连接2 (inet_tcp_dist:do_accept/7, dist_util:con_loop/2)
| --- ......(initial_call, current_function)
......
1. inet_dist_listen_min,inet_dist_listen_max
限定节点监听端口
先来看一下epmd的用法,文档连接http://erlang.org/doc/man/epmd.html
The job of the epmd daemon is to keep track of which node name listens on which address
epmd
是管理名称(name)到端口的映射关系,有了这个关系,任意两个节点就可以通过 Host和监听端口互联,rpc直接就通过TCP发一条消息过去就OK了。
epmd
的简单用法如下:
$ epmd -names
epmd: up and running on port 4369 with data:
name abc at port 30508
#我起了一个节点[email protected] 启动命令:erl -name [email protected]
相关实现代码如下:
% inet_tcp_dist.erl
% Driver = inet_tcp
do_listen(Driver, Options) ->
{First, Last} = case application:get_env(kernel, inet_dist_listen_min) of
{ok, N} when is_integer(N) ->
case application:get_env(kernel, inet_dist_listen_max) of
{ok, M} when is_integer(M) ->
{N, M};
_ ->
{N, N}
end;
_ ->
{0, 0}
end,
% 先获取两个配置参数
do_listen(Driver, First, Last, listen_options([{backlog, 128} | Options])).
do_listen(_Driver, First, Last, _) when First > Last ->
{error, eaddrinuse};
do_listen(Driver, First, Last, Options) ->
% 如果不设置 inet_dist_listen_min和inet_dist_listen_max的值,那么First = 0,监听端口是随机的
case Driver:listen(First, Options) of
{error, eaddrinuse} ->
% 否则,就是在限定区间[min,max]之间选一个最小没有被占用的端口
do_listen(Driver, First + 1, Last, Options);
Other ->
Other
end.
这两个参数虽然看上去没有多大用途,因为大多数内网机器之间都是可以互相访问的,没有防火墙设置。
但是在使用docker容器部署服务的时候,除了放在同一个network
中,否则不同的宿主机之间的docker
实例是无法访问的,这个时候这两个设置就派上用场了。
2. net_ticktime
网络心跳时间
% erl_distribution.erl
% 获取网络心跳时间
ticktime() ->
case catch application:get_env(net_ticktime) of
{ok, Value} when is_integer(Value), Value > 0 ->
[Value * 250]; %% i.e. 1000 / 4 = 250 ms.
_ ->
[]
end.
% net_kernel.erl
init({Name, LongOrShortNames, TickT, CleanHalt}) ->
process_flag(trap_exit, true),
case init_node(Name, LongOrShortNames, CleanHalt) of
{ok, Node, Listeners} ->
% 心跳proc具有最高的优先级
process_flag(priority, max),
Ticktime = to_integer(TickT),
Ticker = spawn_link(net_kernel, ticker, [self(), Ticktime]),
.....
Error ->
{stop, Error}
end.
% ===== 心跳proc
ticker(Kernel, Tick) when is_integer(Tick) ->
process_flag(priority, max),
?tckr_dbg(ticker_started),
ticker_loop(Kernel, Tick).
ticker_loop(Kernel, Tick) ->
receive
{new_ticktime, NewTick} ->
?tckr_dbg({ticker_changed_time, Tick, NewTick}),
?MODULE:ticker_loop(Kernel, NewTick)
after Tick ->
% 正常每隔Tick时间,向 net_kernel发送一条 tick消息
Kernel ! tick,
?MODULE:ticker_loop(Kernel, Tick)
end.
handle_info(tick, State) ->
% 处理心跳proc 发来的消息,向所有保持的连接发送一个心跳
lists:foreach(fun({Pid, _Node}) -> Pid ! {self(), tick} end,State#state.conn_owners),
{noreply, State};
% dist_util.erl
send_tick(DHandle, Socket, Tick, Type, MFTick, MFGetstat) ->
#tick{tick = T0,read = Read,write = Write,ticked = Ticked0} = Tick,
T = T0 + 1,
T1 = T rem 4,
case getstat(DHandle, Socket, MFGetstat) of
{ok, Read, _, _} when Ticked0 =:= T ->
% 没有返回就退出了,这条连接就断了
{error, not_responding};
......
Error ->
Error
end.
- 网络心跳线程具有最高优先级
- 由于集群中每两个互通的节点之间都会维持心跳,所以请酌情调整心跳时间
3. net_kernel:monitor_nodes(...)
monitor_nodes(Flag) ->
case catch process_flag(monitor_nodes, Flag) of
N when is_integer(N) -> ok;
_ -> mk_monitor_nodes_error(Flag, [])
end.
monitor_nodes(Flag, Opts) ->
case catch process_flag({monitor_nodes, Opts}, Flag) of
N when is_integer(N) -> ok;
_ -> mk_monitor_nodes_error(Flag, Opts)
end.
monitor_nodes
其实是erlang nif
的代理函数,真实的实现见C代码,笔者也没有细看
4. 如何管理连接(connection)
4.1 连接的监听(listen)
% net_kernel.erl
start_protos(Name, [Proto | Ps], Node, Ls, CleanHalt) ->
Mod = list_to_atom(Proto ++ "_dist"),
% Mod = inte_tcp_dist
% 新开一个socket监听,并注册到epmd
case catch Mod:listen(Name) of
{ok, {Socket, Address, Creation}} ->
case set_node(Node, Creation) of
ok ->
% 新开启一个监听proc
AcceptPid = Mod:accept(Socket),
auth:sync_cookie(),
L = #listen{
listen = Socket,
address = Address,
accept = AcceptPid,
module = Mod},
start_protos(Name, Ps, Node, [L | Ls], CleanHalt);
_ ->
Mod:close(Socket),
S = "invalid node name: " ++ atom_to_list(Node),
proto_error(CleanHalt, Proto, S),
start_protos(Name, Ps, Node, Ls, CleanHalt)
end;
......
end;
start_protos(_, [], _Node, Ls, _CleanHalt) ->
Ls.
% 我们来看上文提到额accept方法
% inet_tcp_dist.erl
accept(Listen) ->
gen_accept(inet_tcp, Listen).
gen_accept(Driver, Listen) ->
% 新开一个proc专门处理,连进来的socket
spawn_opt(?MODULE, accept_loop, [Driver, self(), Listen], [link, {priority, max}]).
accept_loop(Driver, Kernel, Listen) ->
case Driver:accept(Listen) of
{ok, Socket} ->
Kernel ! {accept, self(), Socket, Driver:family(), tcp},
_ = controller(Driver, Kernel, Socket),
accept_loop(Driver, Kernel, Listen);
Error ->
exit(Error)
end.
% net_kernel.erl
%% 接受一个新的连接
handle_info({accept, AcceptPid, Socket, Family, Proto}, State) ->
MyNode = State#state.node,
case get_proto_mod(Family, Proto, State#state.listen) of
{ok, Mod} ->
Pid = Mod:accept_connection(AcceptPid,
Socket,
MyNode,
State#state.allowed,
State#state.connecttime),
% 将socket控制权移交给 Pid
AcceptPid ! {self(), controller, Pid},
{noreply, State};
_ ->
AcceptPid ! {self(), unsupported_protocol},
{noreply, State}
end;
% inet_tcp_dist.erl
% 接管Socket,并监视维持的这条连接
controller(Driver, Kernel, Socket) ->
receive
{Kernel, controller, Pid} ->
flush_controller(Pid, Socket),
% Driver = inet_tcp
% 接受net_kernel发来的消息,接管socket
Driver:controlling_process(Socket, Pid),
flush_controller(Pid, Socket),
Pid ! {self(), controller};
{Kernel, unsupported_protocol} ->
exit(unsupported_protocol)
end.
do_accept(Driver, Kernel, AcceptPid, Socket, MyNode, Allowed, SetupTime) ->
receive
{AcceptPid, controller} ->
Timer = dist_util:start_timer(SetupTime),
case check_ip(Driver, Socket) of
true ->
HSData = .......
%% 真正的connection loop 入口
dist_util:handshake_other_started(HSData);
{false, IP} ->
error_msg("** Connection attempt from "
"disallowed IP ~w ** ~n", [IP]),
?shutdown(no_node)
end
end.
% dist_util.erl
这个是维持每两个 Node之间的连接的函数loop
con_loop({Kernel, Node, Socket, Type, DHandle, MFTick, MFGetstat, MFSetOpts, MFGetOpts} = ConData,
Tick) ->
receive
{tcp_closed, Socket} ->
?shutdown2(Node, connection_closed);
{Kernel, disconnect} ->
?shutdown2(Node, disconnected);
{Kernel, tick} ->
% 上面两种情况,自己退出之后,net_kernel 会收到消息,然后移除连接(Connection)
% ... 发送心跳了
case send_tick(DHandle, Socket, Tick, Type, MFTick, MFGetstat) of
{ok, NewTick} ->
con_loop(ConData, NewTick);
{error, not_responding} ->
error_msg("** Node ~p not responding **~n"
"** Removing (timedout) connection **~n",
[Node]),
?shutdown2(Node, net_tick_timeout);
_Other ->
?shutdown2(Node, send_net_tick_failed)
end;
......
end.
至此监听方代码已经分析完成。
4.2 连接的建立(connect)
每条连接都有一条发起方和接收方,接收方在上文已经分析完成。接下来我们就来分析发起方
% net_kernel.erl
handle_call({connect, Type, Node}, From, State) ->
verbose({connect, Type, Node}, 1, State),
ConnLookup = ets:lookup(sys_dist, Node),
% 主动发起连接
R = try erts_internal:new_connection(Node) of
ConnId ->
R1 = do_explicit_connect(ConnLookup, Type, Node, ConnId, From, State),
case R1 of
{reply, true, _S} -> %% already connected
ok;
{noreply, _S} -> %% connection pending
ok;
{reply, false, _S} -> %% connection refused
erts_internal:abort_connection(Node, ConnId)
end,
R1
catch
_:_ ->
......
{reply, false, State}
end,
return_call(R, From);
% 如果是一条新的连接
do_explicit_connect(_ConnLookup, Type, Node, ConnId, From, State) ->
case setup(Node, ConnId, Type, From, State) of
{ok, SetupPid} ->
Owners = [{SetupPid, Node} | State#state.conn_owners],
{noreply, State#state{conn_owners = Owners}};
_Error ->
?connect_failure(Node, {setup_call, failed, _Error}),
{reply, false, State}
end.
setup(Node, ConnId, Type, From, State) ->
case setup_check(Node, State) of
{ok, L} ->
Mod = L#listen.module,
LAddr = L#listen.address,
MyNode = State#state.node,
% Mod = inet_tcp_dist
% 建立一条连接
Pid = Mod:setup(Node,Type,MyNode, State#state.type,State#state.connecttime),
Addr = LAddr#net_address{address = undefined,host = undefined},
Waiting = ...
% 保存这条连接
ets:insert(sys_dist, ....),
{ok, Pid};
Error ->
Error
end.
% inet_tcp_dist.erl
setup(Node, Type, MyNode, LongOrShortNames, SetupTime) ->
gen_setup(inet_tcp, Node, Type, MyNode, LongOrShortNames, SetupTime).
gen_setup(Driver, Node, Type, MyNode, LongOrShortNames, SetupTime) ->
spawn_opt(?MODULE, do_setup,
[Driver, self(), Node, Type, MyNode, LongOrShortNames, SetupTime],
[link, {priority, max}]).
do_setup(Driver, Kernel, Node, Type, MyNode, LongOrShortNames, SetupTime) ->
......
case ARMod:ARFun(Name, Address, AddressFamily) of
{ok, Ip, TcpPort, Version} ->
......
do_setup_connect(Driver, Kernel, Node, Address, AddressFamily, Ip, TcpPort, Version, Type, MyNode, Timer);
{ok, Ip} ->
......
do_setup_connect(Driver, Kernel, Node, Address, AddressFamily,
_Other ->
......
end.
do_setup_connect(Driver, Kernel, Node, Address, AddressFamily,Ip, TcpPort, Version, Type, MyNode, Timer) ->
......
% Driver = inet_tcp
% 开始建议一条TCP连接到目标节点
case
Driver:connect( Ip, TcpPort, connect_options([{active, false}, {packet, 2}]))
of
{ok, Socket} ->
% 连接建立成功之后,接管socket,然后像监听方那样维持这条连接
HSData = ...
dist_util:handshake_we_started(HSData);
_ ->
...
?shutdown(Node)
end.
到现在为止,我们已经对发起方也分析完成了。
5. 总结
熟悉并理解net_kernel
的工作机制, 合理使用配置参数net_ticktime
,可以避免网络心跳风暴。
合理配置监听端口结合docker
使用,可以让容器之间互联更加方便
6. 参考文献
- https://github.com/erlang/otp/blob/master/lib/kernel/src/net_kernel.erl
- https://github.com/erlang/otp/blob/master/lib/kernel/src/inet_tcp_dist.erl
- https://github.com/erlang/otp/blob/master/lib/kernel/src/dist_util.erl
- https://github.com/erlang/otp/blob/master/lib/kernel/src/erl_distribution.erl