erlang 节点互联net_kernel

erlang 节点互联net_kernel

net_kernelnet_sup是继erl_epmd,auth后启动的supervisor,

主要目的是 生成一个socket然后注册到epmd ,而且会定时与别的节点维持一个心跳(ticker),下图是net_kernel的拓扑结构:

                        | --- tcp接收连接proc (inet_tcp_dist:accept_loop/3,   priv_inet:accept0/3)
net_kernel --- |  --- 定时器(net_kernel:ticker/2,   net_kernel:tick_loop/2)
                        | --- 连接1 (inet_tcp_dist:do_accept/7,  dist_util:con_loop/2)
                        | --- 连接2 (inet_tcp_dist:do_accept/7,  dist_util:con_loop/2)
                        | --- ......(initial_call, current_function)
                        ......

1. inet_dist_listen_min,inet_dist_listen_max限定节点监听端口

先来看一下epmd的用法,文档连接http://erlang.org/doc/man/epmd.html

The job of the epmd daemon is to keep track of which node name listens on which address

epmd是管理名称(name)到端口的映射关系,有了这个关系,任意两个节点就可以通过 Host和监听端口互联,rpc直接就通过TCP发一条消息过去就OK了。

epmd的简单用法如下:

$ epmd -names
epmd: up and running on port 4369 with data:
name abc at port 30508 
#我起了一个节点[email protected] 启动命令:erl -name [email protected] 

相关实现代码如下:

% inet_tcp_dist.erl
% Driver = inet_tcp
do_listen(Driver, Options) ->
  {First, Last} = case application:get_env(kernel, inet_dist_listen_min) of
                    {ok, N} when is_integer(N) ->
                      case application:get_env(kernel, inet_dist_listen_max) of
                        {ok, M} when is_integer(M) ->
                          {N, M};
                        _ ->
                          {N, N}
                      end;
                    _ ->
                      {0, 0}
                  end,
  % 先获取两个配置参数
  do_listen(Driver, First, Last, listen_options([{backlog, 128} | Options])).

do_listen(_Driver, First, Last, _) when First > Last ->
  {error, eaddrinuse};
do_listen(Driver, First, Last, Options) ->
  % 如果不设置 inet_dist_listen_min和inet_dist_listen_max的值,那么First = 0,监听端口是随机的
  case Driver:listen(First, Options) of
    {error, eaddrinuse} ->
      % 否则,就是在限定区间[min,max]之间选一个最小没有被占用的端口
      do_listen(Driver, First + 1, Last, Options);
    Other ->
      Other
  end.

这两个参数虽然看上去没有多大用途,因为大多数内网机器之间都是可以互相访问的,没有防火墙设置。
但是在使用docker容器部署服务的时候,除了放在同一个network中,否则不同的宿主机之间的docker实例是无法访问的,这个时候这两个设置就派上用场了。

2. net_ticktime网络心跳时间

% erl_distribution.erl
% 获取网络心跳时间
ticktime() ->
  case catch application:get_env(net_ticktime) of
    {ok, Value} when is_integer(Value), Value > 0 ->
      [Value * 250]; %% i.e. 1000 / 4 = 250 ms.
    _ ->
      []
  end.

% net_kernel.erl
init({Name, LongOrShortNames, TickT, CleanHalt}) ->
  process_flag(trap_exit, true),
  case init_node(Name, LongOrShortNames, CleanHalt) of
    {ok, Node, Listeners} ->
      % 心跳proc具有最高的优先级
      process_flag(priority, max),
      Ticktime = to_integer(TickT),
      Ticker = spawn_link(net_kernel, ticker, [self(), Ticktime]),
      .....
    Error ->
      {stop, Error}
  end.
  % ===== 心跳proc
 ticker(Kernel, Tick) when is_integer(Tick) ->
  process_flag(priority, max),
  ?tckr_dbg(ticker_started),
  ticker_loop(Kernel, Tick).

ticker_loop(Kernel, Tick) ->
  receive
    {new_ticktime, NewTick} ->
      ?tckr_dbg({ticker_changed_time, Tick, NewTick}),
      ?MODULE:ticker_loop(Kernel, NewTick)
  after Tick ->
    % 正常每隔Tick时间,向 net_kernel发送一条 tick消息
    Kernel ! tick,
    ?MODULE:ticker_loop(Kernel, Tick)
  end.
  
 handle_info(tick, State) ->
 % 处理心跳proc 发来的消息,向所有保持的连接发送一个心跳
  lists:foreach(fun({Pid, _Node}) -> Pid ! {self(), tick} end,State#state.conn_owners),
  {noreply, State};
  
 % dist_util.erl
 send_tick(DHandle, Socket, Tick, Type, MFTick, MFGetstat) ->
  #tick{tick = T0,read = Read,write = Write,ticked = Ticked0} = Tick,
  T = T0 + 1,
  T1 = T rem 4,
  case getstat(DHandle, Socket, MFGetstat) of
    {ok, Read, _, _} when Ticked0 =:= T ->
      % 没有返回就退出了,这条连接就断了
      {error, not_responding};
    ......

    Error ->
      Error
  end.
  1. 网络心跳线程具有最高优先级
  2. 由于集群中每两个互通的节点之间都会维持心跳,所以请酌情调整心跳时间

3. net_kernel:monitor_nodes(...)

monitor_nodes(Flag) ->
  case catch process_flag(monitor_nodes, Flag) of
    N when is_integer(N) -> ok;
    _ -> mk_monitor_nodes_error(Flag, [])
  end.
monitor_nodes(Flag, Opts) ->
  case catch process_flag({monitor_nodes, Opts}, Flag) of
    N when is_integer(N) -> ok;
    _ -> mk_monitor_nodes_error(Flag, Opts)
  end.

monitor_nodes 其实是erlang nif的代理函数,真实的实现见C代码,笔者也没有细看

4. 如何管理连接(connection)

4.1 连接的监听(listen)
% net_kernel.erl
start_protos(Name, [Proto | Ps], Node, Ls, CleanHalt) ->
  Mod = list_to_atom(Proto ++ "_dist"),
  % Mod = inte_tcp_dist
  % 新开一个socket监听,并注册到epmd
  case catch Mod:listen(Name) of
    {ok, {Socket, Address, Creation}} ->
      case set_node(Node, Creation) of
        ok ->
          % 新开启一个监听proc
          AcceptPid = Mod:accept(Socket),
          auth:sync_cookie(),
          L = #listen{
            listen = Socket,
            address = Address,
            accept = AcceptPid,
            module = Mod},
          start_protos(Name, Ps, Node, [L | Ls], CleanHalt);
        _ ->
          Mod:close(Socket),
          S = "invalid node name: " ++ atom_to_list(Node),
          proto_error(CleanHalt, Proto, S),
          start_protos(Name, Ps, Node, Ls, CleanHalt)
      end;
    ......
  end;
start_protos(_, [], _Node, Ls, _CleanHalt) ->
  Ls.



% 我们来看上文提到额accept方法
% inet_tcp_dist.erl
accept(Listen) ->
  gen_accept(inet_tcp, Listen).

gen_accept(Driver, Listen) ->
  % 新开一个proc专门处理,连进来的socket
  spawn_opt(?MODULE, accept_loop, [Driver, self(), Listen], [link, {priority, max}]).

accept_loop(Driver, Kernel, Listen) ->
  case Driver:accept(Listen) of
    {ok, Socket} ->
      Kernel ! {accept, self(), Socket, Driver:family(), tcp},
      
      _ = controller(Driver, Kernel, Socket),
      accept_loop(Driver, Kernel, Listen);
    Error ->
      exit(Error)
  end.
 
 
 
 
 % net_kernel.erl
 %% 接受一个新的连接
 handle_info({accept, AcceptPid, Socket, Family, Proto}, State) ->
  MyNode = State#state.node,
  case get_proto_mod(Family, Proto, State#state.listen) of
    {ok, Mod} ->
      Pid = Mod:accept_connection(AcceptPid,
        Socket,
        MyNode,
        State#state.allowed,
        State#state.connecttime),
       % 将socket控制权移交给 Pid
      AcceptPid ! {self(), controller, Pid},
      {noreply, State};
    _ ->
      AcceptPid ! {self(), unsupported_protocol},
      {noreply, State}
  end;
  
  
 % inet_tcp_dist.erl
 % 接管Socket,并监视维持的这条连接
 controller(Driver, Kernel, Socket) ->
  receive
    {Kernel, controller, Pid} ->
      
      flush_controller(Pid, Socket),
      % Driver = inet_tcp
      % 接受net_kernel发来的消息,接管socket
      Driver:controlling_process(Socket, Pid),
      flush_controller(Pid, Socket),
      Pid ! {self(), controller};
    {Kernel, unsupported_protocol} ->
      exit(unsupported_protocol)
  end.
  
 do_accept(Driver, Kernel, AcceptPid, Socket, MyNode, Allowed, SetupTime) ->
  receive
    {AcceptPid, controller} ->
      Timer = dist_util:start_timer(SetupTime),
      case check_ip(Driver, Socket) of
        true ->
          HSData = .......
          %% 真正的connection loop 入口
          dist_util:handshake_other_started(HSData);
        {false, IP} ->
          error_msg("** Connection attempt from "
          "disallowed IP ~w ** ~n", [IP]),
          ?shutdown(no_node)
      end
  end.
  
  
  
  % dist_util.erl
  这个是维持每两个 Node之间的连接的函数loop
 con_loop({Kernel, Node, Socket, Type, DHandle, MFTick, MFGetstat, MFSetOpts, MFGetOpts} = ConData,
    Tick) ->
  receive
    {tcp_closed, Socket} ->
      ?shutdown2(Node, connection_closed);
    {Kernel, disconnect} ->
      ?shutdown2(Node, disconnected);
    {Kernel, tick} ->
      % 上面两种情况,自己退出之后,net_kernel 会收到消息,然后移除连接(Connection)
      % ... 发送心跳了
      case send_tick(DHandle, Socket, Tick, Type, MFTick, MFGetstat) of
        {ok, NewTick} ->
          con_loop(ConData, NewTick);
        {error, not_responding} ->
          error_msg("** Node ~p not responding **~n"
          "** Removing (timedout) connection **~n",
            [Node]),
          ?shutdown2(Node, net_tick_timeout);
        _Other ->
          ?shutdown2(Node, send_net_tick_failed)
      end;
    ......
  end.

至此监听方代码已经分析完成。

4.2 连接的建立(connect)

每条连接都有一条发起方和接收方,接收方在上文已经分析完成。接下来我们就来分析发起方

% net_kernel.erl
handle_call({connect, Type, Node}, From, State) ->
  verbose({connect, Type, Node}, 1, State),
  ConnLookup = ets:lookup(sys_dist, Node),
  % 主动发起连接
  R = try erts_internal:new_connection(Node) of
        ConnId ->
          R1 = do_explicit_connect(ConnLookup, Type, Node, ConnId, From, State),
          case R1 of
            {reply, true, _S} -> %% already connected
              ok;
            {noreply, _S} -> %% connection pending
              ok;
            {reply, false, _S} -> %% connection refused
              erts_internal:abort_connection(Node, ConnId)
          end,
          R1
      catch
        _:_ ->
          ......
          {reply, false, State}
      end,
  return_call(R, From);
 
 % 如果是一条新的连接
do_explicit_connect(_ConnLookup, Type, Node, ConnId, From, State) ->
  case setup(Node, ConnId, Type, From, State) of
    {ok, SetupPid} ->
      Owners = [{SetupPid, Node} | State#state.conn_owners],
      {noreply, State#state{conn_owners = Owners}};
    _Error ->
      ?connect_failure(Node, {setup_call, failed, _Error}),
      {reply, false, State}
  end.

setup(Node, ConnId, Type, From, State) ->
  case setup_check(Node, State) of
    {ok, L} ->
      Mod = L#listen.module,
      LAddr = L#listen.address,
      MyNode = State#state.node,
      % Mod = inet_tcp_dist
      % 建立一条连接
      Pid = Mod:setup(Node,Type,MyNode, State#state.type,State#state.connecttime),
      Addr = LAddr#net_address{address = undefined,host = undefined},
      Waiting = ...
      % 保存这条连接
      ets:insert(sys_dist, ....),
      {ok, Pid};
    Error ->
      Error
  end.
  
  
  % inet_tcp_dist.erl
  setup(Node, Type, MyNode, LongOrShortNames, SetupTime) ->
  gen_setup(inet_tcp, Node, Type, MyNode, LongOrShortNames, SetupTime).

gen_setup(Driver, Node, Type, MyNode, LongOrShortNames, SetupTime) ->
  spawn_opt(?MODULE, do_setup,
    [Driver, self(), Node, Type, MyNode, LongOrShortNames, SetupTime],
    [link, {priority, max}]).

do_setup(Driver, Kernel, Node, Type, MyNode, LongOrShortNames, SetupTime) ->
  ......
  case ARMod:ARFun(Name, Address, AddressFamily) of
    {ok, Ip, TcpPort, Version} ->
      ......
      do_setup_connect(Driver, Kernel, Node, Address, AddressFamily, Ip, TcpPort, Version, Type, MyNode, Timer);
    {ok, Ip} ->
        ......
       do_setup_connect(Driver, Kernel, Node, Address, AddressFamily,
    _Other ->
     ......
  end.
  
 do_setup_connect(Driver, Kernel, Node, Address, AddressFamily,Ip, TcpPort, Version, Type, MyNode, Timer) ->
  ......
  % Driver = inet_tcp
  % 开始建议一条TCP连接到目标节点
  case
    Driver:connect( Ip, TcpPort, connect_options([{active, false}, {packet, 2}]))
  of
    {ok, Socket} ->
      % 连接建立成功之后,接管socket,然后像监听方那样维持这条连接
      HSData = ...
      dist_util:handshake_we_started(HSData);
    _ ->
      ...
      ?shutdown(Node)
  end.

到现在为止,我们已经对发起方也分析完成了。

5. 总结

熟悉并理解net_kernel的工作机制, 合理使用配置参数net_ticktime,可以避免网络心跳风暴。

合理配置监听端口结合docker使用,可以让容器之间互联更加方便

6. 参考文献

  • https://github.com/erlang/otp/blob/master/lib/kernel/src/net_kernel.erl
  • https://github.com/erlang/otp/blob/master/lib/kernel/src/inet_tcp_dist.erl
  • https://github.com/erlang/otp/blob/master/lib/kernel/src/dist_util.erl
  • https://github.com/erlang/otp/blob/master/lib/kernel/src/erl_distribution.erl

你可能感兴趣的:(erlang 节点互联net_kernel)