supervisor是一种很常用的erlang pattern,各种资料比较多,但是网上各种例子和说明都主要是针对one_for_one的restart strategies,simple_one_for_one的资料比较少。本文通过分析supervisor源码,说明simple_one_for_one的使用方法与运行机制。
what is the supervisor ?
引用
A behaviour module for implementing a supervisor, a process which supervises other processes called child processes.
简单的说,实现了supervisor的进程可以负责监控其他子进程,supervisor的子进程有4种重启策略,
引用
A supervisor can have one of the following restart strategies:
* one_for_one - if one child process terminates and should be
restarted, only that child process is affected.
* one_for_all - if one child process terminates and should be
restarted, all other child processes are terminated and then all
child processes are restarted.
* rest_for_one - if one child process terminates and should be
restarted, the 'rest' of the child processes -- i.e. the child pro-
cesses after the terminated child process in the start order -- are
terminated. Then the terminated child process and all child pro-
cesses after it are restarted.
* simple_one_for_one - a simplified one_for_one supervisor, where all
child processes are dynamically added instances of the same process
type, i.e. running the same code.
The functions terminate_child/2, delete_child/2 and restart_child/2
are invalid for simple_one_for_one supervisors and will return
{error, simple_one_for_one} if the specified supervisor uses this
restart strategy.
其中simple_one_for_one用来创建一组动态子进程,一个使用了simple_one_for_one 策略的supervisor可以类比为factory pattern,最常见的例子就是socket 服务端接受新的连接请求以后,需要创建新的socket连接进程。
假设通过supervisor,在某些情况下动态创建一个container进程,每个container behaviour(gen_server) 作为进程存在。
container.erl代码:
-module(container).
-behaviour(gen_server).
-export([start_link/1]).
-export([stop/1]).
-export([setup/0]).
-export([init/1, handle_call/3, handle_cast/2, handle_info/2,
terminate/2, code_change/3]).
-record(state, {name}).
setup()->
supervisor:start_child(container_sup,["container1"]),
supervisor:start_child(container_sup,["container2"]),
supervisor:start_child(container_sup,["container3"]),
supervisor:start_child(container_sup,["container4"]).
stop(ContainerName)->
gen_server:call(list_to_atom(ContainerName) , {stop } ).
start_link(Name) when is_list(Name) ->
gen_server:start_link( {local, list_to_atom(Name)}, ?MODULE, Name, []).
init(Name) ->
{ok, #state{name=Name}}.
handle_call( {stop } , From, State) ->
{ stop , normal , ok , State };
handle_call(_Request, _From, State) ->
Reply = ok,
{reply, Reply, State}.
handle_cast(_Msg, State) ->
{noreply, State}.
handle_info(Info, State) ->
{noreply, State}.
terminate(_Reason, _State) ->
ok.
code_change(_OldVsn, State, _Extra) ->
{ok, State}.
container_sup.erl代码:
-module(container_sup).
-behaviour(supervisor).
-export([start_link/0]).
-export([init/1]).
-define(SERVER, ?MODULE).
start_link() ->
supervisor:start_link({local, ?SERVER}, ?MODULE, []).
init([]) ->
ContainerSpec = {container,{container,start_link,[]},
transient,2000,worker,[container]},
{ok,{{simple_one_for_one,10,100}, [ContainerSpec]}}.
1 how to start the container_sup ?
1.1 start supervisor
supervisor:start_link({local, ?MODULE}, ?MODULE, []).
1.2 supervisor:start_link回调gen_server:start_link
start_link(SupName, Mod, Args) ->
gen_server:start_link(SupName, supervisor, {SupName, Mod, Args}, []).
1.3 gen_server:start_link最终回调supervisor:init(具体请参考
http://uniseraph.iteye.com/blog/372838)
1.4 supervisor:init
init({SupName, Mod, Args}) ->
process_flag(trap_exit, true),
case Mod:init(Args) of
{ok, {SupFlags, StartSpec}} ->
case init_state(SupName, SupFlags, Mod, Args) of
{ok, State} when ?is_simple(State) ->
init_dynamic(State, StartSpec);
{ok, State} ->
init_children(State, StartSpec);
Error ->
{stop, {supervisor_data, Error}}
end;
ignore ->
ignore;
Error ->
{stop, {bad_return, {Mod, init, Error}}}
end.
1.4.1 设置进程模式,当处理子进程退出是supervisor会受到一个{'EXIT', Pid, Reason}消息
process_flag(trap_exit, true),
1.4.2 回调container_sup:init获得container_sup的子进程规约
1.4.3 根据不同的resart strategies调用相应的init_xxx方法
case init_state(SupName, SupFlags, Mod, Args) of
{ok, State} when ?is_simple(State) ->
init_dynamic(State, StartSpec);
{ok, State} ->
init_children(State, StartSpec);
Error ->
{stop, {supervisor_data, Error}}
end;
1.4.4 init_dynamic检查子进程规约的合法性,并不真正启动子进程
init_dynamic(State, [StartSpec]) ->
case check_startspec([StartSpec]) of
{ok, Children} ->
{ok, State#state{children = Children}};
Error ->
{stop, {start_spec, Error}}
end;
init_dynamic(_State, StartSpec) ->
{stop, {bad_start_spec, StartSpec}}.
2 how to add a container processor ?
2.1
supervisor:start_child(container_sup,["container1"]),
2.2 supervisor:start_child触发一个start_child事件,在supervisor中处理
handle_call({start_child, EArgs}, _From, State) when ?is_simple(State) ->
#child{mfa = {M, F, A}} = hd(State#state.children),
Args = A ++ EArgs,
case do_start_child_i(M, F, Args) of
{ok, Pid} ->
NState = State#state{dynamics =
?DICT:store(Pid, Args, State#state.dynamics)},
{reply, {ok, Pid}, NState};
{ok, Pid, Extra} ->
NState = State#state{dynamics =
?DICT:store(Pid, Args, State#state.dynamics)},
{reply, {ok, Pid, Extra}, NState};
What ->
{reply, What, State}
end;
2.3 do_start_child_i根据子进程配置规约回调container:start_link方法,并传入参数"container1"
2.4 记录子进程Pid
3 how to stop a container processor ?
引用
The functions terminate_child/2, delete_child/2 and restart_child/2
are invalid for simple_one_for_one supervisors and will return
{error, simple_one_for_one} if the specified supervisor uses this
restart strategy.
3.1 container进程处理一个{stop}消息
handle_call( {stop } , From, State) ->
{ stop , normal , ok , State };
3.2 container_sup受到一个{'EXIT', Pid, Reason}消息,在handle_info中处理
handle_info({'EXIT', Pid, Reason}, State) ->
case restart_child(Pid, Reason, State) of
{ok, State1} ->
{noreply, State1};
{shutdown, State1} ->
{stop, shutdown, State1}
end;
3.3 restart_child根据pid在state中查找并restart这个进程
restart_child(Pid, Reason, State) when ?is_simple(State) ->
case ?DICT:find(Pid, State#state.dynamics) of
{ok, Args} ->
[Child] = State#state.children,
RestartType = Child#child.restart_type,
{M, F, _} = Child#child.mfa,
NChild = Child#child{pid = Pid, mfa = {M, F, Args}},
do_restart(RestartType, Reason, NChild, State);
error ->
{ok, State}
end;
3.4 应用Reason=normal并且conainer是restart type是transient,所以这个container退出,不再重启。
do_restart(permanent, Reason, Child, State) ->
report_error(child_terminated, Reason, Child, State#state.name),
restart(Child, State);
do_restart(_, normal, Child, State) ->
NState = state_del_child(Child, State),
{ok, NState};
do_restart(_, shutdown, Child, State) ->
NState = state_del_child(Child, State),
{ok, NState};
do_restart(transient, Reason, Child, State) ->
report_error(child_terminated, Reason, Child, State#state.name),
restart(Child, State);
do_restart(temporary, Reason, Child, State) ->
report_error(child_terminated, Reason, Child, State#state.name),
NState = state_del_child(Child, State),
{ok, NState}.