1、现象
游戏服务器与多个客户端建立连接,socket设置参数如下[binary, {packet, 4}, {active, 0}, {reuseaddr, true}, {nodelay, false}, {packet_size, 4096}, {delay_send, true}, {send_timeout, 5000}, {keepalive, true}, {exit_on_close, true}],其中绝大多数socket可以正常通信,一两个通信异常,表现为服务端无法发送消息。此时,查看socket宿主进程状态,进程的消息列表堆积,堆栈在prim_inet:close_pend_loop/2处死循环。
2、查找过程
从进程的堆栈来看,进程执行到了gen_tcp:close/1,进入关闭流程,gen_tcp:close/1调用inet:tcp_close/1,最后执行prim_inet:close/1,其中prim模块部分代码如下
close(S) when is_port(S) ->
case subscribe(S, [subs_empty_out_q]) of
{ok, [{subs_empty_out_q,N}]} when N > 0 ->
close_pend_loop(S, N); %% wait for pending output to be sent
_ ->
close_port(S)
end.
close_pend_loop(S, N) ->
receive
{empty_out_q,S} ->
close_port(S)
after ?INET_CLOSE_TIMEOUT ->
case getstat(S, [send_pend]) of
{ok, [{send_pend,N1}]} ->
if
N1 =:= N ->
close_port(S);
true ->
close_pend_loop(S, N1)
end;
_ ->
close_port(S)
end
end.
从代码看出,socket关闭过程如下,检查subs_empty_out_q队列数量,也就是缓冲区数量,如果缓冲区内容已经全部发送完成,关闭socket,否则执行close_pend_loop/2。close_pend_loop在5秒内没有等待到{empty_out_q, S}情况下,判断send_pend字节数是否发生变化,如果有变化,那么就会一直close_pend_loop/2,正好符合观测现象。
3、现象解释和推测
socket设置delay_send,erts在inet_drv会先把发送的消息暂存在发送缓冲区,当close一个socket的时候,要保证:该socket缓冲区已无数据或者socket已经无法发送数据。当服务端判定一个socket已经无效(可能通过心跳时间),准备关闭socket时,如果该socket还在发送数据就会出现上述现象。为什么出现两端在判定socket是否有效差异,可能因为:1、客户端socket写错误但是还能读,所以心跳不能发送给服务端,服务端判定该socket失效需要关闭;2、客户端宿主进程退出,但是并没有清除socket,socket能接受不能发送。
4、解决方案
close一个socket如果在一定时间内不能得到准确的关闭结果,则关闭该socket的宿主进程,宿主进程关闭,虚拟机自动回收该socket。
5、试验论证
这里采用大牛的验证代码
$ cat test.erl
-module(test).
-compile(export_all).
start() ->
start(1234).
start(Port) ->
register(?MODULE, self()),
spawn_link(fun ()-> S= listen(Port), accept(S) end),
receive Any -> io:format("~p~n", [Any]) end. %% to stop: test!stop.
listen(Port) ->
Opts = [{active, false},
binary,
{backlog, 256},
{packet, raw},
{reuseaddr, true}],
{ok, S} = gen_tcp:listen(Port, Opts),
S.
accept(S) ->
case gen_tcp:accept(S) of
{ok, Socket} -> inet:setopts(Socket, [{exit_on_close, true},
{delay_send,true}]),
spawn_opt(?MODULE, entry, [Socket], []);
Error -> erlang:error(Error)
end,
accept(S).
entry(S)->
loop(S),
check_empty_out_q_msg(1000),
io:format("bye socket ~p~n",[S]),
ok.
check_empty_out_q_msg(Timeout)->
receive
Any -> io:format("bingo, got message ~p~n", [Any]), Any
after Timeout -> cont end.
loop(S) ->
check_empty_out_q_msg(0),
case gen_tcp:recv(S, 0) of
{ok, <<"start", _/binary>>}->
io:format("start to reproduce {empty_out_q, Port} message ~n",[]),
gen_tcp:send(S, lists:duplicate(1024*1024, "A")),
io:format("sent 1M bytes ~n",[]),
io:format("sleep 1s ~n",[]),
receive Any1 -> Any1 after 1000 -> cont end,
loop(S);
{ok, _Data} ->
io:format("shutdown(write) ~n",[]),
{ok, [{send_pend, N}]}=inet:getstat(S, [send_pend]),
gen_tcp:shutdown(S, write),
{ok, [{send_pend, N1}]}=inet:getstat(S, [send_pend]),
io:format("5s send_pend ~w/~w ~n",[N,N1]),
loop(S);
Error ->
io:format("tcp ~p~n", [Error]),
Error
end.
$ cat client.erl
-module(client).
-export([start/0]).
start()->
{ok,Sock} = gen_tcp:connect("localhost", 1234, [{active,false}]),
gen_tcp:send(Sock, "start"),
io:format("send start~n",[]),
gen_tcp:recv(Sock,1024),
io:format("drain 1024 bytes~n",[]),
gen_tcp:send(Sock, "bang"),
io:format("send bang~n",[]),
io:format("sleep 10s~n",[]),
receive
Any -> Any
after 10000 -> cont end,
gen_tcp:shutdown(Sock, write),
io:format("end~n",[]),
ok.
$ erlc test.erl client.erl
相关内容 http://blog.yufeng.info/archives/1489
http://wqtn22.iteye.com/blog/1765741