在R15B01上,遇到一个gen_tcp/gen_udp:controlling_process(Port, self())导致的port泄露问题,下列链接详细的说明了产生问题的步骤:
https://github.com/erlang/otp/commit/944a57a11a79c5a9bb2f554c921e2e00e7d56c91
该问题在R15B03得到了修复,此处分析这个问题如下:
1> {ok,Port} = gen_udp:open(9000, [binary]).
{ok,#Port<0.581>}
2> i(0,31,0).
[{current_function,{c,pinfo,1}},
{initial_call,{erlang,apply,2}},
{status,running},
{message_queue_len,0},
{messages,[]},
{links,[<0.25.0>,#Port<0.581>]},
{dictionary,[]},
{trap_exit,false},
{error_handler,error_handler},
{priority,normal},
{group_leader,<0.24.0>},
{total_heap_size,5168},
{heap_size,2584},
{stack_size,27},
{reductions,3814},
{garbage_collection,[{min_bin_vheap_size,46368},
{min_heap_size,233},
{fullsweep_after,65535},
{minor_gcs,1}]},
{suspending,[]}]
此时,这个新建的port已经关联到shell进程<0,31,0>上。
3> gen_udp:controlling_process(Port, self()).
ok
4> i(0,31,0).
[{current_function,{c,pinfo,1}},
{initial_call,{erlang,apply,2}},
{status,running},
{message_queue_len,0},
{messages,[]},
{links,[<0.25.0>]},
{dictionary,[]},
{trap_exit,false},
{error_handler,error_handler},
{priority,normal},
{group_leader,<0.24.0>},
{total_heap_size,5168},
{heap_size,2584},
{stack_size,27},
{reductions,8889},
{garbage_collection,[{min_bin_vheap_size,46368},
{min_heap_size,233},
{fullsweep_after,65535},
{minor_gcs,7}]},
{suspending,[]}]
controlling_process到自身后,到port的link消失了。
分析这个过程的代码,发现如下问题:
gen_udp.erl
controlling_process(S, NewOwner) ->
inet:udp_controlling_process(S, NewOwner).
inet.erl
udp_controlling_process(S, NewOwner) when is_port(S), is_pid(NewOwner) ->
case erlang:port_info(S, connected) of
{connected, Pid} when Pid =/= self() ->
{error, not_owner};
_ ->
{ok, A0} = prim_inet:getopt(S, active),
prim_inet:setopt(S, active, false),
udp_sync_input(S, NewOwner),
try erlang:port_connect(S, NewOwner) of
true ->
unlink(S),
prim_inet:setopt(S, active, A0),
ok
catch
error:Reason ->
{error, Reason}
end
end.
一次controlling_process包括两个动作:首先在新进程上port_connect这个port,接着在原进程上unlink这个port。port_connect与unlink均为bif函数。
BIF_RETTYPE port_connect_2(BIF_ALIST_2)
{
Port* prt;
Process* rp;
Eterm pid = BIF_ARG_2;
if (is_not_internal_pid(pid)) {
error:
BIF_ERROR(BIF_P, BADARG);
}
prt = id_or_name2port(BIF_P, BIF_ARG_1);
if (!prt) {
goto error;
}
rp = erts_pid2proc(BIF_P, ERTS_PROC_LOCK_MAIN,
pid, ERTS_PROC_LOCK_LINK);
if (!rp) {
erts_smp_port_unlock(prt);
ERTS_SMP_ASSERT_IS_NOT_EXITING(BIF_P);
goto error;
}
erts_add_link(&(rp->nlinks), LINK_PID, prt->id);
erts_add_link(&(prt->nlinks), LINK_PID, pid);
erts_smp_proc_unlock(rp, ERTS_PROC_LOCK_LINK);
prt->connected = pid; /* internal pid */
erts_smp_port_unlock(prt);
#ifdef USE_VM_PROBES
if (DTRACE_ENABLED(port_connect)) {
DTRACE_CHARBUF(process_str, DTRACE_TERM_BUF_SIZE);
DTRACE_CHARBUF(port_str, DTRACE_TERM_BUF_SIZE);
DTRACE_CHARBUF(newprocess_str, DTRACE_TERM_BUF_SIZE);
dtrace_pid_str(prt->connected, process_str);
erts_snprintf(port_str, sizeof(port_str), "%T", prt->id);
dtrace_proc_str(rp, newprocess_str);
DTRACE4(port_connect, process_str, port_str, prt->name, newprocess_str);
}
#endif
BIF_RET(am_true);
}
port_connect将在port与新进程上均添加一个link进行关联,但是存在一个问题。
int erts_add_link(ErtsLink **root, Uint type, Eterm pid)
{
void *tstack[STACK_NEED];
int tpos = 0;
int dstack[STACK_NEED+1];
int dpos = 1;
int state = 0;
ErtsLink **this = root;
Sint c;
dstack[0] = DIR_END;
for (;;) {
if (!*this) { /* Found our place */
state = 1;
*this = create_link(type,pid);
break;
} else if ((c = CMP(pid,(*this)->pid)) < 0) {
/* go left */
dstack[dpos++] = DIR_LEFT;
tstack[tpos++] = this;
this = &((*this)->left);
} else if (c > 0) { /* go right */
dstack[dpos++] = DIR_RIGHT;
tstack[tpos++] = this;
this = &((*this)->right);
} else { /* Equal key is an error for monitors */
return -1;
}
}
insertion_rotation(dstack, dpos, tstack, tpos, state);
return 0;
}
在erts_add_link中,会遍历进程/port的link表,如果发现link节点已经在自身的link表中,则返回-1,但是在上层调用者port_connect_2处并未检查这个错误,而是直接返回。对于controlling_process到自身的场景中,此处返回后,立即调用unlink。
BIF_RETTYPE unlink_1(BIF_ALIST_1)
{
Process *rp;
DistEntry *dep;
ErtsLink *l = NULL, *rl = NULL;
/*
* SMP specific note concerning incoming exit signals:
* We have to have at least the status lock during removal of
* the link half on current process, and check for and handle
* a present pending exit while the status lock is held. This
* in order to ensure that we wont be exited by a link after
* it has been removed.
*
* (We also have to have the link lock, of course, in order to
* be allowed to remove the link...)
*/
if (IS_TRACED_FL(BIF_P, F_TRACE_PROCS)) {
trace_proc(BIF_P, BIF_P, am_unlink, BIF_ARG_1);
}
if (is_internal_port(BIF_ARG_1)) {
Port *pt = erts_id2port_sflgs(BIF_ARG_1,
BIF_P,
ERTS_PROC_LOCK_MAIN,
ERTS_PORT_SFLGS_DEAD);
erts_smp_proc_lock(BIF_P, ERTS_PROC_LOCK_LINK|ERTS_PROC_LOCK_STATUS);
#ifdef ERTS_SMP
if (ERTS_PROC_PENDING_EXIT(BIF_P)) {
if (pt)
erts_smp_port_unlock(pt);
goto handle_pending_exit;
}
#endif
l = erts_remove_link(&BIF_P->nlinks, BIF_ARG_1);
ASSERT(pt || !l);
if (pt) {
rl = erts_remove_link(&pt->nlinks, BIF_P->id);
erts_smp_port_unlock(pt);
if (rl)
erts_destroy_link(rl);
}
erts_smp_proc_unlock(BIF_P, ERTS_PROC_LOCK_LINK|ERTS_PROC_LOCK_STATUS);
if (l)
erts_destroy_link(l);
BIF_RET(am_true);
}
else if (is_external_port(BIF_ARG_1)
&& external_port_dist_entry(BIF_ARG_1) == erts_this_dist_entry) {
BIF_RET(am_true);
}
if (is_not_pid(BIF_ARG_1))
BIF_ERROR(BIF_P, BADARG);
…
}
代码篇幅较大,这里仅列出port相关的部分,其中对erts_remove_link的调用是关键部分,它将移除原先通过erts_add_link加入的link节点。
ErtsLink *erts_remove_link(ErtsLink **root, Eterm pid)
{
ErtsLink **tstack[STACK_NEED];
int tpos = 0;
int dstack[STACK_NEED+1];
int dpos = 1;
int state = 0;
ErtsLink **this = root;
Sint c;
int dir;
ErtsLink *q = NULL;
dstack[0] = DIR_END;
for (;;) {
if (!*this) { /* Failure */
return NULL;
} else if ((c = CMP(pid,(*this)->pid)) < 0) {
dstack[dpos++] = DIR_LEFT;
tstack[tpos++] = this;
this = &((*this)->left);
} else if (c > 0) { /* go right */
dstack[dpos++] = DIR_RIGHT;
tstack[tpos++] = this;
this = &((*this)->right);
} else { /* Equal key, found the one to delete */
q = (*this);
if (q->right == NULL) {
(*this) = q->left;
state = 1;
} else if (q->left == NULL) {
(*this) = q->right;
state = 1;
} else {
dstack[dpos++] = DIR_LEFT;
tstack[tpos++] = this;
state = delsub((ErtsMonitorOrLink **) this);
}
break;
}
}
while (state && ( dir = dstack[--dpos] ) != DIR_END) {
this = tstack[--tpos];
if (dir == DIR_LEFT) {
state = balance_left((ErtsMonitorOrLink **) this);
} else {
state = balance_right((ErtsMonitorOrLink **) this);
}
}
return q;
}
可以看到,在这个场景中,由于原先进程与port是link在一起的,此处必然能将它们unlink。
回到udp_controlling_process:
udp_controlling_process(S, NewOwner) when is_port(S), is_pid(NewOwner) ->
case erlang:port_info(S, connected) of
{connected, Pid} when Pid =/= self() ->
{error, not_owner};
_ ->
{ok, A0} = prim_inet:getopt(S, active),
prim_inet:setopt(S, active, false),
udp_sync_input(S, NewOwner),
try erlang:port_connect(S, NewOwner) of
true ->
unlink(S),
prim_inet:setopt(S, active, A0),
ok
catch
error:Reason ->
{error, Reason}
end
end.
综上所述,在controlling_process到自身的场景中,erlang:port_connect不能将port再次加入原进程的link列表,而unlink却会正常地将port与原进程断开,此后,port变为无主port。
此时,若需要关闭这个port,可以通过erlang:port_close关闭。
5> lists:last(erlang:ports()).
#Port<0.581>
6> P = lists:last(erlang:ports()).
#Port<0.581>
7> erlang:port_close(P).
true
8> inet:i().
ok
修复这个bug的代码,作了一项额外检查,若发现新进程与原进程相同,则直接返回ok。
udp_controlling_process(S, NewOwner) when is_port(S), is_pid(NewOwner) ->
case erlang:port_info(S, connected) of
{connected, NewOwner} ->
ok;
{connected, Pid} when Pid =/= self() ->
{error, not_owner};
_ ->
{ok, A0} = prim_inet:getopt(S, active),
prim_inet:setopt(S, active, false),
udp_sync_input(S, NewOwner),
try erlang:port_connect(S, NewOwner) of
true ->
unlink(S),
prim_inet:setopt(S, active, A0),
ok
catch
error:Reason ->
{error, Reason}
end
end.