[root@gfsd1 ~]# [root@gfsd1 ~]# ps aux|grep rabbit root 11813 0.0 0.0 106368 1004 pts/2 S 19:21 0:00 /bin/sh /etc/init.d/rabbitmq start root 11815 0.0 0.0 106100 1200 pts/2 S 19:21 0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/local/rabbitmq/sbin/rabbitmq-server root 11817 46.1 0.3 2806172 53252 pts/2 Sl 19:21 0:07 /usr/local/rabbitmq/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rabbitmq@rmq_1 -boot start_sasl -config /usr/local/rabbitmq/sbin/../etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1.log"} -rabbit sasl_error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1-sasl.log"} -rabbit enabled_plugins_file "/usr/local/rabbitmq/sbin/../etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/local/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1" root 12534 0.0 0.0 103252 836 pts/2 S+ 19:21 0:00 grep rabbit [root@gfsd1 ~]# [root@gfsd1 ~]# [root@gfsd1 ~]# cat /var/run/rabbitmq/pid 11817 [root@gfsd1 ~]#通过 init 脚本 stop 服务
[root@gfsd1 ~]# /etc/init.d/rabbitmq stop Stopping rabbitmq-server: /var/run/rabbitmq/pid Stopping and halting node rabbitmq@rmq_1 ... (卡住)查看日志,发现 RabbitMQ 服务已经执行了停止动作。
[root@gfsd1 ~]# tail -f /usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq\@rmq_1.log ... =INFO REPORT==== 14-Mar-2016::19:21:38 === Stopping RabbitMQ =INFO REPORT==== 14-Mar-2016::19:21:38 === stopped TCP Listener on [::]:6672 =ERROR REPORT==== 14-Mar-2016::19:21:38 === AMQP connection <0.424.0> (running), channel 0 - error: {amqp_error,connection_forced, "broker forced connection closure with reason 'shutdown'",none} =INFO REPORT==== 14-Mar-2016::19:21:38 === Halting Erlang VM对应代码在 rabbit.erl 中,如下所示
[root@gfsd1 ~]# [root@gfsd1 ~]# ps ajxf|grep rabbit 27541 13082 13082 27541 pts/2 13082 S+ 0 0:00 | | \_ /bin/sh /etc/init.d/rabbitmq stop 13082 13127 13082 27541 pts/2 13082 Sl+ 0 0:00 | | \_ /usr/local/rabbitmq/erts-6.0/bin/beam.smp -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -hidden -sname rabbitmqctl13127 -boot start_clean -s rabbit_control_main -nodename rabbitmq@rmq_1 -extra stop /var/run/rabbitmq/pid 12431 13687 13686 12431 pts/5 13686 S+ 0 0:00 | \_ grep rabbit [root@gfsd1 ~]#通过 Ctrl+C 停止调用后,可以看到 rabbitmqctl stop <pidfile> 调用进程才停止。
[root@gfsd1 ~]# /etc/init.d/rabbitmq stop Stopping rabbitmq-server: /var/run/rabbitmq/pid Stopping and halting node rabbitmq@rmq_1 ... (卡住) ^C BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded (v)ersion (k)ill (D)b-tables (d)istribution remove pid file and dir rabbitmq-server. [root@gfsd1 ~]# ^C [root@gfsd1 ~]# [root@gfsd1 ~]# [root@gfsd1 ~]# ps ajxf|grep rabbit 27541 1658 1657 27541 pts/2 1657 S+ 0 0:00 | | \_ grep rabbit [root@gfsd1 ~]# [root@gfsd1 ~]# [root@gfsd1 ~]# ll /var/run/rabbitmq/pid ls: cannot access /var/run/rabbitmq/pid: No such file or directory [root@gfsd1 ~]#
... exec ${ERL_DIR}erl \ -pa "${RABBITMQ_HOME}/ebin" \ -noinput \ -hidden \ ${RABBITMQ_CTL_ERL_ARGS} \ -sname rabbitmqctl$$ \ -boot "${CLEAN_BOOT_FILE}" \ -s rabbit_control_main \ # 入口函数 -nodename $RABBITMQ_NODENAME \ -extra "$@"在 rabbit_control_main.erl 中
... -define(EXTERNAL_CHECK_INTERVAL, 1000). ... start() -> ... {Command, Opts, Args} = case parse_arguments(init:get_plain_arguments(), NodeStr) of {ok, Res} -> Res; no_command -> print_error("could not recognise command", []), usage() end, ... %% The reason we don't use a try/catch here is that rpc:call turns %% thrown errors into normal return values case catch action(Command, Node, Args, Opts, Inform) of ok -> case Quiet of true -> ok; false -> io:format("...done.~n") end, rabbit_misc:quit(0); {ok, Info} -> case Quiet of true -> ok; false -> io:format("...done (~p).~n", [Info]) end, rabbit_misc:quit(0); ... end.对应到实际情况,我们的 Command 为 stop ,所以会调用
... action(stop, Node, Args, _Opts, Inform) -> Inform("Stopping and halting node ~p", [Node]), Res = call(Node, {rabbit, stop_and_halt, []}), case {Res, Args} of {ok, [PidFile]} -> wait_for_process_death( %% 等待指定 pid 的进程退出 read_pid_file(PidFile, false)); %% 读取 pid 文件的内容 {ok, [_, _| _]} -> exit({badarg, Args}); _ -> ok end, Res; ... wait_for_process_death(Pid) -> case process_up(Pid) of true -> timer:sleep(?EXTERNAL_CHECK_INTERVAL), %% 只要认为指定 pid 的进程未退出,则每隔 1s 检查一次 wait_for_process_death(Pid); false -> ok end. ... % Test using some OS clunkiness since we shouldn't trust % rpc:call(os, getpid, []) at this point process_up(Pid) -> with_os([{unix, fun () -> run_ps(Pid) =:= 0 %% Linux 中会执行这里。从上面的判定知道,只有不等于 0 时,才认为目标进程已经退出 end}, {win32, fun () -> Cmd = "tasklist /nh /fi \"pid eq " ++ Pid ++ "\" ", Res = rabbit_misc:os_cmd(Cmd ++ "2>&1"), case re:run(Res, "erl\\.exe", [{capture, none}]) of match -> true; _ -> false end end}]). with_os(Handlers) -> {OsFamily, _} = os:type(), case proplists:get_value(OsFamily, Handlers) of undefined -> throw({unsupported_os, OsFamily}); Handler -> Handler() end. run_ps(Pid) -> Port = erlang:open_port({spawn, "ps -p " ++ Pid}, %% 这里就是整个问题的关键了 [exit_status, {line, 16384}, use_stdio, stderr_to_stdout]), exit_loop(Port). exit_loop(Port) -> receive {Port, {exit_status, Rc}} -> Rc; %% 对应上面,只有 Rc 不等于 0 时,才会认为目标进程已退出 {Port, _} -> exit_loop(Port) end.知道了代码执行流程,简单添加一些关键打印,重新测试如下
[root@gfsd1 ~]# [root@gfsd1 ~]# ps aux|grep rabbit root 11813 0.0 0.0 106368 1004 pts/2 S 19:21 0:00 /bin/sh /etc/init.d/rabbitmq start root 11815 0.0 0.0 106100 1200 pts/2 S 19:21 0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/local/rabbitmq/sbin/rabbitmq-server root 11817 46.1 0.3 2806172 53252 pts/2 Sl 19:21 0:07 /usr/local/rabbitmq/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/rabbitmq -progname erl -- -home /root -- -boot /usr/local/rabbitmq/releases/3.3.5/start_clean -pa /usr/local/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rabbitmq@rmq_1 -boot start_sasl -config /usr/local/rabbitmq/sbin/../etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1.log"} -rabbit sasl_error_logger {file,"/usr/local/rabbitmq/sbin/../var/log/rabbitmq/rabbitmq@rmq_1-sasl.log"} -rabbit enabled_plugins_file "/usr/local/rabbitmq/sbin/../etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/local/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/usr/local/rabbitmq/sbin/../var/lib/rabbitmq/mnesia/rabbitmq@rmq_1" root 12534 0.0 0.0 103252 836 pts/2 S+ 19:21 0:00 grep rabbit [root@gfsd1 ~]# [root@gfsd1 ~]# [root@gfsd1 ~]# cat /var/run/rabbitmq/pid 11817 [root@gfsd1 ~]# [root@gfsd1 ~]# [root@gfsd1 ~]# /etc/init.d/rabbitmq stop Stopping rabbitmq-server: /var/run/rabbitmq/pid Stopping and halting node rabbitmq@rmq_1 ... Args = ["/var/run/rabbitmq/pid"] ... action_stop => Res = ok Args = ["/var/run/rabbitmq/pid"] ... read_pid_file => PidFile : "/var/run/rabbitmq/pid" Wait = false wait_for_process_death Pid = "11817" run_ps => erlang:open_port -> ps -p "11817" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Data = {data,{eol,"11817 pts/2 00:00:08 beam.smp"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "11817" run_ps => erlang:open_port -> ps -p "11817" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Data = {data,{eol,"11817 pts/2 00:00:08 beam.smp"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "11817" run_ps => erlang:open_port -> ps -p "11817" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "11817" run_ps => erlang:open_port -> ps -p "11817" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "11817" run_ps => erlang:open_port -> ps -p "11817" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "11817" run_ps => erlang:open_port -> ps -p "11817" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Exit = {exit_status,0} ... (永远停不下来) ... ^C BREAK: (a)bort (c)ontinue (p)roc info (i)nfo (l)oaded (v)ersion (k)ill (D)b-tables (d)istribution ^Cremove pid file and dir rabbitmq-server. [root@gfsd1 ~]# ^C [root@gfsd1 ~]#可以看到无论执行多少次循环,永远返回 {exit_status,0} ,所以导致 rabbitmqctl stop <pidfile> 无法停止。
[root@Betty ~]# [root@Betty ~]# /etc/init.d/rabbitmq start Starting rabbitmq-server: SUCCESS rabbitmq-server. [root@Betty ~]# [root@Betty ~]# [root@Betty ~]# ps aux|grep rabbit root 2431 0.0 0.0 106456 1004 pts/3 S 14:40 0:00 /bin/sh /etc/init.d/rabbitmq start root 2434 0.0 0.0 106092 1172 pts/3 S 14:40 0:00 /bin/bash -c ulimit -S -c 0 >/dev/null 2>&1 ; /usr/sbin/rabbitmq-server root 2436 42.8 1.4 2296732 56332 pts/3 Sl 14:40 0:02 /usr/local/lib/erlang/erts-6.0/bin/beam.smp -W w -K true -A30 -P 1048576 -- -root /usr/local/lib/erlang -progname erl -- -home /root -- -pa /usr/lib/rabbitmq/sbin/../ebin -noshell -noinput -s rabbit boot -sname rmq_betty -boot start_sasl -config /etc/rabbitmq/rabbitmq -kernel inet_default_connect_options [{nodelay,true}] -sasl errlog_type error -sasl sasl_error_logger false -rabbit error_logger {file,"/var/log/rabbitmq/rmq_betty.log"} -rabbit sasl_error_logger {file,"/var/log/rabbitmq/rmq_betty-sasl.log"} -rabbit enabled_plugins_file "/etc/rabbitmq/enabled_plugins" -rabbit plugins_dir "/usr/lib/rabbitmq/sbin/../plugins" -rabbit plugins_expand_dir "/var/lib/rabbitmq/mnesia/rmq_betty-plugins-expand" -os_mon start_cpu_sup false -os_mon start_disksup false -os_mon start_memsup false -mnesia dir "/var/lib/rabbitmq/mnesia/rmq_betty" -kernel inet_dist_listen_min 25672 -kernel inet_dist_listen_max 25672 root 2567 0.0 0.0 103256 856 pts/3 S+ 14:40 0:00 grep rabbit [root@Betty ~]# [root@Betty ~]# [root@Betty ~]# cat /var/run/rabbitmq/pid 2436 [root@Betty ~]# [root@Betty ~]# [root@Betty ~]# ps -p 2436 PID TTY TIME CMD 2436 pts/3 00:00:02 beam.smp [root@Betty ~]# [root@Betty ~]# [root@Betty ~]# /etc/init.d/rabbitmq stop Stopping rabbitmq-server: Stopping and halting node rmq_betty@Betty ... Args = ["/var/run/rabbitmq/pid"] ... read_pid_file => PidFile : "/var/run/rabbitmq/pid" Wait = false wait_for_process_death Pid = "2436" run_ps => erlang:open_port -> ps -p "2436" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Data = {data,{eol," 2436 pts/3 00:00:02 beam.smp"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "2436" run_ps => erlang:open_port -> ps -p "2436" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Data = {data,{eol," 2436 pts/3 00:00:02 beam.smp"}} exit_loop => Exit = {exit_status,0} wait_for_process_death Pid = "2436" run_ps => erlang:open_port -> ps -p "2436" exit_loop => Data = {data,{eol," PID TTY TIME CMD"}} exit_loop => Exit = {exit_status,1} ...done. rabbitmq-server. [root@Betty ~]#可以看到,循环检测 3 次后就返回了 {exit_status,1} ,从而 rabbitmqctl stop <pidfile> 能够正常退出。
1.修改 init 脚本,将调用 rabbitmqctl stop <pidfile> 的地方,改成 rabbitmqctl stop(需要相应调整针对 pidfile 的各种条件判定)
2.不要使用 VMware 虚拟机,因为在 KVM 虚拟机上没有发现此问题(VMware 粉不要喷我~)
[root@gfsd1 ~]# [root@gfsd1 ~]# ps -p 123 PID TTY TIME CMD 123 ? 00:00:00 kthrotld/1 [root@gfsd1 ~]# [root@gfsd1 ~]# echo $? 0 [root@gfsd1 ~]# [root@gfsd1 ~]# ps -p 1234 PID TTY TIME CMD [root@gfsd1 ~]# [root@gfsd1 ~]# echo $? 0 -- 正常的 ps 程序,此处应该返回 1 [root@gfsd1 ~]#确认问题 ps 程序有哪些特征
[root@gfsd1 ~]# uname -a Linux gfsd1 2.6.32-431.el6.x86_64 #1 SMP Fri Nov 22 03:15:09 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux [root@gfsd1 ~]# ps 文件大小和正常机器上不同 [root@gfsd1 ~]# ll /bin/ps -rwxr-xr-x 1 root root 1223123 Mar 18 10:42 /bin/ps [root@gfsd1 ~]# [root@gfsd1 ~]# 64bit 机器上出现了 32bit 的 ps 文件 [root@gfsd1 ~]# file /bin/ps /bin/ps: ELF 32-bit LSB executable, Intel 80386, version 1 (SYSV), statically linked, for GNU/Linux 2.2.5, not stripped [root@gfsd1 ~]# 正常机器上 ps 是动态链接的 [root@gfsd1 ~]# ldd /bin/ps not a dynamic executable [root@gfsd1 ~]#跟踪问题 ps 的执行
[root@gfsd1 ~]# [root@gfsd1 ~]# strace -f ps -p 1234 execve("/bin/ps", ["ps", "-p", "1234"], [/* 32 vars */]) = 0 -- 调用问题 ps 程序 [ Process PID=14740 runs in 32 bit mode. ] ... set_thread_area(0xffc3db64) = 0 set_tid_address(0xa12c878) = 14740 ... getrlimit(RLIMIT_STACK, {rlim_cur=-4284481536, rlim_max=564187122106368*1024}) = 0 _sysctl({0x2081076ec, -3941488, (nil), (nil), (nil), 18429817727855099905}) = 0 ... close(3) = -1 EBADF (Bad file descriptor) ... close(1023) = -1 EBADF (Bad file descriptor) readlink("/proc/14740/exe", "/bin/ps", 1024) = 7 stat64("/bin/ps", {st_mode=S_IFREG|0755, st_size=1223123, ...}) = 0 getppid() = 14734 readlink("/proc/14734/exe", "/usr/bin/strace", 255) = 15 readlink("/proc/14740/exe", "/bin/ps", 1024) = 7 readlink("/proc/14740/exe", "/bin/ps", 1024) = 7 readlink("/proc/14740/exe", "/bin/ps", 1024) = 7 access("/usr/bin/dpkgd/ps", F_OK) = 0 pipe([3, 4]) = 0 clone(Process 14845 attached child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID|SIGCHLD, child_tidptr=0) = 14845 -- 创建了一个线程(正常 ps 没有该行为) ... [pid 14845] dup2(4, 1 <unfinished ...> [pid 14740] <... fstat64 resumed> {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0 [pid 14845] <... dup2 resumed> ) = 1 [pid 14740] mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0 <unfinished ...> [pid 14845] close(4) = 0 ... [pid 14845] execve("/bin/sh", ["sh", "-c", "/usr/bin/dpkgd/ps -p 1234"], [/* 40 vars */] <unfinished ...> -- 在新建线程中调用的是 /usr/bin/dpkgd/ 目录下的 ps 文件!!! [pid 14740] read(3, <unfinished ...> [pid 14845] <... execve resumed> ) = 0 [ Process PID=14845 runs in 64 bit mode. ] ... [pid 14845] getdents(4, /* 281 entries */, 32768) = 7352 [pid 14845] stat("/proc/1", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0 [pid 14845] open("/proc/1/stat", O_RDONLY) = 5 [pid 14845] read(5, "1 (init) S 0 1 1 0 -1 4202752 30"..., 1023) = 260 [pid 14845] close(5) = 0 [pid 14845] open("/proc/1/status", O_RDONLY) = 5 [pid 14845] read(5, "Name:\tinit\nState:\tS (sleeping)\nT"..., 1023) = 870 [pid 14845] close(5) = 0 [pid 14845] stat("/proc/2", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0 [pid 14845] open("/proc/2/stat", O_RDONLY) = 5 [pid 14845] read(5, "2 (kthreadd) S 0 0 0 0 -1 214961"..., 1023) = 155 [pid 14845] close(5) = 0 [pid 14845] open("/proc/2/status", O_RDONLY) = 5 [pid 14845] read(5, "Name:\tkthreadd\nState:\tS (sleepin"..., 1023) = 656 [pid 14845] close(5) = 0 ... [pid 14845] stat("/proc/28465", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0 [pid 14845] open("/proc/28465/stat", O_RDONLY) = 5 [pid 14845] read(5, "28465 (inet_gethost) S 28451 284"..., 1023) = 237 [pid 14845] close(5) = 0 [pid 14845] open("/proc/28465/status", O_RDONLY) = 5 [pid 14845] read(5, "Name:\tinet_gethost\nState:\tS (sle"..., 1023) = 884 [pid 14845] close(5) = 0 [pid 14845] getdents(4, /* 0 entries */, 32768) = 0 [pid 14845] close(4) = 0 [pid 14845] fstat(1, {st_mode=S_IFIFO|0600, st_size=0, ...}) = 0 [pid 14845] mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4528922000 [pid 14845] write(1, " PID TTY TIME CMD\n", 28) = 28 [ Process PID=14740 runs in 32 bit mode. ] [pid 14740] <... read resumed> " PID TTY TIME CMD\n", 4096) = 28 [ Process PID=14845 runs in 64 bit mode. ] [pid 14845] exit_group(1) = ? -- 线程中的 ps 退出码为 1 Process 14845 detached [ Process PID=14740 runs in 32 bit mode. ] fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 1), ...}) = 0 mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xfffffffff778a000 --- SIGCHLD (Child exited) @ 0 (0) --- write(1, " PID TTY TIME CMD\n", 28 PID TTY TIME CMD ) = 28 read(3, "", 4096) = 0 close(3) = 0 waitpid(14845, [{WIFEXITED(s) && WEXITSTATUS(s) == 1}], 0) = 14845 -- 可以看到,问题 ps 成功拿到了正确 ps 的退出码 munmap(0xf778b000, 4096) = 0 munmap(0xf778a000, 4096) = 0 exit_group(0) = ? -- 但问题 ps 最终以退出码 0 退出 [root@gfsd1 ~]#由上面的信息,可以知道正确的 ps 程序被搬移到了下面的位置
[root@gfsd1 ~]# ll /usr/bin/dpkgd/ total 436 -rwxr-xr-x 1 root root 145872 Jan 28 05:00 lsof -rwxr-xr-x 1 root root 128192 Jan 28 05:00 netstat -rwxr-xr-x 1 root root 87088 Jan 28 05:00 ps -rwxr-xr-x 1 root root 74840 Jan 28 05:00 ss [root@gfsd1 ~]# [root@gfsd1 ~]# file /usr/bin/dpkgd/* /usr/bin/dpkgd/lsof: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped /usr/bin/dpkgd/netstat: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped /usr/bin/dpkgd/ps: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped /usr/bin/dpkgd/ss: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped [root@gfsd1 ~]# [root@gfsd1 ~]# [root@gfsd1 ~]# ldd /usr/bin/dpkgd/* /usr/bin/dpkgd/lsof: linux-vdso.so.1 => (0x00007fff86bff000) libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000) libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000) libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000) /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000) /usr/bin/dpkgd/netstat: linux-vdso.so.1 => (0x00007fff3ddc2000) libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000) libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000) libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000) /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000) /usr/bin/dpkgd/ps: linux-vdso.so.1 => (0x00007fff961b6000) libselinux.so.1 => /lib64/libselinux.so.1 (0x0000003522000000) libproc-3.2.8.so => /lib64/libproc-3.2.8.so (0x0000003ef8600000) libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000) libdl.so.2 => /lib64/libdl.so.2 (0x0000003ef7e00000) /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000) /usr/bin/dpkgd/ss: linux-vdso.so.1 => (0x00007fffb51e1000) libresolv.so.2 => /lib64/libresolv.so.2 (0x0000003ef9e00000) libc.so.6 => /lib64/libc.so.6 (0x0000003ef8200000) /lib64/ld-linux-x86-64.so.2 (0x0000003ef7a00000) [root@gfsd1 ~]#可以看到,被攻击的可执行程序不止 ps 一个...
[root@YOYO ~]# uname -a Linux YOYO 2.6.32-358.el6.x86_64 #1 SMP Fri Feb 22 00:31:26 UTC 2013 x86_64 x86_64 x86_64 GNU/Linux [root@YOYO ~]# [root@YOYO ~]# ll /bin/ps -rwxr-xr-x. 1 root root 87088 11月 15 2012 /bin/ps [root@YOYO ~]# [root@YOYO ~]# file /bin/ps /bin/ps: ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked (uses shared libs), for GNU/Linux 2.6.18, stripped [root@YOYO ~]# [root@YOYO ~]# ldd /bin/ps linux-vdso.so.1 => (0x00007fffb6d24000) libselinux.so.1 => /lib64/libselinux.so.1 (0x00000034e7000000) libproc-3.2.8.so => /lib64/libproc-3.2.8.so (0x0000003f38400000) libc.so.6 => /lib64/libc.so.6 (0x0000003f38000000) libdl.so.2 => /lib64/libdl.so.2 (0x0000003f37c00000) /lib64/ld-linux-x86-64.so.2 (0x0000003f37800000) [root@YOYO ~]#跟踪正常 ps 程序的执行
[root@YOYO ~]# strace -f ps -p 1234 execve("/bin/ps", ["ps", "-p", "1234"], [/* 28 vars */]) = 0 ... getdents(4, /* 241 entries */, 32768) = 6192 stat("/proc/1", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0 open("/proc/1/stat", O_RDONLY) = 5 read(5, "1 (init) S 0 1 1 0 -1 4202752 29"..., 1023) = 256 close(5) = 0 open("/proc/1/status", O_RDONLY) = 5 read(5, "Name:\tinit\nState:\tS (sleeping)\nT"..., 1023) = 866 close(5) = 0 stat("/proc/2", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0 open("/proc/2/stat", O_RDONLY) = 5 read(5, "2 (kthreadd) S 0 0 0 0 -1 214961"..., 1023) = 156 close(5) = 0 open("/proc/2/status", O_RDONLY) = 5 read(5, "Name:\tkthreadd\nState:\tS (sleepin"..., 1023) = 654 close(5) = 0 ... stat("/proc/25701", {st_mode=S_IFDIR|0555, st_size=0, ...}) = 0 open("/proc/25701/stat", O_RDONLY) = 5 read(5, "25701 (bash) S 25697 25701 25701"..., 1023) = 257 close(5) = 0 open("/proc/25701/status", O_RDONLY) = 5 read(5, "Name:\tbash\nState:\tS (sleeping)\nT"..., 1023) = 879 close(5) = 0 getdents(4, /* 0 entries */, 32768) = 0 close(4) = 0 fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 2), ...}) = 0 mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f4d2da3c000 write(1, " PID TTY TIME CMD\n", 28 PID TTY TIME CMD ) = 28 exit_group(1) = ? [root@YOYO ~]#可以看到正常情况下 ps -p xxx 是以退出码 1 退出的。
与此木马程序相关的文章:这里 。
系统漏洞相关文档:《NTRG_ElasticBotnetReport_06102015》
漏洞号:CVE-2015-1427
问题 ps 中的符号片段
... 08048b30 T CAttackIe::UpdateCurVariant(unsigned int) 08048b36 T CTcpAttack::UpdateCurVariant(unsigned int) 08048be4 T CAttackIcmp::MakePacket(unsigned int) 08048c5a T CAttackSyn::MakePacket(unsigned int) 08048d02 T CAttackUdp::MakePacket(unsigned int) 08048dce T CPacketAttack::SendPacket() 08048eac T CPacketAttack::Create() 08048f7e T CAttackAmp::Create() 08049070 T CPacketAttack::UpdateCurVariant(unsigned int) 08049306 T CTcpAttack::Do(unsigned int) 08049546 T CAttackPrx::MakePacket(unsigned int) 080497d6 T CAttackAmp::MakePacket(unsigned int) 0804996c T CAttackDns::MakePacket(unsigned int) 08049bf8 T CAttackCc::Destroy() 08049cc8 T CAttackCc::Create() 08049e8c T CAttackTns::Destroy() 08049f5c T CAttackTns::Create() 0804a05a T CAttackPrx::Create() 0804a2a6 T CAttackDns::Create() 0804a4f2 T CAttackCompress::CreatePackets() 0804a6ec T CAttackCompress::MakePacket(unsigned int) 0804a8be T CAttackCompress::Create() ...