tombstone的抓取与debuggerd的有关系是一个守护进程,用来检测程序的崩溃,将程序崩溃前进程的状态记录下来,保存在/data/tombstone文件夹下,最多10个;本质上是对程序崩溃时某些信号的拦截
相关流程
客户端流程
首先,Android程序的入口有一个linker的操作,大致流程如下:
bionic/linker/arch/arm64/begin.S
31ENTRY(_start)
32 mov x0, sp
33 bl __linker_init
34
35 /* linker init returns the _entry address in the main image */
36 br x0
37END(_start)
bionic/linker/linker.cpp
4442/*
4443 * This is the entry point for the linker, called from begin.S. This
4444 * method is responsible for fixing the linker's own relocations, and
4445 * then calling __linker_init_post_relocation().
4446 *
4447 * Because this method is called before the linker has fixed it's own
4448 * relocations, any attempt to reference an extern variable, extern
4449 * function, or other GOT reference will generate a segfault.
4450 */
4451extern "C" ElfW(Addr) __linker_init(void* raw_args) {
...
4522 // We have successfully fixed our own relocations. It's safe to run
4523 // the main part of the linker now.
4524 args.abort_message_ptr = &g_abort_message;
4525 ElfW(Addr) start_address = __linker_init_post_relocation(args, linker_addr);
4526
4527 INFO("[ Jumping to _start (%p)... ]", reinterpret_cast(start_address));
4528
4529 // Return the address that the calling assembly stub should jump to.
4530 return start_address;
4531}
4195/*
4196 * This code is called after the linker has linked itself and
4197 * fixed it's own GOT. It is safe to make references to externs
4198 * and other non-local data at this point.
4199 */
4200static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
4201#if TIMING
4202 struct timeval t0, t1;
4203 gettimeofday(&t0, 0);
4204#endif
4205
4206 // Sanitize the environment.
4207 __libc_init_AT_SECURE(args);
4208
4209 // Initialize system properties
4210 __system_properties_init(); // may use 'environ'
4211
4212 debuggerd_init();
4213
4214 // Get a few environment variables.
4215 const char* LD_DEBUG = getenv("LD_DEBUG");
4216 if (LD_DEBUG != nullptr) {
4217 g_ld_debug_verbosity = atoi(LD_DEBUG);
4218 }
...
4412}
bionic/linker/debugger.cpp
302__LIBC_HIDDEN__ void debuggerd_init() {
303 struct sigaction action;
304 memset(&action, 0, sizeof(action));
305 sigemptyset(&action.sa_mask);
306 action.sa_sigaction = debuggerd_signal_handler;
307 action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309 // Use the alternate signal stack if available so we can catch stack overflows.
310 action.sa_flags |= SA_ONSTACK;
311
312 sigaction(SIGABRT, &action, nullptr);
313 sigaction(SIGBUS, &action, nullptr);
314 sigaction(SIGFPE, &action, nullptr);
315 sigaction(SIGILL, &action, nullptr);
316 sigaction(SIGSEGV, &action, nullptr);
317#if defined(SIGSTKFLT)
318 sigaction(SIGSTKFLT, &action, nullptr);
319#endif
320 sigaction(SIGTRAP, &action, nullptr);
321}
为上面这几个信号注册信号处理函数,也就是说只有这几个信号会生成tombstone
SIGILL(非法指令异常)
SIGABRT(abort退出异常)
SIGBUS(硬件访问异常)
SIGFPE(浮点运算异常)
SIGSEGV(内存访问异常)
SIGSTKFLT(协处理器栈异常)
SIGTRAP(这是什么?好像不常见)
信号处理函数为:
258/*
259 * Catches fatal signals so we can ask debuggerd to ptrace us before
260 * we crash.
261 */
262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
263 // It's possible somebody cleared the SA_SIGINFO flag, which would mean
264 // our "info" arg holds an undefined value.
265 if (!have_siginfo(signal_number)) {
266 info = nullptr;
267 }
268
269 log_signal_summary(signal_number, info);
270
271 send_debuggerd_packet(info); //发送请求 第一次接受到信号是向debuggerd服务端发送请求,等待回应表示链接上了
272
273 // We need to return from the signal handler so that debuggerd can dump the
274 // thread that crashed, but returning here does not guarantee that the signal
275 // will be thrown again, even for SIGSEGV and friends, since the signal could
276 // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to
277 // preserve the SA_SIGINFO contents.
278 signal(signal_number, SIG_DFL); //将信号处理函数置空
279
280 struct siginfo si;
281 if (!info) {
282 memset(&si, 0, sizeof(si));
283 si.si_code = SI_USER;
284 si.si_pid = getpid();
285 si.si_uid = getuid();
286 info = &si;
287 } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
288 // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
289 // that contain commit 66dd34a (3.9+). The manpage claims to only allow
290 // negative si_code values that are not SI_TKILL, but 66dd34a changed the
291 // check to allow all si_code values in calls coming from inside the house.
292 }
293
294 int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info); //给自己的相关线程再发送一次信号
295 if (rc != 0) {
296 __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297 strerror(errno));
298 _exit(0);
299 }
300}
客户端向denggerd发送信息,并等待回应,通过socket的write & read
208static void send_debuggerd_packet(siginfo_t* info) {
209 // Mutex to prevent multiple crashing threads from trying to talk
210 // to debuggerd at the same time.
211 static pthread_mutex_t crash_mutex = PTHREAD_MUTEX_INITIALIZER;
212 int ret = pthread_mutex_trylock(&crash_mutex);
213 if (ret != 0) {
214 if (ret == EBUSY) {
215 __libc_format_log(ANDROID_LOG_INFO, "libc",
216 "Another thread contacted debuggerd first; not contacting debuggerd.");
217 // This will never complete since the lock is never released.
218 pthread_mutex_lock(&crash_mutex);
219 } else {
220 __libc_format_log(ANDROID_LOG_INFO, "libc",
221 "pthread_mutex_trylock failed: %s", strerror(ret));
222 }
223 return;
224 }
225
226 int s = socket_abstract_client(DEBUGGER_SOCKET_NAME, SOCK_STREAM | SOCK_CLOEXEC);
227 if (s == -1) {
228 __libc_format_log(ANDROID_LOG_FATAL, "libc", "Unable to open connection to debuggerd: %s",
229 strerror(errno));
230 return;
231 }
232
233 // debuggerd knows our pid from the credentials on the
234 // local socket but we need to tell it the tid of the crashing thread.
235 // debuggerd will be paranoid and verify that we sent a tid
236 // that's actually in our process.
237 debugger_msg_t msg;
238 msg.action = DEBUGGER_ACTION_CRASH;
239 msg.tid = gettid();
240 msg.abort_msg_address = reinterpret_cast(g_abort_message);
241 msg.original_si_code = (info != nullptr) ? info->si_code : 0;
242 ret = TEMP_FAILURE_RETRY(write(s, &msg, sizeof(msg)));
243 if (ret == sizeof(msg)) {
244 char debuggerd_ack;
245 ret = TEMP_FAILURE_RETRY(read(s, &debuggerd_ack, 1));
246 int saved_errno = errno;
247 notify_gdb_of_libraries();
248 errno = saved_errno;
249 } else {
250 // read or write failed -- broken connection?
251 __libc_format_log(ANDROID_LOG_FATAL, "libc", "Failed while talking to debuggerd: %s",
252 strerror(errno));
253 }
254
255 close(s);
256}
debuggerd服务端启动,dump流程
debuggerd守护进程如何启动,可以通过debuggerd -b 启动,我们暂且不去说他,就说正常的启动模式
941int main(int argc, char** argv) {
942 union selinux_callback cb;
943 if (argc == 1) {
944 cb.func_audit = audit_callback;
945 selinux_set_callback(SELINUX_CB_AUDIT, cb);
946 cb.func_log = selinux_log_callback;
947 selinux_set_callback(SELINUX_CB_LOG, cb);
948 return do_server();
949 }
950
951 bool dump_backtrace = false;
952 bool have_tid = false;
953 pid_t tid = 0;
954 for (int i = 1; i < argc; i++) {
955 if (!strcmp(argv[i], "-b")) {
956 dump_backtrace = true;
957 } else if (!have_tid) {
958 tid = atoi(argv[i]);
959 have_tid = true;
960 } else {
961 usage();
962 return 1;
963 }
964 }
965 if (!have_tid) {
966 usage();
967 return 1;
968 }
969 return do_explicit_dump(tid, dump_backtrace);
970}
启动一个debuggerd服务端
849static int do_server() {
850 // debuggerd crashes can't be reported to debuggerd.
851 // Reset all of the crash handlers.
852 signal(SIGABRT, SIG_DFL);
853 signal(SIGBUS, SIG_DFL);
854 signal(SIGFPE, SIG_DFL);
855 signal(SIGILL, SIG_DFL);
856 signal(SIGSEGV, SIG_DFL);
857#ifdef SIGSTKFLT
858 signal(SIGSTKFLT, SIG_DFL);
859#endif
860 signal(SIGTRAP, SIG_DFL);
861
862 // Ignore failed writes to closed sockets
863 signal(SIGPIPE, SIG_IGN); //将debuggerd本身的crash忽略
864
865 // Block SIGCHLD so we can sigtimedwait for it.
866 sigset_t sigchld;
867 sigemptyset(&sigchld);
868 sigaddset(&sigchld, SIGCHLD);
869 sigprocmask(SIG_SETMASK, &sigchld, nullptr);
870
871 int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,
872 SOCK_STREAM | SOCK_CLOEXEC); //创建一个服务端,等待客户端连接
873 if (s == -1) return 1;
874
875 typedef void (*NativeDebugInit)(void);
876 static NativeDebugInit s_func_ptr = NULL;
877 if(!s_func_ptr) {
878 void* handle = dlopen("libmiuindbg.so",RTLD_NOW);
879 if(handle) {
880 s_func_ptr = (NativeDebugInit)dlsym(handle,"hook_context_do_hook");
881 }
882 }
883
884 if(s_func_ptr) {
885 s_func_ptr();
886 }
887
888 // Fork a process that stays root, and listens on a pipe to pause and resume the target.
889 if (!start_signal_sender()) {
890 ALOGE("debuggerd: failed to fork signal sender");
891 return 1;
892 }
893
894 ALOGI("debuggerd: starting\n");
895
896 for (;;) {
897 sockaddr_storage ss;
898 sockaddr* addrp = reinterpret_cast(&ss);
899 socklen_t alen = sizeof(ss);
900
901 ALOGV("waiting for connection\n");
902 int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);
903 if (fd == -1) {
904 ALOGE("accept failed: %s\n", strerror(errno));
905 continue;
906 }
907
908 handle_request(fd); //处理客户端的请求
909 }
910 return 0;
911}
处理客户端发来的请求
808static void handle_request(int fd) {
809 ALOGV("handle_request(%d)\n", fd);
810
811 ScopedFd closer(fd);
812 debugger_request_t request;
813 memset(&request, 0, sizeof(request));
814 int status = read_request(fd, &request); //读取客户端的请求
815 if (status != 0) {
816 return;
817 }
818
819 ALOGW("debuggerd: handling request: pid=%d uid=%d gid=%d tid=%d\n", request.pid, request.uid,
820 request.gid, request.tid);
821
822#if defined(__LP64__)
823 // On 64 bit systems, requests to dump 32 bit and 64 bit tids come
824 // to the 64 bit debuggerd. If the process is a 32 bit executable,
825 // redirect the request to the 32 bit debuggerd.
826 if (is32bit(request.tid)) {
827 // Only dump backtrace and dump tombstone requests can be redirected.
828 if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE ||
829 request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
830 redirect_to_32(fd, &request);
831 } else {
832 ALOGE("debuggerd: Not allowed to redirect action %d to 32 bit debuggerd\n", request.action);
833 }
834 return;
835 }
836#endif
837
838 // Fork a child to handle the rest of the request.
839 pid_t fork_pid = fork();
840 if (fork_pid == -1) {
841 ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
842 } else if (fork_pid == 0) {
843 worker_process(fd, request); //处理request
844 } else {
845 monitor_worker_process(fork_pid, request);
846 }
847}
read客户端发来的信息
197static int read_request(int fd, debugger_request_t* out_request) {
198 ucred cr;
199 socklen_t len = sizeof(cr);
200 int status = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &len);
201 if (status != 0) {
202 ALOGE("cannot get credentials");
203 return -1;
204 }
205
206 ALOGV("reading tid");
207 fcntl(fd, F_SETFL, O_NONBLOCK);
208
209 pollfd pollfds[1];
210 pollfds[0].fd = fd;
211 pollfds[0].events = POLLIN;
212 pollfds[0].revents = 0;
213 status = TEMP_FAILURE_RETRY(poll(pollfds, 1, 3000)); //轮询fd句柄
215 ALOGE("timed out reading tid (from pid=%d uid=%d)\n", cr.pid, cr.uid);
216 return -1;
217 }
218
219 debugger_msg_t msg;
220 memset(&msg, 0, sizeof(msg));
221 status = TEMP_FAILURE_RETRY(read(fd, &msg, sizeof(msg))); //读取客户端信息
222 if (status < 0) {
223 ALOGE("read failure? %s (pid=%d uid=%d)\n", strerror(errno), cr.pid, cr.uid);
224 return -1;
225 }
226 if (status != sizeof(debugger_msg_t)) {
227 ALOGE("invalid crash request of size %d (from pid=%d uid=%d)\n", status, cr.pid, cr.uid);
228 return -1;
229 }
230
231 out_request->action = static_cast(msg.action);
232 out_request->tid = msg.tid;
233 out_request->pid = cr.pid;
234 out_request->uid = cr.uid;
235 out_request->gid = cr.gid;
236 out_request->abort_msg_address = msg.abort_msg_address;
237 out_request->original_si_code = msg.original_si_code;
238
239 if (msg.action == DEBUGGER_ACTION_CRASH) {
240 // Ensure that the tid reported by the crashing process is valid.
241 // This check needs to happen again after ptracing the requested thread to prevent a race.
242 if (!pid_contains_tid(out_request->pid, out_request->tid)) {
243 ALOGE("tid %d does not exist in pid %d. ignoring debug request\n", out_request->tid,
244 out_request->pid);
245 return -1;
246 }
247 } else if (cr.uid == 0 || (cr.uid == AID_SYSTEM && msg.action == DEBUGGER_ACTION_DUMP_BACKTRACE)) {
248 // Only root or system can ask us to attach to any process and dump it explicitly.
249 // However, system is only allowed to collect backtraces but cannot dump tombstones.
250 status = get_process_info(out_request->tid, &out_request->pid,
251 &out_request->uid, &out_request->gid);
252 if (status < 0) {
253 ALOGE("tid %d does not exist. ignoring explicit dump request\n", out_request->tid);
254 return -1;
255 }
256
257 if (!selinux_action_allowed(fd, out_request))
258 return -1;
259 } else {
260 // No one else is allowed to dump arbitrary processes.
261 return -1;
262 }
263 return 0;
264}
整体的dump流程
566static void worker_process(int fd, debugger_request_t& request) {
567 // Open the tombstone file if we need it.
568 std::string tombstone_path;
569 int tombstone_fd = -1;
570 switch (request.action) {
571 case DEBUGGER_ACTION_DUMP_TOMBSTONE:
572 case DEBUGGER_ACTION_CRASH:
573 tombstone_fd = open_tombstone(&tombstone_path);
574 if (tombstone_fd == -1) {
575 ALOGE("debuggerd: failed to open tombstone file: %s\n", strerror(errno));
576 exit(1);
577 }
578 break;
579
580 case DEBUGGER_ACTION_DUMP_BACKTRACE:
581 break;
582
583 default:
584 ALOGE("debuggerd: unexpected request action: %d", request.action);
585 exit(1);
586 }
587
588 // At this point, the thread that made the request is blocked in
589 // a read() call. If the thread has crashed, then this gives us
590 // time to PTRACE_ATTACH to it before it has a chance to really fault.
591 //
592 // The PTRACE_ATTACH sends a SIGSTOP to the target process, but it
593 // won't necessarily have stopped by the time ptrace() returns. (We
594 // currently assume it does.) We write to the file descriptor to
595 // ensure that it can run as soon as we call PTRACE_CONT below.
596 // See details in bionic/libc/linker/debugger.c, in function
597 // debugger_signal_handler().
598
599 // Attach to the target process.
//通过ptrace监控子进程(要crash的应用进程),此时debuggerd变为其父进程,向应用进程发送sigstop;以后应用进程接受到的signal会先发到父进程
600 if (!ptrace_attach_thread(request.pid, request.tid)) {
601 ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
602 exit(1);
603 }
604
605 // DEBUGGER_ACTION_CRASH requests can come from arbitrary processes and the tid field in the
606 // request is sent from the other side. If an attacker can cause a process to be spawned with the
607 // pid of their process, they could trick debuggerd into dumping that process by exiting after
608 // sending the request. Validate the trusted request.uid/gid to defend against this.
609 if (request.action == DEBUGGER_ACTION_CRASH) {
610 pid_t pid;
611 uid_t uid;
612 gid_t gid;
613 if (get_process_info(request.tid, &pid, &uid, &gid) != 0) {
614 ALOGE("debuggerd: failed to get process info for tid '%d'", request.tid);
615 exit(1);
616 }
617
618 if (pid != request.pid || uid != request.uid || gid != request.gid) {
619 ALOGE(
620 "debuggerd: attached task %d does not match request: "
621 "expected pid=%d,uid=%d,gid=%d, actual pid=%d,uid=%d,gid=%d",
622 request.tid, request.pid, request.uid, request.gid, pid, uid, gid);
623 exit(1);
624 }
625 }
626
627 // Don't attach to the sibling threads if we want to attach gdb.
628 // Supposedly, it makes the process less reliable.
629 bool attach_gdb = should_attach_gdb(request);
630 if (attach_gdb) {
631 // Open all of the input devices we need to listen for VOLUMEDOWN before dropping privileges.
632 if (init_getevent() != 0) {
633 ALOGE("debuggerd: failed to initialize input device, not waiting for gdb");
634 attach_gdb = false;
635 }
636
637 }
638
639 std::set siblings;
640 if (!attach_gdb) {
641 ptrace_siblings(request.pid, request.tid, siblings);
642 }
643
644 // Generate the backtrace map before dropping privileges.
645 std::unique_ptr backtrace_map(BacktraceMap::Create(request.pid));
646
647 int amfd = -1;
648 std::unique_ptr amfd_data;
649 if (request.action == DEBUGGER_ACTION_CRASH) {
650 // Connect to the activity manager before dropping privileges.
651 amfd = activity_manager_connect();
652 amfd_data.reset(new std::string);
653 }
654
655 // Collect the list of open files.
656 OpenFilesList open_files;
657 populate_open_files_list(request.pid, &open_files);
658
659 bool succeeded = false;
660
661 // Now that we've done everything that requires privileges, we can drop them.
662 if (!drop_privileges()) {
663 ALOGE("debuggerd: failed to drop privileges, exiting");
664 _exit(1);
665 }
666
667 int crash_signal = SIGKILL;
668 succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,
669 &crash_signal, &open_files, amfd_data.get());
670 if (succeeded) {
671 if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
672 if (!tombstone_path.empty()) {
673 android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length()); //将dump结果写到相关路径下
674 }
675 }
676 }
677
678 if (attach_gdb || request.action == DEBUGGER_ACTION_CRASH) {
679 // Before detach we must send SIGSTOP to the target.
680 // Tell the signal process to send SIGSTOP to the target.
681 if (!send_signal(request.pid, 0, SIGSTOP)) {
682 ALOGE("debuggerd: failed to stop process for gdb attach: %s", strerror(errno));
683 attach_gdb = false;
684 }
685 }
686
687 if (!attach_gdb) {
688 // Tell the Activity Manager about the crashing process. If we are
689 // waiting for gdb to attach, do not send this or Activity Manager
690 // might kill the process before anyone can attach.
691 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
692 }
693
694 if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) { //detach客户端
695 ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));
696 }
697
698 for (pid_t sibling : siblings) {
699 ptrace(PTRACE_DETACH, sibling, 0, 0);
700 }
701
702 // Send the signal back to the process if it crashed and we're not waiting for gdb.
703 if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {
704 if (!send_signal(request.pid, request.tid, crash_signal)) {
705 ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));
706 }
707 }
708
709 // Wait for gdb, if requested.
710 if (attach_gdb) {
711 wait_for_user_action(request);
712
713 // Now tell the activity manager about this process.
714 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
715
716 // Tell the signal process to send SIGCONT to the target.
717 if (!send_signal(request.pid, 0, SIGCONT)) {
718 ALOGE("debuggerd: failed to resume process %d: %s", request.pid, strerror(errno));
719 }
720
721 uninit_getevent();
722 }
723
724 close(amfd);
725
726 exit(!succeeded);
727}
perform_dump:进行dump的过程
484static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
485 BacktraceMap* backtrace_map, const std::set& siblings,
486 int* crash_signal, OpenFilesList* open_files, std::string* amfd_data) {
487 if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) { //向应用进程(客户端返回一个值),表示连上了,可以开始dump了
488 ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));
489 return false;
490 }
491
492 int total_sleep_time_usec = 0;
493 while (true) {
494 int signal = wait_for_signal(request.tid, &total_sleep_time_usec); //因为此时已经被ptrace_attach了,所以第二次客户端发给自己的信号会在这里被接收
495 switch (signal) {
496 case -1:
497 ALOGE("debuggerd: timed out waiting for signal");
498 return false;
499
500 case SIGSTOP: //这里是attach时向客户端发送的sigstop信号
501 if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
502 ALOGV("debuggerd: stopped -- dumping to tombstone");
503 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
504 request.original_si_code, request.abort_msg_address, open_files, amfd_data);
505 } else if (request.action == DEBUGGER_ACTION_DUMP_BACKTRACE) {
506 ALOGV("debuggerd: stopped -- dumping to fd");
507 dump_backtrace(fd, backtrace_map, request.pid, request.tid, siblings, nullptr);
508 } else {
509 ALOGV("debuggerd: stopped -- continuing");
//此时通过debuggerd用PTRACE_CONT命令让应用继续执行,
// 这样应用的read系统调用就可以返回到用户态,继续执行debuggerd_signal_handler()
// 此时,debuggerd进入下一次循环,block在wait_for_signal,继续等待应用的下一个信号
510 if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {
511 ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));
512 return false;
513 }
514 continue; // loop again //注意,这里是继续循环,等待客户端的第二次信号
515 }
516 break;
517
518 case SIGABRT:
519 case SIGBUS:
520 case SIGFPE:
521 case SIGILL:
522 case SIGSEGV:
523#ifdef SIGSTKFLT
524 case SIGSTKFLT:
525#endif
526 case SIGSYS:
527 case SIGTRAP:
528 ALOGV("stopped -- fatal signal\n");
529 *crash_signal = signal;
530 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
531 request.original_si_code, request.abort_msg_address, open_files, amfd_data); //客户端发的第二次信号被debuggerd接受,开始dump
532 break; //dump完之后跳出循环,执行下面的操作
533
534 default:
535 ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
536 break;
537 }
538 break;
539 }
540
541 return true;
542}
本质上有两次通信;
第一次通信是进程的signal handler通过socket与启动的dubuggerd服务端进行通信,客户端向debuggerd写request,服务端获取request并返回一个值表示收到;同时attach到客户端,作为父进程;同时发送一个SIGSTOP信号,被接收时,此时通过debuggerd用PTRACE_CONT命令让应用继续执行,这样应用的read系统调用就可以返回到用户态,继续执行debuggerd_signal_handler,debuggerd进入下一次循环,block在wait_for_signal,继续等待应用的下一个信号
客户端收到答复之后,将注册的信号处理函数去掉,(这样再接收到信号就可以正常的走kernel流程了),然后再次发送一个信号
这里就是第二次通信,信号被父进程debuggerd拦截,开始dump操作,dump操作完后进行detach操作,不再作为客户端的父进程
此时客户端会进入到默认的信号处理逻辑中
2173int get_signal(struct ksignal *ksig)
2174{
2175 struct sighand_struct *sighand = current->sighand;
2176 struct signal_struct *signal = current->signal;
2177 int signr;
2178
2179 if (unlikely(current->task_works))
2180 task_work_run();
2181
2182 if (unlikely(uprobe_deny_signal()))
2183 return 0;
2184
2185 /*
2186 * Do this once, we can't return to user-mode if freezing() == T.
2187 * do_signal_stop() and ptrace_stop() do freezable_schedule() and
2188 * thus do not need another check after return.
2189 */
2190 try_to_freeze();
2191
2192relock:
2193 spin_lock_irq(&sighand->siglock);
2194 /*
2195 * Every stopped thread goes here after wakeup. Check to see if
2196 * we should notify the parent, prepare_signal(SIGCONT) encodes
2197 * the CLD_ si_code into SIGNAL_CLD_MASK bits.
2198 */
2199 if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
2200 int why;
2201
2202 if (signal->flags & SIGNAL_CLD_CONTINUED)
2203 why = CLD_CONTINUED;
2204 else
2205 why = CLD_STOPPED;
2206
2207 signal->flags &= ~SIGNAL_CLD_MASK;
2208
2209 spin_unlock_irq(&sighand->siglock);
2210
2211 /*
2212 * Notify the parent that we're continuing. This event is
2213 * always per-process and doesn't make whole lot of sense
2214 * for ptracers, who shouldn't consume the state via
2215 * wait(2) either, but, for backward compatibility, notify
2216 * the ptracer of the group leader too unless it's gonna be
2217 * a duplicate.
2218 */
2219 read_lock(&tasklist_lock);
2220 do_notify_parent_cldstop(current, false, why);
2221
2222 if (ptrace_reparented(current->group_leader))
2223 do_notify_parent_cldstop(current->group_leader,
2224 true, why);
2225 read_unlock(&tasklist_lock);
2226
2227 goto relock;
2228 }
2229
2230 for (;;) {
2231 struct k_sigaction *ka;
2232
2233 if (unlikely(current->jobctl & JOBCTL_STOP_PENDING) &&
2234 do_signal_stop(0))
2235 goto relock;
2236
2237 if (unlikely(current->jobctl & JOBCTL_TRAP_MASK)) {
2238 do_jobctl_trap();
2239 spin_unlock_irq(&sighand->siglock);
2240 goto relock;
2241 }
2242
2243 signr = dequeue_signal(current, ¤t->blocked, &ksig->info);
2244
2245 if (!signr)
2246 break; /* will return 0 */
2247
2248 if (unlikely(current->ptrace) && signr != SIGKILL) {
2249 signr = ptrace_signal(signr, &ksig->info);
2250 if (!signr)
2251 continue;
2252 }
2253
2254 ka = &sighand->action[signr-1];
2255
2256 /* Trace actually delivered signals. */
2257 trace_signal_deliver(signr, &ksig->info, ka);
2258
2259 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
2260 continue;
2261 if (ka->sa.sa_handler != SIG_DFL) {
2262 /* Run the handler. */
2263 ksig->ka = *ka;
2264
2265 if (ka->sa.sa_flags & SA_ONESHOT)
2266 ka->sa.sa_handler = SIG_DFL;
2267
2268 break; /* will return non-zero "signr" value */
2269 }
2270
2271 /*
2272 * Now we are doing the default action for this signal.
2273 */
2274 if (sig_kernel_ignore(signr)) /* Default is nothing. */
2275 continue;
2276
2277 /*
2278 * Global init gets no signals it doesn't want.
2279 * Container-init gets no signals it doesn't want from same
2280 * container.
2281 *
2282 * Note that if global/container-init sees a sig_kernel_only()
2283 * signal here, the signal must have been generated internally
2284 * or must have come from an ancestor namespace. In either
2285 * case, the signal cannot be dropped.
2286 */
2287 if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
2288 !sig_kernel_only(signr))
2289 continue;
2290
2291 if (sig_kernel_stop(signr)) {
2292 /*
2293 * The default action is to stop all threads in
2294 * the thread group. The job control signals
2295 * do nothing in an orphaned pgrp, but SIGSTOP
2296 * always works. Note that siglock needs to be
2297 * dropped during the call to is_orphaned_pgrp()
2298 * because of lock ordering with tasklist_lock.
2299 * This allows an intervening SIGCONT to be posted.
2300 * We need to check for that and bail out if necessary.
2301 */
2302 if (signr != SIGSTOP) {
2303 spin_unlock_irq(&sighand->siglock);
2304
2305 /* signals can be posted during this window */
2306
2307 if (is_current_pgrp_orphaned())
2308 goto relock;
2309
2310 spin_lock_irq(&sighand->siglock);
2311 }
2312
2313 if (likely(do_signal_stop(ksig->info.si_signo))) {
2314 /* It released the siglock. */
2315 goto relock;
2316 }
2317
2318 /*
2319 * We didn't actually stop, due to a race
2320 * with SIGCONT or something like that.
2321 */
2322 continue;
2323 }
2324
2325 spin_unlock_irq(&sighand->siglock);
2326
2327 /*
2328 * Anything else is fatal, maybe with a core dump.
2329 */
2330 current->flags |= PF_SIGNALED;
2331
2332 if (sig_kernel_coredump(signr)) {
2333 if (print_fatal_signals)
2334 print_fatal_signal(ksig->info.si_signo);
2335 proc_coredump_connector(current);
2336 /*
2337 * If it was able to dump core, this kills all
2338 * other threads in the group and synchronizes with
2339 * their demise. If we lost the race with another
2340 * thread getting here, it set group_exit_code
2341 * first and our do_group_exit call below will use
2342 * that value and ignore the one we pass it.
2343 */
2344 do_coredump(&ksig->info);
2345 }
2346
2347 /*
2348 * Death signals, no core dump.
2349 */
2350 do_group_exit(ksig->info.si_signo);
2351 /* NOTREACHED */
2352 }
2353 spin_unlock_irq(&sighand->siglock);
2354
2355 ksig->sig = signr;
2356 return ksig->sig > 0;
2357}
412#define sig_kernel_coredump(sig) \
413 (((sig) < SIGRTMIN) && siginmask(sig, SIG_KERNEL_COREDUMP_MASK))
399 rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \
400 rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \
401 rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \
402 rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \
403 rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \
404 SIGEMT_MASK
可见coredump相应的信号比tombstone多,tombstone响应的为coredump的子集,能响应coredump的信号如下,参考default action列表:
* +--------------------+------------------+
* | POSIX signal | default action |
* +--------------------+------------------+
* | SIGHUP | terminate |
* | SIGINT | terminate |
* | SIGQUIT | coredump |
* | SIGILL | coredump |
* | SIGTRAP | coredump |
* | SIGABRT/SIGIOT | coredump |
* | SIGBUS | coredump |
* | SIGFPE | coredump |
* | SIGKILL | terminate(+) |
* | SIGUSR1 | terminate |
* | SIGSEGV | coredump |
* | SIGUSR2 | terminate |
* | SIGPIPE | terminate |
* | SIGALRM | terminate |
* | SIGTERM | terminate |
* | SIGCHLD | ignore |
* | SIGCONT | ignore(*) |
* | SIGSTOP | stop(*)(+) |
* | SIGTSTP | stop(*) |
* | SIGTTIN | stop(*) |
* | SIGTTOU | stop(*) |
* | SIGURG | ignore |
* | SIGXCPU | coredump |
* | SIGXFSZ | coredump |
* | SIGVTALRM | terminate |
* | SIGPROF | terminate |
* | SIGPOLL/SIGIO | terminate |
* | SIGSYS/SIGUNUSED | coredump |
* | SIGSTKFLT | terminate |
* | SIGWINCH | ignore |
* | SIGPWR | terminate |
* | SIGRTMIN-SIGRTMAX | terminate |
* +--------------------+------------------+
* | non-POSIX signal | default action |
* +--------------------+------------------+
* | SIGEMT | coredump |
* +--------------------+------------------+
那么如何tombstone添加一个信号呢?
拓展
debuggerd_init打不出log?
原因:
bionic/linker/linker_main.cpp
/*
211 * This code is called after the linker has linked itself and
212 * fixed it's own GOT. It is safe to make references to externs
213 * and other non-local data at this point.
214 */
215static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args) {
216 ProtectedDataGuard guard;
217
218#if TIMING
219 struct timeval t0, t1;
220 gettimeofday(&t0, 0);
221#endif
222
223 // Sanitize the environment.
224 __libc_init_AT_SECURE(args);
225
226 // Initialize system properties
227 __system_properties_init(); // may use 'environ'
228
229 // Register the debuggerd signal handler.
230#ifdef __ANDROID__
231 debuggerd_callbacks_t callbacks = {
232 .get_abort_message = []() {
233 return g_abort_message;
234 },
235 .post_dump = ¬ify_gdb_of_libraries,
236 };
237 debuggerd_init(&callbacks); //此时LD_DEBUG还没有初始化
238#endif
239
240 g_linker_logger.ResetState();
241
242 // Get a few environment variables.
243 const char* LD_DEBUG = getenv("LD_DEBUG");
244 if (LD_DEBUG != nullptr) {
245 g_ld_debug_verbosity = atoi(LD_DEBUG);
246 }
bionic/linker/linker_debug.h
63#if LINKER_DEBUG_TO_LOG
64#define _PRINTVF(v, x...) \
65 do { \
66 if (g_ld_debug_verbosity > (v)) async_safe_format_log(5-(v), "linker", x); \
67 } while (0)
68#else /* !LINKER_DEBUG_TO_LOG */
69#define _PRINTVF(v, x...) \
70 do { \
71 if (g_ld_debug_verbosity > (v)) { async_safe_format_fd(1, x); write(1, "\n", 1); } \
72 } while (0)
73#endif /* !LINKER_DEBUG_TO_LOG */
74
75#define PRINT(x...) _PRINTVF(-1, x)
76#define INFO(x...) _PRINTVF(0, x)
77#define TRACE(x...) _PRINTVF(1, x)
所以用INFO等等,级别不够,可以直接用async_safe_format_log进行打印,就一定能打出来