自己创建一个异常
Native Exception,简称NE,是发生于C/C++ code里面最常见的一种异常,
我们在写代码的时候一些常见的操作都会导致NE,比如空指针赋值,数组越界访问等,现在我在代码里面人为的添加一个简单的exception:
test.c:
1 #include
2
3 void func4()
4 {
5 char *p = NULL;
6 *p = 0x5;//异常产生的地方
7 }
8
9 void func3()
10 {
11 int var4 = 4;
12 }
13
14
15 void func2()
16 {
17 int var3 = 3;
18 func3();
19 func4();
20 }
21
22 void func1()
23 {
24 int var1,var2;
25 var1 = 2;
26 }
27
28 void main()
29 {
30 int var0 = 1;
31 func1();
32 func2();
33 return;
34 }
Android.mk
1 LOCAL_PATH := $(call my-dir)
2 include $(CLEAR_VARS)
3
4 LOCAL_CFLAGS += -g3 -O0
5
6 LOCAL_SRC_FILES := test.c
7
8 LOCAL_MODULE := test
9
10 LOCAL_MULTILIB := 32
11
12 include $(BUILD_EXECUTABLE)
我把它放入pls/vendor/mediatek/proprietary/external/libtest/目录下面,我们对它进行编译并push到手机里面:
mmm vendor/mediatek/proprietary/external/libtest/
out/target/product/xxx/system/bin/test
adb push out/target/product/xxx/system/bin/test system/bin/
gdb-server调试程序
启动gdbserver:
$ adb shell ./system/bin/gdbserver :1234 system/bin/test
Process system/bin/test created; pid = 4130
Listening on port 1234
$ adb forward tcp:1234 tcp:1234
gdb 调试这个bin文件:
$ ./prebuilts/gcc/linux-x86/arm/cit-arm-linux-androideabi-4.8/bin/arm-linux-androideabi-gdb out/target/product/xxx/symbols/system/bin/test
Reading symbols from out/target/product/xxx/symbols/system/bin/test...done.
(gdb) set solib-search-path out/target/product/xxx/symbols/system/lib/
(gdb) set solib-absolute-prefix out/target/product/xxx/symbols/
(gdb) target remote:1234
Remote debugging using :1234
Reading symbols from out/target/product/xxx/symbols/system/bin/linker...done.
Loaded symbols for out/target/product/xxx/symbols/system/bin/linker
__dl__start () at bionic/linker/arch/arm/begin.S:32
32 mov r0, sp
(gdb) list
现在test程序加载 成功了:
(gdb) b main //设置断点
Breakpoint 1 at 0xaaaaa772: file vendor/mediatek/proprietary/external/libtest/test.c, line 30.
(gdb) n //单步执行
33 bl __linker_init
当运行到func函数里面就出现异常:
(gdb) n
Program received signal SIGSEGV, Segmentation fault.
0xaaaaa740 in func4 () at vendor/mediatek/proprietary/external/libtest/test.c:6
6 *p = 0x5;
可以很清楚的知道,我们在调用libtest这里面出现了问题,AEE是MTK平台自己的一套处理异常的工具,代码是封装好的,当应用app发生了异常,它回收集异常信息到压缩在DB文件里面,我们需要用GAT工具才能打开这个文件,通过在main_log里面,我们可以搜索到如下信息:
01-02 04:16:18.768 4180 4180 I AEE_AED : Build fingerprint: 'xxx:7.0/NRD90M/v6H5E-2:eng/test-keys'
01-02 04:16:18.768 4180 4180 I AEE_AED : Revision: '0'
01-02 04:16:18.768 4180 4180 I AEE_AED : ABI: 'arm'
01-02 04:16:18.768 4180 4180 I AEE_AED : pid: 4142, tid: 4142, name: test >>> system/bin/test <<<
01-02 04:16:18.769 4180 4180 I AEE_AED : signal 11 (SIGSEGV), code 1 (SEGV_MAPERR), fault addr 0x0
01-02 04:16:18.770 4180 4180 I AEE_AED : r0 00000000 r1 00000005 r2 fffefa2c r3 00000000
01-02 04:16:18.770 4180 4180 I AEE_AED : r4 aaaaa76f r5 fffefa24 r6 00000001 r7 fffefa2c
01-02 04:16:18.771 4180 4180 I AEE_AED : r8 00000000 r9 00000000 sl 00000000 fp fffefa00
01-02 04:16:18.771 4180 4180 I AEE_AED : ip f750085c sp fffef9cc lr aaaaa761 pc aaaaa740 cpsr 00070030
01-02 04:16:18.796 4180 4180 I AEE_AED :
01-02 04:16:18.796 4180 4180 I AEE_AED : backtrace:
01-02 04:16:18.799 4180 4180 I AEE_AED : #00 pc 00000740 /system/bin/test
01-02 04:16:18.800 4180 4180 I AEE_AED : #01 pc 0000075d /system/bin/test
01-02 04:16:18.800 4180 4180 I AEE_AED : #02 pc 0000077b /system/bin/test
01-02 04:16:18.800 4180 4180 I AEE_AED : #03 pc 0001708c /system/lib/libc.so (__libc_init+84)
01-02 04:16:18.800 4180 4180 I AEE_AED : #04 pc 00000660 /system/bin/test
01-02 04:16:18.819 290 290 I wmt_launcher: fw log ctrl flag has been set
当native层程序发生异常的时候,系统kernel就会进入异常模式会发送一个signal给到usr这边,处理这个异常的signal就是android的debuggerd这个进程,会在log当中找到类似如下log:
libc : Fatal signal 11 (SIGSEGV), code 1, fault addr 0x14 in tid 9765 (Capture@CmdQue)
此进程可以侦测到程序崩溃,并将崩溃时的进程状态信息输出到文件和串口中,以供开发人员分析调试使用。Debuggerd的数据被保存在/data/tombstone/目录下,Linux kernel有自己的一套signal机制,在应用程序崩溃时,通常系统内核都会发送signal到出问题的进程,以通知进程出现什么异常,这些进程可以捕获这些signal并对其做相应的处理。
debuggerd创建一个名为 “Android:debuggerd”的socket,作为server端等待其他client端进程的连接,接收client端进程发送来的tid和action信息将由tid指定的那个进程的运行信息,按照由action指定的动作dump到文件;
c/c++程序clinet端
下面就将简单介绍debuggerd进程的处理过程:
在应用程序入口地址__start后,__linker_init中调用debugger_init()函数来注册异常信号处理handler,以实现拦截系统异常的几个singal:SIGILL,SIGABRT, SIGBUS, SIGFPE,SIGSEGV和SIGPIPE:
bionic/linker/linker.cpp:
4172static ElfW(Addr) __linker_init_post_relocation(KernelArgumentBlock& args, ElfW(Addr) linker_base) {
4173#if TIMING
4174 struct timeval t0, t1;
4175 gettimeofday(&t0, 0);
4176#endif
4179 __libc_init_AT_SECURE(args);
4180
4184 debuggerd_init();
bionic/linker/debugger.cpp:
302__LIBC_HIDDEN__ void debuggerd_init() {
303 struct sigaction action;
304 memset(&action, 0, sizeof(action));
305 sigemptyset(&action.sa_mask);
306 action.sa_sigaction = debuggerd_signal_handler;//异常处理函数;
307 action.sa_flags = SA_RESTART | SA_SIGINFO;
308
309 // Use the alternate signal stack if available so we can catch stack overflows.
310 action.sa_flags |= SA_ONSTACK;
311
312 sigaction(SIGABRT, &action, nullptr);
313 sigaction(SIGBUS, &action, nullptr);
314 sigaction(SIGFPE, &action, nullptr);
315 sigaction(SIGILL, &action, nullptr);
316 sigaction(SIGSEGV, &action, nullptr);
317#if defined(SIGSTKFLT)
318 sigaction(SIGSTKFLT, &action, nullptr);
319#endif
320 sigaction(SIGTRAP, &action, nullptr);
321}
bionic库中的链接器会对以下七种信号设置Handler(debugger_signal_handler):
SIGILL(非法指令异常)//前面对空指针赋值就,内核那边就发送这个信号给进程cameraserver
SIGABRT(abort退出异常)
SIGBUS(硬件访问异常)
SIGFPE(浮点运算异常)
SIGSEGV(内存访问异常)
SIGSTKFLT(协处理器栈异常)
SIGPIPE(管道异常
262static void debuggerd_signal_handler(int signal_number, siginfo_t* info, void*) {
263 // It's possible somebody cleared the SA_SIGINFO flag, which would mean
264 // our "info" arg holds an undefined value.
265 if (!have_siginfo(signal_number)) {
266 info = nullptr;
267 }
268
269 log_signal_summary(signal_number, info);//打印出现问题进程信息;
270
271 send_debuggerd_packet(info);//现在处于clinet端,通过socket跟service 进行connect,
//然后通过write(s, &msg, sizeof(msg)把info发给debuggerd,DEBUGGER_ACTION_CRASH为采取的行为;
272
273 // We need to return from the signal handler so that debuggerd can dump the
274 // thread that crashed, but returning here does not guarantee that the signal
275 // will be thrown again, even for SIGSEGV and friends, since the signal could
276 // have been sent manually. Resend the signal with rt_tgsigqueueinfo(2) to
277 // preserve the SA_SIGINFO contents.
278 signal(signal_number, SIG_DFL);//设置该信号关联的动作,SIG_DFL表示默认操作,恢复到默认;
279
280 struct siginfo si;
281 if (!info) {
282 memset(&si, 0, sizeof(si));
283 si.si_code = SI_USER;
284 si.si_pid = getpid();
285 si.si_uid = getuid();
286 info = &si;
287 } else if (info->si_code >= 0 || info->si_code == SI_TKILL) {
288 // rt_tgsigqueueinfo(2)'s documentation appears to be incorrect on kernels
289 // that contain commit 66dd34a (3.9+). The manpage claims to only allow
290 // negative si_code values that are not SI_TKILL, but 66dd34a changed the
291 // check to allow all si_code values in calls coming from inside the house.
292 }
293
294 int rc = syscall(SYS_rt_tgsigqueueinfo, getpid(), gettid(), signal_number, info);
//系统调用tgsigqueueinfo:信号将被传递给线程组的任意成员;
295 if (rc != 0) {
296 __libc_format_log(ANDROID_LOG_FATAL, "libc", "failed to resend signal during crash: %s",
297 strerror(errno));
298 _exit(0);
299 }
300}
debuggered进程service端:
system/core/debuggerd/debuggerd.cpp
871int main(int argc, char** argv) {
872 union selinux_callback cb;
873 if (argc == 1) {
874 cb.func_audit = audit_callback;
875 selinux_set_callback(SELINUX_CB_AUDIT, cb);
876 cb.func_log = selinux_log_callback;
877 selinux_set_callback(SELINUX_CB_LOG, cb);
878 return do_server();//没有-b参数就调用这个流程
879 }
880
895 if (!have_tid) {
896 usage();
897 return 1;
898 }
899 return do_explicit_dump(tid, dump_backtrace);//手动导出 debuggerd -b tid
900}
当启动debuggerd进程传递的参数个数为1时,debuggerd将作为一个后台服务进程,专门接收应用程序异常退出消息而产生tombstone:
792static int do_server() {
793 // debuggerd crashes can't be reported to debuggerd.
794 // Reset all of the crash handlers.
//忽略debuggerd自身crash的处理;
795 signal(SIGABRT, SIG_DFL);
796 signal(SIGBUS, SIG_DFL);
797 signal(SIGFPE, SIG_DFL);
798 signal(SIGILL, SIG_DFL);
799 signal(SIGSEGV, SIG_DFL);
800#ifdef SIGSTKFLT
801 signal(SIGSTKFLT, SIG_DFL);
802#endif
803 signal(SIGTRAP, SIG_DFL);
804
805 // Ignore failed writes to closed sockets
806 signal(SIGPIPE, SIG_IGN);
807
808 // Block SIGCHLD so we can sigtimedwait for it.
809 sigset_t sigchld;
810 sigemptyset(&sigchld);
811 sigaddset(&sigchld, SIGCHLD);
812 sigprocmask(SIG_SETMASK, &sigchld, nullptr);
813 //建立socket通信的server端;
814 int s = socket_local_server(SOCKET_NAME, ANDROID_SOCKET_NAMESPACE_ABSTRACT,
815 SOCK_STREAM | SOCK_CLOEXEC);
816 if (s == -1) return 1;
817
818 // Fork a process that stays root, and listens on a pipe to pause and resume the target.
819 if (!start_signal_sender()) {
820 ALOGE("debuggerd: failed to fork signal sender");
821 return 1;
822 }
823
824 ALOGI("debuggerd: starting\n");
825
826 for (;;) {
827 sockaddr_storage ss;
828 sockaddr* addrp = reinterpret_cast(&ss);
829 socklen_t alen = sizeof(ss);
830
831 ALOGV("waiting for connection\n");
832 int fd = accept4(s, addrp, &alen, SOCK_CLOEXEC);
833 if (fd == -1) {
834 ALOGE("accept failed: %s\n", strerror(errno));
835 continue;
836 }
837
838 handle_request(fd);//handle_request 处理请求;
839 }
840 return 0;
841}
system/core/debuggerd/debuggerd.cpp:
751static void handle_request(int fd) {
752 ALOGV("handle_request(%d)\n", fd);
753
754 ScopedFd closer(fd);
755 debugger_request_t request;
756 memset(&request, 0, sizeof(request));
757 int status = read_request(fd, &request);
//读取client端进程发送来的数据,socket上读取debugger_msg_t结构体;
758 if (status != 0) {
759 return;
760 }
781 // Fork a child to handle the rest of the request.
782 pid_t fork_pid = fork();
783 if (fork_pid == -1) {
784 ALOGE("debuggerd: failed to fork: %s\n", strerror(errno));
785 } else if (fork_pid == 0) {
786 worker_process(fd, request);//创建一个子进程去处理dump的工作;
787 } else {
788 monitor_worker_process(fork_pid, request);//父进程监控子进程操作,结束后就会杀敌子进程;
789 }
790}
先看子进程的操作:system/core/debuggerd/debuggerd.cpp:
537static void worker_process(int fd, debugger_request_t& request) {
538 // Open the tombstone file if we need it.
539 std::string tombstone_path;
540 int tombstone_fd = -1;
541 switch (request.action) {
542 case DEBUGGER_ACTION_DUMP_TOMBSTONE:
543 case DEBUGGER_ACTION_CRASH:
544 tombstone_fd = open_tombstone(&tombstone_path);
553 //打开一个tombstone文件,限制最多10个,超过了就会被覆盖掉;
554 default:
555 ALOGE("debuggerd: unexpected request action: %d", request.action);
556 exit(1);
557 }
569
570 // Attach to the target process.
571 if (ptrace(PTRACE_ATTACH, request.tid, 0, 0) != 0) {
//跟踪指定进程,成为它的父进程,并停止该进程,debuggerd可也拦截发送给这个thread的信号除了
//SIGKILL,所以现在kernel那边发送过来的信号将被debuggered拦截;
//ATTACH之后,会让kernel那边发送SIGSTOP信号给原来问题进程,这个信号将被debuggerd拦截;
572 ALOGE("debuggerd: ptrace attach failed: %s", strerror(errno));
573 exit(1);
574 }
575
576 // Don't attach to the sibling threads if we want to attach gdb.
577 // Supposedly, it makes the process less reliable.
578 bool attach_gdb = should_attach_gdb(request);
587 //是否调用gdb调试,是就会终止正常的crash
588 std::set siblings;
589 if (!attach_gdb) {
590 ptrace_siblings(request.pid, request.tid, siblings);
//同时跟踪问题thread相关联的thread;
591 }
592
593 // Generate the backtrace map before dropping privileges.
594 std::unique_ptr backtrace_map(BacktraceMap::Create(request.pid));
595 //生成backtrace map;
596 int amfd = -1;
597 std::unique_ptr<std::string> amfd_data;
598 if (request.action == DEBUGGER_ACTION_CRASH) {
599 // Connect to the activity manager before dropping privileges.
600 amfd = activity_manager_connect();
601 amfd_data.reset(new std::string);
602 }
603
604 bool succeeded = false;
605
606 // Now that we've done everything that requires privileges, we can drop them.
607 if (!drop_privileges()) {
608 ALOGE("debuggerd: failed to drop privileges, exiting");
609 _exit(1);
610 }
611
612 int crash_signal = SIGKILL;
613 succeeded = perform_dump(request, fd, tombstone_fd, backtrace_map.get(), siblings,
614 &crash_signal, amfd_data.get());
//根据sinal信号类型然后通过engrave_tombstone把信息写到tombstone;
615 if (succeeded) {
616 if (request.action == DEBUGGER_ACTION_DUMP_TOMBSTONE) {
617 if (!tombstone_path.empty()) {
618 android::base::WriteFully(fd, tombstone_path.c_str(), tombstone_path.length());
619 }
620 }
621 }
631 if (!attach_gdb) {
632 // Tell the Activity Manager about the crashing process. If we are
633 // waiting for gdb to attach, do not send this or Activity Manager
634 // might kill the process before anyone can attach.
635 activity_manager_write(request.pid, crash_signal, amfd, *amfd_data.get());
636 }
637 //解除对问题tread的跟踪;
638 if (ptrace(PTRACE_DETACH, request.tid, 0, 0) != 0) {
639 ALOGE("debuggerd: ptrace detach from %d failed: %s", request.tid, strerror(errno));
640 }
641 //解除对问题相关联tread的跟踪;
642 for (pid_t sibling : siblings) {
643 ptrace(PTRACE_DETACH, sibling, 0, 0);
644 }
645
646 // Send the signal back to the process if it crashed and we're not waiting for gdb.
647 if (!attach_gdb && request.action == DEBUGGER_ACTION_CRASH) {
648 if (!send_signal(request.pid, request.tid, crash_signal)) {
649 ALOGE("debuggerd: failed to kill process %d: %s", request.pid, strerror(errno));
650 }
651 }
667
668 close(amfd);
669
670 exit(!succeeded);
671}
455static bool perform_dump(const debugger_request_t& request, int fd, int tombstone_fd,
456 BacktraceMap* backtrace_map, const std::set & siblings,
457 int* crash_signal, std::string* amfd_data) {
458 if (TEMP_FAILURE_RETRY(write(fd, "\0", 1)) != 1) {
459 ALOGE("debuggerd: failed to respond to client: %s\n", strerror(errno));
460 return false;
461 }
462
463 int total_sleep_time_usec = 0;
464 while (true) {
465 int signal = wait_for_signal(request.tid, &total_sleep_time_usec);
//第一次发送等到的是stop信号,第二次才是出现问题类型的真正信号;
466 switch (signal) {
467 case -1:
468 ALOGE("debuggerd: timed out waiting for signal");
469 return false;
470
471 case SIGSTOP:
480 ALOGV("debuggerd: stopped -- continuing");
481 if (ptrace(PTRACE_CONT, request.tid, 0, 0) != 0) {//将目标问题进程切换位出现问题时刻的上下文状态;
482 ALOGE("debuggerd: ptrace continue failed: %s", strerror(errno));
483 return false;
484 }
485 continue; // loop again
486 }
487 break;
488
489 case SIGABRT:
490 case SIGBUS:
491 case SIGFPE:
492 case SIGILL:
493 case SIGSEGV:
494#ifdef SIGSTKFLT
495 case SIGSTKFLT:
496#endif
497 case SIGSYS:
498 case SIGTRAP:
499 ALOGV("stopped -- fatal signal\n");
500 *crash_signal = signal;//当在一次信号过来,就会通过下面的函数导出此刻问题进程的信息;
501 engrave_tombstone(tombstone_fd, backtrace_map, request.pid, request.tid, siblings, signal,
502 request.original_si_code, request.abort_msg_address, amfd_data);
503 break;
504
505 default:
506 ALOGE("debuggerd: process stopped due to unexpected signal %d\n", signal);
507 break;
508 }
509 break;
510 }
511
512 return true;
513}
如下就是从tombstone导出来的信息:
*** *** *** *** *** *** *** *** *** *** *** *** *** *** *** ***
Build fingerprint: '
xxx/5049S/xxx:7.0/NRD90M/xxx:eng/test-keys'
Revision: '0'
ABI: 'arm'
pid: 505, tid: 1046, name: Binder:505_1 >>> /system/bin/cameraserver <<<
signal 4 (SIGILL), code 1 (ILL_ILLOPC), fault addr 0xe09a4988
r0 e8d1a2c8 r1 e09a8004 r2 00000001 r3 00000002
r4 e1106200 r5 00000001 r6 de83b891 r7 e8d438d0
r8 00000416 r9 e8d39990 sl e930bf6d fp e127f910
ip de855c1c sp e127f678 lr de83aba1 pc e09a4988 cpsr 200f0030
backtrace:
#00 pc 00000988 /system/vendor/lib/libcancer.so (_ZN7android6Cancer15destroyInstanceEv+39)
#01 pc 0001db9d /system/vendor/lib/libcam.client.so (_ZN7android15NSDisplayClient13DisplayClient4initEv+60)
#02 pc 0000d145 /system/vendor/lib/libcam.device1.so (_ZN7android14Cam1DeviceBase17initDisplayClientEP18preview_stream_ops+684)
整个tombstone包含的信息有:
(1). 创建1个tombstone文件。
最多10个,如果已存在10个,则覆盖最旧的文件。
(2). 版本信息
主要是fingerprint,可以看出异常版本是eng还是user。
(3). 寄存器信息
主要查看是哪个进程崩溃,信号是什么。寄存器信息需要配合下面的调用栈信息及数据信息结合GNU的工具(objdump -S反汇编)分析。
(4). 调用栈信息
这个是最直接可以看出异常的信息。
(5). 其他线程信息
如果异常线程和其他线程有逻辑关系的话,可以查看对应线程的信息。
(6). main log信息