Linux:系统调用追踪原理简析

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 分析背景

本文分析基于 linux-4.14 内核代码。
运行环境为:Ubuntu 16.04.4 LTS + QEMU Arm vexpress-a9
rootfs 基于 ubuntu-base-16.04-core-armhf.tar.gz 制作。

3. 测试demo

strace 是众所周知的系统调用追踪工具,我们建立一个类似于 strace 的测试程序来分析 系统调用追踪的工作原理。该测试程序可以用来追踪 ARM32 架构下的程序的系统调用。测试程序 syscall_trace 的代码如下:

/*
 * syscall_trace.c
 */

#include 
#include 
#include 
#include 
#include 
#include 

 
/*
 * 每个架构下的系统调用编号并不一致,这里是 ARM32 架构下的系统调用表。 
 * 每个内核版本支持的系统调用也不一样,需要按实际情况进行修改。
 */
static struct syscall {
	int  no;
	char *name;
} syscall_table[] = {
	/* 架构无关系统调用 */
	{0, "restart_syscall"}, 
	{1, "exit"}, 
	{2, "fork"}, 
	{3, "read"}, 
	{4, "write"}, 
	{5, "open"}, 
	{6, "close"}, 
	{8, "creat"}, 
	{9, "link"}, 
	{10, "unlink"}, 
	{11, "execve"}, 
	{12, "chdir"}, 
	{14, "mknod"}, 
	{15, "chmod"}, 
	{16, "lchown"}, 
	{19, "lseek"}, 
	{20, "getpid"}, 
	{21, "mount"}, 
	{23, "setuid"}, 
	{24, "getuid"}, 
	{26, "ptrace"}, 
	{29, "pause"}, 
	{33, "access"}, 
	{34, "nice"}, 
	{36, "sync"}, 
	{37, "kill"}, 
	{38, "rename"}, 
	{39, "mkdir"}, 
	{40, "rmdir"}, 
	{41, "dup"}, 
	{42, "pipe"}, 
	{43, "times"}, 
	{45, "brk"}, 
	{46, "setgid"}, 
	{47, "getgid"}, 
	{49, "geteuid"}, 
	{50, "getegid"}, 
	{51, "acct"}, 
	{52, "umount2"}, 
	{54, "ioctl"}, 
	{55, "fcntl"}, 
	{57, "setpgid"}, 
	{60, "umask"}, 
	{61, "chroot"}, 
	{62, "ustat"}, 
	{63, "dup2"}, 
	{64, "getppid"}, 
	{65, "getpgrp"}, 
	{66, "setsid"}, 
	{67, "sigaction"}, 
	{70, "setreuid"}, 
	{71, "setregid"}, 
	{72, "sigsuspend"}, 
	{73, "sigpending"}, 
	{74, "sethostname"}, 
	{75, "setrlimit"}, 
	{77, "getrusage"}, 
	{78, "gettimeofday"}, 
	{79, "settimeofday"}, 
	{80, "getgroups"}, 
	{81, "setgroups"}, 
	{83, "symlink"}, 
	{85, "readlink"}, 
	{86, "uselib"}, 
	{87, "swapon"}, 
	{88, "reboot"}, 
	{91, "munmap"}, 
	{92, "truncate"}, 
	{93, "ftruncate"}, 
	{94, "fchmod"}, 
	{95, "fchown"}, 
	{96, "getpriority"}, 
	{97, "setpriority"}, 
	{99, "statfs"}, 
	{100, "fstatfs"}, 
	{103, "syslog"}, 
	{104, "setitimer"}, 
	{105, "getitimer"}, 
	{106, "stat"}, 
	{107, "lstat"}, 
	{108, "fstat"}, 
	{111, "vhangup"}, 
	{114, "wait4"}, 
	{115, "swapoff"}, 
	{116, "sysinfo"}, 
	{118, "fsync"}, 
	{119, "sigreturn"}, 
	{120, "clone"}, 
	{121, "setdomainname"}, 
	{122, "uname"}, 
	{124, "adjtimex"}, 
	{125, "mprotect"}, 
	{126, "sigprocmask"}, 
	{128, "init_module"}, 
	{129, "delete_module"}, 
	{131, "quotactl"}, 
	{132, "getpgid"}, 
	{133, "fchdir"}, 
	{134, "bdflush"}, 
	{135, "sysfs"}, 
	{136, "personality"}, 
	{138, "setfsuid"}, 
	{139, "setfsgid"}, 
	{140, "_llseek"}, 
	{141, "getdents"}, 
	{142, "_newselect"}, 
	{143, "flock"}, 
	{144, "msync"}, 
	{145, "readv"}, 
	{146, "writev"}, 
	{147, "getsid"}, 
	{148, "fdatasync"}, 
	{149, "_sysctl"}, 
	{150, "mlock"}, 
	{151, "munlock"}, 
	{152, "mlockall"}, 
	{153, "munlockall"}, 
	{154, "sched_setparam"}, 
	{155, "sched_getparam"}, 
	{156, "sched_setscheduler"}, 
	{157, "sched_getscheduler"}, 
	{158, "sched_yield"}, 
	{159, "sched_get_priority_max"}, 
	{160, "sched_get_priority_min"}, 
	{161, "sched_rr_get_interval"}, 
	{162, "nanosleep"}, 
	{163, "mremap"}, 
	{164, "setresuid"}, 
	{165, "getresuid"}, 
	{168, "poll"}, 
	{169, "nfsservctl"}, 
	{170, "setresgid"}, 
	{171, "getresgid"}, 
	{172, "prctl"}, 
	{173, "rt_sigreturn"}, 
	{174, "rt_sigaction"}, 
	{175, "rt_sigprocmask"}, 
	{176, "rt_sigpending"}, 
	{177, "rt_sigtimedwait"}, 
	{178, "rt_sigqueueinfo"}, 
	{179, "rt_sigsuspend"}, 
	{180, "pread64"}, 
	{181, "pwrite64"}, 
	{182, "chown"}, 
	{183, "getcwd"}, 
	{184, "capget"}, 
	{185, "capset"}, 
	{186, "sigaltstack"}, 
	{187, "sendfile"}, 
	{190, "vfork"}, 
	{191, "ugetrlimit"}, 
	{192, "mmap2"}, 
	{193, "truncate64"}, 
	{194, "ftruncate64"}, 
	{195, "stat64"}, 
	{196, "lstat64"}, 
	{197, "fstat64"}, 
	{198, "lchown32"}, 
	{199, "getuid32"}, 
	{200, "getgid32"}, 
	{201, "geteuid32"}, 
	{202, "getegid32"}, 
	{203, "setreuid32"}, 
	{204, "setregid32"}, 
	{205, "getgroups32"}, 
	{206, "setgroups32"}, 
	{207, "fchown32"}, 
	{208, "setresuid32"}, 
	{209, "getresuid32"}, 
	{210, "setresgid32"}, 
	{211, "getresgid32"}, 
	{212, "chown32"}, 
	{213, "setuid32"}, 
	{214, "setgid32"}, 
	{215, "setfsuid32"}, 
	{216, "setfsgid32"}, 
	{217, "getdents64"}, 
	{218, "pivot_root"}, 
	{219, "mincore"}, 
	{220, "madvise"}, 
	{221, "fcntl64"}, 
	{224, "gettid"}, 
	{225, "readahead"}, 
	{226, "setxattr"}, 
	{227, "lsetxattr"}, 
	{228, "fsetxattr"}, 
	{229, "getxattr"}, 
	{230, "lgetxattr"}, 
	{231, "fgetxattr"}, 
	{232, "listxattr"}, 
	{233, "llistxattr"}, 
	{234, "flistxattr"}, 
	{235, "removexattr"}, 
	{236, "lremovexattr"}, 
	{237, "fremovexattr"}, 
	{238, "tkill"}, 
	{239, "sendfile64"}, 
	{240, "futex"}, 
	{241, "sched_setaffinity"}, 
	{242, "sched_getaffinity"}, 
	{243, "io_setup"}, 
	{244, "io_destroy"}, 
	{245, "io_getevents"}, 
	{246, "io_submit"}, 
	{247, "io_cancel"}, 
	{248, "exit_group"}, 
	{249, "lookup_dcookie"}, 
	{250, "epoll_create"}, 
	{251, "epoll_ctl"}, 
	{252, "epoll_wait"}, 
	{253, "remap_file_pages"}, 
	{256, "set_tid_address"}, 
	{257, "timer_create"}, 
	{258, "timer_settime"}, 
	{259, "timer_gettime"}, 
	{260, "timer_getoverrun"}, 
	{261, "timer_delete"}, 
	{262, "clock_settime"}, 
	{263, "clock_gettime"}, 
	{264, "clock_getres"}, 
	{265, "clock_nanosleep"}, 
	{266, "statfs64"}, 
	{267, "fstatfs64"}, 
	{268, "tgkill"}, 
	{269, "utimes"}, 
	{270, "arm_fadvise64_64"}, 
	{271, "pciconfig_iobase"}, 
	{272, "pciconfig_read"}, 
	{273, "pciconfig_write"}, 
	{274, "mq_open"}, 
	{275, "mq_unlink"}, 
	{276, "mq_timedsend"}, 
	{277, "mq_timedreceive"}, 
	{278, "mq_notify"}, 
	{279, "mq_getsetattr"}, 
	{280, "waitid"}, 
	{281, "socket"}, 
	{282, "bind"}, 
	{283, "connect"}, 
	{284, "listen"}, 
	{285, "accept"}, 
	{286, "getsockname"}, 
	{287, "getpeername"}, 
	{288, "socketpair"}, 
	{289, "send"}, 
	{290, "sendto"}, 
	{291, "recv"}, 
	{292, "recvfrom"}, 
	{293, "shutdown"}, 
	{294, "setsockopt"}, 
	{295, "getsockopt"}, 
	{296, "sendmsg"}, 
	{297, "recvmsg"}, 
	{298, "semop"}, 
	{299, "semget"}, 
	{300, "semctl"}, 
	{301, "msgsnd"}, 
	{302, "msgrcv"}, 
	{303, "msgget"}, 
	{304, "msgctl"}, 
	{305, "shmat"}, 
	{306, "shmdt"}, 
	{307, "shmget"}, 
	{308, "shmctl"}, 
	{309, "add_key"}, 
	{310, "request_key"}, 
	{311, "keyctl"}, 
	{312, "semtimedop"}, 
	{313, "vserver"}, 
	{314, "ioprio_set"}, 
	{315, "ioprio_get"}, 
	{316, "inotify_init"}, 
	{317, "inotify_add_watch"}, 
	{318, "inotify_rm_watch"}, 
	{319, "mbind"}, 
	{320, "get_mempolicy"}, 
	{321, "set_mempolicy"}, 
	{322, "openat"}, 
	{323, "mkdirat"}, 
	{324, "mknodat"}, 
	{325, "fchownat"}, 
	{326, "futimesat"}, 
	{327, "fstatat64"}, 
	{328, "unlinkat"}, 
	{329, "renameat"}, 
	{330, "linkat"}, 
	{331, "symlinkat"}, 
	{332, "readlinkat"}, 
	{333, "fchmodat"}, 
	{334, "faccessat"}, 
	{335, "pselect6"}, 
	{336, "ppoll"}, 
	{337, "unshare"}, 
	{338, "set_robust_list"}, 
	{339, "get_robust_list"}, 
	{340, "splice"}, 
	{341, "arm_sync_file_range"}, 
	{342, "tee"}, 
	{343, "vmsplice"}, 
	{344, "move_pages"}, 
	{345, "getcpu"}, 
	{346, "epoll_pwait"}, 
	{347, "kexec_load"}, 
	{348, "utimensat"}, 
	{349, "signalfd"}, 
	{350, "timerfd_create"}, 
	{351, "eventfd"}, 
	{352, "fallocate"}, 
	{353, "timerfd_settime"}, 
	{354, "timerfd_gettime"}, 
	{355, "signalfd4"}, 
	{356, "eventfd2"}, 
	{357, "epoll_create1"}, 
	{358, "dup3"}, 
	{359, "pipe2"}, 
	{360, "inotify_init1"}, 
	{361, "preadv"}, 
	{362, "pwritev"}, 
	{363, "rt_tgsigqueueinfo"}, 
	{364, "perf_event_open"}, 
	{365, "recvmmsg"}, 
	{366, "accept4"}, 
	{367, "fanotify_init"}, 
	{368, "fanotify_mark"}, 
	{369, "prlimit64"}, 
	{370, "name_to_handle_at"}, 
	{371, "open_by_handle_at"}, 
	{372, "clock_adjtime"}, 
	{373, "syncfs"}, 
	{374, "sendmmsg"}, 
	{375, "setns"}, 
	{376, "process_vm_readv"}, 
	{377, "process_vm_writev"}, 
	{378, "kcmp"}, 
	{379, "finit_module"}, 
	{380, "sched_setattr"}, 
	{381, "sched_getattr"}, 
	{382, "renameat2"}, 
	{383, "seccomp"}, 
	{384, "getrandom"}, 
	{385, "memfd_create"}, 
	{386, "bpf"}, 
	{387, "execveat"}, 
	{388, "userfaultfd"}, 
	{389, "membarrier"}, 
	{390, "mlock2"}, 
	{391, "copy_file_range"}, 
	{392, "preadv2"}, 
	{393, "pwritev2"}, 
	{394, "pkey_mprotect"}, 
	{395, "pkey_alloc"}, 
	{396, "pkey_free"}, 
	{397, "statx"}, 
	/* 架构相关的系统调用 */
#define __ARM_NR_BASE 			0x0f0000
#define __ARM_NR_breakpoint		(__ARM_NR_BASE+1)
#define __ARM_NR_cacheflush		(__ARM_NR_BASE+2)
#define __ARM_NR_usr26			(__ARM_NR_BASE+3)
#define __ARM_NR_usr32			(__ARM_NR_BASE+4)
#define __ARM_NR_set_tls		(__ARM_NR_BASE+5)
	{__ARM_NR_breakpoint, "breakpoint"},
	{__ARM_NR_cacheflush, "cacheflush"},
	{__ARM_NR_usr26, "usr26"},
	{__ARM_NR_usr32, "usr32"},
	{__ARM_NR_set_tls, "set_tls"},

	{-1, NULL}
};

static char *find_syscall_name(int scno)
{
	struct syscall *sc;
 
	for (sc = syscall_table; sc->no >= 0; sc++)
		if (sc->no == scno)
			return sc->name;
 
	return NULL;
}

int main(int argc, char *argv[])
{
	pid_t child;
	int status;
	struct user_regs regs; /* 架构相关的寄存器数据结构 */
	unsigned long int scret, scno;
	
	if ((child = fork()) == 0) { /* 子进程 */
		/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
		ptrace(PTRACE_TRACEME, 0, NULL, NULL);
		
		/*
		 * 在程序 /bin/hello 代码开始执行之前,子进程的 execl() 发送 SIGTRAP 信号给自身;
		 * 然后子进程在处理 SIGTRAP 信号时,给父进程发送 SIGCHLD 信号,同时暂停自身执行,
		 * 告知父进程自身的状态变化。
		 * 子进程在父进程将其唤醒之前,不会再继续运行。父进程后续通过 ptrace() 调用:
		 * ptrace(PTRACE_SYSCALL, child, NULL, NULL);
		 * 将子进程唤醒继续执行。
		 * 通过这样的方式进行父子进程同步,可以防止父进程在对子进程进行信息查询前,子进
		 * 程就已经运行或退出。
		 */
		execl("/bin/hello", "/bin/hello", NULL);

		/* 永远不应该运行到这里 */
		exit(0);
	} else if (child > 0) { /* 父进程 */
		/*
	 	 * 子进程 execl() 给自身发送 SIGTRAP ,然后处理该信号时给父进程发送 
		 * SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒。 
		 * 这里父进程等待的是子进程因 execl() 发送的 SIGCHLD 信号。
		 */
		wait(&status);
		
		/* 
 		 * 发送 PTRACE_SYSCALL 命令给被跟踪子进程,将唤醒子进程继续执行。 
		 * 同时 PTRACE_SYSCALL 命令会导致子进程在 系统调用进入或退出时,
		 * 给自身发送 SIGTRAP 信号,然后子进程在处理 SIGTRAP 信号时给父
		 * 进程发送 SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒后继续
		 * 执行。
		 * 父进程通过 ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
		 */
		ptrace(PTRACE_SYSCALL, child, NULL, NULL);
		for (;;) {
			wait(&status); /* 接收子进程 进入系统调用 时发送过来的 SIGCHLD 信号 */
			if (WIFEXITED(status)) /* 如果被追踪子进程退出了, 那么终止跟踪 */
				break;

			/*
			 * 如果关心 进入系统调用 时的寄存器状态,
			 * 在这里调用 ptrace(PTRACE_GETREGS) 获取相关信息 
			 */
			// ......
			
			/*
			 * 子进程在 进入系统调用 时将暂停执行,父进程通过 
			 * ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
			 */
			ptrace(PTRACE_SYSCALL, child, NULL, NULL);
			
			wait(&status); /* 接收子进程 系统调用退出 时发送过来的 SIGCHLD 信号 */
			if (WIFEXITED(status)) /* 如果被追踪子进程退出了, 那么终止跟踪 */
				break;
						
			/* 获取 系统调用退出 时的寄存器状态 */
			ptrace(PTRACE_GETREGS, child, 0, &regs);
			scret = regs.uregs[0]; /* 寄存器 r0 保存系统调用返回值 */
			scno = regs.uregs[7]; /* 寄存器 r7 保存系统调用号 */
			printf("%s() = %d\n", find_syscall_name(scno), (int)scret);

			/*
			 * 子进程在 退出系统调用 时将暂停执行,父进程通过 
			 * ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
			 */
			ptrace(PTRACE_SYSCALL, child, NULL, NULL);
		}
	}
 
	return 0;
}

被追踪程序 hello 的代码如下:

#include 

int main(void)
{
	printf("Hello, World!\n");
	
	return 0;
}

编译:

arm-linux-gnueabihf-gcc -o syscall_trace syscall_trace.c
arm-linux-gnueabihf-gcc -o hello hello.c

然后将程序 syscall_tracehello 均放入根文件系统的 /bin 目录下,用 QEMU 加载系统,系统运行起来后,运行 syscall_trace 程序,执行结果如下:

$ /bin/syscall_trace
brk() = 135168
uname() = 0
access() = -2
mmap2() = 1995878400
access() = -2
open() = 3
fstat64() = 0
mmap2() = 1995866112
close() = 0
access() = -2
open() = 3
read() = 512
lseek() = 894132
read() = 2960
lseek() = 888324
read() = 51
fstat64() = 0
mmap2() = 1994760192
mprotect() = 0
mmap2() = 1995702272
mmap2() = 1995714560
close() = 0
mmap2() = 1995862016
set_tls() = 0
mprotect() = 0
mprotect() = 0
munmap() = 0
clock_gettime() = 0
fstat64() = 0
ioctl() = 0
brk() = 135168
brk() = 274432
Hello, World!
write() = 33

4. 工作原理分析

我们以测试程序 syscall_trace 的代码 syscall_trace.c 为起点,逐步分析系统调用追踪的原理。

4.1 父子进程追踪与被追踪关系的建立

main()
	...
	if ((child = fork()) == 0) { /* 子进程建立,这里不展开具体细节,因为这不是我们关注的重点 */
		/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
		ptrace(PTRACE_TRACEME, 0, NULL, NULL)
        	/* 以下是内核空间细节 */
        	sys_ptrace(PTRACE_TRACEME, 0, NULL, NULL)
        		if (request == PTRACE_TRACEME) {
        			ret = ptrace_traceme();
        				if (!current->ptrace) { /* 进程当前没有被追踪 */
        					ret = security_ptrace_traceme(current->parent);
        					if (!ret && !(current->real_parent->flags & PF_EXITING)) {
								current->ptrace = PT_PTRACED; /* 标记进程处于被追踪状态 */
								ptrace_link(current, current->real_parent); /* 将当前进程的父进程设置为追踪进程 */
									__ptrace_link(child, new_parent, __task_cred(new_parent))
										list_add(&child->ptrace_entry, &new_parent->ptraced); /* 将子进程添加到父进程的追踪进程列表 */
										child->parent = new_parent; /* 设置父进程 task_struct::parent 为追踪进程 */
							}
        				}
        			...
        			goto out;
        		}
        		...
        	 out:	
				return ret;
	}

此时父进程与被追踪子进程的关系图如下:
Linux:系统调用追踪原理简析_第1张图片

4.2 子进程暂停自身执行并知悉父进程

main()
	...
	if ((child = fork()) == 0) { /* 子进程 */
		/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
		ptrace(PTRACE_TRACEME, 0, NULL, NULL);

		/*
		 * 在程序 /bin/hello 代码开始执行之前,子进程的 execl() 发送 SIGTRAP 信号给自身;
		 * 然后子进程在处理 SIGTRAP 信号时,给父进程发送 SIGCHLD 信号,同时暂停自身执行,
		 * 告知父进程自身的状态变化。
		 * 子进程在父进程将其唤醒之前,不会再继续运行。父进程后续通过 ptrace() 调用:
		 * ptrace(PTRACE_SYSCALL, child, NULL, NULL);
		 * 将子进程唤醒继续执行。
		 * 通过这样的方式进行父子进程同步,可以防止父进程在对子进程进行信息查询前,子进
		 * 程就已经运行或退出。
		 */
		execl("/bin/hello", "/bin/hello", NULL)
		
		/* 以下是内核空间细节 */
		/* @arch/arm/kernel/entry-common.S */	
		vector_swi:
			...
			adr	tbl, sys_call_table /* r8 = 系统调用表 sys_call_table[] 的地址 */
			get_thread_info tsk /* r9 = 进程的 thread_info */
			...
			/* 调用系统调用接口 */
			invoke_syscall tbl, scno, r10, __ret_fast_syscall
				sys_execve() /* @fs/exec.c */
					...
					do_execveat_common(AT_FDCWD, filename, argv, envp, 0)
						/* 我们只关注 SIGTRAP 信号相关细节 */
						...
						retval = exec_binprm(bprm)
							/* 搜寻合适的 binary handler 并用它加载程序 */
							ret = search_binary_handler(bprm)
								retval = load_elf_binary();
								...
								return retval;
							if (ret >= 0) {
								...
								/* 进程给自身发送 SIGTRAP 信号 */
								ptrace_event(PTRACE_EVENT_EXEC, old_vpid)
									...
									if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED)
										send_sig(SIGTRAP, current, 0)
									...
								...
							}
						...
						return retval;
	}

此时,子进程已经有挂起的信号 SIGTRAP 待处理,接下来就是对该信号的处理了。子进程从系统调用 sys_execve() 返回用户空间时处理 SIGTRAP 信号,我们看一下子进程是如何处理 SIGTRAP 信号的:

do_work_pending()
	...
	if (thread_flags & _TIF_SIGPENDING) {
		do_signal(regs, syscall)
			...
			get_signal(&ksig)
				for (;;) {
					struct k_sigaction *ka;
					
					...
					signr = dequeue_synchronous_signal(&ksig->info);
					if (!signr)
						signr = dequeue_signal(current, &current->blocked, &ksig->info);
			
					if (!signr)
						break; /* will return 0 */
					
					/* 处理被追踪进程的 (SIGTRAP) 信号 */
					if (unlikely(current->ptrace) && signr != SIGKILL) {
						signr = ptrace_signal(signr, &ksig->info)
							...
							ptrace_stop(signr, CLD_TRAPPED, 0, info)
								set_special_state(TASK_TRACED)
									current->state = TASK_TRACED;
								...
								current->last_siginfo = info;
								current->exit_code = exit_code; /* exit_code == SIGTRAP */
								...
								if (may_ptrace_stop()) {
									/* 给父进程发送 SIGCHLD 信号 */
									do_notify_parent_cldstop(current, true, why)
										struct siginfo info;
										
										...
										info.si_signo = SIGCHLD;
										...
										info.si_code = why; /* why == CLD_TRAPPED */
										switch (why) {
										...
										case CLD_TRAPPED:
									 		info.si_status = tsk->exit_code & 0x7f;
									 		break;
									 	...
										}
										
										sighand = parent->sighand; /* 父进程的信号处理接口 */
										/*
										 * 不给忽略了 SIGCHLD 的父进程发送该信号。
										 * 默认情况下进程的 SIGCHLD 处理接口为 SIG_DEL 。
										 */
										if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN &&
										    !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP))
											__group_send_sig_info(SIGCHLD, &info, parent); /* 给父进程所在线程组发送 SIGCHLD 信号 */
										__wake_up_parent(tsk, parent) /* 唤醒调用 wait() 等待子进程状态变化的父进程处理 SIGCHLD 信号 */
											__wake_up_sync_key(&parent->signal->wait_chldexit,
																TASK_INTERRUPTIBLE, 1, p)
									...
									/*
									 * 将子进程停下来,直到父进程将其唤醒继续执行。
									 * 在我们的场景下,父进程通过 ptrace(PTRACE_SYSCALL) 将子进程加入调度继续执行。
									 */
									freezable_schedule();
								}
								...
								recalc_sigpending_tsk(current);
							...
					}
					...
				}
	}
	...

我们看到,被追踪子进程对 SIGTRAP 信号的处理是这样的:给父进程发送 SIGCHLD 信号,然后暂停自身执行,等待被唤醒。
上面的分析涉及到 系统调用信号处理 的细节,可以分别参考博文 Linux系统调用实现简析 和 Linux信号处理简析 进行了解。

4.3 父进程追踪子进程的系统调用

4.3.1 父进程等待子进程execl()调用期间发送的SIGCHLD信号

章节 4.2 讲到被追踪的子进程向父进程发送 SIGCHLD 信号,同时暂停执行。我们来看父进程等待【子进程 execl() 调用期间发送的SIGCHLD 信号】的具体流程:

if ((child = fork()) == 0) { /* 子进程 */
	/* 标记子进程为被 trace 状态,加入到父进程追踪进程列表 */
	ptrace(PTRACE_TRACEME, 0, NULL, NULL);
		
	/*
	 * 在程序 /bin/hello 代码开始执行之前,子进程的 execl() 发送 SIGTRAP 信号给自身;
	 * 然后子进程在处理 SIGTRAP 信号时,给父进程发送 SIGCHLD 信号,同时暂停自身执行,
	 * 告知父进程自身的状态变化。
	 * 子进程在父进程将其唤醒之前,不会再继续运行。父进程后续通过 ptrace() 调用:
	 * ptrace(PTRACE_SYSCALL, child, NULL, NULL);
	 * 将子进程唤醒继续执行。
	 * 通过这样的方式进行父子进程同步,可以防止父进程在对子进程进行信息查询前,子进
	 * 程就已经运行或退出。
	 */
	execl("/bin/hello", "/bin/hello", NULL);

	/* 永远不应该运行到这里 */
	exit(0);
} else { /* 父进程 */
	/*
	 * 子进程 execl() 给自身发送 SIGTRAP ,然后处理该信号时给父进程发送 
	 * SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒。 
	 * 这里父进程等待的是子进程因 execl() 发送的 SIGCHLD 信号。
	 */
	wait(&status)
		/* 下面是 glibc 和 内核空间的细节 */
		__waitpid(WAIT_ANY, stat_loc, 0) /* WAIT_ANY == -1 */
       		__wait4(-1, stat_loc, 0, NULL)
       			/* 内核空间的细节 */
       			kernel_wait4(-1, NULL, 0, NULL)
       				...
       				if (upid == -1) /* 等待任一子进程 */
						type = PIDTYPE_MAX;
					...
					wo.wo_type	= type;
					wo.wo_pid	= pid;
					wo.wo_flags	= options | WEXITED;
					...
					ret = do_wait(&wo)
						/*
						 * 子进程处理 SIGTRAP 时给父进程(即当前进程),发送 SIGCHLD 
						 * 信号,然后通过 @wo->child_wait 唤醒父进程处理 SIGCHLD 信号,
						 * child_wait_callback() 也会被触发,它继续完成唤醒父进程的整个过程:
						 * child_wait_callback() -> default_wake_function() 。 
						 */
						init_waitqueue_func_entry(&wo->child_wait, child_wait_callback);
						wo->child_wait.private = current;
						add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);

					repeat:	
						...
						set_current_state(TASK_INTERRUPTIBLE);
						tsk = current;
						do { /* 遍历线程组中的所有进程,包括 @current */
							retval = do_wait_thread(wo, tsk); /* 等待 【进程 @tsk 所有没被追踪的子进程】 的状态变更 */
								...									
								/* 遍历 @tsk 的所有子进程 */
								list_for_each_entry(p, &tsk->children, sibling) {
									int ret = wait_consider_task(wo, 0, p);
										...
										ret = wait_task_stopped(wo, ptrace, p);
											...
											pid = task_pid_vnr(p);
											...
											/* 设置子进程的状态,从 wait() 返回 */
											if (likely(!(wo->wo_flags & WNOWAIT)))
												wo->wo_stat = (exit_code << 8) | 0x7f; /* (SIGTRAP << 8) | 0x7f */
											...
											return pid;
										...
									if (ret)
										return ret;
								}
							
								return 0;
							if (retval)
								goto end;
					
							retval = ptrace_do_wait(wo, tsk); /* 等待 【进程 @tsk 所有被追踪的子进程】 的状态变更 */
								/* 类似于 do_wait_thread() */
							if (retval)
								goto end;
						} while_each_thread(current, tsk);
						
						...
					notask:
						retval = wo->notask_error;
						if (!retval && !(wo->wo_flags & WNOHANG)) {
							...
							if (!signal_pending(current)) {
								/* 没有等到子进程的状态变化,父进程进入可中断睡眠继续等待 */
								schedule(); 
								goto repeat;
							}
						}
						
						/* 等待到子进程的状态变化,发起 wait() 等待的父进程返回用户空间 */
						__set_current_state(TASK_RUNNING);
						remove_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
						return retval;
}
...

wait_task_stopped() 中等待到了子进程因 SIGTRAP 信号而发送的 SIGCHLD 信号。父进程发起 wait() 调用后,父子进程数据关系如下图所示:
Linux:系统调用追踪原理简析_第2张图片

4.3.2 父进程唤醒子进程继续执行

现在子进程处在 execl() 发送给自身的 SIGTRAP 而导致的暂停状态,要跟踪子进程的系统调用,得让它继续运行。父进程通过 ptrace(PTRACE_SYSCALL) 调用将子进程重新纳入调度执行,并通过对子进程设置 TIF_SYSCALL_TRACE 标记,指示开始对子进程系统调用的追踪,具体流程如下:

main()
	if ((child = fork()) == 0) { /* 子进程 */
		...
	} else {
		/*
		 * 子进程 execl() 给自身发送 SIGTRAP ,然后处理该信号时给父进程发送 
		 * SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒。 
		 * 这里父进程等待的是子进程因 execl() 发送的 SIGCHLD 信号。
		 */
		wait(&status);
		
		/* 
 		 * 发送 PTRACE_SYSCALL 命令给被跟踪子进程,将唤醒子进程继续执行。 
		 * 同时 PTRACE_SYSCALL 命令会导致子进程在 系统调用进入或退出时,
		 * 给自身发送 SIGTRAP 信号,然后子进程在处理 SIGTRAP 信号时给父
		 * 进程发送 SIGCHLD 信号,然后暂停自身执行,等待父进程唤醒后继续
		 * 执行。
		 * 父进程通过 ptrace(PTRACE_SYSCALL) 唤醒子进程继续执行。
		 */
		ptrace(PTRACE_SYSCALL, child, NULL, NULL)
        	/* 以下是内核空间细节 */
        	sys_ptrace(PTRACE_SYSCALL, child, NULL, NULL)
        		ret = ptrace_check_attach(child, request == PTRACE_KILL ||
				  							request == PTRACE_INTERRUPT)
				  	/* 子进程被 ptrace 了 && 当前是其父进程对其发起了 PTRACE_SYSCALL */
					if (child->ptrace && child->parent == current) {
						/*
						 * ptrace_freeze_traced() 确保子进程 @child 
						 * 只能被父进程(即当前进程)被唤醒: 将其状态设置为 __TASK_TRACED 。
						 */
						if (ignore_state || ptrace_freeze_traced(child))
							ret = 0;
					}
						
					if (!ret && !ignore_state) {
						/* 等待子进程停下来 (从 CPU 的运行队列中移出) */
						wait_task_inactive(child, __TASK_TRACED)
					}
					
					return ret;
				
				ret = arch_ptrace(child, request, addr, data);
					switch (request) {
					...
					default:
						ret = ptrace_request(child, request, addr, data);
							switch (request) {
							...
							case PTRACE_SYSCALL:
							case PTRACE_CONT:
								return ptrace_resume(child, request, data);
									if (request == PTRACE_SYSCALL)
										/*
										 * 启用子进程的系统调用追踪: 
										 * 在系统调用出入口暂停子进程,方便父进程获取
										 * 调用信息,之后需要父进程来唤醒子进程。
										 */
										set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
									else
										...
									...
									child->exit_code = data;
									wake_up_state(child, __TASK_TRACED); /* 唤醒子进程继续执行 */
									...
									return 0;
							}
					}
					return ret;

		for (;;) {
			...
		}
	}

4.3.3 提取子进程系统调用进入退出时的信息

上小节中设置子进程的 TIF_SYSCALL_TRACE 标记后(对进程的 thread_info::flags 设置),每当子进程进入或退出系统调用,子进程都会通过给自己发送 SIGTRAP 信号暂停自身执行,同时给父进程发送 SIGCHILD 信号;而父进程则通过 wait() 等到了子进程发送的 SIGCHILD 信号,并在子进程暂停期间提取系统调用信息,然后通过 ptrace(PTRACE_SYSCALL) 调用重启子进程。

4.3.3.1 子进程进入系统调用暂停执行的流程
/* @arch/arm/kernel/entry-common.S */
ENTRY(vector_swi)
	adr	tbl, sys_call_table /* r8 = 系统调用表 sys_call_table[] 的地址 */
	get_thread_info tsk /* r9 = 进程的 thread_info */
	...
local_restart:
	ldr	r10, [tsk, #TI_FLAGS] /* r10 = 进程的 thread_info::flags */
	...
	tst	r10, #_TIF_SYSCALL_WORK /* 检查系统调用追踪标记 */
	bne	__sys_trace /* 系统调用进入前追踪 */
	...
__sys_trace:
	bl	syscall_trace_enter /* arch/arm/kernel/ptrace.c */
		if (test_thread_flag(TIF_SYSCALL_TRACE))
			tracehook_report_syscall(regs, PTRACE_SYSCALL_ENTER)
				tracehook_report_syscall_entry(regs)
					ptrace_report_syscall(regs)
						/*
						 * 被追踪子进程给自身发送 SIGTRAP 信号: 
						 * 子进程处理该信号时,发送信号 SIGCHLD 唤醒 wait() 
						 * 等待的父进程,然后暂停自身,直到父进程通过 
						 * ptrace(PTRACE_SYSCALL) 调用唤醒它。
						 */
						ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0));
	...
	/* 调用系统调用 */
	invoke_syscall tbl, scno, r10, __sys_trace_return, reload=1
	...
4.3.3.2 子进程退出系统调用暂停执行的流程
/* @arch/arm/kernel/entry-common.S */
ENTRY(vector_swi)
	ldr	r10, [tsk, #TI_FLAGS] /* r10 = 进程的 thread_info::flags */
	...
	tst	r10, #_TIF_SYSCALL_WORK /* 检查系统调用追踪标记 */
	bne	__sys_trace /* 进行系统调用进入前追踪 */
	...
__sys_trace:
	/* 系统调用进入前追踪 */
	bl	syscall_trace_enter /* arch/arm/kernel/ptrace.c */
	...
	/* 调用系统调用,返回到 __sys_trace_return 标号处 */
	invoke_syscall tbl, scno, r10, __sys_trace_return, reload=1
	...
__sys_trace_return:
	bl	syscall_trace_exit /* arch/arm/kernel/ptrace.c */
		...
		if (test_thread_flag(TIF_SYSCALL_TRACE))
			tracehook_report_syscall(regs, PTRACE_SYSCALL_EXIT)
				tracehook_report_syscall_exit(regs, 0)
					ptrace_report_syscall(regs)
						/*
						 * 被 ptraced 子进程给自身发送 SIGTRAP 信号: 
						 * 子进程处理该信号时,发送信号 SIGCHLD 唤醒 wait() 等待的父进程,
						 * 然后暂停自身,直到父进程通过 ptrace(PTRACE_SYSCALL) 调用唤醒它。
						 */
						ptrace_notify(SIGTRAP | ((ptrace & PT_TRACESYSGOOD) ? 0x80 : 0))
	
	/* 返回用户空间前,处理挂起的信号 */
	b	ret_slow_syscall
4.3.3.3 子进程在系统调用进出口暂停后的重启流程

子进程在系统调用进出口暂停执行后的重启流程,参看章节 4.3.2

4.4 子进程退出与父进程的追踪终止

子进程退出时,最后的系统调用是 sys_exit_group() ,我们简略的看一下退出时的系统调用 sys_exit_group() 的追踪流程,同前面的有些不同,因为进程自身即将退出,它没法再去处理 SIGTRAP 信号了。

sys_exit_group(error_code)
	do_group_exit((error_code & 0xff) << 8)
		do_exit(exit_code)
			...
			tsk->exit_code = code; /* 设置进程退出码 */
			exit_notify(tsk, group_dead)
				...
				if (unlikely(tsk->ptrace)) { /* 如果进程 @tsk 处于被追踪状态 */
					/* 子进程直接给父进程发送 SIGCHLD ,不再通过 SIGTRAP 绕一圈:
					 * 进程正退出,没机会了! */
					int sig = thread_group_leader(tsk) &&
								thread_group_empty(tsk) &&
								!ptrace_reparented(tsk) ? tsk->exit_signal : SIGCHLD;
					autoreap = do_notify_parent(tsk, sig)
						struct siginfo info;
						
						...
						info.si_signo = sig;
						...
						psig = tsk->parent->sighand; /* @tsk 父进程的信号处理数据 */
						...
						if (valid_signal(sig) && sig)
							__group_send_sig_info(sig, &info, tsk->parent); /* 将信号 @sig 发送给 @tsk 的父进程 */
						__wake_up_parent(tsk, tsk->parent); /* 唤醒 @tsk 的父进程处理信号 */			
				}
			...
			/*
			 * 设置进程状态为最终态: task_struct::state = TASK_DEAD ,
			 * 进程彻底终结放弃 CPU 。
			 */
			do_task_dead();

4.5 小结

上面测试程序的工作方式,类似于 strace /bin/hello ,都是在追踪程序进程内,通过 fork() + exec*() 发起被追踪子进程的方式。

5. 另一种场景下的实现

另外一种追踪系统调用的场景,是将追踪程序挂接到已运行程序上,如 strace -p 。我们来简单探讨这种方式的实现和原理。我们还是从一份测试代码开始:

/*
 * syscall_trace.c
 *
 * This is an system call trace demo program for ARM32 architecture.
 *
 * Copyright (c) 2019 Leng Xujun 
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#include 
#include 
#include 
#include 
#include 
#include 
#include 


/*
 * 每个架构下的系统调用编号并不一致,这里是 ARM32 架构下的系统调用表。 
 * 每个内核版本支持的系统调用也不一样,需要按实际情况进行修改。
 */
static struct syscall {
	int  no;
	char *name;
} syscall_table[] = {
	/* 架构无关系统调用 */
	{0, "restart_syscall"}, 
	{1, "exit"}, 
	{2, "fork"}, 
	{3, "read"}, 
	{4, "write"}, 
	{5, "open"}, 
	{6, "close"}, 
	{8, "creat"}, 
	{9, "link"}, 
	{10, "unlink"}, 
	{11, "execve"}, 
	{12, "chdir"}, 
	{14, "mknod"}, 
	{15, "chmod"}, 
	{16, "lchown"}, 
	{19, "lseek"}, 
	{20, "getpid"}, 
	{21, "mount"}, 
	{23, "setuid"}, 
	{24, "getuid"}, 
	{26, "ptrace"}, 
	{29, "pause"}, 
	{33, "access"}, 
	{34, "nice"}, 
	{36, "sync"}, 
	{37, "kill"}, 
	{38, "rename"}, 
	{39, "mkdir"}, 
	{40, "rmdir"}, 
	{41, "dup"}, 
	{42, "pipe"}, 
	{43, "times"}, 
	{45, "brk"}, 
	{46, "setgid"}, 
	{47, "getgid"}, 
	{49, "geteuid"}, 
	{50, "getegid"}, 
	{51, "acct"}, 
	{52, "umount2"}, 
	{54, "ioctl"}, 
	{55, "fcntl"}, 
	{57, "setpgid"}, 
	{60, "umask"}, 
	{61, "chroot"}, 
	{62, "ustat"}, 
	{63, "dup2"}, 
	{64, "getppid"}, 
	{65, "getpgrp"}, 
	{66, "setsid"}, 
	{67, "sigaction"}, 
	{70, "setreuid"}, 
	{71, "setregid"}, 
	{72, "sigsuspend"}, 
	{73, "sigpending"}, 
	{74, "sethostname"}, 
	{75, "setrlimit"}, 
	{77, "getrusage"}, 
	{78, "gettimeofday"}, 
	{79, "settimeofday"}, 
	{80, "getgroups"}, 
	{81, "setgroups"}, 
	{83, "symlink"}, 
	{85, "readlink"}, 
	{86, "uselib"}, 
	{87, "swapon"}, 
	{88, "reboot"}, 
	{91, "munmap"}, 
	{92, "truncate"}, 
	{93, "ftruncate"}, 
	{94, "fchmod"}, 
	{95, "fchown"}, 
	{96, "getpriority"}, 
	{97, "setpriority"}, 
	{99, "statfs"}, 
	{100, "fstatfs"}, 
	{103, "syslog"}, 
	{104, "setitimer"}, 
	{105, "getitimer"}, 
	{106, "stat"}, 
	{107, "lstat"}, 
	{108, "fstat"}, 
	{111, "vhangup"}, 
	{114, "wait4"}, 
	{115, "swapoff"}, 
	{116, "sysinfo"}, 
	{118, "fsync"}, 
	{119, "sigreturn"}, 
	{120, "clone"}, 
	{121, "setdomainname"}, 
	{122, "uname"}, 
	{124, "adjtimex"}, 
	{125, "mprotect"}, 
	{126, "sigprocmask"}, 
	{128, "init_module"}, 
	{129, "delete_module"}, 
	{131, "quotactl"}, 
	{132, "getpgid"}, 
	{133, "fchdir"}, 
	{134, "bdflush"}, 
	{135, "sysfs"}, 
	{136, "personality"}, 
	{138, "setfsuid"}, 
	{139, "setfsgid"}, 
	{140, "_llseek"}, 
	{141, "getdents"}, 
	{142, "_newselect"}, 
	{143, "flock"}, 
	{144, "msync"}, 
	{145, "readv"}, 
	{146, "writev"}, 
	{147, "getsid"}, 
	{148, "fdatasync"}, 
	{149, "_sysctl"}, 
	{150, "mlock"}, 
	{151, "munlock"}, 
	{152, "mlockall"}, 
	{153, "munlockall"}, 
	{154, "sched_setparam"}, 
	{155, "sched_getparam"}, 
	{156, "sched_setscheduler"}, 
	{157, "sched_getscheduler"}, 
	{158, "sched_yield"}, 
	{159, "sched_get_priority_max"}, 
	{160, "sched_get_priority_min"}, 
	{161, "sched_rr_get_interval"}, 
	{162, "nanosleep"}, 
	{163, "mremap"}, 
	{164, "setresuid"}, 
	{165, "getresuid"}, 
	{168, "poll"}, 
	{169, "nfsservctl"}, 
	{170, "setresgid"}, 
	{171, "getresgid"}, 
	{172, "prctl"}, 
	{173, "rt_sigreturn"}, 
	{174, "rt_sigaction"}, 
	{175, "rt_sigprocmask"}, 
	{176, "rt_sigpending"}, 
	{177, "rt_sigtimedwait"}, 
	{178, "rt_sigqueueinfo"}, 
	{179, "rt_sigsuspend"}, 
	{180, "pread64"}, 
	{181, "pwrite64"}, 
	{182, "chown"}, 
	{183, "getcwd"}, 
	{184, "capget"}, 
	{185, "capset"}, 
	{186, "sigaltstack"}, 
	{187, "sendfile"}, 
	{190, "vfork"}, 
	{191, "ugetrlimit"}, 
	{192, "mmap2"}, 
	{193, "truncate64"}, 
	{194, "ftruncate64"}, 
	{195, "stat64"}, 
	{196, "lstat64"}, 
	{197, "fstat64"}, 
	{198, "lchown32"}, 
	{199, "getuid32"}, 
	{200, "getgid32"}, 
	{201, "geteuid32"}, 
	{202, "getegid32"}, 
	{203, "setreuid32"}, 
	{204, "setregid32"}, 
	{205, "getgroups32"}, 
	{206, "setgroups32"}, 
	{207, "fchown32"}, 
	{208, "setresuid32"}, 
	{209, "getresuid32"}, 
	{210, "setresgid32"}, 
	{211, "getresgid32"}, 
	{212, "chown32"}, 
	{213, "setuid32"}, 
	{214, "setgid32"}, 
	{215, "setfsuid32"}, 
	{216, "setfsgid32"}, 
	{217, "getdents64"}, 
	{218, "pivot_root"}, 
	{219, "mincore"}, 
	{220, "madvise"}, 
	{221, "fcntl64"}, 
	{224, "gettid"}, 
	{225, "readahead"}, 
	{226, "setxattr"}, 
	{227, "lsetxattr"}, 
	{228, "fsetxattr"}, 
	{229, "getxattr"}, 
	{230, "lgetxattr"}, 
	{231, "fgetxattr"}, 
	{232, "listxattr"}, 
	{233, "llistxattr"}, 
	{234, "flistxattr"}, 
	{235, "removexattr"}, 
	{236, "lremovexattr"}, 
	{237, "fremovexattr"}, 
	{238, "tkill"}, 
	{239, "sendfile64"}, 
	{240, "futex"}, 
	{241, "sched_setaffinity"}, 
	{242, "sched_getaffinity"}, 
	{243, "io_setup"}, 
	{244, "io_destroy"}, 
	{245, "io_getevents"}, 
	{246, "io_submit"}, 
	{247, "io_cancel"}, 
	{248, "exit_group"}, 
	{249, "lookup_dcookie"}, 
	{250, "epoll_create"}, 
	{251, "epoll_ctl"}, 
	{252, "epoll_wait"}, 
	{253, "remap_file_pages"}, 
	{256, "set_tid_address"}, 
	{257, "timer_create"}, 
	{258, "timer_settime"}, 
	{259, "timer_gettime"}, 
	{260, "timer_getoverrun"}, 
	{261, "timer_delete"}, 
	{262, "clock_settime"}, 
	{263, "clock_gettime"}, 
	{264, "clock_getres"}, 
	{265, "clock_nanosleep"}, 
	{266, "statfs64"}, 
	{267, "fstatfs64"}, 
	{268, "tgkill"}, 
	{269, "utimes"}, 
	{270, "arm_fadvise64_64"}, 
	{271, "pciconfig_iobase"}, 
	{272, "pciconfig_read"}, 
	{273, "pciconfig_write"}, 
	{274, "mq_open"}, 
	{275, "mq_unlink"}, 
	{276, "mq_timedsend"}, 
	{277, "mq_timedreceive"}, 
	{278, "mq_notify"}, 
	{279, "mq_getsetattr"}, 
	{280, "waitid"}, 
	{281, "socket"}, 
	{282, "bind"}, 
	{283, "connect"}, 
	{284, "listen"}, 
	{285, "accept"}, 
	{286, "getsockname"}, 
	{287, "getpeername"}, 
	{288, "socketpair"}, 
	{289, "send"}, 
	{290, "sendto"}, 
	{291, "recv"}, 
	{292, "recvfrom"}, 
	{293, "shutdown"}, 
	{294, "setsockopt"}, 
	{295, "getsockopt"}, 
	{296, "sendmsg"}, 
	{297, "recvmsg"}, 
	{298, "semop"}, 
	{299, "semget"}, 
	{300, "semctl"}, 
	{301, "msgsnd"}, 
	{302, "msgrcv"}, 
	{303, "msgget"}, 
	{304, "msgctl"}, 
	{305, "shmat"}, 
	{306, "shmdt"}, 
	{307, "shmget"}, 
	{308, "shmctl"}, 
	{309, "add_key"}, 
	{310, "request_key"}, 
	{311, "keyctl"}, 
	{312, "semtimedop"}, 
	{313, "vserver"}, 
	{314, "ioprio_set"}, 
	{315, "ioprio_get"}, 
	{316, "inotify_init"}, 
	{317, "inotify_add_watch"}, 
	{318, "inotify_rm_watch"}, 
	{319, "mbind"}, 
	{320, "get_mempolicy"}, 
	{321, "set_mempolicy"}, 
	{322, "openat"}, 
	{323, "mkdirat"}, 
	{324, "mknodat"}, 
	{325, "fchownat"}, 
	{326, "futimesat"}, 
	{327, "fstatat64"}, 
	{328, "unlinkat"}, 
	{329, "renameat"}, 
	{330, "linkat"}, 
	{331, "symlinkat"}, 
	{332, "readlinkat"}, 
	{333, "fchmodat"}, 
	{334, "faccessat"}, 
	{335, "pselect6"}, 
	{336, "ppoll"}, 
	{337, "unshare"}, 
	{338, "set_robust_list"}, 
	{339, "get_robust_list"}, 
	{340, "splice"}, 
	{341, "arm_sync_file_range"}, 
	{342, "tee"}, 
	{343, "vmsplice"}, 
	{344, "move_pages"}, 
	{345, "getcpu"}, 
	{346, "epoll_pwait"}, 
	{347, "kexec_load"}, 
	{348, "utimensat"}, 
	{349, "signalfd"}, 
	{350, "timerfd_create"}, 
	{351, "eventfd"}, 
	{352, "fallocate"}, 
	{353, "timerfd_settime"}, 
	{354, "timerfd_gettime"}, 
	{355, "signalfd4"}, 
	{356, "eventfd2"}, 
	{357, "epoll_create1"}, 
	{358, "dup3"}, 
	{359, "pipe2"}, 
	{360, "inotify_init1"}, 
	{361, "preadv"}, 
	{362, "pwritev"}, 
	{363, "rt_tgsigqueueinfo"}, 
	{364, "perf_event_open"}, 
	{365, "recvmmsg"}, 
	{366, "accept4"}, 
	{367, "fanotify_init"}, 
	{368, "fanotify_mark"}, 
	{369, "prlimit64"}, 
	{370, "name_to_handle_at"}, 
	{371, "open_by_handle_at"}, 
	{372, "clock_adjtime"}, 
	{373, "syncfs"}, 
	{374, "sendmmsg"}, 
	{375, "setns"}, 
	{376, "process_vm_readv"}, 
	{377, "process_vm_writev"}, 
	{378, "kcmp"}, 
	{379, "finit_module"}, 
	{380, "sched_setattr"}, 
	{381, "sched_getattr"}, 
	{382, "renameat2"}, 
	{383, "seccomp"}, 
	{384, "getrandom"}, 
	{385, "memfd_create"}, 
	{386, "bpf"}, 
	{387, "execveat"}, 
	{388, "userfaultfd"}, 
	{389, "membarrier"}, 
	{390, "mlock2"}, 
	{391, "copy_file_range"}, 
	{392, "preadv2"}, 
	{393, "pwritev2"}, 
	{394, "pkey_mprotect"}, 
	{395, "pkey_alloc"}, 
	{396, "pkey_free"}, 
	{397, "statx"}, 
	/* 架构相关的系统调用 */
#define __ARM_NR_BASE 			0x0f0000
#define __ARM_NR_breakpoint		(__ARM_NR_BASE+1)
#define __ARM_NR_cacheflush		(__ARM_NR_BASE+2)
#define __ARM_NR_usr26			(__ARM_NR_BASE+3)
#define __ARM_NR_usr32			(__ARM_NR_BASE+4)
#define __ARM_NR_set_tls		(__ARM_NR_BASE+5)
	{__ARM_NR_breakpoint, "breakpoint"},
	{__ARM_NR_cacheflush, "cacheflush"},
	{__ARM_NR_usr26, "usr26"},
	{__ARM_NR_usr32, "usr32"},
	{__ARM_NR_set_tls, "set_tls"},

	{-1, NULL}
};

static char *find_syscall_name(int scno)
{
	struct syscall *sc;
 
	for (sc = syscall_table; sc->no >= 0; sc++)
		if (sc->no == scno)
			return sc->name;
 
	return NULL;
}

static volatile int stopped;

static void signal_handler(int signo)
{
	if (signo == SIGINT || signo == SIGTERM)
		stopped = 1;
}

int main(int argc, char *argv[])
{
	pid_t child;
	int status;
	struct user_regs regs; 
	unsigned long int scret, scno;

	if (argc != 2) {
		printf("Usage: %s \n", argv[0]);
		exit(EXIT_FAILURE);
	}

	if (signal(SIGINT, signal_handler) == SIG_ERR || 
		signal(SIGTERM, signal_handler) == SIG_ERR) {
		perror("signal");
		exit(EXIT_FAILURE);
	}

	child = (pid_t)atoi(argv[1]);
	
	if (ptrace(PTRACE_ATTACH, child, NULL, NULL) < 0) {
		perror("ptrace");
		exit(EXIT_FAILURE);
	}
	
	wait(&status);
	
	ptrace(PTRACE_SYSCALL, child, NULL, NULL);
	while (!stopped) {
		wait(&status);
		if (WIFEXITED(status))
			break;

		/*
		 * 如果关心 进入系统调用 时的寄存器状态,
		 * 在这里调用 ptrace(PTRACE_GETREGS) 获取相关信息 
		 */
		// ......
		
		ptrace(PTRACE_SYSCALL, child, NULL, NULL);
		
		wait(&status);
		if (WIFEXITED(status))
			break;
		
		ptrace(PTRACE_GETREGS, child, 0, &regs);
		scret = regs.uregs[0]; /* 寄存器 r0 保存系统调用返回值 */
		scno = regs.uregs[7]; /* 寄存器 r7 保存系统调用号 */
		printf("%s() = %d\n", find_syscall_name(scno), (int)scret);
		
		ptrace(PTRACE_SYSCALL, child, NULL, NULL);
	}

	ptrace(PTRACE_DETACH, child, NULL, NULL);
 
	return 0;
}

被追踪程序的代码:

/*
 * tracee2.c
 */
 
#include 
#include 

int main(void)
{
	for (;;) {
		write(fileno(stdout), "@", 1);
		sleep(1);
	}

	return 0;
}

编译:

arm-linux-gnueabihf-gcc -o syscall_trace syscall_trace.c # 编译 tracer 程序
arm-linux-gnueabihf-gcc -o tracee2 tracee2.c # 编译 tracee 程序

syscall_tracetracee2 程序放到根文件系统的 /bin 目录,然后用 QEMU 启动系统,登录系统后运行测试程序:

$ tracee2 &
[1] 927
$ syscall_trace 927
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
nanosleep() = 0
@write() = 1
^Cnanosleep() = 0

其中 @ 是被追踪程序 tracee2 的输出内容。
程序 syscall_trace 中的 ptrace(PTRACE_ATTACH) 调用将向程序 tracee2 发送 SIGSTOP 信号;tracee2 处理 SIGSTOP 信号时,向 syscall_trace 发送 SIGCHLD 信号,然后暂停执行,等待 syscall_trace 将其唤醒。syscall_trace 后续通过 ptrace(PTRACE_SYSCALL) 唤醒 tracee2 ,并开始对 tracee2 程序中系统调用地追踪。
接下来,我们来看 ptrace(PTRACE_ATTACH)ptrace(PTRACE_DETACH) 的细节。

sys_ptrace(PTRACE_ATTACH, child, NULL, NULL)
	...
	child = ptrace_get_task_struct(pid); /* 被追踪进程 tracee2 的 task_struct */
	if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
		ret = ptrace_attach(child, request, addr, data)
			task->ptrace = PT_PTRACED;
			ptrace_link(task, current)
				__ptrace_link(child, new_parent, __task_cred(new_parent))
					list_add(&child->ptrace_entry, &new_parent->ptraced); /* 添加到 tracer 的 tracee 列表 */
					child->parent = new_parent; /* 重置父进程为 tracer */
			/* 给 tracee 进程 @task 发送 SIGSTOP 信号,让它暂停执行 */
			if (!seize)
				send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
		if (!ret)
			arch_ptrace_attach(child);
		goto out_put_task_struct;
	}
	
	...
	
out_put_task_struct:
	put_task_struct(child);
 out:
	return ret;	
	}

被追踪程序 tracee2 处理 SIGSTOP 信号:

do_work_pending()
	if (thread_flags & _TIF_SIGPENDING) {
		do_signal(regs, syscall)
			get_signal(&ksig)
				...
				for (;;) {
					...
					signr = dequeue_synchronous_signal(&ksig->info);
					if (!signr)
						signr = dequeue_signal(current, &current->blocked, &ksig->info);
			
					if (!signr)
						break; /* will return 0 */
					
					if (unlikely(current->ptrace) && signr != SIGKILL) {
						signr = ptrace_signal(signr, &ksig->info)
							ptrace_stop(signr, CLD_TRAPPED, 0, info)
								...
								/* 设置进程状态为 TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) */
								set_special_state(TASK_TRACED);
								...
								if (may_ptrace_stop()) {
									/* 给 tracer 程序 syscall_trace 发送 SIGCHLD 信号并唤醒它:syscall_trace 在 wait() 处等待 */
									do_notify_parent_cldstop(current, true, why);
									...
									/*
									 * 将子进程停下来,直到父进程将其唤醒继续执行。
									 * 在我们的场景下,父进程通过 ptrace(PTRACE_SYSCALL) 将子进程加入调度继续执行。
									 */
									 freezable_schedule();
								}
					}
				}
	}

ptrace(PTRACE_DETACH) 停止 syscall_tracetracee2 的追踪:

sys_ptrace(PTRACE_DETACH, child, NULL, NULL)
	...
	child = ptrace_get_task_struct(pid); /* 被追踪进程 tracee2 的 task_struct */
	...
	ret = arch_ptrace(child, request, addr, data);
		...
		ret = ptrace_request(child, request, addr, data);
			switch (request) {
			...
			case PTRACE_DETACH:	 /* detach a process that was attached. */
				ret = ptrace_detach(child, data);
					...
					child->exit_code = data;
					__ptrace_detach(current, child);
						__ptrace_unlink(p);
							clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); /* 清除系统调用追踪状态 */
							child->parent = child->real_parent; /* 恢复为真正的父进程(通常是 bash),而不是 tracer 进程 */
							list_del_init(&child->ptrace_entry);
							...
							child->ptrace = 0; /* 移除被追踪状态 */
							...
						...
						do_notify_parent(p, p->exit_signal); /* 给真正的父进程发送信号 ,告知自己的状态变化:不再被追踪了 */
				break;
			...
			}
	...
out_put_task_struct:
	put_task_struct(child);
 out:
	return ret;

我们用下图来看 ptrace(PTRACE_ATTACH)ptrace(PTRACE_DETACH) 对追踪程序 syscall_trace 和 被追踪进程 tracee2 进程关系的影响:
Linux:系统调用追踪原理简析_第3张图片

6. 参考资料

man ptrace()
man wait()

你可能感兴趣的:(#,追踪,&,调试,&,性能,linux)