实验内容
- 找一个系统调用,系统调用号为学号最后2位相同的系统调用
- 通过汇编指令触发该系统调用
- 通过gdb跟踪该系统调用的内核处理过程
- 重点阅读分析系统调用入口的保存现场、恢复现场和系统调用返回,以及重点关注系统调用过程中内核堆栈状态的变化
准备环境
利用上次实验下载的linux内核,重新编译,再制作根文件系统,并准备好触发系统调用的程序。
首先配置内核编译选项
make defconfig # Default configuration is based on 'x86_64_defconfig' make menuconfig # 打开debug相关选项 Kernel hacking ---> Compile-time checks and compiler options ---> [*] Compile the kernel with debug info [*] Provide GDB scripts for kernel debugging [*] Kernel debugging # 关闭KASLR,否则会导致打断点失败 Processor type and features ----> [] Randomize the address of the kernel image (KASLR)
编译内核
make -j$(nproc) # nproc gives the number of CPU cores/threads available # 测试⼀下内核能不能正常加载运⾏,因为没有⽂件系统终会kernel panic qemu-system-x86_64 -kernel arch/x86/boot/bzImage # 此时应该不能正常运行
制作根文件系统
#下载 axel -n 20 https://busybox.net/downloads/busybox-1.31.1.tar.bz2 tar -jxvf busybox-1.31.1.tar.bz2 cd busybox-1.31.1 #制作根文件系统 make menuconfig #记得要编译成静态链接,不⽤动态链接库。 Settings ---> [*] Build static binary (no shared libs) #然后编译安装,默认会安装到源码⽬录下的 _install ⽬录中。 make -j$(nproc) && make install # 制作内存根文件系统镜像 mkdir rootfs cd rootfs cp ../busybox-1.31.1/_install/* ./ -rf mkdir dev proc sys home sudo cp -a /dev/{null,console,tty,tty1,tty2,tty3,tty4} dev/
准备init脚本文件放在根文件系统跟目录下(rootfs/init),添加如下内容到init文件。
复制代码 #!/bin/sh mount -t proc none /proc mount -t sysfs none /sys echo "Wellcome TestOS!" echo "--------------------" cd home /bin/sh #给init脚本添加可执行权限 chmod +x init
我的学号后两位是17,对应的系统调用是pread函数,pread函数用于从打开文件的指定位置开始读取数据,函数原型如下:
#includessize_t pread(int filedes, void *buf, size_t nbytes, off_t offset);
返回值:若读取成功则返回实际读到的字节数,若已到文件结尾则返回0,若出错则返回-1。
参数:
1、filedes文件标识符;
2、*buf存放读出数据的缓冲区;
3、nbytes要读取的字节数;
4、offset文件指针。
#include#include #include int main() { int fd,i,count; char buf[20]; fd=open("/home/world",O_RDWR); printf("use pread\n"); // pread(fd,buf,10,0); asm volatile( "mov $0x0, %%rcx\n\t" //参数4 "mov $0x0a, %%rdx\n\t" //参数3 "mov %2, %%rsi\n\t" //参数2 "mov %1, %%rdi\n\t" //参数1 "mov $0x11, %%eax\n\t" //传递系统调用号 "syscall\n\t" //系统调用 "mov %%rax, %0\n\t" //结果存到count中 :"=m"(count) //输出 :"m"(fd),"p"(&buf)//输入 ); buf[20]='\0'; printf("%d",count); close(fd); return 0; }
打包根文件系统
#打包成内存根文件系统镜像 find . -print0 | cpio --null -ov --format=newc | gzip -9 > ../rootfs.cpio.gz #测试挂载根文件系统,看内核启动完成后是否执行init脚本 qemu-system-x86_64 -kernel linux-5.4.34/arch/x86/boot/bzImage -initrd rootfs.cpio.gz
到此,环境准备完毕。
GDB跟踪系统调用的内核处理过程
可以看到现在停在了fs/read_write.c:646处,具体代码如下
SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, size_t, count, loff_t, pos) { return ksys_pread64(fd, buf, count, pos); }
调用了ksys_pread64,继续向下看
ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; if (pos < 0) return -EINVAL; f = fdget(fd); if (f.file) { ret = -ESPIPE; if (f.file->f_mode & FMODE_PREAD) ret = vfs_read(f.file, buf, count, &pos); fdput(f); } return ret; }
内核执行过程
执行完ksys_pread64()函数后,意味着内核调用基本结束,接下来就是要转到用户态了,返回到之前系统调用的do_syscall_64()中,执行syscall_return_slowpath。
do_syscall_64(),位于arch/x86/entry/common,部分代码如下
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs) { struct thread_info *ti; enter_from_user_mode(); local_irq_enable(); ti = current_thread_info(); if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) nr = syscall_trace_enter(regs); if (likely(nr < NR_syscalls)) { nr = array_index_nospec(nr, NR_syscalls); regs->ax = sys_call_table[nr](regs);
...
syscall_return_slowpath(regs);
}
可以看到regs->ax利用系统调用表拿到系统调用号,最后调用了syscall_return_slowpath(), 这个函数是为切换到用户态做准备。接着返回到entry_SYSCALL_64(), entry_SYSCALL_64()是进入64位系统调用的唯一入口点,它完成了保存现场,调用对应的内核处理函数、恢复现场、系统调用返回等工作。如下是代码,
ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ swapgs /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ PUSH_AND_CLEAR_REGS rax=$-ENOSYS TRACE_IRQS_OFF /* IRQs are off. */ movq %rax, %rdi movq %rsp, %rsi call do_syscall_64 /* returns with IRQs disabled */ TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. If we're not, * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. * * Change top bits to match most significant bit (47th or 56th bit * depending on paging mode) in the address. */ #ifdef CONFIG_X86_5LEVEL ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \ "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57 #else shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx #endif /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot * restore RF properly. If the slowpath sets it for whatever reason, we * need to restore it correctly. * * SYSRET can restore TF, but unlike IRET, restoring TF results in a * trap from userspace immediately after SYSRET. This would cause an * infinite loop whenever #DB happens with register state that satisfies * the opportunistic SYSRET conditions. For example, single-stepping * this user code: * * movq $stuck_here, %rcx * pushfq * popq %r11 * stuck_here: * * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: /* rcx and r11 are already restored (see code above) */ UNWIND_HINT_EMPTY POP_REGS pop_rdi=0 skip_r11rcx=1 /* * Now all regs are restored except RSP and RDI. * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ /* * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ STACKLEAK_ERASE_NOCLOBBER SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi popq %rdi popq %rsp USERGS_SYSRET64 END(entry_SYSCALL_64)
流程大概就是在 entry_SYSCALL_64()中,先用swapgs保存现场,再调用了call do_syscall_64,而我们打的断点就在这个do_syscall_64中,找到read_write.c文件,执行ksys_pread64(),执行完后返回到do_syscall_64中,接着执行syscall_return_slowpath函数,为返回用户态做准备,执行完后再返回到enter_SYSCALL_64中,接着利用swapgs恢复现场,最后进入用户态。