注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4,glibc版本为glibc-2.17-196.el7_4.2
linux内核其实可以看做是一个大型应用,其提供的接口就称之为系统调用。而我们平时经常用的open、close、read、write等函数,其实并不是linux提供的系统调用,而是glibc提供的接口,它封装了linux的相关系统调用,以一个更为实用的方式呈现给用户。当然,我们可以不使用glibc封装的接口直接调用原生系统调用。
以64位系统为例,我们可以通过系统上的/usr/include/asm/unistd_64.h文件查看当前版本内核提供的系统调用信息,
#define __NR_read 0
#define __NR_write 1
#define __NR_open 2
#define __NR_close 3
...
#define __NR_copy_file_range 326
可见,CentOS 7.4系统目前提供了327个系统调用。同时这里也定义了每个系统调用的编号。
接下来我们以2号系统调用open为例,看看它是如何从用户态到glibc,再到内核的。
使用sourceinsight,在glibc源码中有以下定义,
# define open(name, flags) open_not_cancel_2 (name, flags)
#define open_not_cancel_2(name, flags) \
INLINE_SYSCALL (open, 2, (const char *) (name), (flags))
我们以x86_64架构为例,可以查到如下宏定义,
# define INLINE_SYSCALL(name, nr, args...) \
({ \
unsigned long int resultvar = INTERNAL_SYSCALL (name, , nr, args); \
if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0)) \
{ \
__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, )); \
resultvar = (unsigned long int) -1; \
} \
(long int) resultvar; })
# define INTERNAL_SYSCALL(name, err, nr, args...) \
INTERNAL_SYSCALL_NCS (__NR_##name, err, nr, ##args)
所以,到达这里后,open就被展开为以下形式,
INTERNAL_SYSCALL_NCS (__NR_##open, err, 2, ##args)
其中,_NR##name,表示将##左右两边的字符串组合在一起,即变为__NR_open。然后接着往下找定义,
# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
({ \
unsigned long int resultvar; \
LOAD_ARGS_##nr (args) \
LOAD_REGS_##nr \
asm volatile ( \
"syscall\n\t" \
: "=a" (resultvar) \
: "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx"); \
(long int) resultvar; })
由上可知,
LOAD_ARGS_##nr (args)即为LOAD_ARGS_4 (args),
LOAD_REGS_##nr即为LOAD_REGS_4。
其实这两个也都是宏定义,先看LOAD_ARGS_4,
# define LOAD_ARGS_4(a1, a2, a3, a4) \
LOAD_ARGS_TYPES_4 (long int, a1, long int, a2, long int, a3, \
long int, a4)
# define LOAD_ARGS_TYPES_4(t1, a1, t2, a2, t3, a3, t4, a4) \
t4 __arg4 = (t4) (a4); \
LOAD_ARGS_TYPES_3 (t1, a1, t2, a2, t3, a3)
# define LOAD_ARGS_TYPES_3(t1, a1, t2, a2, t3, a3) \
t3 __arg3 = (t3) (a3); \
LOAD_ARGS_TYPES_2 (t1, a1, t2, a2)
# define LOAD_ARGS_TYPES_2(t1, a1, t2, a2) \
t2 __arg2 = (t2) (a2); \
LOAD_ARGS_TYPES_1 (t1, a1)
# define LOAD_ARGS_TYPES_1(t1, a1) \
t1 __arg1 = (t1) (a1); \
LOAD_ARGS_0 ()
# define LOAD_ARGS_0()
经过逐层调用,我们发现,其实它就是定义了4个long int的变量而已。
回过头再看LOAD_REGS_4,
# define LOAD_REGS_4 \
LOAD_REGS_TYPES_4 (long int, a1, long int, a2, long int, a3, \
long int, a4)
# define LOAD_REGS_TYPES_4(t1, a1, t2, a2, t3, a3, t4, a4) \
register t4 _a4 asm ("r10") = __arg4; \
LOAD_REGS_TYPES_3(t1, a2, t2, a2, t3, a3)
# define LOAD_REGS_TYPES_3(t1, a1, t2, a2, t3, a3) \
register t3 _a3 asm ("rdx") = __arg3; \
LOAD_REGS_TYPES_2(t1, a1, t2, a2)
# define LOAD_REGS_TYPES_2(t1, a1, t2, a2) \
register t2 _a2 asm ("rsi") = __arg2; \
LOAD_REGS_TYPES_1(t1, a1)
# define LOAD_REGS_TYPES_1(t1, a1) \
register t1 _a1 asm ("rdi") = __arg1; \
LOAD_REGS_0
# define LOAD_REGS_0
还是一样的套路,定义了4个long int的变量,不同的是出现了几个寄存器,并且将变量赋值给对应寄存器,
__arg1 -> rdi
__arg2 -> rsi
__arg3 -> rdx
__arg4 -> r10
到这里我就不太明白了,为什么第四个参数是传给r10寄存器,说好的rcx呢?其实此处是暂存,后续会拷贝至rcx,我们后续还会再说。
我们再回到INTERNAL_SYSCALL_NCS函数,继续往下看,
asm volatile ( \
"syscall\n\t" \
: "=a" (resultvar) \
: "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx");
这里出现了一条重要核心指令,syscall,此时进程将进入内核态。
这时候就该转入内核源码了。我们来看看系统调用的初始化代码,在linux源码树arch/x86/kernel/cpu/common.c文件中,
void syscall_init(void)
{
/*
* LSTAR and STAR live in a bit strange symbiosis.
* They both write to the same internal register. STAR allows to
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
wrmsrl(MSR_LSTAR, system_call);
wrmsrl(MSR_CSTAR, ignore_sysret);
#ifdef CONFIG_IA32_EMULATION
syscall32_cpu_init();
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
X86_EFLAGS_IOPL|X86_EFLAGS_AC);
}
由上可知,MSR_LSTAR的值被设置为system_call,根据Intel的开发手册,
SYSCALL invokes an OS system-call handler at privilege level 0. It does so by loading RIP from the IA32_LSTAR
MSR (after saving the address of the instruction following SYSCALL into RCX). (The WRMSR instruction ensures
that the IA32_LSTAR MSR always contain a canonical address.)
也就是说,执行syscall指令时,会跳转到system_call处执行。该定义在内核arch/x86/entry/entry_64.S文件中,
/*
* System call entry. Up to 6 arguments in registers are supported.
* Register setup:
* rax system call number
* rdi arg0
* rcx return address for syscall/sysret, C arg3
* rsi arg1
* rdx arg2
* r10 arg3 (--> moved to rcx for C)
* r8 arg4
* r9 arg5
* r11 eflags for syscall/sysret, temporary for C
* r12-r15,rbp,rbx saved by C code, not touched.
* /
ENTRY(system_call)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
SWAPGS_UNSAFE_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
* for the guest and jump here on syscall.
*/
GLOBAL(system_call_after_swapgs)
//保存用户栈
movq %rsp,PER_CPU_VAR(old_rsp)
//加载内核栈
movq PER_CPU_VAR(kernel_stack),%rsp
/*
* No need to follow this irqs off/on section - it's straight
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,0
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
CFI_REL_OFFSET rip,RIP-ARGOFFSET
testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
jnz tracesys
system_call_fastpath:
//64位系统__SYSCALL_MASK值为~0
#if __SYSCALL_MASK == ~0
//检查系统调用号是否超出最大值
cmpq $__NR_syscall_max,%rax
#else
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
ja badsys
//将r10寄存器的值传给rcx,也就是系统调用的第四个参数
movq %r10,%rcx
//根据rax中的系统调用号,调用系统调用表中对应的内核函数
call *sys_call_table(,%rax,8) # XXX: rip relative
movq %rax,RAX-ARGOFFSET(%rsp)
这里我们就看到r10寄存器的值会拷贝到RCX寄存器,因此我们通过gdb查看汇编代码时看第四个参数也是rcx。
这里的重点是调用sys_call_table,也就是根据系统调用号进入对应函数。而sys_call_table是在内核编译过程中生成的,我们先看下它的定义(arch/x86/kernel/syscall_64.c),
const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include
};
可见,它由syscalls_64.h头文件定义,这个头文件是由arch/x86/syscalls/syscall_64.tbl通过arch/x86/syscalls/syscalltbl.sh脚本生成。我们先看下这个脚本内容,
#!/bin/sh
in="$1"
out="$2"
grep '^[0-9]' "$in" | sort -n | (
while read nr abi name entry compat; do
abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
if [ -n "$compat" ]; then
echo "__SYSCALL_${abi}($nr, $entry, $compat)"
elif [ -n "$entry" ]; then
echo "__SYSCALL_${abi}($nr, $entry, $entry)"
fi
done
) > "$out"
脚本内容很简单,即将输入文件按照固定格式输出。具体由arch/x86/syscalls/Makefile执行,
out := $(obj)/../include/generated/asm
...
syscall64 := $(srctree)/$(src)/syscall_64.tbl
...
systbl := $(srctree)/$(src)/syscalltbl.sh
...
$(out)/syscalls_64.h: $(syscall64) $(systbl)
$(call if_changed,systbl)
所以最终会生成include/generated/asm/syscalls_64.h这个头文件,从而定义sys_call_table。
我们手动执行这个脚本,看下输出的情况,
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# ls
. .. Makefile syscall_32.tbl syscall_64.tbl syscallhdr.sh syscalltbl.sh
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# sh syscalltbl.sh syscall_64.tbl syscalls_64.h
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# ls
. .. Makefile syscall_32.tbl syscall_64.tbl syscallhdr.sh syscalls_64.h syscalltbl.sh
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# head -n 11 syscall_64.tbl
#
# 64-bit system call numbers and entry vectors
#
# The format is:
#
#
# The abi is "common", "64" or "x32" for this file.
#
0 common read sys_read
1 common write sys_write
2 common open sys_open
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# head syscalls_64.h
__SYSCALL_COMMON(0, sys_read, sys_read)
__SYSCALL_COMMON(1, sys_write, sys_write)
__SYSCALL_COMMON(2, sys_open, sys_open)
__SYSCALL_COMMON(3, sys_close, sys_close)
__SYSCALL_COMMON(4, sys_newstat, sys_newstat)
__SYSCALL_COMMON(5, sys_newfstat, sys_newfstat)
__SYSCALL_COMMON(6, sys_newlstat, sys_newlstat)
__SYSCALL_COMMON(7, sys_poll, sys_poll)
__SYSCALL_COMMON(8, sys_lseek, sys_lseek)
__SYSCALL_COMMON(9, sys_mmap, sys_mmap)
因此,对于open函数而言,执行以下指令后,
call *sys_call_table(,%rax,8)
将调用__SYSCALL_COMMON(2, sys_open, sys_open)。再看下这个宏定义展开是什么。
#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#include
#undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
因此,__SYSCALL_COMMON(2, sys_open, sys_open)最终展开后就成了sys_open。
但是我们还是无法在源码中搜索到sys_open,因为调用者也是通过宏定义调用的,而这个宏定义就是SYSCALL_DEFINEx。
内核中open实际的实现如下,
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(AT_FDCWD, filename, flags, mode);
}
因此,我们看看这个SYSCALL_DEFINE3是怎么一步一步到sys_open的。先看下宏定义的展开,
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
SYSCALL_ALIAS(sys##name, SyS##name); \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
我们用open代入,看下最后会出现啥,
SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
==> SYSCALL_DEFINEx(3, _open, const char __user *, filename, int, flags, umode_t, mode)
==> SYSCALL_METADATA(_open, 3, const char __user *, filename, int, flags, umode_t, mode)//和ftrace相关,暂不关注
__SYSCALL_DEFINEx(3, _open, const char __user *, filename, int, flags, umode_t, mode)
==> asmlinkage long sys_open(__MAP(3,__SC_DECL, const char __user *, filename, int, flags, umode_t, mode)); \
static inline long SYSC_open(__MAP(3,__SC_DECL, const char __user *, filename, int, flags, umode_t, mode)); \
asmlinkage long SyS_open(__MAP(3,__SC_LONG, const char __user *, filename, int, flags, umode_t, mode)) \
{ \
long ret = SYSC_open(__MAP(3,__SC_CAST, const char __user *, filename, int, flags, umode_t, mode)); \
__MAP(3,__SC_TEST, const char __user *, filename, int, flags, umode_t, mode); \
__PROTECT(3, ret,__MAP(3,__SC_ARGS, const char __user *, filename, int, flags, umode_t, mode)); \
return ret; \
} \
SYSCALL_ALIAS(sys_open, SyS_open); \
static inline long SYSC_open(__MAP(3,__SC_DECL, const char __user *, filename, int, flags, umode_t, mode))
该宏定义将sys_open与SyS_open等效关联,而SyS_open经过参数校验后调用SYSC_open,SYSC_open才是最后open系统调用的内核具体实现。
到这里open就从用户态完成了到内核态的调用,调用完毕后返回值保存到rax寄存器中返回。
最后,我们概要的梳理一下整个流程,
用户调用glibc接口open
==> 系统调用号存入eax寄存器,调用syscall指令进入内核态
==> 根据系统调用号在sys_call_table表中查找open对应的内核函数sys_open
==> sys_open调用SyS_open(其实只是alias而已)
==> SyS_open调用SYSC_open
==> SYSC_open为实际函数实现,执行实际操作
==> 返回值存放到rax寄存器,调用retq指令返回用户态