Linux系统调用流程——open函数从用户态接口到内核

注:本文分析基于3.10.0-693.el7内核版本,即CentOS 7.4,glibc版本为glibc-2.17-196.el7_4.2

1、关于系统调用

linux内核其实可以看做是一个大型应用,其提供的接口就称之为系统调用。而我们平时经常用的open、close、read、write等函数,其实并不是linux提供的系统调用,而是glibc提供的接口,它封装了linux的相关系统调用,以一个更为实用的方式呈现给用户。当然,我们可以不使用glibc封装的接口直接调用原生系统调用。

2、系统调用数量

以64位系统为例,我们可以通过系统上的/usr/include/asm/unistd_64.h文件查看当前版本内核提供的系统调用信息,

#define __NR_read 0
#define __NR_write 1
#define __NR_open 2
#define __NR_close 3
...
#define __NR_copy_file_range 326

可见,CentOS 7.4系统目前提供了327个系统调用。同时这里也定义了每个系统调用的编号。

3、open函数调用流程

接下来我们以2号系统调用open为例,看看它是如何从用户态到glibc,再到内核的。

使用sourceinsight,在glibc源码中有以下定义,

# define open(name, flags)	open_not_cancel_2 (name, flags)

#define open_not_cancel_2(name, flags) \
   INLINE_SYSCALL (open, 2, (const char *) (name), (flags))

我们以x86_64架构为例,可以查到如下宏定义,

# define INLINE_SYSCALL(name, nr, args...) \
  ({									      \
    unsigned long int resultvar = INTERNAL_SYSCALL (name, , nr, args);	      \
    if (__builtin_expect (INTERNAL_SYSCALL_ERROR_P (resultvar, ), 0))	      \
      {									      \
	__set_errno (INTERNAL_SYSCALL_ERRNO (resultvar, ));		      \
	resultvar = (unsigned long int) -1;				      \
      }									      \
    (long int) resultvar; })
    
# define INTERNAL_SYSCALL(name, err, nr, args...) \
  INTERNAL_SYSCALL_NCS (__NR_##name, err, nr, ##args)

所以,到达这里后,open就被展开为以下形式,

INTERNAL_SYSCALL_NCS (__NR_##open, err, 2, ##args)

其中,_NR##name,表示将##左右两边的字符串组合在一起,即变为__NR_open。然后接着往下找定义,

# define INTERNAL_SYSCALL_NCS(name, err, nr, args...) \
  ({									      \
    unsigned long int resultvar;					      \
    LOAD_ARGS_##nr (args)						      \
    LOAD_REGS_##nr							      \
    asm volatile (							      \
    "syscall\n\t"							      \
    : "=a" (resultvar)							      \
    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx");		      \
    (long int) resultvar; })

由上可知,
LOAD_ARGS_##nr (args)即为LOAD_ARGS_4 (args),
LOAD_REGS_##nr即为LOAD_REGS_4。

其实这两个也都是宏定义,先看LOAD_ARGS_4,

# define LOAD_ARGS_4(a1, a2, a3, a4)					   \
  LOAD_ARGS_TYPES_4 (long int, a1, long int, a2, long int, a3,		   \
		     long int, a4)

# define LOAD_ARGS_TYPES_4(t1, a1, t2, a2, t3, a3, t4, a4)		   \
  t4 __arg4 = (t4) (a4);						   \
  LOAD_ARGS_TYPES_3 (t1, a1, t2, a2, t3, a3)

# define LOAD_ARGS_TYPES_3(t1, a1, t2, a2, t3, a3)			   \
  t3 __arg3 = (t3) (a3);						   \
  LOAD_ARGS_TYPES_2 (t1, a1, t2, a2)

# define LOAD_ARGS_TYPES_2(t1, a1, t2, a2)				   \
  t2 __arg2 = (t2) (a2);						   \
  LOAD_ARGS_TYPES_1 (t1, a1)

# define LOAD_ARGS_TYPES_1(t1, a1)					   \
  t1 __arg1 = (t1) (a1);						   \
  LOAD_ARGS_0 ()

# define LOAD_ARGS_0()

经过逐层调用,我们发现,其实它就是定义了4个long int的变量而已。

回过头再看LOAD_REGS_4,

# define LOAD_REGS_4							   \
  LOAD_REGS_TYPES_4 (long int, a1, long int, a2, long int, a3,		   \
		     long int, a4)

# define LOAD_REGS_TYPES_4(t1, a1, t2, a2, t3, a3, t4, a4)		   \
  register t4 _a4 asm ("r10") = __arg4;					   \
  LOAD_REGS_TYPES_3(t1, a2, t2, a2, t3, a3)

# define LOAD_REGS_TYPES_3(t1, a1, t2, a2, t3, a3)			   \
  register t3 _a3 asm ("rdx") = __arg3;					   \
  LOAD_REGS_TYPES_2(t1, a1, t2, a2)

# define LOAD_REGS_TYPES_2(t1, a1, t2, a2)				   \
  register t2 _a2 asm ("rsi") = __arg2;					   \
  LOAD_REGS_TYPES_1(t1, a1)

# define LOAD_REGS_TYPES_1(t1, a1)					   \
  register t1 _a1 asm ("rdi") = __arg1;					   \
  LOAD_REGS_0

# define LOAD_REGS_0

还是一样的套路,定义了4个long int的变量,不同的是出现了几个寄存器,并且将变量赋值给对应寄存器,

__arg1 -> rdi
__arg2 -> rsi
__arg3 -> rdx
__arg4 -> r10

到这里我就不太明白了,为什么第四个参数是传给r10寄存器,说好的rcx呢?其实此处是暂存,后续会拷贝至rcx,我们后续还会再说。

我们再回到INTERNAL_SYSCALL_NCS函数,继续往下看,

asm volatile (							      \
    "syscall\n\t"							      \
    : "=a" (resultvar)							      \
    : "0" (name) ASM_ARGS_##nr : "memory", "cc", "r11", "cx");

这里出现了一条重要核心指令,syscall,此时进程将进入内核态。

这时候就该转入内核源码了。我们来看看系统调用的初始化代码,在linux源码树arch/x86/kernel/cpu/common.c文件中,

void syscall_init(void)
{
	/*
	 * LSTAR and STAR live in a bit strange symbiosis.
	 * They both write to the same internal register. STAR allows to
	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
	 */
	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
	wrmsrl(MSR_LSTAR, system_call);
	wrmsrl(MSR_CSTAR, ignore_sysret);

#ifdef CONFIG_IA32_EMULATION
	syscall32_cpu_init();
#endif

	/* Flags to clear on syscall */
	wrmsrl(MSR_SYSCALL_MASK,
	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
	       X86_EFLAGS_IOPL|X86_EFLAGS_AC);
}

由上可知,MSR_LSTAR的值被设置为system_call,根据Intel的开发手册,

SYSCALL invokes an OS system-call handler at privilege level 0. It does so by loading RIP from the IA32_LSTAR
MSR (after saving the address of the instruction following SYSCALL into RCX). (The WRMSR instruction ensures
that the IA32_LSTAR MSR always contain a canonical address.)

也就是说,执行syscall指令时,会跳转到system_call处执行。该定义在内核arch/x86/entry/entry_64.S文件中,

/*
 * System call entry. Up to 6 arguments in registers are supported.
 * Register setup:
 * rax  system call number
 * rdi  arg0
 * rcx  return address for syscall/sysret, C arg3
 * rsi  arg1
 * rdx  arg2
 * r10  arg3 	(--> moved to rcx for C)
 * r8   arg4
 * r9   arg5
 * r11  eflags for syscall/sysret, temporary for C
 * r12-r15,rbp,rbx saved by C code, not touched.
 * /
ENTRY(system_call)
	CFI_STARTPROC	simple
	CFI_SIGNAL_FRAME
	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
	CFI_REGISTER	rip,rcx
	/*CFI_REGISTER	rflags,r11*/
	SWAPGS_UNSAFE_STACK
	/*
	 * A hypervisor implementation might want to use a label
	 * after the swapgs, so that it can do the swapgs
	 * for the guest and jump here on syscall.
	 */
GLOBAL(system_call_after_swapgs)
	//保存用户栈
	movq	%rsp,PER_CPU_VAR(old_rsp)
	//加载内核栈
	movq	PER_CPU_VAR(kernel_stack),%rsp
	/*
	 * No need to follow this irqs off/on section - it's straight
	 * and short:
	 */
	ENABLE_INTERRUPTS(CLBR_NONE)
	SAVE_ARGS 8,0
	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
	movq  %rcx,RIP-ARGOFFSET(%rsp)
	CFI_REL_OFFSET rip,RIP-ARGOFFSET
	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
	jnz tracesys
system_call_fastpath:
//64位系统__SYSCALL_MASK值为~0
#if __SYSCALL_MASK == ~0
	//检查系统调用号是否超出最大值
	cmpq $__NR_syscall_max,%rax
#else
	andl $__SYSCALL_MASK,%eax
	cmpl $__NR_syscall_max,%eax
#endif
	ja badsys
	//将r10寄存器的值传给rcx,也就是系统调用的第四个参数
	movq %r10,%rcx
	//根据rax中的系统调用号,调用系统调用表中对应的内核函数
	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
	movq %rax,RAX-ARGOFFSET(%rsp)

这里我们就看到r10寄存器的值会拷贝到RCX寄存器,因此我们通过gdb查看汇编代码时看第四个参数也是rcx。

这里的重点是调用sys_call_table,也就是根据系统调用号进入对应函数。而sys_call_table是在内核编译过程中生成的,我们先看下它的定义(arch/x86/kernel/syscall_64.c),

const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
	/*
	 * Smells like a compiler bug -- it doesn't work
	 * when the & below is removed.
	 */
	[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include 
};

可见,它由syscalls_64.h头文件定义,这个头文件是由arch/x86/syscalls/syscall_64.tbl通过arch/x86/syscalls/syscalltbl.sh脚本生成。我们先看下这个脚本内容,

#!/bin/sh

in="$1"
out="$2"

grep '^[0-9]' "$in" | sort -n | (
    while read nr abi name entry compat; do
	abi=`echo "$abi" | tr '[a-z]' '[A-Z]'`
	if [ -n "$compat" ]; then
	    echo "__SYSCALL_${abi}($nr, $entry, $compat)"
	elif [ -n "$entry" ]; then
	    echo "__SYSCALL_${abi}($nr, $entry, $entry)"
	fi
    done
) > "$out"

脚本内容很简单,即将输入文件按照固定格式输出。具体由arch/x86/syscalls/Makefile执行,

out := $(obj)/../include/generated/asm
...
syscall64 := $(srctree)/$(src)/syscall_64.tbl
...
systbl := $(srctree)/$(src)/syscalltbl.sh
...
$(out)/syscalls_64.h: $(syscall64) $(systbl)
        $(call if_changed,systbl)

所以最终会生成include/generated/asm/syscalls_64.h这个头文件,从而定义sys_call_table。

我们手动执行这个脚本,看下输出的情况,

[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# ls
.  ..  Makefile  syscall_32.tbl  syscall_64.tbl  syscallhdr.sh  syscalltbl.sh
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# sh syscalltbl.sh syscall_64.tbl syscalls_64.h
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# ls
.  ..  Makefile  syscall_32.tbl  syscall_64.tbl  syscallhdr.sh  syscalls_64.h  syscalltbl.sh
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# head -n 11 syscall_64.tbl 
#
# 64-bit system call numbers and entry vectors
#
# The format is:
#    
#
# The abi is "common", "64" or "x32" for this file.
#
0	common	read			sys_read
1	common	write			sys_write
2	common	open			sys_open
[root@CentOS-7-4 /usr/src/3.10/arch/x86/syscalls]# head syscalls_64.h
__SYSCALL_COMMON(0, sys_read, sys_read)
__SYSCALL_COMMON(1, sys_write, sys_write)
__SYSCALL_COMMON(2, sys_open, sys_open)
__SYSCALL_COMMON(3, sys_close, sys_close)
__SYSCALL_COMMON(4, sys_newstat, sys_newstat)
__SYSCALL_COMMON(5, sys_newfstat, sys_newfstat)
__SYSCALL_COMMON(6, sys_newlstat, sys_newlstat)
__SYSCALL_COMMON(7, sys_poll, sys_poll)
__SYSCALL_COMMON(8, sys_lseek, sys_lseek)
__SYSCALL_COMMON(9, sys_mmap, sys_mmap)

因此,对于open函数而言,执行以下指令后,

call *sys_call_table(,%rax,8) 

将调用__SYSCALL_COMMON(2, sys_open, sys_open)。再看下这个宏定义展开是什么。

#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat)
#include 
#undef __SYSCALL_64

#define __SYSCALL_64(nr, sym, compat) [nr] = sym,

因此,__SYSCALL_COMMON(2, sys_open, sys_open)最终展开后就成了sys_open。

但是我们还是无法在源码中搜索到sys_open,因为调用者也是通过宏定义调用的,而这个宏定义就是SYSCALL_DEFINEx。

内核中open实际的实现如下,

SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
{
	if (force_o_largefile())
		flags |= O_LARGEFILE;

	return do_sys_open(AT_FDCWD, filename, flags, mode);
}

因此,我们看看这个SYSCALL_DEFINE3是怎么一步一步到sys_open的。先看下宏定义的展开,

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)				\
	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __SYSCALL_DEFINEx(x, name, ...)					\
	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	SYSCALL_ALIAS(sys##name, SyS##name);				\
	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

我们用open代入,看下最后会出现啥,

    SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)

==> SYSCALL_DEFINEx(3, _open, const char __user *, filename, int, flags, umode_t, mode)

==> SYSCALL_METADATA(_open, 3, const char __user *, filename, int, flags, umode_t, mode)//和ftrace相关,暂不关注
	__SYSCALL_DEFINEx(3, _open, const char __user *, filename, int, flags, umode_t, mode)

==> asmlinkage long sys_open(__MAP(3,__SC_DECL, const char __user *, filename, int, flags, umode_t, mode));	\
	static inline long SYSC_open(__MAP(3,__SC_DECL, const char __user *, filename, int, flags, umode_t, mode));	\
	asmlinkage long SyS_open(__MAP(3,__SC_LONG, const char __user *, filename, int, flags, umode_t, mode))	\
	{								\
		long ret = SYSC_open(__MAP(3,__SC_CAST, const char __user *, filename, int, flags, umode_t, mode));	\
		__MAP(3,__SC_TEST, const char __user *, filename, int, flags, umode_t, mode);				\
		__PROTECT(3, ret,__MAP(3,__SC_ARGS, const char __user *, filename, int, flags, umode_t, mode));	\
		return ret;						\
	}								\
	SYSCALL_ALIAS(sys_open, SyS_open);				\
	static inline long SYSC_open(__MAP(3,__SC_DECL, const char __user *, filename, int, flags, umode_t, mode))

该宏定义将sys_open与SyS_open等效关联,而SyS_open经过参数校验后调用SYSC_open,SYSC_open才是最后open系统调用的内核具体实现。

到这里open就从用户态完成了到内核态的调用,调用完毕后返回值保存到rax寄存器中返回。

最后,我们概要的梳理一下整个流程,

	用户调用glibc接口open
==> 系统调用号存入eax寄存器,调用syscall指令进入内核态
==> 根据系统调用号在sys_call_table表中查找open对应的内核函数sys_open
==> sys_open调用SyS_open(其实只是alias而已)
==> SyS_open调用SYSC_open
==> SYSC_open为实际函数实现,执行实际操作
==> 返回值存放到rax寄存器,调用retq指令返回用户态

你可能感兴趣的:(Linux,系统调用流程,SYSCALL_DEFINE,open系统调用)