旧式x86平台上的系统调用由int 0x80中断实现,后来对于新式CPU,Linux使用了sysenter方式。
在ARM平台上,使用了swi中断来实现系统调用的跳转。
swi指令用于产生软件中断,从而实现从用户模式变换到管理模式,CPSR(Current Program Status Register,程序状态寄存器,包含了条件标志位、中断禁止位、当前处理器模式标志以及其他的一些控制和状态位)保存到管理模式的SPSR(Saved Program Status Register,程序状态保存寄存器,用于保存CPSR的状态,以便异常返回后恢复异常发生时的工作状态),执行转移到SWI向量,在其他模式下也可使用SWI指令,处理器同样地切换到管理模式。
指令格式如下:
SWI {cond} immed_24
其中:
immed_24 24位立即数,值为从0——16777215之间的整数。
使用SWI指令时,通常使用一下两种方法进行参数传递,SWI异常处理程序可以提供相关的服务,这两种方法均是用户软件协定。SWI异常中断处理程序要通过读取引起软件中断的SWI指令,以取得24为立即数。
1)指令中24位的立即数指定了用户请求的服务类型,参数通过通用寄存器传递。如:
MOV R0,#34
SWI 12
2)指令中的24位立即数被忽略,用户请求的服务类型有寄存器R0的只决定,参数通过其他的通用寄存器传递。如:
MOV R0, #12
MOV R1, #34
SWI 0
在SWI异常处理程序中,去除SWI立即数的步骤为:首先确定一起软中断的SWI指令时ARM指令还是Thumb指令,这可通过对SPSR访问得到;然后取得该SWI指令的地址,这可通过访问LR寄存器得到;接着读出指令,分解出立即数(低24位)。
在arch/arm/include/asm 目录下unistd.h文件中,在Linux内核中,每个系统调用都具有唯一的一个系统调用功能号,这些功能号的定义就在此文件中,在这文件可以看到很多类似这样的定义:
#define __NR_write (__NR_SYSCALL_BASE+ 4)
这是系统调用write的定义,功能号是__NR_SYSCALL_BASE +4,定义为符号__NR_write。
由于采用了不同的二进制接口,所以__NR_SYSCALL_BASE +4的定义会有所不同,在文件中可以找到定义:
#ifndef __ASM_ARM_UNISTD_H #define __ASM_ARM_UNISTD_H #define __NR_OABI_SYSCALL_BASE 0x900000 #if defined(__thumb__) || defined(__ARM_EABI__) #define __NR_SYSCALL_BASE 0 #else #define __NR_SYSCALL_BASE __NR_OABI_SYSCALL_BASE #endif注意那个EABI, EABI是什么东西呢?ABI,Application Binary Interface,应用二进制接口。在较新的EABI规范中,是将系统调用号压入寄存器r7中,而在老的OABI中则是执行的swi 中断号的方式,也就是说原来的调用方式(Old ABI)是通过跟随在swi指令中的调用号来进行的。
/* 0 */ CALL(sys_restart_syscall) CALL(sys_exit) CALL(sys_fork_wrapper) CALL(sys_read) CALL(sys_write) /* 5 */ CALL(sys_open) CALL(sys_close) CALL(sys_ni_syscall) /* was sys_waitpid */ CALL(sys_creat) CALL(sys_link) /*………省略……….*/在源码中,我们可以找到诸如sys_write的函数声明:
#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void) #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__) #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__) #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__) #define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__) #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)这里DEFINEn表示的是参数的个数,有参数的系统调用最终都是指向这么一个宏:
#define SYSCALL_DEFINEx(x, sname, ...) __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)翻看__SYSCALL_DEFINEx的定义:
#define __SYSCALL_DEFINEx(x, name, ...) \ asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))翻查代码,可以找到:
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count)展开此宏,便可以得到“ asmlinkage long sys_write (unsigned int fd, const char __user *buf,size_t count);”的声明形式。
#define __SC_DECL1(t1, a1) t1 a1 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__) #define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__) #define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__) #define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__) #define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__)那么系统是如何找到该函数的。
/* * This is the syscall table declaration for native ABI syscalls. * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall. */ #define ABI(native, compat) native #ifdef CONFIG_AEABI #define OBSOLETE(syscall) sys_ni_syscall #else #define OBSOLETE(syscall) syscall #endif .type sys_call_table, #object ENTRY(sys_call_table) #include "calls.S" #undef ABI #undef OBSOLETE还有
/* * Let's declare a second syscall table for old ABI binaries * using the compatibility syscall entries. */ #define ABI(native, compat) compat #define OBSOLETE(syscall) syscall .type sys_oabi_call_table, #object ENTRY(sys_oabi_call_table) #include "calls.S" #undef ABI #undef OBSOLETEsys_call_table 在内核中是个跳转表,这个表中存储的是一系列的函数指针,这些指针就是系统调用函数的指针,如(sys_open)。内核是根据一个系统调用号(对于EABI来说为系统调用表的索引)找到实际该调用内核哪个函数,然后通过运行该函数完成系统调用的。
系统会根据ABI的不同而将相应的系统调用表的基地址加载进tbl寄存器。
接下来查找的过程。
ARM-Linux内核启动时,通过start_kernel(/init/main.c)->setup_arch(/arch/arm/kernel/setup.c)->paging_init(/arch/arm/mm/nommu.c)->early_trap_init(/arch/arm/kernel/traps.c),初始化中断异常向量表:
void __init early_trap_init(void *vectors_base) { unsigned long vectors = (unsigned long)vectors_base; extern char __stubs_start[], __stubs_end[]; extern char __vectors_start[], __vectors_end[]; extern char __kuser_helper_start[], __kuser_helper_end[]; int kuser_sz = __kuser_helper_end - __kuser_helper_start; vectors_page = vectors_base; /* * Copy the vectors, stubs and kuser helpers (in entry-armv.S) * into the vector page, mapped at 0xffff0000, and ensure these * are visible to the instruction stream. */ memcpy((void *)vectors, __vectors_start, __vectors_end - __vectors_start); memcpy((void *)vectors + 0x200, __stubs_start, __stubs_end - __stubs_start); memcpy((void *)vectors + 0x1000 - kuser_sz, __kuser_helper_start, kuser_sz); /* * Do processor specific fixups for the kuser helpers */ kuser_get_tls_init(vectors); /* * Copy signal return handlers into the vector page, and * set sigreturn to be a pointer to these. */ memcpy((void *)(vectors + KERN_SIGRETURN_CODE - CONFIG_VECTORS_BASE), sigreturn_codes, sizeof(sigreturn_codes)); memcpy((void *)(vectors + KERN_RESTART_CODE - CONFIG_VECTORS_BASE), syscall_restart_code, sizeof(syscall_restart_code)); flush_icache_range(vectors, vectors + PAGE_SIZE); modify_domain(DOMAIN_USER, DOMAIN_CLIENT); }paging_init函数中调用传递的参数是:early_trap_init((void *)CONFIG_VECTORS_BASE);
__vectors_start: ARM( swi SYS_ERROR0 ) THUMB( svc #0 ) THUMB( nop ) W(b) vector_und + stubs_offset W(ldr) pc, .LCvswi + stubs_offset W(b) vector_pabt + stubs_offset W(b) vector_dabt + stubs_offset W(b) vector_addrexcptn + stubs_offset W(b) vector_irq + stubs_offset W(b) vector_fiq + stubs_offset .globl __vectors_end __vectors_end:填充后,向量表如下:
.LCvswi: .word vector_swi最终会执行例程vector_swi来完成对系统调用的处理,翻看/arch/arm/kernel/entry-common.S下vector_swi的定义。
ENTRY(vector_swi) sub sp, sp, #S_FRAME_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 ARM( add r8, sp, #S_PC ) ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr THUMB( mov r8, sp ) THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr mrs r8, spsr @ called from non-FIQ mode, so ok. str lr, [sp, #S_PC] @ Save calling PC str r8, [sp, #S_PSR] @ Save CPSR str r0, [sp, #S_OLD_R0] @ Save OLD_R0 zero_fp /* * Get the system call number. */ #if defined(CONFIG_OABI_COMPAT) /* * If we have CONFIG_OABI_COMPAT then we need to look at the swi * value to determine if it is an EABI or an old ABI call. */ #ifdef CONFIG_ARM_THUMB tst r8, #PSR_T_BIT movne r10, #0 @ no thumb OABI emulation ldreq r10, [lr, #-4] @ get SWI instruction #else ldr r10, [lr, #-4] @ get SWI instruction A710( and ip, r10, #0x0f000000 @ check for SWI ) A710( teq ip, #0x0f000000 ) A710( bne .Larm710bug ) #endif //endif "CONFIG_ARM_THUMB" #ifdef CONFIG_CPU_ENDIAN_BE8 rev r10, r10 @ little endian instruction #endif //endif "CONFIG_CPU_ENDIAN_BE8" #elif defined(CONFIG_AEABI) /* * Pure EABI user space always put syscall number into scno (r7). */ A710( ldr ip, [lr, #-4] @ get SWI instruction ) A710( and ip, ip, #0x0f000000 @ check for SWI ) A710( teq ip, #0x0f000000 ) A710( bne .Larm710bug ) #elif defined(CONFIG_ARM_THUMB) /* Legacy ABI only, possibly thumb mode. */ tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in ldreq scno, [lr, #-4] #else /* Legacy ABI only. */ ldr scno, [lr, #-4] @ get SWI instruction A710( and ip, scno, #0x0f000000 @ check for SWI ) A710( teq ip, #0x0f000000 ) A710( bne .Larm710bug ) #endif //endif "CONFIG_OABI_COMPAT" #ifdef CONFIG_ALIGNMENT_TRAP ldr ip, __cr_alignment ldr ip, [ip] mcr p15, 0, ip, c1, c0 @ update control register #endif enable_irq get_thread_info tsk //tbl是r8寄存器的别名,在arch/arm/kernel/entry-header.S中定义: // tbl .req r8 @syscall table pointer, // 用来存放系统调用表的指针,系统调用表在后面调用 adr tbl, sys_call_table @ load syscall table pointer #if defined(CONFIG_OABI_COMPAT) /* * If the swi argument is zero, this is an EABI call and we do nothing. * * If this is an old ABI call, get the syscall number into scno and * get the old ABI syscall table address. */ bics r10, r10, #0xff000000 eorne scno, r10, #__NR_OABI_SYSCALL_BASE ldrne tbl, =sys_oabi_call_table #elif !defined(CONFIG_AEABI) // scno是寄存器r7的别名 bic scno, scno, #0xff000000 @ mask off SWI op-code eor scno, scno, #__NR_SYSCALL_BASE @ check OS number #endif ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing stmdb sp!, {r4, r5} @ push fifth and sixth args #ifdef CONFIG_SECCOMP tst r10, #_TIF_SECCOMP beq 1f mov r0, scno bl __secure_computing add r0, sp, #S_R0 + S_OFF @ pointer to regs ldmia r0, {r0 - r3} @ have to reload r0 - r3 1: #endif tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls? bne __sys_trace cmp scno, #NR_syscalls @ check upper syscall limit adr lr, BSYM(ret_fast_syscall) @ return address //转入到实现函数 ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine add r1, sp, #S_OFF // why是r8寄存器的别名 2: mov why, #0 @ no longer a real syscall cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE) eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back bcs arm_syscall b sys_ni_syscall @ not private func ENDPROC(vector_swi)
然后转入到函数入口,执行系统调用。
参考总结于:http://blog.csdn.net/xiyangfan/article/details/5701673
http://blog.csdn.net/hongjiujing/article/details/6831192
http://blog.chinaunix.net/uid-26316047-id-3402198.html