linux系统调用代码

系统调用有3条调用flow:
Flow1:

/*=============================================================================
 * SWI handler
 *-----------------------------------------------------------------------------
 */

    .align  5
ENTRY(vector_swi)
#ifdef CONFIG_CPU_V7M
    v7m_exception_entry
#else
    sub sp, sp, #S_FRAME_SIZE
    stmia   sp, {r0 - r12}          @ Calling r0 - r12
 ARM(   add r8, sp, #S_PC       )
 ARM(   stmdb   r8, {sp, lr}^       )   @ Calling sp, lr
 THUMB( mov r8, sp          )
 THUMB( store_user_sp_lr r8, r10, S_SP  )   @ calling sp, lr
    mrs r8, spsr            @ called from non-FIQ mode, so ok.
    str lr, [sp, #S_PC]         @ Save calling PC
    str r8, [sp, #S_PSR]        @ Save CPSR
    str r0, [sp, #S_OLD_R0]     @ Save OLD_R0
#endif
    zero_fp
    alignment_trap r10, ip, __cr_alignment
    enable_irq
    ct_user_exit
    get_thread_info tsk

    /*
     * Get the system call number.
     */

#if defined(CONFIG_OABI_COMPAT)

    /*
     * If we have CONFIG_OABI_COMPAT then we need to look at the swi
     * value to determine if it is an EABI or an old ABI call.
     */
#ifdef CONFIG_ARM_THUMB
    tst r8, #PSR_T_BIT
    movne   r10, #0             @ no thumb OABI emulation
 USER(  ldreq   r10, [lr, #-4]      )   @ get SWI instruction
#else
 USER(  ldr r10, [lr, #-4]      )   @ get SWI instruction
#endif
 ARM_BE8(rev    r10, r10)           @ little endian instruction

#elif defined(CONFIG_AEABI)

    /*
     * Pure EABI user space always put syscall number into scno (r7).
     */
#elif defined(CONFIG_ARM_THUMB)
    /* Legacy ABI only, possibly thumb mode. */
    tst r8, #PSR_T_BIT          @ this is SPSR from save_user_regs
    addne   scno, r7, #__NR_SYSCALL_BASE    @ put OS number in
 USER(  ldreq   scno, [lr, #-4]     )

#else
    /* Legacy ABI only. */
 USER(  ldr scno, [lr, #-4]     )   @ get SWI instruction
#endif

    adr tbl, sys_call_table     @ load syscall table pointer

#if defined(CONFIG_OABI_COMPAT)
    /*
     * If the swi argument is zero, this is an EABI call and we do nothing.
     *
     * If this is an old ABI call, get the syscall number into scno and
     * get the old ABI syscall table address.
     */
    bics    r10, r10, #0xff000000
    eorne   scno, r10, #__NR_OABI_SYSCALL_BASE
    ldrne   tbl, =sys_oabi_call_table
#elif !defined(CONFIG_AEABI)
    bic scno, scno, #0xff000000     @ mask off SWI op-code
    eor scno, scno, #__NR_SYSCALL_BASE  @ check OS number
#endif

local_restart:
    ldr r10, [tsk, #TI_FLAGS]       @ check for syscall tracing
    stmdb   sp!, {r4, r5}           @ push fifth and sixth args

    tst r10, #_TIF_SYSCALL_WORK     @ are we tracing syscalls?
    bne __sys_trace

    cmp scno, #NR_syscalls      @ check upper syscall limit
    adr lr, BSYM(ret_fast_syscall)  @ return address
    ldrcc   pc, [tbl, scno, lsl #2]     @ call sys_* routine

    add r1, sp, #S_OFF
2:  cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)
    eor r0, scno, #__NR_SYSCALL_BASE    @ put OS number back
    bcs arm_syscall
    mov why, #0             @ no longer a real syscall
    b   sys_ni_syscall          @ not private func

#if defined(CONFIG_OABI_COMPAT) || !defined(CONFIG_AEABI)
    /*
     * We failed to handle a fault trying to access the page
     * containing the swi instruction, but we're not really in a
     * position to return -EFAULT. Instead, return back to the
     * instruction and re-enter the user fault handling path trying
     * to page it in. This will likely result in sending SEGV to the
     * current task.
     */
9001:
    sub lr, lr, #4
    str lr, [sp, #S_PC]
    b   ret_fast_syscall
#endif
ENDPROC(vector_swi)

Flow2:

/*============================================================================
 * Special system call wrappers
 */
@ r0 = syscall number
@ r8 = syscall table
sys_syscall:
        bic scno, r0, #__NR_OABI_SYSCALL_BASE
        cmp scno, #__NR_syscall - __NR_SYSCALL_BASE
        cmpne   scno, #NR_syscalls  @ check range
        stmloia sp, {r5, r6}        @ shuffle args
        movlo   r0, r1
        movlo   r1, r2
        movlo   r2, r3
        movlo   r3, r4
        ldrlo   pc, [tbl, scno, lsl #2]
        b   sys_ni_syscall
ENDPROC(sys_syscall)

Flow3:有些系统调用为了兼容性考虑,进行了中转:

sys_sigreturn_wrapper:
        add r0, sp, #S_OFF
        mov why, #0     @ prevent syscall restart handling
        b   sys_sigreturn
ENDPROC(sys_sigreturn_wrapper)

sys_rt_sigreturn_wrapper:
        add r0, sp, #S_OFF
        mov why, #0     @ prevent syscall restart handling
        b   sys_rt_sigreturn
ENDPROC(sys_rt_sigreturn_wrapper)

sys_statfs64_wrapper:
        teq r1, #88
        moveq   r1, #84
        b   sys_statfs64
ENDPROC(sys_statfs64_wrapper)

sys_fstatfs64_wrapper:
        teq r1, #88
        moveq   r1, #84
        b   sys_fstatfs64
ENDPROC(sys_fstatfs64_wrapper)

/*
 * Note: off_4k (r5) is always units of 4K.  If we can't do the requested
 * offset, we return EINVAL.
 */
sys_mmap2:
#if PAGE_SHIFT > 12
        tst r5, #PGOFF_MASK
        moveq   r5, r5, lsr #PAGE_SHIFT - 12
        streq   r5, [sp, #4]
        beq sys_mmap_pgoff
        mov r0, #-EINVAL
        ret lr
#else
        str r5, [sp, #4]
        b   sys_mmap_pgoff
#endif
ENDPROC(sys_mmap2)

#ifdef CONFIG_OABI_COMPAT

/*
 * These are syscalls with argument register differences
 */

sys_oabi_pread64:
        stmia   sp, {r3, r4}
        b   sys_pread64
ENDPROC(sys_oabi_pread64)

sys_oabi_pwrite64:
        stmia   sp, {r3, r4}
        b   sys_pwrite64
ENDPROC(sys_oabi_pwrite64)

sys_oabi_truncate64:
        mov r3, r2
        mov r2, r1
        b   sys_truncate64
ENDPROC(sys_oabi_truncate64)

sys_oabi_ftruncate64:
        mov r3, r2
        mov r2, r1
        b   sys_ftruncate64
ENDPROC(sys_oabi_ftruncate64)

sys_oabi_readahead:
        str r3, [sp]
        mov r3, r2
        mov r2, r1
        b   sys_readahead
ENDPROC(sys_oabi_readahead)

系统调用宏的定义如下(syscalls.h):

/*
 * __MAP - apply a macro to syscall arguments
 * __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to
 *    m(t1, a1), m(t2, a2), ..., m(tn, an)
 * The first argument must be equal to the amount of type/name
 * pairs given.  Note that this list of pairs (i.e. the arguments
 * of __MAP starting at the third one) is in the same format as
 * for SYSCALL_DEFINE/COMPAT_SYSCALL_DEFINE
 */
#define __MAP0(m,...)
#define __MAP1(m,t,a) m(t,a)
#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
#define __MAP5(m,t,a,...) m(t,a), __MAP4(m,__VA_ARGS__)
#define __MAP6(m,t,a,...) m(t,a), __MAP5(m,__VA_ARGS__)
#define __MAP(n,...) __MAP##n(__VA_ARGS__)

#define __SC_DECL(t, a) t a
#define __TYPE_IS_L(t)  (__same_type((t)0, 0L))
#define __TYPE_IS_UL(t) (__same_type((t)0, 0UL))
#define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
#define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
#define __SC_CAST(t, a) (t) a
#define __SC_ARGS(t, a) a
#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))

#ifdef CONFIG_FTRACE_SYSCALLS
#define __SC_STR_ADECL(t, a)    #a
#define __SC_STR_TDECL(t, a)    #t

extern struct ftrace_event_class event_class_syscall_enter;
extern struct ftrace_event_class event_class_syscall_exit;
extern struct trace_event_functions enter_syscall_print_funcs;
extern struct trace_event_functions exit_syscall_print_funcs;

#define SYSCALL_TRACE_ENTER_EVENT(sname)                \
    static struct syscall_metadata __syscall_meta_##sname;      \
    static struct ftrace_event_call __used              \
      event_enter_##sname = {                   \
        .class          = &event_class_syscall_enter,   \
        {                           \
            .name                   = "sys_enter"#sname,    \
        },                          \
        .event.funcs            = &enter_syscall_print_funcs,   \
        .data           = (void *)&__syscall_meta_##sname,\
        .flags                  = TRACE_EVENT_FL_CAP_ANY,   \
    };                              \
    static struct ftrace_event_call __used              \
      __attribute__((section("_ftrace_events")))            \
     *__event_enter_##sname = &event_enter_##sname;

#define SYSCALL_TRACE_EXIT_EVENT(sname)                 \
    static struct syscall_metadata __syscall_meta_##sname;      \
    static struct ftrace_event_call __used              \
      event_exit_##sname = {                    \
        .class          = &event_class_syscall_exit,    \
        {                           \
            .name                   = "sys_exit"#sname, \
        },                          \
        .event.funcs        = &exit_syscall_print_funcs,    \
        .data           = (void *)&__syscall_meta_##sname,\
        .flags                  = TRACE_EVENT_FL_CAP_ANY,   \
    };                              \
    static struct ftrace_event_call __used              \
      __attribute__((section("_ftrace_events")))            \
    *__event_exit_##sname = &event_exit_##sname;

#define SYSCALL_METADATA(sname, nb, ...)            \
    static const char *types_##sname[] = {          \
        __MAP(nb,__SC_STR_TDECL,__VA_ARGS__)        \
    };                          \
    static const char *args_##sname[] = {           \
        __MAP(nb,__SC_STR_ADECL,__VA_ARGS__)        \
    };                          \
    SYSCALL_TRACE_ENTER_EVENT(sname);           \
    SYSCALL_TRACE_EXIT_EVENT(sname);            \
    static struct syscall_metadata __used           \
      __syscall_meta_##sname = {                \
        .name       = "sys"#sname,          \
        .syscall_nr = -1,   /* Filled in at boot */ \
        .nb_args    = nb,               \
        .types      = nb ? types_##sname : NULL,    \
        .args       = nb ? args_##sname : NULL, \
        .enter_event    = &event_enter_##sname,     \
        .exit_event = &event_exit_##sname,      \
        .enter_fields   = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
    };                          \
    static struct syscall_metadata __used           \
      __attribute__((section("__syscalls_metadata")))   \
     *__p_syscall_meta_##sname = &__syscall_meta_##sname;
#else
#define SYSCALL_METADATA(sname, nb, ...)
#endif

#define SYSCALL_DEFINE0(sname)                  \
    SYSCALL_METADATA(_##sname, 0);              \
    asmlinkage long sys_##sname(void)

#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)

#define SYSCALL_DEFINEx(x, sname, ...)              \
    SYSCALL_METADATA(sname, x, __VA_ARGS__)         \
    __SYSCALL_DEFINEx(x, sname, __VA_ARGS__)

#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...)                 \
    asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))   \
        __attribute__((alias(__stringify(SyS##name))));     \
    static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));  \
    asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__));  \
    asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))   \
    {                               \
        long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));  \
        __MAP(x,__SC_TEST,__VA_ARGS__);             \
        __PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));   \
        return ret;                     \
    }                               \
    static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))

对于arm平台,所有的系统调用都定义在calls.S (arch\arm\kernel):

/* 0 */     CALL(sys_restart_syscall)
        CALL(sys_exit)
        CALL(sys_fork)
        CALL(sys_read)
        CALL(sys_write)
/* 5 */     CALL(sys_open)
        CALL(sys_close)
        CALL(sys_ni_syscall)        /* was sys_waitpid */
        CALL(sys_creat)
        CALL(sys_link)
...........
        CALL(OBSOLETE(sys_time))    /* used by libc4 */
...........
/* 180 */   CALL(ABI(sys_pread64, sys_oabi_pread64))
        CALL(ABI(sys_pwrite64, sys_oabi_pwrite64))
..........
/* 385 */   CALL(sys_memfd_create)
        CALL(sys_bpf)
        CALL(sys_execveat)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
#endif
.rept syscalls_padding
        CALL(sys_ni_syscall)
.endr

系统调用表的定义如下:

    .equ NR_syscalls,0
#define CALL(x) .equ NR_syscalls,NR_syscalls+1
#include "calls.S"

/*
 * Ensure that the system call table is equal to __NR_syscalls,
 * which is the value the rest of the system sees
 */
.ifne NR_syscalls - __NR_syscalls
.error "__NR_syscalls is not equal to the size of the syscall table"
.endif

#undef CALL
#define CALL(x) .long x

/*
 * This is the syscall table declaration for native ABI syscalls.
 * With EABI a couple syscalls are obsolete and defined as sys_ni_syscall.
 */
#define ABI(native, compat) native
#ifdef CONFIG_AEABI
#define OBSOLETE(syscall) sys_ni_syscall
#else
#define OBSOLETE(syscall) syscall
#endif

    .type   sys_call_table, #object
ENTRY(sys_call_table)
#include "calls.S"
#undef ABI
#undef OBSOLETE

/*
 * Let's declare a second syscall table for old ABI binaries
 * using the compatibility syscall entries.
 */
#define ABI(native, compat) compat
#define OBSOLETE(syscall) syscall

    .type   sys_oabi_call_table, #object
ENTRY(sys_oabi_call_table)
#include "calls.S"
#undef ABI
#undef OBSOLETE

你可能感兴趣的:(linux系统调用代码)