刘文学+原创作品转载请注明出处http://blog.csdn.net/wdxz6547/article/details/50938472 《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000
先看 linux 系统支持系统调用表
0 common read sys_read
1 common write sys_write
2 common open sys_open
3 common close sys_close
4 common stat sys_newstat
5 common fstat sys_newfstat
6 common lstat sys_newlstat
7 common poll sys_poll
8 common lseek sys_lseek
9 common mmap sys_mmap
10 common mprotect sys_mprotect
11 common munmap sys_munmap
12 common brk sys_brk
13 64 rt_sigaction sys_rt_sigaction
14 common rt_sigprocmask sys_rt_sigprocmask
15 64 rt_sigreturn sys_rt_sigreturn/ptregs
16 64 ioctl sys_ioctl
17 common pread64 sys_pread64
18 common pwrite64 sys_pwrite64
19 64 readv sys_readv
20 64 writev sys_writev
21 common access sys_access
22 common pipe sys_pipe
23 common select sys_select
24 common sched_yield sys_sched_yield
25 common mremap sys_mremap
26 common msync sys_msync
27 common mincore sys_mincore
28 common madvise sys_madvise
29 common shmget sys_shmget
30 common shmat sys_shmat
31 common shmctl sys_shmctl
32 common dup sys_dup
33 common dup2 sys_dup2
34 common pause sys_pause
35 common nanosleep sys_nanosleep
36 common getitimer sys_getitimer
37 common alarm sys_alarm
38 common setitimer sys_setitimer
39 common getpid sys_getpid
40 common sendfile sys_sendfile64
41 common socket sys_socket
42 common connect sys_connect
43 common accept sys_accept
44 common sendto sys_sendto
45 64 recvfrom sys_recvfrom
46 64 sendmsg sys_sendmsg
47 64 recvmsg sys_recvmsg
48 common shutdown sys_shutdown
49 common bind sys_bind
50 common listen sys_listen
51 common getsockname sys_getsockname
52 common getpeername sys_getpeername
53 common socketpair sys_socketpair
54 64 setsockopt sys_setsockopt
55 64 getsockopt sys_getsockopt
56 common clone sys_clone/ptregs
57 common fork sys_fork/ptregs
58 common vfork sys_vfork/ptregs
59 64 execve sys_execve/ptregs
60 common exit sys_exit
61 common wait4 sys_wait4
62 common kill sys_kill
63 common uname sys_newuname
64 common semget sys_semget
65 common semop sys_semop
66 common semctl sys_semctl
67 common shmdt sys_shmdt
68 common msgget sys_msgget
69 common msgsnd sys_msgsnd
70 common msgrcv sys_msgrcv
71 common msgctl sys_msgctl
72 common fcntl sys_fcntl
73 common flock sys_flock
74 common fsync sys_fsync
75 common fdatasync sys_fdatasync
76 common truncate sys_truncate
77 common ftruncate sys_ftruncate
78 common getdents sys_getdents
79 common getcwd sys_getcwd
80 common chdir sys_chdir
81 common fchdir sys_fchdir
82 common rename sys_rename
83 common mkdir sys_mkdir
84 common rmdir sys_rmdir
85 common creat sys_creat
86 common link sys_link
87 common unlink sys_unlink
88 common symlink sys_symlink
89 common readlink sys_readlink
90 common chmod sys_chmod
91 common fchmod sys_fchmod
92 common chown sys_chown
93 common fchown sys_fchown
94 common lchown sys_lchown
95 common umask sys_umask
96 common gettimeofday sys_gettimeofday
97 common getrlimit sys_getrlimit
98 common getrusage sys_getrusage
99 common sysinfo sys_sysinfo
100 common times sys_times
101 64 ptrace sys_ptrace
102 common getuid sys_getuid
103 common syslog sys_syslog
104 common getgid sys_getgid
105 common setuid sys_setuid
106 common setgid sys_setgid
107 common geteuid sys_geteuid
108 common getegid sys_getegid
109 common setpgid sys_setpgid
110 common getppid sys_getppid
111 common getpgrp sys_getpgrp
112 common setsid sys_setsid
113 common setreuid sys_setreuid
114 common setregid sys_setregid
115 common getgroups sys_getgroups
116 common setgroups sys_setgroups
117 common setresuid sys_setresuid
118 common getresuid sys_getresuid
119 common setresgid sys_setresgid
120 common getresgid sys_getresgid
121 common getpgid sys_getpgid
122 common setfsuid sys_setfsuid
123 common setfsgid sys_setfsgid
124 common getsid sys_getsid
125 common capget sys_capget
126 common capset sys_capset
127 64 rt_sigpending sys_rt_sigpending
128 64 rt_sigtimedwait sys_rt_sigtimedwait
129 64 rt_sigqueueinfo sys_rt_sigqueueinfo
130 common rt_sigsuspend sys_rt_sigsuspend
131 64 sigaltstack sys_sigaltstack
132 common utime sys_utime
133 common mknod sys_mknod
134 64 uselib
135 common personality sys_personality
136 common ustat sys_ustat
137 common statfs sys_statfs
138 common fstatfs sys_fstatfs
139 common sysfs sys_sysfs
140 common getpriority sys_getpriority
141 common setpriority sys_setpriority
142 common sched_setparam sys_sched_setparam
143 common sched_getparam sys_sched_getparam
144 common sched_setscheduler sys_sched_setscheduler
145 common sched_getscheduler sys_sched_getscheduler
146 common sched_get_priority_max sys_sched_get_priority_max
147 common sched_get_priority_min sys_sched_get_priority_min
148 common sched_rr_get_interval sys_sched_rr_get_interval
149 common mlock sys_mlock
150 common munlock sys_munlock
151 common mlockall sys_mlockall
152 common munlockall sys_munlockall
153 common vhangup sys_vhangup
154 common modify_ldt sys_modify_ldt
155 common pivot_root sys_pivot_root
156 64 _sysctl sys_sysctl
157 common prctl sys_prctl
158 common arch_prctl sys_arch_prctl
159 common adjtimex sys_adjtimex
160 common setrlimit sys_setrlimit
161 common chroot sys_chroot
162 common sync sys_sync
163 common acct sys_acct
164 common settimeofday sys_settimeofday
165 common mount sys_mount
166 common umount2 sys_umount
167 common swapon sys_swapon
168 common swapoff sys_swapoff
169 common reboot sys_reboot
170 common sethostname sys_sethostname
171 common setdomainname sys_setdomainname
172 common iopl sys_iopl/ptregs
173 common ioperm sys_ioperm
174 64 create_module
175 common init_module sys_init_module
176 common delete_module sys_delete_module
177 64 get_kernel_syms
178 64 query_module
179 common quotactl sys_quotactl
180 64 nfsservctl
181 common getpmsg
182 common putpmsg
183 common afs_syscall
184 common tuxcall
185 common security
186 common gettid sys_gettid
187 common readahead sys_readahead
188 common setxattr sys_setxattr
189 common lsetxattr sys_lsetxattr
190 common fsetxattr sys_fsetxattr
191 common getxattr sys_getxattr
192 common lgetxattr sys_lgetxattr
193 common fgetxattr sys_fgetxattr
194 common listxattr sys_listxattr
195 common llistxattr sys_llistxattr
196 common flistxattr sys_flistxattr
197 common removexattr sys_removexattr
198 common lremovexattr sys_lremovexattr
199 common fremovexattr sys_fremovexattr
200 common tkill sys_tkill
201 common time sys_time
202 common futex sys_futex
203 common sched_setaffinity sys_sched_setaffinity
204 common sched_getaffinity sys_sched_getaffinity
205 64 set_thread_area
206 64 io_setup sys_io_setup
207 common io_destroy sys_io_destroy
208 common io_getevents sys_io_getevents
209 64 io_submit sys_io_submit
210 common io_cancel sys_io_cancel
211 64 get_thread_area
212 common lookup_dcookie sys_lookup_dcookie
213 common epoll_create sys_epoll_create
214 64 epoll_ctl_old
215 64 epoll_wait_old
216 common remap_file_pages sys_remap_file_pages
217 common getdents64 sys_getdents64
218 common set_tid_address sys_set_tid_address
219 common restart_syscall sys_restart_syscall
220 common semtimedop sys_semtimedop
221 common fadvise64 sys_fadvise64
222 64 timer_create sys_timer_create
223 common timer_settime sys_timer_settime
224 common timer_gettime sys_timer_gettime
225 common timer_getoverrun sys_timer_getoverrun
226 common timer_delete sys_timer_delete
227 common clock_settime sys_clock_settime
228 common clock_gettime sys_clock_gettime
229 common clock_getres sys_clock_getres
230 common clock_nanosleep sys_clock_nanosleep
231 common exit_group sys_exit_group
232 common epoll_wait sys_epoll_wait
233 common epoll_ctl sys_epoll_ctl
234 common tgkill sys_tgkill
235 common utimes sys_utimes
236 64 vserver
237 common mbind sys_mbind
238 common set_mempolicy sys_set_mempolicy
239 common get_mempolicy sys_get_mempolicy
240 common mq_open sys_mq_open
241 common mq_unlink sys_mq_unlink
242 common mq_timedsend sys_mq_timedsend
243 common mq_timedreceive sys_mq_timedreceive
244 64 mq_notify sys_mq_notify
245 common mq_getsetattr sys_mq_getsetattr
246 64 kexec_load sys_kexec_load
247 64 waitid sys_waitid
248 common add_key sys_add_key
249 common request_key sys_request_key
250 common keyctl sys_keyctl
251 common ioprio_set sys_ioprio_set
252 common ioprio_get sys_ioprio_get
253 common inotify_init sys_inotify_init
254 common inotify_add_watch sys_inotify_add_watch
255 common inotify_rm_watch sys_inotify_rm_watch
256 common migrate_pages sys_migrate_pages
257 common openat sys_openat
258 common mkdirat sys_mkdirat
259 common mknodat sys_mknodat
260 common fchownat sys_fchownat
261 common futimesat sys_futimesat
262 common newfstatat sys_newfstatat
263 common unlinkat sys_unlinkat
264 common renameat sys_renameat
265 common linkat sys_linkat
266 common symlinkat sys_symlinkat
267 common readlinkat sys_readlinkat
268 common fchmodat sys_fchmodat
269 common faccessat sys_faccessat
270 common pselect6 sys_pselect6
271 common ppoll sys_ppoll
272 common unshare sys_unshare
273 64 set_robust_list sys_set_robust_list
274 64 get_robust_list sys_get_robust_list
275 common splice sys_splice
276 common tee sys_tee
277 common sync_file_range sys_sync_file_range
278 64 vmsplice sys_vmsplice
279 64 move_pages sys_move_pages
280 common utimensat sys_utimensat
281 common epoll_pwait sys_epoll_pwait
282 common signalfd sys_signalfd
283 common timerfd_create sys_timerfd_create
284 common eventfd sys_eventfd
285 common fallocate sys_fallocate
286 common timerfd_settime sys_timerfd_settime
287 common timerfd_gettime sys_timerfd_gettime
288 common accept4 sys_accept4
289 common signalfd4 sys_signalfd4
290 common eventfd2 sys_eventfd2
291 common epoll_create1 sys_epoll_create1
292 common dup3 sys_dup3
293 common pipe2 sys_pipe2
294 common inotify_init1 sys_inotify_init1
295 64 preadv sys_preadv
296 64 pwritev sys_pwritev
297 64 rt_tgsigqueueinfo sys_rt_tgsigqueueinfo
298 common perf_event_open sys_perf_event_open
299 64 recvmmsg sys_recvmmsg
300 common fanotify_init sys_fanotify_init
301 common fanotify_mark sys_fanotify_mark
302 common prlimit64 sys_prlimit64
303 common name_to_handle_at sys_name_to_handle_at
304 common open_by_handle_at sys_open_by_handle_at
305 common clock_adjtime sys_clock_adjtime
306 common syncfs sys_syncfs
307 64 sendmmsg sys_sendmmsg
308 common setns sys_setns
309 common getcpu sys_getcpu
310 64 process_vm_readv sys_process_vm_readv
311 64 process_vm_writev sys_process_vm_writev
312 common kcmp sys_kcmp
313 common finit_module sys_finit_module
314 common sched_setattr sys_sched_setattr
315 common sched_getattr sys_sched_getattr
316 common renameat2 sys_renameat2
317 common seccomp sys_seccomp
318 common getrandom sys_getrandom
319 common memfd_create sys_memfd_create
320 common kexec_file_load sys_kexec_file_load
321 common bpf sys_bpf
322 64 execveat sys_execveat/ptregs
323 common userfaultfd sys_userfaultfd
324 common membarrier sys_membarrier
325 common mlock2 sys_mlock2
326 common copy_file_range sys_copy_file_range
#
# x32-specific system call numbers start at 512 to avoid cache impact
# for native 64-bit operation.
#
512 x32 rt_sigaction compat_sys_rt_sigaction
513 x32 rt_sigreturn sys32_x32_rt_sigreturn
514 x32 ioctl compat_sys_ioctl
515 x32 readv compat_sys_readv
516 x32 writev compat_sys_writev
517 x32 recvfrom compat_sys_recvfrom
518 x32 sendmsg compat_sys_sendmsg
519 x32 recvmsg compat_sys_recvmsg
520 x32 execve compat_sys_execve/ptregs
521 x32 ptrace compat_sys_ptrace
522 x32 rt_sigpending compat_sys_rt_sigpending
523 x32 rt_sigtimedwait compat_sys_rt_sigtimedwait
524 x32 rt_sigqueueinfo compat_sys_rt_sigqueueinfo
525 x32 sigaltstack compat_sys_sigaltstack
526 x32 timer_create compat_sys_timer_create
527 x32 mq_notify compat_sys_mq_notify
528 x32 kexec_load compat_sys_kexec_load
529 x32 waitid compat_sys_waitid
530 x32 set_robust_list compat_sys_set_robust_list
531 x32 get_robust_list compat_sys_get_robust_list
532 x32 vmsplice compat_sys_vmsplice
533 x32 move_pages compat_sys_move_pages
534 x32 preadv compat_sys_preadv64
535 x32 pwritev compat_sys_pwritev64
536 x32 rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo
537 x32 recvmmsg compat_sys_recvmmsg
538 x32 sendmmsg compat_sys_sendmmsg
539 x32 process_vm_readv compat_sys_process_vm_readv
540 x32 process_vm_writev compat_sys_process_vm_writev
541 x32 setsockopt compat_sys_setsockopt
542 x32 getsockopt compat_sys_getsockopt
543 x32 io_setup compat_sys_io_setup
544 x32 io_submit compat_sys_io_submit
545 x32 execveat compat_sys_execveat/ptregs
32 位系统调用表
上面的系统调用并不需要完全掌握, 可以慢慢来, 当需要记住的 64 位系统有 326 个系统调用.
load
execute
end, abort
create process (for example, fork on Unix-like systems, or NtCreateProcess in the Windows NT Native API)
terminate process
get/set process attributes
wait for time, wait event, signal event
allocate, free memory
create file, delete file
open, close
read, write, reposition
get/set file attributes
request device, release device
read, write, reposition
get/set device attributes
logically attach or detach devices
get/set time or date
get/set system data
get/set process, file, or device attributes
create, delete communication connection
send, receive messages
transfer status information
attach or detach remote devices
syscall 的初始化 syscall_init 在 cpu_init 中.
void syscall_init(void)
{
/*
* LSTAR and STAR live in a bit strange symbiosis.
* They both write to the same internal register. STAR allows to
* set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
#else
wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
分析 TODO
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
也许你很好奇上面寄存器的分配为什么是这样, 其实就是约定. 具体参考
这里
我们就以 write 为例
https://github.com/torvalds/linux/blob/master/fs/read_write.c
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
return ret;
}
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_write(f.file, buf, count, &pos);
if (ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
return ret;
}
http://code.woboq.org/linux/linux/include/linux/syscalls.h.html
/* * __MAP - apply a macro to syscall arguments * __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to * m(t1, a1), m(t2, a2), ..., m(tn, an) * The first argument must be equal to the amount of type/name * pairs given. Note that this list of pairs (i.e. the arguments * of __MAP starting at the third one) is in the same format as * for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n> */
#define __MAP0(m,...)
#define __MAP1(m,t,a) m(t,a)
#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
#define __MAP5(m,t,a,...) m(t,a), __MAP4(m,__VA_ARGS__)
#define __MAP6(m,t,a,...) m(t,a), __MAP5(m,__VA_ARGS__)
#define __MAP(n,...) __MAP##n(__VA_ARGS__)
#define __SC_DECL(t, a) t a
#define __TYPE_IS_L(t) (__same_type((t)0, 0L))
#define __TYPE_IS_UL(t) (__same_type((t)0, 0UL))
#define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
#define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
#define __SC_CAST(t, a) (t) a
#define __SC_ARGS(t, a) a
#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(SyS##name)))); \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#ifdef CONFIG_FTRACE_SYSCALLS //跟踪系统调用
#define SYSCALL_METADATA(sname, nb, ...) \
static const char *types_##sname[] = { \
__MAP(nb,__SC_STR_TDECL,__VA_ARGS__) \
}; \
static const char *args_##sname[] = { \
__MAP(nb,__SC_STR_ADECL,__VA_ARGS__) \
}; \
SYSCALL_TRACE_ENTER_EVENT(sname); \
SYSCALL_TRACE_EXIT_EVENT(sname); \
static struct syscall_metadata __used \
__syscall_meta_##sname = { \
.name = "sys"#sname, \
.syscall_nr = -1, /* Filled in at boot */ \
.nb_args = nb, \
.types = nb ? types_##sname : NULL, \
.args = nb ? args_##sname : NULL, \
.enter_event = &event_enter_##sname, \
.exit_event = &event_exit_##sname, \
.enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
}; \
static struct syscall_metadata __used \
__attribute__((section("__syscalls_metadata"))) \
*__p_syscall_meta_##sname = &__syscall_meta_##sname;
#else
#define SYSCALL_METADATA(sname, nb, ...)
#endif
http://code.woboq.org/linux/linux/include/linux/compiler.h.html
# define __user __attribute__((noderef, address_space(1))) //主要供 sparse 工具使用
由于 SYSCALL_DEFINEx 调用了 SYSCALL_METADATA 和 __SYSCALL_DEFINEx
由于 SYSCALL_METADATA 是跟踪系统调用的, 因此我们注意关注 __SYSCALL_DEFINEx
对于 __MAP 宏变化过程
其中 __MAP(3,__SC_DECL, unsigned int, fd, const char __user *, buf, size_t, count) 翻译为
unsigned int fd, const char __user * buf, size_t, count
其中 __MAP(3,__SC_LONG, unsigned int, fd, const char __user *, buf, size_t, count) 翻译为
__typeof(__builtin_choose_expr(__TYPE_IS_LL(unsigned int), 0LL, 0L)) fd \
__builtin_choose_expr(__TYPE_IS_LL(const char __user *), 0LL, 0L)) buf \
__builtin_choose_expr(__TYPE_IS_LL(size_t), 0LL, 0L)) count
因此, wirte 系统调用编译之后变为:
asmlinkage long sys_write(unsigned int fd, const char __user * buf, size_t, count)
__attribute__((alias(__stringify(SyS_write))));
static inline long SYSC_write(unsigned int fd, const char __user * buf, size_t, count);
//SyS_write 函数定义
asmlinkage long SyS_write(
__typeof(__builtin_choose_expr((__same_type((unsigned int)0, 0LL)
|| __same_type((unsigned int)0, 0ULL)),
0LL, 0L))
fd,
__typeof(__builtin_choose_expr((__same_type((const char __user *)0, 0LL)
|| __same_type((const char __user *)0, 0ULL)),
0LL, 0L))
buf,
__typeof(__builtin_choose_expr((__same_type((size_t)0, 0LL)
|| __same_type((size_t)0, 0ULL)),
0LL, 0L))
count
);
//SyS_write 函数定义
asmlinkage long SyS_write(
__typeof(__builtin_choose_expr((__same_type((unsigned int)0, 0LL)
|| __same_type((unsigned int)0, 0ULL)),
0LL, 0L))
fd,
__typeof(__builtin_choose_expr((__same_type((const char __user *)0, 0LL)
|| __same_type((const char __user *)0, 0ULL)),
0LL, 0L))
buf,
__typeof(__builtin_choose_expr((__same_type((size_t)0, 0LL)
|| __same_type((size_t)0, 0ULL)),
0LL, 0L))
count
)
{
long ret = SYSC_write(
(unsigned int) fd,
(const char __user *) buf,
(size_t) count
);
(void)BUILD_BUG_ON_ZERO(
!(__same_type((unsigned int)0, 0LL) || __same_type((unsigned int)0, 0ULL))
&& sizeof(unsigned int) > sizeof(long)),
(void)BUILD_BUG_ON_ZERO(
!__same_type((const char __user *)0, 0LL) || __same_type((const char __user *)0, 0ULL))
&& sizeof(const char __user *) > sizeof(long)),
(void)BUILD_BUG_ON_ZERO(
!__same_type((size_t)0, 0LL) || __same_type((size_t)0, 0ULL))
&& sizeof(size_t) > sizeof(long)),
asmlinkage_protect(3, ret, fd, buf, count)
return ret;
}
static inline long SYSC_write(unsigned int fd, const char __user * buf, size_t, count)
//之前是函数头的解析后的结果, 下面是函数体
{
/* * struct files_struct files; * v = (unsigned long) files ->fdt->fd[fd] * struct fd f = (struct fd){(struct file *)v &~3, v&3 } * * struct fd { * struct file *file; * unsigned int flags; * }; */
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
//获取当前文件的位置 : pos = file->f_pos;
loff_t pos = file_pos_read(f.file);
//将 buf 内容 count 个 byte 从 pos 开始写入 f.file. 详细待后续文件系统部分分析.
ret = vfs_write(f.file, buf, count, &pos);
//如果写入文件字节数大于等于 0
if (ret >= 0)
//更改当前文件的 pos: file->f_pos = pos;
file_pos_write(f.file, pos);
//解锁, 实际调用 fput, 待后续文件系统部分分析.
fdput_pos(f);
}
return ret;
}
至此, write 系统调用分析完成, 细节见注释.
当然你也可以根据自己的偏好分析其中一个系统调用的具体实现, 相信付出是高回报的.
注: 当一个函数多余 6 个参数, 其余参数将被放在栈
#include <stdio.h>
#include <string.h>
#include <fcntl.h>
int main(int argc, char **argv)
{
const char *str = "hello systemcall";
size_t len = strlen(str) + 1;
ssize_t ret = 1;
int fd = open("tmp1", O_WRONLY | O_CREAT | O_APPEND);
printf("write %d return %lu\n", fd, ret);
//write(fd, str, len);
asm volatile (
"mov %1,%%edi\n\t" //fd 传递给第一个参数
"movq %2,%%rsi\n\t" //str 地址传递给第二个参数
"movq %3,%%rdx\n\t" //len 传递给第三个参数
"movq $1,%%rax\n\t" //系统调用号 1
"syscall\n\t"
"movq %%rax, %0\n\t" //将write 返回结果给 ret
:"=p"(ret)
:"b"(fd), "p"(str), "d"(len)
);
printf("write return %lu\n", ret);
return 0;
}
$ gcc -o test test.c
$ ./test
write 3 return 4195536
write return 17
$ ltrace ./test
__libc_start_main(0x4005bd, 1, 0x7fff40ba1148, 0x400650 <unfinished ...>
strlen("hello systemcall") = 16
open("tmp", 1024, 04200000) = 3
+++ exited (status 0) +++
$ ltrace ./test
...
__libc_start_main(0x4005bd, 1, 0x7fff15fa4358, 0x400650 <unfinished ...>
strlen("hello systemcall") = 16
open("tmp", 1024, 04200000 <unfinished ...>
SYS_open("tmp", 1024, 04200000) = 3
<... open resumed> ) = 3
SYS_write(3, "hello systemcall", 17 <no return ...>
+++ exited (status 0) +++
$ strace ./test
open("tmp1", O_WRONLY|O_CREAT|O_APPEND, 0200000020400000) = 3
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 10), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f34c0174000
write(1, "write 3 return 4195536\n", 23write 3 return 4195536
) = 23
write(3, "hello systemcall\n", 17) = 17
write(1, "write return 17\n", 16write return 17
) = 16
exit_group(0) = ?
+++ exited with 0 +++
strace 是查看系统调用的利器, 既可以查看正在调用的进程的系统调用, 也可以看正在运行的程序
正在执行的系统调用.
ltrace−Sparam//查看即将执行的程序param涉及的系统调用 ltrace -p -S param //查看即将执行的程序 param 涉及的系统调用
straceparam//查看即将执行的程序param涉及的系统调用 strace -p PID //查看进程号 PID 正在执行的系统调用
更多参考 man strace man lstrace
系统调用在系统中无处不在, 我们知道在 /proc 文件系统保持了进程信息.
比如 /proc/1 代表进程号 1 的进程信息.
$ cat /proc/1/comm
init
$ sudo cat /proc/1/syscall
23 0x1d 0x7fff4930fa40 0x7fff4930fac0 0x7fff4930fb40 0x0 0x7fbeb3e03420 0x7fff4930f7e8 0x7fbeb2093d83
其中 23 就是系统调用号. 想知道其他进程正在执行的系统调用, 是不是很容易了:)
操作系统为用户态进程与硬件设备进行交互提供了一组接口——系统调用
应用编程接口 (application program interface, API) 和系统调用是不同的
Libc库定义的一些API引用了封装例程(wrapper routine,唯一目的就是发布系统调用)
不是每个API都对应一个特定的系统调用
返回值
此外, 系统调用必须尽量快, 因此必须尽量小, 而标准库负责参数校验等工作.
当用户态进程调用一个系统调用时,CPU切换到内核态并开始执行一个内核函数。
在Linux中是通过执行int $0x80来执行系统调用的, 这条汇编指令产生向量为
128的编程异常
https://0xax.gitbooks.io/linux-insides/content/SysCall/syscall-1.html
https://en.wikipedia.org/wiki/System_call