1. 比如,如果希望了解在执行pwd命令时都调用了哪些系统调用,可以使用下面的命令:
$strace pwd
chenliang@test:~$ strace pwd
execve("/bin/pwd", ["pwd"], [/* 21 vars */]) = 0
brk(0) = 0x8733000
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
mmap2(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb77b6000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=97914, ...}) = 0
mmap2(NULL, 97914, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb779e000
close(3) = 0
access("/etc/ld.so.nohwcap", F_OK) = -1 ENOENT (No such file or directory)
open("/lib/i386-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\340\233\1\0004\0\0\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1758972, ...}) = 0
mmap2(NULL, 1763964, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xb75ef000
mmap2(0xb7798000, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1a9000) = 0xb7798000
mmap2(0xb779b000, 10876, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xb779b000
close(3) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb75ee000
set_thread_area({entry_number:-1 -> 6, base_addr:0xb75ee940, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}) = 0
mprotect(0xb7798000, 8192, PROT_READ) = 0
mprotect(0x804e000, 4096, PROT_READ) = 0
mprotect(0xb77d9000, 4096, PROT_READ) = 0
munmap(0xb779e000, 97914) = 0
brk(0) = 0x8733000
brk(0x8754000) = 0x8754000
open("/usr/lib/locale/locale-archive", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=8752496, ...}) = 0
mmap2(NULL, 2097152, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb73ee000
close(3) = 0
getcwd("/home/chenliang", 4096) = 16
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 15), ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb77b5000
write(1, "/home/chenliang\n", 16/home/chenliang
) = 16
close(1) = 0
munmap(0xb77b5000, 4096) = 0
close(2) = 0
exit_group(0) = ?
2. 对于export出来的内核函数,可以使用ksyms命令或通过/proc/ksyms文件查看。
[root@szclou /]#cat /proc/kallsyms
c0008400 T asm_do_IRQ
c0008400 T _stext
c0008400 T __exception_text_start
c0008414 T do_undefinstr
c00085d4 T do_PrefetchAbort
c0008674 T do_DataAbort
c0008714 T __exception_text_end
c0008718 t __do_fixup_smp_on_up
c000872c T fixup_smp
c0008744 t __fixup_a_pv_table
c0008768 T fixup_pv_table
c000878c T do_one_initcall
c00088dc T name_to_dev_t
c0008c78 t match_dev_by_uuid
c0008cac T SetRoundingMode
c0008ce8 T SetRoundingPrecision
c0008cfc T EmulateAll
c0008d54 T nwfpe_init_fpa
c0008db4 T EmulateCPDO
c0008f54 T PerformLDF
c000908c T PerformLFM
c0009220 T PerformSFM
c000938c T PerformSTF
c000953c T EmulateCPDT
......
[root@szclou /]#cat /proc/kallsyms
c0008400 T asm_do_IRQ
c0008400 T _stext
c0008400 T __exception_text_start
c0008414 T do_undefinstr
c00085d4 T do_PrefetchAbort
c0008674 T do_DataAbort
c0008714 T __exception_text_end
c0008718 t __do_fixup_smp_on_up
c000872c T fixup_smp
c0008744 t __fixup_a_pv_table
c0008768 T fixup_pv_table
c000878c T do_one_initcall
c00088dc T name_to_dev_t
c0008c78 t match_dev_by_uuid
c0008cac T SetRoundingMode
c0008ce8 T SetRoundingPrecision
c0008cfc T EmulateAll
c0008d54 T nwfpe_init_fpa
c0008db4 T EmulateCPDO
c0008f54 T PerformLDF
c000908c T PerformLFM
c0009220 T PerformSFM
c000938c T PerformSTF
c000953c T EmulateCPDT
......
3. 系统调用表sys_call_table存储了所有系统调用对应的服务例程的函数地址,在arch/i386/kernel/syscall_table.S文件中被定义
例如avr32
/*
* AVR32 system call table
*
* Copyright (C) 2004-2006 Atmel Corporation
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
.section .rodata,"a",@progbits
.type sys_call_table,@object
.global sys_call_table
.align 2
sys_call_table:
.long sys_restart_syscall
.long sys_exit
.long sys_fork
.long sys_read
.long sys_write
.long sys_open /* 5 */
.long sys_close
.long sys_umask
.long sys_creat
.long sys_link
.long sys_unlink /* 10 */
.long sys_execve
.long sys_chdir
.long sys_time
.long sys_mknod
.long sys_chmod /* 15 */
.long sys_chown
.long sys_lchown
.long sys_lseek
.long sys_llseek
.long sys_getpid /* 20 */
.long sys_mount
.long sys_umount
.long sys_setuid
.long sys_getuid
.long sys_stime /* 25 */
.long sys_ptrace
.long sys_alarm
.long sys_pause
.long sys_utime
.long sys_newstat /* 30 */
.long sys_newfstat
.long sys_newlstat
.long sys_access
.long sys_chroot
.long sys_sync /* 35 */
.long sys_fsync
.long sys_kill
.long sys_rename
.long sys_mkdir
.long sys_rmdir /* 40 */
.long sys_dup
.long sys_pipe
.long sys_times
.long sys_clone
.long sys_brk /* 45 */
.long sys_setgid
.long sys_getgid
.long sys_getcwd
.long sys_geteuid
.long sys_getegid /* 50 */
.long sys_acct
.long sys_setfsuid
.long sys_setfsgid
.long sys_ioctl
.long sys_fcntl /* 55 */
.long sys_setpgid
.long sys_mremap
.long sys_setresuid
.long sys_getresuid
.long sys_setreuid /* 60 */
.long sys_setregid
.long sys_ustat
.long sys_dup2
.long sys_getppid
.long sys_getpgrp /* 65 */
.long sys_setsid
.long sys_rt_sigaction
.long __sys_rt_sigreturn
.long sys_rt_sigprocmask
.long sys_rt_sigpending /* 70 */
.long sys_rt_sigtimedwait
.long sys_rt_sigqueueinfo
.long __sys_rt_sigsuspend
.long sys_sethostname
.long sys_setrlimit /* 75 */
.long sys_getrlimit
.long sys_getrusage
.long sys_gettimeofday
.long sys_settimeofday
.long sys_getgroups /* 80 */
.long sys_setgroups
.long sys_select
.long sys_symlink
.long sys_fchdir
.long sys_readlink /* 85 */
.long sys_pread64
.long sys_pwrite64
.long sys_swapon
.long sys_reboot
.long __sys_mmap2 /* 90 */
.long sys_munmap
.long sys_truncate
.long sys_ftruncate
.long sys_fchmod
.long sys_fchown /* 95 */
.long sys_getpriority
.long sys_setpriority
.long sys_wait4
.long sys_statfs
.long sys_fstatfs /* 100 */
.long sys_vhangup
.long sys_sigaltstack
.long sys_syslog
.long sys_setitimer
.long sys_getitimer /* 105 */
.long sys_swapoff
.long sys_sysinfo
.long sys_ni_syscall /* was sys_ipc briefly */
.long sys_sendfile
.long sys_setdomainname /* 110 */
.long sys_newuname
.long sys_adjtimex
.long sys_mprotect
.long sys_vfork
.long sys_init_module /* 115 */
.long sys_delete_module
.long sys_quotactl
.long sys_getpgid
.long sys_bdflush
.long sys_sysfs /* 120 */
.long sys_personality
.long sys_ni_syscall /* reserved for afs_syscall */
.long sys_getdents
.long sys_flock
.long sys_msync /* 125 */
.long sys_readv
.long sys_writev
.long sys_getsid
.long sys_fdatasync
.long sys_sysctl /* 130 */
.long sys_mlock
.long sys_munlock
.long sys_mlockall
.long sys_munlockall
.long sys_sched_setparam /* 135 */
.long sys_sched_getparam
.long sys_sched_setscheduler
.long sys_sched_getscheduler
.long sys_sched_yield
.long sys_sched_get_priority_max /* 140 */
.long sys_sched_get_priority_min
.long sys_sched_rr_get_interval
.long sys_nanosleep
.long sys_poll
.long sys_ni_syscall /* 145 was nfsservctl */
.long sys_setresgid
.long sys_getresgid
.long sys_prctl
.long sys_socket
.long sys_bind /* 150 */
.long sys_connect
.long sys_listen
.long sys_accept
.long sys_getsockname
.long sys_getpeername /* 155 */
.long sys_socketpair
.long sys_send
.long sys_recv
.long __sys_sendto
.long __sys_recvfrom /* 160 */
.long sys_shutdown
.long sys_setsockopt
.long sys_getsockopt
.long sys_sendmsg
.long sys_recvmsg /* 165 */
.long sys_truncate64
.long sys_ftruncate64
.long sys_stat64
.long sys_lstat64
.long sys_fstat64 /* 170 */
.long sys_pivot_root
.long sys_mincore
.long sys_madvise
.long sys_getdents64
.long sys_fcntl64 /* 175 */
.long sys_gettid
.long sys_readahead
.long sys_setxattr
.long sys_lsetxattr
.long sys_fsetxattr /* 180 */
.long sys_getxattr
.long sys_lgetxattr
.long sys_fgetxattr
.long sys_listxattr
.long sys_llistxattr /* 185 */
.long sys_flistxattr
.long sys_removexattr
.long sys_lremovexattr
.long sys_fremovexattr
.long sys_tkill /* 190 */
.long sys_sendfile64
.long sys_futex
.long sys_sched_setaffinity
.long sys_sched_getaffinity
.long sys_capget /* 195 */
.long sys_capset
.long sys_io_setup
.long sys_io_destroy
.long sys_io_getevents
.long sys_io_submit /* 200 */
.long sys_io_cancel
.long sys_fadvise64
.long sys_exit_group
.long sys_lookup_dcookie
.long sys_epoll_create /* 205 */
.long sys_epoll_ctl
.long sys_epoll_wait
.long sys_remap_file_pages
.long sys_set_tid_address
.long sys_timer_create /* 210 */
.long sys_timer_settime
.long sys_timer_gettime
.long sys_timer_getoverrun
.long sys_timer_delete
.long sys_clock_settime /* 215 */
.long sys_clock_gettime
.long sys_clock_getres
.long sys_clock_nanosleep
.long sys_statfs64
.long sys_fstatfs64 /* 220 */
.long sys_tgkill
.long sys_ni_syscall /* reserved for TUX */
.long sys_utimes
.long sys_fadvise64_64
.long sys_cacheflush /* 225 */
.long sys_ni_syscall /* sys_vserver */
.long sys_mq_open
.long sys_mq_unlink
.long sys_mq_timedsend
.long sys_mq_timedreceive /* 230 */
.long sys_mq_notify
.long sys_mq_getsetattr
.long sys_kexec_load
.long sys_waitid
.long sys_add_key /* 235 */
.long sys_request_key
.long sys_keyctl
.long sys_ioprio_set
.long sys_ioprio_get
.long sys_inotify_init /* 240 */
.long sys_inotify_add_watch
.long sys_inotify_rm_watch
.long sys_openat
.long sys_mkdirat
.long sys_mknodat /* 245 */
.long sys_fchownat
.long sys_futimesat
.long sys_fstatat64
.long sys_unlinkat
.long sys_renameat /* 250 */
.long sys_linkat
.long sys_symlinkat
.long sys_readlinkat
.long sys_fchmodat
.long sys_faccessat /* 255 */
.long __sys_pselect6
.long sys_ppoll
.long sys_unshare
.long sys_set_robust_list
.long sys_get_robust_list /* 260 */
.long __sys_splice
.long __sys_sync_file_range
.long sys_tee
.long sys_vmsplice
.long __sys_epoll_pwait /* 265 */
.long sys_msgget
.long sys_msgsnd
.long sys_msgrcv
.long sys_msgctl
.long sys_semget /* 270 */
.long sys_semop
.long sys_semctl
.long sys_semtimedop
.long sys_shmat
.long sys_shmget /* 275 */
.long sys_shmdt
.long sys_shmctl
.long sys_utimensat
.long sys_signalfd
.long sys_ni_syscall /* 280, was sys_timerfd */
.long sys_eventfd
.long sys_recvmmsg
.long sys_setns
.long sys_ni_syscall /* r8 is saturated at nr_syscalls */
......
4. 内核提供的系统调用数目非常有限,到2.6.23版本的内核也不过才达到仅仅325个,使用“man 2 syscalls”命令即可以浏览到所有系统调用
chenliang@test:/proc$ ^C
chenliang@test:/proc$ man 2 syscalls
SYSCALLS(2) Linux Programmer's Manual SYSCALLS(2)
NAME
syscalls - Linux system calls
SYNOPSIS
Linux system calls.
DESCRIPTION
The system call is the fundamental interface between an application and the Linux kernel.
System calls and library wrapper functions
System calls are generally not invoked directly, but rather via wrapper functions in glibc (or perhaps some other library). For details of direct invocation of a
system call, see intro(2). Often, but not always, the name of the wrapper function is the same as the name of the system call that it invokes. For example, glibc
contains a function truncate() which invokes the underlying "truncate" system call.
Often the glibc wrapper function is quite thin, doing little work other than copying arguments to the right registers before invoking the system call, and then set‐
ting errno appropriately after the system call has returned. (These are the same steps that are performed by syscall(2), which can be used to invoke system calls
for which no wrapper function is provided.) Note: system calls indicate a failure by returning a negative error number to the caller; when this happens, the wrapper
function negates the returned error number (to make it positive), copies it to errno, and returns -1 to the caller of the wrapper.
Sometimes, however, the wrapper function does some extra work before invoking the system call. For example, nowadays there are (for reasons described below) two
related system calls, truncate(2) and truncate64(2), and the glibc truncate() wrapper function checks which of those system calls are provided by the kernel and
determines which should be employed.
System call list
Below is a list of the Linux system calls. In the list, the Kernel column indicates the kernel version for those system calls that were new in Linux 2.2, or have
appeared since that kernel version. Note the following points:
* Where no kernel version is indicated, the system call appeared in kernel 1.0 or earlier.
* Where a system call is marked "1.2" this means the system call probably appeared in a 1.1.x kernel version, and first appeared in a stable kernel with 1.2.
(Development of the 1.2 kernel was initiated from a branch of kernel 1.0.6 via the 1.1.x unstable kernel series.)
* Where a system call is marked "2.0" this means the system call probably appeared in a 1.3.x kernel version, and first appeared in a stable kernel with 2.0.
(Development of the 2.0 kernel was initiated from a branch of kernel 1.2.x, somewhere around 1.2.10, via the 1.3.x unstable kernel series.)
* Where a system call is marked "2.2" this means the system call probably appeared in a 2.1.x kernel version, and first appeared in a stable kernel with 2.2.0.
(Development of the 2.2 kernel was initiated from a branch of kernel 2.0.21 via the 2.1.x unstable kernel series.)
* Where a system call is marked "2.4" this means the system call probably appeared in a 2.3.x kernel version, and first appeared in a stable kernel with 2.4.0.
(Development of the 2.4 kernel was initiated from a branch of kernel 2.2.8 via the 2.3.x unstable kernel series.)
* Where a system call is marked "2.6" this means the system call probably appeared in a 2.5.x kernel version, and first appeared in a stable kernel with 2.6.0.
(Development of kernel 2.6 was initiated from a branch of kernel 2.4.15 via the 2.5.x unstable kernel series.)
* Starting with kernel 2.6.0, the development model changed, and new system calls may appear in each 2.6.x release. In this case, the exact version number where
the system call appeared is shown. This convention continues with the 3.x kernel series, which followed on from kernel 2.6.39.
* In some cases, a system call was added to a stable kernel series after it branched from the previous stable kernel series, and then backported into the earlier
stable kernel series. For example some system calls that appeared in 2.6.x were also backported into a 2.4.x release after 2.4.15. When this is so, the version
where the system call appeared in both of the major kernel series is listed.
The list of system calls that are available as at kernel 3.9 (or in a few cases only on older kernels) is as follows:
System call Kernel Notes
────────────────────────────────────────────────────────────────────────────
_llseek(2) 1.2
_newselect(2) 2.0
_sysctl(2) 2.0
accept(2) 2.0 See notes on socketcall(2)
accept4(2) 2.6.28
access(2) 1.0
acct(2) 1.0
add_key(2) 2.6.11
adjtimex(2) 1.0
alarm(2) 1.0
alloc_hugepages(2) 2.5.36 Removed in 2.5.44
bdflush(2) 1.2 Deprecated (does nothing)
since 2.6
bind(2) 2.0 See notes on socketcall(2)
brk(2) 1.0
cacheflush(2) 1.2 Not on x86
capget(2) 2.2
capset(2) 2.2
chdir(2) 1.0
chmod(2) 1.0
chown(2) 2.2 See chown(2) for
version details
chown32(2) 2.4
chroot(2) 1.0
clock_adjtime(2) 2.6.39
clock_getres(2) 2.6
clock_gettime(2) 2.6
clock_nanosleep(2) 2.6
clock_settime(2) 2.6
clone(2) 1.0
close(2) 1.0
connect(2) 2.0 See notes on socketcall(2)
creat(2) 1.0
create_module(2) Removed in 2.6
delete_module(2) 1.0
dup(2) 1.0
dup2(2) 1.0
dup3(2) 2.6.27
epoll_create(2) 2.6
epoll_create1(2) 2.6.27
epoll_ctl(2) 2.6
epoll_pwait(2) 2.6.19
epoll_wait(2) 2.6
eventfd(2) 2.6.22
eventfd2(2) 2.6.27
execve(2) 1.0
exit(2) 1.0
exit_group(2) 2.6
faccessat(2) 2.6.16
fadvise64(2) 2.6
fadvise64_64(2) 2.6
fallocate(2) 2.6.23
fanotify_init(2) 2.6.37
fanotify_mark(2) 2.6.37
fchdir(2) 1.0
fchmod(2) 1.0
fchmodat(2) 2.6.16
fchown(2) 1.0
fchown32(2) 2.4
fchownat(2) 2.6.16
fcntl(2) 1.0
fcntl64(2) 2.4
fdatasync(2) 2.0
fgetxattr(2) 2.6; 2.4.18
finit_module(2) 3.8
flistxattr(2) 2.6; 2.4.18
flock(2) 2.0
fork(2) 1.0
free_hugepages(2) 2.5.36 Removed in 2.5.44
fremovexattr(2) 2.6; 2.4.18
fsetxattr(2) 2.6; 2.4.18
fstat(2) 1.0
fstat64(2) 2.4
fstatat64(2) 2.6.16
...
5. 既然系统调用表集中存放了所有系统调用服务例程的地址,那么系统调用在内核中的执行就可以转化为从该表获取对应的服务例程并执行的过程。这个过程中一个很重要的环节就是系统调用号。每个系统调用都拥有一个独一无二的系统调用号,用户应用通过它,而不是系统调用的名称,来指明要执行哪个系统调用。
系统调用号的定义在include/asm-i386/unistd.h文件。
008 #define __NR_restart_syscall 0
007 #define __NR_exit 1
009 #define __NR_fork 2
010 #define __NR_read 3
6. 系统调用最终由系统调用服务例程完成明确的操作。所有的系统调用服务例程集中声明在include/linux/syscalls.h文件,但分散定义在很多不同的文件。比如getpid系统调用用于获取当前进程的PID,它的服务例程sys_getpid在kernel/timer.c文件中定义为:
954 asmlinkage long sys_getpid(void)
955 {
956 return current->tgid;
957 }
除了都具有“sys_”前缀
7. 如图5.2所示,用户应用可以通过两种方式使用系统调用。第一种方式是通过C库函数,包括系统调用在C库中的封装函数和其他普通函数。
[插图]
图5.2 使用系统调用的两种方式
第二种方式是使用_syscall宏。2.6.18版本之前的内核,在include/asm-i386/unistd.h文件中定义有7个_syscall宏,分别是:
_syscall0(type,name)
_syscall1(type,name,type1,arg1)
_syscall2(type,name,type1,arg1,type2,arg2)
_syscall3(type,name,type1,arg1,type2,arg2,type3,arg3)
_syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4)
_syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5)
_syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4,type5,arg5,type6,ar
g6)
其中,type表示所生成系统调用的返回值类型,name表示该系统调用的名称,typeN、argN分别表示第N个参数的类型和名称,它们的数目和_syscall后面的数字一样大。这些宏的作用是创建名为name的函数,_syscall后面跟的数字指明了该函数的参数的个数。
比如sysinfo系统调用用于获取系统总体统计信息,使用_syscall宏定义为:
_syscall1(int, sysinfo, struct sysinfo *, info);
展开后的形式为:
int sysinfo(struct sysinfo * info)
{
long __res;
__asm__ volatile("int $0x80" : "=a" (__res) : "0" (116),"b" ((long)(info)));
do {
if ((unsigned long)(__res) >= (unsigned long)(-(128 + 1))) {
errno = -(__res);
__res = -1;
}
return (int) (__res);
8. 但是自2.6.19版本开始,_syscall宏被废除,我们需要使用syscall函数,通过指定系统调用号和一组参数来调用系统调用。
syscall函数原型为:
int syscall(int number, ...);
其中number是系统调用号,number后面应顺序接上该系统调用的所有参数。下面是gettid系统调用的调用实例。
代码清单5.2 gettid系统调用使用实例
00 #include
01 #include
02 #include
03
04 #define __NR_gettid 224
05
06 int main(int argc, char *argv[])
07 {
08 pid_t tid;
09
10 tid = syscall(__NR_gettid);
11 }
大部分系统调用都包括了一个SYS_符号常量来指定自己到系统调用号的映射,因此上面第10行可重写为:
tid = syscall(SYS_gettid);
9. 当我们在shell终端下输入“echo hello”命令时,将会要求shell进程执行一个可执行文件echo,参数为“hello”。当shell进程接收到命令之后,先在/bin目录下找到echo文件(我们可以使用which命令获得命令所在的位置),然后创建一个子进程去执行/bin/echo,并将参数传递给它,而这个子进程从shell进程继承了3个标准输入/输出文件,即stdin、stdout和stderr,文件号分别为0、1、2。它的工作很简单,就是将参数“hello”写到stdout文件中,通常都是我们的屏幕上。
但是如果我们将命令改成“echo hello > txt”,则在执行时输出将会被重定向到磁盘文件txt中。假定之前该shell进程只有上述3个标准文件打开,则该命令将按如下序列执行。
(1)打开或创建文件txt,如果txt中原来有内容,则清除原来的内容,其文件号为3。
(2)通过dup系统调用复制文件stdout的相关数据结构到文件号4。
(3)关闭stdout,但是由于4号文件也同时引用stdout,所以stdout文件并未真正关闭,只是腾出1号文件号位置。
(4)通过dup系统调用,复制3号文件(即文件txt),由于1号文件关闭,其位置空缺,故3号文件被复制到1号,即进程中原来指向stdout的指针指向了txt。
(5)通过系统调用fork和exec创建子进程并执行echo,子进程在执行cat前关闭3号和4号文件,只留下0、1、2三个文件,请注意,这时的1号文件已经不是stdout而是文件txt了。当cat想向stdout文件写入“hello”时自然就写入到了txt中。
(6)回到shell进程后,关闭指向txt的1号与3号文件文件,再用dup和close系统调用将2号恢复至stdout,这样shell就恢复了0、1、2三个标准输入/输出文件
10. sys_reboot
Linux下有关关机与重启的命令主要有shutdown、reboot、halt、poweroff、telinit和init。它们都可以达到关机或重启的目的,但是每个命令的工作流程并不一样。
这些命令并不都是互相独立的,比如,poweroff、reboot即是halt的符号链接,但是它们最终都是通过reboot系统调用来完成关机或重启操作。
reboot系统调用的服务例程为
11. 如何实现一个新的系统调用
为Linux添加新的系统调用是件相对容易的事情,主要包括有4个步骤:编写系统调用服务例程;添加系统调用号;修改系统调用表;重新编译内核并测试新添加的系统调用。
下面以一个并无实际用处的hello系统调用为例,来演示上述几个步骤。
(1)编写系统调用服务例程。
遵循前面所述的几个原则,hello系统调用的服务例程实现为:
01 asmlinkage long sys_hello(void)
02 {
03 printk(“Hello!\n”);
04 return 0;
05 }
通常,应该为新的系统调用服务例程创建一个新的文件进行存放,但也可以将其定义在其他文件之中并加上注释做必要说明。同时,还要在include/linux/syscalls.h文件中添加原型声明:
asmlinkage long sys_hello(void);
sys_hello函数非常简单,仅仅打印一条语句,并没有使用任何参数。如果我们希望hello系统调用不仅能打印“hello!”欢迎信息,还能够打印出我们传递过去的名称,或者其他的一些描述信息,则sys_hello函数可以实现为:
01 asmlinkage long sys_hello(const char __user *_name)
02 {
03 char *name;
04 long ret;
05
06 name = strndup_user(_name, PAGE_SIZE);
07 if (IS_ERR(name)) {
08 ret = PTR_ERR(name);
09 goto error;
10 }
11
12 printk(“Hello, %s!\n”, name);
13 return 0;
14 error:
15 return ret;
16 }
第二个sys_hello函数使用了一个参数,在这种有参数传递发生的情况下,编写系统调用服务例程时必须仔细检查所有的参数是否合法有效。因为系统调用在内核空间执行,如果不加限制任由用户应用传递输入进入内核,则系统的安全与稳定将受到影响。
参数检查中最重要的一项就是检查用户应用提供的用户空间指针是否有效。比如上述sys_hello函数参数为char类型指针,并且使用了__user标记进行修饰。__user标记表示所修饰的指针为用户空间指针,不能在内核空间直接引用,原因主要如下。
● 用户空间指针在内核空间可能是无效的。
● 用户空间的内存是分页的,可能引起页错误。
● 如果直接引用能够成功,就相当于用户空间可以直接访问内核空间,产生安全问题。
因此,为了能够完成必须的检查,以及在用户空间和内核空间之间安全地传送数据,就需要使用内核提供的函数。比如在sys_hello函数的第6行,就使用了内核提供的strndup_user函数(在mm/util.c文件中定义)从用户空间复制字符串name的内容。
(2)添加系统调用号。
每个系统调用都会拥有一个独一无二的系统调用号,所以接下来需要更新include/asm-i386/unistd.h文件,为hello系统调用添加一个系统调用号。
328 #define __NR_utimensat 320
329 #define __NR_signalfd 321
330 #define __NR_timerfd 322
331 #define __NR_eventfd 323
332 #define __NR_fallocate 324
333 #define __NR_hello 325 / *分配hello系统调用号为325*/
334
335 #ifdef __KERNEL__
336
337 #define NR_syscalls 326 / *将系统调用数目加1修改为326*/
(3)修改系统调用表。
为了让系统调用处理程序system_call函数能够找到hello系统调用,我们还需要修改系统调用表sys_call_table,放入服务例程sys_hello函数的地址。
322 .long sys_utimensat / * 320 */
323 .long sys_signalfd
324 .long sys_timerfd
325 .long sys_eventfd
326 .long sys_fallocate
327 .long sys_hello / *hello系统调用服务例程*/
新的系统调用hello的服务例程被添加到了sys_call_table的末尾。我们可以注意到,sys_call_table每隔5个表项就会有一个注释,表明该项的系统调用号,这个好习惯可以在查找系统调用对应的系统调用号时提供方便。
(4)重新编译内核并测试。
为了能够使用新添加的系统调用,需要重新编译内核,并使用新内核重新引导系统。然后,我们还需要编写测试程序对新的系统调用进行测试。针对hello系统调用的测试程序如下:
00 #include
01 #include
02 #include
03
04 #define __NR_hello 325
05
06 int main(int argc, char *argv[])
07 {
08 syscall(__NR_hello);
09 return 0;
10 }
然后使用gcc编译并执行:
$gcc –o hello hello.c
$./hello
Hello!
由执行结果可见,系统调用添加成功