1.asm.s
1 /*
2 * linux/kernel/asm.s
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * asm.s contains the low-level code for most hardware faults.
9 * page_exception is handled by the mm, so that isn't here. This
10 * file also handles (hopefully) fpu-exceptions due to TS-bit, as
11 * the fpu must be properly saved/resored. This hasn't been tested.
12 */
14 .globl _divide_error,_debug,_nmi,_int3,_overflow,_bounds,_invalid_op
15 .globl _double_fault,_coprocessor_segment_overrun
16 .globl _invalid_TSS,_segment_not_present,_stack_segment
17 .globl _general_protection,_coprocessor_error,_irq13,_reserved
21 no_error_code:
22 xchgl %eax,(%esp)
23 pushl %ebx
24 pushl %ecx
25 pushl %edx
26 pushl %edi
27 pushl %esi
28 pushl %ebp
29 push %ds
30 push %es
31 push %fs
32 pushl $0 # "error code"
33 lea 44(%esp),%edx
34 pushl %edx
35 movl $0x10,%edx
36 mov %dx,%ds
37 mov %dx,%es
38 mov %dx,%fs
39 call *%eax
40 addl $8,%esp
41 pop %fs
42 pop %es
43 pop %ds
44 popl %ebp
45 popl %esi
46 popl %edi
47 popl %edx
48 popl %ecx
49 popl %ebx
50 popl %eax
51 iret
。。。
65 _overflow:
66 pushl $_do_overflow
67 jmp no_error_code
。。。
99 error_code:
100 xchgl %eax,4(%esp) # error code <-> %eax
101 xchgl %ebx,(%esp) # &function <-> %ebx
102 pushl %ecx
103 pushl %edx
104 pushl %edi
105 pushl %esi
106 pushl %ebp
107 push %ds
108 push %es
109 push %fs
110 pushl %eax # error code
111 lea 44(%esp),%eax # offset
112 pushl %eax
113 movl $0x10,%eax
114 mov %ax,%ds
115 mov %ax,%es
116 mov %ax,%fs
117 call *%ebx
118 addl $8,%esp
119 pop %fs
120 pop %es
121 pop %ds
122 popl %ebp
123 popl %esi
124 popl %edi
125 popl %edx
126 popl %ecx
127 popl %ebx
128 popl %eax
129 iret
其他内容不再贴出,因为格式都是一样的。
调用过程是通过注册中断处理函数来实现的。
2.trap.c
1 /*
2 * linux/kernel/traps.c
3 *
4 * (C) 1991 Linus Torvalds
5 */
6
7 /*
8 * 'Traps.c' handles hardware traps and faults after we have saved some
9 * state in 'asm.s'. Currently mostly a debugging-aid, will be extended
10 * to mainly kill the offending process (probably by giving it a signal,
11 * but possibly by killing it outright if necessary).
12 */
13 #include
15 #include
16 #include
17 #include
18 #include
19 #include
20 #include
22 #define get_seg_byte(seg,addr) ({ \
23 register char __res; \
24 __asm__("push %%fs;mov %%ax,%%fs;movb %%fs:%2,%%al;pop %%fs" \
25 :"=a" (__res):"0" (seg),"m" (*(addr))); \
26 __res;})
这段函数的作用是取段seg,addr处的一个字节。
28 #define get_seg_long(seg,addr) ({ \
29 register unsigned long __res; \
30 __asm__("push %%fs;mov %%ax,%%fs;movl %%fs:%2,%%eax;pop %%fs" \
31 :"=a" (__res):"0" (seg),"m" (*(addr))); \
32 __res;})
这个函数的作用是取seg段add处一个长字。
34 #define _fs() ({ \
35 register unsigned short __res; \
36 __asm__("mov %%fs,%%ax":"=a" (__res):); \
37 __res;})
这个是取fs段选择符。
39 int do_exit(long code);
40
41 void page_exception(void);
42
43 void divide_error(void);
44 void debug(void);
45 void nmi(void);
46 void int3(void);
47 void overflow(void);
48 void bounds(void);
49 void invalid_op(void);
50 void device_not_available(void);
51 void double_fault(void);
52 void coprocessor_segment_overrun(void);
53 void invalid_TSS(void);
54 void segment_not_present(void);
55 void stack_segment(void);
56 void general_protection(void);
57 void page_fault(void);
58 void coprocessor_error(void);
59 void reserved(void);
60 void parallel_interrupt(void);
61 void irq13(void);
63 static void die(char * str,long esp_ptr,long nr)
64 {
65 long * esp = (long *) esp_ptr;
66 int i;
67
68 printk("%s: %04x\n\r",str,nr&0xffff);
69 printk("EIP:\t%04x:%p\nEFLAGS:\t%p\nESP:\t%04x:%p\n",
70 esp[1],esp[0],esp[2],esp[4],esp[3]);
这里esp[0]就是上面的图中esp0的位置,因此esp[1]就是cs,esp[2]就是eflgs,esp[3]就是原esp。
71 printk("fs: %04x\n",_fs());
72 printk("base: %p, limit: %p\n",get_base(current->ldt[1]),get_limit(0x17));
73 if (esp[4] == 0x17) {
74 printk("Stack: ");
75 for (i=0;i<4;i++)
76 printk("%p ",get_seg_long(0x17,i+(long *)esp[3]));
77 printk("\n");
78 }
79 str(i);
80 printk("Pid: %d, process nr: %d\n\r",current->pid,0xffff & i);
81 for(i=0;i<10;i++)
82 printk("%02x ",0xff & get_seg_byte(esp[1],(i+(char *)esp[0])));
83 printk("\n\r");
84 do_exit(11); /* play segment exception */
85 }
这里就是打印一些信息,注意之前我们在asm.s中已经保存了一些信息,这里会把感兴趣的取出来。
87 void do_double_fault(long esp, long error_code)
88 {
89 die("double fault",esp,error_code);
90 }
91
92 void do_general_protection(long esp, long error_code)
93 {
94 die("general protection",esp,error_code);
95 }
96
97 void do_divide_error(long esp, long error_code)
98 {
99 die("divide error",esp,error_code);
100 }
102 void do_int3(long * esp, long error_code,
103 long fs,long es,long ds,
104 long ebp,long esi,long edi,
105 long edx,long ecx,long ebx,long eax)
106 {
107 int tr;
108
109 __asm__("str %%ax":"=a" (tr):"0" (0));
110 printk("eax\t\tebx\t\tecx\t\tedx\n\r%8x\t%8x\t%8x\t%8x\n\r",
111 eax,ebx,ecx,edx);
112 printk("esi\t\tedi\t\tebp\t\tesp\n\r%8x\t%8x\t%8x\t%8x\n\r",
113 esi,edi,ebp,(long) esp);
114 printk("\n\rds\tes\tfs\ttr\n\r%4x\t%4x\t%4x\t%4x\n\r",
115 ds,es,fs,tr);
116 printk("EIP: %8x CS: %4x EFLAGS: %8x\n\r",esp[0],esp[1],esp[2]);
117 }
119 void do_nmi(long esp, long error_code)
120 {
121 die("nmi",esp,error_code);
122 }
123
124 void do_debug(long esp, long error_code)
125 {
126 die("debug",esp,error_code);
127 }
128
129 void do_overflow(long esp, long error_code)
130 {
131 die("overflow",esp,error_code);
132 }
134 void do_bounds(long esp, long error_code)
135 {
136 die("bounds",esp,error_code);
137 }
138
139 void do_invalid_op(long esp, long error_code)
140 {
141 die("invalid operand",esp,error_code);
142 }
143
144 void do_device_not_available(long esp, long error_code)
145 {
146 die("device not available",esp,error_code);
147 }
148
149 void do_coprocessor_segment_overrun(long esp, long error_code)
150 {
151 die("coprocessor segment overrun",esp,error_code);
152 }
153
154 void do_invalid_TSS(long esp,long error_code)
155 {
156 die("invalid TSS",esp,error_code);
157 }
159 void do_segment_not_present(long esp,long error_code)
160 {
161 die("segment not present",esp,error_code);
162 }
163
164 void do_stack_segment(long esp,long error_code)
165 {
166 die("stack segment",esp,error_code);
167 }
168
169 void do_coprocessor_error(long esp, long error_code)
170 {
171 if (last_task_used_math != current)
172 return;
173 die("coprocessor error",esp,error_code);
174 }
175
176 void do_reserved(long esp, long error_code)
177 {
178 die("reserved (15,17-47) error",esp,error_code);
179 }
181 void trap_init(void)
182 {
。。。。。
这个已经分析过了,这里不再说明。可以看到出错之后的处理方案差不多都是打印一些信息后退出。
3.system_call.s
7 /*
8 * system_call.s contains the system-call low-level handling routines.
9 * This also contains the timer-interrupt handler, as some of the code is
10 * the same. The hd- and flopppy-interrupts are also here.
11 *
12 * NOTE: This code handles signal-recognition, whichhappens every time
13 * after a timer-interrupt and after each system call. Ordinary interrupts
14 * don't handle signal-recognition, as that would clutter(杂乱,混乱) them up totally
15 * unnecessarily.
这里说的是每次时钟中断或者系统调用处理完成后会处理信号识别。普通中断则不会处理信号识别,因为这会造成混乱。
16 *下面是'ret_from_system_call的堆栈布局
17 * Stack layout in 'ret_from_system_call':
18 *
19 * 0(%esp) - %eax
20 * 4(%esp) - %ebx
21 * 8(%esp) - %ecx
22 * C(%esp) - %edx
23 * 10(%esp) - %fs
24 * 14(%esp) - %es
25 * 18(%esp) - %ds
26 * 1C(%esp) - %eip
27 * 20(%esp) - %cs
28 * 24(%esp) - %eflags
29 * 28(%esp) - %oldesp
30 * 2C(%esp) - %oldss
31 */
33 SIG_CHLD = 17
34
35 EAX = 0x00
36 EBX = 0x04
37 ECX = 0x08
38 EDX = 0x0C
39 FS = 0x10
40 ES = 0x14
41 DS = 0x18
42 EIP = 0x1C
43 CS = 0x20
44 EFLAGS = 0x24
45 OLDESP = 0x28
46 OLDSS = 0x2C
这些是对应上面的stack layout设置的。
48 state = 0 # these are offsets into the task-struct.
49 counter = 4 //剩余时间片
50 priority = 8
51 signal = 12
52 sigaction = 16 # MUST be 16 (=len of sigaction)
53 blocked = (33*16) //受阻信号
对应task-struct中的偏移值。
55 # offsets within sigaction
56 sa_handler = 0
57 sa_mask = 4
58 sa_flags = 8
59 sa_restorer = 12
这是对应sigaction中的偏移
61 nr_system_calls = 72
67 .globl _system_call,_sys_fork,_timer_interrupt,_sys_execve
68 .globl _hd_interrupt,_floppy_interrupt,_parallel_interrupt
69 .globl _device_not_available, _coprocessor_error
71 .align 2
72 bad_sys_call:
73 movl $-1,%eax
74 iret
75 .align 2
76 reschedule:
77 pushl $ret_from_sys_call
78 jmp _schedule
schedule返回时就从ret_from_sys_call继续执行。
79 .align 2
80 _system_call:
81 cmpl $nr_system_calls-1,%eax
82 ja bad_sys_call
83 push %ds
84 push %es
85 push %fs
86 pushl %edx
87 pushl %ecx # push %ebx,%ecx,%edx as parameters
88 pushl %ebx # to the system call
89 movl $0x10,%edx # set up ds,es to kernel space
90 mov %dx,%ds
91 mov %dx,%es
92 movl $0x17,%edx # fs points to local data space
93 mov %dx,%fs
fs指向局部数据段(局部表中数据段描述符),就是执行本次系统调用的用户程序的数据段。
94 call _sys_call_table(,%eax,4)
95 pushl %eax
96 movl _current,%eax
97 cmpl $0,state(%eax) # state
98 jne reschedule //
99 cmpl $0,counter(%eax) # counter
100 je reschedule //时间片用完?重新调度
101 ret_from_sys_call:
102 movl _current,%eax # task[0] cannot have signals
103 cmpl _task,%eax //这里_task对应c语言中的task数组,这里就是task[0]
104 je 3f //是不是task0
105 cmpw $0x0f,CS(%esp) # was old code segment supervisor ?
106 jne 3f
0x0f代表用户DPL=3,局部描述符,索引1,也就是代码段。如果不是那说明是内核代码段。不处理信号(不可抢占)
107 cmpw $0x17,OLDSS(%esp) # was stack segment = 0x17 ?
108 jne 3f
0x17代表DPL=3,局部描述符,索引2,也就是数据段,如果不是说明是内核数据段,不处理信号(不可抢占)
109 movl signal(%eax),%ebx
110 movl blocked(%eax),%ecx
111 notl %ecx
112 andl %ebx,%ecx
113 bsfl %ecx,%ecx
114 je 3f
扫描操作数op1,找到到第1个非0bit位, 把非0bit位的索引下标(从0计算)存入op2. 扫描从低位到高位扫描(也就是从右->左)
没有信号,直接退出。
115 btrl %ecx,%ebx //清除由bsfl找到的位,并将pos放到ebx
116 movl %ebx,signal(%eax)
117 incl %ecx
118 pushl %ecx//参数,第几个位?
119 call _do_signal
120 popl %eax
121 3: popl %eax
122 popl %ebx
123 popl %ecx
124 popl %edx
125 pop %fs
126 pop %es
127 pop %ds
128 iret
130 .align 2
131 _coprocessor_error:
132 push %ds
133 push %es
134 push %fs
135 pushl %edx
136 pushl %ecx
137 pushl %ebx
138 pushl %eax
139 movl $0x10,%eax
140 mov %ax,%ds
141 mov %ax,%es
142 movl $0x17,%eax
143 mov %ax,%fs
144 pushl $ret_from_sys_call
145 jmp _math_error
147 .align 2
148 _device_not_available:
149 push %ds
150 push %es
151 push %fs
152 pushl %edx
153 pushl %ecx
154 pushl %ebx
155 pushl %eax
156 movl $0x10,%eax
157 mov %ax,%ds
158 mov %ax,%es
ds,es 内核数据段
159 movl $0x17,%eax
160 mov %ax,%fs
fs用户数据段
161 pushl $ret_from_sys_call
162 clts # clear TS so that we can use math
163 movl %cr0,%eax
164 testl $0x4,%eax # EM (math emulation bit)
165 je _math_state_restore
166 pushl %ebp
167 pushl %esi
168 pushl %edi
169 call _math_emulate
170 popl %edi
171 popl %esi
172 popl %ebp
173 ret
175 .align 2
176 _timer_interrupt:
177 push %ds # save ds,es and put kernel data space
178 push %es # into them. %fs is used by _system_call
179 push %fs
180 pushl %edx # we save %eax,%ecx,%edx as gcc doesn't
181 pushl %ecx # save those across function calls. %ebx
182 pushl %ebx # is saved as we use that in ret_sys_call
183 pushl %eax
184 movl $0x10,%eax
185 mov %ax,%ds
186 mov %ax,%es
ds,es内核数据段
187 movl $0x17,%eax
188 mov %ax,%fs
fs用户数据段
189 incl _jiffies
190 movb $0x20,%al # EOI to interrupt controller #1
191 outb %al,$0x20
发送结束命令给8259A
192 movl CS(%esp),%eax
193 andl $3,%eax # %eax is CPL (0 or 3, 0=supervisor)
194 pushl %eax
195 call _do_timer # 'do_timer(long CPL)' does everything from
以当前DPL为参数调用do_timer
196 addl $4,%esp # task switching to accounting ...
197 jmp ret_from_sys_call
198
199 .align 2
200 _sys_execve:
201 lea EIP(%esp),%eax
202 pushl %eax
这里是触发该系统调用的程序的代码指针作为参数
203 call _do_execve
204 addl $4,%esp
205 ret
207 .align 2
208 _sys_fork:
209 call _find_empty_process
210 testl %eax,%eax
211 js 1f
212 push %gs
213 pushl %esi
214 pushl %edi
215 pushl %ebp
216 pushl %eax
217 call _copy_process
218 addl $20,%esp
219 1: ret
221 _hd_interrupt:
222 pushl %eax
223 pushl %ecx
224 pushl %edx
225 push %ds
226 push %es
227 push %fs
228 movl $0x10,%eax
229 mov %ax,%ds
230 mov %ax,%es
231 movl $0x17,%eax
232 mov %ax,%fs
233 movb $0x20,%al
234 outb %al,$0xA0 # EOI to interrupt controller #1
235 jmp 1f # give port chance to breathe
236 1: jmp 1f
237 1: xorl %edx,%edx
238 xchgl _do_hd,%edx
239 testl %edx,%edx
240 jne 1f
241 movl $_unexpected_hd_interrupt,%edx
242 1: outb %al,$0x20
243 call *%edx # "interesting" way of handling intr.
244 pop %fs
245 pop %es
246 pop %ds
247 popl %edx
248 popl %ecx
249 popl %eax
250 iret
252 _floppy_interrupt:
253 pushl %eax
254 pushl %ecx
255 pushl %edx
256 push %ds
257 push %es
258 push %fs
259 movl $0x10,%eax
260 mov %ax,%ds
261 mov %ax,%es
262 movl $0x17,%eax
263 mov %ax,%fs
264 movb $0x20,%al
265 outb %al,$0x20 # EOI to interrupt controller #1
266 xorl %eax,%eax
267 xchgl _do_floppy,%eax
268 testl %eax,%eax
269 jne 1f
270 movl $_unexpected_floppy_interrupt,%eax
271 1: call *%eax # "interesting" way of handling intr.
272 pop %fs
273 pop %es
274 pop %ds
275 popl %edx
276 popl %ecx
277 popl %eax
278 iret
279
280 _parallel_interrupt:
281 pushl %eax
282 movb $0x20,%al
283 outb %al,$0x20
284 popl %eax
285 iret
include/unistd.h
13 #define STDIN_FILENO 0
14 #define STDOUT_FILENO 1
15 #define STDERR_FILENO 2
17 #ifndef NULL
18 #define NULL ((void *)0)
19 #endif
21 /* access */
22 #define F_OK 0
23 #define X_OK 1
24 #define W_OK 2
25 #define R_OK 4
27 /* lseek */
28 #define SEEK_SET 0
29 #define SEEK_CUR 1
30 #define SEEK_END 2
31
32 /* _SC stands for System Configuration. We don't use them much */
...........
42 /* more (possibly) configurable things - now pathnames */
43 #define _PC_LINK_MAX 1
........
53 #include
54 #include
55 #include
56 #include
57
58 #ifdef __LIBRARY__
59
60 #define __NR_setup 0 /* used only by init, to get system going */
............................................
130 #define __NR_setreuid 70
131 #define __NR_setregid 71
132
133 #define _syscall0(type,name) \
134 type name(void) \
135 { \
136 long __res; \
137 __asm__ volatile ("int $0x80" \
138 : "=a" (__res) \
139 : "0" (__NR_##name)); \
140 if (__res >= 0) \
141 return (type) __res; \
142 errno = -__res; \
143 return -1; \
144 }
146 #define _syscall1(type,name,atype,a) \
147 type name(atype a) \
148 { \
149 long __res; \
150 __asm__ volatile ("int $0x80" \
151 : "=a" (__res) \
152 : "0" (__NR_##name),"b" ((long)(a))); \
153 if (__res >= 0) \
154 return (type) __res; \
155 errno = -__res; \
156 return -1; \
157 }
159 #define _syscall2(type,name,atype,a,btype,b) \
160 type name(atype a,btype b) \
161 { \
162 long __res; \
163 __asm__ volatile ("int $0x80" \
164 : "=a" (__res) \
165 : "0" (__NR_##name),"b" ((long)(a)),"c" ((long)(b))); \
166 if (__res >= 0) \
167 return (type) __res; \
168 errno = -__res; \
169 return -1; \
170 }
172 #define _syscall3(type,name,atype,a,btype,b,ctype,c) \
173 type name(atype a,btype b,ctype c) \
174 { \
175 long __res; \
176 __asm__ volatile ("int $0x80" \
177 : "=a" (__res) \
178 : "0" (__NR_##name),"b" ((long)(a)),"c" ((long)(b)),"d" ((long)(c))); \
179 if (__res>=0) \
180 return (type) __res; \
181 errno=-__res; \
182 return -1; \
183 }
184
185 #endif /* __LIBRARY__ */
186
187 extern int errno;
189 int access(const char * filename, mode_t mode);
190 int acct(const char * filename);
191 int alarm(int sec);
.....
200 int dup(int fildes);
201 int execve(const char * filename, char ** argv, char ** envp);
202 int execv(const char * pathname, char ** argv);
203 int execvp(const char * file, char ** argv);
204 int execl(const char * pathname, char * arg0, ...);
205 int execlp(const char * file, char * arg0, ...);
206 int execle(const char * pathname, char * arg0, ...);
207 volatile void exit(int status);
.......................
249 int getppid(void);
250 pid_t getpgrp(void);
251 pid_t setsid(void);
252
253 #endif
可以看到unistd.h中主要是定义了一些系统调用对应的宏,这个是方便通过名称来找到系统调用号的。然后是对应的系统调用函数声明。
我们来考察一下系统调用从C语言到内核的整个流程。首先C语言中的系统调用会在库中被转换为int0x80,附带一个调用号。这时内核会处理中断。我们看一下中断处理入口。
在kernel/sched.c中(sched_init),我们看到有下面一句
411 set_system_gate(0x80,&system_call);
也就是注册了中断处理函数。那么我们来看一下这里的system_call。它对应system_call.s中的标号_system_call
我们来看一下:
kernel/system_call.s
79 .align 2
80 _system_call:
81 cmpl $nr_system_calls-1,%eax
82 ja bad_sys_call
83 push %ds
84 push %es
85 push %fs
原来的ds,es,fs入栈。
86 pushl %edx
87 pushl %ecx # push %ebx,%ecx,%edx as parameters
88 pushl %ebx # to the system call
89 movl $0x10,%edx # set up ds,es to kernel space
90 mov %dx,%ds
91 mov %dx,%es
92 movl $0x17,%edx # fs points to local data space
93 mov %dx,%fs
上面ebx,ecx,edx作为参数;接下来设置ds,es为内核数据段,fs为用户数据段。
94 call _sys_call_table(,%eax,4)
95 pushl %eax
96 movl _current,%eax
97 cmpl $0,state(%eax) # state
98 jne reschedule
99 cmpl $0,counter(%eax) # counter
100 je reschedule
这里也比较重要,系统调用完成后,要看一下是不是需要重新调度。
接下来我们看一下94行:
include/linux/sys.h
1 extern int sys_setup();
2 extern int sys_exit();
3 extern int sys_fork();
。。。。。。。。
70 extern int sys_ssetmask();
71 extern int sys_setreuid();
72 extern int sys_setregid();
74 fn_ptr sys_call_table[] = { sys_setup, sys_exit, sys_fork, sys_read,
75 sys_write, sys_open, sys_close, sys_waitpid, sys_creat, sys_link,
76 sys_unlink, sys_execve, sys_chdir, sys_time, sys_mknod, sys_chmod,
77 sys_chown, sys_break, sys_stat, sys_lseek, sys_getpid, sys_mount,
78 sys_umount, sys_setuid, sys_getuid, sys_stime, sys_ptrace, sys_alarm,
79 sys_fstat, sys_pause, sys_utime, sys_stty, sys_gtty, sys_access,
80 sys_nice, sys_ftime, sys_sync, sys_kill, sys_rename, sys_mkdir,
81 sys_rmdir, sys_dup, sys_pipe, sys_times, sys_prof, sys_brk, sys_setgid,
82 sys_getgid, sys_signal, sys_geteuid, sys_getegid, sys_acct, sys_phys,
83 sys_lock, sys_ioctl, sys_fcntl, sys_mpx, sys_setpgid, sys_ulimit,
84 sys_uname, sys_umask, sys_chroot, sys_ustat, sys_dup2, sys_getppid,
85 sys_getpgrp, sys_setsid, sys_sigaction, sys_sgetmask, sys_ssetmask,
86 sys_setreuid,sys_setregid };
可见整个文件就是声明了相应的系统调用,并把它们保存到数组中,以供调用。
我们继续看一下相应的函数在哪定义的:
kernel/system_call.s:208:_sys_fork
207 .align 2
208 _sys_fork:
209 call _find_empty_process
210 testl %eax,%eax
211 js 1f
212 push %gs
213 pushl %esi
214 pushl %edi
215 pushl %ebp
216 pushl %eax
217 call _copy_process
218 addl $20,%esp
219 1: ret
接下来就是调用具体的C语言函数来实现具体功能了。
我们先总结一下系统调用的整个流程:
C语言中的系统调用最终会通过int 0x80并在eax中存放调用号来触发中断。内核因此会进行中断处理。之前在init/main.c的main函数中task0进行了一系列的初始化工作,并在sched_init中注册了系统调用中断(system
_call)。现在系统调用中断发生了,首先根据中端号查询idt,找到已经注册过的处理函数来进行处理。system_call在kernel/system_call.s文件中,相应_system_call标号处,在这里面我们首先保存之前的ds,es,fs段寄存器,然后从ebx,ecx,edx寄存器中取出参数来调用call _sys_call_table(,%eax,4), 这里以调用号×4作为偏移。_sys_call_table定义在include/linux/sys.h中,它是一个函数指针数组,现在我们根据索引就找到了对应的处理函数,在sys.h中这个函数是以extern声明的。它对应的是system_call.s中相应的标号。在这里会进一步调用C语言函数,它们实现了具体的功能,调用完成后还会检查是不是需要重新调度。这就是系统调用的整个流程。
我们继续以timer中断为例来继续熟悉一下调用流程。timer中断是由硬件周期性触发的。我们来看一下它的处理函数。
kernel/sched.c:409: set_intr_gate(0x20,&timer_interrupt);
我们来看一下timer_interrupt:
kernel/system_call.s:176:_timer_interrupt
175 .align 2
176 _timer_interrupt:
177 push %ds # save ds,es and put kernel data space
178 push %es # into them. %fs is used by _system_call
179 push %fs
180 pushl %edx # we save %eax,%ecx,%edx as gcc doesn't
181 pushl %ecx # save those across function calls. %ebx
182 pushl %ebx # is saved as we use that in ret_sys_call
183 pushl %eax
184 movl $0x10,%eax
185 mov %ax,%ds
186 mov %ax,%es
187 movl $0x17,%eax
188 mov %ax,%fs
189 incl _jiffies
190 movb $0x20,%al # EOI to interrupt controller #1
191 outb %al,$0x20
192 movl CS(%esp),%eax
193 andl $3,%eax # %eax is CPL (0 or 3, 0=supervisor)
194 pushl %eax
195 call _do_timer # 'do_timer(long CPL)' does everything from
这里以CPL为参数来调用do_timer。不过这里我们关心的是192行。
196 addl $4,%esp # task switching to accounting ...
197 jmp ret_from_sys_call
上面系统调用的过程中,我们发现它只是保存了es,ds,fs段寄存器,和ebx,ecx,edx等传递的参数以及eax保存的调用号,并没有保存其他的寄存器值。也就是没有用到kernel/asm.s文件。我们再来看一下asm.s的开始部分
14 .globl _divide_error,_debug,_nmi,_int3,_overflow,_bounds,_invalid_op
15 .globl _double_fault,_coprocessor_segment_overrun
16 .globl _invalid_TSS,_segment_not_present,_stack_segment
17 .globl _general_protection,_coprocessor_error,_irq13,_reserved
可以看到它主要是处理一些比较底层的中断。那么我们以_divide_error为例,来看一下它是如何被使用的。
kernel/traps.c:185: set_trap_gate(0,÷_error);
假设现在发生的divide_error,内核根据idt找到对应的中断处理函数处理,也就是这里的divide_error。
kernel/asm.s:19:_divide_error:
19 _divide_error:
20 pushl $_do_divide_error
这里把do_divide_error函数地址压入栈中,作为返回地址。
21 no_error_code:
22 xchgl %eax,(%esp) //C函数地址,因为20行压入了函数地址
23 pushl %ebx
24 pushl %ecx
25 pushl %edx
26 pushl %edi
27 pushl %esi
28 pushl %ebp
29 push %ds
30 push %es
31 push %fs
32 pushl $0 # "error code"
33 lea 44(%esp),%edx //这里其实是把返回地址入栈,44(%esp)处的返回地址是由CPU硬件自动压入栈的
34 pushl %edx
35 movl $0x10,%edx
36 mov %dx,%ds
37 mov %dx,%es
38 mov %dx,%fs
ds,es,fs都设为内核数据段。
39 call *%eax //(%esp)
40 addl $8,%esp
把参数丢弃掉。
41 pop %fs
42 pop %es
43 pop %ds
44 popl %ebp
45 popl %esi
46 popl %edi
47 popl %edx
48 popl %ecx
49 popl %ebx
50 popl %eax
恢复寄存器值。
51 iret
返回到原来的下一条指令处运行。这里iret具体行为可以参考另一篇笔记《init/main.c源码分析》。
我们再来看一下这里的函数_do_divide_error:
kernel/traps.c:97:void do_divide_error(long esp, long error_code)
97 void do_divide_error(long esp, long error_code)
98 {
99 die("divide error",esp,error_code);
100 }
到这里整个流程就比较明白了。
总结:
这里我们最终总结一下与中断处理有关的知识点。中断发生时,首先由CPU硬件自动保存一些寄存器这里涉及到两种情况,如果不涉及到特权级的改变,就只需要保存eflags,cs和eip;如果涉及到特权级的改变,除了上面的内容外还需要保存原ss和esp。这是因为二者使用的堆栈不同的原因。当涉及到特权级改变时使用的是内核堆栈。接下来,内核会根据中断号来查询idt表,对于一些比较底层的中断,它所对应的处理函数一般是在asm.s文件中的,那么这里就涉及到error_code和no_error_code两种情况,主要涉及到保存寄存器的一些操作,在asm中又会调用traps.c中具体的C函数,这就实现了中断处理;如果不是底层的中断,那么注册的处理函数一般是在system_call.s文件中声明的,然后在这里面它又会调用比如call sys_call_table,sys_call_table实在include/unistd.h中生声明的,通过索引找到具体的处理函数,这些处理函数分布在不同的模块中。