Linux内核分析之五——分析系统调用(system_call)的执行机制

作者:姚开健

原创作品转载请注明出处

《Linux内核分析》MOOC课程http://mooc.study.163.com/course/USTC-1000029000

当系统进行系统调用时,系统会通过int 0x80进行跳转到system_call这个地方,它是在系统初始化时,调用trap_init()就开始初始化好的地址,如果在以后有发生系统调用,则会跳转至system_call这个地方去执行。system_call这个汇编执行块是在x86/kernel/entry_32.S这个文件里面(以x86为例),我们来看看它的部分代码:

ENTRY(system_call)
491	RING0_INT_FRAME			# can't unwind into user space anyway
492	ASM_CLAC
493	pushl_cfi %eax			# save orig_eax
494	SAVE_ALL
495	GET_THREAD_INFO(%ebp)
496					# system call tracing in operation / emulation
497	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
498	jnz syscall_trace_entry
499	cmpl $(NR_syscalls), %eax
500	jae syscall_badsys
501syscall_call:
502	call *sys_call_table(,%eax,4)
503syscall_after_call:
504	movl %eax,PT_EAX(%esp)		# store the return value
505syscall_exit:
506	LOCKDEP_SYS_EXIT
507	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
508					# setting need_resched or sigpending
509					# between sampling and the iret
510	TRACE_IRQS_OFF
511	movl TI_flags(%ebp), %ecx
512	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
513	jne syscall_exit_work
514
515restore_all:
516	TRACE_IRQS_IRET
517restore_all_notrace:
518#ifdef CONFIG_X86_ESPFIX32
519	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
520	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
521	# are returning to the kernel.
522	# See comments in process.c:copy_thread() for details.
523	movb PT_OLDSS(%esp), %ah
524	movb PT_CS(%esp), %al
525	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
526	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
527	CFI_REMEMBER_STATE
528	je ldt_ss			# returning to user-space with LDT SS
529#endif
530restore_nocheck:
531	RESTORE_REGS 4			# skip orig_eax/error_code
532irq_return:
533	INTERRUPT_RETURN
534.section .fixup,"ax"
535ENTRY(iret_exc)
536	pushl $0			# no error code
537	pushl $do_iret_error
538	jmp error_code
539.previous
540	_ASM_EXTABLE(irq_return,iret_exc)
541
542#ifdef CONFIG_X86_ESPFIX32
543	CFI_RESTORE_STATE
544ldt_ss:
545#ifdef CONFIG_PARAVIRT
546	/*
547	 * The kernel can't run on a non-flat stack if paravirt mode
548	 * is active.  Rather than try to fixup the high bits of
549	 * ESP, bypass this code entirely.  This may break DOSemu
550	 * and/or Wine support in a paravirt VM, although the option
551	 * is still available to implement the setting of the high
552	 * 16-bits in the INTERRUPT_RETURN paravirt-op.
553	 */
554	cmpl $0, pv_info+PARAVIRT_enabled
555	jne restore_nocheck
556#endif
557
558/*
559 * Setup and switch to ESPFIX stack
560 *
561 * We're returning to userspace with a 16 bit stack. The CPU will not
562 * restore the high word of ESP for us on executing iret... This is an
563 * "official" bug of all the x86-compatible CPUs, which we can work
564 * around to make dosemu and wine happy. We do this by preloading the
565 * high word of ESP with the high word of the userspace ESP while
566 * compensating for the offset by changing to the ESPFIX segment with
567 * a base address that matches for the difference.
568 */
569#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
570	mov %esp, %edx			/* load kernel esp */
571	mov PT_OLDESP(%esp), %eax	/* load userspace esp */
572	mov %dx, %ax			/* eax: new kernel esp */
573	sub %eax, %edx			/* offset (low word is 0) */
574	shr $16, %edx
575	mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
576	mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
577	pushl_cfi $__ESPFIX_SS
578	pushl_cfi %eax			/* new kernel esp */
579	/* Disable interrupts, but do not irqtrace this section: we
580	 * will soon execute iret and the tracer was already set to
581	 * the irqstate after the iret */
582	DISABLE_INTERRUPTS(CLBR_EAX)
583	lss (%esp), %esp		/* switch to espfix segment */
584	CFI_ADJUST_CFA_OFFSET -8
585	jmp restore_nocheck
586#endif
587	CFI_ENDPROC
588ENDPROC(system_call)
589
590	# perform work that needs to be done immediately before resumption
591	ALIGN
592	RING0_PTREGS_FRAME		# can't unwind into user space anyway
593work_pending:
594	testb $_TIF_NEED_RESCHED, %cl
595	jz work_notifysig
596work_resched:
597	call schedule
598	LOCKDEP_SYS_EXIT
599	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
600					# setting need_resched or sigpending
601					# between sampling and the iret
602	TRACE_IRQS_OFF
603	movl TI_flags(%ebp), %ecx
604	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
605					# than syscall tracing?
606	jz restore_all
607	testb $_TIF_NEED_RESCHED, %cl
608	jnz work_resched
609
610work_notifysig:				# deal with pending signals and
611					# notify-resume requests
612#ifdef CONFIG_VM86
613	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
614	movl %esp, %eax
615	jne work_notifysig_v86		# returning to kernel-space or
616					# vm86-space
6171:
618#else
619	movl %esp, %eax
620#endif
621	TRACE_IRQS_ON
622	ENABLE_INTERRUPTS(CLBR_NONE)
623	movb PT_CS(%esp), %bl
624	andb $SEGMENT_RPL_MASK, %bl
625	cmpb $USER_RPL, %bl
626	jb resume_kernel
627	xorl %edx, %edx
628	call do_notify_resume
629	jmp resume_userspace
630
631#ifdef CONFIG_VM86
632	ALIGN
633work_notifysig_v86:
634	pushl_cfi %ecx			# save ti_flags for do_notify_resume
635	call save_v86_state		# %eax contains pt_regs pointer
636	popl_cfi %ecx
637	movl %eax, %esp
638	jmp 1b
639#endif
640END(work_pending)
641
642	# perform syscall exit tracing
643	ALIGN
644syscall_trace_entry:
645	movl $-ENOSYS,PT_EAX(%esp)
646	movl %esp, %eax
647	call syscall_trace_enter
648	/* What it returned is what we'll actually use.  */
649	cmpl $(NR_syscalls), %eax
650	jnae syscall_call
651	jmp syscall_exit
652END(syscall_trace_entry)
653
654	# perform syscall exit tracing
655	ALIGN
656syscall_exit_work:
657	testl $_TIF_WORK_SYSCALL_EXIT, %ecx
658	jz work_pending
659	TRACE_IRQS_ON
660	ENABLE_INTERRUPTS(CLBR_ANY)	# could let syscall_trace_leave() call
661					# schedule() instead
662	movl %esp, %eax
663	call syscall_trace_leave
664	jmp resume_userspace
665END(syscall_exit_work)
代码比较多,在此精简了其主要执行过程,总结成一个流程图:

可以看到,系统进入系统调用时(system_call),会首先保存现场,执行save_all宏,然后进行调用中断服务程序syscall_call,接着执行syscall_exit,当执行到这里准备退出时,会进行判断需不需要响应其他中断或者信号,如果不需要则直接进行restore_all恢复现场并且irq_return,正式返回到系统调用的地方;

如果需要响应其他中断,则需要执行syscall_exit_work,看看有没有work_resched或者work_notifysig,一个是看看当前进程需不需要调度,如果需要就执行call_schedule,如果需要响应某个信号,则进行work_notifysig,接着再跳转至restore_all接着执行并退出。这个就是系统调用时汇编代码级别的大致执行过程。

当我们知道上述系统调用处理过程之后,我们则可以根据之前的进程调度切换上下文,中断处理切换上下文得出一个一般性的过程,就是当系统需要跳转去调度进程,或中断处理或系统调用时,通常我们需要的是保护现场,接着再跳转至要执行的进程或中断服务程序或系统调用服务程序,接着执行完该程序后再恢复现场,并且返回至之前发生调用的地方。如果在服务程序执行过程中还需要进程其他中断或调度或系统调用,则重复刚才的过程直至返回至最初发生调用的地方。这就是系统调用处理程序的原理。

你可能感兴趣的:(Linux内核分析之五——分析系统调用(system_call)的执行机制)