中断的响应和服务

      在IDT和中断响应队列初始化完毕之后,我们来看如何响应一个中断。

      CPU从中断控制器获得中断向量,然后从IDT表中找出对应表项,实际上是一个中断门。根据中断门中的地址信息,我们找到了特定中断的服务入口地址。首先,我们需要问的是,这个入口地址是在哪里设置的呢?我们回到IDT表的初始化一节中,回顾interrupt数组的初始化,如下:

/*
 * Build the entry stubs and pointer table with some assembler magic.
 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
 * single cache line on all modern x86 implementations.
 */
.section .init.rodata,"a"
ENTRY(interrupt)                            //interrupt入口
.section .entry.text, "ax"
	.p2align 5
	.p2align CONFIG_X86_L1_CACHE_SHIFT
ENTRY(irq_entries_start) 
	RING0_INT_FRAME
vector=FIRST_EXTERNAL_VECTOR
.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
	.balign 32
  .rept	7
    .if vector < NR_VECTORS
      .if vector <> FIRST_EXTERNAL_VECTOR
	CFI_ADJUST_CFA_OFFSET -4
      .endif
1:	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range*********************************** */
      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
	jmp 2f
      .endif
      .previous
	.long 1b
      .section .entry.text, "ax"
vector=vector+1
    .endif
  .endr
2:	jmp common_interrupt          //******************************************
.endr
END(irq_entries_start)

.previous
END(interrupt)
.previous

interrupt数组中存储的内容就是标号为1的代码的地址,每个外部中断对应一个这样的地址,并将其存储在中断门的地址字段,中断门的段选择子存储的是_KERNEL_CS。因此,中断服务程序的入口,也就是执行的第一条指令就是push_cfi $(~vector + 0x80),在这之后就jmp common_interrupt,进入公共服务程序common_interrupt。

/*
 * the CPU automatically disables interrupts when executing an IRQ vector,
 * so IRQ-flags tracing has to follow that:
 */
	.p2align CONFIG_X86_L1_CACHE_SHIFT
common_interrupt:
	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
	SAVE_ALL
	TRACE_IRQS_OFF
	movl %esp,%eax  //作为参数传递过去。eax中存储的是系统栈顶端的位置,do_IRQ需要的pt_regs参数就是通过eax传递的。
	call do_IRQ
	jmp ret_from_intr
ENDPROC(common_interrupt)
	CFI_ENDPROC

我们来看一下,这期间发生了什么?

1:	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
此行代码将处理过的中断调用号(~vector+0x80)压入系统栈

在公共跳转目标common_interrupt中:

	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */
将压入栈顶的值加上-0x80,那么现在栈顶的值就变成了~vector,也就是说最终是将~vector压入栈顶的。为什么不直接将vector压栈呢?因为系统堆栈的这个位置在因系统调用而进入内核时要用来存放系统调用号,而系统调用又与中断服务程序共享一部分程序,这样可以通过这种手段加以区分。

我们关键看common_interrupt的实现:

在将栈顶调整为~vector之后,执行SAVE_ALL,就是所谓的“保存现场”:

.macro SAVE_ALL
	cld
	PUSH_GS
	pushl_cfi %fs
	/*CFI_REL_OFFSET fs, 0;*/
	pushl_cfi %es
	/*CFI_REL_OFFSET es, 0;*/
	pushl_cfi %ds
	/*CFI_REL_OFFSET ds, 0;*/
	pushl_cfi %eax
	CFI_REL_OFFSET eax, 0
	pushl_cfi %ebp
	CFI_REL_OFFSET ebp, 0
	pushl_cfi %edi
	CFI_REL_OFFSET edi, 0
	pushl_cfi %esi
	CFI_REL_OFFSET esi, 0
	pushl_cfi %edx
	CFI_REL_OFFSET edx, 0
	pushl_cfi %ecx
	CFI_REL_OFFSET ecx, 0
	pushl_cfi %ebx
	CFI_REL_OFFSET ebx, 0
	movl $(__USER_DS), %edx  //其实都是指向从0开始的空间
	movl %edx, %ds
	movl %edx, %es
	movl $(__KERNEL_PERCPU), %edx
	movl %edx, %fs
	SET_KERNEL_GS %edx
.endm

而对于PUSH_GS:

/*
 * User gs save/restore
 *
 * %gs is used for userland TLS and kernel only uses it for stack
 * canary which is required to be at %gs:20 by gcc.  Read the comment
 * at the top of stackprotector.h for more info.
 *
 * Local labels 98 and 99 are used.
 */
#ifdef CONFIG_X86_32_LAZY_GS

 /* unfortunately push/pop can't be no-op */
.macro PUSH_GS
	pushl_cfi $0
.endm
.macro POP_GS pop=0
	addl $(4 + \pop), %esp
	CFI_ADJUST_CFA_OFFSET -(4 + \pop)
.endm
.macro POP_GS_EX
.endm

 /* all the rest are no-op */
.macro PTGS_TO_GS
.endm
.macro PTGS_TO_GS_EX
.endm
.macro GS_TO_REG reg
.endm
.macro REG_TO_PTGS reg
.endm
.macro SET_KERNEL_GS reg
.endm

#else	/* CONFIG_X86_32_LAZY_GS */

.macro PUSH_GS
	pushl_cfi %gs
	/*CFI_REL_OFFSET gs, 0*/
.endm

.macro POP_GS pop=0
98:	popl_cfi %gs
	/*CFI_RESTORE gs*/
  .if \pop <> 0
	add $\pop, %esp
	CFI_ADJUST_CFA_OFFSET -\pop
  .endif
.endm
.macro POP_GS_EX
.pushsection .fixup, "ax"
99:	movl $0, (%esp)
	jmp 98b
.section __ex_table, "a"
	.align 4
	.long 98b, 99b
.popsection
.endm

.macro PTGS_TO_GS
98:	mov PT_GS(%esp), %gs
.endm
.macro PTGS_TO_GS_EX
.pushsection .fixup, "ax"
99:	movl $0, PT_GS(%esp)
	jmp 98b
.section __ex_table, "a"
	.align 4
	.long 98b, 99b
.popsection
.endm

.macro GS_TO_REG reg
	movl %gs, \reg
	/*CFI_REGISTER gs, \reg*/
.endm
.macro REG_TO_PTGS reg
	movl \reg, PT_GS(%esp)
	/*CFI_REL_OFFSET gs, PT_GS*/
.endm
.macro SET_KERNEL_GS reg
	movl $(__KERNEL_STACK_CANARY), \reg
	movl \reg, %gs
.endm

#endif	/* CONFIG_X86_32_LAZY_GS */

关于fs和gs寄存器,stackprotector.h有这样的注释解释:

/*
 * GCC stack protector support.
 *
 * Stack protector works by putting predefined pattern at the start of
 * the stack frame and verifying that it hasn't been overwritten when
 * returning from the function.  The pattern is called stack canary
 * and unfortunately gcc requires it to be at a fixed offset from %gs.
 * On x86_64, the offset is 40 bytes and on x86_32 20 bytes.  x86_64
 * and x86_32 use segment registers differently and thus handles this
 * requirement differently.
 *
 * On x86_64, %gs is shared by percpu area and stack canary.  All
 * percpu symbols are zero based and %gs points to the base of percpu
 * area.  The first occupant of the percpu area is always
 * irq_stack_union which contains stack_canary at offset 40.  Userland
 * %gs is always saved and restored on kernel entry and exit using
 * swapgs, so stack protector doesn't add any complexity there.
 *
 * On x86_32, it's slightly more complicated.  As in x86_64, %gs is
 * used for userland TLS.  Unfortunately, some processors are much
 * slower at loading segment registers with different value when
 * entering and leaving the kernel, so the kernel uses %fs for percpu
 * area and manages %gs lazily so that %gs is switched only when
 * necessary, usually during task switch.
 *
 * As gcc requires the stack canary at %gs:20, %gs can't be managed
 * lazily if stack protector is enabled, so the kernel saves and
 * restores userland %gs on kernel entry and exit.  This behavior is
 * controlled by CONFIG_X86_32_LAZY_GS and accessors are defined in
 * system.h to hide the details.
 */
fs和gs两个寄存器在2.4内核中是没有压栈操作的。

注意,这里,我们并没有将用户堆栈的情况和标志寄存器EFLAGS压栈,为什么呢?因为CPU在进入中断服务时已经把EFLAGS连同返回地址(也就是用户堆栈SS和用户堆栈的SP)一起压栈了,这是不需要操作系统来维护的。

到现在,我们得到了进入中断服务程序时系统堆栈,可以参考《情景分析》P211的图。

现在,我们可以调用do_IRQ:

/*
 * do_IRQ handles all normal device IRQ's (the special
 * SMP cross-CPU interrupts have their own specific
 * handlers).
 */
unsigned int __irq_entry do_IRQ(struct pt_regs *regs) //regs的值就是在调用do_IRQ前mov到eax中的值。
{
	struct pt_regs *old_regs = set_irq_regs(regs); //读取当前系统栈中保存的寄存器的状况

	/* high bit used in ret_from_ code  */
	unsigned vector = ~regs->orig_ax; //获取中断请求号
	unsigned irq;

	exit_idle();  //在32位机中,这是个空操作。
	irq_enter();  //进入中断上下文。(下文详述)

        //中断线号与设备的中断号之间对应关系,由系统分派,分派表是一个per-cpu变量vector_irq
	irq = __this_cpu_read(vector_irq[vector]); 

	if (!handle_irq(irq, regs)) {  //核心操作
		ack_APIC_irq();

		if (printk_ratelimit())
			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
				__func__, smp_processor_id(), vector, irq);
	}

	irq_exit();

	set_irq_regs(old_regs);
	return 1;
}

先来看一下do_IRQ的参数的数据结构的原型:
struct pt_regs {
	unsigned long bx;
	unsigned long cx;
	unsigned long dx;
	unsigned long si;
	unsigned long di;
	unsigned long bp;
	unsigned long ax;
	unsigned long ds;
	unsigned long es;
	unsigned long fs;
	unsigned long gs;
	unsigned long orig_ax;
	unsigned long ip;
	unsigned long cs;
	unsigned long flags;
	unsigned long sp;
	unsigned long ss;
};
对照这之前的SAVE_ALL,会发现他们是一一对应的。可以知道,CPU在进入中断时所作的实际上都是在为do_IRQ建立一个模拟的子程序调用环境,使得在do_IRQ中可以方便的知道进入中断前夕各个寄存器的内容,在执行结束后又可以返回到“jmp ret_from_intr”,以执行中断返回。《情景分析》 P213。

在获得了系统栈中保存的寄存器的状态之后,接下来就是进入中断上下文:

/*
 * Enter an interrupt context.
 */
void irq_enter(void)
{
	int cpu = smp_processor_id();

	rcu_irq_enter();
	if (idle_cpu(cpu) && !in_interrupt()) {
		/*
		 * Prevent raise_softirq from needlessly waking up ksoftirqd
		 * here, as softirq will be serviced on return from interrupt.
		 */
		local_bh_disable();
		tick_check_idle(cpu);
		_local_bh_enable();
	}

	__irq_enter();
}

/*
 * It is safe to do non-atomic ops on ->hardirq_context,
 * because NMI handlers may not preempt and the ops are
 * always balanced, so the interrupted value of ->hardirq_context
 * will always be restored.
 */
#define __irq_enter()					\
	do {						\
		account_system_vtime(current);		\  //更新current的虚拟时间
		add_preempt_count(HARDIRQ_OFFSET);	\  //增加preempt_count,将其标志为不可抢占状态。
		trace_hardirq_enter();			\  
	} while (0)


现在,我们进入do_IRQ的核心操作handle_irq来看一下:

bool handle_irq(unsigned irq, struct pt_regs *regs)
{
	struct irq_desc *desc;
	int overflow;

	overflow = check_stack_overflow(); //检查系统栈是否溢出

	desc = irq_to_desc(irq); //    return (irq < NR_IRQS) ? irq_desc + irq : NULL;

	if (unlikely(!desc))
		return false;

	if (!execute_on_irq_stack(overflow, desc, irq)) { //判断是否运行在中断栈
		if (unlikely(overflow))                   //如果是运行在系统栈
			print_stack_overflow();
		desc->handle_irq(irq, desc);
	}

	return true;
}
来看一下其中一个函数
execute_on_irq_stack
它涉及到中断请求栈的概念,为什么会有这个概念呢?如果配置的内核的thread_union是4K大小,那么内核栈实际上是有三个的,其中一个就是专门用来处理硬件中断请求的栈。(具体的请参考ULK——P164或者网站http://blog.csdn.net/maray/article/details/5770889)。但是,我们并不需要过分的关心这个,因为大多数情况下,thread_union是8K 的,也就是说下面这段代码会直接返回0的。
static inline int
execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
{
	union irq_ctx *curctx, *irqctx;
	u32 *isp, arg1, arg2;

	curctx = (union irq_ctx *) current_thread_info();
	irqctx = __this_cpu_read(hardirq_ctx);

	/*
	 * this is where we switch to the IRQ stack. However, if we are
	 * already using the IRQ stack (because we interrupted a hardirq
	 * handler) we can't do that and just have to keep using the
	 * current stack (which is the irq stack already after all)
	 */
	if (unlikely(curctx == irqctx)) //如果内核系统栈和硬中断请求栈有相同的起始地址,则说明,没有用到硬中断请求栈。
		return 0;

	/* build the stack frame on the IRQ stack 
           如果用到了硬中断请求栈,那么我们就要分配相应的页面,设置相应的字段了,下面的代码就是完成这项工作的。
        */ 
	isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));
	irqctx->tinfo.task = curctx->tinfo.task;
	irqctx->tinfo.previous_esp = current_stack_pointer;

	/*
	 * Copy the softirq bits in preempt_count so that the
	 * softirq checks work in the hardirq context.
	 */
	irqctx->tinfo.preempt_count =
		(irqctx->tinfo.preempt_count & ~SOFTIRQ_MASK) |
		(curctx->tinfo.preempt_count & SOFTIRQ_MASK);

	if (unlikely(overflow))
		call_on_stack(print_stack_overflow, isp);

	asm volatile("xchgl	%%ebx,%%esp	\n"
		     "call	*%%edi		\n"
		     "movl	%%ebx,%%esp	\n"
		     : "=a" (arg1), "=d" (arg2), "=b" (isp)
		     :  "0" (irq),   "1" (desc),  "2" (isp),
			"D" (desc->handle_irq)
		     : "memory", "cc", "ecx");
	return 1;
}

回到handle_irq中。

接下来我们要看的,就是真正的来处理中断时间的操作了:

		desc->handle_irq(irq, desc);
不要忘了,我们在中断队列初始化中做过的事情:
        for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)  
            irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);  
也就是,handle_irq被初始化为handle_level_irq(电平触发中断)了。当然,还有一些公共通道是handle_edge_irq(边缘触发中断)(如有关时钟的等),但是,它们最终都是要调用handle_irq_event的,这才是我们关心的重点:
/**
 *	handle_level_irq - Level type irq handler
 *	@irq:	the interrupt number
 *	@desc:	the interrupt description structure for this irq
 *
 *	Level type interrupts are active as long as the hardware line has
 *	the active level. This may require to mask the interrupt and unmask
 *	it after the associated handler has acknowledged the device, so the
 *	interrupt line is back to inactive.
 */
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
	raw_spin_lock(&desc->lock);
	mask_ack_irq(desc);

	if (unlikely(irqd_irq_inprogress(&desc->irq_data))) //如果对应的中断已经处在处理过程中,则有可能出错了。
		if (!irq_check_poll(desc))
			goto out_unlock;

	desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); //设置desc状态为非等待状态和非REPLAY状态
	kstat_incr_irqs_this_cpu(irq, desc);

	/*
	 * If its disabled or no action available
	 * keep it masked and get out of here
	 */
	if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
		goto out_unlock;

	handle_irq_event(desc);

	if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
		unmask_irq(desc);
out_unlock:
	raw_spin_unlock(&desc->lock);
}

首先来看一下,CPU是如何给出响应信号的也就是操作mask_ack_irq(),将irq_data中的state_use_accessors设置为IRQD_IRQ_MASKED,表示已经响应:
static inline void mask_ack_irq(struct irq_desc *desc)
{
	if (desc->irq_data.chip->irq_mask_ack)
		desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
	else {
		desc->irq_data.chip->irq_mask(&desc->irq_data);
		if (desc->irq_data.chip->irq_ack)
			desc->irq_data.chip->irq_ack(&desc->irq_data);
	}
	irq_state_set_masked(desc); //    d->state_use_accessors |= IRQD_IRQ_MASKED;

}
这里,要记得在中断队列初始化的时候,我们曾经跟踪过chip,结果chip是i8259A_chip。

现在,我们来看一下是如何进行中断服务的,也就是handle_irq_event()函数所完成的操作:

irqreturn_t handle_irq_event(struct irq_desc *desc)
{
	struct irqaction *action = desc->action;
	irqreturn_t ret;

	desc->istate &= ~IRQS_PENDING;  //将istate设置为非PENDING状态。通过PENDING的设置来禁止中断在同一个中断源
                                        //同一个中断通道嵌套应该是避免的,关于这个知识点,请参看《情景分析》P215。
	irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
	raw_spin_unlock(&desc->lock);

	ret = handle_irq_event_percpu(desc, action);

	raw_spin_lock(&desc->lock);
	irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); //中断服务程序完成之后清楚INPROGRESS标志
	return ret;
}

好了,最终我们来到了handle_irq_event_percpu(),也就是最终我们用来查找服务程序并进行服务的函数。一路走先来发现,这要比2.4麻烦的多!

irqreturn_t
handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{ 
	irqreturn_t retval = IRQ_NONE; //interrupt was not from this device.
	unsigned int random = 0, irq = desc->irq_data.irq;

	do {
		irqreturn_t res;
		trace_irq_handler_entry(irq, action);
		res = action->handler(irq, action->dev_id); //服务程序,具体的中断服务程序是设备驱动范畴内的东西,我们现在不关心。
		trace_irq_handler_exit(irq, action, res);

		if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
			      irq, action->handler))
			local_irq_disable();

		switch (res) {
		case IRQ_WAKE_THREAD:   /*handler requests to wake the handler thread*/
			/*
			 * Catch drivers which return WAKE_THREAD but
			 * did not set up a thread function
			 */
			if (unlikely(!action->thread_fn)) {
				warn_no_thread(irq, action);
				break;
			}

			irq_wake_thread(desc, action);

			/* Fall through to add to randomness */
		case IRQ_HANDLED:    /*interrupt was handled by this device*/
			random |= action->flags;
			break;

		default:
			break;
		}

		retval |= res;
		action = action->next;
	} while (action);

	if (random & IRQF_SAMPLE_RANDOM)  //如果队列中某个服务程序要为系统引入一些随机性,那么要调用下面的函数。
		add_interrupt_randomness(irq);

	if (!noirqdebug)
		note_interrupt(irq, desc, retval);
	return retval;
}
我们看到这个函数的主题是一个循环,在irq对应的desc指向的action服务队列中遍历查找相应的服务程序。那么,这里有个问题,如果action队列有多个服务,那么每次有来自这个通道的中断请求时就要依次把队列遍历一遍,这样岂不是很浪费么?确实,这样不是最高效的方式,但是还好。首先,每个具体的服务程序都应该一开始就检查各自的中断源,一般都是读相应设备的中断状态寄存器,看是否来自设备的中断请求,如果没有就马上返回了,这个过程之需要几条指令,其次,每个队列中服务程序的数量一般不会太大。所以,不会有显著的影响。《情景分析》P218


好像我们已经完成了中断服务程序的过程。。。

没有,还没有!确切的说只是从逻辑的角度来看完成了中断请求的服务。因为Linux内核将整个服务程序分割成了两个部分,第一部分必须立即执行,一般是在关中断的条件下执行的并且必须是对每次请求都单独执行的。而令一部分,则可以稍后在开中断的条件下执行,并且可以将若干次中断服务中剩下来的部分合并起来执行,这写操作往往是很费时的,因此在关中断条件下执行是不合适的,会影响到其他的中断服务。《情景分析》

为了找到后一部分的操作,我们回到do_IRQ中,do_IRQ的最后,与最开始的irq_enter对应的会调用irq_exit,我们来看一下,这个函数做了什么:

/*
 * Exit an interrupt context. Process softirqs if needed and possible:
 */
void irq_exit(void)
{
	account_system_vtime(current); //再一次当前进程的vtime
	trace_hardirq_exit();
	sub_preempt_count(IRQ_EXIT_OFFSET); //减去该中断占用的preempt_count的值
	if (!in_interrupt() && local_softirq_pending()) //如果系统不再处理中断了并且有softirq没有处理
		invoke_softirq();                       //那么还行softirq,这就是我们说的中断的后半部分的工作!!!

	rcu_irq_exit();
#ifdef CONFIG_NO_HZ
	/* Make sure that timer wheel updates are propagated */
	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
		tick_nohz_stop_sched_tick(0);
#endif
	preempt_enable_no_resched();
}

关于中断服务的第二部分的实现,我们会在另外的专题中介绍。


回到common_interrupt中,看到在完成了中断服务程序以后,也就是调用do_IRQ结束以后,程序会跳转到ret_from_intr,我们来看一下:

ret_from_exception:                    #从异常返回时的入口地址
	preempt_stop(CLBR_ANY)         
ret_from_intr:                         #从中断返回时的入口地址
	GET_THREAD_INFO(%ebp)
check_userspace:
	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS and CS
	movb PT_CS(%esp), %al           # CS的最低两位是CPL位
	andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax  #将除VM标志位和CPL位的其余位全部屏蔽掉
	cmpl $USER_RPL, %eax            #中断是否发生在用户空间
	jb resume_kernel		# not returning to v8086 or userspace 如果中断发生在系统空间,则跳转到resume_kernel
ENTRY(resume_userspace)                 #否则返回到用户空间
        LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
                                        # int/exception return?
        jne work_pending
        jmp restore_all
END(ret_from_exception)

 

我们先来看resume_kernel,在entry_32.S中有两个地方定义了它:

(1)如果不是抢占式的内核,那么

#define resume_kernel		restore_all
来看一下关键的restor_all:

restore_all:
	TRACE_IRQS_IRET
restore_all_notrace:
	movl PT_EFLAGS(%esp), %eax	# mix EFLAGS, SS and CS
	# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
	# are returning to the kernel.
	# See comments in process.c:copy_thread() for details.
	movb PT_OLDSS(%esp), %ah
	movb PT_CS(%esp), %al
	andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
	cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax  #检查进程是否在用户空间被中断,是否有自己的LDT(局部描述符表)
	CFI_REMEMBER_STATE
	je ldt_ss			# returning to user-space with LDT SS 这个我们一般不用关心,因为如果不是在VM模式下,LDT是不会被用到的
restore_nocheck:
	RESTORE_REGS 4			# skip orig_eax/error_code
irq_return:
	INTERRUPT_RETURN

我们将RESTORE_REGS展开:

.macro RESTORE_INT_REGS
	popl_cfi %ebx
	CFI_RESTORE ebx
	popl_cfi %ecx
	CFI_RESTORE ecx
	popl_cfi %edx
	CFI_RESTORE edx
	popl_cfi %esi
	CFI_RESTORE esi
	popl_cfi %edi
	CFI_RESTORE edi
	popl_cfi %ebp
	CFI_RESTORE ebp
	popl_cfi %eax
	CFI_RESTORE eax
.endm

.macro RESTORE_REGS pop=0
	RESTORE_INT_REGS
1:	popl_cfi %ds
	/*CFI_RESTORE ds;*/
2:	popl_cfi %es
	/*CFI_RESTORE es;*/
3:	popl_cfi %fs
	/*CFI_RESTORE fs;*/
	POP_GS \pop                  #    addl $(4 + \pop), %esp  跳过已经没有用的~vector
.pushsection .fixup, "ax"
4:	movl $0, (%esp)
	jmp 1b
5:	movl $0, (%esp)
	jmp 2b
6:	movl $0, (%esp)
	jmp 3b
.section __ex_table, "a"
	.align 4
	.long 1b, 4b
	.long 2b, 5b
	.long 3b, 6b
.popsection
	POP_GS_EX
.endm
我们看到,这段程序的作用就是恢复中断前各寄存器的状态。

在此之后,将执行

	INTERRUPT_RETURN
也就是执行iret指令,返回到那里呢?在执行iret的时候,系统堆栈恢复到刚进入中断门时的状态,而iret则是CPU从中断返回。跟进入中断时相对应,如果是从系统态返回到用户态就会将当前堆栈切换到用户堆栈。

(2)如果是抢占式的内核,那么

#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
	DISABLE_INTERRUPTS(CLBR_ANY)
	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?如果preempt==0则说明是可以抢占的,否则不能抢占。
	jnz restore_all                 #如果不能抢占,则restore_all,准备中断返回。那么什么时候可以被抢占,什么时候不能被抢占呢???
                                        #而且,没有几条指令就要中断返回了,为什么要在这里被抢占呢???
need_resched:                           
	movl TI_flags(%ebp), %ecx	# need_resched set ?
	testb $_TIF_NEED_RESCHED, %cl
	jz restore_all                  #如果不能被调度,则准备中断返回。
	testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)	# interrupts off (exception path) ?
	jz restore_all                  #如果关中断了,那么准备中断返回
	call preempt_schedule_irq       #抢占式内核调度
	jmp need_resched                
END(resume_kernel)
#endif

preempt_schedule_irq的实现还是很简单的:
/*
 * this is the entry point to schedule() from kernel preemption
 * off of irq context.
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
 */
asmlinkage void __sched preempt_schedule_irq(void)
{
	struct thread_info *ti = current_thread_info();

	/* Catch callers which need to be fixed */
	BUG_ON(ti->preempt_count || !irqs_disabled());

	do {
		add_preempt_count(PREEMPT_ACTIVE);
		local_irq_enable();
		schedule();
		local_irq_disable();
		sub_preempt_count(PREEMPT_ACTIVE);

		/*
		 * Check again in case we missed a preemption opportunity
		 * between schedule and now.
		 */
		barrier();
	} while (need_resched());
}

以上我们讲了中断发生在内核太的情况,而如果中断是发生在用户态的,就要如下执行代码了:

ENTRY(resume_userspace)                 #否则返回到用户空间
        LOCKDEP_SYS_EXIT
        DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                                        # setting need_resched or sigpending
                                        # between sampling and the iret
        TRACE_IRQS_OFF
        movl TI_flags(%ebp), %ecx
        andl $_TIF_WORK_MASK, %ecx      # is there any work to be done on
                                        # int/exception return?在返回到用户空间之前,还有要做的事情么?
        jne work_pending                # 如果还有要做的事情,那么跳转到work_pending。
        jmp restore_all                 # 否则,返回。
END(ret_from_exception) 
我们来看一下work_pending:

work_pending:
	testb $_TIF_NEED_RESCHED, %cl
	jz work_notifysig              # 如果不需要调度则跳转到work_notifysig
work_resched:
	call schedule
	LOCKDEP_SYS_EXIT
	DISABLE_INTERRUPTS(CLBR_ANY)	# make sure we don't miss an interrupt
					# setting need_resched or sigpending
					# between sampling and the iret
	TRACE_IRQS_OFF
	movl TI_flags(%ebp), %ecx
	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
					# than syscall tracing?
	jz restore_all
	testb $_TIF_NEED_RESCHED, %cl
	jnz work_resched

work_notifysig:				# deal with pending signals and
					# notify-resume requests
#ifdef CONFIG_VM86                      # 不关心VM86
	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
	movl %esp, %eax
	jne work_notifysig_v86		# returning to kernel-space or
					# vm86-space
	xorl %edx, %edx
	call do_notify_resume
	jmp resume_userspace_sig

	ALIGN
work_notifysig_v86:
	pushl_cfi %ecx			# save ti_flags for do_notify_resume
	call save_v86_state		# %eax contains pt_regs pointer
	popl_cfi %ecx
	movl %eax, %esp
#else
	movl %esp, %eax
#endif
	xorl %edx, %edx
	call do_notify_resume
	jmp resume_userspace_sig
END(work_pending)
看一下do_notify_resume函数:

/*
 * notification of userspace execution resumption
 * - triggered by the TIF_WORK_MASK flags
 */
void
do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
{
#ifdef CONFIG_X86_MCE
	/* notify userspace of pending MCEs */
	if (thread_info_flags & _TIF_MCE_NOTIFY)
		mce_notify_process();
#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */

	/* deal with pending signal delivery */
	if (thread_info_flags & _TIF_SIGPENDING)
		do_signal(regs);

	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
		clear_thread_flag(TIF_NOTIFY_RESUME);
		tracehook_notify_resume(regs);
		if (current->replacement_session_keyring)
			key_replace_session_keyring();
	}
	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
		fire_user_return_notifiers();

#ifdef CONFIG_X86_32
	clear_thread_flag(TIF_IRET);
#endif /* CONFIG_X86_32 */
}
到这里,在中断返回之前,我们还要处理相关的信号。thread_info_flags中的_TIF_NOTIFY_RESUME字段被设置了,那么说明该进程有“信号”

等待处理,要先处理了这些待处理的信号(调用do_signal)才会返回。当然这属于进程间通信的内容,我们以后才会讲到。

处理完要处理的信号之后,最终还是要调用跳转到了retore_all,我们已经在前面细细的分析过了,就不赘述了。


到目前为止,我们终于完成了中断的处理,并返回到了被中断之前的状态,被中断的程序继续运行了(如果没有被重新调度的话)!


你可能感兴趣的:(thread,exception,vector,struct,action,returning)