在挑选得到了下一个即将被调度进来的进程之后,如果被选中的进程不是当前正在运行的进程,那么需要进行上下文切换以执行被选中的进程:
context_switch(rq, prev, next); /* unlocks the rq */ /* * The context switch have flipped the stack from under us * and restored the local variables which were saved when * this task called schedule() in the past. prev == current * is still correct, but it can be moved to another cpu/rq. */无疑,这个函数肯定是至关重要的。
/* * context_switch - switch to the new MM and the new * thread's register state. */ static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must * be paired with a subsequent finish_task_switch after the context * switch. * * prepare_task_switch sets up locking and calls architecture specific * hooks. */ mm = next->mm; oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_start_context_switch(prev); if (!mm) { //内核线程,无需切换上下文 next->active_mm = oldmm; //内核线程active_mm将借用上一个进程的active_mm atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); //通知底层体系结构不需要切换虚拟地址空间的用户部分。这种加速上下文切换的技术称为惰性TBL《深入Linux内核架构》 } else switch_mm(oldmm, mm, next); //切换mm ************************************* if (!prev->mm) { //如果被切换出去的进程是内核线程 prev->active_mm = NULL; // rq->prev_mm = oldmm; //归还借用的oldmm } /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); //切换堆栈和寄存器 **************************************** barrier(); // 同步 ************************************************** /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); }
首先,来看上下文切换中的第一个关键操作switch_mm():static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned cpu = smp_processor_id(); if (likely(prev != next)) { #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables */ load_cr3(next->pgd); //将下一个进车的页目录指针放入cr3寄存器中 /* stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); /* * load the LDT, if the LDT is different: Linux很少用到LDT,我们不予以考虑 */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP else { percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ load_cr3(next->pgd); load_LDT_nolock(&next->context); } } #endif }
虽然这个函数很重要,但是它做的事情好像很简单,仅仅将next的页目录指针放在cr3寄存器中,就完成了成mm的切换。这里有这样一个问题:进程尚未切换,但是其进程的mm页目录指针已经改变了,这样是否会引起问题呢?不会的,因为进程的切换是在内核空间的完成的,而内核空间是共享的,所以这样的切换方式并不会引起错误。
下面,是另外一个核心的操作switch_to(),切换进程的堆栈:
/* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. */ #define switch_to(prev, next, last) \ do { \ /* \ * Context-switching clobbers all registers, so we clobber \ * them explicitly, via unused output variables. \ * (EAX and EBP is not listed because EBP is saved/restored \ * explicitly for wchan access and EAX is the return value of \ * __switch_to()) \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ asm volatile("pushfl\n\t" /* save flags */ \ "pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ "pushl %[next_ip]\n\t" /* restore EIP */ \ __switch_canary \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ "popl %%ebp\n\t" /* restore EBP */ \ "popfl\n" /* restore flags */ \ \ /* output parameters */ \ : [prev_sp] "=m" (prev->thread.sp), \ [prev_ip] "=m" (prev->thread.ip), \ "=a" (last), \ \ /* clobbered output registers: */ \ "=b" (ebx), "=c" (ecx), "=d" (edx), \ "=S" (esi), "=D" (edi) \ \ __switch_canary_oparam \ \ /* input parameters: */ \ : [next_sp] "m" (next->thread.sp), \ [next_ip] "m" (next->thread.ip), \ \ /* regparm parameters for __switch_to(): */ \ [prev] "a" (prev), \ [next] "d" (next) \ \ __switch_canary_iparam \ \ : /* reloaded segment registers */ \ "memory"); \ } while (0)
这段代码详细的解释可以参考《情景分析》P374页。这里只是简要的解释:
"movl %[next_sp],%%esp\n\t" /* restore ESP */ \
完成了堆栈的切换,从这里开始,已经运行在了next进程的堆栈中了。我们来看__switch_to函数做了什么?
/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time * (as a call from the fsave or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). * * NOTE! We used to use the x86 hardware context switching. The * reason for not using it any more becomes apparent when you * try to recover gracefully from saved state that is no longer * valid (stale segment register values in particular). With the * hardware task-switch, there is no way to fix up bad state in * a reasonable manner. * * The fact that Intel documents the hardware task-switching to * be slow is a fairly red herring - this code is not noticeably * faster. However, there _is_ some room for improvement here, * so the performance issues may eventually be a valid point. * More important, however, is the fact that this allows us much * more flexibility. * * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ /* * If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ if (preload_fpu) prefetch(next->fpu.state); /* * Reload esp0. */ load_sp0(tss, next); /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation * where we temporarily have non-reloadable segments in %fs * and %gs. This could be an issue if the NMI handler ever * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ lazy_save_gs(prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. */ load_TLS(next, cpu); /* * Restore IOPL if needed. In normal use, the flags restore * in the switch assembly will handle this. But if the kernel * is running virtualized at a non-zero CPL, the popf will * not restore flags, so it must be done in a separate step. */ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); /* If we're going to preload the fpu context, make sure clts is run while we're batching the cpu state updates. */ if (preload_fpu) clts(); /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be * done before math_state_restore, so the TS bit is up * to date. */ arch_end_context_switch(next_p); if (preload_fpu) __math_state_restore(); /* * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) lazy_load_gs(next->gs); percpu_write(current_task, next_p); return prev_p; }