linux调度中,在schedule函数中完成选择下一个进行、进程间切换进程的切换在schedule函数中主要由两个函数完成:
sched_info_switch(prev, next);主要是更新切换出去和进来进程以及对应rq的相关变量。该函数主要调用__sched_info_switch函数来实现。
/* * Called when tasks are switched involuntarily due, typically, to expiring * their time slice. (This may also be called when switching to or from * the idle task.) We are only called when prev != next. */ static inline void __sched_info_switch(struct task_struct *prev, struct task_struct *next) { struct rq *rq = task_rq(prev); /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/ sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/ if (next != rq->idle)/*如果切换进来的进程不是idle进程*/ sched_info_arrive(next);/*更新next进程和对应队列的相关变量*/ }
/* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { /*计算在进程在rq中运行的时间长度*/ unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; /*更新RunQueue中的Task所得到CPU執行 時間的累加值.*/ rq_sched_info_depart(task_rq(t), delta); /*如果被切换出去进程的状态是运行状态 那么将进程sched_info.last_queued设置为rq的clock last_queued为最后一次排队等待运行的时间*/ if (t->state == TASK_RUNNING) sched_info_queued(t); }
/* * Called when a task finally hits the cpu. We can now calculate how * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ static void sched_info_arrive(struct task_struct *t) { unsigned long long now = task_rq(t)->clock, delta = 0; if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/ delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/ sched_info_reset_dequeued(t);/*因为进程将被切换进来运行,设定last_queued为0*/ t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/ t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/ t->sched_info.pcount++;/*cpu上运行的次数加一*/ /*更新rq中rq_sched_info中的对应的变量*/ rq_sched_info_arrive(task_rq(t), delta); }
context_switch函数完成主要的硬件、寄存器等实际的切换工作。
/* * context_switch - switch to the new MM and the new * thread's register state. */ static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); trace_sched_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_start_context_switch(prev); if (unlikely(!mm)) {/*如果被切换进来的进程的mm为空*/ next->active_mm = oldmm;/*将共享切换出去进程的active_mm*/ atomic_inc(&oldmm->mm_count);/*有一个进程共享,所有引用计数加一*/ /*将per cpu变量cpu_tlbstate状态设为LAZY*/ enter_lazy_tlb(oldmm, next); } else/*如果mm不会空,那么进行mm切换*/ switch_mm(oldmm, mm, next); if (unlikely(!prev->mm)) {/*如果切换出去的mm为空,从上面 可以看出本进程的active_mm为共享先前切换出去的进程 的active_mm,所有需要在这里置空*/ prev->active_mm = NULL; rq->prev_mm = oldmm; /*更新rq的前一个mm结构*/ } /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); }
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned cpu = smp_processor_id(); if (likely(prev != next)) { /* stop flush ipis for the previous mm */ /*将被替换进程使用的内存描述结构的CPU 掩码中当前处理器号对应的位码清0*/ cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP /*设置per cpu变量tlb*/ percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif /*将要被调度运行进程拥有的内存描述结构 的CPU掩码中当前处理器号对应的位码设置为1*/ cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables */ load_cr3(next->pgd);/*将切换进来进程的pgd load到cr3寄存器*/ /* * load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP else {/*如果切换的两个进程相同*/ percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ load_cr3(next->pgd); load_LDT_nolock(&next->context); } } #endif }
具体寄存器相关的切换由函数switch_to完成,改函数用汇编代码保持各种寄存器的值,然后调用c函数__switch_to,
汇编中实现了具体的切换:
/* * Saving eflags is important. It switches not only IOPL between tasks, * it also protects other tasks from NT leaking through sysenter etc. */ #define switch_to(prev, next, last) \ do { \ /* \ * Context-switching clobbers all registers, so we clobber \ * them explicitly, via unused output variables. \ * (EAX and EBP is not listed because EBP is saved/restored \ * explicitly for wchan access and EAX is the return value of \ * __switch_to()) \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ asm volatile("pushfl\n\t" /* save flags */ \ "pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ /*将next_ip入栈,下面用jmp跳转,这样 返回到标号1时就切换过来了*/ "pushl %[next_ip]\n\t" /* restore EIP */ \ __switch_canary \ "jmp __switch_to\n" /* regparm call */ \ "1:\t" \ /*切换到新进程的第一条指令*/ "popl %%ebp\n\t" /* restore EBP */ \ "popfl\n" /* restore flags */ \ \ /* output parameters */ \ : [prev_sp] "=m" (prev->thread.sp), \ [prev_ip] "=m" (prev->thread.ip), \ "=a" (last), \ \ /* clobbered output registers: */ \ "=b" (ebx), "=c" (ecx), "=d" (edx), \ "=S" (esi), "=D" (edi) \ \ __switch_canary_oparam \ \ /* input parameters: */ \ : [next_sp] "m" (next->thread.sp), \ [next_ip] "m" (next->thread.ip), \ \ /* regparm parameters for __switch_to(): */ \ [prev] "a" (prev), \ [next] "d" (next) \ \ __switch_canary_iparam \ \ : /* reloaded segment registers */ \ "memory"); \ } while (0)
/* * switch_to(x,yn) should switch tasks from x to y. * * We fsave/fwait so that an exception goes off at the right time * (as a call from the fsave or fwait in effect) rather than to * the wrong process. Lazy FP saving no longer makes any sense * with modern CPU's, and this simplifies a lot of things (SMP * and UP become the same). * * NOTE! We used to use the x86 hardware context switching. The * reason for not using it any more becomes apparent when you * try to recover gracefully from saved state that is no longer * valid (stale segment register values in particular). With the * hardware task-switch, there is no way to fix up bad state in * a reasonable manner. * * The fact that Intel documents the hardware task-switching to * be slow is a fairly red herring - this code is not noticeably * faster. However, there _is_ some room for improvement here, * so the performance issues may eventually be a valid point. * More important, however, is the fact that this allows us much * more flexibility. * * The return value (in %ax) will be the "prev" task after * the task-switch, and shows up in ret_from_fork in entry.S, * for example. */ __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu);/*init_tss为一个per cpu变量*/ bool preload_fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ /* * If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */ preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; /*保存FPU寄存器*/ __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ if (preload_fpu) prefetch(next->xstate); /* * Reload esp0. */ /*吧next_p->thread.esp0装入对应于本地cpu的tss的esp0 字段;任何由sysenter汇编指令产生的从用户态 到内核态的特权级转换将把这个地址拷贝到 esp寄存器中*/ load_sp0(tss, next); /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation * where we temporarily have non-reloadable segments in %fs * and %gs. This could be an issue if the NMI handler ever * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ lazy_save_gs(prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. */ /*把next进程使用的县城局部存储(TLS)段装入本地CPU 的全局描述符表;三个段选择符保存在进程描述符 内的tls_array数组中*/ load_TLS(next, cpu); /* * Restore IOPL if needed. In normal use, the flags restore * in the switch assembly will handle this. But if the kernel * is running virtualized at a non-zero CPL, the popf will * not restore flags, so it must be done in a separate step. */ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); /* If we're going to preload the fpu context, make sure clts is run while we're batching the cpu state updates. */ if (preload_fpu) clts(); /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be * done before math_state_restore, so the TS bit is up * to date. */ arch_end_context_switch(next_p); if (preload_fpu) __math_state_restore();/*装载FPU寄存器*/ /* * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) lazy_load_gs(next->gs); percpu_write(current_task, next_p); return prev_p; }
static inline void __unlazy_fpu(struct task_struct *tsk) { /*包含在thread_info描述符的status字段中的 TS_USEDFPU标志。他表示进程在当前执行的过程中 是否使用过FPU/MMU/XMM寄存器*/ if (task_thread_info(tsk)->status & TS_USEDFPU) { /*由于tsk在这次执行中使用了FPU/MMX/SSE或 SSE2指令;因此内核必须保存相关的硬件 上下文*/ __save_init_fpu(tsk); stts(); } else tsk->fpu_counter = 0; }
static inline void __save_init_fpu(struct task_struct *tsk) { /*如果CPU使用SSE/SSE2扩展,则*/ if (task_thread_info(tsk)->status & TS_XSAVE) xsave(tsk); else fxsave(tsk); clear_fpu_state(tsk); task_thread_info(tsk)->status &= ~TS_USEDFPU;/*重置TS_USEDFPU标志*/ }