linux进程调度之进程切换

 linux调度中,在schedule函数中完成选择下一个进行、进程间切换进程的切换在schedule函数中主要由两个函数完成:

 sched_info_switch(prev, next);主要是更新切换出去和进来进程以及对应rq的相关变量。该函数主要调用__sched_info_switch函数来实现。

/*
 * Called when tasks are switched involuntarily due, typically, to expiring
 * their time slice.  (This may also be called when switching to or from
 * the idle task.)  We are only called when prev != next.
 */
static inline void
__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
	struct rq *rq = task_rq(prev);

	/*
	 * prev now departs the cpu.  It's not interesting to record
	 * stats about how efficient we were at scheduling the idle
	 * process, however.
	 */
	if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/
		sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/

	if (next != rq->idle)/*如果切换进来的进程不是idle进程*/
		sched_info_arrive(next);/*更新next进程和对应队列的相关变量*/
}
/*
 * Called when a process ceases being the active-running process, either
 * voluntarily or involuntarily.  Now we can calculate how long we ran.
 * Also, if the process is still in the TASK_RUNNING state, call
 * sched_info_queued() to mark that it has now again started waiting on
 * the runqueue.
 */
static inline void sched_info_depart(struct task_struct *t)
{
	/*计算在进程在rq中运行的时间长度*/
	unsigned long long delta = task_rq(t)->clock -
					t->sched_info.last_arrival;
	/*更新RunQueue中的Task所得到CPU執行
	時間的累加值.*/
	rq_sched_info_depart(task_rq(t), delta);
	
	/*如果被切换出去进程的状态是运行状态
	那么将进程sched_info.last_queued设置为rq的clock
	last_queued为最后一次排队等待运行的时间*/
	if (t->state == TASK_RUNNING)
		sched_info_queued(t);
}
/*
 * Called when a task finally hits the cpu.  We can now calculate how
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
static void sched_info_arrive(struct task_struct *t)
{
	unsigned long long now = task_rq(t)->clock, delta = 0;

	if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/
		delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/
	sched_info_reset_dequeued(t);/*因为进程将被切换进来运行,设定last_queued为0*/
	t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/
	t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/
	t->sched_info.pcount++;/*cpu上运行的次数加一*/
	/*更新rq中rq_sched_info中的对应的变量*/
	rq_sched_info_arrive(task_rq(t), delta);
}

context_switch函数完成主要的硬件、寄存器等实际的切换工作。

/*
 * context_switch - switch to the new MM and the new
 * thread's register state.
 */
static inline void
context_switch(struct rq *rq, struct task_struct *prev,
	       struct task_struct *next)
{
	struct mm_struct *mm, *oldmm;

	prepare_task_switch(rq, prev, next);
	trace_sched_switch(rq, prev, next);
	mm = next->mm;
	oldmm = prev->active_mm;
	/*
	 * For paravirt, this is coupled with an exit in switch_to to
	 * combine the page table reload and the switch backend into
	 * one hypercall.
	 */
	arch_start_context_switch(prev);

	if (unlikely(!mm)) {/*如果被切换进来的进程的mm为空*/
		next->active_mm = oldmm;/*将共享切换出去进程的active_mm*/
		atomic_inc(&oldmm->mm_count);/*有一个进程共享,所有引用计数加一*/
		/*将per cpu变量cpu_tlbstate状态设为LAZY*/
		enter_lazy_tlb(oldmm, next);
	} else/*如果mm不会空,那么进行mm切换*/
		switch_mm(oldmm, mm, next);

	if (unlikely(!prev->mm)) {/*如果切换出去的mm为空,从上面
	可以看出本进程的active_mm为共享先前切换出去的进程
	的active_mm,所有需要在这里置空*/
		prev->active_mm = NULL;
		rq->prev_mm = oldmm; /*更新rq的前一个mm结构*/
	}
	/*
	 * Since the runqueue lock will be released by the next
	 * task (which is an invalid locking op but in the case
	 * of the scheduler it's an obvious special-case), so we
	 * do an early lockdep release here:
	 */
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif

	/* Here we just switch the register state and the stack. */
	switch_to(prev, next, prev);

	barrier();
	/*
	 * this_rq must be evaluated again because prev may have moved
	 * CPUs since it called schedule(), thus the 'rq' on its stack
	 * frame will be invalid.
	 */
	finish_task_switch(this_rq(), prev);
}
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
			     struct task_struct *tsk)
{
	unsigned cpu = smp_processor_id();

	if (likely(prev != next)) {
		/* stop flush ipis for the previous mm */
		/*将被替换进程使用的内存描述结构的CPU
		掩码中当前处理器号对应的位码清0*/
		cpumask_clear_cpu(cpu, mm_cpumask(prev));
#ifdef CONFIG_SMP
		/*设置per cpu变量tlb*/
		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		percpu_write(cpu_tlbstate.active_mm, next);
#endif
		/*将要被调度运行进程拥有的内存描述结构
		的CPU掩码中当前处理器号对应的位码设置为1*/
		cpumask_set_cpu(cpu, mm_cpumask(next));

		/* Re-load page tables */
		load_cr3(next->pgd);/*将切换进来进程的pgd load到cr3寄存器*/

		/*
		 * load the LDT, if the LDT is different:
		 */
		if (unlikely(prev->context.ldt != next->context.ldt))
			load_LDT_nolock(&next->context);
	}
#ifdef CONFIG_SMP
	else {/*如果切换的两个进程相同*/
		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);

		if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
			/* We were in lazy tlb mode and leave_mm disabled
			 * tlb flush IPI delivery. We must reload CR3
			 * to make sure to use no freed page tables.
			 */
			load_cr3(next->pgd);
			load_LDT_nolock(&next->context);
		}
	}
#endif
}


具体寄存器相关的切换由函数switch_to完成,改函数用汇编代码保持各种寄存器的值,然后调用c函数__switch_to,

汇编中实现了具体的切换:

/*
 * Saving eflags is important. It switches not only IOPL between tasks,
 * it also protects other tasks from NT leaking through sysenter etc.
 */
#define switch_to(prev, next, last)					\
do {									\
	/*								\
	 * Context-switching clobbers all registers, so we clobber	\
	 * them explicitly, via unused output variables.		\
	 * (EAX and EBP is not listed because EBP is saved/restored	\
	 * explicitly for wchan access and EAX is the return value of	\
	 * __switch_to())						\
	 */								\
	unsigned long ebx, ecx, edx, esi, edi;				\
									\
	asm volatile("pushfl\n\t"		/* save    flags */	\
		     "pushl %%ebp\n\t"		/* save    EBP   */	\
		     "movl %%esp,%[prev_sp]\n\t"	/* save    ESP   */ \
		     "movl %[next_sp],%%esp\n\t"	/* restore ESP   */ \
		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
		/*将next_ip入栈,下面用jmp跳转,这样
		返回到标号1时就切换过来了*/
		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
		     __switch_canary					\
		     "jmp __switch_to\n"	/* regparm call  */	\
		     "1:\t"						\
		     /*切换到新进程的第一条指令*/
		     "popl %%ebp\n\t"		/* restore EBP   */	\
		     "popfl\n"			/* restore flags */	\
									\
		     /* output parameters */				\
		     : [prev_sp] "=m" (prev->thread.sp),		\
		       [prev_ip] "=m" (prev->thread.ip),		\
		       "=a" (last),					\
									\
		       /* clobbered output registers: */		\
		       "=b" (ebx), "=c" (ecx), "=d" (edx),		\
		       "=S" (esi), "=D" (edi)				\
		       							\
		       __switch_canary_oparam				\
									\
		       /* input parameters: */				\
		     : [next_sp]  "m" (next->thread.sp),		\
		       [next_ip]  "m" (next->thread.ip),		\
		       							\
		       /* regparm parameters for __switch_to(): */	\
		       [prev]     "a" (prev),				\
		       [next]     "d" (next)				\
									\
		       __switch_canary_iparam				\
									\
		     : /* reloaded segment registers */			\
			"memory");					\
} while (0)

 

/*
 *	switch_to(x,yn) should switch tasks from x to y.
 *
 * We fsave/fwait so that an exception goes off at the right time
 * (as a call from the fsave or fwait in effect) rather than to
 * the wrong process. Lazy FP saving no longer makes any sense
 * with modern CPU's, and this simplifies a lot of things (SMP
 * and UP become the same).
 *
 * NOTE! We used to use the x86 hardware context switching. The
 * reason for not using it any more becomes apparent when you
 * try to recover gracefully from saved state that is no longer
 * valid (stale segment register values in particular). With the
 * hardware task-switch, there is no way to fix up bad state in
 * a reasonable manner.
 *
 * The fact that Intel documents the hardware task-switching to
 * be slow is a fairly red herring - this code is not noticeably
 * faster. However, there _is_ some room for improvement here,
 * so the performance issues may eventually be a valid point.
 * More important, however, is the fact that this allows us much
 * more flexibility.
 *
 * The return value (in %ax) will be the "prev" task after
 * the task-switch, and shows up in ret_from_fork in entry.S,
 * for example.
 */
__notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
	struct thread_struct *prev = &prev_p->thread,
				 *next = &next_p->thread;
	int cpu = smp_processor_id();
	struct tss_struct *tss = &per_cpu(init_tss, cpu);/*init_tss为一个per cpu变量*/
	bool preload_fpu;

	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

	/*
	 * If the task has used fpu the last 5 timeslices, just do a full
	 * restore of the math state immediately to avoid the trap; the
	 * chances of needing FPU soon are obviously high now
	 */
	preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5;
	/*保存FPU寄存器*/
	__unlazy_fpu(prev_p);

	/* we're going to use this soon, after a few expensive things */
	if (preload_fpu)
		prefetch(next->xstate);

	/*
	 * Reload esp0.
	 */
	 /*吧next_p->thread.esp0装入对应于本地cpu的tss的esp0
	字段;任何由sysenter汇编指令产生的从用户态
	到内核态的特权级转换将把这个地址拷贝到
	esp寄存器中*/
	load_sp0(tss, next);

	/*
	 * Save away %gs. No need to save %fs, as it was saved on the
	 * stack on entry.  No need to save %es and %ds, as those are
	 * always kernel segments while inside the kernel.  Doing this
	 * before setting the new TLS descriptors avoids the situation
	 * where we temporarily have non-reloadable segments in %fs
	 * and %gs.  This could be an issue if the NMI handler ever
	 * used %fs or %gs (it does not today), or if the kernel is
	 * running inside of a hypervisor layer.
	 */
	lazy_save_gs(prev->gs);

	/*
	 * Load the per-thread Thread-Local Storage descriptor.
	 */
	 /*把next进程使用的县城局部存储(TLS)段装入本地CPU
	 的全局描述符表;三个段选择符保存在进程描述符
	 内的tls_array数组中*/
	load_TLS(next, cpu);

	/*
	 * Restore IOPL if needed.  In normal use, the flags restore
	 * in the switch assembly will handle this.  But if the kernel
	 * is running virtualized at a non-zero CPL, the popf will
	 * not restore flags, so it must be done in a separate step.
	 */
	if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
		set_iopl_mask(next->iopl);

	/*
	 * Now maybe handle debug registers and/or IO bitmaps
	 */
	if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV ||
		     task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT))
		__switch_to_xtra(prev_p, next_p, tss);

	/* If we're going to preload the fpu context, make sure clts
	   is run while we're batching the cpu state updates. */
	if (preload_fpu)
		clts();

	/*
	 * Leave lazy mode, flushing any hypercalls made here.
	 * This must be done before restoring TLS segments so
	 * the GDT and LDT are properly updated, and must be
	 * done before math_state_restore, so the TS bit is up
	 * to date.
	 */
	arch_end_context_switch(next_p);

	if (preload_fpu)
		__math_state_restore();/*装载FPU寄存器*/

	/*
	 * Restore %gs if needed (which is common)
	 */
	if (prev->gs | next->gs)
		lazy_load_gs(next->gs);

	percpu_write(current_task, next_p);

	return prev_p;
}
static inline void __unlazy_fpu(struct task_struct *tsk)
{	/*包含在thread_info描述符的status字段中的
	TS_USEDFPU标志。他表示进程在当前执行的过程中
	是否使用过FPU/MMU/XMM寄存器*/
	if (task_thread_info(tsk)->status & TS_USEDFPU) {
		/*由于tsk在这次执行中使用了FPU/MMX/SSE或
		SSE2指令;因此内核必须保存相关的硬件
		上下文*/
		__save_init_fpu(tsk);
		stts();
	} else
		tsk->fpu_counter = 0;
}


static inline void __save_init_fpu(struct task_struct *tsk)
{	/*如果CPU使用SSE/SSE2扩展,则*/
	if (task_thread_info(tsk)->status & TS_XSAVE)
		xsave(tsk);
	else
		fxsave(tsk);

	clear_fpu_state(tsk);
	task_thread_info(tsk)->status &= ~TS_USEDFPU;/*重置TS_USEDFPU标志*/
}



你可能感兴趣的:(linux进程调度之进程切换)