在调度器选择新的可用的进程之后,不是马上可以执行运行这个进程,而是必须处理与多任务相关的一些环节,所有这些环节就组成了上下文切换。
在调度函数schedule中,有这样的一个片段,这个函数主要用于在就绪队列上寻找下一个可以运行的进程。
asmlinkage void __sched schedule(void) { struct rq *rq;//这表示就绪队列。 ...... if (likely(prev != next)) { rq->nr_switches++; rq->curr = next; ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ } else spin_unlock_irq(&rq->lock); ...... }
上面的prev表示上一个执行的进程,next为下一个要被调度的进程。如果这两个进程不相等,就会执行context_switch,这个函数用于上下文的切换。这就是说如果刚刚结束的进程,马上又被调度,就不需要太多的切换上下文信息了。
在这篇博文之前,先大致的了解一下上下文切换有哪些事情需要完成,我也是根据自己在读源码时的理解,欢迎大家拍砖。
一,代码
static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_enter_lazy_cpu_mode(); if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); if (unlikely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; } /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); }上面的代码主要的工作可以分成两个部分:切换内存管理上下文和切换处理器内容和内存栈。下面主要讨论这两个部分
二,切换内存管理上下文
这个工作是和处理器相关的,主要完成的事情包括加载页表、刷出地址转换后备缓冲器,向内存管理单元提供新的信息。
if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next);
在内核线程的结构中,task_struct的mm域是空的,它没有自已的内存上下文信息,表示其不对用户空间进行访问。这样在内核线程被调度的时候,它就不去修改有些内存上下文数据,因为即将运行的进程不会用这样数据,那么如果在这之后的下一个进程,就是上一个进程的时候,数据还是全部有效的。对于内核线程来说,并将其active_mm指向当前进程的active_mm,并在其地址空间上运行。
enter_lazy_tbl函数在底层处理时会要求其体系结构不需要切换虚拟地址空间的用户空间部分,这时就执行懒惰TLB处理,执行如下:
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP unsigned cpu = smp_processor_id(); if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY; #endif }看一下这个函数空间是做什么的,这样就先要了解cpu_tblstate变量,这是一个每CPU变量,先分析一下这个结构:
cpu_tlbstate按照“每CPU”定义:
DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) ____cacheline_aligned = { &init_mm, 0, }; #define DEFINE_PER_CPU(type, name) \ __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name这里就是定义了struct tlb_state per_cpu_cpu_tlbstate变量,然后将在编译的时候放入.data.percpu段中。
下面再看一下per_cpu宏定义:
#define per_cpu(var, cpu) (*({ \ extern int simple_indentifier_##var(void); \ RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]); })) #define RELOC_HIDE(ptr, off) \ ({ unsigned long __ptr; \ __ptr = (unsigned long) (ptr); \ (typeof(ptr)) (__ptr + (off)); })__per_cpu_offset取一个偏移量,这样就会根据cpu的编号【smp_processor_id()】找到其相关的变量,就是一个struct tlb_state实例,其结构如下:
struct tlb_state { struct mm_struct *active_mm; int state; char __cacheline_padding[L1_CACHE_BYTES-8]; };然后将其的state改为TLBSTATE_LAZY。
我们知道如果内核线程运行时,没有自己的用户地址空间,是在某一进程的地址空间上随机运行,就是借用这个地址,那运行完之后,就要归还借用的状态。
if (unlikely(!prev->mm)) { prev->active_mm = NULL; rq->prev_mm = oldmm; }
如果mm域为空的话,就执行switch_mm函数,这个函数会深入到具体的CPU内部,但主要工作是:
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { int cpu = smp_processor_id(); if (likely(prev != next)) { /* stop flush ipis for the previous mm */ cpu_clear(cpu, prev->cpu_vm_mask); #ifdef CONFIG_SMP per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; per_cpu(cpu_tlbstate, cpu).active_mm = next; #endif cpu_set(cpu, next->cpu_vm_mask); /* Re-load page tables */ load_cr3(next->pgd);加载页表 /* * load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP else { per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK; BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next); if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload %cr3. */ load_cr3(next->pgd); load_LDT_nolock(&next->conte-xt); } } #endif }
这一部分的内容比较简单,其实就只需要知道这一部分要完成什么样的工作,因为具体的完成是底层相关的。主要完成的就是页表和TLB刷出,当然还有一些细节之处。
三,切换处理器内容
这部分主要的代码如下:
/* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); barrier(); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); }先理解一下这个代码的结构,这里注意switch_to在调用之后,就完成了进程切换,因为在调用完这个函数之后,寄存器和栈的环境就会发生变化,也就是说位于switch_to之后的代码只有在当前进程下一次被选择运行时才会被执行。
barrier()是一个编译器指令,它其实提供一个内存使用的屏障。是一个原语操作。
#define barrier() __asm__ __volatile__("": : :"memory")这条语句是告知编译器,保存在CPU寄存器中、在barrier()执行之前有效的所有内存地址,在barrier()执行之后都将失效。就是在执行之后,该进程不会有任何依赖于之前的内存进行读写。这条语句在这里也确保switch_to和finish_task_switch语句的执行顺序不会被优化,因为编译器知道这里不能被顺序优化。
这里看到switch_to有三个形式参数,但只传递了两个参数值,让我们看个究竟,它的定义是一个宏:
#define switch_to(prev,next,last) do { \ unsigned long esi,edi; \ asm volatile("pushfl\n\t" /* Save flags */ \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ "movl %5,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ "pushl %6\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ "popfl" \ :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ "=a" (last),"=S" (esi),"=D" (edi) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ "2" (prev), "d" (next)); \ } while (0)
在分析这段代码之前,先看一些task_struct中thread的结构:
struct task_struct{ ...... struct thread_struct thread; ...... }; struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; unsigned long esp0; unsigned long sysenter_cs; unsigned long eip; unsigned long esp; unsigned long fs; unsigned long gs; /* Hardware debugging registers */ unsigned long debugreg[8]; /* %%db0-7 debug registers */ /* fault info */ unsigned long cr2, trap_no, error_code; /* floating point info */ union i387_union i387; /* virtual 86 mode info */ struct vm86_struct __user * vm86_info; unsigned long screen_bitmap; unsigned long v86flags, v86mask, saved_esp0; unsigned int saved_fs, saved_gs; /* IO permissions */ unsigned long *io_bitmap_ptr; unsigned long iopl; /* max allowed port in the bitmap, in bytes: */ unsigned long io_bitmap_max; };在thread_struct结构中,我们下面关心的就是EIP:指令指针寄存器,在分段机制启动后,它保存指令执行的偏移地址。ESP:堆栈栈顶的指针,就是内核态的栈顶指针。
struct task_struct fastcall * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ __unlazy_fpu(prev_p); /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter > 5) prefetch(&next->i387.fxsave); /* * Reload esp0. */ load_esp0(tss, next); /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this * before setting the new TLS descriptors avoids the situation * where we temporarily have non-reloadable segments in %fs * and %gs. This could be an issue if the NMI handler ever * used %fs or %gs (it does not today), or if the kernel is * running inside of a hypervisor layer. */ savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. */ load_TLS(next, cpu); /* * Restore IOPL if needed. In normal use, the flags restore * in the switch assembly will handle this. But if the kernel * is running virtualized at a non-zero CPL, the popf will * not restore flags, so it must be done in a separate step. */ if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl)) set_iopl_mask(next->iopl); /* * Now maybe handle debug registers and/or IO bitmaps */ if (unlikely(task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV || task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT)) __switch_to_xtra(prev_p, next_p, tss); /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so * the GDT and LDT are properly updated, and must be * done before math_state_restore, so the TS bit is up * to date. */ arch_leave_lazy_cpu_mode(); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the * chances of needing FPU soon are obviously high now */ if (next_p->fpu_counter > 5) math_state_restore(); /* * Restore %gs if needed (which is common) */ if (prev->gs | next->gs) loadsegment(gs, next->gs); x86_write_percpu(current_task, next_p); return prev_p; }
这个函数完成从prev_p到next_p的切换;
#define __unlazy_fpu( tsk ) do { \ if (task_thread_info(tsk)->status & TS_USEDFPU) { \ __save_init_fpu(tsk); \ stts(); \ } else \ tsk->fpu_counter = 0; \ } while (0) static inline void __save_init_fpu( struct task_struct *tsk ) { /* Use more nops than strictly needed in case the compiler varies code */ alternative_input( "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4, "fxsave %[fx]\n" "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", X86_FEATURE_FXSR, [fx] "m" (tsk->thread.i387.fxsave), [fsw] "m" (tsk->thread.i387.fxsave.swd) : "memory"); /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. safe_address is a random variable that should be in L1 */ alternative_input( GENERIC_NOP8 GENERIC_NOP2, "emms\n\t" /* clear stack tags */ "fildl %[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, [addr] "m" (safe_address)); task_thread_info(tsk)->status &= ~TS_USEDFPU; }
#define load_TLS(t, cpu) native_load_tls(t, cpu) static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) { unsigned int i; struct desc_struct *gdt = get_cpu_gdt_table(cpu); for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; }
#define savesegment(seg, value) \ asm volatile("mov %%" #seg ",%0":"=rm" (value))
进程相关的硬件上下文信息存储在进程运行的TSS段中。
/* This is the TSS defined by the hardware. */ struct i386_hw_tss { unsigned short back_link,__blh; unsigned long esp0; unsigned short ss0,__ss0h; unsigned long esp1; unsigned short ss1,__ss1h; /* ss1 is used to cache MSR_IA32_SYSENTER_CS */ unsigned long esp2; unsigned short ss2,__ss2h; unsigned long __cr3; unsigned long eip; unsigned long eflags; unsigned long eax,ecx,edx,ebx; unsigned long esp; unsigned long ebp; unsigned long esi; unsigned long edi; unsigned short es, __esh; unsigned short cs, __csh; unsigned short ss, __ssh; unsigned short ds, __dsh; unsigned short fs, __fsh; unsigned short gs, __gsh; unsigned short ldt, __ldth; unsigned short trace, io_bitmap_base; } __attribute__((packed)); struct tss_struct { struct i386_hw_tss x86_tss; /* * The extra 1 is there because the CPU will access an * additional byte beyond the end of the IO permission * bitmap. The extra byte must be all 1 bits, and must * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* * Cache the current maximum and the last task that used the bitmap: */ unsigned long io_bitmap_max; struct thread_struct *io_bitmap_owner; /* * pads the TSS to be cacheline-aligned (size is 0x100) */ unsigned long __cacheline_filler[35]; /* * .. and then another 0x100 bytes for emergency kernel stack */ unsigned long stack[64]; } __attribute__((packed));