linux进程调度、进程切换原理详解

版权声明:

转载时请以超链接形式标明文章原始出处http://wanderer-zjhit.blogbus.com/logs/156738683.html和作者信息及本声明

思考:
      在cfs就绪队列中,进程是否切换不是按照时间片到期考虑,而是给根据当前就绪进程数给出一个延迟周期,然后根据当前进程的权重,从而确定当前进程是否已经执行过长时间。
      抢占问题:调用try_to_wake_up或wake_up_new_task唤醒新进程执行时,内核会根据新进程的优先级确定是否需要抢占当前执行进程
      新进程插入问题:cfs类新建进程的插入对应的函数为:task_fork_fair函数,/proc/sys/kernel/sched_child_runs_first参数可以确定让子进程在父进程前运行,如果是,则是子进程的vruntime变为父子中最小的vruntime,然后激活延迟调度
     SCHED_FIFO进程可以运行任意长时间,而且必须使用yield系统调用显示将控制权传给另一个进程,使用 
     sched_setscheduler((struct task_struct * p, int policy, struct sched_param * param)可以修改进程调度类
1 目前有两类调度器--调用schedule()的两种形式
    周期性调度器:在低分辨率定时器的每次时钟中断完成全局统计量更新后,每个cpu在软中断中执行一下操作:更新该cpu上当前进程内核态、用户态使用时间;调用该cpu上的定时器函数;启动周期性定时器(scheduler_tick)完成该cpu上任务的周期性调度工作;在支持动态定时器的系统中,可以关闭该调度器,从而进入深度睡眠过程;scheduler_tick查看当前进程是否运行太长时间,如果是,将进程的TIF_NEED_RESCHED置位,然后再中断返回时,调用schedule,延迟调度,进行进程切换操作
    直接调度器:当前进程主动放弃cpu时,修改进程状态,在当前进程中直接调用schedule函数,从而完成系统切换。-----直接调用schedule(),如等待资源时,主动让渡。
2 scheduler_tick 函数执行
  在cpu执行完全局统计后,每个cpu执行scheduler_tick,该函数实质上对rq->clock等进行更新后,主要对当前进程进行切换相关的考虑工作:----- 有以下针对进程类型有两种调度策略:
如果当前进程是完全公平队列中的进程:则首先根据当前就绪队列中的进程数算出一个延迟时间间隔,大概每个进程分配2ms时间,然后按照该进程在队列中的总权重中占得比例,算出它该执行的时间X,如果该进程执行物理时间超过了X,则激发延迟调度;如果没有超过X,但是红黑树就绪队列中下一个进程优先级更高,即curr->vruntime-leftmost->vruntime > X,也将延迟调度
如果当前进程是实时调度类中的进程:则如果该进程是SCHED_RR,则递减时间片[为HZ/10],到期,插入到队列尾部,并激发延迟调度,如果是SCHED_FIFO,则什么也不做,直到该进程执行完成
进程调度的真正调度和切换工作是在schedule中实现,会按照调度类顺序和优先级挑选出一个最高优先级的进程执行


从scheduler_tick()开始的runqueue执行时间计数和调度策略具体操作如下.......

/*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
 *
 * It also gets called by the fork code, when changing the parent's
 * timeslices.
 */
void scheduler_tick(void)
{
 int cpu = smp_processor_id();
 struct rq *rq = cpu_rq(cpu);  获得该cpu上的就绪队列
 struct task_struct *curr = rq->curr; 获得当前执行的进程

 sched_clock_tick();更新每cpu上的sched_clock_data

 raw_spin_lock(&rq->lock);
 update_rq_clock(rq); 更新rq->clock时钟,一招sched_clock_data值
 update_cpu_load(rq);
 curr->sched_class->task_tick(rq, curr, 0);调用该进程所在类特定的调度策略
 raw_spin_unlock(&rq->lock);

 perf_event_task_tick(curr);

#ifdef CONFIG_SMP
 rq->idle_at_tick = idle_cpu(cpu);
 trigger_load_balance(rq, cpu);
#endif
}

两种调度策略介绍
2.3.1 完全公平进程如何判断是否要重新调度
/*
 * scheduler tick hitting a task of our scheduling class:
 */
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued=0)
{
 struct cfs_rq *cfs_rq;
 struct sched_entity *se = &curr->se;

 for_each_sched_entity(se) { //组调度使用
    cfs_rq = cfs_rq_of(se);
    entity_tick(cfs_rq, se, queued);
 }
}
cfs就绪队列中看是否需要调度该进程:
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued=0)
{
 /*
  * Update run-time statistics of the 'current'.
  */
 update_curr(cfs_rq);

更新当前进程的sum_exec_runtime、虚拟运行时钟vruntime、统计计数、exec_start
更新整个cfs就绪队列的虚拟时钟min_vruntime=max_vruntime(min_runtime,
min_runtime(curr->vruntime,就绪队列中最左节点的vruntime))

#ifdef CONFIG_SCHED_HRTICK
 /*
  * queued ticks are scheduled to match the slice, so don't bother
  * validating it and just reschedule.
  */
 if (queued) {
  resched_task(rq_of(cfs_rq)->curr);
  return;
 }
 /*
  * don't let the period tick interfere with the hrtick preemption
  */
 if (!sched_feat(DOUBLE_TICK) &&
   hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
  return;
#endif

 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
  check_preempt_tick(cfs_rq, curr);此时,该进程已经执行时间得到更新,看是否需要切换到其他进程中
}
cfs调度主函数:
/*
 * Preempt the current task with a newly woken task if needed:
 */
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
 unsigned long ideal_runtime, delta_exec;

 ideal_runtime = sched_slice(cfs_rq, curr); //根据cfs就绪队列中进程数确定一个最长时间间隔,然后看在该时间间隔内当前进程按照权重比例应该执行的物理时间间隔
 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;//进程子上次调度时已经执行了这么长时间
 if (delta_exec > ideal_runtime) {   如果执行时间过长,则切换
  resched_task(rq_of(cfs_rq)->curr); 将tsk->thread_info->flag |= TIF_NEED_RESCHED置位
  /*
   * The current task ran long enough, ensure it doesn't get
   * re-elected due to buddy favours.
   */
  clear_buddies(cfs_rq, curr);
  return;
 }

 /*
  * Ensure that a task that missed wakeup preemption by a
  * narrow margin doesn't have to wait for a full slice.
  * This also mitigates buddy induced latencies under load.
  */
 if (!sched_feat(WAKEUP_PREEMPT))
  return;

 if (delta_exec < sysctl_sched_min_granularity) 进程执行时间过短,退出
  return;

 if (cfs_rq->nr_running > 1) {  虽然该进程当前应该执行的物理时间长度没有到期,但是该进程比下个即将执行的进程没有优势,也将切换
  struct sched_entity *se = __pick_next_entity(cfs_rq);
  s64 delta = curr->vruntime - se->vruntime;

  if (delta > ideal_runtime)
   resched_task(rq_of(cfs_rq)->curr);
 }
}
2.3.2 实时进程如何判断是否要重新调度执行
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
 update_curr_rt(rq);更新当前进程sum_exec_time +=rq->clock+curr->se.exec_start

 watchdog(rq, p);

 /*
  * RR tasks need a special form of timeslice management.
  * FIFO tasks have no timeslices.
  */
 if (p->policy != SCHED_RR) 如果是FIFO实时进程,则退出
  return;

 if (--p->rt.time_slice) 如果是RR实时进程,但时间片未用完,退出
  return;

 p->rt.time_slice = DEF_TIMESLICE;如果是RR实时进程,但时间片已经用完,退出 ,100*HZ/1000

 /*
  * Requeue to the end of queue if we are not the only element
  * on the queue:
  */
 if (p->rt.run_list.prev != p->rt.run_list.next) {
  requeue_task_rt(rq, p, 0);将该进程移动到队列尾部
  set_tsk_need_resched(p);将该进程设置为重新调度
 }
}


3 执行过程
   schedule函数首先进行切换前得rq和current数据更新操作;然后在rq就绪队列中选择将要执行的进程;然后调用context_switch进行虚拟空间、内核栈、cpu寄存器和内核空间的切换;恢复时,就行进程的恢复操作,这些操作都是局限于特定的调度类的
/*
 * schedule() is the main scheduler function.
 */
asmlinkage void __sched schedule(void)
{
 struct task_struct *prev, *next;
 unsigned long *switch_count;
 struct rq *rq;
 int cpu;

need_resched:
 preempt_disable();
 cpu = smp_processor_id();
 rq = cpu_rq(cpu);
 rcu_note_context_switch(cpu);
 prev = rq->curr;
 switch_count = &prev->nivcsw;

 release_kernel_lock(prev);
need_resched_nonpreemptible:

 schedule_debug(prev);

 if (sched_feat(HRTICK))
  hrtick_clear(rq);

 raw_spin_lock_irq(&rq->lock);
 clear_tsk_need_resched(prev); 清除TIF_NEED_RESCHED标志

 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 当前进程为处于TASK_RUNNING态且可以被抢占
  if (unlikely(signal_pending_state(prev->state, prev))) 该进程受到了信号,将其激活
   prev->state = TASK_RUNNING;
  else
   deactivate_task(rq, prev, DEQUEUE_SLEEP); 将该进程从就绪队列中删除
  switch_count = &prev->nvcsw; 前一进程的切换次数统计
 }

 pre_schedule(rq, prev); 切换前操作,调用prev->sched_class->prev_shedule()

 if (unlikely(!rq->nr_running))
  idle_balance(cpu, rq); 如果该cpu上没有就绪进程,从别的cpu中移动进程

 put_prev_task(rq, prev);
 next = pick_next_task(rq); 选择将要执行的进程

 if (likely(prev != next)) {
  sched_info_switch(prev, next);
  perf_event_task_sched_out(prev, next);

  rq->nr_switches++;
  rq->curr = next;
  ++*switch_count;

  context_switch(rq, prev, next); /* unlocks the rq */ 上下文切换
  /*
   * the context switch might have flipped the stack from under
   * us, hence refresh the local variables.
   */
  cpu = smp_processor_id(); 当该进程在此被调度执行时,其执行cpu肯恩发生变化
  rq = cpu_rq(cpu);
 } else
  raw_spin_unlock_irq(&rq->lock);

 post_schedule(rq);进程重新调度的恢复操作

 if (unlikely(reacquire_kernel_lock(current) < 0)) {
  prev = rq->curr;
  switch_count = &prev->nivcsw;
  goto need_resched_nonpreemptible;
 }

 preempt_enable_no_resched();
 if (need_resched())
  goto need_resched;
}
EXPORT_SYMBOL(schedule);
2.2 挑选下一个进程,就绪队列中不同的调度策略对应不同的选择方式
static inline struct task_struct *pick_next_task(struct rq *rq)
{
 const struct sched_class *class;
 struct task_struct *p;

 /*
  * Optimization: we know that if all tasks are in
  * the fair class we can call that function directly:
  */
 if (likely(rq->nr_running == rq->cfs.nr_running)) { 简单情况,rt_sched_class就绪队列中没有就绪进程
  p = fair_sched_class.pick_next_task(rq);
  if (likely(p))
   return p;
 }

 class = sched_class_highest; 遍历3种不同的调度队列对应的就绪队列
 for ( ; ; ) {
  p = class->pick_next_task(rq);
  if (p)
   return p;
  /*
   * Will never be NULL as the idle class always
   * returns a non-NULL p:
   */
  class = class->next;
 }
}
2.2.1 在完全公平调度类中从红黑树就绪队列中选择下一个即将执行的进程
 static struct task_struct *pick_next_task_fair(struct rq *rq)
{
 struct task_struct *p;
 struct cfs_rq *cfs_rq = &rq->cfs; 选中就绪队列中的完全公平就绪队列
 struct sched_entity *se;

 if (!cfs_rq->nr_running)
  return NULL;

 do {
  se = pick_next_entity(cfs_rq);选择下一个调度实体
  set_next_entity(cfs_rq, se);对该调度实体进行修改,将进程从调度队列中删除,修改其start_exec等值
  cfs_rq = group_cfs_rq(se);
 } while (cfs_rq);  如果是按照组调度,则每次调度该组的所有进程

 p = task_of(se);
 hrtick_start_fair(rq, p); 
 return p;
}
从cfs红黑树中获取一个调度进程
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
 struct sched_entity *se = __pick_next_entity(cfs_rq);获取cfs_rq->rb_leftmost节点的调度实体
 struct sched_entity *left = se;

 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) 该选择进程能否抢占cfs_rq->next
  se = cfs_rq->next;

 /*
  * Prefer last buddy, try to return the CPU to a preempted task.
  */
 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
  se = cfs_rq->last;

 clear_buddies(cfs_rq, se);

 return se;
}
对该进程进行修改:
static void set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
 /* 'current' is not kept within the tree. */
 if (se->on_rq) {
  /*
   * Any task has to be enqueued before it get to execute on
   * a CPU. So account for the time it spent waiting on the
   * runqueue.
   */
  update_stats_wait_end(cfs_rq, se);更新se->statistics值
  __dequeue_entity(cfs_rq, se);将该调度实体从红黑树中删除
 }

 update_stats_curr_start(cfs_rq, se); se->exec_start = rq(cfs_rq)->clock,进程调度时间
 cfs_rq->curr = se;
#ifdef CONFIG_SCHEDSTATS
 /*
  * Track our maximum slice length, if the CPU's load is at
  * least twice that of our own weight (i.e. dont track it
  * when there are only lesser-weight tasks around):
  */
 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
  se->statistics.slice_max = max(se->statistics.slice_max,
   se->sum_exec_runtime - se->prev_sum_exec_runtime);
 }
#endif
 se->prev_sum_exec_runtime = se->sum_exec_runtime; 保存该cpu总的执行时间
}
2.2.2 在实时调度类中从数组链表就绪队列中选择下一个即将执行的进程
static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
 struct sched_rt_entity *rt_se;
 struct task_struct *p;
 struct rt_rq *rt_rq;

 rt_rq = &rq->rt; 获得实时调度类就绪队列

 if (unlikely(!rt_rq->rt_nr_running))
  return NULL;

 if (rt_rq_throttled(rt_rq))
  return NULL;

 do {
  rt_se = pick_next_rt_entity(rq, rt_rq);
  BUG_ON(!rt_se);
  rt_rq = group_rt_rq(rt_se);
 } while (rt_rq);

 p = rt_task_of(rt_se);
 p->se.exec_start = rq->clock;设置该进程开始执行时间

 return p;
}
在rt就绪队列中选择下一个执行进程
static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
         struct rt_rq *rt_rq)
{
 struct rt_prio_array *array = &rt_rq->active;
 struct sched_rt_entity *next = NULL;
 struct list_head *queue;
 int idx;

 idx = sched_find_first_bit(array->bitmap); 在该位图中找第一个不为0为的位,值越小,优先级越高
由于共100个数组项,所以需要4个unsigned long。然后看第一个unsigned long值是否为0,设第二个unsigned long 的第2个单位为1,则 return 32 + __off(b[1]) = 32+2=32
算法非常简单:
num=0,且32位里多于一个1
if(word && 0xffff ==0)
    {num + =16;word >>=16}  低16位为0,则数字高16位移到低16位,保证16位里有1
if ( word&&0xff == 0)
  {num+=8;word>>=8}
if (word &&0xf ==0)
  {num+=4;word>>=4}
if (word && ox3 ==0)
  {num+=2;word>=2}
if (word && ox1==0)
 {num+=1}  return Num
 BUG_ON(idx >= MAX_RT_PRIO);

 queue = array->queue + idx; 找着该就绪队列
 next = list_entry(queue->next, struct sched_rt_entity, run_list);从该据需队列中选择第一个进程

 return next;
}
注:两种选择方法都很简单,但是从rt就绪队列中选择完进程后,该进程不从就绪队列中删除,但是在cfs就绪队列中,选择进程会从红黑树中脱链
2.3 进程上下文切换 context_switch执行概览
进程四要素:执行的程序、内核栈、身份结构、虚存区间
A 切换操作首先将进程虚存进行切换,主要是cpu中cr3寄存器的修改,表示之后使用的虚存是新进程虚存
B 然后是进程cpu寄存器保存在内核栈中,并且内核栈切换到新进程内核栈[和task_struct连在一起]。
C 并从新进程内核栈中加再新进程cpu寄存器,执行新进程操作,包括返回到用户态等等
D 执行新进程时,将调用进程的信息回收

/*
 * context_switch - switch to the new MM and the new
 * thread's register state.
 */
static inline void context_switch(struct rq *rq, struct task_struct *prev,struct task_struct *next)
{
 struct mm_struct *mm, *oldmm;

 prepare_task_switch(rq, prev, next);会调用每个体系结构相关的函数,为切换做好准备
 trace_sched_switch(rq, prev, next);
 mm = next->mm; 即将执行进程的虚存结构
 oldmm = prev->active_mm;旧进程的虚存结构,不管就进程是否为内核线程,该值都有效
 /*
  * For paravirt, this is coupled with an exit in switch_to to
  * combine the page table reload and the switch backend into
  * one hypercall.
  */
 arch_start_context_switch(prev);
注:内核线程的current->mm=NULL,使用active_mm来暂借进程的虚存地址空间

 if (unlikely(!mm)) { 如果调度的是内核线程
  next->active_mm = oldmm;内核线程没有虚存,为了完整性,借用上一进程的虚存
  atomic_inc(&oldmm->mm_count);增加虚存使用计数
  enter_lazy_tlb(oldmm, next);不需要切换虚拟地址空间的用户空间部分,称为惰性TLB
 } else
  switch_mm(oldmm, mm, next);进程虚存地址空间切换

 if (unlikely(!prev->mm)) { 前一进程也是内核线程,且该线程被切换出去,断开时为何不递减active_mm计数
  prev->active_mm = NULL;断开其与借用的地址空间的关系
  rq->prev_mm = oldmm;//就绪队列中保存的是实际存在的上一个进程的虚存地址
 }
 /*
  * Since the runqueue lock will be released by the next
  * task (which is an invalid locking op but in the case
  * of the scheduler it's an obvious special-case), so we
  * do an early lockdep release here:
  */
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
#endif

 /* Here we just switch the register state and the stack. */
 switch_to(prev, next, prev);切换到next进程中

 barrier();内存屏障,保证前面语句总是在后面语句前执行完毕
 /*
  * this_rq must be evaluated again because prev may have moved
  * CPUs since it called schedule(), thus the 'rq' on its stack
  * frame will be invalid.
  */
 finish_task_switch(this_rq(), prev);
}
2.3.1 虚存切换,不同体系结构定义不同 
在arch/x86/include/asm/Mmu_context.h中定义如下:
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
        struct task_struct *tsk)
{
 unsigned cpu = smp_processor_id();

 if (likely(prev != next)) { 不是同一个线程组
  /* stop flush ipis for the previous mm */
  cpumask_clear_cpu(cpu, mm_cpumask(prev));//清除prev的cpu_vm_mask,表示prev放弃使用cpu
#ifdef CONFIG_SMP
  percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 刷新cpu地址转换后备缓冲器TLB
  percpu_write(cpu_tlbstate.active_mm, next);
#endif
  cpumask_set_cpu(cpu, mm_cpumask(next)); 设置当前进程的mm->cpu_vm_mask表示其占用cpu

  /* Re-load page tables */
  load_cr3(next->pgd);将新进程的pgd页目录表填写到cpu的cr3寄存器中

  /*
   * load the LDT, if the LDT is different:
   */
  if (unlikely(prev->context.ldt != next->context.ldt))
   load_LDT_nolock(&next->context);提供LDT值
 }
#ifdef CONFIG_SMP
 else {
  percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
  BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);

  if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) {
   /* We were in lazy tlb mode and leave_mm disabled
    * tlb flush IPI delivery. We must reload CR3
    * to make sure to use no freed page tables.
    */
   load_cr3(next->pgd);
   load_LDT_nolock(&next->context);
  }
 }
#endif
}
2.3.2 进程切换,不同体系结构定义不同 
进程切换主要涉及cpu寄存器保存和内核栈的切换
注:
context_to:
 -->switch_mm已经对进程的虚拟内存进行切换,此后用户态已经变为新进程的用户态了,然后
 -->switch_to中先对flags寄存器和ebp压入旧进程内核栈,并将确定旧进程恢复执行的下一跳地址,并将旧进程ip,esp保存到task_struct->thread_info中,这样旧进程保存完毕;然后用新进程的thread_info->esp恢复新进程的内核堆栈,用thread->info的ip恢复新进程地址执行。
关键点:内核寄存器[eflags、ebp保存到内核栈内核栈esp地址、ip地址保存到thread_info中task_struct在生命期中始终是全局的,所以肯定能根据该结构恢复出其所有执行场景来]
在/arch/x86/include/asm/system.h文件中,该定义如下:
/*
 * Saving eflags is important. It switches not only IOPL between tasks,
 * it also protects other tasks from NT leaking through sysenter etc.
 */
#define switch_to(prev, next, last)     \
do {         \
 /*        \
  * Context-switching clobbers all registers, so we clobber \
  * them explicitly, via unused output variables.  \
  * (EAX and EBP is not listed because EBP is saved/restored \
  * explicitly for wchan access and EAX is the return value of \
  * __switch_to())      \
  */        \
 unsigned long ebx, ecx, edx, esi, edi;    \
         \
 asm volatile("pushfl\n\t"   /* save    flags */ \ 保存就的ebp、和flags寄存器到旧进程的内核栈中
       "pushl %%ebp\n\t"   /* save    EBP   */ \
       "movl %%esp,%[prev_sp]\n\t" /* save    ESP   */ \  将旧进程esp保存到thread_info结构中
       "movl %[next_sp],%%esp\n\t" /* restore ESP   */ \ 用新进程esp填写esp寄存器,此时内核栈已切换
       "movl $1f,%[prev_ip]\n\t" /* save    EIP   */ \ 将该进程恢复执行时的下条地址保存到旧进程的thread中
       "pushl %[next_ip]\n\t" /* restore EIP   */ \ 将新进程的ip值压入到新进程的内核栈中
       __switch_canary     \
       "jmp __switch_to\n" /* regparm call  */ \
       "1:\t"      \
       "popl %%ebp\n\t"  /* restore EBP   */ \  该进程执行,恢复ebp寄存器
       "popfl\n"   /* restore flags */ \               恢复flags寄存器
         \
       /* output parameters */    \
       : [prev_sp] "=m" (prev->thread.sp),  \   输出参数
         [prev_ip] "=m" (prev->thread.ip),  \
         "=a" (last),     \
         \
         /* clobbered output registers: */  \
         "=b" (ebx), "=c" (ecx), "=d" (edx),  \
         "=S" (esi), "=D" (edi)    \
                \
         __switch_canary_oparam    \
         \
         /* input parameters: */    \
       : [next_sp]  "m" (next->thread.sp),  \  
         [next_ip]  "m" (next->thread.ip),  \
                \
         /* regparm parameters for __switch_to(): */ \
         [prev]     "a" (prev),    \
         [next]     "d" (next)    \
         \
         __switch_canary_iparam    \
         \
       : /* reloaded segment registers */   \
   "memory");     \
} while (0)
注:下一个进程是如何执行的,实际上当进程切换出去时,
thread_info结构在各个体系结构中不同,arch/x86/include/asm下定义为:
struct thread_struct {
 /* Cached TLS descriptors: */
 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
 unsigned long  sp0;
 unsigned long  sp; 当前进程的内核栈当前位置,ebp显然在线程顶端2个页面对其处
#ifdef CONFIG_X86_32
 unsigned long  sysenter_cs;
#else
 unsigned long  usersp; /* Copy from PDA */
 unsigned short  es;
 unsigned short  ds;
 unsigned short  fsindex;
 unsigned short  gsindex;
#endif
#ifdef CONFIG_X86_32
 unsigned long  ip;  当前执行指令ip地址
#endif
#ifdef CONFIG_X86_64
 unsigned long  fs;
#endif
 unsigned long  gs;
 /* Save middle states of ptrace breakpoints */
 struct perf_event *ptrace_bps[HBP_NUM];
 /* Debug status used for traps, single steps, etc... */
 unsigned long           debugreg6;
 /* Keep track of the exact dr7 value set by the user */
 unsigned long           ptrace_dr7;
 /* Fault info: */
 unsigned long  cr2;
 unsigned long  trap_no;
 unsigned long  error_code;
 /* floating point and extended processor state */
 struct fpu  fpu;
#ifdef CONFIG_X86_32
 /* Virtual 86 mode info */
 struct vm86_struct __user *vm86_info;
 unsigned long  screen_bitmap;
 unsigned long  v86flags;
 unsigned long  v86mask;
 unsigned long  saved_sp0;
 unsigned int  saved_fs;
 unsigned int  saved_gs;
#endif
 /* IO permissions: */
 unsigned long  *io_bitmap_ptr;
 unsigned long  iopl;
 /* Max allowed port in the bitmap, in bytes: */
 unsigned  io_bitmap_max;
};

2.3.3 刚进程冲洗执行,对调用他的调用进程进行事后清理工作
注:A进程切换到B,B然后切换到C,C最后切换到A,此时A执行,但是返回的prev指向C,在A调度时候需要把调用A的进程的信息清除掉
由于从C切换到A时候,A内核栈中保存的实际上是A切换出时的状态信息,即prev=A,next=B,但是在A执行时,其位于context_switch上下文中,该函数的参数prev应该是切换到A的进程C,A负责对C进程信息进行切换后处理,比如,如果切换到A后,A发现C进程已经处于TASK_DEAD状态,则将释放C进程的TASK_STRUCT结构。
/**
 * finish_task_switch - clean up after a task-switch
 * @rq: runqueue associated with task-switch
 * @prev: the thread we just switched away from.
 *
 * finish_task_switch must be called after the context switch, paired
 * with a prepare_task_switch call before the context switch.
 * finish_task_switch will reconcile locking set up by prepare_task_switch,
 * and do any other architecture-specific cleanup actions.
 *
 * Note that we may have delayed dropping an mm in context_switch(). If
 * so, we finish that here outside of the runqueue lock. (Doing it
 * with the lock held can cause deadlocks; see schedule() for
 * details.)
 */
static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 __releases(rq->lock)
{
 struct mm_struct *mm = rq->prev_mm;
 long prev_state;

 rq->prev_mm = NULL;

 /*
  * A task struct has one reference for the use as "current".
  * If a task dies, then it sets TASK_DEAD in tsk->state and calls
  * schedule one last time. The schedule call will never return, and
  * the scheduled task must drop that reference.
  * The test for TASK_DEAD must occur while the runqueue locks are
  * still held, otherwise prev could be scheduled on another cpu, die
  * there before we look at prev->state, and then the reference would
  * be dropped twice.
  *  Manfred Spraul <[email protected]>
  */
 prev_state = prev->state;
 finish_arch_switch(prev);
 perf_event_task_sched_in(current, cpu_of(rq));
 finish_lock_switch(rq, prev);

 fire_sched_in_preempt_notifiers(current);
 if (mm)
  mmdrop(mm);
 if (unlikely(prev_state == TASK_DEAD)) {   如果上一个进程已经终止,释放其task_struct 结构
  /*
   * Remove function-return probe instances associated with this
   * task and put them back on the free list.
   */
    kprobe_flush_task(prev);
    put_task_struct(prev); //最后调用kfree,释放空间
 }
}

你可能感兴趣的:(linux进程调度、进程切换原理详解)