linux调度schedule()
将当前进程从相应的队列删除,计算更新调度实体以及进程相关信息,将进程插入到调度队列中,对于CFS,根据具体时间插入,对于实时调度插入到对应的有限队列尾,从运行队列中选择一个可运行进程,并进行进程上下文切换。
schedule函数任务是用一个进程来替换当前正在执行的进程;
asmlinkage void __sched schedule(void) { struct task_struct *tsk = current; /*If we are going to sleep and we have plugged IO queued, <span style="white-space:pre"> </span> * make sure to submit it to avoid deadlocks.*/ sched_submit_work(tsk); 避免死锁 __schedule(); }
检查prev->state的标志,如果不是可运行态,在内核态没有被抢占,则从运行队列中将此删除;但是如果是由于非阻塞信号挂起,而且状态为TASK_INTERRUPTIBLE,则将进程设置为TASK_RUNNING并插入队列;
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
to_wakeup = wq_worker_sleeping(prev, cpu);
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
}
switch_count = &prev->nvcsw;
}
/*现只对实时进程有用*/
pre_schedule(rq, prev);
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);
/*将当前进程,也就是被切换出去的进程重新
插入到各自的运行队列中,对于CFS算法插入
到合适的位置上,对于实时调度插入到同一个
优先级队列的链表尾部*/
put_prev_task(rq, prev);
/*从各自的运行队列中选择下一个进程来运行*/
next = pick_next_task(rq);
/清除prev的TIF_NEED_RESCHED标志。
clear_tsk_need_resched(prev);
rq->skip_clock_update = 0;
if (likely(prev != next)) {如果prev和next不同
rq->nr_switches++; 队列切换次数
rq->curr = next;
++*switch_count;进程切换次数
context_switch(rq, prev, next); /* unlocks the rq */ 进程上下文切换
/*
* The context switch have flipped the stack from under us
* and restored the local variables which were saved when
* this task called schedule() in the past. prev == current
* is still correct, but it can be moved to another cpu/rq.
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
} else
raw_spin_unlock_irq(&rq->lock);
实时进程会用到
post_schedule(rq);
允许抢占
sched_preempt_enable_no_resched();
if (need_resched())
goto need_resched;
}
对于cpu_rq等宏:
/*通过向上加偏移的方式得到rq,这里可以看出
runqueues为一个rq结构的数组,cpu为数组下标*/
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p))删除运行队列:deacivate_task()
void deactivate_task(struct rq *rq, struct task_struct *p, int flags) { if (task_contributes_to_load(p)) rq->nr_uninterruptible++; dequeue_task(rq, p, flags); }
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { update_rq_clock(rq); //更新rq时间 sched_info_dequeued(p); /*更新进程的sched_info数据结构中相关属性*/ p->sched_class->dequeue_task(rq, p, flags);/*调用具体调度类的函数从他的运行队列中删除*/ }
/* * Pick up the highest-prio task: */ static inline struct task_struct * pick_next_task(struct rq *rq) { const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.h_nr_running)) {//如果nr_running==cfs.h_nr_running,则说明当前rq队列中是没有rt任务的, p = fair_sched_class.pick_next_task(rq);//在cfs队列中挑选即将被切换进去的进程 if (likely(p)) return p; } /* #define sched_class_highest (&stop_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) */ for_each_class(class) { p = class->pick_next_task(rq); if (p) return p; } BUG(); /* the idle class will always have a runnable task */ }
对于context_switch();进程间的切换:
/* * context_switch - switch to the new MM and the new * thread's register state. */ static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { struct mm_struct *mm, *oldmm; prepare_task_switch(rq, prev, next); mm = next->mm; next进程所拥有的内存描述符,对于一般进程mm和active_mm两个字段一样,有相同的地址,但是内核线程没有自己的地址空间 oldmm = prev->active_mm; 进程正在使用的内存描述符 /* * For paravirt, this is coupled with an exit in switch_to to * combine the page table reload and the switch backend into * one hypercall. */ arch_start_context_switch(prev); if (!mm) {如果为内核线程 使用 next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else 否则就用 next的地址空间替换prev的地址空间 switch_mm(oldmm, mm, next); if (!prev->mm) { 如果prev是内核线程或者正在退出的进程,就将prev的内存描述符保存在运行队列的prev_mm中;并设置prev->active_mm = NULL;为空 prev->active_mm = NULL; rq->prev_mm = oldmm; } /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ #ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); 在执行switch_to后prev指向被替换回来的进程 barrier(); /* * this_rq must be evaluated again because prev may have moved * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ finish_task_switch(this_rq(), prev); }
/** * finish_task_switch - clean up after a task-switch * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired * with a prepare_task_switch call before the context switch. * finish_task_switch will reconcile locking set up by prepare_task_switch, * and do any other architecture-specific cleanup actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. * The test for TASK_DEAD must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul <[email protected]> */ prev_state = prev->state; finish_arch_switch(prev); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ perf_event_task_sched_in(prev, current); #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_enable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ finish_lock_switch(rq, prev); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { /* * Remove function-return probe instances associated with this * task and put them back on the free list. */ kprobe_flush_task(prev); put_task_struct(prev); }
该函数还要释放与页表相关的描述符、虚拟内存区,同时还要释放运行队列的自旋锁、并打开本地中断;如果prev是一个正在从系统删除的僵尸任务,就调用
put_task_struct 释放进程描述符引用计数;(见进程删除、、)
对于prepare_task_switch(rq, prev, next);
-------------->sched_info_switch()
--------------------------->__sched_info_switch
static inline void __sched_info_switch(struct task_struct *prev, struct task_struct *next) { struct rq *rq = task_rq(prev); /* * prev now departs the cpu. It's not interesting to record * stats about how efficient we were at scheduling the idle * process, however. */ if (prev != rq->idle)/*如果被切换出去的进程不是idle进程*/ sched_info_depart(prev);/*更新prev进程和他对应rq的相关变量*/ if (next != rq->idle) sched_info_arrive(next); }
/* * Called when a process ceases being the active-running process, either * voluntarily or involuntarily. Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. */ static inline void sched_info_depart(struct task_struct *t) { unsigned long long delta = task_rq(t)->clock - t->sched_info.last_arrival; //更新runqueue中的task所得到的cpu执行累加时间 rq_sched_info_depart(task_rq(t), delta); /*如果被切换出去进程的状态是运行状态 那么将进程sched_info.last_queued设置为rq的clock_last_queued为最后一次排队等待运行的时间*/ if (t->state == TASK_RUNNING) sched_info_queued(t); } /* * This function is only called from enqueue_task(), but also only updates * the timestamp if it is already not set. It's assumed that * sched_info_dequeued() will clear that stamp when appropriate. */ static inline void sched_info_queued(struct task_struct *t) { if (unlikely(sched_info_on())) <span style="white-space:pre"> </span>if (!t->sched_info.last_queued) <span style="white-space:pre"> </span>t->sched_info.last_queued = task_rq(t)->clock; }
/* * Called when a task finally hits the cpu. We can now calculate how * long it was waiting to run. We also note when it began so that we * can keep stats on how long its timeslice is. */ static void sched_info_arrive(struct task_struct *t) { unsigned long long now = task_rq(t)->clock, delta = 0; if (t->sched_info.last_queued)/*如果被切换进来前在运行进程中排队*/ delta = now - t->sched_info.last_queued;/*计算排队等待的时间长度*/ sched_info_reset_dequeued(t);/*因为进程将被切换进来运行,设定last_queued为0*/ t->sched_info.run_delay += delta;/*更新进程在运行队列里面等待的时间*/ t->sched_info.last_arrival = now;/*更新最后一次运行的时间*/ t->sched_info.pcount++;/*cpu上运行的次数加一*/ /*更新rq中rq_sched_info中的对应的变量*/ rq_sched_info_arrive(task_rq(t), delta); }
可见调度后并不是马上就运行。。。。
下图来自 http://www.cnblogs.com/tianchi/archive/2012/08/07/2626013.html