我们之前多次提到过睡眠的进程由于没有占用CPU资源,所以其vruntime会比较少,睡眠时间越长,vruntime越小。为了防止睡眠进程醒来后,独占CPU导致其他进程饿死,进程醒来后,会重新调整其vruntime值,使其既能够收到CPU的优待,又不至于导致其他进程饥饿。
另外,由于之前只分析了CFS调度器相关的代码,所以本文涉及到具体调度器的时候,全部以CFS调度器为例。
void msleep(unsigned int msecs)
{
unsigned long timeout = msecs_to_jiffies(msecs) + 1;
while (timeout)
timeout = schedule_timeout_uninterruptible(timeout);
}
signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
__set_current_state(TASK_UNINTERRUPTIBLE);
return schedule_timeout(timeout);
}
signed long __sched schedule_timeout(signed long timeout)
{
struct timer_list timer;
unsigned long expire;
switch (timeout)
{
case MAX_SCHEDULE_TIMEOUT:
/*
* These two special cases are useful to be comfortable
* in the caller. Nothing more. We could take
* MAX_SCHEDULE_TIMEOUT from one of the negative value
* but I' d like to return a valid offset (>=0) to allow
* the caller to do everything it want with the retval.
*/
schedule();
goto out;
default:
/*
* Another bit of PARANOID. Note that the retval will be
* 0 since no piece of kernel is supposed to do a check
* for a negative retval of schedule_timeout() (since it
* should never happens anyway). You just have the printk()
* that will tell you if something is gone wrong and where.
*/
if (timeout < 0) {
printk(KERN_ERR "schedule_timeout: wrong timeout "
"value %lx\n", timeout);
dump_stack();
current->state = TASK_RUNNING;
goto out;
}
}
expire = timeout + jiffies;
setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
__mod_timer(&timer, expire, false);
schedule();
del_singleshot_timer_sync(&timer);
/* Remove the timer from the object tracker */
destroy_timer_on_stack(&timer);
timeout = expire - jiffies;
out:
return timeout < 0 ? 0 : timeout;
}
由于调用完schedule之后,当前进程移出调度队列,并且进程状态不再是TASK_RUNNING,所以该进程得不到调度。这也就是所谓的进程睡眠,进程睡眠后只能在达到超时时间后由process_timeout函数唤醒,一旦唤醒之后,会接着schedule之后的函数继续执行,此时定时器已经没有用了,所以可以删除。
上面我们说schedule会将当前进程移出调度队列,这里其实是不准确的如果当前进程还有待处理的signal或者进程以TASK_INTERRUPTIBLE状态休眠,进程依然会在调度队列上,至于原因会有专门的文章分析。
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
trace_sched_waking(p);
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
#ifdef CONFIG_SMP
smp_rmb();
smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
stat:
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
我们这里只关注wakeup的核心逻辑,不去分析锁以及rmb相关的逻辑。
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
也就是说处于TASK_INTERRUPTIBLE和TASK_UNINTERRUPTIBLE状态的进程可以被唤醒,前面我们分析过msleep将进程置于TASK_UNINTERRUPTIBLE状态,所以调用msleep休眠的进程可以被唤醒。
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
struct pin_cookie cookie)
{
int en_flags = ENQUEUE_WAKEUP;
lockdep_assert_held(&rq->lock);
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
if (wake_flags & WF_MIGRATED)
en_flags |= ENQUEUE_MIGRATED;
#endif
ttwu_activate(rq, p, en_flags);
ttwu_do_wakeup(rq, p, wake_flags, cookie);
}
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
*/
if (p->in_iowait)
cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) {
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
*/
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
if (cfs_rq_throttled(cfs_rq))
break;
update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
}
if (!se)
add_nr_running(rq, 1);
hrtick_update(rq);
}
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
/*
* The 'current' period is already promised to the current tasks,
* however the extra weight of the new task will slow them down a
* little, place the new task so that it fits in the slot that
* stays open at the end.
*/
if (initial && sched_feat(START_DEBIT))
vruntime += sched_vslice(cfs_rq, se);
/* sleeps up to a single latency don't count. */
if (!initial) {
unsigned long thresh = sysctl_sched_latency;
/*
* Halve their sleep time's effect, to allow
* for a gentler effect of sleepers:
*/
if (sched_feat(GENTLE_FAIR_SLEEPERS))
thresh >>= 1;
vruntime -= thresh;
}
/* ensure we never gain time by being placed backwards. */
se->vruntime = max_vruntime(se->vruntime, vruntime);
}
这个函数非常简单,睡眠进程的vruntime就是用min_vruntime减去一个调度延时sysctl_sched_latency的时间,这就是对休眠进程所谓的优待。当然,如果休眠进程睡眠的时间比较短,其本身的vruntime比min_vruntime-sysctl_sched_latency要大,这种情况下,没有必要对休眠进程补偿。