Linux CFS调度器之msleep

前言


我们之前多次提到过睡眠的进程由于没有占用CPU资源,所以其vruntime会比较少,睡眠时间越长,vruntime越小。为了防止睡眠进程醒来后,独占CPU导致其他进程饿死,进程醒来后,会重新调整其vruntime值,使其既能够收到CPU的优待,又不至于导致其他进程饥饿。

另外,由于之前只分析了CFS调度器相关的代码,所以本文涉及到具体调度器的时候,全部以CFS调度器为例。

进程睡眠msleep函数


msleep

void msleep(unsigned int msecs)
{
	unsigned long timeout = msecs_to_jiffies(msecs) + 1;

	while (timeout)
		timeout = schedule_timeout_uninterruptible(timeout);
}
  • 以jiffies为单位计算超时时间
  • 调用schedule_timeout_uninterruptible进行下一步处理。
    因为这里是sleep,说睡眠多久就睡眠多久,不能被提前唤醒,所以这里的睡眠是无法中断的,因此调用schedule_timeout_uninterruptible接口。

sleep->schedule_timeout_uninterruptible

signed long __sched schedule_timeout_uninterruptible(signed long timeout)
{
	__set_current_state(TASK_UNINTERRUPTIBLE);
	return schedule_timeout(timeout);
}
  • 进程状态设置为TASK_UNINTERRUPTIBLE状态,也就是说不能被提前唤醒。
  • 调用 schedule_timeout接口进行下一步处理。

sleep->schedule_timeout_uninterruptible->schedule_timeout

signed long __sched schedule_timeout(signed long timeout)
{
	struct timer_list timer;
	unsigned long expire;

	switch (timeout)
	{
	case MAX_SCHEDULE_TIMEOUT:
		/*
		 * These two special cases are useful to be comfortable
		 * in the caller. Nothing more. We could take
		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
		 * but I' d like to return a valid offset (>=0) to allow
		 * the caller to do everything it want with the retval.
		 */
		schedule();
		goto out;
	default:
		/*
		 * Another bit of PARANOID. Note that the retval will be
		 * 0 since no piece of kernel is supposed to do a check
		 * for a negative retval of schedule_timeout() (since it
		 * should never happens anyway). You just have the printk()
		 * that will tell you if something is gone wrong and where.
		 */
		if (timeout < 0) {
			printk(KERN_ERR "schedule_timeout: wrong timeout "
				"value %lx\n", timeout);
			dump_stack();
			current->state = TASK_RUNNING;
			goto out;
		}
	}

	expire = timeout + jiffies;

	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
	__mod_timer(&timer, expire, false);
	schedule();
	del_singleshot_timer_sync(&timer);

	/* Remove the timer from the object tracker */
	destroy_timer_on_stack(&timer);

	timeout = expire - jiffies;

 out:
	return timeout < 0 ? 0 : timeout;
}
  • 如果msleep调用的时候,时间填的是MAX_SCHEDULE_TIMEOUT,那么该进程直接调用schedule调度出去,schedule函数会将当前进程移除CFS调度队列,该进程将永远睡眠,调度器不会再搭理它。
  • 设置定时器,一旦达到睡眠时间会调用process_timeout函数,唤醒睡眠的进程。
  • 调用schedule将当前进程调度出去。
  • 删除定时器。

由于调用完schedule之后,当前进程移出调度队列,并且进程状态不再是TASK_RUNNING,所以该进程得不到调度。这也就是所谓的进程睡眠,进程睡眠后只能在达到超时时间后由process_timeout函数唤醒,一旦唤醒之后,会接着schedule之后的函数继续执行,此时定时器已经没有用了,所以可以删除。

上面我们说schedule会将当前进程移出调度队列,这里其实是不准确的如果当前进程还有待处理的signal或者进程以TASK_INTERRUPTIBLE状态休眠,进程依然会在调度队列上,至于原因会有专门的文章分析。

唤醒睡眠进程


process_timeout->wake_up_process->try_to_wake_up

static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
	unsigned long flags;
	int cpu, success = 0;
	smp_mb__before_spinlock();
	raw_spin_lock_irqsave(&p->pi_lock, flags);
	if (!(p->state & state))
		goto out;

	trace_sched_waking(p);

	success = 1; /* we're going to change ->state */
	cpu = task_cpu(p);
	smp_rmb();
	if (p->on_rq && ttwu_remote(p, wake_flags))
		goto stat;

#ifdef CONFIG_SMP
	smp_rmb();
	smp_cond_load_acquire(&p->on_cpu, !VAL);

	p->sched_contributes_to_load = !!task_contributes_to_load(p);
	p->state = TASK_WAKING;

	cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
	if (task_cpu(p) != cpu) {
		wake_flags |= WF_MIGRATED;
		set_task_cpu(p, cpu);
	}
#endif /* CONFIG_SMP */

	ttwu_queue(p, cpu, wake_flags);
stat:
	ttwu_stat(p, cpu, wake_flags);
out:
	raw_spin_unlock_irqrestore(&p->pi_lock, flags);

	return success;
}

我们这里只关注wakeup的核心逻辑,不去分析锁以及rmb相关的逻辑。

  • 入口参数TASK_NORMAL指明可以唤醒处于什么状态的进程,其定义为:
#define TASK_NORMAL		(TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)

也就是说处于TASK_INTERRUPTIBLE和TASK_UNINTERRUPTIBLE状态的进程可以被唤醒,前面我们分析过msleep将进程置于TASK_UNINTERRUPTIBLE状态,所以调用msleep休眠的进程可以被唤醒。

  • 如果进程依旧在cfs调度队列上,调用ttwu_remote唤醒进程。
    我们前面提到过,如果进程有待处理的signal,他不会被移出调度队列,这里只要调用ttwu_remote简单的将其状态设置为TASK_RUNNING就可以了。
  • 在SMP系统上,会调用select_task_rq函数选择一个合适的CPU,将需要唤醒的进程放到该CPU的调度队列来执行,这就是负载均衡的概念,这里可以看出,进程在某个CPU上休眠的,醒来后很可能会在另外一个CPU上运行。
  • 调用ttwu_queue函数将进程重新加入CFS调度队列,并将其状态设置为TASK_RUNNING。

process_timeout->wake_up_process->try_to_wake_up->ttwu_queue->ttwu_do_activate

static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
		 struct pin_cookie cookie)
{
	int en_flags = ENQUEUE_WAKEUP;

	lockdep_assert_held(&rq->lock);

#ifdef CONFIG_SMP
	if (p->sched_contributes_to_load)
		rq->nr_uninterruptible--;

	if (wake_flags & WF_MIGRATED)
		en_flags |= ENQUEUE_MIGRATED;
#endif

	ttwu_activate(rq, p, en_flags);
	ttwu_do_wakeup(rq, p, wake_flags, cookie);
}
  • 调用函数ttwu_activate将进程加入CFS调度队列
  • 调用ttwu_do_wakeup函数将设置状态设置为TASK_RUNNING
    process_timeout->wake_up_process->try_to_wake_up->ttwu_queue->ttwu_do_activate->ttwu_activate->activate_task->enqueue_task->enqueue_task_fair
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;

	/*
	 * If in_iowait is set, the code below may not trigger any cpufreq
	 * utilization updates, so do it here explicitly with the IOWAIT flag
	 * passed.
	 */
	if (p->in_iowait)
		cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);

	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running increment below.
		 */
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running++;

		flags = ENQUEUE_WAKEUP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running++;

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_load_avg(se, 1);
		update_cfs_shares(cfs_rq);
	}

	if (!se)
		add_nr_running(rq, 1);

	hrtick_update(rq);
}
  • 如果进程本身就在调度队列上,那么不需要再次执行入队操作。
  • 如果进程不在调度队列,调用函数enqueue_entity入队,enqueue_entity先调用place_entity函数给要唤醒的进程一个合适的vruntime值,然后调用__enqueue_entity函数将睡眠进程加入红黑树。
    process_timeout->wake_up_process->try_to_wake_up->ttwu_queue->ttwu_do_activate->ttwu_activate->activate_task->enqueue_task->enqueue_task_fair->enqueue_entity->place_entity
static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
	u64 vruntime = cfs_rq->min_vruntime;

	/*
	 * The 'current' period is already promised to the current tasks,
	 * however the extra weight of the new task will slow them down a
	 * little, place the new task so that it fits in the slot that
	 * stays open at the end.
	 */
	if (initial && sched_feat(START_DEBIT))
		vruntime += sched_vslice(cfs_rq, se);

	/* sleeps up to a single latency don't count. */
	if (!initial) {
		unsigned long thresh = sysctl_sched_latency;

		/*
		 * Halve their sleep time's effect, to allow
		 * for a gentler effect of sleepers:
		 */
		if (sched_feat(GENTLE_FAIR_SLEEPERS))
			thresh >>= 1;

		vruntime -= thresh;
	}

	/* ensure we never gain time by being placed backwards. */
	se->vruntime = max_vruntime(se->vruntime, vruntime);
}

这个函数非常简单,睡眠进程的vruntime就是用min_vruntime减去一个调度延时sysctl_sched_latency的时间,这就是对休眠进程所谓的优待。当然,如果休眠进程睡眠的时间比较短,其本身的vruntime比min_vruntime-sysctl_sched_latency要大,这种情况下,没有必要对休眠进程补偿。

你可能感兴趣的:(进程管理学习记录)