进程调度 ---CFS调度

完全公平算法CFS调度分析方法:

是依据进程就绪队列中等待的时间长短来进行调度选择,即在就绪队列中等待时间越长的进程得到调度的机会就越大,否则,机会就越小;CFS在所有可运行进程总数基础上计算出一个进程应该运行多久,而不是依靠nice值来计算时间片;nice值在CFS中被作为进程获得的处理器运行比的权重:越高的nice值(越低的优先级)进程获得更低的处理器使用权重,这是相对默认nice值进程的进程而言的;相反,更低的nice值(越高的优先级)的进程获得更高的处理器使用权重。

在初始化时:

__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);   定时器为调度的前提

#ifdef CONFIG_NO_HZ
	nohz.next_balance = jiffies;
	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
	cpu_notifier(sched_ilb_notifier, 0);
#endif
#endif /* SMP */

}

在start_kernel()--------------->调用sched_init();

                                             --------------------->init_cfs_rq(&rq->cfs);
                            ----------------------->init_rt_rq(&rq->rt, rq);

/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight load;/*运行负载*/  
	unsigned int nr_running, h_nr_running;

	u64 exec_clock;
	u64 min_vruntime;/*保存的最小运行时间*/ 
#ifndef CONFIG_64BIT
	u64 min_vruntime_copy;
#endif

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
	struct sched_entity *curr, *next, *last, *skip;

#ifdef	CONFIG_SCHED_DEBUG
	unsigned int nr_spread_over;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
	int on_list;
	struct list_head leaf_cfs_rq_list;
	struct task_group *tg;	/* group that "owns" this runqueue */

#ifdef CONFIG_SMP
	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long h_load;

	/*
	 * Maintaining per-cpu shares distribution for group scheduling
	 *
	 * load_stamp is the last time we updated the load average
	 * load_last is the last time we updated the load average and saw load
	 * load_unacc_exec_time is currently unaccounted execution time
	 */
	u64 load_avg;
	u64 load_period;
	u64 load_stamp, load_last, load_unacc_exec_time;

	unsigned long load_contribution;
#endif /* CONFIG_SMP */
#ifdef CONFIG_CFS_BANDWIDTH
	int runtime_enabled;
	u64 runtime_expires;
	s64 runtime_remaining;

	u64 throttled_timestamp;
	int throttled, throttle_count;
	struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

调度实体:

struct sched_entity {
	struct load_weight	load;		/* for load-balancing */
	struct rb_node		run_node;
	struct list_head	group_node;
	unsigned int		on_rq;//该se是否在rq上  

	u64			exec_start;//当前cfs_rq的时间,用于计算时间差
	u64			sum_exec_runtime;  //进程总共运行的时间,real-run time  
	u64			vruntime;/*存放进程的虚拟运行时间,用于调度器的选择*/
	u64			prev_sum_exec_runtime;//进程在醒来的时间  

	u64			nr_migrations;

#ifdef CONFIG_SCHEDSTATS
	struct sched_statistics statistics;
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
	struct sched_entity	*parent;
	/* rq on which this entity is (to be) queued: */
	struct cfs_rq		*cfs_rq;
	/* rq "owned" by this entity/group: */
	struct cfs_rq		*my_q;
#endif
};

对于CFS调度:可知,调度器的实体作为一个se的成员变量潜入在进程描述符task_struct内;

CFS调度具体class为fair_sched_class 

/*
 * All the scheduling class methods:
 */
const struct sched_class fair_sched_class = {
	.next			= &idle_sched_class,  下一个为idle进程;
	.enqueue_task		= enqueue_task_fair,
	.dequeue_task		= dequeue_task_fair,
	.yield_task		= yield_task_fair,
	.yield_to_task		= yield_to_task_fair,

	.check_preempt_curr	= check_preempt_wakeup,

	.pick_next_task		= pick_next_task_fair,
	.put_prev_task		= put_prev_task_fair,

#ifdef CONFIG_SMP
	.select_task_rq		= select_task_rq_fair,

	.rq_online		= rq_online_fair,
	.rq_offline		= rq_offline_fair,

	.task_waking		= task_waking_fair,
#endif

	.set_curr_task          = set_curr_task_fair,
	.task_tick		= task_tick_fair,
	.task_fork		= task_fork_fair,

	.prio_changed		= prio_changed_fair,
	.switched_from		= switched_from_fair,
	.switched_to		= switched_to_fair,

	.get_rr_interval	= get_rr_interval_fair,

#ifdef CONFIG_FAIR_GROUP_SCHED
	.task_move_group	= task_move_group_fair,
#endif
};
由上一章可知:

在进程切换时:deactivate_task---->deactivate_task------>

                         p->sched_class->dequeue_task(rq, p, flags); 如果是CFS调度就会调度 fair_sched_class 中的dequeue_task_fair;

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se; 根据task_struct中的调度类 调用调度函数,最后调度task_struct的sched_entity;
	int task_sleep = flags & DEQUEUE_SLEEP;

	for_each_sched_entity(se) {//考虑组调度,
		cfs_rq = cfs_rq_of(se); //获取se对应的运行队列
		dequeue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running decrement below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running--;

		/* Don't dequeue parent if it has other entities besides us */
		if (cfs_rq->load.weight) {
			/*
			 * Bias pick_next to pick a task from this cfs_rq, as
			 * p is sleeping when it is within its sched_slice.
			 */
			if (task_sleep && parent_entity(se))
				set_next_buddy(parent_entity(se));

			/* avoid re-evaluating load for this entity */
			se = parent_entity(se);
			break;
		}
		flags |= DEQUEUE_SLEEP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running--;

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_cfs_load(cfs_rq, 0);
		update_cfs_shares(cfs_rq);
	}

	if (!se)
		dec_nr_running(rq);
	hrtick_update(rq);
}
删除进程:------------>dequeue_entity

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);

	update_stats_dequeue(cfs_rq, se);
	if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
		if (entity_is_task(se)) {
			struct task_struct *tsk = task_of(se);

			if (tsk->state & TASK_INTERRUPTIBLE)
				se->statistics.sleep_start = rq_of(cfs_rq)->clock;
			if (tsk->state & TASK_UNINTERRUPTIBLE)
				se->statistics.block_start = rq_of(cfs_rq)->clock;
		}
#endif
	}

	clear_buddies(cfs_rq, se);

	if (se != cfs_rq->curr)
		__dequeue_entity(cfs_rq, se);
	se->on_rq = 0;
	update_cfs_load(cfs_rq, 0);
	account_entity_dequeue(cfs_rq, se);

	/*
	 * Normalize the entity after updating the min_vruntime because the
	 * update can refer to the ->curr item and we need to reflect this
	 * movement in our normalized position.
	 */
	if (!(flags & DEQUEUE_SLEEP))
		se->vruntime -= cfs_rq->min_vruntime;

	/* return excess runtime on last dequeue */
	return_cfs_rq_runtime(cfs_rq);

	update_min_vruntime(cfs_rq);
	update_cfs_shares(cfs_rq);
}

/

/*实现记账功能,由系统定时器周期调用*/
static void update_curr(struct cfs_rq *cfs_rq)
{
	struct sched_entity *curr = cfs_rq->curr;
	u64 now = rq_of(cfs_rq)->clock;/*now计时器*/
	unsigned long delta_exec;

	if (unlikely(!curr))
		return;

	/*
	 * Get the amount of time the current task was running
	 * since the last time we changed load (this cannot
	 * overflow on 32 bits):
	 */
	 /*获得从最后一次修改负载后当前任务所占用的运行总时间*/
	/*即计算当前进程的执行时间*/
	delta_exec = (unsigned long)(now - curr->exec_start);
	if (!delta_exec)/*如果本次没有执行过,不用重新更新了*/
		return;
	/*根据当前可运行进程总数对运行时间进行加权计算*/
	__update_curr(cfs_rq, curr, delta_exec);
	curr->exec_start = now;/*将exec_start属性置为now*/

	if (entity_is_task(curr)) {/*下面为关于组调度的,暂时不分析了*/
		struct task_struct *curtask = task_of(curr);

		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
		cpuacct_charge(curtask, delta_exec);
		account_group_exec_runtime(curtask, delta_exec);
	}
}
</strong></span>

static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
	      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;

	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
	/*总运行时间更新,真实运行时间  */
	curr->sum_exec_runtime += delta_exec;
	/*更新cfs_rq的exec_clock*/
	schedstat_add(cfs_rq, exec_clock, delta_exec);
	/*用优先级和delta_exec来计算weighted以用于更细vruntime*/
       /* calc_delta_fair用来将真实时间转化为虚拟时间。 
        进程的优先级不同,它在系统中的地位(也就是权重)也不同 
        进程的优先级越高,它的虚拟时间走的越慢。*/
	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
	/*vruntime可以准确地测量给定进程的运行时间
	而且可知道谁应该是下一个被运行的进程*/

	/*更新进程的虚拟运行时间vruntime*/
	curr->vruntime += delta_exec_weighted;
	update_min_vruntime(cfs_rq);
}
每个进程在其产生(fork)的时候,都会根据其父亲的优先级产生它的优先级和权重(sched_fork函数)在copy_process函数中调用。
static inline unsigned long
calc_delta_fair(unsigned long delta, struct sched_entity *se)
{
 	/*NICE_0_LOAD: 优先级0 的weight*/ /如果该进程拥有nice为0的权重,这是他的虚拟时钟和真实时钟是一样速度的   	
 /* 如果不是优先级0,就要调用calc_delta_mine计算delta的weight值*/ 
	if (unlikely(se->load.weight != NICE_0_LOAD))
		delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);

	return delta;
}


 /*在这里不打算详细分析calc_delta_mine (delta_exec,weight,lw),它的执行过程约为delta *= weight / lw. 
从这个函数中可以看到,如果进程的优先级为0,那么就是返回delta.
如果不为0,就会调用calc_delta_mine()对delta值进行修正.对上面对calc_delta_mine()的说明来看
,有如下关系: Delta = delta* NICE_0_LOAD/ se->load 
Se->load值是怎么来的呢? 可以跟踪sys_nice(),就可以发现se->load
其它就是表示nice对应的load值,nice越低,值越大. 
据此,就可以得到一个结论.在执行相同时间的条件下(delta相同),
高优先的进程计算出来的delta值会比低优先级的进程计算出来
的低.应此,高优先的进程就会位于rb_tree的左边,在下次调度的
时候就会优先调度.
从注释来看calc_delta_mine的计算公式为delta *= weight / lw,也就是说进程的权重越大,时钟走的越慢,而且是线性的。 
*/
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
		struct load_weight *lw)
{
	u64 tmp;

	if (!lw->inv_weight) {
		if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
			lw->inv_weight = 1;
		else
			lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
				/ (lw->weight+1);
	}

	tmp = (u64)delta_exec * weight;
	/*
	 * Check whether we'd overflow the 64-bit multiplication:
	 */
	if (unlikely(tmp > WMULT_CONST))
		tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
			WMULT_SHIFT/2);
	else
		tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

min_vruntime是cfs的rq中的一个成员,是cfs时间的基准,在cfs中起这至关重要的作用。  
自cfs产生以来,这部分的代码改动也是很频繁的。  
static void update_min_vruntime(struct cfs_rq *cfs_rq)  
{  
    u64 vruntime = cfs_rq->min_vruntime;  
  
    /* 
        由于当前运行的进程是不在红黑树上的,所以关于cfs_rq->min_vruntime的更新 
        必须要考虑当前的进程,以免产生不公平,这是以前的调度器所疏忽的。 
        如果有当前进程,就以当前进程作为基准计算 
    */  
    if (cfs_rq->curr)  
        vruntime = cfs_rq->curr->vruntime;  
  
    if (cfs_rq->rb_leftmost) {  
        struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,  
                           struct sched_entity,  
                           run_node);  
  
        if (!cfs_rq->curr)  
            /* 
                如果没有当前进程,这个在什么时候出现? 
                其他策略的进程在运行时? 
                就不用考虑它了,就是最小的运行时间 
            */  
            vruntime = se->vruntime;  
        else  
            /* 
                如果有当前进程,还要考虑这个最小的运行时间 
            */  
            vruntime = min_vruntime(vruntime, se->vruntime);  
    }  
  
    //最后,更新cfs_rq->min_vruntime,这个值是单调增加的。  
    cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);  
}  


static void
account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	/*cfs_rq->load更新*/
	update_load_sub(&cfs_rq->load, se->load.weight);
	if (!parent_entity(se))
		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
	if (entity_is_task(se)) {/*组调度相关*/
		add_cfs_task_weight(cfs_rq, -se->load.weight);
		list_del_init(&se->group_node);
	}
	/*运行个数减一*/
	cfs_rq->nr_running--;
	se->on_rq = 0;/*表示不再运行队列中*/
}

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	if (cfs_rq->rb_leftmost == &se->run_node) {
		struct rb_node *next_node;

		next_node = rb_next(&se->run_node);    从红黑树里面删除
		cfs_rq->rb_leftmost = next_node;    
	}

	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

对于向运行队列中添加task:可知最后会调用enqueue_task_fair;

<span style="font-size: 24px;">*
 * The enqueue_task method is called before nr_running is
 * increased. Here we update the fair scheduling stats and
 * then put the task into the rbtree:
 */
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
	struct cfs_rq *cfs_rq;
	struct sched_entity *se = &p->se;
 /*对于主调度,会对一个组中的所有进程进行操作*/  
	for_each_sched_entity(se) {
		if (se->on_rq)
			break;
		cfs_rq = cfs_rq_of(se);
		enqueue_entity(cfs_rq, se, flags);

		/*
		 * end evaluation on encountering a throttled cfs_rq
		 *
		 * note: in the case of encountering a throttled cfs_rq we will
		 * post the final h_nr_running increment below.
		*/
		if (cfs_rq_throttled(cfs_rq))
			break;
		cfs_rq->h_nr_running++;

		flags = ENQUEUE_WAKEUP;
	}

	for_each_sched_entity(se) {
		cfs_rq = cfs_rq_of(se);
		cfs_rq->h_nr_running++;

		if (cfs_rq_throttled(cfs_rq))
			break;

		update_cfs_load(cfs_rq, 0);
		update_cfs_shares(cfs_rq);
	}

	if (!se)
		inc_nr_running(rq);
	hrtick_update(rq);
}
</span>
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)   更新相关调度信息后最终会调用下面函数插入运行进程的红黑树
{
	/*
	 * Update the normalized vruntime before updating min_vruntime
	 * through callig update_curr().
	 */
	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
		se->vruntime += cfs_rq->min_vruntime;

	/*
	 * Update run-time statistics of the 'current'.
	 */
	update_curr(cfs_rq);
	update_cfs_load(cfs_rq, 0);
	account_entity_enqueue(cfs_rq, se);
	update_cfs_shares(cfs_rq);

	if (flags & ENQUEUE_WAKEUP) {
		place_entity(cfs_rq, se, 0);//进程唤醒时的操作(try_to_wake_up会提到
		enqueue_sleeper(cfs_rq, se);
	}

	update_stats_enqueue(cfs_rq, se);
	check_spread(cfs_rq, se);
	if (se != cfs_rq->curr)
		__enqueue_entity(cfs_rq, se);
	se->on_rq = 1;

	if (cfs_rq->nr_running == 1) {
		list_add_leaf_cfs_rq(cfs_rq);
		check_enqueue_throttle(cfs_rq);
	}
}


/*
 * Enqueue an entity into the rb-tree:
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
	struct rb_node *parent = NULL;
	struct sched_entity *entry;
	int leftmost = 1;

	/*
	 * Find the right place in the rbtree:
	 */
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		/*
		 * We dont care about collisions. Nodes with
		 * the same key stay together.
		 */
		if (entity_before(se, entry)) {/*key为被插入进程的vruntime*/ 
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	/*
	 * Maintain a cache of leftmost tree entries (it is frequently
	 * used):
	 */
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
CFS的运行队列布局是放在红黑树里面的,而这颗红黑树的排序方式是按照运行实体的vruntime来的
CFS调度算法的核心是选择具有最小vruntine的任务。运行队列采用红黑树方式存放,其中节点的键值便是可运行进程的虚拟运行时间。CFS调度器选取待运行的下一个进程,是所有进程中vruntime最小的那个,他对应的便是在树中最左侧的叶子节点。

实现选择的函数为pick_next_task_fair:

static struct task_struct *pick_next_task_fair(struct rq *rq)
{
	struct task_struct *p;
	struct cfs_rq *cfs_rq = &rq->cfs;
	struct sched_entity *se;

	if (!cfs_rq->nr_running)
		return NULL;

	do {
		se = pick_next_entity(cfs_rq);
		set_next_entity(cfs_rq, se);/*设置为当前运行进程*/ 
		cfs_rq = group_cfs_rq(se);
	} while (cfs_rq);

	p = task_of(se);
	if (hrtick_enabled(rq))
		hrtick_start_fair(rq, p);

	return p;
}
</span>

/*
 * Pick the next process, keeping these things in mind, in this order:
 * 1) keep things fair between processes/task groups
 * 2) pick the "next" process, since someone really wants that to run
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
	struct sched_entity *se = __pick_first_entity(cfs_rq);
	struct sched_entity *left = se;

	/*
	 * Avoid running the skip buddy, if running something else can
	 * be done without getting too unfair.
	 */
	if (cfs_rq->skip == se) {
		struct sched_entity *second = __pick_next_entity(se);
		if (second && wakeup_preempt_entity(second, left) < 1)
			se = second;
	}

	/*
	 * Prefer last buddy, try to return the CPU to a preempted task.
	 */
	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
		se = cfs_rq->last;

	/*
	 * Someone really wants this to run. If it's not unfair, run it.
	 */
	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
		se = cfs_rq->next;

	clear_buddies(cfs_rq, se);

	return se;
static struct sched_entity *__pick_next_entity(struct sched_entity *se)
{
	struct rb_node *next = rb_next(&se->run_node);

	if (!next)
		return NULL;

	return rb_entry(next, struct sched_entity, run_node);
}



你可能感兴趣的:(进程调度 ---CFS调度)