完全公平算法CFS调度分析方法:
是依据进程就绪队列中等待的时间长短来进行调度选择,即在就绪队列中等待时间越长的进程得到调度的机会就越大,否则,机会就越小;CFS在所有可运行进程总数基础上计算出一个进程应该运行多久,而不是依靠nice值来计算时间片;nice值在CFS中被作为进程获得的处理器运行比的权重:越高的nice值(越低的优先级)进程获得更低的处理器使用权重,这是相对默认nice值进程的进程而言的;相反,更低的nice值(越高的优先级)的进程获得更高的处理器使用权重。
在初始化时:
__init void init_sched_fair_class(void) { #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 定时器为调度的前提 #ifdef CONFIG_NO_HZ nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ }
--------------------->init_cfs_rq(&rq->cfs);
----------------------->init_rt_rq(&rq->rt, rq);
/* CFS-related fields in a runqueue */ struct cfs_rq { struct load_weight load;/*运行负载*/ unsigned int nr_running, h_nr_running; u64 exec_clock; u64 min_vruntime;/*保存的最小运行时间*/ #ifndef CONFIG_64BIT u64 min_vruntime_copy; #endif struct rb_root tasks_timeline; struct rb_node *rb_leftmost; /* * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next, *last, *skip; #ifdef CONFIG_SCHED_DEBUG unsigned int nr_spread_over; #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ /* * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in * a hierarchy). Non-leaf lrqs hold other higher schedulable entities * (like users, containers etc.) * * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This * list is used during load balance. */ int on_list; struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ #ifdef CONFIG_SMP /* * h_load = weight * f(tg) * * Where f(tg) is the recursive weight fraction assigned to * this group. */ unsigned long h_load; /* * Maintaining per-cpu shares distribution for group scheduling * * load_stamp is the last time we updated the load average * load_last is the last time we updated the load average and saw load * load_unacc_exec_time is currently unaccounted execution time */ u64 load_avg; u64 load_period; u64 load_stamp, load_last, load_unacc_exec_time; unsigned long load_contribution; #endif /* CONFIG_SMP */ #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; u64 runtime_expires; s64 runtime_remaining; u64 throttled_timestamp; int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ };
struct sched_entity { struct load_weight load; /* for load-balancing */ struct rb_node run_node; struct list_head group_node; unsigned int on_rq;//该se是否在rq上 u64 exec_start;//当前cfs_rq的时间,用于计算时间差 u64 sum_exec_runtime; //进程总共运行的时间,real-run time u64 vruntime;/*存放进程的虚拟运行时间,用于调度器的选择*/ u64 prev_sum_exec_runtime;//进程在醒来的时间 u64 nr_migrations; #ifdef CONFIG_SCHEDSTATS struct sched_statistics statistics; #endif #ifdef CONFIG_FAIR_GROUP_SCHED struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif };
对于CFS调度:可知,调度器的实体作为一个se的成员变量潜入在进程描述符task_struct内;
CFS调度具体class为fair_sched_class
/* * All the scheduling class methods: */ const struct sched_class fair_sched_class = { .next = &idle_sched_class, 下一个为idle进程; .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif };由上一章可知:
在进程切换时:deactivate_task---->deactivate_task------>
p->sched_class->dequeue_task(rq, p, flags); 如果是CFS调度就会调度 fair_sched_class 中的dequeue_task_fair;
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; 根据task_struct中的调度类 调用调度函数,最后调度task_struct的sched_entity; int task_sleep = flags & DEQUEUE_SLEEP; for_each_sched_entity(se) {//考虑组调度, cfs_rq = cfs_rq_of(se); //获取se对应的运行队列 dequeue_entity(cfs_rq, se, flags); /* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running decrement below. */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running--; /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { /* * Bias pick_next to pick a task from this cfs_rq, as * p is sleeping when it is within its sched_slice. */ if (task_sleep && parent_entity(se)) set_next_buddy(parent_entity(se)); /* avoid re-evaluating load for this entity */ se = parent_entity(se); break; } flags |= DEQUEUE_SLEEP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; if (cfs_rq_throttled(cfs_rq)) break; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } if (!se) dec_nr_running(rq); hrtick_update(rq); }删除进程:------------>dequeue_entity
static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); update_stats_dequeue(cfs_rq, se); if (flags & DEQUEUE_SLEEP) { #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) se->statistics.sleep_start = rq_of(cfs_rq)->clock; if (tsk->state & TASK_UNINTERRUPTIBLE) se->statistics.block_start = rq_of(cfs_rq)->clock; } #endif } clear_buddies(cfs_rq, se); if (se != cfs_rq->curr) __dequeue_entity(cfs_rq, se); se->on_rq = 0; update_cfs_load(cfs_rq, 0); account_entity_dequeue(cfs_rq, se); /* * Normalize the entity after updating the min_vruntime because the * update can refer to the ->curr item and we need to reflect this * movement in our normalized position. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); }
/
/*实现记账功能,由系统定时器周期调用*/ static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_of(cfs_rq)->clock;/*now计时器*/ unsigned long delta_exec; if (unlikely(!curr)) return; /* * Get the amount of time the current task was running * since the last time we changed load (this cannot * overflow on 32 bits): */ /*获得从最后一次修改负载后当前任务所占用的运行总时间*/ /*即计算当前进程的执行时间*/ delta_exec = (unsigned long)(now - curr->exec_start); if (!delta_exec)/*如果本次没有执行过,不用重新更新了*/ return; /*根据当前可运行进程总数对运行时间进行加权计算*/ __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now;/*将exec_start属性置为now*/ if (entity_is_task(curr)) {/*下面为关于组调度的,暂时不分析了*/ struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } } </strong></span>
static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); /*总运行时间更新,真实运行时间 */ curr->sum_exec_runtime += delta_exec; /*更新cfs_rq的exec_clock*/ schedstat_add(cfs_rq, exec_clock, delta_exec); /*用优先级和delta_exec来计算weighted以用于更细vruntime*/ /* calc_delta_fair用来将真实时间转化为虚拟时间。 进程的优先级不同,它在系统中的地位(也就是权重)也不同 进程的优先级越高,它的虚拟时间走的越慢。*/ delta_exec_weighted = calc_delta_fair(delta_exec, curr); /*vruntime可以准确地测量给定进程的运行时间 而且可知道谁应该是下一个被运行的进程*/ /*更新进程的虚拟运行时间vruntime*/ curr->vruntime += delta_exec_weighted; update_min_vruntime(cfs_rq); }每个进程在其产生(fork)的时候,都会根据其父亲的优先级产生它的优先级和权重(sched_fork函数)在copy_process函数中调用。
static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { /*NICE_0_LOAD: 优先级0 的weight*/ /如果该进程拥有nice为0的权重,这是他的虚拟时钟和真实时钟是一样速度的 /* 如果不是优先级0,就要调用calc_delta_mine计算delta的weight值*/ if (unlikely(se->load.weight != NICE_0_LOAD)) delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; }
/*在这里不打算详细分析calc_delta_mine (delta_exec,weight,lw),它的执行过程约为delta *= weight / lw. 从这个函数中可以看到,如果进程的优先级为0,那么就是返回delta. 如果不为0,就会调用calc_delta_mine()对delta值进行修正.对上面对calc_delta_mine()的说明来看 ,有如下关系: Delta = delta* NICE_0_LOAD/ se->load Se->load值是怎么来的呢? 可以跟踪sys_nice(),就可以发现se->load 其它就是表示nice对应的load值,nice越低,值越大. 据此,就可以得到一个结论.在执行相同时间的条件下(delta相同), 高优先的进程计算出来的delta值会比低优先级的进程计算出来 的低.应此,高优先的进程就会位于rb_tree的左边,在下次调度的 时候就会优先调度. 从注释来看calc_delta_mine的计算公式为delta *= weight / lw,也就是说进程的权重越大,时钟走的越慢,而且是线性的。 */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; if (!lw->inv_weight) { if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) lw->inv_weight = 1; else lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) / (lw->weight+1); } tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ if (unlikely(tmp > WMULT_CONST)) tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, WMULT_SHIFT/2); else tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); }
min_vruntime是cfs的rq中的一个成员,是cfs时间的基准,在cfs中起这至关重要的作用。 自cfs产生以来,这部分的代码改动也是很频繁的。 static void update_min_vruntime(struct cfs_rq *cfs_rq) { u64 vruntime = cfs_rq->min_vruntime; /* 由于当前运行的进程是不在红黑树上的,所以关于cfs_rq->min_vruntime的更新 必须要考虑当前的进程,以免产生不公平,这是以前的调度器所疏忽的。 如果有当前进程,就以当前进程作为基准计算 */ if (cfs_rq->curr) vruntime = cfs_rq->curr->vruntime; if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); if (!cfs_rq->curr) /* 如果没有当前进程,这个在什么时候出现? 其他策略的进程在运行时? 就不用考虑它了,就是最小的运行时间 */ vruntime = se->vruntime; else /* 如果有当前进程,还要考虑这个最小的运行时间 */ vruntime = min_vruntime(vruntime, se->vruntime); } //最后,更新cfs_rq->min_vruntime,这个值是单调增加的。 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); }
static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /*cfs_rq->load更新*/ update_load_sub(&cfs_rq->load, se->load.weight); if (!parent_entity(se)) dec_cpu_load(rq_of(cfs_rq), se->load.weight); if (entity_is_task(se)) {/*组调度相关*/ add_cfs_task_weight(cfs_rq, -se->load.weight); list_del_init(&se->group_node); } /*运行个数减一*/ cfs_rq->nr_running--; se->on_rq = 0;/*表示不再运行队列中*/ }
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; next_node = rb_next(&se->run_node); 从红黑树里面删除 cfs_rq->rb_leftmost = next_node; } rb_erase(&se->run_node, &cfs_rq->tasks_timeline); }
<span style="font-size: 24px;">* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; /*对于主调度,会对一个组中的所有进程进行操作*/ for_each_sched_entity(se) { if (se->on_rq) break; cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); /* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; flags = ENQUEUE_WAKEUP; } for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; if (cfs_rq_throttled(cfs_rq)) break; update_cfs_load(cfs_rq, 0); update_cfs_shares(cfs_rq); } if (!se) inc_nr_running(rq); hrtick_update(rq); } </span>
static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 更新相关调度信息后最终会调用下面函数插入运行进程的红黑树 { /* * Update the normalized vruntime before updating min_vruntime * through callig update_curr(). */ if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) se->vruntime += cfs_rq->min_vruntime; /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); update_cfs_load(cfs_rq, 0); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0);//进程唤醒时的操作(try_to_wake_up会提到 enqueue_sleeper(cfs_rq, se); } update_stats_enqueue(cfs_rq, se); check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; if (cfs_rq->nr_running == 1) { list_add_leaf_cfs_rq(cfs_rq); check_enqueue_throttle(cfs_rq); } }
/* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; int leftmost = 1; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); /* * We dont care about collisions. Nodes with * the same key stay together. */ if (entity_before(se, entry)) {/*key为被插入进程的vruntime*/ link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = 0; } } /* * Maintain a cache of leftmost tree entries (it is frequently * used): */ if (leftmost) cfs_rq->rb_leftmost = &se->run_node; rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); }CFS的运行队列布局是放在红黑树里面的,而这颗红黑树的排序方式是按照运行实体的vruntime来的
实现选择的函数为pick_next_task_fair:
static struct task_struct *pick_next_task_fair(struct rq *rq) { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; if (!cfs_rq->nr_running) return NULL; do { se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se);/*设置为当前运行进程*/ cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); if (hrtick_enabled(rq)) hrtick_start_fair(rq, p); return p; } </span>
/* * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups * 2) pick the "next" process, since someone really wants that to run * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = __pick_first_entity(cfs_rq); struct sched_entity *left = se; /* * Avoid running the skip buddy, if running something else can * be done without getting too unfair. */ if (cfs_rq->skip == se) { struct sched_entity *second = __pick_next_entity(se); if (second && wakeup_preempt_entity(second, left) < 1) se = second; } /* * Prefer last buddy, try to return the CPU to a preempted task. */ if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) se = cfs_rq->last; /* * Someone really wants this to run. If it's not unfair, run it. */ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) se = cfs_rq->next; clear_buddies(cfs_rq, se); return se;
static struct sched_entity *__pick_next_entity(struct sched_entity *se) { struct rb_node *next = rb_next(&se->run_node); if (!next) return NULL; return rb_entry(next, struct sched_entity, run_node); }