在CFS中,当产生时钟tick中断时,sched.c中scheduler_tick()函数会被时钟中断(定时器timer的代码)直接调用,我们调用它则是在禁用中断时。注意在fork的代码中,当修改父进程的时间片时,也会导致sched_tick的调用。sched_tick函数首先更新调度信息,然后调整当前进程在红黑树中的位置。调整完成后如果发现当前进程不再是最左边的叶子,就标记need_resched标志,中断返回时就会调用scheduler()完成进程切换,否则当前进程继续占用CPU。注意这与以前的调度器不同,以前是tick中断导致时间片递减,当时间片被用完时才触发优先级调整并重新调度。sched_tick函数的代码如下:
void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; sched_clock_tick(); spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif }它先获取目前CPU上的运行队列中的当前运行进程,更新runqueue级变量clock,然后通过sched_class中的接口名task_tick,调用CFS的tick处理函数task_tick_fair(),以处理时钟中断。我们看kernel/sched_fair.c中的CFS算法实现。具体的调度类如下:
static const struct sched_class fair_sched_class = { .next = &idle_sched_class, .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, .check_preempt_curr = check_preempt_wakeup, .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_fair, .load_balance = load_balance_fair, .move_one_task = move_one_task_fair, .rq_online = rq_online_fair, .rq_offline = rq_offline_fair, .task_waking = task_waking_fair, #endif .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_fork = task_fork_fair, .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, .get_rr_interval = get_rr_interval_fair, #ifdef CONFIG_FAIR_GROUP_SCHED .task_move_group = task_move_group_fair, #endif };task_tick_fair函数用于轮询调度类的中一个进程。实现如下:
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) { /* 考虑了组调度 */ cfs_rq = cfs_rq_of(se); entity_tick(cfs_rq, se, queued); } }该函数获取各层的调度实体,对每个调度实体获取CFS运行队列,调用entity_tick进程进行处理。kernel/sched_fair.c中的函数entity_tick源代码如下:
static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); #ifdef CONFIG_SCHED_HRTICK /* * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ if (queued) { resched_task(rq_of(cfs_rq)->curr); return; } /* * don't let the period tick interfere with the hrtick preemption */ if (!sched_feat(DOUBLE_TICK) && hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) return; #endif if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); }该函数用kernel/sched_fair.c:update_curr()更新当前进程的运行时统计信息,然后调用kernel/sched_fair.c:check_preempt_tick(),检测是否需要重新调度,用下一个进程来抢占当前进程。update_curr()实现记账功能,由系统定时器周期调用,实现如下:
static inline void __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); curr->sum_exec_runtime += delta_exec; /* 总运行时间更新 */ schedstat_add(cfs_rq, exec_clock, delta_exec); /* 更新cfs_rq的exec_clock */ /* 用优先级和delta_exec来计算weighted,以用于更新vruntime */ delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; /* 更新当前进程的vruntime */ update_min_vruntime(cfs_rq); } static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; u64 now = rq_of(cfs_rq)->clock_task; /* now计时器 */ unsigned long delta_exec; if (unlikely(!curr)) return; /* * 获取从最后一次修改负载后当前进程所占用的运行总时间, * 即计算当前进程的执行时间 */ delta_exec = (unsigned long)(now - curr->exec_start); if (!delta_exec) /* 如果本次没有执行过,不用重新更新了 */ return; /* 根据当前可运行进程总数对运行时间进行加权计算 */ __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; /* 将exec_start属性置为now */ if (entity_is_task(curr)) { /* 下面为关于组调度的 */ struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } }这里delta_exec获得从最后一次修改负载后当前进程所占用的运行总时间,即计算当前进程的执行时间。然后调用__update_curr()更新进程的vruntime。更新前需要计算weighted,这由sched_fair.c:calc_delta_fair()实现,如下:
static inline unsigned long calc_delta_fair(unsigned long delta, struct sched_entity *se) { if (unlikely(se->load.weight != NICE_0_LOAD)) delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); return delta; }在calc_delta_fair中,如果进程的优先级为0,那么就是返回delta,如果不为0,就要调用kernel/sched.c中的calc_delta_mine对delta值进行修正,如下:
#if BITS_PER_LONG == 32 # define WMULT_CONST (~0UL) #else # define WMULT_CONST (1UL << 32) #endif #define WMULT_SHIFT 32 /* * Shift right and round: */ #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) /* * delta *= weight / lw */ static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; if (!lw->inv_weight) { if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) lw->inv_weight = 1; else lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) / (lw->weight+1); } tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ if (unlikely(tmp > WMULT_CONST)) tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, WMULT_SHIFT/2); else tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); }CFS允许每个进程运行一段时间、循环轮转、选择运行最少的进程作为下一个运行进程,而不再采用分配给每个进程时间片的做法了,CFS在所有可运行进程总数基础上计算出一个进程应该运行多久,而不是依靠nice值来计算时间片。nice值在CFS中被作为进程获得的处理器运行比的权重,越高的nice值(越低的优先级)进程获得更低的处理器使用权重,这是相对默认nice值进程的进程而言的;相反,更低的nice值(越高的优先级)的进程获得更高的处理器使用权重。
static void check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) { unsigned long ideal_runtime, delta_exec; ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_task(rq_of(cfs_rq)->curr); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. */ clear_buddies(cfs_rq, curr); return; } /* * Ensure that a task that missed wakeup preemption by a * narrow margin doesn't have to wait for a full slice. * This also mitigates buddy induced latencies under load. */ if (!sched_feat(WAKEUP_PREEMPT)) return; if (delta_exec < sysctl_sched_min_granularity) return; if (cfs_rq->nr_running > 1) { /* 用于组调度 */ struct sched_entity *se = __pick_next_entity(cfs_rq); s64 delta = curr->vruntime - se->vruntime; if (delta > ideal_runtime) resched_task(rq_of(cfs_rq)->curr); } }该函数先获取当前进程的理想运行时间,如果当前执行时间超过理想时间,调用kernel/sched.c:resched_task()设置need_resched标志,完成设置的函数为resched_task()--->set_tsk_need_resched(p),表示需要重新调度进程。
(1)进程插入enqueue_task_fair:更新调度信息,调用enqueue_entity()--->__enqueue_entity()将调度实体插入到红黑树中。它会在nr_running递增之前被调用。插入时,会找到右边的空间并进行插入,然后缓存最左边的节点。对于组调度,会对组中的所有进程进行操作。如下:
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; struct rb_node *parent = NULL; struct sched_entity *entry; s64 key = entity_key(cfs_rq, se); /* key为被插入进程的vruntime */ int leftmost = 1; /* * Find the right place in the rbtree: */ while (*link) { parent = *link; entry = rb_entry(parent, struct sched_entity, run_node); /* * We dont care about collisions. Nodes with * the same key stay together. */ if (key < entity_key(cfs_rq, entry)) { link = &parent->rb_left; } else { link = &parent->rb_right; leftmost = 0; } } /* * Maintain a cache of leftmost tree entries (it is frequently * used): */ if (leftmost) cfs_rq->rb_leftmost = &se->run_node; rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); }可见CFS的运行队列布局是放在红黑树里面的,而这颗红黑树的排序方式是按照运行实体的vruntime来的。vruntime的计算方式在上面已经做了分析。在前面“Linux进程管理”的几节介绍中,我们可以看到fork()在创建子进程时最后就会调用enqueue_task_fair(),将新创建的进程插入到红黑树中。
static struct task_struct *pick_next_task_fair(struct rq *rq) { struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; if (unlikely(!cfs_rq->nr_running)) return NULL; do { /* 此循环为了考虑组调度 */ se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); /* 设置为当前运行进程 */ cfs_rq = group_cfs_rq(se); } while (cfs_rq); p = task_of(se); hrtick_start_fair(rq, p); return p; }
该函数调用pick_next_entity()--->__pick_next_entity()完成获取下一个进程的工作,这个函数如下:
static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) { struct rb_node *left = cfs_rq->rb_leftmost; if (!left) return NULL; return rb_entry(left, struct sched_entity, run_node); }该函数并不会遍历红黑树来找到最左叶子节点(是所有进程中vruntime最小的那个),因为该值已经缓存在rb_leftmost字段中。它通过rb_entry函数返回这个缓存的节点进程。完成实质工作的调用为include/linux/rbtree.h:rb_entry()--->include/linux/kernel.h:container_of(),这是一个宏定义。
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->rb_leftmost == &se->run_node) { struct rb_node *next_node; next_node = rb_next(&se->run_node); cfs_rq->rb_leftmost = next_node; } rb_erase(&se->run_node, &cfs_rq->tasks_timeline); }该函数会删除当前进程,并从红黑树中选出下一个具体最小vruntime值的节点,作为新的最左边节点缓存起来。