之前在讲解新创建进程和idle进程被wake_up_process之后如何被调度器调度的原理,有两个点没有分析的很清楚,就是在这个调度的过程中,如何选择一个cpu来执行调度实体。现在单独拎出来详细分析:
简单的流程图如下:
现在仅仅分析第二个部分,EAS没有启用的情况下,调度器如何实现进程的调度.即下面将要分析的source code如下:
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags, int sibling_count_hint)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
#ifdef CONFIG_64BIT_ONLY_CPU
struct cpumask tmpmask;
if (find_packing_cpu(p, &new_cpu))
return new_cpu;
cpumask_andnot(&tmpmask, cpu_present_mask, &b64_only_cpu_mask);
if (cpumask_test_cpu(cpu, &tmpmask)) {
if (weighted_cpuload_32bit(cpu) >
sysctl_sched_32bit_load_threshold &&
!test_tsk_thread_flag(p, TIF_32BIT))
return min_load_64bit_only_cpu();
}
#endif
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
want_affine = !wake_wide(p, sibling_count_hint) &&
!wake_cap(p, cpu, prev_cpu) &&
cpumask_test_cpu(cpu, &p->cpus_allowed);
}
if (energy_aware())
return select_energy_cpu_brute(p, prev_cpu, sync);
/* 下面是这个章节需要分析的内容. */
rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
break;
}
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
break;
}
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
/*
* We're going to need the task's util for capacity_spare_wake
* in find_idlest_group. Sync it up to prev_cpu's
* last_update_time.
*/
sync_entity_load_avg(&p->se);
}
if (!sd) {
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
return new_cpu;
}
下面根据代码结果分下面三个章节来分析上面剩余的代码:
我们知道如下的事实:
只有上面三个条件全部成立,则want_affine=1
下面分析这部分代码:
for_each_domain(cpu, tmp) {
/* 这个if永远不会成立,原因是在Sched domain初始化的时候已经设置了这个flag
在函数sd_init里面已经设置好了,也包括进程的几种状态下需要做balance的flag*/
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
break;
}
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
break;
}
在sd_init初始化的时候,设置的flags如下:
.flags = 1*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
| 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
| 0*SD_NUMA
fdef CONFIG_INTEL_DWS
| 0*SD_INTEL_DWS
ndif
| sd_flags
,
对于for_each_domain(cpu, tmp)的遍历解释如下:
/* 从domain最底层的MC遍历到最高层的DIE层 */
for_each_domain(cpu, tmp) {
/* 这个if永远不会成立,原因是在Sched domain初始化的时候已经设置了这个flag
在函数sd_init里面已经设置好了,也包括进程的几种状态下需要做balance的flag*/
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
/* sched_domain_span(tmp)即这个遍历的domain下面有多少个cpu以及id,
在SDTL 最底层(比如ARM的MC)一个domain是一个cluster,最高层DIE则是所有的cpu
SD_WAKE_AFFINE flag被设置了.如果if条件成立,则将当前的domain赋值给affine_sd
变量
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
break;
}
/* balance flag符合进程在调用select_task_rq时候的flag */
if (tmp->flags & sd_flag)
sd = tmp;
else if (!want_affine)
break;
}
我们知道当内核进程是从idle被wakeup起来的时候,sd_flag是被设置为SD_BALANCE_WAKE,但是在sd_init初始化的时候,没有设置这个flag,所以存在sd=NULL, affine_sd=NULL的情况出现.下面接着分析下面的函数:
/* affine_sd变量不为空,则根据相应的条件update new_cpu */
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
new_cpu = cpu;
}
wake_affine函数源码分析之前,需要先知道三个load的计算方式如下:
分别分析如下.
其源码如下:
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
/* 获取此cpu所在的cfs_rq的load */
unsigned long total = weighted_cpuload(cpu);
/* type=sd->wake_idx=0并且SCHED_FEAT(LB_BIAS, true),可以此if条件成立 */
if (type == 0 || !sched_feat(LB_BIAS))
return total;
return min(rq->cpu_load[type-1], total);
}
/* Used instead of source_load when we know the type == 0 */
static unsigned long weighted_cpuload(const int cpu)
{ /* 获取此cpu上的cfs_rq的load,cpu上的load包括cfs_rq和rt_rq上的负载之和 */
return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
}
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
{ /* 获取cfs_rq平均运行load,在每次调度实体负载update的时候会更新 */
return cfs_rq->runnable_load_avg;
}
/*
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
unsigned long total = weighted_cpuload(cpu);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
return max(rq->cpu_load[type-1], total);
}
函数与上面类似,如果type不为0,则source_load是选择最小数值,而target_load选择最大数值.
源码如下:
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
* effective_load() calculates the load change as seen from the root_task_group
*
* Adding load to a group doesn't make a group heavier, but can cause movement
* of group shares between cpus. Assuming the shares were perfectly aligned one
* can calculate the shift in shares.
*
* Calculate the effective load difference if @wl is added (subtracted) to @tg
* on this @cpu and results in a total addition (subtraction) of @wg to the
* total group weight.
*
* Given a runqueue weight distribution (rw_i) we can compute a shares
* distribution (s_i) using:
*
* s_i = rw_i / \Sum rw_j (1)
*
* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
* shares distribution (s_i):
*
* rw_i = { 2, 4, 1, 0 }
* s_i = { 2/7, 4/7, 1/7, 0 }
*
* As per wake_affine() we're interested in the load of two CPUs (the CPU the
* task used to run on and the CPU the waker is running on), we need to
* compute the effect of waking a task on either CPU and, in case of a sync
* wakeup, compute the effect of the current task going to sleep.
*
* So for a change of @wl to the local @cpu with an overall group weight change
* of @wl we can compute the new shares distribution (s'_i) using:
*
* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
*
* Suppose we're interested in CPUs 0 and 1, and want to compute the load
* differences in waking a task to CPU 0. The additional task changes the
* weight and shares distributions like:
*
* rw'_i = { 3, 4, 1, 0 }
* s'_i = { 3/8, 4/8, 1/8, 0 }
*
* We can then compute the difference in effective weight by using:
*
* dw_i = S * (s'_i - s_i) (3)
*
* Where 'S' is the group weight as seen by its parent.
*
* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
* 4/7) times the weight of the group.
*/
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
struct sched_entity *se = tg->se[cpu];
if (!tg->parent) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = se->my_q;
long W, w = cfs_rq_load_avg(cfs_rq);
tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
W = wg + atomic_long_read(&tg->load_avg);
/* Ensure \Sum rw_j >= rw_i */
W -= cfs_rq->tg_load_avg_contrib;
W += w;
/*
* w = rw_i + @wl
*/
w += wl;
/*
* wl = S * s'_i; see (2)
*/
/* tg->shares可能会被函数sched_group_set_shares修改,但是
root_task_group.shares = ROOT_TASK_GROUP_LOAD在sched_init调度器
初始化的时候设置了为nice=0的权重为1024 */
if (W > 0 && w < W)
wl = (w * (long)tg->shares) / W;
else
wl = tg->shares;
/*
* Per the above, wl is the new se->load.weight value; since
* those are clipped to [MIN_SHARES, ...) do so now. See
* calc_cfs_shares().
*/
if (wl < MIN_SHARES)
wl = MIN_SHARES;
/*
* wl = dw_i = S * (s'_i - s_i); see (3)
*/
wl -= se->avg.load_avg;
/*
* Recursively apply this logic to all parent groups to compute
* the final effective load change on the root group. Since
* only the @tg group gets extra weight, all parent groups can
* only redistribute existing shares. @wl is the shift in shares
* resulting from this level per the above.
*/
wg = 0;
}
return wl;
}
#else
static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
return wl;
}
#endif
这个函数在定义之前已经讲解的很清楚,effective_load怎么计算的.具体如下:
for_each_sched_entity从最底层的se开始遍历一直到这个se所在的根节点:root_task_group
源码分析如下:
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
int idx, this_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
/* sched_domain的wake_idx是sd_alloc_ctl_domain_table函数分配的一个table的
元素.默认数值为0,在sd_init初始化了. 默认数值为0,可以通过sys接口修改,属于
schedctl interface */
idx = sd->wake_idx;
this_cpu = smp_processor_id();/*获取当前运行此函数的cpu id*/
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/ /* sync=wake_flag & WF_SYNC = 0 & 0x01 = 0,目前看到所有的wake_flag都
被设置为0了 */
if (sync) {
tg = task_group(current);
weight = current->se.avg.load_avg;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
weight = p->se.avg.load_avg;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
* always have an imbalance, but there's really nothing you can do
* about that, so that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*//* 下面所在的事情就是决定进程P放入到当前 this cpu上或者放入prev_cpu上负载的影
响 */
this_eff_load = 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
if (this_load > 0) {
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
}
/* 确定是否进行进程的迁移,需要明白,在计算prev_eff_load的时候,为何effective_load
的第三个参数为0呢?是因为prev_cpu是进程P当前运行的进程.只有进程需要迁移到某个cpu上,第
三个参数才会设置数值. */
balanced = this_eff_load <= prev_eff_load;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
if (!balanced)
return 0;
schedstat_inc(sd, ttwu_move_affine);
schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
只有当当前cpu与prev_cpu不是同一个cpu并且wake_affine函数返回为1,则new_cpu = 当前cpu. 即可以将进程P迁移到new_cpu上去了.下面继续分析:
/*这个是用来判断sd是否为空,以及调度域flag是否为SD_BALANCE_FORK*/
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
/*
* We're going to need the task's util for capacity_spare_wake
* in find_idlest_group. Sync it up to prev_cpu's
* last_update_time.
* 对进程P作为调度实体进程负载的更新(PELT算法)
*/
sync_entity_load_avg(&p->se);
}
我们知道只要affine_sd !=NULL,那么sd变量会设置为空,那么选择new_cpu的代码路径就会走select_idle_sibling函数:
if (!sd) {
/*如果affine_sd不为空,则sd就被设置为空 SD_BALACNE_WAKE flag被设置 */
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
/*
* Try and locate an idle CPU in the sched_domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
struct sched_group *sg;
int best_idle_cpu = -1;
int best_idle_cstate = INT_MAX;
unsigned long best_idle_capacity = ULONG_MAX;
schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
schedstat_inc(this_rq(), eas_stats.sis_attempts);
/* 这里是否设置了cstate感知调度器控制属性,即根据idle的深浅来做出不同的决策 */
if (!sysctl_sched_cstate_aware) {
/* 如果没有设置,则直接判断new_cpu是否idle而不管系统存在更钱的idle状态.只要是idle
就直接返回. */
if (idle_cpu(target)) {
schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
schedstat_inc(this_rq(), eas_stats.sis_idle);
return target;
}
/* intel 平台 */
#ifdef CONFIG_INTEL_DWS
if (sched_feat(INTEL_DWS)) {
/*
* For either waker or wakee CPU, if it is idle, then select it, but
* if not, we lower down the bar to use a threshold of runnable avg
* to determine whether it is capable of handling the wakee task
*/
if (sysctl_sched_wakeup_threshold && cpu_more_runnable(target))
return target;
if (prev != target) {
/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/
if (cpus_share_cache(prev, target) && idle_cpu(prev))
return prev;
if (sysctl_sched_wakeup_threshold && cpu_more_runnable(prev))
return prev;
}
} else
#endif
/*
* If the prevous cpu is cache affine and idle, don't be stupid.
*/ /*如果进程p原先运行的cpu(prev)不等于目标cpu && prev与target共享
cache && prev是idle状态,则目标cpu为prev cpu*/
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
return prev;
}
}
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
/* 下面寻找处于最浅idle状态的最小capacity的cpu来放置迁移进程P */
for_each_lower_domain(sd) {
sg = sd->groups;
do {
int i;
/* 进程P cpu亲和数与调度组span cpu数没有交集,则直接遍历下一个sg */
if (!cpumask_intersects(sched_group_cpus(sg),
tsk_cpus_allowed(p)))
goto next;
if (sysctl_sched_cstate_aware) {
for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { /*获取cpu的idle状态索引*/
int idle_idx = idle_get_state_idx(cpu_rq(i));
unsigned long new_usage = boosted_task_util(p);
unsigned long capacity_orig = capacity_orig_of(i);
if (new_usage > capacity_orig || !idle_cpu(i))
goto next;
/* 遍历到的cpu为new_cpu id并且进程P的util小于cpu的capacity
,则直接返回目标cpu(new_cpu) */
if (i == target && new_usage <= capacity_curr_of(target)) {
schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
schedstat_inc(sd, eas_stats.sis_suff_cap);
return target;
}
/* 找出最浅idle状态并且最小capacity的cpu id */
if (idle_idx < best_idle_cstate &&
capacity_orig <= best_idle_capacity) {
best_idle_cpu = i; /* 保存这个选择的id */
best_idle_cstate = idle_idx;
best_idle_capacity = capacity_orig;
}
}
} else { /*如果没有定义感知idle深浅状态的flag,则只要目标cpu是idle就
OK*/
for_each_cpu(i, sched_group_cpus(sg)) {
if (i == target || !idle_cpu(i))
goto next;
}
/* 如果上面的循环失败, 则挑选调度组内第一个cpu*/
target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
schedstat_inc(sd, eas_stats.sis_idle_cpu);
goto done;
}
next:
sg = sg->next;
} while (sg != sd->groups);
}
if (best_idle_cpu >= 0)
target = best_idle_cpu;
done:
schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
schedstat_inc(this_rq(), eas_stats.sis_count);
return target;
}
原理比较简单.
对于这个核心函数的分析,需要理解下面几个核心子函数
两个是递进的关系,查找到了idlest的group之后,在此group中查找idlest的cpu,最后来确定目标cpu的id.分别来分析
由于看的是ARM平台,选择ARM平台调用的函数:
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
/*根据sd_flag数值,设定load_idx数值(只有被wakeup的进程才会设置)*/
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
/*开始对sd内的所有sg遍历*/
do {
unsigned long load, avg_load, spare_cap, max_spare_cap;
int local_group;
int i;
/*sg内的cpu与进程的cpu亲和数没有交集,直接进行下次遍历*/
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
tsk_cpus_allowed(p)))
continue;
/*由于 sd是最底层MC SDTL,所以cpumask_weight(sched_group_cpus(group))=1
local_group目的是确定this_cpu是否在本次遍历的group中*/
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
/*
* Tally up the load of all CPUs in the group and find
* the group containing the CPU with most spare capacity.
*/
avg_load = 0;
max_spare_cap = 0;
/*在这个group计算累加负载和最大没有使用的capacity数值*/
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
/*累加负载,后面会归一化为相对负载*/
avg_load += load;
/*cpu i的剩余capacity余量,即capacity_of-cpu_util*/
spare_cap = capacity_spare_wake(i, p);
/*计算整个循环周期内的最大capacity余量*/
if (spare_cap > max_spare_cap)
max_spare_cap = spare_cap;
}
/*归一化为相对负载*/
/* Adjust by relative CPU capacity of the group */
avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
/*根据每次遍历的local_group的数值,来update相应的一些决策变量.*/
if (local_group) {
this_load = avg_load;
this_spare = max_spare_cap;
} else {
/*获取最小load的group*/
if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
}
/*获取最大capacity余量的group*/
if (most_spare < max_spare_cap) {
most_spare = max_spare_cap;
most_spare_sg = group;
}
}
} while (group = group->next, group != sd->groups);
/*
* The cross-over point between using spare capacity or least load
* is too conservative for high utilization tasks on partially
* utilized systems if we require spare_capacity > task_util(p),
* so we allow for some task stuffing by using
* spare_capacity > task_util(p)/2.
*
* Spare capacity can't be used for fork because the utilization has
* not been set yet, we must first select a rq to compute the initial
* utilization.
*/ /*下面开始根据设定的策略来决策使用哪种方式作为idlest_group*/
/*1.如果进程p是新创建的进程,直接选择负载最轻的group
2.如果capacity余量大于进程p自身的util的一半,并且imbalance参数大于一定数值
则返回为空
3.如果最大capacity余量大于进程p的util的一半,则返回最大余量的group
4.最后根据实际情况选择idlest的group*/
if (sd_flag & SD_BALANCE_FORK)
goto skip_spare;
if (this_spare > task_util(p) / 2 &&
imbalance*this_spare > 100*most_spare)
return NULL;
else if (most_spare > task_util(p) / 2)
return most_spare_sg;
skip_spare:
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
比较好理解
既然已经选择了idlest group,那么开始选择idlest group内的idlest cpu了.
/*
* find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
int shallowest_idle_cpu = -1;
int i;
/* Check if we have any choice: */
if (group->group_weight == 1)
return cpumask_first(sched_group_cpus(group));
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
/*cpu处于idle并且退出时延最小,那么cpu肯定是最浅idle状态的cpu了,idle状态
越深,退出时延越大.*/
if (idle && idle->exit_latency < min_exit_latency) {
/*
* We give priority to a CPU whose idle state
* has the smallest exit latency irrespective
* of any idle timestamp.
*/
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
/*如果退出时延是最小退出时延并且 此cpu之前进入过idle状态.那么挑选刚刚进入
idle的cpu最为idle状态最浅的cpu.注释很清楚*/
} else if ((!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
* the most recently idled CPU might have
* a warmer cache.
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
/*如果没有cpu处于idle,那么选择load最轻的cpu作为返回值*/
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
}
}
}
/*根据系统是否有idle cpu来决策是选择最浅idle状态的cpu还是选择最轻负载的cpu*/
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
原理也比较简单.
两个子核心函数分析完毕,下面分析这个源码:
static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, int cpu, int prev_cpu, int sd_flag)
{
int new_cpu = cpu;
int wu = sd_flag & SD_BALANCE_WAKE;
int cas_cpu = -1;
if (wu) {
schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
schedstat_inc(this_rq(), eas_stats.cas_attempts);
}
/*进程p的cpu亲和数与调度域的span cpu是否存在交集*/
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu;
/*在此调度域内查找出idlest group,进而查找出idlest cpu*/
while (sd) {
struct sched_group *group = NULL;
struct sched_domain *tmp;
int weight;
if (wu)
schedstat_inc(sd, eas_stats.cas_attempts);
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
#ifdef CONFIG_INTEL_DWS
if (sched_feat(INTEL_DWS)) {
if (sd->flags & SD_INTEL_DWS)
group = dws_find_group(sd, p, cpu);
if (!group)
group = find_idlest_group(sd, p, cpu, sd_flag);
} else
#endif /*查找出最小load/最大capacity余量的group*/
group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
}
/*查找出最浅idle状态/最小负载的cpu id*/
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = cas_cpu = new_cpu;
weight = sd->span_weight;
sd = NULL; /*循环中断条件设置*/
for_each_domain(cpu, tmp) {
/*如果tmp与sd不处于同一个level的话,数据会有差异,比如两个
SDTL(MC(child)/DIE(parent))
架构是8核心两个cluster,那么DIE的weight大于MC的weight.这个for循环的目的
是需要遍历顶层sd下面的sg.因为传参过来的sd可能是MC/DIE两者之一*/
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
if (wu && (cas_cpu >= 0)) {
schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
schedstat_inc(this_rq(), eas_stats.cas_count);
}
return new_cpu;
}
原理也比较简单.
重点还是理解调度域和调度组的关系.