下面看下具体的代码:
/*
根据domain级别,从下往上扫描每一级sched_domain。
如果这个domain balance之间间隔时间到了,就进行load_balance操作,
不同级别的domain时间间隔不同,而且级别越高,间隔越长,因为迁移task代价越来越大。
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
int balance = 1;
struct rq *rq = cpu_rq(cpu);
unsigned long interval;
struct sched_domain *sd;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
update_blocked_averages(cpu);
rcu_read_lock();
//task都是在最小的cpu单元上执行
for_each_domain(cpu, sd) {
//从下往上检查domain是否进行负载均衡
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
//每级domain有自己的balance 间隔
interval = sd->balance_interval;
if (idle != CPU_IDLE)
interval *= sd->busy_factor;
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
//jiffies>=sd->last_balance + interval 需要重新做balance。
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* The LBF_SOME_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
//更新last_balance
sd->last_balance = jiffies;
}
if (need_serialize)
spin_unlock(&balancing);
out:
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!balance)
break;
}
rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
}
/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*/
/*
该函数在同一个调度域的各个调度组之间进行负载均衡,执行过程如下:
1、找出最busy的组。
2、在最busy的组中找出最busy的cpu。
3.迁移最busy的cpu上的进程到本cpu,并返回迁移的进程数量
*/
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
int ld_moved, cur_ld_moved, active_balance = 0;
struct sched_group *group;
struct rq *busiest = NULL;
unsigned long flags;
struct cpumask *cpus = __get_cpu_var(load_balance_mask);
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_cpus(sd->groups), //group cpu mask
.idle = idle,
.busiest_nr_running = 0,
.busiest_grp_capacity = 0,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.flags = 0,
.loop = 0,
};
/*
* For NEWLY_IDLE load_balancing, we don't need to consider
* other cpus in our group
*/
//当前cpu处于idle状态,不需要考虑group内其他cpu load
if (idle == CPU_NEWLY_IDLE)
env.dst_grpmask = NULL;
cpumask_copy(cpus, cpu_active_mask);
per_cpu(dbs_boost_load_moved, this_cpu) = 0;
schedstat_inc(sd, lb_count[idle]);
redo:
//找到domain内最忙的group
group = find_busiest_group(&env, balance);
if (*balance == 0)
goto out_balanced;
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
//group内找到最忙的rq,从而获取最忙的cpu
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
ld_moved = 0;
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);//最大迁移的task数量
update_h_load(env.src_cpu);
more_balance:
local_irq_save(flags);
double_rq_lock(env.dst_rq, busiest);
/* The world might have changed. Validate assumptions */
//最busy的rq可运行task少于等于1,不需要移动,这步之前可能有task停止
if (busiest->nr_running <= 1) {
double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
env.flags &= ~LBF_ALL_PINNED;
goto no_move;
}
/*
* cur_ld_moved - load moved in current iteration 本次移动task数量
* ld_moved - cumulative load moved across iterations 累计移动的task数量
*/
//进行task迁移
cur_ld_moved = move_tasks(&env);
ld_moved += cur_ld_moved;
double_rq_unlock(env.dst_rq, busiest);
local_irq_restore(flags);
/*
* some other cpu did the load balance for us.
*/
if (cur_ld_moved && env.dst_cpu != smp_processor_id())
resched_cpu(env.dst_cpu);
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of cpus in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing exceess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_SOME_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/* Prevent to re-select dst_cpu via env's cpus */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_balanced;
}
}
no_move:
if (!ld_moved) {
if (!(env.flags & LBF_HMP_ACTIVE_BALANCE))
schedstat_inc(sd, lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE &&
!(env.flags & LBF_HMP_ACTIVE_BALANCE))
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
if (active_balance) {
stop_one_cpu_nowait(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work);
ld_moved++;
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed =
sd->cache_nice_tries +
NEED_ACTIVE_BALANCE_THRESHOLD - 1;
}
} else {
sd->nr_balance_failed = 0;
if (per_cpu(dbs_boost_needed, this_cpu)) {
struct migration_notify_data mnd;
mnd.src_cpu = cpu_of(busiest);
mnd.dest_cpu = this_cpu;
mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
if (mnd.load > 100)
mnd.load = 100;
atomic_notifier_call_chain(&migration_notifier_head,
0, (void *)&mnd);
per_cpu(dbs_boost_needed, this_cpu) = false;
per_cpu(dbs_boost_load_moved, this_cpu) = 0;
}
/* Assumes one 'busiest' cpu that we pulled tasks from */
if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
check_for_freq_change(this_rq);
check_for_freq_change(busiest);
}
}
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
goto out;
out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
ld_moved = 0;
out:
trace_sched_load_balance(this_cpu, idle, *balance,
group ? group->cpumask[0] : 0,
busiest ? busiest->nr_running : 0,
env.imbalance, env.flags, ld_moved,
sd->balance_interval);
return ld_moved;
}
static struct sched_group *
find_busiest_group(struct lb_env *env, int *balance)
{
struct sd_lb_stats sds;
memset(&sds, 0, sizeof(sds));
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
//更新sched_domain,sched_group的状态,同时找到了domain内最busy的group
//接下来要检查这个group是否是真正最busy的group
update_sd_lb_stats(env, balance, &sds);
/*
* this_cpu is not the appropriate cpu to perform load balancing at
* this level.
*/
if (!(*balance))
goto ret;
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
if (env->flags & LBF_HMP_ACTIVE_BALANCE)
goto force_balance;
//group 内task数量超过溢出限制,就继续进行负载均衡操作,否则返回不需负载均衡处理
if (bail_inter_cluster_balance(env, &sds))
goto out_balanced;
sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
/*
* If the busiest group is imbalanced the below checks don't
* work because they assumes all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
if (sds.group_imb)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
/*
* If the local group is more busy than the selected busiest group
* don't try and pull any tasks.
*/
//本地group负载更重
if (sds.this_load >= sds.max_load)
goto out_balanced;
/*
* Don't pull any tasks if this group is already above the domain
* average load.
*/
if (sds.this_load >= sds.avg_load)
goto out_balanced;
if (env->idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
//当前group内idle cpu数量少于busiest group内idle cpu数量
//并且busiest group内运行task数量未超过负荷,不需要做负载均衡
if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
sds.busiest_nr_running <= sds.busiest_group_weight)
goto out_balanced;
} else {
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
//计算domain内group的imbalance程度
calculate_imbalance(env, &sds);
return sds.busiest;
out_balanced:
ret:
env->imbalance = 0;
return NULL;
}
//首先看下相关数据结构:
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *this; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_pwr; /* Total power of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd *///domain中各个group的平均负载
/** Statistics of this group */
unsigned long this_load;//当前调度组负载
unsigned long this_load_per_task; //当前调度组平均负载
unsigned long this_nr_running; //当前调度组内运行队列中进程总数
unsigned long this_has_capacity;
unsigned long this_group_capacity;
unsigned int this_idle_cpus;
/* Statistics of the busiest group */
unsigned int busiest_idle_cpus;
unsigned long max_load; //最忙组的负载
unsigned long busiest_load_per_task; //最忙组中平均每个task的负载
unsigned long busiest_nr_running; //最忙组中进程总数
#ifdef CONFIG_SCHED_HMP
unsigned long busiest_nr_small_tasks;
unsigned long busiest_nr_big_tasks;
u64 busiest_scaled_load;
#endif
unsigned long busiest_group_capacity; //group可容纳的task数量
unsigned long busiest_has_capacity;
unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
};
/**
* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
* @env: The load balancing environment.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
//domain->group->cpu
static inline void update_sd_lb_stats(struct lb_env *env,
int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats sgs;
int load_idx, prefer_sibling = 0;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
load_idx = get_sd_load_idx(env->sd, env->idle);
do {
int local_group;
//dst_cpu 是否在group中
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
//balance=1
if (local_group && !(*balance))
return;
sds->total_load += sgs.group_load;
sds->total_pwr += sg->sgp->power;
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks, i.e. nr_running < group_capacity. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
//如果是当前cpu上的group,
if (local_group) {
sds->this_load = sgs.avg_load;
sds->this = sg;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
sds->this_has_capacity = sgs.group_has_capacity;
sds->this_idle_cpus = sgs.idle_cpus;
sds->this_group_capacity = sgs.group_capacity;
} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {//其他cpu上面的group,冒泡排序找到最busy group
sds->max_load = sgs.avg_load;
sds->busiest = sg;
env->busiest_nr_running = sds->busiest_nr_running
= sgs.sum_nr_running;
sds->busiest_idle_cpus = sgs.idle_cpus;
env->busiest_grp_capacity = sds->busiest_group_capacity
= sgs.group_capacity;
sds->busiest_load_per_task = sgs.sum_weighted_load;
sds->busiest_has_capacity = sgs.group_has_capacity;
sds->busiest_group_weight = sgs.group_weight;
sds->group_imb = sgs.group_imb;
#ifdef CONFIG_SCHED_HMP
sds->busiest_nr_small_tasks = sgs.sum_nr_small_tasks;
sds->busiest_nr_big_tasks = sgs.sum_nr_big_tasks;
sds->busiest_scaled_load = sgs.group_cpu_load;
#endif
}
sg = sg->next;
} while (sg != env->sd->groups);//遍历domain内的所有group
}
static int
bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
{
int nr_cpus, local_cpu, busiest_cpu;
local_cpu = group_first_cpu(sds->this);//domain当前的group
busiest_cpu = group_first_cpu(sds->busiest);//domain 最busy group
//不同拓扑结构下,cpu的能力也不同
if (cpu_capacity(local_cpu) <= cpu_capacity(busiest_cpu))
return 0;
if (sds->busiest_nr_big_tasks)
return 0;
//最busy group包含的cpu数量
nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
//task 数量溢出限制
if ((sds->busiest_scaled_load < nr_cpus * sched_spill_load) &&
(sds->busiest_nr_running <
nr_cpus * sysctl_sched_spill_nr_run)) {
return 1;
}
return 0;
}
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
* @env: load balance environment
* @sds: statistics of the sched_domain whose imbalance is to be calculated.
*/
//计算domain内group的imbalance程度
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
sds->busiest_load_per_task /= sds->busiest_nr_running;
if (sds->group_imb) {
sds->busiest_load_per_task =
min(sds->busiest_load_per_task, sds->avg_load);
}
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
//最忙组负载小于平均负载,则处于负载均衡
if (sds->max_load < sds->avg_load) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
if (!sds->group_imb) {
/*
* Don't want to pull so many tasks that a group would go idle.
*/
//超过容纳限制的task数
load_above_capacity = (sds->busiest_nr_running -
sds->busiest_group_capacity);
load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
load_above_capacity /= sds->busiest->sgp->power;
}
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load. At the same time,
* we also don't want to reduce the group load below the group capacity
* (so that we can implement power-savings policies etc). Thus we look
* for the minimum possible imbalance.
* Be careful of negative numbers as they'll appear as very large values
* with unsigned longs.
*/
max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(max_pull * sds->busiest->sgp->power,
(sds->avg_load - sds->this_load) * sds->this->sgp->power)
/ SCHED_POWER_SCALE;
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
if (env->imbalance < sds->busiest_load_per_task)
return fix_small_imbalance(env, sds);
}
/*
* find_busiest_queue - find the busiest runqueue among the cpus in group.
*/
//冒泡排序找到group内最忙的rq
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long max_load = 0;
int i;
if (sched_enable_hmp)
return find_busiest_queue_hmp(env, group);
//冒泡排序遍历group内最忙的cpu
for_each_cpu(i, sched_group_cpus(group)) {
//rq->cpu_power表示所在处理器的计算能力,在函式sched_init初始化时,会把这值设定为SCHED_LOAD_SCALE
unsigned long power = power_of(i);//cpu power
unsigned long capacity = DIV_ROUND_CLOSEST(power,
SCHED_POWER_SCALE);
unsigned long wl;
if (!capacity)
capacity = fix_small_capacity(env->sd, group);
if (!cpumask_test_cpu(i, env->cpus))
continue;
rq = cpu_rq(i);
wl = weighted_cpuload(i);//cpu rq负载
/*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu power.
*/
if (capacity && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
* the weighted_cpuload() scaled with the cpu power, so that
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
*/
wl = (wl * SCHED_POWER_SCALE) / power;
if (wl > max_load) {
max_load = wl;
busiest = rq;
}
}
return busiest;
}
/*
循环从cfs rq中最大移动env->loop_max个task到dst cpu
*/
static int move_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
int pulled = 0;
int orig_loop = env->loop;
if (env->imbalance <= 0)
return 0;
env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu))
env->flags |= LBF_IGNORE_SMALL_TASKS;
else if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
!sched_boost())
env->flags |= LBF_IGNORE_BIG_TASKS;
redo:
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
env->loop++;
/* We've more or less seen every task there is, call it quits */
if (env->loop > env->loop_max)
break;
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
//p能否进行迁移,不能的话,将其移动到cfs rq队尾
if (!can_migrate_task(p, env))
goto next;
load = task_h_load(p);
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
if ((load / 2) > env->imbalance)
goto next;
move_task(p, env);
pulled++;
env->imbalance -= load;
per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
break;
#endif
/*
* We only want to steal up to the prescribed amount of
* weighted load.
*/
if (env->imbalance <= 0)
break;
continue;
next:
list_move_tail(&p->se.group_node, tasks);
}
if (env->flags & (LBF_IGNORE_SMALL_TASKS | LBF_IGNORE_BIG_TASKS |
LBF_IGNORE_PREFERRED_CLUSTER_TASKS)
&& !pulled) {
tasks = &env->src_rq->cfs_tasks;
env->flags &= ~(LBF_IGNORE_SMALL_TASKS | LBF_IGNORE_BIG_TASKS |
LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
env->loop = orig_loop;
goto redo;
}
/*
* Right now, this is one of only two places move_task() is called,
* so we can safely collect move_task() stats here rather than
* inside move_task().
*/
schedstat_add(env->sd, lb_gained[env->idle], pulled);
return pulled;
}
/*
* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
*/
//检查task p能否迁移到当前cpu,四种情况不能迁移
// 0不能迁移,1可以迁移
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot = 0;
int twf;
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
//src/dst cpu 是否有限流
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
//倾向迁移big task
if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) &&
nr_big_tasks(env->src_rq) && !is_big_task(p))
return 0;
//忽略small task
if (env->flags & LBF_IGNORE_SMALL_TASKS && is_small_task(p))
return 0;
twf = task_will_fit(p, env->dst_cpu);
/*
* Attempt to not pull tasks that don't fit. We may get lucky and find
* one that actually fits.
*/
//忽略big task,同时不适合迁移
if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
return 0;
if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
!preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
return 0;
/*
* Group imbalance can sometimes cause work to be pulled across groups
* even though the group could have managed the imbalance on its own.
* Prevent inter-cluster migrations for big tasks when the number of
* tasks is lower than the capacity of the group.
*/
if (!twf && env->busiest_nr_running <= env->busiest_grp_capacity)
return 0;
//dst_cpu 是否在p运行运行cpumask内,0表示不在
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
/*
* Remember if this task can be migrated to any other cpu in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
* Also avoid computing new_dst_cpu if we have already computed
* one in current iteration.
*/
if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
env->flags |= LBF_SOME_PINNED;
env->new_dst_cpu = cpu;
break;
}
}
return 0;
}
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
/*
* Aggressive migration if:
* 1) IDLE or NEWLY_IDLE balance.
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
if (env->idle != CPU_NOT_IDLE || !tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
return 1;
}
schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}