linux内核使用SDTL结构体来组织CPU的层次关系
struct sched_domain_topology_level {
sched_domain_mask_f mask; //函数指针,用于指定某个SDTL的cpumask位图
sched_domain_flags_f sd_flags; //函数指针,用于指定某个SDTL的标志位
int flags;
int numa_level;
struct sd_data data;
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
};
标志位
#define SD_BALANCE_NEWIDLE 0x0001 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0002 /* Balance on exec */
#define SD_BALANCE_FORK 0x0004 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0008 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0010 /* Wake task to waking CPU */
#define SD_ASYM_CPUCAPACITY 0x0020 /* Domain members have different CPU capacities */
#define SD_SHARE_CPUCAPACITY 0x0040 /* Domain members share CPU capacity */
#define SD_SHARE_POWERDOMAIN 0x0080 /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES 0x0100 /* Domain members share CPU pkg resources */
#define SD_SERIALIZE 0x0200 /* Only a single load balancing instance */
#define SD_ASYM_PACKING 0x0400 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x0800 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x1000 /* sched_domains of this level overlap */
#define SD_NUMA 0x2000 /* cross-node balancing */
SD_BALANCE_NEWIDLE | 当CPU变为空闲后做负载均衡调度 |
SD_BALANCE_EXEC | 进程调用exec是会重新选择一个最优的CPU的来执行,参考sched_exec()函数 |
SD_BALANCE_FORK | fork出新进程后会选择最优CPU,参考wake_up_new_task()函数 |
SD_BALANCE_WAKE | 唤醒时负载均衡,参考wake_up_process()函数 |
SD_WAKE_AFFINE | 支持wake affine特性 |
SD_ASYM_CPUCAPACITY | 该调度域有不同架构的CPU,如大/小核cpu |
SD_SHARE_CPUCAPACITY | 调度域中的CPU都是可以共享CPU资源的,描述SMT调度层级 |
SD_SHARE_POWERDOMAIN | 该调度域中的CPU可以共享电源域 |
SD_SHARE_PKG_RESOURCES | 该调度域中的CPU可以共享高速缓存 |
SD_ASYM_PACKING | 描述SMT调度层级相关的一些例外 |
SD_NUMA | 描述NUMA调度层级 |
SD_SERIALIZE | |
SD_OVERLAP | |
SD_PREFER_SIBLING |
调度组是负载均衡的最小单位。
load-balance的逻辑比较复杂,可以参考我整理的如下图的整体逻辑框架:
struct lb_env {
struct sched_domain *sd;
struct rq *src_rq;
int src_cpu;
int dst_cpu;
struct rq *dst_rq;
struct cpumask *dst_grpmask;
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
unsigned int flags;
unsigned int loop;
unsigned int loop_break;
unsigned int loop_max;
enum fbq_type fbq_type;
enum migration_type migration_type;
struct list_head tasks;
};
类型 | 成员 | 作用 | |
---|---|---|---|
struct sched_domain * | sd | 指向当前调度域 | |
int | dst_cpu | 当前CPU(后续可能要把任务迁移到该CPU上) | |
struct rq* | dst_rq | 当前CPU对应的就绪度列 | |
struct cpumask * | dst_grpmask | 当前调度域里的第一个调度组的CPU位图 | |
unsigned int | loop_break | 表示本地最多迁移32个进程(sched_nr_migrate_break默认值是32) | |
struct cpumask * | cpus | load_balance_mask位图 |
首先是两个数据结构:
sd_lb_stats
用于描述调度域里的相关信息,以及sg_lb_stats
用于描述调度组里的相关信息
/*
* sd_lb_stats - Structure to store the statistics of a sched_domain
* during load balancing.
*/
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
unsigned int prefer_sibling; /* tasks should go to sibling first */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
struct sg_lb_stats local_stat; /* Statistics of the local group */
};
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
struct sg_lb_stats {
unsigned long avg_load; /*Avg load across the CPUs of the group */
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long group_capacity;
unsigned long group_util; /* Total utilization over the CPUs of the group */
unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
unsigned int sum_nr_running; /* Nr of tasks running in the group */
unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
unsigned int idle_cpus;
unsigned int group_weight;
enum group_type group_type;
unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
#endif
};
负载均衡机制是从注册软中断开始的
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
nohz.next_balance = jiffies;
nohz.next_blocked = jiffies;
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
}
void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
}
注册了SCHED_SOFTIRQ
的软中断,中断处理函数是run_rebalance_domains
。
由scheduler_tick()
在每个时钟节拍中会去检查是否需要load balance。
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
void trigger_load_balance(struct rq *rq)
{
/* Don't need to rebalance while attached to NULL domain */
if (unlikely(on_null_domain(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance)) //需要判断是否到了负载均衡的时间点
raise_softirq(SCHED_SOFTIRQ);
nohz_balancer_kick(rq); //该函数作用是什么?
}
nohz_balancer_kick用来触发nohz idle balance的,这是后面两个章节要仔细描述的内容。这里看起似乎注释不对,因为这个函数不但触发的周期性均衡,也触发了nohz idle balance。然而,其实nohz idle balance本质上也是另外一种意义上的周期性负载均衡,只是因为CPU进入idle,无法产生tick,因此让能产生tick的busy CPU来帮忙触发tick balance。而实际上tick balance和nohz idle balance都是通过SCHED_SOFTIRQ的软中断来处理,最后都是执run_rebalance_domains这个函数。
/*
* sched_exec - execve() is a valuable balancing opportunity, because at
* this point the task has the smallest effective memory and cache footprint.
*/
void sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
int dest_cpu;
raw_spin_lock_irqsave(&p->pi_lock, flags); //这里获取了p->pi_lock,后面migration_cpu_stop中会涉及该锁
dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); //这里选择目的cpu
if (dest_cpu == smp_processor_id()) //如果dest_cpu就是当前cpu,可以释放锁并返回
goto unlock;
if (likely(cpu_active(dest_cpu))) {
struct migration_arg arg = { p, dest_cpu };
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); //这里进行目的cpu的进程迁移
return;
}
unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
这句函数的作用是什么。
stop_one_cpu是停止某个cpu的运行,非SMP和SMP下实现不一样。非SMP下,stop_one_cpu的实现其实就是关抢占、执行函数、开抢占。
SMP架构下,stop_one_cpu实现:
该函数是在执行完函数fn之后再返回,但是不能保证执行fn的时候cpu还一直在线。
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
{
struct cpu_stop_done done;
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
cpu_stop_init_done(&done, 1);
if (!cpu_stop_queue_work(cpu, &work))
return -ENOENT;
/*
* In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
* cycle by doing a preemption:
*/
cond_resched();
wait_for_completion(&done.completion);
return done.ret;
}
migration_cpu_stop函数
主要是两个拿锁的地方,未能理解为何需要那样的判断,rq->lock以及p->pi_lock
/*
* migration_cpu_stop - this will be executed by a highprio stopper thread
* and performs thread migration by bumping thread off CPU then
* 'pushing' onto another runqueue.
*/
static int migration_cpu_stop(void *data)
{
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
local_irq_disable();
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
flush_smp_call_function_from_idle();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
/* 这个地方为何一定要task_rq == rq才能去迁移呢?rq是当前正在执行任务的cpu的rq,其rq->lock拿着,与task_rq(p) != rq有何冲突?没能理解。会不会是迁移操作时需要获取p所在rq的lock,以及目标rq的lock,那样的话假设task_rq(p)不等于rq,拿两个rq->lock不就好了?
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) { //必须是p所在的rq与当前cpu的rq是同一个才能迁移
if (task_on_rq_queued(p)) //必须p是on rq才能迁移
rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
else
p->wake_cpu = arg->dest_cpu;
}
rq_unlock(rq, &rf);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
return 0;
}
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
void wake_up_new_task(struct task_struct *p)
{
struct rq_flags rf;
struct rq *rq;
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_ptr can change in the fork path
* - any previously selected CPU might disappear through hotplug
*
* Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
post_init_entity_util_avg(p);
activate_task(rq, p, ENQUEUE_NOCLOCK);
trace_sched_wakeup_new(p);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
/*
* Nothing relies on rq->lock after this, so its fine to
* drop it.
*/
rq_unpin_lock(rq, &rf);
p->sched_class->task_woken(rq, p);
rq_repin_lock(rq, &rf);
}
#endif
task_rq_unlock(rq, p, &rf);
}