Linux进程管理之SMP负载平衡


------------------------------------------
本文系本站原创,欢迎转载!
转载请注明出处:http://ericxiao.cublog.cn/
------------------------------------------
一:前言
之前在分析Cpuset的时候提起过cpu负载平衡(load balance),当时因为分析的对象是cpuset,所以忽略了负载平衡的实现,在本节的分析中,我们来深入分析这一个过程.鉴于篇幅问题,这里只分析SMP系统的的负载平衡,至于超线性和NUMA的情况,这里就不做介绍了.另外,在本文的分析中还会涉及到之前对CFS以及CFS组调度的一些知识.如果有对这些不太熟悉的,建议先阅读本站有关CFS调度的两篇文章.有关调度域,调度组等基本概念,请参阅linux-2.6.29-rc5/Documentation/scheduler/sched-domains.txt.
本文分析的源代码是基于linux kernel 2.6.29-rc5.分析的代码基本上位于kernel/sched.c中.
 
二:SMP中负载平衡的初始化
Load balance的初始化在下面的过程中:
Kernel_init() à sched_init_smp() à arch_init_sched_domains():
static int arch_init_sched_domains(const struct cpumask *cpu_map)
{
    int err;
 
    /*x86平台为空*/
    arch_update_cpu_topology();
    /*doms_cur中的调度域个数初始化的时候,全部cpu都在一个调度域中*/
    ndoms_cur = 1;
    /*分配cpu位图空间,如果分配失败.直接使用fallback_doms*/
    doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
    if (!doms_cur)
        doms_cur = fallback_doms;
    /*系统中的全部cpu,除去使用引导参数"isolcpus="指定的孤立cpu
    */
    cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
    dattr_cur = NULL;
    err = build_sched_domains(doms_cur);
    register_sched_domain_sysctl();
 
    return err;
}
里面的核心操作为build_sched_domains().代码如下:
build_sched_domains() à __build_sched_domains():
static int __build_sched_domains(const struct cpumask *cpu_map,
                 struct sched_domain_attr *attr)
{
    int i, err = -ENOMEM;
    struct root_domain *rd;
    cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
        tmpmask;
 
    if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
        goto free_notcovered;
    if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
        goto free_nodemask;
    if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
        goto free_this_sibling_map;
    if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
        goto free_this_core_map;
    if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
        goto free_send_covered;
 
 
    rd = alloc_rootdomain();
    if (!rd) {
        printk(KERN_WARNING "Cannot alloc root domain\n");
        goto free_sched_groups;
    }
 
    /*
     * Set up domains for cpus specified by the cpu_map.
     */
    for_each_cpu(i, cpu_map) {
        struct sched_domain *sd = NULL, *p;
 
        cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
        p = sd;
        sd = &per_cpu(phys_domains, i).sd;
        SD_INIT(sd, CPU);
        set_domain_attribute(sd, attr);
        cpumask_copy(sched_domain_span(sd), nodemask);
        sd->parent = p;
        if (p)
            p->child = sd;
        cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
    }
 
    /* Set up physical groups */
    for (i = 0; i < nr_node_ids; i++) {
        cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
        if (cpumask_empty(nodemask))
            continue;
 
        init_sched_build_groups(nodemask, cpu_map,
                    &cpu_to_phys_group,
                    send_covered, tmpmask);
    }
 
    for_each_cpu(i, cpu_map) {
        struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 
        init_sched_groups_power(i, sd);
    }
 
 
    /* Attach the domains */
    for_each_cpu(i, cpu_map) {
        struct sched_domain *sd;
        sd = &per_cpu(phys_domains, i).sd;
        cpu_attach_domain(sd, rd, i);
    }
 
    err = 0;
 
free_tmpmask:
    free_cpumask_var(tmpmask);
free_send_covered:
    free_cpumask_var(send_covered);
free_this_core_map:
    free_cpumask_var(this_core_map);
free_this_sibling_map:
    free_cpumask_var(this_sibling_map);
free_nodemask:
    free_cpumask_var(nodemask);
free_notcovered:
    return err;
 
free_sched_groups:
    goto free_tmpmask;
}
因为篇幅原因,上面所列出的代码除去了CONFIG_NUMA, CONFIG_SCHED_SMT, CONFIG_SCHED_MC的选择编译部份.
其实,上面的代码不外乎做了三件事:初始化调度域,初始化调度组,调度组cpu power的初始化以及调度域和运行队列的关联.下面来依次分析这四个过程:
 
2.1:调度域的初始化
调度域的初始化代码如下:
/*遍历cpu_map中的所有cpu*/
for_each_cpu(i, cpu_map) {
        struct sched_domain *sd = NULL, *p;
        /*在没有配置NUMA的情况下,cpumask_of_node()返回的cpu_online_mask
*所以这里返回的依然是cpu_map
*/
        cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
        /*sd==NULL,p=NULL*/
        p = sd;
        /*每个cpu都有一个sched domain*/
        sd = &per_cpu(phys_domains, i).sd;
        /*初始化sd*/
        SD_INIT(sd, CPU);
        set_domain_attribute(sd, attr);
        /*该sd中包括的cpu,即为cpu_map.即包含了所有的cpu.也就是说在
*在所有cpu中都进行load balance
*/
        cpumask_copy(sched_domain_span(sd), nodemask);
        /*因为p==NULL,所以,sd->parent=NULL*/
        sd->parent = p;
        if (p)
            p->child = sd;
        /*初始化sd中的调度组*/
        cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
    }
在这个代码片段中,我们先来看一下phys_domains的定义:
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
它是一个per-cpu变量,即每个cpu都含有这个的一个变量,它的类型为: struct static_sched_domain,定义如下:
struct static_sched_domain {
    struct sched_domain sd;
    DECLARE_BITMAP(span, CONFIG_NR_CPUS);
};
其实,这个per-cpu变量中的sd成员就是该cpu的基本调度域.它是静态定义的一个结构.
 
SD_INIT(sd, CPU)的定义如下:
#define SD_INIT(sd, type)   sd_init_##type(sd)
所以,SD_INIT(sd,CPU)就是为sd_init_cpu(sd).如下示:
#define SD_INIT_FUNC(type)  \
static noinline void sd_init_##type(struct sched_domain *sd)    \
{                               \
    memset(sd, 0, sizeof(*sd));             \
    *sd = SD_##type##_INIT;                 \
    sd->level = SD_LV_##type;               \
    SD_INIT_NAME(sd, type);                 \
}
 
SD_INIT_FUNC(CPU)
从此可以得知:
Sd_init_cpu(sd)就等价于:
{
    Memset(sd,0,sizeof(*sd));
    *sd = SD_CPU_INIT;
    Sd->level = SD_LV_CPU;
    SD_INIT_NAME(sd,type);
}
SD_CPU_INIT如下:
#define SD_CPU_INIT (struct sched_domain) {     \
    .min_interval       = 1,            \
    .max_interval       = 4,            \
    .busy_factor        = 64,           \
    .imbalance_pct      = 125,          \
    .cache_nice_tries   = 1,            \
    .busy_idx       = 2,            \
    .idle_idx       = 1,            \
    .newidle_idx        = 2,            \
    .wake_idx       = 1,            \
    .forkexec_idx       = 1,            \
    .flags          = SD_LOAD_BALANCE   \
                | SD_BALANCE_EXEC   \
                | SD_BALANCE_FORK   \
                | SD_WAKE_AFFINE    \
                | SD_WAKE_BALANCE   \
                | sd_balance_for_package_power()\
                | sd_power_saving_flags(),\
    .last_balance       = jiffies,      \
    .balance_interval   = 1,            \
}
它对调度域的各成员进行了初始化.这些初始化的值就先放到这里,等以后要用的时候再来对照.
#ifdef CONFIG_SCHED_DEBUG
# define SD_INIT_NAME(sd, type)     sd->name = #type
#else
# define SD_INIT_NAME(sd, type)     do { } while (0)
#endif
在定义了CONFIG_SHED_DEBUG的定义况,对sd的名称进行赋值,在这里即为CPU.
 
返回到上面的代码片段中,还剩下一个子函数,即: cpu_to_phys_group().代码如下:
static int
cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
          struct sched_group **sg, struct cpumask *mask)
{
    int group;
 
    group = cpu;
    if (sg)
        *sg = &per_cpu(sched_group_phys, group).sg;
    return group;
}
这个函数也会简单,即初始化cpu基本调度域的调度组为&per_cpu(sched_group_phys, group).sg;
sched_group_phys定义如下:
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
struct static_sched_group {
    struct sched_group sg;
    DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
};
其实sg就是表示该cpu基本调度域的第一个调度组.
 
2.2:调度组的初始化:
调度组的初始化如下面代码片段如示:
    for (i = 0; i < nr_node_ids; i++) {
        cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
        if (cpumask_empty(nodemask))
            continue;
 
        init_sched_build_groups(nodemask, cpu_map,
                    &cpu_to_phys_group,
                    send_covered, tmpmask);
    }
在没有配置NUMA的情况,nr_node_ids为1,也就是说该循环只会执地一次,核心操作为init_sched_build_groups().代码如下:
static void
init_sched_build_groups(const struct cpumask *span,
            const struct cpumask *cpu_map,
            int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                    struct sched_group **sg,
                    struct cpumask *tmpmask),
            struct cpumask *covered, struct cpumask *tmpmask)
{
    struct sched_group *first = NULL, *last = NULL;
    int i;
 
    /*将coverd清空*/
    cpumask_clear(covered);
 
    /*遍历span中的cpu*/
    for_each_cpu(i, span) {
        struct sched_group *sg;
        /*group_fn即返回i,和cpu i上的sched group*/
        int group = group_fn(i, cpu_map, &sg, tmpmask);
        int j;
 
        /*如果coverd中已经包括了cpu i,表示cpu i已经处理过了
         *继续下一个
         */
        if (cpumask_test_cpu(i, covered))
            continue;
 
        /*清除掉sg中的cpumask*/
        cpumask_clear(sched_group_cpus(sg));
        /*将sg->__cpu_power初始化为0*/
        sg->__cpu_power = 0;
 
        for_each_cpu(j, span) {
            /*在这里又会找到cpu i,即只有j==i的情况下才会满足*/
            if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                continue;
 
            /*将cpu i 在covered 中置位*/
            cpumask_set_cpu(j, covered);
            /*将cpu i在sg中置位*/
            cpumask_set_cpu(j, sched_group_cpus(sg));
        }
        /*这样做其实是将各个group串联起来*/
        if (!first)
            first = sg;
        if (last)
            last->next = sg;
        last = sg;
    }
    /*尾指向头.即形成一个封闭的环*/
    last->next = first;
}
注意在SMP的情况下,调用该函数的group_fn参数为cpu_to_phys_group.这个函数已经在前面分析过了.
这个函数比较简单,对照添加在代码中的注释应该很容易看懂,所以就不再详细分析了.
2.3:调度组cpu power的初始化
Cpu power的初始化如下面代码片段如示:
for_each_cpu(i, cpu_map) {
        struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
 
        init_sched_groups_power(i, sd);
}
即对每个cpu的基本调度域都执行init_sched_groups_powed().代码如下:
tatic void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
    struct sched_domain *child;
    struct sched_group *group;
 
    WARN_ON(!sd || !sd->groups);
 
    /*每个cpu的sd初始化一次*/
    if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
        return;
 
    child = sd->child;
 
    sd->groups->__cpu_power = 0;
 
    /*
     * For perf policy, if the groups in child domain share resources
     * (for example cores sharing some portions of the cache hierarchy
     * or SMT), then set this domain groups cpu_power such that each group
     * can handle only one task, when there are other idle groups in the
     * same sched domain.
     */
     /*对于SMP而言,sd->child是为空的*/
    if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
               (child->flags &
            (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
        sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
        return;
    }
 
    /*
     * add cpu_power of each child group to this groups cpu_power
     */
    group = child->groups;
    do {
        sg_inc_cpu_power(sd->groups, group->__cpu_power);
        group = group->next;
    } while (group != child->groups);
}
在这里,应该来讲讲什么叫调度组的cpu power,cpu power就是调度组的负载能力,例如在超线性的cpu中,主核与超线程的虚拟核它们的负载能力是不一样,在load balance的时候,会根据各组的cpu power来决定应该分配给该组多少的负载才合适.在SMP系统中,各CPU是完全对称的,也就是说,它们的处理能力都是一样的,因此,在这里,每个cpu对应的调度组的cpu power都由sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE)设置为SHED_LOAD_SCALE.
 
2.4:运行队列和调度域的关联
运行队列和调度域的关联在以下代码片段中完成:
for_each_cpu(i, cpu_map) {
    struct sched_domain *sd;
    sd = &per_cpu(phys_domains, i).sd;
cpu_attach_domain(sd, rd, i);
}
Cpu_attach_domain()如下:
static void
cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
{
    struct rq *rq = cpu_rq(cpu);
    struct sched_domain *tmp;
 
    /* Remove the sched domains which do not contribute to scheduling. */
    /*对于SMP而言,它的parent是为空的,因此,进入入循环后会马上会退出*/
    for (tmp = sd; tmp; ) {
        struct sched_domain *parent = tmp->parent;
        if (!parent)
            break;
 
        if (sd_parent_degenerate(tmp, parent)) {
            tmp->parent = parent->parent;
            if (parent->parent)
                parent->parent->child = tmp;
        } else
            tmp = tmp->parent;
    }
 
    /*由于不存在sd->parent.因此,该语句也无意义*/
    if (sd && sd_degenerate(sd)) {
        sd = sd->parent;
        if (sd)
            sd->child = NULL;
    }
 
    sched_domain_debug(sd, cpu);
 
    /*设置rq->rd = rd*/
    rq_attach_root(rq, rd);
   
    /*将rq->sd=sd.即cpu的运行队列指向其基本调度域
    */
    rcu_assign_pointer(rq->sd, sd);
}
该函数也比较简单,不做详细分析了.
 
上面的初始化过程可能让人看得目不暇接,画了个图,以方便理解,如下:
 
三:SMP负载平衡操作
 
3.1:tick中断中的load balance
在sched_init()中,有如下操作:
    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
即为load balance注册了一个软中断处理.什么时候才会将这个软中断的执行打开呢,我们来看一下tick中断时的处理.
在tick中断处理中,会调用scheduler_tick().其中和SMP有关的代码片段如下:
void scheduler_tick(void)
{
    ......
    rq->idle_at_tick = idle_cpu(cpu);
    trigger_load_balance(rq, cpu);
Idle_cpu()定义如下:
nt idle_cpu(int cpu)
{
    return cpu_curr(cpu) == cpu_rq(cpu)->idle;
}
即如果当前cpu是空闲的,那么rq->idle_at_tick会被设置为1.
trigger_load_balance()代码如下:
static inline void trigger_load_balance(struct rq *rq, int cpu)
{
#ifdef CONFIG_NO_HZ
    ………
#endif
    if (time_after_eq(jiffies, rq->next_balance))
        raise_softirq(SCHED_SOFTIRQ);
}
忽略掉选择编译的CONFIG_NO_HZ.
从上面的代码中可以看出.如果jiffies大于或者等于rq->next_balance的时候,就会调用raise_softirq()将load balance的软中断打开.
Rq->next_balance表示下次load balance操作的时间戳.这样是为了避免进行load balance的操作过于频繁,影响系统性能.
 
该软中断对应的处理函数为run_rebalance_domains().代码如下:
static void run_rebalance_domains(struct softirq_action *h)
{
    int this_cpu = smp_processor_id();
    struct rq *this_rq = cpu_rq(this_cpu);
    enum cpu_idle_type idle = this_rq->idle_at_tick ?
                        CPU_IDLE : CPU_NOT_IDLE;
 
    rebalance_domains(this_cpu, idle);
#ifdef CONFIG_NO_HZ
    ......
    ......
#endif
}
This_rq->idle_at_tick表示当前的运行队列是否空闲,如果空闲,idle为CPU_IDLE,否则为CPU_NOT_IDLE.
 
Rebalance_domains()代码如下:
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
    int balance = 1;
    struct rq *rq = cpu_rq(cpu);
    unsigned long interval;
    struct sched_domain *sd;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;
    int need_serialize;
    cpumask_var_t tmp;
 
    /* Fails alloc?  Rebalancing probably not a priority right now. */
    if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
        return;
 
    /*从基本调度域开始从下往上遍历*/
    for_each_domain(cpu, sd) {
 
        /*如果没有设置SD_LOAD_BALANCE标志,不需要进行
         *负载均衡操作
         */
        if (!(sd->flags & SD_LOAD_BALANCE))
            continue;
        /*计算平衡的时间间隔*/
        interval = sd->balance_interval;
        /*如果当前CPU不为空闲,调整时间间隔为较长值*/
        if (idle != CPU_IDLE)
            interval *= sd->busy_factor;
 
        /* scale ms to jiffies */
        /*将ms转换成jiffies*/
        interval = msecs_to_jiffies(interval);
        /*对interval大小进行有效性调整*/
        if (unlikely(!interval))
            interval = 1;
        if (interval > HZ*NR_CPUS/10)
            interval = HZ*NR_CPUS/10;
 
        /*如果定义了SD_SERIALIZE 表示使该调度域的load balance 串行化*/
        need_serialize = sd->flags & SD_SERIALIZE;
        /*每个load balance的soft irq必须持有自旋锁才能进行下一步操作*/
        if (need_serialize) {
            if (!spin_trylock(&balancing))
                goto out;
        }
 
        /*如果到了调度时间*/
        /*time_after_eq(A,B):A>=B返回1*/
        if (time_after_eq(jiffies, sd->last_balance + interval)) {
            if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                /*
                 * We've pulled tasks over so either we're no
                 * longer idle, or one of our SMT siblings is
                 * not idle.
                 */
                 /*已经从其它cpu上拉了进程过来,所以不再空闲了*/
                idle = CPU_NOT_IDLE;
            }
            sd->last_balance = jiffies;
        }
 
        /*如果之前持有了balance ,释放之*/
        if (need_serialize)
            spin_unlock(&balancing);
out:
        if (time_after(next_balance, sd->last_balance + interval)) {
            next_balance = sd->last_balance + interval;
            update_next_balance = 1;
        }
 
        /*
         * Stop the load balance at this level. There is another
         * CPU in our sched group which is doing load balancing more
         * actively.
         */
         /*如果balance为0,表示在所属调度组中有更合适的
         *cpu来做负载平衡的工作
         */
        if (!balance)
            break;
    }
 
    /*
     * next_balance will be updated only when there is a need.
     * When the cpu is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))
        rq->next_balance = next_balance;
 
    free_cpumask_var(tmp);
}
从上面的代码可以看出,在当前CPU忙和空闲的情况下,执行load balance的频率是不一样的,在空闲情况下,会以很高的频率执行load balance.
Load balance的核心操作在load_balance()中进行.
代码如下:
static int load_balance(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *balance, struct cpumask *cpus)
{
    int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
    struct sched_group *group;
    unsigned long imbalance;
    struct rq *busiest;
    unsigned long flags;
 
    /*将所有的cpu在cpus中置位*/
    cpumask_setall(cpus);
 
    /*
     * When power savings policy is enabled for the parent domain, idle
     * sibling can pick up load irrespective of busy siblings. In this case,
     * let the state of idle sibling percolate up as CPU_IDLE, instead of
     * portraying it as CPU_NOT_IDLE.
     */
     /*在SMP系统上,调度域不含SD_SHARES_CPUPOWED标志*/
    if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        sd_idle = 1;
 
    schedstat_inc(sd, lb_count[idle]);
 
redo:
    /*更新调度域上进程组的share值,使其调度保持平衡*/
    update_shares(sd);
    /*找到最繁忙的调度组*/
    group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                   cpus, balance);
 
    /* *balance==0表示不需要在此cpu做load balance*/
    if (*balance == 0)
        goto out_balanced;
 
    /*没有找到除本地组之外的最繁忙组*/
    if (!group) {
        schedstat_inc(sd, lb_nobusyg[idle]);
        goto out_balanced;
    }
 
    /*在调度组中找到最繁忙的运行队列*/
    busiest = find_busiest_queue(group, idle, imbalance, cpus);
   
    /*没有找到,或者失败*/
    if (!busiest) {
        schedstat_inc(sd, lb_nobusyq[idle]);
        goto out_balanced;
    }
 
    /*不可能找到的最忙队列是本地队列,如果出现这种情况,打印出Oops*/
    BUG_ON(busiest == this_rq);
 
    schedstat_add(sd, lb_imbalance[idle], imbalance);
 
    ld_moved = 0;
   
    /*如果向繁忙列队中的运行任务数小于等于1(空闲进程)也是
     *该运行队列不需要被调度平衡,但调度组仍然是繁忙的
     */
    if (busiest->nr_running > 1) {
        /*
         * Attempt to move tasks. If find_busiest_group has found
         * an imbalance but busiest->nr_running <= 1, the group is
         * still unbalanced. ld_moved simply stays zero, so it is
         * correctly treated as an imbalance.
         */
        local_irq_save(flags);
        double_rq_lock(this_rq, busiest);
        ld_moved = move_tasks(this_rq, this_cpu, busiest,
                      imbalance, sd, idle, &all_pinned);
        double_rq_unlock(this_rq, busiest);
        local_irq_restore(flags);
 
        /*
         * some other cpu did the load balance for us.
         */
         /*进程已经移动了,需要重新调度*/
        if (ld_moved && this_cpu != smp_processor_id())
            resched_cpu(this_cpu);
 
        /* All tasks on this runqueue were pinned by CPU affinity */
        /*如果要移动的进程已经被固定在该cpu上,将
         *cpu在负载平衡范围内去掉,重新load balance
         */
        if (unlikely(all_pinned)) {
            cpumask_clear_cpu(cpu_of(busiest), cpus);
            /* 如果负载平衡的cpu集为空,那也用不着做负载平衡了*/
            if (!cpumask_empty(cpus))
                goto redo;
            goto out_balanced;
        }
    }
 
    /*如果移动进程失败或者是busiest cpu队列已经为空 闲了*/
    if (!ld_moved) {
        schedstat_inc(sd, lb_failed[idle]);
        /*移动失败,增加失败计数*/
        sd->nr_balance_failed++;
 
        /*如果失败计数大于cache_nice_tries,启动active_balance*/
        if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
 
            spin_lock_irqsave(&busiest->lock, flags);
 
            /* don't kick the migration_thread, if the curr
             * task on busiest cpu can't be moved to this_cpu
             */
            if (!cpumask_test_cpu(this_cpu,
                          &busiest->curr->cpus_allowed)) {
                spin_unlock_irqrestore(&busiest->lock, flags);
                all_pinned = 1;
                goto out_one_pinned;
            }
 
            if (!busiest->active_balance) {
                busiest->active_balance = 1;
                busiest->push_cpu = this_cpu;
                active_balance = 1;
            }
            spin_unlock_irqrestore(&busiest->lock, flags);
            if (active_balance)
                wake_up_process(busiest->migration_thread);
 
            /*
             * We've kicked active balancing, reset the failure
             * counter.
             */
             /*如果启动了active_balance之后又有移动进程失败的情况,就会马上
              *进入active_balance,因为这时可能migration_thread进程还在运行,或者睡眠不久
              */
            sd->nr_balance_failed = sd->cache_nice_tries+1;
        }
    } else
        /*否则,移动进程成功,清除失败计数*/
        sd->nr_balance_failed = 0;
 
    /*active_balance为0.说明migration_thread之前就已经处于运行状态,
     *也就是说该线程也有末完成的任务,此时调度域处于末平衡状态
     *因此需要将balance的间隔调小
     */
    if (likely(!active_balance)) {
        /* We were unbalanced, so reset the balancing interval */
        sd->balance_interval = sd->min_interval;
    }
    /*否则,migration_thead是刚才被唤起来说,说明此线程正在调整调度域中的负载
    *此时就可以以较长的间隔来进行load balance
    */
    else {
        /*
         * If we've begun active balancing, start to back off. This
         * case may not be covered by the all_pinned logic if there
         * is only 1 task on the busy runqueue (because we don't call
         * move_tasks).
         */
        if (sd->balance_interval < sd->max_interval)
            sd->balance_interval *= 2;
    }
 
    if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        ld_moved = -1;
 
    goto out;
 
/*这里是不需要进行load balance的情况,在这种情况下,也需要将nr_balance_failed计
 *计数清空
 */
out_balanced:
    schedstat_inc(sd, lb_balanced[idle]);
   
    sd->nr_balance_failed = 0;
 
out_one_pinned:
    /* tune up the balancing interval */
    if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
            (sd->balance_interval < sd->max_interval))
        sd->balance_interval *= 2;
 
    if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
        !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
        ld_moved = -1;
    else
        ld_moved = 0;
out:
    /*如果有进程移动了,就要重新更新调度域中进程组的shares*/
    if (ld_moved)
        update_shares(sd);
    return ld_moved;
}
该函数在调度域sd上进行load balance操作,将较重的负载迁移到本地cpu(this_cpu).然后返回迁移的负载量.另外,如果不需要在this_cpu上做load balance的动作,* balance会返回0.
 
对照代码中的注释可以对该函数的运行有个大致的了解,下面就来详细分析这个函数.
其实,这个函数不外乎做了这几样事情,首先更新调度域中进程组的shares值,找到最任务最繁重的调度组,再从调度组中找到负载最重的cpu,然后再将该cpu上的进程迁移到本地cpu上.下面一一对它们进行分析.
 
3.1.1:更新调度域中进程组的shares
这个过程是在update_shares()函数中完成的,代码如下:
static void update_shares(struct sched_domain *sd)
{
    u64 now = cpu_clock(raw_smp_processor_id());
    s64 elapsed = now - sd->last_update;
 
    /* 为了避免更新过于频繁,这里设定了更新时间阀值
     *默认0.25s 更新一次
     */
    if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
        sd->last_update = now;
        walk_tg_tree(tg_nop, tg_shares_up, sd);
    }
}
Sd->last_update是上次更新的时间戳,只有当更新时间超过sysctl_sched_shares_ratelimit才会进行update shares的动作,这样是为了避免过于频繁的更新,影响系统性能.顺便说句题外话,sysctl_XXX都是一些调度器参数.可以在用户空间使用sysctl进行查询和修改.
流程转入到walk_tg_tree(),代码如下:
static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
{
    struct task_group *parent, *child;
    int ret;
 
    rcu_read_lock();
    parent = &root_task_group;
down:
    ret = (*down)(parent, data);
    if (ret)
        goto out_unlock;
    list_for_each_entry_rcu(child, &parent->children, siblings) {
        parent = child;
        goto down;
 
up:
        continue;
    }
    ret = (*up)(parent, data);
    if (ret)
        goto out_unlock;
 
    child = parent;
    parent = parent->parent;
    if (parent)
        goto up;
out_unlock:
    rcu_read_unlock();
 
    return ret;
}
这个函数看起来有点乱,先看注释:
/*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
其实它是遍历整个进程组,第一次遍历到该进程组的时候调用down函数,然后离开该组的时候调用up函数.
分析这个函数需要了解一了组调度的东西,如果有不熟悉的,请查阅本站有关组调度的文档.
组调度的根结点是保存在root_task_group中的,它以此为出发点,遍历整个调度组,为了便于分析,将函数的执行用图表示,如下:
 
 
图中的红色线条表示遍历的顺序,从图中可以很清楚的看出每个节点的遍历状况.
其实,它类似于我们之前分析有关PCI设备驱动时的cpi设备的遍历操作.这里只是一种非递归的写法而已,将它改写成递归形式,如下示:
static int do_walk_tg_tree(struct task_group *tg, tg_visitor down, tg_visitor up, void * data)
{
    int ret = 0;
    struct task_group *child;
   
    if(tg){
        ret = (*down)(tg, data);
        if(ret)
            goto exit;
       
        list_for_each_entry_rcu(child, &tg->children, siblings){
            ret = do_walk_tg_tree(child, down,up,data);
            if(ret)
                goto exit;
        }
 
        ret = (*up)(tg,data);
        if(ret)
            goto exit;
    }
exit:
    return ret;
}
 
static int walk_tg_tree(tg_visitor down, tg_visitor up, void * data)
{
    return do_walk_tg_tree(&root_task_group,down,up,data);
}
从这里也可以看出,它是一种深度优先遍历.下面来看一下它的两个处理函数,即down和up:
Down函数为:
static int tg_nop(struct task_group *tg, void *data)
{
    return 0;
}
从此看出,它是一个空函数,没有任务的操作.
Up函数为:
static int tg_shares_up(struct task_group *tg, void *data)
{
    unsigned long weight, rq_weight = 0;
    unsigned long shares = 0;
    struct sched_domain *sd = data;
    int i;
 
    /*遍历sd 中的cpu*/
    for_each_cpu(i, sched_domain_span(sd)) {
        /*
         * If there are currently no tasks on the cpu pretend there
         * is one of average load so that when a new task gets to
         * run here it will not get delayed by group starvation.
         */
        weight = tg->cfs_rq[i]->load.weight;
        if (!weight)
            weight = NICE_0_LOAD;
        /*rq_weight是当前load.weight值*/
        tg->cfs_rq[i]->rq_weight = weight;
        rq_weight += weight;
        shares += tg->cfs_rq[i]->shares;
    }
    /*经过上面的循环过后:
     *rq_weight:表示该task_group中包含在sd中cpu对应cfs_rq的负载之和
     *shares:表示该task_group中包含在sd中cpu对应cfs_rq的shares值之和
     */
 
    if ((!shares && rq_weight) || shares > tg->shares)
        shares = tg->shares;
 
    /*对于SMP而言,sd->parent应该始终为空?*/
    if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
        shares = tg->shares;
 
    for_each_cpu(i, sched_domain_span(sd))
        update_group_shares_cpu(tg, i, shares, rq_weight);
 
    return 0;
}
这个函数就有点复杂了,我们结合之前分析SMP系统初始化的部份来分析,在SMP中,基本调度域的父层是为空的,根据下面代码片段:
if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
        shares = tg->shares;
 
因此上面函数中的shares值就会固定为 shares = tg->shares;
从上面的代码中,很容易就可看出: rq_weight是该进程组在调度域cpu上的负载之和.在这里要特别这意:
    tg->cfs_rq[i]->rq_weight= weight = tg->cfs_rq[i]->load.weight;
也就是说,rq_weight是当前进程组在cpu i上的负载值.
流程转入到update_group_shares_cpu()中,如下示:
static void
update_group_shares_cpu(struct task_group *tg, int cpu,
            unsigned long sd_shares, unsigned long sd_rq_weight)
{
    unsigned long shares;
    unsigned long rq_weight;
 
    if (!tg->se[cpu])
        return;
 
    rq_weight = tg->cfs_rq[cpu]->rq_weight;
 
    /*
     *           \Sum shares * rq_weight
     * shares =  -----------------------
     *               \Sum rq_weight
     *
     */
    shares = (sd_shares * rq_weight) / sd_rq_weight;
    shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 
    if (abs(shares - tg->se[cpu]->load.weight) >
            sysctl_sched_shares_thresh) {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
 
        spin_lock_irqsave(&rq->lock, flags);
        tg->cfs_rq[cpu]->shares = shares;
 
        __set_se_shares(tg->se[cpu], shares);
        spin_unlock_irqrestore(&rq->lock, flags);
    }
}
调整之后的shares为:
Shares = shares*(当前cpu上的负载)/在调度域上的总负载
到这里,需要我们回想一下有关组调度的一些东西了,进程组的shares值其实就是进程组的se的load值,也就是它的优先级.所以: tg->se[cpu]->load.weight也就是该组旧的shares值,上面计算出来的shares是调整后的shares.如果要调整的shares与旧的shares差值小于sysctl_sched_shares_thresh.为避免引起系统颠簸,也不会对进程组的shares进行更新.
后面的__set_se_shares()在组调度的时候已经分析过了.这里就不再赘述了.
 
分析完了操作之后,我们来考虑一下:为什么在load balance的时候,要对进程组的shares进行设置呢?
假设有一个进程组,它在cpu1,cpu2,cpu3的负载值为别为50,100,150
 在默认情况或者是在cgroup中设置了该进程组的shares,那么,该进程组在cpu1,cpu2,cpu3的se的优先级都是一样的.那么势必会造成,cpu2的负载一直要比cpu1大,cpu3的负载一直要比cpu2大.就样,在进程组内就造成了cpu负载的不均衡.
所以需要根据cpu上的负载来适当调度它的优先级,cpu上负载高的,优先级也设置的大一点.
还是用上面的例子,假设该组的shares值是600.那么调整之后:
        CPU1    CPU2    CPU3
CPU负载:50      100     150
组优先级:100     200     300
这样,在进程组内就尽量的实现load balance.
那优先为什么要调整为:shares*(当前cpu上的负载)/在调度域上的总负载 呢?为什么不按其它的比例对它进行调整呢?
思考一下组调度里的shares值的含义,它表示的是进程组占用CPU的百分比.
假设有两个进程组,G1,G2.G2占用CPU时间是G1的两倍,那么必定有,G2的shares值是G1的两倍.
假设在CPU1,CPU2,CPU3上,它们的shares值分别为S1和3S1如下:
        CPU1    CPU2    CPU3
G1.se:  S1      S1      S1
G2.se:  3S1     3S1     3S1
 
那调整之后:
        CPU1    CPU2    CPU3
G1.se:  S1a     S1b     S1c     S1a+ S1b+ S1c = S1
G2.se:  S2a     S2b     S2c     S2a+ S2b+ S2c = 3S1
G1的shares被分散为s1a,s1b,s1c.但它们的总和为S1.
同理G2的总和也为之前的shares,即3S1  
这样,在调整这后,G1,G2还能保持这样的比例关系.也就是说,调整之后,G1和G2占用CPU的比例关系依然不变.
 
3.1.2:找到最繁重的调度组
这是在find_busiest_group()中完成的.代码如下:
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
           unsigned long *imbalance, enum cpu_idle_type idle,
           int *sd_idle, const struct cpumask *cpus, int *balance)
{
    struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
    unsigned long max_load, avg_load, total_load, this_load, total_pwr;
    unsigned long max_pull;
    unsigned long busiest_load_per_task, busiest_nr_running;
    unsigned long this_load_per_task, this_nr_running;
    int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
    int power_savings_balance = 1;
    unsigned long leader_nr_running = 0, min_load_per_task = 0;
    unsigned long min_nr_running = ULONG_MAX;
    struct sched_group *group_min = NULL, *group_leader = NULL;
#endif
 
    max_load = this_load = total_load = total_pwr = 0;
    busiest_load_per_task = busiest_nr_running = 0;
    this_load_per_task = this_nr_running = 0;
 
    if (idle == CPU_NOT_IDLE)
        load_idx = sd->busy_idx;
    else if (idle == CPU_NEWLY_IDLE)
        load_idx = sd->newidle_idx;
    else
        load_idx = sd->idle_idx;
 
    do {
        unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
        int local_group;
        int i;
        int __group_imb = 0;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long sum_nr_running, sum_weighted_load;
        unsigned long sum_avg_load_per_task;
        unsigned long avg_load_per_task;
 
        /*判断this_cpu是否在这个组中*/
        local_group = cpumask_test_cpu(this_cpu,
                           sched_group_cpus(group));
        /*组中的第一个cpu*/
        if (local_group)
            balance_cpu = cpumask_first(sched_group_cpus(group));
 
        /* Tally up the load of all CPUs in the group */
        sum_weighted_load = sum_nr_running = avg_load = 0;
        sum_avg_load_per_task = avg_load_per_task = 0;
 
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
 
        /*遍历该组中的所有cpu id*/
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
            struct rq *rq = cpu_rq(i);
 
            if (*sd_idle && rq->nr_running)
                *sd_idle = 0;
 
            /* Bias balancing toward cpus of our domain */
            /*偏向于本组中的组,计算出来的load值要偏大*/
            if (local_group) {
                /*计算本地组中第一个空闲的cpu*/
                if (idle_cpu(i) && !first_idle_cpu) {
                    first_idle_cpu = 1;
                    balance_cpu = i;
                }
                /*目的cpu的load值计算要稍微大一点*/
                load = target_load(i, load_idx);
            } else {
                /*源cpu的load值要小一点*/
                load = source_load(i, load_idx);
                /*计算该组的最大负载与最小负载*/
                if (load > max_cpu_load)
                    max_cpu_load = load;
                if (min_cpu_load > load)
                    min_cpu_load = load;
            }
            /*该组计算出来的cpu load的总和*/
            avg_load += load;
            /*该组运行的的进程总数*/
            sum_nr_running += rq->nr_running;
            /*该组进程负载之和*/
            sum_weighted_load += weighted_cpuload(i);
            /*该组中运行进程的平均负载*/
            sum_avg_load_per_task += cpu_avg_load_per_task(i);
        }
 
        /*
         * First idle cpu or the first cpu(busiest) in this sched group
         * is eligible for doing load balancing at this and above
         * domains. In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
         
         /*这个组中的第一个空闲cpu或者是第一个cpu
         *才有条件load bananc.这样是为了防止乒乓效应
         */
        if (idle != CPU_NEWLY_IDLE && local_group &&
            balance_cpu != this_cpu && balance) {
            *balance = 0;
            goto ret;
        }
 
        /*统计调度域的cpu load 总数*/
        total_load += avg_load;
        /*统计调度域中的__cpu_power总数*/
        total_pwr += group->__cpu_power;
 
        /* Adjust by relative CPU power of the group */
        /*ave_load由调度组的cpu power进行调整*/
        /*在SMP系统上,此处仍然为avg_load*/
        avg_load = sg_div_cpu_power(group,
                avg_load * SCHED_LOAD_SCALE);
 
 
        /*
         * Consider the group unbalanced when the imbalance is larger
         * than the average weight of two tasks.
         *
         * APZ: with cgroup the avg task weight can vary wildly and
         *      might not be a suitable number - should we keep a
         *      normalized nr_running number somewhere that negates
         *      the hierarchy?
         */
         /*在SMP系统中,avg_load_per_task = sum_avg_load_per_task*/
        avg_load_per_task = sg_div_cpu_power(group,
                sum_avg_load_per_task * SCHED_LOAD_SCALE);
 
        /*如果该组的最大cpu负值与最小cpu负载之差要超过进程平均
         *负载的两倍,被认为该调度组内是不平衡的
         */
        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
            __group_imb = 1;
       
        /*group_capacity = 1*/
        group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
        /*统计本地组的相关信息*/
        if (local_group) {
            this_load = avg_load;
            this = group;
            this_nr_running = sum_nr_running;
            this_load_per_task = sum_weighted_load;
        }
        /*否则,更新最繁重的组指向*/
        else if (avg_load > max_load &&
               (sum_nr_running > group_capacity || __group_imb)) {
            max_load = avg_load;
            busiest = group;
            busiest_nr_running = sum_nr_running;
            busiest_load_per_task = sum_weighted_load;
            group_imb = __group_imb;
        }
 
        group = group->next;
    } while (group != sd->groups);
 
    /*
     *1:如果没有找到最忙cpu
     *2:本地组的负载要比超过其它的组负载,本地组才是最忙的
     *3:最忙组中没有进程在运行
     *以上几种情况是不需要被调度的
     */
    if (!busiest || this_load >= max_load || busiest_nr_running == 0)
        goto out_balanced;
 
    /*计算出来的avg_load:是组的平均负载*/
    avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
    /*本地组负载比组平均负载要高或者最忙组的负载与本地组的负载
     *之比不超过调度域的imbalance_pct,也是不需要被调度的*/
    if (this_load >= avg_load ||
            100*max_load <= sd->imbalance_pct*this_load)
        goto out_balanced;
 
    /*计算最忙组中的进程平均负载*/
    busiest_load_per_task /= busiest_nr_running;
   
    /*如果选出的最忙调度组中有组内不平衡的情况.
     *需要调整busiest_load_per_task
     */
    if (group_imb)
        busiest_load_per_task = min(busiest_load_per_task, avg_load);
 
    /*
     * We're trying to get all the cpus to the average_load, so we don't
     * want to push ourselves above the average load, nor do we wish to
     * reduce the max loaded cpu below the average load, as either of these
     * actions would just result in more rebalancing later, and ping-pong
     * tasks around. Thus we look for the minimum possible imbalance.
     * Negative imbalances (*we* are more loaded than anyone else) will
     * be counted as no imbalance for these purposes -- we can't fix that
     * by pulling tasks to us. Be careful of negative numbers as they'll
     * appear as very large values with unsigned longs.
     */
     /*调度总的总负载要小于或者等于比它进程的平均负载
     *这种情况下是不需要load balance的.
     *注意max_load是通过source_load()计算出来的,并不是它的实际负载
     */
 
    if (max_load <= busiest_load_per_task)
        goto out_balanced;
 
    /*
     * In the presence of smp nice balancing, certain scenarios can have
     * max load less than avg load(as we skip the groups at or below
     * its cpu_power, while calculating max_load..)
     */
     /*如果最高负载要小于组平均负载(组平均负载中也计算了本地组的
     *负载).只需要小范围的进行load balance*/
    if (max_load < avg_load) {
        *imbalance = 0;
        goto small_imbalance;
    }
 
    /* Don't want to pull so many tasks that a group would go idle */
    /*为了防止组过早处于空闲状态,移动较少的进程*/
    max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
 
    /* How much load to actually move to equalise the imbalance */
    /*计算要移动的负载量*/
    *imbalance = min(max_pull * busiest->__cpu_power,
                (avg_load - this_load) * this->__cpu_power)
            / SCHED_LOAD_SCALE;
 
    /*
     * if *imbalance is less than the average load per runnable task
     * there is no gaurantee that any tasks will be moved so we'll have
     * a think about bumping its value to force at least one task to be
     * moved
     */
     /*如果要移动的负载量要小于组中进程的平均负载
      *需要对移动负载量进行调整.以使它至少能移动一个进程
      */
    if (*imbalance < busiest_load_per_task) {
        unsigned long tmp, pwr_now, pwr_move;
        unsigned int imbn;
 
small_imbalance:
        pwr_move = pwr_now = 0;
        imbn = 2;
        if (this_nr_running) {
            this_load_per_task /= this_nr_running;
            if (busiest_load_per_task > this_load_per_task)
                imbn = 1;
        } else
            this_load_per_task = cpu_avg_load_per_task(this_cpu);
 
        if (max_load - this_load + busiest_load_per_task >=
                    busiest_load_per_task * imbn) {
            *imbalance = busiest_load_per_task;
            return busiest;
        }
 
        /*
         * OK, we don't have enough imbalance to justify moving tasks,
         * however we may be able to increase total CPU power used by
         * moving them.
         */
 
        pwr_now += busiest->__cpu_power *
                min(busiest_load_per_task, max_load);
        pwr_now += this->__cpu_power *
                min(this_load_per_task, this_load);
        pwr_now /= SCHED_LOAD_SCALE;
 
        /* Amount of load we'd subtract */
        tmp = sg_div_cpu_power(busiest,
                busiest_load_per_task * SCHED_LOAD_SCALE);
        if (max_load > tmp)
            pwr_move += busiest->__cpu_power *
                min(busiest_load_per_task, max_load - tmp);
 
        /* Amount of load we'd add */
        if (max_load * busiest->__cpu_power <
                busiest_load_per_task * SCHED_LOAD_SCALE)
            tmp = sg_div_cpu_power(this,
                    max_load * busiest->__cpu_power);
        else
            tmp = sg_div_cpu_power(this,
                busiest_load_per_task * SCHED_LOAD_SCALE);
        pwr_move += this->__cpu_power *
                min(this_load_per_task, this_load + tmp);
        pwr_move /= SCHED_LOAD_SCALE;
 
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
            *imbalance = busiest_load_per_task;
    }
 
    /*返回找到的最忙调度组*/
    return busiest;
 
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
    if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
        goto ret;
 
    if (this == group_leader && group_leader != group_min) {
        *imbalance = min_load_per_task;
        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
            cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
                cpumask_first(sched_group_cpus(group_leader));
        }
        return group_min;
    }
#endif
ret:
    *imbalance = 0;
    return NULL;
}
在阅读上面的代码的时候,首先必须要认识到以下几点:
1:该函数是找到一个最繁忙的组.然后将这个组中的进程迁移到本地cpu中.而调度组内的cpu则不会进行进程迁移的.也就是说,load balance是在相同调度域中的组与组之间发生,而不会发生在组内部.认识到这一点很重要.因此,在寻找到最繁忙的调度组时,不会找到目的cpu所在的调度组.
 
2:调度组中如果有多个cpu,那么只有组中的第一个cpu(调度组中无空闲cpu时)或者是第一个空闲的cpu(调度组中有空闲cpu)才能迁移进程.这样做是为了防止在软中断中执行load balace时做重复的工作,影响系统性能.当然,在后面的分析中我们可以看到,SMP的调度组中永远都只有一个cpu.另外, .在schedule()调度时,发现本地cpu为空闲,会主动进行load balance.这种情况在后面会进行详细的分析.
 
3:关于sg_div_cpu_power()函数
它其实是一个将除法转换为乘法的操作. sg_div_cpu_power(const struct sched_group *sg, u32 load)相当于 load/sg->__cpu_power.
综合前面对SMP load balance的初始化分析可得到,调度组的__cpu_power都被初始化为SCHED_LOAD_SCALE.
 
4:进程迁移是一件很影响系统性能的功能,不仅耗时还会引起Cache失效.因此,只需要在非进程负载平衡的情况下才会进行.在上面的代码中,计算本地组的负载和计算其它组的负载是不相同的.它们分别在target_load(),source_load()中实现.前者计算出来的值比后者计算出来的值要稍大一些.(在没有定义__SCHED_FEAT_LB_BIAS位的时候,它们算出来的值都是cpu上的实际负载).本地调度组的负载和计算出来的最忙进程的负载还有一系列比较之后才会进行load balance.
 
这个函数就不详细分析了.请自行对照代码中的注释进行分析.
 
3.1.3:找到最繁重的cpu
找到了最忙调度组之后,我们需要找到这个组内最忙的cpu,这个过程是在find_busiest_queue()中完成的.代码如下:
static struct rq *
find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
           unsigned long imbalance, const struct cpumask *cpus)
{
    struct rq *busiest = NULL, *rq;
    unsigned long max_load = 0;
    int i;
 
    /*遍历调度组中的cpu*/
    for_each_cpu(i, sched_group_cpus(group)) {
        unsigned long wl;
 
        /*如果cpu不在load balance的cpu子集中*/
        if (!cpumask_test_cpu(i, cpus))
            continue;
 
        /*计算该cpu上的负载*/
        rq = cpu_rq(i);
        wl = weighted_cpuload(i);
 
        /*如果该cpu上只有一个进程在运行,且该进程的负载要大于移动的
         *负载量,在该cpu上是不可能移动imbalance大小的负载的
         */
        if (rq->nr_running == 1 && wl > imbalance)
            continue;
 
        /*更新最繁忙运行队列*/
        if (wl > max_load) {
            max_load = wl;
            busiest = rq;
        }
    }
 
    /*返回最繁忙运行队列*/
    return busiest;
}
这个函数没有阅读困难的地方,需要注意的是,rq->nr_runing==1和rq空闲是两回事.因为cpu上的idle进程是不会计算在rq->nr_runing中的.
 
3.1.4:迁移进程
找到需要迁移进程的cpu了,接下来需要将它上面的进程迁移到本地cpu上,这是在move_tasks()中完成的.代码如下:
static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              unsigned long max_load_move,
              struct sched_domain *sd, enum cpu_idle_type idle,
              int *all_pinned)
{
    const struct sched_class *class = sched_class_highest;
    unsigned long total_load_moved = 0;
    int this_best_prio = this_rq->curr->prio;
 
    /*对每个sched class调用load_balance()*/
    do {
        total_load_moved +=
            class->load_balance(this_rq, this_cpu, busiest,
                max_load_move - total_load_moved,
                sd, idle, all_pinned, &this_best_prio);
        class = class->next;
 
        /*如果是CPU_NEWLY_IDLE形式的load balance,只要cpu上有运行队列就可以了*/
        if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
            break;
 
    } while (class && max_load_move > total_load_moved);
 
    /*如果有进程移动,返回1.否则0*/
    return total_load_moved > 0;
}
在这里我们遇到了CPU_NEWLY_IDLE形式的load balance.这种情况在上面我们分析过,它是在schedule()中发现cpu队列为空时,主动发起的load balance.这种情况下的负载平衡只需要完成很少的工作.即只需要保证cpu有进程运行就可以了.
 
从上面的代码可以看到,它会调用调度类的load_balance()接口,直到无调度类或者是移动了规定的负载.它是从sched_class_highest开始的.先来看一下它的定义:
#define sched_class_highest (&rt_sched_class)
即指向的是实时调度类.我们就从实时类的load balance开始.对应的接口如下:
rt_sched_class. load_balance = load_balance_rt.
代码如下:
static unsigned long
load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
        unsigned long max_load_move,
        struct sched_domain *sd, enum cpu_idle_type idle,
        int *all_pinned, int *this_best_prio)
{
    /* don't touch RT tasks */
    return 0;
}
即实时进程是不能在CPU之间移动的.
 
实时类的next指针是指向CFS了,来分析一下CFS中的load balance.CFS中接口为load_balance_fair().代码如下:
static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
          unsigned long max_load_move,
          struct sched_domain *sd, enum cpu_idle_type idle,
          int *all_pinned, int *this_best_prio)
{
    long rem_load_move = max_load_move;
    int busiest_cpu = cpu_of(busiest);
    struct task_group *tg;
 
    rcu_read_lock();
    /*更新进程组的层次负载因子(hierarchical load factor)*/
    update_h_load(busiest_cpu);
 
    /*遍历各task_group*/
    list_for_each_entry_rcu(tg, &task_groups, list) {
        struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
        unsigned long busiest_h_load = busiest_cfs_rq->h_load;
        unsigned long busiest_weight = busiest_cfs_rq->load.weight;
        u64 rem_load, moved_load;
 
        /*
         * empty group
         */
         /*如果该组是空的*/
        if (!busiest_cfs_rq->task_weight)
            continue;
 
        /*计算该组的最大移动负载量*/
        rem_load = (u64)rem_load_move * busiest_weight;
        /*div_u64(a,b) = a/b*/
        rem_load = div_u64(rem_load, busiest_h_load + 1);
 
        /*在该组内迁移进程,并将迁移的进程负载转换为本层的负载*/
        moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
                rem_load, sd, idle, all_pinned, this_best_prio,
                tg->cfs_rq[busiest_cpu]);
 
        if (!moved_load)
            continue;
 
        moved_load *= busiest_h_load;
        moved_load = div_u64(moved_load, busiest_weight + 1);
 
        /*计算剩余的移运进程负载量*/
        rem_load_move -= moved_load;
        if (rem_load_move < 0)
            break;
    }
    rcu_read_unlock();
 
    return max_load_move - rem_load_move;
}
这个函数比较抽象,其中涉及到了全局移动负载量和本层的移动负载量.下面详细分析这一过程.我们先来看update_h_load().代码如下:
static void update_h_load(long cpu)
{
    walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
}
Walk_tg_tree()这个函数我们很熟悉了.tg_nop()是一个空函数.它跟在前面分析的update_shares()原理是一样的,不过, update_shares()是从底部往上更新,而update_h_load()是从上往下更新.具体的,对调度层的每一层从上往下依次调用tg_load_down(),代码如下:
static int tg_load_down(struct task_group *tg, void *data)
{
    unsigned long load;
    long cpu = (long)data;
 
    /*如果是最上层,即root task group时,它的h_load即为weight*/
    if (!tg->parent) {
        load = cpu_rq(cpu)->load.weight;
    }
    /*否则,它的h_load为(父层的h_load) * (本层shares) / (父层总负载)*/
    else {
        load = tg->parent->cfs_rq[cpu]->h_load;
        load *= tg->cfs_rq[cpu]->shares;
        load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
    }
 
    /*更新本层的h_load*/
    tg->cfs_rq[cpu]->h_load = load;
 
    return 0;
}
理解这个函数是理解load_balance_fair()的关键,而该函数的关键是理解h_load值,举例说明如下:
 
 
如上图所示,一个进程组的根结点下面有Group1,Group2两个进程组以及Task1,Task2两个进程,假设在cpu i上要迁移进程,迁移进程的总负载为Move load,那么在Root task group,Group1,Group2分别需要移动多少负载呢?
这是按照进程组在该层中所占的负载比重来计算的,例如,Group1在Root task group中占的负载比率为30%,那么,Group1中就需要移动Move load的百分之三十.
其实,h_load就是指,在当前的负载情况下,本组应该移动的进程负载量.
对于root task group来说,这很好理解,要移动的进程负载量首先要通过根结点,根结点获得其全部的负载量,也就是cpu_rq(cpu)->load.weight.
对于子层,就需要计算它在父层中的负载比率了.父层的总负载为tg->parent->cfs_rq[cpu]->load.weight,而本层的负载为tg-> se[cpu]->load.weight.那就是本层在上层中所占的负载比率为:
tg-> se[cpu]->load.weight/tg->parent->cfs_rq[cpu]->load.weight
回忆在进入load balance的时候调用了update_shares().在该函数中有以下代码片段:
Update_shares() à tg_shares_up() à update_group_shares_cpu():
static void
update_group_shares_cpu(struct task_group *tg, int cpu,
            unsigned long sd_shares, unsigned long sd_rq_weight)
{
     ......
     ......
    tg->cfs_rq[cpu]->shares = shares;
 
    __set_se_shares(tg->se[cpu], shares);
    .....
    ......
也就是说,tg->cfs_rq[cpu]->shares也就表示负载平衡时,调整之后的tg->se[cpu].load值.
结合上面所分析的,本层在上层中所占的负载比重为:
tg-> cfs_rq[cpu]->shares/tg->parent->cfs_rq[cpu]->load.weight
然后,上层所分得的进程负载量是load = tg->parent->cfs_rq[cpu]->h_load;那本层所分得的进程负载量是:
(tg->cfs_rq[cpu]->shares/tg->parent->cfs_rq[cpu]->load.weight)* tg->parent->cfs_rq[cpu]->h_load;
但我们在代码中看到的计算为:
(tg->cfs_rq[cpu]->shares/(tg->parent->cfs_rq[cpu]->load.weight+1))* tg->parent->cfs_rq[cpu]->h_load;
这是为了避免父结点负载为0的情况,即避免除数为0,因为除数+1对整体结果是影响不大的.
 
考虑下面两个问题:
1:假设现在本层以及本层的子层移动的负载总量为rem_load,那么本层应该移动多少?
2:假设本层移动进程负载为move_load,那么它在调度组层次上移动了多少呢?
对于问题1:
在负载为tg->cfs_rq[cpu i]-> load.weigh时,它可以移动tg->cfs_rq[cpu i]-> h_load
那么现在要移动rem_load,对应本层的负载应该为:
tg->cfs_rq[cpu i]-> load.weigh * rem_load / (tg->cfs_rq[cpu i]->h_load)
同理,为避免除数为0,除数+1,变为:
tg->cfs_rq[cpu i]-> load.weigh * rem_load / (tg->cfs_rq[cpu i]->h_load +1)
 
对于2: 在负载为tg->cfs_rq[cpu i]-> load.weigh时,它可以移动tg->cfs_rq[cpu i]-> h_load,现在本层的负载为move_load,那么,它在调度层次上的值为:
Move_load * tg->cfs_rq[cpu i]->h_load / (tg->cfs_rq[cpu i]-> load.weigh )
同理,调整除数,变为:
Move_load * tg->cfs_rq[cpu i]->h_load / (tg->cfs_rq[cpu i]-> load.weigh +1)
 
其实,上面两个问题就是一个比例关系.
 
结合上面的分析,理解load_balance_fair()应该很容易了,在这里就不再赘述了.
接着看它里面的子函数__load_balance_fair().代码如下:
__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
        unsigned long max_load_move, struct sched_domain *sd,
        enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
        struct cfs_rq *cfs_rq)
{
    struct rq_iterator cfs_rq_iterator;
 
    cfs_rq_iterator.start = load_balance_start_fair;
    cfs_rq_iterator.next = load_balance_next_fair;
    cfs_rq_iterator.arg = cfs_rq;
 
    return balance_tasks(this_rq, this_cpu, busiest,
            max_load_move, sd, idle, all_pinned,
            this_best_prio, &cfs_rq_iterator);
}
 
static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
          unsigned long max_load_move, struct sched_domain *sd,
          enum cpu_idle_type idle, int *all_pinned,
          int *this_best_prio, struct rq_iterator *iterator)
{
    int loops = 0, pulled = 0, pinned = 0;
    struct task_struct *p;
    long rem_load_move = max_load_move;
 
    /*如果移动负载为0 .退出*/
    if (max_load_move == 0)
        goto out;
 
    pinned = 1;
 
    /*
     * Start the load-balancing iterator:
     */
    p = iterator->start(iterator->arg);
next:
    /*如果该组的进程已经遍历完了,或者移动进程个数超过sysctl_sched_nr_migrate
     *完成本次进程迁移过程
     */
    if (!p || loops++ > sysctl_sched_nr_migrate)
        goto out;
 
    /*如果该进程负载大于剩余负载的二倍,或者不能移动此进程.继续下一个
    *进程
    *从此可以看到,在迁移过程中,可以允许迁移大于rem_load_move的负载,只要不超过
    *剩余迁移量的二倍就可以了
    */
    if ((p->se.load.weight >> 1) > rem_load_move ||
        !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
        p = iterator->next(iterator->arg);
        goto next;
    }
 
    /*移动此进程*/
    pull_task(busiest, p, this_rq, this_cpu);
    pulled++;
    rem_load_move -= p->se.load.weight;
 
    /*
     * We only want to steal up to the prescribed amount of weighted load.
     */
     /*如果有剩余的进程迁移量,继续移动下一个.在这里也可看出,
     * *this_best_prio表示移动进程中优先级最小值
     */
    if (rem_load_move > 0) {
        if (p->prio < *this_best_prio)
            *this_best_prio = p->prio;
        p = iterator->next(iterator->arg);
        goto next;
    }
out:
    /*
     * Right now, this is one of only two places pull_task() is called,
     * so we can safely collect pull_task() stats here rather than
     * inside pull_task().
     */
     /*到时,进程迁移工作完成*/
    schedstat_add(sd, lb_gained[idle], pulled);
 
    if (all_pinned)
        *all_pinned = pinned;
 
    return max_load_move - rem_load_move;
}
在这里,我们又遇到了一个迭代器.先来思考一下怎么样遍历该进程组的进程呢?
在进程入列时,会调用account_entity_enqueue().在该函数中有如下代码片段:
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    ......
    ......
    if (entity_is_task(se)) {
        add_cfs_task_weight(cfs_rq, se->load.weight);
        list_add(&se->group_node, &cfs_rq->tasks);
    }
    ......
从此可以看到,进程组中的所有进程都链接在cfs_rq->tasks.所以,我们只需要遍历该链表就可以了.
该迭代器以cfs_rq_iterator.start开始,以cfs_rq_iterator.next获取下一个队象.先分析start过程,即load_balance_start_fair().代码如下:
static struct task_struct *load_balance_start_fair(void *arg)
{
    struct cfs_rq *cfs_rq = arg;
 
    return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next);
}
流程转入__load_balance_iterator():
static struct task_struct *
__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
{
    struct task_struct *p = NULL;
    struct sched_entity *se;
 
    if (next == &cfs_rq->tasks)
        return NULL;
 
    se = list_entry(next, struct sched_entity, group_node);
    p = task_of(se);
    cfs_rq->balance_iterator = next->next;
 
    return p;
}
即cfs_rq->balance_iterator指下链表中的下一个对象,并返回已取得对象.
 
Next操作为
static struct task_struct *load_balance_next_fair(void *arg)
{
    struct cfs_rq *cfs_rq = arg;
 
    return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
}
跟start的操作类似,只是操作链表变为了cfs_rq->balance_iterator,因为在start的时候,已经将cfs_rq->balance_iterator指向了task链的下一人对像.
 
继续来分析balance_tasks()函数,结合代码中的注释,理解这段代码应该很容易,在这里主要分析它的两个重要的子函数,即can_migrate_task()和pull_task().
先来看can_migrate_task().该函数用来判断当前进程是否能够迁移到目标cpu上,代码如下:
static
int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
             struct sched_domain *sd, enum cpu_idle_type idle,
             int *all_pinned)
{
    /*
     * We do not migrate tasks that are:
     * 1) running (obviously), or
     * 2) cannot be migrated to this CPU due to cpus_allowed, or
     * 3) are cache-hot on their current CPU.
     */
     /*如果进程不能在this_cpu上运行,不能迁移*/
    if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
        schedstat_inc(p, se.nr_failed_migrations_affine);
        return 0;
    }
    *all_pinned = 0;
 
    /*如果进程正在运行,不能迁移*/
    if (task_running(rq, p)) {
        schedstat_inc(p, se.nr_failed_migrations_running);
        return 0;
    }
 
    /*
     * Aggressive migration if:
     * 1) task is cache cold, or
     * 2) too many balance attempts have failed.
     */
    /*进程的cache是冷的,或者调度域load balance失败次数太多了.
     *可以迁移
     */
    if (!task_hot(p, rq->clock, sd) ||
            sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
        if (task_hot(p, rq->clock, sd)) {
            schedstat_inc(sd, lb_hot_gained[idle]);
            schedstat_inc(p, se.nr_forced_migrations);
        }
#endif
        return 1;
    }
 
    /*如果进程的cache是热的,不能迁移*/
    if (task_hot(p, rq->clock, sd)) {
        schedstat_inc(p, se.nr_failed_migrations_hot);
        return 0;
    }
    return 1;
}
特别注意一下,如果是进程不能在目标CPU上运行,将不会更新*all_pinned的值.在该函数中,代码中对task_hot()调用了两次,显然是值得优化的.
 
对task的Cache是否为hot是在task_hot()中判断的,代码如下:
static int
task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
{
    s64 delta;
 
    /*
     * Buddy candidates are cache hot:
     */
     /*如果进程是cfs_rq的next或者last指向,说明这是一个优先调度的进程
     *Cache是热的
     */
    if (sched_feat(CACHE_HOT_BUDDY) &&
            (&p->se == cfs_rq_of(&p->se)->next ||
             &p->se == cfs_rq_of(&p->se)->last))
        return 1;
 
    /*不为CFS调度类,Cache是冷的*/
    if (p->sched_class != &fair_sched_class)
        return 0;
 
/*如果sysctl_sched_migration_cost为-1,进程Cache恒为
*热,sysctl_sched_migration_cost为0,进程
    *Cache恒为冷
    */
    if (sysctl_sched_migration_cost == -1)
        return 1;
    if (sysctl_sched_migration_cost == 0)
        return 0;
 
    delta = now - p->se.exec_start;
    /*如果进程开始执行的时间到当前时间的间隔小于sysctl_sched_migration_cost
    *说明Cache是热的*/
    return delta < (s64)sysctl_sched_migration_cost;
}
就不对这个过程做详细分析了,注释中已经说的很清楚了.
 
pull_task()用来完在进程的迁移动作,代码如下:
static void pull_task(struct rq *src_rq, struct task_struct *p,
              struct rq *this_rq, int this_cpu)
{
    /*从旧CPU上出列*/
    deactivate_task(src_rq, p, 0);
    /*更新进程的cpu指向*/
    set_task_cpu(p, this_cpu);
    /*在目标CPU上入列*/
    activate_task(this_rq, p, 0);
    /*
     * Note that idle threads have a prio of MAX_PRIO, for this test
     * to be always true for them.
     */
     /*检查目标CPU上是否需要抢占*/
    check_preempt_curr(this_rq, p, 0);
}
由于更新了目标CPU上的进程,所以要检查一下目标CPU上是否需要抢占.
 
3.2:cpu空闲时的load balance
在cpu空闲时,也会主动进行load balance的操作.如下代码片段如示:
asmlinkage void __sched schedule(void)
{
    ......
    ......
    if (unlikely(!rq->nr_running))
        idle_balance(cpu, rq);
    ......
    ......
   
在schedule()中,如果运行队列为空,会调用idle_balance().
关于idle_balance()的操作,在这里就不再重复讲述了,实际上,在之前的分析中,对CPU_NEWLY_IDLE类型的load balance关键地方都有指出.
对于CPU_NEWLY_IDLE与其它类型的load balace的差别主要有以下几点:
1:CPU_NEWLY_IDLE只要发现CPU空闲就会调用,而无调整时间间隔,并且在CPU_NEWLY_IDLE的load balance处理中,会将下次在tick中断中进行load balance的时间戳设为一个较小值,以便在tick中断中较快速的发现这个不平衡状态.
 
2: CPU_NEWLY_IDLE类型的load balance操作中移动较小量的进程,只需保证CPU上有进程运行即可.
3: CPU_NEWLY_IDLE是将其它CPU上的任务”拉”到本地CPU上.
 
四: migration线程
在load_balance()中,我们还看到,如果失败次数大于sd->cache_nice_tries+2时,就会唤醒CPU的migration线程,我们来看一下该线程的运行.
先来看以下代码:
static int __init migration_init(void)
{
    void *cpu = (void *)(long)smp_processor_id();
    int err;
 
    /* Start one for the boot CPU: */
    err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
    BUG_ON(err == NOTIFY_BAD);
    migration_call(&migration_notifier, CPU_ONLINE, cpu);
    register_cpu_notifier(&migration_notifier);
 
    return err;
}
early_initcall(migration_init);
在系统初始化时,migration_init()得到调用,并在该函数中注册了一个cpu notifier链,因此就可以捕捉hotplug cpu信息,在该链的处理函数中,如以下代码片段:
*/
static int __cpuinit
migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
    ......
    ......
    switch (action) {
 
    case CPU_UP_PREPARE:
    case CPU_UP_PREPARE_FROZEN:
        p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
        if (IS_ERR(p))
            return NOTIFY_BAD;
        kthread_bind(p, cpu);
        /* Must be high prio: stop_machine expects to yield to it. */
        rq = task_rq_lock(p, &flags);
        __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
        task_rq_unlock(rq, &flags);
        cpu_rq(cpu)->migration_thread = p;
        break;
    ......
    ......
从此可以看到,每个cpu UP时,都会为其创建并绑定一个migration线程,并将其设置为了SCHED_FIFO的实时进程,具有较高的优先级.
该线程的处理函数为migration_thread().代码如下:
static int migration_thread(void *data)
{
    int cpu = (long)data;
    struct rq *rq;
 
    rq = cpu_rq(cpu);
    BUG_ON(rq->migration_thread != current);
 
    set_current_state(TASK_INTERRUPTIBLE);
    while (!kthread_should_stop()) {
        struct migration_req *req;
        struct list_head *head;
 
        spin_lock_irq(&rq->lock);
 
        /*如果该cpu已经离线了,跳转到wait_to_die,等待退出*/
        if (cpu_is_offline(cpu)) {
            spin_unlock_irq(&rq->lock);
            goto wait_to_die;
        }
 
        /*如果active_balance为1,表示该cpu上有load balance失败的情况*/
        if (rq->active_balance) {
            active_load_balance(rq, cpu);
            rq->active_balance = 0;
        }
 
        head = &rq->migration_queue;
 
        /*如果rg->migration为空,睡眠,直至唤醒*/
        if (list_empty(head)) {
            spin_unlock_irq(&rq->lock);
            schedule();
            set_current_state(TASK_INTERRUPTIBLE);
            continue;
        }
       
        /*从migration_queue中取得队像,然后迁移进程
         *一般在execve或者是在设置进程的所属cpu的时候
         *会有这个操作*/
        req = list_entry(head->next, struct migration_req, list);
        list_del_init(head->next);
 
        spin_unlock(&rq->lock);
        __migrate_task(req->task, cpu, req->dest_cpu);
        local_irq_enable();
 
        /*处理完了,唤醒进在等待的进程*/
        complete(&req->done);
    }
    __set_current_state(TASK_RUNNING);
    return 0;
 
wait_to_die:
    /* Wait for kthread_stop */
    set_current_state(TASK_INTERRUPTIBLE);
    while (!kthread_should_stop()) {
        schedule();
        set_current_state(TASK_INTERRUPTIBLE);
    }
    __set_current_state(TASK_RUNNING);
    return 0;
}
 
4.1:active_load_balance()
先来看这个函数的第一个操作,即active_load_balance().该函数是处理load balance失败的情况(在load_balance()中),代码如下:
static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
{
    int target_cpu = busiest_rq->push_cpu;
    struct sched_domain *sd;
    struct rq *target_rq;
 
    /* Is there any task to move? */
    /*如果繁忙队列中只有一个可运行进程了,不用进行load balance了*/
    if (busiest_rq->nr_running <= 1)
        return;
 
    target_rq = cpu_rq(target_cpu);
 
    /*
     * This condition is "impossible", if it occurs
     * we need to fix it. Originally reported by
     * Bjorn Helgaas on a 128-cpu setup.
     */
     /*不可能出现繁忙队列就是本地队列的情况,因为在load balance时,找到的
     *最繁忙调度组和最繁忙队列都不是本地的*/
    BUG_ON(busiest_rq == target_rq);
 
    /* move a task from busiest_rq to target_rq */
    double_lock_balance(busiest_rq, target_rq);
    update_rq_clock(busiest_rq);
    update_rq_clock(target_rq);
 
    /* Search for an sd spanning us and the target CPU. */
    /*找到目的cpu所在的域.在SMP中,只有一个基本调度哉*/
    for_each_domain(target_cpu, sd) {
        if ((sd->flags & SD_LOAD_BALANCE) &&
            cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                break;
    }
 
    /* 如果找到了要负载平衡的调度域*/
    if (likely(sd)) {
        schedstat_inc(sd, alb_count);
 
        /*从繁忙队列上迁移一个进程到目的cpu上*/
        if (move_one_task(target_rq, target_cpu, busiest_rq,
                  sd, CPU_IDLE))
            schedstat_inc(sd, alb_pushed);
        else
            schedstat_inc(sd, alb_failed);
    }
    double_unlock_balance(busiest_rq, target_rq);
}
从此可以看到,当load balance失败的时候,只会从繁忙队列中移动一个进程到目标cpu上.来看一下具体的迁移过程,即move_one_task(),该函数是以CPU_IDLE参数进行调用的.代码如下:
static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
             struct sched_domain *sd, enum cpu_idle_type idle)
{
    const struct sched_class *class;
 
    for (class = sched_class_highest; class; class = class->next)
        if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
            return 1;
 
    return 0;
}
从此即可以看出,直接调用调度类的move_one_task().在CFS中,该函数为move_one_task_fair().代码如下:
static int
move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
           struct sched_domain *sd, enum cpu_idle_type idle)
{
    struct cfs_rq *busy_cfs_rq;
    struct rq_iterator cfs_rq_iterator;
 
    cfs_rq_iterator.start = load_balance_start_fair;
    cfs_rq_iterator.next = load_balance_next_fair;
 
    for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
        /*
         * pass busy_cfs_rq argument into
         * load_balance_[start|next]_fair iterators
         */
        cfs_rq_iterator.arg = busy_cfs_rq;
        if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
                       &cfs_rq_iterator))
            return 1;
    }
 
    return 0;
}
在分析CFS组调度的时候,曾经分析过,CPU上的进程组都是挂在该cpu运行队列的leaf_cfs_rq_list队列上的,因此只需要遍历该链表就可以遍历该CPU上的进程组.
在后面用的迭代器是在之前已经分析过了的,这里不再赘述,流程转入到iter_move_one_task():
static int
iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
           struct sched_domain *sd, enum cpu_idle_type idle,
           struct rq_iterator *iterator)
{
    struct task_struct *p = iterator->start(iterator->arg);
    int pinned = 0;
 
    while (p) {
        if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
            pull_task(busiest, p, this_rq, this_cpu);
            /*
             * Right now, this is only the second place pull_task()
             * is called, so we can safely collect pull_task()
             * stats here rather than inside pull_task().
             */
            schedstat_inc(sd, lb_gained[idle]);
 
            return 1;
        }
        p = iterator->next(iterator->arg);
    }
 
    return 0;
}
只要该进程是可以与目标CPU关联的,那么就调用pull_task()与之关联,并且马上返回.该函数中涉及到的子函数在前面都已经分析过了,这里就不做详细分析了.
 
4.2: rq->migration_queue
接下来分析一下挂在rg->migration_queue中的对象的处理,首先我们得要知道是在什么情况下将对象挂到该链表上的.搜索kernel的代码可发现,是在migrate_task()函数中,代码如下:
static int
migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
{
    struct rq *rq = task_rq(p);
 
    /*
     * If the task is not on a runqueue (and not running), then
     * it is sufficient to simply update the task's cpu field.
     */
     /*如果进程不处于运行状态,不需要迁移到目标cpu的运行队列中
     *只需要将其关联到目标cpu*/
    if (!p->se.on_rq && !task_running(rq, p)) {
        set_task_cpu(p, dest_cpu);
        return 0;
    }
 
    /*初始化struct migration_req 结构,并将其链入进程所在cpu的migration_queue*/
    init_completion(&req->done);
    req->task = p;
    req->dest_cpu = dest_cpu;
    list_add(&req->list, &rq->migration_queue);
 
    return 1;
}
该函数是将进程p移动到dest_cpu上.
 
同时,搜索kernel源代码,发现有两种情况下会调用migrate_task().如下示:
1:在更改进程所属cpu时:
这种情况下,将进程迁移到新的CPU集上是理所当然的.如下代码片段如示:
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
    ......
    ......
    if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
        /* Need help from migration thread: drop lock and wait. */
        task_rq_unlock(rq, &flags);
        wake_up_process(rq->migration_thread);
        wait_for_completion(&req.done);
        tlb_migrate_finish(p->mm);
        return 0;
    }
    ......
    ......
}
如示所示,new_mask表示进程p的新CPU集, cpumask_any_and(cpu_online_mask, new_mask)是指从cpu_online_mask和new_mask的交集中任选一个cpu(一般是序号最小的).
它调用migrate_task()将请求链入到migration_queu链表.然后唤醒该cpu上的migration线程,并且等待操作的完成.
 
2:在execev()时:
在下面的代码片段中:
do_execve() à sched_exec():
void sched_exec(void)
{
    int new_cpu, this_cpu = get_cpu();
    /*找到相同调度域中负载最轻的CPU*/
    new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
    put_cpu();
    /*如果当前CPU不是负载最轻的CPU,将进程迁移到负载最轻的CPU*/
    if (new_cpu != this_cpu)
        sched_migrate_task(current, new_cpu);
}
为什么要在execve()的时候调整所在的CPU呢?事实这时候调整CPU是最合适的,因为它此时占用的内存以及Cache损失是最小的.
Sched_balance_self()就是找到当前cpu所在调度域中的负载最轻的CPU.该函数跟我们之前分析的find_busiest_group()的逻辑差不多.这里不做分析了.
流程转入到sched_migrate_task().代码如下:
static void sched_migrate_task(struct task_struct *p, int dest_cpu)
{
    struct migration_req req;
    unsigned long flags;
    struct rq *rq;
 
    rq = task_rq_lock(p, &flags);
    /*如果CPU不允许或者目标CPU已经离线了,退出*/
    if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
        || unlikely(!cpu_active(dest_cpu)))
        goto out;
 
    /* force the process onto the specified CPU */
    /*生成请求并且链入到migration_thread链表*/
    if (migrate_task(p, dest_cpu, &req)) {
        /* Need to wait for migration thread (might exit: take ref). */
        struct task_struct *mt = rq->migration_thread;
 
        get_task_struct(mt);
        task_rq_unlock(rq, &flags);
        wake_up_process(mt);
        put_task_struct(mt);
        wait_for_completion(&req.done);
 
        return;
    }
out:
    task_rq_unlock(rq, &flags);
}
这个过程跟set_cpus_allowed_ptr()中的处理差不多,请自行结合代码中的注释进行分析.
 
接下来,我们来分析一下,到底migration线程怎么去处理这些请求.处理代码如下:
migration_thread() à __migrate_task():
static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{
    struct rq *rq_dest, *rq_src;
    int ret = 0, on_rq;
 
    if (unlikely(!cpu_active(dest_cpu)))
        return ret;
 
    rq_src = cpu_rq(src_cpu);
    rq_dest = cpu_rq(dest_cpu);
 
    double_rq_lock(rq_src, rq_dest);
    /* Already moved. */
    /*如果进程不在src_cpu上,可能已经迁移完成了.退出*/
    if (task_cpu(p) != src_cpu)
        goto done;
    /* Affinity changed (again). */
    /*如果进程不允许运行在des_cpu上,退出*/
    if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
        goto fail;
 
    /*将进程迁移到目的cpu*/
    on_rq = p->se.on_rq;
    if (on_rq)
        deactivate_task(rq_src, p, 0);
 
    set_task_cpu(p, dest_cpu);
    if (on_rq) {
        activate_task(rq_dest, p, 0);
        check_preempt_curr(rq_dest, p, 0);
    }
done:
    ret = 1;
fail:
    double_rq_unlock(rq_src, rq_dest);
    return ret;
}
这个过程很简单,就是进程的迁移.请对照代码自行分析,这里就不再赘述了.
 
五:cpuset中遗留的调度域问题
在分析cpuset子系统的时候,遇到了一个与调度域相关的接口partition_sched_domains().在本节中,来对它进行一个详细的分析.代码如下:
void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                 struct sched_domain_attr *dattr_new)
{
    int i, j, n;
    int new_topology;
 
    mutex_lock(&sched_domains_mutex);
 
    /* always unregister in case we don't destroy any domains */
    unregister_sched_domain_sysctl();
 
    /* Let architecture update cpu core mappings. */
    new_topology = arch_update_cpu_topology();
 
    n = doms_new ? ndoms_new : 0;
 
    /* Destroy deleted domains */
    /*判断当前系统中的调度域是否与要设置的调度域有相同的部份
    *如有相同的部份,则这部份信息可以保存下来,不需要再次设置调度域*/
    for (i = 0; i < ndoms_cur; i++) {
        /*如果有相同的,继续下一个*/
        for (j = 0; j < n && !new_topology; j++) {
            if (cpumask_equal(&doms_cur[i], &doms_new[j])
                && dattrs_equal(dattr_cur, i, dattr_new, j))
                goto match1;
        }
        /* no match - a current sched domain not in new doms_new[] */
        /*如果有不相同的,则需要对旧的调度域信息进行释放*/
        detach_destroy_domains(doms_cur + i);
match1:
        ;
    }
 
    /*如果doms_new == NULL,则必有ndoms_new == 1*/
    /*如果doms_new == NULL,则取系统中除孤立CPU外的其它所有CPU,将其放至
     *同一个调度域
     */
    if (doms_new == NULL) {
        ndoms_cur = 0;
        doms_new = fallback_doms;
        cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
        WARN_ON_ONCE(dattr_new);
    }
 
    /* Build new domains */
    /*构建立的调度域.同理,之前已经有的就不要再重建立了*/
    for (i = 0; i < ndoms_new; i++) {
        for (j = 0; j < ndoms_cur && !new_topology; j++) {
            if (cpumask_equal(&doms_new[i], &doms_cur[j])
                && dattrs_equal(dattr_new, i, dattr_cur, j))
                goto match2;
        }
        /* no match - add a new doms_new */
        __build_sched_domains(doms_new + i,
                    dattr_new ? dattr_new + i : NULL);
match2:
        ;
    }
 
    /* Remember the new sched domains */
    /*释放资源,更新doms_cur,ndoms_cur等全局信息*/
    if (doms_cur != fallback_doms)
        kfree(doms_cur);
    kfree(dattr_cur);   /* kfree(NULL) is safe */
    doms_cur = doms_new;
    dattr_cur = dattr_new;
    ndoms_cur = ndoms_new;
 
    register_sched_domain_sysctl();
 
    mutex_unlock(&sched_domains_mutex);
}
在这个函数中,会传入三个参数,ndoms_new表示调度域的个数,doms_new表示每个调度域中的cpu成员,它是一个struct mask数组,有ndoms_new项,dattr_new是每个调度域的属性.关于调度域属性在分析Cpuset的时候分析过了,这里就不再重复了.
在这里,有几个全局量:
ndoms_cur:表示当前系统中的调度域个数
doms_cur:是当前各调度域中的CPU位图
dattr_cur:是当前各调度域中的属性
该接口的逻辑很清晰,而且里面核心的子函数__build_sched_domains()已经在前面详细分析过了,所以这里就不再这个函数做过多的讲解了.
 
六:小结
SMP负载平衡的过程有的地方还是很晦涩,比如shares值与h_load的调整过程.进程负载的计算过程以及对负载平衡条件的判断也是一个理解的难点,不过,较2.6.9来说 ,逻辑还是清晰了不少.

你可能感兴趣的:(linux内核)