Linux-scheduler之负载均衡(二)

四、调度域

SDTL结构

linux内核使用SDTL结构体来组织CPU的层次关系

struct sched_domain_topology_level {
    sched_domain_mask_f mask;        //函数指针,用于指定某个SDTL的cpumask位图
    sched_domain_flags_f sd_flags;    //函数指针,用于指定某个SDTL的标志位
    int            flags;            
    int            numa_level;
    struct sd_data      data;
#ifdef CONFIG_SCHED_DEBUG
    char                *name;
#endif
};

标志位

#define SD_BALANCE_NEWIDLE    0x0001    /* Balance when about to become idle */
#define SD_BALANCE_EXEC        0x0002    /* Balance on exec */
#define SD_BALANCE_FORK        0x0004    /* Balance on fork, clone */
#define SD_BALANCE_WAKE        0x0008  /* Balance on wakeup */
#define SD_WAKE_AFFINE        0x0010    /* Wake task to waking CPU */
#define SD_ASYM_CPUCAPACITY    0x0020  /* Domain members have different CPU capacities */
#define SD_SHARE_CPUCAPACITY    0x0040    /* Domain members share CPU capacity */
#define SD_SHARE_POWERDOMAIN    0x0080    /* Domain members share power domain */
#define SD_SHARE_PKG_RESOURCES    0x0100    /* Domain members share CPU pkg resources */
#define SD_SERIALIZE        0x0200    /* Only a single load balancing instance */
#define SD_ASYM_PACKING        0x0400  /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING    0x0800    /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP        0x1000    /* sched_domains of this level overlap */
#define SD_NUMA            0x2000    /* cross-node balancing */
SD_BALANCE_NEWIDLE 当CPU变为空闲后做负载均衡调度
SD_BALANCE_EXEC 进程调用exec是会重新选择一个最优的CPU的来执行,参考sched_exec()函数
SD_BALANCE_FORK fork出新进程后会选择最优CPU,参考wake_up_new_task()函数
SD_BALANCE_WAKE 唤醒时负载均衡,参考wake_up_process()函数
SD_WAKE_AFFINE 支持wake affine特性
SD_ASYM_CPUCAPACITY 该调度域有不同架构的CPU,如大/小核cpu
SD_SHARE_CPUCAPACITY 调度域中的CPU都是可以共享CPU资源的,描述SMT调度层级
SD_SHARE_POWERDOMAIN 该调度域中的CPU可以共享电源域
SD_SHARE_PKG_RESOURCES 该调度域中的CPU可以共享高速缓存
SD_ASYM_PACKING 描述SMT调度层级相关的一些例外
SD_NUMA 描述NUMA调度层级
SD_SERIALIZE
SD_OVERLAP
SD_PREFER_SIBLING

调度组是负载均衡的最小单位。

五、CPU调度域拓扑

CPU的调度域拓扑结构可以参考以下三张图(图来自互联网)
Linux-scheduler之负载均衡(二)_第1张图片
Linux-scheduler之负载均衡(二)_第2张图片Linux-scheduler之负载均衡(二)_第3张图片

六、负载均衡

load-balance的逻辑比较复杂,可以参考我整理的如下图的整体逻辑框架:
Linux-scheduler之负载均衡(二)_第4张图片

lb_env()结构体
struct lb_env {
    struct sched_domain    *sd;

    struct rq        *src_rq;
    int            src_cpu;

    int            dst_cpu;
    struct rq        *dst_rq;

    struct cpumask        *dst_grpmask;
    int            new_dst_cpu;
    enum cpu_idle_type    idle;
    long            imbalance;
    /* The set of CPUs under consideration for load-balancing */
    struct cpumask        *cpus;

    unsigned int        flags;

    unsigned int        loop;
    unsigned int        loop_break;
    unsigned int        loop_max;

    enum fbq_type        fbq_type;
    enum migration_type    migration_type;
    struct list_head    tasks;
};
类型 成员 作用
struct sched_domain * sd 指向当前调度域
int dst_cpu 当前CPU(后续可能要把任务迁移到该CPU上)
struct rq* dst_rq 当前CPU对应的就绪度列
struct cpumask * dst_grpmask 当前调度域里的第一个调度组的CPU位图
unsigned int loop_break 表示本地最多迁移32个进程(sched_nr_migrate_break默认值是32)
struct cpumask * cpus load_balance_mask位图
find_busiest_group()函数

首先是两个数据结构:

sd_lb_stats 用于描述调度域里的相关信息,以及sg_lb_stats用于描述调度组里的相关信息

/*
 * sd_lb_stats - Structure to store the statistics of a sched_domain
 *         during load balancing.
 */
struct sd_lb_stats {
    struct sched_group *busiest;    /* Busiest group in this sd */
    struct sched_group *local;    /* Local group in this sd */
    unsigned long total_load;    /* Total load of all groups in sd */
    unsigned long total_capacity;    /* Total capacity of all groups in sd */
    unsigned long avg_load;    /* Average load across all groups in sd */
    unsigned int prefer_sibling; /* tasks should go to sibling first */
    struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
    struct sg_lb_stats local_stat;    /* Statistics of the local group */
};

/*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
struct sg_lb_stats {
    unsigned long avg_load; /*Avg load across the CPUs of the group */
    unsigned long group_load; /* Total load over the CPUs of the group */
    unsigned long group_capacity;
    unsigned long group_util; /* Total utilization over the CPUs of the group */
    unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
    unsigned int sum_nr_running; /* Nr of tasks running in the group */
    unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
    unsigned int idle_cpus;
    unsigned int group_weight;
    enum group_type group_type;
    unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
    unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
    unsigned int nr_numa_running;
    unsigned int nr_preferred_running;
#endif
};

6.1 负载均衡机制的触发

负载均衡机制是从注册软中断开始的

6.1.1 软中断注册
start_kernel
sched_init
init_sched_fair_class
open_softirq注册SCHED_SOFTIRQ
run_rebalance_domains
rest_init
kenrel_init, 1号进程
kernel_init_freeable
smp_init
idle_init
Bringing up secondary CPUs...
__init void init_sched_fair_class(void)
{
#ifdef CONFIG_SMP
    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

#ifdef CONFIG_NO_HZ_COMMON
    nohz.next_balance = jiffies;
    nohz.next_blocked = jiffies;
    zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
#endif
#endif /* SMP */
}

void open_softirq(int nr, void (*action)(struct softirq_action *))
{
    softirq_vec[nr].action = action;
}

注册了SCHED_SOFTIRQ的软中断,中断处理函数是run_rebalance_domains

6.1.2 软中断触发

scheduler_tick()在每个时钟节拍中会去检查是否需要load balance。

scheduler_tick
trigger_load_balance
raise_softirq
nohz_balancer_kick
/*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
void trigger_load_balance(struct rq *rq)
{
    /* Don't need to rebalance while attached to NULL domain */
    if (unlikely(on_null_domain(rq)))
        return;

    if (time_after_eq(jiffies, rq->next_balance)) //需要判断是否到了负载均衡的时间点
        raise_softirq(SCHED_SOFTIRQ);

    nohz_balancer_kick(rq);    //该函数作用是什么?
}

nohz_balancer_kick用来触发nohz idle balance的,这是后面两个章节要仔细描述的内容。这里看起似乎注释不对,因为这个函数不但触发的周期性均衡,也触发了nohz idle balance。然而,其实nohz idle balance本质上也是另外一种意义上的周期性负载均衡,只是因为CPU进入idle,无法产生tick,因此让能产生tick的busy CPU来帮忙触发tick balance。而实际上tick balance和nohz idle balance都是通过SCHED_SOFTIRQ的软中断来处理,最后都是执run_rebalance_domains这个函数。

七、exec进程

core.c
sched_class
select task rq函数, fair.c, rt.c, deadline.c, stoptask.c
/*
 * sched_exec - execve() is a valuable balancing opportunity, because at
 * this point the task has the smallest effective memory and cache footprint.
 */
void sched_exec(void)
{
    struct task_struct *p = current;
    unsigned long flags;
    int dest_cpu;

    raw_spin_lock_irqsave(&p->pi_lock, flags); //这里获取了p->pi_lock,后面migration_cpu_stop中会涉及该锁
    dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); //这里选择目的cpu
    if (dest_cpu == smp_processor_id())    //如果dest_cpu就是当前cpu,可以释放锁并返回
        goto unlock;

    if (likely(cpu_active(dest_cpu))) {
        struct migration_arg arg = { p, dest_cpu };

        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);    //这里进行目的cpu的进程迁移
        return;
    }
unlock:
    raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}

stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);这句函数的作用是什么。

stop_one_cpu是停止某个cpu的运行,非SMP和SMP下实现不一样。非SMP下,stop_one_cpu的实现其实就是关抢占、执行函数、开抢占。

SMP架构下,stop_one_cpu实现:

该函数是在执行完函数fn之后再返回,但是不能保证执行fn的时候cpu还一直在线。

int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
{
    struct cpu_stop_done done;
    struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };

    cpu_stop_init_done(&done, 1);
    if (!cpu_stop_queue_work(cpu, &work))
        return -ENOENT;
    /*
     * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
     * cycle by doing a preemption:
     */
    cond_resched();
    wait_for_completion(&done.completion);
    return done.ret;
}

migration_cpu_stop函数

主要是两个拿锁的地方,未能理解为何需要那样的判断,rq->lock以及p->pi_lock

/*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
 * 'pushing' onto another runqueue.
 */
static int migration_cpu_stop(void *data)
{
    struct migration_arg *arg = data;
    struct task_struct *p = arg->task;
    struct rq *rq = this_rq();
    struct rq_flags rf;

    /*
     * The original target CPU might have gone down and we might
     * be on another CPU but it doesn't matter.
     */
    local_irq_disable();
    /*
     * We need to explicitly wake pending tasks before running
     * __migrate_task() such that we will not miss enforcing cpus_ptr
     * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
     */
    flush_smp_call_function_from_idle();

    raw_spin_lock(&p->pi_lock);
    rq_lock(rq, &rf);
    /* 这个地方为何一定要task_rq == rq才能去迁移呢?rq是当前正在执行任务的cpu的rq,其rq->lock拿着,与task_rq(p) != rq有何冲突?没能理解。会不会是迁移操作时需要获取p所在rq的lock,以及目标rq的lock,那样的话假设task_rq(p)不等于rq,拿两个rq->lock不就好了?
     * If task_rq(p) != rq, it cannot be migrated here, because we're
     * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
     * we're holding p->pi_lock.
     */
    if (task_rq(p) == rq) { //必须是p所在的rq与当前cpu的rq是同一个才能迁移
        if (task_on_rq_queued(p)) //必须p是on rq才能迁移
            rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
        else
            p->wake_cpu = arg->dest_cpu;
    }
    rq_unlock(rq, &rf);
    raw_spin_unlock(&p->pi_lock);

    local_irq_enable();
    return 0;
}

八、fork进程

_do_fork
wake_up_new_task
select_task_rq
/*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
void wake_up_new_task(struct task_struct *p)
{
    struct rq_flags rf;
    struct rq *rq;

    raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
    p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
    /*
     * Fork balancing, do it here and not earlier because:
     *  - cpus_ptr can change in the fork path
     *  - any previously selected CPU might disappear through hotplug
     *
     * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
     * as we're not fully set-up yet.
     */
    p->recent_used_cpu = task_cpu(p);
    rseq_migrate(p);
    __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
    rq = __task_rq_lock(p, &rf);
    update_rq_clock(rq);
    post_init_entity_util_avg(p);

    activate_task(rq, p, ENQUEUE_NOCLOCK);
    trace_sched_wakeup_new(p);
    check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
    if (p->sched_class->task_woken) {
        /*
         * Nothing relies on rq->lock after this, so its fine to
         * drop it.
         */
        rq_unpin_lock(rq, &rf);
        p->sched_class->task_woken(rq, p);
        rq_repin_lock(rq, &rf);
    }
#endif
    task_rq_unlock(rq, p, &rf);
}

你可能感兴趣的:(进程调度子系统,Linux内核,linux,scheduler)