现在计算机为多核处理器,操作系统会把任务分配到处理器上运行工作,但是如何均匀的分配任务?以及怎样调整何时调整任务的分配策略?
此篇内容大部分不懂先看个大概,有时间在仔细分析
在linux系统中运用软中断来调整各个处理器的负载;
__init void init_sched_fair_class(void) { #ifdef CONFIG_SMP open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); 注册软中断底部 #ifdef CONFIG_NO_HZ nohz.next_balance = jiffies; zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); cpu_notifier(sched_ilb_notifier, 0); #endif #endif /* SMP */ }
/* * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ static void run_rebalance_domains(struct softirq_action *h) { int this_cpu = smp_processor_id(); struct rq *this_rq = cpu_rq(this_cpu); enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; rebalance_domains(this_cpu, idle); /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are * stopped. */ nohz_idle_balance(this_cpu, idle); }其中rebblance_domain调用load_balance;
/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { int balance = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; int need_serialize; update_shares(cpu); rcu_read_lock(); for_each_domain(cpu, sd) {//对于该cpu所在的调度域及其父调度域 if (!(sd->flags & SD_LOAD_BALANCE))<pre name="code" class="cpp">//如果这个调度域已经明确表示不参与负载平衡continue;interval = sd->balance_interval;//得到该调度域的平衡周期if (idle != CPU_IDLE)interval *= sd->busy_factor;//根据当前cpu状态对此周期进行修正/* scale ms to jiffies */interval = msecs_to_jiffies(interval);//将毫秒转化成jiffie数interval = clamp(interval, 1UL, max_load_balance_interval);need_serialize = sd->flags & SD_SERIALIZE;if (need_serialize) {if (!spin_trylock(&balancing))goto out;}if (time_after_eq(jiffies, sd->last_balance + interval)) {//真的需要进行负载平衡了if (load_balance(cpu, rq, sd, idle, &balance)) {/* * We've pulled tasks over so either we're no * longer idle. */idle = CPU_NOT_IDLE;}sd->last_balance = jiffies;}if (need_serialize)spin_unlock(&balancing);out:if (time_after(next_balance, sd->last_balance + interval)) {next_balance = sd->last_balance + interval;update_next_balance = 1;}/* * Stop the load balance at this level. There is another * CPU in our sched group which is doing load balancing more * actively. */if (!balance)break;}rcu_read_unlock();/* * next_balance will be updated only when there is a need. * When the cpu is attached to null domain for ex, it will not be * updated. */if (likely(update_next_balance))rq->next_balance = next_balance; 还是不懂对于cpu的负载均衡怎样分析以及怎样处理。。。。。此处代码还不是很理解
/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest;run_rebalance_domains unsigned long flags; struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); struct lb_env env = { .sd = sd, .dst_cpu = this_cpu, .dst_rq = this_rq, .dst_grpmask = sched_group_cpus(sd->groups), .idle = idle, .loop_break = sched_nr_migrate_break, .cpus = cpus, }; cpumask_copy(cpus, cpu_active_mask); max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); redo: group = find_busiest_group(&env, balance); if (*balance == 0) goto out_balanced; if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; } busiest = find_busiest_queue(&env, group); if (!busiest) { schedstat_inc(sd, lb_nobusyq[idle]); goto out_balanced; } BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; env.src_cpu = busiest->cpu; env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(this_rq, busiest); /* * cur_ld_moved - load moved in current iteration * ld_moved - cumulative load moved across iterations */ cur_ld_moved = move_tasks(&env); ld_moved += cur_ld_moved; double_rq_unlock(this_rq, busiest); local_irq_restore(flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; goto more_balance; } /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group * where they can run. The upper limit on how many times we * iterate on same src_cpu is dependent on number of cpus in our * sched_group. * * This changes load balance semantics a bit on who can move * load to a given_cpu. In addition to the given_cpu itself * (or a ilb_cpu acting on its behalf where given_cpu is * nohz-idle), we now have balance_cpu in a position to move * load to given_cpu. In rare situations, this may cause * conflicts (balance_cpu and given_cpu/ilb_cpu deciding * _independently_ and at _same_ time to move some load to * given_cpu) causing exceess load to be moved to given_cpu. * This however should not happen so much in practice and * moreover subsequent load balance cycles should correct the * excess load moved. */ if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && lb_iterations++ < max_lb_iterations) { this_rq = cpu_rq(env.new_dst_cpu); env.dst_rq = this_rq; env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. */ goto more_balance; } /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(env.flags & LBF_ALL_PINNED)) { cpumask_clear_cpu(cpu_of(busiest), cpus); if (!cpumask_empty(cpus)) { env.loop = 0; env.loop_break = sched_nr_migrate_break; goto redo; } goto out_balanced; } } if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); /* * Increment the failure counter only on periodic balance. * We do not want newidle balance, which can be very * frequent, pollute the failure counter causing * excessive cache_hot migrations and active balances. */ if (idle != CPU_NEWLY_IDLE) sd->nr_balance_failed++; if (need_active_balance(&env)) { raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, * if the curr task on busiest cpu can't be * moved to this_cpu */ if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(busiest->curr))) { raw_spin_unlock_irqrestore(&busiest->lock, flags); env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared * only after active load balance is finished. */ if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; active_balance = 1; } raw_spin_unlock_irqrestore(&busiest->lock, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } } else sd->nr_balance_failed = 0; if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; } else { /* * If we've begun active balancing, start to back off. This * case may not be covered by the all_pinned logic if there * is only 1 task on the busy runqueue (because we don't call * move_tasks). */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; } goto out; out_balanced: schedstat_inc(sd, lb_balanced[idle]); sd->nr_balance_failed = 0; out_one_pinned: /* tune up the balancing interval */ if (((env.flags & LBF_ALL_PINNED) && sd->balance_interval < MAX_PINNED_INTERVAL) || (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; ld_moved = 0; out: return ld_moved; }
同时:当进程调用了sleep、usleep、poll、epoll、pause时,也就是调用了可能休眠的操作时都会在内核代码里对schedule()函数的调用;同时 try_to_wake_up() 在唤醒线程、进程时会选择在那个cpu上运行,也会涉及到load balance;
* idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; int pulled_task = 0; unsigned long next_balance = jiffies + HZ; this_rq->idle_stamp = this_rq->clock; if (this_rq->avg_idle < sysctl_sched_migration_cost) return; /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ raw_spin_unlock(&this_rq->lock); update_shares(this_cpu); rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; int balance = 1; if (!(sd->flags & SD_LOAD_BALANCE)) continue; if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, &balance); } interval = msecs_to_jiffies(sd->balance_interval); if (time_after(next_balance, sd->last_balance + interval)) next_balance = sd->last_balance + interval; if (pulled_task) { this_rq->idle_stamp = 0; break; } } rcu_read_unlock(); raw_spin_lock(&this_rq->lock); if (pulled_task || time_after(jiffies, this_rq->next_balance)) { /* * We are going idle. next_balance may be set based on * a busy processor. So reset next_balance. */ this_rq->next_balance = next_balance; } }
在try_to_wake_up上也会进行cpu的负载均衡:
*/ static int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { cpu = task_cpu(p); ................................. #ifdef CONFIG_SMP /* * If the owning (remote) cpu is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. */ while (p->on_cpu) { #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW /* * In case the architecture enables interrupts in * context_switch(), we cannot busy wait, since that * would lead to deadlocks when an interrupt hits and * tries to wake up @prev. So bail and do a complete * remote wakeup. */ if (ttwu_activate_remote(p, wake_flags)) goto stat; #else cpu_relax(); #endif } ............. p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; if (p->sched_class->task_waking) p->sched_class->task_waking(p); cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } #endif /* CONFIG_SMP */ ttwu_queue(p, cpu); stat: ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; }如果上述this_cpu和cpu不等;则设置wake_flags以及set_task_cpu(p,cpu);
cpu 为进程休眠前运行时的cpu,this_cpu为目标cpu,实际运行这个函数的cpu?????是否这样理解