linux cfs负载均衡

前一篇文章介绍了RT调度器的负载均衡,CFS调度器负载均衡实现更复杂,
下面将一步步介绍下CFS 普通进程的负载均衡。


普通进程的负载均衡在以下情况下会被触发:
1、当前进程离开TASK_RUNNING状态(进入睡眠或退出),而对应的run_queue中已无进程可用时。这时触发负载均衡,试图从别的run_queue中pull一个进程过来运行。调用的路径idle_balance->
2、每个时钟中断中,启动负载均衡过程,试图发现并解决系统中不均衡。
调用路径scheduler_tick->trigger_load_balance->raise_softirq(SCHED_SOFTIRQ)->run_rebalance_domains->rebalance_domains-
->load_balance。


每次时钟中断到来时,判断当前cpu运行队列下次balance时间是否到达,到达的话,触发SCHED_SOFTIRQ软中断,中断处理函数中会调用rebalance_domains。


在具体讲解代码前,先明确几个概念,调度域,调度组,多核架构,
可以参考以前的两篇文章:


http://blog.csdn.net/u014089131/article/details/54581332


http://blog.csdn.net/u014089131/article/details/54023726


因为cache的原因,进程迁移尽量在低级domain中进行,domain级别越高,迁移代价越大,迁移代价会抵消掉
负载均衡带来的好处。


下面看下具体的代码:

/*
根据domain级别,从下往上扫描每一级sched_domain。
如果这个domain balance之间间隔时间到了,就进行load_balance操作,
不同级别的domain时间间隔不同,而且级别越高,间隔越长,因为迁移task代价越来越大。
*/
static void rebalance_domains(int cpu, enum cpu_idle_type idle)
{
	int balance = 1;
	struct rq *rq = cpu_rq(cpu);
	unsigned long interval;
	struct sched_domain *sd;
	/* Earliest time when we have to do rebalance again */
	unsigned long next_balance = jiffies + 60*HZ;
	int update_next_balance = 0;
	int need_serialize;

	update_blocked_averages(cpu);

	rcu_read_lock();
	//task都是在最小的cpu单元上执行
	for_each_domain(cpu, sd) {
		//从下往上检查domain是否进行负载均衡
		if (!(sd->flags & SD_LOAD_BALANCE))
			continue;
		//每级domain有自己的balance 间隔
		interval = sd->balance_interval;
		if (idle != CPU_IDLE)
			interval *= sd->busy_factor;

		/* scale ms to jiffies */
		interval = msecs_to_jiffies(interval);
		interval = clamp(interval, 1UL, max_load_balance_interval);

		need_serialize = sd->flags & SD_SERIALIZE;

		if (need_serialize) {
			if (!spin_trylock(&balancing))
				goto out;
		}
		//jiffies>=sd->last_balance + interval 需要重新做balance。
		if (time_after_eq(jiffies, sd->last_balance + interval)) {
			if (load_balance(cpu, rq, sd, idle, &balance)) {
				/*
				 * The LBF_SOME_PINNED logic could have changed
				 * env->dst_cpu, so we can't know our idle
				 * state even if we migrated tasks. Update it.
				 */
				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
			}
			//更新last_balance
			sd->last_balance = jiffies;
		}
		if (need_serialize)
			spin_unlock(&balancing);
out:
		if (time_after(next_balance, sd->last_balance + interval)) {
			next_balance = sd->last_balance + interval;
			update_next_balance = 1;
		}

		/*
		 * Stop the load balance at this level. There is another
		 * CPU in our sched group which is doing load balancing more
		 * actively.
		 */
		if (!balance)
			break;
	}
	rcu_read_unlock();

	/*
	 * next_balance will be updated only when there is a need.
	 * When the cpu is attached to null domain for ex, it will not be
	 * updated.
	 */
	if (likely(update_next_balance))
		rq->next_balance = next_balance;
}

/*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
/*
该函数在同一个调度域的各个调度组之间进行负载均衡,执行过程如下:
1、找出最busy的组。
2、在最busy的组中找出最busy的cpu。
3.迁移最busy的cpu上的进程到本cpu,并返回迁移的进程数量
*/
static int load_balance(int this_cpu, struct rq *this_rq,
			struct sched_domain *sd, enum cpu_idle_type idle,
			int *balance)

{
	int ld_moved, cur_ld_moved, active_balance = 0;
	struct sched_group *group;
	struct rq *busiest = NULL;
	unsigned long flags;
	struct cpumask *cpus = __get_cpu_var(load_balance_mask);

	struct lb_env env = {
		.sd			= sd,
		.dst_cpu		= this_cpu,
		.dst_rq			= this_rq,
		.dst_grpmask    	= sched_group_cpus(sd->groups), //group cpu mask
		.idle			= idle,
		.busiest_nr_running 	= 0,
		.busiest_grp_capacity 	= 0,
		.loop_break		= sched_nr_migrate_break,
		.cpus			= cpus,
		.flags			= 0,
		.loop			= 0,
	};

	/*
	 * For NEWLY_IDLE load_balancing, we don't need to consider
	 * other cpus in our group
	 */
	//当前cpu处于idle状态,不需要考虑group内其他cpu load
	if (idle == CPU_NEWLY_IDLE)
		env.dst_grpmask = NULL;

	cpumask_copy(cpus, cpu_active_mask);

	per_cpu(dbs_boost_load_moved, this_cpu) = 0;
	schedstat_inc(sd, lb_count[idle]);

redo:
	//找到domain内最忙的group
	group = find_busiest_group(&env, balance);

	if (*balance == 0)
		goto out_balanced;

	if (!group) {
		schedstat_inc(sd, lb_nobusyg[idle]);
		goto out_balanced;
	}
	//group内找到最忙的rq,从而获取最忙的cpu
	busiest = find_busiest_queue(&env, group);
	if (!busiest) {
		schedstat_inc(sd, lb_nobusyq[idle]);
		goto out_balanced;
	}

	BUG_ON(busiest == env.dst_rq);

	schedstat_add(sd, lb_imbalance[idle], env.imbalance);

	ld_moved = 0;
	if (busiest->nr_running > 1) {
		/*
		 * Attempt to move tasks. If find_busiest_group has found
		 * an imbalance but busiest->nr_running <= 1, the group is
		 * still unbalanced. ld_moved simply stays zero, so it is
		 * correctly treated as an imbalance.
		 */
		env.flags |= LBF_ALL_PINNED;
		env.src_cpu   = busiest->cpu;
		env.src_rq    = busiest;
		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);//最大迁移的task数量

		update_h_load(env.src_cpu);
more_balance:
		local_irq_save(flags);
		double_rq_lock(env.dst_rq, busiest);

		/* The world might have changed. Validate assumptions */
		//最busy的rq可运行task少于等于1,不需要移动,这步之前可能有task停止
		if (busiest->nr_running <= 1) {
			double_rq_unlock(env.dst_rq, busiest);
			local_irq_restore(flags);
			env.flags &= ~LBF_ALL_PINNED;
			goto no_move;
		}

		/*
		 * cur_ld_moved - load moved in current iteration 本次移动task数量
		 * ld_moved     - cumulative load moved across iterations 累计移动的task数量
		 */
		//进行task迁移
		cur_ld_moved = move_tasks(&env);
		ld_moved += cur_ld_moved;
		double_rq_unlock(env.dst_rq, busiest);
		local_irq_restore(flags);

		/*
		 * some other cpu did the load balance for us.
		 */
		if (cur_ld_moved && env.dst_cpu != smp_processor_id())
			resched_cpu(env.dst_cpu);

		if (env.flags & LBF_NEED_BREAK) {
			env.flags &= ~LBF_NEED_BREAK;
			goto more_balance;
		}

		/*
		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
		 * us and move them to an alternate dst_cpu in our sched_group
		 * where they can run. The upper limit on how many times we
		 * iterate on same src_cpu is dependent on number of cpus in our
		 * sched_group.
		 *
		 * This changes load balance semantics a bit on who can move
		 * load to a given_cpu. In addition to the given_cpu itself
		 * (or a ilb_cpu acting on its behalf where given_cpu is
		 * nohz-idle), we now have balance_cpu in a position to move
		 * load to given_cpu. In rare situations, this may cause
		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
		 * _independently_ and at _same_ time to move some load to
		 * given_cpu) causing exceess load to be moved to given_cpu.
		 * This however should not happen so much in practice and
		 * moreover subsequent load balance cycles should correct the
		 * excess load moved.
		 */
		if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {

			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
			env.dst_cpu	 = env.new_dst_cpu;
			env.flags	&= ~LBF_SOME_PINNED;
			env.loop	 = 0;
			env.loop_break	 = sched_nr_migrate_break;

			/* Prevent to re-select dst_cpu via env's cpus */
			cpumask_clear_cpu(env.dst_cpu, env.cpus);

			/*
			 * Go back to "more_balance" rather than "redo" since we
			 * need to continue with same src_cpu.
			 */
			goto more_balance;
		}

		/* All tasks on this runqueue were pinned by CPU affinity */
		if (unlikely(env.flags & LBF_ALL_PINNED)) {
			cpumask_clear_cpu(cpu_of(busiest), cpus);
			if (!cpumask_empty(cpus)) {
				env.loop = 0;
				env.loop_break = sched_nr_migrate_break;
				goto redo;
			}
			goto out_balanced;
		}
	}

no_move:
	if (!ld_moved) {
		if (!(env.flags & LBF_HMP_ACTIVE_BALANCE))
			schedstat_inc(sd, lb_failed[idle]);

		/*
		 * Increment the failure counter only on periodic balance.
		 * We do not want newidle balance, which can be very
		 * frequent, pollute the failure counter causing
		 * excessive cache_hot migrations and active balances.
		 */
		if (idle != CPU_NEWLY_IDLE &&
		    !(env.flags & LBF_HMP_ACTIVE_BALANCE))
			sd->nr_balance_failed++;

		if (need_active_balance(&env)) {
			raw_spin_lock_irqsave(&busiest->lock, flags);

			/* don't kick the active_load_balance_cpu_stop,
			 * if the curr task on busiest cpu can't be
			 * moved to this_cpu
			 */
			if (!cpumask_test_cpu(this_cpu,
					tsk_cpus_allowed(busiest->curr))) {
				raw_spin_unlock_irqrestore(&busiest->lock,
							    flags);
				env.flags |= LBF_ALL_PINNED;
				goto out_one_pinned;
			}

			/*
			 * ->active_balance synchronizes accesses to
			 * ->active_balance_work.  Once set, it's cleared
			 * only after active load balance is finished.
			 */
			if (!busiest->active_balance) {
				busiest->active_balance = 1;
				busiest->push_cpu = this_cpu;
				active_balance = 1;
			}
			raw_spin_unlock_irqrestore(&busiest->lock, flags);

			if (active_balance) {
				stop_one_cpu_nowait(cpu_of(busiest),
					active_load_balance_cpu_stop, busiest,
					&busiest->active_balance_work);
				ld_moved++;
			}

			/*
			 * We've kicked active balancing, reset the failure
			 * counter.
			 */
			sd->nr_balance_failed =
			    sd->cache_nice_tries +
			    NEED_ACTIVE_BALANCE_THRESHOLD - 1;
		}
	} else {
		sd->nr_balance_failed = 0;
		if (per_cpu(dbs_boost_needed, this_cpu)) {
			struct migration_notify_data mnd;

			mnd.src_cpu = cpu_of(busiest);
			mnd.dest_cpu = this_cpu;
			mnd.load = per_cpu(dbs_boost_load_moved, this_cpu);
			if (mnd.load > 100)
				mnd.load = 100;
			atomic_notifier_call_chain(&migration_notifier_head,
						   0, (void *)&mnd);
			per_cpu(dbs_boost_needed, this_cpu) = false;
			per_cpu(dbs_boost_load_moved, this_cpu) = 0;

		}

		/* Assumes one 'busiest' cpu that we pulled tasks from */
		if (!same_freq_domain(this_cpu, cpu_of(busiest))) {
			check_for_freq_change(this_rq);
			check_for_freq_change(busiest);
		}
	}
	if (likely(!active_balance)) {
		/* We were unbalanced, so reset the balancing interval */
		sd->balance_interval = sd->min_interval;
	} else {
		/*
		 * If we've begun active balancing, start to back off. This
		 * case may not be covered by the all_pinned logic if there
		 * is only 1 task on the busy runqueue (because we don't call
		 * move_tasks).
		 */
		if (sd->balance_interval < sd->max_interval)
			sd->balance_interval *= 2;
	}

	goto out;

out_balanced:
	schedstat_inc(sd, lb_balanced[idle]);

	sd->nr_balance_failed = 0;

out_one_pinned:
	/* tune up the balancing interval */
	if (((env.flags & LBF_ALL_PINNED) &&
			sd->balance_interval < MAX_PINNED_INTERVAL) ||
			(sd->balance_interval < sd->max_interval))
		sd->balance_interval *= 2;

	ld_moved = 0;
out:
	trace_sched_load_balance(this_cpu, idle, *balance,
				 group ? group->cpumask[0] : 0,
				 busiest ? busiest->nr_running : 0,
				 env.imbalance, env.flags, ld_moved,
				 sd->balance_interval);
	return ld_moved;
}

static struct sched_group *
find_busiest_group(struct lb_env *env, int *balance)
{
	struct sd_lb_stats sds;

	memset(&sds, 0, sizeof(sds));

	/*
	 * Compute the various statistics relavent for load balancing at
	 * this level.
	 */
	 //更新sched_domain,sched_group的状态,同时找到了domain内最busy的group
	 //接下来要检查这个group是否是真正最busy的group
	update_sd_lb_stats(env, balance, &sds);

	/*
	 * this_cpu is not the appropriate cpu to perform load balancing at
	 * this level.
	 */
	if (!(*balance))
		goto ret;

	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
	    check_asym_packing(env, &sds))
		return sds.busiest;

	/* There is no busy sibling group to pull tasks from */
	if (!sds.busiest || sds.busiest_nr_running == 0)
		goto out_balanced;

	if (env->flags & LBF_HMP_ACTIVE_BALANCE)
		goto force_balance;

	//group 内task数量超过溢出限制,就继续进行负载均衡操作,否则返回不需负载均衡处理
	if (bail_inter_cluster_balance(env, &sds))
		goto out_balanced;

	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;

	/*
	 * If the busiest group is imbalanced the below checks don't
	 * work because they assumes all things are equal, which typically
	 * isn't true due to cpus_allowed constraints and the like.
	 */
	if (sds.group_imb)
		goto force_balance;

	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
			!sds.busiest_has_capacity)
		goto force_balance;

	/*
	 * If the local group is more busy than the selected busiest group
	 * don't try and pull any tasks.
	 */
	//本地group负载更重
	if (sds.this_load >= sds.max_load)
		goto out_balanced;

	/*
	 * Don't pull any tasks if this group is already above the domain
	 * average load.
	 */
	if (sds.this_load >= sds.avg_load)
		goto out_balanced;

	if (env->idle == CPU_IDLE) {
		/*
		 * This cpu is idle. If the busiest group load doesn't
		 * have more tasks than the number of available cpu's and
		 * there is no imbalance between this and busiest group
		 * wrt to idle cpu's, it is balanced.
		 */
		//当前group内idle cpu数量少于busiest group内idle cpu数量
		//并且busiest group内运行task数量未超过负荷,不需要做负载均衡
		if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
		    sds.busiest_nr_running <= sds.busiest_group_weight)
			goto out_balanced;
	} else {
		/*
		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
		 * imbalance_pct to be conservative.
		 */
		if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
			goto out_balanced;
	}

force_balance:
	/* Looks like there is an imbalance. Compute it */
	 //计算domain内group的imbalance程度
	calculate_imbalance(env, &sds);
	return sds.busiest;

out_balanced:
ret:
	env->imbalance = 0;
	return NULL;
}

//首先看下相关数据结构:
struct sd_lb_stats {
	struct sched_group *busiest; /* Busiest group in this sd */
	struct sched_group *this;  /* Local group in this sd */
	unsigned long total_load;  /* Total load of all groups in sd */
	unsigned long total_pwr;   /*	Total power of all groups in sd */
	unsigned long avg_load;	   /* Average load across all groups in sd *///domain中各个group的平均负载

	/** Statistics of this group */
	unsigned long this_load;//当前调度组负载
	unsigned long this_load_per_task; //当前调度组平均负载
	unsigned long this_nr_running; //当前调度组内运行队列中进程总数
	unsigned long this_has_capacity;
	unsigned long this_group_capacity;
	unsigned int  this_idle_cpus;

	/* Statistics of the busiest group */
	unsigned int  busiest_idle_cpus;
	unsigned long max_load; //最忙组的负载
	unsigned long busiest_load_per_task; //最忙组中平均每个task的负载
	unsigned long busiest_nr_running; //最忙组中进程总数
#ifdef CONFIG_SCHED_HMP
	unsigned long busiest_nr_small_tasks;
	unsigned long busiest_nr_big_tasks;
	u64 busiest_scaled_load;
#endif
	unsigned long busiest_group_capacity; //group可容纳的task数量
	unsigned long busiest_has_capacity;
	unsigned int  busiest_group_weight;

	int group_imb; /* Is there imbalance in this sd */
};
/**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
 //domain->group->cpu
static inline void update_sd_lb_stats(struct lb_env *env,
					int *balance, struct sd_lb_stats *sds)
{
	struct sched_domain *child = env->sd->child;
	struct sched_group *sg = env->sd->groups;
	struct sg_lb_stats sgs;
	int load_idx, prefer_sibling = 0;

	if (child && child->flags & SD_PREFER_SIBLING)
		prefer_sibling = 1;

	load_idx = get_sd_load_idx(env->sd, env->idle);

	do {
		int local_group;

		//dst_cpu 是否在group中
		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
		memset(&sgs, 0, sizeof(sgs));
		update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
		//balance=1
		if (local_group && !(*balance))
			return;

		sds->total_load += sgs.group_load;
		sds->total_pwr += sg->sgp->power;

		/*
		 * In case the child domain prefers tasks go to siblings
		 * first, lower the sg capacity to one so that we'll try
		 * and move all the excess tasks away. We lower the capacity
		 * of a group only if the local group has the capacity to fit
		 * these excess tasks, i.e. nr_running < group_capacity. The
		 * extra check prevents the case where you always pull from the
		 * heaviest group when it is already under-utilized (possible
		 * with a large weight task outweighs the tasks on the system).
		 */
		if (prefer_sibling && !local_group && sds->this_has_capacity)
			sgs.group_capacity = min(sgs.group_capacity, 1UL);
        //如果是当前cpu上的group,
		if (local_group) {
			sds->this_load = sgs.avg_load;
			sds->this = sg;
			sds->this_nr_running = sgs.sum_nr_running;
			sds->this_load_per_task = sgs.sum_weighted_load;
			sds->this_has_capacity = sgs.group_has_capacity;
			sds->this_idle_cpus = sgs.idle_cpus;
			sds->this_group_capacity = sgs.group_capacity;
		} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {//其他cpu上面的group,冒泡排序找到最busy group
			sds->max_load = sgs.avg_load;
			sds->busiest = sg;
			env->busiest_nr_running = sds->busiest_nr_running
							= sgs.sum_nr_running;
			sds->busiest_idle_cpus = sgs.idle_cpus;
			env->busiest_grp_capacity = sds->busiest_group_capacity
							= sgs.group_capacity;
			sds->busiest_load_per_task = sgs.sum_weighted_load;
			sds->busiest_has_capacity = sgs.group_has_capacity;
			sds->busiest_group_weight = sgs.group_weight;
			sds->group_imb = sgs.group_imb;
#ifdef CONFIG_SCHED_HMP
			sds->busiest_nr_small_tasks = sgs.sum_nr_small_tasks;
			sds->busiest_nr_big_tasks = sgs.sum_nr_big_tasks;
			sds->busiest_scaled_load = sgs.group_cpu_load;
#endif
		}

		sg = sg->next;
	} while (sg != env->sd->groups);//遍历domain内的所有group
}

static int
bail_inter_cluster_balance(struct lb_env *env, struct sd_lb_stats *sds)
{
	int nr_cpus, local_cpu, busiest_cpu;

	local_cpu = group_first_cpu(sds->this);//domain当前的group
	busiest_cpu = group_first_cpu(sds->busiest);//domain 最busy group
	//不同拓扑结构下,cpu的能力也不同
	if (cpu_capacity(local_cpu) <= cpu_capacity(busiest_cpu))
		return 0;

	if (sds->busiest_nr_big_tasks)
		return 0;

	//最busy group包含的cpu数量
	nr_cpus = cpumask_weight(sched_group_cpus(sds->busiest));
	//task 数量溢出限制
	if ((sds->busiest_scaled_load < nr_cpus * sched_spill_load) &&
		(sds->busiest_nr_running <
			nr_cpus * sysctl_sched_spill_nr_run)) {
			return 1;
	}

	return 0;
}

/**
 * calculate_imbalance - Calculate the amount of imbalance present within the
 *			 groups of a given sched_domain during load balance.
 * @env: load balance environment
 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 */
 //计算domain内group的imbalance程度
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
	unsigned long max_pull, load_above_capacity = ~0UL;

	sds->busiest_load_per_task /= sds->busiest_nr_running;
	if (sds->group_imb) {
		sds->busiest_load_per_task =
			min(sds->busiest_load_per_task, sds->avg_load);
	}

	/*
	 * In the presence of smp nice balancing, certain scenarios can have
	 * max load less than avg load(as we skip the groups at or below
	 * its cpu_power, while calculating max_load..)
	 */
	//最忙组负载小于平均负载,则处于负载均衡
	if (sds->max_load < sds->avg_load) {
		env->imbalance = 0;
		return fix_small_imbalance(env, sds);
	}

	if (!sds->group_imb) {
		/*
		 * Don't want to pull so many tasks that a group would go idle.
		 */
		//超过容纳限制的task数
		load_above_capacity = (sds->busiest_nr_running -
						sds->busiest_group_capacity);

		load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);

		load_above_capacity /= sds->busiest->sgp->power;
	}

	/*
	 * We're trying to get all the cpus to the average_load, so we don't
	 * want to push ourselves above the average load, nor do we wish to
	 * reduce the max loaded cpu below the average load. At the same time,
	 * we also don't want to reduce the group load below the group capacity
	 * (so that we can implement power-savings policies etc). Thus we look
	 * for the minimum possible imbalance.
	 * Be careful of negative numbers as they'll appear as very large values
	 * with unsigned longs.
	 */
	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);

	/* How much load to actually move to equalise the imbalance */
	env->imbalance = min(max_pull * sds->busiest->sgp->power,
		(sds->avg_load - sds->this_load) * sds->this->sgp->power)
			/ SCHED_POWER_SCALE;

	/*
	 * if *imbalance is less than the average load per runnable task
	 * there is no guarantee that any tasks will be moved so we'll have
	 * a think about bumping its value to force at least one task to be
	 * moved
	 */
	if (env->imbalance < sds->busiest_load_per_task)
		return fix_small_imbalance(env, sds);

}

/*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
//冒泡排序找到group内最忙的rq
static struct rq *find_busiest_queue(struct lb_env *env,
				     struct sched_group *group)
{
	struct rq *busiest = NULL, *rq;
	unsigned long max_load = 0;
	int i;

	if (sched_enable_hmp)
		return find_busiest_queue_hmp(env, group);
	
	//冒泡排序遍历group内最忙的cpu
	for_each_cpu(i, sched_group_cpus(group)) {
		//rq->cpu_power表示所在处理器的计算能力,在函式sched_init初始化时,会把这值设定为SCHED_LOAD_SCALE
		unsigned long power = power_of(i);//cpu power
		unsigned long capacity = DIV_ROUND_CLOSEST(power,
							SCHED_POWER_SCALE);
		unsigned long wl;

		if (!capacity)
			capacity = fix_small_capacity(env->sd, group);

		if (!cpumask_test_cpu(i, env->cpus))
			continue;

		rq = cpu_rq(i);
		wl = weighted_cpuload(i);//cpu rq负载

		/*
		 * When comparing with imbalance, use weighted_cpuload()
		 * which is not scaled with the cpu power.
		 */
		if (capacity && rq->nr_running == 1 && wl > env->imbalance)
			continue;

		/*
		 * For the load comparisons with the other cpu's, consider
		 * the weighted_cpuload() scaled with the cpu power, so that
		 * the load can be moved away from the cpu that is potentially
		 * running at a lower capacity.
		 */
		wl = (wl * SCHED_POWER_SCALE) / power;

		if (wl > max_load) {
			max_load = wl;
			busiest = rq;
		}
	}

	return busiest;
}

/*
循环从cfs rq中最大移动env->loop_max个task到dst cpu
*/
static int move_tasks(struct lb_env *env)
{
	struct list_head *tasks = &env->src_rq->cfs_tasks;
	struct task_struct *p;
	unsigned long load;
	int pulled = 0;
	int orig_loop = env->loop;

	if (env->imbalance <= 0)
		return 0;

	env->flags |= LBF_IGNORE_PREFERRED_CLUSTER_TASKS;
	if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu))
		env->flags |= LBF_IGNORE_SMALL_TASKS;
	else if (cpu_capacity(env->dst_cpu) < cpu_capacity(env->src_cpu) &&
							!sched_boost())
		env->flags |= LBF_IGNORE_BIG_TASKS;

redo:
	while (!list_empty(tasks)) {
		p = list_first_entry(tasks, struct task_struct, se.group_node);

		env->loop++;
		/* We've more or less seen every task there is, call it quits */
		if (env->loop > env->loop_max)
			break;

		/* take a breather every nr_migrate tasks */
		if (env->loop > env->loop_break) {
			env->loop_break += sched_nr_migrate_break;
			env->flags |= LBF_NEED_BREAK;
			break;
		}
		//p能否进行迁移,不能的话,将其移动到cfs rq队尾
		if (!can_migrate_task(p, env))
			goto next;

		load = task_h_load(p);

		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
			goto next;

		if ((load / 2) > env->imbalance)
			goto next;

		move_task(p, env);
		pulled++;
		env->imbalance -= load;
		per_cpu(dbs_boost_load_moved, env->dst_cpu) += pct_task_load(p);

#ifdef CONFIG_PREEMPT
		/*
		 * NEWIDLE balancing is a source of latency, so preemptible
		 * kernels will stop after the first task is pulled to minimize
		 * the critical section.
		 */
		if (env->idle == CPU_NEWLY_IDLE)
			break;
#endif

		/*
		 * We only want to steal up to the prescribed amount of
		 * weighted load.
		 */
		if (env->imbalance <= 0)
			break;

		continue;
next:
		list_move_tail(&p->se.group_node, tasks);
	}

	if (env->flags & (LBF_IGNORE_SMALL_TASKS | LBF_IGNORE_BIG_TASKS |
LBF_IGNORE_PREFERRED_CLUSTER_TASKS)
							     && !pulled) {
		tasks = &env->src_rq->cfs_tasks;
		env->flags &= ~(LBF_IGNORE_SMALL_TASKS | LBF_IGNORE_BIG_TASKS |
LBF_IGNORE_PREFERRED_CLUSTER_TASKS);
		env->loop = orig_loop;
		goto redo;
	}

	/*
	 * Right now, this is one of only two places move_task() is called,
	 * so we can safely collect move_task() stats here rather than
	 * inside move_task().
	 */
	schedstat_add(env->sd, lb_gained[env->idle], pulled);

	return pulled;
}

/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
 //检查task p能否迁移到当前cpu,四种情况不能迁移
 // 0不能迁移,1可以迁移
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
	int tsk_cache_hot = 0;
	int twf;
	/*
	 * We do not migrate tasks that are:
	 * 1) throttled_lb_pair, or
	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
	 * 3) running (obviously), or
	 * 4) are cache-hot on their current CPU.
	 */
	 //src/dst cpu 是否有限流
	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
		return 0;
	//倾向迁移big task
	if (cpu_capacity(env->dst_cpu) > cpu_capacity(env->src_cpu) &&
		nr_big_tasks(env->src_rq) && !is_big_task(p))
		return 0;
	
	//忽略small task
	if (env->flags & LBF_IGNORE_SMALL_TASKS && is_small_task(p))
		return 0;

	twf = task_will_fit(p, env->dst_cpu);

	/*
	 * Attempt to not pull tasks that don't fit. We may get lucky and find
	 * one that actually fits.
	 */
	 //忽略big task,同时不适合迁移
	if (env->flags & LBF_IGNORE_BIG_TASKS && !twf)
		return 0;

	if (env->flags & LBF_IGNORE_PREFERRED_CLUSTER_TASKS &&
			!preferred_cluster(cpu_rq(env->dst_cpu)->cluster, p))
		return 0;

	/*
	 * Group imbalance can sometimes cause work to be pulled across groups
	 * even though the group could have managed the imbalance on its own.
	 * Prevent inter-cluster migrations for big tasks when the number of
	 * tasks is lower than the capacity of the group.
	 */
	if (!twf && env->busiest_nr_running <= env->busiest_grp_capacity)
		return 0;

	//dst_cpu 是否在p运行运行cpumask内,0表示不在
	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
		int cpu;

		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

		/*
		 * Remember if this task can be migrated to any other cpu in
		 * our sched_group. We may want to revisit it if we couldn't
		 * meet load balance goals by pulling other tasks on src_cpu.
		 *
		 * Also avoid computing new_dst_cpu if we have already computed
		 * one in current iteration.
		 */
		if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
			return 0;

		/* Prevent to re-select dst_cpu via env's cpus */
		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
				env->flags |= LBF_SOME_PINNED;
				env->new_dst_cpu = cpu;
				break;
			}
		}

		return 0;
	}

	/* Record that we found atleast one task that could run on dst_cpu */
	env->flags &= ~LBF_ALL_PINNED;

	if (task_running(env->src_rq, p)) {
		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
		return 0;
	}

	/*
	 * Aggressive migration if:
	 * 1) IDLE or NEWLY_IDLE balance.
	 * 2) task is cache cold, or
	 * 3) too many balance attempts have failed.
	 */

	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
	if (env->idle != CPU_NOT_IDLE || !tsk_cache_hot ||
		env->sd->nr_balance_failed > env->sd->cache_nice_tries) {

		if (tsk_cache_hot) {
			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
			schedstat_inc(p, se.statistics.nr_forced_migrations);
		}

		return 1;
	}

	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
	return 0;
}


你可能感兴趣的:(linux cfs负载均衡)