来看看这注释吧
/*
* This is how migration works:
*
* 1) we invoke migration_cpu_stop() on the target CPU using stop_one_cpu().
* 我们用stop_one_cpu()函数来让目标CPU执行migration_cpu_stop() 函数
* 2) stopper starts to run (implicitly forcing the migrated thread off the CPU)
* stop进程开始执行(stop跑起来也就意味这CPU上其他进程都停止了,隐式地强制被迁移的进程停下来)
* 3) it checks whether the migrated task is still in the wrong runqueue.
* 检查下迁移过的进程是否仍然在错误的rq中
* 4) if it's in the wrong runqueue then the migration thread removes it and puts it into the right queue.
* 如果在错误的rq中,迁移线程重新将它移除掉,并放到正确的rq中
* 5) stopper completes and stop_one_cpu() returns and the migration is done.
* stop进程完事儿,函数返回,迁移完成
*/
migrate_task_to
int migrate_task_to(struct task_struct *p, int target_cpu)
{
struct migration_arg arg = { p, target_cpu };
int curr_cpu = task_cpu(p); //这里的curr表示进程所在CPU,并不一定是当前执行迁移动作的CPU
if (curr_cpu == target_cpu) //如果task当前CPU和目标CPU一致,没必要迁移
return 0;
if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) //测试进程CPU亲和力,是否能在目标CPU上运行
return -EINVAL;
/* TODO: This is not properly updating schedstats */
trace_sched_move_numa(p, curr_cpu, target_cpu); //trace系统的一个跟踪点
return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); //停止task所在CPU,并执行migration_cpu_stop来迁移进程
}
migration_cpu_stop
static int migration_cpu_stop(void *data)
{
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
struct rq *rq = this_rq();
local_irq_disable();
sched_ttwu_pending();
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
if (task_rq(p) == rq) { //迁移函数很弱,只能在task所在的CPU上执行
if (task_on_rq_queued(p)) //如果task处于R状态,即在rq中,执行迁移动作
rq = __migrate_task(rq, p, arg->dest_cpu);
else //否则设置下次唤醒的CPU为目标CPU即可
p->wake_cpu = arg->dest_cpu;
}
raw_spin_unlock(&rq->lock);
raw_spin_unlock(&p->pi_lock);
local_irq_enable();
return 0;
}
__migrate_task
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
{
if (unlikely(!cpu_active(dest_cpu))) //如果目标CPU挂了,那也别迁移了
return rq;
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) //再看看CPU亲和力到底行不行
return rq;
rq = move_queued_task(rq, p, dest_cpu);
return rq;
}
move_queued_task
static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
{
lockdep_assert_held(&rq->lock);
p->on_rq = TASK_ON_RQ_MIGRATING; //将cpu状态置为迁移中
dequeue_task(rq, p, 0); //回调实现
set_task_cpu(p, new_cpu); //将task的所在CPU设置为新的
raw_spin_unlock(&rq->lock);
rq = cpu_rq(new_cpu); //拿到新cpu的rq
raw_spin_lock(&rq->lock);
BUG_ON(task_cpu(p) != new_cpu);
enqueue_task(rq, p, 0); //将任务插入新cpu的rq中
p->on_rq = TASK_ON_RQ_QUEUED; //更新任务状态
check_preempt_curr(rq, p, 0);
return rq;
}
dequeue_task
static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
for_each_sched_entity(se) { //循环与组调度相关
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags); //从rq中移除该se
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we wil
* post the final h_nr_running decrement below.
*/
if (cfs_rq_throttled(cfs_rq)) //带宽控制相关
break;
cfs_rq->h_nr_running--;
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
/* Avoid re-evaluating load for this entity: */
se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
if (task_sleep && se && !throttled_hierarchy(cfs_rq))
set_next_buddy(se);
break;
}
flags |= DEQUEUE_SLEEP;
}
for_each_sched_entity(se) { //如果没有组调度,这里se = NULL,不会进来
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
if (cfs_rq_throttled(cfs_rq))
break;
update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
}
if (!se)
sub_nr_running(rq, 1);
hrtick_update(rq); //
}
enqueue_task
static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
* passed.
*/
if (p->in_iowait)
cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
for_each_sched_entity(se) { //循环与组调度相关
if (se->on_rq)
break;
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags); //将这个se入队
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
*/
if (cfs_rq_throttled(cfs_rq)) //带宽控制相关
break;
cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
if (cfs_rq_throttled(cfs_rq))
break;
update_load_avg(se, 1);
update_cfs_shares(cfs_rq);
}
if (!se)
add_nr_running(rq, 1);
hrtick_update(rq);
}
dequeue_entity
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se); //负载均衡
update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se); //清除伙伴关系
if (se != cfs_rq->curr) //如果不是当前task,才执行迁移
__dequeue_entity(cfs_rq, se); //从红黑树中移除se
se->on_rq = 0; //将task状态改为不在rq上
account_entity_dequeue(cfs_rq, se); //更新整个cfs_rq的权重值,也是负载均衡
/*
* Normalize after update_curr(); which will also have moved
* min_vruntime if @se is the one holding it back. But before doing
* update_min_vruntime() again, which will discount @se's position and
* can move min_vruntime forward still more.
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
/* return excess runtime on last dequeue */
return_cfs_rq_runtime(cfs_rq);
update_cfs_shares(cfs_rq);
/*
* Now advance min_vruntime if @se was the entity holding it back,
* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq); //任务已经移除,更新本cpu上的cfs_rq的min_vruntime
}
enqueue_entity
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
bool curr = cfs_rq->curr == se;
/*
* If we're the current task, we must renormalise before calling
* update_curr().
*/
if (renorm && curr)
se->vruntime += cfs_rq->min_vruntime; //修复vruntime
update_curr(cfs_rq); //更新cfs_rq树
/*
* Otherwise, renormalise after, such that we're placed at the current
* moment in time, instead of some random moment in the past. Being
* placed in the past could significantly boost this task to the
* fairness detriment of existing tasks.
*/
if (renorm && !curr)
se->vruntime += cfs_rq->min_vruntime;
enqueue_entity_load_avg(cfs_rq, se); //负载均衡
account_entity_enqueue(cfs_rq, se); //重新计算负载均衡
update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP)
place_entity(cfs_rq, se, 0); //对唤醒的进程进行补偿
check_schedstat_required();
update_stats_enqueue(cfs_rq, se, flags);
check_spread(cfs_rq, se);
if (!curr) //如果rq当前进程不是该se,则将se加入rq中
__enqueue_entity(cfs_rq, se); //放入红黑树中
se->on_rq = 1; //恢复task状态
if (cfs_rq->nr_running == 1) {
list_add_leaf_cfs_rq(cfs_rq);
check_enqueue_throttle(cfs_rq);
}
}
__dequeue_entity
static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->rb_leftmost == &se->run_node) {
struct rb_node *next_node;
next_node = rb_next(&se->run_node);
cfs_rq->rb_leftmost = next_node;
}
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
__enqueue_entity
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
struct rb_node *parent = NULL;
struct sched_entity *entry;
int leftmost = 1;
/*
* Find the right place in the rbtree:
*/
while (*link) {
parent = *link;
entry = rb_entry(parent, struct sched_entity, run_node);
/*
* We dont care about collisions. Nodes with
* the same key stay together.
*/
if (entity_before(se, entry)) {
link = &parent->rb_left;
} else {
link = &parent->rb_right;
leftmost = 0;
}
}
/*
* Maintain a cache of leftmost tree entries (it is frequently
* used):
*/
if (leftmost)
cfs_rq->rb_leftmost = &se->run_node;
rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}
好,总结一下进程迁移的过程:
- migrate_task_to不一定运行在迁出和迁入的CPU上,所以要让迁出的CPU执行一个stop调度类的进程来执行迁移动作
- 迁出的CPU上执行stop进程,这意味着被迁出的进程已经被切换掉了(on_rq = 0)
- 真正的迁移动作一开始就关中断了,迁移完成后再打开中断
- 迁移动作在cfs上无非就是修复task_groupe,修复vruntime,修复load_avarage,修复band_width,然后操作一番红黑树,dequeue和enqueue的动作基本是镜像的。