migrate_task_to代码走读

来看看这注释吧

/*
 * This is how migration works:
 *
 * 1) we invoke migration_cpu_stop() on the target CPU using stop_one_cpu().
 *    我们用stop_one_cpu()函数来让目标CPU执行migration_cpu_stop() 函数
 * 2) stopper starts to run (implicitly forcing the migrated thread off the CPU)
 *    stop进程开始执行（stop跑起来也就意味这CPU上其他进程都停止了，隐式地强制被迁移的进程停下来）
 * 3) it checks whether the migrated task is still in the wrong runqueue.
 *    检查下迁移过的进程是否仍然在错误的rq中
 * 4) if it's in the wrong runqueue then the migration thread removes it and puts it into the right queue.
 *    如果在错误的rq中，迁移线程重新将它移除掉，并放到正确的rq中
 * 5) stopper completes and stop_one_cpu() returns and the migration is done.
 *    stop进程完事儿，函数返回，迁移完成
 */

migrate_task_to

int migrate_task_to(struct task_struct *p, int target_cpu)
{      
        struct migration_arg arg = { p, target_cpu };
        int curr_cpu = task_cpu(p); //这里的curr表示进程所在CPU，并不一定是当前执行迁移动作的CPU
       
        if (curr_cpu == target_cpu) //如果task当前CPU和目标CPU一致，没必要迁移
                return 0;
       
        if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p))) //测试进程CPU亲和力，是否能在目标CPU上运行
                return -EINVAL;
       
        /* TODO: This is not properly updating schedstats */
       
        trace_sched_move_numa(p, curr_cpu, target_cpu); //trace系统的一个跟踪点
        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg); //停止task所在CPU，并执行migration_cpu_stop来迁移进程
}

migration_cpu_stop

static int migration_cpu_stop(void *data)
{       
        struct migration_arg *arg = data;
        struct task_struct *p = arg->task;
        struct rq *rq = this_rq();
        
        local_irq_disable();

        sched_ttwu_pending();
        
        raw_spin_lock(&p->pi_lock);
        raw_spin_lock(&rq->lock);

        if (task_rq(p) == rq) { //迁移函数很弱，只能在task所在的CPU上执行
                if (task_on_rq_queued(p)) //如果task处于R状态，即在rq中，执行迁移动作
                        rq = __migrate_task(rq, p, arg->dest_cpu);
                else  //否则设置下次唤醒的CPU为目标CPU即可
                        p->wake_cpu = arg->dest_cpu;
        }
        raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
        
        local_irq_enable();
        return 0;
}

__migrate_task

static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
{
        if (unlikely(!cpu_active(dest_cpu))) //如果目标CPU挂了，那也别迁移了
                return rq;
 
        /* Affinity changed (again). */
        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) //再看看CPU亲和力到底行不行
                return rq;
 
        rq = move_queued_task(rq, p, dest_cpu);
 
        return rq;
}

move_queued_task

static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
{
        lockdep_assert_held(&rq->lock);
 
        p->on_rq = TASK_ON_RQ_MIGRATING; //将cpu状态置为迁移中
        dequeue_task(rq, p, 0);  //回调实现
        set_task_cpu(p, new_cpu); //将task的所在CPU设置为新的
        raw_spin_unlock(&rq->lock); 

        rq = cpu_rq(new_cpu); //拿到新cpu的rq

        raw_spin_lock(&rq->lock);
        BUG_ON(task_cpu(p) != new_cpu);
        enqueue_task(rq, p, 0); //将任务插入新cpu的rq中
        p->on_rq = TASK_ON_RQ_QUEUED; //更新任务状态
        check_preempt_curr(rq, p, 0);
 
        return rq;
}

dequeue_task

static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{               
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        int task_sleep = flags & DEQUEUE_SLEEP;
                
        for_each_sched_entity(se) { //循环与组调度相关
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags); //从rq中移除该se
                
                /*
                 * end evaluation on encountering a throttled cfs_rq
                 *
                 * note: in the case of encountering a throttled cfs_rq we wil
                 * post the final h_nr_running decrement below.
                */
                if (cfs_rq_throttled(cfs_rq)) //带宽控制相关
                        break;
                cfs_rq->h_nr_running--;
                
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
                        /* Avoid re-evaluating load for this entity: */
                        se = parent_entity(se);
                        /*
                         * Bias pick_next to pick a task from this cfs_rq, as
                         * p is sleeping when it is within its sched_slice.
                         */
                        if (task_sleep && se && !throttled_hierarchy(cfs_rq))
                                set_next_buddy(se);
                        break;
                }
                flags |= DEQUEUE_SLEEP;
        }       
  
        for_each_sched_entity(se) { //如果没有组调度，这里se = NULL，不会进来
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
  
                if (cfs_rq_throttled(cfs_rq))
                        break;
  
                update_load_avg(se, 1);
                update_cfs_shares(cfs_rq);
        }
  
        if (!se)
                sub_nr_running(rq, 1);
  
        hrtick_update(rq); //
}

enqueue_task

static void
enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{       
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
        
        /*
         * If in_iowait is set, the code below may not trigger any cpufreq
         * utilization updates, so do it here explicitly with the IOWAIT flag
         * passed.
         */
        if (p->in_iowait)
                cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
        
        for_each_sched_entity(se) { //循环与组调度相关
                if (se->on_rq)
                        break;
                cfs_rq = cfs_rq_of(se);
                enqueue_entity(cfs_rq, se, flags); //将这个se入队
        
                /*
                 * end evaluation on encountering a throttled cfs_rq
                 *
                 * note: in the case of encountering a throttled cfs_rq we will
                 * post the final h_nr_running increment below.
                 */
                if (cfs_rq_throttled(cfs_rq)) //带宽控制相关
                        break;
                cfs_rq->h_nr_running++;
        
                flags = ENQUEUE_WAKEUP;
        }

        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
 
                update_load_avg(se, 1);
                update_cfs_shares(cfs_rq);
        }
 
        if (!se)
                add_nr_running(rq, 1);
 
        hrtick_update(rq);
}

dequeue_entity

static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{        
        /*
         * Update run-time statistics of the 'current'.
         */
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se); //负载均衡
         
        update_stats_dequeue(cfs_rq, se, flags);
         
        clear_buddies(cfs_rq, se); //清除伙伴关系
         
        if (se != cfs_rq->curr) //如果不是当前task，才执行迁移
                __dequeue_entity(cfs_rq, se); //从红黑树中移除se
        se->on_rq = 0; //将task状态改为不在rq上
        account_entity_dequeue(cfs_rq, se); //更新整个cfs_rq的权重值，也是负载均衡
         
        /*
         * Normalize after update_curr(); which will also have moved
         * min_vruntime if @se is the one holding it back. But before doing
         * update_min_vruntime() again, which will discount @se's position and
         * can move min_vruntime forward still more.
         */
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
         
        /* return excess runtime on last dequeue */
        return_cfs_rq_runtime(cfs_rq);
         
        update_cfs_shares(cfs_rq);
         
        /*
         * Now advance min_vruntime if @se was the entity holding it back,
         * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
         * put back on, and if we advance min_vruntime, we'll be placed back
         * further than we started -- ie. we'll be penalized.
         */
        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                update_min_vruntime(cfs_rq); //任务已经移除，更新本cpu上的cfs_rq的min_vruntime
}

enqueue_entity

static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{       
        bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
        bool curr = cfs_rq->curr == se;
        
        /*
         * If we're the current task, we must renormalise before calling
         * update_curr().
         */
        if (renorm && curr)
                se->vruntime += cfs_rq->min_vruntime; //修复vruntime
        
        update_curr(cfs_rq); //更新cfs_rq树
        
        /*
         * Otherwise, renormalise after, such that we're placed at the current
         * moment in time, instead of some random moment in the past. Being
         * placed in the past could significantly boost this task to the
         * fairness detriment of existing tasks.
         */
        if (renorm && !curr)
                se->vruntime += cfs_rq->min_vruntime;
        
        enqueue_entity_load_avg(cfs_rq, se); //负载均衡
        account_entity_enqueue(cfs_rq, se); //重新计算负载均衡
        update_cfs_shares(cfs_rq);
        
        if (flags & ENQUEUE_WAKEUP)
                place_entity(cfs_rq, se, 0); //对唤醒的进程进行补偿
        
        check_schedstat_required();
        update_stats_enqueue(cfs_rq, se, flags);
        check_spread(cfs_rq, se);
        if (!curr) //如果rq当前进程不是该se，则将se加入rq中
                __enqueue_entity(cfs_rq, se); //放入红黑树中
        se->on_rq = 1; //恢复task状态
        
        if (cfs_rq->nr_running == 1) {
                list_add_leaf_cfs_rq(cfs_rq);
                check_enqueue_throttle(cfs_rq);
        }
}

__dequeue_entity

static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
        if (cfs_rq->rb_leftmost == &se->run_node) {
                struct rb_node *next_node;

                next_node = rb_next(&se->run_node);
                cfs_rq->rb_leftmost = next_node;
        }
 
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}

__enqueue_entity

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{       
        struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
        struct rb_node *parent = NULL;
        struct sched_entity *entry;
        int leftmost = 1;
        
        /*
         * Find the right place in the rbtree:
         */
        while (*link) {
                parent = *link;
                entry = rb_entry(parent, struct sched_entity, run_node);
                /*
                 * We dont care about collisions. Nodes with
                 * the same key stay together.
                 */
                if (entity_before(se, entry)) {
                        link = &parent->rb_left;
                } else {
                        link = &parent->rb_right;
                        leftmost = 0;
                }
        }
        
        /*
         * Maintain a cache of leftmost tree entries (it is frequently
         * used):
         */
        if (leftmost)
                cfs_rq->rb_leftmost = &se->run_node;
        
        rb_link_node(&se->run_node, parent, link);
        rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

好，总结一下进程迁移的过程：

migrate_task_to不一定运行在迁出和迁入的CPU上，所以要让迁出的CPU执行一个stop调度类的进程来执行迁移动作
迁出的CPU上执行stop进程，这意味着被迁出的进程已经被切换掉了(on_rq = 0)
真正的迁移动作一开始就关中断了，迁移完成后再打开中断
迁移动作在cfs上无非就是修复task_groupe，修复vruntime，修复load_avarage，修复band_width，然后操作一番红黑树，dequeue和enqueue的动作基本是镜像的。

migrate_task_to代码走读

你可能感兴趣的:(migrate_task_to代码走读)