Linux进程调度(schedule)的框架如上图所示。
本文的代码分析基于linux kernel 4.4.22,最好的学习方法还是”RTFSC”
rq其实是runnable queue,即本cpu上所有可运行进程的队列集合。每个cpu每种类型的rq(cfs/rt)只有一个,一个rq包含多个runnable的task,但是rq当前正在运行的进程(current running task)只有一个。
既然rq是中心,那么以下几点就是关键路径:
我们下面就逐一解答这些疑问,理解了这些关键路径,你就对linux的进程调度框架有了一个清晰的认识。
只有task新创建/或者task从blocked状态被唤醒(wakeup),task才会被压入rq。涉及到进程调度相关的步骤如下:
1、把task压入rq(enqueue),且把task->state设置为TASK_RUNNING;
2、判断压入新task以后rq的负载情况,当前task需不需要被调度出去,如果需要把当前task的thread_info->flags其中TIF_NEED_RESCHED bit置位。
重点在这里:如果当前进程需要重新调度的条件成立,这里只是会设置TIF_NEED_RESCHED标志,并不会马上调用schedule()来进行调度。真正的调度时机发生在从中断/异常返回时,会判断当前进程有没有被设置TIF_NEED_RESCHED,如果设置则调用schedule()来进行调度。
为什么唤醒涉及到调度不会马上执行?而是只设置一个TIF_NEED_RESCHED,等到中断/异常返回的时候才执行?
我理解有几点:(1)唤醒操作经常在中断上下文中执行,在这个环境中直接调用schedule()进行调度是不行的;(2)为了维护非抢占内核以来的一些传统,不要轻易中断进程的处理逻辑除非他主动放弃;(3)在普通上下文中,唤醒后接着调用schedule()也是可以的,我们看到一些特殊函数就是这么干的(调用smp_send_reschedule()、resched_curr()的函数)。
在当前进程调用系统函数进入blocked状态是,task会出rq(dequeue)。具体的步骤如下:
1、当前进程把task->state设置为TASK_INTERRUPTIBLE/TASK_UNINTERRUPTIBLE;
2、立即调用schedule()进行调度;
这里block是和wakeup、scheduler_tick最大的不同,block是马上调用schedule()进行调度,而wakeup、scheduler_tick是设置TIF_NEED_RESCHED标志,等待中断/异常返回时才执行真正的schedule()操作;
前面说了在rq的enqueue、dequeue时刻会计算rq负载,来决定把哪个runnable task放到current running task。除了enqueue/dequeue时候,系统还会周期性的计算rq负载来进行调度,确保多进程在1个cpu上都能得到服务。具体的步骤如下:
在前面几节中有一个重要的概念,wakeup、scheduler_tick操作后,如果需要调度只会设置TIF_NEED_RESCHED,在中断/异常返回时才执行真正的调度schedule()操作;
那么在哪些中断/异常返回时会执行schedule()呢?
我们分析”arch/arm64/kernel/entry.S”,在ArmV8架构下用户态跑在el0、内核态跑在el1。
.align 6
el1_sync:
kernel_entry 1
mov x0, sp
get_thread_info x20 // top of stack
ldr w4, [x20, #TI_CPU_EXCP]
add w4, w4, #0x1
str w4, [x20, #TI_CPU_EXCP]
cmp w4, #0x1
b.ne el1_sync_nest
str x0, [x20, #TI_REGS_ON_EXCP]
el1_sync_nest:
mrs x1, esr_el1 // read the syndrome register
lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
b.ne el1_sync_nest_skip_dec
sub w4, w4, #0x1
str w4, [x20, #TI_CPU_EXCP]
el1_sync_nest_skip_dec:
cmp w4, #0x2
b.lt el1_sync_nest_skip
bl aee_stop_nested_panic
el1_sync_nest_skip:
mrs x1, esr_el1 // read the syndrome register
lsr x24, x1, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
b.eq el1_da
cmp x24, #ESR_ELx_EC_IABT_CUR // instruction abort in EL1
b.eq el1_ia
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
b.eq el1_undef
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
b.eq el1_sp_pc
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el1_sp_pc
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL1
b.eq el1_undef
cmp x24, #ESR_ELx_EC_BREAKPT_CUR // debug exception in EL1
b.ge el1_dbg
b el1_inv
el1_ia:
/*
* Fall through to the Data abort case
*/
el1_da:
/*
* Data abort handling
*/
mrs x0, far_el1
enable_dbg
// re-enable interrupts if they were enabled in the aborted context
tbnz x23, #7, 1f // PSR_I_BIT
enable_irq
1:
mov x2, sp // struct pt_regs
bl do_mem_abort
cmp x24, #ESR_ELx_EC_DABT_CUR // data abort in EL1
b.eq el1_da_nest_skip_dec
mov x5, sp
get_thread_info x20 // top of stack
ldr w4, [x20, #TI_CPU_EXCP]
sub w4, w4, #0x1
str w4, [x20, #TI_CPU_EXCP]
el1_da_nest_skip_dec:
// disable interrupts before pulling preserved data off the stack
disable_irq
kernel_exit 1
el1_sp_pc:
/*
* Stack or PC alignment exception handling
*/
mrs x0, far_el1
enable_dbg
mov x2, sp
b do_sp_pc_abort
el1_undef:
/*
* Undefined instruction
*/
enable_dbg
mov x0, sp
bl do_undefinstr
el1_dbg:
/*
* Debug exception handling
*/
cmp x24, #ESR_ELx_EC_BRK64 // if BRK64
cinc x24, x24, eq // set bit '0'
tbz x24, #0, el1_inv // EL1 only
mrs x0, far_el1
mov x2, sp // struct pt_regs
bl do_debug_exception
mov x5, sp
get_thread_info x20 // top of stack
ldr w4, [x20, #TI_CPU_EXCP]
sub w4, w4, #0x1
str w4, [x20, #TI_CPU_EXCP]
kernel_exit 1
el1_inv:
// TODO: add support for undefined instructions in kernel mode
enable_dbg
mov x0, sp
mov x2, x1
mov x1, #BAD_SYNC
b bad_mode
ENDPROC(el1_sync)
大部分的内核态异常都是不可恢复的,内核最终会调用panic()复位,所以根本不会再返回去判断TIF_NEED_RESCHED标志;另外一部分可以返回的也只是简单调用kernel_exit恢复,不会去判断TIF_NEED_RESCHED标志。
.align 6
el1_irq:
kernel_entry 1
enable_dbg
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
#endif
irq_handler
#ifdef CONFIG_PREEMPT
ldr w24, [tsk, #TI_PREEMPT] // get preempt count
cbnz w24, 1f // preempt count != 0
// (1) 如果preempt count大于0,禁止抢占,直接返回
ldr x0, [tsk, #TI_FLAGS] // get flags
tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling?
bl el1_preempt // (2) 如果preempt count=0且TIF_NEED_RESCHED被置位,
// 继续调用el1_preempt() -> preempt_schedule_irq() -> __schedule()
1:
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on
#endif
kernel_exit 1
ENDPROC(el1_irq)
↓
#ifdef CONFIG_PREEMPT
el1_preempt:
mov x24, lr
1: bl preempt_schedule_irq // irq en/disable is done inside
ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS
tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling?
ret x24
#endif
↓
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
enum ctx_state prev_state;
/* Catch callers which need to be fixed */
BUG_ON(preempt_count() || !irqs_disabled());
prev_state = exception_enter();
do {
preempt_disable();
local_irq_enable();
__schedule(true);
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
exception_exit(prev_state);
}
可以看到在内核态中断返回时:会首先判断当前进程的thread_info->preempt_count的值,如果大于0说明禁止抢占不做处理直接返回;如果等于0且thread_info->flags被置位TIF_NEED_RESCHED,调用preempt_schedule_irq()重新进行调度。
.align 6
el0_sync:
kernel_entry 0
mrs x25, esr_el1 // read the syndrome register
lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
b.eq el0_svc // (1) 系统调用类的异常
cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
b.eq el0_da
cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
b.eq el0_ia
cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
b.eq el0_fpsimd_acc
cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception
b.eq el0_fpsimd_exc
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
b.eq el0_undef
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
b.eq el0_sp_pc
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el0_sp_pc
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
b.eq el0_undef
cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
b.ge el0_dbg
b el0_inv
↓
.align 6
el0_svc:
adrp stbl, sys_call_table // load syscall table pointer
uxtw scno, w8 // syscall number in w8
mov sc_nr, #__NR_syscalls
el0_svc_naked: // compat entry point
stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number
enable_dbg_and_irq
ct_user_exit 1
ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks
tst x16, #_TIF_SYSCALL_WORK
b.ne __sys_trace
cmp scno, sc_nr // check upper syscall limit
b.hs ni_sys
ldr x16, [stbl, scno, lsl #3] // address in the syscall table
blr x16 // call sys_* routine
// (1.1) 系统调用的执行
b ret_fast_syscall // (1.2) 系统调用异常的的返回
ni_sys:
mov x0, sp
bl do_ni_syscall
b ret_fast_syscall
ENDPROC(el0_svc)
↓
/*
* This is the fast syscall return path. We do as little as possible here,
* and this includes saving x0 back into the kernel stack.
*/
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK // (1.2.1) 判断thread_info->flags中_TIF_SYSCALL_WORK有没有被置位
// _TIF_WORK_MASK = (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE)
// _TIF_NEED_RESCHED:当前进程需要调度
// _TIF_SIGPENDING:当前进程有pending的信号需要处理
cbnz x2, ret_fast_syscall_trace
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending // (1.2.2) 如果有wokr需要处理调用work_pending
enable_step_tsk x1, x2
kernel_exit 0
ret_fast_syscall_trace:
enable_irq // enable interrupts
b __sys_trace_return_skipped // we already saved x0
/*
* Ok, we need to do extra processing, enter the slow path.
*/
work_pending:
tbnz x1, #TIF_NEED_RESCHED, work_resched
/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
mov x0, sp // 'regs'
enable_irq // enable interrupts for do_notify_resume()
bl do_notify_resume // (1.2.2.1) 如果signal、resume等work需要处理,
// 调用do_notify_resume()
b ret_to_user
work_resched:
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off // the IRQs are off here, inform the tracing code
#endif
bl schedule // (1.2.2.2) 如果TIF_NEED_RESCHED被置位,调用schedule()进行任务调度
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_irq // disable interrupts
ldr x1, [tsk, #TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
enable_step_tsk x1, x2
kernel_exit 0
ENDPROC(ret_to_user)
用户态的异常其中一个大类就是系统调用,这是用户主动调用svc命令陷入到内核态中执行系统调用。
在返回用户态的时候会判断thread_info->flags中的TIF_NEED_RESCHED bit有没有被置位,有置位则会调用schedule();还会判断_TIF_SIGPENDING,有置位会进行信号处理do_signal()。
.align 6
el0_sync:
kernel_entry 0
mrs x25, esr_el1 // read the syndrome register
lsr x24, x25, #ESR_ELx_EC_SHIFT // exception class
cmp x24, #ESR_ELx_EC_SVC64 // SVC in 64-bit state
b.eq el0_svc
cmp x24, #ESR_ELx_EC_DABT_LOW // data abort in EL0
b.eq el0_da // (1) 其他类型的异常
cmp x24, #ESR_ELx_EC_IABT_LOW // instruction abort in EL0
b.eq el0_ia
cmp x24, #ESR_ELx_EC_FP_ASIMD // FP/ASIMD access
b.eq el0_fpsimd_acc
cmp x24, #ESR_ELx_EC_FP_EXC64 // FP/ASIMD exception
b.eq el0_fpsimd_exc
cmp x24, #ESR_ELx_EC_SYS64 // configurable trap
b.eq el0_undef
cmp x24, #ESR_ELx_EC_SP_ALIGN // stack alignment exception
b.eq el0_sp_pc
cmp x24, #ESR_ELx_EC_PC_ALIGN // pc alignment exception
b.eq el0_sp_pc
cmp x24, #ESR_ELx_EC_UNKNOWN // unknown exception in EL0
b.eq el0_undef
cmp x24, #ESR_ELx_EC_BREAKPT_LOW // debug exception in EL0
b.ge el0_dbg
b el0_inv
↓
el0_da:
/*
* Data abort handling
*/
mrs x26, far_el1
// enable interrupts before calling the main handler
enable_dbg_and_irq
ct_user_exit
bic x0, x26, #(0xff << 56)
mov x1, x25
mov x2, sp
bl do_mem_abort // (1.1) 调用异常处理
b ret_to_user // (1.2) 完成后调用ret_to_user返回
el0_ia:
/*
* Instruction abort handling
*/
mrs x26, far_el1
// enable interrupts before calling the main handler
enable_dbg_and_irq
ct_user_exit
mov x0, x26
mov x1, x25
mov x2, sp
bl do_mem_abort
b ret_to_user
↓
/*
* Ok, we need to do extra processing, enter the slow path.
*/
work_pending:
tbnz x1, #TIF_NEED_RESCHED, work_resched
/* TIF_SIGPENDING, TIF_NOTIFY_RESUME or TIF_FOREIGN_FPSTATE case */
mov x0, sp // 'regs'
enable_irq // enable interrupts for do_notify_resume()
bl do_notify_resume // (1.2.2.1) 如果signal、resume等work需要处理,
// 调用do_notify_resume()
b ret_to_user
work_resched:
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off // the IRQs are off here, inform the tracing code
#endif
bl schedule // (1.2.2.2) 如果TIF_NEED_RESCHED被置位,调用schedule()进行任务调度
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_irq // disable interrupts
ldr x1, [tsk, #TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending // (1.2.2) 如果有wokr需要处理调用work_pending
enable_step_tsk x1, x2
kernel_exit 0
ENDPROC(ret_to_user)
用户态的异常除了系统调用,剩下就是错误类型的异常,比如:data abort、instruction abort、其他错误等。
在返回用户态的时候会判断thread_info->flags中的TIF_NEED_RESCHED bit有没有被置位,有置位则会调用schedule();还会判断_TIF_SIGPENDING,有置位会进行信号处理do_signal()。
.align 6
el0_irq:
kernel_entry 0
el0_irq_naked:
enable_dbg
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_off
#endif
ct_user_exit
irq_handler // (1) 调用irq处理程序
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on
#endif
b ret_to_user // (2) 最后也是调用ret_to_user返回,
// 会判断TIF_NEED_RESCHED、_TIF_SIGPENDING
ENDPROC(el0_irq)
用户态的中断处理和其他异常处理一样,最后都是调用ret_to_user返回用户态。
在返回用户态的时候会判断thread_info->flags中的TIF_NEED_RESCHED bit有没有被置位,有置位则会调用schedule();还会判断_TIF_SIGPENDING,有置位会进行信号处理do_signal()。
从上一节的分析中断/异常返回一共有5类路径:
我们可以看到是否支持抢占,只会影响”内核态中断的返回”这一条路径。
在之前的内核中会存在PREEMPT_ACTIVE这样一个标志,他是为了避免在如下代码被抢占会出现问题:
for (; ;) {
1: prepare_to_wait(&wq, &__wait, TASK_UNINTERRUPTIBLE);
2: if (condition)
3: break; // 如果这里发生抢占
4: schedule();
}
finish_wait();
假设如下场景:
为了避免以上的错误发生,在以前版本的内核中设计了PREEMPT_ACTIVE标志,如果是抢占发生首先设置PREEMPT_ACTIVE标志再调用schedule(),schedule()判断PREEMPT_ACTIVE的存在则不会进行dequeue/deactive操作。
asmlinkage void __sched preempt_schedule_irq(void)
{
add_preempt_count(PREEMPT_ACTIVE); // (1) 在抢占调度之前设置PREEMPT_ACTIVE标志
local_irq_enable();
schedule(); // (2) 调用schedule()进行实际调度
local_irq_disable();
sub_preempt_count(PREEMPT_ACTIVE);
}
↓
asmlinkage void __sched schedule(void)
{
/* (2.1) 如果进程state状态不为TASK_RUNNING && 没有置位PREEMPT_ACTIVE标志,
以下代码会对这样的进程进行deactivate_task(dequeue)操作
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
unlikely(signal_pending(prev))))
prev->state = TASK_RUNNING;
else {
if (prev->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
deactivate_task(prev, rq);
}
}
}
最新的4.4内核中,已经取消PREEMPT_ACTIVE标志而改为使用__schedule(bool preempt)的函数参数传入:
asmlinkage __visible void __sched preempt_schedule_irq(void)
{
do {
preempt_disable();
local_irq_enable();
__schedule(true); // (1) 使用preempt=true来调用__schedule()
local_irq_disable();
sched_preempt_enable_no_resched();
} while (need_resched());
}
↓
static void __sched notrace __schedule(bool preempt)
{
// (1.1) 使用preempt代替了PREEMPT_ACTIVE标志的作用
if (!preempt && prev->state) {
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
/*
* If a worker went to sleep, notify and ask workqueue
* whether it wants to wake up a task to maintain
* concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
to_wakeup = wq_worker_sleeping(prev, cpu);
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
}
switch_count = &prev->nvcsw;
}
}
上述几节的内容讲述了调度相关的几个关键节点,所以理解调度你可以从以下的几个函数入手: