限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
本文基于 ARMv7 架构 + linux-4.14.132内核代码
进行分析。对涉及的 ATF(Arm Trusted Firmware) 以及 ARMv7 CPU HYP 模式
知识不做展开,读者可自行阅读相关资料进行了解。
ARMv7
架构下,SoC
的一般启动流程大概如下:
上电 --> SoC Boot ROM --> SPL --> U-BOOT --> Linux 内核
在进入 Linux 内核
之前,通常只启动了一个 BOOT CPU
(通常是 CPU 0
),而其它的 CPU 核处于待机状态。我们的分析,直接从 Linux 内核
入口开始,我们也不会讨论 Linux 内核
的解压过程。
从内核链接脚本 arch/arm/kernel/vmlinux.lds.S
的如下片段:
/* include/asm-generic/vmlinux.lds.h */
/* Section used for early init (in .S files) */
#define HEAD_TEXT *(.head.text)
/* arch/arm/kernel/vmlinux.lds.S */
...
OUTPUT_ARCH(arm)
ENTRY(stext) /* 内核入口 */
...
SECTIONS
{
...
. = PAGE_OFFSET + TEXT_OFFSET;
.head.text : {
_text = .;
HEAD_TEXT
}
...
}
了解到内核的入口在 arch/arm/kernel/head.S
中:
/* include/linux/init.h */
/* For assembly routines */
#define __HEAD .section ".head.text","ax"
/* arch/arm/kernel/head.S */
/*
* Kernel startup entry point.
* ---------------------------
*
* This is normally called from the decompressor code. The requirements
* are: MMU = off, D-cache = off, I-cache = dont care, r0 = 0,
* r1 = machine nr, r2 = atags or dtb pointer.
*
* This code is mostly position independent, so if you link the kernel at
* 0xc0008000, you call this at __pa(0xc0008000).
*
* See linux/arch/arm/tools/mach-types for the complete list of machine
* numbers for r1.
*
* We're trying to keep crap to a minimum; DO NOT add any machine specific
* crap here - that's what the boot loader (or in extreme, well justified
* circumstances, zImage) is for.
*/
.arm
__HEAD
ENTRY(stext)
ARM_BE8(setend be ) @ ensure we are in BE8 mode
...
#ifdef CONFIG_ARM_VIRT_EXT
bl __hyp_stub_install
#endif
@ ensure svc mode and all interrupts masked
safe_svcmode_maskall r9
mrc p15, 0, r9, c0, c0 @ get processor id
bl __lookup_processor_type @ r5=procinfo r9=cpuid
movs r10, r5 @ invalid processor (r5=0)?
...
#ifdef CONFIG_ARM_LPAE
mrc p15, 0, r3, c0, c1, 4 @ read ID_MMFR0
and r3, r3, #0xf @ extract VMSA support
cmp r3, #5 @ long-descriptor translation table format?
...
#endif
ldr r8, =PLAT_PHYS_OFFSET @ always constant in this case
/*
* r1 = machine no, r2 = atags or dtb,
* r8 = phys_offset, r9 = cpuid, r10 = procinfo
*/
bl __vet_atags
#ifdef CONFIG_SMP_ON_UP
bl __fixup_smp
#endif
#ifdef CONFIG_ARM_PATCH_PHYS_VIRT
bl __fixup_pv_table
#endif
bl __create_page_tables /* 建立内核页表 */
ldr r13, =__mmap_switched @ address to jump to after
@ mmu has been enabled
badr lr, 1f @ return (PIC) address
#ifdef CONFIG_ARM_LPAE
mov r5, #0 @ high TTBR0
mov r8, r4, lsr #12 @ TTBR1 is swapper_pg_dir pfn
#else
mov r8, r4 @ set TTBR1 to swapper_pg_dir
#endif
ldr r12, [r10, #PROCINFO_INITFUNC]
add r12, r12, r10
ret r12
/*
* The following fragment of code is executed with the MMU on in MMU mode,
* and uses absolute addresses; this is not position independent.
*
* r0 = cp#15 control register
* r1 = machine ID
* r2 = atags/dtb pointer
* r9 = processor ID
*/
__INIT
__mmap_switched:
adr r3, __mmap_switched_data
ldmia r3!, {r4, r5, r6, r7}
cmp r4, r5 @ Copy data segment if needed
1: cmpne r5, r6
ldrne fp, [r4], #4
strne fp, [r5], #4
bne 1b
mov fp, #0 @ Clear BSS (and zero fp)
1: cmp r6, r7
strcc fp, [r6],#4
bcc 1b
ARM( ldmia r3, {r4, r5, r6, r7, sp})
THUMB( ldmia r3, {r4, r5, r6, r7} )
THUMB( ldr sp, [r3, #16] )
str r9, [r4] @ Save processor ID
str r1, [r5] @ Save machine type
str r2, [r6] @ Save atags pointer
cmp r7, #0
strne r0, [r7] @ Save control register values
b start_kernel /* start_kernel() */
ENDPROC(__mmap_switched)
内核流程从汇编代码进入了 C 入口 start_kernel()
:
/* init/main.c */
asmlinkage __visible void __init start_kernel(void)
{
...
pr_notice("%s", linux_banner);
setup_arch(&command_line);
...
sched_init();
...
/* Do the rest non-__init'ed, we're now alive */
rest_init();
}
在 setup_arch()
中 解析 CPU DTS 配置, 以及 PSCI(Power State Coordination Interface)
初始化。看 CPU 相关的 DTS 配置:
/ {
cpus {
#address-cells = <1>;
#size-cells = <0>;
cpu0: cpu@0 {
compatible = "arm,cortex-a7";
device_type = "cpu";
reg = <0>;
clocks = <&ccu CLK_CPUX>;
clock-latency = <244144>; /* 8 32k periods */
clock-frequency = <1200000000>;
};
cpu@1 {
compatible = "arm,cortex-a7";
device_type = "cpu";
reg = <1>;
clock-frequency = <1200000000>;
};
cpu@2 {
compatible = "arm,cortex-a7";
device_type = "cpu";
reg = <2>;
clock-frequency = <1200000000>;
};
cpu@3 {
compatible = "arm,cortex-a7";
device_type = "cpu";
reg = <3>;
clock-frequency = <1200000000>;
};
};
...
};
/* arch/arm/kernel/psci_smp.c */
const struct smp_operations psci_smp_ops __initconst = {
.smp_boot_secondary = psci_boot_secondary,
#ifdef CONFIG_HOTPLUG_CPU
.cpu_disable = psci_cpu_disable,
.cpu_die = psci_cpu_die,
.cpu_kill = psci_cpu_kill,
#endif
};
/* arch/arm/kernel/setup.c */
void __init setup_arch(char **cmdline_p)
{
...
/* 解析 "cpus" DTS 配置 */
arm_dt_init_cpu_maps();
/* ARM PSCI(Power State Coordinate Interface) 初始化 */
psci_dt_init();
#ifdef CONFIG_SMP
if (is_smp()) {
if (!mdesc->smp_init || !mdesc->smp_init()) {
if (psci_smp_available()) /* 如果 PSCI(Power State Coordinate Interface) 可用 */
smp_set_ops(&psci_smp_ops); /* 使用 PSCI 的 smp_operations */
else if (mdesc->smp)
...
}
smp_init_cpus();
smp_build_mpidr_hash();
}
#endif
...
}
ARM PSCI
用来管理 CPU 的 启动、关闭、休眠、重启等工作。我们先看 PSCI
配置相关的 DTS 配置:
/{
cpus {
...
};
...
/* PSCI 配置 */
psci {
compatible = "arm,psci-1.0";
method = "smc";
};
};
有时候,PSCI
DTS 配置可能是由 U-BOOT
动态插入的,所以你无法在内核的 DTS 中找到它。继续看 PSCI
的初始化:
/* drivers/firmware/psci.c */
static const struct of_device_id psci_of_match[] __initconst = {
{ .compatible = "arm,psci", .data = psci_0_1_init},
{ .compatible = "arm,psci-0.2", .data = psci_0_2_init},
{ .compatible = "arm,psci-1.0", .data = psci_0_2_init},
{},
};
int __init psci_dt_init(void)
{
struct device_node *np;
np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
/* 没有配置 "psci" 节点 */
if (!np || !of_device_is_available(np))
return -ENODEV;
init_fn = (psci_initcall_t)matched_np->data;
return init_fn(np); /* psci_0_1_init() */
}
static int __init psci_0_1_init(struct device_node *np)
{
int err;
err = get_set_conduit_method(np); /* 设置 发起 PSCI 功能接口 请求方式 (SMC) */
...
pr_info("Using PSCI v0.1 Function IDs from DT\n");
if (!of_property_read_u32(np, "cpu_suspend", &id)) {
psci_function_id[PSCI_FN_CPU_SUSPEND] = id;
psci_ops.cpu_suspend = psci_cpu_suspend;
}
if (!of_property_read_u32(np, "cpu_off", &id)) {
psci_function_id[PSCI_FN_CPU_OFF] = id;
psci_ops.cpu_off = psci_cpu_off;
}
if (!of_property_read_u32(np, "cpu_on", &id)) {
psci_function_id[PSCI_FN_CPU_ON] = id;
psci_ops.cpu_on = psci_cpu_on;
}
if (!of_property_read_u32(np, "migrate", &id)) {
psci_function_id[PSCI_FN_MIGRATE] = id;
psci_ops.migrate = psci_migrate;
}
...
return err;
}
static int get_set_conduit_method(struct device_node *np)
{
const char *method;
pr_info("probing for conduit method from DT.\n");
if (of_property_read_string(np, "method", &method)) {
pr_warn("missing \"method\" property\n");
return -ENXIO;
}
if (!strcmp("hvc", method)) {
set_conduit(PSCI_CONDUIT_HVC);
} else if (!strcmp("smc", method)) { /* 我们的 DTS 配置通过 SMC 指令发起 PSCI 功能请求 */
set_conduit(PSCI_CONDUIT_SMC);
} else {
pr_warn("invalid \"method\" property: %s\n", method);
return -EINVAL;
}
return 0;
}
static void set_conduit(enum psci_conduit conduit)
{
switch (conduit) {
...
case PSCI_CONDUIT_SMC:
invoke_psci_fn = __invoke_psci_fn_smc;
break;
...
}
psci_ops.conduit = conduit;
}
接下来看每 CPU idle
进程的初始工作:
/* kernel/sched/core.c */
void __init sched_init(void)
{
...
/*
* Make us the idle thread. Technically, schedule() should not be
* called from this thread, however somewhere below it might be,
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
/* 初始化当前 CPU 的 idle 进程 */
init_idle(current, smp_processor_id());
...
}
/* 初始化 @cpu 的 idle 进程 */
void init_idle(struct task_struct *idle, int cpu)
{
struct rq *rq = cpu_rq(cpu); /* @cpu 的运行队列 */
...
__sched_fork(0, idle);
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
idle->flags |= PF_IDLE;
#ifdef CONFIG_SMP
set_cpus_allowed_common(idle, cpumask_of(cpu)); /* 限定 idle 进程到 @cpu 上运行 */
#endif
...
__set_task_cpu(idle, cpu);
...
rq->curr = rq->idle = idle; /* 设置 @cpu 运行队列当前进程为 idle */
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
idle->on_cpu = 1;
#endif
...
init_idle_preempt_count(idle, cpu); /* 开启 @cpu 的抢占 */
idle->sched_class = &idle_sched_class;
...
#ifdef CONFIG_SMP
/* 设置 idle 进程名为 "swapper/%d" */
sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
#endif
}
BOOT CPU
的启动过程接近尾声了,接下就是其它 非 BOOT CPU
的启动的前期准备工作:
/* init/main.c */
static noinline void __ref rest_init(void)
{
/* 做内核剩余初始化工作的内核线程:其它非 BOOT CPU 将从 kernel_init() 拉起 */
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
...
/* BOOT CPU 的 idle 进程 CPU 亲和性设置:限定到 BOOT CPU 上运行 */
rcu_read_lock();
tsk = find_task_by_pid_ns(pid, &init_pid_ns);
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
rcu_read_unlock();
/* 创建并唤醒【用于创建内核线程的内核线程 kthreadd 】 */
pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
...
system_state = SYSTEM_SCHEDULING;
complete(&kthreadd_done);
schedule_preempt_disabled();
cpu_startup_entry(CPUHP_ONLINE); /* BOOT CPU 进入其 idle 进程 */
}
/* kernel/sched/idle.c */
/* 在 BOOT CPU 上启动其 idle 进程 */
void cpu_startup_entry(enum cpuhp_state state)
{
...
while (1)
do_idle();
}
到此,BOOT CPU
从已经启动完毕,进入了其 idle
进程。
前面我们看到,从 BOOT CPU
启动了一个入口为 kernel_init()
的内核线程,它负责完成内核中剩余的初始化工作,其中就包括 非 BOOT CPU
的启动工作。我们来看 非 BOOT CPU
启动的细节。
kernel_init()
kernel_init_freeable()
smp_init() /* 启动其它 非 BOOT CPU */
/* kernel/smp.c */
/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
idle_threads_init(); /* 为系统中的所有 CPU 创建每 CPU 的 idle 线程数据(task_struct等) 并初始化 */
cpuhp_threads_init(); /* 为系统中的所有 CPU 创建每 CPU 的 热插拔管理 内核线程 */
pr_info("Bringing up secondary CPUs ...\n");
/* 启动所有非 BOOT CPU ,逐个按顺序启动 */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
break;
if (!cpu_online(cpu)) /* CPU 尚未启动 */
cpu_up(cpu); /* 启动 CPU @cpu */
}
...
}
用 cpu_up()
启动一个 CPU:
/* kernel/cpu.c */
int cpu_up(unsigned int cpu)
{
return do_cpu_up(cpu, CPUHP_ONLINE);
}
static int do_cpu_up(unsigned int cpu, enum cpuhp_state target)
{
int err = 0;
...
err = _cpu_up(cpu, 0, target);
...
return err;
}
static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target)
{
int ret = 0;
struct task_struct *idle;
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
...
if (st->state == CPUHP_OFFLINE) { /* 如果 @cpu 处于离线关闭状态 */
idle = idle_thread_get(cpu);
...
}
cpuhp_set_state(st, target); /* 标记 @cpu 为目标状态 @target: st->target = CPUHP_ONLINE */
target = min((int)target, CPUHP_BRINGUP_CPU);
ret = cpuhp_up_callbacks(cpu, st, target);
...
return ret;
}
/* 设置 CPU 目标状态,返回 CPU 的当前状态 */
static inline enum cpuhp_state
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
st->rollback = false;
st->last = NULL;
st->target = target; /* 设置 CPU 目标状态为 @target */
st->single = false;
/*
* 如果 CPU 的 【目标状态 @target > 当前状态 @st->state】,设为 true ,表示是 CPU 启动正向过程;
* 如果 CPU 的 【目标状态 @target <= 当前状态 @st->state】,设为 false ,表示是 CPU 关闭反向过程。
*/
st->bringup = st->state < target;
return prev_state; /* 返回 CPU 的当前状态 */
}
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
enum cpuhp_state prev_state = st->state;
int ret = 0;
/* 逐个调用状态区间 [CPUHP_OFFLINE, CPUHP_BRINGUP_CPU] 所有热插拔状态的回调 */
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
...
}
return ret;
}
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
struct hlist_node **lastp)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
struct cpuhp_step *step = cpuhp_get_step(state); /* 获取 @state 的回调接口 */
if (!step->multi_instance) {
...
/* 我们只关心和分析相关的状态回调 bringup_cpu() */
ret = cb(cpu); /* ..., bringup_cpu() */
...
return ret;
}
}
/* Boot processor state steps */
/*
* 我们只关注状态 CPUHP_BRINGUP_CPU 的回调,其它的状态回调对我们
* 的分析没有本质影响。
*/
static struct cpuhp_step cpuhp_bp_states[] = {
...
#ifdef CONFIG_SMP
...
/* Kicks the plugged cpu into life */
[CPUHP_BRINGUP_CPU] = {
.name = "cpu:bringup",
.startup.single = bringup_cpu,
.teardown.single = NULL,
.cant_stop = true,
},
...
#else
...
#endif
};
static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
{
struct cpuhp_step *sp;
sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states;
return sp + state;
}
static bool cpuhp_is_ap_state(enum cpuhp_state state)
{
/*
* The extra check for CPUHP_TEARDOWN_CPU is only for documentation
* purposes as that state is handled explicitly in cpu_down.
*/
return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU;
}
看 CPU 热插拔状态 CPUHP_BRINGUP_CPU
回调 bringup_cpu()
:
/* kernel/cpu.c */
static int bringup_cpu(unsigned int cpu)
{
struct task_struct *idle = idle_thread_get(cpu);
int ret;
...
/* Arch-specific enabling code. */
ret = __cpu_up(cpu, idle); /* 进入 CPU 启动架构相关的流程 */
...
/*
* 非 BOOT CPU 启动最后,CPU 进入 状态时被唤醒:
* secondary_start_kernel()
* cpu_startup_entry(CPUHP_AP_ONLINE_IDLE)
* cpuhp_online_idle(state)
* st->state = CPUHP_AP_ONLINE_IDLE;
* complete_ap_thread(st, true);
* while (1)
* do_idle();
*/
return bringup_wait_for_ap(cpu); /* 等待 CPU 启动完成(进入 CPUHP_AP_ONLINE_IDLE 态) */
}
CPU 启动 ARM 架构相关的流程:
/* arch/arm/kernel/smp.c */
int __cpu_up(unsigned int cpu, struct task_struct *idle)
{
/* 配置 @cpu 的内核栈空间 */
secondary_data.stack = task_stack_page(idle) + THREAD_START_SP;
...
#ifdef CONFIG_MMU
/* 配置 @cpu 的页表 */
secondary_data.pgdir = virt_to_phys(idmap_pgd);
secondary_data.swapper_pg_dir = get_arch_pgd(swapper_pg_dir);
#endif
sync_cache_w(&secondary_data); /* cache 同步 */
ret = smp_ops.smp_boot_secondary(cpu, idle); /* psci_boot_secondary() */
if (ret == 0) {
/*
* CPU was successfully started, wait for it
* to come online or time out.
*/
/* 等待 CPU 成功启动:
* secondary_start_kernel() -> complete(&cpu_running)
*/
wait_for_completion_timeout(&cpu_running,
msecs_to_jiffies(1000));
if (!cpu_online(cpu)) { /* CPU 应该已经处于在线状态 */
pr_crit("CPU%u: failed to come online\n", cpu);
ret = -EIO;
}
} else {
...
}
/* secondary_data 数据是所有非 BOOT CPU 共享的,每个 CPU 启动时都要重新设置 */
memset(&secondary_data, 0, sizeof(secondary_data));
return ret;
}
从这里开始,通过 PSCI
接口 psci_boot_secondary()
来启动 CPU :
/* arch/arm/kernel/psci_smp.c */
static int psci_boot_secondary(unsigned int cpu, struct task_struct *idle)
{
if (psci_ops.cpu_on) /* psci_cpu_on() */
return psci_ops.cpu_on(cpu_logical_map(cpu),
virt_to_idmap(&secondary_startup));
return -ENODEV;
}
/* drivers/firmware/psci.c */
static int psci_cpu_on(unsigned long cpuid, unsigned long entry_point)
{
int err;
u32 fn; /* PSCI 请求功能号 */
fn = psci_function_id[PSCI_FN_CPU_ON]; /* 启用 CPU 的 PSCI 功能号 */
// 中转调用请求给实现了 PSCI 接口的固件 ATF 。
// ATF 处理完 @fn 请求后,会调用 secondary_startup ,
// 然后返回此处继续执行。
// 通过 SMC 指令发起 PSCI 功能请求。
err = invoke_psci_fn(fn, cpuid, entry_point, 0); /* __invoke_psci_fn_smc() */
return psci_to_linux_errno(err);
}
进入 ATF 固件中的 PSCI 启动 CPU 功能接口
后,该接口会跳转到 secondary_startup()
继续执行:
/* arch/arm/kernel/head.S */
ENTRY(secondary_startup)
#ifdef CONFIG_ARM_VIRT_EXT
bl __hyp_stub_install_secondary
#endif
safe_svcmode_maskall r9
mrc p15, 0, r9, c0, c0 @ get processor id
bl __lookup_processor_type
movs r10, r5 @ invalid processor?
moveq r0, #'p' @ yes, error 'p'
beq __error_p
/*
* Use the page tables supplied from __cpu_up.
*/
adr r4, __secondary_data
ldmia r4, {r5, r7, r12} @ address to jump to after
sub lr, r4, r5 @ mmu has been enabled
add r3, r7, lr
ldrd r4, [r3, #0] @ get secondary_data.pgdir
ARM_BE8(eor r4, r4, r5) @ Swap r5 and r4 in BE:
ARM_BE8(eor r5, r4, r5) @ it can be done in 3 steps
ARM_BE8(eor r4, r4, r5) @ without using a temp reg.
ldr r8, [r3, #8] @ get secondary_data.swapper_pg_dir
badr lr, __enable_mmu @ return address
mov r13, r12 @ __secondary_switched address
ldr r12, [r10, #PROCINFO_INITFUNC]
add r12, r12, r10 @ initialise processor
@ (return control reg)
ret r12 /* 跳转到 secondary_start_kernel() */
ENDPROC(secondary_startup)
/*
* r6 = &secondary_data
*/
ENTRY(__secondary_switched)
ldr sp, [r7, #12] @ get secondary_data.stack
mov fp, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
...
/*
* r6 = &secondary_data
*/
ENTRY(__secondary_switched)
ldr sp, [r7, #12] @ get secondary_data.stack
mov fp, #0
b secondary_start_kernel
ENDPROC(__secondary_switched)
.align
.type __secondary_data, %object
__secondary_data:
.long .
.long secondary_data
.long __secondary_switched
/* arch/arm/kernel/smp.c */
asmlinkage void secondary_start_kernel(void)
{
struct mm_struct *mm = &init_mm;
unsigned int cpu;
secondary_biglittle_init();
cpu_switch_mm(mm->pgd, mm);
local_flush_bp_all();
enter_lazy_tlb(mm, current);
local_flush_tlb_all();
cpu = smp_processor_id();
mmgrab(mm);
current->active_mm = mm;
cpumask_set_cpu(cpu, mm_cpumask(mm));
cpu_init();
preempt_disable(); /* 禁用当前CPU抢占 */
/*
* CPU 热插拔上线前的所有准备工作:
* 触发所有状态 CPUHP_AP_ONLINE 之前的 cpu 热插拔回调
* (CPUHP_BRINGUP_CPU + 1 -> CPUHP_AP_ONLINE)
*/
notify_cpu_starting(cpu);
...
set_cpu_online(cpu, true);
// __cpu_up() -> wait_for_completion_timeout(&cpu_running, ...)
complete(&cpu_running);
/*
* OK, it's off to the idle thread for us
*/
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); /* 进入当前 CPU 的idle 进程 */
}
/* kernel/cpu.c */
void notify_cpu_starting(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE);
int ret;
st->booted_once = true;
/* 进入 CPU CPUHP_AP_ONLINE 态的所有 @cpu 热插拔状态的工作:
* 触发状态区间 [CPUHP_BRINGUP_CPU + 1, CPUHP_AP_ONLINE] 回
* 调。
*/
while (st->state < target) {
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); /* 如 gic_starting_cpu(), ... */
/*
* STARTING must not fail!
*/
WARN_ON_ONCE(ret);
}
}
CPU 已启动完毕,然后最终进入 idle 状态:
void cpu_startup_entry(enum cpuhp_state state)
{
...
cpuhp_online_idle(state);
while (1)
do_idle();
}
void cpuhp_online_idle(enum cpuhp_state state)
{
...
st->state = CPUHP_AP_ONLINE_IDLE;
// bringup_cpu() -> bringup_wait_for_ap(cpu)
complete_ap_thread(st, true);
}
我们可以看到,系统 BOOT 阶段,非 BOOT CPU 是逐个、按严格的先后顺序启动的:只有前一 CPU 进入 idle 循环后,后一个 CPU 的启动工作,才会开始。
如果在不支持或没有实现 PSCI
固件功能的 ARMv7
架构平台,各 CPU 的启动流程稍有不同,下面我们以全志 sun8i SoC
为例,来说明 CPU 的启动流程。
start_kernel()
...
setup_arch(&command_line)
...
arm_dt_init_cpu_maps()
#ifdef CONFIG_SMP
if (is_smp()) {
if (!mdesc->smp_init || !mdesc->smp_init()) {
if (psci_smp_available())
...
else if (mdesc->smp)
smp_set_ops(mdesc->smp); /* sun8i_smp_ops */
}
}
...
#endif
...
sched_init()
...
rest_init()
pid = kernel_thread(kernel_init, NULL, CLONE_FS);
...
cpu_startup_entry(CPUHP_ONLINE);
kernel_init()
kernel_init_freeable()
smp_init()
cpu_up(cpu)
do_cpu_up(cpu, CPUHP_ONLINE)
_cpu_up(cpu, 0, target)
...
bringup_cpu()
__cpu_up(cpu, idle)
sun8i_smp_boot_secondary()
static int sun8i_smp_boot_secondary(unsigned int cpu,
struct task_struct *idle)
{
u32 reg;
if (!(prcm_membase && cpucfg_membase))
return -EFAULT;
spin_lock(&cpu_lock);
/* Set CPU boot address */
/* 设置非 BOOT CPU 的启动地址为 secondary_startup */
writel(__pa_symbol(secondary_startup),
cpucfg_membase + CPUCFG_PRIVATE0_REG);
/* Assert the CPU core in reset */
writel(0, cpucfg_membase + CPUCFG_CPU_RST_CTRL_REG(cpu));
/* Assert the L1 cache in reset */
reg = readl(cpucfg_membase + CPUCFG_GEN_CTRL_REG);
writel(reg & ~BIT(cpu), cpucfg_membase + CPUCFG_GEN_CTRL_REG);
/* Clear CPU power-off gating */
reg = readl(prcm_membase + PRCM_CPU_PWROFF_REG);
writel(reg & ~BIT(cpu), prcm_membase + PRCM_CPU_PWROFF_REG);
mdelay(1);
/* Deassert the CPU core reset */
writel(3, cpucfg_membase + CPUCFG_CPU_RST_CTRL_REG(cpu));
spin_unlock(&cpu_lock);
return 0;
}
看看 sun8i_smp_boot_secondary()
的逻辑,就是把 非 BOOT CPU
的启动地址设置为 secondary_startup
,即 CPU 启动时从 secondary_startup
开始执行,后面的流程就和 PSCI 一样了:
secondary_startup
secondary_start_kernel()
...
notify_cpu_starting(cpu)
...
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE)
start_kernel()
rest_init()
pid = kernel_thread(kernel_init, NULL, CLONE_FS)
kernel_init()
kernel_init_freeable()
smp_init() /* 启动所有 CPU */
...
do_basic_setup()
...
do_initcalls()
for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
do_initcall_level(level)
for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
do_one_initcall(*fn)
ret = fn() /* topology_init() */
/* include/linux/cpu.h */
struct cpu { /* CPU 设备抽象*/
int node_id; /* The node which contains the CPU */
/* CPU 是否可以热插拔: BOOT CPU 不允许热插拔 */
int hotpluggable; /* creates sysfs control file if hotpluggable */
struct device dev;
};
/* arch/arm/include/asm/cpu.h */
struct cpuinfo_arm { /* ARM CPU 设备抽象*/
struct cpu cpu;
u32 cpuid;
#ifdef CONFIG_SMP
unsigned int loops_per_jiffy;
#endif
}
/* arch/arm/kernel/setup.c */
DEFINE_PER_CPU(struct cpuinfo_arm, cpu_data); /* ARM 平台每 CPU 的信息数据 */
static int __init topology_init(void)
{
int cpu;
for_each_possible_cpu(cpu) {
struct cpuinfo_arm *cpuinfo = &per_cpu(cpu_data, cpu);
cpuinfo->cpu.hotpluggable = platform_can_hotplug_cpu(cpu);
register_cpu(&cpuinfo->cpu, cpu);
}
return 0;
}
/* drivers/base/cpu.c */
static DEFINE_PER_CPU(struct device *, cpu_sys_devices);
struct bus_type cpu_subsys = {
.name = "cpu",
.dev_name = "cpu",
.match = cpu_subsys_match,
#ifdef CONFIG_HOTPLUG_CPU
/* 用来处理 CPU 热插拔。热插拔细节在章节 4 展开 */
.online = cpu_subsys_online,
.offline = cpu_subsys_offline,
#endif
};
int register_cpu(struct cpu *cpu, int num)
{
int error;
cpu->node_id = cpu_to_node(num);
memset(&cpu->dev, 0x00, sizeof(struct device));
cpu->dev.id = num; /* Linux CPU 编号 */
cpu->dev.bus = &cpu_subsys;
...
error = device_register(&cpu->dev); /* 注册 CPU 设备到 driver core */
...
per_cpu(cpu_sys_devices, num) = &cpu->dev;
register_cpu_under_node(num, cpu_to_node(num));
...
return 0;
}
每个 CPU 都有一个热插拔处理线程,前面的流程中,我们没有仔细分析它们,现在来看一下:
kernel_init()
kernel_init_freeable()
smp_init()
...
cpuhp_threads_init(); /* 为系统中的所有 CPU 创建每 CPU 的 热插拔管理 内核线程 */
...
for_each_present_cpu(cpu) {
...
if (!cpu_online(cpu)) /* CPU 尚未启动 */
cpu_up(cpu); /* 启动 CPU @cpu */
}
/* kernel/cpu.c */
/* 每 CPU 的热插拔[状态、内核线程等]数据 */
static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = {
.fail = CPUHP_INVALID,
};
static struct smp_hotplug_thread cpuhp_threads = {
/*
* store 指向每cpu的热插拔管理数据 cpuhp_state 的 thread:
* smpboot_register_percpu_thread(&cpuhp_threads) 调用过程中,
* 会设定到创建的热插拔线程对应的 task_struct
*/
.store = &cpuhp_state.thread,
.create = &cpuhp_create,
.thread_should_run = cpuhp_should_run,
.thread_fn = cpuhp_thread_fun,
.thread_comm = "cpuhp/%u",
.selfparking = true,
};
void __init cpuhp_threads_init(void)
{
BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); /* 为系统中的每个 CPU 都创建一个热插拔处理内核线程 */
kthread_unpark(this_cpu_read(cpuhp_state.thread)); /* 启动当前 CPU 的热插拔处理内核线程 */
}
/* kernel/smpboot.c */
int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
cpumask_copy(plug_thread->cpumask, cpumask); /* 设置所有要创建热插拔管理线程的 CPU 掩码 */
for_each_online_cpu(cpu) { /* 为当前在线的 CPU 创建内核线程:当前只有 BOOT CPU 在线 */
/* 创建 @cpu 的热插拔管理内核线程:创建但不启动它 */
ret = __smpboot_create_thread(plug_thread, cpu);
...
if (cpumask_test_cpu(cpu, cpumask)) /* 如果 @cpu 在 @cpumask 中 */
smpboot_unpark_thread(plug_thread, cpu); /* 则启动 @cpu 的内核线程 */
}
list_add(&plug_thread->list, &hotplug_threads);
...
return ret;
}
因为当前只有 BOOT CPU
在线,所以只为 BOOT CPU
创建了1个热插拔内核线程。
在 非 BOOT CPU
启动过程中,在进入 CPUHP_BRINGUP_CPU
状态拉起 CPU 之前,会经过 CPUHP_CREATE_THREADS
状态 ,此时会触发回调 smpboot_create_threads()
,建立当前启动 CPU 的热插拔管理线程:
/* kernel/smpboot.c */
int smpboot_create_threads(unsigned int cpu)
{
struct smp_hotplug_thread *cur;
int ret = 0;
mutex_lock(&smpboot_threads_lock);
list_for_each_entry(cur, &hotplug_threads, list) {
/* 创建 @cpu 的各内核线程(包括 @cpu 的热插拔管理内核线程) */
ret = __smpboot_create_thread(cur, cpu);
if (ret)
break;
}
mutex_unlock(&smpboot_threads_lock);
return ret;
}
系统启动后,我们可以查看到各 CPU 热插拔管理内核线程:
root@qemu-ubuntu:~# ps -ef | grep cpuhp | grep -v grep
root 13 2 0 03:00 ? 00:00:00 [cpuhp/0]
root 14 2 0 03:00 ? 00:00:00 [cpuhp/1]
root 20 2 0 03:00 ? 00:00:00 [cpuhp/2]
root 26 2 0 03:00 ? 00:00:00 [cpuhp/3]
本小节给出 CPU 热插拔过程的概述,由于涉及的细节太多,限于篇幅,将不做深入展开。
以一条 shell 命令发起 CPU offline 过程:
# echo 0 > /sys/devices/system/cpu/cpuN/online
这将触发接口 cpu_subsys_offline()
:
device_offline()
dev->bus->offline(dev) = cpu_subsys_offline(dev)
cpu_down(dev->id)
do_cpu_down(cpu, CPUHP_OFFLINE)
cpu_down_maps_locked(cpu, target)
_cpu_down(cpu, 0, target)
BOOT CPU 是不支持 offline 的,我们查看 BOOT CPU 的 sysfs 接口:
root@qemu-ubuntu:~# ls -l /sys/devices/system/cpu/cpu0
total 0
-rw-r--r-- 1 root root 4096 Mar 26 06:04 cpu_capacity
-r-------- 1 root root 4096 Mar 26 06:04 crash_notes
-r-------- 1 root root 4096 Mar 26 06:04 crash_notes_size
drwxr-xr-x 2 root root 0 Mar 26 06:04 hotplug
lrwxrwxrwx 1 root root 0 Mar 26 06:04 of_node -> ../../../../firmware/devicetree/base/cpus/cpu@0
drwxr-xr-x 2 root root 0 Mar 26 06:04 power
lrwxrwxrwx 1 root root 0 Mar 26 06:04 subsystem -> ../../../../bus/cpu
drwxr-xr-x 2 root root 0 Mar 26 06:04 topology
-rw-r--r-- 1 root root 4096 Mar 26 06:04 uevent
我们看到,cpu0
没有 online
属性导出,自然也就不支持 offline
和 online
操作。
以一条 shell 命令发起 CPU online 过程:
# echo 1 > /sys/devices/system/cpu/cpuN/online
这将触发接口 cpu_subsys_online()
:
device_online()
dev->bus->online(dev) = cpu_subsys_online(dev)
cpu_up(cpuid)
do_cpu_up(cpu, CPUHP_ONLINE)
从上两小节的描述,我们看不出 CPU 热插拔(offline/online
)和其热插拔管理线程有什么关系,我们在这里以 offline
过程为例,展开其细节:
device_offline()
...
do_cpu_down(cpu, CPUHP_OFFLINE)
...
_cpu_down(cpu, 0, target)
static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
enum cpuhp_state target)
{
...
if (st->state > CPUHP_TEARDOWN_CPU) {
st->target = max((int)target, CPUHP_TEARDOWN_CPU);
/* 唤醒 CPU 热插拔管理线程,处理 CPU offline 过程前期部分 */
ret = cpuhp_kick_ap_work(cpu);
...
if (st->state > CPUHP_TEARDOWN_CPU)
goto out;
st->target = target;
}
/* 处理剩余的 CPU offline 过程: 调用状态区间 [..., CPUHP_TEARDOWN_CPU] 各回调 */
ret = cpuhp_down_callbacks(cpu, st, target);
...
out:
...
return ret;
}
随着 CPU 的热插拔管理线程被唤醒,将处理 CPU offline
过程前期部分:
/* kernel/smpboot.c */
static int smpboot_thread_fn(void *data)
{
struct smpboot_thread_data *td = data;
struct smp_hotplug_thread *ht = td->ht;
while (1) {
...
if (!ht->thread_should_run(td->cpu)) {
...
} else {
__set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu); /* cpuhp_thread_fun() */
}
}
}
/* kernel/cpu.c */
static void cpuhp_thread_fun(unsigned int cpu)
{
...
if (st->single) {
...
} else {
if (bringup) {
...
} else {
state = st->state;
st->state--; /* 更新状态 */
st->should_run = (st->state > st->target);
}
}
...
/* CPU offline 状态回调:每次调用 1 个状态回调,直到达到目标状态 @st->target 为止 */
st->result = cpuhp_invoke_callback(cpu, state, bringup, st->node, &st->last);
...
}
CPU online
过程中,唤醒热插拔管理线程的过程类似,在此不再赘述,感兴趣的童鞋可自行阅读代码分析。
《DEN0013D_cortex_a_series_PG.pdf》
《learn_the_architecture_-_trustzone_for_aarch64_102418_0101_01_en.pdf》
https://www.kernel.org/doc/Documentation/arm/Booting
https://lwn.net/Articles/557132/
https://github.com/ARM-software/arm-trusted-firmware