前面已经介绍了 Zircon 内核启动的汇编代码部分,主要是一些 CPU 的初始化。
现在 prime CPU 已经来到了 C 世界的 lk_main() 函数
其他 CPU 也来到了 arm64_secondary_entry() 函数
lk_main() 是打开 Zircon 内核世界的大门,由 prime cpu 敲开,一眼望去你就会发现 lk_main 一行行代码目的都非常明确:
硬件环境初始化遵循 CPU 架构 -> 目标平台 -> 目标设备 的初始化顺序
线程早期初始化
调用全局构造函数
CPU 架构早期初始化
平台早期初始化
目标设备早期初始化(仅是 Hook)
虚拟内存预初始化
内核堆初始化
虚拟内存初始化
内核初始化
第二阶段初始化(由创建出的 bootstrap2 线程完成)
bootstrap2 设为 prime cpu 的 IDLE 线程,Zircon 初始化完成
每当 prime CPU 进行一个阶段的初始化时,都会通过 lk_primary_cpu_init_level() 函数通知其他 CPU 当前初始化的进度
lk_main 代码:
//内核初始化
// called from arch code
void lk_main() {
// serial prints to console based on compile time switch
dlog_bypass_init_early();
// get us into some sort of thread context
// 初始化线程表 *
thread_init_early();
// deal with any static constructors
// 调用全局构造函数
// 全局构造函数即打了 __attribute__ ((constructor)) 的函数,将被编译器编译到 .init 段中
call_constructors();
// early arch stuff
lk_primary_cpu_init_level(LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_ARCH_EARLY - 1);
// CPU 架构早期初始化 *
arch_early_init();
// do any super early platform initialization
lk_primary_cpu_init_level(LK_INIT_LEVEL_ARCH_EARLY, LK_INIT_LEVEL_PLATFORM_EARLY - 1);
// 平台早期初始化 *
platform_early_init();
// do any super early target initialization
lk_primary_cpu_init_level(LK_INIT_LEVEL_PLATFORM_EARLY, LK_INIT_LEVEL_TARGET_EARLY - 1);
// 目标设备初始化 *
// 实际还没实现
target_early_init();
dprintf(INFO, "\nwelcome to Zircon\n\n");
dprintf(INFO, "KASLR: .text section at %p\n", __code_start);
lk_primary_cpu_init_level(LK_INIT_LEVEL_TARGET_EARLY, LK_INIT_LEVEL_VM_PREHEAP - 1);
dprintf(SPEW, "initializing vm pre-heap\n");
// 内核堆初始化之前 *
// 主要是配置页表和虚拟内存
vm_init_preheap();
// bring up the kernel heap
lk_primary_cpu_init_level(LK_INIT_LEVEL_VM_PREHEAP, LK_INIT_LEVEL_HEAP - 1);
dprintf(SPEW, "initializing heap\n");
// 内核堆初始化 *
heap_init();
lk_primary_cpu_init_level(LK_INIT_LEVEL_HEAP, LK_INIT_LEVEL_VM - 1);
dprintf(SPEW, "initializing vm\n");
// 虚拟内存初始化 *
vm_init();
// initialize the kernel
lk_primary_cpu_init_level(LK_INIT_LEVEL_VM, LK_INIT_LEVEL_KERNEL - 1);
dprintf(SPEW, "initializing kernel\n");
// 内核初始化 *
kernel_init();
lk_primary_cpu_init_level(LK_INIT_LEVEL_KERNEL, LK_INIT_LEVEL_THREADING - 1);
// create a thread to complete system initialization
dprintf(SPEW, "creating bootstrap completion thread\n");
// 创建 bootstrap2 线程
// 由 bootstrap2 线程完成剩下的初始化工作
// 虽说是在 bootstrap2 线程中跑,但此时没有开启任务调度,所以实际还是在本线程
// 因此下面代码是顺序执行的
thread_t* t = thread_create("bootstrap2", &bootstrap2, NULL, DEFAULT_PRIORITY);
thread_set_cpu_affinity(t, cpu_num_to_mask(0));
thread_detach(t);
thread_resume(t);
// become the idle thread and enable interrupts to start the scheduler
// 所有任务完成,直接称为 prime CPU 的 IDLE 线程
thread_become_idle();
}
thread_init_early():
重点在初始化线程优先级表:
sched_init_early():
和 linux 类似的数据结构
void sched_init_early() {
// initialize the run queues
// 每个 CPU 一个表
for (unsigned int cpu = 0; cpu < SMP_MAX_CPUS; cpu++)
for (unsigned int i = 0; i < NUM_PRIORITIES; i++) {
// 每个优先级一个链表
list_initialize(&percpu[cpu].run_queue[i]);
}
}
extern void (*const __init_array_start[])();
extern void (*const __init_array_end[])();
static void call_constructors() {
for (void (*const* a)() = __init_array_start; a != __init_array_end; a++)
(*a)();
}
函数上打了 attribute ((constructor)) 则为全局构造函数,编译器将其编译到 .init 段
而 __init_array_start 和 __init_array_end 是该段的开始和结尾
arm64_cpu_early_init() 这个函数每个 CPU 内核都会走到,非 prime CPU 通过 arm64_secondary_entry() 调用到这里
这里主要是跟 CPU 特性相关的初始化
//初始化每个 CPU 内核
static void arm64_cpu_early_init() {
// Make sure the per cpu pointer is set up.
// 检查 CPU 指针
arm64_init_percpu_early();
// 设置 EL1 的异常向量表
// Set the vector base.
ARM64_WRITE_SYSREG(VBAR_EL1, (uint64_t)&arm64_el1_exception_base);
// 系统控制寄存器(SCTLR)用于控制标准内存和系统设备,并为在硬件内核中实现的功能提供状态信息。
// https://www.jianshu.com/p/885913b7201c
// Set some control bits in sctlr.
uint64_t sctlr = ARM64_READ_SYSREG(sctlr_el1);
sctlr |= SCTLR_EL1_UCI | SCTLR_EL1_UCT | SCTLR_EL1_DZE | SCTLR_EL1_SA0 | SCTLR_EL1_SA;
sctlr &= ~SCTLR_EL1_AC; // Disable alignment checking for EL1, EL0.
ARM64_WRITE_SYSREG(sctlr_el1, sctlr);
// Save all of the features of the cpu.
// 收集 CPU 支持的 Feature *
arm64_feature_init();
// 打开 CPU 计数器, 读取这个 PMCCNTR_EL0 寄存器值,就可以知道当前 CPU 已运行了多少 Cycle。
// Enable cycle counter.
ARM64_WRITE_SYSREG(pmcr_el0, PMCR_EL0_ENABLE_BIT | PMCR_EL0_LONG_COUNTER_BIT);
ARM64_WRITE_SYSREG(pmcntenset_el0, PMCNTENSET_EL0_ENABLE);
// 使用户态可以读取计数寄存器
// Enable user space access to cycle counter.
ARM64_WRITE_SYSREG(pmuserenr_el0, PMUSERENR_EL0_ENABLE);
// Enable Debug Exceptions by Disabling the OS Lock. The OSLAR_EL1 is a WO
// register with only the low bit defined as OSLK. Write 0 to disable.
ARM64_WRITE_SYSREG(oslar_el1, 0x0);
// Enable user space access to virtual counter (CNTVCT_EL0).
// 使用户态可以读取 virtual counter
ARM64_WRITE_SYSREG(cntkctl_el1, CNTKCTL_EL1_ENABLE_VIRTUAL_COUNTER);
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100048_0002_05_en/jfa1406793259505.html
// 启用本地内核调试
// 监视 debug 事件,启用断点/监视/向量调试功能
ARM64_WRITE_SYSREG(mdscr_el1, MSDCR_EL1_INITIAL_VALUE);
// 开启快速中断
arch_enable_fiqs();
}
arm64_feature_init():
检查一下特性:
// call on every cpu to save features
void arm64_feature_init() {
// set up some global constants based on the boot cpu
cpu_num_t cpu = arch_curr_cpu_num();
if (cpu == 0) {
// read the block size of DC ZVA
uint64_t dczid = ARM64_READ_SYSREG(dczid_el0);
uint32_t arm64_zva_shift = 0;
if (BIT(dczid, 4) == 0) {
arm64_zva_shift = (uint32_t)(ARM64_READ_SYSREG(dczid_el0) & 0xf) + 2;
}
ASSERT(arm64_zva_shift != 0); // for now, fail if DC ZVA is unavailable
arm64_zva_size = (1u << arm64_zva_shift);
// read the dcache and icache line size
uint64_t ctr = ARM64_READ_SYSREG(ctr_el0);
uint32_t arm64_dcache_shift = (uint32_t)BITS_SHIFT(ctr, 19, 16) + 2;
arm64_dcache_size = (1u << arm64_dcache_shift);
uint32_t arm64_icache_shift = (uint32_t)BITS(ctr, 3, 0) + 2;
arm64_icache_size = (1u << arm64_icache_shift);
// parse the ISA feature bits
// 收集 CPU 支持的加密算法
// 每个内核都要跑一次
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100048_0002_05_en/jfa1406793234300.html
arm64_features |= ZX_HAS_CPU_FEATURES;
uint64_t isar0 = ARM64_READ_SYSREG(id_aa64isar0_el1);
if (BITS_SHIFT(isar0, 7, 4) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_AES;
}
if (BITS_SHIFT(isar0, 7, 4) >= 2) {
arm64_features |= ZX_ARM64_FEATURE_ISA_PMULL;
}
if (BITS_SHIFT(isar0, 11, 8) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_SHA1;
}
if (BITS_SHIFT(isar0, 15, 12) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_SHA2;
}
if (BITS_SHIFT(isar0, 19, 16) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_CRC32;
}
if (BITS_SHIFT(isar0, 23, 20) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_ATOMICS;
}
if (BITS_SHIFT(isar0, 31, 28) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_RDM;
}
if (BITS_SHIFT(isar0, 35, 32) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_SHA3;
}
if (BITS_SHIFT(isar0, 39, 36) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_SM3;
}
if (BITS_SHIFT(isar0, 43, 40) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_SM4;
}
if (BITS_SHIFT(isar0, 47, 44) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_DP;
}
// 是否支持数据高速缓存
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100048_0002_05_en/jfa1406793234300.html
uint64_t isar1 = ARM64_READ_SYSREG(id_aa64isar1_el1);
if (BITS_SHIFT(isar1, 3, 0) >= 1) {
arm64_features |= ZX_ARM64_FEATURE_ISA_DPB;
}
// 是否支持浮点计算
// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.100048_0002_05_en/jfa1406793234300.html
uint64_t pfr0 = ARM64_READ_SYSREG(id_aa64pfr0_el1);
if (BITS_SHIFT(pfr0, 19, 16) < 0b1111) {
arm64_features |= ZX_ARM64_FEATURE_ISA_FP;
}
// 是否支持 SIMD,即单指令多数据集指令,例如向量运算
if (BITS_SHIFT(pfr0, 23, 20) < 0b1111) {
arm64_features |= ZX_ARM64_FEATURE_ISA_ASIMD;
}
}
// read the cache info for each cpu
arm64_get_cache_info(&(cache_info[cpu]));
// check to make sure implementation supports 16 bit asids
// 是否支持 address space ID
uint64_t mmfr0 = ARM64_READ_SYSREG(ID_AA64MMFR0_EL1);
ASSERT((mmfr0 & ARM64_MMFR0_ASIDBITS_MASK) == ARM64_MMFR0_ASIDBITS_16);
}
主要处理和物理内存相关的以及 ZBI(Zircon 启动镜像)
// 平台初始化 ARM64 *
void platform_early_init(void) {
// if the zbi_paddr variable is -1, it was not set
// in start.S, so we are in a bad place.
// 内核启动镜像的物理地址,这是一个内核启动参数
if (zbi_paddr == -1UL) {
panic("no zbi_paddr!\n");
}
// 返回内核启动镜像的虚拟地址
void* zbi_vaddr = paddr_to_physmap(zbi_paddr);
// initialize the boot memory reservation system
// 把内核镜像所在的内存区域加入 PMM(物理内存管理器) 的保留内存区域列表
boot_reserve_init();
// ramdisk 的 base 地址
if (zbi_vaddr && is_zbi_container(zbi_vaddr)) {
zbi_header_t* header = (zbi_header_t*)zbi_vaddr;
ramdisk_base = header;
ramdisk_size = ROUNDUP(header->length + sizeof(*header), PAGE_SIZE);
} else {
panic("no bootdata!\n");
}
if (!ramdisk_base || !ramdisk_size) {
panic("no ramdisk!\n");
}
zbi_header_t* zbi = reinterpret_cast<zbi_header_t*>(ramdisk_base);
// 处理其他 zbi(Zircon Boot Image) *
// Zircon 内核启动镜像在代码中被抽象成一个个 ZBI,不仅仅内核镜像是一个 ZBI,内核驱动/Ramdisk 等也是一个个 ZBI
// walk the zbi structure and process all the items
process_zbi(zbi);
// is the cmdline option to bypass dlog set ?
dlog_bypass_init();
// bring up kernel drivers after we have mapped our peripheral ranges
// 初始化内核设备驱动 *
pdev_init(zbi);
// Serial port should be active now
// 读内核启动参数 halt-on-panic
// Read cmdline after processing zbi, which may contain cmdline data.
halt_on_panic = cmdline_get_bool("kernel.halt-on-panic", false);
// Check if serial should be enabled
// 串口是否打开
const char* serial_mode = cmdline_get("kernel.serial");
uart_disabled = (serial_mode != NULL && !strcmp(serial_mode, "none"));
// add the ramdisk to the boot reserve memory list
// 把 ramdisk 镜像所在的内存区域加入 PMM(物理内存管理器) 的保留内存区域列表
paddr_t ramdisk_start_phys = physmap_to_paddr(ramdisk_base);
paddr_t ramdisk_end_phys = ramdisk_start_phys + ramdisk_size;
dprintf(INFO, "reserving ramdisk phys range [%#" PRIx64 ", %#" PRIx64 "]\n",
ramdisk_start_phys, ramdisk_end_phys - 1);
boot_reserve_add_range(ramdisk_start_phys, ramdisk_size);
//如果配置了内存限制,则初始化内存限制
// check if a memory limit was passed in via kernel.memory-limit-mb and
// find memory ranges to use if one is found.
zx_status_t status = memory_limit_init();
if (status == ZX_OK) {
// Figure out and add arenas based on the memory limit and our range of DRAM
memory_limit_add_range(mem_arena.base, mem_arena.size, mem_arena);
status = memory_limit_add_arenas(mem_arena);
}
// If no memory limit was found, or adding arenas from the range failed, then add
// the existing global arena.
if (status != ZX_OK) {
dprintf(INFO, "memory limit lib returned an error (%d), falling back to default arena\n",
status);
pmm_add_arena(&mem_arena);
}
// tell the boot allocator to mark ranges we've reserved as off limits
boot_reserve_wire();
}
首先预先简单了解两个概念,这个具体会在后续文章中分析:
ZBI(Zircon Boot Image),Zircon 启动镜像,内核层在编译时最终出来的文件除了 Linux 传统的 boot.img, ramdisk.img 等,在 Zircon 中,还有很多种 bin 输出文件,他们在启动时被 bootloader 和 kernel.bin 一起加载到内存中,这些 bin 就叫做 ZBI。
ZBI 可以是:
等。。。。
如下宏:
#define ZBI_ALL_TYPES(macro) \
macro(ZBI_TYPE_CONTAINER, "CONTAINER", ".bin") \
macro(ZBI_TYPE_KERNEL_X64, "KERNEL_X64", ".bin") \
macro(ZBI_TYPE_KERNEL_ARM64, "KERNEL_ARM64", ".bin") \
macro(ZBI_TYPE_DISCARD, "DISCARD", ".bin") \
macro(ZBI_TYPE_STORAGE_RAMDISK, "RAMDISK", ".bin") \
macro(ZBI_TYPE_STORAGE_BOOTFS, "BOOTFS", ".bin") \
macro(ZBI_TYPE_CMDLINE, "CMDLINE", ".txt") \
macro(ZBI_TYPE_CRASHLOG, "CRASHLOG", ".bin") \
macro(ZBI_TYPE_NVRAM, "NVRAM", ".bin") \
macro(ZBI_TYPE_PLATFORM_ID, "PLATFORM_ID", ".bin") \
macro(ZBI_TYPE_CPU_CONFIG, "CPU_CONFIG", ".bin") \
macro(ZBI_TYPE_MEM_CONFIG, "MEM_CONFIG", ".bin") \
macro(ZBI_TYPE_KERNEL_DRIVER, "KERNEL_DRIVER", ".bin") \
macro(ZBI_TYPE_ACPI_RSDP, "ACPI_RSDP", ".bin") \
macro(ZBI_TYPE_SMBIOS, "SMBIOS", ".bin") \
macro(ZBI_TYPE_EFI_MEMORY_MAP, "EFI_MEMORY_MAP", ".bin") \
macro(ZBI_TYPE_EFI_SYSTEM_TABLE, "EFI_SYSTEM_TABLE", ".bin") \
macro(ZBI_TYPE_E820_TABLE, "E820_TABLE", ".bin") \
macro(ZBI_TYPE_DEBUG_UART, "DEBUG_UART", ".bin") \
macro(ZBI_TYPE_FRAMEBUFFER, "FRAMEBUFFER", ".bin") \
macro(ZBI_TYPE_DRV_MAC_ADDRESS, "DRV_MAC_ADDRESS", ".bin") \
macro(ZBI_TYPE_DRV_PARTITION_MAP, "DRV_PARTITION_MAP", ".bin") \
macro(ZBI_TYPE_BOOT_CONFIG, "BOOT_CONFIG", ".bin") \
macro(ZBI_TYPE_BOOT_VERSION, "BOOT_VERSION", ".bin")
ramdisk 是根 ZBI
PMM(Physical Memory Manager),物理内存管理器,这是 Zircon 用来统一管理物理内存的机制。
现代计算机为了拓展规模,在无法在单片 die 中塞入更多核心的情况下发展出了 NUMA 架构,简单的来说就是使用多路 CPU,而内存是挂在每块 CPU 的内存控制器上的,也就是说,每块 CPU 都有自己管理的内存,这样内存就被分割成了一片片区域。
PMM 使用 PmmNode 数据结构表示每片内存。每个 CPU 内存控制器下的内存不可能都是插满的,这样的话就必然会有内存空间的断续,PMM 将一块连续的内存区域抽象成 PmmArena,这样:
PMM 下管理着多个 PmmNode,每个 PmmNode 下又管理着多个 PmmArena。
pdev_init() -> pdev_run_hooks(LK_INIT_LEVEL_PLATFORM_EARLY) -> 遍历 Driver ZBI -> lk_pdev_init_struct.hook()
typedef void (*lk_pdev_init_hook)(const void* driver_data, uint32_t length);
// for registering platform drivers
// 内核驱动结构
struct lk_pdev_init_struct {
uint32_t type; // driver type, as defined in
lk_pdev_init_hook hook; // hook for driver init
uint level; // init level for the hook
const char* name;
};
// 驱动注册宏
typedef void (*lk_pdev_init_hook)(const void* driver_data, uint32_t length);
// for registering platform drivers
// 内核驱动结构
struct lk_pdev_init_struct {
uint32_t type; // driver type, as defined in
lk_pdev_init_hook hook; // hook for driver init
uint level; // init level for the hook
const char* name;
};
// 驱动注册宏
#define LK_PDEV_INIT(_name, _type, _hook, _level)
__ALIGNED(sizeof(void*)) \
__USED __SECTION(".data.rel.ro.lk_pdev_init") static const struct lk_pdev_init_struct _dev_init_struct_##_name = { \
.type = _type, \
.hook = _hook, \
.level = _level, \
.name = #_name, \
};
static void pdev_init_driver(uint32_t type, const void* driver_data, uint32_t length, uint level) {
const struct lk_pdev_init_struct* ptr;
for (ptr = __start_lk_pdev_init; ptr != __stop_lk_pdev_init; ptr++) {
if (ptr->type == type && ptr->level == level) {
ptr->hook(driver_data, length);
return;
}
}
}
在初始化内核堆之前,需要先初始化内核内存空间页表和虚拟内存,因为内核堆所管理的这片内存包含在此
VmAspace 是 Zircon 对虚拟内存的抽象,较为复杂,另做讨论
void vm_init_preheap() {
// allow the vmm a shot at initializing some of its data structures
// 构造代表内核空间的 VmAspace 对象
VmAspace::KernelAspaceInitPreHeap();
// 在页表中标记内存页已经使用
// mark the physical pages used by the boot time allocator
if (boot_alloc_end != boot_alloc_start) {
dprintf(INFO, "VM: marking boot alloc used range [%#" PRIxPTR ", %#" PRIxPTR ")\n", boot_alloc_start,
boot_alloc_end);
MarkPagesInUsePhys(boot_alloc_start, boot_alloc_end - boot_alloc_start);
}
zx_status_t status;
// 内核随机分布,这是为了保护内核在内存中的位置,防止恶意代码获取
// 以下暂时掠过
#if !DISABLE_KASLR // Disable random memory padding for KASLR
。。。。。
#endif
// grab a page and mark it as the zero page
// 分配零页
status = pmm_alloc_page(0, &zero_page, &zero_page_paddr);
DEBUG_ASSERT(status == ZX_OK);
void* ptr = paddr_to_physmap(zero_page_paddr);
DEBUG_ASSERT(ptr);
arch_zero_page(ptr);
}
void VmAspace::KernelAspaceInitPreHeap() TA_NO_THREAD_SAFETY_ANALYSIS {
// the singleton kernel address space
// 构造一个内核空间单例,因为这个函数只会在启动时调用,所以是这个对象是单例
// VmAspace 即 Virtual Memory Address Space,代表当前 CPU 虚拟内存空间的抽象
static VmAspace _kernel_aspace(KERNEL_ASPACE_BASE, KERNEL_ASPACE_SIZE, VmAspace::TYPE_KERNEL, "kernel");
// the singleton dummy root vmar (used to break a reference cycle in
// Destroy())
static VmAddressRegionDummy dummy_vmar;
#if LK_DEBUGLEVEL > 1
_kernel_aspace.Adopt();
dummy_vmar.Adopt();
#endif
dummy_root_vmar = &dummy_vmar;
static VmAddressRegion _kernel_root_vmar(_kernel_aspace);
_kernel_aspace.root_vmar_ = fbl::AdoptRef(&_kernel_root_vmar);
// 初始化
auto err = _kernel_aspace.Init();
ASSERT(err >= 0);
// save a pointer to the singleton kernel address space
VmAspace::kernel_aspace_ = &_kernel_aspace;
aspaces.push_front(kernel_aspace_);
}
Zircon 的内核堆由内部的 cmpctmalloc 实现。
具体实现还没有细看
// cmpct 堆初始化
void cmpct_init(void) {
LTRACE_ENTRY;
// 初始化全局互斥锁
// Create a mutex.
mutex_init(&theheap.lock);
// 初始化空闲列表
// Initialize the free list.
for (int i = 0; i < NUMBER_OF_BUCKETS; i++) {
theheap.free_lists[i] = NULL;
}
for (int i = 0; i < BUCKET_WORDS; i++) {
theheap.free_list_bits[i] = 0;
}
size_t initial_alloc = HEAP_GROW_SIZE - 2 * sizeof(header_t);
theheap.remaining = 0;
heap_grow(initial_alloc);
}
void vm_init() {
LTRACE_ENTRY;
VmAspace* aspace = VmAspace::kernel_aspace();
// 内核镜像的各个段,以及读写策略
// we expect the kernel to be in a temporary mapping, define permanent
// regions for those now
struct temp_region {
const char* name;
vaddr_t base;
size_t size;
uint arch_mmu_flags;
} regions[] = {
{
.name = "kernel_code",
.base = (vaddr_t)__code_start,
.size = ROUNDUP((uintptr_t)__code_end - (uintptr_t)__code_start, PAGE_SIZE),
.arch_mmu_flags = ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_EXECUTE,
},
{
.name = "kernel_rodata",
.base = (vaddr_t)__rodata_start,
.size = ROUNDUP((uintptr_t)__rodata_end - (uintptr_t)__rodata_start, PAGE_SIZE),
.arch_mmu_flags = ARCH_MMU_FLAG_PERM_READ,
},
{
.name = "kernel_data",
.base = (vaddr_t)__data_start,
.size = ROUNDUP((uintptr_t)__data_end - (uintptr_t)__data_start, PAGE_SIZE),
.arch_mmu_flags = ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_WRITE,
},
{
.name = "kernel_bss",
.base = (vaddr_t)__bss_start,
.size = ROUNDUP((uintptr_t)_end - (uintptr_t)__bss_start, PAGE_SIZE),
.arch_mmu_flags = ARCH_MMU_FLAG_PERM_READ | ARCH_MMU_FLAG_PERM_WRITE,
},
};
// 遍历上面的几个段,并设置策略
for (uint i = 0; i < fbl::count_of(regions); ++i) {
temp_region* region = ®ions[i];
ASSERT(IS_PAGE_ALIGNED(region->base));
dprintf(INFO, "VM: reserving kernel region [%#" PRIxPTR ", %#" PRIxPTR ") flags %#x name '%s'\n",
region->base, region->base + region->size, region->arch_mmu_flags, region->name);
// 在vmm中标记一块虚拟内存,这块虚拟内存抽象为VmRegion类,拥有自己的底层mmu相关的配置
zx_status_t status = aspace->ReserveSpace(region->name, region->size, region->base);
ASSERT(status == ZX_OK);
// 对某VmRegion对应的虚拟内存设置内存保护的相关参数
status = ProtectRegion(aspace, region->base, region->arch_mmu_flags);
ASSERT(status == ZX_OK);
}
// 标记映射表
// reserve the kernel aspace where the physmap is
aspace->ReserveSpace("physmap", PHYSMAP_SIZE, PHYSMAP_BASE);
// 随机内核布局
#if !DISABLE_KASLR // Disable random memory padding for KASLR
。。。。。。
#endif
}
这部分逻辑很简单:
void kernel_init(void) {
dprintf(SPEW, "initializing mp\n");
// 多核初始化 *
mp_init();
dprintf(SPEW, "initializing timers\n");
// 计时器队列初始化 *
timer_queue_init();
}
// 多核初始化
void mp_init(void) {
// CPU 热插拔锁
mutex_init(&mp.hotplug_lock);
// 核间中断任务表初始化
mp.ipi_task_lock = SPIN_LOCK_INITIAL_VALUE;
for (uint i = 0; i < fbl::count_of(mp.ipi_task_list); ++i) {
list_initialize(&mp.ipi_task_list[i]);
}
}
// 初始化每个 CPU 的计时器队列
void timer_queue_init(void) {
for (uint i = 0; i < SMP_MAX_CPUS; i++) {
list_initialize(&percpu[i].timer_queue);
percpu[i].preempt_timer_deadline = ZX_TIME_INFINITE;
percpu[i].next_timer_deadline = ZX_TIME_INFINITE;
}
}
这里要注意,虽说是在 bootstrap2 线程中跑,但此时没有开启任务调度,所以实际还是在同一个 CPU 执行的,因此下面代码是顺序执行的
// bootstrap2 线程工作函数
static int bootstrap2(void*) {
dprintf(SPEW, "top of bootstrap2()\n");
lk_primary_cpu_init_level(LK_INIT_LEVEL_THREADING, LK_INIT_LEVEL_ARCH - 1);
// CPU 架构初始化 *
arch_init();
// initialize the rest of the platform
dprintf(SPEW, "initializing platform\n");
lk_primary_cpu_init_level(LK_INIT_LEVEL_ARCH, LK_INIT_LEVEL_PLATFORM - 1);
// 平台初始化 *
platform_init();
// initialize the target
dprintf(SPEW, "initializing target\n");
lk_primary_cpu_init_level(LK_INIT_LEVEL_PLATFORM, LK_INIT_LEVEL_TARGET - 1);
// 目标设备初始化
// Hook 未实现
target_init();
dprintf(SPEW, "moving to last init level\n");
lk_primary_cpu_init_level(LK_INIT_LEVEL_TARGET, LK_INIT_LEVEL_LAST);
return 0;
}
// 架构初始化,由 bootstrap2 线程完成
void arch_init() TA_NO_THREAD_SAFETY_ANALYSIS {
// 主要任务是初始化每个 CPU 的中断
arch_mp_init_percpu();
dprintf(INFO, "ARM boot EL%lu\n", arm64_get_boot_el());
arm64_feature_debug(true);
// 读取启动参数中配置的 CPU 数量
uint32_t max_cpus = arch_max_num_cpus();
uint32_t cmdline_max_cpus = cmdline_get_uint32("kernel.smp.maxcpus", max_cpus);
if (cmdline_max_cpus > max_cpus || cmdline_max_cpus <= 0) {
printf("invalid kernel.smp.maxcpus value, defaulting to %u\n", max_cpus);
cmdline_max_cpus = max_cpus;
}
secondaries_to_init = cmdline_max_cpus - 1;
// 初始化非 prime CPU *
// 主要任务是为非 prime cpu 创建 IDLE 线程
lk_init_secondary_cpus(secondaries_to_init);
LTRACEF("releasing %d secondary cpus\n", secondaries_to_init);
// 释放启动锁
// 前面在汇编代码等待的非 prime CPU 可以继续执行了
// Release the secondary cpus.
spin_unlock(&arm_boot_cpu_lock);
// 为了让改动立刻写入内存,让其他 CPU 立刻可见,需要 flush cache
// Flush the release of the lock, since the secondary cpus are running without cache on.
arch_clean_cache_range((addr_t)&arm_boot_cpu_lock, sizeof(arm_boot_cpu_lock));
}
void lk_init_secondary_cpus(uint secondary_cpu_count) {
if (secondary_cpu_count >= SMP_MAX_CPUS) {
dprintf(CRITICAL, "Invalid secondary_cpu_count %u, SMP_MAX_CPUS %d\n",
secondary_cpu_count, SMP_MAX_CPUS);
secondary_cpu_count = SMP_MAX_CPUS - 1;
}
// 为每个 CPU 创建 IDLE 线程
for (uint i = 0; i < secondary_cpu_count; i++) {
thread_t* t = thread_create_idle_thread(i + 1);
if (!t) {
dprintf(CRITICAL, "could not allocate idle thread %u\n", i + 1);
secondary_idle_thread_count = i;
break;
}
}
secondary_idle_thread_count = secondary_cpu_count;
}
主要和 CPU 热插拔相关
platform_init() -> platform_cpu_init()
// 初始化平台上的其他 CPU,CPU 热插拔?
static void platform_cpu_init(void) {
// 遍历所有簇
for (uint cluster = 0; cluster < cpu_cluster_count; cluster++) {
// 遍历簇内所有 CPU 内核
for (uint cpu = 0; cpu < cpu_cluster_cpus[cluster]; cpu++) {
// 启动新增 CPU
if (cluster != 0 || cpu != 0) {
// create a stack for the cpu we're about to start
zx_status_t status = arm64_create_secondary_stack(cluster, cpu);
DEBUG_ASSERT(status == ZX_OK);
// start the cpu
status = platform_start_cpu(cluster, cpu);
if (status != ZX_OK) {
// TODO(maniscalco): Is continuing really the right thing to do here?
// start failed, free the stack
zx_status_t status = arm64_free_secondary_stack(cluster, cpu);
DEBUG_ASSERT(status == ZX_OK);
continue;
}
// the cpu booted
//
// bootstrap thread is now responsible for freeing its stack
}
}
}
}
非 prime cpu 初始化代码
大部分前面都已经介绍过
// called from assembly.
extern "C" void arm64_secondary_entry() {
arm64_cpu_early_init();
spin_lock(&arm_boot_cpu_lock);
spin_unlock(&arm_boot_cpu_lock);
uint cpu = arch_curr_cpu_num();
thread_secondary_cpu_init_early(&_init_thread[cpu - 1]);
// Run early secondary cpu init routines up to the threading level.
lk_init_level(LK_INIT_FLAG_SECONDARY_CPUS, LK_INIT_LEVEL_EARLIEST, LK_INIT_LEVEL_THREADING - 1);
arch_mp_init_percpu();
arm64_feature_debug(false);
lk_secondary_cpu_entry();
}
thread_secondary_cpu_init_early 为此 CPU 创建了一个占位的空线程
lk_secondary_cpu_entry -> thread_secondary_cpu_entry: