void __init __no_sanitize_address setup_arch(char **cmdline_p)
{
//init_mm是init进程(0号进程)的内存描述符
//初始化内核的mm结构体的代码段、数据段和栈的结束地址
init_mm.start_code = (unsigned long) _text;
init_mm.end_code = (unsigned long) _etext;
init_mm.end_data = (unsigned long) _edata;
init_mm.brk = (unsigned long) _end;
*cmdline_p = boot_command_line;//cmdline_p指向boot启动参数
/*
* If know now we are going to need KPTI then use non-global
* mappings from the start, avoiding the cost of rewriting
* everything later.
*/
//这里根据是否开启kpli(内核页表隔离)决定是否创建非全局页表
arm64_use_ng_mappings = kaslr_requires_kpti();
early_fixmap_init();//早期固定映射初始化,搞了个框架,还没有填充pte页表项
early_ioremap_init();//早期io映射初始化
setup_machine_fdt(__fdt_pointer);//解析fdt中内存,加入到memblock中
/*
* Initialise the static keys early as they may be enabled by the
* cpufeature code and early parameters.
*/
jump_label_init();//初始化jump-label子系统,看不懂
parse_early_param();//解析early options这两个参数
/*
* Unmask asynchronous aborts and fiq after bringing up possible
* earlycon. (Report possible System Errors once we can report this
* occurred).
*/
local_daif_restore(DAIF_PROCCTX_NOIRQ);//恢复daif为不可以IRQ
/*
* TTBR0 is only used for the identity mapping at this stage. Make it
* point to zero page to avoid speculatively fetching new entries.
*/
cpu_uninstall_idmap();//移除idmap_pg_dir页表
xen_early_init();//不支持xen,不看
efi_init();
if (!efi_enabled(EFI_BOOT) && ((u64)_text % MIN_KIMG_ALIGN) != 0)
pr_warn(FW_BUG "Kernel image misaligned at boot, please fix your bootloader!");
arm64_memblock_init();//初始化memblock,到这里,不能使用的内存都放入reserved了
paging_init();//切换为细粒度映射,释放粗粒度映射页表内存,memblock初始化完成
acpi_table_upgrade();//解析acpi表的信息,存放到acpi_initrd_files中
/* Parse the ACPI tables for possible boot-time configuration */
acpi_boot_table_init();//解析ACPI表,获得可能的引导时配置
if (acpi_disabled)
unflatten_device_tree();//解析设备树信息,创建设备节点树
bootmem_init();//初始化内存基本数据结构,pg_data_t,、zone、page
kasan_init();//KASAN初始化,我们没有使用
request_standard_resources();
early_ioremap_reset();//设置after_paging_init为1
if (acpi_disabled)
psci_dt_init();//设备树中psci初始化
else
psci_acpi_init();//acpi种psci初始化
init_bootcpu_ops();//初始化cpu启动方法集合
smp_init_cpus();//smp的其他cpu初始化
smp_build_mpidr_hash();//看不懂
/* Init percpu seeds for random tags after cpus are set up. */
kasan_init_tags();//kasan的东西,我们没有用
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
/*
* Make sure init_thread_info.ttbr0 always generates translation
* faults in case uaccess_enable() is inadvertently called by the init
* thread.
*/
init_task.thread_info.ttbr0 = phys_to_ttbr(__pa_symbol(reserved_pg_dir));
#endif
if (boot_args[1] || boot_args[2] || boot_args[3]) {
pr_err("WARNING: x1-x3 nonzero in violation of boot protocol:\n"
"\tx1: %016llx\n\tx2: %016llx\n\tx3: %016llx\n"
"This indicates a broken bootloader or old kernel\n",
boot_args[1], boot_args[2], boot_args[3]);
}
}
setup_arch函数主要是处理cpu体系相关架构,我们是arm64平台,这个函数处理arm64的一些初始化,主要包括:
bool kaslr_requires_kpti(void)
{
//如果没有CONFIG_RANDOMIZE_BASE,说明不能支持kpti
if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
return false;
/*
* E0PD does a similar job to KPTI so can be used instead
* where available.
*/
if (IS_ENABLED(CONFIG_ARM64_E0PD)) {
//读取AA64MMFR2_EL1寄存器
u64 mmfr2 = read_sysreg_s(SYS_ID_AA64MMFR2_EL1);
//如果寄存器实现了E0PD,就不用KPTI了,因为硬件实现了相关功能
if (cpuid_feature_extract_unsigned_field(mmfr2,
ID_AA64MMFR2_E0PD_SHIFT))
return false;
}
/*
* Systems affected by Cavium erratum 24756 are incompatible
* with KPTI.
*/
//如果开启了erratum 24756,就不能开启KPTI,因为不兼容
if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) {
extern const struct midr_range cavium_erratum_27456_cpus[];
if (is_midr_in_range_list(read_cpuid_id(),
cavium_erratum_27456_cpus))
return false;
}
return kaslr_offset() > 0;
}
KPTI 是一项安全技术,它将内核页表和用户页表分离,使得用户空间无法直接访问内核空间的数据结构和代码,从而防止用户空间对内核进行非法访问。在某些情况下,KPTI 还可以减轻 Meltdown 和 Spectre 等 CPU 漏洞所带来的安全风险。
kaslr_requires_kpti判断当前系统是否需要启用 Kernel Page Table Isolation (KPTI) 技术来保护内核代码和数据。
void __init early_fixmap_init(void)
{
pgd_t *pgdp;
p4d_t *p4dp, p4d;
pud_t *pudp;
pmd_t *pmdp;
unsigned long addr = FIXADDR_START;
pgdp = pgd_offset_k(addr);//找到固定映射起始地址的pgd页表项(偏移后的)
p4dp = p4d_offset(pgdp, addr);//计算出pgd页表项中存放的物理地址
p4d = READ_ONCE(*p4dp);//读取页表项中的物理地址存放的值,也就是p4d表起始地址
//如果使用超过3级页表,并且p4d不是空的
if (CONFIG_PGTABLE_LEVELS > 3 &&
!(p4d_none(p4d) || p4d_page_paddr(p4d) == __pa_symbol(bm_pud))) {
/*
* We only end up here if the kernel mapping and the fixmap
* share the top level pgd entry, which should only happen on
* 16k/4 levels configurations.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
pudp = pud_offset_kimg(p4dp, addr);//计算出固定映射起始地址的pud页表项
} else {
if (p4d_none(p4d))//如果p4d是空的
//将bm_pud的物理地址写到pgd全局页目录表中
__p4d_populate(p4dp, __pa_symbol(bm_pud), PUD_TYPE_TABLE);
pudp = fixmap_pud(addr);
}
if (pud_none(READ_ONCE(*pudp)))//如果pudp是空的
//将bm_pmd的物理地址写到pud页目录表中
__pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE);
pmdp = fixmap_pmd(addr);//计算出pmd的目录项的物理地址
//将bm_pte的物理地址写到pmd页表目录表中
__pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE);
/*
* The boot-ioremap range spans multiple pmds, for which
* we are not prepared:
*/
BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)))
|| pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) {
WARN_ON(1);
pr_warn("pmdp %p != %p, %p\n",
pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)),
fixmap_pmd(fix_to_virt(FIX_BTMAP_END)));
pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n",
fix_to_virt(FIX_BTMAP_BEGIN));
pr_warn("fix_to_virt(FIX_BTMAP_END): %08lx\n",
fix_to_virt(FIX_BTMAP_END));
pr_warn("FIX_BTMAP_END: %d\n", FIX_BTMAP_END);
pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
}
}
early_fixmap_init函数主要是初始化早期固定映射:
void __init early_ioremap_init(void)
{
early_ioremap_setup();
}
void __init early_ioremap_setup(void)
{
int i;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
if (WARN_ON(prev_map[i]))
break;
for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
slot_virt[i] = __fix_to_virt(FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*i);
}
static void __init setup_machine_fdt(phys_addr_t dt_phys)
{
int size;
//为FDT物理计算出虚拟地址,然后填写pte页表项创建映射关系
void *dt_virt = fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL);
const char *name;
if (dt_virt)
memblock_reserve(dt_phys, size);//设备树内存添加到memblock.reserved中
//扫描设备树的内存信息,把内存加入memblock
if (!dt_virt || !early_init_dt_scan(dt_virt)) {
pr_crit("\n"
"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
"\nPlease check your bootloader.",
&dt_phys, dt_virt);
while (true)
cpu_relax();
}
//修改映射的pte项为只读
fixmap_remap_fdt(dt_phys, &size, PAGE_KERNEL_RO);
name = of_flat_dt_get_machine_name();
if (!name)
return;
pr_info("Machine model: %s\n", name);
dump_stack_set_arch_desc("%s (DT)", name);
}
setup_machine_fdt函数主要做了一下几件事:
void __init parse_early_param(void)
{
static int done __initdata;
static char tmp_cmdline[COMMAND_LINE_SIZE] __initdata;
if (done)
return;
/* All fall through to do_early_param. */
strlcpy(tmp_cmdline, boot_command_line, COMMAND_LINE_SIZE);//拷贝启动参数到tmp_cmdline
parse_early_options(tmp_cmdline);//解析启动参数的early或者options,如果有运行函数early options
done = 1;
}
void __init parse_early_options(char *cmdline)
{
parse_args("early options", cmdline, NULL, 0, 0, 0, NULL,
do_early_param);
}
parse_early_param函数首先调用函数strlcpy把启动参数拷贝到tmp_cmdline,然后调用函数parse_early_options解析启动参数,parse_early_options函数也只是直接调用parse_args函数解析参数的。
static inline void cpu_uninstall_idmap(void)
{
struct mm_struct *mm = current->active_mm;
cpu_set_reserved_ttbr0();//设置TTBR0_EL1为reserved_pg_dir的物理地址
local_flush_tlb_all();//冲刷TLB
cpu_set_default_tcr_t0sz();//根据虚拟地址位数设置TCR.t0sz
if (mm != &init_mm && !system_uses_ttbr0_pan())
cpu_switch_mm(mm->pgd, mm);
}
cpu_uninstall_idmap主要做了以下几件事:
void __init arm64_memblock_init(void)
{
const s64 linear_region_size = BIT(vabits_actual - 1);
/* Handle linux,usable-memory-range property */
fdt_enforce_memory_region();//扫描usable-memory-range属性的内存
//删除超出我们支持的物理地址大小的内存
memblock_remove(1ULL << PHYS_MASK_SHIFT, ULLONG_MAX);
//为物理内存的基数选择一个合适的值
memstart_addr = round_down(memblock_start_of_DRAM(),
ARM64_MEMSTART_ALIGN);
/*
* Remove the memory that we will not be able to cover with the
* linear mapping. Take care not to clip the kernel which may be
* high in memory.
*/
//删除线性映射覆盖的内存,我们无法用
memblock_remove(max_t(u64, memstart_addr + linear_region_size,
__pa_symbol(_end)), ULLONG_MAX);
if (memstart_addr + linear_region_size < memblock_end_of_DRAM()) {
/* ensure that memstart_addr remains sufficiently aligned */
memstart_addr = round_up(memblock_end_of_DRAM() - linear_region_size,
ARM64_MEMSTART_ALIGN);
memblock_remove(0, memstart_addr);
}
/*
* If we are running with a 52-bit kernel VA config on a system that
* does not support it, we have to place the available physical
* memory in the 48-bit addressable part of the linear region, i.e.,
* we have to move it upward. Since memstart_addr represents the
* physical address of PAGE_OFFSET, we have to *subtract* from it.
*/
if (IS_ENABLED(CONFIG_ARM64_VA_BITS_52) && (vabits_actual != 52))
memstart_addr -= _PAGE_OFFSET(48) - _PAGE_OFFSET(52);
/*
* Apply the memory limit if it was set. Since the kernel may be loaded
* high up in memory, add back the kernel region that must be accessible
* via the linear mapping.
*/
if (memory_limit != PHYS_ADDR_MAX) {
memblock_mem_limit_remove_map(memory_limit);
memblock_add(__pa_symbol(_text), (u64)(_end - _text));
}
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/*
* Add back the memory we just removed if it results in the
* initrd to become inaccessible via the linear mapping.
* Otherwise, this is a no-op
*/
u64 base = phys_initrd_start & PAGE_MASK;
u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base;
/*
* We can only add back the initrd memory if we don't end up
* with more memory than we can address via the linear mapping.
* It is up to the bootloader to position the kernel and the
* initrd reasonably close to each other (i.e., within 32 GB of
* each other) so that all granule/#levels combinations can
* always access both.
*/
if (WARN(base < memblock_start_of_DRAM() ||
base + size > memblock_start_of_DRAM() +
linear_region_size,
"initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) {
phys_initrd_size = 0;
} else {
//移除[phys_initrd_start,phys_initrd_start + phys_initrd_size]
//再添加回去,从而清除flags,最后放入reserve中
memblock_remove(base, size); /* clear MEMBLOCK_ flags */
memblock_add(base, size);
memblock_reserve(base, size);
}
}
if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) {
extern u16 memstart_offset_seed;
u64 range = linear_region_size -
(memblock_end_of_DRAM() - memblock_start_of_DRAM());
/*
* If the size of the linear region exceeds, by a sufficient
* margin, the size of the region that the available physical
* memory spans, randomize the linear region as well.
*/
if (memstart_offset_seed > 0 && range >= ARM64_MEMSTART_ALIGN) {
range /= ARM64_MEMSTART_ALIGN;
memstart_addr -= ARM64_MEMSTART_ALIGN *
((range * memstart_offset_seed) >> 16);
}
}
/*
* Register the kernel text, kernel data, initrd, and initial
* pagetables with memblock.
*/
//把内核代码段的内存放入reserve
memblock_reserve(__pa_symbol(_text), _end - _text);
if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) {
/* the generic initrd code expects virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
initrd_end = initrd_start + phys_initrd_size;
}
//把设备树中记录的reserved内存放入reserve
early_init_fdt_scan_reserved_mem();
reserve_elfcorehdr();//为ELF核心头预留内存
if (!IS_ENABLED(CONFIG_ZONE_DMA) && !IS_ENABLED(CONFIG_ZONE_DMA32))
reserve_crashkernel();//解析设备树,把crashkernel需要的内存放入reserve
high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
}
arm64_memblock_init函数主要做了以下几件事情:
void __init paging_init(void)
{
//获取swapper_pg_dir对应的pgd项
pgd_t *pgdp = pgd_set_fixmap(__pa_symbol(swapper_pg_dir));
map_kernel(pgdp);//为内核创建细粒度映射。
map_mem(pgdp);//给全部的memblock创建页表映射
pgd_clear_fixmap();//清空fixmap临时映射
//切换页表,并将新建立的页表内容替换swappper_pg_dir页表内容
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
init_mm.pgd = swapper_pg_dir;//设置mm.pgd
//释放粗粒度内核页表init_pg_dir
memblock_free(__pa_symbol(init_pg_dir),
__pa_symbol(init_pg_end) - __pa_symbol(init_pg_dir));
memblock_allow_resize();//设置memblock_can_resize为1
}
paging_init主要做了以下几件事:
void __init unflatten_device_tree(void)
{
//解析fdt信息,转化为device_node组成的树状结构of_root
__unflatten_device_tree(initial_boot_params, NULL, &of_root,
early_init_dt_alloc_memory_arch, false);
/* Get pointer to "/chosen" and "/aliases" nodes for use everywhere */
//扫描“aliases”节点的所有属性
of_alias_scan(early_init_dt_alloc_memory_arch);
unittest_unflatten_overlay_base();//为ovelay测试创建基本设备树,空函数
}
unflatten_device_tree函数主要做了以下几件事:
void __init bootmem_init(void)
{
unsigned long min, max;
min = PFN_UP(memblock_start_of_DRAM());
max = PFN_DOWN(memblock_end_of_DRAM());
//引导时期的内存读写测试,
early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT);
max_pfn = max_low_pfn = max;
min_low_pfn = min;
arm64_numa_init();//numa相关的初始化
/*
* must be done after arm64_numa_init() which calls numa_init() to
* initialize node_online_map that gets used in hugetlb_cma_reserve()
* while allocating required CMA size across online nodes.
*/
#if defined(CONFIG_HUGETLB_PAGE) && defined(CONFIG_CMA)
arm64_hugetlb_cma_reserve();//预留CMA区域
#endif
//如果启动参数指定了cma_pernuma,为每个numa申请cma内存
dma_pernuma_cma_reserve();//反正我没用过
/*
* sparse_init() tries to allocate memory from memblock, so must be
* done after the fixed reservations
*/
//为每个section创建page结构体数组,并为这些struct page结构体创建页表,
//映射到vmemmap虚拟映射区,同时对每个在线的mem_section进行初始化
sparse_init();//
zone_sizes_init(min, max);//初始化每个node, node下的zone, 以及node下的每一个pfn对应的page
//初始化arm64_dma_phys_limit后保留CMA区域
dma_contiguous_reserve(arm64_dma_phys_limit);
/*
* request_standard_resources() depends on crashkernel's memory being
* reserved, so do it here.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA) || IS_ENABLED(CONFIG_ZONE_DMA32))
reserve_crashkernel();//为崩溃内核预留内存
memblock_dump_all();//debug的时候输出memblock的信息
}
bootmem_init函数主要做了以下几件事:
int __init psci_dt_init(void)
{
struct device_node *np;
const struct of_device_id *matched_np;
psci_initcall_t init_fn;
int ret;
//找到设备树中的psci节点
np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);
if (!np || !of_device_is_available(np))//判断节点是否ok
return -ENODEV;
init_fn = (psci_initcall_t)matched_np->data;//psci_0_2_init()
ret = init_fn(np);
of_node_put(np);//node的kobject的引用计数减一
return ret;
}
psci_dt_init函数主要做了以下几件事:
static inline void __init init_bootcpu_ops(void)
{
init_cpu_ops(0);//读取cpu的enable方法并记录在cpu_ops中
}
int __init init_cpu_ops(int cpu)
{
const char *enable_method = cpu_read_enable_method(cpu);
if (!enable_method)
return -ENODEV;
cpu_ops[cpu] = cpu_get_ops(enable_method);//cpu_psci_ops
if (!cpu_ops[cpu]) {
pr_warn("Unsupported enable-method: %s\n", enable_method);
return -EOPNOTSUPP;
}
return 0;
}
init_bootcpu_ops函数首先会定义一个boot_ops结构体,其中包含了一系列启动CPU的函数指针,例如跳转到kernel_entry函数,或者设置CPU寄存器等操作。接着会将该结构体的地址保存到一个全局变量cpu_ops中。
void __init smp_init_cpus(void)
{
int i;
if (acpi_disabled)
of_parse_and_init_cpus();//初始化设备树中记录的cpu
else
acpi_parse_and_init_cpus();//初始化acpi检测到的cpu
if (cpu_count > nr_cpu_ids)
pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",
cpu_count, nr_cpu_ids);
if (!bootcpu_valid) {
pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
return;
}
/*
* We need to set the cpu_logical_map entries before enabling
* the cpus so that cpu processor description entries (DT cpu nodes
* and ACPI MADT entries) can be retrieved by matching the cpu hwid
* with entries in cpu_logical_map while initializing the cpus.
* If the cpu set-up fails, invalidate the cpu_logical_map entry.
*/
//遍历所有cpu
for (i = 1; i < nr_cpu_ids; i++) {
if (cpu_logical_map(i) != INVALID_HWID) {//cpu不是INVALID
if (smp_cpu_setup(i))//cpu设置成功
set_cpu_logical_map(i, INVALID_HWID);//设置cpu逻辑映射项
}
}
}
smp_init_cpus函数主要做了以下几件事:
smp_build_mpidr_hash函数是针对arm64多处理器系统中的处理器寻址和调度进行优化的函数,其作用是构建一个哈希表,以加速处理器之间的通信和任务调度。具体来说,这个哈希表会将每个处理器的mpidr(Multiprocessor Affinity Descriptor Register)与其所在的CPU核心ID关联起来。
首先,在for_each_possible_cpu循环中,遍历了所有可能存在的CPU核心,对于每个CPU核心,都使用per_cpu函数获取其cpu_data结构体,然后判断该处理器是否支持ARM64_HAS_CPUID功能。
如果支持的话,就进一步检查该处理器是否属于某个集群。如果属于某个集群的话,会根据其mpidr中的cluster_id来设置相应的临时掩码变量tmp_mask。
如果不属于任何集群的话,就通过mpidr_cpu_id函数从mpidr中提取出该处理器对应的CPU核心ID,然后将该ID和该处理器的pgtable_hash_node结构体通过hash_add函数添加到wait_for_pgtable_refill哈希表中。
最终,smp_build_mpidr_hash函数完成了一个哈希表的构建,其中,所有的处理器都被关联到相应的CPU核心或集群中。这样,在进行处理器之间的通信或任务调度时,就可以快速定位目标处理器所在的CPU核心或集群,大大提高了系统的性能和效率。