void __init x86_64_start_kernel(char * real_mode_data)
{
int i;
/* 内核映像和模块区域映射的完整性检查 */
...
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
/* Make NULL pointers segfault */
zap_identity_mappings();
/* KERNEL_IMAGE_START = 0xffffffff80000000UL KERNEL_IMAGE_SIZE = 512M
* PAGE_OFFSET = 0xffff880000000000UL PAGE_SHIFT = 12
*/
max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) // segment.h中定义NUM_EXCEPTION_VECTORS = 32
set_intr_gate(i, early_idt_handler); // 往idt_table[i]数组中写入一个门描述符,处理函数是early_idt_handler
/* struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table };
* lidt汇编指令利用idt_descr变量初始化idtr寄存器,NR_VECTORS = 256 */
load_idt((const struct desc_ptr *)&idt_descr);
x86_64_start_reservations(real_mode_data);
}
static void __init zap_identity_mappings(void)
{
/* (pgd_t *)(init_mm.pgd + pgd_index(addr)) = (pgd_t)0 */
pgd_t *pgd = pgd_offset_k(0UL);
pgd_clear(pgd);
/* Read-Modify-Write to CR4 */
__flush_tlb_all();
}
static void __init clear_bss(void)
{ memset(__bss_start, 0, (unsigned long) __bss_stop - (unsigned long) __bss_start); }
static void __init copy_bootdata(char *real_mode_data)
{
char * command_line;
memcpy(&boot_params, real_mode_data, sizeof boot_params);
if (boot_params.hdr.cmd_line_ptr) {
command_line = __va(boot_params.hdr.cmd_line_ptr);
memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE);
}
}
void __init x86_64_start_reservations(char *real_mode_data)
{
copy_bootdata(__va(real_mode_data));
memblock_init();
memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
/* Assume only end is not page aligned */
unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
}
#endif
reserve_ebda_region();
/* At this point everything still needed from the boot loader or BIOS or kernel text
* should be early reserved or marked not RAM in e820. All other memory is free game. */
start_kernel();
}
asmlinkage void __init start_kernel(void)
{
char * command_line;
extern const struct kernel_param __start___param[], __stop___param[];
/* 当只有一个CPU的时候这个函数什么都不做,SMP时,返回在启动的时候的那个CPU号 */
smp_setup_processor_id();
/* Need to run as early as possible, to initialize the lockdep hash: */
lockdep_init();
debug_objects_early_init();
/* Set up the the initial canary ASAP: */
boot_init_stack_canary();
cgroup_init_early(); /* 系统启动时的cgroup初始化,初始化那些要求early init的子系统 */
local_irq_disable();
early_boot_irqs_disabled = true;
/* Interrupts are still disabled. Do necessary setups, then enable them */
tick_init();
boot_cpu_init();
/* 初始化页地址,使用链表将其链接起来 */
page_address_init();
printk(KERN_NOTICE "%s", linux_banner);
/* 体系结构相关函数,由源码树顶层目录下的Makefile中的ARCH变量决定 */
setup_arch(&command_line);
mm_init_owner(&init_mm, &init_task); /* init_mm.owner = &init_task */
mm_init_cpumask(&init_mm);
setup_command_line(command_line);
setup_nr_cpu_ids(); /* nr_cpu_ids = NR_CPUS */
setup_per_cpu_areas();
smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
build_all_zonelists(NULL);
page_alloc_init();
printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
/* 对内核选项的两次解析 */
parse_early_param();
parse_args("Booting kernel", static_command_line, __start___param,
__stop___param - __start___param, &unknown_bootoption);
/* These use large bootmem allocations and must precede kmem_cache_init() */
setup_log_buf(0);
/* 初始化hash表,便于从进程的PID获得对应的进程描述符指针 */
pidhash_init();
/* 虚拟文件系统初始化,包括dcache\inode\files\mnt\bdev_cache\chrdev_init() */
vfs_caches_init_early();
sort_main_extable();
/* trap_init函数完成对系统保留中断向量(异常、非屏蔽中断以及系统调用)的初始化
* init_IRQ函数则完成其余中断向量的初始化
*/
trap_init();
mm_init();
/* Set up the scheduler prior starting any interrupts (such as the
* timer interrupt). Full topology setup happens at smp_init()
* time - but meanwhile we still have a functioning scheduler.
*/
sched_init();
/*
* Disable preemption - early bootup scheduling is extremely
* fragile until we cpu_idle() for the first time.
*/
preempt_disable();
if (!irqs_disabled()) {
printk(KERN_WARNING "start_kernel(): bug: interrupts were "
"enabled *very* early, fixing it\n");
local_irq_disable();
}
idr_init_cache();
/* NOTE: */
perf_event_init();
rcu_init();
radix_tree_init();
/* init some links before init_ISA_irqs() */
early_irq_init();
init_IRQ();
prio_tree_init();
/* 初始化定时器相关的数据结构*/
init_timers();
/* 对高精度时钟进行初始化 */
hrtimers_init();
softirq_init();
timekeeping_init();
/* 初始化系统时钟源 */
time_init();
/* 对内核的profile功能(一个内核性能调试工具)进行初始化 */
profile_init();
call_function_init();
if (!irqs_disabled())
printk(KERN_CRIT "start_kernel(): bug: interrupts were "
"enabled early\n");
early_boot_irqs_disabled = false;
local_irq_enable();
/* Interrupts are enabled now so all GFP allocations are safe. */
gfp_allowed_mask = __GFP_BITS_MASK;
/* slab初始化 */
kmem_cache_init_late();
/*
* HACK ALERT! This is early. We're enabling the console before
* we've done PCI setups etc, and console_init() must be aware of
* this. But we do want output early, in case something goes wrong.
*/
/* 控制台初始化以显示printk的内容,在此之前调用的printk只是把数据存到缓冲区里 */
console_init();
if (panic_later)
panic(panic_later, panic_param);
/* 如果定义了CONFIG_LOCKDEP宏,则打印锁依赖信息,否则什么也不做 */
lockdep_info();
/*
* Need to run this when irqs are enabled, because it wants
* to self-test [hard/soft]-irqs on/off lock inversion bugs
* too:
*/
locking_selftest();
#ifdef CONFIG_BLK_DEV_INITRD
if (initrd_start && !initrd_below_start_ok &&
page_to_pfn(virt_to_page((void *)initrd_start)) < min_low_pfn) {
printk(KERN_CRIT "initrd overwritten (0x%08lx < 0x%08lx) - "
"disabling it.\n",
page_to_pfn(virt_to_page((void *)initrd_start)),
min_low_pfn);
initrd_start = 0;
}
#endif
page_cgroup_init();
enable_debug_pagealloc();
debug_objects_mem_init();
kmemleak_init();
setup_per_cpu_pageset();
numa_policy_init();
if (late_time_init)
late_time_init();
sched_clock_init();
/* 根据CPU在1s内执行极短循环的次数,得到BogoMIPS值 */
calibrate_delay();
pidmap_init();
anon_vma_init();
#ifdef CONFIG_X86
if (efi_enabled)
efi_enter_virtual_mode();
#endif
thread_info_cache_init();
cred_init();
/* 根据物理内存大小计算允许创建进程数量 */
fork_init(totalram_pages);
proc_caches_init();
buffer_init();
key_init();
security_init();
dbg_late_init();
vfs_caches_init(totalram_pages);
signals_init();
/* rootfs populating might need page-writeback */
page_writeback_init();
#ifdef CONFIG_PROC_FS
proc_root_init();
#endif
cgroup_init(); /* 注册cgroup文件系统并创建/proc/cgroup文件,初始化所有在cgroup_init_early中没有初始化的子系统 */
cpuset_init();
taskstats_init_early();
delayacct_init();
/* 测试CPU的各种缺陷,记录检测到的缺陷,以便于内核的其他部分可以使用它们的工作
* check_bugs=>identify_boot_cpu=>identify_cpu会做很多工作,包括select_idle_routine以及intel_init_thermal
*/
check_bugs();
acpi_early_init(); /* before LAPIC and SMP init */
sfi_init_late();
ftrace_init();
/* Do the rest non-__init'ed, we're now alive */
/* 创建init进程 */
rest_init();
}