源码基于:Linux5.4
现代软件工程中,一个大的工程通常都会有多个源文件组成,其中包括高级计算机语言编写的源文件,以及汇编语言编写的汇编文件。在编译构建过程中会分别对这些源文件进行汇编、编译生成目标文件,这些目标文件包含:代码段、数据段、符号表等内容。链接器主要任务是将符号引用解析到符号定义上,将多个目标文件和库文件合并成为一个可执行文件或者动态链接库,生成符号表,并对程序代码做最后的检查和优化。
本文主要针对 ARM64架构的 链接脚本 vmlinux.lds.S 进行剖析。
因为 vmlinux.lds.S 的内容比较多,本文将其拆分后分析。
arch/arm64/kernel/vmlinux.lds.S
#include
#include
#include
#include
#include
#include
#include
#include "image.h"
/* .exit.text needed in case of alternative patching */
#define ARM_EXIT_KEEP(x) x
#define ARM_EXIT_DISCARD(x)
OUTPUT_ARCH(aarch64)
ENTRY(_text)
jiffies = jiffies_64;
#define HYPERVISOR_EXTABLE \
. = ALIGN(SZ_8); \
__start___kvm_ex_table = .; \
*(__kvm_ex_table) \
__stop___kvm_ex_table = .;
#define HYPERVISOR_TEXT \
/* \
* Align to 4 KB so that \
* a) the HYP vector table is at its minimum \
* alignment of 2048 bytes \
* b) the HYP init code will not cross a page \
* boundary if its size does not exceed \
* 4 KB (see related ASSERT() below) \
*/ \
. = ALIGN(SZ_4K); \
__hyp_idmap_text_start = .; \
*(.hyp.idmap.text) \
__hyp_idmap_text_end = .; \
__hyp_text_start = .; \
*(.hyp.text) \
HYPERVISOR_EXTABLE \
__hyp_text_end = .;
#define IDMAP_TEXT \
. = ALIGN(SZ_4K); \
__idmap_text_start = .; \
*(.idmap.text) \
__idmap_text_end = .;
#ifdef CONFIG_HIBERNATION
#define HIBERNATE_TEXT \
. = ALIGN(SZ_4K); \
__hibernate_exit_text_start = .; \
*(.hibernate_exit.text) \
__hibernate_exit_text_end = .;
#else
#define HIBERNATE_TEXT
#endif
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
#define TRAMP_TEXT \
. = ALIGN(PAGE_SIZE); \
__entry_tramp_text_start = .; \
*(.entry.tramp.text) \
. = ALIGN(PAGE_SIZE); \
__entry_tramp_text_end = .;
#else
#define TRAMP_TEXT
#endif
/*
* The size of the PE/COFF section that covers the kernel image, which
* runs from stext to _edata, must be a round multiple of the PE/COFF
* FileAlignment, which we set to its minimum value of 0x200. 'stext'
* itself is 4 KB aligned, so padding out _edata to a 0x200 aligned
* boundary should be sufficient.
*/
PECOFF_FILE_ALIGNMENT = 0x200;
#ifdef CONFIG_EFI
#define PECOFF_EDATA_PADDING \
.pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); }
#else
#define PECOFF_EDATA_PADDING
#endif
SECTIONS
{
/*
* XXX: The linker does not define how output sections are
* assigned to input sections when there are multiple statements
* matching the same input section name. There is no documented
* order of matching.
*/
/DISCARD/ : {
ARM_EXIT_DISCARD(EXIT_TEXT)
ARM_EXIT_DISCARD(EXIT_DATA)
EXIT_CALL
*(.discard)
*(.discard.*)
*(.interp .dynamic)
*(.dynsym .dynstr .hash .gnu.hash)
*(.eh_frame)
}
. = KIMAGE_VADDR + TEXT_OFFSET;
.head.text : {
_text = .;
HEAD_TEXT
}
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
*(.exception.text)
__exception_text_end = .;
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
ENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
HYPERVISOR_TEXT
IDMAP_TEXT
HIBERNATE_TEXT
TRAMP_TEXT
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
*(.got) /* Global offset table */
}
. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text section */
RO_DATA(PAGE_SIZE) /* everything from this point to */
EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */
NOTES
. = ALIGN(PAGE_SIZE);
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
idmap_pg_end = .;
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
. += PAGE_SIZE;
#endif
reserved_pg_dir = .;
. += PAGE_SIZE;
swapper_pg_dir = .;
. += PAGE_SIZE;
. = ALIGN(SEGMENT_ALIGN);
__init_begin = .;
__inittext_begin = .;
INIT_TEXT_SECTION(8)
__exittext_begin = .;
.exit.text : {
ARM_EXIT_KEEP(EXIT_TEXT)
}
__exittext_end = .;
. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
*(.altinstructions)
__alt_instructions_end = .;
}
.altinstr_replacement : {
*(.altinstr_replacement)
}
. = ALIGN(PAGE_SIZE);
__inittext_end = .;
__initdata_begin = .;
.init.data : {
INIT_DATA
INIT_SETUP(16)
INIT_CALLS
CON_INITCALL
INIT_RAM_FS
*(.init.rodata.* .init.bss) /* from the EFI stub */
}
.exit.data : {
ARM_EXIT_KEEP(EXIT_DATA)
}
PERCPU_SECTION(L1_CACHE_BYTES)
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
}
__rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR);
__rela_size = SIZEOF(.rela.dyn);
#ifdef CONFIG_RELR
.relr.dyn : ALIGN(8) {
*(.relr.dyn)
}
__relr_offset = ABSOLUTE(ADDR(.relr.dyn) - KIMAGE_VADDR);
__relr_size = SIZEOF(.relr.dyn);
#endif
. = ALIGN(SEGMENT_ALIGN);
__initdata_end = .;
__init_end = .;
_data = .;
_sdata = .;
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
* Granule (CWG) of data from the cache. Keep the section that
* requires this type of maintenance to be in its own Cache Writeback
* Granule (CWG) area so the cache maintenance operations don't
* interfere with adjacent data.
*/
.mmuoff.data.write : ALIGN(SZ_2K) {
__mmuoff_data_start = .;
*(.mmuoff.data.write)
}
. = ALIGN(SZ_2K);
.mmuoff.data.read : {
*(.mmuoff.data.read)
__mmuoff_data_end = .;
}
PECOFF_EDATA_PADDING
__pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
_edata = .;
BSS_SECTION(0, 0, 0)
. = ALIGN(PAGE_SIZE);
init_pg_dir = .;
. += INIT_DIR_SIZE;
init_pg_end = .;
__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
_end = .;
STABS_DEBUG
HEAD_SYMBOLS
}
#include "image-vars.h"
/*
* The HYP init code and ID map text can't be longer than a page each,
* and should not cross a page boundary.
*/
ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"HYP init code too big or misaligned")
ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K,
"ID map text too big or misaligned")
#ifdef CONFIG_HIBERNATION
ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1))
<= SZ_4K, "Hibernate exit text too big or misaligned")
#endif
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) <= 3*PAGE_SIZE,
"Entry trampoline text too big")
#endif
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
*/
ASSERT(_text == (KIMAGE_VADDR + TEXT_OFFSET), "HEAD is misaligned")
通过 vmlinux.lds.S 可以将内核的 section,大体分如下几个区间:
当然还有其他字段,详细看下文具体定义。
#include
#include
#include
#include
#include
#include
#include
#include "image.h"
上面的头文件,大多会使用宏定义的方式编写特定段的描述内容,用于在 vmlinux.lds.S 文件中引用。
//指定了链接之后输出文件的体系结构是 aarch64
OUTPUT_ARCH(aarch64)
//指定程序的入口地址为 _text
ENTRY(_text)
//Linux内核中定义了jiffies变量来记录从系统启动到当前时刻系统时钟所产生的tick数
//通常都设置为 jiffies_64
jiffies = jiffies_64;
ENTRY() 用以指定程序的入口地址,这里指定是 _text,其他方式:
SECTIONS { } 是链接脚本语法中的关键命令,用以描述输出文件的内存布局。
SECTIONS 命令告诉链接文件如何把输入文件的段映射到输出文件的各个段中,如何将输入段整合为输出段,如何把输出段放入程序地址空间和进程地址空间中。
这是一个特殊的输出段,被该段引用的任何输入段,将不会出现在输出文件中。
/DISCARD/ : {
ARM_EXIT_DISCARD(EXIT_TEXT)
ARM_EXIT_DISCARD(EXIT_DATA)
EXIT_CALL
*(.discard)
*(.discard.*)
*(.interp .dynamic)
*(.dynsym .dynstr .hash .gnu.hash)
*(.eh_frame)
}
. = KIMAGE_VADDR + TEXT_OFFSET;
.head.text : {
_text = .;
HEAD_TEXT
}
'.' 号是连接脚本中一个特殊的符号,用以表示当前位置计数器。
最开始将 KIMAGE_VADDR + TEXT_OFFSET 赋值给 '.' 意思是把代码段的地址设置给当前位置;
KIMAGE_VADDR 在 memory.h 中定义:
arch/arm64/include/asm/memory.h
#define KIMAGE_VADDR (MODULES_END)
TEXT_OFFSET 表示内核在RAM 中的起始位置相对于RAM 起始位置的偏移,在Makefile 中定义:
arch/arm64/Makefile
TEXT_OFFSET := 0x00080000
".head.text" 表示输出段名称,对应的输入段位 HEAD_TEXT:
include/asm-generic/vmlinux.lds.h
#define HEAD_TEXT KEEP(*(.head.text))
意思是将所有目标文件中的 .head.text 都放入 .head.text 输出段中。
其中 _text = .; 用以标识 _text 段的开始就是当前位置,这里也是内核镜像的起始位置。
从《fixmap详解》一文中,我们看到 KIMAGE_VADDR 是在0xFFFF FFC0 0000 0000 基础上偏移 256M,即KIMAGE_VADDR 的地址为 0xFFFF FFC0 1000 0000。如果加上 TEXT_OFFSET 之后就会得到 .head.text 所在段的地址 0xFFFF FFC0 1008 0000,我们通过 readelf -S vmlinux 来看下:
There are 52 section headers, starting at offset 0x1c82de30:
Section Headers:
[Nr] Name Type Address Offset
Size EntSize Flags Link Info Align
[ 0] NULL 0000000000000000 00000000
0000000000000000 0000000000000000 0 0 0
[ 1] .head.text PROGBITS ffffffc0 10080000 00010000
0000000000001000 0000000000000000 AX 0 0 4096
[ 2] .text PROGBITS ffffffc010081000 00011000
00000000011ae294 0000000000000000 WAX 0 0 4096
[ 3] .rodata PROGBITS ffffffc011230000 011c0000
0000000000c11d3d 0000000000000000 WAMS 0 0 4096
从vmlinux 的字段信息,可以清晰看到:
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
*(.exception.text)
__exception_text_end = .;
IRQENTRY_TEXT
SOFTIRQENTRY_TEXT
ENTRY_TEXT
TEXT_TEXT
SCHED_TEXT
CPUIDLE_TEXT
LOCK_TEXT
KPROBES_TEXT
HYPERVISOR_TEXT
IDMAP_TEXT
HIBERNATE_TEXT
TRAMP_TEXT
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
*(.got) /* Global offset table */
}
. = ALIGN(SEGMENT_ALIGN);
_etext = .; /* End of text section */
定义文本段内容,区间为 [ _stext, _etext )
开始的时候,将当前位置存放在 _stext 和 __exception_text_start 中;
接着加入所有输入目标文件的 .exception.text 段到 .text 段中;
添加完 .exception.text 后,将当前位置存入 __exception_text_end 中;
接着是宏定义 IRQENTRY_TEXT,定义在 vmlinux.lds.h 中:
include/asm-generic/vmlinux.lds.h
#define IRQENTRY_TEXT \
ALIGN_FUNCTION(); \
__irqentry_text_start = .; \
*(.irqentry.text) \
__irqentry_text_end = .;
以此类推,最终 .text 段中依次输入:
最后将当前位置存入 _etext 中。
RO_DATA(PAGE_SIZE) /* everything from this point to */
该宏定义是在 vmlinux.lds.h:
include/asm-generic/vmlinux.lds.h
#define RO_DATA(align) RO_DATA_SECTION(align)
#define RO_DATA_SECTION(align) \
. = ALIGN((align)); \
.rodata : AT(ADDR(.rodata) - LOAD_OFFSET) { \
__start_rodata = .; \
*(.rodata) *(.rodata.*) \
RO_AFTER_INIT_DATA /* Read only after init */ \
. = ALIGN(8); \
__start___tracepoints_ptrs = .; \
KEEP(*(__tracepoints_ptrs)) /* Tracepoints: pointer array */ \
__stop___tracepoints_ptrs = .; \
*(__tracepoints_strings)/* Tracepoints: strings */ \
} \
\
.rodata1 : AT(ADDR(.rodata1) - LOAD_OFFSET) { \
*(.rodata1) \
} \
\
/* PCI quirks */ \
.pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \
__start_pci_fixups_early = .; \
KEEP(*(.pci_fixup_early)) \
__end_pci_fixups_early = .; \
__start_pci_fixups_header = .; \
KEEP(*(.pci_fixup_header)) \
__end_pci_fixups_header = .; \
__start_pci_fixups_final = .; \
KEEP(*(.pci_fixup_final)) \
__end_pci_fixups_final = .; \
__start_pci_fixups_enable = .; \
KEEP(*(.pci_fixup_enable)) \
__end_pci_fixups_enable = .; \
__start_pci_fixups_resume = .; \
KEEP(*(.pci_fixup_resume)) \
__end_pci_fixups_resume = .; \
__start_pci_fixups_resume_early = .; \
KEEP(*(.pci_fixup_resume_early)) \
__end_pci_fixups_resume_early = .; \
__start_pci_fixups_suspend = .; \
KEEP(*(.pci_fixup_suspend)) \
__end_pci_fixups_suspend = .; \
__start_pci_fixups_suspend_late = .; \
KEEP(*(.pci_fixup_suspend_late)) \
__end_pci_fixups_suspend_late = .; \
} \
\
/* Built-in firmware blobs */ \
.builtin_fw : AT(ADDR(.builtin_fw) - LOAD_OFFSET) ALIGN(8) { \
__start_builtin_fw = .; \
KEEP(*(.builtin_fw)) \
__end_builtin_fw = .; \
} \
\
TRACEDATA \
\
/* Kernel symbol table: Normal symbols */ \
__ksymtab : AT(ADDR(__ksymtab) - LOAD_OFFSET) { \
__start___ksymtab = .; \
KEEP(*(SORT(___ksymtab+*))) \
__stop___ksymtab = .; \
} \
\
/* Kernel symbol table: GPL-only symbols */ \
__ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) { \
__start___ksymtab_gpl = .; \
KEEP(*(SORT(___ksymtab_gpl+*))) \
__stop___ksymtab_gpl = .; \
} \
\
/* Kernel symbol table: Normal unused symbols */ \
__ksymtab_unused : AT(ADDR(__ksymtab_unused) - LOAD_OFFSET) { \
__start___ksymtab_unused = .; \
KEEP(*(SORT(___ksymtab_unused+*))) \
__stop___ksymtab_unused = .; \
} \
\
/* Kernel symbol table: GPL-only unused symbols */ \
__ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - LOAD_OFFSET) { \
__start___ksymtab_unused_gpl = .; \
KEEP(*(SORT(___ksymtab_unused_gpl+*))) \
__stop___ksymtab_unused_gpl = .; \
} \
\
/* Kernel symbol table: GPL-future-only symbols */ \
__ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - LOAD_OFFSET) { \
__start___ksymtab_gpl_future = .; \
KEEP(*(SORT(___ksymtab_gpl_future+*))) \
__stop___ksymtab_gpl_future = .; \
} \
\
/* Kernel symbol table: Normal symbols */ \
__kcrctab : AT(ADDR(__kcrctab) - LOAD_OFFSET) { \
__start___kcrctab = .; \
KEEP(*(SORT(___kcrctab+*))) \
__stop___kcrctab = .; \
} \
\
/* Kernel symbol table: GPL-only symbols */ \
__kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - LOAD_OFFSET) { \
__start___kcrctab_gpl = .; \
KEEP(*(SORT(___kcrctab_gpl+*))) \
__stop___kcrctab_gpl = .; \
} \
\
/* Kernel symbol table: Normal unused symbols */ \
__kcrctab_unused : AT(ADDR(__kcrctab_unused) - LOAD_OFFSET) { \
__start___kcrctab_unused = .; \
KEEP(*(SORT(___kcrctab_unused+*))) \
__stop___kcrctab_unused = .; \
} \
\
/* Kernel symbol table: GPL-only unused symbols */ \
__kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - LOAD_OFFSET) { \
__start___kcrctab_unused_gpl = .; \
KEEP(*(SORT(___kcrctab_unused_gpl+*))) \
__stop___kcrctab_unused_gpl = .; \
} \
\
/* Kernel symbol table: GPL-future-only symbols */ \
__kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - LOAD_OFFSET) { \
__start___kcrctab_gpl_future = .; \
KEEP(*(SORT(___kcrctab_gpl_future+*))) \
__stop___kcrctab_gpl_future = .; \
} \
\
/* Kernel symbol table: strings */ \
__ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) { \
*(__ksymtab_strings) \
} \
\
/* __*init sections */ \
__init_rodata : AT(ADDR(__init_rodata) - LOAD_OFFSET) { \
*(.ref.rodata) \
MEM_KEEP(init.rodata) \
MEM_KEEP(exit.rodata) \
} \
\
/* Built-in module parameters. */ \
__param : AT(ADDR(__param) - LOAD_OFFSET) { \
__start___param = .; \
KEEP(*(__param)) \
__stop___param = .; \
} \
\
/* Built-in module versions. */ \
__modver : AT(ADDR(__modver) - LOAD_OFFSET) { \
__start___modver = .; \
KEEP(*(__modver)) \
__stop___modver = .; \
} \
\
BTF \
\
. = ALIGN((align)); \
__end_rodata = .;
定义只读数据段内容,区间为[ __start_rodata, __end_rodata );
EXCEPTION_TABLE(8) /* __init_begin will be marked RO NX */
该宏定义也是在 vmlinux.lds.h:
include/asm-generic/vmlinux.lds.h
#define EXCEPTION_TABLE(align) \
. = ALIGN(align); \
__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { \
__start___ex_table = .; \
KEEP(*(__ex_table)) \
__stop___ex_table = .; \
}
定义扩展页表段内容,区间为 [ __start___ex_table, __stop___ex_table );
NOTES
该宏定义也是在 vmlinux.lds.h:
include/asm-generic/vmlinux.lds.h
#define NOTES \
/DISCARD/ : { *(.note.GNU-stack) } \
.notes : AT(ADDR(.notes) - LOAD_OFFSET) { \
__start_notes = .; \
KEEP(*(.note.*)) \
__stop_notes = .; \
}
定义notes 段,区间为 [ __start_notes, __stop_notes );
. = ALIGN(PAGE_SIZE);
idmap_pg_dir = .;
. += IDMAP_DIR_SIZE;
idmap_pg_end = .;
#ifdef CONFIG_UNMAP_KERNEL_AT_EL0
tramp_pg_dir = .;
. += PAGE_SIZE;
#endif
reserved_pg_dir = .;
. += PAGE_SIZE;
swapper_pg_dir = .;
. += PAGE_SIZE;
在 .init.text 段之前,会预留一部分的空间给一些页表初始化使用。
例如,idmap_pg_dir、tramp_pg_dir、reserved_pg_dir、swapper_pg_dir 等。
idmap_pg_dir 是 identity mapping 用到的页表。
init 段包括 inittext 段 和 initdata 段。
. = ALIGN(SEGMENT_ALIGN);
__init_begin = .;
...
__init_end = .;
初始化段范围是 [ __init_begin, __init_end ),包括了这里的 initext 段,以及后面 initdata 段,在 kernel_init() 中内核初始化完成,会将这部分内存释放掉,详细看 free_initmem() 函数。
下面通过第 2.9 节和第 2.10 节,详细剖析 inittext 段 和 initdata 段。
. = ALIGN(SEGMENT_ALIGN);
__init_begin = .;
__inittext_begin = .;
INIT_TEXT_SECTION(8)
__exittext_begin = .;
.exit.text : {
ARM_EXIT_KEEP(EXIT_TEXT)
}
__exittext_end = .;
. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
*(.altinstructions)
__alt_instructions_end = .;
}
.altinstr_replacement : {
*(.altinstr_replacement)
}
. = ALIGN(PAGE_SIZE);
__inittext_end = .;
本节剖析的是 inittext 段,而不仅仅是 .init.text,inittext 区间为:
[ __inittext_begin, __inittext_end )
其中包括:
.init.text 段主要通过宏 INIT_TEXT_SECTION() 进行定义,下面来看下这个宏:
include/asm-generic/vmlinux.lds.h
#define INIT_TEXT_SECTION(inittext_align) \
. = ALIGN(inittext_align); \
.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { \
_sinittext = .; \
INIT_TEXT \
_einittext = .; \
}
#define INIT_TEXT \
*(.init.text .init.text.*) \
*(.text.startup) \
MEM_DISCARD(init.text*)
__exittext_begin = .;
.exit.text : {
ARM_EXIT_KEEP(EXIT_TEXT)
}
__exittext_end = .;
主要是通过宏 EXIT_TEXT 来定义 .exit.text 的输入section描述:
include/asm-generic/vmlinux.lds.h
#define EXIT_TEXT \
*(.exit.text) \
*(.text.exit) \
MEM_DISCARD(exit.text)
.init.data : {
INIT_DATA
INIT_SETUP(16)
INIT_CALLS
CON_INITCALL
INIT_RAM_FS
*(.init.rodata.* .init.bss) /* from the EFI stub */
}
.exit.data : {
ARM_EXIT_KEEP(EXIT_DATA)
}
PERCPU_SECTION(L1_CACHE_BYTES)
.rela.dyn : ALIGN(8) {
*(.rela .rela*)
}
__rela_offset = ABSOLUTE(ADDR(.rela.dyn) - KIMAGE_VADDR);
__rela_size = SIZEOF(.rela.dyn);
#ifdef CONFIG_RELR
.relr.dyn : ALIGN(8) {
*(.relr.dyn)
}
__relr_offset = ABSOLUTE(ADDR(.relr.dyn) - KIMAGE_VADDR);
__relr_size = SIZEOF(.relr.dyn);
#endif
. = ALIGN(SEGMENT_ALIGN);
__initdata_end = .;
__init_end = .;
本节剖析的是 initdata 段,而不仅仅是 .init.data,initdata 区间为:
[ __initdata_begin,__initdata_end)
这其中包括:
.init.data 又包括:
下面笔者将会逐个剖析这些 .init.data 段的宏定义。当然,initdata 中还有个重要的 .data.percpu 段,通过 PERCPU_SECTION 进行定义,本节也会剖析该字段。
include/asm-generic/vmlinux.lds.h
#define INIT_DATA \
KEEP(*(SORT(___kentry+*))) \
*(.init.data init.data.*) \
MEM_DISCARD(init.data*) \
KERNEL_CTORS() \
MCOUNT_REC() \
*(.init.rodata .init.rodata.*) \
FTRACE_EVENTS() \
TRACE_SYSCALLS() \
KPROBE_BLACKLIST() \
ERROR_INJECT_WHITELIST() \
MEM_DISCARD(init.rodata) \
CLK_OF_TABLES() \
RESERVEDMEM_OF_TABLES() \
TIMER_OF_TABLES() \
CPU_METHOD_OF_TABLES() \
CPUIDLE_METHOD_OF_TABLES() \
KERNEL_DTB() \
IRQCHIP_OF_MATCH_TABLE() \
ACPI_PROBE_TABLE(irqchip) \
ACPI_PROBE_TABLE(timer) \
THERMAL_TABLE(governor) \
EARLYCON_TABLE() \
LSM_TABLE() \
EARLY_LSM_TABLE()
include/asm-generic/vmlinux.lds.h
#define INIT_SETUP(initsetup_align) \
. = ALIGN(initsetup_align); \
__setup_start = .; \
KEEP(*(.init.setup)) \
__setup_end = .;
include/asm-generic/vmlinux.lds.h
#define INIT_CALLS \
__initcall_start = .; \
KEEP(*(.initcallearly.init)) \
INIT_CALLS_LEVEL(0) \
INIT_CALLS_LEVEL(1) \
INIT_CALLS_LEVEL(2) \
INIT_CALLS_LEVEL(3) \
INIT_CALLS_LEVEL(4) \
INIT_CALLS_LEVEL(5) \
INIT_CALLS_LEVEL(rootfs) \
INIT_CALLS_LEVEL(6) \
INIT_CALLS_LEVEL(7) \
__initcall_end = .;
在《Linux 中 initcall 机制详解》一文中 initcall 函数指针就是定义在这里。
INIT_CALLS 总的来说,做了四件事情:
下面来看下 INIT_CALLS_LEVEL() 做了什么:
#define INIT_CALLS_LEVEL(level) \
__initcall##level##_start = .; \
KEEP(*(.initcall##level##.init)) \
KEEP(*(.initcall##level##s.init)) \
首先,定义每个 level 的起始位置 __initcall##level##_start,《initcall 机制详解》一文中会连续调用每个 level 下所有 initcall 函数,而这个 level 下第一个 initcall 函数指针,也就是这个 level 函数指针的起始位置定义在 initcall_levels 这个函数指针数组中:
init/main.c
static initcall_entry_t *initcall_levels[] __initdata = {
__initcall0_start,
__initcall1_start,
__initcall2_start,
__initcall3_start,
__initcall4_start,
__initcall5_start,
__initcall6_start,
__initcall7_start,
__initcall_end,
};
回到 INIT_CALLS_LEVEL(),接下来会将所有目标文件中 .initcall##level##.init 和 .initcall##level##s.init 段加入到 .init.data 段中;
这样,在 System.map 中我们清晰看到 initcall 的各个段:
...
ffffffc012032ee0 d __initcall_223_42_trace_init_flags_sys_enterearly
ffffffc012032ee0 D __initcall_start
ffffffc012032ee0 D __setup_end
ffffffc012032ee8 d __initcall_224_66_trace_init_flags_sys_exitearly
ffffffc012032ef0 d __initcall_163_146_cpu_suspend_initearly
ffffffc012032ef8 d __initcall_151_267_asids_initearly
ffffffc012032f00 d __initcall_167_688_spawn_ksoftirqdearly
ffffffc012032f08 d __initcall_343_6656_migration_initearly
...
ffffffc012032f90 d __initcall_312_768_initialize_ptr_randomearly
ffffffc012032f98 D __initcall0_start
ffffffc012032f98 d __initcall_241_771_bpf_jit_charge_init0
ffffffc012032fa0 d __initcall_141_53_init_mmap_min_addr0
ffffffc012032fa8 d __initcall_209_6528_pci_realloc_setup_params0
ffffffc012032fb0 d __initcall_339_1143_net_ns_init0
ffffffc012032fb8 D __initcall1_start
ffffffc012032fb8 d __initcall_160_1437_fpsimd_init1
ffffffc012032fc0 d __initcall_181_669_tagged_addr_init1
...
ffffffc012033178 d __initcall_347_1788_init_default_flow_dissectors1
ffffffc012033180 d __initcall_360_2821_netlink_proto_init1
ffffffc012033188 D __initcall2_start
ffffffc012033188 d __initcall_165_139_debug_monitors_init2
ffffffc012033190 d __initcall_141_333_irq_sysfs_init2
...
ffffffc0120332b8 d __initcall_304_814_kobject_uevent_init2
ffffffc0120332c0 d __initcall_184_1686_msm_rpm_driver_init2s
ffffffc0120332c8 D __initcall3_start
ffffffc0120332c8 d __initcall_173_390_debug_traps_init3
ffffffc0120332d0 d __initcall_161_275_reserve_memblock_reserved_regions3
...
ffffffc012033370 d __initcall_132_5273_gsi_init3
ffffffc012033378 d __initcall_149_547_of_platform_default_populate_init3s
ffffffc012033380 D __initcall4_start
...
ffffffc012033878 D __initcall5_start
...
ffffffc0120339d8 d __initcall_317_1188_xsk_init5
ffffffc0120339e0 d __initcall_211_194_pci_apply_final_quirks5s
ffffffc0120339e8 d __initcall_168_680_populate_rootfsrootfs
ffffffc0120339e8 D __initcallrootfs_start
ffffffc0120339f0 D __initcall6_start
...
ffffffc012034b30 D __initcall7_start
...
ffffffc012034c88 d __initcall_150_554_of_platform_sync_state_init7s
ffffffc012034c90 d __initcall_123_29_alsa_sound_last_init7s
ffffffc012034c98 D __con_initcall_start
ffffffc012034c98 d __initcall_151_246_hvc_console_initcon
ffffffc012034c98 D __initcall_end
ffffffc012034ca0 D __con_initcall_end
include/asm-generic/vmlinux.lds.h
#define CON_INITCALL \
__con_initcall_start = .; \
KEEP(*(.con_initcall.init)) \
__con_initcall_end = .;
console initcall 不同其他的incall 函数,这里会单独确定其起始地址 __con_initcall_start 和结尾地址 __con_initcall_end。
include/asm-generic/vmlinux.lds.h
#ifdef CONFIG_BLK_DEV_INITRD
#define INIT_RAM_FS \
. = ALIGN(4); \
__initramfs_start = .; \
KEEP(*(.init.ramfs)) \
. = ALIGN(8); \
KEEP(*(.init.ramfs.info))
#else
#define INIT_RAM_FS
#endif
include/asm-generic/vmlinux.lds.h
#define PERCPU_SECTION(cacheline) \
. = ALIGN(PAGE_SIZE); \
.data..percpu : AT(ADDR(.data..percpu) - LOAD_OFFSET) { \
__per_cpu_load = .; \
PERCPU_INPUT(cacheline) \
}
首先是进行当前位置的地址对齐,需要 PAGE_SIZE 对齐;
接着,记录 percpu 段的起始位置 __per_cpu_load;
接下来是 percpu 的输入描述:
include/asm-generic/vmlinux.lds.h
#define PERCPU_INPUT(cacheline) \
__per_cpu_start = .; \
*(.data..percpu..first) \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
*(.data..percpu..read_mostly) \
. = ALIGN(cacheline); \
*(.data..percpu) \
*(.data..percpu..shared_aligned) \
PERCPU_DECRYPTED_SECTION \
__per_cpu_end = .;
.data.percpu 段包括:
_data = .;
_sdata = .;
RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
* Granule (CWG) of data from the cache. Keep the section that
* requires this type of maintenance to be in its own Cache Writeback
* Granule (CWG) area so the cache maintenance operations don't
* interfere with adjacent data.
*/
.mmuoff.data.write : ALIGN(SZ_2K) {
__mmuoff_data_start = .;
*(.mmuoff.data.write)
}
. = ALIGN(SZ_2K);
.mmuoff.data.read : {
*(.mmuoff.data.read)
__mmuoff_data_end = .;
}
PECOFF_EDATA_PADDING
__pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin);
_edata = .;
[ _data, _edata ] 代表数据相关段。
下面来看下 RW_DATA_SECTION:
include/asm-generic/vmlinux.lds.h
/*
* Writeable data.
* All sections are combined in a single .data section.
* The sections following CONSTRUCTORS are arranged so their
* typical alignment matches.
* A cacheline is typical/always less than a PAGE_SIZE so
* the sections that has this restriction (or similar)
* is located before the ones requiring PAGE_SIZE alignment.
* NOSAVE_DATA starts and ends with a PAGE_SIZE alignment which
* matches the requirement of PAGE_ALIGNED_DATA.
*
* use 0 as page_align if page_aligned data is not used */
#define RW_DATA_SECTION(cacheline, pagealigned, inittask) \
. = ALIGN(PAGE_SIZE); \
.data : AT(ADDR(.data) - LOAD_OFFSET) { \
INIT_TASK_DATA(inittask) \
NOSAVE_DATA \
PAGE_ALIGNED_DATA(pagealigned) \
CACHELINE_ALIGNED_DATA(cacheline) \
READ_MOSTLY_DATA(cacheline) \
DATA_DATA \
CONSTRUCTORS \
} \
BUG_TABLE \
BSS_SECTION(0, 0, 0)
该宏定义是在 vmlinux.lds.h:
#define BSS_SECTION(sbss_align, bss_align, stop_align) \
. = ALIGN(sbss_align); \
__bss_start = .; \
SBSS(sbss_align) \
BSS(bss_align) \
. = ALIGN(stop_align); \
__bss_stop = .;
定义 bss 段内容,区间为 [ _bss_start, __bss_stop ];
. = ALIGN(PAGE_SIZE);
init_pg_dir = .;
. += INIT_DIR_SIZE;
init_pg_end = .;
__pecoff_data_size = ABSOLUTE(. - __initdata_begin);
_end = .;
定义了 init_pg_dir,这是临时页表初始化页表,对于三级页表,会占用 3 个pages 空间。
在页表映射完成之后,这部分的内存会被释放掉,变成普通内存供 buddy 使用。详细可以查看《paging_init 详解》一文第 10.1 节。
#define STABS_DEBUG \
.stab 0 : { *(.stab) } \
.stabstr 0 : { *(.stabstr) } \
.stab.excl 0 : { *(.stab.excl) } \
.stab.exclstr 0 : { *(.stab.exclstr) } \
.stab.index 0 : { *(.stab.index) } \
.stab.indexstr 0 : { *(.stab.indexstr) } \
.comment 0 : { *(.comment) }
arch/arm64/kernel/image.h
/*
* These will output as part of the Image header, which should be little-endian
* regardless of the endianness of the kernel. While constant values could be
* endian swapped in head.S, all are done here for consistency.
*/
#define HEAD_SYMBOLS \
DEFINE_IMAGE_LE64(_kernel_size_le, _end - _text); \
DEFINE_IMAGE_LE64(_kernel_offset_le, TEXT_OFFSET); \
DEFINE_IMAGE_LE64(_kernel_flags_le, __HEAD_FLAGS);
#define DEFINE_IMAGE_LE64(sym, data) \
sym##_lo32 = DATA_LE32((data) & 0xffffffff); \
sym##_hi32 = DATA_LE32((data) >> 32)
定义头符号,分别是:
在内核镜像生成过程中,这几个符号标志代表的值会作为镜像头的一部分输出。
There are 52 section headers, starting at offset 0x1c82de30:
Section Headers:
[Nr] Name Type Address Offset
Size EntSize Flags Link Info Align
[ 0] NULL 0000000000000000 00000000
0000000000000000 0000000000000000 0 0 0
[ 1] .head.text PROGBITS ffffffc0 10080000 00010000
0000000000001000 0000000000000000 AX 0 0 4096
[ 2] .text PROGBITS ffffffc010081000 00011000
00000000011ae294 0000000000000000 WAX 0 0 4096
[ 3] .rodata PROGBITS ffffffc011230000 011c0000
0000000000c11d3d 0000000000000000 WAMS 0 0 4096
[ 4] ".mmuoff.data.wri PROGBITS ffffffc011e41d40 01dd1d40
0000000000000008 0000000000000000 WA 0 0 8
[ 5] .rodata1 PROGBITS ffffffc011e41d48 01dd1d48
0000000000000000 0000000000000000 WA 0 0 1
[ 6] .pci_fixup PROGBITS ffffffc011e41d48 01dd1d48
0000000000003438 0000000000000000 A 0 0 8
[ 7] .builtin_fw PROGBITS ffffffc011e45180 01dd5180
0000000000000000 0000000000000000 A 0 0 8
[ 8] __ksymtab PROGBITS ffffffc011e45180 01dd5180
000000000002a018 0000000000000000 A 0 0 8
[ 9] __ksymtab_gpl PROGBITS ffffffc011e6f198 01dff198
0000000000023dd8 0000000000000000 A 0 0 8
[10] __ksymtab_unused PROGBITS ffffffc011e92f70 01e22f70
0000000000000000 0000000000000000 A 0 0 1
[11] __ksymtab_unused_ PROGBITS ffffffc011e92f70 01e22f70
0000000000000000 0000000000000000 A 0 0 1
[12] __ksymtab_gpl_fut PROGBITS ffffffc011e92f70 01e22f70
0000000000000000 0000000000000000 A 0 0 1
[13] __kcrctab PROGBITS ffffffc011e92f70 01e22f70
0000000000007004 0000000000000000 A 0 0 4
[14] __kcrctab_gpl PROGBITS ffffffc011e99f74 01e29f74
0000000000005fa4 0000000000000000 A 0 0 1
[15] __kcrctab_unused PROGBITS ffffffc011e9ff18 01e2ff18
0000000000000000 0000000000000000 A 0 0 1
[16] __kcrctab_unused_ PROGBITS ffffffc011e9ff18 01e2ff18
0000000000000000 0000000000000000 A 0 0 1
[17] __kcrctab_gpl_fut PROGBITS ffffffc011e9ff18 01e2ff18
0000000000000000 0000000000000000 A 0 0 1
[18] __ksymtab_strings PROGBITS ffffffc011e9ff18 01e2ff18
0000000000043abc 0000000000000000 A 0 0 1
[19] __init_rodata PROGBITS ffffffc011ee39d4 01e739d4
0000000000000000 0000000000000000 A 0 0 1
[20] __param PROGBITS ffffffc011ee39d8 01e739d8
00000000000034d0 0000000000000000 A 0 0 8
[21] __modver PROGBITS ffffffc011ee6ea8 01e76ea8
00000000000000e8 0000000000000000 A 0 0 8
[22] __ex_table PROGBITS ffffffc011ee7000 01e77000
0000000000002f98 0000000000000000 A 0 0 8
[23] .modinfo PROGBITS ffffffc011ee9f98 01e79f98
000000000001eda2 0000000000000000 A 0 0 1
[24] .notes NOTE ffffffc011f08d3c 01e98d3c
0000000000000030 0000000000000000 A 0 0 4
[25] .init.text PROGBITS ffffffc011f10000 01ea0000
00000000000802a0 0000000000000000 AX 0 0 4
[26] .exit.text PROGBITS ffffffc011f902a0 01f202a0
0000000000007c98 0000000000000000 AX 0 0 4
[27] .altinstructions PROGBITS ffffffc011f97f38 01f27f38
000000000005c8c8 0000000000000000 A 0 0 1
[28] .altinstr_replace PROGBITS ffffffc011ff4800 01f84800
0000000000027048 0000000000000000 AX 0 0 4
[29] .init.data PROGBITS ffffffc01201c000 01fac000
0000000000018ea8 0000000000000000 WAMS 0 0 256
[30] .data..percpu PROGBITS ffffffc012035000 01fc5000
0000000000016098 0000000000000000 WA 0 0 64
[31] .rela.dyn RELA ffffffc01204b098 01fdb098
0000000000000000 0000000000000018 A 0 0 8
[32] .relr.dyn LOOS+fffff00 ffffffc01204b098 01fdb098
000000000000d1c0 0000000000000008 A 0 0 8
[33] .data PROGBITS ffffffc012060000 01ff0000
00000000001dbe90 0000000000000000 WA 0 0 4096
[34] __bug_table PROGBITS ffffffc01223be90 021cbe90
00000000000217e0 0000000000000000 WA 0 0 4
[35] .mmuoff.data.writ PROGBITS ffffffc01225d800 021ed800
0000000000000010 0000000000000000 WA 0 0 2048
[36] .mmuoff.data.read PROGBITS ffffffc01225e000 021ee000
0000000000000008 0000000000000000 WA 0 0 8
[37] .pecoff_edata_pad PROGBITS ffffffc01225e008 021ee008
00000000000001f8 0000000000000000 WA 0 0 1
[38] .sbss PROGBITS ffffffc01225e200 021ee200
0000000000000000 0000000000000000 WA 0 0 1
[39] .bss NOBITS ffffffc01225f000 021ef000
00000000003243d8 0000000000000000 WA 0 0 4096
[40] .comment PROGBITS 0000000000000000 021ef000
000000000000018e 0000000000000001 MS 0 0 1
[41] .debug_line PROGBITS 0000000000000000 021ef18e
000000000263488f 0000000000000000 0 0 1
[42] .debug_info PROGBITS 0000000000000000 04823a1d
0000000010379e33 0000000000000000 0 0 1
[43] .debug_abbrev PROGBITS 0000000000000000 14b9d850
000000000032fd07 0000000000000000 0 0 1
[44] .debug_aranges PROGBITS 0000000000000000 14ecd560
0000000000000780 0000000000000000 0 0 16
[45] .debug_ranges PROGBITS 0000000000000000 14ecdce0
00000000009b4c00 0000000000000000 0 0 16
[46] .debug_loc PROGBITS 0000000000000000 158828e0
00000000057c6925 0000000000000000 0 0 1
[47] .debug_str PROGBITS 0000000000000000 1b049205
00000000004d9367 0000000000000001 MS 0 0 1
[48] .debug_frame PROGBITS 0000000000000000 1b522570
0000000000201460 0000000000000000 0 0 8
[49] .symtab SYMTAB 0000000000000000 1b7239d0
0000000000b3df60 0000000000000018 51 449199 8
[50] .shstrtab STRTAB 0000000000000000 1c261930
000000000000028c 0000000000000000 0 0 1
[51] .strtab STRTAB 0000000000000000 1c261bbc
00000000005cc26f 0000000000000000 0 0 1
Key to Flags:
W (write), A (alloc), X (execute), M (merge), S (strings)
I (info), L (link order), G (group), T (TLS), E (exclude), x (unknown)
O (extra OS processing required) o (OS specific), p (processor specific)