MMU除了管理虚拟地址到物理地址的映射外,还管理的访问权限。Linux内核分了很多段,如代码段,只读数据段,数据段等,不同的段访问权限肯定是不同的。代码段一般来说可读可执行,并且只能在特权模式下执行,但是不可写;只读数据段只能读,不能写也不能执行;数据段可以读写,不能执行。本文主要就是来介绍下linux内核各个段的权限设置问题,关于用户进程各个段的权限设置会在另外一篇文章中介绍。
我们首先要知道linux分了哪些段,以ARM64为例,我们看下编译生成的连接脚本vmlinux.lds文件。
. = ((((0xffffffffffffffff << (48)) + (0)) + (0x08000000))) + 0x00080000;
.head.text : {
_text = .;
*(.head.text)
}
.text : { /* Real text segment */
_stext = .; /* Text and read-only data */
__exception_text_start = .;
*(.exception.text)
__exception_text_end = .;
. = ALIGN(8); __entry_text_start = .; *(.entry.text) __entry_text_end = .;
. = ALIGN(8); *(.text.hot .text .text.fixup .text.unlikely) *(.ref.text)
. = ALIGN(8); __sched_text_start = .; *(.sched.text) __sched_text_end = .;
. = ALIGN(8); __cpuidle_text_start = .; *(.cpuidle.text) __cpuidle_text_end = .;
. = ALIGN(8); __lock_text_start = .; *(.spinlock.text) __lock_text_end = .;
. = ALIGN(8); __kprobes_text_start = .; *(.kprobes.text) __kprobes_text_end = .;
. = ALIGN(0x00001000); __hyp_idmap_text_start = .; *(.hyp.idmap.text) __hyp_idmap_text_end = .; __hyp_text_start = .; *(.hyp.text) __hyp_text_end = .;
. = ALIGN(0x00001000); __idmap_text_start = .; *(.idmap.text) __idmap_text_end = .;
*(.fixup)
*(.gnu.warning)
. = ALIGN(16);
*(.got) /* Global offset table */
}
. = ALIGN(0x00010000);
_etext = .; /* End of text section */
. = ALIGN(((1 << 12))); .rodata : AT(ADDR(.rodata) - 0) { __start_rodata = .; *(.rodata) *(.rodata.*) __start_data_ro_after_init = .; *(.data..ro_after_init) __end_data_ro_after_init = .; *(__vermagic) . = ALIGN(8); __start___tracepoints_ptrs = .; *(__tracepoints_ptrs) __stop___tracepoints_ptrs = .; *(__tracepoints_strings) } .rodata1 : AT(ADDR(.rodata1) - 0) { *(.rodata1) } . = ALIGN(8); __bug_table : AT(ADDR(__bug_table) - 0) { __start___bug_table = .; *(__bug_table) __stop___bug_table = .; } .pci_fixup : AT(ADDR(.pci_fixup) - 0) { __start_pci_fixups_early = .; *(.pci_fixup_early) __end_pci_fixups_early = .; __start_pci_fixups_header = .; *(.pci_fixup_header) __end_pci_fixups_header = .; __start_pci_fixups_final = .; *(.pci_fixup_final) __end_pci_fixups_final = .; __start_pci_fixups_enable = .; *(.pci_fixup_enable) __end_pci_fixups_enable = .; __start_pci_fixups_resume = .; *(.pci_fixup_resume) __end_pci_fixups_resume = .; __start_pci_fixups_resume_early = .; *(.pci_fixup_resume_early) __end_pci_fixups_resume_early = .; __start_pci_fixups_suspend = .; *(.pci_fixup_suspend) __end_pci_fixups_suspend = .; __start_pci_fixups_suspend_late = .; *(.pci_fixup_suspend_late) __end_pci_fixups_suspend_late = .; } .builtin_fw : AT(ADDR(.builtin_fw) - 0) { __start_builtin_fw = .; *(.builtin_fw) __end_builtin_fw = .; } __ksymtab : AT(ADDR(__ksymtab) - 0) { __start___ksymtab = .; KEEP(*(SORT(___ksymtab+*))) __stop___ksymtab = .; } __ksymtab_gpl : AT(ADDR(__ksymtab_gpl) - 0) { __start___ksymtab_gpl = .; KEEP(*(SORT(___ksymtab_gpl+*))) __stop___ksymtab_gpl = .; } __ksymtab_unused : AT(ADDR(__ksymtab_unused) - 0) { __start___ksymtab_unused = .; KEEP(*(SORT(___ksymtab_unused+*))) __stop___ksymtab_unused = .; } __ksymtab_unused_gpl : AT(ADDR(__ksymtab_unused_gpl) - 0) { __start___ksymtab_unused_gpl = .; KEEP(*(SORT(___ksymtab_unused_gpl+*))) __stop___ksymtab_unused_gpl = .; } __ksymtab_gpl_future : AT(ADDR(__ksymtab_gpl_future) - 0) { __start___ksymtab_gpl_future = .; KEEP(*(SORT(___ksymtab_gpl_future+*))) __stop___ksymtab_gpl_future = .; } __kcrctab : AT(ADDR(__kcrctab) - 0) { __start___kcrctab = .; KEEP(*(SORT(___kcrctab+*))) __stop___kcrctab = .; } __kcrctab_gpl : AT(ADDR(__kcrctab_gpl) - 0) { __start___kcrctab_gpl = .; KEEP(*(SORT(___kcrctab_gpl+*))) __stop___kcrctab_gpl = .; } __kcrctab_unused : AT(ADDR(__kcrctab_unused) - 0) { __start___kcrctab_unused = .; KEEP(*(SORT(___kcrctab_unused+*))) __stop___kcrctab_unused = .; } __kcrctab_unused_gpl : AT(ADDR(__kcrctab_unused_gpl) - 0) { __start___kcrctab_unused_gpl = .; KEEP(*(SORT(___kcrctab_unused_gpl+*))) __stop___kcrctab_unused_gpl = .; } __kcrctab_gpl_future : AT(ADDR(__kcrctab_gpl_future) - 0) { __start___kcrctab_gpl_future = .; KEEP(*(SORT(___kcrctab_gpl_future+*))) __stop___kcrctab_gpl_future = .; } __ksymtab_strings : AT(ADDR(__ksymtab_strings) - 0) { KEEP(*(__ksymtab_strings)) } __init_rodata : AT(ADDR(__init_rodata) - 0) { *(.ref.rodata) } __param : AT(ADDR(__param) - 0) { __start___param = .; *(__param) __stop___param = .; } __modver : AT(ADDR(__modver) - 0) { __start___modver = .; *(__modver) __stop___modver = .; . = ALIGN(((1 << 12))); __end_rodata = .; } . = ALIGN(((1 << 12))); /* everything from this point to */
. = ALIGN(8); __ex_table : AT(ADDR(__ex_table) - 0) { __start___ex_table = .; *(__ex_table) __stop___ex_table = .; } /* __init_begin will be marked RO NX */
.notes : AT(ADDR(.notes) - 0) { __start_notes = .; *(.note.*) __stop_notes = .; }
. = ALIGN(0x00010000);
__init_begin = .;
. = ALIGN(8); .init.text : AT(ADDR(.init.text) - 0) { _sinittext = .; *(.init.text) *(.text.startup) *(.meminit.text) _einittext = .; }
.exit.text : {
*(.exit.text) *(.text.exit) *(.memexit.text)
}
.init.data : {
KEEP(*(SORT(___kentry+*))) *(.init.data) *(.meminit.data) *(.init.rodata) *(.meminit.rodata) . = ALIGN(8); __clk_of_table = .; *(__clk_of_table) *(__clk_of_table_end) . = ALIGN(8); __reservedmem_of_table = .; *(__reservedmem_of_table) *(__reservedmem_of_table_end) . = ALIGN(8); __clksrc_of_table = .; *(__clksrc_of_table) *(__clksrc_of_table_end) . = ALIGN(8); __iommu_of_table = .; *(__iommu_of_table) *(__iommu_of_table_end) . = ALIGN(8); __cpu_method_of_table = .; *(__cpu_method_of_table) *(__cpu_method_of_table_end) . = ALIGN(8); __cpuidle_method_of_table = .; *(__cpuidle_method_of_table) *(__cpuidle_method_of_table_end) . = ALIGN(32); __dtb_start = .; *(.dtb.init.rodata) __dtb_end = .; . = ALIGN(8); __irqchip_of_table = .; *(__irqchip_of_table) *(__irqchip_of_table_end) . = ALIGN(8); __irqchip_acpi_probe_table = .; *(__irqchip_acpi_probe_table) __irqchip_acpi_probe_table_end = .; . = ALIGN(8); __clksrc_acpi_probe_table = .; *(__clksrc_acpi_probe_table) __clksrc_acpi_probe_table_end = .; . = ALIGN(32); __earlycon_table = .; *(__earlycon_table) __earlycon_table_end = .;
. = ALIGN(16); __setup_start = .; *(.init.setup) __setup_end = .;
__initcall_start = .; KEEP(*(.initcallearly.init)) __initcall0_start = .; KEEP(*(.initcall0.init)) KEEP(*(.initcall0s.init)) __initcall1_start = .; KEEP(*(.initcall1.init)) KEEP(*(.initcall1s.init)) __initcall2_start = .; KEEP(*(.initcall2.init)) KEEP(*(.initcall2s.init)) __initcall3_start = .; KEEP(*(.initcall3.init)) KEEP(*(.initcall3s.init)) __initcall4_start = .; KEEP(*(.initcall4.init)) KEEP(*(.initcall4s.init)) __initcall5_start = .; KEEP(*(.initcall5.init)) KEEP(*(.initcall5s.init)) __initcallrootfs_start = .; KEEP(*(.initcallrootfs.init)) KEEP(*(.initcallrootfss.init)) __initcall6_start = .; KEEP(*(.initcall6.init)) KEEP(*(.initcall6s.init)) __initcall7_start = .; KEEP(*(.initcall7.init)) KEEP(*(.initcall7s.init)) __initcall_end = .;
__con_initcall_start = .; KEEP(*(.con_initcall.init)) __con_initcall_end = .;
__security_initcall_start = .; KEEP(*(.security_initcall.init)) __security_initcall_end = .;
. = ALIGN(4); __initramfs_start = .; KEEP(*(.init.ramfs)) . = ALIGN(8); KEEP(*(.init.ramfs.info))
*(.init.rodata.* .init.bss) /* from the EFI stub */
}
.exit.data : {
*(.exit.data) *(.fini_array) *(.dtors) *(.memexit.data) *(.memexit.rodata)
}
. = ALIGN((1 << 12)); .data..percpu : AT(ADDR(.data..percpu) - 0) { __per_cpu_load = .; __per_cpu_start = .; *(.data..percpu..first) . = ALIGN((1 << 12)); *(.data..percpu..page_aligned) . = ALIGN((1 << 7)); *(.data..percpu..read_mostly) . = ALIGN((1 << 7)); *(.data..percpu) *(.data..percpu..shared_aligned) __per_cpu_end = .; }
. = ALIGN(4);
.altinstructions : {
__alt_instructions = .;
*(.altinstructions)
__alt_instructions_end = .;
}
.altinstr_replacement : {
*(.altinstr_replacement)
}
.rela : ALIGN(8) {
*(.rela .rela*)
}
__rela_offset = ABSOLUTE(ADDR(.rela) - ((((0xffffffffffffffff << (48)) + (0)) + (0x08000000))));
__rela_size = SIZEOF(.rela);
. = ALIGN(0x00010000);
__init_end = .;
_data = .;
_sdata = .;
. = ALIGN((1 << 12)); .data : AT(ADDR(.data) - 0) { . = ALIGN(16384); __start_init_task = .; *(.data..init_task) __end_init_task = .; . = ALIGN((1 << 12)); __nosave_begin = .; *(.data..nosave) . = ALIGN((1 << 12)); __nosave_end = .; . = ALIGN((1 << 12)); *(.data..page_aligned) . = ALIGN((1 << 7)); *(.data..cacheline_aligned) . = ALIGN((1 << 7)); *(.data..read_mostly) . = ALIGN((1 << 7)); *(.data .data.[0-9a-zA-Z_]*) *(.ref.data) *(.data..shared_aligned) *(.data.unlikely) . = ALIGN(32); *(__tracepoints) . = ALIGN(8); __start___jump_table = .; *(__jump_table) __stop___jump_table = .; . = ALIGN(8); __start___verbose = .; *(__verbose) __stop___verbose = .; CONSTRUCTORS }
/*
* Data written with the MMU off but read with the MMU on requires
* cache lines to be invalidated, discarding up to a Cache Writeback
* Granule (CWG) of data from the cache. Keep the section that
* requires this type of maintenance to be in its own Cache Writeback
* Granule (CWG) area so the cache maintenance operations don't
* interfere with adjacent data.
*/
.mmuoff.data.write : ALIGN(0x00000800) {
__mmuoff_data_start = .;
*(.mmuoff.data.write)
}
. = ALIGN(0x00000800);
.mmuoff.data.read : {
*(.mmuoff.data.read)
__mmuoff_data_end = .;
}
.pecoff_edata_padding : { BYTE(0); . = ALIGN(PECOFF_FILE_ALIGNMENT); }
_edata = .;
. = ALIGN(0); __bss_start = .; . = ALIGN(0); .sbss : AT(ADDR(.sbss) - 0) { *(.sbss) *(.scommon) } . = ALIGN(0); .bss : AT(ADDR(.bss) - 0) { *(.bss..page_aligned) *(.dynbss) *(.bss .bss.[0-9a-zA-Z_]*) *(COMMON) } . = ALIGN(0); __bss_stop = .;
. = ALIGN((1 << 12));
idmap_pg_dir = .;
. += ((((((48)) - 4) / (12 - 3)) - 1) * (1 << 12));
swapper_pg_dir = .;
. += ((4 - 1) * (1 << 12));
_end = .;
.stab 0 : { *(.stab) } .stabstr 0 : { *(.stabstr) } .stab.excl 0 : { *(.stab.excl) } .stab.exclstr 0 : { *(.stab.exclstr) } .stab.index 0 : { *(.stab.index) } .stab.indexstr 0 : { *(.stab.indexstr) } .comment 0 : { *(.comment) }
_kernel_size_le_lo32 = (((_end - _text) & 0xffffffff) & 0xffffffff); _kernel_size_le_hi32 = (((_end - _text) >> 32) & 0xffffffff); _kernel_offset_le_lo32 = (((0x00080000) & 0xffffffff) & 0xffffffff); _kernel_offset_le_hi32 = (((0x00080000) >> 32) & 0xffffffff); _kernel_flags_le_lo32 = (((((0 << 0) | (((12 - 10) / 2) << 1) | (1 << 3))) & 0xffffffff) & 0xffffffff); _kernel_flags_le_hi32 = (((((0 << 0) | (((12 - 10) / 2) << 1) | (1 << 3))) >> 32) & 0xffffffff);
}
/*
* The HYP init code and ID map text can't be longer than a page each,
* and should not cross a page boundary.
*/
ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(0x00001000 - 1)) <= 0x00001000,
"HYP init code too big or misaligned")
ASSERT(__idmap_text_end - (__idmap_text_start & ~(0x00001000 - 1)) <= 0x00001000,
"ID map text too big or misaligned")
/*
* If padding is applied before .head.text, virt<->phys conversions will fail.
*/
ASSERT(_text == (((((0xffffffffffffffff << (48)) + (0)) + (0x08000000))) + 0x00080000), "HEAD is misaligned")
以ARM64为例,内核各个段权限的设置在arch/arm64/mm/mmu.c的map_kernel中定义。
static void __init map_kernel(pgd_t *pgd)
{
static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data;
map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text);
map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata);
map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC,
&vmlinux_init);
map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data);
if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) {
/*
* The fixmap falls in a separate pgd to the kernel, and doesn't
* live in the carveout for the swapper_pg_dir. We can simply
* re-use the existing dir for the fixmap.
*/
set_pgd(pgd_offset_raw(pgd, FIXADDR_START),
*pgd_offset_k(FIXADDR_START));
} else if (CONFIG_PGTABLE_LEVELS > 3) {
/*
* The fixmap shares its top level pgd entry with the kernel
* mapping. This can really only occur when we are running
* with 16k/4 levels, so we can simply reuse the pud level
* entry instead.
*/
BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES));
set_pud(pud_set_fixmap_offset(pgd, FIXADDR_START),
__pud(__pa(bm_pmd) | PUD_TYPE_TABLE));
pud_clear_fixmap();
} else {
BUG();
}
kasan_copy_shadow(pgd);
}
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
PTE_UXN:指明非特权模式下不可执行,没有指明PTE_RDONLY,所以代码段是可以写的,这样岂不是很危险?
#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
PTE_PXN指明特权模式下不可执行,PTE_UXN指明非特权模式下不可执行,但是。。。没有指明PTE_RDONLY,只读数据段也可以写?什么鬼?
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
初始化段同时包括数据和代码,这里设置为跟代码段权限一致,但是依然是可写的,不过初始化完成后这个段会被释放,我们姑且认为安全隐患比较小。
#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE)
可读写,不可执行,这个看起来比较正常。
看到这里可以肯定的是,事情没有那么简单,权限设定绝不可能仅仅在此处。代码段必须是只读的,否则存放很大的安全漏洞,如果内核代码被恶意程序修改了,那么CPU将很容易被黑客控制,所以代码段应该被设置为PAGE_KERNEL_ROX属性,只读,可特权模式下执行,而只读数据段应该被设置为PAGE_KERNEL_RO只读。果不其然,全局搜索一下这两个宏,果然发现在mark_readonly函数中,会将代码段和只读数据段的权限分别设置为PAGE_KERNEL_ROX和PAGE_KERNEL_RO。
那么为什么不一开始就直接设置为只读呢?这里其实是内核给用户提供了一个便利,如果启动内核的时候通过bootargs设置rodata=false,则不会将只读数据段和代码段设置为只读,这应该是方便调试,或者用于一些特殊的场景,如果没有设置,则默认将代码段和只读数据段设置为只读模式。