实验平台:
intel x86_64
centos 7:3.10.0
// linux-3.10.1/Documentation/x86/x86_64/mm.txt
<previous description obsolete, deleted>
Virtual memory map with 4 level page tables:
0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
hole caused by [48:63] sign extension
ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
... unused hole ...
ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space
ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
The direct mapping covers all memory in the system up to the highest
memory address (this means in some cases it can also include PCI memory
holes).
vmalloc space is lazily synchronized into the different PML4 pages of
the processes using the page fault handler, with init_level4_pgt as
reference.
Current X86-64 implementations only support 40 bits of address space,
but we support up to 46 bits. This expands into MBZ space in the page tables.
x86_64虚拟地址空间布局如下(x86_64物理内存空间zone区域划分没有高端内存区域):
主要是分析其中的:
这两块内核虚拟地址空间都是直接映射区,地址连续,和物理地址空间是简单的线性映射关系。
......
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
......
ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
......
(32位系统下这两个区域是合并在一起的地址空间连续的直接映射区)
从 __START_KERNEL_map(0xffffffff80000000)开始的 512M 用于存放内核代码段、全局变量、BSS 等。这里对应到物理内存开始的位置,减去 __START_KERNEL_map 就能得到物理内存的地址。
对于目前我的centos7.6 ,3.10.0,内核通常安装在物理地址0x1000000处,从第16MB开始,因此内核代码段的起始地址是:0xffffffff81000000。
因为内核代码段_text的起始虚拟地址:0xffffffff81000000
// linux-3.10.1/arch/x86/include/asm/page_64_types.h
#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \
(CONFIG_PHYSICAL_ALIGN - 1)) & \
~(CONFIG_PHYSICAL_ALIGN - 1))
#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
// linux-3.10.1/Documentation/x86/x86_64/mm.txt
ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
__START_KERNEL_map是内核映射的起始虚拟地址,从物理地址phys_base映射到虚拟地址__START_KERNEL_map。
物理地址phys_base:64bit下为了支持KASLR(kernel address space layout ramdomization)内核映像在物理内存中是一个随机地址phys_base,centos7.6 默认没有开启KASLR,物理地址phys_base默认是0 。
32位系统下也都是默认从物理地址0开始建立映射关系。
(3.10.0没有加入KASLR,请参考:https://www.phoronix.com/news/KASLR-Default-Linux-4.12)
// linux-3.10.1/arch/x86/kernel/head_64.S
.text
__HEAD
.code64
.globl startup_64
startup_64:
......
/*
* Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
subq $_text - __START_KERNEL_map, %rbp
......
/* Fixup phys_base */
addq %rbp, phys_base(%rip)
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
KASLR:将kernel随机的加载到不同的物理地址运行,内核在自引导及decompressed后,会通过判断kaslr命令行参数是否enable来确定是否对加载内核的物理地址和内核运行的虚拟地址进行随机化操作。
__PHYSICAL_START宏是内核代码段在物理内存中的起始地址,即:0x1000000。
__START_KERNEL宏是是内核代码段映射的起始虚拟地址,即:_text内核虚拟地址0xffffffff81000000。
_text 的物理地址(__PHYSICAL_START宏) = 0xffffffff81000000 - 0xffffffff80000000 = 0x1000000
内核代码段的起始物理地址_text就是 0x1000000。
也可以通过看内核的配置的文件查看内核代码段的起始物理地址_text:
vim /boot/config-3.10.0-693.el7.x86_64
通过 /proc/iomem 查看系统的物理地址空间,可以看到内核代码段的起始物理地址是0x1000000,与上述相等。
// linux-3.10.1/arch/x86/kernel/vmlinux.lds.S
#define LOAD_OFFSET __START_KERNEL_map
OUTPUT_ARCH(i386:x86-64)
ENTRY(phys_startup_64)
jiffies_64 = jiffies;
SECTIONS
{
......
. = __START_KERNEL;
phys_startup_64 = startup_64 - LOAD_OFFSET;
......
}
phys_startup_64 是内核代码段的物理起始地址, startup_64是内核代码段的虚拟起始地址(和_text是同一个地址,内核代码段的起始函数就是startup_64)。
// linux-3.10.1/arch/x86/kernel/head_64.S
.text
__HEAD
.code64
.globl startup_64
startup_64:
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
* %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
*
* We only come here initially at boot nothing else comes here.
*
* Since we may be loaded at an address different from what we were
* compiled to run at we first fixup the physical addresses in our page
* tables and then reload them.
*/
/*
* Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
subq $_text - __START_KERNEL_map, %rbp
......
所以对于内核代码段:内核代码段物理地址 + __START_KERNEL_map = 内核代码段虚拟地址。
从 0xffff800000000000 开始就是内核的部分,开始有 8T 的空档区域。从 __PAGE_OFFSET_BASE(0xffff880000000000) 开始的 64T 的虚拟地址空间是直接映射区域,也就是减去 PAGE_OFFSET 就是物理地址。
这块区域把所有物理内存线性映射到PAGE_OFFSET虚拟地址。PAGE_OFFSET的值可能是固定的0xffff888000000000,或者KASLR使能后的随机地址page_offset_base。
// linux-3.10.1/arch/x86/include/asm/page_64_types.h
#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
比如:
packet套接字的 struct sock 结构体的内核虚拟地址是 0xffff88025ff24800。
// linux-3.10.1/Documentation/x86/x86_64/mm.txt
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
packet套接字的 struct sock的物理地址 = 0xffff88025ff24800 - 0xffff880000000000 = 0x25ff24800
packet套接字的 struct sock的物理地址就在System RAM:100000000-26dffffff 范围中。
小结:/dev/mem是物理地址空间,而操作系统操作的任何内存都基于虚拟地址。
(1)x86_64可以直接映射64T的物理内存(direct mapping of all phys. memory),足以一一映射当前常见的任意物理内存。
(2)Linux内核对所有物理内存建立一一映射(kernel text mapping)。物理地址和虚拟地址之间固定偏移。
对于上述的两块内存映射区:
......
ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
......
ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0
......
这两块映射区域之间的内核虚拟地址转换为物理地址可以直接借助于__pa(x)函数,无需通过页表转换获得。
direct mapping之间的物理地址转换为内核虚拟地址可以直接借助于__va(x)函数。
// linux-3.10.1/arch/x86/include/asm/page.h
#define __pa(x) __phys_addr((unsigned long)(x))
// linux-3.10.1/arch/x86/include/asm/page_64.h
#define __phys_addr(x) __phys_addr_nodebug(x)
extern unsigned long phys_base;
static inline unsigned long __phys_addr_nodebug(unsigned long x)
{
unsigned long y = x - __START_KERNEL_map;
/* use the carry flag to determine if x was < __START_KERNEL_map */
x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET));
return x;
}
// linux-3.10.1/arch/x86/include/asm/page_types.h
#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
// linux-3.10.1/arch/x86/include/asm/page_64_types.h
#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
__pa(x) = x - PAGE_OFFSET
相对应有个__va(x)函数:
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
// linux-3.10.1/arch/x86/include/asm/page_64_types.h
#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
__pa(x) = x - __START_KERNEL_map + phys_base
如果没有开启KASLR
__pa(x) = x - __START_KERNEL_map
上述两个函数只适用direct mapping和kernel text mapping。其他区域的内核虚拟地址空间不能适用这两个函数进行转换。
#include
#include
#include
#include
static int __init pa_va_init(void)
{
unsigned long kernel_phys_address;
unsigned long direct_phys_address;
kernel_phys_address = __pa(0xffffffff81000000);
printk(" kernel_text_start_phys_address = 0x%lx\n", kernel_phys_address);
direct_phys_address = __pa(0xffff88006cadc000 );
printk(" direct_phys_address = 0x%lx\n", direct_phys_address);
return -1;
}
static void __exit pa_va_exit(void)
{
}
module_init(pa_va_init);
module_exit(pa_va_exit);
MODULE_LICENSE("GPL");
Linux 3.10.1
极客时间:趣谈Linux操作系统
https://blog.csdn.net/pwl999/article/details/112055498
https://zhuanlan.zhihu.com/p/99557658
https://fanlv.wiki/2021/07/25/linux-mem/
https://blog.csdn.net/richardysteven/article/details/52629731
https://blog.csdn.net/dog250/article/details/102745181
https://mp.weixin.qq.com/s/TJ8ttDAZfZeUK-fSfRsJ8g