在前一篇笔记中,已经对节点的概念进行了讲解。而节点又被分为多个管理区(zone),zone用于表示内存中的某个范围。管理区被分为多个类型。
ZONE_DMA:内存首部16MB,即低端范围的物理内存,某些工业标准体系结构(ISA)设备需要用到ZONE_DMA;
ZONE_DMA32:标记了使用32位地址字可寻址, 适合DMA的内存域。
ZONE_NORMAL:16MB~896MB,该部分的内存由内核直接映射到线性地址空间的较高部分;
ZONE_HIGHMEM:896MB~末尾,将保留给系统使用,是系统中预留的可用内存空间,不能被内核直接映射。
ZONE_MOVABLE:内核定义了一个伪内存域ZONE_MOVABLE, 在防止物理内存碎片的机制memory migration中需要使用该 内存域. 供防止物理内存碎片的极致使用。
ZONE_DEVICE:为支持热插拔设备而分配的Non Volatile Memory非易失性内存。
__MAX_NR_ZONES:充当结束标记, 在内核中想要迭代系统中所有内存域, 会用到该常量。
许多内核操作只有通过ZONE_NORMAL才能完成,因此ZONE_NORMAL是影响系统性能最为重要的管理区。管理区的类型用定义为枚举类型 zone_type ,在
enum zone_type {
#ifdef CONFIG_ZONE_DMA
/*
* ZONE_DMA is used when there are devices that are not able
* to do DMA to all of addressable memory (ZONE_NORMAL). Then we
* carve out the portion of memory that is needed for these devices.
* The range is arch specific.
*
* Some examples
*
* Architecture Limit
* ---------------------------
* parisc, ia64, sparc <4G
* s390 <2G
* arm Various
* alpha Unlimited or 0-16MB.
*
* i386, x86_64 and multiple other arches
* <16M.
*/
ZONE_DMA,
#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
* only able to do DMA to the lower 16M but also 32 bit devices that
* can only do DMA areas below 4G.
*/
ZONE_DMA32,
#endif
/*
* Normal addressable memory is in ZONE_NORMAL. DMA operations can be
* performed on pages in ZONE_NORMAL if the DMA devices support
* transfers to all addressable memory.
*/
ZONE_NORMAL,
#ifdef CONFIG_HIGHMEM
/*
* A memory area that is only addressable by the kernel through
* mapping portions into its own address space. This is for example
* used by i386 to allow the kernel to address the memory beyond
* 900MB. The kernel will set up special mappings (page
* table entries on i386) for each page that the kernel needs to
* access.
*/
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
#ifdef CONFIG_ZONE_DEVICE
ZONE_DEVICE,
#endif
__MAX_NR_ZONES
};
管理区描述符定义为结构体 struct zone ,在
struct zone {
unsigned long watermark[NR_WMARK];
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA
int node;
#endif
unsigned int inactive_ratio;
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
unsigned long dirty_balance_reserve;
#ifndef CONFIG_SPARSEMEM
unsigned long *pageblock_flags;
#endif
#ifdef CONFIG_NUMA
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif
unsigned long zone_start_pfn;
unsigned long managed_pages;
unsigned long spanned_pages;
unsigned long present_pages;
const char *name;
int nr_migrate_reserve_block;
#ifdef CONFIG_MEMORY_ISOLATION
unsigned long nr_isolate_pageblock;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
seqlock_t span_seqlock;
#endif
wait_queue_head_t *wait_table;
unsigned long wait_table_hash_nr_entries;
unsigned long wait_table_bits;
ZONE_PADDING(_pad1_)
struct free_area free_area[MAX_ORDER];
unsigned long flags;
spinlock_t lock;
ZONE_PADDING(_pad2_)
spinlock_t lru_lock;
struct lruvec lruvec;
atomic_long_t inactive_age;
unsigned long percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
unsigned long compact_cached_free_pfn;
unsigned long compact_cached_migrate_pfn[2];
#endif
#ifdef CONFIG_COMPACTION
unsigned int compact_considered;
unsigned int compact_defer_shift;
int compact_order_failed;
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
bool compact_blockskip_flush;
#endif
ZONE_PADDING(_pad3_)
atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
} ____cacheline_internodealigned_in_smp;
下面对struct zone结构体的主要字段进行简要分析:
enum zone_watermarks {
// 说明当前可用内存达到最低限了
WMARK_MIN,
// 可用内存很少了,但还没到最低限,不是很紧急的情况,就不要占用内存了
WMARK_LOW,
// 剩余内存丰富,大家放心使用
WMARK_HIGH,
NR_WMARK
};
struct page
一样关心数据结构的大小, 因此这里的 ZONE_PADDING() 可以理解为用空间换取时间(性能). 在内存管理开发过程中, 内核开发者逐渐发现有一些自选锁竞争会非常厉害, 很难获取. 像 zone->lock和 zone->lru_lock这两个锁有时需要同时获取锁. 因此保证他们使用不同的cache line 是内核常用的一种优化技巧。ZONE_PADDING的定义如下:/*
* zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate
* cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here.
*/
#if defined(CONFIG_SMP)
struct zone_padding {
char x[0];
} ____cacheline_internodealigned_in_smp;
#define ZONE_PADDING(name) struct zone_padding name;
#else
#define ZONE_PADDING(name)
#endif
enum zone_flags {
/* 防止并发回收, 在SMP上系统, 多个CPU可能试图并发的回收同一个内存域.
* 该标志可防止这种情况, 如果一个CPU在回收某个内存域, 则设置该标识.
* 这防止了其他CPU回收该内存区域
*/
ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */
/* zone处于OOM killer(Out Of Memory killer)的zonelist中,
* OOM killer机制会监控那些占用内存过大,尤其是瞬间占用内存很快的进程,
* 然后防止内存耗尽而自动把该进程杀掉。
*/
ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */
/*标识当前区域中有很多脏页*/
ZONE_CONGESTED, /* zone has many dirty pages backed by
* a congested BDI
*/
/*用于标识最近的一次页面扫描中, LRU算法发现了很多脏的页面*/
ZONE_DIRTY, /* reclaim scanning has recently found
* many dirty file pages at the tail
* of the LRU.
*/
/*最近的回收扫描发现有很多页在写回*/
ZONE_WRITEBACK, /* reclaim scanning has recently found
* many pages under writeback
*/
ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */
};
本文基于X86对管理区初始化进行分析。
管理区初始化时,需要用到最大物理页面帧号 max_pfn,低端内存最大页面数 max_low_pfn两个全局变量
通过 start_kernel() -> setup_arch() -> e820_end_of_ram_pfn() 初始化max_pfn,如下所示:
void __init setup_arch(char **cmdline_p)
{
...
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
/*遍历e820.map,找到系统中得最大内存数,
这个内存数需小于4G*/
max_pfn = e820_end_of_ram_pfn();
...
}
#ifdef CONFIG_X86_32
# ifdef CONFIG_X86_PAE
# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT))
# else
# define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT))
# endif
#else /* CONFIG_X86_32 */
# define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
#endif
/*
* Find the highest page frame number we have available
*/
static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
{
int i;
unsigned long last_pfn = 0;
unsigned long max_arch_pfn = MAX_ARCH_PFN;
/*e820.nr_map为e820中所有的内存块,其中e820为从bios中探测到的页面数存放处*/
for (i = 0; i < e820.nr_map; i++) {
struct e820entry *ei = &e820.map[i];
unsigned long start_pfn;
unsigned long end_pfn;
/*
* Persistent memory is accounted as ram for purposes of
* establishing max_pfn and mem_map.
*/
if (ei->type != E820_RAM && ei->type != E820_PRAM)
continue;
/*起始地址对应的页面帧号*/
start_pfn = ei->addr >> PAGE_SHIFT;
/*结束物理地址对应的页面帧号*/
end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT;
if (start_pfn >= limit_pfn)
continue;
if (end_pfn > limit_pfn) {
last_pfn = limit_pfn;
break;
}
if (end_pfn > last_pfn)
last_pfn = end_pfn;
}
if (last_pfn > max_arch_pfn)
last_pfn = max_arch_pfn;
printk(KERN_INFO "e820: last_pfn = %#lx max_arch_pfn = %#lx\n",
last_pfn, max_arch_pfn);
return last_pfn;
}
unsigned long __init e820_end_of_ram_pfn(void)
{
/*MAX_ARCH_PFN为4G空间*/
return e820_end_pfn(MAX_ARCH_PFN);
}
unsigned long __init e820_end_of_low_ram_pfn(void)
{
return e820_end_pfn(1UL << (32-PAGE_SHIFT));
}
通过 start_kernel() -> setup_arch() -> e820_end_of_ram_pfn() 初始化 max_low_pfn,如下所示:
void __init setup_arch(char **cmdline_p)
{
...
#ifdef CONFIG_X86_32
/* max_low_pfn get updated here */
find_low_pfn_range();
#else
check_x2apic();
/* How many end-of-memory variables you have, grandma! */
/* need this before calling reserve_initrd */
if (max_pfn > (1UL<<(32 - PAGE_SHIFT)))
max_low_pfn = e820_end_of_low_ram_pfn();
else
max_low_pfn = max_pfn;
high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
#endif
...
}
/*
* All of RAM fits into lowmem - but if user wants highmem
* artificially via the highmem=x boot parameter then create
* it:
*/
static void __init lowmem_pfn_init(void)
{
/* max_low_pfn is 0, we already have early_res support */
max_low_pfn = max_pfn;
//如果高端内存页面数highmem_pages在开机没有设置,初始化为0.
if (highmem_pages == -1)
highmem_pages = 0;
#ifdef CONFIG_HIGHMEM
//如果高端内存页框数大于页框总数,打印错误信息,并初始化高端内存页框数为0
if (highmem_pages >= max_pfn) {
printk(KERN_ERR MSG_HIGHMEM_TOO_BIG,
pages_to_mb(highmem_pages), pages_to_mb(max_pfn));
highmem_pages = 0;
}
//如果存在有效的高端内存页框数
if (highmem_pages) {
if (max_low_pfn - highmem_pages < 64*1024*1024/PAGE_SIZE) {
printk(KERN_ERR MSG_LOWMEM_TOO_SMALL,
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
/* 前面已经将max_low_pfn初始化为最大页框数 max_pfn,
* max_low_pfn为最大页框数减去高端内存页框数
*/
max_low_pfn -= highmem_pages;
}
#else
if (highmem_pages)
printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n");
#endif
}
....
/*
* We have more RAM than fits into lowmem - we try to put it into
* highmem, also taking the highmem=x boot parameter into account:
*/
static void __init highmem_pfn_init(void)
{
max_low_pfn = MAXMEM_PFN;
/*如果高端内存页面数highmem_pages在开机没有设置,
*highmem_pages等于总页面数减去低端页面数
*/
if (highmem_pages == -1)
highmem_pages = max_pfn - MAXMEM_PFN;
/*如果highmem_pages在开机启动项设置了,因为可能出现不一致的情况
*所以要对highmem_pages的大小进行判断
*如果高端内存页框数 highmem_pages 加低端内存的最大页框数 MAXMEM_PFN
*小于最大物理页框数,最大页框数为高端内存页面数加低端内存的最大页框数
*/
if (highmem_pages + MAXMEM_PFN < max_pfn)
max_pfn = MAXMEM_PFN + highmem_pages;
if (highmem_pages + MAXMEM_PFN > max_pfn) {
printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL,
pages_to_mb(max_pfn - MAXMEM_PFN),
pages_to_mb(highmem_pages));
highmem_pages = 0;
}
#ifndef CONFIG_HIGHMEM
/* Maximum memory usable is what is directly addressable */
printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20);
if (max_pfn > MAX_NONPAE_PFN)
printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n");
else
printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n");
max_pfn = MAXMEM_PFN;
#else /* !CONFIG_HIGHMEM */
#ifndef CONFIG_HIGHMEM64G
/*在没有配置64G的情况下,内存的大小不能超过4G*/
if (max_pfn > MAX_NONPAE_PFN) {
max_pfn = MAX_NONPAE_PFN;
printk(KERN_WARNING MSG_HIGHMEM_TRIMMED);
}
#endif /* !CONFIG_HIGHMEM64G */
#endif /* !CONFIG_HIGHMEM */
}
/*
* Determine low and high memory ranges:
*/
void __init find_low_pfn_range(void)
{
/* it could update max_pfn */
/*当内存的大小小于低端内存的最大页框数MAXMEM_PFN时,没有高端地址映射*/
if (max_pfn <= MAXMEM_PFN)
lowmem_pfn_init();
else //存在高端映射
highmem_pfn_init();
}
通过 start_kernel()->setup_arch()->paging_init()->zone_sizes_init() 初始化管理区zone。
void __init zone_sizes_init(void)
{
unsigned long max_zone_pfns[MAX_NR_ZONES];
memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
#ifdef CONFIG_ZONE_DMA
max_zone_pfns[ZONE_DMA] = min(MAX_DMA_PFN, max_low_pfn);
#endif
#ifdef CONFIG_ZONE_DMA32
max_zone_pfns[ZONE_DMA32] = min(MAX_DMA32_PFN, max_low_pfn);
#endif
max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
#ifdef CONFIG_HIGHMEM
max_zone_pfns[ZONE_HIGHMEM] = max_pfn;
#endif
free_area_init_nodes(max_zone_pfns);
}
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by memblock_set_node(), the size of each
* zone in each node and their holes is calculated. If the maximum PFN
* between two adjacent zones match, it is assumed that the zone is empty.
* For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
* that arch_max_dma32_pfn has no pages. It is also assumed that a zone
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes();
/* Print out the zone ranges */
pr_info("Zone ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
pr_info(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
pr_cont("[mem %#018Lx-%#018Lx]\n",
(u64)arch_zone_lowest_possible_pfn[i]
<< PAGE_SHIFT,
((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
pr_info(" Node %d: %#018Lx\n", i,
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/* Print out the early node map */
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_MEMORY);
check_for_memory(pgdat, nid);
}
}
free_area_init_nodes()中初始化zone的调用顺序为:free_area_init_nodes()->free_area_init_node()->free_area_init_core()。
free_area_init_node()用于节点的初始化,节点的初始化已经在上文【Linux内核学习笔记一】内存管理-节点 中进行了讲解。
free_area_init_core()用于向每个zone填充相关信息,以下是free_area_init_core()的代码:
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
//初始化节区的管理结构。
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
//遍历所有zone对其进行处理
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
size = zone->spanned_pages;
realsize = freesize = zone->present_pages;
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages = calc_memmap_size(size, realsize);
if (!is_highmem_idx(j)) {
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
}
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
/*
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
//初始化zone的数据。
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
//初始化zone的每cpu缓存数据
zone_pcp_init(zone);
/* For bootup, initialized properly in watermark setup */
mod_zone_page_state(zone, NR_ALLOC_BATCH, zone->managed_pages);
lruvec_init(&zone->lruvec);
if (!size)
continue;
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
//初始化zone的free_area结构体。
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
//设置所有页框的PG_reserved位。
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}