本文用于学习和分享,参考了《深入理解linux内核》《深入理解linux虚拟内存管理》等书籍以及内存管理相关的博文。
在NUMA体系结构中,内存被分割成多个区域(BANK,也叫”簇”),依据簇与处理器的”距离”不同, 访问不同簇的时间也会不同。比如,可能把内存的一个簇指派给每个处理器,或则某个簇和设备卡很近,很适合DMA,那么就指派给该设备。对每个CPU而言,内核都试图把耗时簇的访问次数减到最少。
每个簇被认为是一个节点(node)。在linux中的 struct pglist_data 体现了这一概念,即便在UMA体系结构中亦是如此。 (UMA体系结构中,只有一个节点)。
内存中的每个节点都由 pg_data_t 描述,而 pg_data_t 由 struct pglist_data 定义。在分配一个页面时,linux采用节点局部分配策略,从最靠近CPU的节点分配内存。结构体 struct pglist_data 在
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones;
struct page *node_mem_map;
struct bootmem_data *bdata;
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */
int kswapd_max_order;
enum zone_type classzone_idx;
} pg_data_t;
以下对每个字段进行简要分析:
zonelist 是用于内存分配的zone列表。
struct zonelist {
//当前节点中,每个zone的备份列表。 当前节点的zone中无可用内存时,会向这些备用节点进行分配。
struct zonelist_cache *zlcache_ptr; // NULL or &zlcache
//当前节点中所有zone列表
struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
//用于优化,通过位图指示相应的zone是否有内存可用
struct zonelist_cache zlcache; // optional ...
#endif
};
zonelist_cache 是描述内存节点的备用zone列表的描述符,用于性能优化
struct zonelist_cache{
//zone所属的节点id
unsigned short z_to_n[MAX_ZONES_PER_ZONELIST]; /* zone->nid */
//相应的zone是否为full
DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST); /* zone full? */
//最后检查的时间
unsigned long last_full_zap; /* when last zap'd (jiffies) */
};
zoneref 是对zone的引用
struct zoneref {
//实际引用的zone指针
struct zone *zone; /* Pointer to actual zone */
//在引用数组中的索引
int zone_idx; /* zone_idx(zoneref->zone) */
};
/*
* node_bootmem_map is a map pointer - the bits represent all physical
* memory pages (including holes) on the node.
*/
typedef struct bootmem_data {
//内存块的最小页帧号
unsigned long node_min_pfn;
//内存块的低端页帧号,高于此页帧的都被分配了。
unsigned long node_low_pfn;
//内存块的页面映射位图
void *node_bootmem_map;
unsigned long last_end_off;
unsigned long hint_idx;
struct list_head list;
} bootmem_data_t;
本文基于x86讲解节点的初始化。在初始化节点时的调用顺序为:
setup_arch()->paging_init()->zone_sizes_init()->free_area_init_nodes()。free_area_init_nodes()再调用free_area_init_node()对每个节点进行初始化,代码如下:
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
reset_deferred_meminit(pgdat);
//设置内存区的节点号和起始pfn
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
#endif
/**
* 初始化节点中每个zone的大小和空洞。
*/
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
//设置节点的node_mem_map
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
nid, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#endif
//初始化zone,设置zone的free_area链表,标记所有页面不可用。
free_area_init_core(pgdat);
}
初始化节点时将对 mem_map 及每个管理区(zone)进行初始化(管理区的初始化在【Linux内核学习笔记二】内存管理-管理区(zone)中进行讲解)。
通过free_area_init_node()->alloc_node_mem_map(),free_area_init_node()->free_area_init_core->memmap_init()
对mem_map 进行初始化。
alloc_node_mem_map() 初始化节点的局部映射地址,即pg_data_t->node_mem_map。在NUMA中,全局mem_map指向系统第一个节点的地址,系统中每个节点的起始地址,都对应在全局mem_map的某个位置。在UMA中,全局mem_map就是节点的node_mem_map。 代码如下所示:
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, start, end;
struct page *map;
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
map = memblock_virt_alloc_node_nopanic(size,
pgdat->node_id);
//计算属于本节点的第一个页面的描述符
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
//在多节点系统中全局mem_map指向系统第一个节点的起始地址
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
memmap_init()的定义如下所示:
#define memmap_init(size, nid, zone, start_pfn) \
memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
memmap_init_zone主要根据页框号pfn通过pfn_to_page()查找到页面管理结构page,然后对该页面的管理结构page进行初始化。代码如下所示:
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if ((z->zone_start_pfn <= pfn)
&& (pfn < zone_end_pfn(z))
&& !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
}