本文以Linux内核4.9来做介绍。
内核中的节点是使用一个结构体struct pglist_data来进行管理的,它的组成如下所示,本文只会列出几个关键成员,其余成员待遇到时在做解释:
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#ifdef CONFIG_PAGE_EXTENSION
struct page_ext *node_page_ext;
#endif
#endif
#ifndef CONFIG_NO_BOOTMEM
struct bootmem_data *bdata;
#endif
#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Must be held any time you expect node_start_pfn, node_present_pages
* or node_spanned_pages stay constant. Holding this will also
* guarantee that any pfn_valid() stays that way.
*
* pgdat_resize_lock() and pgdat_resize_unlock() are provided to
* manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
*
* Nests above zone->lock and zone->span_seqlock
*/
spinlock_t node_size_lock;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd; /* Protected by
mem_hotplug_begin/end() */
int kswapd_order;
enum zone_type kswapd_classzone_idx;
int kswapd_failures; /* Number of 'reclaimed == 0' runs */
#ifdef CONFIG_COMPACTION
int kcompactd_max_order;
enum zone_type kcompactd_classzone_idx;
wait_queue_head_t kcompactd_wait;
struct task_struct *kcompactd;
#endif
#ifdef CONFIG_NUMA_BALANCING
/* Lock serializing the migrate rate limiting window */
spinlock_t numabalancing_migrate_lock;
/* Rate limiting time interval */
unsigned long numabalancing_migrate_next_window;
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
/*
* This is a per-node reserve of pages that are not available
* to userspace allocations.
*/
unsigned long totalreserve_pages;
#ifdef CONFIG_NUMA
/*
* zone reclaim becomes active if more unmapped pages exist.
*/
unsigned long min_unmapped_pages;
unsigned long min_slab_pages;
#endif /* CONFIG_NUMA */
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* If memory initialisation on large machines is deferred then this
* is the first PFN that needs to be initialised.
*/
unsigned long first_deferred_pfn;
/* Number of non-deferred pages */
unsigned long static_init_pgcnt;
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
spinlock_t split_queue_lock;
struct list_head split_queue;
unsigned long split_queue_len;
#endif
/* Fields commonly accessed by the page reclaim scanner */
struct lruvec lruvec;
/*
* The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
* this node's LRU. Maintained by the pageout code.
*/
unsigned int inactive_ratio;
unsigned long flags;
ZONE_PADDING(_pad2_)
/* Per-node vmstats */
struct per_cpu_nodestat __percpu *per_cpu_nodestats;
atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS];
} pg_data_t;
enum {
ZONELIST_FALLBACK, /* zonelist with fallback */
#ifdef CONFIG_NUMA
/*
* The NUMA zonelists are doubled because we need zonelists that
* restrict the allocations to a single node for __GFP_THISNODE.
*/
ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */
#endif
MAX_ZONELISTS
};
如上所示支持的分配方式有两种,ZONELIST_FALLBACK和ZONELIST_NOFALLBACK,那么根据不同的分配方式,对于ZONE的优先顺序可能是不同的,这个数组可以记录下来不同策略对应的优先级,所以这就是该成员存在的意义。
我们以ARM64平台为例,它对系统中所有Node的定义:
struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);
static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
这里定义了一个结构体指针数组,并作为全局变量来使用,这个数组的内容是需要系统启动时进行初始化和填充的:
/**
* Initialize NODE_DATA for a node on the local memory
*/
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
u64 nd_pa;
void *nd;
int tnid;
if (start_pfn < end_pfn)
pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
else
pr_info("Initmem setup node %d []\n", nid);
nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
nd = __va(nd_pa);
/* report and initialize */
pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
nd_pa, nd_pa + nd_size - 1);
tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
if (tnid != nid)
pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);
node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
NODE_DATA(nid)->node_id = nid;
NODE_DATA(nid)->node_start_pfn = start_pfn;
NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
}
setup_node_data是在如下函数中被调用的:
static int __init numa_register_nodes(void)
{
int nid;
struct memblock_region *mblk;
/* Check that valid nid is set to memblks */
for_each_memblock(memory, mblk)
if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
mblk->nid, mblk->base,
mblk->base + mblk->size - 1);
return -EINVAL;
}
/* Finally register nodes. */
for_each_node_mask(nid, numa_nodes_parsed) {
unsigned long start_pfn, end_pfn;
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
setup_node_data(nid, start_pfn, end_pfn);
node_set_online(nid);
}
/* Setup online nodes to actual nodes*/
node_possible_map = numa_nodes_parsed;
return 0;
}
进一步跟下去会发现它是在void __init arm64_numa_init(void)里面进行层层调用下来的。具体我们不做分析了。
对于单一节点的系统来说,系统中只有一个node描述符,定义如下:
#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
.bdata = &bootmem_node_data[0]
};
EXPORT_SYMBOL(contig_page_data);
#endif