深入浅出内存管理--内存节点(Node)

本文以Linux内核4.9来做介绍。

Node 结构体

内核中的节点是使用一个结构体struct pglist_data来进行管理的,它的组成如下所示,本文只会列出几个关键成员,其余成员待遇到时在做解释:

 typedef struct pglist_data {
     struct zone node_zones[MAX_NR_ZONES];
     struct zonelist node_zonelists[MAX_ZONELISTS];
     int nr_zones;
 #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
     struct page *node_mem_map;
 #ifdef CONFIG_PAGE_EXTENSION
     struct page_ext *node_page_ext;
 #endif
 #endif
 #ifndef CONFIG_NO_BOOTMEM
     struct bootmem_data *bdata;
 #endif
 #ifdef CONFIG_MEMORY_HOTPLUG
     /*
      * Must be held any time you expect node_start_pfn, node_present_pages
      * or node_spanned_pages stay constant.  Holding this will also
      * guarantee that any pfn_valid() stays that way.
      *
      * pgdat_resize_lock() and pgdat_resize_unlock() are provided to
      * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG.
      *
      * Nests above zone->lock and zone->span_seqlock
      */
     spinlock_t node_size_lock;
 #endif
     unsigned long node_start_pfn;
     unsigned long node_present_pages; /* total number of physical pages */
     unsigned long node_spanned_pages; /* total size of physical page
                          range, including holes */
     int node_id;
     wait_queue_head_t kswapd_wait;
     wait_queue_head_t pfmemalloc_wait;
     struct task_struct *kswapd; /* Protected by
                        mem_hotplug_begin/end() */
     int kswapd_order;
     enum zone_type kswapd_classzone_idx;
 
     int kswapd_failures;        /* Number of 'reclaimed == 0' runs */
 
 #ifdef CONFIG_COMPACTION
     int kcompactd_max_order;
     enum zone_type kcompactd_classzone_idx;
     wait_queue_head_t kcompactd_wait;
     struct task_struct *kcompactd;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
     /* Lock serializing the migrate rate limiting window */
     spinlock_t numabalancing_migrate_lock;
 
     /* Rate limiting time interval */
     unsigned long numabalancing_migrate_next_window;
 
     /* Number of pages migrated during the rate limiting time interval */
     unsigned long numabalancing_migrate_nr_pages;
 #endif
     /*
      * This is a per-node reserve of pages that are not available
      * to userspace allocations.
      */
     unsigned long       totalreserve_pages;
 
 #ifdef CONFIG_NUMA
     /*
      * zone reclaim becomes active if more unmapped pages exist.
      */
     unsigned long       min_unmapped_pages;
     unsigned long       min_slab_pages;
 #endif /* CONFIG_NUMA */
 
     /* Write-intensive fields used by page reclaim */
     ZONE_PADDING(_pad1_)
     spinlock_t      lru_lock;
 
 #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
     /*
      * If memory initialisation on large machines is deferred then this
      * is the first PFN that needs to be initialised.
      */
     unsigned long first_deferred_pfn;
     /* Number of non-deferred pages */
     unsigned long static_init_pgcnt;
 #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
     spinlock_t split_queue_lock;
     struct list_head split_queue;
     unsigned long split_queue_len;
 #endif
 
     /* Fields commonly accessed by the page reclaim scanner */
     struct lruvec       lruvec;
     /*
      * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
      * this node's LRU.  Maintained by the pageout code.
      */
     unsigned int inactive_ratio;
 
     unsigned long       flags;
 
     ZONE_PADDING(_pad2_)
 
     /* Per-node vmstats */
     struct per_cpu_nodestat __percpu *per_cpu_nodestats;
     atomic_long_t       vm_stat[NR_VM_NODE_STAT_ITEMS];
 } pg_data_t;

  • node_zones[MAX_NR_ZONES];
    该节点中所有管理区(ZONE)的描述符数组,
  • node_zonelists[MAX_ZONELISTS];
    页分配器使用的结构体数组,页分配器会根据不同的GFP申请标志来按照不同的顺序扫描对应节点中的ZONE,而该结构体就是用于定制不同的顺序。
 enum {
    ZONELIST_FALLBACK,  /* zonelist with fallback */
#ifdef CONFIG_NUMA
    /*
     * The NUMA zonelists are doubled because we need zonelists that
     * restrict the allocations to a single node for __GFP_THISNODE.
     */
    ZONELIST_NOFALLBACK,    /* zonelist without fallback (__GFP_THISNODE) */
#endif
    MAX_ZONELISTS
};

如上所示支持的分配方式有两种,ZONELIST_FALLBACK和ZONELIST_NOFALLBACK,那么根据不同的分配方式,对于ZONE的优先顺序可能是不同的,这个数组可以记录下来不同策略对应的优先级,所以这就是该成员存在的意义。

  • nr_zones
    节点中存在的管理区数目,最大为MAX_NR_ZONES。
  • node_mem_map
    节点中页描述符数组。
  • node_id
    节点的id。
  • node_start_pfn
    节点中的物理页其实页框。

系统中Node的定义

我们以ARM64平台为例,它对系统中所有Node的定义:

 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
 EXPORT_SYMBOL(node_data);
 
 
 static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
 

这里定义了一个结构体指针数组,并作为全局变量来使用,这个数组的内容是需要系统启动时进行初始化和填充的:

/**
 * Initialize NODE_DATA for a node on the local memory
 */
static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
{
    const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
    u64 nd_pa;
    void *nd;
    int tnid;

    if (start_pfn < end_pfn)
        pr_info("Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
            start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
    else
        pr_info("Initmem setup node %d []\n", nid);
        
    nd_pa = memblock_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
    nd = __va(nd_pa);

    /* report and initialize */
    pr_info("NODE_DATA [mem %#010Lx-%#010Lx]\n",
        nd_pa, nd_pa + nd_size - 1);
    tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
    if (tnid != nid)
        pr_info("NODE_DATA(%d) on node %d\n", nid, tnid);

    node_data[nid] = nd;
    memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
    NODE_DATA(nid)->node_id = nid;
    NODE_DATA(nid)->node_start_pfn = start_pfn;
    NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
}

setup_node_data是在如下函数中被调用的:

static int __init numa_register_nodes(void)
{
    int nid;
    struct memblock_region *mblk;

    /* Check that valid nid is set to memblks */
    for_each_memblock(memory, mblk)
        if (mblk->nid == NUMA_NO_NODE || mblk->nid >= MAX_NUMNODES) {
            pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
                mblk->nid, mblk->base,
                mblk->base + mblk->size - 1);
            return -EINVAL;
        }

    /* Finally register nodes. */
    for_each_node_mask(nid, numa_nodes_parsed) {
        unsigned long start_pfn, end_pfn;

        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
        setup_node_data(nid, start_pfn, end_pfn);
        node_set_online(nid);
    }

    /* Setup online nodes to actual nodes*/
    node_possible_map = numa_nodes_parsed;

    return 0;
}

进一步跟下去会发现它是在void __init arm64_numa_init(void)里面进行层层调用下来的。具体我们不做分析了。

单一节点

对于单一节点的系统来说,系统中只有一个node描述符,定义如下:

#ifndef CONFIG_NEED_MULTIPLE_NODES
struct pglist_data __refdata contig_page_data = {
    .bdata = &bootmem_node_data[0]
};
EXPORT_SYMBOL(contig_page_data);
#endif

你可能感兴趣的:(内核笔记,深入浅出内存管理)