pg_data_t数据结构2.6.37

这个数据结构用于非一致内存访问(NUMA)机器,表示比zone更高层次的内存区。

关于NUMA的文章

http://www.ibm.com/developerworks/cn/linux/l-numa/

http://blog.chinaunix.net/uid-7295895-id-3076420.html

NUMA有多个节点,而每个节点内,访问内存的时间是相同的,不同的节点,访问内存的时间可以不同。

在NUMA机器上,每个NUMA 节点(node)有一个pg_data_t来描述它的内存布局。

对于UMA上,只有一个节点的情况,只是contig_page_data的静态pg_data_t结构。

#define NODE_DATA(nid) (&contig_page_data)


每个节点(node),有可以分成区(zone),如ZONE_DMA、ZONE_NORMAL、ZONE_HIGHMEM

每个区有自己的特殊用途,如ZONE_DMA是低端的物理内存,如ISA设备需要用到它;

ZONE_NORMAL的地址可以直接映射到线性地址空间中。

不过对于ARM来说,ZONE_DMA和ZONE_NORMAL应该没有啥区别。


/*

 * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM

 * (mostly NUMA machines?) to denote a higher-level memory zone than the

 * zone denotes.

 *

 * On NUMA machines, each NUMA node would have a pg_data_t to describe

 * it's memory layout.

 *

 * Memory statistics and page replacement data structures are maintained on a

 * per-zone basis.

 */

struct bootmem_data;

typedef struct pglist_data {

struct zone node_zones[MAX_NR_ZONES];

struct zonelist node_zonelists[MAX_ZONELISTS];

int nr_zones; /* 表示该节点的区数 */

#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */

struct page *node_mem_map;

#ifdef CONFIG_CGROUP_MEM_RES_CTLR

struct page_cgroup *node_page_cgroup;

#endif

#endif

#ifndef CONFIG_NO_BOOTMEM

struct bootmem_data *bdata;/* 指向内存引导程序 */

#endif

#ifdef CONFIG_MEMORY_HOTPLUG

/*

 * Must be held any time you expect node_start_pfn, node_present_pages

 * or node_spanned_pages stay constant.  Holding this will also

 * guarantee that any pfn_valid() stays that way.

 *

 * Nests above zone->lock and zone->size_seqlock.

 */

spinlock_t node_size_lock;

#endif

unsigned long node_start_pfn; /* 节点(node)的开始PFN */

unsigned long node_present_pages; /* total number of physical pages 

                                                此节点总的物理页数

                                          */

unsigned long node_spanned_pages; /* total size of physical page

     range, including holes

                                              此节点物理页数,包括洞

                                         */

int node_id;              /*  node id(NID) 节点的ID号 */

wait_queue_head_t kswapd_wait;/* kswpdN的等待队列链表 */

struct task_struct *kswapd;   /* 该节点的kswapdN进程 */

int kswapd_max_order;         /* ...*/

} pg_data_t;


node_zones:

该节点的区。

zone_type 区类型

enum zone_type {

#ifdef CONFIG_ZONE_DMA

/*

* ZONE_DMA is used when there are devices that are not able

* to do DMA to all of addressable memory (ZONE_NORMAL). Then we

* carve out the portion of memory that is needed for these devices.

* The range is arch specific.这个范围是体系结构决定的

*

* Some examples

*

* Architecture Limit

* ---------------------------

* parisc, ia64, sparc <4G

* s390 <2G

* arm Various

* alpha Unlimited or 0-16MB.

*

* i386, x86_64 and multiple other arches

* <16M.

*/

ZONE_DMA,

#endif

#ifdef CONFIG_ZONE_DMA32

/*

* x86_64 needs two ZONE_DMAs because it supports devices that are

* only able to do DMA to the lower 16M but also 32 bit devices that

* can only do DMA areas below 4G.

*/

ZONE_DMA32,

#endif

/*

* Normal addressable memory is in ZONE_NORMAL. DMA operations can be

* performed on pages in ZONE_NORMAL if the DMA devices support

* transfers to all addressable memory.

         * 如果DMA设备支持所有可寻址内存,则DMA操作能在ZONE_NORMAL的页中执行

*/

ZONE_NORMAL,

#ifdef CONFIG_HIGHMEM

/*

* A memory area that is only addressable by the kernel through

* mapping portions into its own address space. This is for example

* used by i386 to allow the kernel to address the memory beyond

* 900MB. The kernel will set up special mappings (page

* table entries on i386) for each page that the kernel needs to

* access.

*/

ZONE_HIGHMEM,

#endif

ZONE_MOVABLE,

__MAX_NR_ZONES

};



node_zonelists:内存分配时,区的顺序

start_kernel()==>build_all_zonelists(NULL);===》》build_zonelists


node_mem_map: 指向struct page数组的第一个页面,即mem_map数组,指向该node所有的页;

free_area_init_node=>alloc_node_mem_map

mem_map = NODE_DATA(0)->node_mem_map;