Linux把物理内存划分为三个层次来管理
层次 |
描述 |
节点(Node) |
为了支持NUMA模型而引入的概念,node的划分主要根据访问速度,一个node中所有内存对于某个cpu来说访问速度相同,对于SMP来说,每个cpu都有本地内存,本地cpu访问速度较快,但是其他cpu却访问较慢,每个node用pg_data_t结构体来描述 |
管理区(Zone) |
每个节点node根据用途不同被划分为多个zone, 用于表示不同范围的内存, 内核可以使用不同的映射方式映射物理内存,每个zone用struct zone结构体来描述 |
页框(Page) |
内存被细分为多个页面帧, 页框是最基本的页面分配的单位,每个页框用struct page结构体来描述 |
初始化node数据结构
void __init free_area_init_nodes(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
int i, nid;
/*
Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
sizeof(arch_zone_lowest_possible_pfn));
memset(arch_zone_highest_possible_pfn, 0,
sizeof(arch_zone_highest_possible_pfn));
arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();
//确定每个zone的物理内存页框范围
arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];
for (i = 1; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
arch_zone_lowest_possible_pfn[i] =
arch_zone_highest_possible_pfn[i-1];
arch_zone_highest_possible_pfn[i] =
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0;
arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0;
/* Find the PFNs that ZONE_MOVABLE begins at in each node */
memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
find_zone_movable_pfns_for_nodes(zone_movable_pfn);
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++) {
if (i == ZONE_MOVABLE)
continue;
printk(" %-8s ", zone_names[i]);
if (arch_zone_lowest_possible_pfn[i] ==
arch_zone_highest_possible_pfn[i])
printk("empty\n");
else
printk("%0#10lx -> %0#10lx\n",
arch_zone_lowest_possible_pfn[i],
arch_zone_highest_possible_pfn[i]);
}
/* Print out the PFNs ZONE_MOVABLE begins at in each node */
printk("Movable zone start PFN for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
printk(" Node %d: %lu\n", i, zone_movable_pfn[i]);
}
/* Print out the early_node_map[] */
printk("Early memory PFN ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
printk(" %3d: %0#10lx -> %0#10lx\n", nid, start_pfn, end_pfn);
/*
Initialise every node */
mminit_verify_pageflags_layout();
setup_nr_node_ids();
for_each_online_node(nid) {
//初始化每个node
pg_data_t *pgdat = NODE_DATA(nid);
free_area_init_node(nid, NULL,
//nid为node id
find_min_pfn_for_node(nid), NULL);
/* Any memory on that node */
if (pgdat->node_present_pages)
node_set_state(nid, N_HIGH_MEMORY);
check_for_regular_memory(pgdat);
}
}
只有每个node中物理页框的分布和空洞情况是通过CONFIG_HAVE_MEMBLOCK_NODE_MAP来描述的,这里不详细介绍。
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
pgdat->node_id = nid;
//nid为node id
pgdat->node_start_pfn = node_start_pfn;
//该node 起始页框
calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
nid, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#endif
free_area_init_core(pgdat, zones_size, zholes_size);
}
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
zones_size);
pgdat->node_spanned_pages = totalpages;
//该node中包含的页框数,包括空洞页框
realtotalpages = totalpages;
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -=
zone_absent_pages_in_node(pgdat->node_id, i,
zholes_size);
pgdat->node_present_pages = realtotalpages;
//该node中实际的页框数,不包括空洞页框
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
}
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
//分配页描述符数组
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
#ifdef CONFIG_FLAT_NODE_MEM_MAP
//FLAT内存模型,指node中没有空洞,即node中内存连续
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, start, end;
struct page *map;
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
end = pgdat->node_start_pfn + pgdat->
node_spanned_pages;
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(
struct page);
//该node包含的页框数,包括空洞
map = alloc_remap(pgdat->node_id, size);
//该函数一般返回NULL
if (!map)
map = alloc_bootmem_node_nopanic(pgdat, size);
//从bootmem分配页框描述符数组
pgdat->
node_mem_map = map + (pgdat->node_start_pfn - start);
//指向该node包含的所有页框的页描述符数组
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
//系统只包含一个node,即UMA
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
//struct page *mem_map; 对于UMA来说
mem_map存放页描述符数组指针
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET);
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
//初始化node中的zone
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
//初始化每个zone
struct zone *zone = pgdat->
node_zones + j;
unsigned long size, realsize, memmap_pages;
enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust realsize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages =
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds realsize %lu\n",
zone_names[j], memmap_pages, realsize);
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += realsize;
nr_all_pages += realsize;
zone->spanned_pages = size;
//zone包括的页框数,包括空洞
zone->present_pages = realsize;
//zone存在的页框数,剔除空洞
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
for_each_lru(lru)
INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
zone->reclaim_stat.recent_scanned[1] = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
if (!size)
continue;
set_pageblock_order(pageblock_default_order());
setup_usemap(pgdat, zone, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
//设置页描述符属于哪个node和zone
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
//初始的count设置为1
reset_page_mapcount(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if ((z->zone_start_pfn <= pfn)
&& (pfn < z->zone_start_pfn + z->spanned_pages)
&& !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
}
#if defined(CONFIG_FLATMEM)
#define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET))
#endif
static inline void set_page_links(struct page *page, enum zone_type zone,
unsigned long node, unsigned long pfn)
{
set_page_zone(page, zone);
set_page_node(page, node);
#if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
set_page_section(page, pfn_to_section_nr(pfn));
#endif
}
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}
static inline void init_page_count(struct page *page)
{
atomic_set(&page->_count, 1);
}
static inline void reset_page_mapcount(struct page *page)
{
atomic_set(&(page)->_mapcount, -1);
}
SetPageReserved的定义为:
#define SETPAGEFLAG(uname, lname) \
static inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &page->flags); }