节点和管理区是内存管理中所涉及的重要概念,其数据结构在前文《linux物理内存概述》中已经介绍,现在让我们来看看linux是如何完成节点和管理区的。
在内核首先通过setup_arch()-->paging_init()-->zone_sizes_init()来初始化节点和管理区的一些数据项
static void __init zone_sizes_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); /*分别获取三个管理区的页面数*/ max_zone_pfns[ZONE_DMA] = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; #endif free_area_init_nodes(max_zone_pfns); }
在获取了三个管理区的页面数后,通过free_area_init_nodes()来完成后续工作
void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long nid; int i; /* Sort early_node_map as initialisation assumes it is sorted */ sort_node_map();/*将所有节点按起始页框号排序*/ /* Record where the zone boundaries are */ /*记录三个管理区的边界*/ memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; for (i = 1; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) /*不处理ZONE_MOVABLE*/ continue; /*将下一个管理区的起始页框置为上一个管理区的结束页框*/ arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1]; arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); find_zone_movable_pfns_for_nodes(zone_movable_pfn); /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; printk(" %-8s %0#10lx -> %0#10lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ printk("Movable zone start PFN for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); } /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); for_each_online_node(nid) {/*遍历每个节点*/ pg_data_t *pgdat = NODE_DATA(nid); /*初始化节点*/ free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_HIGH_MEMORY); check_for_regular_memory(pgdat); } }
其中核心函数为free_area_init_node(),用来针对特定的节点进行初始化
void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; /*计算节点占用的总页面数和除去洞的实际总页面数*/ calculate_node_totalpages(pgdat, zones_size, zholes_size); /*分配节点的mem_map*/ alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", nid, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); #endif /*初始化节点中的关键数据*/ free_area_init_core(pgdat, zones_size, zholes_size); }
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; /*遍历节点的所有管理区*/ for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, zones_size);/*计算第i个管理区的页面数*/ pgdat->node_spanned_pages = totalpages;/*保存总页面数*/ realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++)/*计算实际的总页面数(不包括洞)*/ realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, zholes_size); pgdat->node_present_pages = realtotalpages;/*保存可用页面数*/ printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); }
/* * Set up the zone data structures: * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat);/*初始化node_size_lock*/ pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait);/*初始化页换出进程的等待队列*/ pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; enum lru_list l; /*计算该管理区的总页面数和实际可用页面数*/ size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, zholes_size); /* * Adjust realsize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ memmap_pages = /*计算该管理区中要使用多少页面来完成映射*/ PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; /*如果实际可用页面大于或等于需要用来映射的页面,则用前者减去后者再保存*/ if (realsize >= memmap_pages) { realsize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else/*否则的话说明需要映射的页面太多超过了管理区可用内存的范围*/ printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", zone_names[j], memmap_pages, realsize); /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j))/*如果不是高端内存管理区*/ nr_kernel_pages += realsize; nr_all_pages += realsize; zone->spanned_pages = size; /*保存总页面数*/ zone->present_pages = realsize; /*保存剩余的可用页面数*/ #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; /*指向管理区所属节点*/ zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); zone->reclaim_stat.nr_saved_scan[l] = 0; } /*回收状态中的各项都初始化为0*/ zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; zone->reclaim_stat.recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) continue; set_pageblock_order(pageblock_default_order()); setup_usemap(pgdat, zone, size);/*为pageblock_flags分配空间*/ /*初始化管理区的等待队列(wait table)和标识空闲块的结构(free_area)*/ ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); /*设定管理区页面的相关信息*/ memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; } }
至此,节点和管理区的关键数据已完成初始化,内核在后面为内存管理做得一个准备工作就是将所有节点的管理区都链入到zonelist中,便于后面内存分配工作的进行
内核在start_kernel()-->build_all_zonelist()中完成zonelist的初始化
void build_all_zonelists(void) { /*设定zonelist的顺序,是按节点还是按管理区排序,只对NUMA有意义*/ set_zonelist_order(); if (system_state == SYSTEM_BOOTING) { __build_all_zonelists(NULL); mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } /*得到所有管理区可分配的空闲页面数*/ vm_total_pages = nr_free_pagecache_pages(); /* * Disable grouping by mobility if the number of pages in the * system is too low to allow the mechanism to work. It would be * more accurate, but expensive to check per-zone. This check is * made on memory-hotadd so a system can start with mobility * disabled and enable it later */ if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) page_group_by_mobility_disabled = 1; else page_group_by_mobility_disabled = 0; printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA printk("Policy zone: %s\n", zone_names[policy_zone]); #endif }
static int __build_all_zonelists(void *dummy) { int nid; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif for_each_online_node(nid) {/*遍历节点*/ pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } return 0; }
UMA架构下的build_zonelists()函数为
static void build_zonelists(pg_data_t *pgdat) { int node, local_node; enum zone_type j; struct zonelist *zonelist; local_node = pgdat->node_id; /*取node_zonelists[0],将本节点和其他节点的管理区都链入其中*/ zonelist = &pgdat->node_zonelists[0]; /*将本节点的管理区链入zonelist*/ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); /* * Now we build the zonelist so that it contains the zones * of all the other nodes. * We don't want to pressure a particular node, so when * building the zones for node N, we make sure that the * zones coming right after the local ones are those from * node N+1 (modulo N) */ /*将本节点后面的节点的管理区链入zonelist*/ for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; j = build_zonelists_node(NODE_DATA(node), zonelist, j, MAX_NR_ZONES - 1); } /*将本节点前面的节点的管理区链入zonelist*/ for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; j = build_zonelists_node(NODE_DATA(node), zonelist, j, MAX_NR_ZONES - 1); } /*zonelist的结束点置空*/ zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; }
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) { struct zone *zone; BUG_ON(zone_type >= MAX_NR_ZONES); zone_type++; do { zone_type--; zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) {/*如果管理区的可用页面数不为0*/ /*将zone添加进zonelist*/ zoneref_set_zone(zone, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); return nr_zones; }
zoneref_set_zone()就是对_zonerefs的设置
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) { zoneref->zone = zone; zoneref->zone_idx = zone_idx(zone); }
到这里内存管理的前期准备工作已基本完成,包括页表的建立以及管理区的建立,接下来就可以建立伙伴系统和slab分配器来完成具体的内存管理工作了。