体系结构相关代码需要在启动期间建立以下信息:
1.系统中各个内存域的页帧边界,保存在max_zone_pfn中
2.个结点页帧的分配情况,保存在全局变量early_node_map中。
从内核版本2.6.10开始提供一个通用的框架,用于将上述信息转换为伙伴系统预期的结点和内存域数据结构。在这以前,各个体系结构必须自行建立相关结构。现在,体系结构相关代码只需要建立前述的简单结构,将繁重的工作留给free_area_init_nodes即可。图1给出了该过程概述,图2给出了free_area_init_nodes的代码流程图。
图1:free_area_init_nodes过程概述
图2:free_area_init_nodes代码流程图
free_area_init_nodes的源代码的详细分析如下:
void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long nid; enum zone_type i; /* Sort early_node_map as initialisation assumes it is sorted */ sort_node_map();//排序使得后续的任务稍微容易些,排序本身并不特别复杂 /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn));//全局数组arch_zone_lowest_possible_pfn用来存储各个内存域可使用的最低内存页帧编号 memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn));//全局数组arch_zone_highest_possible_pfn用来存储各个内存域可使用的最高内存页帧编号 arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions();//辅助函数find_min_pfn_with_active_regions用于找到注册的最低内存域中可用的编号最小的页帧 arch_zone_highest_possible_pfn[0] = max_zone_pfn[0];//max_zone_pfn记录了各个内存域包含的最大页帧号 for (i = 1; i < MAX_NR_ZONES; i++) {//依次遍历,确定各个内存域的边界 if (i == ZONE_MOVABLE)//由于ZONE_MOVABLE是一个虚拟内存域,不与真正的硬件内存域关联,该内存域的边界总是设置为0,如后面的代码所示 continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1];//第n个内存域的最小页帧,即前一个(第n-1个)内存域的最大页帧 arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);//不出意外,当前内存域的最大页帧由max_zone_pfn给出 } arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); find_zone_movable_pfns_for_nodes(zone_movable_pfn);//用于计算进入ZONE_MOVABLE的内存数量,详细分析见下文 /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) {//将各个内存域的最大、最小页帧号显示出来 if (i == ZONE_MOVABLE) continue; printk(" %-8s %8lu -> %8lu\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ printk("Movable zone start PFN for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i])//对每个结点来说,zone_movable_pfn[node_id]表示ZONE_MOVABLE在movable_zone内存域中所取得内存的起始地址。内核确保这些页将用于满足符合ZONE_MOVABLE职责的内存分配 printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); } /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++)//显示各个内存域的分配情况 printk(" %3d: %8lu -> %8lu\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); /* Initialise every node */ setup_nr_node_ids(); for_each_online_node(nid) {//代码遍历所有的活动结点,并分别对各个结点调用free_area_init_node建立数据结构,该函数需要结点第一个可用的页帧作为一个参数,而find_min_pfn_for_node则从early_node_map数组提取该信息 pg_data_t *pgdat = NODE_DATA(nid); free_area_init_node(nid, pgdat, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ if (pgdat->node_present_pages)//根据node_present_pages字段判断结点具有内存,则在结点位图中设置N_HIGH_MEMORY标志,该标志只表示结点上存在普通或高端内存,因此check_for_regular_memory进一步检查低于ZONE_HIGHMEM的内存域中是否有内存,并据此在结点位图中相应地设置N_NORMAL_MEMORY node_set_state(nid, N_HIGH_MEMORY); check_for_regular_memory(pgdat); } }
free_area_init_node源代码详细分析:
void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size);//首先累计各个内存域的页数,计算结点中页的总数。对连续内存模型而言,这可以通过zone_sizes_init完成,但calculate_node_totalpages还考虑了内存空洞 alloc_node_mem_map(pgdat);//分配了该节点的页面描述符数组[pgdat->node_mem_map数组的内存分配] free_area_init_core(pgdat, zones_size, zholes_size);//对该节点的每个区[DMA,NORMAL,HIGH]的的结构进行初始化 }calculate_node_totalpages源代码详细分析:
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, zones_size);//累计计算各个内存域包含空洞的内存总页数 pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, zholes_size)//;以包含空洞的内存总页数累计减去各个内存域中空洞的数量,就可以得出实际可用的内存页数 pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); }alloc_node_mem_map源代码详细分析:
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat) { /* Skip empty nodes */ if (!pgdat->node_spanned_pages)//如果内存结点没有没存页,直接返回 return; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) {//如果还没有为结点分配mem_map,则需要为结点分配mem_map unsigned long size, start, end; struct page *map; /* * The zone's endpoints aren't required to be MAX_ORDER * aligned but the node_mem_map endpoints must be in order * for the buddy allocator to function correctly. */ start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);//确定起点,以MAX_ORDER_NR_PAGES的大小对齐 end = pgdat->node_start_pfn + pgdat->node_spanned_pages;//计算结束点 end = ALIGN(end, MAX_ORDER_NR_PAGES);//以MAX_ORDER_NR_PAGES对齐,与上面的功能一致,将内存映射对齐到伙伴系统的最大分配阶 size = (end - start) * sizeof(struct page);//计算所需内存的大小 map = alloc_remap(pgdat->node_id, size);//为内存映射分配内存 if (!map)//如果分配不成功,则使用普通的自举内存分配器进行分配 map = alloc_bootmem_node(pgdat, size); pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's */ if (pgdat == NODE_DATA(0)) { mem_map = NODE_DATA(0)->node_mem_map; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP if (page_to_pfn(mem_map) != pgdat->node_start_pfn) mem_map -= (pgdat->node_start_pfn - ARCH_PFN_OFFSET); #endif /* CONFIG_ARCH_POPULATES_NODE_MAP */ } #endif #endif /* CONFIG_FLAT_NODE_MEM_MAP */ }
free_area_init_core源代码详细分析:
static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; size = zone_spanned_pages_in_node(nid, j, zones_size);//内存域跨域的页数 realsize = size - zone_absent_pages_in_node(nid, j, zholes_size);//内存域的可用长度,可通过跨域的页数减去空洞覆盖的页数而得到 /* * Adjust realsize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ memmap_pages = (size * sizeof(struct page)) >> PAGE_SHIFT;//用于内存映射需要的页数 if (realsize >= memmap_pages) {如果内存域的可用长度大于用于内存映射需要的页数 realsize -= memmap_pages;//则将需要映射的页数分配出去 printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else//否则,显示警告信息,可用内存不足 printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", zone_names[j], memmap_pages, realsize); /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); }//除去用于保留的内存页 if (!is_highmem_idx(j)) nr_kernel_pages += realsize;//nr_kernel_pages表示不包含高端内存的系统内存共有的内存页面数,用于统计所有一致映射的页 nr_all_pages += realsize; zone->spanned_pages = size;//跨域的内存页 zone->present_pages = realsize;//经过一系列初始化之后,还可使用的内存页 #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)/ 100;//这句话不理解,请指教 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;//这句话不理解,请指教 #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock);//关于锁机制,自己还没有学到,后面会详细介绍锁机制 spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone);//初始化该内存域的per_cpu缓存 INIT_LIST_HEAD(&zone->active_list); INIT_LIST_HEAD(&zone->inactive_list); zone->nr_scan_active = 0; zone->nr_scan_inactive = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) continue; set_pageblock_order(pageblock_default_order()); setup_usemap(pgdat, zone, size); ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY);//init_currently_empty_zone用于初始化free_area列表,并将属于该内存域的所有page实例都设置为初始默认值 BUG_ON(ret); zone_start_pfn += size; } }check_for_regular_memory源代码详细分析:
static void check_for_regular_memory(pg_data_t *pgdat) { #ifdef CONFIG_HIGHMEM enum zone_type zone_type; for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {//进一步检查低于ZONE_HIGHMEM的内存域中是否有内存 struct zone *zone = &pgdat->node_zones[zone_type]; if (zone->present_pages) node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);//并根据上面的检查在结点位图中相应地设置N_NORMAL_MEMORY } #endif }