Linux物理内存管理区在start_kernel函数中进行初始化,此时启动分配器已经建立,所以可以从bootmem中分配需要的内存。
一、全局变量初始化
max_pfn:最大物理页面帧号
start_kernel()->setup_arch()->e820_end_of_ram_pfn()找出最大可用内存页面帧号。
void __init setup_arch(char **cmdline_p) { …… /* * partially used pages are not usable - thus * we are rounding upwards: */ /*遍历e820.map,找到系统中得最大内存数, 这个内存数需小于4G*/ max_pfn = e820_end_of_ram_pfn(); …… }
unsigned long __init e820_end_of_ram_pfn(void) { /*MAX_ARCH_PFN为4G空间*/ return e820_end_pfn(MAX_ARCH_PFN, E820_RAM); }
/* * Find the highest page frame number we have available */ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; unsigned long max_arch_pfn = MAX_ARCH_PFN;/*4G地址空间对应的页面数*/ /*对e820中所有的内存块,其中e820为从bios中探测到的页面数存放处*/ for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i];/*第i个物理页面块*/ unsigned long start_pfn; unsigned long end_pfn; if (ei->type != type)/*与找的类型不匹配*/ continue; /*起始地址对应的页面帧号*/ start_pfn = ei->addr >> PAGE_SHIFT; /*结束物理地址对应的页面帧号*/ end_pfn = (ei->addr + ei->size) >> PAGE_SHIFT; if (start_pfn >= limit_pfn) continue; if (end_pfn > limit_pfn) { last_pfn = limit_pfn;/*找到的结束页面帧号大于限制大小时*/ break; } if (end_pfn > last_pfn) last_pfn = end_pfn;/*保存更新last_pfn*/ } if (last_pfn > max_arch_pfn)/*大于4G空间时*/ last_pfn = max_arch_pfn; /*打印输出信息*/ printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n", last_pfn, max_arch_pfn); /*返回最后一个页面帧号*/ return last_pfn; }
max_low_pfn:低端内存最大页面数
start_kernel()->setup_arch()->find_low_pfn_range()
/* * Determine low and high memory ranges: */ /*找到低端内存的做大内存页面数,初始化两个变量*/ void __init find_low_pfn_range(void) { /* it could update max_pfn */ /*当内存的大小本来就小于低端内存的做大页框数时; 直接没有高端地址映射*/ if (max_pfn <= MAXMEM_PFN) lowmem_pfn_init(); else/*这是一般PC机的运行流程,存在高端映射*/ highmem_pfn_init(); }
我们直接看具有高端地址空间的部分。
/* * We have more RAM than fits into lowmem - we try to put it into * highmem, also taking the highmem=x boot parameter into account: */ /*高端地址空间的页面数可以在启动中进行配置; 如果不配置,在这里进行设置大小*/ void __init highmem_pfn_init(void) { /*MAXMEM_PFN为最大物理地址-(4M+4M+8K+128M); 所以低端内存的大小其实比我们说的896M低一些*/ max_low_pfn = MAXMEM_PFN; if (highmem_pages == -1)/*高端内存页面数如果在开机没有设置*/ highmem_pages = max_pfn - MAXMEM_PFN;/*总页面数减去低端页面数*/ /*如果highmem_pages变量在启动项设置了,那么在这里就要进行这样的判断,因为可能出现不一致的情况*/ if (highmem_pages + MAXMEM_PFN < max_pfn) max_pfn = MAXMEM_PFN + highmem_pages; if (highmem_pages + MAXMEM_PFN > max_pfn) { printk(KERN_WARNING MSG_HIGHMEM_TOO_SMALL, pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); highmem_pages = 0; } #ifndef CONFIG_HIGHMEM /* Maximum memory usable is what is directly addressable */ printk(KERN_WARNING "Warning only %ldMB will be used.\n", MAXMEM>>20); if (max_pfn > MAX_NONPAE_PFN) printk(KERN_WARNING "Use a HIGHMEM64G enabled kernel.\n"); else printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); max_pfn = MAXMEM_PFN; #else /* !CONFIG_HIGHMEM *//*存在高端地址情况*/ #ifndef CONFIG_HIGHMEM64G /*在没有配置64G的情况下,内存的大小不能超过4G*/ if (max_pfn > MAX_NONPAE_PFN) { max_pfn = MAX_NONPAE_PFN; printk(KERN_WARNING MSG_HIGHMEM_TRIMMED); } #endif /* !CONFIG_HIGHMEM64G */ #endif /* !CONFIG_HIGHMEM */ }
二、管理区初始化
Start_kernl()->setup_arch()->paging_init()->zone_sizes_init()
static void __init zone_sizes_init(void) { /*初始化几个内存区中的最大页面数,在后面用于具体的初始化工作*/ unsigned long max_zone_pfns[MAX_NR_ZONES]; memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); max_zone_pfns[ZONE_DMA] =/*DMA区的最大页面帧号,后面的类似*/ virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; max_zone_pfns[ZONE_NORMAL] = max_low_pfn; #ifdef CONFIG_HIGHMEM max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; #endif /*内存体系的MMU建立,包括伙伴系统的初步建立*/ free_area_init_nodes(max_zone_pfns); }
其中x86-32 非PAE下MAX_DMA_ADDRESS为16M+3G大小
/* The maximum address that we can perform a DMA transfer to on this platform */ #define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
/** * free_area_init_nodes - Initialise all pg_data_t and zone data * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. * Using the page ranges provided by add_active_range(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed * that arch_max_dma32_pfn has no pages. It is also assumed that a zone * starts where the previous one ended. For example, ZONE_DMA32 starts * at arch_max_dma_pfn. */ void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long nid; int i; /* Sort early_node_map as initialisation assumes it is sorted */ /*将活动区域进行排序,关于活动区域在后面会有介绍*/ sort_node_map(); /* Record where the zone boundaries are */ memset(arch_zone_lowest_possible_pfn, 0, sizeof(arch_zone_lowest_possible_pfn)); memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); /*找出活动内存中最小的页面,在代码中的作者的注释很详细*/ arch_zone_lowest_possible_pfn[0] = find_min_pfn_with_active_regions(); arch_zone_highest_possible_pfn[0] = max_zone_pfn[0]; for (i = 1; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; arch_zone_lowest_possible_pfn[i] = arch_zone_highest_possible_pfn[i-1];/*假定区域连续,下一个区域的最小页面为上一个区的最后页面*/ arch_zone_highest_possible_pfn[i] = max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]); } /*对ZONE_MOVABLE区域设置为0*/ arch_zone_lowest_possible_pfn[ZONE_MOVABLE] = 0; arch_zone_highest_possible_pfn[ZONE_MOVABLE] = 0; /* Find the PFNs that ZONE_MOVABLE begins at in each node */ memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn)); find_zone_movable_pfns_for_nodes(zone_movable_pfn);/*找出每个区的movable的页面数,关于movable为新引入的机制,在后面的文章中会对其详细分析*/ /* Print out the zone ranges */ printk("Zone PFN ranges:\n"); for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; printk(" %-8s %0#10lx -> %0#10lx\n", zone_names[i], arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } /* Print out the PFNs ZONE_MOVABLE begins at in each node */ printk("Movable zone start PFN for each node\n"); for (i = 0; i < MAX_NUMNODES; i++) { if (zone_movable_pfn[i]) printk(" Node %d: %lu\n", i, zone_movable_pfn[i]); } /* Print out the early_node_map[] */ printk("early_node_map[%d] active PFN ranges\n", nr_nodemap_entries); for (i = 0; i < nr_nodemap_entries; i++) printk(" %3d: %0#10lx -> %0#10lx\n", early_node_map[i].nid, early_node_map[i].start_pfn, early_node_map[i].end_pfn); /* Initialise every node */ /*调试用*/ mminit_verify_pageflags_layout(); setup_nr_node_ids(); for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); /*zone中数据的初始化,伙伴系统建立但是没有页面 和数据,页面在后面的mem_init中得到*/ free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), NULL); /* Any memory on that node */ if (pgdat->node_present_pages) node_set_state(nid, N_HIGH_MEMORY); /*内存的相关检查*/ check_for_regular_memory(pgdat); } }
void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn;/*这个在前面调用一个函数得到*/ /*计算系统中节点nid的所有物理页面保存在数据结构中*/ calculate_node_totalpages(pgdat, zones_size, zholes_size); /*当节点只有一个时,将节点的map保存到全局变量中*/ alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", nid, (unsigned long)pgdat, (unsigned long)pgdat->node_mem_map); #endif /*zone中相关数据的初始化,包括伙伴系统,等待队列,相关变量, 数据结构、链表等;*/ free_area_init_core(pgdat, zones_size, zholes_size); }
具体的区域的初始化在下面函数进行
/* * Set up the zone data structures: * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; int ret; pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; enum lru_list l; /*下面的两个函数会获得指定节点的真实内存大小*/ size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, zholes_size); /* * Adjust realsize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ memmap_pages =/*存放页面所需要的内存大小*/ PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; if (memmap_pages) printk(KERN_DEBUG " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", zone_names[j], memmap_pages, realsize); /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve;/*减去为DMA保留的页面*/ printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } if (!is_highmem_idx(j))/*如果不是高端内存区*/ nr_kernel_pages += realsize; nr_all_pages += realsize; /*下面为初始化zone结构的相关变量*/ zone->spanned_pages = size; zone->present_pages = realsize; #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) / 100; zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; #endif zone->name = zone_names[j]; spin_lock_init(&zone->lock); spin_lock_init(&zone->lru_lock); zone_seqlock_init(zone); zone->zone_pgdat = pgdat; zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); for_each_lru(l) {//初始化链表 INIT_LIST_HEAD(&zone->lru[l].list); zone->reclaim_stat.nr_saved_scan[l] = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; zone->reclaim_stat.recent_scanned[0] = 0; zone->reclaim_stat.recent_scanned[1] = 0; zap_zone_vm_stats(zone);/*将变量zone->vm_stat变量置0*/ zone->flags = 0; if (!size) continue; /*需要定义相关宏该版本没定义*/ set_pageblock_order(pageblock_default_order()); /zone中变量pageblock_flags内存申请,从启动分配器中*/ setup_usemap(pgdat, zone, size); /*zone中的任务等待队列和zone的伙伴系统(MAX_ORDER个链表)的初始化,关于伙伴系统将单独在后面总结*/ ret = init_currently_empty_zone(zone, zone_start_pfn, size, MEMMAP_EARLY); BUG_ON(ret); /*zone中page相关属性的初始化工作*/ memmap_init(size, nid, j, zone_start_pfn); zone_start_pfn += size; } }
三、分配内存的备用区域初始化(非CONFIG_NUMA)
数据结构表示
x; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * If zlcache_ptr is not NULL, then it is just the address of zlcache, * as explained above. If zlcache_ptr is NULL, there is no zlcache. * * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry *///zone分配方案 struct zonelist { struct zonelist_cache *zlcache_ptr; // NULL or &zlcache struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; #ifdef CONFIG_NUMA struct zonelist_cache zlcache; // optional ... #endif };
代码中的英文注释很详细了
初始化
Start_kernel()->build_all_zonelists()
void build_all_zonelists(void) { /*设置全局变量current_zonelist_order*/ set_zonelist_order(); if (system_state == SYSTEM_BOOTING) { /*对所有节点创建zonelists*/ __build_all_zonelists(NULL); /*调试用*/ mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } /*计算所有zone中可分配的页面数之和*/ vm_total_pages = nr_free_pagecache_pages(); /* * Disable grouping by mobility if the number of pages in the * system is too low to allow the mechanism to work. It would be * more accurate, but expensive to check per-zone. This check is * made on memory-hotadd so a system can start with mobility * disabled and enable it later */ if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES)) page_group_by_mobility_disabled = 1; else page_group_by_mobility_disabled = 0; printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); #ifdef CONFIG_NUMA printk("Policy zone: %s\n", zone_names[policy_zone]); #endif }
/* return values int ....just for stop_machine() */ static int __build_all_zonelists(void *dummy) { int nid; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); /*创建zonelists,这个数组用来在分配内存中坐回绕,循环访问*/ build_zonelists(pgdat); /*在UMA中,这个仅仅是把相关的变量设置成了NULL*/ build_zonelist_cache(pgdat); } return 0; }
static void build_zonelists(pg_data_t *pgdat) { int node, local_node; enum zone_type j; struct zonelist *zonelist; local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; /*将zone添加到zone链表中,这样,zone中page的 分配等操作将依靠这个环形的链表;*/ j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); /* * Now we build the zonelist so that it contains the zones * of all the other nodes. * We don't want to pressure a particular node, so when * building the zones for node N, we make sure that the * zones coming right after the local ones are those from * node N+1 (modulo N) *//*对其他在线的节点创建zonelist*/ for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; j = build_zonelists_node(NODE_DATA(node), zonelist, j, MAX_NR_ZONES - 1); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; j = build_zonelists_node(NODE_DATA(node), zonelist, j, MAX_NR_ZONES - 1); } zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; }
/* * Builds allocation fallback zone lists. * * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) { struct zone *zone; BUG_ON(zone_type >= MAX_NR_ZONES); zone_type++; do { zone_type--; zone = pgdat->node_zones + zone_type; if (populated_zone(zone)) {/*如果以页面为单位的管理区的总大小不为0*/ zoneref_set_zone(zone,/*设置管理区链表,将相关信息加入*/ &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } } while (zone_type); return nr_zones; }
内存管理区初始化主要是借助于启动分配器和以初始化的e820全局变量。内存管理区初始化后相应的伙伴系统、slab机制等等就可以在此基础上建立了,在后面会一点一点总结。