建立内存管理架构

5.2.5 建立内存管理架构

回到setup_arch函数的中,第995行调用initmem_init来启用初始化期间的内存管理器early。这个函数在两个文件中有定义,arch/x86/mm/init_32.carch/x86/mm/numa_32.c,取决于是否启动了编译选项CONFIG_NEED_MULTIPLE_NODES。这个编译选项是什么意思?这得从NUMA说起。NUMA翻译成中文就叫“非对称内存访问体系”,其目的是为多CPU,或大型计算机集群提供一个分布式内存访问环境,而每一个分布式节点就叫做NODE。我们单机系统通常是UMA,即“对称内存访问体系”,其实就是只有一个NODE的环境。

 

而每个NODE下物理内存分成几个Zone(区域),Zone再对物理页面进行管理。所以,不管是我们的PC,还是大型集群服务器,只要安装了Linux操作系统,就是一个NODE->Zone->Page这样一个三层物理内存管理体系。

 

我的这台烂PC,居然定义了这个选项的,所以实际上应该是调用arch/x86/mm/numa_32.c中的那个initmem_init函数。可能考虑到它是双核的原因吧,不过我们简单起见,只考虑仅有一个NODE的情况,所以仅对arch/x86/mm/init_32.c中的initmem_init函数进行分析,对NUMA感兴趣的可以去研究另一个initmem_init函数。

 

/* arch/x86/mm/init_32.c */

708void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,

 709                                int acpi, int k8)

 710{

 711#ifdef CONFIG_HIGHMEM

 712        highstart_pfn = highend_pfn = max_pfn;

 713        if (max_pfn > max_low_pfn)

 714                highstart_pfn = max_low_pfn;

 715        e820_register_active_regions(0, 0, highend_pfn);

 716        sparse_memory_present_with_active_regions(0);

 717        printk(KERN_NOTICE "%ldMB HIGHMEM available./n",

 718                pages_to_mb(highend_pfn - highstart_pfn));

 719        num_physpages = highend_pfn;

 720        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;

 721#else

 722        e820_register_active_regions(0, 0, max_low_pfn);

 723        sparse_memory_present_with_active_regions(0);

 724        num_physpages = max_low_pfn;

 725        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;

 726#endif

 727#ifdef CONFIG_FLATMEM

 728        max_mapnr = num_physpages;

 729#endif

 730        __vmalloc_start_set = true;

 731

 732        printk(KERN_NOTICE "%ldMB LOWMEM available./n",

 733                        pages_to_mb(max_low_pfn));

 734

 735        setup_bootmem_allocator();

 736}

 

这个函数首先调用e820_register_active_regions设置early_node_map[],传入给它的参数是00max_low_pfn(或者是highend_pfn,取决于是否定义了编译选项CONFIG_FLATMEM):

 

932void __init e820_register_active_regions(int nid, unsigned long start_pfn,

 933                                         unsigned long last_pfn)

 934{

 935        unsigned long ei_startpfn;

 936        unsigned long ei_endpfn;

 937        int i;

 938

 939        for (i = 0; i < e820.nr_map; i++)

 940                if (e820_find_active_region(&e820.map[i],

 941                                            start_pfn, last_pfn,

 942                                            &ei_startpfn, &ei_endpfn))

 943                        add_active_range(nid, ei_startpfn, ei_endpfn);

 944}

 

e820_register_active_regions函数的940行,调用e820_find_active_region函数,其中ei_startpfnei_endpfn是结果参数:

 

902int __init e820_find_active_region(const struct e820entry *ei,

 903                                  unsigned long start_pfn,

 904                                  unsigned long last_pfn,

 905                                  unsigned long *ei_startpfn,

 906                                  unsigned long *ei_endpfn)

 907{

 908        u64 align = PAGE_SIZE;

 909

 910        *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;

 911        *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;

 912

 913        /* Skip map entries smaller than a page */

 914        if (*ei_startpfn >= *ei_endpfn)

 915                return 0;

 916

 917        /* Skip if map is outside the node */

 918        if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||

 919                                    *ei_startpfn >= last_pfn)

 920                return 0;

 921

 922        /* Check for overlaps */

 923        if (*ei_startpfn < start_pfn)

 924                *ei_startpfn = start_pfn;

 925        if (*ei_endpfn > last_pfn)

 926                *ei_endpfn = last_pfn;

 927

 928        return 1;

 929}

 

e820_find_active_region 函数的910行、911行计算出e820.map[i]元素的起始和结束页框的页号,并赋给结果参数ei_startpfnei_endpfn。那么回到e820_register_active_regions943行调用add_active_range函数,并传入参数nidei_startpfnei_endpfn。这个nid是什么东西?在initmem_init中传入的是0,表示0NODE。除此之外,传入add_active_range的参数还有刚才获得的ei_startpfnei_endpfn页框号,其中,ei_startpfn就是0

 

3984void __init add_active_range(unsigned int nid, unsigned long start_pfn,

3985                                                unsigned long end_pfn)

3986{

3987        int i;

3988

3989        mminit_dprintk(MMINIT_TRACE, "memory_register",

3990                        "Entering add_active_range(%d, %#lx, %#lx) "

3991                        "%d entries of %d used/n",

3992                        nid, start_pfn, end_pfn,

3993                        nr_nodemap_entries, MAX_ACTIVE_REGIONS);

3994

3995        mminit_validate_memmodel_limits(&start_pfn, &end_pfn);

3996

3997        /* Merge with existing active regions if possible */

3998        for (i = 0; i < nr_nodemap_entries; i++) {

3999                if (early_node_map[i].nid != nid)

4000                        continue;

4001

4002                /* Skip if an existing region covers this new one */

4003                if (start_pfn >= early_node_map[i].start_pfn &&

4004                                end_pfn <= early_node_map[i].end_pfn)

4005                        return;

4006

4007                /* Merge forward if suitable */

4008                if (start_pfn <= early_node_map[i].end_pfn &&

4009                                end_pfn > early_node_map[i].end_pfn) {

4010                        early_node_map[i].end_pfn = end_pfn;

4011                        return;

4012                }

4013

4014                /* Merge backward if suitable */

4015                if (start_pfn < early_node_map[i].start_pfn &&

4016                                end_pfn >= early_node_map[i].start_pfn) {

4017                        early_node_map[i].start_pfn = start_pfn;

4018                        return;

4019                }

4020        }

4021

4022        /* Check that early_node_map is large enough */

4023        if (i >= MAX_ACTIVE_REGIONS) {

4024                printk(KERN_CRIT "More than %d memory regions, truncating/n",

4025                                                 MAX_ACTIVE_REGIONS);

4026                return;

4027        }

4028

4029        early_node_map[i].nid = nid;

4030        early_node_map[i].start_pfn = start_pfn;

4031        early_node_map[i].end_pfn = end_pfn;

4032        nr_nodemap_entries = i + 1;

4033}

 

3989 – 3995行代码主要是打印一些相关信息,我们主要还是关注全局变量early_node_map数组是怎样被初始化的。那么early_node_map到底是个什么东西呢?这个东西定义在mm/page_alloc.c

static struct node_active_region __meminitdata early_node_map[MAX_ACTIVE_REGIONS];

struct node_active_region {

       unsigned long start_pfn;

       unsigned long end_pfn;

       int nid;

};

这里关键是数组的大小,也就是MAX_ACTIVE_REGIONS的值,代表NUMA活动节点的最大数量,本来应该等于CONFIG_MAX_ACTIVE_REGIONS。但我们在.config中没有配置它,所以它的值为256

 

nr_nodemap_entries,也是一个全局的值,它的值在编译的时候被设置为1,那么函数3998 – 4020 测试一下start_pfnend_pfn是否与early_node_map[]的某个元素重合,如果有重合,则返回。而我们这里,early_node_map[0]也是空的,全是0,所以直接来到4029行,early_node_map[0].nid给赋成0early_node_map[i].start_pfn给赋值成start_pfn,我们传进来的是e820.map[i]起始地址对应的页号;early_node_map[i].end_pfn给赋值成end_pfn,我们传进来的是e820.map[i]最后一个页框号;最后nr_nodemap_entries1,返回。返回后,全局变量nr_nodemap_entries就是全局数组early_node_mapsize

 

回到e820_register_active_regions,最后939for循环执行完成后,全局early_node_map数组就存放了每个可用e820 RAM的起始地址和结束地址对应的页框号,他们的nid都是0,共有e820.nr_map个元素;全局变量nr_nodemap_entries的值最后也变成了e820 RAM可用内存区的总数e820.nr_map

 

 

 

建立内存管理架构_第1张图片

 

initmem_init函数在设置好全局变量early_node_map[]数组之后,随后716723行调用的是sparse_memory_present_with_active_regions(0)

 

3471void __init sparse_memory_present_with_active_regions(int nid)

3472{

3473        int i;

3474

3475        for_each_active_range_index_in_nid(i, nid)

3476                memory_present(early_node_map[i].nid,

3477                                early_node_map[i].start_pfn,

3478                                early_node_map[i].end_pfn);

3479}

 

3475行,有一个for_each_active_range_index_in_nid宏,这个宏后面会经常用到,这里只讲一次,来自mm/page_alloc.c

#define for_each_active_range_index_in_nid(i, nid) /

        for (i = first_active_region_index_in_nid(nid); i != -1; /

                                i = next_active_region_index_in_nid(i, nid))

其中,first_active_region_index_in_nid函数定义为:

static int __meminit first_active_region_index_in_nid(int nid)

{

       int i;

       for (i = 0; i < nr_nodemap_entries; i++)

              if (nid == MAX_NUMNODES || early_node_map[i].nid == nid)

                     return i;

       return -1;

}

也就是说,i的初始值为该函数的返回值,取决于early_node_map[0].nid的值。由于我们也没有配置CONFIG_NODES_SHIFTMAX_NUMNODES1;全局变量early_node_map的所有nid都是0,所以first_active_region_index_in_nid返回0

 

next_active_region_index_in_nid函数定义如下:

static int __meminit next_active_region_index_in_nid(int index, int nid)

{

       for (index = index + 1; index < nr_nodemap_entries; index++)

              if (nid == MAX_NUMNODES || early_node_map[index].nid == nid)

                     return index;

 

       return -1;

}

 

除非i > nr_nodemap_entries,否则它将返回index,即从1nr_nodemap_entries,也就是遍历了所有early_node_map数组的元素。for_each_active_range_index_in_nid就是这么一个循环。这说明了一个什么问题?那就是验证了前面的那句话,笔者的这个单机PCNUMA只有一个NODE。我们没有配置CONFIG_HAVE_MEMORY_PRESENT,所以memory_present是一个空函数。

 

回到initmem_init中,第三步是调用setup_bootmem_allocator()函数来设置引导启动阶段所涉及到的页映射位:

 

775void __init setup_bootmem_allocator(void)

 776{

 777#ifndef CONFIG_NO_BOOTMEM

 778        int nodeid;

 779        unsigned long bootmap_size, bootmap;

 780        /*

 781         * Initialize the boot-time allocator (with low memory only):

 782         */

 783        bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;

 784        bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,

 785                                 PAGE_SIZE);

 786        if (bootmap == -1L)

 787                panic("Cannot find bootmem map of size %ld/n", bootmap_size);

 788        reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");

 789#endif

 790

 791        printk(KERN_INFO "  mapped low ram: 0 - %08lx/n",

 792                 max_pfn_mapped<<PAGE_SHIFT);

 793        printk(KERN_INFO "  low ram: 0 - %08lx/n", max_low_pfn<<PAGE_SHIFT);

 794

 795#ifndef CONFIG_NO_BOOTMEM

 796        for_each_online_node(nodeid) {

 797                 unsigned long start_pfn, end_pfn;

 798

 799#ifdef CONFIG_NEED_MULTIPLE_NODES

……

 806#else

 807                start_pfn = 0;

 808                end_pfn = max_low_pfn;

 809#endif

 810                bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,

 811                                                 bootmap);

 812        }

 813#endif

 814

 815        after_bootmem = 1;

 816}

 

我们看到,由于我们.config文件中配置了CONFIG_NO_BOOTMEM,所以这个函数也是什么都不干,仅仅打印几个信息并把全局变量after_init_bootmem设置为1。这个函数结束后,我们的initmem_init也就结束了,至此,内核永久页表建立好了,并且初始化时期内存管理相关的数据结构框架都搭建起来了,下面就该向这个框架内“添砖加瓦”了。

 

 

你可能感兴趣的:(数据结构,框架,集群,struct,each,merge)