内核中描述memory数据结构的演变

内核从UBoot中得知memory的有关参数如大小等,是怎样使用的?怎样演变出各种数据结构,直到神奇的我们看不懂。

本文希望能对 kernel memory 有个比较清晰的理解。

描述memory的数据结构变化:

early_mem -> mem_ifro -> memblock -> node, zone, page. -> 内核提供的接口函数。

4 and 5对页表的建立进行了描述。


1] setup_arch -> parse_early_param -> early_mem

内核会通过输入参数中的 mem=xxxM得到内核可用 memory 的大小。是怎样调到 early_mem的?

可以从下面的back trace 中得到答案。

------------[ cut here ]------------

WARNING: at arch/arm/kernel/setup.c:564 early_mem+0x20/0xc4()
Modules linked in:
Backtrace:
[<c0012058>] (dump_backtrace+0x0/0x110) from [<c05334d0>] (dump_stack+0x18/0x1c)
 r6:00000234 r5:c06ad348 r4:00000000 r3:c0722a08
[<c05334b8>] (dump_stack+0x0/0x1c) from [<c0029260>] (warn_slowpath_common+0x54/0x6c)
[<c002920c>] (warn_slowpath_common+0x0/0x6c) from [<c002929c>] (warn_slowpath_null+0x24/0x2c)
 r8:c06d3a9c r7:c06d3a93 r6:c06d3a97 r5:c06d3a93 r4:c06d3a97
r3:00000009
[<c0029278>] (warn_slowpath_null+0x0/0x2c) from [<c06ad348>] (early_mem+0x20/0xc4)
[<c06ad328>] (early_mem+0x0/0xc4) from [<c06ab9a4>] (do_early_param+0x70/0xb0)
 r5:c06d3a93 r4:c06d9350
[<c06ab934>] (do_early_param+0x0/0xb0) from [<c0041af0>] (parse_args+0x1f4/0x2c0)
 r6:c06d3a97 r5:c060bee9 r4:00000000 r3:c06ab934
[<c00418fc>] (parse_args+0x0/0x2c0) from [<c06ab260>] (parse_early_options+0x38/0x48)
[<c06ab228>] (parse_early_options+0x0/0x48) from [<c06ab680>] (parse_early_param+0x34/0x48)
[<c06ab64c>] (parse_early_param+0x0/0x48) from [<c06ada50>] (setup_arch+0x4f8/0x8b4)
 r4:c06cde4c r3:00000000
[<c06ad558>] (setup_arch+0x0/0x8b4) from [<c06ab71c>] (start_kernel+0x88/0x2a0)
[<c06ab694>] (start_kernel+0x0/0x2a0) from [<80008040>] (0x80008040)
 r7:c06f3f9c r6:c06d5b14 r5:c06ecee4 r4:10c53c7d
---[ end trace 1b75b31a2719ed1c ]---

1.1] early_mem: get the memory size and physical start address;

Memory: usermem==0 value meminfo.nr_banks = 0
Memory: how call into early_mem
Memory: get the memory start phy Address 0x80000000, size 0x28000000

1.2] early_mem -> arm_add_memory: increase the memifo the nr_banks member;

the function of early_mem may be called many times.
Memory: arm_add_memory: start 0x80000000, size 0x28000000


2] setup_arch -> sanity_check_meminfo: process the meminfo: membank

Memory:sanity_check_meminfo:banks 0x1
Memory:sanity_check_meminfo:arm_lowmem_limit 0xa8000000, high_memory 0xe8000000

3] arm_memblock_init :  memblock

3.1] Contiguous Memory Allocator for DMA mapping framework
cma: Memory: dma size_cmdline 0xffffffff
cma: Memory: dma selected size 0x1000000 [0xa8000000 - a7000000]
cma: CMA: reserved 16 MiB at a7000000

MEMBLOCK configuration:
 memory size = 0x28000000[640M] reserved size = 0x19436fc[25M]
 memory.cnt  = 0x1
 memory[0x0]    [0x00000080000000-0x000000a7ffffff], 0x28000000 bytes

 reserved.cnt  = 0x3
 /*16K #define SWAPPER_PG_DIR_SIZE (PTRS_PER_PGD * sizeof(pgd_t)): 2048*8*/
 reserved[0x0]  [0x00000080004000-0x00000080007fff], 0x4000 bytes
 reserved[0x1]  [0x00000080008180-0x0000008094787b], 0x93f6fc bytes
 reserved[0x2]  [0x000000a7000000-0x000000a7ffffff], 0x1000000 bytes [CMA]

Memory policy: ECC disabled, Data cache writealloc


4] arch/arm/mm/mmu.c:    prepare_page_table

[0, 0xbf000000], [0xbf000000, 0xc0000000][0xe8000000, VMALLOC_START]
Memory: prepare_page_table MODULES_VADDR0xbf000000, PMD_SIZE0x200000

Memory: prepare_page_table PAGE_OFFSETc0000000, PMD_SIZE0x200000

Memory: prepare_page_table end0xa8000000, arm_lowmem_limit0xa8000000

这里的pmd_clear(pmd_off_k(addr));做了些什么事情?

4.1 找到 page dirctory item

先看:pmd_off_k(addr),又出来一个更拫的角色
static inline pmd_t *pmd_off_k(unsigned long virt)
{
    return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt);
}

/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr)    pgd_offset(&init_mm, addr)
pgd_offset_k(addr)得到是directory item的地址。

/* to find an entry in a page-table-directory
 * arch/arm/include/asm/pgtable-2level.h
 */
#define pgd_offset(mm, addr)    ((mm)->pgd + pgd_index(addr))
#define pgd_index(addr)        ((addr) >> PGDIR_SHIFT)

现在看pud_offset(pgd, start)
文件:include/asm-generic/4level-fixup.h中
#define pud_offset(pgd, start)        (pgd)

pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt) ->
pmd_offset(pgd_offset_k(virt), virt)
又:
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
{
    return (pmd_t *)pud;
}
pmd_offset(pgd_offset_k(virt), virt) ->
pgd_offset_k(virt) -> pgd_offset(&init_mm, addr) -> ((mm)->pgd + pgd_index(addr))

4.2 找到这个地址干什么?

#define pmd_clear(pmdp)            \
    do {                \
        pmdp[0] = __pmd(0);    \
        pmdp[1] = __pmd(0);    \
        clean_pmd_entry(pmdp);    \
    } while (0)


#define __pte(x)        ((pte_t) { (x) } )
#define __pmd(x)        ((pmd_t) { (x) } )

arch/arm/include/asm/tlbflush.h

#define tlb_op(f, regs, arg)    __tlb_op(f, "p15, 0, %0, " regs, arg)
#define tlb_l2_op(f, regs, arg)    __tlb_op(f, "p15, 1, %0, " regs, arg)


static inline void clean_pmd_entry(void *pmd)
{
    const unsigned int __tlb_flag = __cpu_tlb_flags;

    tlb_op(TLB_DCLEAN, "c7, c10, 1    @ flush_pmd", pmd);
    tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1  @ L2 flush_pmd", pmd);
}

4.3prepare_page_table

prepare_page_table:这里说的pmd_clear就是把主内核页表目录的对应项清0.
1]Clear out all the mappings below the kernel image.
2]Clear out all the kernel space mappings, except for the first memory bank, up to the vmalloc region.
3]这里没有清所说的低端内存。


5 map_lowmem: memblock_region

static void __init map_lowmem(void)
{
    struct memblock_region *reg;
    phys_addr_t start;
    phys_addr_t end;
    struct map_desc map;

    /* Map all the lowmem memory banks. */
    for_each_memblock(memory, reg) {
        start = reg->base;
        end = start + reg->size;

        if (end > arm_lowmem_limit)
            end = arm_lowmem_limit;
        if (start >= end)
            break;
#define    __phys_to_pfn(paddr)    ((unsigned long)((paddr) >> PAGE_SHIFT))

        map.pfn = __phys_to_pfn(start);
        map.virtual = __phys_to_virt(start);
        map.length = end - start;
        map.type = MT_MEMORY;

        create_mapping(&map, false);
    }
}


这里有关 memory的数据结构从memblock_region到map_desc
struct map_desc {
    unsigned long virtual;
    unsigned long pfn;
    unsigned long length;
    unsigned int type;
};

5.1 how to create the first and second level table

/************************************************************************************/
static void __init create_mapping(struct map_desc *md, bool force_pages)
{
    unsigned long addr, length, end;
    phys_addr_t phys;
    const struct mem_type *type;
    pgd_t *pgd;

    type = &mem_types[md->type];

    addr = md->virtual & PAGE_MASK;
    phys = __pfn_to_phys(md->pfn);
    length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));

    pgd = pgd_offset_k(addr);
    end = addr + length;

    do {
        unsigned long next = pgd_addr_end(addr, end);

        alloc_init_pud(pgd, addr, next, phys, type, force_pages);

        phys += next - addr;
        addr = next;
    } while (pgd++, addr != end);
}

/************************************************************************************/
static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
    unsigned long end, unsigned long phys, const struct mem_type *type,
    bool force_pages)
{
    pud_t *pud = pud_offset(pgd, addr);
    unsigned long next;

    do {
        next = pud_addr_end(addr, end);
        alloc_init_section(pud, addr, next, phys, type, force_pages);
        phys += next - addr;
    } while (pud++, addr = next, addr != end);
}


/************************************************************************************/
static void __init alloc_init_section(pud_t *pud, unsigned long addr,
                      unsigned long end, phys_addr_t phys,
                      const struct mem_type *type,
                      bool force_pages)
{
    pmd_t *pmd = pmd_offset(pud, addr);

    /*
     * Try a section mapping - end, addr and phys must all be aligned
     * to a section boundary.  Note that PMDs refer to the individual
     * L1 entries, whereas PGDs refer to a group of L1 entries making
     * up one logical pointer to an L2 table.
     */
    if (type->prot_sect && ((addr | end | phys) & ~SECTION_MASK) == 0 &&
        !force_pages) {
        pmd_t *p = pmd;

        if (addr & SECTION_SIZE)
            pmd++;

        do {
            *pmd = __pmd(phys | type->prot_sect);
            phys += SECTION_SIZE;
        } while (pmd++, addr += SECTION_SIZE, addr != end);

        flush_pmd_entry(p);
    }
}

/************************************************************************************/
static void __init alloc_init_section(pud_t *pud, unsigned long addr,
                      unsigned long end, phys_addr_t phys,
                      const struct mem_type *type,
                      bool force_pages)
{
    pmd_t *pmd = pmd_offset(pud, addr);
    /*
    * No need to loop; pte's aren't interested in the
    * individual L1 entries.
    */
    alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
}

/************************************************************************************/
static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
                  unsigned long end, unsigned long pfn,
                  const struct mem_type *type)
{
    /*allocate memory for page table item of each pmd*/
    pte_t *start_pte = early_pte_alloc(pmd);
    pte_t *pte = start_pte + pte_index(addr);

    /* If replacing a section mapping, the whole section must be replaced */
    BUG_ON(pmd_bad(*pmd) && ((addr | end) & ~PMD_MASK));

    do {
        set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
        pfn++;
    } while (pte++, addr += PAGE_SIZE, addr != end);
    
    early_pte_install(pmd, start_pte, type->prot_l1);
}

/************************************************************************************/
static void __init early_pte_install(pmd_t *pmd, pte_t *pte, unsigned long prot)
{
    __pmd_populate(pmd, __pa(pte), prot);
    BUG_ON(pmd_bad(*pmd));
}

/************************************************************************************/
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
                  pmdval_t prot)
{
    pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
    pmdp[0] = __pmd(pmdval);
    /*每个pte的值已经写入完毕,这里只是把他们分为两部分*/
    pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));

    flush_pmd_entry(pmdp);
}

/*1] 这里循环用的步长是 PGDIR_SIZE = 2^21 = 2M, 这个大循环的控制变量是pgd, PGDIR_SIZE
 *2] 再看里面的小循环:alloc_init_pud的循环控制变量是pud and PUD_SIZE,因为pgd=pud,
 *   该循环实际上只运行一次
 *3] 现在看最里面的循环:alloc_init_section:这里分为2部分
 *3.1] a section mapping循环控制变量是 pmd and SECTION_SIZE.
 *3.2] page entry: 循环控制变量是 PAGE_SIZE,pte
 */
#define pgd_addr_end(addr, end)                        \
({    unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK;    \
    (__boundary - 1 < (end) - 1)? __boundary: (end);        \
})



/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr)    pgd_offset(&init_mm, addr)
pgd_offset_k(addr)得到是directory item的地址。

/* to find an entry in a page-table-directory
 * arch/arm/include/asm/pgtable-2level.h
 */
#define pgd_offset(mm, addr)    ((mm)->pgd + pgd_index(addr))
#define pgd_index(addr)        ((addr) >> PGDIR_SHIFT)

/*
 * PMD_SHIFT determines the size of the area a second-level page table can map
 * PGDIR_SHIFT determines what a third-level page table entry can map
 */
#define PMD_SHIFT        21
#define PGDIR_SHIFT        21

#define PMD_SIZE        (1UL << PMD_SHIFT)
#define PMD_MASK        (~(PMD_SIZE-1))
#define PGDIR_SIZE        (1UL << PGDIR_SHIFT)
#define PGDIR_MASK        (~(PGDIR_SIZE-1)

/*
 * section address mask and size definitions.
 */
#define SECTION_SHIFT        20
#define SECTION_SIZE        (1UL << SECTION_SHIFT)
#define SECTION_MASK        (~(SECTION_SIZE-1))

/*得到pmd_t是pud_t,而
crash> pgd_t
typedef unsigned int [2] pgd_t;
SIZE: 8
crash> pud_t
typedef struct {
    pgd_t pgd;
} pud_t;
SIZE: 8
crash> pmd_t
typedef unsigned int pmd_t;
SIZE: 4
crash> pte_t
typedef unsigned int pte_t;
SIZE: 4
返回的是8个字节的首地址,8个字节中的后面的高4个字节怎样使用的?
pmd++
 */

  1st level contains 4096 entries (4 pages for PGD)
§  1MB section per entry or
§  Pointer to a 2nd level table
§  Implementation-defined 16MB supersections
§  2nd level contains 256 entries pointing to 4KB page each
§  1KB per 2nd level page table
 2nd level page table does not fill a full 4K page

ARM Linux workarounds
§  Separate array for the Linux PTE bits
§  1st level entry consists of two 32-bit locations pointing to 2KB 2nd level
page table entries


5.2 how to allocate memory for pte

static void __init *early_alloc(unsigned long sz)
{
    return early_alloc_aligned(sz, sz);
}

static void __init *early_alloc_aligned(unsigned long sz, unsigned long align)
{
    void *ptr = __va(memblock_alloc(sz, align));
    memset(ptr, 0, sz);
    return ptr;
}

phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
{
    return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
    phys_addr_t alloc;

    alloc = __memblock_alloc_base(size, align, max_addr);

    if (alloc == 0)
        panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
              (unsigned long long) size, (unsigned long long) max_addr);

    return alloc;
}
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
    return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
}

static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
                    phys_addr_t align, phys_addr_t max_addr,
                    int nid)
{
    phys_addr_t found;

    /* align @size to avoid excessive fragmentation on reserved array */
    size = round_up(size, align);

    found = memblock_find_in_range_node(0, max_addr, size, align, nid);
    if (found && !memblock_reserve(found, size))
        return found;

    return 0;
}

5.3 how to value the pte

#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)

/*从函数cpu_v7_set_pte_ext可以看出each entry store two time offset 0 and offset 2048*/
r0:pte address, r1:context, r2:0
crash> dis cpu_v7_set_pte_ext -l    /*linux version*/
0xc0019a0c <cpu_v7_set_pte_ext>:        str     r1, [r0]

0xc0019a10 <cpu_v7_set_pte_ext+4>:      bic     r3, r1, #1008   ; 0x3f0
0xc0019a14 <cpu_v7_set_pte_ext+8>:      bic     r3, r3, #3
0xc0019a18 <cpu_v7_set_pte_ext+12>:     orr     r3, r3, r2
0xc0019a1c <cpu_v7_set_pte_ext+16>:     orr     r3, r3, #18
0xc0019a20 <cpu_v7_set_pte_ext+20>:     tst     r1, #16
0xc0019a24 <cpu_v7_set_pte_ext+24>:     orrne   r3, r3, #64     ; 0x40
0xc0019a28 <cpu_v7_set_pte_ext+28>:     eor     r1, r1, #64     ; 0x40
0xc0019a2c <cpu_v7_set_pte_ext+32>:     tst     r1, #192        ; 0xc0
0xc0019a30 <cpu_v7_set_pte_ext+36>:     orrne   r3, r3, #512    ; 0x200
0xc0019a34 <cpu_v7_set_pte_ext+40>:     tst     r1, #256        ; 0x100
0xc0019a38 <cpu_v7_set_pte_ext+44>:     orrne   r3, r3, #32
0xc0019a3c <cpu_v7_set_pte_ext+48>:     tst     r1, #512        ; 0x200
0xc0019a40 <cpu_v7_set_pte_ext+52>:     orrne   r3, r3, #1
0xc0019a44 <cpu_v7_set_pte_ext+56>:     tst     r1, #2
0xc0019a48 <cpu_v7_set_pte_ext+60>:     tstne   r1, #1
0xc0019a4c <cpu_v7_set_pte_ext+64>:     moveq   r3, #0
                    /*hardware version is stored at +2048 bytes*/
0xc0019a50 <cpu_v7_set_pte_ext+68>:     str     r3, [r0, #2048]!        ; 0x800
0xc0019a54 <cpu_v7_set_pte_ext+72>:     mcr     15, 0, r0, cr7, cr10, {1}
0xc0019a58 <cpu_v7_set_pte_ext+76>:     mov     pc, lr

这里把两项合成一项
pfn_pte(pfn, __pgprot(type->prot_pte)
#define __pgprot(x)     ((pgprot_t) { (x) } )

#define pfn_pte(pfn,prot)    __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
#define pgprot_val(x)   (x)

/*先看一下 mem_type *type */

crash> mem_type
struct mem_type {
    pteval_t prot_pte;
    pmdval_t prot_l1;
    pmdval_t prot_sect;
    unsigned int domain;
}
SIZE: 16

[MT_MEMORY] = {
    .prot_pte  = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
    .prot_l1   = PMD_TYPE_TABLE,
    .prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
    .domain    = DOMAIN_KERNEL,
},


6 dma_contiguous_remap



void __init dma_contiguous_remap(void)
{
    int i;
    for (i = 0; i < dma_mmu_remap_num; i++) {
        phys_addr_t start = dma_mmu_remap[i].base;
        phys_addr_t end = start + dma_mmu_remap[i].size;
        struct map_desc map;
        unsigned long addr;

        if (end > arm_lowmem_limit)
            end = arm_lowmem_limit;
        if (start >= end)
            return;

        map.pfn = __phys_to_pfn(start);
        map.virtual = __phys_to_virt(start);
        map.length = end - start;
        map.type = MT_MEMORY_DMA_READY;

        /*
         * Clear previous low-memory mapping
         */
        for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
             addr += PGDIR_SIZE)
            pmd_clear(pmd_off_k(addr));

        iotable_init(&map, 1);
    }
}


7. devicemaps_init

/*
 * Set up the device mappings.  Since we clear out the page tables for all
 * mappings above VMALLOC_START, we will remove any debug device mappings.
 * This means you have to be careful how you debug this function, or any
 * called function.  This means you can't use any function or debugging
 * method which may touch any device, otherwise the kernel _will_ crash.
 */
static void __init devicemaps_init(struct machine_desc *mdesc)
{
    struct map_desc map;
    unsigned long addr;
    void *vectors;

    /*
     * Allocate the vector page early.
     */
    vectors = early_alloc(PAGE_SIZE);

    early_trap_init(vectors);

    for (addr = VMALLOC_START; addr; addr += PMD_SIZE)
        pmd_clear(pmd_off_k(addr));


    /*
     * Map the cache flushing regions.
     */
#ifdef FLUSH_BASE
    map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
    map.virtual = FLUSH_BASE;
    map.length = SZ_1M;
    map.type = MT_CACHECLEAN;
    create_mapping(&map);
#endif
#ifdef FLUSH_BASE_MINICACHE
    map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
    map.virtual = FLUSH_BASE_MINICACHE;
    map.length = SZ_1M;
    map.type = MT_MINICLEAN;
    create_mapping(&map);
#endif
    /*
     * Create a mapping for the machine vectors at the high-vectors
     * location (0xffff0000).  If we aren't using high-vectors, also
     * create a mapping at the low-vectors virtual address.
     */
    map.pfn = __phys_to_pfn(virt_to_phys(vectors));
    map.virtual = 0xffff0000;
    map.length = PAGE_SIZE;
    map.type = MT_HIGH_VECTORS;
    create_mapping(&map, false);

    if (!vectors_high()) {
        map.virtual = 0;
        map.type = MT_LOW_VECTORS;
        create_mapping(&map, false);
    }

    /*
     * Ask the machine support to map in the statically mapped devices.
     */
    if (mdesc->map_io)
        mdesc->map_io();

    /*
     * Finally flush the caches and tlb to ensure that we're in a
     * consistent state wrt the writebuffer.  This also ensures that
     * any write-allocated cache lines in the vector page are written
     * back.  After this point, we can start to touch devices again.
     */
    local_flush_tlb_all();
    flush_cache_all();
}

7.1 如果有平台相关的map_io

.map_io        = ns115_map_io,

/*
 *ns115 io table
 */
struct map_desc ns115_io_desc[] __initdata = {
    {
        .virtual    = IO_ADDRESS(NS115_GIC_CPU_BASE),
        .pfn        = __phys_to_pfn(NS115_GIC_CPU_BASE),
        .length        = SZ_256,
        .type        = MT_DEVICE,
    },
}
void __init ns115_map_io(void)
{
    iotable_init(ns115_io_desc, ARRAY_SIZE(ns115_io_desc));
    init_consistent_dma_size(SZ_16M - SZ_2M);
}

静态映射:没有占用虚拟地址
arch/arm/mach-ns115/include/mach/hardware.h
/*
 * Statically mapped addresses:
 *
 * 10xx xxxx -> fbxx xxxx
 * 1exx xxxx -> fdxx xxxx
 * 1fxx xxxx -> fexx xxxx
 */
#define IO_ADDRESS(x)        (((x) & 0x03ffffff) + 0xfb000000)


8. kmap_init

static void __init kmap_init(void)
{
#ifdef CONFIG_HIGHMEM
    pkmap_page_table = early_pte_alloc_and_install(pmd_off_k(PKMAP_BASE),
        PKMAP_BASE, _PAGE_KERNEL_TABLE);
#endif
}

#define PKMAP_BASE        (PAGE_OFFSET - PMD_SIZE)


static pte_t * __init early_pte_alloc_and_install(pmd_t *pmd,
    unsigned long addr, unsigned long prot)
{
    if (pmd_none(*pmd)) {
        pte_t *pte = early_pte_alloc(pmd);
        early_pte_install(pmd, pte, prot);
    }
    BUG_ON(pmd_bad(*pmd));
    return pte_offset_kernel(pmd, addr);
}
#define pte_offset_kernel(pmd,addr)    (pmd_page_vaddr(*(pmd)) + pte_index(addr))


9] paging_init -> bootmem_init

bootmem_init引入了memory的管理结构 node, zone and page
每个page都对应 page机构体,存放在数组 mem_map中。
void __init bootmem_init(void)
{
    unsigned long min, max_low, max_high;

    max_low = max_high = 0;
    //bootmem_init: min 0x80000, max_low 0xa8000, max_high 0xa8000
    find_limits(&min, &max_low, &max_high);

    arm_bootmem_init(min, max_low);

    /*CONFIG_SPARSEMEM not defined, so arm_memory_presentt mean null*/
    arm_memory_present();

    /*
     * sparse_init() needs the bootmem allocator up and running.
     */
    /*CONFIG_SPARSEMEM not defined, so sparse_init mean null*/
    sparse_init();

    /*
     * Now free the memory - free_area_init_node needs
     * the sparse mem_map arrays initialized by sparse_init()
     * for memmap_init_zone(), otherwise all PFNs are invalid.
     */
    arm_bootmem_free(min, max_low, max_high);
}

/*******************************************************************/

9.1] get the pfn information from the meminfo


static void __init find_limits(unsigned long *min, unsigned long *max_low,
                   unsigned long *max_high)
{
    struct meminfo *mi = &meminfo;
    int i;

    /* This assumes the meminfo array is properly sorted */
    *min = bank_pfn_start(&mi->bank[0]);
    for_each_bank (i, mi)/* 如果是high memory 就退出 */
        if (mi->bank[i].highmem)
                break;
    /*the name looks un-understanding: both are end pfn, low is the first and high is the last*/
    *max_low = bank_pfn_end(&mi->bank[i - 1]);
    *max_high = bank_pfn_end(&mi->bank[mi->nr_banks - 1]);
}

/*******************************************************************/

9.2] arm_bootmem_init

static void __init arm_bootmem_init(unsigned long start_pfn,
    unsigned long end_pfn)
{
    struct memblock_region *reg;
    unsigned int boot_pages;
    phys_addr_t bitmap;
    pg_data_t *pgdat;

    /*
     * Allocate the bootmem bitmap page.  This must be in a region
     * of memory which has already been mapped.
     */
    boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
    /* memblock_alloc_base(size: page number, align关于对齐,max phy address)*/
    bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
                __pfn_to_phys(end_pfn));

    /*
     * Initialise the bootmem allocator, handing the
     * memory banks over to bootmem.
     */
    /*extern struct pglist_data contig_page_data;
     *#define NODE_DATA(nid)        (&contig_page_data)
     *#define NODE_MEM_MAP(nid)    mem_map,
     * what about the bootmem node?
     * 这里引入了有关node的数据结构pg_data_t
     */
    node_set_online(0);
    pgdat = NODE_DATA(0);
    init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);

    /*调用函数mark_bootmem设置 reserve, free bit*/
    /* Free the lowmem regions from memblock into bootmem. */
    for_each_memblock(memory, reg) {
        unsigned long start = memblock_region_memory_base_pfn(reg);
        unsigned long end = memblock_region_memory_end_pfn(reg);

        if (end >= end_pfn)
            end = end_pfn;
        if (start >= end)
            break;

        free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
    }

    /* Reserve the lowmem memblock reserved regions in bootmem. */
    for_each_memblock(reserved, reg) {
        unsigned long start = memblock_region_reserved_base_pfn(reg);
        unsigned long end = memblock_region_reserved_end_pfn(reg);

        if (end >= end_pfn)
            end = end_pfn;
        if (start >= end)
            break;

        reserve_bootmem(__pfn_to_phys(start),
                    (end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
    }
}

/*******************************************************************/

9.3] using the pfn information to value the zone


static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
    unsigned long max_high)
{
    unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
    struct memblock_region *reg;

    /*
     * initialise the zones.
     */
    memset(zone_size, 0, sizeof(zone_size));

    /*
     * The memory size has already been determined.  If we need
     * to do anything fancy with the allocation of this memory
     * to the zones, now is the time to do it.
     */
    zone_size[0] = max_low - min;
#ifdef CONFIG_HIGHMEM
    zone_size[ZONE_HIGHMEM] = max_high - max_low;
#endif

    /*
     * Calculate the size of the holes.
     *  holes = node_size - sum(bank_sizes)
     */
    memcpy(zhole_size, zone_size, sizeof(zhole_size));
    for_each_memblock(memory, reg) {
        unsigned long start = memblock_region_memory_base_pfn(reg);
        unsigned long end = memblock_region_memory_end_pfn(reg);

        if (start < max_low) {
            unsigned long low_end = min(end, max_low);
            zhole_size[0] -= low_end - start;
        }
#ifdef CONFIG_HIGHMEM
        if (end > max_low) {
            unsigned long high_start = max(start, max_low);
            zhole_size[ZONE_HIGHMEM] -= end - high_start;
        }
#endif
    }

    free_area_init_node(0, zone_size, min, zhole_size);
}

/*******************************************************************/

9.3.1] free_area_init_node


void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
        unsigned long node_start_pfn, unsigned long *zholes_size)
{
    pg_data_t *pgdat = NODE_DATA(nid);

    pgdat->node_id = nid;
    pgdat->node_start_pfn = node_start_pfn;
    calculate_node_totalpages(pgdat, zones_size, zholes_size);

    alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
    printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
        nid, (unsigned long)pgdat,
        (unsigned long)pgdat->node_mem_map);
#endif

    free_area_init_core(pgdat, zones_size, zholes_size);
}


/*******************************************************************/

9.3.1.1]alloc_node_mem_map:如名字所示分配mem_map

static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
    /* Skip empty nodes */
    if (!pgdat->node_spanned_pages)
        return;

#ifdef CONFIG_FLAT_NODE_MEM_MAP
    /* ia64 gets its own node_mem_map, before this, without bootmem */
    if (!pgdat->node_mem_map) {
        unsigned long size, start, end;
        struct page *map;

        /*
         * The zone's endpoints aren't required to be MAX_ORDER
         * aligned but the node_mem_map endpoints must be in order
         * for the buddy allocator to function correctly.
         */
        start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
        end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
        end = ALIGN(end, MAX_ORDER_NR_PAGES);
        size =  (end - start) * sizeof(struct page);
        map = alloc_remap(pgdat->node_id, size);
        if (!map)
            map = alloc_bootmem_node_nopanic(pgdat, size);
        pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
    }
#ifndef CONFIG_NEED_MULTIPLE_NODES
    /*
     * With no DISCONTIG, the global mem_map is just set as node 0's
     */
    if (pgdat == NODE_DATA(0)) {
        mem_map = NODE_DATA(0)->node_mem_map;
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}

/*******************************************************************/

9.3.1.2] Set up the zone data structures

/*
 * Set up the zone data structures:
 *   - mark all pages reserved
 *   - mark all memory queues empty
 *   - clear the memory bitmaps
 */
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        unsigned long *zones_size, unsigned long *zholes_size)
{
    enum zone_type j;
    int nid = pgdat->node_id;
    unsigned long zone_start_pfn = pgdat->node_start_pfn;
    int ret;

    pgdat_resize_init(pgdat);
    pgdat->nr_zones = 0;
    init_waitqueue_head(&pgdat->kswapd_wait);
    pgdat->kswapd_max_order = 0;
    pgdat_page_cgroup_init(pgdat);

    for (j = 0; j < MAX_NR_ZONES; j++) {
        struct zone *zone = pgdat->node_zones + j;
        unsigned long size, realsize, memmap_pages;
        enum lru_list lru;

        size = zone_spanned_pages_in_node(nid, j, zones_size);
        realsize = size - zone_absent_pages_in_node(nid, j,
                                zholes_size);

        /*
         * Adjust realsize so that it accounts for how much memory
         * is used by this zone for memmap. This affects the watermark
         * and per-cpu initialisations
         */
        memmap_pages =
            PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
        if (realsize >= memmap_pages) {
            realsize -= memmap_pages;
            if (memmap_pages)
                printk(KERN_DEBUG
                       "  %s zone: %lu pages used for memmap\n",
                       zone_names[j], memmap_pages);
        } else
            printk(KERN_WARNING
                "  %s zone: %lu pages exceeds realsize %lu\n",
                zone_names[j], memmap_pages, realsize);

        /* Account for reserved pages */
        if (j == 0 && realsize > dma_reserve) {
            realsize -= dma_reserve;
            printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
                    zone_names[0], dma_reserve);
        }

        if (!is_highmem_idx(j))
            nr_kernel_pages += realsize;
        nr_all_pages += realsize;

        zone->spanned_pages = size;
        zone->present_pages = realsize;
#ifdef CONFIG_NUMA
        zone->node = nid;
        zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
                        / 100;
        zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
#endif
        zone->name = zone_names[j];
        pr_err("free_area_init_core %s\n", zone->name);
        spin_lock_init(&zone->lock);
        spin_lock_init(&zone->lru_lock);
        zone_seqlock_init(zone);
        zone->zone_pgdat = pgdat;

        zone_pcp_init(zone);
        for_each_lru(lru)
            INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
        zone->reclaim_stat.recent_rotated[0] = 0;
        zone->reclaim_stat.recent_rotated[1] = 0;
        zone->reclaim_stat.recent_scanned[0] = 0;
        zone->reclaim_stat.recent_scanned[1] = 0;
        zap_zone_vm_stats(zone);
        zone->flags = 0;
        if (!size)
            continue;

        set_pageblock_order(pageblock_default_order());
        setup_usemap(pgdat, zone, size);
        ret = init_currently_empty_zone(zone, zone_start_pfn,
                        size, MEMMAP_EARLY);
        BUG_ON(ret);
        memmap_init(size, nid, j, zone_start_pfn);
        zone_start_pfn += size;
    }
}




10 . build_all_zonelists

Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 162560


printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
Kernel command line: root=/dev/sda2 rw rootwait mem=640M console=ttyS0,115200 init=/init video=nusmartfb:1024x600-16

11. mm_init

/*******************************************************************/
/*
 * Set up kernel memory allocators
 */
static void __init mm_init(void)
{
    /*
     * page_cgroup requires contiguous pages,
     * bigger than MAX_ORDER unless SPARSEMEM.
     */
    page_cgroup_init_flatmem();
    mem_init();
    kmem_cache_init();
    percpu_init_late();
    pgtable_cache_init();
    vmalloc_init();
}


/*******************************************************************/
/*
 * mem_init() marks the free areas in the mem_map and tells us how much
 * memory is free.  This is done after various parts of the system have
 * claimed their memory after the kernel image.
 */
void __init mem_init(void)
{
    unsigned long reserved_pages, free_pages;
    struct memblock_region *reg;
    int i;

    max_mapnr   = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;

    /* this will put all unused low memory onto the freelists */
    free_unused_memmap(&meminfo);

    totalram_pages += free_all_bootmem();
    free_highpages();
    /*从meminfo中得到 free and revered page number*/
    reserved_pages = free_pages = 0;
    for_each_bank(i, &meminfo) {
        struct membank *bank = &meminfo.bank[i];
        unsigned int pfn1, pfn2;
        struct page *page, *end;

        pfn1 = bank_pfn_start(bank);
        pfn2 = bank_pfn_end(bank);

        page = pfn_to_page(pfn1);
        end  = pfn_to_page(pfn2 - 1) + 1;

        do {
            if (PageReserved(page))
                reserved_pages++;
            else if (!page_count(page))
                free_pages++;
            page++;
        } while (page < end);
    }

    /*
     * Since our memory may not be contiguous, calculate the
     * real number of pages we have in this system
     */
    printk(KERN_INFO "Memory:");
    num_physpages = 0;
    for_each_memblock(memory, reg) {
        unsigned long pages = memblock_region_memory_end_pfn(reg) -
            memblock_region_memory_base_pfn(reg);
        num_physpages += pages;
        printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
    }
    printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));

    printk(KERN_NOTICE "Memory: %luk/%luk available, %luk reserved, %luK highmem\n",
        nr_free_pages() << (PAGE_SHIFT-10),
        free_pages << (PAGE_SHIFT-10),
        reserved_pages << (PAGE_SHIFT-10),
        totalhigh_pages << (PAGE_SHIFT-10));

    MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
        (PAGE_SIZE)),

    MLK(FIXADDR_START, FIXADDR_TOP),
    MLM(VMALLOC_START, VMALLOC_END),
    MLM(PAGE_OFFSET, (unsigned long)high_memory),

    MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
        (PAGE_SIZE)),

    MLM(MODULES_VADDR, MODULES_END),

    MLK_ROUNDUP(_text, _etext),
    MLK_ROUNDUP(__init_begin, __init_end),
    MLK_ROUNDUP(_sdata, _edata),
    MLK_ROUNDUP(__bss_start, __bss_stop));

}

Memory: 640MB = 640MB total
这里 available + reserverd = 611M < 640M ?
what about the kmap?可以申请到的高端内存只能有2M?
Memory: 623420k/623420k available, 31940k reserved, 0K highmem
Virtual kernel memory layout:
    vector  : 0xffff0000 - 0xffff1000   (   4 kB)
    fixmap  : 0xfff00000 - 0xfffe0000   ( 896 kB)
    vmalloc : 0xe8800000 - 0xff000000   ( 360 MB)
    lowmem  : 0xc0000000 - 0xe8000000   ( 640 MB)
    pkmap   : 0xbfe00000 - 0xc0000000   (   2 MB)
    modules : 0xbf000000 - 0xbfe00000   (  14 MB)
      .text : 0xc0008000 - 0xc06ab000   (6796 kB)
      .init : 0xc06ab000 - 0xc06db920   ( 195 kB)
      .data : 0xc06dc000 - 0xc0721c20   ( 280 kB)
       .bss : 0xc0721c44 - 0xc094787c   (2200 kB)


Memory: 824MB = 824MB total
Memory: 810364k/810364k available, 33412k reserved, 65536K highmem
Virtual kernel memory layout:
    vector  : 0xffff0000 - 0xffff1000   (   4 kB)
    fixmap  : 0xfff00000 - 0xfffe0000   ( 896 kB)
    vmalloc : 0xf0000000 - 0xff000000   ( 240 MB)
    lowmem  : 0xc0000000 - 0xef800000   ( 760 MB)
    pkmap   : 0xbfe00000 - 0xc0000000   (   2 MB)
    modules : 0xbf000000 - 0xbfe00000   (  14 MB)
      .text : 0xc0008000 - 0xc06ab000   (6796 kB)
      .init : 0xc06ab000 - 0xc06db920   ( 195 kB)
      .data : 0xc06dc000 - 0xc0721c20   ( 280 kB)
       .bss : 0xc0721c44 - 0xc094787c   (2200 kB)


你可能感兴趣的:(kernel,memory)