内核从UBoot中得知memory的有关参数如大小等,是怎样使用的?怎样演变出各种数据结构,直到神奇的我们看不懂。
本文希望能对 kernel memory 有个比较清晰的理解。
描述memory的数据结构变化:
early_mem -> mem_ifro -> memblock -> node, zone, page. -> 内核提供的接口函数。
4 and 5对页表的建立进行了描述。
内核会通过输入参数中的 mem=xxxM得到内核可用 memory 的大小。是怎样调到 early_mem的?
可以从下面的back trace 中得到答案。
------------[ cut here ]------------
WARNING: at arch/arm/kernel/setup.c:564 early_mem+0x20/0xc4()Memory policy: ECC disabled, Data cache writealloc
[0, 0xbf000000], [0xbf000000, 0xc0000000][0xe8000000, VMALLOC_START]
Memory: prepare_page_table MODULES_VADDR0xbf000000, PMD_SIZE0x200000
Memory: prepare_page_table PAGE_OFFSETc0000000, PMD_SIZE0x200000
Memory: prepare_page_table end0xa8000000, arm_lowmem_limit0xa8000000
这里的pmd_clear(pmd_off_k(addr));做了些什么事情?
先看:pmd_off_k(addr),又出来一个更拫的角色
static inline pmd_t *pmd_off_k(unsigned long virt)
{
return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt);
}
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
pgd_offset_k(addr)得到是directory item的地址。
/* to find an entry in a page-table-directory
* arch/arm/include/asm/pgtable-2level.h
*/
#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
现在看pud_offset(pgd, start)
文件:include/asm-generic/4level-fixup.h中
#define pud_offset(pgd, start) (pgd)
pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt) ->
pmd_offset(pgd_offset_k(virt), virt)
又:
static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
{
return (pmd_t *)pud;
}
pmd_offset(pgd_offset_k(virt), virt) ->
pgd_offset_k(virt) -> pgd_offset(&init_mm, addr) -> ((mm)->pgd + pgd_index(addr))
#define pmd_clear(pmdp) \
do { \
pmdp[0] = __pmd(0); \
pmdp[1] = __pmd(0); \
clean_pmd_entry(pmdp); \
} while (0)
#define __pte(x) ((pte_t) { (x) } )
#define __pmd(x) ((pmd_t) { (x) } )
arch/arm/include/asm/tlbflush.h
#define tlb_op(f, regs, arg) __tlb_op(f, "p15, 0, %0, " regs, arg)
#define tlb_l2_op(f, regs, arg) __tlb_op(f, "p15, 1, %0, " regs, arg)
static inline void clean_pmd_entry(void *pmd)
{
const unsigned int __tlb_flag = __cpu_tlb_flags;
tlb_op(TLB_DCLEAN, "c7, c10, 1 @ flush_pmd", pmd);
tlb_l2_op(TLB_L2CLEAN_FR, "c15, c9, 1 @ L2 flush_pmd", pmd);
}
prepare_page_table:这里说的pmd_clear就是把主内核页表目录的对应项清0.
1]Clear out all the mappings below the kernel image.
2]Clear out all the kernel space mappings, except for the first memory bank, up to the vmalloc region.
3]这里没有清所说的低端内存。
static void __init map_lowmem(void)
{
struct memblock_region *reg;
phys_addr_t start;
phys_addr_t end;
struct map_desc map;
/* Map all the lowmem memory banks. */
for_each_memblock(memory, reg) {
start = reg->base;
end = start + reg->size;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
break;
#define __phys_to_pfn(paddr) ((unsigned long)((paddr) >> PAGE_SHIFT))
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY;
create_mapping(&map, false);
}
}
这里有关 memory的数据结构从memblock_region到map_desc
struct map_desc {
unsigned long virtual;
unsigned long pfn;
unsigned long length;
unsigned int type;
};
/************************************************************************************/
static void __init create_mapping(struct map_desc *md, bool force_pages)
{
unsigned long addr, length, end;
phys_addr_t phys;
const struct mem_type *type;
pgd_t *pgd;
type = &mem_types[md->type];
addr = md->virtual & PAGE_MASK;
phys = __pfn_to_phys(md->pfn);
length = PAGE_ALIGN(md->length + (md->virtual & ~PAGE_MASK));
pgd = pgd_offset_k(addr);
end = addr + length;
do {
unsigned long next = pgd_addr_end(addr, end);
alloc_init_pud(pgd, addr, next, phys, type, force_pages);
phys += next - addr;
addr = next;
} while (pgd++, addr != end);
}
/************************************************************************************/
static void __init alloc_init_pud(pgd_t *pgd, unsigned long addr,
unsigned long end, unsigned long phys, const struct mem_type *type,
bool force_pages)
{
pud_t *pud = pud_offset(pgd, addr);
unsigned long next;
do {
next = pud_addr_end(addr, end);
alloc_init_section(pud, addr, next, phys, type, force_pages);
phys += next - addr;
} while (pud++, addr = next, addr != end);
}
/************************************************************************************/
static void __init alloc_init_section(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type,
bool force_pages)
{
pmd_t *pmd = pmd_offset(pud, addr);
/*
* Try a section mapping - end, addr and phys must all be aligned
* to a section boundary. Note that PMDs refer to the individual
* L1 entries, whereas PGDs refer to a group of L1 entries making
* up one logical pointer to an L2 table.
*/
if (type->prot_sect && ((addr | end | phys) & ~SECTION_MASK) == 0 &&
!force_pages) {
pmd_t *p = pmd;
if (addr & SECTION_SIZE)
pmd++;
do {
*pmd = __pmd(phys | type->prot_sect);
phys += SECTION_SIZE;
} while (pmd++, addr += SECTION_SIZE, addr != end);
flush_pmd_entry(p);
}
}
/************************************************************************************/
static void __init alloc_init_section(pud_t *pud, unsigned long addr,
unsigned long end, phys_addr_t phys,
const struct mem_type *type,
bool force_pages)
{
pmd_t *pmd = pmd_offset(pud, addr);
/*
* No need to loop; pte's aren't interested in the
* individual L1 entries.
*/
alloc_init_pte(pmd, addr, end, __phys_to_pfn(phys), type);
}
/************************************************************************************/
static void __init alloc_init_pte(pmd_t *pmd, unsigned long addr,
unsigned long end, unsigned long pfn,
const struct mem_type *type)
{
/*allocate memory for page table item of each pmd*/
pte_t *start_pte = early_pte_alloc(pmd);
pte_t *pte = start_pte + pte_index(addr);
/* If replacing a section mapping, the whole section must be replaced */
BUG_ON(pmd_bad(*pmd) && ((addr | end) & ~PMD_MASK));
do {
set_pte_ext(pte, pfn_pte(pfn, __pgprot(type->prot_pte)), 0);
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
early_pte_install(pmd, start_pte, type->prot_l1);
}
/************************************************************************************/
static void __init early_pte_install(pmd_t *pmd, pte_t *pte, unsigned long prot)
{
__pmd_populate(pmd, __pa(pte), prot);
BUG_ON(pmd_bad(*pmd));
}
/************************************************************************************/
static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte,
pmdval_t prot)
{
pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
pmdp[0] = __pmd(pmdval);
/*每个pte的值已经写入完毕,这里只是把他们分为两部分*/
pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
flush_pmd_entry(pmdp);
}
/*1] 这里循环用的步长是 PGDIR_SIZE = 2^21 = 2M, 这个大循环的控制变量是pgd, PGDIR_SIZE
*2] 再看里面的小循环:alloc_init_pud的循环控制变量是pud and PUD_SIZE,因为pgd=pud,
* 该循环实际上只运行一次
*3] 现在看最里面的循环:alloc_init_section:这里分为2部分
*3.1] a section mapping循环控制变量是 pmd and SECTION_SIZE.
*3.2] page entry: 循环控制变量是 PAGE_SIZE,pte
*/
#define pgd_addr_end(addr, end) \
({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \
(__boundary - 1 < (end) - 1)? __boundary: (end); \
})
/* to find an entry in a kernel page-table-directory */
#define pgd_offset_k(addr) pgd_offset(&init_mm, addr)
pgd_offset_k(addr)得到是directory item的地址。
/* to find an entry in a page-table-directory
* arch/arm/include/asm/pgtable-2level.h
*/
#define pgd_offset(mm, addr) ((mm)->pgd + pgd_index(addr))
#define pgd_index(addr) ((addr) >> PGDIR_SHIFT)
/*
* PMD_SHIFT determines the size of the area a second-level page table can map
* PGDIR_SHIFT determines what a third-level page table entry can map
*/
#define PMD_SHIFT 21
#define PGDIR_SHIFT 21
#define PMD_SIZE (1UL << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE-1))
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1)
/*
* section address mask and size definitions.
*/
#define SECTION_SHIFT 20
#define SECTION_SIZE (1UL << SECTION_SHIFT)
#define SECTION_MASK (~(SECTION_SIZE-1))
/*得到pmd_t是pud_t,而
crash> pgd_t
typedef unsigned int [2] pgd_t;
SIZE: 8
crash> pud_t
typedef struct {
pgd_t pgd;
} pud_t;
SIZE: 8
crash> pmd_t
typedef unsigned int pmd_t;
SIZE: 4
crash> pte_t
typedef unsigned int pte_t;
SIZE: 4
返回的是8个字节的首地址,8个字节中的后面的高4个字节怎样使用的?
pmd++
*/
1st level contains 4096 entries (4 pages for PGD)
§ 1MB section per entry or
§ Pointer to a 2nd level table
§ Implementation-defined 16MB supersections
§ 2nd level contains 256 entries pointing to 4KB page each
§ 1KB per 2nd level page table
2nd level page table does not fill a full 4K page
ARM Linux workarounds
§ Separate array for the Linux PTE bits
§ 1st level entry consists of two 32-bit locations pointing to 2KB 2nd level
page table entries
static void __init *early_alloc(unsigned long sz)
{
return early_alloc_aligned(sz, sz);
}
static void __init *early_alloc_aligned(unsigned long sz, unsigned long align)
{
void *ptr = __va(memblock_alloc(sz, align));
memset(ptr, 0, sz);
return ptr;
}
phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
{
return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
phys_addr_t alloc;
alloc = __memblock_alloc_base(size, align, max_addr);
if (alloc == 0)
panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
(unsigned long long) size, (unsigned long long) max_addr);
return alloc;
}
phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
return memblock_alloc_base_nid(size, align, max_addr, MAX_NUMNODES);
}
static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t max_addr,
int nid)
{
phys_addr_t found;
/* align @size to avoid excessive fragmentation on reserved array */
size = round_up(size, align);
found = memblock_find_in_range_node(0, max_addr, size, align, nid);
if (found && !memblock_reserve(found, size))
return found;
return 0;
}
#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
/*从函数cpu_v7_set_pte_ext可以看出each entry store two time offset 0 and offset 2048*/
r0:pte address, r1:context, r2:0
crash> dis cpu_v7_set_pte_ext -l /*linux version*/
0xc0019a0c <cpu_v7_set_pte_ext>: str r1, [r0]
0xc0019a10 <cpu_v7_set_pte_ext+4>: bic r3, r1, #1008 ; 0x3f0
0xc0019a14 <cpu_v7_set_pte_ext+8>: bic r3, r3, #3
0xc0019a18 <cpu_v7_set_pte_ext+12>: orr r3, r3, r2
0xc0019a1c <cpu_v7_set_pte_ext+16>: orr r3, r3, #18
0xc0019a20 <cpu_v7_set_pte_ext+20>: tst r1, #16
0xc0019a24 <cpu_v7_set_pte_ext+24>: orrne r3, r3, #64 ; 0x40
0xc0019a28 <cpu_v7_set_pte_ext+28>: eor r1, r1, #64 ; 0x40
0xc0019a2c <cpu_v7_set_pte_ext+32>: tst r1, #192 ; 0xc0
0xc0019a30 <cpu_v7_set_pte_ext+36>: orrne r3, r3, #512 ; 0x200
0xc0019a34 <cpu_v7_set_pte_ext+40>: tst r1, #256 ; 0x100
0xc0019a38 <cpu_v7_set_pte_ext+44>: orrne r3, r3, #32
0xc0019a3c <cpu_v7_set_pte_ext+48>: tst r1, #512 ; 0x200
0xc0019a40 <cpu_v7_set_pte_ext+52>: orrne r3, r3, #1
0xc0019a44 <cpu_v7_set_pte_ext+56>: tst r1, #2
0xc0019a48 <cpu_v7_set_pte_ext+60>: tstne r1, #1
0xc0019a4c <cpu_v7_set_pte_ext+64>: moveq r3, #0
/*hardware version is stored at +2048 bytes*/
0xc0019a50 <cpu_v7_set_pte_ext+68>: str r3, [r0, #2048]! ; 0x800
0xc0019a54 <cpu_v7_set_pte_ext+72>: mcr 15, 0, r0, cr7, cr10, {1}
0xc0019a58 <cpu_v7_set_pte_ext+76>: mov pc, lr
这里把两项合成一项
pfn_pte(pfn, __pgprot(type->prot_pte)
#define __pgprot(x) ((pgprot_t) { (x) } )
#define pfn_pte(pfn,prot) __pte(__pfn_to_phys(pfn) | pgprot_val(prot))
#define pgprot_val(x) (x)
/*先看一下 mem_type *type */
crash> mem_type
struct mem_type {
pteval_t prot_pte;
pmdval_t prot_l1;
pmdval_t prot_sect;
unsigned int domain;
}
SIZE: 16
[MT_MEMORY] = {
.prot_pte = L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY,
.prot_l1 = PMD_TYPE_TABLE,
.prot_sect = PMD_TYPE_SECT | PMD_SECT_AP_WRITE,
.domain = DOMAIN_KERNEL,
},
void __init dma_contiguous_remap(void)
{
int i;
for (i = 0; i < dma_mmu_remap_num; i++) {
phys_addr_t start = dma_mmu_remap[i].base;
phys_addr_t end = start + dma_mmu_remap[i].size;
struct map_desc map;
unsigned long addr;
if (end > arm_lowmem_limit)
end = arm_lowmem_limit;
if (start >= end)
return;
map.pfn = __phys_to_pfn(start);
map.virtual = __phys_to_virt(start);
map.length = end - start;
map.type = MT_MEMORY_DMA_READY;
/*
* Clear previous low-memory mapping
*/
for (addr = __phys_to_virt(start); addr < __phys_to_virt(end);
addr += PGDIR_SIZE)
pmd_clear(pmd_off_k(addr));
iotable_init(&map, 1);
}
}
/*
* Set up the device mappings. Since we clear out the page tables for all
* mappings above VMALLOC_START, we will remove any debug device mappings.
* This means you have to be careful how you debug this function, or any
* called function. This means you can't use any function or debugging
* method which may touch any device, otherwise the kernel _will_ crash.
*/
static void __init devicemaps_init(struct machine_desc *mdesc)
{
struct map_desc map;
unsigned long addr;
void *vectors;
/*
* Allocate the vector page early.
*/
vectors = early_alloc(PAGE_SIZE);
early_trap_init(vectors);
for (addr = VMALLOC_START; addr; addr += PMD_SIZE)
pmd_clear(pmd_off_k(addr));
/*
* Map the cache flushing regions.
*/
#ifdef FLUSH_BASE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS);
map.virtual = FLUSH_BASE;
map.length = SZ_1M;
map.type = MT_CACHECLEAN;
create_mapping(&map);
#endif
#ifdef FLUSH_BASE_MINICACHE
map.pfn = __phys_to_pfn(FLUSH_BASE_PHYS + SZ_1M);
map.virtual = FLUSH_BASE_MINICACHE;
map.length = SZ_1M;
map.type = MT_MINICLEAN;
create_mapping(&map);
#endif
/*
* Create a mapping for the machine vectors at the high-vectors
* location (0xffff0000). If we aren't using high-vectors, also
* create a mapping at the low-vectors virtual address.
*/
map.pfn = __phys_to_pfn(virt_to_phys(vectors));
map.virtual = 0xffff0000;
map.length = PAGE_SIZE;
map.type = MT_HIGH_VECTORS;
create_mapping(&map, false);
if (!vectors_high()) {
map.virtual = 0;
map.type = MT_LOW_VECTORS;
create_mapping(&map, false);
}
/*
* Ask the machine support to map in the statically mapped devices.
*/
if (mdesc->map_io)
mdesc->map_io();
/*
* Finally flush the caches and tlb to ensure that we're in a
* consistent state wrt the writebuffer. This also ensures that
* any write-allocated cache lines in the vector page are written
* back. After this point, we can start to touch devices again.
*/
local_flush_tlb_all();
flush_cache_all();
}
.map_io = ns115_map_io,
/*
*ns115 io table
*/
struct map_desc ns115_io_desc[] __initdata = {
{
.virtual = IO_ADDRESS(NS115_GIC_CPU_BASE),
.pfn = __phys_to_pfn(NS115_GIC_CPU_BASE),
.length = SZ_256,
.type = MT_DEVICE,
},
}
void __init ns115_map_io(void)
{
iotable_init(ns115_io_desc, ARRAY_SIZE(ns115_io_desc));
init_consistent_dma_size(SZ_16M - SZ_2M);
}
静态映射:没有占用虚拟地址
arch/arm/mach-ns115/include/mach/hardware.h
/*
* Statically mapped addresses:
*
* 10xx xxxx -> fbxx xxxx
* 1exx xxxx -> fdxx xxxx
* 1fxx xxxx -> fexx xxxx
*/
#define IO_ADDRESS(x) (((x) & 0x03ffffff) + 0xfb000000)
static void __init kmap_init(void)
{
#ifdef CONFIG_HIGHMEM
pkmap_page_table = early_pte_alloc_and_install(pmd_off_k(PKMAP_BASE),
PKMAP_BASE, _PAGE_KERNEL_TABLE);
#endif
}
#define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE)
static pte_t * __init early_pte_alloc_and_install(pmd_t *pmd,
unsigned long addr, unsigned long prot)
{
if (pmd_none(*pmd)) {
pte_t *pte = early_pte_alloc(pmd);
early_pte_install(pmd, pte, prot);
}
BUG_ON(pmd_bad(*pmd));
return pte_offset_kernel(pmd, addr);
}
#define pte_offset_kernel(pmd,addr) (pmd_page_vaddr(*(pmd)) + pte_index(addr))
bootmem_init引入了memory的管理结构 node, zone and page
每个page都对应 page机构体,存放在数组 mem_map中。
void __init bootmem_init(void)
{
unsigned long min, max_low, max_high;
max_low = max_high = 0;
//bootmem_init: min 0x80000, max_low 0xa8000, max_high 0xa8000
find_limits(&min, &max_low, &max_high);
arm_bootmem_init(min, max_low);
/*CONFIG_SPARSEMEM not defined, so arm_memory_presentt mean null*/
arm_memory_present();
/*
* sparse_init() needs the bootmem allocator up and running.
*/
/*CONFIG_SPARSEMEM not defined, so sparse_init mean null*/
sparse_init();
/*
* Now free the memory - free_area_init_node needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
arm_bootmem_free(min, max_low, max_high);
}
/*******************************************************************/
static void __init find_limits(unsigned long *min, unsigned long *max_low,
unsigned long *max_high)
{
struct meminfo *mi = &meminfo;
int i;
/* This assumes the meminfo array is properly sorted */
*min = bank_pfn_start(&mi->bank[0]);
for_each_bank (i, mi)/* 如果是high memory 就退出 */
if (mi->bank[i].highmem)
break;
/*the name looks un-understanding: both are end pfn, low is the first and high is the last*/
*max_low = bank_pfn_end(&mi->bank[i - 1]);
*max_high = bank_pfn_end(&mi->bank[mi->nr_banks - 1]);
}
/*******************************************************************/
static void __init arm_bootmem_init(unsigned long start_pfn,
unsigned long end_pfn)
{
struct memblock_region *reg;
unsigned int boot_pages;
phys_addr_t bitmap;
pg_data_t *pgdat;
/*
* Allocate the bootmem bitmap page. This must be in a region
* of memory which has already been mapped.
*/
boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
/* memblock_alloc_base(size: page number, align关于对齐,max phy address)*/
bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
__pfn_to_phys(end_pfn));
/*
* Initialise the bootmem allocator, handing the
* memory banks over to bootmem.
*/
/*extern struct pglist_data contig_page_data;
*#define NODE_DATA(nid) (&contig_page_data)
*#define NODE_MEM_MAP(nid) mem_map,
* what about the bootmem node?
* 这里引入了有关node的数据结构pg_data_t
*/
node_set_online(0);
pgdat = NODE_DATA(0);
init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);
/*调用函数mark_bootmem设置 reserve, free bit*/
/* Free the lowmem regions from memblock into bootmem. */
for_each_memblock(memory, reg) {
unsigned long start = memblock_region_memory_base_pfn(reg);
unsigned long end = memblock_region_memory_end_pfn(reg);
if (end >= end_pfn)
end = end_pfn;
if (start >= end)
break;
free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
}
/* Reserve the lowmem memblock reserved regions in bootmem. */
for_each_memblock(reserved, reg) {
unsigned long start = memblock_region_reserved_base_pfn(reg);
unsigned long end = memblock_region_reserved_end_pfn(reg);
if (end >= end_pfn)
end = end_pfn;
if (start >= end)
break;
reserve_bootmem(__pfn_to_phys(start),
(end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
}
}
/*******************************************************************/
static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
unsigned long max_high)
{
unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
struct memblock_region *reg;
/*
* initialise the zones.
*/
memset(zone_size, 0, sizeof(zone_size));
/*
* The memory size has already been determined. If we need
* to do anything fancy with the allocation of this memory
* to the zones, now is the time to do it.
*/
zone_size[0] = max_low - min;
#ifdef CONFIG_HIGHMEM
zone_size[ZONE_HIGHMEM] = max_high - max_low;
#endif
/*
* Calculate the size of the holes.
* holes = node_size - sum(bank_sizes)
*/
memcpy(zhole_size, zone_size, sizeof(zhole_size));
for_each_memblock(memory, reg) {
unsigned long start = memblock_region_memory_base_pfn(reg);
unsigned long end = memblock_region_memory_end_pfn(reg);
if (start < max_low) {
unsigned long low_end = min(end, max_low);
zhole_size[0] -= low_end - start;
}
#ifdef CONFIG_HIGHMEM
if (end > max_low) {
unsigned long high_start = max(start, max_low);
zhole_size[ZONE_HIGHMEM] -= end - high_start;
}
#endif
}
free_area_init_node(0, zone_size, min, zhole_size);
}
/*******************************************************************/
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
nid, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#endif
free_area_init_core(pgdat, zones_size, zholes_size);
}
/*******************************************************************/
static void __init_refok alloc_node_mem_map(struct pglist_data *pgdat)
{
/* Skip empty nodes */
if (!pgdat->node_spanned_pages)
return;
#ifdef CONFIG_FLAT_NODE_MEM_MAP
/* ia64 gets its own node_mem_map, before this, without bootmem */
if (!pgdat->node_mem_map) {
unsigned long size, start, end;
struct page *map;
/*
* The zone's endpoints aren't required to be MAX_ORDER
* aligned but the node_mem_map endpoints must be in order
* for the buddy allocator to function correctly.
*/
start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
end = pgdat->node_start_pfn + pgdat->node_spanned_pages;
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
map = alloc_remap(pgdat->node_id, size);
if (!map)
map = alloc_bootmem_node_nopanic(pgdat, size);
pgdat->node_mem_map = map + (pgdat->node_start_pfn - start);
}
#ifndef CONFIG_NEED_MULTIPLE_NODES
/*
* With no DISCONTIG, the global mem_map is just set as node 0's
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
#endif
#endif /* CONFIG_FLAT_NODE_MEM_MAP */
}
/*******************************************************************/
/*
* Set up the zone data structures:
* - mark all pages reserved
* - mark all memory queues empty
* - clear the memory bitmaps
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
pgdat->nr_zones = 0;
init_waitqueue_head(&pgdat->kswapd_wait);
pgdat->kswapd_max_order = 0;
pgdat_page_cgroup_init(pgdat);
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, memmap_pages;
enum lru_list lru;
size = zone_spanned_pages_in_node(nid, j, zones_size);
realsize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust realsize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
memmap_pages =
PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
if (realsize >= memmap_pages) {
realsize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds realsize %lu\n",
zone_names[j], memmap_pages, realsize);
/* Account for reserved pages */
if (j == 0 && realsize > dma_reserve) {
realsize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += realsize;
nr_all_pages += realsize;
zone->spanned_pages = size;
zone->present_pages = realsize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
pr_err("free_area_init_core %s\n", zone->name);
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
for_each_lru(lru)
INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
zone->reclaim_stat.recent_scanned[0] = 0;
zone->reclaim_stat.recent_scanned[1] = 0;
zap_zone_vm_stats(zone);
zone->flags = 0;
if (!size)
continue;
set_pageblock_order(pageblock_default_order());
setup_usemap(pgdat, zone, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
Built 1 zonelists in Zone order, mobility grouping on. Total pages: 162560
printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
Kernel command line: root=/dev/sda2 rw rootwait mem=640M console=ttyS0,115200 init=/init video=nusmartfb:1024x600-16
/*******************************************************************/
/*
* Set up kernel memory allocators
*/
static void __init mm_init(void)
{
/*
* page_cgroup requires contiguous pages,
* bigger than MAX_ORDER unless SPARSEMEM.
*/
page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
percpu_init_late();
pgtable_cache_init();
vmalloc_init();
}
/*******************************************************************/
/*
* mem_init() marks the free areas in the mem_map and tells us how much
* memory is free. This is done after various parts of the system have
* claimed their memory after the kernel image.
*/
void __init mem_init(void)
{
unsigned long reserved_pages, free_pages;
struct memblock_region *reg;
int i;
max_mapnr = pfn_to_page(max_pfn + PHYS_PFN_OFFSET) - mem_map;
/* this will put all unused low memory onto the freelists */
free_unused_memmap(&meminfo);
totalram_pages += free_all_bootmem();
free_highpages();
/*从meminfo中得到 free and revered page number*/
reserved_pages = free_pages = 0;
for_each_bank(i, &meminfo) {
struct membank *bank = &meminfo.bank[i];
unsigned int pfn1, pfn2;
struct page *page, *end;
pfn1 = bank_pfn_start(bank);
pfn2 = bank_pfn_end(bank);
page = pfn_to_page(pfn1);
end = pfn_to_page(pfn2 - 1) + 1;
do {
if (PageReserved(page))
reserved_pages++;
else if (!page_count(page))
free_pages++;
page++;
} while (page < end);
}
/*
* Since our memory may not be contiguous, calculate the
* real number of pages we have in this system
*/
printk(KERN_INFO "Memory:");
num_physpages = 0;
for_each_memblock(memory, reg) {
unsigned long pages = memblock_region_memory_end_pfn(reg) -
memblock_region_memory_base_pfn(reg);
num_physpages += pages;
printk(" %ldMB", pages >> (20 - PAGE_SHIFT));
}
printk(" = %luMB total\n", num_physpages >> (20 - PAGE_SHIFT));
printk(KERN_NOTICE "Memory: %luk/%luk available, %luk reserved, %luK highmem\n",
nr_free_pages() << (PAGE_SHIFT-10),
free_pages << (PAGE_SHIFT-10),
reserved_pages << (PAGE_SHIFT-10),
totalhigh_pages << (PAGE_SHIFT-10));
MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
(PAGE_SIZE)),
MLK(FIXADDR_START, FIXADDR_TOP),
MLM(VMALLOC_START, VMALLOC_END),
MLM(PAGE_OFFSET, (unsigned long)high_memory),
MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
(PAGE_SIZE)),
MLM(MODULES_VADDR, MODULES_END),
MLK_ROUNDUP(_text, _etext),
MLK_ROUNDUP(__init_begin, __init_end),
MLK_ROUNDUP(_sdata, _edata),
MLK_ROUNDUP(__bss_start, __bss_stop));
}
Memory: 640MB = 640MB total
这里 available + reserverd = 611M < 640M ?
what about the kmap?可以申请到的高端内存只能有2M?
Memory: 623420k/623420k available, 31940k reserved, 0K highmem
Virtual kernel memory layout:
vector : 0xffff0000 - 0xffff1000 ( 4 kB)
fixmap : 0xfff00000 - 0xfffe0000 ( 896 kB)
vmalloc : 0xe8800000 - 0xff000000 ( 360 MB)
lowmem : 0xc0000000 - 0xe8000000 ( 640 MB)
pkmap : 0xbfe00000 - 0xc0000000 ( 2 MB)
modules : 0xbf000000 - 0xbfe00000 ( 14 MB)
.text : 0xc0008000 - 0xc06ab000 (6796 kB)
.init : 0xc06ab000 - 0xc06db920 ( 195 kB)
.data : 0xc06dc000 - 0xc0721c20 ( 280 kB)
.bss : 0xc0721c44 - 0xc094787c (2200 kB)
Memory: 824MB = 824MB total
Memory: 810364k/810364k available, 33412k reserved, 65536K highmem
Virtual kernel memory layout:
vector : 0xffff0000 - 0xffff1000 ( 4 kB)
fixmap : 0xfff00000 - 0xfffe0000 ( 896 kB)
vmalloc : 0xf0000000 - 0xff000000 ( 240 MB)
lowmem : 0xc0000000 - 0xef800000 ( 760 MB)
pkmap : 0xbfe00000 - 0xc0000000 ( 2 MB)
modules : 0xbf000000 - 0xbfe00000 ( 14 MB)
.text : 0xc0008000 - 0xc06ab000 (6796 kB)
.init : 0xc06ab000 - 0xc06db920 ( 195 kB)
.data : 0xc06dc000 - 0xc0721c20 ( 280 kB)
.bss : 0xc0721c44 - 0xc094787c (2200 kB)