Chipset:MSM8x25Q
Codebase:Android 4.1
Linux Kernel: 3.4.0
在linux Kernel中,一开始内存相关的信息是由struct meminfo来保存的,每个物理连续的内存区域被保存为meminfo中的一个元素,也就是说在Linux使用中,整块物理内存可能是不连续的,可能其中某一中间区域是被其他cpu给使用掉了。
那么内存相关信息又是从哪里收集到的呢,系统在boot阶段,如u-boot会将当前物理内存linux可以使用的部分通过TAG的形式传递给linux内核。Qualcomm使用的是叫lk的boot,不管用的是哪种boot类型,使用TAG来传递参数的原理是一样的。
下面我们看下Linux内核是如何收集内存信息的。
Meminfo信息收集
系统启动有如下流程:
start_kernel -> setup_arch -> setup_machine_tags-> parse_tags -> parse_tag.
[html] view plain copy print ?
- static int __init parse_tag(const struct tag *tag)
- {
- extern struct tagtable __tagtable_begin, __tagtable_end;
- struct tagtable *t;
-
- for (t = &__tagtable_begin; t < &__tagtable_end; t++)
- if (tag->hdr.tag == t->tag) {
- t->parse(tag);
- break;
- }
-
- return t < &__tagtable_end;
- }
static int __init parse_tag(const struct tag *tag)
{
extern struct tagtable __tagtable_begin, __tagtable_end;
struct tagtable *t;
for (t = &__tagtable_begin; t < &__tagtable_end; t++)
if (tag->hdr.tag == t->tag) {
t->parse(tag);
break;
}
return t < &__tagtable_end;
}
__tagtable_begin被定义在kernel/arch/arm/kernel/vmlinux.lds.S中:
[html] view plain copy print ?
- .init.tagtable : {
- __tagtable_begin = .;
- *(.taglist.init)
- __tagtable_end = .;
- }
.init.tagtable : {
__tagtable_begin = .;
*(.taglist.init)
__tagtable_end = .;
}
另外,在arch/arm/kernel/setup.c中有如下函数定义:
[html] view plain copy print ?
- static int __init parse_tag_mem32(const struct tag *tag)
- {
- return arm_add_memory(tag->u.mem.start, tag->u.mem.size);
- }
- __tagtable(ATAG_MEM, parse_tag_mem32);
static int __init parse_tag_mem32(const struct tag *tag)
{
return arm_add_memory(tag->u.mem.start, tag->u.mem.size);
}
__tagtable(ATAG_MEM, parse_tag_mem32);
__tagtable是个宏定义:
[html] view plain copy print ?
- #define __tagtable(tag, fn) \
- static const struct tagtable__tagtable_##fn __tag = { tag, fn }
#define __tagtable(tag, fn) \
static const struct tagtable__tagtable_##fn __tag = { tag, fn }
里面的__tag的宏定义又如下:
[html] view plain copy print ?
- #define __tag __used__attribute__((__section__(".taglist.init")))
#define __tag __used__attribute__((__section__(".taglist.init")))
__attribute__是一个特殊的GNU关键字,在这里的用法是:告诉编译器需要将其作用的函数或者数据放入”.taglist.init”这一段区域。
也就是说由__tagtable定义的函数将会被放在section“.taglist.init” 这个区域,而且__tagtable_begin指向的就是这个区域的首地址。所以在parse_tag()做for循环调用的时候,
必然会调用到parse_tag_mem32()。
其中一点要注意的是,parse_tag_mem32()的TAG为ATAG_MEM, 所以在boot传过来的TAG参数如果是要定义为memory参数的话TAG一定要定义为ATAG_MEM,否则parse_tag_mem32()是无法解析到的!
parse_tag_mem32()调用arm_add_memory().
/*start和size参数是从boot传过来的。*/
[html] view plain copy print ?
- int __init arm_add_memory(phys_addr_t start, unsigned long size)
- {
- /*第一次进来meminfo.nr_banks值为0.*/
- struct membank *bank = &meminfo.bank[meminfo.nr_banks];
- /*最多能保存NR_BANKS个bank,本平台为8.*/
- if (meminfo.nr_banks >= NR_BANKS) {
- printk(KERN_CRIT "NR_BANKS too low, "
- "ignoring memory at 0x%08llx\n", (long long)start);
- return -EINVAL;
- }
- /*页对齐后保存物理起始地址。*/
- size -= start & ~PAGE_MASK;
- bank->start = PAGE_ALIGN(start);
- /*保存本bank size.*/
- bank->size = size & PAGE_MASK;
-
- /*
- * Check whether this memory region has non-zero size or
- * invalid node number.
- */
- if (bank->size == 0)
- return -EINVAL;
- /*记录当前拥有bank数量。*/
- meminfo.nr_banks++;
- return 0;
- }<span style="font-family: Arial, Helvetica, sans-serif;"> </span>
int __init arm_add_memory(phys_addr_t start, unsigned long size)
{
/*第一次进来meminfo.nr_banks值为0.*/
struct membank *bank = &meminfo.bank[meminfo.nr_banks];
/*最多能保存NR_BANKS个bank,本平台为8.*/
if (meminfo.nr_banks >= NR_BANKS) {
printk(KERN_CRIT "NR_BANKS too low, "
"ignoring memory at 0x%08llx\n", (long long)start);
return -EINVAL;
}
/*页对齐后保存物理起始地址。*/
size -= start & ~PAGE_MASK;
bank->start = PAGE_ALIGN(start);
/*保存本bank size.*/
bank->size = size & PAGE_MASK;
/*
* Check whether this memory region has non-zero size or
* invalid node number.
*/
if (bank->size == 0)
return -EINVAL;
/*记录当前拥有bank数量。*/
meminfo.nr_banks++;
return 0;
}
Meminfo检查
在meminfo信息收集完成之后,系统会先对它作一个检查:
Start_kernel -> setup_arch -> sanity_check_meminfo.
[html] view plain copy print ?
- void __init sanity_check_meminfo(void)
- {
- int i, j, highmem = 0;
- ~~snip
- /*对每个bank都做检查。*/
- for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
- struct membank *bank = &meminfo.bank[j];
- *bank = meminfo.bank[i];
- /*这里表示是PAE扩展的情况???*/
- if (bank->start > ULONG_MAX)
- highmem = 1;
-
- #ifdef CONFIG_HIGHMEM
- /*如果物理地址比在vmalloc_min之上或者小于内核逻辑
- 映射地址空间(俗称lowmem或者地段内存),那么就被认为是高端内存。
- vmalloc_min被定义为vmalloc的最低地址。关于vmalloc可以了解下linux
- 的虚拟内存空间布局划分。其实它和lowmem最高地址中间还留有8M的
- 空间防止越界。*/
- if (__va(bank->start) >= vmalloc_min ||
- __va(bank->start) < (void *)PAGE_OFFSET)
- highmem = 1;
-
- bank->highmem = highmem;
-
- /*
- * Split those memory banks which are partially overlapping
- * the vmalloc area greatly simplifying things later.
- */
- /*表示meminfo其中的一个bank的物理地址其中一部分处于
- Lowmem,一部分却又处于Highmem,这种情况需要将bank再重新划分
- 成两个bank。*/
- if (!highmem && __va(bank->start) < vmalloc_min &&
- bank->size > vmalloc_min - __va(bank->start)) {
- if (meminfo.nr_banks >= NR_BANKS) {
- printk(KERN_CRIT "NR_BANKS too low, "
- "ignoring high memory\n");
- } else {
- /*将当前跟着的bank元素都往后挪一个位置,以保存新划分出来的
- Bank。*/
- memmove(bank + 1, bank,
- (meminfo.nr_banks - i) * sizeof(*bank));
- meminfo.nr_banks++;
- i++;
- /*保存size和start,既然代码跑这里来了,肯定为highmem了。*/
- bank[1].size -= vmalloc_min - __va(bank->start);
- bank[1].start = __pa(vmalloc_min - 1) + 1;
- bank[1].highmem = highmem = 1;
- j++;
- }
- /*lowmem的size, start保持不变。*/
- bank->size = vmalloc_min - __va(bank->start);
- }
- #else
- bank->highmem = highmem;
- /*系统没有enable high memory时直接忽略highmem.*/
- /*
- * Highmem banks not allowed with !CONFIG_HIGHMEM.
- */
- if (highmem) {
- printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx "
- "(!CONFIG_HIGHMEM).\n",
- (unsigned long long)bank->start,
- (unsigned long long)bank->start + bank->size - 1);
- continue;
- }
- /*判断物理起始地址是不是落在vmalloc区域,或者小于lowmem区域。*/
- /*
- * Check whether this memory bank would entirely overlap
- * the vmalloc area.
- */
- if (__va(bank->start) >= vmalloc_min ||
- __va(bank->start) < (void *)PAGE_OFFSET) {
- printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx "
- "(vmalloc region overlap).\n",
- (unsigned long long)bank->start,
- (unsigned long long)bank->start + bank->size - 1);
- continue;
- }
- /*判断物理结束地址是不是落在vmalloc区域*/
- /*
- * Check whether this memory bank would partially overlap
- * the vmalloc area.
- */
- if (__va(bank->start + bank->size) > vmalloc_min ||
- __va(bank->start + bank->size) < __va(bank->start)) {
- unsigned long newsize = vmalloc_min - __va(bank->start);
- printk(KERN_NOTICE "Truncating RAM at %.8llx-%.8llx "
- "to -%.8llx (vmalloc region overlap).\n",
- (unsigned long long)bank->start,
- (unsigned long long)bank->start + bank->size - 1,
- (unsigned long long)bank->start + newsize - 1);
- bank->size = newsize;
- }
- #endif
- /*当bank的结束地址比当前的arm_lowmem_limit 还要大的话重新更新。*/
- if (!bank->highmem && bank->start + bank->size > arm_lowmem_limit)
- arm_lowmem_limit = bank->start + bank->size;
-
- j++;
- }
- #ifdef CONFIG_HIGHMEM
- if (highmem) {
- const char *reason = NULL;
- /*vipt属于arm cache的一种模式,如果alias了vipt,那么Highmem就
- 不会被使用了。*/
- if (cache_is_vipt_aliasing()) {
- /*
- * Interactions between kmap and other mappings
- * make highmem support with aliasing VIPT caches
- * rather difficult.
- */
- reason = "with VIPT aliasing cache";
- }
- if (reason) {
- printk(KERN_CRIT "HIGHMEM is not supported %s, ignoring high memory\n",
- reason);
- while (j > 0 && meminfo.bank[j - 1].highmem)
- j--;
- }
- }
- #endif
- meminfo.nr_banks = j;
- /* arm_lowmem_limit 以上都被认为是高端内存了。*/
- high_memory = __va(arm_lowmem_limit - 1) + 1;
- memblock_set_current_limit(arm_lowmem_limit);
- }
void __init sanity_check_meminfo(void)
{
int i, j, highmem = 0;
~~snip
/*对每个bank都做检查。*/
for (i = 0, j = 0; i < meminfo.nr_banks; i++) {
struct membank *bank = &meminfo.bank[j];
*bank = meminfo.bank[i];
/*这里表示是PAE扩展的情况???*/
if (bank->start > ULONG_MAX)
highmem = 1;
#ifdef CONFIG_HIGHMEM
/*如果物理地址比在vmalloc_min之上或者小于内核逻辑
映射地址空间(俗称lowmem或者地段内存),那么就被认为是高端内存。
vmalloc_min被定义为vmalloc的最低地址。关于vmalloc可以了解下linux
的虚拟内存空间布局划分。其实它和lowmem最高地址中间还留有8M的
空间防止越界。*/
if (__va(bank->start) >= vmalloc_min ||
__va(bank->start) < (void *)PAGE_OFFSET)
highmem = 1;
bank->highmem = highmem;
/*
* Split those memory banks which are partially overlapping
* the vmalloc area greatly simplifying things later.
*/
/*表示meminfo其中的一个bank的物理地址其中一部分处于
Lowmem,一部分却又处于Highmem,这种情况需要将bank再重新划分
成两个bank。*/
if (!highmem && __va(bank->start) < vmalloc_min &&
bank->size > vmalloc_min - __va(bank->start)) {
if (meminfo.nr_banks >= NR_BANKS) {
printk(KERN_CRIT "NR_BANKS too low, "
"ignoring high memory\n");
} else {
/*将当前跟着的bank元素都往后挪一个位置,以保存新划分出来的
Bank。*/
memmove(bank + 1, bank,
(meminfo.nr_banks - i) * sizeof(*bank));
meminfo.nr_banks++;
i++;
/*保存size和start,既然代码跑这里来了,肯定为highmem了。*/
bank[1].size -= vmalloc_min - __va(bank->start);
bank[1].start = __pa(vmalloc_min - 1) + 1;
bank[1].highmem = highmem = 1;
j++;
}
/*lowmem的size, start保持不变。*/
bank->size = vmalloc_min - __va(bank->start);
}
#else
bank->highmem = highmem;
/*系统没有enable high memory时直接忽略highmem.*/
/*
* Highmem banks not allowed with !CONFIG_HIGHMEM.
*/
if (highmem) {
printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx "
"(!CONFIG_HIGHMEM).\n",
(unsigned long long)bank->start,
(unsigned long long)bank->start + bank->size - 1);
continue;
}
/*判断物理起始地址是不是落在vmalloc区域,或者小于lowmem区域。*/
/*
* Check whether this memory bank would entirely overlap
* the vmalloc area.
*/
if (__va(bank->start) >= vmalloc_min ||
__va(bank->start) < (void *)PAGE_OFFSET) {
printk(KERN_NOTICE "Ignoring RAM at %.8llx-%.8llx "
"(vmalloc region overlap).\n",
(unsigned long long)bank->start,
(unsigned long long)bank->start + bank->size - 1);
continue;
}
/*判断物理结束地址是不是落在vmalloc区域*/
/*
* Check whether this memory bank would partially overlap
* the vmalloc area.
*/
if (__va(bank->start + bank->size) > vmalloc_min ||
__va(bank->start + bank->size) < __va(bank->start)) {
unsigned long newsize = vmalloc_min - __va(bank->start);
printk(KERN_NOTICE "Truncating RAM at %.8llx-%.8llx "
"to -%.8llx (vmalloc region overlap).\n",
(unsigned long long)bank->start,
(unsigned long long)bank->start + bank->size - 1,
(unsigned long long)bank->start + newsize - 1);
bank->size = newsize;
}
#endif
/*当bank的结束地址比当前的arm_lowmem_limit 还要大的话重新更新。*/
if (!bank->highmem && bank->start + bank->size > arm_lowmem_limit)
arm_lowmem_limit = bank->start + bank->size;
j++;
}
#ifdef CONFIG_HIGHMEM
if (highmem) {
const char *reason = NULL;
/*vipt属于arm cache的一种模式,如果alias了vipt,那么Highmem就
不会被使用了。*/
if (cache_is_vipt_aliasing()) {
/*
* Interactions between kmap and other mappings
* make highmem support with aliasing VIPT caches
* rather difficult.
*/
reason = "with VIPT aliasing cache";
}
if (reason) {
printk(KERN_CRIT "HIGHMEM is not supported %s, ignoring high memory\n",
reason);
while (j > 0 && meminfo.bank[j - 1].highmem)
j--;
}
}
#endif
meminfo.nr_banks = j;
/* arm_lowmem_limit 以上都被认为是高端内存了。*/
high_memory = __va(arm_lowmem_limit - 1) + 1;
memblock_set_current_limit(arm_lowmem_limit);
}
Vmalloc_min一开始编译的时候就被初始化的:
[html] view plain copy print ?
- static void * __initdata vmalloc_min =
- (void*)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET);
static void * __initdata vmalloc_min =
(void*)(VMALLOC_END - (240 << 20) - VMALLOC_OFFSET);
VMALLOC_END:表示vmalloc区域结束地址。
240<<20:vmalloc区域有240M大小。
VMALLOC_OFFSET:为8M。vmalloc区域和lowmem区域有8M的空闲区间,防止访问越界。
当然,vamlloc_min也可以通过cmdline的方式传到kernel中作修改。
[html] view plain copy print ?
- static int __init early_vmalloc(char *arg)
- {
- /*将vmalloc size解析成unsigned long类型。*/
- unsigned long vmalloc_reserve = memparse(arg, NULL);
-
- if (vmalloc_reserve < SZ_16M) {
- vmalloc_reserve = SZ_16M;
- printk(KERN_WARNING
- "vmalloc area too small, limiting to %luMB\n",
- vmalloc_reserve >> 20);
- }
-
- if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {
- vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);
- printk(KERN_WARNING
- "vmalloc area is too big, limiting to %luMB\n",
- vmalloc_reserve >> 20);
- }
- /*改变vmalloc_min变量,这样就得到了自己想要的vmalloc size了。*/
- vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve);
- return 0;
- }
- early_param("vmalloc", early_vmalloc);
static int __init early_vmalloc(char *arg)
{
/*将vmalloc size解析成unsigned long类型。*/
unsigned long vmalloc_reserve = memparse(arg, NULL);
if (vmalloc_reserve < SZ_16M) {
vmalloc_reserve = SZ_16M;
printk(KERN_WARNING
"vmalloc area too small, limiting to %luMB\n",
vmalloc_reserve >> 20);
}
if (vmalloc_reserve > VMALLOC_END - (PAGE_OFFSET + SZ_32M)) {
vmalloc_reserve = VMALLOC_END - (PAGE_OFFSET + SZ_32M);
printk(KERN_WARNING
"vmalloc area is too big, limiting to %luMB\n",
vmalloc_reserve >> 20);
}
/*改变vmalloc_min变量,这样就得到了自己想要的vmalloc size了。*/
vmalloc_min = (void *)(VMALLOC_END - vmalloc_reserve);
return 0;
}
early_param("vmalloc", early_vmalloc);
vmalloc_min的变化也会导致lowmem也就是低端的内存大小的变化。所以实际应用中,high memory的定义并非一定像书上所说的为896M之上。
Meminfo使用:
做完了检查之后就是使用了,在使用部分,meminfo的信息其实都传给了一个叫structmemblock的结构,后续由它来完成内存区域信息保存的责任。它会将一些必要的区域给保留出来供系统使用,例如kernel的text, code段。其他未使用部分系统才能使用。来看看实现函数arm_memblock_init().
[html] view plain copy print ?
- void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
- {
- int i;
- /*将struct meminfo的信息都放入到了struct memblock中去,它会将保留的区域和空闲的区域用memory 和reserved变量分别保存。*/
- for (i = 0; i < mi->nr_banks; i++)
- memblock_add(mi->bank[i].start, mi->bank[i].size);
-
- /* kernel的text段需要作为保留部分。其实看system.map会发现
- _stext为symbol里的其实地址,而_end为结束地址。所以这块memblock
- Region包括了virtual memory layout中的.init, .bss, .data, .text这几个区域。*/
- memblock_reserve(__pa(_stext), _end - _stext);
- /* 本平台的phys_initrd_start 这里为0.*/
- #ifdef CONFIG_BLK_DEV_INITRD
- if (phys_initrd_size &&
- !memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) {
- pr_err("INITRD: 0x%08lx+0x%08lx is not a memory region - disabling initrd\n",
- phys_initrd_start, phys_initrd_size);
- phys_initrd_start = phys_initrd_size = 0;
- }
- if (phys_initrd_size &&
- memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) {
- pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region - disabling initrd\n",
- phys_initrd_start, phys_initrd_size);
- phys_initrd_start = phys_initrd_size = 0;
- }
- if (phys_initrd_size) {
- memblock_reserve(phys_initrd_start, phys_initrd_size);
-
- /* Now convert initrd to virtual addresses */
- initrd_start = __phys_to_virt(phys_initrd_start);
- initrd_end = initrd_start + phys_initrd_size;
- }
- #endif
- /*这部分空间是给页表留着的。*/
- arm_mm_memblock_reserve();
- /*空函数。*/
- arm_dt_memblock_reserve();
- /*如果平台有定义这几的reserve函数,那么调用它。
- 在前面的mempool文章中,我们已经分析过了,平台会
- 预留一百多M的memory供系统ION分配。*/
- /* reserve any platform specific memblock areas */
- if (mdesc->reserve)
- mdesc->reserve();
- /*关于cma,是系统为了reserved memory而出现的。
- 它的优点是:当某些模块如audio/camera需要连续物理大块内存
- 时,能申请到,而不用的时候,又可以被其他模块申请。避免了内存
- 浪费。其原理利用的是内存数据迁移。不过本平台没用使用到。*/
- /*
- * reserve memory for DMA contigouos allocations,
- * must come from DMA area inside low memory
- */
- dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
-
- arm_memblock_steal_permitted = false;
- memblock_allow_resize();
- memblock_dump_all();
- }
void __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
{
int i;
/*将struct meminfo的信息都放入到了struct memblock中去,它会将保留的区域和空闲的区域用memory 和reserved变量分别保存。*/
for (i = 0; i < mi->nr_banks; i++)
memblock_add(mi->bank[i].start, mi->bank[i].size);
/* kernel的text段需要作为保留部分。其实看system.map会发现
_stext为symbol里的其实地址,而_end为结束地址。所以这块memblock
Region包括了virtual memory layout中的.init, .bss, .data, .text这几个区域。*/
memblock_reserve(__pa(_stext), _end - _stext);
/* 本平台的phys_initrd_start 这里为0.*/
#ifdef CONFIG_BLK_DEV_INITRD
if (phys_initrd_size &&
!memblock_is_region_memory(phys_initrd_start, phys_initrd_size)) {
pr_err("INITRD: 0x%08lx+0x%08lx is not a memory region - disabling initrd\n",
phys_initrd_start, phys_initrd_size);
phys_initrd_start = phys_initrd_size = 0;
}
if (phys_initrd_size &&
memblock_is_region_reserved(phys_initrd_start, phys_initrd_size)) {
pr_err("INITRD: 0x%08lx+0x%08lx overlaps in-use memory region - disabling initrd\n",
phys_initrd_start, phys_initrd_size);
phys_initrd_start = phys_initrd_size = 0;
}
if (phys_initrd_size) {
memblock_reserve(phys_initrd_start, phys_initrd_size);
/* Now convert initrd to virtual addresses */
initrd_start = __phys_to_virt(phys_initrd_start);
initrd_end = initrd_start + phys_initrd_size;
}
#endif
/*这部分空间是给页表留着的。*/
arm_mm_memblock_reserve();
/*空函数。*/
arm_dt_memblock_reserve();
/*如果平台有定义这几的reserve函数,那么调用它。
在前面的mempool文章中,我们已经分析过了,平台会
预留一百多M的memory供系统ION分配。*/
/* reserve any platform specific memblock areas */
if (mdesc->reserve)
mdesc->reserve();
/*关于cma,是系统为了reserved memory而出现的。
它的优点是:当某些模块如audio/camera需要连续物理大块内存
时,能申请到,而不用的时候,又可以被其他模块申请。避免了内存
浪费。其原理利用的是内存数据迁移。不过本平台没用使用到。*/
/*
* reserve memory for DMA contigouos allocations,
* must come from DMA area inside low memory
*/
dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
arm_memblock_steal_permitted = false;
memblock_allow_resize();
memblock_dump_all();
}
到这里,接下来的任务基本上就交给struct memblock完成了!
20130318