详细的概念性解释就不说了,如果对vmalloc没有一点概念的话,可以稍微找些资料了解下,这里主要就是分析下在内核中vmalloc的实现;
直接物理内存映射(内核逻辑地址)-- 8 MB -- vm -- 1 page -- vm -- 1page --vm ......
大概就是这样:逻辑地址以high_memory为结束边界;然后是 8MB 的空洞(主要是防止指针越界访问);接着就是 VMALLOC_START为边界 开始了vmalloc 区域,该区域有多个vm小区域组成,每个小区域之间有1页(一个page大小)的空洞地址,作用还是防止越界访问;结束是以VMALLOC_END,后面还有个空洞地址,接着最后就是固定映射和临时映射的区域了;
结构体:
struct vm_struct { struct vm_struct *next;//所有vm_struct链接的链表,vmlist是表头 void *addr;//分配得到的子区域在虚拟地址空间中的起始地址 unsigned long size;//表示区域长度 unsigned long flags;//标识 struct page **pages;//这是个指针数组,每个数组元素都是一个被映射的page指针 unsigned int nr_pages;//表示多少个page被映射 phys_addr_t phys_addr; const void *caller; };这个结构体和进程虚拟地址空间的vma非常相识,值得注意;
下面这个结构体是用来管理kvm地址的
struct vmap_area { unsigned long va_start; unsigned long va_end; unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct list_head purge_list; /* "lazy purge" list */ struct vm_struct *vm; struct rcu_head rcu_head; };
/** * vmalloc - allocate virtually contiguous memory * @size: allocation size * Allocate enough pages to cover @size from the page level * allocator and map them into contiguous kernel virtual space. * * For tight control over page level allocator and protection flags * use __vmalloc() instead. */ void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, GFP_KERNEL | __GFP_HIGHMEM);//从高内存分配 } static inline void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) { return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, __builtin_return_address(0)); }__builtin_return_address(0)的含义是,得到当前函数返回地址,即此函数被别的函数调用,然后此函数执行完毕后,返回,所谓返回地址就是那时候的地址。__builtin_return_address(1)的含义是,得到当前函数的调用者的返回地址。注意是调用者的返回地址,而不是函数起始地址。
/** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller) { return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, prot, node, caller); }
这是个主要函数,说明下参数:
unsigned long size :表示要分配的内存大小;
unsigned long align:表示以什么对齐,到这里是 1;
unsigned long start:表示映射区域从什么地方开始查找,这里为:VMALLOC_START;
unsigned long end :表示映射区域从什么地方结束查找,这里为:VMALLOC_END;
gfp_t gfp_mask:表示分配的标识,这里为:GFP_KERNEL | __GFP_HIGHMEM;
pgprot_t prot:表示区域的保护模式,这里为:PAGE_KERNEL;
int node:表示分配节点,这里为:-1;
const void *caller:表示函数地址,这里表示的是__vmalloc_node的返回地址
/** * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or NUMA_NO_NODE * @caller: caller's return address * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller) { struct vm_struct *area; void *addr; unsigned long real_size = size; size = PAGE_ALIGN(size);//size必须页面对齐,因为是映射到页面上,所以必须的页面对齐 if (!size || (size >> PAGE_SHIFT) > totalram_pages)//大小检查下 goto fail; area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, start, end, node, gfp_mask, caller);//从这里已经得到area了(也可能为NULL) if (!area) goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); if (!addr) return NULL; /* * In this function, newly allocated vm_struct has VM_UNLIST flag. * It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ clear_vm_unlist(area);//已经把所有成员都初始化好了,可以清除VM_UNLIST标识了 /* * A ref_count = 3 is needed because the vm_struct and vmap_area * structures allocated in the __get_vm_area_node() function contain * references to the virtual address of the vmalloc'ed block. */ kmemleak_alloc(addr, real_size, 3, gfp_mask); return addr; fail: warn_alloc_failed(gfp_mask, 0, "vmalloc: allocation failure: %lu bytes\n", real_size); return NULL; }
static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { int bit = fls(size); if (bit > IOREMAP_MAX_ORDER) bit = IOREMAP_MAX_ORDER; else if (bit < PAGE_SHIFT) bit = PAGE_SHIFT; align = 1ul << bit; }//ioremap映射时要做的一些检查 size = PAGE_ALIGN(size);//页对齐 if (unlikely(!size)) return NULL; //分配一个area结构体内存 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; /* * We always allocate a guard page. */ size += PAGE_SIZE;//加上空洞页,空洞页是不分配物理内存的 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);//分配一个虚拟内存区域kvm if (IS_ERR(va)) { kfree(area); return NULL; } /* * When this function is called from __vmalloc_node_range, * we add VM_UNLIST flag to avoid accessing uninitialized * members of vm_struct such as pages and nr_pages fields. * They will be set later. */ if (flags & VM_UNLIST)//标识含义上面有解释,下面的函数主要是从va中赋值给area setup_vmalloc_vm(area, va, flags, caller); else insert_vmalloc_vm(area, va, flags, caller); return area; }
下面是从
/* * Allocate a region of KVA of the specified size and alignment, within the * vstart and vend. */ static struct vmap_area *alloc_vmap_area(unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int node, gfp_t gfp_mask) { struct vmap_area *va; struct rb_node *n; unsigned long addr; int purged = 0; struct vmap_area *first; BUG_ON(!size);//size = 0 BUG_ON(size & ~PAGE_MASK); //size要页对齐 BUG_ON(!is_power_of_2(align));//size要以2的n次幂对齐 //分配结构体 va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!va)) return ERR_PTR(-ENOMEM); retry: spin_lock(&vmap_area_lock); /* * Invalidate cache if we have more permissive parameters. * cached_hole_size notes the largest hole noticed _below_ * the vmap_area cached in free_vmap_cache: if size fits * into that hole, we want to scan from vstart to reuse * the hole instead of allocating above free_vmap_cache. * Note that __free_vmap_area may update free_vmap_cache * without updating cached_hole_size or cached_align. *///下面判断cache vmap是否有用,主要检查是否存在、大小、起始地址、对齐 if (!free_vmap_cache || size < cached_hole_size || vstart < cached_vstart || align < cached_align) { nocache: cached_hole_size = 0; free_vmap_cache = NULL; } /* record if we encounter less permissive parameters */ cached_vstart = vstart; cached_align = align; /* find starting point for our search */ if (free_vmap_cache) {//把cache 中的vmap拿出来比较下 first = rb_entry(free_vmap_cache, struct vmap_area, rb_node); addr = ALIGN(first->va_end, align);//首先要对齐后再比较 if (addr < vstart)//结束地址都比开始地址小,那肯定不能用 goto nocache; if (addr + size < addr)//地址越界 goto overflow; } else {//没有free_vmap_cache addr = ALIGN(vstart, align);//和上面一样检查下地址 if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; first = NULL; //下面是红黑树的遍历,主要是看看比较的条件 while (n) { struct vmap_area *tmp; tmp = rb_entry(n, struct vmap_area, rb_node); if (tmp->va_end >= addr) {//找到一个结束地址大于需要映射的开始地址 first = tmp; if (tmp->va_start <= addr)//这里就表明,起始地址在区域中间 break; n = n->rb_left;//这里往叶子节点走,则分配地址更小的区域 } else n = n->rb_right;//这边分配,则分配地址更大的区域 } if (!first)//表示找到了起始地址,映射起始地址比任何区域的结束地址都大 goto found; } /* from the starting point, walk areas until a suitable hole is found */ while (addr + size > first->va_start && addr + size <= vend) {//这里是计算空洞地址是否足够 if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align);//重点是addr每次都会移动到区域结尾处 if (addr + size < addr) goto overflow; if (list_is_last(&first->list, &vmap_area_list))//如果是最后一个区域,那接下来的都是空洞地址 goto found; first = list_entry(first->list.next, struct vmap_area, list);//下一个地址 } found://如果要理解上面的代码,其实分析下first的几种情况就可以明了了; if (addr + size > vend)//看看是否超出vmalloc_end的界限 goto overflow; //下面开始赋值了 va->va_start = addr; va->va_end = addr + size; va->flags = 0; __insert_vmap_area(va);//插入红黑树和链表中 free_vmap_cache = &va->rb_node; spin_unlock(&vmap_area_lock); BUG_ON(va->va_start & (align-1)); BUG_ON(va->va_start < vstart); BUG_ON(va->va_end > vend); return va; overflow://没有地址分配的打印 spin_unlock(&vmap_area_lock); if (!purged) { purge_vmap_area_lazy(); purged = 1; goto retry; } if (printk_ratelimit()) printk(KERN_WARNING "vmap allocation for size %lu failed: " "use vmalloc=<size> to increase size.\n", size); kfree(va); return ERR_PTR(-EBUSY); }
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller) { const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;//分配初始化为0的内存页 nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;//去掉一个空洞页 array_size = (nr_pages * sizeof(struct page *));//数组大小 area->nr_pages = nr_pages;//实际映射的页数 /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) {//如果大于一个page,则使用vmalloc来分配。这里是递归 pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES;//标识是vmalloc分配的内存 } else {//数组比较下,就用kmalloc来分配,node = -1 pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; area->caller = caller;//这是<span style="font-family: Arial, Helvetica, sans-serif;">__vmalloc_node_flags()函数的返回地址吧,这个不知道有什么用??</span> if (!area->pages) {//分配数组空间失败,就释放area remove_vm_area(area->addr); kfree(area); return NULL; } for (i = 0; i < area->nr_pages; i++) { struct page *page; gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node < 0) page = alloc_page(tmp_mask); else page = alloc_pages_node(node, tmp_mask, order); if (unlikely(!page)) {//如果有一个页分配失败的话就全部失败,释放掉开始分配的内存; /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } area->pages[i] = page;//记录页面数组 } if (map_vm_area(area, prot, &pages))//利用页表项来建立映射 goto fail; return area->addr; fail: warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, allocated %ld of %ld bytes\n", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; }
-------------------------------释放vmalloc分配的页==vfree()-------------------------------------
/** * vfree - release memory allocated by vmalloc() * @addr: memory base address * * Free the virtually continuous memory area starting at @addr, as * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is * NULL, no operation is performed. * * Must not be called in NMI context (strictly speaking, only if we don't * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) * * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) * */ void vfree(const void *addr) { BUG_ON(in_nmi()); kmemleak_free(addr);//检查内存泄漏函数 if (!addr)//简单做下检查 return; if (unlikely(in_interrupt())) { struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); llist_add((struct llist_node *)addr, &p->list); schedule_work(&p->wq); } else __vunmap(addr, 1); }
释放的主要函数,vmalloc和其他虚拟映射的地址释放也是调用该函数:参数是:addr和1
static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; if (!addr)//NULL return; if ((PAGE_SIZE-1) & (unsigned long)addr) {//对齐检查 WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } area = remove_vm_area(addr);//释放虚拟地址 if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); return; } debug_check_no_locks_freed(addr, area->size); debug_check_no_obj_freed(addr, area->size); if (deallocate_pages) { int i; for (i = 0; i < area->nr_pages; i++) {//释放物理内存页 struct page *page = area->pages[i]; BUG_ON(!page); __free_page(page); } if (area->flags & VM_VPAGES)//如果pages是vmalloc分配的(数组大小大于一个page时)则用vfree释放 vfree(area->pages); else kfree(area->pages); } kfree(area); return; }