// sbrk:用来扩大或者缩小进程的数据段边界,brk为新的数据段边界,其函数实现在文件/mm/mmap.c中。
函数原型如下:
SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; struct mm_struct *mm = current->mm; unsigned long min_brk; //写信号量获取操作, 得到读写信号量sem, 将直接将文件映射到内存 down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK min_brk = mm->end_code; #else min_brk = mm->start_brk; #endif if (brk < min_brk) goto out; /* * Check against rlimit here. If this check is done later after the test * of oldbrk with newbrk then it can escape the test and let the data * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ //参数有效性判断。 //代码段非法访问 rlim = rlimit(RLIMIT_DATA); //RLIMIT_DATA以字节表示的data()段限制 if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + (mm->end_data - mm->start_data) > rlim) goto out; //页框对齐 newbrk = PAGE_ALIGN(brk); oldbrk = PAGE_ALIGN(mm->brk); if (oldbrk == newbrk) goto set_brk; /* Always allow shrinking brk. */ //如果新边界比现在的边界要小,那说明要执行收缩操作 //缩短堆 if (brk <= mm->brk) { if (!do_munmap(mm, newbrk, oldbrk-newbrk)) goto set_brk; goto out; } /* Check against existing mmap mappings. */ //伸展空间已经有映射了 if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) goto out; /* Ok, looks good - let it rip. */ //执行伸长操作 if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) goto out; set_brk: mm->brk = brk; out: retval = mm->brk; up_write(&mm->mmap_sem); return retval; }
由于这个函数既可以用来分配空间,即把动态分配区地步的边界往上推;也可以用来释放,即归还空间。因此,它的代码也大致可以分为两部分。首先是第一部分:收缩数据区,伸长操作。我们分为两种情况来分析。
从上面的代码我们可以看出。用户空间的收缩操作相应的接口是:do_munmap()。代码如下:
/* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. * Jeremy Fitzhardinge <[email protected]> */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) { unsigned long end; struct vm_area_struct *vma, *prev, *last; if ((start & ~PAGE_MASK) || start > TASK_SIZE || len > TASK_SIZE-start) return -EINVAL; if ((len = PAGE_ALIGN(len)) == 0) return -EINVAL; /* Find the first overlapping VMA */ //找到第一个结束地址大于start的VMA。Prev是前一个VMA vma = find_vma_prev(mm, start, &prev); if (!vma) return 0; /* we have start < vma->vm_end */ /* if it doesn't overlap, we have nothing.. */ //现在的堆尾点不可能落在空洞里 //start:新的边界地址。Len:收缩的长度。Start+len即为旧的边界地址。 //所以 start+len肯定是属于进程的线性地址 end = start + len; if (vma->vm_start >= end) return 0; /* * If we need to split any vma, do it now to save pain later. * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ //如果start大于mpnt的起始地址,就会把mpnt一分为二 if (start > vma->vm_start) { int error; /* * Make sure that map_count on return from munmap() will * not exceed its limit; but let map_count go just above * its limit temporarily, to help free resources as expected. */ if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) return -ENOMEM; error = __split_vma(mm, vma, start, 0); if (error) return error; prev = vma; } /* Does it split the last one? */ //找到最后的一个vma last = find_vma(mm, end); //把最后一个线性区一分为二的情况 if (last && end > last->vm_start) { int error = __split_vma(mm, last, end, 1); if (error) return error; } vma = prev? prev->vm_next: mm->mmap; /* * unlock any mlock()ed ranges before detaching vmas */ if (mm->locked_vm) { struct vm_area_struct *tmp = vma; while (tmp && tmp->vm_start < end) { if (tmp->vm_flags & VM_LOCKED) { mm->locked_vm -= vma_pages(tmp); munlock_vma_pages_all(tmp); } tmp = tmp->vm_next; } } /* * Remove the vma's, and unmap the actual pages */ //将mpnt对的区间vma从进程描述符组中删除 detach_vmas_to_be_unmapped(mm, vma, prev, end); //更新页表项,释放页框 unmap_region(mm, vma, prev, start, end); /* Fix up all other VM information */ //到现在为止,所有要释放的vma都挂在mpnt上。remove_vma_list为对要删除的vma链的处理 remove_vma_list(mm, vma); return 0; }
为了弄清楚收缩的整个过程,有必要详细的分析一下函数所调用的各个子函数。
__split_vma:将一个vma劈为成两个:
//参数含义:
//mm:进程的内存描述符 vma:要劈分的vma addr:为界线地址 new_below:为0时,vma为下一半 为1时,//vma为上一半
/* * __split_vma() bypasses sysctl_max_map_count checking. We use this on the * munmap path where it doesn't make sense to fail. */ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { struct mempolicy *pol; struct vm_area_struct *new; int err = -ENOMEM; //如果进程的vma总数超过了限制值 zlh if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) return -EINVAL; //新申请一个vma new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) goto out_err; //将新的vma赋值为旧的vma,使其两者相等 /* most fields are the same, copy all, and then fixup */ *new = *vma; INIT_LIST_HEAD(&new->anon_vma_chain); //new_below为1的时候,vma为上一半,对应的new为下一半 if (new_below) new->vm_end = addr; else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { err = PTR_ERR(pol); goto out_free_vma; } vma_set_policy(new, pol); if (anon_vma_clone(new, vma)) goto out_free_mpol; if (new->vm_file) { get_file(new->vm_file); if (vma->vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); } //如果定义了open操作 if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); //经过前面的初始化之后,再由vma_adjust调整vma的边界 if (new_below) err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); else err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); /* Success. */ if (!err) return 0; /* Clean everything up if vma_adjust failed. */ //如果调整失败,清理工作 if (new->vm_ops && new->vm_ops->close) new->vm_ops->close(new); if (new->vm_file) { if (vma->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); fput(new->vm_file); } unlink_anon_vmas(new); out_free_mpol: mpol_put(pol); out_free_vma: kmem_cache_free(vm_area_cachep, new); out_err: return err; }
转入vma_adjust(),它用来完成调整vma的起始边界和结束边界,将新的vma,插入到进程的vma链等操作,函数原型为:
/* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. * The following helper function should be used when such adjustments * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
第二个要为析的函数是:detach_vmas_to_be_unmapped()
它主要是将要删除的vma链到一起,同时将要删除的vma从mm中脱链
//参数说明:
/*
Mm: 进程的内存描述符
Vma:要删除的起始vma
Prev:vma的前一个vma区
End:结束地址
*/
/* * Create a list of vma's touched by the unmap, removing them from the mm's * vma list as we go.. */ static void detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long end) { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { //从红黑树中释放掉vma rb_erase(&vma->vm_rb, &mm->mm_rb); //更新vma计数 mm->map_count--; tail_vma = vma; vma = vma->vm_next; } while (vma && vma->vm_start < end); //将要删除的vma从链表中脱落 *insertion_point = vma; if (vma) vma->vm_prev = prev; //最后元素后向指针置NULL tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; else addr = vma ? vma->vm_start : mm->mmap_base; mm->unmap_area(mm, addr); //由于进行了删除操作。mmap_cache失效了,置NULL mm->mmap_cache = NULL; /* Kill the cache. */ }
接下来要分析的调用函数是remove_vma_list()
它主要对删除的vma链进行处理。具体代码如下示:
//参数说明:
//mm:进程的内存描述符
//vma:要删除的链表的头节点
/* * Ok - we have the memory areas we should free on the vma list, * so release them, and do the vma updates. * * Called with the mm semaphore held. */ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) { /* Update high watermark before we lower total_vm */ //更新mm的total_vm update_hiwater_vm(mm); do { long nrpages = vma_pages(vma); mm->total_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); validate_mm(mm); }
update_hiwater_vm() 函数定义在mm.h中,原型为:
static inline void update_hiwater_vm(struct mm_struct *mm) { if (mm->hiwater_vm < mm->total_vm) mm->hiwater_vm = mm->total_vm; }
vma_pages()函数,是对vma调整的封装:
static inline unsigned long vma_pages(struct vm_area_struct *vma) { return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; }
unmap_region是整个收缩过程中的核心,它主要完成相应项表项的修改,具体映射页框的释放。
代码如下:
/* * Get rid of page table information in the indicated region. * * Called with the mm semaphore held. */ static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end) { struct vm_area_struct *next = prev? prev->vm_next: mm->mmap; struct mmu_gather *tlb; unsigned long nr_accounted = 0; lru_add_drain(); tlb = tlb_gather_mmu(mm, 0); update_hiwater_rss(mm); //断开具体的vma映射 unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); //因为删除了一些映射,会造成一个页表空闲的情况,回收页表项所占的空间 free_pgtables(tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS, next? next->vm_start: 0); tlb_finish_mmu(tlb, start, end); }
unmap_vmas用来释放pte所映射的页面。代码如下:
//参数说明: //mm:进程描述符 vma:要删除的起始vma start_addr:要删除的线性区的起始地址 // end_addr:要删除的线性区的结束地址 details:在调用的时候置为了NULL /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlbp: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * * Returns the end address of the unmapping (restart addr if interrupted). * * Unmap all pages in the vma list. * * We aim to not hold locks for too long (for scheduling latency reasons). * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to * return the ending mmu_gather to the caller. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ unsigned long unmap_vmas(struct mmu_gather **tlbp, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) { long zap_work = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = (*tlbp)->fullmm; struct mm_struct *mm = vma->vm_mm; mmu_notifier_invalidate_range_start(mm, start_addr, end_addr); //遍历要删除的vma链表 for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { unsigned long end; //确定要断开映射的起始地址跟结束地址 start = max(vma->vm_start, start_addr); if (start >= vma->vm_end) continue; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) continue; if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; if (unlikely(is_pfn_mapping(vma))) untrack_pfn_vma(vma, 0, 0); //while循环开始断开start到end的所有被映射的页框,在足够的情况下一次释放zap_bytes while (start != end) { if (!tlb_start_valid) { tlb_start = start; tlb_start_valid = 1; } //在条件编译下is_vm_hugetlb_page()为空 if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of do_mmap_pgoff. When * hugetlbfs ->mmap method fails, * do_mmap_pgoff() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { unmap_hugepage_range(vma, start, end, NULL); zap_work -= (end - start) / pages_per_huge_page(hstate_vma(vma)); } start = end; } else start = unmap_page_range(*tlbp, vma, start, end, &zap_work, details); if (zap_work > 0) { BUG_ON(start != end); break; } tlb_finish_mmu(*tlbp, tlb_start, start); if (need_resched() || (i_mmap_lock && spin_needbreak(i_mmap_lock))) { if (i_mmap_lock) { *tlbp = NULL; goto out; } cond_resched(); } *tlbp = tlb_gather_mmu(vma->vm_mm, fullmm); tlb_start_valid = 0; zap_work = ZAP_BLOCK_SIZE; } } out: mmu_notifier_invalidate_range_end(mm, start_addr, end_addr); return start; /* which is now the end (or restart) address */ }
跟进unmap_page_range():
static unsigned long unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, long *zap_work, struct zap_details *details) { pgd_t *pgd; unsigned long next; if (details && !details->check_mapping && !details->nonlinear_vma) details = NULL; BUG_ON(addr >= end); mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); //取得页目录 pgd = pgd_offset(vma->vm_mm, addr); //断开pgd项对应的pmd do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { (*zap_work)--; continue; } next = zap_pud_range(tlb, vma, pgd, addr, next, zap_work, details); } while (pgd++, addr = next, (addr != end && *zap_work > 0)); tlb_end_vma(tlb, vma); mem_cgroup_uncharge_end(); return addr; }
4.1.1.4.1.1.1 zap_pud_range
跟进zap_pud_range():
static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, long *zap_work, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) { (*zap_work)--; continue; } next = zap_pmd_range(tlb, vma, pud, addr, next, zap_work, details); } while (pud++, addr = next, (addr != end && *zap_work > 0)); return addr; }
4.1.1.4.1.1.1.1 zap_pmd_range
转入zap_pmd_range():
static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, long *zap_work, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; } next = zap_pte_range(tlb, vma, pmd, addr, next, zap_work, details); } while (pmd++, addr = next, (addr != end && *zap_work > 0)); return addr; }
4.1.1.4.1.1.1.1.1 zap_pte_range
继续跟进zap_pte_range():
static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, long *zap_work, struct zap_details *details) { struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; int rss[NR_MM_COUNTERS]; init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; //pte没有映射页面 if (pte_none(ptent)) { (*zap_work)--; continue; } (*zap_work) -= PAGE_SIZE; //相应的页在主存中 if (pte_present(ptent)) { struct page *page; page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to * invalidate cache without truncating: * unmap shared but keep private pages. */ if (details->check_mapping && details->check_mapping != page->mapping) continue; /* * Each page->index must be checked when * invalidating or truncating nonlinear. */ if (details->nonlinear_vma && (page->index < details->first_index || page->index > details->last_index)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, addr) != page->index) set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); tlb_remove_page(tlb, page); continue; } /* * If details->check_mapping, we leave swap entries; * if details->nonlinear_vma, we leave file entries. */ if (unlikely(details)) continue; if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); } else { swp_entry_t entry = pte_to_swp_entry(ptent); if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); return addr; }
通过上面的分析可以看到,内核是如何通过线性地址从pgd找到pte再释放相关页面的。到这一步,注意到,只是释放了pte所映射的页框,所以,可能会造成有很多pte项没有映射的状态,这部份pte所占的空间其实是可以回收的。它是在free_pgtables()函数中完成的。代码如下:
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start;// 把虚拟区的起始地址赋给addr /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } vma = next; } }
prev指向的是什么?
调用这个函数的时候,prev指向的是什么区域的vma呢?
刚开始的时候:
detach_vmas_to_be_unmapped后:
看上面可以看出: clear_page_tables中,要操作的线性地址即为prev,prev->next之间的空洞线性地址。