一:缺页中断处理
在调用malloc后只建立了虚拟地址空间,没有分配内存对应的物理内存,当进程访问没有建立映射关系的虚拟内存时,会产生缺页中断(考虑的问题比较多:匿名页面,KSM页面,pagecache页面,写时复制,私有映射,共享映射)。
缺页异常处理依赖于处理器的体系架构,因此缺页异常底层的处理流程在内核代码中的特定结构部分。
__vectors_start是中断异常处理的起点,具体到缺页异常路径是:
__vectors_start-->vector_dabt-->__dabt_usr/__dabt_svc-->dabt_helper-->v7_early_abort-->do_DataAbort-->fsr_info-->do_translation_fault/do_page_fault/do_sect_fault。
do_page_fault是缺页中断的核心函数,主要工作交给__do_page_fault处理,然后进行一些异常处理__do_kernel_fault和__do_user_fault。__do_page_fault查找合适的vma,然后主要工作交给handle_mm_fault;handle_mm_fault的核心又是handle_pte_fault。
handle_pte_fault中根据也是否存在分为两类:do_fault(文件映射缺页中断)、do_anonymous_page(匿名页面缺页中断)、do_swap_page()和do_wp_page(写时复制)。
static int __kprobes do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) { struct task_struct *tsk; struct mm_struct *mm; int fault, sig, code; unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; if (notify_page_fault(regs, fsr)) return 0; tsk = current;-------------------------------------------获取当前进程的task_struct mm = tsk->mm;-------------------------------------------获取进程内存管理结构体mm_struct /* Enable interrupts if they were enabled in the parent context. */ if (interrupts_enabled(regs)) local_irq_enable(); /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_atomic() || !mm)----------------------------------in_atomic判断当前状态是否处于中断上下文或者禁止抢占,如果是跳转到no_context;如果当前进程没有mm,说明是一个内核线程,跳转到no_context。 goto no_context; if (user_mode(regs)) flags |= FAULT_FLAG_USER; if (fsr & FSR_WRITE) flags |= FAULT_FLAG_WRITE; /* * As per x86, we may deadlock here. However, since the kernel only * validly references user space from well defined areas of the code, * we can bug out early if this is from code which shouldn't. */ if (!down_read_trylock(&mm->mmap_sem)) { if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc))----------发生在内核空间,且没有在exception tables查询到该地址,跳转到no_context。 goto no_context; retry: down_read(&mm->mmap_sem);---------------------------用户空间则睡眠等待锁持有者释放锁。 } else { /* * The above down_read_trylock() might have succeeded in * which case, we'll have missed the might_sleep() from * down_read() */ might_sleep(); #ifdef CONFIG_DEBUG_VM if (!user_mode(regs) && !search_exception_tables(regs->ARM_pc)) goto no_context; #endif } //通过失效的addr来查找vma,如果通过find_vma没有找到vma,说明addr地址还没有在进程地址空间中。 fault = __do_page_fault(mm, addr, fsr, flags, tsk); /* If we need to retry but a fatal signal is pending, handle the * signal first. We do not need to release the mmap_sem because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) return 0; /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. */ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr); if (!(fault & VM_FAULT_ERROR) && flags & FAULT_FLAG_ALLOW_RETRY) { if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, addr); } else { tsk->min_flt++; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, addr); } if (fault & VM_FAULT_RETRY) { /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~FAULT_FLAG_ALLOW_RETRY; flags |= FAULT_FLAG_TRIED; goto retry; } } up_read(&mm->mmap_sem); /* * Handle the "normal" case first - VM_FAULT_MAJOR / VM_FAULT_MINOR */ if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP | VM_FAULT_BADACCESS))))----没有错误,说明缺页中断处理完成。 return 0; /* * If we are in kernel mode at this point, we * have no context to handle this fault with. */ if (!user_mode(regs))-----------------------------------判断CPSR寄存器的低4位,CPSR的低5位表示当前所处的模式。如果低4位位0,则处于用户态。见下面CPSRM4~M0细节。 goto no_context;------------------------------------进行内核空间错误处理 if (fault & VM_FAULT_OOM) { /* * We ran out of memory, call the OOM killer, and return to * userspace (which will retry the fault, or kill us if we * got oom-killed) */ pagefault_out_of_memory();--------------------------进行OOM处理,然后返回。 return 0; } if (fault & VM_FAULT_SIGBUS) { /* * We had some memory, but were unable to * successfully fix up this page fault. */ sig = SIGBUS; code = BUS_ADRERR; } else { /* * Something tried to access memory that * isn't in our memory map.. */ sig = SIGSEGV; code = fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR; } __do_user_fault(tsk, addr, fsr, sig, code, regs);------用户模式下错误处理,通过给用户进程发信号:SIGBUS/SIGSEGV。 return 0; no_context: __do_kernel_fault(mm, addr, fsr, regs);----------------错误发生在内核模式,如果内核无法处理,此处产生oops错误。 return 0; }
do_page_fault->handle_mm_fault->handle_mm_fault->handle_pte_dault
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); pgd = pgd_offset(mm, address);------------------------------------获取当前address在当前进程页表项PGD页面目录项。 pud = pud_alloc(mm, pgd, address);--------------------------------获取当前address在当前进程对应PUD页表目录项。 if (!pud) return VM_FAULT_OOM; pmd = pmd_alloc(mm, pud, address);--------------------------------找到当前地址的PMD页表目录项 if (!pmd) return VM_FAULT_OOM; if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { int ret = VM_FAULT_FALLBACK; if (!vma->vm_ops) ret = do_huge_pmd_anonymous_page(mm, vma, address, pmd, flags); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { pmd_t orig_pmd = *pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd)) { unsigned int dirty = flags & FAULT_FLAG_WRITE; /* * If the pmd is splitting, return and retry the * the fault. Alternative: wait until the split * is done, and goto retry. */ if (pmd_trans_splitting(orig_pmd)) return 0; if (pmd_protnone(orig_pmd)) return do_huge_pmd_numa_page(mm, vma, address, orig_pmd, pmd); if (dirty && !pmd_write(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { huge_pmd_set_accessed(mm, vma, address, pmd, orig_pmd, dirty); return 0; } } } if (unlikely(pmd_none(*pmd)) && unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; /* if an huge pmd materialized from under us just retry later */ if (unlikely(pmd_trans_huge(*pmd))) return 0; pte = pte_offset_map(pmd, address);-------------------------------根据address从pmd中获取pte指针 return handle_pte_fault(mm, vma, address, pte, pmd, flags); }
1.2:对缺页中断的区分
static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; /* * some architectures can have larger ptes than wordsize, * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y, * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses. * The code below just needs a consistent view for the ifs and * we later double check anyway with the ptl lock held. So here * a barrier will do. */ entry = *pte; barrier(); if (!pte_present(entry)) {------------------------------------------pte页表项中的L_PTE_PRESENT位没有置位,说明pte对应的物理页面不存在 if (pte_none(entry)) {------------------------------------------pte页表项内容为空,同时pte对应物理页面也不存在 if (vma->vm_ops) { if (likely(vma->vm_ops->fault)) return do_fault(mm, vma, address, pte,--------------vm_ops操作函数fault存在,则是文件映射页面异常中断 pmd, flags, entry); } return do_anonymous_page(mm, vma, address,------------------反之,vm_ops操作函数fault不存在,则是匿名页面异常中断 pte, pmd, flags); } return do_swap_page(mm, vma, address,---------------------------pte对应的物理页面不存在,但是pte页表项不为空,说明该页被交换到swap分区了 pte, pmd, flags, entry); } ======================================下面都是物理页面存在的情况=========================================== if (pte_protnone(entry)) return do_numa_page(mm, vma, address, entry, pte, pmd); ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry))------------------------------------------对只读属性的页面产生写异常,触发写时复制缺页中断 return do_wp_page(mm, vma, address, pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, pte);-----------------------------pte内容发生变化,需要把新的内容写入pte页表项中,并且刷新TLB和cache。 } else { if (flags & FAULT_FLAG_WRITE) flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); return 0; }
文件映射缺页中断 :处理函数:do_fault,有vm_ops(设置了vm的操作函数),pte内容为空,页不在内存中(pte 没有映射物理页面)
匿名缺页中断 :处理函数:do_anonymous_page,没有vm_ops,pte内容为空,页不在内存中
页被交换的swap分区:处理函数:do_swap_page,pte内容存在,页不在内存中
写时复制 :处理函数:do_wp_page,页在内存中
1.3:匿名页面缺页中断
匿名页面是相对于文件映射页面的,Linux中将所有没有关联到文件映射的页面成为匿名页面。其核心处理函数为do_anonymous_page()。
handle_pte_fault->do_anonymous_page
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) { struct mem_cgroup *memcg; struct page *page; spinlock_t *ptl; pte_t entry; pte_unmap(page_table); /* Check if we need to add a guard page to the stack */ if (check_stack_guard_page(vma, address) < 0) return VM_FAULT_SIGSEGV; /* Use the zero-page for reads */ if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {--------------如果是分配只读属性的页面,使用一个zeroed的全局页面empty_zero_page entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), vma->vm_page_prot)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto unlock; goto setpte;------------------------------------------------------------跳转到setpte设置硬件pte表项,把新的PTE entry设置到硬件页表中 } /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address);-------------------如果页面是可写的,分配掩码是__GFP_MOVABLE|__GFP_WAIT|__GFP_IO|__GFP_FS|__GFP_HARDWALL|__GFP_HIGHMEM。最终调用alloc_pages,优先使用高端内存。 if (!page) goto oom; /* * The memory barrier inside __SetPageUptodate makes sure that * preceeding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry));-------------------------------生成一个新的PTE Entry page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; inc_mm_counter_fast(mm, MM_ANONPAGES);------------------------------------增加系统中匿名页面统计计数,计数类型是MM_ANONPAGES page_add_new_anon_rmap(page, vma, address);-------------------------------将匿名页面添加到RMAP系统中 mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_active_or_unevictable(page, vma);---------------------------将匿名页面添加到LRU链表中 setpte: set_pte_at(mm, address, page_table, entry);-------------------------------将entry设置到PTE硬件中 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); goto unlock; oom_free_page: page_cache_release(page); oom: return VM_FAULT_OOM; }
1.4:文件映射缺页中断
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page; spinlock_t *ptl; pte_t *pte; int ret = 0; if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {------static unsigned long fault_around_bytes __read_mostly =rounddown_pow_of_two(65536);
pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags);----------------------围绕在缺页异常地址周围提前映射尽可能多的页面,提前建立进程地址空间和page cache的映射关系有利于减少发生缺页终端的次数。这里只是和现存的page cache提前建立映射关系,而不会去创建page cache。 if (!pte_same(*pte, orig_pte)) goto unlock_out; pte_unmap_unlock(pte, ptl); } ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);-----------创建page cache的页面实际操作 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); unlock_page(fault_page); page_cache_release(fault_page); return ret; } do_set_pte(vma, address, fault_page, pte, false, false);-------------------生成新的PTE Entry设置到硬件页表项中 unlock_page(fault_page); unlock_out: pte_unmap_unlock(pte, ptl); return ret; }
do_fault_around:为异常地址分配page_cache
handle_cow_fault()0处理私有映射且发生写时复制COW的情况
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);----------------优先从高端内存分配可移动页面 if (!new_page) return VM_FAULT_OOM; if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { page_cache_release(new_page); return VM_FAULT_OOM; } ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);----------利用vma->vm_ops->fault()读取文件内容到fault_page中。 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (fault_page) copy_user_highpage(new_page, fault_page, address, vma);-------------------将fault_page页面内容复制到新分配页面new_page中。 __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) {------------------------------------如果pte和orig_pte不一致,说明中间有人修改了pte,那么释放fault_page和new_page页面并退出。 pte_unmap_unlock(pte, ptl); if (fault_page) { unlock_page(fault_page); page_cache_release(fault_page); } else { /* * The fault handler has no page to lock, so it holds * i_mmap_lock for read to protect against truncate. */ i_mmap_unlock_read(vma->vm_file->f_mapping); } goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true);-------------------------将PTE Entry设置到PTE硬件页表项pte中。 mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma);--------------------------将新分配的new_page加入到LRU链表中。 pte_unmap_unlock(pte, ptl); if (fault_page) { unlock_page(fault_page); page_cache_release(fault_page);-------------------------------------------释放fault_page页面 } else { /* * The fault handler has no page to lock, so it holds * i_mmap_lock for read to protect against truncate. */ i_mmap_unlock_read(vma->vm_file->f_mapping); } return ret; uncharge_out: mem_cgroup_cancel_charge(new_page, memcg); page_cache_release(new_page); return ret; }
do_shared_fault()处理共享文件映射中发生缺页异常的情况。
static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page; struct address_space *mapping; spinlock_t *ptl; pte_t *pte; int dirtied = 0; int ret, tmp; ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);-----------------读取文件到fault_page中 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; if (vma->vm_ops->page_mkwrite) { unlock_page(fault_page); tmp = do_page_mkwrite(vma, fault_page, address);-----------------------------通知进程地址空间,fault_page将变成可写的,那么进程可能需要等待这个page的内容回写成功。 if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { page_cache_release(fault_page); return tmp; } } pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) {---------------------------------------判断该异常地址对应的硬件页表项pte内容与之前的orig_pte是否一致。不一致,就需要释放fault_page。 pte_unmap_unlock(pte, ptl); unlock_page(fault_page); page_cache_release(fault_page); return ret; } do_set_pte(vma, address, fault_page, pte, true, false);--------------------------利用fault_page新生成一个PTE Entry并设置到页表项pte中 pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page))--------------------------------------------------设置页面为脏 dirtied = 1; mapping = fault_page->mapping; unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { balance_dirty_pages_ratelimited(mapping);------------------------------------每设置一页为dirty,检查是否需要回写;如需要则回写一部分页面 } if (!vma->vm_ops->page_mkwrite) file_update_time(vma->vm_file); return ret; }
1.5:写时复制
do_wp_page()函数处理那些用户试图修改pte页表没有可写属性的页面,它新分配一个页面并且复制旧页面内容到新的页面中
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) __releases(ptl) { struct page *old_page, *new_page = NULL; pte_t entry; int ret = 0; int page_mkwrite = 0; bool dirty_shared = false; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ struct mem_cgroup *memcg; old_page = vm_normal_page(vma, address, orig_pte);--------------------------查找缺页异常地址address对应页面的struct page数据结构,返回normal mapping页面。 if (!old_page) {------------------------------------------------------------如果返回old_page为NULL,说明这时一个special mapping页面。 if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) goto reuse; goto gotten; } if (PageAnon(old_page) && !PageKsm(old_page)) {-----------------------------针对匿名非KSM页面之外的情况进行进行处理, if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); goto unlock; } page_cache_release(old_page); } if (reuse_swap_page(old_page)) { page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); goto reuse; } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==---------------处理匿名非KSM页面之外的情况 (VM_WRITE|VM_SHARED))) { page_cache_get(old_page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; pte_unmap_unlock(page_table, ptl); tmp = do_page_mkwrite(vma, old_page, address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { page_cache_release(old_page); return tmp; } page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); goto unlock; } page_mkwrite = 1; } dirty_shared = true; reuse: if (old_page) page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; if (dirty_shared) { struct address_space *mapping; int dirtied; if (!page_mkwrite) lock_page(old_page); dirtied = set_page_dirty(old_page); VM_BUG_ON_PAGE(PageAnon(old_page), old_page); mapping = old_page->mapping; unlock_page(old_page); page_cache_release(old_page); if ((dirtied || page_mkwrite) && mapping) { balance_dirty_pages_ratelimited(mapping); } if (!page_mkwrite) file_update_time(vma->vm_file); } return ret; } page_cache_get(old_page); gotten:------------------------------------------------------------------------表示需要新建一个页面,也就是写时复制。 pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, address);----------分配高端、可移动、零页面 if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);--------分配高端、可移动页面 if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma); } __SetPageUptodate(new_page); if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; mmun_start = address & PAGE_MASK; mmun_end = mmun_start + PAGE_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, MM_FILEPAGES); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot);---------------------------利用new_page和vma生成PTE Entry entry = maybe_mkwrite(pte_mkdirty(entry), vma); ptep_clear_flush_notify(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address);------------------------把new_page添加到RMAP反向映射机制,设置页面计数_mapcount为0。 mem_cgroup_commit_charge(new_page, memcg, false); lru_cache_add_active_or_unevictable(new_page, vma);--------------------把new_page添加到活跃的LRU链表中 set_pte_at_notify(mm, address, page_table, entry); update_mmu_cache(vma, address, page_table); if (old_page) { page_remove_rmap(old_page);--------------------------------------_mapcount计数减1 } /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; } else mem_cgroup_cancel_charge(new_page, memcg); if (new_page) page_cache_release(new_page);----------------------------------------释放new_page,这里new_page==old_page。 unlock: pte_unmap_unlock(page_table, ptl); if (mmun_end > mmun_start) mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ munlock_vma_page(old_page); unlock_page(old_page); } page_cache_release(old_page); } return ret; oom_free_new: page_cache_release(new_page); oom: if (old_page) page_cache_release(old_page); return VM_FAULT_OOM; }
二:page引用计数
内存管理大多是以页为中心展开的,每个物理页面都需要一个struct page数据结构体,
/* * who is mapping it.--------------------------------------------------------------我们无法知道那个进程在使用一个页面,但是可以通过RMAP相关结构体知道谁映射到了此页面。 */ struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { struct address_space *mapping; /* If low bit clear, points to----------表示页面所指向的地址空间,低两位用于判断是匿名映射还是KSM页面。位1表示匿名页面,位2表示KSM页面。 */ void *s_mem; /* slab first object */---------------------------用于slab分配器,slab中第一个对象的开始地址,和mapping共同占用一个字的存储空间。 }; /* Second double word */ struct { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ bool pfmemalloc; /* If set by the page allocator, */ }; union { #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) /* Used for cmpxchg_double in slub */ unsigned long counters; #else unsigned counters; #endif struct { union { atomic_t _mapcount; struct { /* SLUB */ unsigned inuse:16; unsigned objects:15; unsigned frozen:1; }; int units; /* SLOB */ }; atomic_t _count; /* Usage count, see below. */ }; unsigned int active; /* SLAB */ }; }; ... }
enum pageflags { PG_locked, /* Page is locked. Don't touch. */---表示页面已经上锁了。如果该比特位置位,说明页面已经被锁定;内存管理其他模块不能访问这个页面,以防发生竞争。 PG_error,----------------------------------------------页面操作过程中发生错误会设置该位。 PG_referenced,-----------------------------------------控制页面活跃程度,在kswapd页面回收中使用。 PG_uptodate,-------------------------------------------表示页面的数据已经从块设备成功读取。 PG_dirty,----------------------------------------------表示页面内容发生改变,页面为脏,页面内容被改写后还没有和外部存储器进行同步操作。 PG_lru,------------------------------------------------表示页面加入了LRU链表,内核使用LRU链表管理活跃和不活跃页面。 PG_active,---------------------------------------------控制页面活跃成都,在kswapd页面回收中使用。 PG_slab,-----------------------------------------------用于slab分配器 PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/--页面的所有者使用,如果是page cache页面,文件系统可能使用。 PG_arch_1,---------------------------------------------与体系结构相关的页面状态位。 PG_reserved,-------------------------------------------表示该页不可被换出。 PG_private, /* If pagecache, has fs-private data */--表示该页是有效的,。如果页面是page cache,那么包含一些文件系统相关的数据信息。 PG_private_2, /* If pagecache, has fs aux data */----如果是page cache,可能包含fs aux data。 PG_writeback, /* Page is under writeback */----表示页面的内容正在向块设备进行回写。 #ifdef CONFIG_PAGEFLAGS_EXTENDED PG_head, /* A head page */ PG_tail, /* A tail page */ #else PG_compound, /* A compound page */-------------一个混合页面 #endif PG_swapcache, /* Swap page: swp_entry_t in private */---表示页面处于交换缓存。 PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */----------表示该页马上要被回收。 PG_swapbacked, /* Page is backed by RAM/swap */---------页面具有swap缓存功能,通常匿名页面才可以写回swap分区。 PG_unevictable, /* Page is "unevictable" */----表示页面不可回收。 #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */-----------表示页面对应的VMA处于locked状态。 #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE PG_compound_lock, #endif __NR_PAGEFLAGS, /* Filesystems */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* SLOB */ PG_slob_free = PG_private, }
2.1:_count表示内核中引用该页面的次数。
_count == 0:表示该页面位空闲或即将要被释放。
_count > 0:表示该页面已经被分配切内核正在使用,暂不会被释放。
内核中操作_count的引用技术API有get_page()和put_page()。
(1)分配页面时_count引用计数会变成1。
分配页面函数alloc_pages()在成功分配页面后,_count引用计数应该为0,由set_page_refcounter()设置。
(2)加入LRU链表时,page会被kswapd内核线程使用,因此_count引用计数会加1。
以malloc()为用户程序分配内存为例,发生缺页中断后do_anonymous_page()函数成功分配出来一个页面,在设置硬件PTE之前,调用lru_cache_add()函数把这个匿名页面添加到LRU链表中,在这个过程中,使用page_cache_get()宏来增加_count引用计数。
2.2:_mapcount引用计数表示这个页面被进程映射的个数,即已经映射了多少个用户pte也表。
每个用户进程地址空间都有一份独立的页表,有可能出现多个用户进程地址空间同时映射到一个物理页面的情况,RMAP反向映射系统就是利用这个特性来实现的。
_mapcount引用计数主要用于RMAP反响映射系统中。
_mapcount == -1:表示没有pte映射到页面中。
_mapcount == 0:表示只有父进程映射了页面。
匿名页面刚分配时,_mapcount引用计数初始化为0.
_mapcount > 0:表示除了父进程外还有其他进程映射了这个页面。
设置父进程pte页表项内容到子进程中并增加该页面的_mapcount计数。
2.3:页面所PG_locked
PG_locked用于设置页面锁,有两个函数用于申请页面锁:lock_page()和trylock_page()。
lock_page()用于申请页面锁,如果页面锁被其他进程占用,那么睡眠等待。
trylock_page()也同样检查PG_locked位,但是不等待。如果页面的PG_locked置位,则返回false,表明有其他进程已经锁住了页面;返回true表示获取锁成功。