在pte_handle_fault()中,如果触发异常的页存在于主存中,那么该异常往往是由写了一个只读页触发的,此时需要进行COW(写时复制操作)。如当一个父进程通过fork()创建了一个子进程时,子进程将会共享父进程的页框。之后,无论是父进程还是子进程要对相应的内存进行写操作,都要进行COW,也就是为自己重新分配一个页框,并把之前的数据复制到页框中去,再写。
static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; entry = *pte; ... ... ... /********页在主存中的情况***********/ ptl = pte_lockptr(mm, pmd); spin_lock(ptl); if (unlikely(!pte_same(*pte, entry))) goto unlock; if (flags & FAULT_FLAG_WRITE) {//异常由写访问触发 if (!pte_write(entry))//而对应的页是不可写的 return do_wp_page(mm, vma, address, //此时必须进行写时复制的操作 pte, pmd, ptl, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { update_mmu_cache(vma, address, entry); } else { /* * This is needed only for protection faults but the arch code * is not yet telling us if this is a protection fault or not. * This still avoids useless tlb flushes for .text page faults * with threads. */ if (flags & FAULT_FLAG_WRITE) flush_tlb_page(vma, address); } unlock: pte_unmap_unlock(pte, ptl); return 0; }
可以看到,hand_pte_fault()函数处理页存在于主存中的情况的关键操作都集中在do_wp_page()函数上。该函数是用来处理COW的,不过在COW之前先要做一些检查,比如说,如果对应的页只有一个进程使用,那么便可以直接修改页的权限为可读可写,而不进行COW。总之,不到不得以的情况下是不会进行COW的。
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) { struct page *old_page, *new_page; pte_t entry; int reuse = 0, ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; old_page = vm_normal_page(vma, address, orig_pte);//获取共享页 if (!old_page) {//获取共享页失败 /* * VM_MIXEDMAP !pfn_valid() case * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty * accounting on raw pfn maps. */ /*如果vma的映射本来就是共享且可写的,则跳转至reuse直接使用orig_pte对应的页*/ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) goto reuse; /*否则跳转至gotten分配一个页*/ goto gotten; } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ /*下面首先判断匿名页的情况,如果old_page是匿名页,并且只有一个进程使用它(reuse为1),则 则直接使用该页*/ if (PageAnon(old_page) && !PageKsm(old_page)) { /*这里先判断是否有其他进程竞争,修改了页表*/ if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); lock_page(old_page); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); page_cache_release(old_page); goto unlock; } page_cache_release(old_page); } /*确定没有其他进程竞争,则进行reuse判断,通过reuse_swap_page()函数判断 old_page的_mapcount字段是否为0,是的话则表明只有一个进程使用该匿名页*/ reuse = reuse_swap_page(old_page); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) {//如果vma的映射本来就是共享且可写的 /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by * get_user_pages(.write=1, .force=1). */ if (vma->vm_ops && vma->vm_ops->page_mkwrite) { struct vm_fault vmf; int tmp; vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = old_page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.page = old_page; /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait * for the page to get into an appropriate state. * * We do this without the lock held, so that it can * sleep if it needs to. */ page_cache_get(old_page);//增加old_page的引用计数作为保护 pte_unmap_unlock(page_table, ptl); /*这里通知即将修改页的权限*/ tmp = vma->vm_ops->page_mkwrite(vma, &vmf); /*如果无法修改的话,则跳转到unwritable_page*/ if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { ret = tmp; goto unwritable_page; } if (unlikely(!(tmp & VM_FAULT_LOCKED))) { lock_page(old_page); if (!old_page->mapping) { ret = 0; /* retry the fault */ unlock_page(old_page); goto unwritable_page; } } else VM_BUG_ON(!PageLocked(old_page)); /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ /*走到这里表示已经成功修改了页的权限了,这里同样重新获取页表,判断是否和之前一致*/ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); page_cache_release(old_page); goto unlock; } page_mkwrite = 1; } dirty_page = old_page; get_page(dirty_page); reuse = 1; } if (reuse) {//reuse处理,也就是说不进行COW,可以直接在old_page上进行写操作 reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte);//标记_PAGE_ACCESSED位 entry = maybe_mkwrite(pte_mkdirty(entry), vma);//将页的权限修改为可读可写,并且标记为脏页 if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, entry); ret |= VM_FAULT_WRITE; goto unlock; } /* * Ok, we need to copy. Oh, well.. */ /***************终于走到了不得已的一步了,下面只好进行COW了********************/ page_cache_get(old_page); gotten: pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, address);//分配一个零页面 if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);//分配一个非零页面 if (!new_page) goto oom; cow_user_page(new_page, old_page, address, vma);//将old_page中的数据拷贝到new_page } __SetPageUptodate(new_page); /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if ((vma->vm_flags & VM_LOCKED) && old_page) { lock_page(old_page); /* for LRU manipulation */ clear_page_mlock(old_page); unlock_page(old_page); } if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; /* * Re-check the pte - we dropped the lock */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter(mm, file_rss); inc_mm_counter(mm, anon_rss); } } else inc_mm_counter(mm, anon_rss); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot);//获取new_page的pte entry = maybe_mkwrite(pte_mkdirty(entry), vma);//修改new_page的权限 /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); if (old_page) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * page_remove_rmap with the ptp_clear_flush above. * Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in page_remove_rmap. * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ page_remove_rmap(old_page); } /* Free the old page.. */ new_page = old_page; ret |= VM_FAULT_WRITE; } else mem_cgroup_uncharge_page(new_page); if (new_page) page_cache_release(new_page); if (old_page) page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); if (dirty_page) { /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * * do_no_page is protected similarly. */ if (!page_mkwrite) { wait_on_page_locked(dirty_page); set_page_dirty_balance(dirty_page, page_mkwrite); } put_page(dirty_page); if (page_mkwrite) { struct address_space *mapping = dirty_page->mapping; set_page_dirty(dirty_page); unlock_page(dirty_page); page_cache_release(dirty_page); if (mapping) { /* * Some device drivers do not set page.mapping * but still dirty their pages */ balance_dirty_pages_ratelimited(mapping); } } /* file_update_time outside page_lock */ if (vma->vm_file) file_update_time(vma->vm_file); } return ret; oom_free_new: page_cache_release(new_page); oom: if (old_page) { if (page_mkwrite) { unlock_page(old_page); page_cache_release(old_page); } page_cache_release(old_page); } return VM_FAULT_OOM; unwritable_page: page_cache_release(old_page); return ret; }