当进程 A 使用系统调用 fock 创建一个子进程 B 时,由于子进程 B 实际上是父进程 A 的一个拷贝,因
此会拥有与父进程相同的物理页面。也即为了达到节约内存和加快创建速度的目标, fork()函数会让子进程
B 以只读方式共享父进程 A 的物理页面。同时将父进程 A 对这些物理页面的访问权限也设成只读!!!。
这样一来,当父进程 A 或子进程 B 任何一方对这些以共享的物理页面执行写操作时,都会产生页面出错异常(page_fault int14)中断。当这个异常发生时,最终会调用到handle_pte_fault()函数来进行pte权限的检查。
来看一下handle_pte_fault()函数。
int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
//如果pte的presetn flag为0,表示页面根本不在内存当中,所以需要检查匿名页,nonlinear,swap等。这个后
//面再讲
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
if (pte_file(entry))
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
...
....
//这里可以看到如果是写操作但检查pte没有写权限,则会调用do_wp_page来分配一个新页面并设置写权限,然后把旧页
//面复制到新的页面了
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
...
}
从上面的代码可以看到,如果发现是写操作异常,而且pte flags没有write权限,则会调用do_wp_page()来试图解决这个异常。根据<<深入理解Linux内核>>(第三版)的说明,do_wp_page()会检查_count这个page flag来决定是否真正需要COW。因为page flag,_count表示当前有几个process在映射或者使用这个page。初始值为-1,有process分配这个page的时候会加1。所以当发现_count为0的时候是不需要做COW的,只需要使用就可以。但判断_count值来判断有几个process映射这个page的过程比较复杂。因为放到swap cache的时候_count值也会加1,而且PG_private也会被设置。判断完之后,如果发现不需要COW,直接就mark当前的page对应的pte flag权限为write,就不会再发生write异常了(?????还没看到do_wp_page()函数哪里会检查_count ~~~~ 需要继续看一下)。
后面当然就是发现一定要COW,就重新申请一个page,然后把旧的page拷贝到新的page的过程了。
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte)
__releases(ptl)
{
struct page *old_page, *new_page = NULL;
pte_t entry;
int ret = 0;
int page_mkwrite = 0;
struct page *dirty_page = NULL;
unsigned long mmun_start = 0; /* For mmu_notifiers */
unsigned long mmun_end = 0; /* For mmu_notifiers */
old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page) {
/* * VM_MIXEDMAP !pfn_valid() case * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty * accounting on raw pfn maps. */
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
goto reuse;
goto gotten;
}
/* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */
if (PageAnon(old_page) && !PageKsm(old_page)) {
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
goto unlock;
}
page_cache_release(old_page);
}
if (reuse_swap_page(old_page)) {
/* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */
page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
goto reuse;
}
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
/* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by * get_user_pages(.write=1, .force=1). */
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
struct vm_fault vmf;
int tmp;
vmf.virtual_address = (void __user *)(address &
PAGE_MASK);
vmf.pgoff = old_page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = old_page;
/* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait * for the page to get into an appropriate state. * * We do this without the lock held, so that it can * sleep if it needs to. */
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(tmp &
(VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
ret = tmp;
goto unwritable_page;
}
if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
lock_page(old_page);
if (!old_page->mapping) {
ret = 0; /* retry the fault */
unlock_page(old_page);
goto unwritable_page;
}
} else
VM_BUG_ON(!PageLocked(old_page));
/* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
goto unlock;
}
page_mkwrite = 1;
}
dirty_page = old_page;
get_page(dirty_page);
reuse:
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
if (ptep_set_access_flags(vma, address, page_table, entry,1))
update_mmu_cache(vma, address, page_table);
pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
if (!dirty_page)
return ret;
/* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * * __do_fault is protected similarly. */
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page, page_mkwrite);
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
}
put_page(dirty_page);
if (page_mkwrite) {
struct address_space *mapping = dirty_page->mapping;
set_page_dirty(dirty_page);
unlock_page(dirty_page);
page_cache_release(dirty_page);
if (mapping) {
/* * Some device drivers do not set page.mapping * but still dirty their pages */
balance_dirty_pages_ratelimited(mapping);
}
}
return ret;
}
/* * Ok, we need to copy. Oh, well.. */
page_cache_get(old_page);
gotten:
pte_unmap_unlock(page_table, ptl);
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage_movable(vma, address);
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);
}
__SetPageUptodate(new_page);
if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
mmun_start = address & PAGE_MASK;
mmun_end = mmun_start + PAGE_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/* * Re-check the pte - we dropped the lock */
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (likely(pte_same(*page_table, orig_pte))) {
if (old_page) {
if (!PageAnon(old_page)) {
dec_mm_counter_fast(mm, MM_FILEPAGES);
inc_mm_counter_fast(mm, MM_ANONPAGES);
}
} else
inc_mm_counter_fast(mm, MM_ANONPAGES);
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */
ptep_clear_flush(vma, address, page_table);
page_add_new_anon_rmap(new_page, vma, address);
/* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */
set_pte_at_notify(mm, address, page_table, entry);
update_mmu_cache(vma, address, page_table);
if (old_page) {
/* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * page_remove_rmap with the ptp_clear_flush above. * Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in page_remove_rmap. * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */
page_remove_rmap(old_page);
}
/* Free the old page.. */
new_page = old_page;
ret |= VM_FAULT_WRITE;
} else
mem_cgroup_uncharge_page(new_page);
if (new_page)
page_cache_release(new_page);
unlock:
pte_unmap_unlock(page_table, ptl);
if (mmun_end > mmun_start)
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
if (old_page) {
/* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */
if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
lock_page(old_page); /* LRU manipulation */
munlock_vma_page(old_page);
unlock_page(old_page);
}
page_cache_release(old_page);
}
return ret;
oom_free_new:
page_cache_release(new_page);
oom:
if (old_page)
page_cache_release(old_page);
return VM_FAULT_OOM;
unwritable_page:
page_cache_release(old_page);
return ret;
}
Linux几种进程创建函数对COW的实现:
http://www.cnblogs.com/biyeymyhjob/archive/2012/07/20/2601655.html