Linux内核架构: 关于写时复制(COW)机制

当进程 A 使用系统调用 fock 创建一个子进程 B 时,由于子进程 B 实际上是父进程 A 的一个拷贝,因
此会拥有与父进程相同的物理页面。也即为了达到节约内存和加快创建速度的目标, fork()函数会让子进程
B 以只读方式共享父进程 A 的物理页面。同时将父进程 A 对这些物理页面的访问权限也设成只读!!!
这样一来,当父进程 A 或子进程 B 任何一方对这些以共享的物理页面执行写操作时,都会产生页面出错异常(page_fault int14)中断。当这个异常发生时,最终会调用到handle_pte_fault()函数来进行pte权限的检查。
来看一下handle_pte_fault()函数。

int handle_pte_fault(struct mm_struct *mm,
             struct vm_area_struct *vma, unsigned long address,
             pte_t *pte, pmd_t *pmd, unsigned int flags)
{
        //如果pte的presetn flag为0,表示页面根本不在内存当中,所以需要检查匿名页,nonlinear,swap等。这个后
        //面再讲
        if (!pte_present(entry)) { 
            if (pte_none(entry)) {
                if (vma->vm_ops) {
                    if (likely(vma->vm_ops->fault))
                        return do_linear_fault(mm, vma, address,
                            pte, pmd, flags, entry);
                }
                return do_anonymous_page(mm, vma, address,
                             pte, pmd, flags);
            }
            if (pte_file(entry))
                return do_nonlinear_fault(mm, vma, address,
                        pte, pmd, flags, entry);
            return do_swap_page(mm, vma, address,
                        pte, pmd, flags, entry);
    }
    ...
    ....
    //这里可以看到如果是写操作但检查pte没有写权限,则会调用do_wp_page来分配一个新页面并设置写权限,然后把旧页
    //面复制到新的页面了
    if (flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))
            return do_wp_page(mm, vma, address,
                    pte, pmd, ptl, entry);
        entry = pte_mkdirty(entry);
    }
    ...

}

从上面的代码可以看到,如果发现是写操作异常,而且pte flags没有write权限,则会调用do_wp_page()来试图解决这个异常。根据<<深入理解Linux内核>>(第三版)的说明,do_wp_page()会检查_count这个page flag来决定是否真正需要COW。因为page flag,_count表示当前有几个process在映射或者使用这个page。初始值为-1,有process分配这个page的时候会加1。所以当发现_count为0的时候是不需要做COW的,只需要使用就可以。但判断_count值来判断有几个process映射这个page的过程比较复杂。因为放到swap cache的时候_count值也会加1,而且PG_private也会被设置。判断完之后,如果发现不需要COW,直接就mark当前的page对应的pte flag权限为write,就不会再发生write异常了(?????还没看到do_wp_page()函数哪里会检查_count ~~~~ 需要继续看一下)。
后面当然就是发现一定要COW,就重新申请一个page,然后把旧的page拷贝到新的page的过程了。

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long address, pte_t *page_table, pmd_t *pmd,
        spinlock_t *ptl, pte_t orig_pte)
    __releases(ptl)
{
    struct page *old_page, *new_page = NULL;
    pte_t entry;
    int ret = 0;
    int page_mkwrite = 0;
    struct page *dirty_page = NULL;
    unsigned long mmun_start = 0;   /* For mmu_notifiers */
    unsigned long mmun_end = 0; /* For mmu_notifiers */

    old_page = vm_normal_page(vma, address, orig_pte);
    if (!old_page) {
        /* * VM_MIXEDMAP !pfn_valid() case * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty * accounting on raw pfn maps. */
        if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                     (VM_WRITE|VM_SHARED))
            goto reuse;
        goto gotten;
    }

    /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */
    if (PageAnon(old_page) && !PageKsm(old_page)) {
        if (!trylock_page(old_page)) {
            page_cache_get(old_page);
            pte_unmap_unlock(page_table, ptl);
            lock_page(old_page);
            page_table = pte_offset_map_lock(mm, pmd, address,
                             &ptl);
            if (!pte_same(*page_table, orig_pte)) {
                unlock_page(old_page);
                goto unlock;
            }
            page_cache_release(old_page);
        }
        if (reuse_swap_page(old_page)) {
            /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */
            page_move_anon_rmap(old_page, vma, address);
            unlock_page(old_page);
            goto reuse;
        }
        unlock_page(old_page);
    } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
                    (VM_WRITE|VM_SHARED))) {
        /* * Only catch write-faults on shared writable pages, * read-only shared pages can get COWed by * get_user_pages(.write=1, .force=1). */
        if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
            struct vm_fault vmf;
            int tmp;

            vmf.virtual_address = (void __user *)(address &
                                PAGE_MASK);
            vmf.pgoff = old_page->index;
            vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
            vmf.page = old_page;

            /* * Notify the address space that the page is about to * become writable so that it can prohibit this or wait * for the page to get into an appropriate state. * * We do this without the lock held, so that it can * sleep if it needs to. */
            page_cache_get(old_page);
            pte_unmap_unlock(page_table, ptl);

            tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
            if (unlikely(tmp &
                    (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
                ret = tmp;
                goto unwritable_page;
            }
            if (unlikely(!(tmp & VM_FAULT_LOCKED))) {
                lock_page(old_page);
                if (!old_page->mapping) {
                    ret = 0; /* retry the fault */
                    unlock_page(old_page);
                    goto unwritable_page;
                }
            } else
                VM_BUG_ON(!PageLocked(old_page));

            /* * Since we dropped the lock we need to revalidate * the PTE as someone else may have changed it. If * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */
            page_table = pte_offset_map_lock(mm, pmd, address,
                             &ptl);
            if (!pte_same(*page_table, orig_pte)) {
                unlock_page(old_page);
                goto unlock;
            }

            page_mkwrite = 1;
        }
        dirty_page = old_page;
        get_page(dirty_page);

reuse:
        flush_cache_page(vma, address, pte_pfn(orig_pte));
        entry = pte_mkyoung(orig_pte);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (ptep_set_access_flags(vma, address, page_table, entry,1))
            update_mmu_cache(vma, address, page_table);
        pte_unmap_unlock(page_table, ptl);
        ret |= VM_FAULT_WRITE;

        if (!dirty_page)
            return ret;

        /* * Yes, Virginia, this is actually required to prevent a race * with clear_page_dirty_for_io() from clearing the page dirty * bit after it clear all dirty ptes, but before a racing * do_wp_page installs a dirty pte. * * __do_fault is protected similarly. */
        if (!page_mkwrite) {
            wait_on_page_locked(dirty_page);
            set_page_dirty_balance(dirty_page, page_mkwrite);
            /* file_update_time outside page_lock */
            if (vma->vm_file)
                file_update_time(vma->vm_file);
        }
        put_page(dirty_page);
        if (page_mkwrite) {
            struct address_space *mapping = dirty_page->mapping;

            set_page_dirty(dirty_page);
            unlock_page(dirty_page);
            page_cache_release(dirty_page);
            if (mapping)    {
                /* * Some device drivers do not set page.mapping * but still dirty their pages */
                balance_dirty_pages_ratelimited(mapping);
            }
        }

        return ret;
    }

    /* * Ok, we need to copy. Oh, well.. */
    page_cache_get(old_page);
gotten:
    pte_unmap_unlock(page_table, ptl);

    if (unlikely(anon_vma_prepare(vma)))
        goto oom;

    if (is_zero_pfn(pte_pfn(orig_pte))) {
        new_page = alloc_zeroed_user_highpage_movable(vma, address);
        if (!new_page)
            goto oom;
    } else {
        new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
        if (!new_page)
            goto oom;
        cow_user_page(new_page, old_page, address, vma);
    }
    __SetPageUptodate(new_page);

    if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
        goto oom_free_new;

    mmun_start  = address & PAGE_MASK;
    mmun_end    = mmun_start + PAGE_SIZE;
    mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);

    /* * Re-check the pte - we dropped the lock */
    page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
    if (likely(pte_same(*page_table, orig_pte))) {
        if (old_page) {
            if (!PageAnon(old_page)) {
                dec_mm_counter_fast(mm, MM_FILEPAGES);
                inc_mm_counter_fast(mm, MM_ANONPAGES);
            }
        } else
            inc_mm_counter_fast(mm, MM_ANONPAGES);
        flush_cache_page(vma, address, pte_pfn(orig_pte));
        entry = mk_pte(new_page, vma->vm_page_prot);
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */
        ptep_clear_flush(vma, address, page_table);
        page_add_new_anon_rmap(new_page, vma, address);
        /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */
        set_pte_at_notify(mm, address, page_table, entry);
        update_mmu_cache(vma, address, page_table);
        if (old_page) {
            /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * page_remove_rmap with the ptp_clear_flush above. * Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in page_remove_rmap. * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */
            page_remove_rmap(old_page);
        }

        /* Free the old page.. */
        new_page = old_page;
        ret |= VM_FAULT_WRITE;
    } else
        mem_cgroup_uncharge_page(new_page);

    if (new_page)
        page_cache_release(new_page);
unlock:
    pte_unmap_unlock(page_table, ptl);
    if (mmun_end > mmun_start)
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
    if (old_page) {
        /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */
        if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) {
            lock_page(old_page);    /* LRU manipulation */
            munlock_vma_page(old_page);
            unlock_page(old_page);
        }
        page_cache_release(old_page);
    }
    return ret;
oom_free_new:
    page_cache_release(new_page);
oom:
    if (old_page)
        page_cache_release(old_page);
    return VM_FAULT_OOM;

unwritable_page:
    page_cache_release(old_page);
    return ret;
}

Linux几种进程创建函数对COW的实现:
http://www.cnblogs.com/biyeymyhjob/archive/2012/07/20/2601655.html

你可能感兴趣的:(Linux内核架构: 关于写时复制(COW)机制)