缺页异常的几种情况处理机制简介

匿名页面:

do_anonymous_page():

static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pte_t *page_table, pmd_t *pmd,
		unsigned int flags)
{
………………
	if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),---------------(1)
						vma->vm_page_prot));
    goto setpte;
………………
	page = alloc_zeroed_user_highpage_movable(vma, address);---------------(2)
    lru_cache_add_active_or_unevictable(page, vma);------------------(3)
setpte:
	set_pte_at(mm, address, page_table, entry);----------------------(4)
}

(1)如果分配的页面具有只读属性,则系统会分配一个全填充为零的页面,零页面在系统初始化时候已经初始化好了

(2)如果不是只读,则正常调用alloc_zeroed_user_highpage_movable()最后调用alloc_page()分配一个页面

(3)将匿名页面添加到LRU链表中。

(4)调用set_pte_at()设置到硬件页表中。

具体流程图如下:

缺页异常的几种情况处理机制简介_第1张图片

文件映射缺页中断:

static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pte_t *page_table, pmd_t *pmd,
		unsigned int flags, pte_t orig_pte)
{
	pgoff_t pgoff = (((address & PAGE_MASK)
			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

	pte_unmap(page_table);
	/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
	if (!vma->vm_ops->fault)
		return VM_FAULT_SIGBUS;
	if (!(flags & FAULT_FLAG_WRITE))--------------------(1)
		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
				orig_pte);
	if (!(vma->vm_flags & VM_SHARED))----------------(2)
		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
				orig_pte);
	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);-------------(3)
}

(1)如果需要获取的页面不具备可写属性则执行do_read_fault().

(2)如果需要获取的页面具有可写属性,但为私有页面,则执行do_cow_fault().

(3)其他情况,共享页面则执行do_shared_fault().

do_read_fault():

static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pmd_t *pmd,
		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
    ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);-----------(1)
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		return ret;
    pte = pte_offset_map_lock(mm, pmd, address, &ptl);------------------(2)
…………
    do_set_pte(vma, address, fault_page, pte, false, false);---------------(3)
}

(1)调用__do_fault()进而调用vm_ops.fault()函数来完成页面的申请,vm_ops.fault函数主要有个模块自己实现,例如IO的:

static const struct vm_operations_struct ext4_file_vm_ops = {
    .fault        = ext4_filemap_fault,
    .map_pages    = filemap_map_pages,
    .page_mkwrite   = ext4_page_mkwrite,
};

追踪代码最后发现申请页面最后仍然使用的是alloc_pages()函数来实现。

(2)获取当前页表项pte

(3)将新生成的PTE entry设置到硬件页表项中

do_cow_fault():

static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pmd_t *pmd,
		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
    new_page = alloc_page_vma(gfp, vma, address);------------(1)
…………
    ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);------(2)
…………
   	if (fault_page)
		copy_user_highpage(new_page, fault_page, address, vma);----------(3)
…………
    pte = pte_offset_map_lock(mm, pmd, address, &ptl);------------(4)
…………
    do_set_pte(vma, address, new_page, pte, true, true);----------(5)
    lru_cache_add_active_or_unevictable(new_page, vma);------------(6)
…………
}

(1)申请一个新的页面。

(2)使用__do_fault通过vma->vm_ops->fault()将文件内容读取到fault_page页面。

(3)如果fault_page存在,则将fault_page的内容复制到new_page中。

(4)重新获取异常地址对应的页表项。

(5)将new_page对应的PTE entry设置到硬件页表里面.

(6)将new_page页面添加到对应的LRU链表。

do_shared_page():

static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pmd_t *pmd,
		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
    ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);------------(1)
	if (vma->vm_ops->page_mkwrite) {
…………
		tmp = do_page_mkwrite(vma, fault_page, address);------------(2)
…………
    }
    pte = pte_offset_map_lock(mm, pmd, address, &ptl);--------------(3)
…………
    do_set_pte(vma, address, fault_page, pte, true, false);----------(4)
    if (set_page_dirty(fault_page))---------(5)
		dirtied = 1;
…………
    if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {------------(6)
		/*
		 * Some device drivers do not set page.mapping but still
		 * dirty their pages
		 */
		balance_dirty_pages_ratelimited(mapping);
	}
…………
}

(1)读取文件到fault_page中

(2)使页面变为可写页面(与do_read_page()函数不同之处)

(3)获取fault_page对应的pte

(4)将新生成的PTE entry设置到硬件页表中

(5)将page标记为dirty(与do_read_page()函数不同之处)

(6)通过balance_dirty_pages_ratelimited()来平衡并回写一部分脏页。

写时复制:

do_wp_page():

static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pte_t *page_table, pmd_t *pmd,
		spinlock_t *ptl, pte_t orig_pte, unsigned int flags)
	__releases(ptl)
{
…………
	old_page = vm_normal_page(vma, address, orig_pte);--------------(1)
	if (!old_page) {
		/*
		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
		 * VM_PFNMAP VMA.
		 *
		 * We should not cow pages in a shared writeable mapping.
		 * Just mark the pages writable and/or call ops->pfn_mkwrite.
		 */
		if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
				     (VM_WRITE|VM_SHARED))
			return wp_pfn_shared(mm, vma, address, page_table, ptl,
					     orig_pte, pmd);-----------(2)

		pte_unmap_unlock(page_table, ptl);
		return wp_page_copy(mm, vma, address, page_table, pmd,
				    orig_pte, old_page, gfp);---------(3)
	}
/*
	 * Take out anonymous pages first, anonymous shared vmas are
	 * not dirty accountable.
	 */
	if (PageAnon(old_page) && !PageKsm(old_page)) {----------(4)
		if (!trylock_page(old_page)) {
			page_cache_get(old_page);
			pte_unmap_unlock(page_table, ptl);
			lock_page(old_page);
			page_table = pte_offset_map_lock(mm, pmd, address,
							 &ptl);
			if (!pte_same(*page_table, orig_pte)) {
				unlock_page(old_page);
				pte_unmap_unlock(page_table, ptl);
				page_cache_release(old_page);
				return 0;
			}
			page_cache_release(old_page);
		}
		if (reuse_swap_page(old_page)) {-------------(5)
			/*
			 * The page is all ours.  Move it to our anon_vma so
			 * the rmap code will not search our parent or siblings.
			 * Protected against the rmap code by the page lock.
			 */
			page_move_anon_rmap(old_page, vma, address);
			unlock_page(old_page);
			return wp_page_reuse(mm, vma, address, page_table, ptl,
					     orig_pte, old_page, 0, 0);----------------(6)
		}
		unlock_page(old_page);
	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
					(VM_WRITE|VM_SHARED))) {
		return wp_page_shared(mm, vma, address, page_table, pmd,
				      ptl, orig_pte, old_page);--------------(7)
	}

	/*
	 * Ok, we need to copy. Oh, well..
	 */
	page_cache_get(old_page);-----------(7)

	pte_unmap_unlock(page_table, ptl);
	return wp_page_copy(mm, vma, address, page_table, pmd,
			    orig_pte, old_page, gfp);-----------------(8)
}

(1)获取一个normal_mapping的页面

(2)处理special mapping的情况,如果vma是可写且共享,则调用wp_pfn_shared(),继续使用这个页面,不做写时复制操作

(3)否则调用wp_page_copy()重新分配一个页面进行写时复制。

(4)处理Anon page且不是KSM的情况,主要是加锁以及增加page引用计数。

(5)通过reuse_swap_page()函数判断page的count值是否为1,判断页面是否为只有一个进程映射的匿名页面,如果是则继续使用此页面,不做写时复制操作。

(6)如果是page cache或者KSM页面,然后继续使用此页面不做写时复制

(7)增加page->count计数

(8)此时需要写时复制,调用wp_page_copy()完成操作。

wp_pfn_shared():

static int wp_pfn_shared(struct mm_struct *mm,
			struct vm_area_struct *vma, unsigned long address,
			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
			pmd_t *pmd)
{
…………
	if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
…………
    ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);---------(1)
…………
    }
    page_table = pte_offset_map_lock(mm, pmd, address, &ptl);------------------(2)
…………
    return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
			     NULL, 0, 0);------------(3)
}

static inline int wp_page_reuse(struct mm_struct *mm,
			struct vm_area_struct *vma, unsigned long address,
			pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
			struct page *page, int page_mkwrite,
			int dirty_shared)
	__releases(ptl)
{
…………
	flush_cache_page(vma, address, pte_pfn(orig_pte));-----------(4)
	entry = pte_mkyoung(orig_pte);-------------(4)
	entry = maybe_mkwrite(pte_mkdirty(entry), vma);-------------(5)
	if (dirty_shared) {-------(6)
…………
    dirtied = set_page_dirty(page);-----------(7)
…………
        if ((dirtied || page_mkwrite) && mapping) {
			/*
			 * Some device drivers do not set page.mapping
			 * but still dirty their pages
			 */
			balance_dirty_pages_ratelimited(mapping);---------(8)
	    }
    }
…………
}

(1)通知之前的只读页面变成了可写属性

(2)获取页面对用的额PTE entry

(3)调用wp_page_reuse进一步设置页面相关属性

(4)刷新页面对应cache

(5)设置pte访问位

(6)根据pte的标志位设置页面的可写属性,同时设置pte的dirty位

(7)如果是dirty_share页面,则设置页面的dirty位

(8)平衡页面并回写一部分脏页

以上只是根据字面意思来理解,不是很懂,大体上和缺页中断发生非写时复制操作类似。

开始写时复制的核心函数wp_page_copy():

static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
			unsigned long address, pte_t *page_table, pmd_t *pmd,
			pte_t orig_pte, struct page *old_page, gfp_t gfp)
{
…………
	if (unlikely(anon_vma_prepare(vma)))
		goto oom;
	if (is_zero_pfn(pte_pfn(orig_pte))) {
		new_page = alloc_zeroed_user_highpage(gfp, vma, address);--------------(1)
		if (!new_page)
			goto oom;
	} else {
		new_page = alloc_page_vma(gfp, vma, address);-----------(2)
		if (!new_page)
			goto oom;
		cow_user_page(new_page, old_page, address, vma);----------------(3)
	}
…………
	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);-----------(4)
	if (likely(pte_same(*page_table, orig_pte))) {
…………
		flush_cache_page(vma, address, pte_pfn(orig_pte));-----------(5)
		entry = mk_pte(new_page, vma->vm_page_prot);---------------(6)
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);---------------(7)
…………
		page_add_new_anon_rmap(new_page, vma, address);------------(8)
		lru_cache_add_active_or_unevictable(new_page, vma);---------------(9)
    }
        set_pte_at_notify(mm, address, page_table, entry);-------------(10)
		update_mmu_cache(vma, address, page_table);-------------------(11)
}

(1)判断old page是否为零页面,如果是则alloc_zeroed_user_highpage()分配一个全是零的页面

(2)不是0页面则分配一个新页面new_page

(3)将就页面的内容复制到新页面。

(4)重新获取pte,并判断pte是否被修改过

(5)利用新页面和VMA属性重新生成一个PTE entry

(6)刷新page对应的cache

(7)设置PTE entry的DIRTY和WIRTABLE位

(8)把new_page添加到RMAP反向映射

(9)将new_page添加到对应的LRU链表

(10)将新生成的PTE entry设置到硬件页表

(11)更新MMU cache。

总结一下,以上步骤基本上和file映射里面发生写时复制的操作大同小异。

具体流程图如下:

缺页异常的几种情况处理机制简介_第2张图片

你可能感兴趣的:(Linux,memory,management)