匿名页面:
do_anonymous_page():
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
{
………………
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),---------------(1)
vma->vm_page_prot));
goto setpte;
………………
page = alloc_zeroed_user_highpage_movable(vma, address);---------------(2)
lru_cache_add_active_or_unevictable(page, vma);------------------(3)
setpte:
set_pte_at(mm, address, page_table, entry);----------------------(4)
}
(1)如果分配的页面具有只读属性,则系统会分配一个全填充为零的页面,零页面在系统初始化时候已经初始化好了
(2)如果不是只读,则正常调用alloc_zeroed_user_highpage_movable()最后调用alloc_page()分配一个页面
(3)将匿名页面添加到LRU链表中。
(4)调用set_pte_at()设置到硬件页表中。
具体流程图如下:
文件映射缺页中断:
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
pgoff_t pgoff = (((address & PAGE_MASK)
- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
pte_unmap(page_table);
/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
if (!vma->vm_ops->fault)
return VM_FAULT_SIGBUS;
if (!(flags & FAULT_FLAG_WRITE))--------------------(1)
return do_read_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
if (!(vma->vm_flags & VM_SHARED))----------------(2)
return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
orig_pte);
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);-------------(3)
}
(1)如果需要获取的页面不具备可写属性则执行do_read_fault().
(2)如果需要获取的页面具有可写属性,但为私有页面,则执行do_cow_fault().
(3)其他情况,共享页面则执行do_shared_fault().
do_read_fault():
static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);-----------(1)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
pte = pte_offset_map_lock(mm, pmd, address, &ptl);------------------(2)
…………
do_set_pte(vma, address, fault_page, pte, false, false);---------------(3)
}
(1)调用__do_fault()进而调用vm_ops.fault()函数来完成页面的申请,vm_ops.fault函数主要有个模块自己实现,例如IO的:
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
追踪代码最后发现申请页面最后仍然使用的是alloc_pages()函数来实现。
(2)获取当前页表项pte
(3)将新生成的PTE entry设置到硬件页表项中
do_cow_fault():
static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
new_page = alloc_page_vma(gfp, vma, address);------------(1)
…………
ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);------(2)
…………
if (fault_page)
copy_user_highpage(new_page, fault_page, address, vma);----------(3)
…………
pte = pte_offset_map_lock(mm, pmd, address, &ptl);------------(4)
…………
do_set_pte(vma, address, new_page, pte, true, true);----------(5)
lru_cache_add_active_or_unevictable(new_page, vma);------------(6)
…………
}
(1)申请一个新的页面。
(2)使用__do_fault通过vma->vm_ops->fault()将文件内容读取到fault_page页面。
(3)如果fault_page存在,则将fault_page的内容复制到new_page中。
(4)重新获取异常地址对应的页表项。
(5)将new_page对应的PTE entry设置到硬件页表里面.
(6)将new_page页面添加到对应的LRU链表。
do_shared_page():
static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
…………
ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);------------(1)
if (vma->vm_ops->page_mkwrite) {
…………
tmp = do_page_mkwrite(vma, fault_page, address);------------(2)
…………
}
pte = pte_offset_map_lock(mm, pmd, address, &ptl);--------------(3)
…………
do_set_pte(vma, address, fault_page, pte, true, false);----------(4)
if (set_page_dirty(fault_page))---------(5)
dirtied = 1;
…………
if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {------------(6)
/*
* Some device drivers do not set page.mapping but still
* dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
…………
}
(1)读取文件到fault_page中
(2)使页面变为可写页面(与do_read_page()函数不同之处)
(3)获取fault_page对应的pte
(4)将新生成的PTE entry设置到硬件页表中
(5)将page标记为dirty(与do_read_page()函数不同之处)
(6)通过balance_dirty_pages_ratelimited()来平衡并回写一部分脏页。
写时复制:
do_wp_page():
static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
spinlock_t *ptl, pte_t orig_pte, unsigned int flags)
__releases(ptl)
{
…………
old_page = vm_normal_page(vma, address, orig_pte);--------------(1)
if (!old_page) {
/*
* VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
* VM_PFNMAP VMA.
*
* We should not cow pages in a shared writeable mapping.
* Just mark the pages writable and/or call ops->pfn_mkwrite.
*/
if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))
return wp_pfn_shared(mm, vma, address, page_table, ptl,
orig_pte, pmd);-----------(2)
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page, gfp);---------(3)
}
/*
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
if (PageAnon(old_page) && !PageKsm(old_page)) {----------(4)
if (!trylock_page(old_page)) {
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);
lock_page(old_page);
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte)) {
unlock_page(old_page);
pte_unmap_unlock(page_table, ptl);
page_cache_release(old_page);
return 0;
}
page_cache_release(old_page);
}
if (reuse_swap_page(old_page)) {-------------(5)
/*
* The page is all ours. Move it to our anon_vma so
* the rmap code will not search our parent or siblings.
* Protected against the rmap code by the page lock.
*/
page_move_anon_rmap(old_page, vma, address);
unlock_page(old_page);
return wp_page_reuse(mm, vma, address, page_table, ptl,
orig_pte, old_page, 0, 0);----------------(6)
}
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(mm, vma, address, page_table, pmd,
ptl, orig_pte, old_page);--------------(7)
}
/*
* Ok, we need to copy. Oh, well..
*/
page_cache_get(old_page);-----------(7)
pte_unmap_unlock(page_table, ptl);
return wp_page_copy(mm, vma, address, page_table, pmd,
orig_pte, old_page, gfp);-----------------(8)
}
(1)获取一个normal_mapping的页面
(2)处理special mapping的情况,如果vma是可写且共享,则调用wp_pfn_shared(),继续使用这个页面,不做写时复制操作
(3)否则调用wp_page_copy()重新分配一个页面进行写时复制。
(4)处理Anon page且不是KSM的情况,主要是加锁以及增加page引用计数。
(5)通过reuse_swap_page()函数判断page的count值是否为1,判断页面是否为只有一个进程映射的匿名页面,如果是则继续使用此页面,不做写时复制操作。
(6)如果是page cache或者KSM页面,然后继续使用此页面不做写时复制
(7)增加page->count计数
(8)此时需要写时复制,调用wp_page_copy()完成操作。
wp_pfn_shared():
static int wp_pfn_shared(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
pmd_t *pmd)
{
…………
if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) {
…………
ret = vma->vm_ops->pfn_mkwrite(vma, &vmf);---------(1)
…………
}
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);------------------(2)
…………
return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte,
NULL, 0, 0);------------(3)
}
static inline int wp_page_reuse(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *page_table, spinlock_t *ptl, pte_t orig_pte,
struct page *page, int page_mkwrite,
int dirty_shared)
__releases(ptl)
{
…………
flush_cache_page(vma, address, pte_pfn(orig_pte));-----------(4)
entry = pte_mkyoung(orig_pte);-------------(4)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);-------------(5)
if (dirty_shared) {-------(6)
…………
dirtied = set_page_dirty(page);-----------(7)
…………
if ((dirtied || page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);---------(8)
}
}
…………
}
(1)通知之前的只读页面变成了可写属性
(2)获取页面对用的额PTE entry
(3)调用wp_page_reuse进一步设置页面相关属性
(4)刷新页面对应cache
(5)设置pte访问位
(6)根据pte的标志位设置页面的可写属性,同时设置pte的dirty位
(7)如果是dirty_share页面,则设置页面的dirty位
(8)平衡页面并回写一部分脏页
以上只是根据字面意思来理解,不是很懂,大体上和缺页中断发生非写时复制操作类似。
开始写时复制的核心函数wp_page_copy():
static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
pte_t orig_pte, struct page *old_page, gfp_t gfp)
{
…………
if (unlikely(anon_vma_prepare(vma)))
goto oom;
if (is_zero_pfn(pte_pfn(orig_pte))) {
new_page = alloc_zeroed_user_highpage(gfp, vma, address);--------------(1)
if (!new_page)
goto oom;
} else {
new_page = alloc_page_vma(gfp, vma, address);-----------(2)
if (!new_page)
goto oom;
cow_user_page(new_page, old_page, address, vma);----------------(3)
}
…………
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);-----------(4)
if (likely(pte_same(*page_table, orig_pte))) {
…………
flush_cache_page(vma, address, pte_pfn(orig_pte));-----------(5)
entry = mk_pte(new_page, vma->vm_page_prot);---------------(6)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);---------------(7)
…………
page_add_new_anon_rmap(new_page, vma, address);------------(8)
lru_cache_add_active_or_unevictable(new_page, vma);---------------(9)
}
set_pte_at_notify(mm, address, page_table, entry);-------------(10)
update_mmu_cache(vma, address, page_table);-------------------(11)
}
(1)判断old page是否为零页面,如果是则alloc_zeroed_user_highpage()分配一个全是零的页面
(2)不是0页面则分配一个新页面new_page
(3)将就页面的内容复制到新页面。
(4)重新获取pte,并判断pte是否被修改过
(5)利用新页面和VMA属性重新生成一个PTE entry
(6)刷新page对应的cache
(7)设置PTE entry的DIRTY和WIRTABLE位
(8)把new_page添加到RMAP反向映射
(9)将new_page添加到对应的LRU链表
(10)将新生成的PTE entry设置到硬件页表
(11)更新MMU cache。
总结一下,以上步骤基本上和file映射里面发生写时复制的操作大同小异。
具体流程图如下: