在下面几种情况下会发生,页面出错异常(也叫缺页中断):
1、相应的页面目录项或者页面表项为空,也就是该线性地址与物理地址的映射关系尚未建立,或者已经撤销。
2、相应的物理页面不在内存中。 本文讨论的就是这种情况。
3、指令中规定的访问方式与页面的权限不符,例如企图写一个“只读”的页面。
假设已经建立好了映射,但是页表项最后一位P为0,表示页面不在内存中;整个页表项如下图,offset表示页面在一个磁盘设备的位置,也就是磁盘设备的逻辑页面号;而type则是指该页面在哪一个磁盘设备中。
图 1 页面交换项结构
这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。
代码如下: arch/i386/mm/fault.c
asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct * vma; unsigned long address; unsigned long page; unsigned long fixup; int write; siginfo_t info; /* get the address */ __asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中 tsk = current;//task_struct /* * We fault-in kernel-space virtual memory on-demand. The * 'reference' page table is init_mm.pgd. * * NOTE! We MUST NOT take any locks for this case. We may * be in an interrupt or a critical region, and should * only copy the information from the master page table, * nothing more. */ if (address >= TASK_SIZE) goto vmalloc_fault; mm = tsk->mm;//mm_struct info.si_code = SEGV_MAPERR; /* * If we're in an interrupt or have no user * context, we must not take the fault.. */ if (in_interrupt() || !mm) goto no_context; down(&mm->mmap_sem); vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。 if (!vma)//没有找到,说明没有一个区间的结束地址高于给定的地址,参考上图,说明这个地址是在堆栈之下,也就是3G字节以上了。 goto bad_area; if (vma->vm_start <= address)//起始地址不高于address,说明映射已经建立,转到good_area去进一步检查失败原因。 goto good_area; if (!(vma->vm_flags & VM_GROWSDOWN)) goto bad_area; .... /* * Ok, we have a good vm_area for this memory access, so * we can handle it.. */ good_area: info.si_code = SEGV_ACCERR; write = 0; switch (error_code & 3) {// 110 & 011 = 2 default: /* 3: write, present */ #ifdef TEST_VERIFY_AREA if (regs->cs == KERNEL_CS) printk("WP fault at %08lx\n", regs->eip); #endif /* fall through */ case 2: /* write, not present */ if (!(vma->vm_flags & VM_WRITE)) goto bad_area; write++;//执行到这里 break; case 1: /* read, present */ goto bad_area; case 0: /* read, not present */ if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto bad_area; } /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo * the fault. */ switch (handle_mm_fault(mm, vma, address, write)) { case 1: tsk->min_flt++; break; case 2: tsk->maj_flt++; break; case 0: goto do_sigbus; default: goto out_of_memory; } /* * Did it hit the DOS screen memory VA from vm86 mode? */ if (regs->eflags & VM_MASK) { unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) tsk->thread.screen_bitmap |= 1 << bit; } up(&mm->mmap_sem); return; ....... }
内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs,它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。
error_code:
bit 0 == 0 means no page found, 1 means protection fault
bit 1 == 0 means read, 1 means write
bit 2 == 0 means kernel, 1 means user-mode 此时,error_code为110,用户态,页面不在内存中,写。
handle_mm_fault函数,代码如下:
int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access) { int ret = -1; pgd_t *pgd; pmd_t *pmd; pgd = pgd_offset(mm, address);//返回页面表项指针 pmd = pmd_alloc(pgd, address);//中转了一下,还是页目录表项指针 if (pmd) { pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针 if (pte) ret = handle_pte_fault(mm, vma, address, write_access, pte); } return ret; }
static inline int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access, pte_t * pte) { pte_t entry; /* * We need the page table lock to synchronize with kswapd * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); entry = *pte;//页表项中内容 if (!pte_present(entry)) {//页面不在内存中 /* * If it truly wasn't present, we know that kswapd * and the PTE updates will not touch it later. So * drop the lock. */ spin_unlock(&mm->page_table_lock); if (pte_none(entry))//页表项不为空 return do_no_page(mm, vma, address, write_access, pte); return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//执行到这里 } if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); spin_unlock(&mm->page_table_lock); return 1; }
do_swap_page函数,如下:
static int do_swap_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, pte_t * page_table, swp_entry_t entry, int write_access) { struct page *page = lookup_swap_cache(entry);//从hash表中寻找 pte_t pte; if (!page) { lock_kernel(); swapin_readahead(entry);//预读页面 page = read_swap_cache(entry);//真正得到一个页面,这个页面可能从hash表中寻找到,因为上面预读了。或者自己申请页面,并且从盘上将其内容读进来。 unlock_kernel(); if (!page) return -1; flush_page_to_ram(page); flush_icache_page(vma, page); } mm->rss++; pte = mk_pte(page, vma->vm_page_prot);//形成页表项 /* * Freeze the "shared"ness of the page, ie page_count + swap_count. * Must lock page before transferring our swap count to already * obtained page count. */ lock_page(page); swap_free(entry); if (write_access && !is_page_shared(page)) pte = pte_mkwrite(pte_mkdirty(pte));//页表项赋予已写过对应的物理页,可进行读、写或者执行 UnlockPage(page); set_pte(page_table, pte);//页表项(属性刚才已经设置了)指向对应的页面 /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); return 1; /* Minor fault */ }
一、下面分别解释各个函数。首先解释swapin_readahead函数,如下:
void swapin_readahead(swp_entry_t entry) { int i, num; struct page *new_page; unsigned long offset; /* * Get the number of handles we should do readahead io to. Also, * grab temporary references on them, releasing them as io completes. */ num = valid_swaphandles(entry, &offset); for (i = 0; i < num; offset++, i++) { ...... new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0); if (new_page != NULL) page_cache_release(new_page);//page使用计数减1 swap_free(SWP_ENTRY(SWP_TYPE(entry), offset)); } return; }
提前预读相邻的盘面,根据下面的描述,__get_free_page,page使用计数为1,add_to_swap_cache,page使用计数再加1;此时page_cache_release,page使用计数又变成了1。直到有进程认领,才变成2。
struct page * read_swap_cache_async(swp_entry_t entry, int wait) { struct page *found_page = 0, *new_page; unsigned long new_page_addr; /* * Make sure the swap entry is still in use. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ goto out; /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry);//假设没有找到 if (found_page) goto out_free_swap; new_page_addr = __get_free_page(GFP_USER);//刚申请的page结构,使用计数为1 if (!new_page_addr) goto out_free_swap; /* Out of memory */ new_page = virt_to_page(new_page_addr);//转化成对应的page结构指针 /* * Check the swap cache again, in case we stalled above. */ found_page = lookup_swap_cache(entry);//假设没有找到 if (found_page) goto out_free_page; /* * Add it to the swap cache and read its contents. */ lock_page(new_page); add_to_swap_cache(new_page, entry);//加入到对应的链表上 rw_swap_page(READ, new_page, wait);//真正的把磁盘上的数据读到新申请的page上,等待块设备驱动一章再来看 return new_page; out_free_page: page_cache_release(new_page); out_free_swap: swap_free(entry); out: return found_page; }
void add_to_swap_cache(struct page *page, swp_entry_t entry) { unsigned long flags; #ifdef SWAP_CACHE_INFO swap_cache_add_total++; #endif if (!PageLocked(page)) BUG(); if (PageTestandSetSwapCache(page)) BUG(); if (page->mapping) BUG(); flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1)); page->flags = flags | (1 << PG_uptodate); add_to_page_cache_locked(page, &swapper_space, entry.val); }
add_to_page_cache_locked函数,代码如下:
void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index) { if (!PageLocked(page)) BUG(); page_cache_get(page);//增加了使用计数,现在使用计数为2 spin_lock(&pagecache_lock); page->index = index;//index存着页面交换项 add_page_to_inode_queue(mapping, page);//page->list链入mapping->clean_pages add_page_to_hash_queue(page, page_hash(mapping, index));//page->next_hash和page->pprev_hash链入全局的Hash表 lru_cache_add(page);//page->lru链入了全局的active_list spin_unlock(&pagecache_lock); }
static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page) { struct list_head *head = &mapping->clean_pages; mapping->nrpages++; list_add(&page->list, head);//page->list链入mapping->clean_pages page->mapping = mapping;//mapping指向了swapper_space }
struct address_space swapper_space = { LIST_HEAD_INIT(swapper_space.clean_pages), LIST_HEAD_INIT(swapper_space.dirty_pages), LIST_HEAD_INIT(swapper_space.locked_pages), 0, /* nrpages */ &swap_aops, };
static void add_page_to_hash_queue(struct page * page, struct page **p) { struct page *next = *p;//page->next_hash和page->pprev_hash链入全局的Hash表 *p = page; page->next_hash = next; page->pprev_hash = p; if (next) next->pprev_hash = &page->next_hash; if (page->buffers) PAGE_BUG(page); atomic_inc(&page_cache_size); }
void lru_cache_add(struct page * page) { spin_lock(&pagemap_lru_lock); if (!PageLocked(page)) BUG(); DEBUG_ADD_PAGE add_page_to_active_list(page); /* This should be relatively rare */ if (!page->age) deactivate_page_nolock(page); spin_unlock(&pagemap_lru_lock); }
#define add_page_to_active_list(page) { \ DEBUG_ADD_PAGE \ ZERO_PAGE_BUG \ SetPageActive(page); \ list_add(&(page)->lru, &active_list); \ //page->lru链入了全局的active_list nr_active_pages++; \ //全局的nr_active_pages加1 }
#define read_swap_cache(entry) read_swap_cache_async(entry, 1);还是调用read_swap_cache_async函数,只是本次执行,很可能从lookup_swap_cache函数,找到了page。
struct page * read_swap_cache_async(swp_entry_t entry, int wait) { struct page *found_page = 0, *new_page; unsigned long new_page_addr; /* * Make sure the swap entry is still in use. */ if (!swap_duplicate(entry)) /* Account for the swap cache */ goto out; /* * Look for the page in the swap cache. */ found_page = lookup_swap_cache(entry);//假设在hash表中找到对应的page,有进程认领了,使用计数为2 if (found_page) goto out_free_swap; new_page_addr = __get_free_page(GFP_USER); if (!new_page_addr) goto out_free_swap; /* Out of memory */ new_page = virt_to_page(new_page_addr); /* * Check the swap cache again, in case we stalled above. */ found_page = lookup_swap_cache(entry);//有可能__get_free_page,没有足够的可分配的页面,切换到其他进程了,再切回来时,在Hash表中再寻找一遍 if (found_page) goto out_free_page; /* * Add it to the swap cache and read its contents. */ lock_page(new_page); add_to_swap_cache(new_page, entry); rw_swap_page(READ, new_page, wait); return new_page; out_free_page: page_cache_release(new_page); out_free_swap: swap_free(entry); out: return found_page; }
三、lookup_swap_cache函数,如下:
struct page * lookup_swap_cache(swp_entry_t entry) { struct page *found; #ifdef SWAP_CACHE_INFO swap_cache_find_total++; #endif while (1) { /* * Right now the pagecache is 32-bit only. But it's a 32 bit index. =) */ repeat: found = find_lock_page(&swapper_space, entry.val);//entry.val为页面交换项 if (!found) return 0; /* * Though the "found" page was in the swap cache an instant * earlier, it might have been removed by refill_inactive etc. * Re search ... Since find_lock_page grabs a reference on * the page, it can not be reused for anything else, namely * it can not be associated with another swaphandle, so it * is enough to check whether the page is still in the scache. */ if (!PageSwapCache(found)) { UnlockPage(found); page_cache_release(found); goto repeat; } if (found->mapping != &swapper_space) goto out_bad; #ifdef SWAP_CACHE_INFO swap_cache_find_success++; #endif UnlockPage(found); return found; }
#define find_lock_page(mapping, index) \ __find_lock_page(mapping, index, page_hash(mapping, index))
struct page * __find_lock_page (struct address_space *mapping, unsigned long offset, struct page **hash) { struct page *page; /* * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ repeat: spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash);//得到了hash表的其中一个链表的头 if (page) { page_cache_get(page);//增加使用计数 spin_unlock(&pagecache_lock); lock_page(page); /* Is the page still hashed? Ok, good.. */ if (page->mapping) return page; /* Nope: we raced. Release and try again.. */ UnlockPage(page); page_cache_release(page); goto repeat; } spin_unlock(&pagecache_lock); return NULL; }
static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page) { goto inside; for (;;) { page = page->next_hash;//从hash表中寻找 inside: if (!page) goto not_found; if (page->mapping != mapping) continue; if (page->index == offset) break; } /* * Touching the page may move it to the active list. * If we end up with too few inactive pages, we wake * up kswapd. */ age_page_up(page); if (inactive_shortage() > inactive_target / 2 && free_shortage()) wakeup_kswapd(0); not_found: return page; }
根据页面交换项,在hash表中寻找page结构。
swapin_readahead(entry);//预读页面 page = read_swap_cache(entry);//真正得到一个页面,这个页面可能从hash表中寻找到,因为上面预读了。或者自己申请页面,并且从盘上将其内容读进来。read_swap_cache无论从hash表中读取页面,还是自己申请页面,并加入到对应的链表。最后使用计数都是2。
swapin_readahead预读了很多页面,如果没有被进程认领,那么使用计数为1。