Linux内核源代码情景分析-内存管理之用户页面的换入

    在下面几种情况下会发生,页面出错异常(也叫缺页中断):

    1、相应的页面目录项或者页面表项为空,也就是该线性地址与物理地址的映射关系尚未建立,或者已经撤销。

    2、相应的物理页面不在内存中。 本文讨论的就是这种情况。 

    3、指令中规定的访问方式与页面的权限不符,例如企图写一个“只读”的页面。


    假设已经建立好了映射,但是页表项最后一位P为0,表示页面不在内存中;整个页表项如下图,offset表示页面在一个磁盘设备的位置,也就是磁盘设备的逻辑页面号;而type则是指该页面在哪一个磁盘设备中。


                                 图 1  页面交换项结构


    这里假定CPU的运行已经到达了页面异常服务程序的主体do_page_fault()的入口处。

    代码如下: arch/i386/mm/fault.c

asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
{
	struct task_struct *tsk;
	struct mm_struct *mm;
	struct vm_area_struct * vma;
	unsigned long address;
	unsigned long page;
	unsigned long fixup;
	int write;
	siginfo_t info;

	/* get the address */
	__asm__("movl %%cr2,%0":"=r" (address));//把映射的失败的地址保存在address中

	tsk = current;//task_struct

	/*
	 * We fault-in kernel-space virtual memory on-demand. The
	 * 'reference' page table is init_mm.pgd.
	 *
	 * NOTE! We MUST NOT take any locks for this case. We may
	 * be in an interrupt or a critical region, and should
	 * only copy the information from the master page table,
	 * nothing more.
	 */
	if (address >= TASK_SIZE)
		goto vmalloc_fault;

	mm = tsk->mm;//mm_struct
	info.si_code = SEGV_MAPERR;

	/*
	 * If we're in an interrupt or have no user
	 * context, we must not take the fault..
	 */
	if (in_interrupt() || !mm)
		goto no_context;

	down(&mm->mmap_sem);

	vma = find_vma(mm, address);//找出结束地址大于给定地址的第一个区间。
	if (!vma)//没有找到,说明没有一个区间的结束地址高于给定的地址,参考上图,说明这个地址是在堆栈之下,也就是3G字节以上了。
		goto bad_area;
	if (vma->vm_start <= address)//起始地址不高于address,说明映射已经建立,转到good_area去进一步检查失败原因。
		goto good_area;
	if (!(vma->vm_flags & VM_GROWSDOWN))
		goto bad_area;
	....
/*
 * Ok, we have a good vm_area for this memory access, so
 * we can handle it..
 */
good_area:
	info.si_code = SEGV_ACCERR;
	write = 0;
	switch (error_code & 3) {// 110 & 011 = 2
		default:	/* 3: write, present */
#ifdef TEST_VERIFY_AREA
			if (regs->cs == KERNEL_CS)
				printk("WP fault at %08lx\n", regs->eip);
#endif
			/* fall through */
		case 2:		/* write, not present */
			if (!(vma->vm_flags & VM_WRITE))
				goto bad_area;
			write++;//执行到这里
			break;
		case 1:		/* read, present */
			goto bad_area;
		case 0:		/* read, not present */
			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
				goto bad_area;
	}

	/*
	 * If for any reason at all we couldn't handle the fault,
	 * make sure we exit gracefully rather than endlessly redo
	 * the fault.
	 */
	switch (handle_mm_fault(mm, vma, address, write)) {
	case 1:
		tsk->min_flt++;
		break;
	case 2:
		tsk->maj_flt++;
		break;
	case 0:
		goto do_sigbus;
	default:
		goto out_of_memory;
	}

	/*
	 * Did it hit the DOS screen memory VA from vm86 mode?
	 */
	if (regs->eflags & VM_MASK) {
		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
		if (bit < 32)
			tsk->thread.screen_bitmap |= 1 << bit;
	}
	up(&mm->mmap_sem);
	return;
        .......
}

    内核的中断/异常响应机制还传过来两个参数。一个是pt_regs结构指针regs,它指向例外发生前夕CPU中各寄存器内容的一份副本。而error_code则进一步指明映射失败的具体原因。

    error_code:

    bit 0 == 0 means no page found, 1 means protection fault 

    bit 1 == 0 means read, 1 means write 

    bit 2 == 0 means kernel, 1 means user-mode 此时,error_code为110,用户态,页面不在内存中,写。


    handle_mm_fault函数,代码如下:

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
	unsigned long address, int write_access)
{
	int ret = -1;
	pgd_t *pgd;
	pmd_t *pmd;


	pgd = pgd_offset(mm, address);//返回页面表项指针
	pmd = pmd_alloc(pgd, address);//中转了一下,还是页目录表项指针


	if (pmd) {
		pte_t * pte = pte_alloc(pmd, address);//返回指向页表项的指针
		if (pte)
			ret = handle_pte_fault(mm, vma, address, write_access, pte);
	}
	return ret;
}

    handle_pte_fault函数,如下:

static inline int handle_pte_fault(struct mm_struct *mm,
	struct vm_area_struct * vma, unsigned long address,
	int write_access, pte_t * pte)
{
	pte_t entry;

	/*
	 * We need the page table lock to synchronize with kswapd
	 * and the SMP-safe atomic PTE updates.
	 */
	spin_lock(&mm->page_table_lock);
	entry = *pte;//页表项中内容
	if (!pte_present(entry)) {//页面不在内存中
		/*
		 * If it truly wasn't present, we know that kswapd
		 * and the PTE updates will not touch it later. So
		 * drop the lock.
		 */
		spin_unlock(&mm->page_table_lock);
		if (pte_none(entry))//页表项不为空
			return do_no_page(mm, vma, address, write_access, pte);
		return do_swap_page(mm, vma, address, pte, pte_to_swp_entry(entry), write_access);//执行到这里
	}

	if (write_access) {
		if (!pte_write(entry))
			return do_wp_page(mm, vma, address, pte, entry);

		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	establish_pte(vma, address, pte, entry);
	spin_unlock(&mm->page_table_lock);
	return 1;
}

    do_swap_page函数,如下:

static int do_swap_page(struct mm_struct * mm,
	struct vm_area_struct * vma, unsigned long address,
	pte_t * page_table, swp_entry_t entry, int write_access)
{
	struct page *page = lookup_swap_cache(entry);//从hash表中寻找
	pte_t pte;

	if (!page) {
		lock_kernel();
		swapin_readahead(entry);//预读页面
		page = read_swap_cache(entry);//真正得到一个页面,这个页面可能从hash表中寻找到,因为上面预读了。或者自己申请页面,并且从盘上将其内容读进来。
		unlock_kernel();
		if (!page)
			return -1;

		flush_page_to_ram(page);
		flush_icache_page(vma, page);
	}

	mm->rss++;

	pte = mk_pte(page, vma->vm_page_prot);//形成页表项

	/*
	 * Freeze the "shared"ness of the page, ie page_count + swap_count.
	 * Must lock page before transferring our swap count to already
	 * obtained page count.
	 */
	lock_page(page);
	swap_free(entry);
	if (write_access && !is_page_shared(page))
		pte = pte_mkwrite(pte_mkdirty(pte));//页表项赋予已写过对应的物理页,可进行读、写或者执行
	UnlockPage(page);

	set_pte(page_table, pte);//页表项(属性刚才已经设置了)指向对应的页面
	/* No need to invalidate - it was non-present before */
	update_mmu_cache(vma, address, pte);
	return 1;	/* Minor fault */
}


    一、下面分别解释各个函数。首先解释swapin_readahead函数,如下:

void swapin_readahead(swp_entry_t entry)
{
	int i, num;
	struct page *new_page;
	unsigned long offset;

	/*
	 * Get the number of handles we should do readahead io to. Also,
	 * grab temporary references on them, releasing them as io completes.
	 */
	num = valid_swaphandles(entry, &offset);
	for (i = 0; i < num; offset++, i++) {
		......
		new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset), 0);
		if (new_page != NULL)
			page_cache_release(new_page);//page使用计数减1
		swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
	}
	return;
}

    提前预读相邻的盘面,根据下面的描述,__get_free_page,page使用计数为1,add_to_swap_cache,page使用计数再加1;此时page_cache_release,page使用计数又变成了1。直到有进程认领,才变成2。


    read_swap_cache_async函数,如下:

struct page * read_swap_cache_async(swp_entry_t entry, int wait)
{
	struct page *found_page = 0, *new_page;
	unsigned long new_page_addr;
	
	/*
	 * Make sure the swap entry is still in use.
	 */
	if (!swap_duplicate(entry))	/* Account for the swap cache */
		goto out;
	/*
	 * Look for the page in the swap cache.
	 */
	found_page = lookup_swap_cache(entry);//假设没有找到
	if (found_page)
		goto out_free_swap;

	new_page_addr = __get_free_page(GFP_USER);//刚申请的page结构,使用计数为1
	if (!new_page_addr)
		goto out_free_swap;	/* Out of memory */
	new_page = virt_to_page(new_page_addr);//转化成对应的page结构指针

	/*
	 * Check the swap cache again, in case we stalled above.
	 */
	found_page = lookup_swap_cache(entry);//假设没有找到
	if (found_page)
		goto out_free_page;
	/* 
	 * Add it to the swap cache and read its contents.
	 */
	lock_page(new_page);
	add_to_swap_cache(new_page, entry);//加入到对应的链表上
	rw_swap_page(READ, new_page, wait);//真正的把磁盘上的数据读到新申请的page上,等待块设备驱动一章再来看
	return new_page;

out_free_page:
	page_cache_release(new_page);
out_free_swap:
	swap_free(entry);
out:
	return found_page;
}

    add_to_swap_cache函数是重点,代码如下:

void add_to_swap_cache(struct page *page, swp_entry_t entry)
{
	unsigned long flags;

#ifdef SWAP_CACHE_INFO
	swap_cache_add_total++;
#endif
	if (!PageLocked(page))
		BUG();
	if (PageTestandSetSwapCache(page))
		BUG();
	if (page->mapping)
		BUG();
	flags = page->flags & ~((1 << PG_error) | (1 << PG_arch_1));
	page->flags = flags | (1 << PG_uptodate);
	add_to_page_cache_locked(page, &swapper_space, entry.val);
}


    add_to_page_cache_locked函数,代码如下:

void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
{
	if (!PageLocked(page))
		BUG();

	page_cache_get(page);//增加了使用计数,现在使用计数为2
	spin_lock(&pagecache_lock);
	page->index = index;//index存着页面交换项
	add_page_to_inode_queue(mapping, page);//page->list链入mapping->clean_pages
	add_page_to_hash_queue(page, page_hash(mapping, index));//page->next_hash和page->pprev_hash链入全局的Hash表
	lru_cache_add(page);//page->lru链入了全局的active_list
	spin_unlock(&pagecache_lock);
}


    add_page_to_inode_queue函数,代码如下:

static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
{
	struct list_head *head = &mapping->clean_pages;

	mapping->nrpages++;
	list_add(&page->list, head);//page->list链入mapping->clean_pages
	page->mapping = mapping;//mapping指向了swapper_space
}

struct address_space swapper_space = {
	LIST_HEAD_INIT(swapper_space.clean_pages),
	LIST_HEAD_INIT(swapper_space.dirty_pages),
	LIST_HEAD_INIT(swapper_space.locked_pages),
	0,				/* nrpages	*/
	&swap_aops,
};

    add_page_to_hash_queue函数,如下:

static void add_page_to_hash_queue(struct page * page, struct page **p)
{
	struct page *next = *p;//page->next_hash和page->pprev_hash链入全局的Hash表

	*p = page;
	page->next_hash = next;
	page->pprev_hash = p;
	if (next)
		next->pprev_hash = &page->next_hash;
	if (page->buffers)
		PAGE_BUG(page);
	atomic_inc(&page_cache_size);
}


    lru_cache_add函数,如下:

void lru_cache_add(struct page * page)
{
	spin_lock(&pagemap_lru_lock);
	if (!PageLocked(page))
		BUG();
	DEBUG_ADD_PAGE
	add_page_to_active_list(page);
	/* This should be relatively rare */
	if (!page->age)
		deactivate_page_nolock(page);
	spin_unlock(&pagemap_lru_lock);
}


    add_page_to_active_list函数,如下:
#define add_page_to_active_list(page) { \
	DEBUG_ADD_PAGE \
	ZERO_PAGE_BUG \
	SetPageActive(page); \
	list_add(&(page)->lru, &active_list); \ //page->lru链入了全局的active_list
	nr_active_pages++; \ //全局的nr_active_pages加1
}

    二、下面解释read_swap_cache函数,如下:

#define read_swap_cache(entry) read_swap_cache_async(entry, 1);
    还是调用read_swap_cache_async函数,只是本次执行,很可能从lookup_swap_cache函数,找到了page。
struct page * read_swap_cache_async(swp_entry_t entry, int wait)
{
	struct page *found_page = 0, *new_page;
	unsigned long new_page_addr;
	
	/*
	 * Make sure the swap entry is still in use.
	 */
	if (!swap_duplicate(entry))	/* Account for the swap cache */
		goto out;
	/*
	 * Look for the page in the swap cache.
	 */
	found_page = lookup_swap_cache(entry);//假设在hash表中找到对应的page,有进程认领了,使用计数为2
	if (found_page)
		goto out_free_swap;

	new_page_addr = __get_free_page(GFP_USER);
	if (!new_page_addr)
		goto out_free_swap;	/* Out of memory */
	new_page = virt_to_page(new_page_addr);

	/*
	 * Check the swap cache again, in case we stalled above.
	 */
	found_page = lookup_swap_cache(entry);//有可能__get_free_page,没有足够的可分配的页面,切换到其他进程了,再切回来时,在Hash表中再寻找一遍
	if (found_page)
		goto out_free_page;
	/* 
	 * Add it to the swap cache and read its contents.
	 */
	lock_page(new_page);
	add_to_swap_cache(new_page, entry);
	rw_swap_page(READ, new_page, wait);
	return new_page;

out_free_page:
	page_cache_release(new_page);
out_free_swap:
	swap_free(entry);
out:
	return found_page;
}

    

    三、lookup_swap_cache函数,如下:

struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *found;

#ifdef SWAP_CACHE_INFO
	swap_cache_find_total++;
#endif
	while (1) {
		/*
		 * Right now the pagecache is 32-bit only.  But it's a 32 bit index. =)
		 */
repeat:
		found = find_lock_page(&swapper_space, entry.val);//entry.val为页面交换项
		if (!found)
			return 0;
		/*
		 * Though the "found" page was in the swap cache an instant
		 * earlier, it might have been removed by refill_inactive etc.
		 * Re search ... Since find_lock_page grabs a reference on
		 * the page, it can not be reused for anything else, namely
		 * it can not be associated with another swaphandle, so it
		 * is enough to check whether the page is still in the scache.
		 */
		if (!PageSwapCache(found)) {
			UnlockPage(found);
			page_cache_release(found);
			goto repeat;
		}
		if (found->mapping != &swapper_space)
			goto out_bad;
#ifdef SWAP_CACHE_INFO
		swap_cache_find_success++;
#endif
		UnlockPage(found);
		return found;
}

    find_lock_page函数,如下:

#define find_lock_page(mapping, index) \
		__find_lock_page(mapping, index, page_hash(mapping, index))

    __find_lock_page函数,如下:

struct page * __find_lock_page (struct address_space *mapping,
				unsigned long offset, struct page **hash)
{
	struct page *page;

	/*
	 * We scan the hash list read-only. Addition to and removal from
	 * the hash-list needs a held write-lock.
	 */
repeat:
	spin_lock(&pagecache_lock);
	page = __find_page_nolock(mapping, offset, *hash);//得到了hash表的其中一个链表的头
	if (page) {
		page_cache_get(page);//增加使用计数
		spin_unlock(&pagecache_lock);

		lock_page(page);

		/* Is the page still hashed? Ok, good.. */
		if (page->mapping)
			return page;

		/* Nope: we raced. Release and try again.. */
		UnlockPage(page);
		page_cache_release(page);
		goto repeat;
	}
	spin_unlock(&pagecache_lock);
	return NULL;
}

    __find_page_nolock函数,如下:

static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
{
	goto inside;

	for (;;) {
		page = page->next_hash;//从hash表中寻找
inside:
		if (!page)
			goto not_found;
		if (page->mapping != mapping)
			continue;
		if (page->index == offset)
			break;
	}
	/*
	 * Touching the page may move it to the active list.
	 * If we end up with too few inactive pages, we wake
	 * up kswapd.
	 */
	age_page_up(page);
	if (inactive_shortage() > inactive_target / 2 && free_shortage())
			wakeup_kswapd(0);
not_found:
	return page;
}

    根据页面交换项,在hash表中寻找page结构。


      swapin_readahead(entry);//预读页面
      page = read_swap_cache(entry);//真正得到一个页面,这个页面可能从hash表中寻找到,因为上面预读了。或者自己申请页面,并且从盘上将其内容读进来。
    read_swap_cache无论从hash表中读取页面,还是自己申请页面,并加入到对应的链表。最后使用计数都是2。

    swapin_readahead预读了很多页面,如果没有被进程认领,那么使用计数为1。

你可能感兴趣的:(Linux内核源代码情景分析-内存管理之用户页面的换入)