struct page是mm种最核心的结构体之一,可以说整个内存管理就是围绕page展开的,不同场景下page的状态各有不同,page->flags标志位是描述page状态的重要成员,定义在include/linux/page-flags.h:
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
//跟page reclaim的二次机会法有关
PG_referenced,
//page缓存内存和磁盘数据一致
PG_uptodate,
//代表是脏页
PG_dirty,
//page在lru链表中
PG_lru,
//page在active lru链表中
PG_active,
PG_workingset,
PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */
//IO错误
PG_error,
//page是对应的是slab内存
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
//不能换出
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
//正在回写
PG_writeback, /* Page is under writeback */
PG_head, /* A head page */
PG_mappedtodisk, /* Has blocks allocated on-disk */
//马上开始回收,回收前设置
PG_reclaim, /* To be reclaimed asap */
//匿名页和shmem page设置该条件
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
//被mlock了
PG_mlocked, /* Page is vma mlocked */
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
#endif
};
表示page被lock,上锁之后其他等待该lock的调用会睡眠等待,主要是用于page的竞态保护,内核提供lock_page/trylock_page/unlock_page相关函数。
trylock_page:检测PG_locked flag,如果已经设置返回false,否则设置返回true。
lock_page: 先检测是否上锁,如果trylock_page返回false代表已经设置过了,那么调用进__lock_page会睡眠等待,如果未设置过,那么trylock_page直接设置返回。
unlock_page: clear掉PG_locked flags,唤醒等待的进程
到底什么哪些类型的页面,什么场景会设置脏页?
设置:只要写回磁盘或者交换分区(包括zram压缩方式的page)都会设置PG_Dirty,比如要回收的匿名页(add_to_swap中设置PG_Dirty),shmem页面,或者file-back的页面。因为如果不设置PG_Dirty标志(真正clean的page除外)就无法pageout写回磁盘或者交换区(包括zram)。
清除:通常准备写入磁盘或者交换分区前clear,见下面的pageout函数。
/*
* shrink_page_list() returns the number of reclaimed pages
*/
static unsigned int shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
unsigned int nr_reclaimed = 0;
unsigned int pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
while (!list_empty(page_list)) {
...
if (PageDirty(page)) {
...
try_to_unmap_flush_dirty();
switch (pageout(page, mapping)) {
...
}
return nr_reclaimed;
}
表示page正在回写,向swap分区写入和文件系统中向磁盘写入都会设置该标志位。一般调用set_page_writeback函数设置。一般在向块设备层submit io前设置,io完成取消。
swap分区写入场景举例:
设置writeback:
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
...
set_page_writeback(page);
unlock_page(page);
submit_bio(bio);
out:
return ret;
}
IO完成取消writeback标志:
一般写回磁盘或者交换分区前设置,回写成功clear掉标志,ClearPageReclaim见上面的end_page_writeback,不论普通文件系统中文件会写磁盘,还是回写交换分区,io完成都会回调该函数,然后由于回写之前设置了PG_Reclaim,这里回写完成就会清理掉。
设置代码:
/*
* pageout is called by shrink_page_list() for each dirty page.
* Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping)
{
/*
* If the page is dirty, only perform writeback if that write
* will be non-blocking. To prevent this allocation from being
* stalled by pagecache activity. But note that there may be
* stalls if we need to run get_block(). We could test
* PagePrivate for that.
*
* If this process is currently in __generic_file_write_iter() against
* this page's queue, we can perform writeback even if that
* will block.
*
* If the page is swapcache, write it back even if that would
* block, for some throttling. This happens by accident, because
* swap_backing_dev_info is bust: it doesn't reflect the
* congestion state of the swapdevs. Easy to fix, if needed.
*/
if (!is_page_cache_freeable(page))
return PAGE_KEEP;
if (!mapping) {
/*
* Some data journaling orphaned pages can have
* page->mapping == NULL while being dirty with clean buffers.
*/
if (page_has_private(page)) {
if (try_to_free_buffers(page)) {
ClearPageDirty(page);
pr_info("%s: orphaned page\n", __func__);
return PAGE_CLEAN;
}
}
return PAGE_KEEP;
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
if (!may_write_to_inode(mapping->host))
return PAGE_KEEP;
//因为准备要回写磁盘或者交换分区了,清理PageDirty
if (clear_page_dirty_for_io(page)) {
int res;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.for_reclaim = 1,
};
//设置该标志,代表马上开始回收了
SetPageReclaim(page);
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
trace_mm_vmscan_writepage(page);
inc_node_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}