该值初始化函数:mm/page_alloc.c:init_per_zone_wmark_min->refresh_zone_stat_thresholds()
常规条件下zone区域的free_pages获取方式如下所示:
free_pages = atomic_long_read(zone->vm_stat[NR_FREE_PAGES]);
若free_pages < zone->percpu_drift_mark,则该zone的free_pages需要使用更加精确的获取方式(主要通过zone_page_state_snapshot函数实现):
free_pages = atomic_long_read(&zone->vm_stat[NR_FREE_PAGES]);
for_each_online_cpu(cpu)
free_pages += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[NR_FREE_PAGES];
//mm/vmscan.c
/* Work out how many page cache pages we can reclaim in this reclaim_mode */
static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
{
unsigned long nr_pagecache_reclaimable;
unsigned long delta = 0;
/*
* If RECLAIM_UNMAP is set, then all file pages are considered
* potentially reclaimable. Otherwise, we have to worry about
* pages like swapcache and node_unmapped_file_pages() provides
* a better estimate
*/
if (node_reclaim_mode & RECLAIM_UNMAP)//回收能进行unmap操作
//统计内存节点中所有page cache文件页的数量
nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
else//回收不能进行unmap操作
//同居内存节点中没有被映射的page cache文件页数量
nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
/* If we can't clean pages, remove dirty pages from consideration */
//内存节点中脏文件页数量
if (!(node_reclaim_mode & RECLAIM_WRITE))
delta += node_page_state(pgdat, NR_FILE_DIRTY);
/* Watch for any possible underflows due to delta */
if (unlikely(delta > nr_pagecache_reclaimable))
delta = nr_pagecache_reclaimable;
return nr_pagecache_reclaimable - delta;
}
在linux内核中,我们可以利用node_page_state函数和enum node_stat_item枚举类型来获取指定内存节点中很多的内存相关的数据:
//pgdat内存节点中文件页总数
node_page_state(pgdat, NR_FILE_PAGES);
//pgdat内存节点上处于不活跃文件页lru链表中页总数
node_page_state(pgdat, NR_INACTIVE_FILE)
//pgdat内存节点中脏文件页总数
node_page_state(pgdat, NR_FILE_DIRTY)
相关源码
//mm/vmstat.c
/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(struct pglist_data *pgdat,
enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
#ifdef CONFIG_SMP
if (x < 0)
x = 0;
#endif
return x;
}
//include/linux/mmzone.h
enum node_stat_item {
NR_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */
NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */
NR_PAGES_SCANNED, /* pages scanned since last reclaim */
WORKINGSET_REFAULT,
WORKINGSET_ACTIVATE,
WORKINGSET_NODERECLAIM,
NR_ANON_MAPPED, /* Mapped anonymous pages */
NR_FILE_MAPPED, /* pagecache pages mapped into pagetables.
only modified from process context */
NR_FILE_PAGES,
#ifdef CONFIG_COUNT_AVAILABLE_MEMORY
NR_FILE_RAMFS, /* pages for ramfs */
NR_FILE_TMPFS, /* pages for tmpfs */
#endif
NR_FILE_DIRTY,
NR_WRITEBACK,
NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */
NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */
NR_SHMEM_THPS,
NR_SHMEM_PMDMAPPED,
NR_ANON_THPS,
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_VMSCAN_WRITE,
NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */
NR_DIRTIED, /* page dirtyings since bootup */
NR_WRITTEN, /* page writings since bootup */
NR_VM_NODE_STAT_ITEMS
};
https://www.linuxidc.com/Linux/2013-06/85344.htm
//mm/percpu.c
/**
* __alloc_percpu - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
*/
void __percpu *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false, GFP_KERNEL);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
* @reserved: allocate from the reserved chunk if available
* @gfp: allocation flags
*
* Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
* contain %GFP_KERNEL, the allocation is atomic.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
arm64 Linux内核内存伙伴系统—相关知识点汇总
https://www.cnblogs.com/aspirs/p/13899609.html
pageAnon实现代码如下,判断物理页对应描述符struct page中的mapping成员的第0位是否为1,若是则pageAnon函数返回true.
//include/linux/page-flags.h
static __always_inline int PageAnon(struct page *page)
{
page = compound_head(page);
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
static inline struct page *compound_head(struct page *page)
{
unsigned long head = READ_ONCE(page->compound_head);
if (unlikely(head & 1))
return (struct page *) (head - 1);
return page;
}
2.PageSwapBacked(page):函数会检查页对应描述符struct page的flag成员的PG_swapbacked位是否设置为1,为1则返回 true
3.PageSwapCache(page):则是在page为匿名页的情况向判断该页是否分配了swap 空间。往往在内存回收时若匿名页没有分配
swap空间则会通过add_to_swap()函数为其分配交换空间,分配后PageSwapCache(page)会被返回true。
内核总会出现如下几种状态的页:
pageAnon(page)&&PageSwapBacked(page)为true:该page对应一个标准的匿名页.page->mapping第0位为1,且
page->mapping执向一个anon_vma数据结构.
!pageAnon(page)&&PageSwapBacked(page)返回true:该页page-mapping的第0位为0,但page->flag的PG_swapbacked位为1.此时
page->mapping指向的是一个stuct address_space*数据结构(一个inode->i_mapping).shmem共享页就处于上述状态.
pageAnon(page)&&!PageSwapCache(page)返回true:表示该匿名页未被分配交换空间,在进行页面回收扫描不活跃匿名页lru链表的过
程中遇到该此种状态的页会通过add_to_swap()函数为其分配交换空间.
pageAnon(page)&&!PageSwapBacked(page)返回true:该页是一处于临时状态的匿名页,在进行页面回收扫描不活跃匿名页lru链表的过程
中shrink_page_list函数在最后会将page的PG_swapbacked标志清除,清除后该匿名页立马能够被释放回收.
进一步了解可参考博文:你是什么内存: PageAnon 与 PageSwapBacked
PG_locked用于设置页面锁,有两个函数用于申请页面锁:lock_page()和trylock_page()。
/*
*使用原子位操作,试着去置位,若已经置位,则任务被挂起,直到调用wake_up_bit()唤醒,等待的线程。可以被wake_up_bit
*唤醒
*/
int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,wait_on_bit_lock()
wait_bit_action_f *action, unsigned mode)
{
do {
int ret;
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags))
continue;
ret = action(&q->key);
if (!ret)
continue;
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
return ret;
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
return 0;
}
void __lock_page(struct page *page)
{
//定义在哪位上等待
DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
__wait_on_bit_lock(page_waitqueue(page), &wait, bit_wait_io,
TASK_UNINTERRUPTIBLE);
}
/*
* lock_page may only be called if we have the page's inode pinned.
*/
static inline void lock_page(struct page *page)
{
might_sleep();
//如果原page->flags已经被置PG_locked,则调用__lock_page进行等待使用者释放
if (!trylock_page(page))
__lock_page(page);
}
#define test_and_set_bit_lock(nr, addr) test_and_set_bit(nr, addr)
static inline int trylock_page(struct page *page)
{
//尝试为page->flags设置PG_locked标志位,并且返回原来标志位的值。所以并不会等待。
return (likely(!test_and_set_bit_lock(PG_locked, &page->flags)));
}
本文翻译:https://blog.csdn.net/hbcbgcx/article/details/88356449
_count(高版本_refcount)表示内核中引用该页面的次数:
内核通过get_page()和put_page()操作page的引用计数_count。
static inline void get_page(struct page *page)
{
if (unlikely(PageTail(page)))
if (likely(__get_page_tail(page)))
return;
/*
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_count.
*/
//判断页面_count值不能小于等于0,因为伙伴系统分配好的页面初始值位1。
VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page);
//原子增加引用计数
atomic_inc(&page->_count);
}
static inline int put_page_testzero(struct page *page)
{
//_count不能为0,如果为0,说明这页面已经被释放了。
VM_BUG_ON_PAGE(atomic_read(&page->_count) == 0, page);
return atomic_dec_and_test(&page->_count);
}
void put_page(struct page *page)
{
if (unlikely(PageCompound(page)))
put_compound_page(page);
else if (put_page_testzero(page))//如果减1之后等于0,就会释放页面
__put_single_page(page);//释放页面
}
内核还有一对常用宏来操作页的引用计数_count.
#define page_cache_get(page) get_page(page)
#define page_cache_release(page) put_page(page)
内核中用引用计数_count跟踪内核page的使用情况:
分配页面时_count引用计数会变成1
//分配页面函数alloc_pages()在成功分配页面后,_count引用计数应该为0,由set_page_refcounter()设置
/*
* Turn a non-refcounted page (->_count == 0) into refcounted with
* a count of one.
*/
static inline void set_page_refcounted(struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(atomic_read(&page->_count), page);
set_page_count(page, 1);
}
加入LRU链表时,page会被kswapd内核线程使用,因此_count引用计数会加1
/*
*以malloc()为用户程序分配内存为例,发生缺页中断后do_anonymous_page()函数成功分配出来一个页面,在设置硬
*件PTE之前,调用lru_cache_add()函数把这个匿名页面添加到LRU链表中,在这个过程中,使用page_cache_get()宏
*来增加_count引用计数。
*/
static void __lru_cache_add(struct page *page)
{
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
page_cache_get(page);---------------------增加计数
if (!pagevec_space(pvec))
__pagevec_lru_add(pvec);
pagevec_add(pvec, page);
put_cpu_var(lru_add_pvec);
}
被映射到其他用户进程pte时,_count引用计数会加1
1.子进程在被创建时共享父进程地址空间,设置父进程的pte页表项内容到子进程中并增加该页面的_count计数
页面的private中私有数据
1.对于PG_swapable页面,__add_to_swap_cache函数会增加_count引用计数。
2.对于PG_private页面,主要在block模块的buffer_head中引用。
内核对页面进行操作等关键路径上也会使_count引用计数加1
_mapcount引用计数表示这个页面被进程映射的个数,即已经映射了多少个用户pte页表。每个用户进程地址空间都有一份独立的页表,有可能出现多个用户进程地址空间同时映射到一个物理页面的情况,RMAP反向映射系统就是利用这个特性来实现的。_mapcount引用计数主要用于RMAP反响映射系统中。
_mapcount == -1:表示没有pte映射到页面中。
_mapcount == 0:表示只有父进程映射了页面。匿名页面刚分配时,_mapcount引用计数初始化为0.
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
SetPageSwapBacked(page);
//设为0
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
if (PageTransHuge(page))
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__mod_zone_page_state(page_zone(page), NR_ANON_PAGES,
hpage_nr_pages(page));
__page_set_anon_rmap(page, vma, address, 1);
}
_mapcount > 0:表示除了父进程外还有其他进程映射了这个页面
static inline unsigned long
copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
{
...
page = vm_normal_page(vma, addr, pte);
if (page) {
//增加_count计数
get_page(page);
//增加_mapcount计数
page_dup_rmap(page);
if (PageAnon(page))
rss[MM_ANONPAGES]++;
else
rss[MM_FILEPAGES]++;
}
...
}
//mm/vmscan.c
/* Check if a page is dirty or under writeback */
static void page_check_dirty_writeback(struct page *page,
bool *dirty, bool *writeback)
{
struct address_space *mapping;
/*
* Anonymous pages are not handled by flushers and must be written
* from reclaim context. Do not stall reclaim based on them
*/
if (!page_is_file_cache(page)) {
*dirty = false;
*writeback = false;
return;
}
/* By default assume that the page flags are accurate */
*dirty = PageDirty(page);
*writeback = PageWriteback(page);
/* Verify dirty/writeback state if the filesystem supports it */
if (!page_has_private(page))
return;
mapping = page_mapping(page);
if (mapping && mapping->a_ops->is_dirty_writeback)
mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
}
// include/linux/pagemap.h
static inline void wait_on_page_writeback(struct page *page)
/*
*1.page在从budy系统中分配出来的时候,count为1. 但是再加入page cache后,count为3.因为加入page cache时,也会把
* page一并加入zone的lru(add_to_page_cache_lru)。这个时候_count是3.如果加上buffer head后,应该是4.
* page_count(page) - !!PagePrivate(page) 可以理解为:4-1,3-0和2-0:
* a.4-1是指,page在page cache中,又在lru中,并且page的private有mapping的私有数据。这种情况下不能回
* 收吧
* c.3-0是在page在page cache中,又在lru中,并且page的private有mapping的没有私有数据,这种情况下不能回收.
* b.2-0是指,page只在lru中,已经从page cache中release。其private上没有mapping的私有数据。这个情况可以回
* 收。
* 所以page_count(page) - page_has_private(page) == 2为true表示可以回收.
*2.2个!!就是使数值为0或者1其中一个.
*/
static inline int is_page_cache_freeable(struct page *page)
{
/*
* A freeable page cache page is referenced only by the caller
* that isolated the page, the page cache radix tree and
* optional buffer heads at page->private.
*/
return page_count(page) - page_has_private(page) == 2;
}
PageBuddy(page):
1.True:在伙伴系统中
2.通过page->_mapcount = PAGE_BUDDY_MAPCOUNT_VALUE判定
order = page_order_unsafe(page)
#define page_order_unsafe(page) READ_ONCE(page_private(page))
#define page_private(page) ((page)->private)
1.概念判定
2.源码判定
truct page {
……
atomic_t _mapcount;
union {
……
struct {
……
struct address_space *mapping;
};
页描述符struct page中的struct address_space * mapping成员用于区分匿名页面和基于文件映射的页面:
两个访问位应用于在页面回收机制中扫描链表中的页时判断该页最近释放被经常访问。