[摘要]
[正文]重要数据结构介绍
[正文]重要资源初始化(包括pgdat、struct zone等)
[正文]内存回收时机
[正文]内存回收过程
[总结]
[其他]
注意:请使用谷歌浏览器阅读(IE浏览器排版混乱)
【摘要】
本文将介绍linux内存回收机制.
【正文】重要数据结构介绍
1 membank/meminfo
meminfo描述了整个系统的内存信息,meminfo中可以包含NR_BANKS个membank;
struct meminfo {
int nr_banks;
struct membank bank[NR_BANKS];//NR_BANKS=8
};
每个membank都是通过arm_add_memory()初始化的:setup_arch()->setup_machine_fdt()->arm_add_memory();
struct membank {
phys_addr_t start;
phys_addr_t size;
unsigned int highmem;
};
2 memblock/memblock_type
struct memblock_region {
phys_addr_t base;
phys_addr_t size;
};
struct memblock_type {
unsigned long cnt;/* number of regions */
unsigned long max;/* size of the allocated array */
phys_addr_t total_size;/* size of all regions */
struct memblock_region *regions;
};
struct memblock {
phys_addr_t current_limit;
struct memblock_type memory;
struct memblock_type reserved;
};
初始化:arm_memblock_init->memblock_add->memblock_add_region-> type->regions[0].base =mi->bank[i].startvoid __init arm_memblock_init(struct meminfo *mi, struct machine_desc *mdesc)
{
int i;
for (i = 0; i < mi->nr_banks; i++)
memblock_add(mi->bank[i].start, mi->bank[i].size);
/* Register the kernel text, kernel data and initrd with memblock. */
#ifdef CONFIG_XIP_KERNEL
memblock_reserve(__pa(_sdata), _end - _sdata);
#else
memblock_reserve(__pa(_stext), _end - _stext);
#endif
arm_mm_memblock_reserve();
arm_dt_memblock_reserve();
/* reserve any platform specific memblock areas */
if (mdesc->reserve)
mdesc->reserve();
/*
* reserve memory for DMA contigouos allocations,
* must come from DMA area inside low memory
*/
dma_contiguous_reserve(min(arm_dma_limit, arm_lowmem_limit));
arm_memblock_steal_permitted = false;
memblock_allow_resize();
/*
memblock_dump(&memblock.memory, "memory");
memblock_dump(&memblock.reserved, "reserved");
*/
memblock_dump_all();
}
3 pg_data_t *pgdat=NODE_DATA(0);可以通过NODE_DATA(0)找到ZONE_NORMAL内存区,进而获取这个内存区信息,
如:可用内存大小,zone->managed_pages
typedef struct pglist_data {
struct zone node_zones[MAX_NR_ZONES];
struct zonelist node_zonelists[MAX_ZONELISTS];
int nr_zones;
#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
struct page *node_mem_map;
#endif
unsigned long node_start_pfn;
unsigned long node_present_pages; /* total number of physical pages */
unsigned long node_spanned_pages; /* total size of physical page
range, including holes */
int node_id;
nodemask_t reclaim_nodes;/* Nodes allowed to reclaim from */
wait_queue_head_t kswapd_wait;
wait_queue_head_t pfmemalloc_wait;
struct task_struct *kswapd;/* Protected by lock_memory_hotplug() */
int kswapd_max_order;
enum zone_type classzone_idx;
} pg_data_t;
pg_data_t *pgdat=NODE_DATA(0);enum lru_list {
LRU_INACTIVE_ANON = LRU_BASE,
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};
【正文】重要资源初始化(包括pgdat、struct zone等)
1 meminfo与membank初始化
系统启动过程中,根据bootargs参数初始化meminfo,membank结构:
bootargs = console=ttyS0,115200 mem=110M root=/dev/mtdblock1 rootfstype=squashfs init=/linuxrc
举例:fdt中指定系统内存起始地址是0x200000,系统内存大小是110M=0x6e00000;
注意:atags或fdt参数都对系统内存的起始地址和内存大小进行了配置,但系统内存的大小最后以bootargs中的mem参数为准.
1.1 初始化membank的方法:
int __init arm_add_memory(phys_addr_t start, phys_addr_t size)
{
struct membank *bank = &meminfo.bank[meminfo.nr_banks];
u64 aligned_start;
if (meminfo.nr_banks >= NR_BANKS) {
printk(KERN_CRIT "NR_BANKS too low, "
"ignoring memory at 0x%08llx\n", (long long)start);
return -EINVAL;
}
/*
* Ensure that start/size are aligned to a page boundary.
* Size is appropriately rounded down, start is rounded up.
*/
size -= start & ~PAGE_MASK;
aligned_start = PAGE_ALIGN(start);
#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
if (aligned_start > ULONG_MAX) {
printk(KERN_CRIT "Ignoring memory at 0x%08llx outside "
"32-bit physical address space\n", (long long)start);
return -EINVAL;
}
if (aligned_start + size > ULONG_MAX) {
printk(KERN_CRIT "Truncating memory at 0x%08llx to fit in "
"32-bit physical address space\n", (long long)start);
/*
* To ensure bank->start + bank->size is representable in
* 32 bits, we use ULONG_MAX as the upper limit rather than 4GB.
* This means we lose a page after masking.
*/
size = ULONG_MAX - aligned_start;
}
#endif
if (aligned_start < PHYS_OFFSET) {
if (aligned_start + size <= PHYS_OFFSET) {
pr_info("Ignoring memory below PHYS_OFFSET: 0x%08llx-0x%08llx\n",
aligned_start, aligned_start + size);
return -EINVAL;
}
pr_info("Ignoring memory below PHYS_OFFSET: 0x%08llx-0x%08llx\n",
aligned_start, (u64)PHYS_OFFSET);
size -= PHYS_OFFSET - aligned_start;
aligned_start = PHYS_OFFSET;
}
/* atags或fdt中指定的系统内存的起始地址 */
bank->start = aligned_start;
/* atags或fdt中指定的系统内存的大小,bootargs中可以对系统内存大小进行修改 */
bank->size = size & ~(phys_addr_t)(PAGE_SIZE - 1);
/*
* Check whether this memory region has non-zero size or
* invalid node number.
*/
if (bank->size == 0)
return -EINVAL;
/*
atags或fdt中对应meminfo.membank[0],初始化想要的membank后nr_banks要加1,
下次再有membank加入则对应meminfo.membank[1]
*/
meminfo.nr_banks++;
return 0;
}
1.2 初始化membank的时机:
<1> atags或fdt解析时,根据系统内存的其实地址和大小创建meminfo.membank[0]
void __init early_init_dt_add_memory_arch(u64 base, u64 size)
{
/*根据fdt参数指定的系统内存和大小创建meminfo.membank[0],
如:membank.start=0x200000;membank.size=0x6e00000<110M>
系统内存大小会根据bootargs中的mem参数进行二次修改.
*/
arm_add_memory(base,size);
}
<2> 解析bootargs中mem=118M时:
static int __init early_mem(char *p)
{
static int usermem __initdata = 0;
phys_addr_t size;
phys_addr_t start;
char *endp;
/*
* If the user specifies memory size, we
* blow away any automatically generated
* size.
*/
if (usermem == 0) {
usermem = 1;
/* 注意解析fdt时已经添加了membank,此时meminfo.nr_banks=1;
此处再次将meminfo.nr_banks=0,表明bootargs中mem参数中指定的内存大小
对应meminfo.membank[0],即是对fdt中指定的系统内存大小进行二次配置.
*/
meminfo.nr_banks = 0;
}
start = PHYS_OFFSET;
size = memparse(p, &endp);
if (*endp == '@')
start = memparse(endp + 1, NULL);
arm_add_memory(start, size);
return 0;
}
2 初始化pgdat、struct zone: setup_arch()->paging_init()->bootmem_init()->
void __init bootmem_init(void)
{
unsigned long min, max_low, max_high;
max_low = max_high = 0;
/* 根据上面创建好的meminfo.membank[0] ,获取物理地址的页帧号:
min=0x200;max_low=0x6e00;max_high=0x6e00*/
find_limits(&min, &max_low, &max_high);
arm_bootmem_init(min, max_low);
/*
* Sparsemem tries to allocate bootmem in memory_present(),
* so must be done after the fixed reservations
*/
arm_memory_present();
/*
* sparse_init() needs the bootmem allocator up and running.
*/
sparse_init();
/*
* Now free the memory - free_area_init_node needs
* the sparse mem_map arrays initialized by sparse_init()
* for memmap_init_zone(), otherwise all PFNs are invalid.
*/
arm_bootmem_free(min, max_low, max_high);
/*
* This doesn't seem to be used by the Linux memory manager any
* more, but is used by ll_rw_block. If we can get rid of it, we
* also get rid of some of the stuff above as well.
*
* Note: max_low_pfn and max_pfn reflect the number of _pages_ in
* the system, not the maximum PFN.
*/
max_low_pfn = max_low - PHYS_PFN_OFFSET;
/*系统中物理页个数:0x6c00=0x6e00-0x200*/
max_pfn = max_high - PHYS_PFN_OFFSET;
}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_init()
static void __init arm_bootmem_init(unsigned long start_pfn,
unsigned long end_pfn)
{
struct memblock_region *reg;
unsigned int boot_pages;
phys_addr_t bitmap;
pg_data_t *pgdat;
/*
* Allocate the bootmem bitmap page. This must be in a region
* of memory which has already been mapped.
*/
boot_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
bitmap = memblock_alloc_base(boot_pages << PAGE_SHIFT, L1_CACHE_BYTES,
__pfn_to_phys(end_pfn));
/*
* Initialise the bootmem allocator, handing the
* memory banks over to bootmem.
*/
node_set_online(0);
pgdat = NODE_DATA(0);
init_bootmem_node(pgdat, __phys_to_pfn(bitmap), start_pfn, end_pfn);
/* Free the lowmem regions from memblock into bootmem. */
for_each_memblock(memory, reg) {
/*物理页帧号:start=0x200;end=0x6e00;*/
unsigned long start = memblock_region_memory_base_pfn(reg);
unsigned long end = memblock_region_memory_end_pfn(reg);
if (end >= end_pfn)
end = end_pfn;
if (start >= end)
break;
free_bootmem(__pfn_to_phys(start), (end - start) << PAGE_SHIFT);
}
/* Reserve the lowmem memblock reserved regions in bootmem. */
for_each_memblock(reserved, reg) {
unsigned long start = memblock_region_reserved_base_pfn(reg);
unsigned long end = memblock_region_reserved_end_pfn(reg);
if (end >= end_pfn)
end = end_pfn;
if (start >= end)
break;
reserve_bootmem(__pfn_to_phys(start),
(end - start) << PAGE_SHIFT, BOOTMEM_DEFAULT);
}
}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()
static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
unsigned long max_high)
{
unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
struct memblock_region *reg;
/*
* initialise the zones.
*/
memset(zone_size, 0, sizeof(zone_size));
/*
* The memory size has already been determined. If we need
* to do anything fancy with the allocation of this memory
* to the zones, now is the time to do it.
*/
/*
0表示ZONE_NORMAL,系统内存大小对应的物理页个数,即0x6c00=0x6e00-0x200;
zone_size[1]=0;表示ZONE_MOVABLE
*/
zone_size[0] = max_low - min;
/*
* Calculate the size of the holes.
* holes = node_size - sum(bank_sizes)
*/
memcpy(zhole_size, zone_size, sizeof(zhole_size));
for_each_memblock(memory, reg) {
unsigned long start = memblock_region_memory_base_pfn(reg);
unsigned long end = memblock_region_memory_end_pfn(reg);
if (start < max_low) {
unsigned long low_end = min(end, max_low);
zhole_size[0] -= low_end - start;
}
#ifdef CONFIG_HIGHMEM
if (end > max_low) {
unsigned long high_start = max(start, max_low);
zhole_size[ZONE_HIGHMEM] -= end - high_start;
}
#endif
}
#ifdef CONFIG_ZONE_DMA
/*
* Adjust the sizes according to any special requirements for
* this machine type.
*/
if (arm_dma_zone_size)
arm_adjust_dma_zone(zone_size, zhole_size,
arm_dma_zone_size >> PAGE_SHIFT);
#endif
free_area_init_node(0, zone_size, min, zhole_size);
}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()->free_area_init_node():
void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
/*nid = 0*/
pgdat->node_id = nid;
/* 系统内存起始地址对应的物理页帧号:0x200 */
pgdat->node_start_pfn = node_start_pfn;
init_zone_allows_reclaim(nid);
/*系统内存大小:以页为单位(0x6e00-0x200 个页)*/
calculate_node_totalpages(pgdat, zones_size, zholes_size);
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
nid, (unsigned long)pgdat,
(unsigned long)pgdat->node_mem_map);
#endif
free_area_init_core(pgdat, zones_size, zholes_size);
}
setup_arch()->paging_init()->bootm_init()->arm_bootmem_free()->free_area_init_node()->free_area_init_core():
实现struct zone: pgdat->node_zones的初始化
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
int nid = pgdat->node_id;
unsigned long zone_start_pfn = pgdat->node_start_pfn;
int ret;
pgdat_resize_init(pgdat);
#ifdef CONFIG_NUMA_BALANCING
spin_lock_init(&pgdat->numabalancing_migrate_lock);
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_cgroup_init(pgdat);
/*MAX_NR_ZONES=2;0表示ZONE_NORMAL;1表示ZONE_MOVALBE*/
for (j = 0; j < MAX_NR_ZONES; j++) {
/*初始化struct zone: pgdat->node_zones*/
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
/*系统内存大小以页为单位:
即zone_names[0]="NORMAL";zone_size[0]=0x6c00=0x6e00-0x200;zholes_size[0]=0;
zone_names[1]="Movable";zone_size[1]=0;zholes_size[1]=0;
dma_reserve=0;
*/
size = zone_spanned_pages_in_node(nid, j, zones_size);
/*系统内存大小以页为单位:0x6c00*/
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
zholes_size);
/*
* Adjust freesize so that it accounts for how much memory
* is used by this zone for memmap. This affects the watermark
* and per-cpu initialisations
*/
/*计算页描述符占用的大小:0x6c00*sizeof(struct page)>>PAGE_SHIFT*/
memmap_pages = calc_memmap_size(size, realsize);
if (freesize >= memmap_pages) {
freesize -= memmap_pages;
if (memmap_pages)
printk(KERN_DEBUG
" %s zone: %lu pages used for memmap\n",
zone_names[j], memmap_pages);
} else
printk(KERN_WARNING
" %s zone: %lu pages exceeds freesize %lu\n",
zone_names[j], memmap_pages, freesize);
/* Account for reserved pages */
if (j == 0 && freesize > dma_reserve) {
freesize -= dma_reserve;
printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
zone_names[0], dma_reserve);
}
if (!is_highmem_idx(j))
nr_kernel_pages += freesize;
/* Charge for highmem memmap if there are enough kernel pages */
else if (nr_kernel_pages > memmap_pages * 2)
nr_kernel_pages -= memmap_pages;
nr_all_pages += freesize;
/*系统内存大小以页为单位:0x6c00*/
zone->spanned_pages = size;
zone->present_pages = realsize;
/* 注意:在之后的mm_init->mem_init->free_all_bootmem中会重新初始化
* Set an approximate value for lowmem here, it will be adjusted
* when the bootmem allocator frees pages into the buddy system.
* And all highmem pages will be managed by the buddy system.
*/
zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
#ifdef CONFIG_NUMA
zone->node = nid;
zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);
/*初始化zone->lruvec :包括LRU_INACTIVE_ANON/LRU_ACTIVE_ANON等*/
lruvec_init(&zone->lruvec);
if (!size)
continue;
set_pageblock_order();
setup_usemap(pgdat, zone, zone_start_pfn, size);
ret = init_currently_empty_zone(zone, zone_start_pfn,
size, MEMMAP_EARLY);
BUG_ON(ret);
/*初始化页描述符 */
memmap_init(size, nid, j, zone_start_pfn);
zone_start_pfn += size;
}
}
查看struct zone方法,如查看ZONE_NORMAL区:
方法一:struct zone=NODE_DATA(nid)->node_zones(ZONE_NORMAL);其中nid=0;ZONE_NORMAL=0;
方法二:struct *zone=page_zone(struct *page);可以根据page找到所在的struct zone;
注意:zone.managed_pages是系统允许使用的内存页个数,zone.watermark是以此为基准进行校验的,可以参考后文zone_watermark_ok.
zone.managed_pagess在mm_init->mem_init->free_all_bootmem中会重新初始化,并且后续使用的zone.managed_pages以此为准.
memmap_init=memmap_init_zone
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
/*pgdat->node_zones[0]:ZONE_NORMAL*/
z = &NODE_DATA(nid)->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
}
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
mminit_verify_page_links(page, zone, nid, pfn);
/* page->count初始化为1,表示该页被申请了,在以后的启动过程中会通过free_all_bootmem->__free_one_page
释放所有系统内存,那时page->count设置为0
*/
init_page_count(page);
/*page->_mapcount设置-1*/
page_mapcount_reset(page);
page_nid_reset_last(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if ((z->zone_start_pfn <= pfn)
&& (pfn < zone_end_pfn(z))
&& !(pfn & (pageblock_nr_pages - 1)))
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
INIT_LIST_HEAD(&page->lru);
}
}
【正文】内存回收时机
1 首先系统启动过程中,会对所有系统内存进行一次释放:
mm_init->mem_init->free_all_bootmem->free_all_bootmem_core->free_hot_cold_page/__free_pages_ok
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
unsigned long start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
/*start物理内存起始地址物理页帧号0x200;物理内存结束地址页帧号0x6e00*/
while (start < end) {
unsigned long *map, idx, vec;
unsigned shift;
map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
* vec holds at most BITS_PER_LONG map bits,
* bit 0 corresponds to start.
*/
vec = ~map[idx / BITS_PER_LONG];
if (shift) {
vec >>= shift;
if (end - start >= BITS_PER_LONG)
vec |= ~map[idx / BITS_PER_LONG + 1] <<
(BITS_PER_LONG - shift);
}
/*
* If we have a properly aligned and fully unreserved
* BITS_PER_LONG block of pages in front of us, free
* it in one go.
*/
if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
int order = ilog2(BITS_PER_LONG);
__free_pages_bootmem(pfn_to_page(start), order);
count += BITS_PER_LONG;
start += BITS_PER_LONG;
} else {
unsigned long cur = start;
start = ALIGN(start + 1, BITS_PER_LONG);
while (vec && cur != start) {
if (vec & 1) {
page = pfn_to_page(cur);
__free_pages_bootmem(page, 0);
count++;
}
vec >>= 1;
++cur;
}
}
}
page = virt_to_page(bdata->node_bootmem_map);
pages = bdata->node_low_pfn - bdata->node_min_pfn;
pages = bootmem_bootmap_pages(pages);
count += pages;
while (pages--)
__free_pages_bootmem(page++, 0);
bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
return count;
}
2 alloc_pages过程中因为剩余内存不足引起的内存的回收:
可结合 linux内存管理之伙伴系统管理 一文观看;
alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->
__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask
/* zonelist= node_zonelist(nid,gfp_mask)=NODE_DATA(nid)->node_zonelists;
其中nid=0;pg_data_t *pgdat=NODE_DATA(0)*/
struct page *__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
struct zone *preferred_zone;
struct page *page = NULL;
int migratetype = allocflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
struct mem_cgroup *memcg = NULL;
gfp_mask &= gfp_allowed_mask;
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
if (should_fail_alloc_page(gfp_mask, order))
return NULL;
/*
* Check the zones suitable for the gfp_mask contain at least one
* valid zone. It's possible to have an empty zonelist as a result
* of GFP_THISNODE and a memoryless node
*/
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
/*
* Will only have any effect when __GFP_KMEMCG is set. This is
* verified in the (always inline) callee
*/
if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
return NULL;
retry_cpuset:
cpuset_mems_cookie = get_mems_allowed();
/* The preferred zone is used for statistics later */
first_zones_zonelist(zonelist, high_zoneidx,
nodemask ? : &cpuset_current_mems_allowed,
&preferred_zone);
if (!preferred_zone)
goto out;
/* First allocation attempt */
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, alloc_flags,preferred_zone, migratetype);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
gfp_mask = memalloc_noio_flags(gfp_mask);
/* 系统空闲内存紧张时 get_page_from_freelist失败时 */
page = __alloc_pages_slowpath(gfp_mask, order,
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
}
trace_mm_page_alloc(page, order, gfp_mask, migratetype);
out:
/*
* When updating a task's mems_allowed, it is possible to race with
* parallel threads in such a way that an allocation can fail while
* the mask is being updated. If a page allocation is about to fail,
* check if the cpuset changed during allocation and if so, retry.
*/
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
goto retry_cpuset;
memcg_kmem_commit_charge(page, memcg, order);
return page;
}
alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->
__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask->
get_page_from_freelist->zone_reclaim->shrink_zone
static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
struct zone *preferred_zone, int migratetype)
{
struct zoneref *z;
struct page *page = NULL;
int classzone_idx;
struct zone *zone;
nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
int zlc_active = 0;/* set if using zonelist_cache */
int did_zlc_setup = 0;/* just call zlc_setup() one time */
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
goto this_zone_full;
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
int ret;
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
/* zone_watermark_ok在空闲内存不足时返回0;足够时返回1 */
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (IS_ENABLED(CONFIG_NUMA) &&
!did_zlc_setup && nr_online_nodes > 1) {
/*
* we do zlc_setup if there are multiple nodes
* and before considering the first zone allowed
* by the cpuset.
*/
allowednodes = zlc_setup(zonelist, alloc_flags);
zlc_active = 1;
did_zlc_setup = 1;
}
if (zone_reclaim_mode == 0 ||
!zone_allows_reclaim(preferred_zone, zone))
goto this_zone_full;
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
/* zone_reclaim进行内存回收,zone_watermark_ok返回0时,表明空闲内存不足,此时通过zone_reclarim->shrink_zone回收内存;
如果回收后,空闲内存仍不足,则通过:__alloc_pages_nodemask->__alloc_pages_slowpath->
wake_all_kswapd 唤醒守护进程继续释放内存:kswapd->balance_pgdat->shrink_zone
*/
ret = zone_reclaim(zone, gfp_mask, order);
switch (ret) {
case ZONE_RECLAIM_NOSCAN:
/* did not scan */
continue;
case ZONE_RECLAIM_FULL:
/* scanned but unreclaimable */
continue;
default:
/* did we reclaim enough 内存回收之后,系统空余内存是否足够*/
if (zone_watermark_ok(zone, order, mark,
classzone_idx, alloc_flags))
goto try_this_zone;
if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
ret == ZONE_RECLAIM_SOME)
goto this_zone_full;
continue;
}
}
try_this_zone:
/*申请内存:alloc_pages->__alloc_pages_nodemask->get_page_from_freelist->buffered_rmqueue->rmqueue_bulk:
分解大的伙伴.如4页分成2*2页,申请失败时调用*/
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
break;
this_zone_full:
if (IS_ENABLED(CONFIG_NUMA))
zlc_mark_zone_full(zonelist, z);
}
if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
/* Disable zlc cache for second zonelist scan */
zlc_active = 0;
goto zonelist_scan;
}
if (page)
page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
return page;
}
alloc_pages->
get_page_from_freelist
申请过程中通过
zone_watermark_ok
,判断是否有足够空闲页,否则调用zone_reclaim->shrink_zone回收,如果还未成功,则:
__alloc_pages_slowpath->__alloc_pages_direct_reclaim->__perform_reclaim->try_to_free_pages->shrink_zones->shrink_zone
bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags)
{
return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,zone_page_state(z, NR_FREE_PAGES));
}
/*
* Return true if free pages are above 'mark'. This takes into account the order
* of the allocation.
*/
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
int classzone_idx, int alloc_flags, long free_pages)
{
/* free_pages my go negative - that's OK */
long min = mark;
long lowmem_reserve = z->lowmem_reserve[classzone_idx];
int o;
long free_cma = 0;
free_pages -= (1 << order) - 1;
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
if (alloc_flags & ALLOC_HARDER)
min -= min / 4;
#ifdef CONFIG_CMA
/* If allocation can't use CMA areas don't use free CMA pages */
if (!(alloc_flags & ALLOC_CMA))
free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif
/* 系统空闲内存不足时,返回0 */
if (free_pages - free_cma <= min + lowmem_reserve)
return false;
for (o = 0; o < order; o++) {
/* At the next order, this order's pages become unavailable */
free_pages -= z->free_area[o].nr_free << o;
/* Require fewer higher order pages to be free */
min >>= 1;
/* 系统空闲内存不足时,返回0 */
if (free_pages <= min)
return false;
}
return true;
}
zone->watermark和zone->lowmem_reserve初始化过程:init_per_zone_wmark_min->setup_per_zone_wmarks
/*
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
* we want it large (64MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
* min_free_kbytes = sqrt(lowmem_kbytes * 16)
*
* which yields
*
* 16MB: 512k
* 32MB: 724k
* 64MB: 1024k
* 128MB: 1448k
* 256MB: 2048k
* 512MB: 2896k
* 1024MB: 4096k
* 2048MB: 5792k
* 4096MB: 8192k
* 8192MB: 11584k
* 16384MB: 16384k
*/
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
/*举例:系统内存为118M(bootargs参数中mem=118M)
zone->managed_pages=28764个page;表示系统允许申请的内存页个数.
lowmem_kbytes=(zone->managed_pages=28764)*4=115056kbytes;
*/
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
/*
min_free_kbytes =1356kbytes*/
min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
}
module_init(init_per_zone_wmark_min)
zone->watermark和zone->lowmem_reserve通过proc设置过程:
min_free_kbytes_sysctl_handler->setup_per_zone_wmarks
static void __setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
struct zone *zone;
unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
lowmem_pages += zone->managed_pages;
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
tmp = (u64)pages_min * zone->managed_pages;
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
* __GFP_HIGH and PF_MEMALLOC allocations usually don't
* need highmem pages, so cap pages_min to a small
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
* deltas controls asynch page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
zone->watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
zone->watermark[WMARK_MIN] = tmp;
}
/*举例:系统内存为118M(bootargs参数中mem=118M)
zone->managed_pages=28764个page;表示系统允许申请的内存页个数.
lowmem_kbytes=(zone->managed_pages=28764)*4=115056kbytes;
min_free_kbytes =1356kbytes;
min_wmark_pages(zone)=zone->watermark[WMARK_MIN]=tmp=min_free_kbytes/4=339个pages;
zone->watermark[WMARK_LOW]=339+339/4=423;
zone->watermark[WMARK_HIGH]=339+339/2=508;
*/
zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
setup_zone_migrate_reserve(zone);
spin_unlock_irqrestore(&zone->lock, flags);
}
/* update totalreserve_pages */
calculate_totalreserve_pages();
}
3 wake_all_kswapd唤醒守护进程释放空闲内存:kswapd->balance_pgdat->shrink_zone
get_page_from_freelist申请内存失败时,alloc_pages(gfp_mask,order)->alloc_pages_node(numa_node_id(),gfp_mask,order)->
__alloc_pages(gfp_mask,order,node_zonelist(nid,gfp_mask))->__alloc_pages_nodemask->
__alloc_pages_slowpath->wake_all_kswapd唤醒回收机制.
【正文】内存回收过程
void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
memset(lruvec, 0, sizeof(struct lruvec));
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
}
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
};
enum lru_list {
LRU_INACTIVE_ANON = LRU_BASE,
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
LRU_UNEVICTABLE,
NR_LRU_LISTS
};
用户进程的物理页面都是通过page fault进行分配的。通过page fault进行分配的页面都是可以进行回收的。 这些页面总体可以划分为两种,分别是文件页(file cache)和匿名页(anonymous cache).下面介绍一下缺页异常处理的主要函数:
int handle_pte_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pte_t *pte, pmd_t *pmd, unsigned int flags)
{
pte_t entry;
spinlock_t *ptl;
entry = *pte;
/* 页未被申请 present位被清除 */
if (!pte_present(entry)) {
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
return do_linear_fault(mm, vma, address,
pte, pmd, flags, entry);
}
/*匿名页缺页处理:最后会调用到filemap_fault,见后文分析*/
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
if (pte_file(entry)){
/*文件页缺页处理*/
return do_nonlinear_fault(mm, vma, address,
pte, pmd, flags, entry);
}
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
if (pte_numa(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
if (flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(mm, vma, address,
pte, pmd, ptl, entry);
entry = pte_mkdirty(entry);
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vma, address, pte);
} else {
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
* This still avoids useless tlb flushes for .text page faults
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
return 0;
}
4.1 匿名页申请
匿名页缺页处理:handle_pte_fault->do_anonymous_page()
static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags)
{
struct page *page;
spinlock_t *ptl;
pte_t entry;
pte_unmap(page_table);
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
return VM_FAULT_SIGBUS;
/* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/* Allocate our own private page. */
if (unlikely(anon_vma_prepare(vma)))
goto oom;
/*申请物理页*/
page = alloc_zeroed_user_highpage_movable(vma, address);
if (!page)
goto oom;
/*
* The memory barrier inside __SetPageUptodate makes sure that
* preceeding stores to the page contents become visible before
* the set_pte_at() write.
*/
/*设置page->flags ;page-flags.h中定义宏*/
__SetPageUptodate(page);
if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
goto oom_free_page;
entry = mk_pte(page, vma->vm_page_prot);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto release;
inc_mm_counter_fast(mm, MM_ANONPAGES);
/*
设置page->flags;page->mapping;page->index等.
将申请的页添加到LRU_ACTIVE_ANON或LRU_UNEVICTABLE链表
*/
page_add_new_anon_rmap(page, vma, address);
setpte:
set_pte_at(mm, address, page_table, entry);
/* No need to invalidate - it was non-present before */
update_mmu_cache(vma, address, page_table);
unlock:
pte_unmap_unlock(page_table, ptl);
return 0;
release:
mem_cgroup_uncharge_page(page);
page_cache_release(page);
goto unlock;
oom_free_page:
page_cache_release(page);
oom:
return VM_FAULT_OOM;
}
handle_pte_fault->do_anonymous_page->page_add_new_anon_rmap
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
SetPageSwapBacked(page);
atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
if (!PageTransHuge(page))
__inc_zone_page_state(page, NR_ANON_PAGES);
else
__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
__page_set_anon_rmap(page, vma, address, 1);
/*
判断这段内存是否mlock锁定,mlock(用户虚拟起始地址,锁定的内存大小);
锁定的内存不会被动kswapd回收机制动态释放
*/
if (!mlocked_vma_newpage(vma, page))
/*将页添加到LRU_ACTIVE_ANON*/
lru_cache_add_lru(page, LRU_ACTIVE_ANON);
else
/*将页添加到LRU_UNEVICTABLE链表;动态回收过程(shrink_zone)不会释放锁定的内存*/
add_page_to_unevictable_list(page);
}
handle_pte_fault->do_wp_page->
page_move_anon_rmap
void page_move_anon_rmap(struct page *page,struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
}
匿名vma初始化:anon_vma_alloc()
4.2 文件页的申请
1)读取文件可以直接使用read系统调用进行读取;也可以mmap文件句柄,最后会使用文件系统层的mmap注册文件页的缺页异常处理函数,注意此处不是匿名页的缺页异常处理。
以squashfs文件系统为例:
const struct file_operations generic_ro_fops={
.llseek = generic_file_llseek,
.read = do_sync_read,
.aio_read = generic_file_aio_read, --直接读取
.mmap = generic_file_readonly_mmap,--触发缺页异常,通过filemap_fault读取
.splice_read = generic_file_splice_read,
}
2)mmap一个文件的过程:generic_file_readonly_mmap->generic_file_mmap
const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.page_mkwrite = filemap_page_mkwrite,
.remap_pages = generic_file_remap_pages,
};
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
return -EINVAL;
return generic_file_mmap(file, vma);
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
struct address_space *mapping = file->f_mapping;
if (!mapping->a_ops->readpage)
return -ENOEXEC;
file_accessed(file);
/* const struct vm_operations_struct generic_file_vm_ops = {
.fault=filemap_fault,//缺页异常时调用
}
*/
vma->vm_ops = &generic_file_vm_ops;
return 0;
}
3)read方式读取文件,系统调用处理过程:generic_file_aio_read->do_generic_file_read()mmap方式读取文件,缺页异常处理过程:handle_pte_fault->do_nolinear_fault->__do_fault->filemap_fault
int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
int error;
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page;
pgoff_t size;
int ret = 0;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (offset >= size)
return VM_FAULT_SIGBUS;
/*
* Do we have something in the page cache already?
*/
page = find_get_page(mapping, offset);
if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
/*
* We found the page, so try async readahead before
* waiting for the lock.
*/
/*调用路径:
->page_cache_async_readahead->ondemand_readahead->__do_page_cache_readahead->
read_pages->squashfs_readpage
->page_cache_async_readahead->ondemand_readahead->ra_submit->__do_page_cache_readahead->
read_pages->squashfs_readpage
*/
do_async_mmap_readahead(vma, ra, file, page, offset);
} else if (!page) {
/* No page in the page cache at all */
/*调用路径:
->ra_submit->__do_page_cache_readahead->read_pages->squashfs_readpage
*/
do_sync_mmap_readahead(vma, ra, file, offset);
print_readpage_flag = 0;
count_vm_event(PGMAJFAULT);
mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_get_page(mapping, offset);
if (!page)
goto no_cached_page;
}
if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
page_cache_release(page);
return ret | VM_FAULT_RETRY;
}
/* Did it get truncated? */
if (unlikely(page->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
VM_BUG_ON(page->index != offset);
/*
* We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error.
*/
if (unlikely(!PageUptodate(page)))
goto page_not_uptodate;
/*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(offset >= size)) {
unlock_page(page);
page_cache_release(page);
return VM_FAULT_SIGBUS;
}
vmf->page = page;
return ret | VM_FAULT_LOCKED;
no_cached_page:
/*
* We're only likely to ever get here if MADV_RANDOM is in
* effect.
*/
error = page_cache_read(file, offset);
/*
* The page we want has now been added to the page cache.
* In the unlikely event that someone removed it in the
* meantime, we'll just come back here and read it again.
*/
if (error >= 0)
goto retry_find;
/*
* An error return from page_cache_read can result if the
* system is low on memory, or a problem occurs while trying
* to schedule I/O.
*/
if (error == -ENOMEM)
return VM_FAULT_OOM;
return VM_FAULT_SIGBUS;
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
ClearPageError(page);
/* 调用路径:->squashfs_readpage */
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
page_cache_release(page);
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
}
分析file_fault中代码得知:do_sync_mmap_readahead/do_async_mmap_readahead/page_cache_read最后都会通过squashfs_readpage读取文件.
举例:系统内存紧张时,会释放进程的动态库,此时若再加载动态库,则需要通过file_fault获取,多数情况通过do_sync_mmap_readahead加载代码段到dram上.
下面分析一下do_sync_mmap_readahead/do_async_mmap_readahead/page_cache_read这三个函数,这三种读取文件的过程都有可能申请文件页,且此时申请的文件页
需要通过add_to_page_cache_lru挂载到lru链表上.do_sync_mmap_readahead/do_async_mmap_readahead实现类似统一介绍:
无论哪种方式文件也得申请都形如(do_generic_file_read中虽未直接调用page_cache_read,但也有类似实现add_to_page_cache_lru):filemap_fault->page_cache_async_readahead->ondemand_readahead->__do_page_cache_readahead->
read_pages->squashfs_readpage
filemap_fault->page_cache_async_readahead->ondemand_readahead->ra_submit->__do_page_cache_readahead->
read_pages->squashfs_readpage
4)申请文件页时机:__do_page_cache_readahead->过程申请文件页:
static int
__do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode);
if (isize == 0)
goto out;
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
if (page)
continue;
/*申请文件页,在之后read_pages->add_to_page_cache_lru中将申请到的文件页添加到lru链表*/
page = page_cache_alloc_readahead(mapping);
if (!page)
break;
page->index = page_offset;
list_add(&page->lru, &page_pool);
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
ret++;
}
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
if (ret)
read_pages(mapping, filp, &page_pool, ret);
BUG_ON(!list_empty(&page_pool));
out:
return ret;
}
__do_page_cache_readahead->read_pages
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
struct blk_plug plug;
unsigned page_idx;
int ret;
blk_start_plug(&plug);
if (mapping->a_ops->readpages) {
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
page->index, GFP_KERNEL)) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
}
ret = 0;
out:
blk_finish_plug(&plug);
return ret;
}
5)申请文件页时机:squashfs_readpage过程申请文件页:可以参考linux文件读取过程一文
static int squashfs_readpage(struct file *file, struct page *page)
{
struct inode *inode = page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
int bytes, i, offset = 0, sparse = 0;
struct squashfs_cache_entry *buffer = NULL;
void *pageaddr;
int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1;
int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT);
int start_index = page->index & ~mask;
int end_index = start_index | mask;
int file_end = i_size_read(inode) >> msblk->block_log;
if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
PAGE_CACHE_SHIFT))
goto out;
if (index < file_end || squashfs_i(inode)->fragment_block ==
SQUASHFS_INVALID_BLK) {
/*
* Reading a datablock from disk. Need to read block list
* to get location and block size.
*/
u64 block = 0;
int bsize = read_blocklist(inode, index, &block);
if (bsize < 0)
goto error_out;
if (bsize == 0) { /* hole */
bytes = index == file_end ?
(i_size_read(inode) & (msblk->block_size - 1)) :
msblk->block_size;
sparse = 1;
} else {
/*
* Read and decompress datablock.
*/
buffer = squashfs_get_datablock(inode->i_sb,
block, bsize);
if (buffer->error) {
squashfs_file_lookup(inode,file->f_dentry,0);
ERROR("Unable to read page0 ,block %llx, size %x"
"\n", block, bsize);
squashfs_cache_put(buffer);
goto error_out;
}
bytes = buffer->length;
}
} else {
/*
* Datablock is stored inside a fragment (tail-end packed
* block).
*/
if(print_readpage_flag)
printk("read page, fragment block %llx, size %x\n",
squashfs_i(inode)->fragment_block,
squashfs_i(inode)->fragment_size);
buffer = squashfs_get_fragment(inode->i_sb,
squashfs_i(inode)->fragment_block,
squashfs_i(inode)->fragment_size);
if (buffer->error) {
ERROR("Unable to read page, block %llx, size %x\n",
squashfs_i(inode)->fragment_block,
squashfs_i(inode)->fragment_size);
squashfs_cache_put(buffer);
goto error_out;
}
bytes = i_size_read(inode) & (msblk->block_size - 1);
offset = squashfs_i(inode)->fragment_offset;
}
/*
* Loop copying datablock into pages. As the datablock likely covers
* many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly
* grab the pages from the page cache, except for the page that we've
* been called to fill.
*/
for (i = start_index; i <= end_index && bytes > 0; i++,
bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) {
struct page *push_page;
int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE);
TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail);
/*申请文件页,在之后grab_cache_page_nowait->add_to_page_cache_lru中将申请到的文件页添加到lru链表
squashfs_readpage一次读取一个逻辑块大小,而入参只是一个页,所以要再申请文件页
*/
push_page = (i == page->index) ? page :
grab_cache_page_nowait(page->mapping, i);
if (!push_page)
continue;
if (PageUptodate(push_page))
goto skip_page;
pageaddr = kmap_atomic(push_page);
squashfs_copy_data(pageaddr, buffer, offset, avail);
memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail);
kunmap_atomic(pageaddr);
flush_dcache_page(push_page);
SetPageUptodate(push_page);
skip_page:
unlock_page(push_page);
if (i != page->index)
page_cache_release(push_page);
}
if (!sparse)
squashfs_cache_put(buffer);
return 0;
error_out:
SetPageError(page);
out:
pageaddr = kmap_atomic(page);
memset(pageaddr, 0, PAGE_CACHE_SIZE);
kunmap_atomic(pageaddr);
flush_dcache_page(page);
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
return 0;
}
squashfs_readpage->grab_cache_page_nowait
struct page *
grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
{
struct page *page = find_get_page(mapping, index);
if (page) {
if (trylock_page(page))
return page;
page_cache_release(page);
return NULL;
}
/*申请文件页,在之后grab_cache_page_nowait->add_to_page_cache_lru中
将申请到的文件页添加到lru链表*/
page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
page_cache_release(page);
page = NULL;
}
return page;
}
6)申请文件页时机:filemap_fault->page_cache_read:过程申请文件页
do_nolinear_fault->__do_fault->filemap_fault->page_cache_read
static int page_cache_read(struct file *file, pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
int ret;
do {
/*通过alloc_pages申请内存*/
page = page_cache_alloc_cold(mapping);
if (!page)
return -ENOMEM;
/*
设置page->flags;page->mapping;page->index等.
将申请的页添加到LRU_INACTIVE_FILE链表
*/
ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
ret = 0; /* losing race to add is OK */
page_cache_release(page);
} while (ret == AOP_TRUNCATED_PAGE);
return ret;
}
do_nolinear_fault->__do_fault->filemap_fault->page_cache_read->add_to_page_cache_lru()
int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
int ret;
/*设置PG_locked;调用add_to_page_cache_locked*/
ret = add_to_page_cache(page, mapping, offset, gfp_mask);
if (ret == 0)
/*将申请的页添加到LRU_INACTIVE_FILE链表*/
lru_cache_add_file(page);
return ret;
}
add_to_page_cache_locked设置page->mapping=file->f_mapping;page->index=offset;
注意:do_dentry_open时f_mapping=inode->i_mapping;
inode->i_mapping的初始化:
1>是在do_last->lookup_open->vfs_create->ubifs_create->ubifs_new_inode创建一个文件时初始化的.
2>对于已存在文件,是在do_last->lookup_open->lookup_real->ubifs_lookup->ubifs_iget
add_to_page_cache_lru->add_to_page_cache->add_to_page_cache_locked 和 __delete_from_page_cache对应;
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
{
int error;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageSwapBacked(page));
error = mem_cgroup_cache_charge(page, current->mm,
gfp_mask & GFP_RECLAIM_MASK);
if (error)
goto out;
error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error == 0) {
page_cache_get(page);
/*设置page->mapping=file->f_mapping*/
page->mapping = mapping;
/*offset表示要访问文件的偏移地址所在页*/
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
error = radix_tree_insert(&mapping->page_tree, offset, page);
if (likely(!error)) {
mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
trace_mm_filemap_add_to_page_cache(page);
} else {
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
page_cache_release(page);
}
radix_tree_preload_end();
} else
mem_cgroup_uncharge_cache_page(page);
out:
return error;
}
7)申请文件页时机:do_generic_file_read过程申请文件页
详见博文:linux文件系统实现原理简述 http://write.blog.csdn.net/postedit/71249365 ;
5 内存回收的代码介绍:
alloc_pages申请内存过程,主要有两个时机会回收内存:
时机一:get_page_from_freelist->zone_reclaim->shrink_zone 主要根据zone_watermark_ok判断是否zone_reclaim;
时机二:__alloc_pages_slowpath->wake_all_kswapd唤醒回收机制.kswapd->balance_pgdat->shrink_zone;
alloc_pages过程中,如果get_page_from_freelist申请失败,则唤醒守护进程进行内存回收.
无论哪种方式,最后都是通过形如shrink_zone的函数进行内存回收的.
shrink_zone->shrink_lruvc/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
{
unsigned long nr[NR_LRU_LISTS];
unsigned long nr_to_scan;
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
struct blk_plug plug;
/*获取NR_LRU_LISTS链表上每种物理页个数*/
get_scan_count(lruvec, sc, nr);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(lru) {
if (nr[lru]) {
nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, sc);
}
}
/*
* On large memory systems, scan >> priority can become
* really large. This is fine for the starting priority;
* we want to put equal scanning pressure on each zone.
* However, if the VM has a harder time of freeing pages,
* with multiple processes reclaiming pages, the total
* freeing target can get unreasonably large.
*/
if (nr_reclaimed >= nr_to_reclaim &&
sc->priority < DEF_PRIORITY)
break;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.inactive_anon_is_low(lruvec)=0
*/
if (inactive_anon_is_low(lruvec))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
throttle_vm_writeout(sc->gfp_mask);
}
shrink_lruvec->shrink_list()
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
/*inactive_list_is_low返回值:
如果是文件页:LRU_ACTIVE_FILE链表上的页比LRU_INACTIVE_FILE链表上的页多,则为真,否则为假;
如果是匿名页则一直未假;
*/
if (inactive_list_is_low(lruvec, lru))
/* 把页从active链表移到inactive链表 */
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
/*真正的页面回收函数
shrink_inactive_list->shrink_page_list
*/
return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
}
当inactive链表上的页数不够的时候,会调用shrink_active_list,该函数会将active链表上的页move到inactive链表上;
shrink_lruvec->shrink_list()->shrink_active_list()
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
enum lru_list lru)
{
unsigned long nr_taken;
unsigned long nr_scanned;
unsigned long vm_flags;
LIST_HEAD(l_hold);/* The pages which were snipped off */
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned long nr_rotated = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);
unsigned long vm_oflags = 0;
static int age = 0;
int puttoinactive = 0;
lru_add_drain();
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
spin_lock_irq(&zone->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, isolate_mode, lru);
if (global_reclaim(sc))
zone->pages_scanned += nr_scanned;
reclaim_stat->recent_scanned[file] += nr_taken;
__count_zone_vm_events(PGREFILL, zone, nr_scanned);
__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
putback_lru_page(page);
continue;
}
if (unlikely(buffer_heads_over_limit)) {
if (page_has_private(page) && trylock_page(page)) {
if (page_has_private(page))
try_to_release_page(page, 0);
unlock_page(page);
}
}
/*
page_referenced返回0:表示该页需要添加到inactive链表上,如果某些特殊情况不想释放特定页可以在此时做特殊处理;
page_referenced返回1:如果文件页是代码段,则添加到active链表上,因此代码段有更多机会留在内存上;
*/
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
* that executable code get better chances to stay in
* memory under moderate memory pressure. Anon pages
* are not likely to be evicted by use-once streaming
* IO, plus JVM can create lots of anon VM_EXEC pages,
* so we ignore them here.
*/
/*此时表示文件页*/
if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) {
list_add(&page->lru, &l_active);
continue;
}
}
ClearPageActive(page); /* we are de-activating */
/* 将页添加到inactive链表中 */
list_add(&page->lru, &l_inactive);
}
/*
* Move pages back to the lru list.
*/
spin_lock_irq(&zone->lru_lock);
/*
* Count referenced pages from currently used mappings as rotated,
* even though only some of them are actually re-activated. This
* helps balance scan pressure between file and anonymous pages in
* get_scan_ratio.
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
free_hot_cold_page_list(&l_hold, 1);
}
shrink_active_list将页移到inactive链表上;
下一步进行真正的页面回收函数
shrink_inactive_list->shrink_page_list
static noinline_for_stack unsigned long
shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
struct scan_control *sc, enum lru_list lru)
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
unsigned long nr_dirty = 0;
unsigned long nr_writeback = 0;
isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct zone *zone = lruvec_zone(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
return SWAP_CLUSTER_MAX;
}
lru_add_drain();
if (!sc->may_unmap)
isolate_mode |= ISOLATE_UNMAPPED;
if (!sc->may_writepage)
isolate_mode |= ISOLATE_CLEAN;
spin_lock_irq(&zone->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, isolate_mode, lru);
__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
if (global_reclaim(sc)) {
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned);
else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned);
}
spin_unlock_irq(&zone->lru_lock);
if (nr_taken == 0)
return 0;
/*真正回收页面*/
nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP,
&nr_dirty, &nr_writeback, false);
spin_lock_irq(&zone->lru_lock);
reclaim_stat->recent_scanned[file] += nr_taken;
if (global_reclaim(sc)) {
if (current_is_kswapd())
__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
nr_reclaimed);
else
__count_zone_vm_events(PGSTEAL_DIRECT, zone,
nr_reclaimed);
}
putback_inactive_pages(lruvec, &page_list);
__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&zone->lru_lock);
free_hot_cold_page_list(&page_list, 1);
/*
* If reclaim is isolating dirty pages under writeback, it implies
* that the long-lived page allocation rate is exceeding the page
* laundering rate. Either the global limits are not being effective
* at throttling processes due to the page distribution throughout
* zones or there is heavy usage of a slow backing device. The
* only option is to throttle from reclaim context which is not ideal
* as there is no guarantee the dirtying process is throttled in the
* same way balance_dirty_pages() manages.
*
* This scales the number of dirty pages that must be under writeback
* before throttling depending on priority. It is a simple backoff
* function that has the most effect in the range DEF_PRIORITY to
* DEF_PRIORITY-2 which is the priority reclaim is considered to be
* in trouble and reclaim is considered to be in trouble.
*
* DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
* DEF_PRIORITY-1 50% must be PageWriteback
* DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
* ...
* DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
* isolated page is PageWriteback
*/
if (nr_writeback && nr_writeback >=
(nr_taken >> (DEF_PRIORITY - sc->priority)))
wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
sc->priority,
trace_shrink_flags(file));
return nr_reclaimed;
}
shrink_inactive_list->shrink_page_list()->__remove_mapping()
static int __remove_mapping(struct address_space *mapping, struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
spin_lock_irq(&mapping->tree_lock);
/*
* The non racy check for a busy page.
*
* Must be careful with the order of the tests. When someone has
* a ref to the page, it may be possible that they dirty it then
* drop the reference. So if PageDirty is tested before page_count
* here, then the following race may occur:
*
* get_user_pages(&page);
* [user mapping goes away]
* write_to(page);
* !PageDirty(page) [good]
* SetPageDirty(page);
* put_page(page);
* !page_count(page) [good, discard it]
*
* [oops, our write_to data is lost]
*
* Reversing the order of the tests ensures such a situation cannot
* escape unnoticed. The smp_rmb is needed to ensure the page->flags
* load is not satisfied before that of page->_count.
*
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under tree_lock, then this ordering is not required.
*/
if (!page_freeze_refs(page, 2))
goto cannot_free;
/* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_unfreeze_refs(page, 2);
goto cannot_free;
}
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page);
spin_unlock_irq(&mapping->tree_lock);
swapcache_free(swap, page);
} else {
void (*freepage)(struct page *);
freepage = mapping->a_ops->freepage;
__delete_from_page_cache(page);
spin_unlock_irq(&mapping->tree_lock);
mem_cgroup_uncharge_cache_page(page);
if (freepage != NULL)
freepage(page);
}
return 1;
cannot_free:
spin_unlock_irq(&mapping->tree_lock);
return 0;
}
__remove_mapping->delete_from_page_cache->__delete_from_page_cache()
void __delete_from_page_cache(struct page *page)
{
struct address_space *mapping = page->mapping;
trace_mm_filemap_delete_from_page_cache(page);
/*
* if we're uptodate, flush out into the cleancache, otherwise
* invalidate any existing cleancache entries. We can't leave
* stale data around in the cleancache once our page is gone
*/
if (PageUptodate(page) && PageMappedToDisk(page))
cleancache_put_page(page);
else
cleancache_invalidate_page(mapping, page);
/*从lru链表上删除,和add_to_page_cache_locked对应*/
radix_tree_delete(&mapping->page_tree, page->index);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
mapping->nrpages--;
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
BUG_ON(page_mapped(page));
/*
* Some filesystems seem to re-dirty the page even after
* the VM has canceled the dirty bit (eg ext3 journaling).
*
* Fix it up by doing a final dirty accounting check after
* having removed the page entirely.
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
}
}
【总结】ps:伙伴系统buddy:11个块链表大小分别为2的0、1、2、、、10次方个页,最多申请连续1024个页=4M
4通过page fault申请文件页
1) 映射页:page->mapping = file->f_mapping:
filemap_fault->add_to_page_cache_lru->add_to_page_cache_locked-》add_to_page_cache_locked:page->mapping = mapping;page->index = offset;
2)匿名页:handle_pte_fault->do_wp_page->page_move_anon_rmap
3) 匿名vma初始化:anon_vma_alloc()
5 动态内存回收机制,主要回收通过page fault进行申请的页.
1)alloc_pages因为剩余内存不足引起的回收:
try_to_free_pages->shrink_zone;
zone_reclaim->shrink_zone;
shrink_zone -> shrink_lruvec -> shrink_list->shrink_inactive_list->shrink_page_list
shrink_zone -> shrink_lruvec -> shrink_list->shrink_active_list
2)守护进程释放kswapd->balance_pgdat->shrink_zone
kswapd_init->kswapd_run->kthread_run(kswapd,pgdat,"kswapd%d",nid)
6 真正的页面回收函数
shrink_page_list->try_to_unmap_anon:
匿名线性区赋值,存放在页描述符的mapping字段。当mapping最低位为0时,存放的是映射页address_space;非0存放匿名页线性区。
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;(可以插入几个进程的线性区(),只有所有进程都释放了该页(mapcount为-1时)
才能释放,即从active,inactive链表中删除,并调用free_hot_page释放页框;初始化page->_mapcount:copy_one_pte->page_dup_rmap(page);
try_to_unmap_anon逐个线性区释放。
1)一个线性区如何释放?
shrink_active_list的执行条件:非活动链表中比活动链表小,则开始缩小活动链表 。
move_active_pages_to_lru:list_move(&page->lru, &lruvec->lists[lru]),将活动也链到活动也链表中。
1)mark_page_accessed: page从inactive链表向active链表切换,其中采用了二次机会。
inactive reference=0-> inactive reference=1 ->active reference=0->active reference=1
2)page_referenced:page从active向inactive表切换.每当置换算法扫描一次页面,就老化一次。
3)用户进程的页面都是通过缺页异常 page fault分配的handle_pte_fault,都是可回收的,分为:文件页和匿名页。
4) handle_pte_fault函数(do_swap_page换入,pageout换出,页不在内存时-present标记清0时调用。匿名页添加到活动lru;文件页加到非活动lru)
【其他】
1 不断触发内存动态回收方法,内存紧张时,代码段可能被回收:
static int wake_all_kswapd_debug(void)
{
/* 每个两秒触发一次内存回收,触发30次 */
for(j=0;j<30;j++)
{
/*8表示连续256个page即1M大小;NODE_DATA(0)表示ZONE_NORMAL*/
wake_all_kswapd(8,NODE_DATA(0)->node_zonelists,0,0);
msleep(2000);
}
}
2 判断test进程的代码段,动态库等被重新加载的方法:filemap_fault函数中加入打印信息.
int filemap_fault()
{
if(strncmp(current->comm,"test",4)==0)
{
printk("[%s]filemap_fault:%s\n",
current->comm,file->f_path.dentry->d_iname);
}
}
3 统计加载一个动态库到ram上时,需要申请的文件页数,根据上文得知申请文件页的时机,则分别统计即可.
文件页个数增加过程,一般来说系统通过add_to_page_cache_lru讲申请到的页作为文件页,可以据此统计文件页个数:
1>do_generic_file_read过程申请文件页,do_generic_file_read->add_to_page_cache_lru之后统计文件页个数;
2>filemap_fault->page_cache_read->add_to_page_cache_lru之后统计文件页个数;
3>filemap_fault->__do_page_cache_readahead->read_pages->->add_to_page_cache_lru之后统计文件页个数;
4>do_generic_file_read/filemap_fault->(mapping->a_ops->readpage=squashfs_readpage)->
squashfs_readpage->grab_cache_page_nowait之后统计文件页个数,注意不统计入参的一个文件页;
文件页减少过程:
shrink_page_list->remove_mapping->__remove_mapping->__delete_from_page_cache后统计;
[其他]
vm.min_free_kbytes=409600;
vm.vfs_cache_pressure=200; // sysctl_vfs_cache_pressure
vm.swappiness=40
调整MIN_FREE_KBYTES的目的是保持物理内存有足够的空闲空间,防止突发性的换页。swapiness缺省为60,减少swapiness会使系统尽快通过swapout不使用的进程资源来释放更多的物理内存。vfs_cache_pressure的缺省值是100,加大这个参数设置了虚拟内存回收directory和i-node缓冲的倾向,这个值越大,回收的倾向越严重。调整这3个参数的目的就是让操作系统在平时就尽快回收缓冲,释放物理内存,这样就可以避免突发性的大规模换页。