内存分配-页(page)

1.页  

 内核把物理页作为内存管理的基本单元,并用struct page结构表示系统中的每个物理页。

struct page{
   unsigned long         flags;
   atomic_t              _count;
   atomic_t              _mapcount;
   unsigned long         private;
   struct address_space  *mapping;
   pgoff_t               index;
   struct list_head      lru;
   void                   *virtual;
}

   flag表示页的状态

  _count表示引用次数,为负值时表示空闲,可以调用page_count查询_count值,返回0表示空闲

  virtual表示页的虚拟地址。高端内存中,动态映射时,该值为NULL。

  lru 表示页面回收的LRU链表算法

2.区

    Linux把系统的页划分为区,形成不同的内存池,可以根据用途进行分配。具体的分区要看不同的硬件体系架构,常用的分区为:ZONE_NORMAL,ZONE_HIGHMEM,ZONE_DMA;每个区都用struct zone表示:
 

struct zone {

	unsigned long watermark[NR_WMARK];
	unsigned long percpu_drift_mark;
	unsigned long		lowmem_reserve[MAX_NR_ZONES];
	unsigned long		dirty_balance_reserve;
#ifdef CONFIG_NUMA
	int node;
	unsigned long		min_unmapped_pages;
	unsigned long		min_slab_pages;
#endif
	struct per_cpu_pageset __percpu *pageset;
	spinlock_t		lock;
	int                     all_unreclaimable; /* All pages pinned */
#ifdef CONFIG_MEMORY_HOTPLUG
	/* see spanned/present_pages for more description */
	seqlock_t		span_seqlock;
#endif
	struct free_area	free_area[MAX_ORDER];

#ifndef CONFIG_SPARSEMEM
	unsigned long		*pageblock_flags;
#endif /* CONFIG_SPARSEMEM */

#ifdef CONFIG_COMPACTION
	unsigned int		compact_considered;
	unsigned int		compact_defer_shift;
	int			compact_order_failed;
#endif

	ZONE_PADDING(_pad1_)
	spinlock_t		lru_lock;
	struct lruvec		lruvec;

	struct zone_reclaim_stat reclaim_stat;

	unsigned long		pages_scanned;	   /* since last reclaim */
	unsigned long		flags;		   /* zone flags, see below */

	atomic_long_t		vm_stat[NR_VM_ZONE_STAT_ITEMS];
	unsigned int inactive_ratio;
	ZONE_PADDING(_pad2_)
	wait_queue_head_t	* wait_table;
	unsigned long		wait_table_hash_nr_entries;
	unsigned long		wait_table_bits;

	struct pglist_data	*zone_pgdat;
	unsigned long		zone_start_pfn;

	unsigned long		spanned_pages;	/* total size, including holes */
	unsigned long		present_pages;	/* amount of memory (excluding holes) */

	const char		*name;
} ____cacheline_internodealigned_in_smp;

    其中watermark数组持有该区的最小值、最低和最高水位,分别为:WMARK_MIN,WMARK_LOW和WMARK_HIGH

   3.获得页

   内核提供了请求内存的底层机制。核心函数是:alloc_pages(gfp_mask, order)

#define alloc_pages(gfp_mask, order) \
		alloc_pages_node(numa_node_id(), gfp_mask, order)

static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
						unsigned int order)
{
	/* Unknown node is current node */
	if (nid < 0)
		nid = numa_node_id();

	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
}

__alloc_pages(gfp_t gfp_mask, unsigned int order,
		struct zonelist *zonelist)
{
	return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
}

      alloc_pages最终调用的是_alloc_pages_nodemask,后面会仔细分析。alloc_pages返回的是page结构体,可以用page_address函数转换成对应的逻辑地址。

#define page_address(page) ((page)->virtual)

    如果无需用到struct page结构体,可以直接调用_get_free_pages

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
	struct page *page;

	/*
	 * __get_free_pages() returns a 32-bit address, which cannot represent
	 * a highmem page
	 */
	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

	page = alloc_pages(gfp_mask, order);
	if (!page)
		return 0;
	return (unsigned long) page_address(page);
}

   代码中可以看出,_get_free_pages就是alloc_pages与page_address的结合

关于分配掩码gfp_mask,是一个很重要的参数。主要分为以下几类:

  • 内存管理区修饰符  -----决定从哪个区分配内存
  • 移动修饰符            ------决定分配的页面是否具有迁移属性
  • 水位修饰符             ------控制是否可以访问紧急预留的内存
  • 页面回收修饰符     
  • 行动修饰符

4.具体分析_alloc_pages_nodemask

__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
			struct zonelist *zonelist, nodemask_t *nodemask)
{
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);     //根据gfp_mask确定分区
	struct zone *preferred_zone;
	struct page *page = NULL;
	int migratetype = allocflags_to_migratetype(gfp_mask);     //根据gfp_mask得到迁移类分配页的类型
	unsigned int cpuset_mems_cookie;

	gfp_mask &= gfp_allowed_mask;

	lockdep_trace_alloc(gfp_mask);

	might_sleep_if(gfp_mask & __GFP_WAIT);

	if (should_fail_alloc_page(gfp_mask, order))
		return NULL;

	/*
	 * Check the zones suitable for the gfp_mask contain at least one
	 * valid zone. It's possible to have an empty zonelist as a result
	 * of GFP_THISNODE and a memoryless node
	 */
	if (unlikely(!zonelist->_zonerefs->zone))
		return NULL;

retry_cpuset:
	cpuset_mems_cookie = get_mems_allowed();

	/* The preferred zone is used for statistics later   从zonelist中找到zoneidx与high_zoneidx相同的管理区,也就是之前认定的管理区*/    
	first_zones_zonelist(zonelist, high_zoneidx,
				nodemask ? : &cpuset_current_mems_allowed,
				&preferred_zone);
	if (!preferred_zone)
		goto out;

	//尝试第一次分配,流程是从指定的管理区开始扫描管理区-->找到充足的管理区-->从指定的迁移类型链表中分配内存-->如果在指定迁移类型中找不到则到其他的迁移类型中去寻找
	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
			zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
			preferred_zone, migratetype);
	if (unlikely(!page))
		page = __alloc_pages_slowpath(gfp_mask, order,   //第一次分配失败的话则会通过一条低速路径来进行二次分配,包括唤醒页换出守护进程等
				zonelist, high_zoneidx, nodemask,
				preferred_zone, migratetype);

	trace_mm_page_alloc(page, order, gfp_mask, migratetype);

out:
	/*
	 * When updating a task's mems_allowed, it is possible to race with
	 * parallel threads in such a way that an allocation can fail while
	 * the mask is being updated. If a page allocation is about to fail,
	 * check if the cpuset changed during allocation and if so, retry.
	 */
	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
		goto retry_cpuset;

	return page;
}
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
		struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
		struct zone *preferred_zone, int migratetype)
{
	struct zoneref *z;
	struct page *page = NULL;
	int classzone_idx;
	struct zone *zone;
	nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
	int zlc_active = 0;		/* set if using zonelist_cache */
	int did_zlc_setup = 0;		/* just call zlc_setup() one time */

	classzone_idx = zone_idx(preferred_zone);   //获取管理区的编号
zonelist_scan:
	/*
	 * Scan zonelist, looking for a zone with enough free.
	 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
	 */
	for_each_zone_zonelist_nodemask(zone, z, zonelist,       //从认定的管理区开始遍历,直到找到一个拥有足够空间的管理区,
						high_zoneidx, nodemask) {            //例如,如果high_zoneidx对应的ZONE_HIGHMEM,则遍历顺序为HIGHMEM-->NORMAL-->DMA,
		if (NUMA_BUILD && zlc_active &&                                    //  如果high_zoneidx对应ZONE_NORMAL,则遍历顺序为NORMAL-->DMA*/
			!zlc_zone_worth_trying(zonelist, z, allowednodes))
				continue;
		if ((alloc_flags & ALLOC_CPUSET) &&
			!cpuset_zone_allowed_softwall(zone, gfp_mask))
				continue;
		/*
		 * When allocating a page cache page for writing, we
		 * want to get it from a zone that is within its dirty
		 * limit, such that no single zone holds more than its
		 * proportional share of globally allowed dirty pages.
		 * The dirty limits take into account the zone's
		 * lowmem reserves and high watermark so that kswapd
		 * should be able to balance it without having to
		 * write pages from its LRU list.
		 *
		 * This may look like it could increase pressure on
		 * lower zones by failing allocations in higher zones
		 * before they are full.  But the pages that do spill
		 * over are limited as the lower zones are protected
		 * by this very same mechanism.  It should not become
		 * a practical burden to them.
		 *
		 * XXX: For now, allow allocations to potentially
		 * exceed the per-zone dirty limit in the slowpath
		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
		 * which is important when on a NUMA setup the allowed
		 * zones are together not big enough to reach the
		 * global limit.  The proper fix for these situations
		 * will require awareness of zones in the
		 * dirty-throttling and the flusher threads.
		 */
		if ((alloc_flags & ALLOC_WMARK_LOW) &&
		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
			goto this_zone_full;

		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
			unsigned long mark;
			int ret;

			mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];  //通过alloc_flags来确定是使用何种水印,pages_min?pages_low?pages_high?
			if (zone_watermark_ok(zone, order, mark,                              //选择了一种水印,就要求分配后的空闲不低于该水印才能进行分配
				    classzone_idx, alloc_flags))     /*如果管理区的水位线处于正常水平,则在该管理区进行分配*/
				goto try_this_zone;

			if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {    //下面是针对NUMA架构的申请页面回收
				/*
				 * we do zlc_setup if there are multiple nodes
				 * and before considering the first zone allowed
				 * by the cpuset.
				 */
				allowednodes = zlc_setup(zonelist, alloc_flags);
				zlc_active = 1;
				did_zlc_setup = 1;
			}

			if (zone_reclaim_mode == 0)
				goto this_zone_full;

			/*
			 * As we may have just activated ZLC, check if the first
			 * eligible zone has failed zone_reclaim recently.
			 */
			if (NUMA_BUILD && zlc_active &&
				!zlc_zone_worth_trying(zonelist, z, allowednodes))
				continue;

			ret = zone_reclaim(zone, gfp_mask, order);
			switch (ret) {
			case ZONE_RECLAIM_NOSCAN:
				/* did not scan */
				continue;
			case ZONE_RECLAIM_FULL:
				/* scanned but unreclaimable */
				continue;
			default:
				/* did we reclaim enough */
				if (!zone_watermark_ok(zone, order, mark,
						classzone_idx, alloc_flags))
					goto this_zone_full;
			}
		}

try_this_zone:   //分配2^order个页
		page = buffered_rmqueue(preferred_zone, zone, order,
						gfp_mask, migratetype);
		if (page)
			break;
this_zone_full:
		if (NUMA_BUILD)
			zlc_mark_zone_full(zonelist, z);
	}

	if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
		/* Disable zlc cache for second zonelist scan */
		zlc_active = 0;
		goto zonelist_scan;
	}
	return page;
}
/*
 * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
 * we cheat by calling it from here, in the order > 0 path.  Saves a branch
 * or two.
 */
 /*
 该函数分两种情况进行处理,一种是只要求分配单个页框,另一种是要求分配多个连续页框
 对于单个页面,内核选择从每CPU页框高速缓存中分配,它的核心描述结构也是MIGRATE_TYPES个链表,
 只不过链表中的元素都是单个页。这些页分为热页和冷页,所谓热页就是还处在CPU高速缓存中的
 页,相反,冷页就是不存在于高速缓存中的页。对于单个页框的申请,分配热页可以提高效率。
 需要注意的是,越靠近链表头的页越热,越靠近链表尾的页越冷,因为每次释放单个页框的时候,
 页框是插入到链表的头部的,也就是说靠近头部的页框是最近才释放的,因此最有可能存在于高
 速缓存当中
 */
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
			struct zone *zone, int order, gfp_t gfp_flags,
			int migratetype)
{
	unsigned long flags;
	struct page *page;
	int cold = !!(gfp_flags & __GFP_COLD);

again:
	if (likely(order == 0)) {     //order为0,则分配一个页
		struct per_cpu_pages *pcp;
		struct list_head *list;

		local_irq_save(flags);
		pcp = &this_cpu_ptr(zone->pageset)->pcp;   //获取本地cpu对应的pcp
		list = &pcp->lists[migratetype];
		
		/*如果链表为空,则表示没有可分配的页,需要从伙伴系统中分配2^batch个页给list*/
		if (list_empty(list)) {
			pcp->count += rmqueue_bulk(zone, 0,
					pcp->batch, list,
					migratetype, cold);
			if (unlikely(list_empty(list)))
				goto failed;
		}

		if (cold)   //如果需要分配冷页,则从链表的尾部获取
			page = list_entry(list->prev, struct page, lru);
		else       //如果是需要热页,则从链表的头部获取
			page = list_entry(list->next, struct page, lru);

		list_del(&page->lru);
		pcp->count--;
	} else {       
		if (unlikely(gfp_flags & __GFP_NOFAIL)) {      //对于连续的页框分配,通过调用__rmqueue()来完成分配
			/*
			 * __GFP_NOFAIL is not to be used in new code.
			 *
			 * All __GFP_NOFAIL callers should be fixed so that they
			 * properly detect and handle allocation failures.
			 *
			 * We most definitely don't want callers attempting to
			 * allocate greater than order-1 page units with
			 * __GFP_NOFAIL.
			 */
			WARN_ON_ONCE(order > 1);
		}
		spin_lock_irqsave(&zone->lock, flags);
		page = __rmqueue(zone, order, migratetype);   //从管理区的伙伴系统中选择合适的内存块进行分配
		spin_unlock(&zone->lock);
		if (!page)
			goto failed;
		__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
	}

	__count_zone_vm_events(PGALLOC, zone, 1 << order);
	zone_statistics(preferred_zone, zone, gfp_flags);
	local_irq_restore(flags);

	VM_BUG_ON(bad_range(zone, page));
	if (prep_new_page(page, order, gfp_flags))
		goto again;
	return page;

failed:
	local_irq_restore(flags);
	return NULL;
}
static struct page *__rmqueue(struct zone *zone, unsigned int order,
						int migratetype)
{
	struct page *page;

retry_reserve:
	page = __rmqueue_smallest(zone, order, migratetype);  //按照指定的迁徙类型,分配对应的内存块

	if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {    //如果分配失败并且迁移类型不是MIGRATE_RESERVE(如果是MIGRATE_RESERVE,则表明已经没有其他的迁移类型可供选择了)
		page = __rmqueue_fallback(zone, order, migratetype);

		/*
		 * Use MIGRATE_RESERVE rather than fail an allocation. goto
		 * is used because __rmqueue_smallest is an inline function
		 * and we want just one call site
		 */
		if (!page) {
			migratetype = MIGRATE_RESERVE;
			goto retry_reserve;
		}
	}

	trace_mm_page_alloc_zone_locked(page, order, migratetype);
	return page;
}
static inline
struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
						int migratetype)
{
	unsigned int current_order;
	struct free_area * area;
	struct page *page;

	/* Find a page of the appropriate size in the preferred list */
	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
		area = &(zone->free_area[current_order]);    //获取和现在的阶数对应的free_area
		if (list_empty(&area->free_list[migratetype]))
			continue;

		page = list_entry(area->free_list[migratetype].next,    //得到满足要求的页块中的第一个页描述符
							struct page, lru);
		list_del(&page->lru);
		rmv_page_order(page);    //将page的private设为0
		area->nr_free--;                              //内存块数减1
		expand(zone, page, order, current_order, area, migratetype);   //进行拆分
		return page;
	}

	return NULL;
}
static inline void expand(struct zone *zone, struct page *page,
	int low, int high, struct free_area *area,
	int migratetype)
{
	unsigned long size = 1 << high;

	while (high > low) {     //申请的order为low,实际分配的块对应的order为high,如果high大于low,则要将大块进行拆分,将拆分的块添加到下一级快链表中
		area--;   //area减1得到下一级order对应的ared
		high--;   //high减1表明进行了一次拆分
		size >>= 1;   //拆分一次size就要除以2
		VM_BUG_ON(bad_range(zone, &page[size]));

#ifdef CONFIG_DEBUG_PAGEALLOC
		if (high < debug_guardpage_minorder()) {
			/*
			 * Mark as guard pages (or page), that will allow to
			 * merge back to allocator when buddy will be freed.
			 * Corresponding page table entries will not be touched,
			 * pages will stay not present in virtual address space
			 */
			INIT_LIST_HEAD(&page[size].lru);
			set_page_guard_flag(&page[size]);
			set_page_private(&page[size], high);
			/* Guard pages are not available for any usage */
			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
			continue;
		}
#endif
		list_add(&page[size].lru, &area->free_list[migratetype]);
		area->nr_free++;
		set_page_order(&page[size], high);
	}
}

参考:http://blog.csdn.net/vanbreaker/article/details/7621289 

你可能感兴趣的:(嵌入式)