linux内核内存管理的代码改变的有点大,主要是细节方面;
首先从 __get_free_pages()开始总结起:
/* * Common helper functions. */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { struct page *page; /* * __get_free_pages() returns a 32-bit address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);//返回逻辑地址,所以不能用highmem区域未映射的地址 page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page);//物理page转换为线性逻辑地址 }
static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order)//alloc_pages()函数可以分配任何zone的页 { return alloc_pages_current(gfp_mask, order); } struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = get_task_policy(current);//得到current中的mempolicy,如果为空,则得到&preferred_node_policy[node]中的mempolicy struct page *page; unsigned int cpuset_mems_cookie; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))//pol空,在中断上下文,指定到本地上获取page,则用默认的策略 pol = &default_policy; retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE)//交错策略 page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; return page; }
struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ union { short preferred_node; /* preferred */ nodemask_t nodes; /* interleave/bind */ /* undefined for default */ } v; union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; };成员的含义,注释已经说的比较明白了;
有几种分配策略 enum { MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, MPOL_MAX, /* always last member of enum */ };
page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); static unsigned interleave_nodes(struct mempolicy *policy) { unsigned nid, next; struct task_struct *me = current; nid = me->il_next; next = next_node(nid, policy->v.nodes);//policy->v.nodes是一个表示nid是否存在的bit映射,从nid位置开始再找出下一个为1的bit位 if (next >= MAX_NUMNODES)//如果超出nid节点最大编号范围,则从头再找 next = first_node(policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; } static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { struct zonelist *zl; struct page *page; zl = node_zonelist(nid, gfp);// NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); 如果指定了__GFP_THISNODE,则用pgdat(nid)->zonelists[1],否则用zonelists[0] page = __alloc_pages(gfp, order, zl);//这是主要分析的 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0])) inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; }
page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, int nd) { switch (policy->mode) { case MPOL_PREFERRED://首选模式 if (!(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node;//这时候mempolicy中的v成员就代表一个节点了,其他模式下则代表节点的mask break; case MPOL_BIND: /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the * current node isn't part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes)))//指定本地分配,并且nd是不存在 nd = first_node(policy->v.nodes);//从v中得到可用的node所在的bit位 break; default: BUG(); } return node_zonelist(nd, gfp);//和上面一样得到zonelists[] } static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))//和current->mems_allowed是否相交 *if policy->v.nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->v.nodes is intersect with node_states[N_MEMORY]. * so if the following test faile, it implies * policy->v.nodes has movable memory only.中 return &policy->v.nodes; return NULL; }
/* * This is the 'heart' of the zoned buddy allocator. */ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { enum zone_type high_zoneidx = gfp_zone(gfp_mask);//获取允许zone的最大的zone,一般是从high--normal--dma struct zone *preferred_zone; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask);<span style="font-family: Arial, Helvetica, sans-serif;">//迁移类型,主要是zone->free_area.free_list[迁移类型]</span> unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;//设置分配标识,<span style="font-family: Arial, Helvetica, sans-serif;">low水位,内存节点要对应cpu集合</span> struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask;//屏蔽一些无用的标识 lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); if (should_fail_alloc_page(gfp_mask, order)) return NULL; /* * Check the zones suitable for the gfp_mask contain at least one * valid zone. It's possible to have an empty zonelist as a result * of GFP_THISNODE and a memoryless node */ if (unlikely(!zonelist->_zonerefs->zone))//备用列表中没有zone return NULL; /* * Will only have any effect when __GFP_KMEMCG is set. This is * verified in the (always inline) callee */ if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) return NULL; retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); /* The preferred zone is used for statistics later *///循环备用列表中的所有zone,找到一个最合适的zone first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif /* First allocation attempt *///快速分配内存页 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); if (unlikely(!page)) { /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. */ gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order,//这个是慢速的分配 zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); out: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; memcg_kmem_commit_charge(page, memcg, order); return page; }
int migratetype = allocflags_to_migratetype(gfp_mask); /* Convert GFP flags to their corresponding migrate type */ static inline int allocflags_to_migratetype(gfp_t gfp_flags) { WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);//GFP_MOVABLE包含可回收的和可移动的页 if (unlikely(page_group_by_mobility_disabled))//这表示是否开启反内存碎片,内存少的一般就不开启 return MIGRATE_UNMOVABLE;//其实就是0 /* Group based on mobility */ return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) | ((gfp_flags & __GFP_RECLAIMABLE) != 0);//返回 回收或可移动 的标识位数,在分配内存是使用 }
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes, struct zone **zone){ return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, zone); } /* Returns the next zone at or below highest_zoneidx in a zonelist */由于highest_zoneidx是最大的区域,所以只要小于该区域就是首选zone struct zoneref *next_zones_zonelist(struct zoneref *z,//这时候z里面包含了一系列的内存区域zones enum zone_type highest_zoneidx, nodemask_t *nodes, struct zone **zone){ /* * Find the next suitable zone to use for the allocation. * Only filter based on nodemask if it's set */ if (likely(nodes == NULL))//交错策略,其实表示所有zone都是有效的 while (zonelist_zone_idx(z) > highest_zoneidx)//得到首选节点 z++; else while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes))) z++; *zone = zonelist_zone(z); return z; }//这应该可以说是得到zonelists中的首选节点