linux内存管理--伙伴系统分配内存准备工作

linux内核内存管理的代码改变的有点大,主要是细节方面;


首先从 __get_free_pages()开始总结起:

unsigned long  __get_free_pages(grp_t  gfp_mask, unsigned int order)

/*                      
 * Common helper functions.
 */ 
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{   
    struct page *page;
    
    /*
     * __get_free_pages() returns a 32-bit address, which cannot represent
     * a highmem page
     */
    VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);//返回逻辑地址,所以不能用highmem区域未映射的地址

    page = alloc_pages(gfp_mask, order); 
    if (!page)
        return 0;
    return (unsigned long) page_address(page);//物理page转换为线性逻辑地址
} 

struct page*  alloc_pages()

static inline struct page *
alloc_pages(gfp_t gfp_mask, unsigned int order)//alloc_pages()函数可以分配任何zone的页
{
    return alloc_pages_current(gfp_mask, order);
}  

struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{   
    struct mempolicy *pol = get_task_policy(current);//得到current中的mempolicy,如果为空,则得到&preferred_node_policy[node]中的mempolicy
    struct page *page;
    unsigned int cpuset_mems_cookie;

    if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))//pol空,在中断上下文,指定到本地上获取page,则用默认的策略
        pol = &default_policy;

retry_cpuset:
    cpuset_mems_cookie = get_mems_allowed();

    /*
     * No reference counting needed for current->mempolicy
     * nor system default_policy
     */
    if (pol->mode == MPOL_INTERLEAVE)//交错策略
        page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
    else
        page = __alloc_pages_nodemask(gfp, order,
                policy_zonelist(gfp, pol, numa_node_id()),
                policy_nodemask(gfp, pol));

    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
        goto retry_cpuset;

    return page;
}

上面代码中:
cpuset_mems_cookie = get_mems_allowed();
xxx
if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
        goto retry_cpuset;
是成对出现的;这是个顺序锁,这里锁住的应该是 current->mems_allowed 成员,因为在分配内存页时都要考虑到该cpu是否可以在某个节点上分配。如果在分配期间修改了mems_allowed,就算分配到了该page,那该cpu也不一定能使用;在分配完内存页后还要再判断下该mems_allowed是否已经改变;如果改变了,那分配的页也就失效了,要重新再分配一次;


下面说下分配策略,这个分配策略主要是获取到节点的编号(也就是得到 pgdat 结构体);下面是分配策略的结构体

struct mempolicy {
    atomic_t refcnt;
    unsigned short mode;    /* See MPOL_* above */
    unsigned short flags;   /* See set_mempolicy() MPOL_F_* above */
    union {     
        short        preferred_node; /* preferred */
        nodemask_t   nodes;     /* interleave/bind */
        /* undefined for default */
    } v;
    union {
        nodemask_t cpuset_mems_allowed; /* relative to these nodes */
        nodemask_t user_nodemask;   /* nodemask passed by user */
    } w;
};
成员的含义,注释已经说的比较明白了;

有几种分配策略
enum {
    MPOL_DEFAULT,
    MPOL_PREFERRED,
    MPOL_BIND,
    MPOL_INTERLEAVE,
    MPOL_LOCAL,
    MPOL_MAX,   /* always last member of enum */
}; 

交错策略:
接着上代码  pol->mode == MPOL_INTERLEAVE  

page = alloc_page_interleave(gfp, order, interleave_nodes(pol));

static unsigned interleave_nodes(struct mempolicy *policy)
{
    unsigned nid, next;
    struct task_struct *me = current;

    nid = me->il_next;
    next = next_node(nid, policy->v.nodes);//policy->v.nodes是一个表示nid是否存在的bit映射,从nid位置开始再找出下一个为1的bit位
    if (next >= MAX_NUMNODES)//如果超出nid节点最大编号范围,则从头再找
        next = first_node(policy->v.nodes);
    if (next < MAX_NUMNODES)
        me->il_next = next;
    return nid;
}


static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
                    unsigned nid)
{
    struct zonelist *zl;
    struct page *page;

    zl = node_zonelist(nid, gfp);// NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); 如果指定了__GFP_THISNODE,则用pgdat(nid)->zonelists[1],否则用zonelists[0]
    page = __alloc_pages(gfp, order, zl);//这是主要分析的
    if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
        inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
    return page;
}

alloc_page_interleave()函数:
    首选通过 interleave_nodes()从current中得到nid,也就是内存节点编号;
    再根据nid 和gfp 得到zonelist; 其中 zonelist[0]是按离当前cpu节点的远近顺序存放的各个内存节点的zone,而zonelist[1]则是存放自己当前节点的zone(可以看初始化zonelist部分)
    最后调用:__alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);



绑定策略、首选策略:

page = __alloc_pages_nodemask(gfp, order,
                policy_zonelist(gfp, pol, numa_node_id()),
                policy_nodemask(gfp, pol));


static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy,
    int nd)
{
    switch (policy->mode) {
    case MPOL_PREFERRED://首选模式
        if (!(policy->flags & MPOL_F_LOCAL))
            nd = policy->v.preferred_node;//这时候mempolicy中的v成员就代表一个节点了,其他模式下则代表节点的mask
        break;
    case MPOL_BIND:
        /*
         * Normally, MPOL_BIND allocations are node-local within the
         * allowed nodemask.  However, if __GFP_THISNODE is set and the
         * current node isn't part of the mask, we use the zonelist for
         * the first node in the mask instead.
         */
        if (unlikely(gfp & __GFP_THISNODE) &&
                unlikely(!node_isset(nd, policy->v.nodes)))//指定本地分配,并且nd是不存在
            nd = first_node(policy->v.nodes);//从v中得到可用的node所在的bit位
        break;
    default:
        BUG();
    }
    return node_zonelist(nd, gfp);//和上面一样得到zonelists[]
}


static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
    /* Lower zones don't get a nodemask applied for MPOL_BIND */
    if (unlikely(policy->mode == MPOL_BIND) &&
            apply_policy_zone(policy, gfp_zone(gfp)) &&
            cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))//和current->mems_allowed是否相交
     *if policy->v.nodes has movable memory only,
     * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only.
     *              
     * policy->v.nodes is intersect with node_states[N_MEMORY].
     * so if the following test faile, it implies 
     * policy->v.nodes has movable memory only.中
        return &policy->v.nodes;

    return NULL;
}

policy_zonelist(gfp, pol, numa_node_id()):
    首选,如果是首选模式,那policy->v.preferred_node就代表了一个节点,而不是节点的mask;
    然后,如果是绑定模式,如果指定了本地分配,并且当前节点又不可用,则从policy中找到第一个可用节点;
    最后,其他情况都使用numa_node_id()获取的节点


struct page * __alloc_pages_nodemask(gfp_t  gfp_mask, unsigned int order,  struct  zonelist *zonelist, nodemask_t  *nodemask)

/*      
 * This is the 'heart' of the zoned buddy allocator.
 */     
struct page *   
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
            struct zonelist *zonelist, nodemask_t *nodemask)
{   
    enum zone_type high_zoneidx = gfp_zone(gfp_mask);//获取允许zone的最大的zone,一般是从high--normal--dma
    struct zone *preferred_zone;
    struct page *page = NULL;
    int migratetype = allocflags_to_migratetype(gfp_mask);<span style="font-family: Arial, Helvetica, sans-serif;">//迁移类型,主要是zone->free_area.free_list[迁移类型]</span>

    unsigned int cpuset_mems_cookie;
    int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;//设置分配标识,<span style="font-family: Arial, Helvetica, sans-serif;">low水位,内存节点要对应cpu集合</span>
    struct mem_cgroup *memcg = NULL;

    gfp_mask &= gfp_allowed_mask;//屏蔽一些无用的标识

    lockdep_trace_alloc(gfp_mask);

    might_sleep_if(gfp_mask & __GFP_WAIT);

    if (should_fail_alloc_page(gfp_mask, order))
        return NULL;

    /*
     * Check the zones suitable for the gfp_mask contain at least one
     * valid zone. It's possible to have an empty zonelist as a result
     * of GFP_THISNODE and a memoryless node
     */
    if (unlikely(!zonelist->_zonerefs->zone))//备用列表中没有zone
        return NULL;

    /*
     * Will only have any effect when __GFP_KMEMCG is set.  This is
     * verified in the (always inline) callee
     */
    if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
        return NULL;

retry_cpuset:
    cpuset_mems_cookie = get_mems_allowed();

    /* The preferred zone is used for statistics later *///循环备用列表中的所有zone,找到一个最合适的zone
    first_zones_zonelist(zonelist, high_zoneidx,
                nodemask ? : &cpuset_current_mems_allowed,
                &preferred_zone);
    if (!preferred_zone)
        goto out;

#ifdef CONFIG_CMA
    if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
        alloc_flags |= ALLOC_CMA;
#endif
    /* First allocation attempt *///快速分配内存页
    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
            zonelist, high_zoneidx, alloc_flags,
            preferred_zone, migratetype);
    if (unlikely(!page)) {
        /*
         * Runtime PM, block IO and its error handling path
         * can deadlock because I/O on the device might not
         * complete.
         */
        gfp_mask = memalloc_noio_flags(gfp_mask);
        page = __alloc_pages_slowpath(gfp_mask, order,//这个是慢速的分配
                zonelist, high_zoneidx, nodemask,
                preferred_zone, migratetype);
    }

    trace_mm_page_alloc(page, order, gfp_mask, migratetype);

out:
    /*
     * When updating a task's mems_allowed, it is possible to race with
     * parallel threads in such a way that an allocation can fail while
     * the mask is being updated. If a page allocation is about to fail,
     * check if the cpuset changed during allocation and if so, retry.
     */
    if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
        goto retry_cpuset;

    memcg_kmem_commit_charge(page, memcg, order);

    return page;
}


下面是反内存碎片设置的,改变了以前的zone->free_area.free_list[迁移类型],以前free_list就直接链接page块,而现在是各种迁移类型链表的数组了。
int migratetype = allocflags_to_migratetype(gfp_mask);

/* Convert GFP flags to their corresponding migrate type */
static inline int allocflags_to_migratetype(gfp_t gfp_flags)
{   
    WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);//GFP_MOVABLE包含可回收的和可移动的页
    
    if (unlikely(page_group_by_mobility_disabled))//这表示是否开启反内存碎片,内存少的一般就不开启
        return MIGRATE_UNMOVABLE;//其实就是0

    /* Group based on mobility */
    return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
        ((gfp_flags & __GFP_RECLAIMABLE) != 0);//返回 回收或可移动 的标识位数,在分配内存是使用
}


static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone){
    return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
                                zone);
}   

/* Returns the next zone at or below highest_zoneidx in a zonelist */由于highest_zoneidx是最大的区域,所以只要小于该区域就是首选zone
struct zoneref *next_zones_zonelist(struct zoneref *z,//这时候z里面包含了一系列的内存区域zones
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone){
    /*
     * Find the next suitable zone to use for the allocation.
     * Only filter based on nodemask if it's set
     */
    if (likely(nodes == NULL))//交错策略,其实表示所有zone都是有效的
        while (zonelist_zone_idx(z) > highest_zoneidx)//得到首选节点
            z++;
    else
        while (zonelist_zone_idx(z) > highest_zoneidx ||
                (z->zone && !zref_in_nodemask(z, nodes)))
            z++;
                    
    *zone = zonelist_zone(z);
    return z;       }//这应该可以说是得到zonelists中的首选节点













你可能感兴趣的:(linux,kernel,linux驱动)