linux内存管理--快速分配内存页


先说下快速分配内存页的参数:

gfp_mask  进入快速分配时,加上了__GFP_HARDWALL  这表示再分配时要加大分配力度;

nodemask  表示节点的mask,就是是否能在该节点上分配内存,这是个bit位数组;

order 是分配的阶;

zonelist 是当perferred_zone上没有合适的页可以分配时,就要按zonelist中的顺序扫描该zonelist中备用zone列表,一个个的试用;

high_zoneidx是表示该分配时,所能分配的最高zone,一般从high --》 normal --》dma 内存越来越昂贵,所以一般从high到dma分配依次分配;

alloc_flags 是分配内存是的标识;

preferred_zone 表示从high_zoneidx后找到的合适的zone,一般会从该zone分配;分配失败的话,就会在zonelist再找一个preferred_zone = 合适的zone;

migratetype是迁移类型,在zone->free_area.free_list[XXX] 作为分配下标使用,这个是用来反碎片化的,修改了以前的free_area结构体,在该结构体中再添加了一个数组,该数组以迁移类型为下标,每个数组元素都挂了对应迁移类型的页链表;

    page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
            zonelist, high_zoneidx, alloc_flags,
            preferred_zone, migratetype);


static struct page *
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
        struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
        struct zone *preferred_zone, int migratetype)
{
    struct zoneref *z;
    struct page *page = NULL;
    int classzone_idx;
    struct zone *zone;
    nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
    int zlc_active = 0;     /* set if using zonelist_cache */
    int did_zlc_setup = 0;      /* just call zlc_setup() one time */

    classzone_idx = zone_idx(preferred_zone);//zone的id
zonelist_scan:
    /*
     * Scan zonelist, looking for a zone with enough free.
     * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
     */
    for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask) {//这个宏是从zonelist->_zonerefs数组中得到合适的zone,具体解释看后面
        if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
            !zlc_zone_worth_trying(zonelist, z, allowednodes))//z->zone所在的节点不允许分配或者该zone已经饱满了,都跳过该z
                continue;
        if ((alloc_flags & ALLOC_CPUSET) &&
            !cpuset_zone_allowed_softwall(zone, gfp_mask))//开启了检查内存节点是否在指定CPU集合,并且该zone不被允许在该CPU上分配内存,跳过;
                continue;
        /*
         * When allocating a page cache page for writing, we
         * want to get it from a zone that is within its dirty
         * limit, such that no single zone holds more than its
         * proportional share of globally allowed dirty pages.
         * The dirty limits take into account the zone's
         * lowmem reserves and high watermark so that kswapd
         * should be able to balance it without having to
         * write pages from its LRU list.
         *
         * This may look like it could increase pressure on
         * lower zones by failing allocations in higher zones
         * before they are full.  But the pages that do spill
         * over are limited as the lower zones are protected
         * by this very same mechanism.  It should not become
         * a practical burden to them.
         *
         * XXX: For now, allow allocations to potentially
         * exceed the per-zone dirty limit in the slowpath
         * (ALLOC_WMARK_LOW unset) before going into reclaim,
         * which is important when on a NUMA setup the allowed
         * zones are together not big enough to reach the
         * global limit.  The proper fix for these situations
         * will require awareness of zones in the
         * dirty-throttling and the flusher threads.
         */
        if ((alloc_flags & ALLOC_WMARK_LOW) &&
            (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))//判断该zone上的脏页是否超过了limit
            goto this_zone_full;//脏页超过了限制,跳转到最后设置该zone已经饱满,这样可以平衡脏页分配到各个zone上

        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
        if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
            unsigned long mark;
            int ret;

            mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
            if (zone_watermark_ok(zone, order, mark,
                    classzone_idx, alloc_flags))//检查该zone是否有足够的页来分配,具体分析见后面
                goto try_this_zone;

            if (IS_ENABLED(CONFIG_NUMA) &&
                    !did_zlc_setup && nr_online_nodes > 1) {
                /*
                 * we do zlc_setup if there are multiple nodes
                 * and before considering the first zone allowed
                 * by the cpuset.
                 */
                allowednodes = zlc_setup(zonelist, alloc_flags);
                zlc_active = 1;
                did_zlc_setup = 1;
            }
            // 上面已经用zone_watermark_ok()测试了该zone是否能分配页,如果分配不了,而又不能回收,或者该zone不在回收zone范围,所以就只好设置饱满了,防止下一次再扫描该zone
            if (zone_reclaim_mode == 0 ||
                !zone_allows_reclaim(preferred_zone, zone))
                goto this_zone_full;

            /*
             * As we may have just activated ZLC, check if the first
             * eligible zone has failed zone_reclaim recently.
             *///因为上面已经设置过zlc_active等变量,所以要再次扫描下,不行就跳过该zone
            if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
                !zlc_zone_worth_trying(zonelist, z, allowednodes))
                continue;

            ret = zone_reclaim(zone, gfp_mask, order);//运行到这里表明该zone可以回收页
            switch (ret) {
            case ZONE_RECLAIM_NOSCAN:
                /* did not scan */
                continue;
            case ZONE_RECLAIM_FULL:
                /* scanned but unreclaimable */
                continue;
            default://上面两种情况都是没有回收页面的,到这里则表示已经回收了部分页面
                /* did we reclaim enough *///因为上面回收了部分页面,所以要再用看看该zone是否可以分配页了
                if (zone_watermark_ok(zone, order, mark,
                        classzone_idx, alloc_flags))
                    goto try_this_zone;

                /*
                 * Failed to reclaim enough to meet watermark.
                 * Only mark the zone full if checking the min
                 * watermark or if we failed to reclaim just
                 * 1<<order pages or else the page allocator
                 * fastpath will prematurely mark zones full
                 * when the watermark is between the low and
                 * min watermarks.
                 *///没办法,确实尽力了
                if (((alloc_flags & ALLOC_WMARK_MASK) == ALLOC_WMARK_MIN) ||
                    ret == ZONE_RECLAIM_SOME)
                    goto this_zone_full;

                continue;
            }
        }

try_this_zone://理想情况下,开始分配内存
        page = buffered_rmqueue(preferred_zone, zone, order,
                        gfp_mask, migratetype);
        if (page)
            break;
this_zone_full://该zone已经饱满了,设置该zone
        if (IS_ENABLED(CONFIG_NUMA))
            zlc_mark_zone_full(zonelist, z);
    }//这里就结束了遍历zone分配函数的循环了

    if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
        /* Disable zlc cache for second zonelist scan */
        zlc_active = 0;
        goto zonelist_scan;//再循环一次
    }

    if (page)
        /*
         * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
         * necessary to allocate the page. The expectation is
         * that the caller is taking steps that will free more
         * memory. The caller should avoid the page being used
         * for !PFMEMALLOC purposes.
         *///如果是无水印分配得到的页,表明该zone已经没有多少内存页可以被用来分配了,所以要设置pfmemalloc,让系统回收点内存;
        page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);

    return page;
}



这个宏是从zonelist->_zonerefs数组中获取合适的zone

 for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask) {

zone是循环体中可以使用的,遍历zonelist->_zonerefs数组元素,

#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
    for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
        zone;                           \
        z = next_zones_zonelist(++z, highidx, nodemask, &zone)) \


static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone)
{
    return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
                                zone);
}

struct zoneref {
    struct zone *zone;  /* Pointer to actual zone */
    int zone_idx;       /* zone_idx(zoneref->zone) */
};


highest_zoneid 是所能接受的最大zone,如果比它还大,那就不行所以z++,直到找到一个不大于(等于是最合适的)的返回

/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z, 
                    enum zone_type highest_zoneidx,
                    nodemask_t *nodes,
                    struct zone **zone)
{
    /*  
     * Find the next suitable zone to use for the allocation.
     * Only filter based on nodemask if it's set
     */
    if (likely(nodes == NULL))
        while (zonelist_zone_idx(z) > highest_zoneidx)
            z++;    
    else
        while (zonelist_zone_idx(z) > highest_zoneidx ||                (z->zone && !zref_in_nodemask(z, nodes)))
            z++;

    *zone = zonelist_zone(z);
    return z;
}



要理解zlc_zone_worth_trying()函数,要先看几个结构体

struct zonelist {
    struct zonelist_cache *zlcache_ptr;          // NULL or &zlcache
    struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
#ifdef CONFIG_NUMA
    struct zonelist_cache zlcache;               // optional ...
#endif
};


struct zonelist_cache {
    unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];      /* zone->nid */
    DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);  /* zone full? */
    unsigned long last_full_zap;        /* when last zap'd (jiffies) */
};


static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
                        nodemask_t *allowednodes)
{
    struct zonelist_cache *zlc; /* cached zonelist speedup info */
    int i;              /* index of *z in zonelist zones */
    int n;              /* node that zone *z is on */

    zlc = zonelist->zlcache_ptr;//上面结构体说的很明白 zlcache_ptr 是 zlcache的地址
    if (!zlc)//表示 zlcache不存在,内存是UMA模式
        return 1;

    i = z - zonelist->_zonerefs;// 这是 求z在_zonerfs是第几个元素
    n = zlc->z_to_n[i];//根据i可以得到该zone所在的节点编号nid

    /* This zone is worth trying if it is allowed but not full */
    return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);//判断该nid是否合法,该zone是否已经达到饱满了;
}


/*
 * Return true if free pages are above 'mark'. This takes into account the order * of the allocation.
 */
static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
              int classzone_idx, int alloc_flags, long free_pages)
{
    /* free_pages my go negative - that's OK */
    long min = mark;
    long lowmem_reserve = z->lowmem_reserve[classzone_idx];//紧急情况下才能分配的页数
    int o;
    long free_cma = 0;

    free_pages -= (1 << order) - 1;//减去要分配的页数
    if (alloc_flags & ALLOC_HIGH)
        min -= min / 2;
    if (alloc_flags & ALLOC_HARDER)
        min -= min / 4;
#ifdef CONFIG_CMA
    /* If allocation can't use CMA areas don't use free CMA pages */
    if (!(alloc_flags & ALLOC_CMA))
        free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
#endif       

    if (free_pages - free_cma <= min + lowmem_reserve)//min + lowmem_reserve是一个界限,小于则不能在该zone上分配
        return false;
    for (o = 0; o < order; o++) {//循环去掉比需要分配的order介还小的阶上的free page
        /* At the next order, this order's pages become unavailable */
        free_pages -= z->free_area[o].nr_free << o;

        /* Require fewer higher order pages to be free */
        min >>= 1;//高阶页可以少些

        if (free_pages <= min)
            return false;
    }            
    return true;
}




你可能感兴趣的:(linux,kernel,内存管理,内存分配,linux驱动)