linux内存管理之初始化zonelists

linux内存管理代码有些改变,记录下来。

首先是初始化内存,从 init/main.c 中的__init  start_kernel()函数开始--》build_all_zonelists(NULL, NULL); -->> __build_all_zonelists(NULL) -->>

/* return values int ....just for stop_machine() */
static int __build_all_zonelists(void *data)//从_ref build_all_zonelists()函数中调用__build_all_zonelists(NULL)
{
    int nid;
    int cpu;
    pg_data_t *self = data;

#ifdef CONFIG_NUMA
    memset(node_load, 0, sizeof(node_load));//以节点为下标的数组
#endif

    if (self && !node_online(self->node_id)) {//初始化时,self=data=NULL,不执行
        build_zonelists(self);
        build_zonelist_cache(self);
    }

    for_each_online_node(nid) {//循环找到系统的所有节点
        pg_data_t *pgdat = NODE_DATA(nid);

        build_zonelists(pgdat);
        build_zonelist_cache(pgdat);
    }

    /*
     * Initialize the boot_pagesets that are going to be used
     * for bootstrapping processors. The real pagesets for
     * each zone will be allocated later when the per cpu
     * allocator is available.
     *
     * boot_pagesets are used also for bootstrapping offline
     * cpus if the system is already booted because the pagesets
     * are needed to initialize allocators on a specific cpu too.
     * F.e. the percpu allocator needs the page allocator which
     * needs the percpu allocator in order to allocate its pagesets
     * (a chicken-egg dilemma).
     */
    for_each_possible_cpu(cpu) {
        setup_pageset(&per_cpu(boot_pageset, cpu), 0);

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
        /*
         * We now know the "local memory node" for each node--
         * i.e., the node of the first zone in the generic zonelist.
         * Set up numa_mem percpu variable for on-line cpus.  During
         * boot, only the boot cpu should be on-line;  we'll init the
         * secondary cpus' numa_mem as they come on-line.  During
         * node/memory hotplug, we'll fixup all on-line cpus.
         */
        if (cpu_online(cpu))
            set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
    }

    return 0;
}


static void build_zonelists(pg_data_t *pgdat)
{
    int j, node, load;
    enum zone_type i;
    nodemask_t used_mask;
    int local_node, prev_node;
    struct zonelist *zonelist;
    int order = current_zonelist_order;

    /* initialize zonelists */
    for (i = 0; i < MAX_ZONELISTS; i++) {//MAX_ZONELISTS = 2,初始化zonelist
        zonelist = pgdat->node_zonelists + i;
        zonelist->_zonerefs[0].zone = NULL;
        zonelist->_zonerefs[0].zone_idx = 0;
    }

    /* NUMA-aware ordering of nodes */
    local_node = pgdat->node_id;
    load = nr_online_nodes;
    prev_node = local_node;
    nodes_clear(used_mask);

    memset(node_order, 0, sizeof(node_order));//static int node_order[MAX_NUMNODES]
    j = 0;

    while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {//找一个离local_node最近距离的节点
        /*
         * We don't want to pressure a particular node.
         * So adding penalty to the first node in same
         * distance group to make it round-robin.
         */
        if (node_distance(local_node, node) !=
            node_distance(local_node, prev_node))
            node_load[node] = load;

        prev_node = node;
        load--;
        if (order == ZONELIST_ORDER_NODE)//前面开始函数中设置了,set_zonelist_order()
            build_zonelists_in_node_order(pgdat, node);
        else
            node_order[j++] = node; /* remember order */
    }//在local_node节点上会出现这样:local_node->zonelist[0]->_zonerefs[0~ node * node->node_zones] = n1.m  n1.h  n1.n  n1.d32  n1.d  n2.m  n2.h n2.n  n2.d32  n2.d ...
// 其中n1、n2代表节点,m = movable, h=highmem, n=normal, d32=DMA32, d=DMA

    if (order == ZONELIST_ORDER_ZONE) {//是使用上面的node排序还是用zone排序,根据set_zonelist_order()函数来判断
        /* calculate node order -- i.e., DMA last! */
        build_zonelists_in_zone_order(pgdat, j);这种排序的结果和上面的不一样:local_node->zonelist[0]->_zonerefs[0~node * node->node_zones] = n1.m n2.m n3.m n1.h n2.h n3.h .....
    }

    build_thisnode_zonelists(pgdat);//给自己节点的排序
}


ZONELIST_ORDER_NODE :

/*
 * Build zonelists ordered by node and zones within node.
 * This results in maximum locality--normal zone overflows into local
 * DMA zone, if any--but risks exhausting DMA zone.
 */
static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)//把node节点上的内存域排列到pgdat->zonelists中
{
    int j;
    struct zonelist *zonelist;

    zonelist = &pgdat->node_zonelists[0];
    for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)//条件不成立
        ;
    j = build_zonelists_node(NODE_DATA(node), zonelist, j,
                            MAX_NR_ZONES - 1);
    zonelist->_zonerefs[j].zone = NULL;//结束标识,为下次再调用该函数上面for循环使用,会紧接着本次结束位置开始
    zonelist->_zonerefs[j].zone_idx = 0;
}

static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
                int nr_zones, enum zone_type zone_type)
{
    struct zone *zone;

    BUG_ON(zone_type >= MAX_NR_ZONES);
    zone_type++;

    do {
        zone_type--;
        zone = pgdat->node_zones + zone_type;
        if (populated_zone(zone)) {
            zoneref_set_zone(zone,
                &zonelist->_zonerefs[nr_zones++]);//从pgdat节点上的廉价内存开始全部存放到zonelist->_zonerefs数组中
            check_highest_zone(zone_type);
        }

    } while (zone_type);
    return nr_zones;
}


ZONELIST_ORDER_ZONE:

static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
{
    int pos, j, node;
    int zone_type;      /* needs to be signed */
    struct zone *z;
    struct zonelist *zonelist;

    zonelist = &pgdat->node_zonelists[0];
    pos = 0;
    for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
        for (j = 0; j < nr_nodes; j++) {
            node = node_order[j];
            z = &NODE_DATA(node)->node_zones[zone_type];
            if (populated_zone(z)) {
                zoneref_set_zone(z,
                    &zonelist->_zonerefs[pos++]);
                check_highest_zone(zone_type);
            }
        }
    }
    zonelist->_zonerefs[pos].zone = NULL;
    zonelist->_zonerefs[pos].zone_idx = 0;
}


配置zonelists[1] 也就是本节点的zone排序:

/*
 * Build gfp_thisnode zonelists
 */
static void build_thisnode_zonelists(pg_data_t *pgdat)
{
    int j;
    struct zonelist *zonelist;

    zonelist = &pgdat->node_zonelists[1];
    j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
    zonelist->_zonerefs[j].zone = NULL;
    zonelist->_zonerefs[j].zone_idx = 0;
}

//j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
                int nr_zones, enum zone_type zone_type)
{
    struct zone *zone;

    BUG_ON(zone_type >= MAX_NR_ZONES);
    zone_type++;

    do {
        zone_type--;
        zone = pgdat->node_zones + zone_type;
        if (populated_zone(zone)) {
            zoneref_set_zone(zone,
                &zonelist->_zonerefs[nr_zones++]);
            check_highest_zone(zone_type);
        }

    } while (zone_type);
    return nr_zones;
}

static void build_zonelist_cache(pg_data_t *pgdat)
{
    struct zonelist *zonelist;
    struct zonelist_cache *zlc;
    struct zoneref *z;

    zonelist = &pgdat->node_zonelists[0];
    zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
    bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
    for (z = zonelist->_zonerefs; z->zone; z++)
        zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
}


主要还是自己看源代码吧,初始化后每个node节点中的node_zonelist是这样的:

m1 == node1 movable ;    h1 == node1 highmen;    n1 == node1 normal;  d32.1 ==  node1 DMA32;  d1 == node1 DMA

m2 == node2 movable;   .........


ZONELIST_ORDER_NODE:pgdat->node_zonelists[0]->_zonerefs[0 ~ node * node->node_zones] = m1 h1 n1 d32.1 d1 m2 h2 n2 d32.2 d2.....

ZONELIST_ORDER_ZONE:pgdat->node_zonelists[0]->_zonerefs[0 ~ node * node->node_zones] = m1 m2 m3 h1 h2 h3 d32.1 d32.2 d32.3 d1 d2 d3

本节点:pgdat-> node_zonelist[1]->_zonerefs[0 ~ zone_type--] = m h n d32 d


有不正确的地方,欢迎指正,谢谢。

转载地址:http://blog.csdn.net/yuzhihui_no1/article/details/50759567

你可能感兴趣的:(linux内核)