memory managerment slab allocator(1)

/***************************************************************************/
 * The memory is organized in caches, one cache for each object type.
 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
 * Each cache consists out of many slabs (they are small (usually one
 * page long) and always contiguous), and each slab contains multiple
 * initialized objects.
 *
 * This means, that your constructor is used only for newly allocated
 * slabs and you must pass objects with the same initializations to
 * kmem_cache_free.
 *
 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
 * normal). If you need a special memory type, then must create a new
 * cache for that memory type.
 *
 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
 *   full slabs with 0 free objects
 *   partial slabs
 *   empty slabs with no allocated objects[没有分配的 object?]
 *
 * If partial slabs exist, then new allocations come from these slabs,
 * otherwise from empty slabs or new slabs are allocated.
 *
 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
 *
 * Each cache has a short per-cpu head array, most allocs
 * and frees go into that array, and if that array overflows, then 1/2
 * of the entries in the array are given back into the global cache.
 * The head array is strictly LIFO and should improve the cache hit rates.
 * On SMP, it additionally reduces the spinlock operations.
 *
 * The c_cpuarray may not be read with enabled local interrupts -
 * it's changed with a smp_call_function().
 *
 * SMP synchronization:
 *  constructors and destructors are called without any locking.
 *  Several members in struct kmem_cache and struct slab never change, they
 *    are accessed without any locking.
 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
 *      and local interrupts are disabled so slab code is preempt-safe.

 *  The non-constant members are protected with a per-cache irq spinlock.

Kmem_cache_create


/**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * @name must be valid until the cache is destroyed. This implies that
 * the module calling this has to destroy the cache before getting unloaded.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 *
/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
    unsigned long flags, void (*ctor)(void *))
{
    size_t left_over, slab_size, ralign;
    struct kmem_cache *cachep = NULL, *pc;
    gfp_t gfp;

    list_for_each_entry(pc, &cache_chain, next) {
        char tmp;
        int res;
        if (!strcmp(pc->name, name)) {
            printk(KERN_ERR
                   "kmem_cache_create: duplicate cache %s\n", name);
            dump_stack();
            goto oops;
        }
    }

     /*
     * Check that size is in terms of words.  This is needed to avoid
     * unaligned accesses for some archs when redzoning is used, and makes
     * sure any on-slab bufctl's are also correctly aligned.
    
*/
    if (size & (BYTES_PER_WORD - 1)) {
        size += (BYTES_PER_WORD - 1);
        size &= ~(BYTES_PER_WORD - 1);
    }

    ralign = BYTES_PER_WORD;
    align = ralign;
    gfp = GFP_KERNEL;

    /* Get cache's description obj.

     *kmem_cache结构体本身也由cache管理,且有一个现有蛋还是先用鸡的问题,使用静态分配的方法解决的

     **/

    cachep = kmem_cache_zalloc(&cache_cache, gfp);
    if (!cachep)
        goto oops;
    cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];

    /*
     * Determine if the slab management is 'on' or 'off' slab.
     * (bootstrapping cannot cope with offslab caches so don't do
     * it too early on. Always use on-slab management when
     * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak)
     */

    if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init &&
        !(flags & SLAB_NOLEAKTRACE))
        /*
         * Size is large, assume best to place the slab management obj
         * off-slab (should allow better packing of objs).
         */
        flags |= CFLGS_OFF_SLAB;

    size = ALIGN(size, align);

    left_over = calculate_slab_order(cachep, size, align, flags);
     /*每个slab管理数据的大小,包括两部分:slab结构体和objects的kmem_bufctl_t*/
    slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
              + sizeof(struct slab), align);

    /*有关colour的到底什么用途?*/
    cachep->colour_off = cache_line_size();
    /* Offset must be a multiple of the alignment. */
    if (cachep->colour_off < align)
        cachep->colour_off = align;
    cachep->colour = left_over / cachep->colour_off;
    cachep->slab_size = slab_size;
    cachep->flags = flags;
    cachep->gfpflags = 0;
    if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
        cachep->gfpflags |= GFP_DMA;
    cachep->buffer_size = size;
    cachep->reciprocal_buffer_size = reciprocal_value(size);
    cachep->name = name;

    setup_cpu_cache(cachep, gfp);

     /* cache setup completed, link it into the list */
    list_add(&cachep->next, &cache_chain);
    return cachep;
}


calculate_slab_order

/**已知对象的大小,每个slab到底占据几个page, 每个slab中包含几个object?:是在该函数中决定的
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
 * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */

static size_t calculate_slab_order(struct kmem_cache *cachep,
            size_t size, size_t align, unsigned long flags)
{
    /* Found something acceptable - save it away */
    cachep->num = num;         /*每个slab中object的个数*/
    cachep->gfporder = gfporder;     /*每个slab使用的page order*/

    /*
     * Acceptable internal fragmentation?:判断的条件
     */
    if (left_over * 8 <= (PAGE_SIZE << gfporder))
        break;
}


setup_cpu_cache

static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
    if (g_cpucache_up == FULL)
        return enable_cpucache(cachep, gfp);
}


/* Called with cache_chain_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
    int err;
    int limit, shared;

    /*
     * The head array serves three purposes:
     * - create a LIFO ordering, i.e. return objects that are cache-warm
     * - reduce the number of spinlock operations.
     * - reduce the number of linked list operations on the slab and
     *   bufctl chains: array operations are cheaper.
     * The numbers are guessed, we should auto-tune as described by
     * Bonwick.
     *
/
    if (cachep->buffer_size > 131072)
        limit = 1;
    else if (cachep->buffer_size > PAGE_SIZE)
        limit = 8;
    else if (cachep->buffer_size > 1024)
        limit = 24;
    else if (cachep->buffer_size > 256)
        limit = 54;
    else
        limit = 120;

    /*
     * CPU bound tasks (e.g. network routing) can exhibit cpu bound
     * allocation behaviour: Most allocs on one cpu, most free operations
     * on another cpu. For these cases, an efficient object passing between
     * cpus is necessary. This is provided by a shared array. The array
     * replaces Bonwick's magazine layer.
     * On uniprocessor, it's functionally equivalent (but less efficient)
     * to a larger limit. Thus disabled by default.
     *
/
    shared = 0;
    if (cachep->buffer_size <= PAGE_SIZE && num_possible_cpus() > 1)
        shared = 8;


    err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
    if (err)
        printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
               cachep->name, -err);
    return err;
}

do_tune_cpucache

/* Always called with the cache_chain_mutex held */
static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                int batchcount, int shared, gfp_t gfp)
{
    struct ccupdate_struct *new;
    int i;

    new = kzalloc(sizeof(*new) + nr_cpu_ids * sizeof(struct array_cache *),
              gfp);
    if (!new)
        return -ENOMEM;

    for_each_online_cpu(i) {
        new->new[i] = alloc_arraycache(cpu_to_mem(i), limit,
                        batchcount, gfp);
        if (!new->new[i]) {
            for (i--; i >= 0; i--)
                kfree(new->new[i]);
            kfree(new);
            return -ENOMEM;
        }
    }
    new->cachep = cachep;

    on_each_cpu(do_ccupdate_local, (void *)new, 1);

    check_irq_on();
    cachep->batchcount = batchcount;
    cachep->limit = limit;
    cachep->shared = shared;

    for_each_online_cpu(i) {
        struct array_cache *ccold = new->new[i];
        if (!ccold)
            continue;
        spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
        free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
        spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
        kfree(ccold);
    }
    kfree(new);
    return alloc_kmemlist(cachep, gfp);
}


alloc_kmemlist

/* This initializes kmem_list3 or resizes various caches for all nodes.
 * 这里的 node指的应该是多个存储系统
 *
/
static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
{
    int node;
    struct kmem_list3 *l3;
    struct array_cache *new_shared;
    struct array_cache **new_alien = NULL;

    for_each_online_node(node) {

                if (use_alien_caches) {
                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                        if (!new_alien)
                                goto fail;
                }

        new_shared = NULL;
        if (cachep->shared) {
            new_shared = alloc_arraycache(node,
                cachep->shared*cachep->batchcount,
                    0xbaadf00d, gfp);
            if (!new_shared) {
                free_alien_cache(new_alien);
                goto fail;
            }
        }

        l3 = cachep->nodelists[node];
        if (l3) {
            struct array_cache *shared = l3->shared;

            spin_lock_irq(&l3->list_lock);

            if (shared)
                free_block(cachep, shared->entry,
                        shared->avail, node);

            l3->shared = new_shared;
            if (!l3->alien) {
                l3->alien = new_alien;
                new_alien = NULL;
            }
            l3->free_limit = (1 + nr_cpus_node(node)) *
                    cachep->batchcount + cachep->num;
            spin_unlock_irq(&l3->list_lock);
            kfree(shared);
            free_alien_cache(new_alien);
            continue;
        }
        l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
        if (!l3) {
            free_alien_cache(new_alien);
            kfree(new_shared);
            goto fail;
        }

        kmem_list3_init(l3);
        l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
                ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
        l3->shared = new_shared;
        l3->alien = new_alien;
        l3->free_limit = (1 + nr_cpus_node(node)) *
                    cachep->batchcount + cachep->num;
        cachep->nodelists[node] = l3;
    }
    return 0;
}

执行kmem_cache_create后并没有为object 分配内存,当有请求时才动态分配,如下面的函数 kmem_cache_alloc

kmem_cache_alloc

/**
 * kmem_cache_alloc - Allocate an object
 * @cachep: The cache to allocate from.
 * @flags: See kmalloc().
 *
 * Allocate an object from this cache.  The flags are only relevant
 * if the cache has no available objects.
 *
/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));

    trace_kmem_cache_alloc(_RET_IP_, ret,
                   obj_size(cachep), cachep->buffer_size, flags);

    return ret;
}


static __always_inline void *
__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
{
    unsigned long save_flags;
    void *objp;

    flags &= gfp_allowed_mask;

    lockdep_trace_alloc(flags);

    if (slab_should_failslab(cachep, flags))
        return NULL;

    cache_alloc_debugcheck_before(cachep, flags);
    local_irq_save(save_flags);
    objp = __do_cache_alloc(cachep, flags);
    local_irq_restore(save_flags);
    objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
    kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
                 flags);
    prefetchw(objp);

    if (likely(objp))
        kmemcheck_slab_alloc(cachep, flags, objp, obj_size(cachep));

    if (unlikely((flags & __GFP_ZERO) && objp))
        memset(objp, 0, obj_size(cachep));

    return objp;
}

static __always_inline void *
__do_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
    return ____cache_alloc(cachep, flags);
}

____cache_alloc

static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)

{
    void *objp;
    struct array_cache *ac;

    check_irq_off();
     /*判断的标准为:array_cache的available成员变量*/
    ac = cpu_cache_get(cachep);
    if (likely(ac->avail)) {
        STATS_INC_ALLOCHIT(cachep);
        ac->touched = 1;
        objp = ac->entry[--ac->avail];
    } else {
        STATS_INC_ALLOCMISS(cachep);
        objp = cache_alloc_refill(cachep, flags);
        /*
         * the 'ac' may be updated by cache_alloc_refill(),
         * and kmemleak_erase() requires its correct value.
         */
        ac = cpu_cache_get(cachep);
    }
    /*
     * To avoid a false negative, if an object that is in one of the
     * per-CPU caches is leaked, we need to make sure kmemleak doesn't
     * treat the array pointers as a reference to the object.
     */
    if (objp)
        kmemleak_erase(&ac->entry[ac->avail]);
    return objp;
}

cache_alloc_refill

static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
{
    int batchcount;
    struct kmem_list3 *l3;
    struct array_cache *ac;
    int node;

retry:
    check_irq_off();
    node = numa_mem_id();
    ac = cpu_cache_get(cachep);
    batchcount = ac->batchcount;
    if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
        /*
         * If there was little recent activity on this cache, then
         * perform only a partial refill.  Otherwise we could generate
         * refill bouncing.
         */
        batchcount = BATCHREFILL_LIMIT;
    }
    l3 = cachep->nodelists[node];

    BUG_ON(ac->avail > 0 || !l3);
    spin_lock(&l3->list_lock);

     /* See if we can refill from the shared array
         * transfer entry from the shared array_cache to current CPU array_cache
         **
/
    if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) {
        l3->shared->touched = 1;
        goto alloc_done;
    }
     /*这里有两个循环:外面一层的控制变量是batchcount,从entry得到对应的slab
     *里面的一层控制变量为【slabp->inuse < cachep->num】,最后ac->entry[ac->avail++] = slab_get_obj
     **/

    while (batchcount > 0) {
        struct list_head *entry;
        struct slab *slabp;
        /* Get slab alloc is to come from. */
        entry = l3->slabs_partial.next;
        if (entry == &l3->slabs_partial) {
            l3->free_touched = 1;
            entry = l3->slabs_free.next;
            if (entry == &l3->slabs_free)
                goto must_grow;
        }

        slabp = list_entry(entry, struct slab, list);
        check_slabp(cachep, slabp);
        check_spinlock_acquired(cachep);

        /*
         * The slab was either on partial or free list so
         * there must be at least one object available for
         * allocation.
         */
        BUG_ON(slabp->inuse >= cachep->num);

        while (slabp->inuse < cachep->num && batchcount--) {
            STATS_INC_ALLOCED(cachep);
            STATS_INC_ACTIVE(cachep);
            STATS_SET_HIGH(cachep);

            ac->entry[ac->avail++] = slab_get_obj(cachep, slabp,
                                node);
        }
        check_slabp(cachep, slabp);

        /* move slabp to correct slabp list: */
        list_del(&slabp->list);
        if (slabp->free == BUFCTL_END)
            list_add(&slabp->list, &l3->slabs_full);
        else
            list_add(&slabp->list, &l3->slabs_partial);
    }

must_grow:
    l3->free_objects -= ac->avail;
alloc_done:
    spin_unlock(&l3->list_lock);

    if (unlikely(!ac->avail)) {
        int x;
        x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL);

        /* cache_grow can reenable interrupts, then ac could change. */
        ac = cpu_cache_get(cachep);
        if (!x && ac->avail == 0)    /* no objects in sight? abort */
            return NULL;

        if (!ac->avail)        /* objects refilled by interrupt? */
            goto retry;
    }
    ac->touched = 1;
    return ac->entry[--ac->avail];
}

static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
                int nodeid)
{
    void *objp = index_to_obj(cachep, slabp, slabp->free);
    kmem_bufctl_t next;

    slabp->inuse++;
    next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
    slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
    WARN_ON(slabp->nodeid != nodeid);
#endif
    slabp->free = next;

    return objp;
}

static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
                 unsigned int idx)
{
    return slab->s_mem + cache->buffer_size * idx;
}

cache_grow

/*
 * Grow (by 1) the number of slabs within a cache.  This is called by
 * kmem_cache_alloc() when there are no active objs left in a cache.
 */
static int cache_grow(struct kmem_cache *cachep,
        gfp_t flags, int nodeid, void *objp)
{
    struct slab *slabp;
    size_t offset;
    gfp_t local_flags;
    struct kmem_list3 *l3;

    /*
     * Be lazy and only check for valid flags here,  keeping it out of the
     * critical path in kmem_cache_alloc().
     */
    BUG_ON(flags & GFP_SLAB_BUG_MASK);
    local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);

    /* Take the l3 list lock to change the colour_next on this node */
    check_irq_off();
    l3 = cachep->nodelists[nodeid];
    spin_lock(&l3->list_lock);

    /* Get colour for the slab, and cal the next value. */
    offset = l3->colour_next;
    l3->colour_next++;
    if (l3->colour_next >= cachep->colour)
        l3->colour_next = 0;
    spin_unlock(&l3->list_lock);

    offset *= cachep->colour_off;

    if (local_flags & __GFP_WAIT)
        local_irq_enable();

    /*
     * The test for missing atomic flag is performed here, rather than
     * the more obvious place, simply to reduce the critical path length
     * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
     * will eventually be caught here (where it matters).
     */
    kmem_flagcheck(cachep, flags);

    /*
     * Get mem for the objs.  Attempt to allocate a physical page from
     * 'nodeid'.
     */

    if (!objp)
        objp = kmem_getpages(cachep, local_flags, nodeid);
    if (!objp)
        goto failed;

     /* Get slab management. */
    slabp = alloc_slabmgmt(cachep, objp, offset,
            local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
    if (!slabp)
        goto opps1;

    slab_map_pages(cachep, slabp, objp);

    cache_init_objs(cachep, slabp);

    if (local_flags & __GFP_WAIT)
        local_irq_disable();
    check_irq_off();
    spin_lock(&l3->list_lock);

    /* Make slab active. */
    list_add_tail(&slabp->list, &(l3->slabs_free));
    STATS_INC_GROWN(cachep);
    l3->free_objects += cachep->num;
    spin_unlock(&l3->list_lock);
    return 1;
opps1:
    kmem_freepages(cachep, objp);
failed:
    if (local_flags & __GFP_WAIT)
        local_irq_disable();
    return 0;
}



static void cache_init_objs(struct kmem_cache *cachep,
                struct slab *slabp)
{
    int i;

    for (i = 0; i < cachep->num; i++) {
        void *objp = index_to_obj(cachep, slabp, i);
        slab_bufctl(slabp)[i] = i + 1;
    }
    slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}


主要目的是:能够debug memory被破坏的错误,下一个主题关注调试memory添加的信息。

你可能感兴趣的:(memory managerment slab allocator(1))