FreeBSD zone allocator

FreeBSD zone allocator 实现为一个 slab allocator ,但在细节方面和原始的 slab allocator 不太一样。
在 zone allocator 中,keg 充当着 back end 的角色,zone 起着 front end 的作用。
主要的数据结构有:struct uma_keg , struct uma_zone , struct uma_slab , struct uma_cache , struct uma_bucket。

/* 
 * Keg management structure 
 * 
 * TODO: Optimize for cache line size 
 * 
 */  
struct uma_keg {  
     LIST_ENTRY(uma_keg) uk_link;    /* List of all kegs */  
     struct mtx  uk_lock;    /* Lock for the keg */  
     struct uma_hash uk_hash;  
     LIST_HEAD(,uma_zone)    uk_zones;   /* Keg's zones */  
     LIST_HEAD(,uma_slab)    uk_part_slab;   /* partially allocated slabs */  
     LIST_HEAD(,uma_slab)    uk_free_slab;   /* empty slab list */  
     LIST_HEAD(,uma_slab)    uk_full_slab;   /* full slabs */  
     u_int32_t   uk_recurse; /* Allocation recursion count */  
     u_int32_t   uk_align;   /* Alignment mask */  
     u_int32_t   uk_pages;   /* Total page count */  
     u_int32_t   uk_free;    /* Count of items free in slabs */  
     u_int32_t   uk_size;    /* Requested size of each item */  
     u_int32_t   uk_rsize;   /* Real size of each item */  
     u_int32_t   uk_maxpages;    /* Maximum number of pages to alloc */  
     uma_init    uk_init;    /* Keg's init routine */  
     uma_fini    uk_fini;    /* Keg's fini routine */  
     uma_alloc   uk_allocf;  /* Allocation function */  
     uma_free    uk_freef;   /* Free routine */  
     struct vm_object    *uk_obj;    /* Zone specific object */  
     vm_offset_t uk_kva;     /* Base kva for zones with objs */  
     uma_zone_t  uk_slabzone;    /* Slab zone backing us, if OFFPAGE */  
     u_int16_t   uk_pgoff;   /* Offset to uma_slab struct */  
     u_int16_t   uk_ppera;   /* pages per allocation from backend */  
     u_int16_t   uk_ipers;   /* Items per slab */  
};  

keg 存储着三类 slab 的链表:uk_free_slab(完全没有使用的 slab ),uk_part_slab(部分使用的 slab),uk_full_slab(全部使用了的 slab)。uk_link链表连接这系统中所有的 keg ,链表头存储在 uma_kegs中,以便 zone_foreach 遍历所有的 keg。
/* Page management structure */  
/* Sorry for the union, but space efficiency is important */  
struct uma_slab_head {  
    uma_keg_t   us_keg;         /* Keg we live in */  
    union {  
        LIST_ENTRY(uma_slab)    _us_link;   /* slabs in zone */  
        unsigned long   _us_size;   /* Size of allocation */  
    } us_type;  
    SLIST_ENTRY(uma_slab)   us_hlink;   /* Link for hash table */  
    u_int8_t    *us_data;       /* First item */  
    u_int8_t    us_flags;       /* Page flags see uma.h */  
    u_int8_t    us_freecount;   /* How many are free? */  
    u_int8_t    us_firstfree;   /* First free item index */  
};  
/* The standard slab structure */  
struct uma_slab {  
    struct uma_slab_head    us_head;    /* slab header data */  
    struct {  
        u_int8_t    us_item;  
    } us_freelist[1];           /* actual number bigger */  
};  
/* 
 * The slab structure for UMA_ZONE_REFCNT zones for whose items we 
 * maintain reference counters in the slab for. 
 */  
struct uma_slab_refcnt {  
    struct uma_slab_head    us_head;    /* slab header data */  
    struct {  
        u_int8_t    us_item;  
        u_int32_t   us_refcnt;  
    } us_freelist[1];           /* actual number bigger */  
};  
#define us_keg      us_head.us_keg  
#define us_link     us_head.us_type._us_link  
#define us_size     us_head.us_type._us_size  
#define us_hlink    us_head.us_hlink  
#define us_data     us_head.us_data  
#define us_flags    us_head.us_flags  
#define us_freecount    us_head.us_freecount  
#define us_firstfree    us_head.us_firstfree  

us_link 连接着所有属于同类的 slab ,当用 uma_large_malloc 分配时,slab 不被 keg 管理,用 us_size 保存数据的大小。
/* 
 * Structures for per cpu queues. 
 */  
struct uma_bucket {  
    LIST_ENTRY(uma_bucket)  ub_link;    /* Link into the zone */  
    int16_t ub_cnt;             /* Count of free items. */  
    int16_t ub_entries;         /* Max items. */  
    void    *ub_bucket[];           /* actual allocation storage */  
};  
typedef struct uma_bucket * uma_bucket_t;  
struct uma_cache {  
    uma_bucket_t    uc_freebucket;  /* Bucket we're freeing to */  
    uma_bucket_t    uc_allocbucket; /* Bucket to allocate from */  
    u_int64_t   uc_allocs;  /* Count of allocations */  
    u_int64_t   uc_frees;   /* Count of frees */  
};  

当分配内存时,首先检查当前 CPU 的 uc_freebucket 和 uc_allocbucket 是否有缓存的 item;当释放内存时,首先检查当前 CPU 的 uc_freebucket 和 uc_allocbucket 是否有空槽可以缓存释放的 item。uc_allocs 和 uc_frees 保存着迄今为止只在 cache 内进行的分配和释放数。
/* 
 * Zone management structure  
 * 
 * TODO: Optimize for cache line size 
 * 
 */  
struct uma_zone {  
    char        *uz_name;   /* Text name of the zone */  
    struct mtx  *uz_lock;   /* Lock for the zone (keg's lock) */  
    uma_keg_t   uz_keg;     /* Our underlying Keg */  
    LIST_ENTRY(uma_zone)    uz_link;    /* List of all zones in keg */  
    LIST_HEAD(,uma_bucket)  uz_full_bucket; /* full buckets */  
    LIST_HEAD(,uma_bucket)  uz_free_bucket; /* Buckets for frees */  
    uma_ctor    uz_ctor;    /* Constructor for each allocation */  
    uma_dtor    uz_dtor;    /* Destructor */  
    uma_init    uz_init;    /* Initializer for each item */  
    uma_fini    uz_fini;    /* Discards memory */  
    u_int64_t   uz_allocs;  /* Total number of allocations */  
    u_int64_t   uz_frees;   /* Total number of frees */  
    u_int64_t   uz_fails;   /* Total number of alloc failures */  
    uint16_t    uz_fills;   /* Outstanding bucket fills */  
    uint16_t    uz_count;   /* Highest value ub_ptr can have */  
    /* 
     * This HAS to be the last item because we adjust the zone size 
     * based on NCPU and then allocate the space for the zones. 
     */  
    struct uma_cache    uz_cpu[1];  /* Per cpu caches */  
};  

uz_full_bucket 链表连接着所有缓存着 item 的 bucket ,uz_free_bucket 链表连接着所有没有缓存着 item 的 bucket。分配内存时,如 CPU 的 cache 内没有缓存的 item 时,可将 uz_full_bucket 链表中的 bucket 移到 CPU 的 cache 内以分配内存;释放内存时,如 CPU 的 cache 内没有空槽可以缓存释放的 item ,可将 uz_free_bucket 链表中的bucket 移到 CPU 的 cache 内以缓存内存。

对 keg 进行 grow 操作的主要函数有:uma_zone_slab,slab_zalloc。
static uma_slab_t  
uma_zone_slab(uma_zone_t zone, int flags)  
{  
    uma_slab_t slab;  
    uma_keg_t keg;  
    keg = zone->uz_keg;  
    /* 
     * This is to prevent us from recursively trying to allocate 
     * buckets.  The problem is that if an allocation forces us to 
     * grab a new bucket we will call page_alloc, which will go off 
     * and cause the vm to allocate vm_map_entries.  If we need new 
     * buckets there too we will recurse in kmem_alloc and bad 
     * things happen.  So instead we return a NULL bucket, and make 
     * the code that allocates buckets smart enough to deal with it 
     * 
     * XXX: While we want this protection for the bucket zones so that 
     * recursion from the VM is handled (and the calling code that 
     * allocates buckets knows how to deal with it), we do not want 
     * to prevent allocation from the slab header zones (slabzone 
     * and slabrefzone) if uk_recurse is not zero for them.  The 
     * reason is that it could lead to NULL being returned for 
     * slab header allocations even in the M_WAITOK case, and the 
     * caller can't handle that.  
     */  
    if (keg->uk_flags & UMA_ZFLAG_INTERNAL && keg->uk_recurse != 0)  
         if (zone != slabzone && zone != slabrefzone && zone != zones)  
            return (NULL);  
    slab = NULL;  
    for (;;) {  
        /* 
         * Find a slab with some space.  Prefer slabs that are partially 
         * used over those that are totally full.  This helps to reduce 
         * fragmentation. 
         */  
        if (keg->uk_free != 0) {  
            if (!LIST_EMPTY(&keg->uk_part_slab)) {  
                slab = LIST_FIRST(&keg->uk_part_slab);  
            } else {  
                slab = LIST_FIRST(&keg->uk_free_slab);  
                LIST_REMOVE(slab, us_link);  
                LIST_INSERT_HEAD(&keg->uk_part_slab, slab,  
                    us_link);  
            }  
            return (slab);  
        }  
        /* 
         * M_NOVM means don't ask at all! 
         */  
        if (flags & M_NOVM)  
            break;  
        if (keg->uk_maxpages &&  
            keg->uk_pages >= keg->uk_maxpages) {  
            keg->uk_flags |= UMA_ZFLAG_FULL;  
            if (flags & M_NOWAIT)  
                break;  
            else  
                msleep(keg, &keg->uk_lock, PVM,  
                    "zonelimit", 0);  
            continue;  
        }  
        keg->uk_recurse++;  
        slab = slab_zalloc(zone, flags);  
        keg->uk_recurse--;  
        /* 
         * If we got a slab here it's safe to mark it partially used 
         * and return.  We assume that the caller is going to remove 
         * at least one item. 
         */  
        if (slab) {  
            LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);  
            return (slab);  
        }  
        /* 
         * We might not have been able to get a slab but another cpu 
         * could have while we were unlocked.  Check again before we 
         * fail. 
         */  
        if (flags & M_NOWAIT)  
            flags |= M_NOVM;  
    }  
    return (slab);  
}  

/* 
 * Allocate a new slab for a zone.  This does not insert the slab onto a list. 
 * 
 * Arguments: 
 *  zone  The zone to allocate slabs for 
 *  wait  Shall we wait? 
 * 
 * Returns: 
 *  The slab that was allocated or NULL if there is no memory and the 
 *  caller specified M_NOWAIT. 
 */  
static uma_slab_t  
slab_zalloc(uma_zone_t zone, int wait)  
{  
    uma_slabrefcnt_t slabref;  
    uma_slab_t slab;  
    uma_keg_t keg;  
    u_int8_t *mem;  
    u_int8_t flags;  
    int i;  
    slab = NULL;  
    keg = zone->uz_keg;  
#ifdef UMA_DEBUG  
    printf("slab_zalloc:  Allocating a new slab for %s\n", zone->uz_name);  
#endif  
    ZONE_UNLOCK(zone);  
    if (keg->uk_flags & UMA_ZONE_OFFPAGE) {  
        slab = uma_zalloc_internal(keg->uk_slabzone, NULL, wait);  
        if (slab == NULL) {  
            ZONE_LOCK(zone);  
            return NULL;  
        }  
    }  
    /* 
     * This reproduces the old vm_zone behavior of zero filling pages the 
     * first time they are added to a zone. 
     * 
     * Malloced items are zeroed in uma_zalloc. 
     */  
    if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)  
        wait |= M_ZERO;  
    else  
        wait &= ~M_ZERO;  
    mem = keg->uk_allocf(zone, keg->uk_ppera * UMA_SLAB_SIZE,  
        &flags, wait);  
    if (mem == NULL) {  
        if (keg->uk_flags & UMA_ZONE_OFFPAGE)  
            uma_zfree_internal(keg->uk_slabzone, slab, NULL,  
                SKIP_NONE, ZFREE_STATFREE);  
        ZONE_LOCK(zone);  
        return (NULL);  
    }  
    /* Point the slab into the allocated memory */  
    if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))  
        slab = (uma_slab_t )(mem + keg->uk_pgoff);  
    if ((keg->uk_flags & UMA_ZONE_MALLOC) ||  
        (keg->uk_flags & UMA_ZONE_REFCNT))  
        for (i = 0; i < keg->uk_ppera; i++)  
            vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);  
    slab->us_keg = keg;  
    slab->us_data = mem;  
    slab->us_freecount = keg->uk_ipers;  
    slab->us_firstfree = 0;  
    slab->us_flags = flags;  
    if (keg->uk_flags & UMA_ZONE_REFCNT) {  
        slabref = (uma_slabrefcnt_t)slab;  
        for (i = 0; i < keg->uk_ipers; i++) {  
            slabref->us_freelist[i].us_refcnt = 0;  
            slabref->us_freelist[i].us_item = i+1;  
        }  
    } else {  
        for (i = 0; i < keg->uk_ipers; i++)  
            slab->us_freelist[i].us_item = i+1;  
    }  
    if (keg->uk_init != NULL) {  
        for (i = 0; i < keg->uk_ipers; i++)  
            if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),  
                keg->uk_size, wait) != 0)  
                break;  
        if (i != keg->uk_ipers) {  
            if (keg->uk_fini != NULL) {  
                for (i--; i > -1; i--)  
                    keg->uk_fini(slab->us_data +  
                        (keg->uk_rsize * i),  
                        keg->uk_size);  
            }  
            if ((keg->uk_flags & UMA_ZONE_MALLOC) ||  
                (keg->uk_flags & UMA_ZONE_REFCNT)) {  
                vm_object_t obj;  
                if (flags & UMA_SLAB_KMEM)  
                    obj = kmem_object;  
                else if (flags & UMA_SLAB_KERNEL)  
                    obj = kernel_object;  
                else  
                    obj = NULL;  
                for (i = 0; i < keg->uk_ppera; i++)  
                    vsetobj((vm_offset_t)mem +  
                        (i * PAGE_SIZE), obj);  
            }  
            if (keg->uk_flags & UMA_ZONE_OFFPAGE)  
                uma_zfree_internal(keg->uk_slabzone, slab,  
                    NULL, SKIP_NONE, ZFREE_STATFREE);  
            keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera,  
                flags);
            ZONE_LOCK(zone);  
            return (NULL);  
        }  
    }  
    ZONE_LOCK(zone);  
    if (keg->uk_flags & UMA_ZONE_HASH)  
        UMA_HASH_INSERT(&keg->uk_hash, slab, mem);  
    keg->uk_pages += keg->uk_ppera;  
    keg->uk_free += keg->uk_ipers;  
    return (slab);  
}  

uma_zone_slab 函数返回没有全部使用了的 slab 。如果调用 uma_zone_slab 时,keg 的 uk_free 为0(此时,uk_free_slab 和 uk_part_slab 链表均为空),则调用 slab_zalloc 分配新的 slab 。slab_zalloc 通过调用 keg->uk_allocf来分配 item 块内存,根据 keg->uk_flag 的标识来决定 slab 是从 keg->uk_slabzone 中分配还是存储在 item 块的最后面。如果,slab_zalloc 分配 item 块成功,keg->uk_pages 将增加 keg->uk_ppera 。当对相同的 keg 调用 uma_zone_slab 时,如果 uk_free 为 0,当 uk_maxpages 设置并且 uk_pages 大于等于 uk_maxpages 时,函数将阻塞在 uk_lock 上(见 uma_zone_slab 1977-2005行)。

uma_zalloc_arg 函数从 zone 分配 item 。
void *  
uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)  
{  
    void *item;  
    uma_cache_t cache;  
    uma_bucket_t bucket;  
    int cpu;  
    /* This is the fast path allocation */  
#ifdef UMA_DEBUG_ALLOC_1  
    printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);  
#endif  
    CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,  
        zone->uz_name, flags);  
    if (flags & M_WAITOK) {  
        WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,  
            "uma_zalloc_arg: zone \"%s\"", zone->uz_name);  
    }  
    /* 
     * If possible, allocate from the per-CPU cache.  There are two 
     * requirements for safe access to the per-CPU cache: (1) the thread 
     * accessing the cache must not be preempted or yield during access, 
     * and (2) the thread must not migrate CPUs without switching which 
     * cache it accesses.  We rely on a critical section to prevent 
     * preemption and migration.  We release the critical section in 
     * order to acquire the zone mutex if we are unable to allocate from 
     * the current cache; when we re-acquire the critical section, we 
     * must detect and handle migration if it has occurred. 
     */  
zalloc_restart:  
    critical_enter();  
    cpu = curcpu;  
    cache = &zone->uz_cpu[cpu];  
zalloc_start:  
    bucket = cache->uc_allocbucket;  
    if (bucket) {  
        if (bucket->ub_cnt > 0) {  
            bucket->ub_cnt--;  
            item = bucket->ub_bucket[bucket->ub_cnt];  
#ifdef INVARIANTS  
            bucket->ub_bucket[bucket->ub_cnt] = NULL;  
#endif  
            KASSERT(item != NULL,  
                ("uma_zalloc: Bucket pointer mangled."));  
            cache->uc_allocs++;  
            critical_exit();  
#ifdef INVARIANTS  
            ZONE_LOCK(zone);  
            uma_dbg_alloc(zone, NULL, item);  
            ZONE_UNLOCK(zone);  
#endif  
            if (zone->uz_ctor != NULL) {  
                if (zone->uz_ctor(item, zone->uz_keg->uk_size,  
                    udata, flags) != 0) {  
                    uma_zfree_internal(zone, item, udata,  
                        SKIP_DTOR, ZFREE_STATFAIL |  
                        ZFREE_STATFREE);  
                    return (NULL);  
                }  
            }  
            if (flags & M_ZERO)  
                bzero(item, zone->uz_keg->uk_size);  
            return (item);  
        } else if (cache->uc_freebucket) {  
            /* 
             * We have run out of items in our allocbucket. 
             * See if we can switch with our free bucket. 
             */  
            if (cache->uc_freebucket->ub_cnt > 0) {  
#ifdef UMA_DEBUG_ALLOC  
                printf("uma_zalloc: Swapping empty with"  
                    " alloc.\n");  
#endif  
                bucket = cache->uc_freebucket;  
                cache->uc_freebucket = cache->uc_allocbucket;  
                cache->uc_allocbucket = bucket;  
                goto zalloc_start;  
            }  
        }  
    }  
    /* 
     * Attempt to retrieve the item from the per-CPU cache has failed, so 
     * we must go back to the zone.  This requires the zone lock, so we 
     * must drop the critical section, then re-acquire it when we go back 
     * to the cache.  Since the critical section is released, we may be 
     * preempted or migrate.  As such, make sure not to maintain any 
     * thread-local state specific to the cache from prior to releasing 
     * the critical section. 
     */  
    critical_exit();  
    ZONE_LOCK(zone);  
    critical_enter();  
    cpu = curcpu;  
    cache = &zone->uz_cpu[cpu];  
    bucket = cache->uc_allocbucket;  
    if (bucket != NULL) {  
        if (bucket->ub_cnt > 0) {  
            ZONE_UNLOCK(zone);  
            goto zalloc_start;  
        }  
        bucket = cache->uc_freebucket;  
        if (bucket != NULL && bucket->ub_cnt > 0) {  
            ZONE_UNLOCK(zone);  
            goto zalloc_start;  
        }  
    }  
    /* Since we have locked the zone we may as well send back our stats */  
    zone->uz_allocs += cache->uc_allocs;  
    cache->uc_allocs = 0;  
    zone->uz_frees += cache->uc_frees;  
    cache->uc_frees = 0;  
    /* Our old one is now a free bucket */  
    if (cache->uc_allocbucket) {  
        KASSERT(cache->uc_allocbucket->ub_cnt == 0,  
            ("uma_zalloc_arg: Freeing a non free bucket."));  
        LIST_INSERT_HEAD(&zone->uz_free_bucket,  
            cache->uc_allocbucket, ub_link);  
        cache->uc_allocbucket = NULL;  
    }  
    /* Check the free list for a new alloc bucket */  
    if ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {  
        KASSERT(bucket->ub_cnt != 0,  
            ("uma_zalloc_arg: Returning an empty bucket."));  
        LIST_REMOVE(bucket, ub_link);  
        cache->uc_allocbucket = bucket;  
        ZONE_UNLOCK(zone);  
        goto zalloc_start;  
    }  
    /* We are no longer associated with this CPU. */  
    critical_exit();  
    /* Bump up our uz_count so we get here less */  
    if (zone->uz_count < BUCKET_MAX)  
        zone->uz_count++;  
    /* 
     * Now lets just fill a bucket and put it on the free list.  If that 
     * works we'll restart the allocation from the begining. 
     */  
    if (uma_zalloc_bucket(zone, flags)) {  
        ZONE_UNLOCK(zone);  
        goto zalloc_restart;  
    }  
    ZONE_UNLOCK(zone);  
    /* 
     * We may not be able to get a bucket so return an actual item. 
     */  
#ifdef UMA_DEBUG  
    printf("uma_zalloc_arg: Bucketzone returned NULL\n");  
#endif  
    return (uma_zalloc_internal(zone, udata, flags));  
}  

uma_zalloc_arg 首先尝试从当前 CPU cache 中分配 item 。如失败:若 zone 的 uz_full_bucket 链表不为空,则将其首元素移到 CPU cache 中以供分配;否则调用 uma_zalloc_bucket 装填一个新的 bucket ,并将之插入 uz_full_bucket 链表。如果 uma_zalloc_bucket 也失败,则调用 uma_zalloc_internal 分配内存。
static int  
uma_zalloc_bucket(uma_zone_t zone, int flags)  
{  
    uma_bucket_t bucket;  
    uma_slab_t slab;  
    int16_t saved;  
    int max, origflags = flags;  
    /* 
     * Try this zone's free list first so we don't allocate extra buckets. 
     */  
    if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {  
        KASSERT(bucket->ub_cnt == 0,  
            ("uma_zalloc_bucket: Bucket on free list is not empty."));  
        LIST_REMOVE(bucket, ub_link);  
    } else {  
        int bflags;  
        bflags = (flags & ~M_ZERO);  
        if (zone->uz_keg->uk_flags & UMA_ZFLAG_CACHEONLY)  
            bflags |= M_NOVM;  
        ZONE_UNLOCK(zone);  
        bucket = bucket_alloc(zone->uz_count, bflags);  
        ZONE_LOCK(zone);  
    }  
    if (bucket == NULL)  
        return (0);  
#ifdef SMP  
    /* 
     * This code is here to limit the number of simultaneous bucket fills 
     * for any given zone to the number of per cpu caches in this zone. This 
     * is done so that we don't allocate more memory than we really need. 
     */  
    if (zone->uz_fills >= mp_ncpus)  
        goto done;  
#endif  
    zone->uz_fills++;  
    max = MIN(bucket->ub_entries, zone->uz_count);  
    /* Try to keep the buckets totally full */  
    saved = bucket->ub_cnt;  
    while (bucket->ub_cnt < max &&  
        (slab = uma_zone_slab(zone, flags)) != NULL) {  
        while (slab->us_freecount && bucket->ub_cnt < max) {  
            bucket->ub_bucket[bucket->ub_cnt++] =  
                uma_slab_alloc(zone, slab);  
        }  
        /* Don't block on the next fill */  
        flags |= M_NOWAIT;  
    }  
    /* 
     * We unlock here because we need to call the zone's init. 
     * It should be safe to unlock because the slab dealt with 
     * above is already on the appropriate list within the keg 
     * and the bucket we filled is not yet on any list, so we 
     * own it. 
     */  
    if (zone->uz_init != NULL) {  
        int i;  
        ZONE_UNLOCK(zone);  
        for (i = saved; i < bucket->ub_cnt; i++)  
            if (zone->uz_init(bucket->ub_bucket[i],  
                zone->uz_keg->uk_size, origflags) != 0)  
                break;  
        /* 
         * If we couldn't initialize the whole bucket, put the 
         * rest back onto the freelist. 
         */  
        if (i != bucket->ub_cnt) {  
            int j;  
            for (j = i; j < bucket->ub_cnt; j++) {  
                uma_zfree_internal(zone, bucket->ub_bucket[j],  
                    NULL, SKIP_FINI, 0);  
#ifdef INVARIANTS  
                bucket->ub_bucket[j] = NULL;  
#endif  
            }  
            bucket->ub_cnt = i;  
        }  
        ZONE_LOCK(zone);  
    }  
    zone->uz_fills--;  
    if (bucket->ub_cnt != 0) {  
        LIST_INSERT_HEAD(&zone->uz_full_bucket,  
            bucket, ub_link);  
        return (1);  
    }  
#ifdef SMP  
done:  
#endif  
    bucket_free(bucket);  
    return (0);  
}  
/* 
 * Allocates an item for an internal zone 
 * 
 * Arguments 
 *  zone   The zone to alloc for. 
 *  udata  The data to be passed to the constructor. 
 *  flags  M_WAITOK, M_NOWAIT, M_ZERO. 
 * 
 * Returns 
 *  NULL if there is no memory and M_NOWAIT is set 
 *  An item if successful 
 */  
static void *  
uma_zalloc_internal(uma_zone_t zone, void *udata, int flags)  
{  
    uma_keg_t keg;  
    uma_slab_t slab;  
    void *item;  
    item = NULL;  
    keg = zone->uz_keg;  
#ifdef UMA_DEBUG_ALLOC  
    printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);  
#endif  
    ZONE_LOCK(zone);  
    slab = uma_zone_slab(zone, flags);  
    if (slab == NULL) {  
        zone->uz_fails++;  
        ZONE_UNLOCK(zone);  
        return (NULL);  
    }  
    item = uma_slab_alloc(zone, slab);  
    zone->uz_allocs++;  
    ZONE_UNLOCK(zone);  
    /* 
     * We have to call both the zone's init (not the keg's init) 
     * and the zone's ctor.  This is because the item is going from 
     * a keg slab directly to the user, and the user is expecting it 
     * to be both zone-init'd as well as zone-ctor'd. 
     */  
    if (zone->uz_init != NULL) {  
        if (zone->uz_init(item, keg->uk_size, flags) != 0) {  
            uma_zfree_internal(zone, item, udata, SKIP_FINI,  
                ZFREE_STATFAIL | ZFREE_STATFREE);  
            return (NULL);  
        }  
    }  
    if (zone->uz_ctor != NULL) {  
        if (zone->uz_ctor(item, keg->uk_size, udata, flags) != 0) {  
            uma_zfree_internal(zone, item, udata, SKIP_DTOR,  
                ZFREE_STATFAIL | ZFREE_STATFREE);  
            return (NULL);  
        }  
    }  
    if (flags & M_ZERO)  
        bzero(item, keg->uk_size);  
    return (item);  
} 

uma_zalloc_internal 函数先调用 uma_zone_slab 函数返回一个未用完的 slab ,再调用 uma_slab_alloc 函数从这个 slab 中分配一个 item 。uma_zone_slab 函数假设调用程序至少从返回的 slab 里分配一个 item ,所以如果返回的 slab 是完全没有使用的,则在返回之前将之从 uk_free_slab 中移除,并插入 uk_part_slab 。uma_slab_alloc 函数假设 slab 是部分使用了的 slab ,所以只处理当 slab->us_freecount 等于 0 的情况。

uma_zfree_arg 将从 zone 分配的 item 返回给 zone 。
void  
uma_zfree_arg(uma_zone_t zone, void *item, void *udata)  
{  
    uma_keg_t keg;  
    uma_cache_t cache;  
    uma_bucket_t bucket;  
    int bflags;  
    int cpu;  
    keg = zone->uz_keg;  
#ifdef UMA_DEBUG_ALLOC_1  
    printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);  
#endif  
    CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,  
        zone->uz_name);  
    if (zone->uz_dtor)  
        zone->uz_dtor(item, keg->uk_size, udata);  
#ifdef INVARIANTS  
    ZONE_LOCK(zone);  
    if (keg->uk_flags & UMA_ZONE_MALLOC)  
        uma_dbg_free(zone, udata, item);  
    else  
        uma_dbg_free(zone, NULL, item);  
    ZONE_UNLOCK(zone);  
#endif  
    /* 
     * The race here is acceptable.  If we miss it we'll just have to wait 
     * a little longer for the limits to be reset. 
     */  
    if (keg->uk_flags & UMA_ZFLAG_FULL)  
        goto zfree_internal;  
    /* 
     * If possible, free to the per-CPU cache.  There are two 
     * requirements for safe access to the per-CPU cache: (1) the thread 
     * accessing the cache must not be preempted or yield during access, 
     * and (2) the thread must not migrate CPUs without switching which 
     * cache it accesses.  We rely on a critical section to prevent 
     * preemption and migration.  We release the critical section in 
     * order to acquire the zone mutex if we are unable to free to the 
     * current cache; when we re-acquire the critical section, we must 
     * detect and handle migration if it has occurred. 
     */  
zfree_restart:  
    critical_enter();  
    cpu = curcpu;  
    cache = &zone->uz_cpu[cpu];  
zfree_start:  
    bucket = cache->uc_freebucket;  
    if (bucket) {  
        /* 
         * Do we have room in our bucket? It is OK for this uz count 
         * check to be slightly out of sync. 
         */  
        if (bucket->ub_cnt < bucket->ub_entries) {  
            KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,  
                ("uma_zfree: Freeing to non free bucket index."));  
            bucket->ub_bucket[bucket->ub_cnt] = item;  
            bucket->ub_cnt++;  
            cache->uc_frees++;  
            critical_exit();  
            return;  
        } else if (cache->uc_allocbucket) {  
#ifdef UMA_DEBUG_ALLOC  
            printf("uma_zfree: Swapping buckets.\n");  
#endif  
            /* 
             * We have run out of space in our freebucket. 
             * See if we can switch with our alloc bucket. 
             */  
            if (cache->uc_allocbucket->ub_cnt <  
                cache->uc_freebucket->ub_cnt) {  
                bucket = cache->uc_freebucket;  
                cache->uc_freebucket = cache->uc_allocbucket;  
                cache->uc_allocbucket = bucket;  
                goto zfree_start;  
            }  
        }  
    }  
    /* 
     * We can get here for two reasons: 
     * 
     * 1) The buckets are NULL 
     * 2) The alloc and free buckets are both somewhat full. 
     * 
     * We must go back the zone, which requires acquiring the zone lock, 
     * which in turn means we must release and re-acquire the critical 
     * section.  Since the critical section is released, we may be 
     * preempted or migrate.  As such, make sure not to maintain any 
     * thread-local state specific to the cache from prior to releasing 
     * the critical section. 
     */  
    critical_exit();  
    ZONE_LOCK(zone);  
    critical_enter();  
    cpu = curcpu;  
    cache = &zone->uz_cpu[cpu];  
    if (cache->uc_freebucket != NULL) {  
        if (cache->uc_freebucket->ub_cnt <  
            cache->uc_freebucket->ub_entries) {  
            ZONE_UNLOCK(zone);  
            goto zfree_start;  
        }  
        if (cache->uc_allocbucket != NULL &&  
            (cache->uc_allocbucket->ub_cnt <  
            cache->uc_freebucket->ub_cnt)) {  
            ZONE_UNLOCK(zone);  
            goto zfree_start;  
        }  
    }  
    /* Since we have locked the zone we may as well send back our stats */  
    zone->uz_allocs += cache->uc_allocs;  
    cache->uc_allocs = 0;  
    zone->uz_frees += cache->uc_frees;  
    cache->uc_frees = 0;  
    bucket = cache->uc_freebucket;  
    cache->uc_freebucket = NULL;  
    /* Can we throw this on the zone full list? */  
    if (bucket != NULL) {  
#ifdef UMA_DEBUG_ALLOC  
        printf("uma_zfree: Putting old bucket on the free list.\n");  
#endif  
        /* ub_cnt is pointing to the last free item */  
        KASSERT(bucket->ub_cnt != 0,  
            ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n")); 
        LIST_INSERT_HEAD(&zone->uz_full_bucket,  
            bucket, ub_link);  
    }  
    if ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {  
        LIST_REMOVE(bucket, ub_link);  
        ZONE_UNLOCK(zone);  
        cache->uc_freebucket = bucket;  
        goto zfree_start;  
    }  
    /* We are no longer associated with this CPU. */  
    critical_exit();  
    /* And the zone.. */  
    ZONE_UNLOCK(zone);  
#ifdef UMA_DEBUG_ALLOC  
    printf("uma_zfree: Allocating new free bucket.\n");  
#endif  
    bflags = M_NOWAIT;  
    if (keg->uk_flags & UMA_ZFLAG_CACHEONLY)  
        bflags |= M_NOVM;  
    bucket = bucket_alloc(zone->uz_count, bflags);  
    if (bucket) {  
        ZONE_LOCK(zone);  
        LIST_INSERT_HEAD(&zone->uz_free_bucket,  
            bucket, ub_link);  
        ZONE_UNLOCK(zone);  
        goto zfree_restart;  
    }  
    /* 
     * If nothing else caught this, we'll just do an internal free. 
     */  
zfree_internal:  
    uma_zfree_internal(zone, item, udata, SKIP_DTOR, ZFREE_STATFREE);  
    return;  
}

/* 
 * Frees an item to an INTERNAL zone or allocates a free bucket 
 * 
 * Arguments: 
 *  zone   The zone to free to 
 *  item   The item we're freeing 
 *  udata  User supplied data for the dtor 
 *  skip   Skip dtors and finis 
 */  
static void  
uma_zfree_internal(uma_zone_t zone, void *item, void *udata,  
    enum zfreeskip skip, int flags)  
{  
    uma_slab_t slab;  
    uma_slabrefcnt_t slabref;  
    uma_keg_t keg;  
    u_int8_t *mem;  
    u_int8_t freei;  
    keg = zone->uz_keg;  
    if (skip < SKIP_DTOR && zone->uz_dtor)  
        zone->uz_dtor(item, keg->uk_size, udata);  
    if (skip < SKIP_FINI && zone->uz_fini)  
        zone->uz_fini(item, keg->uk_size);  
    ZONE_LOCK(zone);  
    if (flags & ZFREE_STATFAIL)  
        zone->uz_fails++;  
    if (flags & ZFREE_STATFREE)  
        zone->uz_frees++;  
    if (!(keg->uk_flags & UMA_ZONE_MALLOC)) {  
        mem = (u_int8_t *)((unsigned long)item & (~UMA_SLAB_MASK));  
        if (keg->uk_flags & UMA_ZONE_HASH)  
            slab = hash_sfind(&keg->uk_hash, mem);  
        else {  
            mem += keg->uk_pgoff;  
            slab = (uma_slab_t)mem;  
        }  
    } else {  
        slab = (uma_slab_t)udata;  
    }  
    /* Do we need to remove from any lists? */  
    if (slab->us_freecount+1 == keg->uk_ipers) {  
        LIST_REMOVE(slab, us_link);  
        LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);  
    } else if (slab->us_freecount == 0) {  
        LIST_REMOVE(slab, us_link);  
        LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);  
    }  
    /* Slab management stuff */  
    freei = ((unsigned long)item - (unsigned long)slab->us_data)  
        / keg->uk_rsize;  
#ifdef INVARIANTS  
    if (!skip)  
        uma_dbg_free(zone, slab, item);  
#endif  
    if (keg->uk_flags & UMA_ZONE_REFCNT) {  
        slabref = (uma_slabrefcnt_t)slab;  
        slabref->us_freelist[freei].us_item = slab->us_firstfree;  
    } else {  
        slab->us_freelist[freei].us_item = slab->us_firstfree;  
    }  
    slab->us_firstfree = freei;  
    slab->us_freecount++;  
    /* Zone statistics */  
    keg->uk_free++;  
    if (keg->uk_flags & UMA_ZFLAG_FULL) {  
        if (keg->uk_pages < keg->uk_maxpages)  
            keg->uk_flags &= ~UMA_ZFLAG_FULL;  
        /*  
         * We can handle one more allocation. Since we're clearing ZFLAG_FULL, 
         * wake up all procs blocked on pages. This should be uncommon, so  
         * keeping this simple for now (rather than adding count of blocked  
         * threads etc). 
         */  
        wakeup(keg);  
    }  
    ZONE_UNLOCK(zone);  
} 

uma_zfree_arg 与 uma_zalloc_arg 类似,首先尝试将 item 缓存在 当前 CPU cache 中。如失败:若 uz_free_bucket 链表不为空,则将其首元素移到 CPU cache 中以缓存 item ;否则,调用 bucket_alloc 分配一个空的 bucket ,并将之插入 uz_free_bucket 链表。如果 bucket_alloc 也失败,则调用 uma_zfree_internal 释放 item 。不过 uma_zfree_arg 在进行这些动作之前,先检查 keg 的可分配页面数是否达到上限,如是,则直接调用 uma_zfree_internal 释放 item ,因为 uma_zfree_internal 检查 keg 的可分配页面数是否已经降到上限数下了,并可唤醒阻塞在 keg 上的线程。
/* 
 * Drain the cached buckets from a zone.  Expects a locked zone on entry. 
 */  
static void  
bucket_cache_drain(uma_zone_t zone)  
{  
    uma_bucket_t bucket;  
    /* 
     * Drain the bucket queues and free the buckets, we just keep two per 
     * cpu (alloc/free). 
     */  
    while ((bucket = LIST_FIRST(&zone->uz_full_bucket)) != NULL) {  
        LIST_REMOVE(bucket, ub_link);  
        ZONE_UNLOCK(zone);  
        bucket_drain(zone, bucket);  
        bucket_free(bucket);  
        ZONE_LOCK(zone);  
    }  
    /* Now we do the free queue.. */  
    while ((bucket = LIST_FIRST(&zone->uz_free_bucket)) != NULL) {  
        LIST_REMOVE(bucket, ub_link);  
        bucket_free(bucket);  
    }  
}  
/* 
 * Frees pages from a zone back to the system.  This is done on demand from 
 * the pageout daemon. 
 * 
 * Arguments: 
 *  zone  The zone to free pages from 
 *   all  Should we drain all items? 
 * 
 * Returns: 
 *  Nothing. 
 */  
void  
zone_drain(uma_zone_t zone)  
{  
    struct slabhead freeslabs = { 0 };  
    uma_keg_t keg;  
    uma_slab_t slab;  
    uma_slab_t n;  
    u_int8_t flags;  
    u_int8_t *mem;  
    int i;  
    keg = zone->uz_keg;  
    /* 
     * We don't want to take pages from statically allocated zones at this 
     * time 
     */  
    if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)  
        return;  
    ZONE_LOCK(zone);  
#ifdef UMA_DEBUG  
    printf("%s free items: %u\n", zone->uz_name, keg->uk_free);  
#endif  
    bucket_cache_drain(zone);  
    if (keg->uk_free == 0)  
        goto finished;  
    slab = LIST_FIRST(&keg->uk_free_slab);  
    while (slab) {  
        n = LIST_NEXT(slab, us_link);  
        /* We have no where to free these to */  
        if (slab->us_flags & UMA_SLAB_BOOT) {  
            slab = n;  
            continue;  
        }  
        LIST_REMOVE(slab, us_link);  
        keg->uk_pages -= keg->uk_ppera;  
        keg->uk_free -= keg->uk_ipers;  
        if (keg->uk_flags & UMA_ZONE_HASH)  
            UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);  
        SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);  
        slab = n;  
    }  
finished:  
    ZONE_UNLOCK(zone);  
    while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {  
        SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);  
        if (keg->uk_fini)  
            for (i = 0; i < keg->uk_ipers; i++)  
                keg->uk_fini(  
                    slab->us_data + (keg->uk_rsize * i),  
                    keg->uk_size);  
        flags = slab->us_flags;  
        mem = slab->us_data;  
        if ((keg->uk_flags & UMA_ZONE_MALLOC) ||  
            (keg->uk_flags & UMA_ZONE_REFCNT)) {  
            vm_object_t obj;  
            if (flags & UMA_SLAB_KMEM)  
                obj = kmem_object;  
            else if (flags & UMA_SLAB_KERNEL)  
                obj = kernel_object;  
            else  
                obj = NULL;  
            for (i = 0; i < keg->uk_ppera; i++)  
                vsetobj((vm_offset_t)mem + (i * PAGE_SIZE),  
                    obj);  
        }  
        if (keg->uk_flags & UMA_ZONE_OFFPAGE)  
            uma_zfree_internal(keg->uk_slabzone, slab, NULL,  
                SKIP_NONE, ZFREE_STATFREE);  
#ifdef UMA_DEBUG  
        printf("%s: Returning %d bytes.\n",  
            zone->uz_name, UMA_SLAB_SIZE * keg->uk_ppera);  
#endif  
        keg->uk_freef(mem, UMA_SLAB_SIZE * keg->uk_ppera, flags);  
    }  
}

zone_drain 函数通过调用 bucket_cache_drain 将 zone 中的 uz_full_bucket 缓存的 item 返还给 keg (CPU cache 中的 item 不返还),并释放 uz_full_bucket 和 uz_free_bucket 连接的所有 bucket,再将 keg 用 uk_free_slab 连接起来的 pages 释放给系统。
static void  
zone_foreach(void (*zfunc)(uma_zone_t))  
{  
    uma_keg_t keg;  
    uma_zone_t zone;  
    mtx_lock(&uma_mtx);  
    LIST_FOREACH(keg, &uma_kegs, uk_link) {  
        LIST_FOREACH(zone, &keg->uk_zones, uz_link)  
            zfunc(zone);  
    }  
    mtx_unlock(&uma_mtx);  
}

zone_foreach 对系统中所有被 uma_kegs 连接的 zone 调用 zfunc 函数。
void  
uma_reclaim(void)  
{  
#ifdef UMA_DEBUG  
    printf("UMA: vm asked us to release pages!\n");  
#endif  
    bucket_enable();  
    zone_foreach(zone_drain);  
    /* 
     * Some slabs may have been freed but this zone will be visited early 
     * we visit again so that we can free pages that are empty once other 
     * zones are drained.  We have to do the same for buckets. 
     */  
    zone_drain(slabzone);  
    zone_drain(slabrefzone);  
    bucket_zone_drain();  
}

uma_reclaim 回收所有暂且不会被使用的页面,这个函数只能被 page out daemon 调用。

参考文章:
    * Jeff Bonwick, The Slab Allocator: An Object-Caching Kernel Memory Allocator (1994)
    * FreeBSD zone(9) manual page

你可能感兴趣的:(thread,cache,Access,FreeBSD,Go)