Memcached内存池分析

针对Memcacged1.4.15代码

1.完整slabs内存池图

这是我画的memcached的slabs内存池对象关系图:

Memcached内存池分析_第1张图片


2.内存池数据结构

typedef struct {
    unsigned int size;      /* 每个item的大小 */
    unsigned int perslab;   /* 每个page中包含多少个item */
 
    void *slots;           //空闲的item列表,指向最后一个空闲的chunk的地址,如果perslab=1那么slots和slab_list[0]指向的地址是同一个
    unsigned int sl_curr;   //当前空闲的item位置(也就是实际空闲item个数),从后往前的
 
    unsigned int slabs;     //已分配chunk数目
 
    void **slab_list;       //所有的page指针,指向page的地址,也是第一个chunk的地址,可以用(item *)((&slabclass[N])->slab_list[0])获取第一个item
    unsigned int list_size; //每个数组trunk数目,默认是16

 
    unsigned int killing;  /* index+1 of dying slab, or zero if none */
    size_t requested; //已分配总内存大小
} slabclass_t;

//slots:指向free chunk块的指针数组,slots的指针指向一个void //*的数组,该数组中的每一个元素的内容均指向一个空闲的chunk块,而且相同slabclass上的所有slab中的free chunk块均挂接到这个链表上;
//slabs:当前slabclass中分配的页内存个数;
//slab_list:当前slabclass所分配的页内存(slab)的指针数组,每一个数组元素的内容均是一个指向页内存地址的指针;
//        size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
//        void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
//        if (new_list == 0) return 0;
//        p->list_size = new_size;
//        p->slab_list = new_list;

static slabclass_t slabclass[MAX_NUMBER_OF_SLAB_CLASSES];
static size_t mem_limit = 0;//内存限制大小,如果默认64M   slabs_init的时候
static size_t mem_malloced = 0;//已分配大小
static int power_largest;//数组最大个数,默认是42
 
static void *mem_base = NULL;
static void *mem_current = NULL;//内存使用当前地址
static size_t mem_avail = 0;//剩余内存
 
/**
 * slab 线程锁
 */
static pthread_mutex_t slabs_lock = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t slabs_rebalance_lock = PTHREAD_MUTEX_INITIALIZER;

3.初始化slabs_init

/* slab初始化*/
/* limit:内存大小(字节);factor:增长因子;prealloc:是否一次性分配内存*/
void slabs_init(const size_t limit, const double factor, const bool prealloc) {
    int i = POWER_SMALLEST - 1;//0
    unsigned int size = sizeof(item) + settings.chunk_size;//chunk_size 最小分配空间
 
    mem_limit = limit;//内存限制大小
 
    if (prealloc) {//一次分配所有设置的内存
        /* Allocate everything in a big chunk with malloc */
        mem_base = malloc(mem_limit);
        if (mem_base != NULL) {
            mem_current = mem_base;
            mem_avail = mem_limit;
        } else {
            fprintf(stderr, "Warning: Failed to allocate requested memory in one large chunk.\nWill allocate in smaller chunks\n");
        }
    }
 
    memset(slabclass, 0, sizeof(slabclass));
//settings.item_size_max = 1024 * 1024=1M; /* The famous 1MB upper limit. */
//settings.item_size_max / factor    1048576/1.25=838860.8 这就是单page最大的chunk大小 字节
//slabclass[41]    {size=717184 perslab=1 slots=0x00000000 ...}    slabclass_t
//所以到了42就跳出循环了
//slabclass[42]    {size=1048576 perslab=1 slots=0x00000000 ...}    slabclass_t
//43就不分配了
//slabclass[43]    {size=0 perslab=0 slots=0x00000000 ...}    slabclass_t
    while (++i < POWER_LARGEST && size <= settings.item_size_max / factor) {
        /* Make sure items are always n-byte aligned */
        if (size % CHUNK_ALIGN_BYTES)//字节数为8的倍数
            size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
 
        slabclass[i].size = size;//item大小
        slabclass[i].perslab = settings.item_size_max / slabclass[i].size;//item数目
        size *= factor;//乘以增长因子
        if (settings.verbose > 1) {
            fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",i, slabclass[i].size, slabclass[i].perslab);
        }
    }
 
    power_largest = i;//默认=42
    slabclass[power_largest].size = settings.item_size_max;
    slabclass[power_largest].perslab = 1;//最大的只能存储一个item
    if (settings.verbose > 1) {
        fprintf(stderr, "slab class %3d: chunk size %9u perslab %7u\n",i, slabclass[i].size, slabclass[i].perslab);
    }
 
    /* for the test suite:  faking of how much we've already malloc'd */
    {
        char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
        if (t_initial_malloc) {
            mem_malloced = (size_t)atol(t_initial_malloc);
        }
 
    }
 
#ifndef DONT_PREALLOC_SLABS
    {
        char *pre_alloc = getenv("T_MEMD_SLABS_ALLOC");
 
        if (pre_alloc == NULL || atoi(pre_alloc) != 0) {
            slabs_preallocate(power_largest);
        }
    }
#endif
}

4.第一次分配slab

最初是在do_item_alloc中调用do_slabs_alloc

#0  do_item_alloc (key=0x7ffff0013754 "key", nkey=3, flags=0, exptime=0, nbytes=5, cur_hv=0) at items.c:190
#1  0x0000000000415706 in item_alloc (key=0x7ffff0013754 "key", nkey=3, flags=0, exptime=0, nbytes=5) at thread.c:486
#2  0x000000000040a38e in process_update_command (c=0x7ffff0013550, tokens=0x7ffff7ae4b00, ntokens=6, comm=2, handle_cas=false) at memcached.c:2917
#3  0x000000000040b43d in process_command (c=0x7ffff0013550, command=0x7ffff0013750 "set") at memcached.c:3258
#4  0x000000000040bfa1 in try_read_command (c=0x7ffff0013550) at memcached.c:3504
#5  0x000000000040cc25 in drive_machine (c=0x7ffff0013550) at memcached.c:3824
#6  0x000000000040d81f in event_handler (fd=37, which=2, arg=0x7ffff0013550) at memcached.c:4065
#7  0x00007ffff7dc9e0c in event_process_active_single_queue (base=0x635bb0, flags=0) at event.c:1350
#8  event_process_active (base=0x635bb0, flags=0) at event.c:1420
#9  event_base_loop (base=0x635bb0, flags=0) at    event.c:1621
#10 0x0000000000415416 in worker_libevent (arg=0x628d60) at thread.c:384
#11 0x0000003441607851 in start_thread () from /lib64/libpthread.so.0
#12 0x00000034412e890d in clone    () from /lib64/libc.so.6


客户端申请存储key value会调用到do_item_alloc

/*@null@*/
item *do_item_alloc(char *key, const size_t nkey, const int flags,
                    const rel_time_t exptime, const int nbytes,
                    const uint32_t cur_hv) {
    uint8_t nsuffix;
    item *it = NULL;
    char suffix[40];
    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
    if (settings.use_cas) {
        ntotal += sizeof(uint64_t);//算出需要分配的内存的大小,原始长度+结构体自身大小
    }

    unsigned int id = slabs_clsid(ntotal);
    if (id == 0)
        return 0;

    mutex_lock(&cache_lock);
    /* do a quick check if we have any expired items in the tail.. */
    int tries = 5;
    int tried_alloc = 0;
    item *search;
    void *hold_lock = NULL;
    rel_time_t oldest_live = settings.oldest_live;

    search = tails[id];
    /* We walk up *only* for locked items. Never searching for expired.
     * Waste of CPU for almost all deployments */
    for (; tries > 0 && search != NULL; tries--, search=search->prev) {
        uint32_t hv = hash(ITEM_key(search), search->nkey, 0);
        /* Attempt to hash item lock the "search" item. If locked, no
         * other callers can incr the refcount
         */
        /* FIXME: I think we need to mask the hv here for comparison? */
        if (hv != cur_hv && (hold_lock = item_trylock(hv)) == NULL)
            continue;
        /* Now see if the item is refcount locked */
        if (refcount_incr(&search->refcount) != 2) {
            refcount_decr(&search->refcount);
            /* Old rare bug could cause a refcount leak. We haven't seen
             * it in years, but we leave this code in to prevent failures
             * just in case */
            if (search->time + TAIL_REPAIR_TIME < current_time) {
                itemstats[id].tailrepairs++;
                search->refcount = 1;
                do_item_unlink_nolock(search, hv);
            }
            if (hold_lock)
                item_trylock_unlock(hold_lock);
            continue;
        }

        /* Expired or flushed */
        if ((search->exptime != 0 && search->exptime < current_time)
            || (search->time <= oldest_live && oldest_live <= current_time)) {
            itemstats[id].reclaimed++;
            if ((search->it_flags & ITEM_FETCHED) == 0) {
                itemstats[id].expired_unfetched++;
            }
            it = search;
            slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
            do_item_unlink_nolock(it, hv);
            /* Initialize the item block: */
            it->slabs_clsid = 0;
        } else if ((it = slabs_alloc(ntotal, id)) == NULL) {


调用slabs_clsid()

//寻找适合给定大小的item存储的slab
unsigned int slabs_clsid(const size_t size) {
    int res = POWER_SMALLEST;
 
    if (size == 0)
        return 0;
    while (size > slabclass[res].size)//找到第一个比item size大的slab
        if (res++ == power_largest)
            return 0;
    return res;
}

调用do_slabs_alloc返回slots指向的item,并使slots指向下一个item

/*存储item*/
static void *do_slabs_alloc(const size_t size, unsigned int id) {
    slabclass_t *p;
    void *ret = NULL;
    item *it = NULL;

    if (id < POWER_SMALLEST || id > power_largest) {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
        return NULL;
    }

    p = &slabclass[id];
    assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);

    /* fail unless we have space at the end of a recently allocated page,
       we have something on our freelist, or we could allocate a new page */
       //p->sl_curr != 0 说明还有空闲就不要调用do_slabs_newslab重新分配下一个page
     if (! (p->sl_curr != 0 || do_slabs_newslab(id) != 0)) {
        /* We don't have more memory available */
        ret = NULL;
    } else if (p->sl_curr != 0) {
        /* return off our freelist */
        it = (item *)p->slots;//将当前p空闲的slots其实也就是第一个,实际上是链表最后一个chunk分配给item
        p->slots = it->next;//修改p的空闲的slots,为倒数第二个,也就是it的前一个
        if (it->next) it->next->prev = 0;//因为it的前一个再前一个已经给item分配了,自然没了
        p->sl_curr--;//-1,虽然perslab在分割好之后和sl_curr一样大,但是sl_curr是要递减的,而perslab是永远不变的存储的是chunk个数
        ret = (void *)it;
    }

    if (ret) {
        p->requested += size;
        MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
    } else {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
    }

    return ret;
}

下面是gdb下调试的函数调用关系:

#0  do_slabs_alloc (size=71, id=1) at slabs.c:241
#1  0x000000000041161d in slabs_alloc (size=71, id=1) at slabs.c:404
#2  0x0000000000412ae6 in do_item_alloc (key=0x7ffff0013754 "key", nkey=3, flags=0, exptime=0, nbytes=5, cur_hv=0) at items.c:188
#3  0x0000000000415706 in item_alloc (key=0x7ffff0013754 "key", nkey=3, flags=0, exptime=0, nbytes=5) at thread.c:486
#4  0x000000000040a38e in process_update_command (c=0x7ffff0013550, tokens=0x7ffff7ae4b00, ntokens=6, comm=2, handle_cas=false) at memcached.c:2917
#5  0x000000000040b43d in process_command (c=0x7ffff0013550, command=0x7ffff0013750 "set") at memcached.c:3258
#6  0x000000000040bfa1 in try_read_command (c=0x7ffff0013550) at memcached.c:3504
#7  0x000000000040cc25 in drive_machine (c=0x7ffff0013550) at memcached.c:3824
#8  0x000000000040d81f in event_handler (fd=37,    which=2, arg=0x7ffff0013550) at memcached.c:4065
#9  0x00007ffff7dc9e0c in event_process_active_single_queue (base=0x635bb0, flags=0) at event.c:1350
#10 event_process_active (base=0x635bb0, flags=0) at event.c:1420
#11 event_base_loop (base=0x635bb0, flags=0) at event.c:1621
#12 0x0000000000415416 in worker_libevent (arg=0x628d60) at thread.c:384
#13 0x0000003441607851 in start_thread () from /lib64/libpthread.so.0
#14 0x00000034412e890d in clone () from /lib64/libc.so.6

5.第一次slab_list

初始化slabs,分配trunk

static int do_slabs_newslab(const unsigned int id) {
    slabclass_t *p = &slabclass[id];
//settings.item_size_max = 1024 * 1024; /* The famous 1MB upper limit. */
 int len = settings.slab_reassign ? settings.item_size_max
        : p->size * p->perslab;
    char *ptr;

    if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
        (grow_slab_list(id) == 0) ||
        ((ptr = memory_allocate((size_t)len)) == 0)) {

        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
        return 0;
    }

    memset(ptr, 0, (size_t)len);
    split_slab_page_into_freelist(ptr, id);
//这里很巧妙,如果是第一次do_slabs_newslab,那么p->slabs=0,++之后就=1,
//第二次来自然就是2,这次会把第2次申请的page也就是ptr串在p->slab_list[1]上
    p->slab_list[p->slabs++] = ptr;
    mem_malloced += len;
    MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);

    return 1;
}


p->slab_list第一次是存16*8个字节

//扩充trunk数目,重新分配slabs个数,默认是分配16个页,后续按照2倍增加
static int grow_slab_list (const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    if (p->slabs == p->list_size) {
	//默认new_zise=16
        size_t new_size =  (p->list_size != 0) ? p->list_size * 2 : 16;
		//因为以后会扩容,所以这里用realloc
		//p->slab_list一开始=0,64位sizeof(void *)=8,32位sizeof(void *)=4
        void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
        if (new_list == 0) return 0;
        p->list_size = new_size;
        p->slab_list = new_list;
    }
    return 1;
}

调用split_slab_page_into_freelist分割ptr


5.分割ptr

split_slab_page_into_freelist:

static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    int x;
    for (x = 0; x < p->perslab; x++) {
        do_slabs_free(ptr, 0, id);
        ptr += p->size;//ptr偏移一个size,第一次是96字节,如果是默认配置的话
    }
}

这个函数很短,就是循环去调用do_slabs_free函数

//分割item内存
static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
    slabclass_t *p;
    item *it;//首先声明一个item对象

    assert(((item *)ptr)->slabs_clsid == 0);
    assert(id >= POWER_SMALLEST && id <= power_largest);
    if (id < POWER_SMALLEST || id > power_largest)
        return;

    MEMCACHED_SLABS_FREE(size, id, ptr);
    p = &slabclass[id];//获取第id个slabclass_t并声明为p

    it = (item *)ptr;//将需要切割的内存空间实例化为item *
    it->it_flags |= ITEM_SLABBED;
    it->prev = 0;//it一开始无头
    it->next = p->slots;//it的新卡一个是p的当前的slots,也就是ptr上一次切割的item,也可以理解为是当前item的上一个item
    if (it->next) it->next->prev = it;//如果当前的it的下一个item也就是也就是ptr上一次切割的item存在(第2次切割以后才会有)
    //前一个item的后一个item指向当前,其实也就是正常的双链表指向操作
    p->slots = it;//p的slots永远指向当前的item,每次如果执行do_slabs_free就会划走一块item,并串前后的指针

    p->sl_curr++;//p的当前空闲的item个数+1
    p->requested -= size;
    return;
}

第一次:
(gdb) p    ptr
$5 = 0x7ffff51e1010 ""

vs2012+        ptr    0x0000000000230070 ""    char *

>    split_slab_page_into_freelist(ptr, id);

第二次:
(gdb) p    ptr
$25 = (void *) 0x7ffff51e1070
0x70-0x10=0x60=96  正好是一个chunk大小

第三次
(gdb) p ptr
$50 = (void *) 0x7ffff51e10d0
(gdb) p *(it-2)
$46 = {next = 0x7ffff51e1010, prev = 0x7ffff51e10d0, h_next = 0x0, time = 0, exptime = 0, nbytes = 0, refcount = 0, nsuffix = 0 '\000', it_flags = 4 '\004
', slabs_clsid = 0 '\000', nkey = 0 '\000', data = 0x7ffff51e1070}
(gdb) p *(it-4)
$54 = {next = 0x0, prev = 0x7ffff51e1070, h_next = 0x0, time = 0, exptime = 0, nbytes = 0, refcount = 0, nsuffix = 0 '\000', it_flags = 4 '\004', slabs_cl
sid = 0 '\000', nkey = 0 '\000', data = 0x7ffff51e1010}

为什么是it-2才是上一个呢?
因为
(gdb) p sizeof(it)
$57 = 8
(gdb) p sizeof(item)
$58 = 48
it是指针自然是8字节,而item结构是48字节,96正好是48的2倍,其实这里是巧合,求上一个item不应该这么求!
static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    int x;
    for (x = 0; x < p->perslab; x++) {
        do_slabs_free(ptr, 0, id);
        ptr += p->size;
    }
}
因为每次是Ptr增加p->size的长度。
所以直接用next指针就可以了
(gdb) p it->next
$67 = (struct _stritem *) 0x7ffff51e1070
(gdb) p *it->next
$68 = {next = 0x7ffff51e1010, prev = 0x7ffff51e10d0, h_next = 0x0, time = 0, exptime = 0, nbytes = 0, refcount = 0, nsuffix = 0 '\000', it_flags = 4 '\004
', slabs_clsid = 0 '\000', nkey = 0 '\000', data = 0x7ffff51e1070}



下面是在vs2012下的调试数据,一看就清楚了

Memcached内存池分析_第2张图片


7.分割完ptr之后

回到do_slabs_newslab中修改了p->slab_list[p->slabs++] = ptr;mem_malloced += len;

Memcached内存池分析_第3张图片


然后又回到do_slabs_alloc函数中:

it = (item *)p->slots;//将当前p空闲的slots其实也就是第一个,实际上是链表最后一个chunk分配给item
        p->slots = it->next;//修改p的空闲的slots,为倒数第二个,也就是it的前一个
        if (it->next) it->next->prev = 0;//因为it的前一个再前一个已经给item分配了,自然没了
        p->sl_curr--;//-1,虽然perslab在分割好之后和sl_curr一样大,但是sl_curr是要递减的,而perslab是永远不变的存储的是chunk个数
        ret = (void *)it;
(gdb) p    *p
$72 = {size = 96, perslab = 10922, slots = 0x7ffff52e0f70, sl_curr = 10922, slabs = 1, slab_list = 0x7ffff00169e0, list_size = 16, killing = 0, requested
= 0}
(gdb) n
(gdb) p *p
$73 = {size = 96, perslab = 10922, slots = 0x7ffff52e0f10, sl_curr = 10922, slabs = 1, slab_list = 0x7ffff00169e0, list_size = 16, killing = 0, requested

= 0}

 p->sl_curr--;之后的调试数据,可以看到sl_curry已经减1 ,而perslab是不变的。

(gdb) p *p
$75 = {size = 96, perslab = 10922, slots = 0x7ffff52e0f10, sl_curr = 10921, slabs = 1, slab_list = 0x7ffff00169e0, list_size = 16, killing = 0, requested
= 0}

下面是vs2012中的结果,一样的:

Memcached内存池分析_第4张图片


8.分配好item之后

回到do_item_alloc,已经得到分配的item

(gdb) p *it
$79 = {next = 0x7ffff52e0f10, prev = 0x0, h_next = 0x0, time = 0, exptime = 0, nbytes = 0, refcount = 0, nsuffix = 0 '\000', it_flags = 4 '\004', slabs_cl
sid = 0 '\000', nkey = 0 '\000', data = 0x7ffff52e0f70}


9.模拟memcached调用slabs


3次分配一样大小的

Memcached内存池分析_第5张图片


2.故意设一个比默认chunk=96大的需求设置100,从后面的调试信息可以看出id2已经被算出应该是第2个slabs上:

int main()
{
	item *it1=NULL;item *it2 = NULL;item *it3 = NULL;item *it4 = NULL;
	int preallocate = 0;
	size_t ntotal =63;// item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
	size_t ntotal2 =100;
	unsigned int id =0;	unsigned int id2 =0;
	settings_init();

	slabs_init(settings.maxbytes, settings.factor, preallocate);

	if (settings.use_cas) {
        ntotal += sizeof(uint64_t);
    }
	id=slabs_clsid(ntotal);
	it1=slabs_alloc(ntotal, id);
	it2=slabs_alloc(ntotal, id);
	it3=slabs_alloc(ntotal, id);
	if (settings.use_cas) {
        ntotal2 += sizeof(uint64_t);
    }
	id2=slabs_clsid(ntotal2);
	it4=slabs_alloc(ntotal2, id2);
return 0;
}
Memcached内存池分析_第6张图片

制造perslab=1的情景

分配超过settings.item_size_max / factor=717184的size

这样就会命中id=41的slabs[41],同时我们设置2个需要分配717184的:

Memcached内存池分析_第7张图片

从调试结果可以看出来slabclass[41]已经分配了2个slab_list,由于这2个slab_list指向的page都只有一个chunk,所以这个chunk中存储的item的next和prev都是0,

因为第一次分割do_slabs_free的时候:

it->prev = 0;
    it->next = p->slots;
由于p->slots一开始=0,因为后面不会再切割chunk了,所以这个item的next和prev都是0。

	item *it1=NULL;item *it2 = NULL;item *it3 = NULL;item *it4 = NULL;item *it5 = NULL;
	int preallocate = 0;
	size_t ntotal =63;// item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
	size_t ntotal2 =717184;
	unsigned int id =0;	unsigned int id2 =0;
	settings_init();

	slabs_init(settings.maxbytes, settings.factor, preallocate);

	if (settings.use_cas) {
        ntotal += sizeof(uint64_t);
    }
	id=slabs_clsid(ntotal);
	it1=slabs_alloc(ntotal, id);
	it2=slabs_alloc(ntotal, id);
	it3=slabs_alloc(ntotal, id);
	if (settings.use_cas) {
        ntotal2 += sizeof(uint64_t);
    }
	id2=slabs_clsid(ntotal2);
	it4=slabs_alloc(ntotal2, id2);
	it5=slabs_alloc(ntotal2, id2);

你可能感兴趣的:(Memcached内存池分析)