memcahce采用了内存页面,内存页面上的内存块技术实现了内存管理器,对item的指针采用了hashtable的方法,通过item的key值实现快速定位查找item指针的方法,这里详细剖析一下实现的关键代码.
首先是内存管理,在Slabs.c代码中实现.
(1).void slabs_init()
void slabs_init(const size_t limit, const double factor, const bool prealloc) {
int i = POWER_SMALLEST - 1; //POWER_SMALLEST:定义的值是1
//初始化设置时 settings.chunk_size = 48
unsigned int size = sizeof(item) + settings.chunk_size;
/* Factor of 2.0 means use the default memcached behavior */
if (factor == 2.0 && size < 128)
{
size = 128;
}
mem_limit = limit;
if (prealloc) {
/* Allocate everything in a big chunk with malloc */
mem_base = malloc(mem_limit); //分配256M的内存
if (mem_base != NULL) {
mem_current = mem_base; //当前mem_current指向mem_base
mem_avail = mem_limit; //初始化时可得到的内存大小mem_avail
} else {
fprintf(stderr, "Warning: Failed to allocate requested memory in"
" one large chunk./nWill allocate in smaller chunks/n");
}
}
memset(slabclass, 0, sizeof(slabclass)); //200个slab,每个slab是1M
//POWER_BLOCK 设置的值是 1048576=1024*1024=1M
//循环计算每个内存集合的内存单元大小和每个内存页所包含的内存单元数目
//如果当前的内存单元的大小超过0.5M,就好停止扩展,并在最后增加一个内存单元为1M的集合
//在memcached中内存页大小的上限是1M)
while (++i < POWER_LARGEST && size <= POWER_BLOCK / 2) {
/* Make sure items are always n-byte aligned */
//size是8个字节对齐的 CHUNK_ALIGN_BYTES 8
//对于每个内存页的大小是与sizeof(void*)对齐的,而且内存集合的内存单元大小是以factor因子增加的
if (size % CHUNK_ALIGN_BYTES) //如果不能被8除尽,那么对size进行修正
{
size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
}
slabclass[i].size = size; //每个内存集合的内存单元大小
slabclass[i].perslab = POWER_BLOCK / slabclass[i].size; //每个内存页所包含的内存单元数目
size *= factor;
if (settings.verbose > 1)
{
fprintf(stderr, "slab class %3d: chunk size %6u perslab %5u/n",i, slabclass[i].size, slabclass[i].perslab);
}
}
power_largest = i;
slabclass[power_largest].size = POWER_BLOCK;
slabclass[power_largest].perslab = 1;
/* for the test suite: faking of how much we've already malloc'd */
{
char *t_initial_malloc = getenv("T_MEMD_INITIAL_MALLOC");
if (t_initial_malloc) {
mem_malloced = (size_t)atol(t_initial_malloc);
}
}
#ifndef DONT_PREALLOC_SLABS
{
char *pre_alloc = getenv("T_MEMD_SLABS_ALLOC");
//如果编译的时候没有定义DONT_PREALLOC_SLABS而且环境变量中也没有定义T_MEMD_SLABS_ALLOC,memcached就会进行内存的预分配
if (pre_alloc == NULL || atoi(pre_alloc) != 0) {
slabs_preallocate(power_largest);
}
}
#endif
}
(2).void slabs_preallocate():
#ifndef DONT_PREALLOC_SLABS
static void slabs_preallocate (const unsigned int maxslabs) {
int i;
unsigned int prealloc = 0;
/* pre-allocate a 1MB slab in every size class so people don't get
confused by non-intuitive "SERVER_ERROR out of memory"
messages. this is the most common question on the mailing
list. if you really don't want this, you can rebuild without
these three lines. */
for (i = POWER_SMALLEST; i <= POWER_LARGEST; i++) {
if (++prealloc > maxslabs)
return;
do_slabs_newslab(i); //预分配
}
}
#endif
(3).int grow_slab_list():
//当一个slab(内存页面)用光后,又有新的item要插入这个id,那么它就会重新申请新的slab,申请新的slab时,对应id的slab链表就要增长,这个链表是成倍增长的,
//在函数grow_slab_list函数中,这个链的长度从1变成2,从2变成4,从4变成8……
static int grow_slab_list (const unsigned int id) {
slabclass_t *p = &slabclass[id];
//p->slabs页面个数
if (p->slabs == p->list_size) {
size_t new_size = (p->list_size != 0) ? p->list_size * 2 : 16; //初始时new_size=16,下一次变为16*2,在下次就是16*2*2
//原型: extern void *realloc(void *mem_address, unsigned int newsize);
//功能: 先释放原来mem_address所指内存区域,并按照newsize指定的大小重新分配空间,同时将原有数据从头到尾拷贝到新分配的内存区域,
//并返回该内存区域的首地址。即重新分配存储器块。
void *new_list = realloc(p->slab_list, new_size * sizeof(void *));
if (new_list == 0) return 0;
p->list_size = new_size;
p->slab_list = new_list; //用于页面的指针链表
}
return 1;
}
(4). int do_slabs_newslab()
//该函数分配一个新的内存页,每个slabclass_t会有多个页面
static int do_slabs_newslab(const unsigned int id) {
slabclass_t *p = &slabclass[id];
#ifdef ALLOW_SLABS_REASSIGN
int len = POWER_BLOCK;
#else
int len = p->size * p->perslab; //长度=每块的大小*每个内存页所包含的内存单元数目
#endif
char *ptr;
if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||(grow_slab_list(id) == 0) ||
((ptr = memory_allocate((size_t)len)) == 0))
{
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
return 0;
}
//确认malloc成功后初时化它的值为0(memset)
memset(ptr, 0, (size_t)len);
//将end_page_ptr指向新的内存页,并把它加入到内存页数组中,同时修改对应的计算变量(这个函数是memcached中唯一分配"用户可用内存"的地方,
//"用户可用"是指set/update/replace指令可以控制的内存)
p->end_page_ptr = ptr; //最后一个slab空闲内存起始地址
p->end_page_free = p->perslab; //最后一个slab空闲区能存放的item个数
p->slab_list[p->slabs++] = ptr; //链表里的元素所指向的地址:记录页面指向的地址,见代码194行的页面地址链表
mem_malloced += len; //修改已分配内存大小的值
MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
return 1;
}
(5). void *do_slabs_alloc()
//内存单元:集合中维护的"逻辑内存块"的大小,它是以8字节对齐的
//内存页: slabclass_t分配内存的时候是以perslab个内存单元分配的,这perslab个连续的内存单元就是内存页
//slabs_alloc是一个宏,对于多线程模式和单线程模式,它会映射到不同的函数
void *do_slabs_alloc(const size_t size, unsigned int id) {
slabclass_t *p;
void *ret = NULL;
//根据需要的大小查找对应的slabclass_t结构
if (id < POWER_SMALLEST || id > power_largest) {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
return NULL;
}
p = &slabclass[id];
//检查内存单元指针数组slots是否为空,如果非空-->返回一个空的内存单元
assert(p->sl_curr == 0 || ((item *)p->slots[p->sl_curr - 1])->slabs_clsid == 0);
//如果使用系统分配,直接调用操作系统的malloc函数
#ifdef USE_SYSTEM_MALLOC
if (mem_limit && mem_malloced + size > mem_limit) {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
return 0;
}
mem_malloced += size;
ret = malloc(size);
MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret);
return ret;
#endif
/* fail unless we have space at the end of a recently allocated page,
we have something on our freelist, or we could allocate a new page */
//检查是否分配了新的内存页, 如果是-->返回一个"新的"内存单元(没有加入到slots中),如果新的内存页为空,那么调用do_slabs_newslab从系统分配内存
//(当然do_slabs_alloc还会修改对应的计数变量)
//先从本slab中申请,如果没有内存的话就去slot里面找,如果还没有找到的话就要new新的了
//如果这些办法都失败了,iteam_alloc就需用动动LRU的脑筋了,它会从尾部循环50次,看看没有可以释放的item
if (! (p->end_page_ptr != 0 || p->sl_curr != 0 ||do_slabs_newslab(id) != 0)) {
/* We don't have more memory available */
ret = NULL;
} else if (p->sl_curr != 0) {
/* return off our freelist */
//从空闲里面去取
ret = p->slots[--p->sl_curr];
} else {
/* if we recently allocated a whole page, return from that */
assert(p->end_page_ptr != NULL);
ret = p->end_page_ptr; //返回新生成的地址
//p->end_page_free:空闲数目减少1个
if (--p->end_page_free != 0) {
p->end_page_ptr += p->size; //改id的指针偏移size个单位
} else {
p->end_page_ptr = 0;
}
}
if (ret) {
MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
} else {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
}
return ret;
}
(6). void do_slabs_free()
void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
slabclass_t *p;
assert(((item *)ptr)->slabs_clsid == 0);
assert(id >= POWER_SMALLEST && id <= power_largest);
if (id < POWER_SMALLEST || id > power_largest)
return;
MEMCACHED_SLABS_FREE(size, id, ptr);
p = &slabclass[id];
#ifdef USE_SYSTEM_MALLOC
mem_malloced -= size; //已经分配的内存数减去释放的内存大小
free(ptr);
return;
#endif
//将释放的内存加入到内存单元数组中:(可以看到memcached是不真正释放内存的,而且它的分配与释放操作都是很简单的指针赋值操作
//开始释放的时候都是p->sl_curr= p->sl_total=0,因此执行它,得到空闲链表,空闲的内存单元的数目
if (p->sl_curr == p->sl_total) { /* need more space on the free list */
int new_size = (p->sl_total != 0) ? p->sl_total * 2 : 16; /* 16 is arbitrary */
void **new_slots = realloc(p->slots, new_size * sizeof(void *));
if (new_slots == 0)
return;
p->slots = new_slots;
p->sl_total = new_size;
}
p->slots[p->sl_curr++] = ptr; //p->sl_curr自增加1,并且记录该指针
return;
}
(7). void *memory_allocate()
//从mem_base中分配size大小的内存
static void *memory_allocate(size_t size) {
void *ret;
if (mem_base == NULL) {
/* We are not using a preallocated large memory chunk */
ret = malloc(size);
} else {
ret = mem_current;
//如果分配大小size大于内存可以得到的大小,直接返回NULL
if (size > mem_avail) {
return NULL;
}
/* mem_current pointer _must_ be aligned!!! */
//如果不能被8整除,修正size的大小
if (size % CHUNK_ALIGN_BYTES) {
size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
}
mem_current += size; //当前内存指针偏移size
if (size < mem_avail) {
mem_avail -= size; //修改内存剩余大小的值
} else {
mem_avail = 0;
}
}
return ret;
}
然后是通过item的key值实现快速定位item指针地址的hashtabel,在assoc.c中。
(1). void assoc_init()
/*hashtable的初始化,计算hashtable的大小-->分配空间-->初始化空间为NULL*/
//分配hashtable的所需的内存
void assoc_init(void) {
//65535个hashtable
/*在内存的动态存储区中分配n个长度为size的连续空间,函数返回一个指向分配起始地址的指针;如果分配不成功,返回NULL*/
/*用 法: void *calloc(unsigned n,unsigned size)*/
/*hashpower: 16 */
/*hashsize(n) ((ub4)1<<(n))* 将1左移16位: 16*16*16*16=65535个hashtable */
primary_hashtable = calloc(hashsize(hashpower), sizeof(void *));
if (! primary_hashtable) {
fprintf(stderr, "Failed to init hashtable./n");
exit(EXIT_FAILURE);
}
}
(2). item *assoc_find()
//根据键寻找对应的值
item *assoc_find(const char *key, const size_t nkey) {
//根据key和key_len计算hash值
uint32_t hv = hash(key, nkey, 0);
item *it;
unsigned int oldbucket;
//根据hash值和掩码计算hashtable的下标
//如果当前处于hashtable的扩展过程,并且下标值小于数据迁移的记录值,那么就从新的hashtable中获得该下标对应的item链表,否则
//就从原来的hashtable中获得item链表
if (expanding &&(oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket)
{
it = old_hashtable[oldbucket];
} else {
it = primary_hashtable[hv & hashmask(hashpower)];
}
//循环对比链表中的item的key寻找对应的item
item *ret = NULL;
int depth = 0;
//桶里面是链表
while (it) {
if ((nkey == it->nkey) && (memcmp(key, ITEM_key(it), nkey) == 0)) {
ret = it;
break;
}
it = it->h_next;
++depth;
}
MEMCACHED_ASSOC_FIND(key, depth);
return ret;
}
(3). item** _hashitem_before()
//寻找key对应的元素的指针变量的地址
static item** _hashitem_before (const char *key, const size_t nkey) {
uint32_t hv = hash(key, nkey, 0);
item **pos;
unsigned int oldbucket;
if (expanding && (oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket)
{
pos = &old_hashtable[oldbucket];
} else {
pos = &primary_hashtable[hv & hashmask(hashpower)];
}
//对桶里面的链表做循环处理
while (*pos && ((nkey != (*pos)->nkey) || memcmp(key, ITEM_key(*pos), nkey))) {
pos = &(*pos)->h_next;
}
return pos;
}
(4). void assoc_expand()
static void assoc_expand(void) {
old_hashtable = primary_hashtable; //指针的作用只是指向内存中的一段地址
primary_hashtable = calloc(hashsize(hashpower + 1), sizeof(void *)); //每次都是进行2倍的容量扩展
if (primary_hashtable) {
if (settings.verbose > 1)
fprintf(stderr, "Hash table expansion starting/n");
hashpower++;
expanding = true;
expand_bucket = 0;
do_assoc_move_next_bucket();
} else {
primary_hashtable = old_hashtable;
/* Bad news, but we can keep running. */
}
}
(5). void do_assoc_move_next_bucket()
//被static void conn_set_state()调用
void do_assoc_move_next_bucket(void) {
item *it, *next;
int bucket;
//将hashtable中的第一个下标的item列表重新计算hash值并移到新的hashtable中,
if (expanding) {
//这里只移动了一个下标的item链表do_assoc_move_next_bucket
for (it = old_hashtable[expand_bucket]; NULL != it; it = next) {
next = it->h_next;
//新的散列值
bucket = hash(ITEM_key(it), it->nkey, 0) & hashmask(hashpower);
it->h_next = primary_hashtable[bucket];
primary_hashtable[bucket] = it; //扩展后变为新的了
}
old_hashtable[expand_bucket] = NULL;
expand_bucket++;
//对于其他的元素的迁移会在用户用户请求的时候进行移动,这是把时间消耗分散的延迟处理方式,当元素迁移完成后,
//就会释放旧的hashtable占用的资源free
if (expand_bucket == hashsize(hashpower - 1)) {
expanding = false;
free(old_hashtable);
if (settings.verbose > 1)
fprintf(stderr, "Hash table expansion done/n");
}
}
}
(6). int assoc_insert()
//将item加入到hashtable中
int assoc_insert(item *it) {
uint32_t hv;
unsigned int oldbucket;
//验证item的key不在hashtable中
assert(assoc_find(ITEM_key(it), it->nkey) == 0); /* shouldn't have duplicately named things defined */
hv = hash(ITEM_key(it), it->nkey, 0);
//根据hash值和掩码计算hashtable的下标
//在扩展中还是使用旧的,因为扩展的时候做了hashpower++操作,如果下标值大于已经移走的数目,那么它必没有被移走
if (expanding &&(oldbucket = (hv & hashmask(hashpower - 1))) >= expand_bucket)
{
it->h_next = old_hashtable[oldbucket];
old_hashtable[oldbucket] = it;
} else {
//会有多个映射到同一个上面,比如第一个item进来,它的it->h_next=NULL,下一步又给它赋值it,第二个item进来,它的h_next就是上一个的item地址值了,
//然后又给这个桶赋值为它自己,以此类推:第三个item->h_next=第二个item 第二个item->h_next=第一个item 当前的是第三个item的地址
it->h_next = primary_hashtable[hv & hashmask(hashpower)];
primary_hashtable[hv & hashmask(hashpower)] = it;
}
hash_items++;
//如果当前不是处于扩展状态,那么就检查hashtable中保存的item数是否超过其大小的1.5倍,如果是就进行2倍的容量扩展assoc_expand()
if (!expanding && hash_items > (hashsize(hashpower) * 3) / 2) {
assoc_expand();
}
MEMCACHED_ASSOC_INSERT(ITEM_key(it), hash_items);
return 1;
}
(7). void assoc_delete()
//从hashtable中删除对应key的item
void assoc_delete(const char *key, const size_t nkey) {
item **before = _hashitem_before(key, nkey);
if (*before) {
item *nxt;
hash_items--;
/* The DTrace probe cannot be triggered as the last instruction
* due to possible tail-optimization by the compiler
*/
MEMCACHED_ASSOC_DELETE(key, hash_items);
//修改item的h_next指针,从链表中删除该元素
nxt = (*before)->h_next;
(*before)->h_next = 0; /* probably pointless, but whatever. */
*before = nxt;
return;
}
/* Note: we never actually get here. the callers don't delete things
they can't find. */
assert(*before != 0);
}