导航
memcached源码分析
memcached源码分析-网络模块
memcached源码分析-指令解析模块
memcached源码分析-哈希表(hashtable)模块
memcached源码分析-slab存储机制
1.前言
这一章节主要介绍了memcached的内存管理机制,我们在指令解析模块中说到了memcached是如何解析处理客户端请求指令的。
例如:执行set key flags exptime bytes
\n
value
memcached接收来此客户端的存储请求,那么服务端是如何存储来自客户端的存储内容的呢,这里就涉及到了slabs存储机制,在此之前首先介绍memcached中关于内存管理的几个重要的概念
item 数据存储节点
typedef struct _stritem {
/* Protected by LRU locks */
//一个item的地址, 主要用于LRU链和freelist链
struct _stritem *next;
//下一个item的地址,主要用于LRU链和freelist链
struct _stritem *prev;
/* Rest are protected by an item lock */
//用于记录哈希表槽中下一个item节点的地址
struct _stritem *h_next; /* hash chain next */
//最近访问时间
rel_time_t time; /* least recent access */
//缓存过期时间
rel_time_t exptime; /* expire time */
int nbytes; /* size of data */
//当前item被引用的次数,用于判断item是否被其它的线程在操作中
//refcount == 1的情况下该节点才可以被删除
unsigned short refcount;
uint8_t nsuffix; /* length of flags-and-length string */
uint8_t it_flags; /* ITEM_* above */
//记录该item节点位于哪个slabclass_t中
uint8_t slabs_clsid;/* which slab class we're in */
uint8_t nkey; /* key length, w/terminating null and padding */
/* this odd type prevents type-punning issues when we do
* the little shuffle to save space when not using CAS. */
union {
uint64_t cas;
char end;
} data[];
/* if it_flags & ITEM_CAS we have 8 bytes CAS */
/* then null-terminated key */
/* then " flags length\r\n" (no terminating null) */
/* then data with terminating \r\n (no terminating null; it's binary!) */
} item;
slab与chunk
slab是一块内存空间,默认大小为1M,memcached会把一个slab分割成一个个chunk, 这些被切割的小的内存块,主要用来存储item
slabclass
每个item的大小都可能不一样,item存储于chunk,如果chunk大小不够,则不足以分配给item使用,如果chunk过大,则太过于浪费内存空间。因此memcached采取的做法是,将slab切割成不同大小的chunk,这样就满足了不同大小item的存储。被划分不同大小chunk的slab的内存在memcached就是用slabclass这个结构体来表现的
typedef struct {
//chunk大小
unsigned int size; /* sizes of items */
//1M内存大小被分割为多少个chunck
unsigned int perslab; /* how many items per slab */
//空闲chunk链表
void *slots; /* list of item ptrs */
//空闲chunk的个数
unsigned int sl_curr; /* total free items in list */
//当前slabclass已经分配了所少个1M空间的slab
unsigned int slabs; /* how many slabs were allocated for this class */
//slab指针数组
void **slab_list; /* array of slab pointers */
//slab指针数组的大小
unsigned int list_size; /* size of prev array */
size_t requested; /* The number of requested bytes */
} slabclass_t;
1.slabclass数组初始化的时候,每个slabclass_t都会分配一个1M大小的slab,slab会被切分为N个小的内存块,这个小的内存块的大小取决于slabclass_t结构上的size的大小
2.每个slabclass_t都只存储一定大小范围的数据,并且下一个slabclass切割的chunk块大于前一个slabclass切割的chunk块大小
3.memcached中slabclass数组默认大小为64,slabclass切割块大小的增长因子默认是1.25
例如:slabclass[1]切割的chunk块大小为100字节,slabclass[2]为125,如果需要存储一个110字节的缓存,那么就需要到slabclass[2] 的空闲链表中获取一个空闲节点进行存储
2.slabclass的初始化
/**
* Determines the chunk sizes and initializes the slab class descriptors
* accordingly.
*/
void slabs_init(const size_t limit, const double factor, const bool prealloc, const uint32_t *slab_sizes) {
int i = POWER_SMALLEST - 1;
unsigned int size = sizeof(item) + settings.chunk_size;
/* Some platforms use runtime transparent hugepages. If for any reason
* the initial allocation fails, the required settings do not persist
* for remaining allocations. As such it makes little sense to do slab
* preallocation. */
bool __attribute__ ((unused)) do_slab_prealloc = false;
mem_limit = limit;
if (prealloc) {
//...
/* Allocate everything in a big chunk with malloc */
//预分配一个大空间
mem_base = malloc(mem_limit);
do_slab_prealloc = true;
if (mem_base != NULL) {
mem_current = mem_base;
mem_avail = mem_limit;
} else {
fprintf(stderr, "Warning: Failed to allocate requested memory in"
" one large chunk.\nWill allocate in smaller chunks\n");
}
}
memset(slabclass, 0, sizeof(slabclass));
while (++i < MAX_NUMBER_OF_SLAB_CLASSES-1) {
if (slab_sizes != NULL) {
if (slab_sizes[i-1] == 0)
break;
size = slab_sizes[i-1];
} else if (size >= settings.slab_chunk_size_max / factor) {
break;
}
/* Make sure items are always n-byte aligned */
if (size % CHUNK_ALIGN_BYTES)
size += CHUNK_ALIGN_BYTES - (size % CHUNK_ALIGN_BYTES);
//当前slabclass切割的chunk块大小
slabclass[i].size = size;
slabclass[i].perslab = settings.slab_page_size / slabclass[i].size;
if (slab_sizes == NULL)
//下一个slabclass切割的chunk块大小
size *= factor;
//...
}
power_largest = i;
slabclass[power_largest].size = settings.slab_chunk_size_max;
slabclass[power_largest].perslab = settings.slab_page_size / settings.slab_chunk_size_max;
//...
if (prealloc && do_slab_prealloc) {
//为slabclass结构分配slab空间,默认大小为1M
//并且将1M的slab空间切割成N个slabclass->size大小块,并每个块加入到slot空闲链表中,块和块直接用双向链表串联
slabs_preallocate(power_largest);
}
}
static void slabs_preallocate (const unsigned int maxslabs) {
int i;
unsigned int prealloc = 0;
//...
for (i = POWER_SMALLEST; i < MAX_NUMBER_OF_SLAB_CLASSES; i++) {
if (++prealloc > maxslabs)
return;
//分配slab空间,并切割slab块
if (do_slabs_newslab(i) == 0) {
fprintf(stderr, "Error while preallocating slab memory!\n"
"If using -L or other prealloc options, max memory must be "
"at least %d megabytes.\n", power_largest);
exit(1);
}
}
}
//为slabclass分配一个新的slab
static int do_slabs_newslab(const unsigned int id) {
slabclass_t *p = &slabclass[id];
slabclass_t *g = &slabclass[SLAB_GLOBAL_PAGE_POOL];
int len = (settings.slab_reassign || settings.slab_chunk_size_max != settings.slab_page_size)
? settings.slab_page_size
: p->size * p->perslab;
char *ptr;
if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0
&& g->slabs == 0)) {
mem_limit_reached = true;
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
return 0;
}
//空闲链表没有空间可用,需要重新分配开辟一个slab空间
//这个slab空间地址使用slabclass->slab_list保存的,grow_slab_list就是扩大slab_list
if ((grow_slab_list(id) == 0) ||
//分配具体的slab空间
(((ptr = get_page_from_global_pool()) == NULL) &&
((ptr = memory_allocate((size_t)len)) == 0))) {
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
return 0;
}
memset(ptr, 0, (size_t)len);
//将新分配的slab空间进行切割chunk块,并加入到slot空闲列表中
split_slab_page_into_freelist(ptr, id);
p->slab_list[p->slabs++] = ptr;
MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
return 1;
}
static void split_slab_page_into_freelist(char *ptr, const unsigned int id) {
slabclass_t *p = &slabclass[id];
int x;
for (x = 0; x < p->perslab; x++) {
//这里的释放并不是一个真实意义上的释放,而是将内存块归还到slabclass的空闲列表中
//我们分配一个新的slab空间,切割后加入到空闲列表中和memcached中释放一个item节点操作是一致的,顾调用了相同的接口
//将内存chunk块强转为item类型,然后通过item中的双向节点串接到slabclass的空闲节点列表中
do_slabs_free(ptr, 0, id);
//切割块偏移
ptr += p->size;
}
}
3.item节点的分配流程
1、 根据大小,找到合适的slabclass
2、 slabclass空闲列表中是否有空闲的item节点,如果有直接分配item,用于存储内容
3、 空闲列表没有空闲的item可以分配,会重新开辟一个slab(默认大小为1M)的内存块,然后切割slab并放入到空闲列表中,然后从空闲列表中获取节点
item *item_alloc(char *key, size_t nkey, int flags, rel_time_t exptime, int nbytes) {
item *it;
/* do_item_alloc handles its own locks */
it = do_item_alloc(key, nkey, flags, exptime, nbytes);
return it;
}
item *do_item_alloc(char *key, const size_t nkey, const unsigned int flags,
const rel_time_t exptime, const int nbytes) {
//...
//计算需要的空间大小
size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
if (settings.use_cas) {
ntotal += sizeof(uint64_t);
}
//根据大小,找到对应的slabclass下标
unsigned int id = slabs_clsid(ntotal);
unsigned int hdr_id = 0;
if (id == 0)
return 0;
/* This is a large item. Allocate a header object now, lazily allocate
* chunks while reading the upload.
*/
if (ntotal > settings.slab_chunk_size_max) {
//...
} else {
//从根据id去对应的slabcalss中获取一个item节点
it = do_item_alloc_pull(ntotal, id);
}
//...
/* Refcount is seeded to 1 by slabs_alloc() */
it->next = it->prev = 0;
//LRU分层机制,后面说到LRU我们再做进一步分析
if (settings.temp_lru &&
exptime - current_time <= settings.temporary_ttl) {
id |= TEMP_LRU;
} else if (settings.lru_segmented) {
id |= HOT_LRU;
} else {
/* There is only COLD in compat-mode */
id |= COLD_LRU;
}
it->slabs_clsid = id;
//...
return it;
}
item *do_item_alloc_pull(const size_t ntotal, const unsigned int id) {
item *it = NULL;
int i;
//这里涉及LRU淘汰机制,后面会做介绍
//...
it = slabs_alloc(ntotal, id, &total_bytes, 0);
//...
return it;
}
void *slabs_alloc(size_t size, unsigned int id, uint64_t *total_bytes,
unsigned int flags) {
void *ret;
pthread_mutex_lock(&slabs_lock);
ret = do_slabs_alloc(size, id, total_bytes, flags);
pthread_mutex_unlock(&slabs_lock);
return ret;
}
static void *do_slabs_alloc(const size_t size, unsigned int id, uint64_t *total_bytes,
unsigned int flags) {
slabclass_t *p;
item *it = NULL;
//...
p = &slabclass[id];
//...
/* fail unless we have space at the end of a recently allocated page,
we have something on our freelist, or we could allocate a new page */
//空闲列表中已经没有空闲节点了,重新分配slab并切割加入到空闲列表中
if (p->sl_curr == 0 && flags != SLABS_ALLOC_NO_NEWPAGE) {
//为slabclass重新开辟一个slab
do_slabs_newslab(id);
}
//空闲列表中含有空余的节点
if (p->sl_curr != 0) {
/* return off our freelist */
//分配出去,并将该节点从空闲列表中移除
it = (item *)p->slots;
p->slots = it->next;
if (it->next) it->next->prev = 0;
/* Kill flag and initialize refcount here for lock safety in slab
* mover's freeness detection. */
it->it_flags &= ~ITEM_SLABBED;
it->refcount = 1;
p->sl_curr--;
ret = (void *)it;
} else {
ret = NULL;
}
//...
return ret;
}
4.item节点的释放
释放一个item节点,并不会free内存空间,而是将item节点归还到slabclass的空闲列表中
void slabs_free(void *ptr, size_t size, unsigned int id) {
pthread_mutex_lock(&slabs_lock);
//将item节点归还到slabclass的空闲列表中
do_slabs_free(ptr, size, id);
pthread_mutex_unlock(&slabs_lock);
}
static void do_slabs_free(void *ptr, const size_t size, unsigned int id) {
slabclass_t *p;
item *it;
//...
p = &slabclass[id];
it = (item *)ptr;
if ((it->it_flags & ITEM_CHUNKED) == 0) {
//...
it->it_flags = ITEM_SLABBED;
it->slabs_clsid = 0;
it->prev = 0;
it->next = p->slots;
if (it->next) it->next->prev = it;
p->slots = it;
p->sl_curr++;
//...
} else {
do_slabs_free_chunked(it, size);
}
return;
}