写作不易,转载请注明出处:
http://blog.csdn.net/wbin233/article/details/82320575 ,谢谢。
glusterfs IO缓存池主要包括三个数据结构:iobuf_pool,iobuf_arena,iobuf。
最小的io buffer单元
struct iobuf {
union { /* 链表,链接到同一个iobuf_arena中的其他iobuf */
struct list_head list;
struct {
struct iobuf *next;
struct iobuf *prev;
};
};
struct iobuf_arena *iobuf_arena; /* 该iobuf所属的iobuf_arena */
gf_lock_t lock; /* for ->ptr and ->ref */
gf_atomic_t ref; /* 0 == passive, >0 == active */
void *ptr; /* 指向实际的存储区域,由iobuf_arena初始化时决定 */
void *free_ptr; /* in case of stdalloc, this is the one to be freed */
};
包含了多个iobuf
struct iobuf_arena {
union { /* 链表,链接到同一个iobuf_pool中的其他iobuf_arena */
struct list_head list;
struct {
struct iobuf_arena *next;
struct iobuf_arena *prev;
};
};
struct list_head all_list;
size_t page_size; /* 该iobuf_arena中单个iobuf分配的大小(gf_iobuf_init_config其中一个) */
size_t arena_size; /* 该iobuf_arena分配的缓存大小,为page_size * page_count */
size_t page_count; /* 该iobuf_arena包含的iobuf个数 */
struct iobuf_pool *iobuf_pool; /* 该iobuf_arena所属的iobuf_pool */
void *mem_base; /* 该iobuf_arena的基地址,创建时由mmap映射到内存 */
struct iobuf *iobufs; /* 指向该iobuf_arena中iobufs链表的头节点 */
int active_cnt;
struct iobuf active; /* head node iobuf(unused by itself) */
int passive_cnt;
struct iobuf passive; /* head node iobuf(unused by itself) */
uint64_t alloc_cnt; /* 此池中的总分配数 */
int max_active; /* max active buffers at a given time */
}
相应的,iobuf_pool中则包含了多个iobuf_arena
struct iobuf_pool {
pthread_mutex_t mutex;
size_t arena_size; /* iobuf_pool中总缓存大小,由各个iobuf_arena中分配累加得 */
size_t default_page_size; /* iobuf的默认大小 */
int arena_cnt; /* iobuf_pool中包含的iobuf_arena个数 */
struct list_head all_arenas; /* 链接了iobuf_pool中所有的iobuf_arena */
/* 根据page_size(iobuf分配的大小)的不同进入不同的“桶”。
数组中的每个元素为arenas链表,该链表中的每个arenas的page_size相同 */
struct list_head arenas[GF_VARIABLE_IOBUF_COUNT];
/* 与arenas数组类似,只是存储的arena都是没有空闲的iobuf可以分配 */
struct list_head filled[GF_VARIABLE_IOBUF_COUNT];
/* 与arenas数组类似,只是存储的arena是可以被清空的 */
struct list_head purge[GF_VARIABLE_IOBUF_COUNT];
/* 表示缺失的次数。由于默认的iobuf分配的大小的可选项在gf_iobuf_init_config中,
最大值为1M,但请求分配的值大于1M时该值+1,需使用standard calloc()分配 */
uint64_t request_misses;
int rdma_device_count;
struct list_head *mr_list[GF_RDMA_DEVICE_COUNT];
void *device[GF_RDMA_DEVICE_COUNT];
int (*rdma_registration)(void **, void*);
int (*rdma_deregistration)(struct list_head**, struct iobuf_arena *);
};
struct iobref {
gf_lock_t lock;
gf_atomic_t ref;
struct iobuf **iobrefs; /* iobuf数组 */
int alloced; /* iobrefs中分配空间(GF_CALLOC)的iobuf个数 */
int used; /* iobrefs中已经分配的iobuf个数 */
};
上述经常提起的iobuf中分配大小可选项,对应了其中的pagesize,而num_pages则表示相应的iobuf_arena包含的iobuf个数。比如iobuf_arena中page_size为128,则会分配1024个这样大小的iobuf。
struct iobuf_init_config gf_iobuf_init_config[] = {
/* { pagesize, num_pages }, */
{128, 1024},
{512, 512},
{2 * 1024, 512},
{8 * 1024, 128},
{32 * 1024, 64},
{128 * 1024, 32},
{256 * 1024, 8},
{1 * 1024 * 1024, 2},
};
比较粗糙得画了个glusterfs IO缓存池的大致结构,如下。
OK,了解完基本数据结构后,分析其具体实现。
首先是iobuf_pool的初始化函数。
struct iobuf_pool *
iobuf_pool_new (void)
{
struct iobuf_pool *iobuf_pool = NULL;
int i = 0;
size_t page_size = 0;
size_t arena_size = 0;
int32_t num_pages = 0;
/* 各种初始化 */
iobuf_pool = GF_CALLOC (sizeof (*iobuf_pool), 1,
gf_common_mt_iobuf_pool);
if (!iobuf_pool)
goto out;
INIT_LIST_HEAD (&iobuf_pool->all_arenas);
pthread_mutex_init (&iobuf_pool->mutex, NULL);
for (i = 0; i <= IOBUF_ARENA_MAX_INDEX; i++) {
INIT_LIST_HEAD (&iobuf_pool->arenas[i]);
INIT_LIST_HEAD (&iobuf_pool->filled[i]);
INIT_LIST_HEAD (&iobuf_pool->purge[i]);
}
iobuf_pool->default_page_size = 128 * GF_UNIT_KB;
iobuf_pool->rdma_registration = NULL;
iobuf_pool->rdma_deregistration = NULL;
for (i = 0; i < GF_RDMA_DEVICE_COUNT; i++) {
iobuf_pool->device[i] = NULL;
iobuf_pool->mr_list[i] = NULL;
}
/* 根据gf_iobuf_init_config,添加相应的arena,
IOBUF_ARENA_MAX_INDEX的值为gf_iobuf_init_config数组大小 */
arena_size = 0;
for (i = 0; i < IOBUF_ARENA_MAX_INDEX; i++) {
page_size = gf_iobuf_init_config[i].pagesize;
num_pages = gf_iobuf_init_config[i].num_pages;
iobuf_pool_add_arena (iobuf_pool, page_size, num_pages);
arena_size += page_size * num_pages;
}
/* 处理所有更大的iobuf请求 */
iobuf_create_stdalloc_arena (iobuf_pool);
iobuf_pool->arena_size = arena_size;
out:
return iobuf_pool;
}
就是iobuf_pool中几个参数的初始化,初始化链表。然后根据gf_iobuf_init_config
中设的几个pagesize和num_pages,添加相应的arena,链接到iobuf_pool->arenas[index]
中。
因此比较关键的就是添加arena的函数iobuf_pool_add_arena (iobuf_pool, page_size, num_pages)
struct iobuf_arena *
iobuf_pool_add_arena (struct iobuf_pool *iobuf_pool, size_t page_size,
int32_t num_pages)
{
struct iobuf_arena *iobuf_arena = NULL;
GF_VALIDATE_OR_GOTO ("iobuf", iobuf_pool, out);
pthread_mutex_lock (&iobuf_pool->mutex);
{
iobuf_arena = __iobuf_pool_add_arena (iobuf_pool, page_size,
num_pages);
}
pthread_mutex_unlock (&iobuf_pool->mutex);
out:
return iobuf_arena;
}
/* 内部实现 */
struct iobuf_arena *
__iobuf_pool_add_arena (struct iobuf_pool *iobuf_pool, size_t page_size,
int32_t num_pages)
{
struct iobuf_arena *iobuf_arena = NULL;
int index = 0;
/* 根据page_size,找到gf_iobuf_init_config中第一个大于该值的索引 */
index = gf_iobuf_get_arena_index (page_size);
if (index == -1) {
gf_msg ("iobuf", GF_LOG_ERROR, 0, LG_MSG_PAGE_SIZE_EXCEEDED,
"page_size (%zu) of iobufs in arena being added is "
"greater than max available", page_size);
return NULL;
}
/* 优先从iobuf_pool->purge中获取,则不用重新申请 */
iobuf_arena = __iobuf_arena_unprune (iobuf_pool, page_size);
/* 如果iobuf_pool->purge没有可用的,则申请分配 */
if (!iobuf_arena)
iobuf_arena = __iobuf_arena_alloc (iobuf_pool, page_size,
num_pages);
if (!iobuf_arena) {
gf_msg (THIS->name, GF_LOG_WARNING, 0, LG_MSG_ARENA_NOT_FOUND,
"arena not found");
return NULL;
}
/* 插入到iobuf_pool->arenas相应的链表中 */
list_add (&iobuf_arena->list, &iobuf_pool->arenas[index]);
return iobuf_arena;
}
获取iobuf_arena
,首先优先从iobuf_pool->purge[index]
中查找是否有,如果有则直接复用即可,无需重新申请分配一个新的iobuf_arena
,否则则需要重新申请分配。
/* 从iobuf_pool->purge[index]中查找是否有iobuf_arena */
struct iobuf_arena *
__iobuf_arena_unprune (struct iobuf_pool *iobuf_pool, size_t page_size)
{
struct iobuf_arena *iobuf_arena = NULL;
struct iobuf_arena *tmp = NULL;
int index = 0;
GF_VALIDATE_OR_GOTO ("iobuf", iobuf_pool, out);
index = gf_iobuf_get_arena_index (page_size);
if (index == -1) {
gf_msg ("iobuf", GF_LOG_ERROR, 0, LG_MSG_PAGE_SIZE_EXCEEDED,
"page_size (%zu) of iobufs in arena being added is "
"greater than max available", page_size);
return NULL;
}
list_for_each_entry (tmp, &iobuf_pool->purge[index], list) {
list_del_init (&tmp->list);
iobuf_arena = tmp;
break;
}
out:
return iobuf_arena;
}
/* 根据page_size和num_iobufs,申请分配iobuf_arena,并链接到iobuf_pool中 */
struct iobuf_arena *
__iobuf_arena_alloc (struct iobuf_pool *iobuf_pool, size_t page_size,
int32_t num_iobufs)
{
struct iobuf_arena *iobuf_arena = NULL;
size_t rounded_size = 0;
GF_VALIDATE_OR_GOTO ("iobuf", iobuf_pool, out);
iobuf_arena = GF_CALLOC (sizeof (*iobuf_arena), 1,
gf_common_mt_iobuf_arena);
if (!iobuf_arena)
goto err;
/* 初始化各个链表 */
INIT_LIST_HEAD (&iobuf_arena->list);
INIT_LIST_HEAD (&iobuf_arena->all_list);
INIT_LIST_HEAD (&iobuf_arena->active.list);
INIT_LIST_HEAD (&iobuf_arena->passive.list);
iobuf_arena->iobuf_pool = iobuf_pool;
/* 从gf_iobuf_init_config中找到第一个大于该值的值 */
rounded_size = gf_iobuf_get_pagesize (page_size);
iobuf_arena->page_size = rounded_size; // iobuf分配的大小
iobuf_arena->page_count = num_iobufs; // iobuf的个数
iobuf_arena->arena_size = rounded_size * num_iobufs; // 该iobuf_arena总分配的大小
/* 该iobuf_arena的基地址,由mmap映射到内存 */
iobuf_arena->mem_base = mmap (NULL, iobuf_arena->arena_size,
PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (iobuf_arena->mem_base == MAP_FAILED) {
gf_msg (THIS->name, GF_LOG_WARNING, 0, LG_MSG_MAPPING_FAILED,
"mapping failed");
goto err;
}
if (iobuf_pool->rdma_registration) {
iobuf_pool->rdma_registration (iobuf_pool->device,
iobuf_arena);
}
list_add_tail (&iobuf_arena->all_list, &iobuf_pool->all_arenas);
/* 初始化iobuf_arena中的iobuf */
__iobuf_arena_init_iobufs (iobuf_arena);
if (!iobuf_arena->iobufs) {
gf_msg (THIS->name, GF_LOG_ERROR, 0, LG_MSG_INIT_IOBUF_FAILED,
"init failed");
goto err;
}
iobuf_pool->arena_cnt++;
return iobuf_arena;
err:
__iobuf_arena_destroy (iobuf_pool, iobuf_arena);
out:
return NULL;
}
关键步骤为调用了__iobuf_arena_init_iobufs
来初始化iobuf_arena
中的iobuf
。
void
__iobuf_arena_init_iobufs (struct iobuf_arena *iobuf_arena)
{
int iobuf_cnt = 0;
struct iobuf *iobuf = NULL;
int offset = 0;
int i = 0;
GF_VALIDATE_OR_GOTO ("iobuf", iobuf_arena, out);
iobuf_cnt = iobuf_arena->page_count;
iobuf_arena->iobufs = GF_CALLOC (sizeof (*iobuf), iobuf_cnt,
gf_common_mt_iobuf);
if (!iobuf_arena->iobufs)
return;
iobuf = iobuf_arena->iobufs;
for (i = 0; i < iobuf_cnt; i++) {
INIT_LIST_HEAD (&iobuf->list);
LOCK_INIT (&iobuf->lock);
iobuf->iobuf_arena = iobuf_arena;
/* 指向实际的存储区域,根据iobuf_arena的mem_base基地址和偏移offset计算得 */
iobuf->ptr = iobuf_arena->mem_base + offset;
/* 插入到iobuf_arena->passive中 */
list_add (&iobuf->list, &iobuf_arena->passive.list);
iobuf_arena->passive_cnt++;
/* 更新偏移值 */
offset += iobuf_arena->page_size;
iobuf++;
}
out:
return;
}
其余的操作比如iobuf_get
,iobuf_get2
等大同小异,比较简单暂时就不介绍啦,闲得慌有空再补上,有需要的看源码即可。