# YDB is a key-value storage library.
# Simple API, only 6 methods (open, close, sync, get, set, del).
# The data files are append-only (aka: log or journal).
# As the data files are immutable it's rsync-friendly (though index is mutable, but should be reasonably small)
# Index is stored in the memory and rarely dumped to the disk.
# As the index is in memory, GET requests need at most one disk-seek.
# SET request requires at most one disk-seek to write the data. On average the cost is 0 disk seeks, due to append-only file structure and operating system write caches.
# It's spinning-disk friendly: it's optimized to take an advantage the fast sequential access to the disk.
# it's flash-disk friendly: data is never modified, so there's no need to read-clear-write flash sectors.
# Once in a while the garbage-collector is started to remove old log files to save disk space. This can slow down the db up to two times.
# Simple API, only 6 methods (open, close, sync, get, set, del).
# The data files are append-only (aka: log or journal).
# As the data files are immutable it's rsync-friendly (though index is mutable, but should be reasonably small)
# Index is stored in the memory and rarely dumped to the disk.
# As the index is in memory, GET requests need at most one disk-seek.
# SET request requires at most one disk-seek to write the data. On average the cost is 0 disk seeks, due to append-only file structure and operating system write caches.
# It's spinning-disk friendly: it's optimized to take an advantage the fast sequential access to the disk.
# it's flash-disk friendly: data is never modified, so there's no need to read-clear-write flash sectors.
# Once in a while the garbage-collector is started to remove old log files to save disk space. This can slow down the db up to two times.
struct db { u32 magic; ///这个就是你的数据以及index的存储文件的目录。 char *top_dir; ///tree用来表示index。 struct tree tree; ///保存了当前的所有的数据文件的信息。 struct loglist loglist; /// int overcommit_ratio; int flags; ///下面这几个是用来gc。 int gc_enabled; int gc_running; int gc_finished; pthread_t gc_thread; pthread_mutex_t lock; /**/ };
struct tree { char *fname; /* Base name of index file. */ ///这里是为了防止索引文件被破坏,因此会有个备份。 char *fname_new; /* Name of new index file, *.new */ char *fname_old; /* Name of previous index, *.old */ ///红黑树的根结点。 struct rb_root root; ///表示最后一次提交,也就是写入的数据文件的number以及在当前文件的偏移。 int commited_last_record_logno; u64 commited_last_record_offset; ///和上面的区别就是这里的两个值是没有提交的。 int last_record_logno; u64 last_record_offset; ///key的个数和大小。 u64 key_counter; u64 key_bytes; /* used to store keys (incl header and padding) */ u64 value_bytes; /* used to store values (incl header and padding) */ ///这个域包含了每个数据文件的引用计数(也就是每个数据文件所包含的元数据的个数) r_arr refcnt; };
struct item { struct rb_node node; ///当前key对应的数据所在的文件的number。 int logno; ///当前key所对应的数据所在的文件的偏移 u64 value_offset; ///数据的大小。 u32 value_sz; ///key的大小以及key的值。 u16 key_sz; char key[]; };
struct index_item{ u32 magic; u32 checksum; int logno; u64 value_offset; u16 key_sz; u32 value_sz; char key[]; };
struct loglist { ///这里包含了log(也就是数据文件)的数组。 r_arr logs; char *top_dir; char *unlink_base; ///当前最新的一个文件的number,以及fd。(由于ydb的结构类似log,因此每次写都是写在最新的那个文件) int write_logno; int write_fd; u64 min_log_size; u64 total_bytes; /* total size of all logs */ u64 appended_bytes; }
struct log { int fd; char *fname; u64 file_size; };
#define YDB_CREAT (0x01) #define YDB_RDONLY (0x02) #define YDB_GCDISABLE (0x04) YDB ydb_open(char *top_dir, int overcommit_ratio, unsigned long long min_log_size, int flags)
struct db *db = (struct db *)zmalloc(sizeof(struct db)); db->magic = YDB_STRUCT_MAGIC; db->top_dir = strdup(top_dir); db->overcommit_ratio = overcommit_ratio; db->lock = (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; db->flags = flags;
char buf[256]; snprintf(buf, sizeof(buf), "%s%s%s", top_dir, PATH_DELIMITER, "index.ydb"); int logno = 0; u64 record_offset = 0; if(tree_open(&db->tree, buf, &logno, &record_offset, flags) < 0) { if(!(flags & YDB_CREAT)) { log_error("Failed to load index file from %s", top_dir); /* TODO: memleaks here */ return(NULL); } } int r = loglist_open(&db->loglist, top_dir, min_log_size, max_descriptors()); if(r < 0){ /* TODO: memleaks here */ return(NULL); }
int record_sz = END_OF_FILE; while(1) { if(record_sz == END_OF_FILE) { /* find an edge */ ///得到当前的数据文件。 struct log *log = slot_get(&db->loglist, logno); 。。。。。。。。。。。。。。。。。。。。。。。。。。。。。 char key[MAX_KEY_SIZE]; u16 key_sz = MAX_KEY_SIZE-1; int flags; u64 value_offset; u32 value_sz; ///其中flags是元数据的属性,因为在ydb中的删除是标记删除的,因此当flags为FLAG_DELETE时,我们需要从索引树中删除这条索引。 record_sz = loglist_get_record(&db->loglist, logno, record_offset, key, &key_sz, &value_offset, &value_sz, &flags); ///这个表示一个数据文件已经遍历结束,该遍历下一个文件。 if(record_sz == END_OF_FILE) { logno++; record_offset = 0; continue; } ///文件遍历完毕。 if(record_sz == NO_MORE_DATA) break; if(FLAG_DELETE & flags) { ///删除索引 tree_del(&db->tree, key, key_sz, logno, record_offset); }else{ tree_add(&db->tree, key, key_sz, logno, value_offset, value_sz, record_offset); } record_offset += record_sz; }
int start = MIN(rarr_min(db->tree.refcnt), rarr_min(db->loglist.logs)); int stop = MAX(rarr_max(db->tree.refcnt), rarr_max(db->loglist.logs)); for(logno=start; lognotree, logno); ///表示当前的数据文件。 struct log *log = slot_get(&db->loglist, logno); if(refcnt == 0 && log == NULL) continue; if(refcnt && log) continue; if(refcnt) { log_error("Log %i(0x%x) used, but file is not loaded!", logno, logno); log_error("Sorry to say but you'd lost %i key-values.", refcnt); log_error("Closing db"); /* we're in inconsistent state */ db_close(db, 0); return(NULL); } if(log) continue; } return db; }
1 首先通过mmap映射index。db到内存。
2 然后通过index_item结构来存取每个索引,并进行合法性校验。
3 最后将通过校验的item插入到红黑树。
int tree_open(struct tree *tree, char *fname, int *last_record_logno, u64 *last_record_offset, int flags) { char buf[256]; tree->fname = strdup(fname); tree->refcnt = rarr_new(); ///防止索引文件丢失。进行备份。 snprintf(buf, sizeof(buf), "%s.old", fname); tree->fname_old = strdup(buf); snprintf(buf, sizeof(buf), "%s.new", fname); tree->fname_new = strdup(buf); ///红黑树的根结点赋值。 tree->root = RB_ROOT; ///加载索引文件到红黑树 return tree_load_index(tree, last_record_logno, last_record_offset, flags); }
1 搜索top_dir目录,找到所有的数据文件。
3 最后打开最后一个数据文件(也就是将要写的那个文件),并将它的相关属性赋值给loglist.
int loglist_open(struct loglist *llist, char *top_dir, u64 min_log_size, int max_descriptors) { llist->min_log_size = min_log_size; llist->logs = rarr_new(); llist->top_dir = strdup(top_dir); char unlink_base[256]; snprintf(unlink_base, sizeof(unlink_base), "%s%s%s%s.old", top_dir, PATH_DELIMITER, DATA_FNAME, DATA_EXT); llist->unlink_base = strdup(unlink_base); .................................... /* Load data files */ ///系统调用,用来搜索满足glob_str模式的文件。 glob(glob_str, 0, NULL, &globbuf); for(off=globbuf.gl_pathv; off && *off; off++) { ///通过文件名得到logno int logno = logno_from_fname(*off, prefix_len, suffix_len); log_info("Opening log: %s (%04i)", *off, logno); ///生成log,并插入到数组。 if(log_open(llist, logno) < 0) { log_error("Unable to open log %5i/0x%04x", logno, logno); continue; } max_logno = MAX(max_logno, logno); } globfree(&globbuf); if(max_logno < 0) { /* empty directory yet*/ if(log_create(llist, 0) < 0) return(-1); max_logno = 0; } ///打开writer。 if(log_open_writer(llist, max_logno) < 0) return(-1); return(0); }
/* Retrieve a value for selected key */ int ydb_get(YDB ydb, char *key, unsigned short key_sz, char *buf, unsigned int buf_sz);
/* Add/modify a key */ int ydb_add(YDB ydb, char *key, unsigned short key_sz, char *value, unsigned int value_sz);
1 调用loglist_append,来将数据写入到对应的log文件,并更新loglist的对应域
2 调用tree_add,来新建或者修改一个item,并加入到索引树中。
int db_add(struct db *db, char *key, u16 key_sz, char *value, u32 value_sz) { /* TODO: error handling on write? */ struct append_info af; af = loglist_append(&db->loglist, key, key_sz, value, value_sz, FLAG_SET); tree_add(&db->tree, key, key_sz, af.logno, af.value_offset, value_sz, af.record_offset); return 1; }