上面一篇博文主要说了leveldb的理论原理,接下来将说明leveldb中如何去实现这些组件,主要是按照下面的思路进行的:首先分别分析LSM的各个组件,例如memtable,commit log,compaction实现,之后将这些功能串联起来分析一下leveldb的读写流程。这篇中主要是关于memtable的分析。
memtable常驻于内存,需要按照key进行排序,通常意义上的话,可以使用二叉查找树来实现,跟进一步可以使用红黑树保证树的平衡,但是leveldb中使用了另外的一种数据结构:跳表Skip List。
memtable声明在db/memtable.h中,定义如下:
class MemTable { public: // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. explicit MemTable(const InternalKeyComparator& comparator); // Increase reference count. void Ref() { ++refs_; } // Drop reference count. Delete if no more references exist. void Unref() { --refs_; assert(refs_ >= 0); if (refs_ <= 0) // 如果引用数为0,删除该对象 { delete this; } } // Returns an estimate of the number of bytes of data in use by this // data structure. // // REQUIRES: external synchronization to prevent simultaneous // operations on the same MemTable. // 返回使用的内存量 size_t ApproximateMemoryUsage(); // Return an iterator that yields the contents of the memtable. // // The caller must ensure that the underlying MemTable remains live // while the returned iterator is live. The keys returned by this // iterator are internal keys encoded by AppendInternalKey in the // db/format.{h,cc} module. // 返回迭代器,遍历该memtable Iterator* NewIterator(); // Add an entry into memtable that maps key to value at the // specified sequence number and with the specified type. // Typically value will be empty if type==kTypeDeletion. // 由于采用了LSM结构,所以没有数据删除,只有数据的添加,如果type= // kTypeDeletion,这时value为空 // sequence number:递增的序列,用于数据的恢复 void Add(SequenceNumber seq, ValueType type, const Slice& key, const Slice& value); // If memtable contains a value for key, store it in *value and return true. // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. // Else, return false. // 查询该memtable bool Get(const LookupKey& key, std::string* value, Status* s); private: // 私有化析构函数,这样保证只有Unref()方法能够删除该对象 ~MemTable(); // Private since only Unref() should be used to delete it struct KeyComparator { const InternalKeyComparator comparator; explicit KeyComparator(const InternalKeyComparator& c) : comparator(c) { } int operator()(const char* a, const char* b) const; }; // 迭代器 friend class MemTableIterator; friend class MemTableBackwardIterator; // 跳表类型?作用 typedef SkipList<const char*, KeyComparator> Table; // key比较器 KeyComparator comparator_; // 对象被引用的数量,如果该值为0,则删除该对象 int refs_; // 内存区域的封装 Arena arena_; // 使用跳表数据结构保证内存数据按照key排序 Table table_; // No copying allowed MemTable(const MemTable&); void operator=(const MemTable&); };
初始化函数定义如下:
MemTable::MemTable(const InternalKeyComparator& cmp) : comparator_(cmp), refs_(0), table_(comparator_, &arena_) { }
void MemTable:Add(SequenceNumber s, ValueType type, const Slice& key, const Slice& value) { // Format of an entry is concatenation of: // key_size : varint32 of internal_key.size() // key bytes : char[internal_key.size()] // value_size : varint32 of value.size() // value bytes : char[value.size()] // 首先格式化kv数据,之后分别写入,格式如下: // key_size, key_bytes, sequence_number|type(固定64位), // value_size,value size_t key_size = key.size(); size_t val_size = value.size(); size_t internal_key_size = key_size + 8; const size_t encoded_len = VarintLength(internal_key_size) + internal_key_size + VarintLength(val_size) + val_size; char* buf = arena_.Allocate(encoded_len); char* p = EncodeVarint32(buf, internal_key_size); memcpy(p, key.data(), key_size); p += key_size; EncodeFixed64(p, (s << 8) | type); // (sequencenum << 8) | type p += 8; p = EncodeVarint32(p, val_size); memcpy(p, value.data(), val_size); assert((p + val_size) - buf == encoded_len); // 插入数据 table_.Insert(buf); }
查询的操作代码如下:
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s) { Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); // 查找key iter.Seek(memkey.data()); if (iter.Valid()) { // entry format is: // klength varint32 // userkey char[klength] // tag uint64 // vlength varint32 // value char[vlength] // Check that it belongs to same user key. We do not check the // sequence number since the Seek() call above should have skipped // all entries with overly large sequence numbers. const char* entry = iter.key(); uint32_t key_length; const char* key_ptr = GetVarint32Ptr(entry, entry+5, &key_length); if (comparator_.comparator.user_comparator()->Compare( Slice(key_ptr, key_length - 8), key.user_key()) == 0) // memtable找到的key和用户查找的key相同 { // Correct user key const uint64_t tag = DecodeFixed64(key_ptr + key_length - 8); switch (static_cast<ValueType>(tag & 0xff)) // 屏蔽低8位 { case kTypeValue: // 如果是kTypeValue类型,表明找到 { Slice v = GetLengthPrefixedSlice(key_ptr + key_length); value->assign(v.data(), v.size()); return true; } case kTypeDeletion: // 如果是kTypeDeletion删除的数据 *s = Status::NotFound(Slice()); return true; } } } return false; }
// A helper class useful for DBImpl::Get() class LookupKey{ public: // Initialize *this for looking up user_key at a snapshot with // the specified sequence number. LookupKey(const Slice& user_key, SequenceNumber sequence); ~LookupKey(); // Return a key suitable for lookup in a MemTable. Slice memtable_key() const { return Slice(start_, end_ - start_); } // Return an internal key (suitable for passing to an internal iterator) Slice internal_key() const { return Slice(kstart_, end_ - kstart_); } // Return the user key Slice user_key() const { return Slice(kstart_, end_ - kstart_ - 8); } private: // We construct a char array of the form: // klength varint32 <-- start_ // userkey char[klength] <-- kstart_ // tag uint64 // <-- end_ // The array is a suitable MemTable key. // The suffix starting with "userkey" can be used as an InternalKey. const char* start_; const char* kstart_; const char* end_; char space_[200]; // Avoid allocation for short keys // No copying allowed LookupKey(const LookupKey&); void operator=(const LookupKey&); };
1. SkipList数据结构
2. Arena内存分配策略