RocksDB使用skiplist作为内存的基本数据结构,skiplist的介绍可以查看网上资料。
class MemTableRep {
Allocator* allocator_; // 内存分配器
};
struct InlineSkipList<Comparator>::Node {
// next_[0] is the lowest level link (level 0). Higher levels are
// stored _earlier_, so level 1 is at next_[-1].
std::atomic<Node*> next_[1];
const char* Key() const { return reinterpret_cast<const char*>(&next_[1]); } // key
};
struct InlineSkipList<Comparator>::Splice {
// The invariant of a Splice is that prev_[i+1].key <= prev_[i].key <
// next_[i].key <= next_[i+1].key for all i. That means that if a
// key is bracketed by prev_[i] and next_[i] then it is bracketed by
// all higher levels. It is _not_ required that prev_[i]->Next(i) ==
// next_[i] (it probably did at some point in the past, but intervening
// or concurrent operations might have inserted nodes in between).
int height_ = 0;
Node** prev_;
Node** next_;
};
class InlineSkipList {
Allocator* const allocator_; // 内存分配器
Comparator const compare_; // key comparator
Node* const head_; // head
std::atomic<int> max_height_; // Height of the entire list
Splice* seq_splice_; // splice,一个Node各层的集合,由allocator_分配,一次分配sizeof(Node)*max_height_大小内存,访问时直接使用Node里面的next_数组指针偏移即可,详细见Splice和Node结构体注释。
};
class SkipListRep : public MemTableRep {
InlineSkipList<const MemTableRep::KeyComparator&> skip_list_; // skip_list_存储kv
const MemTableRep::KeyComparator& cmp_; // key comparator
};
class MemTable {
struct KeyComparator : public MemTableRep::KeyComparator {
const InternalKeyComparator comparator;
};
KeyComparator comparator_; // key comparator,用于比较key大小
std::unique_ptr<MemTableRep> table_; // 真正的memtable
};
根据上面类的定义可以看出,memtable上面记录了table,table里面有InlineSkipList,SlipList由Splice的双向链表组成,Splice中包含了所有level的内存,在不同level间切换使用Node中next_数组下标即可。
在不同level间切换使用Node中next_数组下标即可。真正的key可以直接访问next_[1]即可。如下图所示,来自https://zhuanlan.zhihu.com/p/444460663。
核心的代码是这一行。
bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
bool allow_partial_splice_fix) {
// 在insert时,将key的内存强制向前偏移一个Node,访问key就可以使用next_[1]了。
Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1;
...
}
由于跳表没有kv的概念,因此将key和value进行统一编码。SkipList中的key由如下元素构成:
key的编码逻辑如下:
Status MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, /* user key */
const Slice& value,
const ProtectionInfoKVOS64* kv_prot_info,
bool allow_concurrent,
MemTablePostProcessInfo* post_process_info, void** hint) {
...
char* p = EncodeVarint32(buf, internal_key_size); // 按照1字节对齐写入keysize
memcpy(p, key.data(), key_size); // 拷贝key
p += key_size;
uint64_t packed = PackSequenceAndType(s, type); // 打包seq_num和type
EncodeFixed64(p, packed);
p += 8; // seq_num和type占8字节
p = EncodeVarint32(p, val_size); // 按照1字节对齐写入valuesize
memcpy(p, value.data(), val_size); // 拷贝value
...
}
const DecodedKey key_decoded = compare_.decode_key(key);
virtual DecodedType decode_key(const char* key) const {
// The format of key is frozen and can be treated as a part of the API
// contract. Refer to MemTable::Add for details.
return GetLengthPrefixedSlice(key);
}
inline Slice GetLengthPrefixedSlice(const char* data) {
uint32_t len = 0;
// +5: we assume "data" is not corrupted
// unsigned char is 7 bits, uint32_t is 32 bits, need 5 unsigned char
// 排除encode过程中前面的keysize,获取keysize到len
auto p = GetVarint32Ptr(data, data + 5 /* limit */, &len);
return Slice(p, len);
}
key的比较在比较器InternalKeyComparator中实现,其由db_impl逐渐传递到InlineSkipList中。比较的核心函数如下:
int InternalKeyComparator::Compare(const ParsedInternalKey& a,
const ParsedInternalKey& b) const {
// Order by:
// increasing user key (according to user-supplied comparator)
// decreasing sequence number
// decreasing type (though sequence# should be enough to disambiguate)
int r = user_comparator_.Compare(a.user_key, b.user_key);
if (r == 0) {
if (a.sequence > b.sequence) {
r = -1;
} else if (a.sequence < b.sequence) {
r = +1;
} else if (a.type > b.type) {
r = -1;
} else if (a.type < b.type) {
r = +1;
}
}
return r;
}
根据前文的介绍,Insert代码流程如下:
Status MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, /* user key */
const Slice& value,
const ProtectionInfoKVOS64* kv_prot_info,
bool allow_concurrent,
MemTablePostProcessInfo* post_process_info, void** hint) {
// key encode,见上一章节
...
// 插入key
bool res = table->InsertKey(handle);
// 插入bloom filter
bloom_filter_->Add(key_without_ts);
...
}
bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
bool allow_partial_splice_fix) {
// 生成Node,用于使用next_[1]访问key
Node* x = reinterpret_cast<Node*>(const_cast<char*>(key)) - 1;
// key解码
const DecodedKey key_decoded = compare_.decode_key(key);
// 通过比较key,将key插入
...
}
InlineSkipList是支持写入的,使用CAS操作完成。代码如下:
bool InlineSkipList<Comparator>::Insert(const char* key, Splice* splice,
bool allow_partial_splice_fix) {
if (UseCAS) {
for (int i = 0; i < height; ++i) {
while (true) {
// 比较key
...
// 使用CAS插入key,若失败则while循环重新比较插入
if (splice->prev_[i]->CASNext(i, splice->next_[i], x)) {
// success
break;
}
FindSpliceForLevel<false>(key_decoded, splice->prev_[i], nullptr, i,
&splice->prev_[i], &splice->next_[i]);
}
}
}
}
上文介绍key由seq_num和type构成,那么get的时候没有seq_num如何读取呢?RocksDB引入了LookupKey。
根据上面key compare的章节,可以知道读取最新的数据就是seq_num比之前大就可以了,因此LookupKey在encode的时候使用的seqnum为最后一次写入成功的seq_num。代码如下:
class LookupKey {
LookupKey(const Slice& _user_key, SequenceNumber sequence);
// We construct a char array of the form:
// klength varint32 <-- start_
// userkey char[klength] <-- kstart_
// tag uint64
// <-- end_
// The array is a suitable MemTable key.
// The suffix starting with "userkey" can be used as an InternalKey.
const char* start_;
const char* kstart_;
const char* end_;
char space_[200]; // Avoid allocation for short keys
// Return an internal key (suitable for passing to an internal iterator)
Slice internal_key() const {
return Slice(kstart_, static_cast<size_t>(end_ - kstart_));
}
// Return the user key
Slice user_key() const {
return Slice(kstart_, static_cast<size_t>(end_ - kstart_ - 8));
}
};
Status DBImpl::GetImpl(const ReadOptions& read_options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* pinnable_val, bool* value_found,
ReadCallback* callback, bool* is_blob_index) {
if (read_options.snapshot != nullptr) {
// 快照场景,使用快照最后一个IO的seq
snapshot =
reinterpret_cast<const SnapshotImpl*>(read_options.snapshot)->number_;
} else {
// 非快照场景,使用version的最后一个IO的seq
snapshot = last_seq_same_as_publish_seq_
? versions_->LastSequence()
: versions_->LastPublishedSequence();
}
// 生成lookupkey
LookupKey lkey(key, snapshot);
// 从memtable读
if (sv->mem->Get(lkey, get_impl_options.value->GetSelf(), timestamp, &s,
&merge_context, &max_covering_tombstone_seq,
read_options, get_impl_options.callback,
get_impl_options.is_blob_index)) {
done = true;
get_impl_options.value->PinSelf();
RecordTick(stats_, MEMTABLE_HIT);
}
// 从immutable读
...
// 从sstable读
...
}
bool MemTable::Get(const LookupKey& key, std::string* value,
std::string* timestamp, Status* s,
MergeContext* merge_context,
SequenceNumber* max_covering_tombstone_seq,
SequenceNumber* seq, const ReadOptions& read_opts,
ReadCallback* callback, bool* is_blob_index, bool do_merge) {
GetFromTable(key, *max_covering_tombstone_seq, do_merge, callback,
is_blob_index, value, timestamp, s, merge_context, seq,
&found_final_value, &merge_in_progress);
}
由于seq_num也在key中,因此获取的key是不是我们真正想要的,需要额外判断,memtable使用Saver作为上下文,在回调函数SaveValue中进行处理
void MemTable::GetFromTable(const LookupKey& key,
SequenceNumber max_covering_tombstone_seq,
bool do_merge, ReadCallback* callback,
bool* is_blob_index, std::string* value,
std::string* timestamp, Status* s,
MergeContext* merge_context, SequenceNumber* seq,
bool* found_final_value, bool* merge_in_progress) {
Saver saver;
saver.status = s;
saver.found_final_value = found_final_value;
saver.merge_in_progress = merge_in_progress;
saver.key = &key;
saver.value = value;
...
// 在SkipList中查找key,这里不详细列出代码了
table_->Get(key, &saver, SaveValue);
*seq = saver.seq;
}
static bool SaveValue(void* arg, const char* entry) {
// 先判断获取出来的key是否为userkey,若不是,直接返回
if (s->mem->GetInternalKeyComparator().user_comparator()->Equal(
Slice(key_ptr, key_length - 8), s->key->user_key())) {
case kTypeValue: {
if (s->inplace_update_support) {
s->mem->GetLock(s->key->user_key())->ReadLock();
}
Slice v = GetLengthPrefixedSlice(key_ptr + key_length);
*(s->status) = Status::OK();
if (*(s->merge_in_progress)) {
...
} else if (s->value != nullptr) {
// 将value值放入saver中,用于返回给用户
s->value->assign(v.data(), v.size());
}
if (s->inplace_update_support) {
s->mem->GetLock(s->key->user_key())->ReadUnlock();
}
*(s->found_final_value) = true;
if (s->is_blob_index != nullptr) {
*(s->is_blob_index) = (type == kTypeBlobIndex);
}
return false;
}
case kTypeDeletion:
case kTypeSingleDeletion:
case kTypeRangeDeletion: {
if (*(s->merge_in_progress)) {
...
} else {
*(s->status) = Status::NotFound();
}
*(s->found_final_value) = true;
return false;
}
}
}
http://mysql.taobao.org/monthly/2018/11/05/
https://zhuanlan.zhihu.com/p/444460663