以leveldb 1.20版本,源码解析,先看下leveldb中自定义的基本组件
字符串处理类,不采用std::string的原因是避免内存拷贝
class LEVELDB_EXPORT Slice {
private:
const char* data_;
size_t size_;
};
leveldb中的错误码处理,包含错误码和错误信息
class LEVELDB_EXPORT Status {
private:
enum Code {
kOk = 0,
kNotFound = 1,
kCorruption = 2,
kNotSupported = 3,
kInvalidArgument = 4,
kIOError = 5
};
// stats_的不同bit表示错误码和错误信息
const char* state_;
};
定长编解码:
void PutFixed32(std::string* dst, uint32_t value) {
char buf[sizeof(value)];
EncodeFixed32(buf, value);
dst->append(buf, sizeof(buf));
}
inline void EncodeFixed32(char* dst, uint32_t value) {
uint8_t* const buffer = reinterpret_cast<uint8_t*>(dst);
// Recent clang and gcc optimize this to a single mov / str instruction.
// 小端字节序
buffer[0] = static_cast<uint8_t>(value);
buffer[1] = static_cast<uint8_t>(value >> 8);
buffer[2] = static_cast<uint8_t>(value >> 16);
buffer[3] = static_cast<uint8_t>(value >> 24);
}
inline uint32_t DecodeFixed32(const char* ptr) {
const uint8_t* const buffer = reinterpret_cast<const uint8_t*>(ptr);
// Recent clang and gcc optimize this to a single mov / ldr instruction.
return (static_cast<uint32_t>(buffer[0])) |
(static_cast<uint32_t>(buffer[1]) << 8) |
(static_cast<uint32_t>(buffer[2]) << 16) |
(static_cast<uint32_t>(buffer[3]) << 24);
}
不固定长度编解码,目的是减少空间占用,通过每个byte的最高bit来判断该byte是否有值,这种编码思想在protobuf中广泛使用
void PutVarint32(std::string* dst, uint32_t v) {
char buf[5];
char* ptr = EncodeVarint32(buf, v);
dst->append(buf, ptr - buf);
}
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
uint8_t* ptr = reinterpret_cast<uint8_t*>(dst);
static const int B = 128;
if (v < (1 << 7)) {
*(ptr++) = v;
} else if (v < (1 << 14)) {
*(ptr++) = v | B;
*(ptr++) = v >> 7;
} else if (v < (1 << 21)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = v >> 14;
} else if (v < (1 << 28)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = v >> 21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = (v >> 21) | B;
*(ptr++) = v >> 28;
}
return reinterpret_cast<char*>(ptr);
}
const char* GetVarint32PtrFallback(const char* p, const char* limit,
uint32_t* value) {
uint32_t result = 0;
for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) {
uint32_t byte = *(reinterpret_cast<const uint8_t*>(p));
p++;
if (byte & 128) {
// More bytes are present
result |= ((byte & 127) << shift);
} else {
result |= (byte << shift);
*value = result;
return reinterpret_cast<const char*>(p);
}
}
return nullptr;
}
bool GetVarint32(Slice* input, uint32_t* value) {
const char* p = input->data();
const char* limit = p + input->size();
const char* q = GetVarint32Ptr(p, limit, value);
if (q == nullptr) {
return false;
} else {
*input = Slice(q, limit - q);
return true;
}
}
inline const char* GetVarint32Ptr(const char* p, const char* limit,
uint32_t* value) {
if (p < limit) {
uint32_t result = *(reinterpret_cast<const uint8_t*>(p));
if ((result & 128) == 0) {
// 先尝试进行简单的<128判断
*value = result;
return p + 1;
}
}
return GetVarint32PtrFallback(p, limit, value);
}
leveldb中的内存池,用来给memtable分配内存,当memtable转化为immemtable然后Minor compaction形成sstable之后,这个memtable的Arena内存被统一回收:
// 为memtable服务的内存池,每次申请一大块内存,然后给memtable分配小块内存
// 当memtable析构时,随着memtble一起释放内存
class Arena {
public:
Arena();
Arena(const Arena&) = delete;
Arena& operator=(const Arena&) = delete;
~Arena();
// Return a pointer to a newly allocated memory block of "bytes" bytes.
char* Allocate(size_t bytes);
// Allocate memory with the normal alignment guarantees provided by malloc.
char* AllocateAligned(size_t bytes);
// Returns an estimate of the total memory usage of data allocated
// by the arena.
size_t MemoryUsage() const {
return memory_usage_.load(std::memory_order_relaxed);
}
private:
char* AllocateFallback(size_t bytes);
char* AllocateNewBlock(size_t block_bytes);
// Allocation state
char* alloc_ptr_; // 当前内存块的分配指针
size_t alloc_bytes_remaining_; // 当前内存块的剩余大小
// Array of new[] allocated memory blocks
std::vector<char*> blocks_; // 每个Arena由多个内存块组成
// Total memory usage of the arena.
//
// TODO(costan): This member is accessed via atomics, but the others are
// accessed without any locking. Is this OK?
std::atomic<size_t> memory_usage_; // Arena的内存占用
};
inline char* Arena::Allocate(size_t bytes) {
// The semantics of what to return are a bit messy if we allow
// 0-byte allocations, so we disallow them here (we don't need
// them for our internal use).
assert(bytes > 0);
// 当前的内存块中还有空间
if (bytes <= alloc_bytes_remaining_) {
char* result = alloc_ptr_;
alloc_ptr_ += bytes;
alloc_bytes_remaining_ -= bytes;
return result;
}
return AllocateFallback(bytes);
}
Arena::~Arena() {
// 析构删除每个内存块
for (size_t i = 0; i < blocks_.size(); i++) {
delete[] blocks_[i];
}
}
char* Arena::AllocateFallback(size_t bytes) {
if (bytes > kBlockSize / 4) {
// 大约1/4直接申请,避免内存碎片
// Object is more than a quarter of our block size. Allocate it separately
// to avoid wasting too much space in leftover bytes.
char* result = AllocateNewBlock(bytes);
return result;
}
// We waste the remaining space in the current block.
// 申请新的block,之前的block的剩余空间会浪费掉,但是目的是为了快
alloc_ptr_ = AllocateNewBlock(kBlockSize);
alloc_bytes_remaining_ = kBlockSize;
char* result = alloc_ptr_;
alloc_ptr_ += bytes;
alloc_bytes_remaining_ -= bytes;
return result;
}
// 对齐方式的内存分配,即分配的内存开始空间时地址大小的整倍数
char* Arena::AllocateAligned(size_t bytes) {
const int align = (sizeof(void*) > 8) ? sizeof(void*) : 8;
static_assert((align & (align - 1)) == 0,
"Pointer size should be a power of 2");
size_t current_mod = reinterpret_cast<uintptr_t>(alloc_ptr_) & (align - 1);
// 计算需要填充的数据量
size_t slop = (current_mod == 0 ? 0 : align - current_mod);
size_t needed = bytes + slop;
char* result;
if (needed <= alloc_bytes_remaining_) {
result = alloc_ptr_ + slop;
alloc_ptr_ += needed;
alloc_bytes_remaining_ -= needed;
} else {
// AllocateFallback always returned aligned memory
result = AllocateFallback(bytes);
}
assert((reinterpret_cast<uintptr_t>(result) & (align - 1)) == 0);
return result;
}
memtable通过skiplist实现KV对的有序排列,代码太多,看下Insert过程,其他过程类似:
template <typename Key, class Comparator>
void SkipList<Key, Comparator>::Insert(const Key& key) {
// TODO(opt): We can use a barrier-free variant of FindGreaterOrEqual()
// here since Insert() is externally synchronized.
Node* prev[kMaxHeight];
// 找到>=key的一系列Node
Node* x = FindGreaterOrEqual(key, prev);
// Our data structure does not allow duplicate insertion
assert(x == nullptr || !Equal(key, x->key));
int height = RandomHeight();
if (height > GetMaxHeight()) {
for (int i = GetMaxHeight(); i < height; i++) {
prev[i] = head_;
}
// It is ok to mutate max_height_ without any synchronization
// with concurrent readers. A concurrent reader that observes
// the new value of max_height_ will see either the old value of
// new level pointers from head_ (nullptr), or a new value set in
// the loop below. In the former case the reader will
// immediately drop to the next level since nullptr sorts after all
// keys. In the latter case the reader will use the new node.
max_height_.store(height, std::memory_order_relaxed);
}
// 构建Node并插入
x = NewNode(key, height);
for (int i = 0; i < height; i++) {
// NoBarrier_SetNext() suffices since we will add a barrier when
// we publish a pointer to "x" in prev[i].
x->NoBarrier_SetNext(i, prev[i]->NoBarrier_Next(i));
prev[i]->SetNext(i, x);
}
}
emplate <typename Key, class Comparator>
typename SkipList<Key, Comparator>::Node*
SkipList<Key, Comparator>::FindGreaterOrEqual(const Key& key,
Node** prev) const {
Node* x = head_;
int level = GetMaxHeight() - 1;
while (true) {
Node* next = x->Next(level);
// 在每一层的Node上比较大小
if (KeyIsAfterNode(key, next)) {
// Keep searching in this list
x = next;
} else {
if (prev != nullptr) prev[level] = x;
// 达到最下面一层
if (level == 0) {
return next;
} else {
// Switch to next list
level--;
}
}
}
}
leveldb为了支持跨平台,抽象出ENV这个接口类,封装了对文件系统的操作,有三个继承者:
1、PosixENV:linux系统下的文件系统操作
2、WindowsENV:Windows系统下的文件系统操作
3、ENVWrapper:通过用户传入,让用户自定义
leveldb中用cache来加快寻找sstable和KV对,cache的淘汰策略是LRU
DBImpl中有TableCache来缓存sstable的句柄,key是sstable文件的fie_num,value是打开并解析到内存中的sstable句柄
sstable中有block_cache来缓存Block的句柄,key是该Block在sstable中的offset,value是解析完成的Block
cache通过list+hash表实现的一个LRUCache,有两个注意点:
1、当value从cache中淘汰时,有可能还在被外部使用,所以不能立即释放,通过引用计数方式释放内存
2、LRUCache是线程安全的,为了提供性能,把一个大cache继续hash到一个个小的LRUCache,也就是ShardedLRUCache