SSTable就是leveldb最后落地存储的文件,针对SSTable详细格式介绍可点此SSTable存储结构说明。本篇主要是对SSTable的读写流程代码研读。
写流程就是按照SSTable的格式去写,阅读起来并不是太复杂。
namespace leveldb {
struct TableBuilder::Rep {
Rep(const Options& opt, WritableFile* f)
: options(opt),
index_block_options(opt),
file(f),
offset(0),
data_block(&options),
index_block(&index_block_options),
num_entries(0),
closed(false),
filter_block(opt.filter_policy == nullptr
? nullptr
: new FilterBlockBuilder(opt.filter_policy)),
pending_index_entry(false) {
index_block_options.block_restart_interval = 1;
}
//Data Block写选项
Options options;
//index Block写选项,主要参数是block_restart_interval,
//主要是多久写一个Data Block重启点>
Options index_block_options;
//
WritableFile* file;
uint64_t offset;
Status status;
BlockBuilder data_block;
BlockBuilder index_block;
//
std::string last_key;
//
int64_t num_entries;
bool closed; // Either Finish() or Abandon() has been called.
FilterBlockBuilder* filter_block;
// We do not emit the index entry for a block until we have seen the
// first key for the next data block. This allows us to use shorter
// keys in the index block. For example, consider a block boundary
// between the keys "the quick brown fox" and "the who". We can use
// "the r" as the key for the index block entry since it is >= all
// entries in the first block and < all entries in subsequent
// blocks.
//
// Invariant: r->pending_index_entry is true only if data_block is empty.
//
bool pending_index_entry;
//
BlockHandle pending_handle; // Handle to add to index block
//
std::string compressed_output;
};
TableBuilder::TableBuilder(const Options& options, WritableFile* file)
: rep_(new Rep(options, file)) {
if (rep_->filter_block != nullptr) {
rep_->filter_block->StartBlock(0);
}
}
TableBuilder::~TableBuilder() {
assert(rep_->closed); // Catch errors where caller forgot to call Finish()
delete rep_->filter_block;
delete rep_;
}
/*
*/
Status TableBuilder::ChangeOptions(const Options& options) {
// Note: if more fields are added to Options, update
// this function to catch changes that should not be allowed to
// change in the middle of building a Table.
if (options.comparator != rep_->options.comparator) {
return Status::InvalidArgument("changing comparator while building table");
}
// Note that any live BlockBuilders point to rep_->options and therefore
// will automatically pick up the updated options.
rep_->options = options;
rep_->index_block_options = options;
rep_->index_block_options.block_restart_interval = 1;
return Status::OK();
}
//
void TableBuilder::Add(const Slice& key, const Slice& value) {
Rep* r = rep_;
//
assert(!r->closed);
if (!ok()) return;
/*
0,表示已存储了key。
上层传过来的key已保证从小到达的顺序,
所以新加入的key肯定大于已存在key数据的最后一个key。
>
*/
if (r->num_entries > 0) {
assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
}
/*
*/
if (r->pending_index_entry) {
assert(r->data_block.empty());
/*
*/
r->options.comparator->FindShortestSeparator(&r->last_key, key);
//
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
//
r->index_block.Add(r->last_key, Slice(handle_encoding));
//
r->pending_index_entry = false;
}
//
if (r->filter_block != nullptr) {
r->filter_block->AddKey(key);
}
//
r->last_key.assign(key.data(), key.size());
//
r->num_entries++;
//
r->data_block.Add(key, value);
/*
*/
const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
if (estimated_block_size >= r->options.block_size) {
Flush();
}
}
//
void TableBuilder::Flush() {
//
Rep* r = rep_;
assert(!r->closed);
if (!ok()) return;
if (r->data_block.empty()) return;
/*
*/
assert(!r->pending_index_entry);
//
WriteBlock(&r->data_block, &r->pending_handle);
if (ok()) {
//
r->pending_index_entry = true;
//
r->status = r->file->Flush();
}
//
if (r->filter_block != nullptr) {
r->filter_block->StartBlock(r->offset);
}
}
//
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
// File format contains a sequence of blocks where each block has:
// block_data: uint8[n]
// type: uint8
// crc: uint32
assert(ok());
Rep* r = rep_;
//
Slice raw = block->Finish();
/*
*/
Slice block_contents;
CompressionType type = r->options.compression;
// TODO(postrelease): Support more compression options: zlib?
switch (type) {
case kNoCompression:
block_contents = raw;
break;
case kSnappyCompression: {
std::string* compressed = &r->compressed_output;
if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
compressed->size() < raw.size() - (raw.size() / 8u)) {
block_contents = *compressed;
} else {
// Snappy not supported, or compressed less than 12.5%, so just
// store uncompressed form
block_contents = raw;
type = kNoCompression;
}
break;
}
}
//
WriteRawBlock(block_contents, type, handle);
//
r->compressed_output.clear();
block->Reset();
}
void TableBuilder::WriteRawBlock(const Slice& block_contents,
CompressionType type, BlockHandle* handle) {
Rep* r = rep_;
/*
*/
handle->set_offset(r->offset);
handle->set_size(block_contents.size());
//
r->status = r->file->Append(block_contents);
if (r->status.ok()) {
/*
*/
char trailer[kBlockTrailerSize];
trailer[0] = type;
uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type
EncodeFixed32(trailer + 1, crc32c::Mask(crc));
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
if (r->status.ok()) {
//
r->offset += block_contents.size() + kBlockTrailerSize;
}
}
}
//
Status TableBuilder::status() const { return rep_->status; }
/*
*/
Status TableBuilder::Finish() {
Rep* r = rep_;
Flush();
assert(!r->closed);
r->closed = true;
BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
/*
*/
// Write filter block
if (ok() && r->filter_block != nullptr) {
WriteRawBlock(r->filter_block->Finish(), kNoCompression,
&filter_block_handle);
}
/*
*/
// Write metaindex block
if (ok()) {
BlockBuilder meta_index_block(&r->options);
if (r->filter_block != nullptr) {
// Add mapping from "filter.Name" to location of filter data
std::string key = "filter.";
key.append(r->options.filter_policy->Name());
std::string handle_encoding;
filter_block_handle.EncodeTo(&handle_encoding);
meta_index_block.Add(key, handle_encoding);
}
// TODO(postrelease): Add stats and other meta blocks
WriteBlock(&meta_index_block, &metaindex_block_handle);
}
//
// Write index block
if (ok()) {
if (r->pending_index_entry) {
r->options.comparator->FindShortSuccessor(&r->last_key);
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
r->index_block.Add(r->last_key, Slice(handle_encoding));
r->pending_index_entry = false;
}
WriteBlock(&r->index_block, &index_block_handle);
}
/*
*/
// Write footer
if (ok()) {
Footer footer;
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(index_block_handle);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
r->status = r->file->Append(footer_encoding);
if (r->status.ok()) {
r->offset += footer_encoding.size();
}
}
//
return r->status;
}
void TableBuilder::Abandon() {
Rep* r = rep_;
assert(!r->closed);
r->closed = true;
}
//
uint64_t TableBuilder::NumEntries() const { return rep_->num_entries; }
//
uint64_t TableBuilder::FileSize() const { return rep_->offset; }
} // namespace leveldb
读流程中涉及到了Table Cache
等知识,待后续篇章去解读。
针对读流程中出现的二级迭代器,会在下篇文章中介绍。
namespace leveldb {
struct Table::Rep {
~Rep() {
delete filter;
delete[] filter_data;
delete index_block;
}
Options options;
Status status;
RandomAccessFile* file;
uint64_t cache_id;
FilterBlockReader* filter;
const char* filter_data;
BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer
Block* index_block;
};
//打开SSTable时,首先将index block读取出来,
//用于后期查询key时,先通过内存中的index block来
//判断key在不在这个SSTable,然后再决定是否去读取对应的data block。
//这样明显可减少I/O操作。
Status Table::Open(const Options& options, RandomAccessFile* file,
uint64_t size, Table** table) {
*table = nullptr;
//SSTable的Footer就是48Byte
if (size < Footer::kEncodedLength) {
return Status::Corruption("file is too short to be an sstable");
}
char footer_space[Footer::kEncodedLength];
Slice footer_input;
//将footer读出来,用于解析其中的metaindex_block_handle和
//index_block_handle。
Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
&footer_input, footer_space);
if (!s.ok()) return s;
//1、解析出metaindex_block_handle;
//2、解析出index_block_handle。
Footer footer;
s = footer.DecodeFrom(&footer_input);
if (!s.ok()) return s;
// Read the index block
BlockContents index_block_contents;
if (s.ok()) {
ReadOptions opt;
//是否开启严格检查数据完整性,默认false
//开启之后可能会因为部分数据异常导致整个数据库无法读。
if (options.paranoid_checks) {
opt.verify_checksums = true;
}
//将index_block读出。
//1、安装offset去sstable位置读取数据;
//2、若开启校验则校验;
//3、若数据压缩则解压。
s = ReadBlock(file, opt, footer.index_handle(), &index_block_contents);
}
if (s.ok()) {
// We've successfully read the footer and the index block: we're
// ready to serve requests.
Block* index_block = new Block(index_block_contents);
Rep* rep = new Table::Rep;
rep->options = options;
rep->file = file;
rep->metaindex_handle = footer.metaindex_handle();
rep->index_block = index_block;
//涉及到对Cache管理了,这里暂时不清楚此cache_id的作用。
rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
rep->filter_data = nullptr;
rep->filter = nullptr;
//实例一个table,用于对sstable读取解析
*table = new Table(rep);
//读取filte block
(*table)->ReadMeta(footer);
}
return s;
}
void Table::ReadMeta(const Footer& footer) {
//过滤策略都没有,那就可以确定没必要读filter block了
if (rep_->options.filter_policy == nullptr) {
return; // Do not need any metadata
}
// TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
// it is an empty block.
ReadOptions opt;
if (rep_->options.paranoid_checks) {
opt.verify_checksums = true;
}
//根据metaindex_handle读取metaindex block
BlockContents contents;
if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
// Do not propagate errors since meta info is not needed for operation
return;
}
//这里是疑惑的地方!!!!!!
Block* meta = new Block(contents);
Iterator* iter = meta->NewIterator(BytewiseComparator());
std::string key = "filter.";
key.append(rep_->options.filter_policy->Name());
iter->Seek(key);
if (iter->Valid() && iter->key() == Slice(key)) {
//根据metaindex的offset+size去读取filter block
ReadFilter(iter->value());
}
delete iter;
delete meta;
}
void Table::ReadFilter(const Slice& filter_handle_value) {
Slice v = filter_handle_value;
BlockHandle filter_handle;
if (!filter_handle.DecodeFrom(&v).ok()) {
return;
}
// We might want to unify with ReadBlock() if we start
// requiring checksum verification in Table::Open.
ReadOptions opt;
if (rep_->options.paranoid_checks) {
opt.verify_checksums = true;
}
//读取filter block 数据
BlockContents block;
if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
return;
}
//如果heap_allocated为true表示读取
//filter block的时候new了内存,后续需要删除
if (block.heap_allocated) {
rep_->filter_data = block.data.data(); // Will need to delete later
}
//构造一个读取filter block的实例
rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
}
Table::~Table() { delete rep_; }
static void DeleteBlock(void* arg, void* ignored) {
delete reinterpret_cast<Block*>(arg);
}
static void DeleteCachedBlock(const Slice& key, void* value) {
Block* block = reinterpret_cast<Block*>(value);
delete block;
}
static void ReleaseBlock(void* arg, void* h) {
Cache* cache = reinterpret_cast<Cache*>(arg);
Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
cache->Release(handle);
}
//根据index_value(即offset+size),读取对应的block。
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
Iterator* Table::BlockReader(void* arg, const ReadOptions& options,
const Slice& index_value) {
Table* table = reinterpret_cast<Table*>(arg);
Cache* block_cache = table->rep_->options.block_cache;
Block* block = nullptr;
Cache::Handle* cache_handle = nullptr;
BlockHandle handle;
Slice input = index_value;
Status s = handle.DecodeFrom(&input);
// We intentionally allow extra stuff in index_value so that we
// can add more features in the future.
if (s.ok()) {
BlockContents contents;
if (block_cache != nullptr) {
//如果开启了block_cache,则先去此cache中查找
//key就是id+DataBlock的offset。(此处暂时不解读Cache相关实现)
char cache_key_buffer[16];
EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
EncodeFixed64(cache_key_buffer + 8, handle.offset());
Slice key(cache_key_buffer, sizeof(cache_key_buffer));
cache_handle = block_cache->Lookup(key);
//1、若在cache中查找到了直接将地址赋值给block;
//2、若为找到,则去SSTable文件中去查找
if (cache_handle != nullptr) {
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
} else {
s = ReadBlock(table->rep_->file, options, handle, &contents);
if (s.ok()) {
block = new Block(contents);
//若读取的Block是直接new的,且fill_cache,则将这个Block缓存起来。
if (contents.cachable && options.fill_cache) {
cache_handle = block_cache->Insert(key, block, block->size(),
&DeleteCachedBlock);
}
}
}
} else {
//3、若为使用block_cache,则直接去SSTable中去读数据。
s = ReadBlock(table->rep_->file, options, handle, &contents);
if (s.ok()) {
block = new Block(contents);
}
}
}
Iterator* iter;
if (block != nullptr) {
iter = block->NewIterator(table->rep_->options.comparator);
//1、cache_handle 为null,表示block不在缓存中,在迭代器iter析构时,
// 直接删除这个block。
//2、cache_handle非null,表示block在缓存中,在迭代器iter析构时,
// 通过ReleaseBlock,减少其一次引用计数。
if (cache_handle == nullptr) {
iter->RegisterCleanup(&DeleteBlock, block, nullptr);
} else {
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
}
} else {
//若未获取到block,则直接生存一个错误迭代器返回。
iter = NewErrorIterator(s);
}
return iter;
}
//SSTable二层迭代器迭代器。
Iterator* Table::NewIterator(const ReadOptions& options) const {
return NewTwoLevelIterator(
rep_->index_block->NewIterator(rep_->options.comparator),
&Table::BlockReader, const_cast<Table*>(this), options);
}
Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg,
void (*handle_result)(void*, const Slice&,
const Slice&)) {
Status s;
//通过key,找到index block中的一条对应DataBlock的记录
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
iiter->Seek(k);
//Seek到
if (iiter->Valid()) {
//hanlde_vale就是返回的DataBlock的offset+size。
Slice handle_value = iiter->value();
FilterBlockReader* filter = rep_->filter;
BlockHandle handle;
//如果过滤策略非空,则通过DataBlock的offset,去Filter中去查找是否有此key
if (filter != nullptr && handle.DecodeFrom(&handle_value).ok() &&
!filter->KeyMayMatch(handle.offset(), k)) {
// Not found
} else {
//如果在Filte Block中查找到了(不一定真的查找到),那就去DataBlock中去查找。
//通过DataBlock的offset+size去创建一个读取DataBlock的迭代器
Iterator* block_iter = BlockReader(this, options, iiter->value());
//Seek要查找的key
block_iter->Seek(k);
if (block_iter->Valid()) {
//查找到key之后,执行传入的方法函数
(*handle_result)(arg, block_iter->key(), block_iter->value());
}
s = block_iter->status();
delete block_iter;
}
}
if (s.ok()) {
s = iiter->status();
}
delete iiter;
return s;
}
//预估key的大致偏移位。
//1、在index_block中查找到了就返回index_block中对应的DataBlock的offset。
//2、如果在index_block中查找到了但是无法解码出offset+size,就默认给metaindex_block的offset。
//3、Seek是查到大于等于这个key的值,若未找到,说明这个key比较大,默认给metaindex_block的offset。
uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
Iterator* index_iter =
rep_->index_block->NewIterator(rep_->options.comparator);
index_iter->Seek(key);
uint64_t result;
if (index_iter->Valid()) {
BlockHandle handle;
Slice input = index_iter->value();
Status s = handle.DecodeFrom(&input);
if (s.ok()) {
result = handle.offset();
} else {
// Strange: we can't decode the block handle in the index block.
// We'll just return the offset of the metaindex block, which is
// close to the whole file size for this case.
result = rep_->metaindex_handle.offset();
}
} else {
// key is past the last key in the file. Approximate the offset
// by returning the offset of the metaindex block (which is
// right near the end of the file).
result = rep_->metaindex_handle.offset();
}
delete index_iter;
return result;
}
} // namespace leveldb