class Block; // Block数据结构定义,主要通过Block::Iter操作
class Block::Iter : public Iterator ; // Block中每个entry的迭代器,内部使用DecodeEntry解析每个entry
static inline const char* DecodeEntry(const char* p, // 解析一个entry(共享key长度、非共享key长度、value长度)
const char* limit,
uint32_t* shared,
uint32_t* non_shared,
uint32_t* value_length) ;
// 从file文件中,读取handle(offset_/size_)指定的Block数据到**block中(进行magic、CRC校验、如果数据为压缩过的则解压缩)
Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, Block** block);
// 向文件写入一个Block(数据及type和CRC),并设置index Block项(index Block在sst文件完毕阶段写入)
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) ; // TableBuilder对应sst文件构建
BlockBuilder::Add(const Slice& key, const Slice& value) ; // 向当前Block的buffer中添加一条key/value
// 格式:共享key长度+非共享key长度+value长度+非共享key数据+value数据
BlockBuilder::Finish() ; // 向当前Block的buffer添加restart信息
声明及实现如下:
// Block数据结构
class Block {
public:
// Initialize the block with the specified contents.
// Takes ownership of data[] and will delete[] it when done.
Block(const char* data, size_t size); // 构造函数,初始化data/size,如:new Block(buf, n)
~Block();
size_t size() const { return size_; } // size
Iterator* NewIterator(const Comparator* comparator); // iterator
private:
uint32_t NumRestarts() const; // restarts数量
const char* data_; // 数据
size_t size_; // 大小
uint32_t restart_offset_; // Offset in data_ of restart array, restart的数据偏移
// No copying allowed
Block(const Block&); // private 拷贝构造函数及赋值运算符重载
void operator=(const Block&);
class Iter;
};
inline uint32_t Block::NumRestarts() const { // restarts数量
assert(size_ >= 2*sizeof(uint32_t));
return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
}
Block::Block(const char* data, size_t size) // 构造函数,初始化data/size,如:new Block(buf, n)
: data_(data), size_(size)
{
if (size_ < sizeof(uint32_t))
{
size_ = 0; // Error marker
}
else
{
restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); // restart开始位置, 每个Restart Point占4字节
if (restart_offset_ > size_ - sizeof(uint32_t))
{
// The size is too small for NumRestarts() and therefore
// restart_offset_ wrapped around.
size_ = 0;
}
}
}
Block::~Block() {
delete[] data_; // 使用Block后,内存由Block控制释放
}
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively. Will not derefence past "limit".
//
// If any errors are detected, returns NULL. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
static inline const char* DecodeEntry(const char* p, // 解析一个recored
const char* limit,
uint32_t* shared,
uint32_t* non_shared,
uint32_t* value_length)
{
// An entry for a particular key-value pair has the form:
// shared_bytes: varint32
// unshared_bytes: varint32
// value_length: varint32
// key_delta: char[unshared_bytes]
// value: char[value_length]
// shared_bytes == 0 for restart points.
// restarts[i] contains the offset within the block of the ith restart point.
if (limit - p < 3) return NULL;
*shared = reinterpret_cast<const unsigned char*>(p)[0];
*non_shared = reinterpret_cast<const unsigned char*>(p)[1];
*value_length = reinterpret_cast<const unsigned char*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) // 最高位都是0,则每个长度只占一个字节
{
// Fast path: all three values are encoded in one byte each
p += 3;
}
else
{
// 分别解析每个长度: 共享key长度、非共享key长度、value长度
if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL;
if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
}
// 剩余数据长度 < 解析的"key非共享内存"和"value内容"长度,则数据非法
if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
return NULL;
}
return p;
}
class Block::Iter : public Iterator { // Block操作主要集中在Block::Iter上
private:
const Comparator* const comparator_;
const char* const data_; // underlying block contents
uint32_t const restarts_; // Offset of restart array (list of fixed32)
uint32_t const num_restarts_; // Number of uint32_t entries in restart array
// current_ is offset in data_ of current entry. >= restarts_ if !Valid
uint32_t current_;
uint32_t restart_index_; // Index of restart block in which current_ falls
std::string key_;
Slice value_;
Status status_;
inline int Compare(const Slice& a, const Slice& b) const { // 比较
return comparator_->Compare(a, b);
}
// 下一entry的offset ok
// Return the offset in data_ just past the end of the current entry.
inline uint32_t NextEntryOffset() const {
return (value_.data() + value_.size()) - data_; // value_为current entry
}
// 跳到第index个restart point指向的record的offset
uint32_t GetRestartPoint(uint32_t index) {
assert(index < num_restarts_);
return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
}
// 跳到index个restart块的起始位置 ok
void SeekToRestartPoint(uint32_t index) {
key_.clear();
restart_index_ = index;
// current_ will be fixed by ParseNextKey();
// ParseNextKey() starts at the end of value_, so set value_ accordingly
uint32_t offset = GetRestartPoint(index); // 第index个restart point指向的record的offset
value_ = Slice(data_ + offset, 0); // value_指向当前entry的起始地址
}
public:
Iter(const Comparator* comparator, const char* data, uint32_t restarts, uint32_t num_restarts)
: comparator_(comparator), data_(data), restarts_(restarts), num_restarts_(num_restarts),
current_(restarts_), restart_index_(num_restarts_) // 指向最后
{
assert(num_restarts_ > 0);
}
virtual bool Valid() const // 是否合法
{ return current_ < restarts_; }
virtual Status status() const { return status_; }
virtual Slice key() const { // 返回key_
assert(Valid());
return key_;
}
virtual Slice value() const { // 返回value_
assert(Valid());
return value_;
}
// 下一个entry
virtual void Next() {
assert(Valid());
ParseNextKey();
}
// 前一个entry
virtual void Prev()
{
assert(Valid());
// Scan backwards to a restart point before current_
const uint32_t original = current_;
while (GetRestartPoint(restart_index_) >= original)
{
if (restart_index_ == 0) // 第一次调用,未初始化时可以出现
{
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
}
SeekToRestartPoint(restart_index_); // 跳到index个restart块的起始位置
do {
// Loop until end of current entry hits the start of original entry
} while (ParseNextKey() && NextEntryOffset() < original);// 解析此restart块中的每个entry,直到最后一个(NextEntryOffset() == original)
// // same with
// while ( ParseNextKey() && NextEntryOffset() < original) // 解析此restart块中的每个entry,直到最后一个(NextEntryOffset() == original)
// {
// // Keep skipping
// }
}
virtual void Seek(const Slice& target_key) { // 定位target_key
// Binary search in restart array to find the first restart point
// with a key >= target
uint32_t left = 0;
uint32_t right = num_restarts_ - 1;
while (left < right) { // 二分查找所在的restart块
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr = DecodeEntry(data_ + region_offset,
data_ + restarts_,
&shared, &non_shared, &value_length);
if (key_ptr == NULL || (shared != 0)) {
CorruptionError();
return;
}
Slice mid_key(key_ptr, non_shared); // key
if (Compare(mid_key, target_key) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}
// 在restart块内顺序查找
// Linear search (within restart block) for first key >= target
SeekToRestartPoint(left);
while (true) {
if (!ParseNextKey()) {
return;
}
if (Compare(key_, target_key) >= 0) { // 直到当前key_ >= target_key时停止
return;
}
}
}
// 定位到第一个entry ok
virtual void SeekToFirst() {
SeekToRestartPoint(0); // 跳到0个restart块的起始位置,即: 开始
ParseNextKey(); // 解析一个entry(key/value)
}
// 定位到最后一个entry ok
virtual void SeekToLast() {
SeekToRestartPoint(num_restarts_ - 1); // 跳到最后一个restart块的起始位置
while (ParseNextKey() && NextEntryOffset() < restarts_) // 解析此restart块中的每个entry,直到最后一个(NextEntryOffset() == restarts_)
{
// Keep skipping
}
}
private:
void CorruptionError() {
current_ = restarts_;
restart_index_ = num_restarts_;
status_ = Status::Corruption("bad entry in block");
key_.clear();
value_.clear();
}
// 解析一个entry ok
bool ParseNextKey() {
current_ = NextEntryOffset(); // 当前entry的offset,data_为block数据开始位置,value_上一entry数据开始位置
const char* p = data_ + current_; // 当前entry起始地址
const char* limit = data_ + restarts_; // Restarts come right after data
if (p >= limit) {
// No more entries to return. Mark as invalid.
current_ = restarts_;
restart_index_ = num_restarts_;
return false;
}
// Decode next entry
uint32_t shared, non_shared, value_length;
p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);// 解析一个entry
if (p == NULL || key_.size() < shared)
{
CorruptionError();
return false;
}
else
{
key_.resize(shared); // key_记录了上一个entry的key
key_.append(p, non_shared);
value_ = Slice(p + non_shared, value_length); // data contents
while (restart_index_ + 1 < num_restarts_ &&
GetRestartPoint(restart_index_ + 1) < current_) //? <= current_
{ // 是否到达下一个restart的entry
++restart_index_;
}
return true;
}
}
};
ReadBlock函数:
// 从file文件中,读取handle(offset_/size_)指定的Block数据到**block中(magic、CRC校验、如果数据为压缩过的则解压缩)
Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, Block** block)
{
*block = NULL;
// Read the block contents as well as the type/crc footer.
// See table_builder.cc for the code that built this structure.
size_t n = static_cast<size_t>(handle.size()); // 返回block数据的大小
char* buf = new char[n + kBlockTrailerSize]; // 分配block大小+5字节的type/crc
Slice contents;
Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
if (!s.ok()) {
delete[] buf;
return s;
}
if (contents.size() != n + kBlockTrailerSize) {// 读取到的大小不合法
delete[] buf;
return Status::Corruption("truncated block read");
}
// Check the crc of the type and the block contents
const char* data = contents.data(); // Pointer to where Read put the data
if (options.verify_checksums) {
const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));// 读取CRC
const uint32_t actual = crc32c::Value(data, n + 1); // 计算CRC
if (actual != crc) { // CRC检验
delete[] buf;
s = Status::Corruption("block checksum mismatch");
return s;
}
}
CompressionType type = static_cast<CompressionType>(data[n]); // 类型
switch (type) { // type字段,表示是否压缩
case kNoCompression: // 没有压缩
if (data != buf) {
// File implementation gave us pointer to some other data.
// Copy into buf[].
// buf与data有重叠(通常memcpy不能处理内存重叠的问题-采用while(*des++=*sou++);)
// 这里应该不存在问题,data与buf地址相同,感觉这里不需要使用memcpy
memcpy(buf, data, n + kBlockTrailerSize); //? buf与data有重叠
}
// Ok
break;
case kSnappyCompression: { // 压缩过的数据
size_t ulength = 0;
if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
delete[] buf;
return Status::Corruption("corrupted compressed block contents");
}
char* ubuf = new char[ulength];
if (!port::Snappy_Uncompress(data, n, ubuf)) { // 解压缩
delete[] buf;
delete[] ubuf;
return Status::Corruption("corrupted compressed block contents");
}
delete[] buf;
buf = ubuf;
n = ulength;
break;
}
default:
delete[] buf;
return Status::Corruption("bad block type");
} // switch
// 分配一个Block
*block = new Block(buf, n); // Block takes ownership of buf[]
return Status::OK();
}
// 向文件写入一个Block(数据及type和CRC),并设置index Block项
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
// File format contains a sequence of blocks where each block has:
// block_data: uint8[n]
// type: uint8
// crc: uint32
assert(ok());
Rep* r = rep_;
Slice raw = block->Finish();
Slice block_contents;
CompressionType type = r->options.compression;
// TODO(postrelease): Support more compression options: zlib?
switch (type) {
case kNoCompression:
block_contents = raw;
break;
case kSnappyCompression: {// 压缩
std::string* compressed = &r->compressed_output;
if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
compressed->size() < raw.size() - (raw.size() / 8u)) {
block_contents = *compressed;
} else {
// Snappy not supported, or compressed less than 12.5%, so just
// store uncompressed form
block_contents = raw;
type = kNoCompression;
}
break;
}
}
handle->set_offset(r->offset); // 设置 index block索引信息
handle->set_size(block_contents.size()); //
r->status = r->file->Append(block_contents); // 写入data block数据
if (r->status.ok()) {
char trailer[kBlockTrailerSize]; // type + crc
trailer[0] = type;
uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
crc = crc32c::Extend(crc, trailer, 1); // Extend crc to cover block type
EncodeFixed32(trailer+1, crc32c::Mask(crc));
r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); // 写入trailer
if (r->status.ok()) {
r->offset += block_contents.size() + kBlockTrailerSize;
}
}
r->compressed_output.clear();
block->Reset();
}
class BlockBuilder {
public:
explicit BlockBuilder(const Options* options); // 构造函数
// Reset the contents as if the BlockBuilder was just constructed.
void Reset(); // 重置状态
// REQUIRES: Finish() has not been callled since the last call to Reset().
// REQUIRES: key is larger than any previously added key
void Add(const Slice& key, const Slice& value); // 添加一条key/value记录
// Finish building the block and return a slice that refers to the
// block contents. The returned slice will remain valid for the
// lifetime of this builder or until Reset() is called.
Slice Finish(); // 结束构建此Block,返回Block已经构建的内容
// Returns an estimate of the current (uncompressed) size of the block
// we are building.
size_t CurrentSizeEstimate() const; // 返回当前容量
// Return true iff no entries have been added since the last Reset()
bool empty() const { // 内容是否为空
return buffer_.empty();
}
private:
const Options* options_;
std::string buffer_; // Destination buffer
std::vector<uint32_t> restarts_; // Restart points, // 各重启点
int counter_; // Number of entries emitted since restart // 当前restart块的数量(每16个构建一个restart块)
bool finished_; // Has Finish() been called? // 是否此Block已经结束构建
std::string last_key_; // 最后一个添加的key
// No copying allowed
BlockBuilder(const BlockBuilder&);
void operator=(const BlockBuilder&);
};
BlockBuilder::BlockBuilder(const Options* options)
: options_(options),
restarts_(),
counter_(0),
finished_(false) {
assert(options->block_restart_interval >= 1);
restarts_.push_back(0); // First restart point is at offset 0
}
void BlockBuilder::Reset() {
buffer_.clear();
restarts_.clear();
restarts_.push_back(0); // First restart point is at offset 0
counter_ = 0;
finished_ = false;
last_key_.clear();
}
// 预估大小
size_t BlockBuilder::CurrentSizeEstimate() const {
return (buffer_.size() + // Raw data buffer
restarts_.size() * sizeof(uint32_t) + // Restart array
sizeof(uint32_t)); // Restart array length
}
Slice BlockBuilder::Finish() { // 添加restart信息
// Append restart array
for (size_t i = 0; i < restarts_.size(); i++) { // 重启点
PutFixed32(&buffer_, restarts_[i]);
}
PutFixed32(&buffer_, restarts_.size()); // 重启点数量
finished_ = true;
return Slice(buffer_);
}
void BlockBuilder::Add(const Slice& key, const Slice& value) { // 向当前Block的buffer中添加一条key/value
Slice last_key_piece(last_key_);
assert(!finished_);
assert(counter_ <= options_->block_restart_interval); // 如果>了,表示已经开始下一个restart周期了,counter_ reset
assert(buffer_.empty() // No values yet?
|| options_->comparator->Compare(key, last_key_piece) > 0); //+ or key是顺序添加的
size_t shared = 0;
if (counter_ < options_->block_restart_interval) // 还没有到配置的restart块数量
{
// See how much sharing to do with previous string
const size_t min_length = std::min(last_key_piece.size(), key.size());
while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
shared++; // 与last key共享的长度
}
}
else // 新的restart块
{
// Restart compression
restarts_.push_back(buffer_.size()); // buffer_为内容缓冲区, size()表示当前位置
counter_ = 0;
}
const size_t non_shared = key.size() - shared; // 非共享key长度
// 写入一个entry/record
// Add "<shared><non_shared><value_size>" to buffer_
PutVarint32(&buffer_, shared); // shared key 长度
PutVarint32(&buffer_, non_shared); // 非shared key长度
PutVarint32(&buffer_, value.size()); // value长度
// Add string delta to buffer_ followed by value
buffer_.append(key.data() + shared, non_shared); // 非共享key内容
buffer_.append(value.data(), value.size()); // value内容
// Update state
last_key_.resize(shared);
// last_key就是当前添加的key
last_key_.append(key.data() + shared, non_shared); //+ 可能出于对效率的考虑,共享的数据较多时,
//+ 直接赋值(last_key_=key)将产生大的数据拷贝
assert(Slice(last_key_) == key);
counter_++;
}