levelDB源码分析-SSTable:Block

levelDB中涉及Block操作的接口机函数基本如下:

   class Block;     											// Block数据结构定义,主要通过Block::Iter操作
    class Block::Iter : public Iterator ;				// Block中每个entry的迭代器,内部使用DecodeEntry解析每个entry
	
    static inline const char* DecodeEntry(const char* p,                // 解析一个entry(共享key长度、非共享key长度、value长度)
                                          const char* limit,
                                          uint32_t* shared,
                                          uint32_t* non_shared,
                                          uint32_t* value_length) ;
    
   
   // 从file文件中,读取handle(offset_/size_)指定的Block数据到**block中(进行magic、CRC校验、如果数据为压缩过的则解压缩)
   Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, Block** block);
    
   // 向文件写入一个Block(数据及type和CRC),并设置index Block项(index Block在sst文件完毕阶段写入)
   void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) ; // TableBuilder对应sst文件构建
   
   BlockBuilder::Add(const Slice& key, const Slice& value) ;			// 向当前Block的buffer中添加一条key/value
   															// 格式:共享key长度+非共享key长度+value长度+非共享key数据+value数据 
   
   BlockBuilder::Finish() ;										// 向当前Block的buffer添加restart信息  



声明及实现如下:

// Block数据结构
    class Block {
      public:
        // Initialize the block with the specified contents.
        // Takes ownership of data[] and will delete[] it when done.
        Block(const char* data, size_t size);						// 构造函数,初始化data/size,如:new Block(buf, n)

        ~Block();

        size_t size() const { return size_; }						// size
        Iterator* NewIterator(const Comparator* comparator);			// iterator

      private:
        uint32_t NumRestarts() const; 								// restarts数量

        const char* data_;            								// 数据
        size_t size_;                 								// 大小
        uint32_t restart_offset_;     								// Offset in data_ of restart array, restart的数据偏移

        // No copying allowed
        Block(const Block&);										// private 拷贝构造函数及赋值运算符重载
        void operator=(const Block&);

        class Iter;
    };
    
    inline uint32_t Block::NumRestarts() const {					// restarts数量
        assert(size_ >= 2*sizeof(uint32_t));
        return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
    }
    Block::Block(const char* data, size_t size)						// 构造函数,初始化data/size,如:new Block(buf, n)
      : data_(data), size_(size) 
    {
        if (size_ < sizeof(uint32_t)) 
        {
            size_ = 0;  // Error marker
        } 
        else 
        {
            													
            restart_offset_ = size_ - (1 + NumRestarts()) * sizeof(uint32_t); // restart开始位置, 每个Restart Point占4字节
            if (restart_offset_ > size_ - sizeof(uint32_t)) 
            {
                // The size is too small for NumRestarts() and therefore
                // restart_offset_ wrapped around.
                size_ = 0;
            }
        }
    }

    Block::~Block() {
        delete[] data_; 										// 使用Block后,内存由Block控制释放
    }
    
     // Helper routine: decode the next block entry starting at "p",
    // storing the number of shared key bytes, non_shared key bytes,
    // and the length of the value in "*shared", "*non_shared", and
    // "*value_length", respectively.  Will not derefence past "limit".
    //
    // If any errors are detected, returns NULL.  Otherwise, returns a
    // pointer to the key delta (just past the three decoded values).
    static inline const char* DecodeEntry(const char* p,                // 解析一个recored
                                          const char* limit,
                                          uint32_t* shared,
                                          uint32_t* non_shared,
                                          uint32_t* value_length) 
    {
// An entry for a particular key-value pair has the form:
//     shared_bytes: varint32
//     unshared_bytes: varint32
//     value_length: varint32
//     key_delta: char[unshared_bytes]
//     value: char[value_length]
// shared_bytes == 0 for restart points.

// restarts[i] contains the offset within the block of the ith restart point.

        if (limit - p < 3) return NULL;
        *shared = reinterpret_cast<const unsigned char*>(p)[0];
        *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
        *value_length = reinterpret_cast<const unsigned char*>(p)[2];
        if ((*shared | *non_shared | *value_length) < 128) // 最高位都是0,则每个长度只占一个字节
        { 
            // Fast path: all three values are encoded in one byte each
            p += 3;
        }
        else 
        {
            // 分别解析每个长度: 共享key长度、非共享key长度、value长度
            if ((p = GetVarint32Ptr(p, limit, shared)) == NULL) return NULL;
            if ((p = GetVarint32Ptr(p, limit, non_shared)) == NULL) return NULL;
            if ((p = GetVarint32Ptr(p, limit, value_length)) == NULL) return NULL;
        }

        // 剩余数据长度 < 解析的"key非共享内存"和"value内容"长度,则数据非法
        if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
            return NULL;
        }
        return p;
    }
    class Block::Iter : public Iterator {                    // Block操作主要集中在Block::Iter上
      private:
        const Comparator* const comparator_;
        const char* const data_;      // underlying block contents
        uint32_t const restarts_;     // Offset of restart array (list of fixed32)
        uint32_t const num_restarts_; // Number of uint32_t entries in restart array

        // current_ is offset in data_ of current entry.  >= restarts_ if !Valid
        uint32_t current_;
        uint32_t restart_index_;  // Index of restart block in which current_ falls
        std::string key_;
        Slice value_;
        Status status_;

        inline int Compare(const Slice& a, const Slice& b) const {  // 比较
            return comparator_->Compare(a, b);
        }

        // 下一entry的offset ok
        // Return the offset in data_ just past the end of the current entry.
        inline uint32_t NextEntryOffset() const {
            return (value_.data() + value_.size()) - data_; // value_为current entry
        }

        // 跳到第index个restart point指向的record的offset
        uint32_t GetRestartPoint(uint32_t index) {
            assert(index < num_restarts_);
            return DecodeFixed32(data_ + restarts_ + index * sizeof(uint32_t));
        }

        // 跳到index个restart块的起始位置 ok
        void SeekToRestartPoint(uint32_t index) {
            key_.clear();
            restart_index_ = index;
            // current_ will be fixed by ParseNextKey();

            // ParseNextKey() starts at the end of value_, so set value_ accordingly
            uint32_t offset = GetRestartPoint(index);  // 第index个restart point指向的record的offset
            value_ = Slice(data_ + offset, 0);         // value_指向当前entry的起始地址
        }

      public:
        Iter(const Comparator* comparator, const char* data, uint32_t restarts, uint32_t num_restarts)
          : comparator_(comparator), data_(data), restarts_(restarts), num_restarts_(num_restarts),
            current_(restarts_), restart_index_(num_restarts_) // 指向最后
        {
            assert(num_restarts_ > 0);
        }

        virtual bool Valid() const                  // 是否合法
            { return current_ < restarts_; } 
        virtual Status status() const { return status_; }
        virtual Slice key() const {                 // 返回key_
            assert(Valid());
            return key_;
        }
        virtual Slice value() const {               // 返回value_
            assert(Valid());
            return value_;
        }
        // 下一个entry
        virtual void Next() {
            assert(Valid());
            ParseNextKey();
        }

        // 前一个entry
        virtual void Prev() 
        {
            assert(Valid());

            // Scan backwards to a restart point before current_
            const uint32_t original = current_;
            while (GetRestartPoint(restart_index_) >= original)
            {
                if (restart_index_ == 0) // 第一次调用,未初始化时可以出现
                {
                    // No more entries
                    current_ = restarts_;
                    restart_index_ = num_restarts_;
                    return;
                }
                restart_index_--;
            }

            SeekToRestartPoint(restart_index_); // 跳到index个restart块的起始位置
            do {
                // Loop until end of current entry hits the start of original entry
            } while (ParseNextKey() && NextEntryOffset() < original);// 解析此restart块中的每个entry,直到最后一个(NextEntryOffset() == original)
//          // same with  
//          while (  ParseNextKey() && NextEntryOffset() < original) // 解析此restart块中的每个entry,直到最后一个(NextEntryOffset() == original)
//          {
//                // Keep skipping
//          }
        }

        virtual void Seek(const Slice& target_key) {                // 定位target_key
            // Binary search in restart array to find the first restart point
            // with a key >= target
            uint32_t left = 0;
            uint32_t right = num_restarts_ - 1;
            while (left < right) {  							// 二分查找所在的restart块
                uint32_t mid = (left + right + 1) / 2;
                uint32_t region_offset = GetRestartPoint(mid);
                uint32_t shared, non_shared, value_length;
                const char* key_ptr = DecodeEntry(data_ + region_offset,
                data_ + restarts_,
                &shared, &non_shared, &value_length);
                if (key_ptr == NULL || (shared != 0)) {
                    CorruptionError();
                    return;
                }
                Slice mid_key(key_ptr, non_shared);  // key
                if (Compare(mid_key, target_key) < 0) {
                    // Key at "mid" is smaller than "target".  Therefore all
                    // blocks before "mid" are uninteresting.
                    left = mid;
                    } else {
                    // Key at "mid" is >= "target".  Therefore all blocks at or
                    // after "mid" are uninteresting.
                    right = mid - 1;
                }
            }

            // 在restart块内顺序查找
            // Linear search (within restart block) for first key >= target
            SeekToRestartPoint(left);
            while (true) {
                if (!ParseNextKey()) {
                    return;
                }
                if (Compare(key_, target_key) >= 0) { // 直到当前key_ >= target_key时停止
                    return;
                }
            }
        }

        // 定位到第一个entry ok
        virtual void SeekToFirst() {
            SeekToRestartPoint(0);      // 跳到0个restart块的起始位置,即: 开始
            ParseNextKey();             // 解析一个entry(key/value)
        }

        // 定位到最后一个entry ok
        virtual void SeekToLast() {
            SeekToRestartPoint(num_restarts_ - 1);  // 跳到最后一个restart块的起始位置
            while (ParseNextKey() && NextEntryOffset() < restarts_) // 解析此restart块中的每个entry,直到最后一个(NextEntryOffset() == restarts_)
            {
                // Keep skipping
            }
        }

      private:
        void CorruptionError() {
            current_ = restarts_;
            restart_index_ = num_restarts_;
            status_ = Status::Corruption("bad entry in block");
            key_.clear();
            value_.clear();
        }

        // 解析一个entry ok
        bool ParseNextKey() {
            current_ = NextEntryOffset();               // 当前entry的offset,data_为block数据开始位置,value_上一entry数据开始位置
            const char* p = data_ + current_;           // 当前entry起始地址
            const char* limit = data_ + restarts_;      // Restarts come right after data
            if (p >= limit) {
                // No more entries to return.  Mark as invalid.
                current_ = restarts_;
                restart_index_ = num_restarts_;
                return false;
            }

            // Decode next entry
            uint32_t shared, non_shared, value_length;
            p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);// 解析一个entry
            if (p == NULL || key_.size() < shared) 
            {
                CorruptionError();
                return false;
            } 
            else 
            {
                key_.resize(shared);                                // key_记录了上一个entry的key
                key_.append(p, non_shared);
                value_ = Slice(p + non_shared, value_length);       // data contents
                while (restart_index_ + 1 < num_restarts_ &&
                    GetRestartPoint(restart_index_ + 1) < current_) //? <= current_
                {   // 是否到达下一个restart的entry
                    ++restart_index_;
                }
                return true;
            }
        }
    };

        
 ReadBlock函数:
         
    // 从file文件中,读取handle(offset_/size_)指定的Block数据到**block中(magic、CRC校验、如果数据为压缩过的则解压缩)
    Status ReadBlock(RandomAccessFile* file, const ReadOptions& options, const BlockHandle& handle, Block** block) 
    {
        *block = NULL;

        // Read the block contents as well as the type/crc footer.
        // See table_builder.cc for the code that built this structure.
        size_t n = static_cast<size_t>(handle.size());  // 返回block数据的大小
        char* buf = new char[n + kBlockTrailerSize];    // 分配block大小+5字节的type/crc
        Slice contents;
        Status s = file->Read(handle.offset(), n + kBlockTrailerSize, &contents, buf);
        if (!s.ok()) {
            delete[] buf;
            return s;
        }
        if (contents.size() != n + kBlockTrailerSize) {// 读取到的大小不合法
            delete[] buf;
            return Status::Corruption("truncated block read");
        }

        // Check the crc of the type and the block contents
        const char* data = contents.data();    // Pointer to where Read put the data
        if (options.verify_checksums) {
            const uint32_t crc = crc32c::Unmask(DecodeFixed32(data + n + 1));// 读取CRC
            const uint32_t actual = crc32c::Value(data, n + 1);              // 计算CRC
            if (actual != crc) {                                             // CRC检验
                delete[] buf;
                s = Status::Corruption("block checksum mismatch");
                return s;
            }
        }
        CompressionType type = static_cast<CompressionType>(data[n]);       // 类型
        switch (type) {                                                     // type字段,表示是否压缩
            case kNoCompression:                                            // 没有压缩
            if (data != buf) {
                // File implementation gave us pointer to some other data.
                // Copy into buf[].
                // buf与data有重叠(通常memcpy不能处理内存重叠的问题-采用while(*des++=*sou++);)
                // 这里应该不存在问题,data与buf地址相同,感觉这里不需要使用memcpy
                memcpy(buf, data, n + kBlockTrailerSize);                   //? buf与data有重叠
            }

            // Ok
            break;
            case kSnappyCompression: {                                      // 压缩过的数据
                size_t ulength = 0;
                if (!port::Snappy_GetUncompressedLength(data, n, &ulength)) {
                    delete[] buf;
                    return Status::Corruption("corrupted compressed block contents");
                }
                char* ubuf = new char[ulength];
                if (!port::Snappy_Uncompress(data, n, ubuf)) {              // 解压缩
                    delete[] buf;
                    delete[] ubuf;
                    return Status::Corruption("corrupted compressed block contents");
                }
                delete[] buf;
                buf = ubuf;
                n = ulength;
                break;
            }
            default:
                delete[] buf;
                return Status::Corruption("bad block type");
        } // switch

        // 分配一个Block
        *block = new Block(buf, n);  // Block takes ownership of buf[]
        return Status::OK();
    }
    
    // 向文件写入一个Block(数据及type和CRC),并设置index Block项
    void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
        // File format contains a sequence of blocks where each block has:
        //    block_data: uint8[n]
        //    type: uint8
        //    crc: uint32
        assert(ok());
        Rep* r = rep_;
        Slice raw = block->Finish();

        Slice block_contents;
        CompressionType type = r->options.compression;
        // TODO(postrelease): Support more compression options: zlib?
        switch (type) {
            case kNoCompression:
            block_contents = raw;
            break;

            case kSnappyCompression: {// 压缩
                std::string* compressed = &r->compressed_output;
                if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
                compressed->size() < raw.size() - (raw.size() / 8u)) {
                    block_contents = *compressed;
                    } else {
                    // Snappy not supported, or compressed less than 12.5%, so just
                    // store uncompressed form
                    block_contents = raw;
                    type = kNoCompression;
                }
                break;
            }
        }
        handle->set_offset(r->offset);               				// 设置 index block索引信息
        handle->set_size(block_contents.size());     				//
        r->status = r->file->Append(block_contents); 				// 写入data block数据
        if (r->status.ok()) {
            char trailer[kBlockTrailerSize];           				// type + crc
            trailer[0] = type;
            uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
            crc = crc32c::Extend(crc, trailer, 1);  // Extend crc to cover block type
            EncodeFixed32(trailer+1, crc32c::Mask(crc));
            r->status = r->file->Append(Slice(trailer, kBlockTrailerSize)); // 写入trailer
            if (r->status.ok()) {
                r->offset += block_contents.size() + kBlockTrailerSize;
            }
        }
        r->compressed_output.clear();
        block->Reset();
    }

 class BlockBuilder {
      public:
        explicit BlockBuilder(const Options* options);							// 构造函数

        // Reset the contents as if the BlockBuilder was just constructed.
        void Reset();														// 重置状态

        // REQUIRES: Finish() has not been callled since the last call to Reset().
        // REQUIRES: key is larger than any previously added key
        void Add(const Slice& key, const Slice& value);							// 添加一条key/value记录

        // Finish building the block and return a slice that refers to the
        // block contents.  The returned slice will remain valid for the
        // lifetime of this builder or until Reset() is called.
        Slice Finish();														// 结束构建此Block,返回Block已经构建的内容

        // Returns an estimate of the current (uncompressed) size of the block
        // we are building.
        size_t CurrentSizeEstimate() const;										// 返回当前容量

        // Return true iff no entries have been added since the last Reset()
        bool empty() const {													// 内容是否为空
            return buffer_.empty();
        }

      private:
        const Options*        options_;
        std::string           buffer_;      // Destination buffer
        std::vector<uint32_t> restarts_;    // Restart points,					// 各重启点
        int                   counter_;     // Number of entries emitted since restart	// 当前restart块的数量(每16个构建一个restart块)
        bool                  finished_;    // Has Finish() been called?			// 是否此Block已经结束构建
        std::string           last_key_;	 									// 最后一个添加的key

        // No copying allowed
        BlockBuilder(const BlockBuilder&);
        void operator=(const BlockBuilder&);
   };   
        BlockBuilder::BlockBuilder(const Options* options)
    : options_(options),
    restarts_(),
    counter_(0),
    finished_(false) {
        assert(options->block_restart_interval >= 1);
        restarts_.push_back(0);       // First restart point is at offset 0
    }

    void BlockBuilder::Reset() {
        buffer_.clear();
        restarts_.clear();
        restarts_.push_back(0);       // First restart point is at offset 0
        counter_ = 0;
        finished_ = false;
        last_key_.clear();
    }

    // 预估大小
    size_t BlockBuilder::CurrentSizeEstimate() const {
        return (buffer_.size() +                // Raw data buffer
        restarts_.size() * sizeof(uint32_t) +   // Restart array
        sizeof(uint32_t));                      // Restart array length
    }

    
    Slice BlockBuilder::Finish() {								// 添加restart信息
        // Append restart array
        for (size_t i = 0; i < restarts_.size(); i++) {                 // 重启点
            PutFixed32(&buffer_, restarts_[i]);
        }
        PutFixed32(&buffer_, restarts_.size());                         // 重启点数量
        finished_ = true;
        return Slice(buffer_);
    }

    void BlockBuilder::Add(const Slice& key, const Slice& value) {      // 向当前Block的buffer中添加一条key/value
        Slice last_key_piece(last_key_);
        assert(!finished_);
        assert(counter_ <= options_->block_restart_interval);           // 如果>了,表示已经开始下一个restart周期了,counter_ reset
        assert(buffer_.empty()                                          // No values yet?
            || options_->comparator->Compare(key, last_key_piece) > 0); //+ or key是顺序添加的

        size_t shared = 0;
        if (counter_ < options_->block_restart_interval)                // 还没有到配置的restart块数量
        {              
            // See how much sharing to do with previous string
            const size_t min_length = std::min(last_key_piece.size(), key.size());
            while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
                shared++;                                               // 与last key共享的长度
            }
        } 
        else                                                            // 新的restart块
        {
            // Restart compression
            restarts_.push_back(buffer_.size());                        // buffer_为内容缓冲区, size()表示当前位置
            counter_ = 0;
        }
        const size_t non_shared = key.size() - shared;                  // 非共享key长度

                                                                        // 写入一个entry/record
        // Add "<shared><non_shared><value_size>" to buffer_
        PutVarint32(&buffer_, shared);                                  // shared key 长度           
        PutVarint32(&buffer_, non_shared);                              // 非shared key长度
        PutVarint32(&buffer_, value.size());                            // value长度

        // Add string delta to buffer_ followed by value
        buffer_.append(key.data() + shared, non_shared);                // 非共享key内容
        buffer_.append(value.data(), value.size());                     // value内容

        // Update state
        last_key_.resize(shared);
        														// last_key就是当前添加的key
        last_key_.append(key.data() + shared, non_shared);			//+ 可能出于对效率的考虑,共享的数据较多时,
        														//+ 直接赋值(last_key_=key)将产生大的数据拷贝
        assert(Slice(last_key_) == key);
        counter_++;
    }
        


你可能感兴趣的:(String,null,delete,iterator,buffer,compression)