从这个小例子可以看出逻辑记录和物理Block之间的关系,LevelDb一次物理读取为一个Block,然后根据类型情况拼接出逻辑记录,供后续流程处理。
// 记录类型
enum RecordType {
// Zero is reserved for preallocated files
kZeroType = 0,
kFullType = 1,
// For fragments
kFirstType = 2,
kMiddleType = 3,
kLastType = 4
};
static const int kBlockSize = 32768; // 32k Block
// recored header is checksum (4 bytes), length (2 bytes), type (1 byte).
static const int kHeaderSize = 4 + 2 + 1;
写日志类Writer:
namespace log {
class Writer {
public:
// Create a writer that will append data to "*dest".
// "*dest" must be initially empty.
// "*dest" must remain live while this Writer is in use.
explicit Writer(WritableFile* dest);
~Writer(){}
Status AddRecord(const Slice& slice); // 添加一个记录
private:
WritableFile* dest_; // class WritableFile;为写文件类
int block_offset_; // Current offset in block
// crc32c values for all supported record types. These are
// pre-computed to reduce the overhead of computing the crc of the
// record type stored in the header.
uint32_t type_crc_[kMaxRecordType + 1]; // 每种type都预先计算出CRC,kMaxRecordType = kLastType;
Status EmitPhysicalRecord(RecordType type, const char* ptr, size_t length);// 写入一个Record
// No copying allowed
Writer(const Writer&); // 禁止拷贝构造函数及赋值运算符重载
void operator=(const Writer&);
};
}
Writer::Writer(WritableFile* dest) // 构造函数,参数:写文件句柄
: dest_(dest),
block_offset_(0) {
for (int i = 0; i <= kMaxRecordType; i++) {
char t = static_cast(i);
type_crc_[i] = crc32c::Value(&t, 1); // 首先计算每个Type对应的CRC
}
}
Status Writer::AddRecord(const Slice& slice) { // 添加一个记录
const char* ptr = slice.data();
size_t left = slice.size();
// Fragment the record if necessary and emit it. Note that if slice
// is empty, we still want to iterate once to emit a single // 如果Slice为空,则增加一个zero-length的记录
// zero-length record
Status s;
bool begin = true;
do {
const int leftover = kBlockSize - block_offset_; // 当前Block剩余容量
assert(leftover >= 0);
if (leftover < kHeaderSize) { // 剩余容量比kHeaderSize还小,则填充trailer
// Switch to a new block
if (leftover > 0) {
// Fill the trailer (literal below relies on kHeaderSize being 7)
assert(kHeaderSize == 7);
dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); // leftover<7, dest_追加leftover个0
}
block_offset_ = 0;
}
// Invariant: we never leave < kHeaderSize bytes in a block.
assert(kBlockSize - block_offset_ - kHeaderSize >= 0);
const size_t avail = kBlockSize - block_offset_ - kHeaderSize; // 当前block剩余可用大小(除去kHeaderSize)
const size_t fragment_length = (left < avail) ? left : avail; // 分片
RecordType type;
const bool end = (left == fragment_length); // 是否为最后一个
if (begin && end) { // 开始 && 结束,则type为FullType
type = kFullType;
} else if (begin) { // 开始 && 非结束,则type为kFirstType
type = kFirstType;
} else if (end) { // 非开始 && 结束,则type为kLastType
type = kLastType;
} else { // 其它为kMiddleType
type = kMiddleType;
}
s = EmitPhysicalRecord(type, ptr, fragment_length); // 保存一条fragment_length字节长度的数据到log文件,类型为type,开始地址为ptr
if(!s.ok()){ // 写入失败,则跳出循环
break ;
}
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (/*s.ok() &&*/ left > 0);
return s;
}
// 保存一条n字节长度的记录,记录类型为t,记录数据开始地址为ptr
Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n)
{
assert(n <= 0xffff); // Must fit in two bytes
assert(block_offset_ + kHeaderSize + n <= kBlockSize);
// Format the header
char buf[kHeaderSize]; // 7bytes: CheckSum(4) + 记录长度(2) + Type(1)
buf[4] = static_cast(n & 0xff);
buf[5] = static_cast(n >> 8 & 0xff); // 长度高位在后
buf[6] = static_cast(t);
// Compute the crc of the record type and the payload.
uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); // 计算CRC
crc = crc32c::Mask(crc); // Adjust for storage
EncodeFixed32(buf, crc); // 将CRC放入header前4字节
// Write the header and the payload
Status s = dest_->Append(Slice(buf, kHeaderSize)); // header写入文件
if (s.ok()) { // header写入成功
s = dest_->Append(Slice(ptr, n)); // 将记录数据写入文件
if (s.ok()) {
s = dest_->Flush(); // flush到文件
}
}
block_offset_ += kHeaderSize + n; // Block offset移动
return s;
}
class Reader {
public:
// Interface for reporting errors.
class Reporter
{
public:
virtual ~Reporter();
// Some corruption was detected. "size" is the approximate number
// of bytes dropped due to the corruption.
virtual void Corruption(size_t bytes, const Status& status) = 0;
};
// Create a reader that will return log records from "*file".
// "*file" must remain live while this Reader is in use.
//
// If "reporter" is non-NULL, it is notified whenever some data is
// dropped due to a detected corruption. "*reporter" must remain
// live while this Reader is in use.
//
// If "checksum" is true, verify checksums if available.
//
// The Reader will start reading at the first record located at physical
// position >= initial_offset within the file. // 读取Block是从position(>= initial_offset)位置开始的
Reader(SequentialFile* file, Reporter* reporter, bool checksum, uint64_t initial_offset);
~Reader();
// Read the next record into *record. Returns true if read
// successfully, false if we hit end of the input. May use
// "*scratch" as temporary storage. The contents filled in *record
// will only be valid until the next mutating operation on this
// reader or the next mutation to *scratch.
bool ReadRecord(Slice* record, std::string* scratch); // 读取一个record
// Returns the physical offset of the last record returned by ReadRecord.
//
// Undefined before the first call to ReadRecord.
uint64_t LastRecordOffset();
private:
SequentialFile* const file_;
Reporter* const reporter_;
bool const checksum_;
char* const backing_store_;
Slice buffer_;
bool eof_; // Last Read() indicated EOF by returning < kBlockSize
// Offset of the last record returned by ReadRecord.
uint64_t last_record_offset_;
// Offset of the first location past the end of buffer_.
uint64_t end_of_buffer_offset_;
// Offset at which to start looking for the first record to return
uint64_t const initial_offset_;
// Extend record types with the following special values
enum
{
kEof = kMaxRecordType + 1,
// Returned whenever we find an invalid physical record.
// Currently there are three situations in which this happens:
// * The record has an invalid CRC (ReadPhysicalRecord reports a drop)
// * The record is a 0-length record (No drop is reported)
// * The record is below constructor's initial_offset (No drop is reported)
kBadRecord = kMaxRecordType + 2
};
// Skips all blocks that are completely before "initial_offset_".
//
// Returns true on success. Handles reporting.
bool SkipToInitialBlock(); // 文件读取指针跳到记录所在的block的文件偏移位置
// Return type, or one of the preceding special values
unsigned int ReadPhysicalRecord(Slice* result); // 从sst文件读取一个Block数据
// Reports dropped bytes to the reporter.
// buffer_ must be updated to remove the dropped bytes prior to invocation.
void ReportCorruption(size_t bytes, const char* reason);
void ReportDrop(size_t bytes, const Status& reason);
// No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
主要函数为:
// 文件读取指针跳到记录所在的block的文件偏移位置
bool Reader::SkipToInitialBlock()
{
size_t offset_in_block = initial_offset_ % kBlockSize; // 对应的记录内偏移地址
uint64_t block_start_location = initial_offset_ - offset_in_block; // Block的起始位置
// Don't search a block if we'd be in the trailer
if (offset_in_block > kBlockSize - 6) // 因为读取Block是从position(>= initial_offset)位置开始的,
{ // 所以如果处于Block的最后6个字节中,则肯定会从下一个Block开始
offset_in_block = 0;
block_start_location += kBlockSize; // 跳到下一Block
}
end_of_buffer_offset_ = block_start_location;
// Skip to start of first block that can contain the initial record
if (block_start_location > 0)
{
Status skip_status = file_->Skip(block_start_location); // 文件定位到指定地址处(记录所在的Block开始位置)
if (!skip_status.ok())
{
ReportDrop(block_start_location, skip_status);
return false;
}
}
return true;
}
// 从sst文件读取一个Block数据
unsigned int Reader::ReadPhysicalRecord(Slice* result)
{
while (true)
{
if (buffer_.size() < kHeaderSize) // buffer_: 剩余数据,当长度小于头部长度时,剩余的数据已无效(填充的trailer信息)
{
if (!eof_)
{
// Last read was a full read, so this is a trailer to skip
buffer_.clear();
// 读取下一个Block
Status status = file_->Read(kBlockSize, &buffer_, backing_store_);// backing_store_用于读取记录时存放的空间
end_of_buffer_offset_ += buffer_.size();
if (!status.ok())
{
buffer_.clear();
ReportDrop(kBlockSize, status);
eof_ = true;
return kEof;
}
else if (buffer_.size() < kBlockSize) // 文件结束
{
eof_ = true;
}
continue; // 防止到达文件结尾时,读取的部分数据非法,continue进行判断(下面的条件分支)
}
else if (buffer_.size() == 0) // 到达文件结束,且已经没有数据了
{
// End of file
return kEof;
}
else
{
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "truncated record at end of file");
return kEof;
}
}
// Parse the header
const char* header = buffer_.data();
const uint32_t a = static_cast(header[4]) & 0xff; // 记录长度(低位)
const uint32_t b = static_cast(header[5]) & 0xff; // 记录长度(高位)
const unsigned int type = header[6]; // 记录Type
const uint32_t length = a | (b << 8); // 记录长度
if (kHeaderSize + length > buffer_.size()) // 长度非法
{
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "bad record length");
return kBadRecord;
}
if (type == kZeroType && length == 0) // 类型非法
{
// Skip zero length record without reporting any drops since
// such records are produced by the mmap based writing code in
// env_posix.cc that preallocates file regions.
buffer_.clear();
return kBadRecord;
}
// Check crc
if (checksum_)
{
uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); // CRC
uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); // 计算CRC
if (actual_crc != expected_crc) // CRC校验
{
// Drop the rest of the buffer since "length" itself may have
// been corrupted and if we trust it, we could find some
// fragment of a real log record that just happens to look
// like a valid log record.
size_t drop_size = buffer_.size();
buffer_.clear();
ReportCorruption(drop_size, "checksum mismatch");
return kBadRecord;
}
}
buffer_.remove_prefix(kHeaderSize + length); // 当前数据指针后移
// Skip physical record that started before initial_offset_
if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < initial_offset_)
{
result->clear();
return kBadRecord;
}
*result = Slice(header + kHeaderSize, length);
return type;
}// while
}
// 读取一个record
bool Reader::ReadRecord(Slice* record, std::string* scratch)
{
if (last_record_offset_ < initial_offset_)
{
if (!SkipToInitialBlock())// 文件读取指针跳到记录所在的Block的文件偏移位置
{
return false;
}
}
scratch->clear();
record->clear();
bool in_fragmented_record = false;
// Record offset of the logical record that we're reading
// 0 is a dummy value to make compilers happy
uint64_t prospective_record_offset = 0;
Slice fragment;
while (true)
{
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); // 当前记录偏移位置
const unsigned int record_type = ReadPhysicalRecord(&fragment); // 读取一个record片段
switch (record_type)
{
case kFullType:
if (in_fragmented_record)
{
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty())
{
in_fragmented_record = false;
}
else
{
ReportCorruption(scratch->size(), "partial record without end(1)");
}
}
prospective_record_offset = physical_record_offset;
scratch->clear();
*record = fragment;
last_record_offset_ = prospective_record_offset; // last_record_offset_为上一record的偏移
return true;
case kFirstType:
if (in_fragmented_record)
{
// Handle bug in earlier versions of log::Writer where
// it could emit an empty kFirstType record at the tail end
// of a block followed by a kFullType or kFirstType record
// at the beginning of the next block.
if (scratch->empty())
{
in_fragmented_record = false;
}
else
{
ReportCorruption(scratch->size(), "partial record without end(2)");
}
}
prospective_record_offset = physical_record_offset;
scratch->assign(fragment.data(), fragment.size());
in_fragmented_record = true;
break;
case kMiddleType:
if (!in_fragmented_record)
{
ReportCorruption(fragment.size(),
"missing start of fragmented record(1)");
}
else // 此类型为一个中间片段,append
{
scratch->append(fragment.data(), fragment.size());
}
break;
case kLastType:
if (!in_fragmented_record)
{
ReportCorruption(fragment.size(),
"missing start of fragmented record(2)");
}
else
{
scratch->append(fragment.data(), fragment.size());
*record = Slice(*scratch);
last_record_offset_ = prospective_record_offset;// last_record_offset_为上一record的偏移
return true;
}
break;
case kEof:
if (in_fragmented_record)
{
ReportCorruption(scratch->size(), "partial record without end(3)");
scratch->clear();
}
return false;
case kBadRecord:
if (in_fragmented_record)
{
ReportCorruption(scratch->size(), "error in middle of record");
in_fragmented_record = false;
scratch->clear();
}
break;
default:
char buf[40];
snprintf(buf, sizeof(buf), "unknown record type %u", record_type);
ReportCorruption(
(fragment.size() + (in_fragmented_record ? scratch->size() : 0)),
buf);
in_fragmented_record = false;
scratch->clear();
break;
}// switch
} // while
return false;
}
使用方法:
日志文件每个record中数据域格式就是WriteBatch::rep_格式。
1、在DBImpl::Write中调用,写的是log文件:
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
WriteBatchInternal::Contents(updates)返回的数据格式为:
// 格式:
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring |
//+ kTypeDeletion varstring
// varstring :=
// len: varint32
// data: uint8[len]
std::string record;
new_db.EncodeTo(&record); # 具体数据格式在manifest中介绍
s = log.AddRecord(record);