上一篇的博客,写到Rocksdb将WriteBatch的内容往memtable,主要用到的是MemTableInserter这个类的SeekToColumnFamily和PutCF这两个方法
class MemTableInserter : public WriteBatch::Handler {
public:
SequenceNumber sequence_;
ColumnFamilyMemTables* cf_mems_;
bool ignore_missing_column_families_;
uint64_t log_number_;
DBImpl* db_;
const bool dont_filter_deletes_;
MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
bool ignore_missing_column_families, uint64_t log_number,
DB* db, const bool dont_filter_deletes)
: sequence_(sequence),
cf_mems_(cf_mems),
ignore_missing_column_families_(ignore_missing_column_families),
log_number_(log_number),
db_(reinterpret_cast(db)),
dont_filter_deletes_(dont_filter_deletes) {
assert(cf_mems);
if (!dont_filter_deletes_) {
assert(db_);
}
}
bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
// We are only allowed to call this from a single-threaded write thread
// (or while holding DB mutex)
//在memtable里查找这个column_family
bool found = cf_mems_->Seek(column_family_id);
if (!found) {
if (ignore_missing_column_families_) {
*s = Status::OK();
} else {
*s = Status::InvalidArgument(
"Invalid column family specified in write batch");
}
return false;
}
if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
// This is true only in recovery environment (log_number_ is always 0 in
// non-recovery, regular write code-path)
// * If log_number_ < cf_mems_->GetLogNumber(), this means that column
// family already contains updates from this log. We can't apply updates
// twice because of update-in-place or merge workloads -- ignore the
// update
*s = Status::OK();
return false;
}
return true;
}
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
Status seek_status;
//如果在memtable中没有找到传入的ColumnFamily,直接返回,如果找到了,就是设置当前(ColumnFamilyData)current_为找到的columnFamily
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
//直接获取当前的ColumnFamilyData)current_的memtable
MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions();
//如何memtable操作中的内部更新不支持就添加这条记录
if (!moptions->inplace_update_support) {
mem->Add(sequence_, kTypeValue, key, value);
//或者支持内部更新,但inplace_callback为空,就只是更新这条记录
} else if (moptions->inplace_callback == nullptr) {
mem->Update(sequence_, key, value);
RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
} else {
//不然就更新这条记录并且Callback
if (mem->UpdateCallback(sequence_, key, value)) {
} else {
//支持内部更新,但在memtable中找不到这条记录,就去从sst获取,并且更新,添加
// key not found in memtable. Do sst get, update, add
//设置快照
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions ropts;
ropts.snapshot = &read_from_snapshot;
std::string prev_value;
std::string merged_value;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
//调用数据库的Get的操作获获取这个key之前的值,并存在快照中
Status s = db_->Get(ropts, cf_handle, key, &prev_value);
char* prev_buffer = const_cast<char*>(prev_value.c_str());
uint32_t prev_size = static_cast(prev_value.size());
auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
s.ok() ? &prev_size : nullptr,
value, &merged_value);
if (status == UpdateStatus::UPDATED_INPLACE) {
//之前的的值已经内部更新了,其实就是把新的值写在原来的地址上
// prev_value is updated in-place with final value.
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
} else if (status == UpdateStatus::UPDATED) {
//没有内部更新的话,就存合并后的值,
// merged_value contains the final value.
mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
}
}
}
// Since all Puts are logged in trasaction logs (if enabled), always bump
// sequence number. Even if the update eventually fails and does not result
// in memtable add/update.
sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK();
}
}
这个类在WriteBatchInternal::InsertInto这个中被构建并作为参加传入WriteBatch的Iterate方法。
Status WriteBatchInternal::InsertInto(const WriteBatch* b,
ColumnFamilyMemTables* memtables,
bool ignore_missing_column_families,
uint64_t log_number, DB* db,
const bool dont_filter_deletes) {
MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
ignore_missing_column_families, log_number, db,
dont_filter_deletes);
return b->Iterate(&inserter);
}
WriteBatch::Iterate而这个方法做的事情就是将WriteBatch中的内容移除头的12个字节后,一条条取记录,然后根据类型调用handler(MemTableInserter)里面的方法处理
Status WriteBatch::Iterate(Handler* handler) const {
Slice input(rep_);
if (input.size() < kHeader) {//字符串的长度至少要大于等于12
return Status::Corruption("malformed WriteBatch (too small)");
}
input.remove_prefix(kHeader);//移除头12个字节
Slice key, value, blob;
int found = 0;//代表记录数
Status s;
while (s.ok() && !input.empty() && handler->Continue()) {
char tag = 0;//获取类型
uint32_t column_family = 0; // default
s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
&blob);
if (!s.ok()) {
return s;
}
switch (tag) {
case kTypeColumnFamilyValue:
case kTypeValue:
s = handler->PutCF(column_family, key, value);
found++;
break;
case kTypeColumnFamilyDeletion:
case kTypeDeletion:
s = handler->DeleteCF(column_family, key);
found++;
break;
case kTypeColumnFamilySingleDeletion:
case kTypeSingleDeletion:
s = handler->SingleDeleteCF(column_family, key);
found++;
break;
case kTypeColumnFamilyMerge:
case kTypeMerge:
s = handler->MergeCF(column_family, key, value);
found++;
break;
case kTypeLogData:
handler->LogData(blob);
break;
default:
return Status::Corruption("unknown WriteBatch tag");
}
}
if (!s.ok()) {
return s;
}
if (found != WriteBatchInternal::Count(this)) {//判断添加的记录数是否等于WriteBatch中持有的记录数
return Status::Corruption("WriteBatch has wrong count");
} else {
return Status::OK();
}
}
因为是写操作,所以调用的是MemTableInserter:PutCF
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
Status seek_status;
//如果在memtable中没有找到传入的ColumnFamily,直接返回,如果找到了,就是设置当前(ColumnFamilyData)current_为找到的columnFamily
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
//直接获取当前的ColumnFamilyData)current_的memtable
MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions();
//如何memtable操作中的内部更新不支持就添加这条记录
if (!moptions->inplace_update_support) {
mem->Add(sequence_, kTypeValue, key, value);
//或者支持内部更新,但inplace_callback为空,就只是更新这条记录
} else if (moptions->inplace_callback == nullptr) {
mem->Update(sequence_, key, value);
RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
} else {
//不然就更新这条记录并且Callback
if (mem->UpdateCallback(sequence_, key, value)) {
} else {
//支持内部更新,但在memtable中找不到这条记录,就去从sst获取,并且更新,添加
// key not found in memtable. Do sst get, update, add
//设置快照
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions ropts;
ropts.snapshot = &read_from_snapshot;
std::string prev_value;
std::string merged_value;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
//调用数据库的Get的操作获获取这个key之前的值,并存在快照中
Status s = db_->Get(ropts, cf_handle, key, &prev_value);
char* prev_buffer = const_cast<char*>(prev_value.c_str());
uint32_t prev_size = static_cast(prev_value.size());
auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
s.ok() ? &prev_size : nullptr,
value, &merged_value);
if (status == UpdateStatus::UPDATED_INPLACE) {
//之前的的值已经内部更新了,其实就是把新的值写在原来的地址上
// prev_value is updated in-place with final value.
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
} else if (status == UpdateStatus::UPDATED) {
//没有内部更新的话,就存合并后的值,
// merged_value contains the final value.
mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
}
}
}
// Since all Puts are logged in trasaction logs (if enabled), always bump
// sequence number. Even if the update eventually fails and does not result
// in memtable add/update.
sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK();
}
而PutCF主要调用的是MemTable::Add方法往memtable里面添加记录,加入时进行编码操作(省空间)
void MemTable::Add(SequenceNumber s, ValueType type,
const Slice& key, /* user key */
const Slice& value) {
//存储的格式如下
// Format of an entry is concatenation of:
// key_size : varint32 of internal_key.size()
// key bytes : char[internal_key.size()]
// value_size : varint32 of value.size()
// value bytes : char[value.size()]
//这里为了节省空间,将整型编码成变长整型,存储为变长整型(可以查下资料)
uint32_t key_size = static_cast(key.size());
uint32_t val_size = static_cast(value.size());
uint32_t internal_key_size = key_size + 8;
//获取编码后的长度
const uint32_t encoded_len = VarintLength(internal_key_size) +
internal_key_size + VarintLength(val_size) +
val_size;
char* buf = nullptr;
//分配encoded_len长度的buffer
KeyHandle handle = table_->Allocate(encoded_len, &buf);
assert(buf != nullptr);
//依次将key和value的长度和值,还有类型,编码到buf里面
char* p = EncodeVarint32(buf, internal_key_size);
memcpy(p, key.data(), key_size);
p += key_size;
uint64_t packed = PackSequenceAndType(s, type);
EncodeFixed64(p, packed);
p += 8;
p = EncodeVarint32(p, val_size);
memcpy(p, value.data(), val_size);
assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);
//编码完以后插入.
// rocksdb中,memtable在内存中的形式有三种:skiplist,hash-skiplist,hash-linklist,
// 从字面中就可以看出数据结构的大体形式,
// hash-skiplist就是每个hash bucket中是一个skiplist,
// hash-linklist中,每个hash bucket中是一个link-list,
// 启用何用数据结构可在配置中选择
table_->Insert(handle);
num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
std::memory_order_relaxed);
data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
std::memory_order_relaxed);
if (type == kTypeDeletion) {
num_deletes_++;
}
if (prefix_bloom_) {
assert(prefix_extractor_);
prefix_bloom_->Add(prefix_extractor_->Transform(key));
}
// The first sequence number inserted into the memtable
assert(first_seqno_ == 0 || s > first_seqno_);
if (first_seqno_ == 0) {
first_seqno_ = s;
if (earliest_seqno_ == kMaxSequenceNumber) {
earliest_seqno_ = first_seqno_;
}
assert(first_seqno_ >= earliest_seqno_);
}
should_flush_ = ShouldFlushNow();
}
这样完成了memtable表的记录添加
参考
http://kernelmaker.github.io/Rocksdb_Study_4
http://www.cnblogs.com/KevinT/category/590804.html