1.Slice
//主要用来装数据的
// 就两个成员变量data,size
// (就是用装key和value的值,长度),以及一些处理函数。
class Slice {
public:
// Create an empty slice.
Slice() : data_(""), size_(0) { }
// Create a slice that refers to d[0,n-1].
Slice(const char* d, size_t n) : data_(d), size_(n) { }
// Create a slice that refers to the contents of "s"
/* implicit */
Slice(const std::string& s) : data_(s.data()), size_(s.size()) { }
// Create a slice that refers to s[0,strlen(s)-1]
/* implicit */
Slice(const char* s) : data_(s), size_(strlen(s)) { }
// Create a single slice from SliceParts using buf as storage.
// buf must exist as long as the returned Slice exists.
Slice(const struct SliceParts& parts, std::string* buf);
// Return a pointer to the beginning of the referenced data
const char* data() const { return data_; }
// Return the length (in bytes) of the referenced data
size_t size() const { return size_; }
// Return true iff the length of the referenced data is zero
bool empty() const { return size_ == 0; }
// Return the ith byte in the referenced data.
// REQUIRES: n < size()
char operator[](size_t n) const {
assert(n < size());
return data_[n];
}
// Change this slice to refer to an empty array
void clear() { data_ = ""; size_ = 0; }
// Drop the first "n" bytes from this slice.
void remove_prefix(size_t n) {
assert(n <= size());
data_ += n;
size_ -= n;
}
// Return a string that contains the copy of the referenced data.
std::string ToString(bool hex = false) const;
// Three-way comparison. Returns value:
// < 0 iff "*this" < "b",
// == 0 iff "*this" == "b",
// > 0 iff "*this" > "b"
int compare(const Slice& b) const;
// Return true iff "x" is a prefix of "*this"
bool starts_with(const Slice& x) const {
return ((size_ >= x.size_) &&
(memcmp(data_, x.data_, x.size_) == 0));
}
// Compare two slices and returns the first byte where they differ
size_t difference_offset(const Slice& b) const;
// private: make these public for rocksdbjni access
const char* data_;
size_t size_;
// Intentionally copyable
};
2.WriteOptions
struct WriteOptions {
// Default: false
bool sync; //是否需要同步
// If true, writes will not first go to the write ahead log,
// and the write may got lost after a crash.
bool disableWAL; //是否需要写事务日志
// The option is deprecated. It's not used anymore.
uint64_t timeout_hint_us; // 指示了这个写操作完成的时间期限
// If true and if user is trying to write to column families that don't exist
// (they were dropped), ignore the write (don't return an error). If there
// are multiple writes in a WriteBatch, other writes will succeed.
// Default: false
bool ignore_missing_column_families;
WriteOptions()
: sync(false),
disableWAL(false),
timeout_hint_us(0),
ignore_missing_column_families(false) {}
};
3.WriteBatch
//rocksdb在写时做了一个优化批量更新的操作,即writebatch类。writebatch类只有一个成员变量,存储的
//是若干条记录的序列号字符串,这个字符串是按照一定格式生成,当要取出这些记录时,也要按照格式一条一条
//解析出来。
//先介绍下这个类的成员变量rep_,这个字符串用来存储这次批操作的所有记录,格式如下:
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring
// kTypeDeletion varstring
// kTypeSingleDeletion varstring
// kTypeMerge varstring varstring
// kTypeColumnFamilyValue varint32 varstring varstring
// kTypeColumnFamilyDeletion varint32 varstring varstring
// kTypeColumnFamilySingleDeletion varint32 varstring varstring
// kTypeColumnFamilyMerge varint32 varstring varstring
// varstring :=
// len: varint32
// data: uint8[len]
//可以看到这个这个字符串首先有有8字节的序列号和4字节的记录数作为头,所以这个类定义了
//static const size_t KHeader=12
//作为这个这个字符串的最小长度。在头之后,紧接着就是一条一条的记录。
//对于插入的记录,由kTypeValue+key长度+key+value长度+value组成
//对于删除记录,由kTypeDelete+key长度+key组成
//这类定义了写和删除的操作实现就是调用WriteBatchInternal这个类里面的方法
class WriteBatch : public WriteBatchBase {
public:
explicit WriteBatch(size_t reserved_bytes = 0);
~WriteBatch();
using WriteBatchBase::Put;
// Store the mapping "key->value" in the database.
void Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override;
void Put(const Slice& key, const Slice& value) override {
Put(nullptr, key, value);
}
// Variant of Put() that gathers output like writev(2). The key and value
// that will be written to the database are concatentations of arrays of
// slices.
void Put(ColumnFamilyHandle* column_family, const SliceParts& key,
const SliceParts& value) override;
void Put(const SliceParts& key, const SliceParts& value) override {
Put(nullptr, key, value);
}
using WriteBatchBase::Delete;
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(ColumnFamilyHandle* column_family, const Slice& key) override;
void Delete(const Slice& key) override { Delete(nullptr, key); }
// variant that takes SliceParts
void Delete(ColumnFamilyHandle* column_family,
const SliceParts& key) override;
void Delete(const SliceParts& key) override { Delete(nullptr, key); }
using WriteBatchBase::SingleDelete;
// If the database contains a mapping for "key", erase it. Expects that the
// key was not overwritten. Else do nothing.
void SingleDelete(ColumnFamilyHandle* column_family,
const Slice& key) override;
void SingleDelete(const Slice& key) override { SingleDelete(nullptr, key); }
// variant that takes SliceParts
void SingleDelete(ColumnFamilyHandle* column_family,
const SliceParts& key) override;
void SingleDelete(const SliceParts& key) override {
SingleDelete(nullptr, key);
}
4.WriteBatchInternal
这个类主要作用就是操作WriteBatch的字符串,比如取出/设置序列号,取出/设置记录数,将WriteBatch插入memtable等等。来看下这个类的操作成员方法,全部声明为static方法:
class WriteBatchInternal {
public:
// WriteBatch methods with column_family_id instead of ColumnFamilyHandle*
static void Put(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value);
static void Put(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key, const SliceParts& value);
static void Delete(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key);
static void Delete(WriteBatch* batch, uint32_t column_family_id,
const Slice& key);
static void SingleDelete(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key);
static void SingleDelete(WriteBatch* batch, uint32_t column_family_id,
const Slice& key);
static void Merge(WriteBatch* batch, uint32_t column_family_id,
const Slice& key, const Slice& value);
static void Merge(WriteBatch* batch, uint32_t column_family_id,
const SliceParts& key, const SliceParts& value);
// Return the number of entries in the batch.
static int Count(const WriteBatch* batch);
// Set the count for the number of entries in the batch.
static void SetCount(WriteBatch* batch, int n);
// Return the seqeunce number for the start of this batch.
static SequenceNumber Sequence(const WriteBatch* batch);
// Store the specified number as the seqeunce number for the start of
// this batch.
static void SetSequence(WriteBatch* batch, SequenceNumber seq);
// Returns the offset of the first entry in the batch.
// This offset is only valid if the batch is not empty.
static size_t GetFirstOffset(WriteBatch* batch);
static Slice Contents(const WriteBatch* batch) {
return Slice(batch->rep_);
}
static size_t ByteSize(const WriteBatch* batch) {
return batch->rep_.size();
}
static void SetContents(WriteBatch* batch, const Slice& contents);
// Inserts batch entries into memtable
// If dont_filter_deletes is false AND options.filter_deletes is true,
// then --> Drops deletes in batch if db->KeyMayExist returns false
// If ignore_missing_column_families == true. WriteBatch referencing
// non-existing column family should be ignored.
// However, if ignore_missing_column_families == false, any WriteBatch
// referencing non-existing column family will return a InvalidArgument()
// failure.
//
// If log_number is non-zero, the memtable will be updated only if
// memtables->GetLogNumber() >= log_number
static Status InsertInto(const WriteBatch* batch,
ColumnFamilyMemTables* memtables,
bool ignore_missing_column_families = false,
uint64_t log_number = 0, DB* db = nullptr,
const bool dont_filter_deletes = true);
static void Append(WriteBatch* dst, const WriteBatch* src);
};
5.MemTableInserter
// 这个类将正常的键值对和删除类型的键值对添加进memtable。这个类将作为参数,传入rep_解析函数
class MemTableInserter : public WriteBatch::Handler {
public:
SequenceNumber sequence_;
ColumnFamilyMemTables* cf_mems_;
bool ignore_missing_column_families_;
uint64_t log_number_;
DBImpl* db_;
const bool dont_filter_deletes_;
MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
bool ignore_missing_column_families, uint64_t log_number,
DB* db, const bool dont_filter_deletes)
: sequence_(sequence),
cf_mems_(cf_mems),
ignore_missing_column_families_(ignore_missing_column_families),
log_number_(log_number),
db_(reinterpret_cast(db)),
dont_filter_deletes_(dont_filter_deletes) {
assert(cf_mems);
if (!dont_filter_deletes_) {
assert(db_);
}
}
bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
// We are only allowed to call this from a single-threaded write thread
// (or while holding DB mutex)
bool found = cf_mems_->Seek(column_family_id);
if (!found) {
if (ignore_missing_column_families_) {
*s = Status::OK();
} else {
*s = Status::InvalidArgument(
"Invalid column family specified in write batch");
}
return false;
}
if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
// This is true only in recovery environment (log_number_ is always 0 in
// non-recovery, regular write code-path)
// * If log_number_ < cf_mems_->GetLogNumber(), this means that column
// family already contains updates from this log. We can't apply updates
// twice because of update-in-place or merge workloads -- ignore the
// update
*s = Status::OK();
return false;
}
return true;
}
virtual Status PutCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
Status seek_status;
//如何在memtable中没有找到传入的ColumnFamily,直接返回
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
//获取memtable
MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions();
//如何memtable操作中的内部更新不支持就添加这条记录
if (!moptions->inplace_update_support) {
mem->Add(sequence_, kTypeValue, key, value);
//或者更新这条记录
} else if (moptions->inplace_callback == nullptr) {
mem->Update(sequence_, key, value);
RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
} else {
//或者更新这条记录
if (mem->UpdateCallback(sequence_, key, value)) {
} else {
//如果在memtable中找不到这条记录,就去从sst获取,并且更新,添加
// key not found in memtable. Do sst get, update, add
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions ropts;
ropts.snapshot = &read_from_snapshot;
std::string prev_value;
std::string merged_value;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
//调用数据库的Get的操作获获取这个key之前的值,并存在快照中
Status s = db_->Get(ropts, cf_handle, key, &prev_value);
char* prev_buffer = const_cast<char*>(prev_value.c_str());
uint32_t prev_size = static_cast(prev_value.size());
auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
s.ok() ? &prev_size : nullptr,
value, &merged_value);
if (status == UpdateStatus::UPDATED_INPLACE) {
// prev_value is updated in-place with final value.
mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
} else if (status == UpdateStatus::UPDATED) {
// merged_value contains the final value.
mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
}
}
}
// Since all Puts are logged in trasaction logs (if enabled), always bump
// sequence number. Even if the update eventually fails and does not result
// in memtable add/update.
sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK();
}
virtual Status DeleteCF(uint32_t column_family_id,
const Slice& key) override {
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions();
if (!dont_filter_deletes_ && moptions->filter_deletes) {
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions ropts;
ropts.snapshot = &read_from_snapshot;
std::string value;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
return Status::OK();
}
}
mem->Add(sequence_, kTypeDeletion, key, Slice());
sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK();
}
virtual Status SingleDeleteCF(uint32_t column_family_id,
const Slice& key) override {
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions();
if (!dont_filter_deletes_ && moptions->filter_deletes) {
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions ropts;
ropts.snapshot = &read_from_snapshot;
std::string value;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
if (!db_->KeyMayExist(ropts, cf_handle, key, &value)) {
RecordTick(moptions->statistics, NUMBER_FILTERED_DELETES);
return Status::OK();
}
}
mem->Add(sequence_, kTypeSingleDeletion, key, Slice());
sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK();
}
virtual Status MergeCF(uint32_t column_family_id, const Slice& key,
const Slice& value) override {
Status seek_status;
if (!SeekToColumnFamily(column_family_id, &seek_status)) {
++sequence_;
return seek_status;
}
MemTable* mem = cf_mems_->GetMemTable();
auto* moptions = mem->GetMemTableOptions();
bool perform_merge = false;
if (moptions->max_successive_merges > 0 && db_ != nullptr) {
LookupKey lkey(key, sequence_);
// Count the number of successive merges at the head
// of the key in the memtable
size_t num_merges = mem->CountSuccessiveMergeEntries(lkey);
if (num_merges >= moptions->max_successive_merges) {
perform_merge = true;
}
}
if (perform_merge) {
// 1) Get the existing value
std::string get_value;
// Pass in the sequence number so that we also include previous merge
// operations in the same batch.
SnapshotImpl read_from_snapshot;
read_from_snapshot.number_ = sequence_;
ReadOptions read_options;
read_options.snapshot = &read_from_snapshot;
auto cf_handle = cf_mems_->GetColumnFamilyHandle();
if (cf_handle == nullptr) {
cf_handle = db_->DefaultColumnFamily();
}
db_->Get(read_options, cf_handle, key, &get_value);
Slice get_value_slice = Slice(get_value);
// 2) Apply this merge
auto merge_operator = moptions->merge_operator;
assert(merge_operator);
std::deque<std::string> operands;
operands.push_front(value.ToString());
std::string new_value;
bool merge_success = false;
{
StopWatchNano timer(Env::Default(), moptions->statistics != nullptr);
PERF_TIMER_GUARD(merge_operator_time_nanos);
merge_success = merge_operator->FullMerge(
key, &get_value_slice, operands, &new_value, moptions->info_log);
RecordTick(moptions->statistics, MERGE_OPERATION_TOTAL_TIME,
timer.ElapsedNanos());
}
if (!merge_success) {
// Failed to merge!
RecordTick(moptions->statistics, NUMBER_MERGE_FAILURES);
// Store the delta in memtable
perform_merge = false;
} else {
// 3) Add value to memtable
mem->Add(sequence_, kTypeValue, key, new_value);
}
}
if (!perform_merge) {
// Add merge operator to memtable
mem->Add(sequence_, kTypeMerge, key, value);
}
sequence_++;
cf_mems_->CheckMemtableFull();
return Status::OK();
}
};
rocksdb_put(db, writeoptions, key, strlen(key), value, strlen(value) + 1, &err);
//调用
SaveError(errptr, db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen)));
//调用
db->rep->Put(options->rep, Slice(key, keylen), Slice(val, vallen))
Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
const Slice& key, const Slice& value) {
// Pre-allocate size of write batch conservatively.
// 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
// and we allocate 11 extra bytes for key length, as well as value length.
WriteBatch batch(key.size() + value.size() + 24); //设置Batch
batch.Put(column_family, key, value);
return Write(opt, &batch);//写Batch
}
//首先设置WriteBatch
void WriteBatch::Put(ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) {
WriteBatchInternal::Put(this, GetColumnFamilyID(column_family), key, value);
}
//实际调用的的是WriteBatchInternal::Put
void WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
const Slice& key, const Slice& value) {
//WriteBatch记入数加1
WriteBatchInternal::SetCount(b, WriteBatchInternal::Count(b) + 1);
//
if (column_family_id == 0) {
b->rep_.push_back(static_cast<char>(kTypeValue));//添加类型
} else {
b->rep_.push_back(static_cast<char>(kTypeColumnFamilyValue));//添加类型
PutVarint32(&b->rep_, column_family_id);
}
PutLengthPrefixedSlice(&b->rep_, key);//key的长度和值
PutLengthPrefixedSlice(&b->rep_, value);//添加value的长度和值
}
//然后再把WriteBatch写入
virtual Status Write(const WriteOptions& options, WriteBatch* updates) = 0;
|
Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
return WriteImpl(write_options, my_batch, nullptr);
}
//写入实现在这里
Status DBImpl::WriteImpl(const WriteOptions& write_options,
WriteBatch* my_batch, WriteCallback* callback) {
if (my_batch == nullptr) {
return Status::Corruption("Batch is nullptr!");
}
if (write_options.timeout_hint_us != 0) {
return Status::InvalidArgument("timeout_hint_us is deprecated");
}
Status status;
bool callback_failed = false;
bool xfunc_attempted_write = false;
/*先尝试下可以不可以写*/
XFUNC_TEST("transaction", "transaction_xftest_write_impl",
xf_transaction_write1, xf_transaction_write, write_options,
db_options_, my_batch, callback, this, &status,
&xfunc_attempted_write);
if (xfunc_attempted_write) {
// Test already did the write
return status;
}
PERF_TIMER_GUARD(write_pre_and_post_process_time);
WriteThread::Writer w; //建立写操作
w.batch = my_batch; //要写的数据
w.sync = write_options.sync; //需不需要对事务日志执行fsync或者fdatasync 操作
w.disableWAL = write_options.disableWAL;//指示需要不需要写事务日志
w.in_batch_group = false;
// 最后,in_batch_group的比较有意思。在RocksDB内部,对写入操作做了优化,尽可能地将用户的写入
// 批量处理。这其中使用了一个队列,即write_thread_内部的WriteThread::Writer*队列。在准备写队列头
// 的任务时,会试着用BuildBatchGroup()构建一个批量任务组,将紧跟着队头的其他写操作任务加入
// 到一个BatchGroup,一次性地写入数据库。
w.done = false; //写操作完成时设置
w.has_callback = (callback != nullptr) ? true : false;
if (!write_options.disableWAL) {
//记入 the Number of Write calls that request WAL
RecordTick(stats_, WRITE_WITH_WAL);
}
//记入工作。数据库评测时用到
StopWatch write_sw(env_, db_options_.statistics.get(), DB_WRITE);
// 将当前写入任务@w挂入写队列,并在mutex_上睡眠等待。等待直到:
// 1) 写操作设置了超时时间,等待超时。或,
// 2) @w之前的任务都已完成,@w已处于队列头部。或,
// 3) @w这个写任务被别的写线程完成了。
// 第3个条件,任务被别的写线程完成,实际上是被之前的写任务合并进一个
// WriteBatchGroup中去了。此时的@w会被标记成in_batch_group。有意思的是,在JoinBatchGroup()
// 里面,如果因为超时唤醒了,发现当前任务in_batch_group为true,则会继续等待,
// 因为它已经被别的线程加入BatchGroup准备写入数据库了。
write_thread_.JoinBatchGroup(&w);//将要带有要写的batch的Write加入写的队列当中
if (w.done) {
// write was done by someone else, no need to grab mutex
RecordTick(stats_, WRITE_DONE_BY_OTHER);
return w.status;
}
// else we are the leader of the write batch group
WriteContext context;
mutex_.Lock();
//如果需要写事务日志
if (!write_options.disableWAL) {
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
}
//还是自己的write写自己的batch
RecordTick(stats_, WRITE_DONE_BY_SELF);
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
// Once reaches this point, the current writer "w" will try to do its write
// job. It may also pick up some of the remaining writers in the "writers_"
// when it finds suitable, and finish them in the same write batch.
// This is how a write job could be done by the other writer.
assert(!single_column_family_mode_ ||
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
//设置最大的log的size
uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
? 4 * max_total_in_memory_state_
: db_options_.max_total_wal_size;
if (UNLIKELY(!single_column_family_mode_) &&
alive_log_files_.begin()->getting_flushed == false &&
total_log_size_ > max_total_wal_size) {
// 如果column family有多个,最早的活跃的事务日志对应的memtable还没有被写入磁盘,
// 而且当前日志总大小超过了设定的最大值,那么就需要分配新的memtable,将老的
// immutable memtable内容写入磁盘。
uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number; //当前活跃事务的日志对应的num
alive_log_files_.begin()->getting_flushed = true;
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"Flushing all column families with data in WAL number %" PRIu64
". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
// no need to refcount because drop is happening in write thread, so can't
// happen while we're in the write thread
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
//小等于当前活跃事务日志的num的colum family都应该切换新的memtable
if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
//为Column family分配新的memtable
status = SwitchMemtable(cfd, &context);
if (!status.ok()) {
break;
}
cfd->imm()->FlushRequested();
//调度将要发生的flush
SchedulePendingFlush(cfd);
}
}
//调度flush或者compaction
MaybeScheduleFlushOrCompaction();
}
/*判断需要flush */
else if (UNLIKELY(write_buffer_.ShouldFlush())) {
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
"Flushing all column families. Write buffer is using %" PRIu64
" bytes out of a total of %" PRIu64 ".",
write_buffer_.memory_usage(), write_buffer_.buffer_size());
// no need to refcount because drop is happening in write thread, so can't
// happen while we're in the write thread
//这里flush当前版本的columnfamily
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
if (!cfd->mem()->IsEmpty()) {
status = SwitchMemtable(cfd, &context);
if (!status.ok()) {
break;
}
cfd->imm()->FlushRequested();
SchedulePendingFlush(cfd);
}
}
//调度flush或者compaction
MaybeScheduleFlushOrCompaction();
}
if (UNLIKELY(status.ok() && !bg_error_.ok())) {
status = bg_error_;
}
/*flush_schedule不为空*/
if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
status = ScheduleFlushes(&context);
}
/*write_controller_判断是否需要stop或者delay*/
if (UNLIKELY(status.ok()) &&
(write_controller_.IsStopped() || write_controller_.NeedsDelay())) {
PERF_TIMER_STOP(write_pre_and_post_process_time);
PERF_TIMER_GUARD(write_delay_time);
// We don't know size of curent batch so that we always use the size
// for previous one. It might create a fairness issue that expiration
// might happen for smaller writes but larger writes can go through.
// Can optimize it if it is an issue.
status = DelayWrite(last_batch_group_size_);
PERF_TIMER_START(write_pre_and_post_process_time);
}
uint64_t last_sequence = versions_->LastSequence();
WriteThread::Writer* last_writer = &w;
autovector write_batch_group;
/*日志的和日志的dir同步*/
bool need_log_sync = !write_options.disableWAL && write_options.sync;
bool need_log_dir_sync = need_log_sync && !log_dir_synced_;
//这里在等待事务日志同步的完成
if (status.ok()) {
//把需要写的WriteBatch作为leader加入BatchGroup中
last_batch_group_size_ = write_thread_.EnterAsBatchGroupLeader(
&w, &last_writer, &write_batch_group);
if (need_log_sync) {
while (logs_.front().getting_synced) {
log_sync_cv_.Wait();
}
for (auto& log : logs_) {
assert(!log.getting_synced);
log.getting_synced = true;
}
}
// Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging
// and protects against concurrent loggers and concurrent writes
// into memtables
mutex_.Unlock();
if (callback != nullptr) {
// If this write has a validation callback, check to see if this write
// is able to be written. Must be called on the write thread.
status = callback->Callback(this);
callback_failed = true;
}
} else {
mutex_.Unlock();
}
// At this point the mutex is unlocked
//这里开始是写memtable
if (status.ok()) {
//把write_batch_group中的WriteBatch往WriteBatchInternal这个类要往memtable中写的updates-WriteBatch(这样写和前面区分开)添加
WriteBatch* updates = nullptr;
if (write_batch_group.size() == 1) {
updates = write_batch_group[0];
} else {
updates = &tmp_batch_;
for (size_t i = 0; i < write_batch_group.size(); ++i) {
//往writeBatch追加
WriteBatchInternal::Append(updates, write_batch_group[i]);
}
}
//这个updates-writeBatch的序列号等于verson中最后的序列号+1
const SequenceNumber current_sequence = last_sequence + 1;
//设置序列号
WriteBatchInternal::SetSequence(updates, current_sequence);
//获取记录数
int my_batch_count = WriteBatchInternal::Count(updates);
//verson中最后的序列号等于updates-writeBatch的count
last_sequence += my_batch_count;
const uint64_t batch_size = WriteBatchInternal::ByteSize(updates);
// Record statistics
RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count);
RecordTick(stats_, BYTES_WRITTEN, batch_size);
if (write_options.disableWAL) {
flush_on_destroy_ = true;
}
PERF_TIMER_STOP(write_pre_and_post_process_time);
uint64_t log_size = 0;
//需要写事务日志
if (!write_options.disableWAL) {
PERF_TIMER_GUARD(write_wal_time);
//通过WriteBatch创建log入口,就是获取updates-Batch的内容
Slice log_entry = WriteBatchInternal::Contents(updates);
//log队列里添加log_entry
status = logs_.back().writer->AddRecord(log_entry);
total_log_size_ += log_entry.size();
//添加日志的size
alive_log_files_.back().AddSize(log_entry.size());
log_empty_ = false;
log_size = log_entry.size();
RecordTick(stats_, WAL_FILE_BYTES, log_size);
/*同步日志*/
if (status.ok() && need_log_sync) {
RecordTick(stats_, WAL_FILE_SYNCED);
StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
// It's safe to access logs_ with unlocked mutex_ here because:
// - we've set getting_synced=true for all logs,
// so other threads won't pop from logs_ while we're here,
// - only writer thread can push to logs_, and we're in
// writer thread, so no one will push to logs_,
// - as long as other threads don't modify it, it's safe to read
// from std::deque from multiple threads concurrently.
for (auto& log : logs_) {
status = log.writer->file()->Sync(db_options_.use_fsync);
if (!status.ok()) {
break;
}
}
if (status.ok() && need_log_dir_sync) {
// We only sync WAL directory the first time WAL syncing is
// requested, so that in case users never turn on WAL sync,
// we can avoid the disk I/O in the write code path.
status = directories_.GetWalDir()->Fsync();
}
}
}
//上面都是在操作log
if (status.ok()) {
PERF_TIMER_GUARD(write_memtable_time);
/****************这里开始将WriteBatch写memtable***********/
//这个函数就是往memtable中写WriteBatch用的
status = WriteBatchInternal::InsertInto(
updates, column_family_memtables_.get(),
write_options.ignore_missing_column_families, 0, this, false);
// A non-OK status here indicates iteration failure (either in-memory
// writebatch corruption (very bad), or the client specified invalid
// column family). This will later on trigger bg_error_.
//
// Note that existing logic was not sound. Any partial failure writing
// into the memtable would result in a state that some write ops might
// have succeeded in memtable but Status reports error for all writes.
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
}
//收尾操作
PERF_TIMER_START(write_pre_and_post_process_time);
if (updates == &tmp_batch_) {
tmp_batch_.Clear();
}
mutex_.Lock();
// internal stats
default_cf_internal_stats_->AddDBStats(
InternalStats::BYTES_WRITTEN, batch_size);
default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN,
my_batch_count);
if (!write_options.disableWAL) {
if (write_options.sync) {
default_cf_internal_stats_->AddDBStats(InternalStats::WAL_FILE_SYNCED,
1);
}
default_cf_internal_stats_->AddDBStats(
InternalStats::WAL_FILE_BYTES, log_size);
}
if (status.ok()) {
versions_->SetLastSequence(last_sequence);
}
} else {
// Operation failed. Make sure sure mutex is held for cleanup code below.
mutex_.Lock();
}
if (db_options_.paranoid_checks && !status.ok() && !callback_failed &&
!status.IsBusy() && bg_error_.ok()) {
bg_error_ = status; // stop compaction & fail any further writes
}
mutex_.AssertHeld();
if (need_log_sync) {
MarkLogsSynced(logfile_number_, need_log_dir_sync, status);
}
uint64_t writes_for_other = write_batch_group.size() - 1;
if (writes_for_other > 0) {
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
writes_for_other);
if (!write_options.disableWAL) {
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL,
writes_for_other);
}
}
mutex_.Unlock();
write_thread_.ExitAsBatchGroupLeader(&w, last_writer, status);
return status;
}
参考资料
http://blog.csdn.net/wang_xijue/article/details/46521605
http://kernelmaker.github.io/Rocksdb_Study_4