badgbdb 数据库库目录下文件
LOCK
目录下的文件,用来保护目录只能被一个进程打开,避免多进程badger同时打开这个目录
*.vlog
用来记录kv对中value的长度大于1M的kv对
*.mem
记录lsm tree中的memtable和immemtable的
*.sst
当value的长度大于默认1M就先写vlog,再插入lsm tree,小于1M的就直接插入lsm tree,这种方式的写入主要是为了k,v分离,lsm tree里面不存大的value,在lsm tree合并压缩的时候就不需要读写大value,整体减少磁盘资源的消耗。lsm tree的实现就是只有追加写,把put和delete都改为写日志。
每次的写都是一个entry,entry包括key和value,通过meta来区分是delete还是put,version字段用的是txn.commitTs提交时间
// Entry provides Key, Value, UserMeta and ExpiresAt. This struct can be used by
// the user to set data.
type Entry struct {
Key []byte
Value []byte
ExpiresAt uint64 // time.Unix
version uint64
offset uint32 // offset is an internal field.//entry在文件中的偏移
UserMeta byte
meta byte
// Fields maintained internally.
hlen int // Length of the header.
valThreshold int64
}
大于阈值默认1M的value会写入vlog,所有的entry都会写入lsm tree,写入lsm tree其实就是写入memtable,写入前先判断memtable是否满了,满了就创建一个新的memtable,把旧的memtable转为immemtable
db.opt.Debugf("writeRequests called. Writing to value log")
err := db.vlog.write(reqs)//向vlog中写入,len(value)< opt.ValueThreshold 不会写入vlog文件
if err != nil {
done(err)
return err
}
db.opt.Debugf("Sending updates to subscribers")
db.pub.sendUpdates(reqs)
db.opt.Debugf("Writing to memtable")
var count int
for _, b := range reqs {
if len(b.Entries) == 0 {
continue
}
count += len(b.Entries)
var i uint64
var err error
for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() {
i++
if i%100 == 0 {
db.opt.Debugf("Making room for writes")
}
// We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm.
// When flushChan is full and you are blocked there, and the flusher is trying to update s.imm,
// you will get a deadlock.
time.Sleep(10 * time.Millisecond)
}
if err != nil {
done(err)
return y.Wrap(err, "writeRequests")
}
if err := db.writeToLSM(b); err != nil {//向LSM tree写入
done(err)
return y.Wrap(err, "writeRequests")
}
}
memtable在内存上用的是一个跳表的数据结构,在磁盘上是一个以.mem结尾的文件,写memtable是先把entry写入文件,再修改跳表里面的内存。
func (mt *memTable) Put(key []byte, value y.ValueStruct) error {
entry := &Entry{
Key: key,
Value: value.Value,
UserMeta: value.UserMeta,
meta: value.Meta,
ExpiresAt: value.ExpiresAt,
}
// wal is nil only when badger in running in in-memory mode and we don't need the wal.
if mt.wal != nil {//不是只在内存里面运行,这里肯定不为nil,
// If WAL exceeds opt.ValueLogFileSize, we'll force flush the memTable. See logic in
// ensureRoomForWrite.
if err := mt.wal.writeEntry(mt.buf, entry, mt.opt); err != nil {//向.mem里面写entry
return y.Wrapf(err, "cannot write entry to WAL file")
}
}
// We insert the finish marker in the WAL but not in the memtable.
if entry.meta&bitFinTxn > 0 {//不写入memtable
return nil//一个txn的最后一个entry,用来做标记,不需要记录到memtable中
}
// Write to skiplist and update maxVersion encountered.
mt.sl.Put(key, value)//向跳表写入
if ts := y.ParseTs(entry.Key); ts > mt.maxVersion {
mt.maxVersion = ts
}
return nil
}
把memtable写满,也就是超过默认配置64M后,memtable 转为immemtable
// ensureRoomForWrite is always called serially.
func (db *DB) ensureRoomForWrite() error {
var err error
db.lock.Lock()
defer db.lock.Unlock()
y.AssertTrue(db.mt != nil) // A nil mt indicates that DB is being closed.
if !db.mt.isFull() {//默认mt.opt.MemTableSize=64,mt.wal.writeAt大于64M则isFull是true
return nil
}
select {
case db.flushChan <- flushTask{mt: db.mt}://当memtable写满的时候转成immemtable,db.flushChan的容量是opt.NumMemtables,默认是5
db.opt.Debugf("Flushing memtable, mt.size=%d size of flushChan: %d\n",
db.mt.sl.MemSize(), len(db.flushChan))
// We manage to push this task. Let's modify imm.
db.imm = append(db.imm, db.mt)
db.mt, err = db.newMemTable()//创建新的.mem文件,内存里面是memtable
if err != nil {
return y.Wrapf(err, "cannot create new mem table")
}
// New memtable is empty. We certainly have room.
return nil
default:
// We need to do this to unlock and allow the flusher to modify imm.
return errNoRoom
}
}
immemtable 转成一个迭代器,迭代器中读出kv对,然后再插入新的table表中,也就是一个新的sst文件,table再插入lsm tree第0层
// handleFlushTask must be run serially.
func (db *DB) handleFlushTask(ft flushTask) error {
// ft.mt could be nil with ft.itr being the valid field.
bopts := buildTableOptions(db)
builder := buildL0Table(ft, bopts)
defer builder.Close()
// buildL0Table can return nil if the none of the items in the skiplist are
// added to the builder. This can happen when drop prefix is set and all
// the items are skipped.
if builder.Empty() {
builder.Finish()
return nil
}
fileID := db.lc.reserveFileID()
var tbl *table.Table
var err error
if db.opt.InMemory {
data := builder.Finish()
tbl, err = table.OpenInMemoryTable(data, fileID, &bopts)
} else {
tbl, err = table.CreateTable(table.NewFilename(fileID, db.opt.Dir), builder)//内存中的memtable变成immemtable,转成.sst文件
}
if err != nil {
return y.Wrap(err, "error while creating table")
}
// We own a ref on tbl.
err = db.lc.addLevel0Table(tbl) // This will incrRef
_ = tbl.DecrRef() // Releases our ref.
return err
}