badger的写过程

文件后缀

badgbdb 数据库库目录下文件

  • LOCK
    目录下的文件,用来保护目录只能被一个进程打开,避免多进程badger同时打开这个目录

  • *.vlog
    用来记录kv对中value的长度大于1M的kv对

  • *.mem
    记录lsm tree中的memtable和immemtable的

  • *.sst

整体写流程

当value的长度大于默认1M就先写vlog,再插入lsm tree,小于1M的就直接插入lsm tree,这种方式的写入主要是为了k,v分离,lsm tree里面不存大的value,在lsm tree合并压缩的时候就不需要读写大value,整体减少磁盘资源的消耗。lsm tree的实现就是只有追加写,把put和delete都改为写日志。

写日志项

每次的写都是一个entry,entry包括key和value,通过meta来区分是delete还是put,version字段用的是txn.commitTs提交时间

// Entry provides Key, Value, UserMeta and ExpiresAt. This struct can be used by
// the user to set data.
type Entry struct {
	Key       []byte
	Value     []byte
	ExpiresAt uint64 // time.Unix
	version   uint64
	offset    uint32 // offset is an internal field.//entry在文件中的偏移
	UserMeta  byte
	meta      byte

	// Fields maintained internally.
	hlen         int // Length of the header.
	valThreshold int64
}

先写vlog再写lsm tree

大于阈值默认1M的value会写入vlog,所有的entry都会写入lsm tree,写入lsm tree其实就是写入memtable,写入前先判断memtable是否满了,满了就创建一个新的memtable,把旧的memtable转为immemtable

	db.opt.Debugf("writeRequests called. Writing to value log")
	err := db.vlog.write(reqs)//向vlog中写入,len(value)< opt.ValueThreshold 不会写入vlog文件
	if err != nil {
		done(err)
		return err
	}

	db.opt.Debugf("Sending updates to subscribers")
	db.pub.sendUpdates(reqs)
	db.opt.Debugf("Writing to memtable")
	var count int
	for _, b := range reqs {
		if len(b.Entries) == 0 {
			continue
		}
		count += len(b.Entries)
		var i uint64
		var err error
		for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() {
			i++
			if i%100 == 0 {
				db.opt.Debugf("Making room for writes")
			}
			// We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm.
			// When flushChan is full and you are blocked there, and the flusher is trying to update s.imm,
			// you will get a deadlock.
			time.Sleep(10 * time.Millisecond)
		}
		if err != nil {
			done(err)
			return y.Wrap(err, "writeRequests")
		}
		if err := db.writeToLSM(b); err != nil {//向LSM tree写入
			done(err)
			return y.Wrap(err, "writeRequests")
		}
	}

写memtable

memtable在内存上用的是一个跳表的数据结构,在磁盘上是一个以.mem结尾的文件,写memtable是先把entry写入文件,再修改跳表里面的内存。

func (mt *memTable) Put(key []byte, value y.ValueStruct) error {
	entry := &Entry{
		Key:       key,
		Value:     value.Value,
		UserMeta:  value.UserMeta,
		meta:      value.Meta,
		ExpiresAt: value.ExpiresAt,
	}

	// wal is nil only when badger in running in in-memory mode and we don't need the wal.
	if mt.wal != nil {//不是只在内存里面运行,这里肯定不为nil,
		// If WAL exceeds opt.ValueLogFileSize, we'll force flush the memTable. See logic in
		// ensureRoomForWrite.
		if err := mt.wal.writeEntry(mt.buf, entry, mt.opt); err != nil {//向.mem里面写entry
			return y.Wrapf(err, "cannot write entry to WAL file")
		}
	}
	// We insert the finish marker in the WAL but not in the memtable.
	if entry.meta&bitFinTxn > 0 {//不写入memtable
		return nil//一个txn的最后一个entry,用来做标记,不需要记录到memtable中
	}

	// Write to skiplist and update maxVersion encountered.
	mt.sl.Put(key, value)//向跳表写入
	if ts := y.ParseTs(entry.Key); ts > mt.maxVersion {
		mt.maxVersion = ts
	}
	return nil
}

memtable转immemtable

把memtable写满,也就是超过默认配置64M后,memtable 转为immemtable

// ensureRoomForWrite is always called serially.
func (db *DB) ensureRoomForWrite() error {
	var err error
	db.lock.Lock()
	defer db.lock.Unlock()

	y.AssertTrue(db.mt != nil) // A nil mt indicates that DB is being closed.
	if !db.mt.isFull() {//默认mt.opt.MemTableSize=64,mt.wal.writeAt大于64M则isFull是true
		return nil
	}

	select {
	case db.flushChan <- flushTask{mt: db.mt}://当memtable写满的时候转成immemtable,db.flushChan的容量是opt.NumMemtables,默认是5
		db.opt.Debugf("Flushing memtable, mt.size=%d size of flushChan: %d\n",
			db.mt.sl.MemSize(), len(db.flushChan))
		// We manage to push this task. Let's modify imm.
		db.imm = append(db.imm, db.mt)
		db.mt, err = db.newMemTable()//创建新的.mem文件,内存里面是memtable
		if err != nil {
			return y.Wrapf(err, "cannot create new mem table")
		}
		// New memtable is empty. We certainly have room.
		return nil
	default:
		// We need to do this to unlock and allow the flusher to modify imm.
		return errNoRoom
	}
}

immemtable转sst

immemtable 转成一个迭代器,迭代器中读出kv对,然后再插入新的table表中,也就是一个新的sst文件,table再插入lsm tree第0层

// handleFlushTask must be run serially.
func (db *DB) handleFlushTask(ft flushTask) error {
	// ft.mt could be nil with ft.itr being the valid field.
	bopts := buildTableOptions(db)
	builder := buildL0Table(ft, bopts)
	defer builder.Close()

	// buildL0Table can return nil if the none of the items in the skiplist are
	// added to the builder. This can happen when drop prefix is set and all
	// the items are skipped.
	if builder.Empty() {
		builder.Finish()
		return nil
	}

	fileID := db.lc.reserveFileID()
	var tbl *table.Table
	var err error
	if db.opt.InMemory {
		data := builder.Finish()
		tbl, err = table.OpenInMemoryTable(data, fileID, &bopts)
	} else {
		tbl, err = table.CreateTable(table.NewFilename(fileID, db.opt.Dir), builder)//内存中的memtable变成immemtable,转成.sst文件
	}
	if err != nil {
		return y.Wrap(err, "error while creating table")
	}
	// We own a ref on tbl.
	err = db.lc.addLevel0Table(tbl) // This will incrRef
	_ = tbl.DecrRef()               // Releases our ref.
	return err
}

你可能感兴趣的:(badgerdb)