Mongodb源码分析--插入记录及索引B树构建

在之前的 一篇文章中,介绍了assembleResponse函数(位于instance.cpp第224行),它会根据op操作枚举类型来调用相应的crud操作,枚举类型定义如下:

enum Operations{
opReply
= 1 , /* reply.responseToisset. */
dbMsg
= 1000 , /* genericmsgcommandfollowedbyastring */
dbUpdate
= 2001 , /* updateobject */
dbInsert
= 2002 ,
// dbGetByOID=2003,
dbQuery = 2004 ,
dbGetMore
= 2005 ,
dbDelete
= 2006 ,
dbKillCursors
= 2007
};


可以看到dbInsert = 2002 为插入操作枚举值,下面我们看一下assembleResponse在确定是插入操作时调用的方法,如下:
assembleResponse(Message & m,DbResponse & dbresponse, const SockAddr & client){
.....
try {
if (op == dbInsert){ // 添加记录操作
receivedInsert(m,currentOp);
}
else if (op == dbUpdate){ // 更新记录
receivedUpdate(m,currentOp);
}
else if (op == dbDelete){ // 删除记录
receivedDelete(m,currentOp);
}
else if (op == dbKillCursors){ // 删除Cursors(游标)对象
currentOp.ensureStarted();
logThreshold
= 10 ;
ss
<< " killcursors " ;
receivedKillCursors(m);
}
else {
mongo::log()
<< " operationisn'tsupported: " << op << endl;
currentOp.done();
log
= true ;
}
}
.....
}
}

从上面代码可以看出,系统在确定dbInsert操作时,调用了receivedInsert()方法(位于instance.cpp文件第570行),下面是该方法的定义:

void receivedInsert(Message & m,CurOp & op){
DbMessaged(m);
// 初始化数据库格式的消息
const char * ns = d.getns(); // 获取名空间,用于接下来insert数据
assert( * ns);
uassert(
10058 , " notmaster " ,isMasterNs(ns));
op.debug().str
<< ns;

writelocklk(ns);
// 声明写锁

if (handlePossibleShardedMessage(m, 0 )) // 查看是不是sharding信息,如果是则处理
return ;

Client::Contextctx(ns);
int n = 0 ;
while (d.moreJSObjs()){ // 循环获取当前消息体中的BSONObj数据(数据库记录)
BSONObjjs = d.nextJsObj();
uassert(
10059 , " objecttoinserttoolarge " ,js.objsize() <= BSONObjMaxUserSize);
{
// 声明BSONObj迭代器,以查看里面元素是否有更新操作,如setincpushpull等
BSONObjIteratori(js);
while (i.more()){
BSONElemente
= i.next();
uassert(
13511 , " objecttoinsertcan'thave$modifiers " ,e.fieldName()[ 0 ] != ' $ ' );
}
}
// 插入记录操作,god=false用于标识当前BSONObj对象为有效数据
theDataFileMgr.insertWithObjMod(ns,js, false );
logOp(
" i " ,ns,js); // 日志操作,包括master状态下及sharding分片情况

if ( ++ n % 4 == 0 ){
// 在插入一些数据后,进行持久化操作,有关持久化部分参见我的这篇文章
// http://www.cnblogs.com/daizhj/archive/2011/03/21/1990344.html
getDur().commitIfNeeded();
}
}
globalOpCounters.incInsertInWriteLock(n);
// 在写锁环境下添加已插入记录数(n),锁采用InterlockedIncrement实现数的原子性
}

上面的方法中,主要是在“写锁”环境下执行插入数据操作,并且在插入记录之前进行简单的数据对象检查,如长度和插入数据是否被修改,以确保数据的最终有效性。
最终上面代码会调用 insertWithObjMod()方法(位于pdfile.cpp 文件第1432行),该方法定义如下:
DiskLocDataFileMgr::insertWithObjMod( const char * ns,BSONObj & o, bool god){
DiskLocloc
= insert(ns,o.objdata(),o.objsize(),god);
if ( ! loc.isNull()) // 判断返回记录地址是否为空(记录是否插入成功)
o = BSONObj(loc.rec()); // 如有效,则用记录地地址上的记录(record类型指针)绑定到o上
return loc;
}


该方法只是一个对插入操作及返回结果的封装,其中ns为数据对象的名空间,o就是要插入的数据对象(BSONObj),god用于标识当前BSONObj对象是否为有效数据(false=有效),这里之所以要传入god这个参数,是因为在接下来的insert方法里同时支持添加名空间(及索引)和插入记录操作(都会不断调用该方法),而在添加名空间时god=true。

下面我们看一下insert方法(pdfile.cpp 第1467行),因为其内容较长,请详见注释:

DiskLocDataFileMgr::insert( const char * ns, const void * obuf, int len, bool god, const BSONElement & writeId, bool mayAddIndex){
bool wouldAddIndex = false ;
massert(
10093 , " cannotinsertintoreserved$collection " ,god || isANormalNSName(ns));
uassert(
10094 ,str::stream() << " invalidns: " << ns,isValidNS(ns));
const char * sys = strstr(ns, " system. " );
if (sys){ // 对插入记录的ns进行判断,是否要插入保留的数据库名(system),如是则停止执行其它代码
uassert( 10095 , " attempttoinsertinreserveddatabasename'system' " ,sys != ns);
if (strstr(ns, " .system. " )){
// later:checkfordba-typepermissionshereifhavethatatsomepointseparate
if (strstr(ns, " .system.indexes " )) // 判断是否创建索引
wouldAddIndex = true ;
else if (legalClientSystemNS(ns, true ))
;
else if ( ! god){ // 表示obuf有数据,但这就意味着要向system下插入数据(把system当成数据表了)
out () << " ERROR:attempttoinsertinsystemnamespace " << ns << endl;
return DiskLoc();
}
}
else
sys
= 0 ;
}

bool addIndex = wouldAddIndex && mayAddIndex; // 判断是否需要添加索引

NamespaceDetails
* d = nsdetails(ns); // 获取ns的详细信息
if (d == 0 ){
addNewNamespaceToCatalog(ns);
// 向systemcatalog添加新的名空间,它会再次调用当前insert()方法
/* todo:shouldn'tbeinthenamespacecataloguntilaftertheallocationsherework.
alsoifthisisanaddIndex,thosechecksshouldhappenbeforethis!
*/
// 创建第一个数据库文件.
cc().database() -> allocExtent(ns,Extent::initialSize(len), false );
d
= nsdetails(ns);
if ( ! god)
ensureIdIndexForNewNs(ns);
}
d
-> paddingFits();

NamespaceDetails
* tableToIndex = 0 ;

string tabletoidxns;
BSONObjfixedIndexObject;
if (addIndex){
assert(obuf);
BSONObjio((
const char * )obuf);
// 做索引准备工作,这里并不真正创建索引,只是进行参数检查,以及索引是否已存在等
if ( ! prepareToBuildIndex(io,god,tabletoidxns,tableToIndex,fixedIndexObject))
return DiskLoc();

if ( ! fixedIndexObject.isEmpty()){
obuf
= fixedIndexObject.objdata();
len
= fixedIndexObject.objsize();
}

}

const BSONElement * newId = & writeId;
int addID = 0 ;
if ( ! god){
// 检查对象是否有_id字段,没有则添加
// Notethatbtreebucketswhichweinsertaren'tBSONObj's,butinthatcasegod==true.
BSONObjio(( const char * )obuf);
BSONElementidField
= io.getField( " _id " );
uassert(
10099 , " _idcannotbeanarray " ,idField.type() != Array);

if (idField.eoo() /* 判断是否是结束元素 */ && ! wouldAddIndex && strstr(ns, " .local. " ) == 0 ){
addID
= len;
if (writeId.eoo()){
// 初始化一个_id随机值(因为_id可能是12byte类型或其它类型)
idToInsert_.oid.init();
newId
= & idToInsert; // 绑定初始化的_id值
}
len
+= newId -> size();
}
// 如果io对象中有时间戳元素时,并用当前时间进行更新
BSONElementManipulator::lookForTimestamps(io);
}

// 兼容旧的数据文件
DiskLocextentLoc;
int lenWHdr = len + Record::HeaderSize;
lenWHdr
= ( int )(lenWHdr * d -> paddingFactor);
if (lenWHdr == 0 ){
assert(d
-> paddingFactor == 0 );
* getDur().writing( & d -> paddingFactor) = 1.0 ;
lenWHdr
= len + Record::HeaderSize;
}

// 在对新的对象分配空间前检查数据是否会造成索引冲突(唯一索引)
// capped标识是否是固定大小的集合类型,这种类型下系统会自动将过于陈旧的数据remove掉
// 注:此cap与nosql中常说的cap无太大关联
// nosqlcap即:一致性,有效性,分区容忍性
// 参见这篇文章: http://blog.nosqlfan.com/html/1112.html ,
// http://blog.nosqlfan.com/html/96.html )
if (d -> nIndexes && d -> capped && ! god){
checkNoIndexConflicts(d,BSONObj(reinterpret_cast
< const char *> (obuf)));
}

DiskLocloc
= d -> alloc(ns,lenWHdr,extentLoc); // 为当前记录分配空间namespace.cpp__stdAlloc方法
if (loc.isNull()){ // 如果分配失效
if (d -> capped == 0 ){ // cap大小未增加,即
log( 1 ) << " allocatingnewextentfor " << ns << " padding: " << d -> paddingFactor << " lenWHdr: " << lenWHdr << endl;
// 尝试从空闲空间列表中分配空间
cc().database() -> allocExtent(ns,Extent::followupSize(lenWHdr,d -> lastExtentSize), false );
// 尝试再次为当前记录分配空间
loc = d -> alloc(ns,lenWHdr,extentLoc);
if (loc.isNull()){
log()
<< " WARNING:alloc()failedafterallocatingnewextent.lenWHdr: " << lenWHdr << " lastextentsize: " << d -> lastExtentSize << " ;tryingagain\n " ;
for ( int zzz = 0 ;zzz < 10 && lenWHdr > d -> lastExtentSize;zzz ++ ){ // 最多尝试循环10次分配空间
log() << " try# " << zzz << endl;
cc().database()
-> allocExtent(ns,Extent::followupSize(len,d -> lastExtentSize), false );
loc
= d -> alloc(ns,lenWHdr,extentLoc);
if ( ! loc.isNull())
break ;
}
}
}
if (loc.isNull()){ // 最终未分配空间给对象
log() << " insert:couldn'tallocspaceforobjectns: " << ns << " capped: " << d -> capped << endl;
assert(d
-> capped);
return DiskLoc();
}
}

Record
* r = loc.rec();
{
assert(r
-> lengthWithHeaders >= lenWHdr);
r
= (Record * )getDur().writingPtr(r,lenWHdr); // 持久化插入记录信息
if (addID){
/* alittleeffortwasmadeheretoavoidadoublecopywhenweaddanID */
((
int & ) * r -> data) = * (( int * )obuf) + newId -> size();
memcpy(r
-> data + 4 ,newId -> rawdata(),newId -> size()); // 拷贝_id字段到指定记录内存空间
memcpy(r -> data + 4 + newId -> size(),(( char * )obuf) + 4 ,addID - 4 ); // 拷贝数据到指定内存空间
}
else {
if (obuf)
memcpy(r
-> data,obuf,len); // 直接拷贝数据到记录字段r
}
}

{
Extent
* e = r -> myExtent(loc);
if (e -> lastRecord.isNull()){ // 如果未尾记录为空,本人理解:即之前未插入过记录
Extent::FL * fl = getDur().writing(e -> fl());
fl
-> firstRecord = fl -> lastRecord = loc;
r
-> prevOfs = r -> nextOfs = DiskLoc::NullOfs;
}
else {
Record
* oldlast = e -> lastRecord.rec(); // 否则将新记录添加到最后一条记录的后面
r -> prevOfs = e -> lastRecord.getOfs();
r
-> nextOfs = DiskLoc::NullOfs;
getDur().writingInt(oldlast
-> nextOfs) = loc.getOfs();
getDur().writingDiskLoc(e
-> lastRecord) = loc;
}
}

/* 持久化操作并更新相应统计信息 */
{
NamespaceDetails::Stats
* s = getDur().writing( & d -> stats);
s
-> datasize += r -> netLength();
s
-> nrecords ++ ;
}

// 在god时会清空stats信息,同时会添加一个btreebucket(占据存储空间)
if ( ! god)
NamespaceDetailsTransient::get_w(ns).notifyOfWriteOp();
// 在写操作时清空缓存,优化查询优化

if (tableToIndex){
uassert(
13143 , " can'tcreateindexonsystem.indexes " ,tabletoidxns.find( " .system.indexes " ) == string ::npos);

BSONObjinfo
= loc.obj();
bool background = info[ " background " ].trueValue();
if (background && cc().isSyncThread()){
/* don'tdobackgroundindexingonslaves.therearenuances.thiscouldbeaddedlaterbutrequiresmorecode. */
log()
<< " info:indexinginforegroundonthisreplica;wasabackgroundindexbuildontheprimary " << endl;
background
= false ;
}

int idxNo = tableToIndex -> nIndexes;
IndexDetails
& idx = tableToIndex -> addIndex(tabletoidxns.c_str(), ! background); // 清空临时缓存信息;同时递增索引数量
getDur().writingDiskLoc(idx.info) = loc;
try {
buildAnIndex(tabletoidxns,tableToIndex,idx,idxNo,background);
// 创建索引
}
catch (DBException & e){
// 保存异常信息,并执行dropIndexes
LastError * le = lastError. get ();
int savecode = 0 ;
string saveerrmsg;
if (le){
savecode
= le -> code;
saveerrmsg
= le -> msg;
}
else {
savecode
= e.getCode();
saveerrmsg
= e.what();
}

// 回滚索引操作(drop索引)
string name = idx.indexName();
BSONObjBuilderb;
string errmsg;
bool ok = dropIndexes(tableToIndex,tabletoidxns.c_str(),name.c_str(),errmsg,b, true );
if ( ! ok){
log()
<< " failedtodropindexafterauniquekeyerrorbuildingit: " << errmsg << ' ' << tabletoidxns << ' ' << name << endl;
}

assert(le
&& ! saveerrmsg.empty());
raiseError(savecode,saveerrmsg.c_str());
throw ;
}
}

/* 将记录数据添加到索引信息(btree)中 */
if (d -> nIndexes){
try {
BSONObjobj(r
-> data);
indexRecord(d,obj,loc);
}
catch (AssertionException & e){
// _idindex键值重复
if (tableToIndex || d -> capped){
massert(
12583 , " unexpectedindexinsertionfailureoncappedcollection " , ! d -> capped);
string s = e.toString();
s
+= " :onaddIndex/capped-collectionanditsindexwillnotmatch " ;
uassert_nothrow(s.c_str());
error()
<< s << endl;
}
else {
// 回滚上述操作
_deleteRecord(d,ns,r,loc);
throw ;
}
}
}

// out()<<"insertedatloc:"<<hex<<loc.getOfs()<<"lenwhdr:"<<hex<<lenWHdr<<dec<<''<<ns<<endl;
return loc;
}


正如之前所说,该方法会完成添加名空间,添加索引,添加数据记录(memcpy调用)。其中名空间的添加方法 addNewNamespaceToCatalog比较简单,下面主要介绍一下索引的创建过程,这里分为了两步:

1.创建索引树(b树)

2.将数据(主要是地址)添加到索引(树)中

先看一下创建索引过程:

static void buildAnIndex( string ns,NamespaceDetails * d,IndexDetails & idx, int idxNo, bool background){
tlog()
<< " buildingnewindexon " << idx.keyPattern() << " for " << ns << (background ? " background " : "" ) << endl;
Timert;
unsigned
long long n;

if (background){
log(
2 ) << " buildAnIndex:background=true\n " ;
}

assert(
! BackgroundOperation::inProgForNs(ns.c_str())); // shouldhavebeencheckedearlier,betternotbe...
assert(d -> indexBuildInProgress == 0 );
assertInWriteLock();
RecoverableIndexStaterecoverable(d);
if (inDBRepair || ! background){ // 当数据库在repair时或非后台工作方式下
n = fastBuildIndex(ns.c_str(),d,idx,idxNo); // 创建索引
assert( ! idx.head.isNull());
}
else {
BackgroundIndexBuildJobj(ns.c_str());
// 以后台方式创建索引
n = j.go(ns,d,idx,idxNo);
}
tlog()
<< " donefor " << n << " records " << t.millis() / 1000.0 << " secs " << endl;
}



创建索引方法会要据创建方式(是否是后台线程等),使用不同的方法,这里主要讲解非后台方式,也就是上面的fastBuildIndex方法(pdfile.cpp第1101行),其定义如下(内容详见注释):


unsigned long long fastBuildIndex( const char * ns,NamespaceDetails * d,IndexDetails & idx, int idxNo){
CurOp
* op = cc().curop(); // 设置当前操作指针,用于设置操作信息

Timert;

tlog(
1 ) << " fastBuildIndex " << ns << " idxNo: " << idxNo << ' ' << idx.info.obj().toString() << endl;

bool dupsAllowed = ! idx.unique();
bool dropDups = idx.dropDups() || inDBRepair;
BSONObjorder
= idx.keyPattern();

getDur().writingDiskLoc(idx.head).Null();

if (logLevel > 1 )printMemInfo( " beforeindexstart " );

/* 获取并排序所有键值----- */
unsigned
long long n = 0 ;
shared_ptr
< Cursor > c = theDataFileMgr.findAll(ns);
BSONObjExternalSortersorter(order);
sorter.hintNumObjects(d
-> stats.nrecords);
unsigned
long long nkeys = 0 ;
ProgressMeterHolderpm(op
-> setMessage( " index:(1/3)externalsort " ,d -> stats.nrecords, 10 ));
while (c -> ok()){
BSONObjo
= c -> current();
DiskLocloc
= c -> currLoc();

BSONObjSetDefaultOrderkeys;
idx.getKeysFromObject(o,keys);
// 从对象中获取键值信息
int k = 0 ;
for (BSONObjSetDefaultOrder::iteratori = keys.begin();i != keys.end();i ++ ){
if ( ++ k == 2 ){ // 是否是多键索引
d -> setIndexIsMultikey(idxNo);
}
sorter.add(
* i,loc); // 向排序器添加键值和记录位置信息
nkeys ++ ;
}

c
-> advance();
n
++ ;
pm.hit();
if (logLevel > 1 && n % 10000 == 0 ){
printMemInfo(
" \titeratingobjects " );
}

};
pm.finished();

if (logLevel > 1 )printMemInfo( " beforefinalsort " );
sorter.sort();
if (logLevel > 1 )printMemInfo( " afterfinalsort " );

log(t.seconds()
> 5 ? 0 : 1 ) << " \texternalsortused: " << sorter.numFiles() << " files " << " in " << t.seconds() << " secs " << endl;

list
< DiskLoc > dupsToDrop;

/* 创建索引 */
{
BtreeBuilderbtBuilder(dupsAllowed,idx);
// 实例化b树索引对象
// BSONObjkeyLast;
auto_ptr < BSONObjExternalSorter::Iterator > i = sorter.iterator(); // 初始化迭代器用于下面遍历
assert(pm == op -> setMessage( " index:(2/3)btreebottomup " ,nkeys, 10 ));
while (i -> more()){
RARELYkillCurrentOp.checkForInterrupt();
// 检查冲突如shutdown或kill指令
BSONObjExternalSorter::Datad = i -> next();

try {
btBuilder.addKey(d.first,d.second);
// 向b树索引对象中添加索引键值和记录位置信息
}
catch (AssertionException & e){
if (dupsAllowed){
// unknowexception??
throw ;
}

if (e.interrupted())
throw ;

if ( ! dropDups)
throw ;

/* wecouldqueuetheseondisk,butnormallythereareveryfewdups,soinsteadwe
keepinramandhavealimit.
*/
dupsToDrop.push_back(d.second);
uassert(
10092 , " toomaydupsonindexbuildwithdropDups=true " ,dupsToDrop.size() < 1000000 );
}
pm.hit();
}
pm.finished();
op
-> setMessage( " index:(3/3)btree-middle " );
log(t.seconds()
> 10 ? 0 : 1 ) << " \tdonebuildingbottomlayer,goingtocommit " << endl;
btBuilder.commit();
// 提交创建索引操作,该方法会完成最终构造Btree索引操作
wassert(btBuilder.getn() == nkeys || dropDups);
}

log(
1 ) << " \tfastBuildIndexdupsToDrop: " << dupsToDrop.size() << endl;
// 删除索引中已出现的重复记录
for (list < DiskLoc > ::iteratori = dupsToDrop.begin();i != dupsToDrop.end();i ++ )
theDataFileMgr.deleteRecord(ns,i
-> rec(), * i, false , true );

return n;
}

上面方法主要对要创建的索引信息进行提取,并封装到一个BtreeBuilder中,顾名思义,该对象用于进行b树的创建(因为索引也是一个b树),当信息收集排序完成后,就开始创建索引,如下:

btree.cpp1842行
void BtreeBuilder::commit(){
buildNextLevel(first);
committed
= true ;
}

void BtreeBuilder::buildNextLevel(DiskLocloc){
int levels = 1 ;
while ( 1 ){
if (loc.btree() -> tempNext().isNull()){
// 在当前层级上只有一个bucket
getDur().writingDiskLoc(idx.head) = loc;
break ;
}
levels
++ ;

DiskLocupLoc
= BtreeBucket::addBucket(idx); // 添加bucket并实例化上一层DiskLoc
DiskLocupStart = upLoc;
BtreeBucket
* up = upLoc.btreemod(); // 获取上一层的bucket指针

DiskLocxloc
= loc;
while ( ! xloc.isNull()){
RARELY{
getDur().commitIfNeeded();
b
= cur.btreemod();
up
= upLoc.btreemod();
}

BtreeBucket
* x = xloc.btreemod();
BSONObjk;
DiskLocr;
x
-> popBack(r,k); // 弹出当前bucket中最右边的键
bool keepX = (x -> n != 0 ); // 当前bucket中元素个数是否为0
DiskLockeepLoc = keepX ? xloc:x -> nextChild;

// 压入上面弹出的最右边的键值,该键值为当前up(bucket)中最大值
if ( ! up -> _pushBack(r,k,ordering,keepLoc))
{
// 当前bucket已满,则新创建一个addBucket
DiskLocn = BtreeBucket::addBucket(idx);
up
-> tempNext() = n;
upLoc
= n;
up
= upLoc.btreemod();
up
-> pushBack(r,k,ordering,keepLoc);
}

DiskLocnextLoc
= x -> tempNext(); // getnextinchainatcurrentlevel
if (keepX){ // 表示当前结点非顶层结点,则设置它的父结点
x -> parent = upLoc;
}
else {
if ( ! x -> nextChild.isNull())
x
-> nextChild.btreemod() -> parent = upLoc;
x
-> deallocBucket(xloc,idx); // 删除xlocbucket
}
xloc
= nextLoc; // 指向当前层的下个元素
}

loc
= upStart; // 升级当前结点
mayCommitProgressDurably();
}

if (levels > 1 )
log(
2 ) << " btreelevels: " << levels << endl;
}


上面的buildNextLevel方法自下而上根据之前抽取的键值逐层构造一个b树。这里有一个问题需要注意一下,因为mongodb使用bucket来作为b树中的一个层次结点或叶子结点容器(如下图),bucket最大尺寸为8192字节,c。有关b树索引的文章可以参见这篇文章:,
mongodb目前关于B树索引的文档 :http://blog.nosqlfan.com/html/758.html
Mongodb源码分析--插入记录及索引B树构建

当初始化了b树索引及空间信息之后,下面就会将数据绑定到相应信息结点上了,也就是DataFileMgr::insert方法(pdfile.cpp文件)的如下代码:

/* 将记录数据添加到索引信息(btree)中 */

if (d -> nIndexes){
try {
BSONObjobj(r
-> data);
indexRecord(d,obj,loc);
}
......
}


上面的indexRecord方法会将键值和数据(包括存储位置)添加到索引中(其中参数d包括之前创建的B树索引信息), 该方法定义如下(pdfile.cpp 第1355行):


/* 将键值和数据(包括存储位置)添加到索引中 */
static void indexRecord(NamespaceDetails * d,BSONObjobj,DiskLocloc){
int n = d -> nIndexesBeingBuilt(); // 获取已(及正在)构建的索引数
for ( int i = 0 ;i < n;i ++ ){
try {
bool unique = d -> idx(i).unique();
// 内联函数(inline):将索引和记录相关信息初始化到btree中
_indexRecord(d,i /* 索引顺序位 */ ,obj,loc, /* dupsAllowed */ ! unique);
}
catch (DBException & ){
/* 如果发生异常,则进行回滚操作
note<=i(not<i)isimportanthereastheindexwewerejustattempted
maybemultikeyandrequiresomecleanup.
*/
for ( int j = 0 ;j <= i;j ++ ){
try {
_unindexRecord(d
-> idx(j),obj,loc, false );
}
catch (...){
log(
3 ) << " unindexfailsonrollbackafteruniquefailure\n " ;
}
}
throw ;
}
}
}


上面的_indexRecord为内联函数(pdfile.cpp)(inline关键字参见C++说明),该参数声明如下:


static inline void _indexRecord(NamespaceDetails * d, int idxNo,BSONObj & obj,DiskLocrecordLoc, bool dupsAllowed){
IndexDetails
& idx = d -> idx(idxNo); //
BSONObjSetDefaultOrderkeys;
idx.getKeysFromObject(obj,keys);
// 从对象信息中获取键属性信息
BSONObjorder = idx.keyPattern();
Orderingordering
= Ordering::make(order); // 初始化排序方式用于下面传参
int n = 0 ;
for (BSONObjSetDefaultOrder::iteratori = keys.begin();i != keys.end();i ++ ){
if ( ++ n == 2 ){
d
-> setIndexIsMultikey(idxNo); // 设置多键值索引
}
assert(
! recordLoc.isNull());
try {
idx.head
/* DiskLoc */ .btree() /* BtreeBucket */ -> bt_insert(idx.head,recordLoc, // 执行向btree中添加记录和绑定索引信息的操作
* i,ordering,dupsAllowed,idx);
}
catch (AssertionException & e){
if (e.getCode() == 10287 && idxNo == d -> nIndexes){
DEVlog()
<< " info:caughtkeyalreadyinindexonbgindexing(ok) " << endl;
continue ;
}
if ( ! dupsAllowed){
// 重复键值异常
throw ;
}
problem()
<< " caughtassertion_indexRecord " << idx.indexNamespace() << endl;
}
}
}


上面方法最终会执行b树插入方法bt_insert(btree.cpp文件1622行),如下(详情见注释):

int BtreeBucket::bt_insert( const DiskLocthisLoc, const DiskLocrecordLoc,
const BSONObj & key, const Ordering & order, bool dupsAllowed,
IndexDetails
& idx, bool toplevel) const {
if (toplevel){ // 如果是顶级节点(如果是通过构造索引方式调用,则toplevel=true)
// 判断键值是否过界(因为其会存储在system.indexs中),其中:KeyMax=8192/10.mongodb开发团队可能会在更高版本中扩大该值
if (key.objsize() > KeyMax){
problem()
<< " Btree::insert:keytoolargetoindex,skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl;
return 3 ;
}
}
// 执行添加操作
int x = _insert(thisLoc,recordLoc,key,order,dupsAllowed,DiskLoc(),DiskLoc(),idx);
assertValid(order);
// assert排序方式是否有效

return x;
}


上面代码紧接着会调用btree.cpp文件的内部方法_insert(btree.cpp文件 1554行):


int BtreeBucket::_insert( const DiskLocthisLoc, const DiskLocrecordLoc,
const BSONObj & key, const Ordering & order, bool dupsAllowed,
const DiskLoclChild, const DiskLocrChild,IndexDetails & idx) const {
if (key.objsize() > KeyMax){
problem()
<< " ERROR:keytoolargelen: " << key.objsize() << " max: " << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl;
return 2 ;
}
assert(key.objsize()
> 0 );

int pos;
// 在btreebucket中使用二分查询,查看键值是否已在所索引信息中
bool found = find(idx,key,recordLoc,order,pos /* 返回该索引信息所在或应该在的位置 */ , ! dupsAllowed);
if (insert_debug){
out () << " " << thisLoc.toString() << ' . ' << " _insert " <<
key.toString()
<< ' / ' << recordLoc.toString() <<
" l: " << lChild.toString() << " r: " << rChild.toString() << endl;
out () << " found: " << found << " pos: " << pos << " n: " << n << endl;
}

if (found){
const _KeyNode & kn = k(pos); // 获取指定磁盘位置的节点信息,_KeyNode
if (kn.isUnused()){ // 查看已存在的键结点是否已使用
log( 4 ) << " btree_insert:reusingunusedkey " << endl;
massert(
10285 , " _insert:reusekeybutlchildisnotnull " ,lChild.isNull());
massert(
10286 , " _insert:reusekeybutrchildisnotnull " ,rChild.isNull());
kn.writing().setUsed();
return 0 ;
}

DEV{
log()
<< " _insert():keyalreadyexistsinindex(okforbackground:true)\n " ;
log()
<< " " << idx.indexNamespace() << " thisLoc: " << thisLoc.toString() << ' \n ' ;
log()
<< " " << key.toString() << ' \n ' ;
log()
<< " " << " recordLoc: " << recordLoc.toString() << " pos: " << pos << endl;
log()
<< " oldlr: " << childForPos(pos).toString() << ' ' << childForPos(pos + 1 ).toString() << endl;
log()
<< " newlr: " << lChild.toString() << ' ' << rChild.toString() << endl;
}
alreadyInIndex();
// 提示键值结点已在索引中,不必再创建,并抛出异常
}

DEBUGGING
out () << " TEMP:key: " << key.toString() << endl;
DiskLocchild
= childForPos(pos); // 查询当前pos的子结点信息,以寻找插入位置
if (insert_debug)
out () << " getChild( " << pos << " ): " << child.toString() << endl;
if (child.isNull() || ! rChild.isNull() /* 在当前buckets中插入,即'internal'插入 */ ){
insertHere(thisLoc,pos,recordLoc,key,order,lChild,rChild,idx);
// 在当前buckets中插入
return 0 ;
}
// 如果有子结点,则在子结点上执行插入操作
return child.btree() -> bt_insert(child,recordLoc,key,order,dupsAllowed,idx, /* toplevel */ false );
}


上面_insert方法首先会使用二分法查找要插入的记录是否已存在于索引中,同时会返回一个插入点(pos),如不存在则会进一步在插入点位置查看找元素以决定是在当前bucket中插入,还是在当前pos位置的(右)子结点(bucket)上插入(这会再次递归调用上面的bt_insert方法),这里我们假定在当前bucket插入,则会执行insertHere方法(btree.cpp文件1183行),它的定义如下:


/* *
*insertakeyinthisbucket,splittingifnecessary.
*@keypos-wheretoinsertthekeyinrange0..n.0=makeleftmost,n=makerightmost.
*NOTEthisfunctionmayfreesomedata,andasaresultthevaluepassedforkeyposmay
*beinvalidaftercallinginsertHere()
*/
void BtreeBucket::insertHere( const DiskLocthisLoc, int keypos,
const DiskLocrecordLoc, const BSONObj & key, const Ordering & order,
const DiskLoclchild, const DiskLocrchild,IndexDetails & idx) const {
if (insert_debug)
out () << " " << thisLoc.toString() << " .insertHere " << key.toString() << ' / ' << recordLoc.toString() << ' '
<< lchild.toString() << ' ' << rchild.toString() << " keypos: " << keypos << endl;

DiskLocoldLoc
= thisLoc;
// 根据keypos插入相应位置并将数据memcpy到内存指定位置
if ( ! basicInsert(thisLoc,keypos,recordLoc,key,order)){
// 如果插入无效,表示当前bucket已满,则分割记录并放到新创建的bucket中
thisLoc.btreemod() -> split(thisLoc,keypos,recordLoc,key,order,lchild,rchild,idx);
return ;
}

{
// 持久化当前thisLoc的结点信息并根据插入位置(是否最后一个key),来更新当前thisLoc(及后面key结点)的子结点信息
const _KeyNode * _kn = & k(keypos);
_KeyNode
* kn = (_KeyNode * )getDur().alreadyDeclared((_KeyNode * )_kn); // alreadydeclaredintentinbasicInsert()
if (keypos + 1 == n){ // n为pack(打包后)存储的记录数,这里"判断等于n"表示为最后(last)一个key
if (nextChild != lchild){ // 如果是最后元素,那么"当前最高键值的右子结点应该与要插入的左子结点相同
out () << " ERRORnextChild!=lchild " << endl;
out () << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
out () << " keyPos: " << keypos << " n: " << n << endl;
out () << " nextChild: " << nextChild.toString() << " lchild: " << lchild.toString() << endl;
out () << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
out () << " key: " << key.toString() << endl;
dump();
assert(
false );
}
kn
-> prevChildBucket = nextChild; // "当前最高键值的右子结点”绑定到持久化结点的左子结点
assert(kn -> prevChildBucket == lchild);
nextChild.writing()
= rchild; // 持久化"当前最高键值的右子结点”,并将“要插入结点”的右子结点绑定到
if ( ! rchild.isNull()) // 如果有右子结点,则更新右子结点的父结点信息为当前thisLoc
rchild.btree() -> parent.writing() = thisLoc;
}
else {
// 如果keypos位置不是最后一个
kn -> prevChildBucket = lchild; // 将左子结点绑定到keypos位置结点的左子结点上
if (k(keypos + 1 ).prevChildBucket != lchild){ // 这时左子结点应该与下一个元素的左子结点相同
out () << " ERRORk(keypos+1).prevChildBucket!=lchild " << endl;
out () << " thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
out () << " keyPos: " << keypos << " n: " << n << endl;
out () << " k(keypos+1).pcb: " << k(keypos + 1 ).prevChildBucket.toString() << " lchild: " << lchild.toString() << endl;
out () << " recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
out () << " key: " << key.toString() << endl;
dump();
assert(
false );
}
const DiskLoc * pc = & k(keypos + 1 ).prevChildBucket; // 获取keypos后面元素的左子结点信息
* getDur().alreadyDeclared((DiskLoc * )pc) = rchild; // 将右子结点绑定到下一个元素(keypos+1)的左子结点上declaredinbasicInsert()
if ( ! rchild.isNull()) // 如果有右子结点,则更新右子结点的父结点信息为当前thisLoc
rchild.btree() -> parent.writing() = thisLoc;
}
return ;
}
}


该方法中会调用一个叫basicInsert的方法,它主要会在当前bucket中指定位置(keypos)添加记录信息,同时持久化该结点信息,如下:


// tree.cpp1183
bool BucketBasics::basicInsert( const DiskLocthisLoc, int & keypos, const DiskLocrecordLoc, const BSONObj & key, const Ordering & order) const {
assert(keypos
>= 0 && keypos <= n);
// 判断bucket剩余的空间是否满足当前数据需要的存储空间
int bytesNeeded = key.objsize() + sizeof (_KeyNode);
if (bytesNeeded > emptySize){
_pack(thisLoc,order,keypos);
// 如不够用,进行一次整理打包操作,以为bucket中整理更多空间
if (bytesNeeded > emptySize) // 如还不够用,则返回
return false ;
}

BucketBasics
* b; // 声明Bucket管理对象指针,该对象提供了Bucket存储管理的基本操作和属性,如insert,_pack等
{
const char * p = ( const char * ) & k(keypos);
const char * q = ( const char * ) & k(n + 1 );
// declarethatwewillwriteto[k(keypos),k(n)]
// todo:thiswritesamediumamounttothejournal.wemaywanttoaddaverb"shift"totheredologso
// wecanlogaverysmallamount.
b = (BucketBasics * )getDur().writingAtOffset(( void * ) this ,p - ( char * ) this ,q - p);
// 如已有3个结点,目前要插到第三个结点之间,则对每三个元素进行迁移,
// e.g.n==3,keypos==2
// 149
// ->
// 14_9
for ( int j = n;j > keypos;j -- ) // makeroom
b -> k(j) = b -> k(j - 1 );
}
getDur().declareWriteIntent(
& b -> emptySize, 12 ); // [b->emptySize..b->n]is12bytesandwearegoingtowritethose
b -> emptySize -= sizeof (_KeyNode); // 将当前bucket中的剩余空闲空间减少
b -> n ++ ; // 已有结点数加1

_KeyNode
& kn = b -> k(keypos);
kn.prevChildBucket.Null();
// 设置当前结点的左子结点为空
kn.recordLoc = recordLoc; // 绑定结点记录信息
kn.setKeyDataOfs(( short )b -> _alloc(key.objsize())); // 设置结点数据偏移信息
char * p = b -> dataAt(kn.keyDataOfs()); // 实例化指向磁盘数据(journal文件)位置(含偏移量)的指针
getDur().declareWriteIntent(p,key.objsize()); // 持久化结点数据信息
memcpy(p,key.objdata(),key.objsize()); // 将当前结点信息复制到p指向的地址空间
return true ;
}


如果上面方法调用失效,则意味着当前 bucket中已有可用空间插入新记录,这时系统会调用 split(btree.cpp文件 1240行)方法来进行bucket分割,以创建新的bucket并将信息塞入其中,如下:

void BtreeBucket::split( const DiskLocthisLoc, int keypos, const DiskLocrecordLoc, const BSONObj & key, const Ordering & order, const DiskLoclchild, const DiskLocrchild,IndexDetails & idx){
assertWritable();

if (split_debug)
out () << " " << thisLoc.toString() << " .split " << endl;

int split = splitPos(keypos); // 找到要迁移的数据位置
DiskLocrLoc = addBucket(idx); // 添加一个新的BtreeBucket
BtreeBucket * r = rLoc.btreemod();
if (split_debug)
out () << " split: " << split << ' ' << keyNode(split).key.toString() << " n: " << n << endl;
for ( int i = split + 1 ;i < n;i ++ ){
KeyNodekn
= keyNode(i);
r
-> pushBack(kn.recordLoc,kn.key,order,kn.prevChildBucket); // 向新bucket中迁移过剩数据
}
r
-> nextChild = nextChild; // 绑定新bucket的右子结点
r -> assertValid(order);

if (split_debug)
out () << " newrLoc: " << rLoc.toString() << endl;
r
= 0 ;
rLoc.btree()
-> fixParentPtrs(rLoc); // 设置当前bucket树的父指针信息

{
KeyNodesplitkey
= keyNode(split); // 获取内存中分割点位置所存储的数据
nextChild = splitkey.prevChildBucket; // 提升splitkey键,它的子结点将会是thisLoc(l)和rLoc(r)
if (split_debug){
out () << " splitkeykey: " << splitkey.key.toString() << endl;
}

// 将splitkey提升为父结点
if (parent.isNull()){
// 如果无父结点时,则创建一个,并将
DiskLocL = addBucket(idx);
BtreeBucket
* p = L.btreemod();
p
-> pushBack(splitkey.recordLoc,splitkey.key,order,thisLoc);
p
-> nextChild = rLoc; // 将分割的bucket为了当前
p -> assertValid(order);
parent
= idx.head.writing() = L; // 将splitkey提升为父结点
if (split_debug)
out () << " wewereroot,makingnewroot: " << hex << parent.getOfs() << dec << endl;
rLoc.btree()
-> parent.writing() = parent;
}
else {
// setthisbeforecalling_insert-ifitsplitsitwilldofixParent()logicandchangethevalue.
rLoc.btree() -> parent.writing() = parent;
if (split_debug)
out () << " promotingsplitkeykey " << splitkey.key.toString() << endl;
// 提升splitkey键,它的左子结点thisLoc,右子点rLoc
parent.btree() -> _insert(parent,splitkey.recordLoc,splitkey.key,order, /* dupsallowed */ true ,thisLoc,rLoc,idx);
}
}

int newpos = keypos;
// 打包压缩数据(pack,移除无用数据),以提供更多空间
truncateTo(split,order,newpos); // notethismaytrashsplitkey.key.thuswehadtopromoteitbeforefinishinguphere.

// addournewkey,thereisroomnow
{
if (keypos <= split){ // 如果还有空间存储新键
if (split_debug)
out () << " keypos<split,insertHere()thenewkey " << endl;
insertHere(thisLoc,newpos,recordLoc,key,order,lchild,rchild,idx);
// 再次向当前bucket中添加记录
}
else { // 如压缩之后依旧无可用空间,则向新创建的bucket中添加节点
int kp = keypos - split - 1 ;
assert(kp
>= 0 );
rLoc.btree()
-> insertHere(rLoc,kp,recordLoc,key,order,lchild,rchild,idx);
}
}

if (split_debug)
out () << " splitend " << hex << thisLoc.getOfs() << dec << endl;
}




好了,今天的内容到这里就告一段落了,在接下来的文章中,将会介绍客户端发起Delete操作时,Mongodb的执行流程和相应实现部分。

原文链接:http://www.cnblogs.com/daizhj/archive/2011/03/30/1999699.html
作者: daizhj, 代震军
微博: http://t.sina.com.cn/daizhj
Tags: mongodb,c++,btree

作者:代震军发表于 2011-03-30 13:23原文链接

你可能感兴趣的:(mongodb)