本文我们分析一个chunk的迁移,下文中将分析mongodb的shard平衡策略,之所以分开成两篇文章分析是因为chunk的偏移设计命令太多,太长.下面首先来看看chunk的迁移流程.
1. 将要迁移chunk端A首先记录chunk迁移数据的位置.
2. 通知远端B,让其执行_recvChunkStart开始chunk的迁移.
3. B端首先从A端system.indexes读取索引,并将其插入到自身上.
4. B端读取A端数据,并插入到自己的collection.
5. B端执行在从A端读取数据时A端产生的删除,插入操作.
6. A端在B端读取数据时一直向B端询问是否已经操作完毕准备提交了.
7. B端通知自己已经ready等待提交数据.
8. A端通知B端提交数据.
9. B端提交数据.
10. A端更新configserver配置数据,更新自己的chunkmanager.
11. A端清空自己记录的迁移数据位置,清空已经移动到了B端的数据.
下面进入代码分析,chunk的迁移时movechunk命令完成的,这个命令是在迁移chunk的服务器上运行的,其代码非常长,这里分成几段来讲解.
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
// 1. parse options
// 2. make sure my view is complete and lock
// 3. start migrate
// in a read lock, get all DiskLoc and sort so we can do as little seeking as possible
// tell to start transferring
// 4. pause till migrate caught up
// 5. LOCK
// a) update my config, essentially locking
// b) finish migrate
// c) update config server
// d) logChange to config server
// 6. wait for all current cursors to expire
// 7. remove data locally
// 1.参数检测部分
string ns = cmdObj.firstElement().str();
string to = cmdObj["to"].str();
string from = cmdObj["from"].str(); // my public address, a tad redundant, but safe
// if we do a w=2 after very write
bool secondaryThrottle = cmdObj["secondaryThrottle"].trueValue();
if ( secondaryThrottle && ! anyReplEnabled() )
secondaryThrottle = false;
BSONObj min = cmdObj["min"].Obj();
BSONObj max = cmdObj["max"].Obj();
BSONElement shardId = cmdObj["shardId"];
BSONElement maxSizeElem = cmdObj["maxChunkSizeBytes"];
const long long maxChunkSize = maxSizeElem.numberLong(); // in bytes
if ( ! shardingState.enabled() ) {
string configdb = cmdObj["configdb"].String();
shardingState.enable( configdb );
configServer.init( configdb );
}
MoveTimingHelper timing( "from" , ns , min , max , 6 /* steps */ , errmsg );
// Make sure we're as up-to-date as possible with shard information
// This catches the case where we had to previously changed a shard's host by
// removing/adding a shard with the same name
Shard::reloadShardInfo();
// So 2.2 mongod can interact with 2.0 mongos, mongod needs to handle either a conn
// string or a shard in the to/from fields. The Shard constructor handles this,
// eventually we should break the compatibility.
Shard fromShard( from );
Shard toShard( to );
timing.done(1);
// 2.
//分布式锁锁ns命令的collection
DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC ) , ns );
dist_lock_try dlk;
dlk = dist_lock_try( &lockSetup , (string)"migrate-" + min.toString() );
dlk.got();
BSONObj chunkInfo = BSON("min" << min << "max" << max << "from" << fromShard.getName() << "to" << toShard.getName() );
configServer.logChange( "moveChunk.start" , ns , chunkInfo );
ShardChunkVersion maxVersion;
string myOldShard;
{
scoped_ptr conn(ScopedDbConnection::getInternalScopedDbConnection(shardingState.getConfigServer()) );
BSONObj x;
BSONObj currChunk;
x = conn->get()->findOne( ShardNS::chunk,Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) );
currChunk = conn->get()->findOne( ShardNS::chunk , shardId.wrap( "_id" ) );//currChunk为当前要move的chunk
maxVersion = ShardChunkVersion::fromBSON( x, "lastmod" );
myOldShard = currChunk["shard"].String();
conn->done();
BSONObj currMin = currChunk["min"].Obj();
BSONObj currMax = currChunk["max"].Obj();
if ( myOldShard != fromShard.getName() )
return false;
if ( maxVersion < shardingState.getVersion( ns ) )
return false;
// since this could be the first call that enable sharding we also make sure to have the chunk manager up to date
shardingState.gotShardName( myOldShard );
// Using the maxVersion we just found will enforce a check - if we use zero version,
// it's possible this shard will be *at* zero version from a previous migrate and
// no refresh will be done
// TODO: Make this less fragile
ShardChunkVersion shardVersion = maxVersion;//更新版本信息,加载chunkmanager
shardingState.trySetVersion( ns , shardVersion /* will return updated */ );
}
timing.done(2);
// 3.
ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );
BSONObj shardKeyPattern = chunkManager->getKey();
MigrateStatusHolder statusHolder( ns , min , max , shardKeyPattern );
{
// this gets a read lock, so we know we have a checkpoint for mods
//这里是存储当前这个chunk的数据的地址,后面方便来自B端数据的读取操作,记录地址使用的是一个set,因为需要排序.
if ( ! migrateFromStatus.storeCurrentLocs( maxChunkSize , errmsg , result ) )
return false;
scoped_ptr connTo(
ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
BSONObj res;
bool ok;
ok = connTo->get()->runCommand( "admin" ,//这里是通知B端开始chunk的迁移.
BSON( "_recvChunkStart" << ns <<
"from" << fromShard.getConnString() <<
"min" << min <<
"max" << max <<
"shardKeyPattern" << shardKeyPattern <<
"configServer" << configServer.modelServer() <<
"secondaryThrottle" << secondaryThrottle
) ,
res );
connTo->done();
}
下面继续来看这里的函数_recvChunkStart.
bool run(const string& , BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool) {
if ( migrateStatus.getActive() ) {//一个chunk迁移已经开始了
errmsg = "migrate already in progress";
return false;
}
if ( ! configServer.ok() )
configServer.init( cmdObj["configServer"].String() );
migrateStatus.prepare();
migrateStatus.ns = cmdObj.firstElement().String();
migrateStatus.from = cmdObj["from"].String();
migrateStatus.min = cmdObj["min"].Obj().getOwned();
migrateStatus.max = cmdObj["max"].Obj().getOwned();
migrateStatus.shardKeyPattern = cmdObj["shardKeyPattern"].Obj().getOwned();
migrateStatus.secondaryThrottle = cmdObj["secondaryThrottle"].trueValue();
if ( migrateStatus.secondaryThrottle && ! anyReplEnabled() )
migrateStatus.secondaryThrottle = false;
boost::thread m( migrateThread );//开启一个线程专门负责迁移工作
result.appendBool( "started" , true );
return true;
}
下面来看这个线程migrateThread.其调用migrateStatus.go做迁移工作,而这个go函数其实是_go的包装,下面直接从_go开始分析.
void _go() {
slaveCount = ( getSlaveCount() / 2 ) + 1;
scoped_ptr connPtr(ScopedDbConnection::getScopedDbConnection( from ) );
ScopedDbConnection& conn = *connPtr;//建立来自from的连接
conn->getLastError(); // just test connection
{
// 0. copy system.namespaces entry if collection doesn't already exist
Client::WriteContext ctx( ns );
// Only copy if ns doesn't already exist
if ( ! nsdetails( ns.c_str() ) ) {//本地collection不存在则建立一个
string system_namespaces = NamespaceString( ns ).db + ".system.namespaces";
BSONObj entry = conn->findOne( system_namespaces, BSON( "name" << ns ) );
if ( entry["options"].isABSONObj() ) {
string errmsg;
if ( ! userCreateNS( ns.c_str(), entry["options"].Obj(), errmsg, true, 0 ) )
warning() << "failed to create collection with options: " << errmsg
<< endl;
}
}
}
{
// 1. copy indexes
vector all;
{ //建立关于这个collection的所有索引
auto_ptr indexes = conn->getIndexes( ns );
while ( indexes->more() ) {
all.push_back( indexes->next().getOwned() );
}
}//向.system.indexes插入一条数据将自动建立相应的索引,这个可详见插入数据部分
for ( unsigned i=0; iname + ".system.indexes";
theDataFileMgr.insertAndLog( system_indexes.c_str() , idx, true /* flag fromMigrate in oplog */ );
}
}
{
// 2. delete any data already in range
RemoveSaver rs( "moveChunk" , ns , "preCleanup" );
long long num = Helpers::removeRange( ns ,
min ,
max ,
findShardKeyIndexPattern_unlocked( ns , shardKeyPattern ) ,
false , /*maxInclusive*/
secondaryThrottle , /* secondaryThrottle */
cmdLine.moveParanoia ? &rs : 0 , /*callback*/
true ); /* flag fromMigrate in oplog */
}
{
// 3. initial bulk clone
state = CLONE;
while ( true ) {//从from端克隆数据,复制到自己的数据库里
BSONObj res;//这里从A端读取数据
if ( ! conn->runCommand( "admin" , BSON( "_migrateClone" << 1 ) , res ) ) { // gets array of objects to copy, in disk order
state = FAIL;
conn.done();
return;
}//实际的数据
BSONObj arr = res["objects"].Obj();
int thisTime = 0;
BSONObjIterator i( arr );
while( i.more() ) {
BSONObj o = i.next().Obj();
{
PageFaultRetryableSection pgrs;
while ( 1 ) {
try {
Lock::DBWrite lk( ns );
Helpers::upsert( ns, o, true );//数据插入本地
break;
}
catch ( PageFaultException& e ) {
e.touch();
}
}
}
thisTime++;
numCloned++;
clonedBytes += o.objsize();
if ( secondaryThrottle ) {//设定了这个参数时需要等待至少两个secondary端插入了数据,才能继续
if ( ! waitForReplication( cc().getLastOp(), 2, 60 /* seconds to wait */ ) ) {
}
}
}
if ( thisTime == 0 )
break;
}
}
// if running on a replicated system, we'll need to flush the docs we cloned to the secondaries
ReplTime lastOpApplied = cc().getLastOp().asDate();
{
// 4. do bulk of mods
state = CATCHUP;//之前从from复制了数据,但是复制数据期间可能这个chunk的数据
while ( true ) {//被更改了,所以这里从from端传递更改的信息过来,然后应用到本地
BSONObj res;//更改信息中
if ( ! conn->runCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
state = FAIL;
conn.done();
return;
}
if ( res["size"].number() == 0 )
break;
apply( res , &lastOpApplied );//应用来自A端的修改操作
const int maxIterations = 3600*50;
int i;
for ( i=0;irunCommand( "admin" , BSON( "_transferMods" << 1 ) , res ) ) {
state = FAIL;
conn.done();
return;
}
if ( res["size"].number() > 0 && apply( res , &lastOpApplied ) )
continue;
if ( state == ABORT ) {
timing.note( "aborted" );
return;
}
if ( state == COMMIT_START ) {
if ( flushPendingWrites( lastOpApplied ) )
break;
}
sleepmillis( 10 );
}
if ( state == FAIL ) {
return;
}
timing.done(5);
}
state = DONE;
conn.done();
}
下面我们回到A端首先看看_migrateClone,这个命令是B端从A端读取数据时执行的.执行的函数为MigrateFromStatus::clone.
bool clone( string& errmsg , BSONObjBuilder& result ) {
if ( ! _getActive() ) {//必须是chunk迁移已经启动
return false;
}
ElapsedTracker tracker (128, 10); // same as ClientCursor::_yieldSometimesTracker
int allocSize;
{
Client::ReadContext ctx( _ns );
NamespaceDetails *d = nsdetails( _ns.c_str() );
scoped_spinlock lk( _trackerLocks );
allocSize = std::min(BSONObjMaxUserSize, (int)((12 + d->averageObjectSize()) * _cloneLocs.size()));
}
BSONArrayBuilder a (allocSize);
while ( 1 ) {
bool filledBuffer = false;
auto_ptr fileLock;
Record* recordToTouch = 0;
{
Client::ReadContext ctx( _ns );
scoped_spinlock lk( _trackerLocks );
//之前记录的chunk数据的地址.
set::iterator i = _cloneLocs.begin();
for ( ; i!=_cloneLocs.end(); ++i ) {
if (tracker.intervalHasElapsed()) // should I yield?
break;
DiskLoc dl = *i;//实际数据的加载
Record* r = dl.rec();
if ( ! r->likelyInPhysicalMemory() ) {
fileLock.reset( new LockMongoFilesShared() );
recordToTouch = r;
break;
}
BSONObj o = dl.obj();
// use the builder size instead of accumulating 'o's size so that we take into consideration
// the overhead of BSONArray indices
if ( a.len() + o.objsize() + 1024 > BSONObjMaxUserSize ) {
filledBuffer = true; // break out of outer while loop
break;
}
a.append( o );//实际数据的复制
}//已经复制了的删除
_cloneLocs.erase( _cloneLocs.begin() , i );
if ( _cloneLocs.empty() || filledBuffer )
break;
}
if ( recordToTouch ) {
// its safe to touch here bceause we have a LockMongoFilesShared
// we can't do where we get the lock because we would have to unlock the main readlock and tne _trackerLocks
// simpler to handle this out there
recordToTouch->touch();
recordToTouch = 0;
}
}//这里返回的数据将被发往B端.
result.appendArray( "objects" , a.arr() );
return true;
}
继续来看命令_transferMods,它负责将B端读取数据时A端的修改日志穿到B端.这里调用的函数为:MigrateFromStatus::transferMods
bool transferMods( string& errmsg , BSONObjBuilder& b ) {
if ( ! _getActive() )
return false;
long long size = 0;
Client::ReadContext cx( _ns );//将deleted和reload的操作日志发送给B端
xfer( &_deleted , b , "deleted" , size , false );
xfer( &_reload , b , "reload" , size , true );
b.append( "size" , size );
return true;
}
void xfer( list * l , BSONObjBuilder& b , const char * name , long long& size , bool explode ) {
const long long maxSize = 1024 * 1024;
if ( l->size() == 0 || size > maxSize )
return;
BSONArrayBuilder arr(b.subarrayStart(name));
list::iterator i = l->begin();
while ( i != l->end() && size < maxSize ) {
BSONObj t = *i;
if ( explode ) {
BSONObj it;//再次查看是否又有修改
if ( Helpers::findById( cc() , _ns.c_str() , t, it ) ) {
arr.append( it );
size += it.objsize();
}
}
else
arr.append( t );
i = l->erase( i );
size += t.objsize();
}
arr.done();
}
那么现在继续看看这里的_deleted和_reload的出处.
void logOp(const char *opstr, const char *ns, const BSONObj& obj, BSONObj *patt, bool *b, bool fromMigrate) {
if ( replSettings.master )
_logOp(opstr, ns, 0, obj, patt, b, fromMigrate);
logOpForSharding( opstr , ns , obj , patt );
}
void logOpForSharding( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {
migrateFromStatus.logOp( opstr , ns , obj , patt );
}
void logOp( const char * opstr , const char * ns , const BSONObj& obj , BSONObj * patt ) {
if ( ! _getActive() )//chunk迁移时记录删除以及插入等动作,以达到同步的目的
return;
if ( _ns != ns )
return;
// no need to log if this is not an insertion, an update, or an actual deletion
// note: opstr 'db' isn't a deletion but a mention that a database exists (for replication
// machinery mostly)
char op = opstr[0];
if ( op == 'n' || op =='c' || ( op == 'd' && opstr[1] == 'b' ) )
return;
BSONElement ide;
if ( patt )
ide = patt->getField( "_id" );
else
ide = obj["_id"];
BSONObj it;
switch ( opstr[0] ) {
case 'd': {
if ( getThreadName() == cleanUpThreadName ) {
// we don't want to xfer things we're cleaning
// as then they'll be deleted on TO
// which is bad
return;
}
// can't filter deletes :(
_deleted.push_back( ide.wrap() );
_memoryUsed += ide.size() + 5;
return;
}
case 'i':
it = obj;
break;
case 'u':
if ( ! Helpers::findById( cc() , _ns.c_str() , ide.wrap() , it ) ) {
return;
}
break;
}
if ( ! isInRange( it , _min , _max ) )
return;
_reload.push_back( ide.wrap() );
_memoryUsed += ide.size() + 5;
}
下面我们回到A端movechunk命令上来:
// 4.
for ( int i=0; i<86400; i++ ) { // don't want a single chunk move to take more than a day
sleepsecs( 1 );
scoped_ptr conn(
ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
BSONObj res;
bool ok;
//查看接收端数据接收信息,是否已经完成迁移等待提交了
ok = conn->get()->runCommand( "admin" , BSON( "_recvChunkStatus" << 1 ) , res );
res = res.getOwned();
conn->done();
if ( ! ok || res["state"].String() == "fail" ) {
result.append( "cause" , res );
return false;
}
if ( res["state"].String() == "steady" )//等待ready状态
break;//迁移用内存太多,告知B端终止动作
if ( migrateFromStatus.mbUsed() > (500 * 1024 * 1024) ) {
// this is too much memory for us to use for this
// so we're going to abort the migrate
scoped_ptr conn(ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
BSONObj res;
conn->get()->runCommand( "admin" , BSON( "_recvChunkAbort" << 1 ) , res );
res = res.getOwned();
conn->done();
result.appendBool( "split" , true );
return false;
}
killCurrentOp.checkForInterrupt();
}
// 5.
{//真正的提交部分
// 5.a
// we're under the collection lock here, so no other migrate can change maxVersion or ShardChunkManager state
migrateFromStatus.setInCriticalSection( true );
ShardChunkVersion currVersion = maxVersion;
ShardChunkVersion myVersion = currVersion;
myVersion.incMajor();//chunk的迁移造成了主version增加1
{
Lock::DBWrite lk( ns );
verify( myVersion > shardingState.getVersion( ns ) );
// bump the chunks manager's version up and "forget" about the chunk being moved
// this is not the commit point but in practice the state in this shard won't until the commit it done
shardingState.donateChunk( ns , min , max , myVersion );//本地chunkmanager移出这个chunk
}
// 5.b
// we're under the collection lock here, too, so we can undo the chunk donation because no other state change
// could be ongoing
{
BSONObj res;
scoped_ptr connTo(ScopedDbConnection::getScopedDbConnection( toShard.getConnString() ) );
bool ok;//告知B端提交数据
ok = connTo->get()->runCommand( "admin" ,BSON( "_recvChunkCommit" << 1 ) ,res );
connTo->done();
if ( ! ok ) {//数据提交失败,这里将之前移出的那个chunk添加回去
Lock::DBWrite lk( ns );
// revert the chunk manager back to the state before "forgetting" about the chunk
shardingState.undoDonateChunk( ns , min , max , currVersion );
result.append( "cause" , res );
return false;
}
}
// 5.c
// version at which the next highest lastmod will be set
// if the chunk being moved is the last in the shard, nextVersion is that chunk's lastmod
// otherwise the highest version is from the chunk being bumped on the FROM-shard
ShardChunkVersion nextVersion;
// we want to go only once to the configDB but perhaps change two chunks, the one being migrated and another
// local one (so to bump version for the entire shard)
// we use the 'applyOps' mechanism to group the two updates and make them safer
// TODO pull config update code to a module
BSONObjBuilder cmdBuilder;
//更新configserver chunks信息,新产出了一个chunk当然需要修改chunks这个collection了
BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) );
{
// update for the chunk being moved
BSONObjBuilder op;
op.append( "op" , "u" );
op.appendBool( "b" , false /* no upserting */ );
op.append( "ns" , ShardNS::chunk );
BSONObjBuilder n( op.subobjStart( "o" ) );
n.append( "_id" , Chunk::genID( ns , min ) );
myVersion.addToBSON( n, "lastmod" );
n.append( "ns" , ns );
n.append( "min" , min );
n.append( "max" , max );
n.append( "shard" , toShard.getName() );
n.done();
BSONObjBuilder q( op.subobjStart( "o2" ) );
q.append( "_id" , Chunk::genID( ns , min ) );
q.done();
updates.append( op.obj() );
}
nextVersion = myVersion;
// if we have chunks left on the FROM shard, update the version of one of them as well
// we can figure that out by grabbing the chunkManager installed on 5.a
// TODO expose that manager when installing it
ShardChunkManagerPtr chunkManager = shardingState.getShardChunkManager( ns );
if( chunkManager->getNumChunks() > 0 ) {
// get another chunk on that shard
BSONObj lookupKey;
BSONObj bumpMin, bumpMax;
do {
chunkManager->getNextChunk( lookupKey , &bumpMin , &bumpMax );
lookupKey = bumpMin;
}
while( bumpMin == min );
BSONObjBuilder op;
op.append( "op" , "u" );
op.appendBool( "b" , false );
op.append( "ns" , ShardNS::chunk );
nextVersion.incMinor(); // same as used on donateChunk
BSONObjBuilder n( op.subobjStart( "o" ) );
n.append( "_id" , Chunk::genID( ns , bumpMin ) );
nextVersion.addToBSON( n, "lastmod" );
n.append( "ns" , ns );
n.append( "min" , bumpMin );
n.append( "max" , bumpMax );
n.append( "shard" , fromShard.getName() );
n.done();
BSONObjBuilder q( op.subobjStart( "o2" ) );
q.append( "_id" , Chunk::genID( ns , bumpMin ) );
q.done();
updates.append( op.obj() );
}
updates.done();
BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) );
{
BSONObjBuilder b;
b.append( "ns" , ShardNS::chunk );
b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) );
{
BSONObjBuilder bb( b.subobjStart( "res" ) );
// TODO: For backwards compatibility, we can't yet require an epoch here
bb.appendTimestamp( "lastmod", maxVersion.toLong() );
bb.done();
}
preCond.append( b.obj() );
}
preCond.done();
BSONObj cmd = cmdBuilder.obj();
bool ok = false;
BSONObj cmdResult;//执行更新命令
scoped_ptr conn(
ScopedDbConnection::getInternalScopedDbConnection(
shardingState.getConfigServer() ) );
ok = conn->get()->runCommand( "config" , cmd , cmdResult );
conn->done();
migrateFromStatus.setInCriticalSection( false );
// 5.d
configServer.logChange( "moveChunk.commit" , ns , chunkInfo );
}
migrateFromStatus.done();
{//最后删除本地的这个chunk的数据
// 6.
OldDataCleanup c;
c.secondaryThrottle = secondaryThrottle;
c.ns = ns;
c.min = min.getOwned();
c.max = max.getOwned();
c.shardKeyPattern = shardKeyPattern.getOwned();
ClientCursor::find( ns , c.initial );
if ( c.initial.size() ) {
boost::thread t( boost::bind( &cleanupOldData , c ) );
}
else {
// 7.
c.doRemove();
}
}
return true;
}
到这里一个chunk的迁移工作完成,流程相当长,代码还是比较好阅读的,需要注意的是不要将两个端的代码搞混了.
原文链接: mongodb源码分析(二十三)mongos chunk的迁移
作者: yhjj0108,杨浩