本文将分析mongodb中数据平衡的策略.先来看看流程.mongodb开启一线程banlance专门负责数据的平衡工作,其查看系统中所有的shard,发现有不平衡的情况就选择将其中shard服务器的chunk迁移到其它服务器让整个系统达到平衡.来看看平衡策略.
1. shard数据大小超过了shard配置的数据大小,从中选取chunk迁移到别处.
2. 找到shard中有违法tag规则的chunk,将这些chunk迁移到符合tag规则的shard中.
3. 找出所有tag(加入了一个空的tag以达到若不存在tag时有tag可判断的情况)中每一
个tag中chunk数最多的shard上的chunk数目记着Max,找出同样tag中chunk数最少chunk
的shard的chunk数目记着min.总chunk数目记着total.当max-min>=threshold(下面代码描述的)
时,将宣传从max所在shard迁移一个chunk到min所在shard.
//balancedLastTime表示上一个循环是否发生了chunk迁移,发生了则其中记录了迁移的chunk数 if(balancedLastTime || total<20) threshold=2; if(total<80); threshold=4; else threshold=8;下面就来具体分析源码.源码位置为mongo\s\balance.cpp run函数,其在mongos初始化时被启动,作为一个单独的线程来管理系统的平衡.
void Balancer::run() { // this is the body of a BackgroundJob so if we throw here we're basically ending the balancer thread prematurely while ( ! inShutdown() ) { if ( ! _init() ) {//初始化不成功则等待1min继续初始化 sleepsecs( 60 ); continue; } break; } int sleepTime = 30; // getConnectioString and dist lock constructor does not throw, which is what we expect on while // on the balancer thread ConnectionString config = configServer.getConnectionString(); DistributedLock balanceLock( config , "balancer" ); while ( ! inShutdown() ) { try { scoped_ptr<ScopedDbConnection> connPtr(//连接配置服务器 ScopedDbConnection::getInternalScopedDbConnection( config.toString() ) ); ScopedDbConnection& conn = *connPtr; // ping has to be first so we keep things in the config server in sync _ping( conn.conn() );//更新config.mongos的ping值,保持自己的连接状态 // use fresh shard state Shard::reloadShardInfo(); // refresh chunk size (even though another balancer might be active) Chunk::refreshChunkSize(); BSONObj balancerConfig; // now make sure we should even be running //当前正在做平衡或者当前collection设置了不需要平衡或者系统设置了不需要自动 //平衡,则这里返回false,否则true. if ( ! grid.shouldBalance( "", &balancerConfig ) ) { // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( sleepTime ); continue; } sleepTime = balancerConfig["_nosleep"].trueValue() ? 30 : 6; {//分布式锁 dist_lock_try lk( &balanceLock , "doing balance round" ); if ( ! lk.got() ) {//没有得到锁 // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( sleepTime ); // no need to wake up soon continue; } vector<CandidateChunkPtr> candidateChunks;//这里选择的CandidateChunk中最多每一个collection一个 _doBalanceRound( conn.conn() , &candidateChunks );//这里的vector中不会同时出现一个collection中两个 if ( candidateChunks.size() == 0 ) { //chunks的情况 _balancedLastTime = 0; } else {//这里设置 _balancedLastTime = _moveChunks( &candidateChunks, balancerConfig["_secondaryThrottle"].trueValue() ); } } // Ping again so scripts can determine if we're active without waiting _ping( conn.conn(), true ); conn.done(); sleepsecs( _balancedLastTime ? sleepTime / 6 : sleepTime ); } catch ( std::exception& e ) { sleepsecs( sleepTime ); // sleep a fair amount b/c of error continue; } } }run->_doBalanceRound
void Balancer::_doBalanceRound( DBClientBase& conn, vector<CandidateChunkPtr>* candidateChunks ) { // 1. Check whether there is any sharded collection to be balanced by querying // the ShardsNS::collections collection auto_ptr<DBClientCursor> cursor = conn.query( ShardNS::collection , BSONObj() ); vector< string > collections; while ( cursor->more() ) { BSONObj col = cursor->nextSafe(); // sharded collections will have a shard "key". if ( ! col["key"].eoo() && ! col["noBalance"].trueValue() )//若collection没有配置不能迁移则将其先记录 collections.push_back( col["_id"].String() ); } cursor.reset(); if ( collections.empty() ) return; // 2. Get a list of all the shards that are participating in this balance round // along with any maximum allowed quotas and current utilization. We get the // latter by issuing db.serverStatus() (mem.mapped) to all shards. vector<Shard> allShards; Shard::getAllShards( allShards ); if ( allShards.size() < 2) //总共才1个shard无法完成迁移工作 return; ShardInfoMap shardInfo; for ( vector<Shard>::const_iterator it = allShards.begin(); it != allShards.end(); ++it ) { const Shard& s = *it; ShardStatus status = s.getStatus();//得到所有shard的配置信息 shardInfo[ s.getName() ] = ShardInfo( s.getMaxSize(), status.mapped(), s.isDraining(), status.hasOpsQueued(),//表示其中还有待写回的数据,下文将会分析到 s.tags() ); } // 3. For each collection, check if the balancing policy recommends moving anything around. for (vector<string>::const_iterator it = collections.begin(); it != collections.end(); ++it ) { const string& ns = *it; map< string,vector<BSONObj> > shardToChunksMap; cursor = conn.query( ShardNS::chunk , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj chunk = cursor->nextSafe(); if ( chunk["jumbo"].trueValue() )//这个chunk过大,上一次迁移失败将其标记为了jumbo continue; vector<BSONObj>& chunks = shardToChunksMap[chunk["shard"].String()]; chunks.push_back( chunk.getOwned() ); } cursor.reset(); if (shardToChunksMap.empty()) continue; for ( vector<Shard>::iterator i=allShards.begin(); i!=allShards.end(); ++i ) { // this just makes sure there is an entry in shardToChunksMap for every shard Shard s = *i; shardToChunksMap[s.getName()].size(); } DistributionStatus status( shardInfo, shardToChunksMap ); // load tags,确保tags上建好了索引,这里加载tags条件 conn.ensureIndex( ShardNS::tags, BSON( "ns" << 1 << "min" << 1 ), true ); cursor = conn.query( ShardNS::tags , QUERY( "ns" << ns ).sort( "min" ) ); while ( cursor->more() ) { BSONObj tag = cursor->nextSafe(); uassert( 16356 , str::stream() << "tag ranges not valid for: " << ns , status.addTagRange( TagRange( tag["min"].Obj().getOwned(), tag["max"].Obj().getOwned(), tag["tag"].String() ) ) ); } cursor.reset(); CandidateChunk* p = _policy->balance( ns, status, _balancedLastTime ); if ( p ) candidateChunks->push_back( CandidateChunkPtr( p ) ); } }run->_doBalanceRound->BalancerPolicy::balance
MigrateInfo* BalancerPolicy::balance( const string& ns, const DistributionStatus& distribution, int balancedLastTime ) { // 1) check for shards that policy require to us to move off of // draining, maxSize // 2) check tag policy violations // 3) then we make sure chunks are balanced for each tag // 1) check things we have to move {//第一个move条件,shard的大小超出了配置的大小 const set<string>& shards = distribution.shards(); for ( set<string>::const_iterator z = shards.begin(); z != shards.end(); ++z ) { string shard = *z; const ShardInfo& info = distribution.shardInfo( shard ); if ( ! info.isSizeMaxed() && ! info.isDraining() )//当前shard的大小没有超过shard配置大小,maxsize=0表示shard可无限大 continue; if ( distribution.numberOfChunksInShard( shard ) == 0 )//这个shard没有chunks continue; // now we know we need to move to chunks off this shard // we will if we are allowed if ( info.hasOpsQueued() ) continue; const vector<BSONObj>& chunks = distribution.getChunks( shard ); // since we have to move all chunks, lets just do in order for ( unsigned i=0; i<chunks.size(); i++ ) {//按照tags要求找到一个最适合的shard,将其迁移到这个shard上 BSONObj chunkToMove = chunks[i]; string tag = distribution.getTagForChunk( chunkToMove ); string to = distribution.getBestReceieverShard( tag ); if ( to.size() == 0 ) continue; return new MigrateInfo( ns, to, shard, chunkToMove.getOwned() ); } } } //tag不对,需要移动 // 2) tag violations if ( distribution.tags().size() > 0 ) { const set<string>& shards = distribution.shards(); for ( set<string>::const_iterator i = shards.begin(); i != shards.end(); ++i ) { string shard = *i; const ShardInfo& info = distribution.shardInfo( shard ); const vector<BSONObj>& chunks = distribution.getChunks( shard ); for ( unsigned j = 0; j < chunks.size(); j++ ) { string tag = distribution.getTagForChunk( chunks[j] ); if ( info.hasTag( tag ) )//tags满足这个规则,否则将其迁移 continue; // uh oh, this chunk is in the wrong place string to = distribution.getBestReceieverShard( tag ); if ( to.size() == 0 ) continue; return new MigrateInfo( ns, to, shard, chunks[j].getOwned() ); } } } // 3) for each tag balance //根据tag的balance,选取tag中chunks数目中最多的chunks和最小的chunks //当最多的chunks A与最小的chunks B形成A>B+threshold时,从A中迁移一个 //chunks满足tag的chunk到B int threshold = 8; if ( balancedLastTime || distribution.totalChunks() < 20 ) threshold = 2; else if ( distribution.totalChunks() < 80 ) threshold = 4; // randomize the order in which we balance the tags // this is so that one bad tag doesn't prevent others from getting balanced vector<string> tags; { set<string> t = distribution.tags(); for ( set<string>::const_iterator i = t.begin(); i != t.end(); ++i ) tags.push_back( *i ); tags.push_back( "" );//打乱顺序,避免每次按照同样的规则迁移出现问题 std::random_shuffle( tags.begin(), tags.end() ); } for ( unsigned i=0; i<tags.size(); i++ ) { string tag = tags[i]; string from = distribution.getMostOverloadedShard( tag );//这个tag中数据最多的shard名 if ( from.size() == 0 ) continue; unsigned max = distribution.numberOfChunksInShardWithTag( from, tag ); if ( max == 0 ) continue; string to = distribution.getBestReceieverShard( tag ); if ( to.size() == 0 ) return NULL; unsigned min = distribution.numberOfChunksInShardWithTag( to, tag ); const int imbalance = max - min; if ( imbalance < threshold )//相差达到了threshold,将其迁移 continue; //超过了threshold,选取一个shard中tag为该tag的chunks作为迁移对象 const vector<BSONObj>& chunks = distribution.getChunks( from ); for ( unsigned j = 0; j < chunks.size(); j++ ) { if ( distribution.getTagForChunk( chunks[j] ) != tag ) continue; return new MigrateInfo( ns, to, from, chunks[j] ); } } // Everything is balanced here! return NULL; }回到run函数继续看_moveChunks函数.
int Balancer::_moveChunks( const vector<CandidateChunkPtr>* candidateChunks , bool secondaryThrottle ) { int movedCount = 0; for ( vector<CandidateChunkPtr>::const_iterator it = candidateChunks->begin(); it != candidateChunks->end(); ++it ) { const CandidateChunk& chunkInfo = *it->get(); DBConfigPtr cfg = grid.getDBConfig( chunkInfo.ns );//加载相应的配置与chunkManager ChunkManagerPtr cm = cfg->getChunkManager( chunkInfo.ns ); ChunkPtr c = cm->findChunk( chunkInfo.chunk.min );//比照不等说明发生了chunk的拆分,不能再迁移这个chunk了 if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) { // likely a split happened somewhere cm = cfg->getChunkManager( chunkInfo.ns , true /* reload */); c = cm->findChunk( chunkInfo.chunk.min ); if ( c->getMin().woCompare( chunkInfo.chunk.min ) || c->getMax().woCompare( chunkInfo.chunk.max ) ) continue; } BSONObj res;//真正的chunk迁移,前文已经分析,这里不再分析 if ( c->moveAndCommit( Shard::make( chunkInfo.to ) , Chunk::MaxChunkSize , secondaryThrottle , res ) ) { movedCount++;//记录这次迁移的chunk数 continue; } // the move requires acquiring the collection metadata's lock, which can fail if ( res["chunkTooBig"].trueValue() ) {//迁移失败,太大了,将其拆分 // reload just to be safe cm = cfg->getChunkManager( chunkInfo.ns ); c = cm->findChunk( chunkInfo.chunk.min ); res = BSONObj(); c->singleSplit( true , res ); if ( ! res["ok"].trueValue() ) {//拆分失败,将其标记为jumbo c->markAsJumbo(); // we increment moveCount so we do another round right away movedCount++; } } } return movedCount; }到这里mongodb数据的平衡分析完毕,因为前面一系列分析文章的基础,这里的流程还是挺简单的.
原文链接:mongodb源码分析(二十四)mongos数据的平衡
作者: yhjj0108,杨浩