本文我们分析mongodb chunk的拆分,chunk的分拆分两种情况.
1. chunk范围[min,max]这表明这个chunk还没拆分,第一次拆分考虑到后面插入更多的数据,所以拆分时chunk将从实际的最大值max1处拆分,拆分后的chunk范围如下:[min,max1),[max1,max].对于这种[value,max]的拆分拆分点选择这个chunk的最小值min1,得到[value,min1),[min1,max].
2. 对于一般的chunk值[a,b],其拆分是按照空间占用的,拆分后的两个chunk数据占用的空间基本相等.
下面进入singleSplit具体分析源码.
BSONObj Chunk::singleSplit( bool force , BSONObj& res ) const {
vector splitPoint;
// if splitting is not obligatory we may return early if there are not enough data
// we cap the number of objects that would fall in the first half (before the split point)
// the rationale is we'll find a split point without traversing all the data
if ( ! force ) {
vector candidates;
const int maxPoints = 2;//选取分chunk的点,MaxObjectPerChunk表明一个chunk最多的对象数,第二个参数表明
//分chunk的大小
pickSplitVector( candidates , getManager()->getCurrentDesiredChunkSize() , maxPoints , MaxObjectPerChunk );
if ( candidates.size() <= 1 ) {
// no split points means there isn't enough data to split on
// 1 split point means we have between half the chunk size to full chunk size
// so we shouldn't split
return BSONObj();
}
splitPoint.push_back( candidates.front() );
}
else {//强制分chunk,这里将现有chunk从中间分成2部分
// if forcing a split, use the chunk's median key
BSONObj medianKey;
pickMedianKey( medianKey );
if ( ! medianKey.isEmpty() )
splitPoint.push_back( medianKey );
}
// We assume that if the chunk being split is the first (or last) one on the collection, this chunk is
// likely to see more insertions. Instead of splitting mid-chunk, we use the very first (or last) key
// as a split point.
if ( minIsInf() ) {//第一次分,考虑到可能会有更多的插入动作,这里通过使用
splitPoint.clear();//shardkey排序中的最后一个来分chunk,这就是机器自己的分chunk策略
BSONObj key = _getExtremeKey( 1 );//splitchunk默认是将chunk按照大小来分
if ( ! key.isEmpty() ) {
splitPoint.push_back( key );
}
}
else if ( maxIsInf() ) {//大端没有分是maxkey,所以这里选取了一个最小的大端作为
splitPoint.clear();//chunk的分点
BSONObj key = _getExtremeKey( -1 );
if ( ! key.isEmpty() ) {
splitPoint.push_back( key );
}
}
// Normally, we'd have a sound split point here if the chunk is not empty. It's also a good place to
// sanity check.
if ( splitPoint.empty() || _min == splitPoint.front() || _max == splitPoint.front() ) {
return BSONObj();
}
if (multiSplit( splitPoint , res ))//实际的分chunk动作
return splitPoint.front();
else
return BSONObj();
}
不关注!force这一情况,去看pickSplitVector
void Chunk::pickSplitVector( vector& splitPoints , int chunkSize /* bytes */, int maxPoints, int maxObjs ) const {
// Ask the mongod holding this chunk to figure out the split points.
scoped_ptr conn(//向chunk所在server发送命令,让其按照自己的要求选择拆分点
ScopedDbConnection::getInternalScopedDbConnection( getShard().getConnString() ) );
BSONObj result;
BSONObjBuilder cmd;
cmd.append( "splitVector" , _manager->getns() );
cmd.append( "keyPattern" , _manager->getShardKey().key() );
cmd.append( "min" , getMin() );
cmd.append( "max" , getMax() );
cmd.append( "maxChunkSizeBytes" , chunkSize );
cmd.append( "maxSplitPoints" , maxPoints );
cmd.append( "maxChunkObjects" , maxObjs );
BSONObj cmdObj = cmd.obj();
if ( ! conn->get()->runCommand( "admin" , cmdObj , result )) {
conn->done();
ostringstream os;
os << "splitVector command failed: " << result;
uassert( 13345 , os.str() , 0 );
}
BSONObjIterator it( result.getObjectField( "splitKeys" ) );
while ( it.more() ) {
splitPoints.push_back( it.next().Obj().getOwned() );
}
conn->done();
}
继续看这里的splitVector命令.
bool run(const string& dbname, BSONObj& jsobj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
// 1.a We'll parse the parameters in two steps. First, make sure the we can use the split index to get
// a good approximation of the size of the chunk -- without needing to access the actual data.
const char* ns = jsobj.getStringField( "splitVector" );
BSONObj keyPattern = jsobj.getObjectField( "keyPattern" );
// If min and max are not provided use the "minKey" and "maxKey" for the sharding key pattern.
BSONObj min = jsobj.getObjectField( "min" );
BSONObj max = jsobj.getObjectField( "max" );
long long maxSplitPoints = 0;
BSONElement maxSplitPointsElem = jsobj[ "maxSplitPoints" ];//like 2
if ( maxSplitPointsElem.isNumber() )
maxSplitPoints = maxSplitPointsElem.numberLong();
long long maxChunkObjects = Chunk::MaxObjectPerChunk;
BSONElement MaxChunkObjectsElem = jsobj[ "maxChunkObjects" ];
if ( MaxChunkObjectsElem.isNumber() )
maxChunkObjects = MaxChunkObjectsElem.numberLong();//like 250000
vector splitKeys;
{
// Get the size estimate for this namespace
Client::ReadContext ctx( ns );
NamespaceDetails *d = nsdetails( ns );
const IndexDetails *idx = d->findIndexByPrefix( keyPattern ,true ); /* require single key */
// extend min to get (min, MinKey, MinKey, ....)
//根据当前的值min以及index的值得到index的范围,-1表示min中没给定的位域设置最小值
//1表示没给定的位域设置最大值
min = Helpers::modifiedRangeBound( min , idx->keyPattern() , -1 );
if ( max.isEmpty() ) {
// if max not specified, make it (MaxKey, Maxkey, MaxKey...)
max = Helpers::modifiedRangeBound( max , idx->keyPattern() , 1 );
} else {
// otherwise make it (max,MinKey,MinKey...) so that bound is non-inclusive
max = Helpers::modifiedRangeBound( max , idx->keyPattern() , -1 );
}
const long long recCount = d->stats.nrecords;
const long long dataSize = d->stats.datasize;
// 1.b Now that we have the size estimate, go over the remaining parameters and apply any maximum size
// restrictions specified there.
// 'force'-ing a split is equivalent to having maxChunkSize be the size of the current chunk, i.e., the
// logic below will split that chunk in half
long long maxChunkSize = 0;
bool force = false;
{
BSONElement maxSizeElem = jsobj[ "maxChunkSize" ];
BSONElement forceElem = jsobj[ "force" ];
if ( forceElem.trueValue() ) {
force = true;
maxChunkSize = dataSize;
}
else if ( maxSizeElem.isNumber() )
maxChunkSize = maxSizeElem.numberLong() * 1<<20;
else {
maxSizeElem = jsobj["maxChunkSizeBytes"];
if ( maxSizeElem.isNumber() )
maxChunkSize = maxSizeElem.numberLong();
}
}
// If there's not enough data for more than one chunk, no point continuing.
if ( dataSize < maxChunkSize || recCount == 0 ) {//数据不够无法拆分
vector emptyVector;
result.append( "splitKeys" , emptyVector );
return true;
}
// We'll use the average object size and number of object to find approximately how many keys
// each chunk should have. We'll split at half the maxChunkSize or maxChunkObjects, if
// provided.
const long long avgRecSize = dataSize / recCount;
long long keyCount = maxChunkSize / (2 * avgRecSize);
if ( maxChunkObjects && ( maxChunkObjects < keyCount ) )
keyCount = maxChunkObjects;
// 2. Traverse the index and add the keyCount-th key to the result vector. If that key
// appeared in the vector before, we omit it. The invariant here is that all the
// instances of a given key value live in the same chunk.
Timer timer;
long long currCount = 0;
long long numChunks = 0;//遍历min到max中的数据,然后选择拆分点
BtreeCursor * bc = BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
shared_ptr c( bc );
auto_ptr cc( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
// Use every 'keyCount'-th key as a split point. We add the initial key as a sentinel, to be removed
// at the end. If a key appears more times than entries allowed on a chunk, we issue a warning and
// split on the following key.
set tooFrequentKeys;
splitKeys.push_back( bc->prettyKey( c->currKey().getOwned() ).extractFields( keyPattern ) );
while ( 1 ) {
while ( cc->ok() ) {
currCount++;
if ( currCount > keyCount ) {//keyCount数可以近似认为是一个chunk的大小
BSONObj currKey = bc->prettyKey( c->currKey() ).extractFields(keyPattern);
// Do not use this split key if it is the same used in the previous split point.
if ( currKey.woCompare( splitKeys.back() ) == 0 )
tooFrequentKeys.insert( currKey.getOwned() );
else {//将其作为一个拆分点
splitKeys.push_back( currKey.getOwned() );
currCount = 0;
numChunks++;
}
}
cc->advance();
// Stop if we have enough split points.
if ( maxSplitPoints && ( numChunks >= maxSplitPoints ) )
break; //让出cpu一会儿
if ( ! cc->yieldSometimes( ClientCursor::DontNeed ) ) {
// we were near and and got pushed to the end
// i think returning the splits we've already found is fine
// don't use the btree cursor pointer to access keys beyond this point but ok
// to use it for format the keys we've got already
cc.release();
break;
}
}
if ( splitKeys.size() > 1 || ! force )
break;
force = false;
keyCount = currCount / 2;
currCount = 0;
bc = BtreeCursor::make( d , d->idxNo(*idx) , *idx , min , max , false , 1 );
c.reset( bc );
cc.reset( new ClientCursor( QueryOption_NoCursorTimeout , c , ns ) );
}
// Remove the sentinel at the beginning before returning
splitKeys.erase( splitKeys.begin() );
}
result.append( "splitKeys" , splitKeys );
return true;
}
可以看出上面按照一个chunk的大小近似的找出了拆分点,不直接按照文档大小来查找可能是这样需要加载实际的文档,那么这样可能就会很慢.
下面继续singleSplit->multiSplit
bool Chunk::multiSplit( const vector& m , BSONObj& res ) const {
const size_t maxSplitPoints = 8192;
scoped_ptr conn(
ScopedDbConnection::getInternalScopedDbConnection( getShard().getConnString() ) );
BSONObjBuilder cmd;
cmd.append( "splitChunk" , _manager->getns() );
cmd.append( "keyPattern" , _manager->getShardKey().key() );
cmd.append( "min" , getMin() );
cmd.append( "max" , getMax() );
cmd.append( "from" , getShard().getName() );
cmd.append( "splitKeys" , m );
cmd.append( "shardId" , genID() );
cmd.append( "configdb" , configServer.modelServer() );
BSONObj cmdObj = cmd.obj();//执行splitChunk命令实际的分片
if ( ! conn->get()->runCommand( "admin" , cmdObj , res )) {
conn->done();
// Mark the minor version for *eventual* reload
_manager->markMinorForReload( this->_lastmod );
return false;
}
conn->done();
// force reload of config
_manager->reload();//强制manager重现加载,因为chunk分片了,所以manager必须强制重新加载,这里reload默认参数为true
return true;
}
singleSplit->multiSplit->splitChunk
bool run(const string& dbname, BSONObj& cmdObj, int, string& errmsg, BSONObjBuilder& result, bool fromRepl ) {
// 1. check whether parameters passed to splitChunk are sound
const string ns = cmdObj.firstElement().str();
const BSONObj keyPattern = cmdObj["keyPattern"].Obj();
const BSONObj min = cmdObj["min"].Obj();
const BSONObj max = cmdObj["max"].Obj();
const string from = cmdObj["from"].str();
const BSONObj splitKeysElem = cmdObj["splitKeys"].Obj();
vector splitKeys;
BSONObjIterator it( splitKeysElem );
while ( it.more() )
splitKeys.push_back( it.next().Obj().getOwned() );
const BSONElement shardId = cmdObj["shardId"];
// It is possible that this is the first sharded command this mongod is asked to perform. If so,
// start sharding apparatus. We'd still be missing some more shard-related info but we'll get it
// in step 2. below.
if ( ! shardingState.enabled() ) {
string configdb = cmdObj["configdb"].String();
shardingState.enable( configdb );
configServer.init( configdb );
}
Shard myShard( from );//从shard name中得到对应的shard
// 2. lock the collection's metadata and get highest version for the current shard
//分布式锁,关于分布式锁不做分析,这里描述其过程,其会开一个线程,这个线程专门负责ping configserver,告诉config自己
//可连接可达,当然这里设置了skew,能够容忍一定时间内的误差.分布式锁使用两个collection config.lockpings负责记录
//pings config.locks记录实际的加锁.锁有三种状态.0表示该锁没有占用,1表示一个shard正准备在其上加锁,2表示已加锁,
//当发现锁已经被占用时可查看占用者的ping值,若ping值过时可认为其服务器已经掉线了,则可以获得锁.
DistributedLock lockSetup( ConnectionString( shardingState.getConfigServer() , ConnectionString::SYNC) , ns );
dist_lock_try dlk;
dlk = dist_lock_try( &lockSetup, string("split-") + min.toString() );
// TODO This is a check migrate does to the letter. Factor it out and share. 2010-10-22
ShardChunkVersion maxVersion;
string shard;
ChunkInfo origChunk;
{
scoped_ptr conn(ScopedDbConnection::getInternalScopedDbConnection(shardingState.getConfigServer()));
BSONObj x = conn->get()->findOne( ShardNS::chunk,Query( BSON( "ns" << ns ) ).sort( BSON( "lastmod" << -1 ) ) );
maxVersion = ShardChunkVersion::fromBSON( x, "lastmod" );//查询chunk版本
BSONObj currChunk = conn->get()->findOne( ShardNS::chunk , shardId.wrap( "_id" ) ).getOwned();
shard = currChunk["shard"].String();
conn->done();
BSONObj currMin = currChunk["min"].Obj();
BSONObj currMax = currChunk["max"].Obj();
if ( currMin.woCompare( min ) || currMax.woCompare( max ) ) {
errmsg = "chunk boundaries are outdated (likely a split occurred)";
result.append( "currMin" , currMin );
result.append( "currMax" , currMax );
result.append( "requestedMin" , min );
result.append( "requestedMax" , max );
return false;
}
origChunk.min = currMin.getOwned();
origChunk.max = currMax.getOwned();
origChunk.lastmod = ShardChunkVersion::fromBSON( currChunk["lastmod"] );
// since this could be the first call that enable sharding we also make sure to have the chunk manager up to date
shardingState.gotShardName( shard );
ShardChunkVersion shardVersion;//得到shardVersion版本,加载chunkManager
shardingState.trySetVersion( ns , shardVersion /* will return updated */ );
}
// 3. create the batch of updates to metadata ( the new chunks ) to be applied via 'applyOps' command
BSONObjBuilder logDetail;
origChunk.appendShortVersion( "before" , logDetail );
vector newChunks;
ShardChunkVersion myVersion = maxVersion;
BSONObj startKey = min;
splitKeys.push_back( max ); // makes it easier to have 'max' in the next loop. remove later.
BSONObjBuilder cmdBuilder;
//这里的实现有点奇怪,具体的分片是按照oplog的方式完成的,首先写入修改configserver相应collection的oplog,
//然后执行applyOps replay这里设置的oplog,达到分片的目的.具体为什么要这么干呢?有没有哪位高手指点下
BSONArrayBuilder updates( cmdBuilder.subarrayStart( "applyOps" ) );//这里创建更新config.chunks的命令
for ( vector::const_iterator it = splitKeys.begin(); it != splitKeys.end(); ++it ) {
BSONObj endKey = *it;
// splits only update the 'minor' portion of version
myVersion.incMinor();
// build an update operation against the chunks collection of the config database with
// upsert true
BSONObjBuilder op;
op.append( "op" , "u" );
op.appendBool( "b" , true );
op.append( "ns" , ShardNS::chunk );
// add the modified (new) chunk information as the update object
BSONObjBuilder n( op.subobjStart( "o" ) );
n.append( "_id" , Chunk::genID( ns , startKey ) );
myVersion.addToBSON( n, "lastmod" );
n.append( "ns" , ns );
n.append( "min" , startKey );
n.append( "max" , endKey );
n.append( "shard" , shard );
n.done();
// add the chunk's _id as the query part of the update statement
BSONObjBuilder q( op.subobjStart( "o2" ) );
q.append( "_id" , Chunk::genID( ns , startKey ) );
q.done();
updates.append( op.obj() );
// remember this chunk info for logging later
newChunks.push_back( ChunkInfo( startKey , endKey, myVersion ) );
startKey = endKey;
}
updates.done();
{
BSONArrayBuilder preCond( cmdBuilder.subarrayStart( "preCondition" ) );
BSONObjBuilder b;
b.append( "ns" , ShardNS::chunk );
b.append( "q" , BSON( "query" << BSON( "ns" << ns ) << "orderby" << BSON( "lastmod" << -1 ) ) );
{
BSONObjBuilder bb( b.subobjStart( "res" ) );
// TODO: For backwards compatibility, we can't yet require an epoch here
bb.appendTimestamp( "lastmod", maxVersion.toLong() );
bb.done();
}
preCond.append( b.obj() );
preCond.done();
}
// 4. apply the batch of updates to metadata and to the chunk manager
BSONObj cmd = cmdBuilder.obj();
bool ok;//这里的实现复用了replset的代码,replset的secondary通过读取oplog然后本地
BSONObj cmdResult;//重现primary的实现来达到同步数据库的目的,而这里构建了类似
{ //oplog的bson对象,然后执行applyops来让configserver执行相应的splitchunk的操作
scoped_ptr conn(
ScopedDbConnection::getInternalScopedDbConnection(
shardingState.getConfigServer() ) );
ok = conn->get()->runCommand( "config" , cmd , cmdResult );//这里真正的更新config.chunks
conn->done();
}
// install a chunk manager with knowledge about newly split chunks in this shard's state
splitKeys.pop_back(); // 'max' was used as sentinel
maxVersion.incMinor();//更新配置,完成splitchunk,chunkmanager改变,这里更新
shardingState.splitChunk( ns , min , max , splitKeys , maxVersion );
// 5. logChanges
// single splits are logged different than multisplits
if ( newChunks.size() == 2 ) {//记录一些操作的结果到config.changelog中
newChunks[0].appendShortVersion( "left" , logDetail );
newChunks[1].appendShortVersion( "right" , logDetail );
configServer.logChange( "split" , ns , logDetail.obj() );
}
else {
BSONObj beforeDetailObj = logDetail.obj();
BSONObj firstDetailObj = beforeDetailObj.getOwned();
const int newChunksSize = newChunks.size();
for ( int i=0; i < newChunksSize; i++ ) {
BSONObjBuilder chunkDetail;
chunkDetail.appendElements( beforeDetailObj );
chunkDetail.append( "number", i+1 );
chunkDetail.append( "of" , newChunksSize );
newChunks[i].appendShortVersion( "chunk" , chunkDetail );
configServer.logChange( "multi-split" , ns , chunkDetail.obj() );
}
}
if (newChunks.size() == 2){
// If one of the chunks has only one object in it we should move it
static const BSONObj fields = BSON("_id" << 1 );
DBDirectClient conn;
for (int i=1; i >= 0 ; i--){ // high chunk more likely to have only one obj
ChunkInfo chunk = newChunks[i];
Query q = Query().minKey(chunk.min).maxKey(chunk.max);
scoped_ptr c (conn.query(ns, q, /*limit*/-2, 0, &fields));
if (c && c->itcount() == 1) {
result.append("shouldMigrate", BSON("min" << chunk.min << "max" << chunk.max));
break;
}
}
}
return true;
}
到这里分片完成,简单来说分片就是将一个chunk按照需要分成几个chunk,分片的过程中,只是修改了configserver上的配置,实际的数据没有任何的改变.