上一篇文章分析了replset的初始化,下面我们继续分析replset的同步部分,这里涉及到2个线程,一个函数.
producerThread: 对于非primary的服务器,选取一个目标服务器并从其读出操作日志.
startsyncthread: 从producerthread处读取操作日志然后replay.
msgCheckNewState函数: 负责各个服务器状态的切换,secondary<->primary.
首先来看看producerThread,其内部循环调用_producerThread取出操作日志,前进到_producerThread
void BackgroundSync::_producerThread() { MemberState state = theReplSet->state(); // we want to pause when the state changes to primary if (state.primary()) {//primary服务器不需要同步其它,暂停 if (!_pause) { stop(); } sleepsecs(1); return; } if (state.fatal() || state.startup()) { sleepsecs(5); return; } // if this member has an empty oplog, we cannot start syncing if (theReplSet->lastOpTimeWritten.isNull()) { sleepsecs(1); return; } // we want to unpause when we're no longer primary // start() also loads _lastOpTimeFetched, which we know is set from the "if" else if (_pause) {//初始化时_pause为true,这里启动,设置上一次操作的时间,对于初始化就是save config时写入log的时间 start(); } produce(); }
_producerThread->produce,其内部选取一个synctarget然后从其中读取操作日志然后存入一个
blockqueue中供syncthread同步.
void BackgroundSync::produce() { // this oplog reader does not do a handshake because we don't want the server it's syncing // from to track how far it has synced OplogReader r(false /* doHandshake */); // find a target to sync from the last op time written getOplogReader(r);//选取synctarget // no server found { boost::unique_lock<boost::mutex> lock(_mutex); r.tailingQueryGTE(rsoplog, _lastOpTimeFetched);//查找比当前取得log时间更大的操作日志 } if (isRollbackRequired(r)) {//sync back的操作,将需要rollback的部分做逆操作 stop(); //不再深入分析,感兴趣的自己阅读 return; } while (!inShutdown()) { while (!inShutdown()) { if (!r.moreInCurrentBatch()) {//没有更多的数据 if (theReplSet->gotForceSync()) {//rs.syncFrom设置的强行更新的目标存在 return; } if (theReplSet->isPrimary()) {//primary不需要sync return; } { boost::unique_lock<boost::mutex> lock(_mutex); if (!_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.more(); } if (!r.more()) break; BSONObj o = r.nextSafe().getOwned(); Timer timer; _buffer.push(o);//读取的日志记录到_buffer中,供syncthread读取操作 { boost::unique_lock<boost::mutex> lock(_mutex); // update counters _queueCounter.waitTime += timer.millis(); _queueCounter.numElems++; _lastH = o["h"].numberLong(); _lastOpTimeFetched = o["ts"]._opTime();//更新最新的操作时间戳 } } // end while { boost::unique_lock<boost::mutex> lock(_mutex); if (_pause || !_currentSyncTarget || !_currentSyncTarget->hbinfo().hbstate.readable()) { return; } } r.tailCheck(); if( !r.haveCursor() ) { return; } // looping back is ok because this is a tailable cursor } }_producerThread->produce->getOplogReader
void BackgroundSync::getOplogReader(OplogReader& r) { Member *target = NULL, *stale = NULL; BSONObj oldest; // then we're initial syncing and we're still waiting for this to be set { boost::unique_lock<boost::mutex> lock(_mutex); if (_lastOpTimeFetched.isNull()) { _currentSyncTarget = NULL; return; } } //选取一个时间戳比自己更新的,且一段时间内能够连接的端坐sync //若有多个目标则选择ping耗时最少的,强制指定了forcesynctarget,则直接选择该 //target,一旦选取了一个target,将其设置为currentsynctarget,之后除了其无法连接操作 //log过时或者stop,否则将一直保持不变 while ((target = theReplSet->getMemberToSyncTo()) != NULL) { string current = target->fullName(); if (!r.connect(current)) {//不能连接,一段时间内不考虑其作为sync端 r.resetConnection(); theReplSet->veto(current); continue; } if (isStale(r, oldest)) {//本地取得最新的操作日志过旧,比sync端最旧的操作日志时间还旧 r.resetConnection(); theReplSet->veto(current, 600); stale = target; continue; } // if we made it here, the target is up and not stale { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = target; } return; } // the only viable sync target was stale if (stale) { theReplSet->goStale(stale, oldest); sleepsecs(120); } { boost::unique_lock<boost::mutex> lock(_mutex); _currentSyncTarget = NULL; } }_producerThread->produce->getOplogReader->getMemberToSyncTo
Member* ReplSetImpl::getMemberToSyncTo() { lock lk(this); bool buildIndexes = true; // if we have a target we've requested to sync from, use it if (_forceSyncTarget) {//有强制更新的目标直接选定 Member* target = _forceSyncTarget; _forceSyncTarget = 0; return target; } // wait for 2N pings before choosing a sync target if (_cfg) {//ping的次数至少是members的2倍 int needMorePings = config().members.size()*2 - HeartbeatInfo::numPings; if (needMorePings > 0) { return NULL; } buildIndexes = myConfig().buildIndexes; } // find the member with the lowest ping time that has more data than me // Find primary's oplog time. Reject sync candidates that are more than // MAX_SLACK_TIME seconds behind. OpTime primaryOpTime; static const unsigned maxSlackDurationSeconds = 10 * 60; // 10 minutes const Member* primary = box.getPrimary(); if (primary) primaryOpTime = primary->hbinfo().opTime; else // choose a time that will exclude no candidates, since we don't see a primary primaryOpTime = OpTime(maxSlackDurationSeconds, 0); if ( primaryOpTime.getSecs() < maxSlackDurationSeconds ) { // erh - I think this means there was just a new election // and we don't yet know the new primary's optime primaryOpTime = OpTime(maxSlackDurationSeconds, 0); } OpTime oldestSyncOpTime(primaryOpTime.getSecs() - maxSlackDurationSeconds, 0); Member *closest = 0; time_t now = 0; // Make two attempts. The first attempt, we ignore those nodes with // slave delay higher than our own. The second attempt includes such // nodes, in case those are the only ones we can reach. // This loop attempts to set 'closest'. for (int attempts = 0; attempts < 2; ++attempts) { for (Member *m = _members.head(); m; m = m->next()) { if (!m->hbinfo().up()) continue; // make sure members with buildIndexes sync from other members w/indexes if (buildIndexes && !m->config().buildIndexes)//有buildIndexes的只能选择buildIndexes的 continue; if (!m->state().readable())//非primary,secondary不可读 continue; if (m->state() == MemberState::RS_SECONDARY) { // only consider secondaries that are ahead of where we are if (m->hbinfo().opTime <= lastOpTimeWritten)//远端的日志时间戳比自己还旧,不考虑 continue; // omit secondaries that are excessively behind, on the first attempt at least. if (attempts == 0 && m->hbinfo().opTime < oldestSyncOpTime) continue; } // omit nodes that are more latent than anything we've already considered if (closest && //之前已经找到了一个sync端,比照ping的时间,选取ping时间更少的一个 (m->hbinfo().ping > closest->hbinfo().ping)) continue; if ( attempts == 0 && myConfig().slaveDelay < m->config().slaveDelay ) { continue; // skip this one in the first attempt } //之前选举过作为同步对象,但是其无法连接,这里要等待vote->second //时间后才可能再次将其作为候选选项,其时间可以配置,默认为10s. map<string,time_t>::iterator vetoed = _veto.find(m->fullName()); if (vetoed != _veto.end()) { // Do some veto housekeeping if (now == 0) { now = time(0); } // if this was on the veto list, check if it was vetoed in the last "while". // if it was, skip. if (vetoed->second >= now) { continue; } _veto.erase(vetoed); // fall through, this is a valid candidate now } // This candidate has passed all tests; set 'closest' closest = m; } if (closest) break; // no need for second attempt } if (!closest) { return NULL; } return closest; }到这里procedure线程分析完毕,继续startSyncThread,其入口为:ReplSetImpl::syncThread
void ReplSetImpl::syncThread() { while( 1 ) { // After a reconfig, we may not be in the replica set anymore, so // check that we are in the set (and not an arbiter) before // trying to sync with other replicas. if( ! _self ) { sleepsecs(20); continue; } if( myConfig().arbiterOnly ) {//仲裁者不需要同步 return; } try { _syncThread();//真正的同步过程 } catch(DBException& e) { sethbmsg(str::stream() << "syncThread: " << e.toString()); sleepsecs(10); } catch(...) { sethbmsg("unexpected exception in syncThread()"); sleepsecs(60); } sleepsecs(1); } }
void ReplSetImpl::_syncThread() { StateBox::SP sp = box.get(); if( sp.state.primary() ) {//primary不需要同步 sleepsecs(1); return; }//下面这些只有secondary服务器会执行 if( _blockSync || sp.state.fatal() || sp.state.startup() ) { sleepsecs(5); return; } /* do we have anything at all? */ if( lastOpTimeWritten.isNull() ) {//没有sync的时间,初始化sync,从远端复制数据库 syncDoInitialSync(); return; // _syncThread will be recalled, starts from top again in case sync failed. } /* we have some data. continue tailing. */ replset::SyncTail tail(replset::BackgroundSync::get());//这里额度BackgroundSync全局唯一,内部保存了producethread读取的操作日志buffer tail.oplogApplication();//读取操作日志然后replay }_syncThread->syncDoInitialSync
void ReplSetImpl::syncDoInitialSync() { const static int maxFailedAttempts = 10; createOplog();//可能的创建local.oplog.rs的过程 int failedAttempts = 0; while ( failedAttempts < maxFailedAttempts ) { try { _syncDoInitialSync(); break; } catch(DBException& e) { failedAttempts++; sleepsecs(30); } } }_syncThread->syncDoInitialSync->_syncDoInitialSync
void ReplSetImpl::_syncDoInitialSync() { replset::InitialSync init(replset::BackgroundSync::get()); // if this is the first node, it may have already become primary const Member *source = getMemberToSyncTo(); string sourceHostname = source->h().toString(); init.setHostname(sourceHostname); OplogReader r; BSONObj lastOp = r.getLastOp(rsoplog); if (replSettings.fastsync) {//快速同步中仅仅同步最后一个操作,使用这个命令有限制, log() << "fastsync: skipping database clone" << rsLog;//不注意会造成某些数据不能同步 // prime oplog init.oplogApplication(lastOp, lastOp); return; } else { dropAllDatabasesExceptLocal(); list<string> dbs = r.conn()->getDatabaseNames(); //复制primary端的所有除local的数据库,这是一个同步过程,同步复制所有数据库 if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, true ) ) { veto(source->fullName(), 600);//将这台服务器加入到veto,600s内不尝试从该端sync sleepsecs(300); return; } BSONObj minValid;//同步lastOp到minValid的操作日志 if ( ! _syncDoInitialSync_applyToHead( init, &r , source , lastOp , minValid ) ) { return; } lastOp = minValid; // its currently important that lastOp is equal to the last op we actually pulled // this is because the background thread only pulls each op once now // so if its now, we'll be waiting forever { // this takes whatever the last op the we got is // and stores it locally before we wipe it out below Lock::DBRead lk(rsoplog); Helpers::getLast(rsoplog, lastOp); lastOp = lastOp.getOwned(); } // reset state, as that "didn't count" emptyOplog(); lastOpTimeWritten = OpTime(); lastH = 0; if ( ! _syncDoInitialSync_clone( sourceHostname.c_str(), dbs, false ) ) {//克隆index veto(source->fullName(), 600); sleepsecs(300); return; } } BSONObj minValid; if ( ! _syncDoInitialSync_applyToHead( init, &r, source, lastOp, minValid ) ) {//再次同步日志 return; } { Client::WriteContext cx( "local." ); cx.ctx().db()->flushFiles(true); Helpers::putSingleton("local.replset.minvalid", minValid); cx.ctx().db()->flushFiles(true); } changeState(MemberState::RS_RECOVERING); }
_syncThread->syncDoInitialSync->_syncDoInitialSync->_syncDoInitialSync_applyToHead
bool ReplSetImpl::_syncDoInitialSync_applyToHead( replset::InitialSync& init, OplogReader* r, const Member* source, const BSONObj& lastOp , BSONObj& minValid ) { /* our cloned copy will be strange until we apply oplog events that occurred through the process. we note that time point here. */ try { // It may have been a long time since we last used this connection to // query the oplog, depending on the size of the databases we needed to clone. // A common problem is that TCP keepalives are set too infrequent, and thus // our connection here is terminated by a firewall due to inactivity. // Solution is to increase the TCP keepalive frequency. minValid = r->getLastOp(rsoplog); } catch ( SocketException & ) { if( !r->connect(source->h().toString()) ) { throw; } // retry minValid = r->getLastOp(rsoplog); } OpTime mvoptime = minValid["ts"]._opTime(); OpTime startingTS = lastOp["ts"]._opTime(); // apply startingTS..mvoptime portion of the oplog init.oplogApplication(lastOp, minValid);//replay操作日志 return true; }_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication
void InitialSync::oplogApplication(const BSONObj& applyGTEObj, const BSONObj& minValidObj) { OpTime applyGTE = applyGTEObj["ts"]._opTime(); OpTime minValid = minValidObj["ts"]._opTime(); syncApply(applyGTEObj);//sync第一条日志 _logOpObjRS(applyGTEObj);//将其记录到本地local.oplog.rs并更新时间lastOpTimeWritten // if there were no writes during the initial sync, there will be nothing in the queue so // just go live if (minValid == applyGTE) { return; } OpTime ts; time_t start = time(0); unsigned long long n = 0, lastN = 0; while( ts < minValid ) {//在克隆数据库或者索引的时候已经产生了其它操作日志 OpQueue ops; while (ops.getSize() < replBatchSizeBytes) {//等待更多的数据 if (tryPopAndWaitForMore(&ops)) { break; } } multiApply(ops.getDeque(), multiInitialSyncApply);//使用threadpoll多线程分别调用multiInitialSyncApply去执行操作日志的replay n += ops.getDeque().size(); if ( n > lastN + 1000 ) { time_t now = time(0); if (now - start > 10) { // simple progress metering log() << "replSet initialSyncOplogApplication applied " << n << " operations, synced to " << ts.toStringPretty() << rsLog; start = now; lastN = n; } } // we want to keep a record of the last op applied, to compare with minvalid const BSONObj& lastOp = ops.getDeque().back(); OpTime tempTs = lastOp["ts"]._opTime(); applyOpsToOplog(&ops.getDeque());//将日志记录到本地local.oplog.rs中 ts = tempTs; } }_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication->multiApply
void SyncTail::multiApply( std::deque<BSONObj>& ops, MultiSyncApplyFunc applyFunc ) { // Use a ThreadPool to prefetch all the operations in a batch. prefetchOps(ops);//多线程预先将索引和数据加载进内存 std::vector< std::vector<BSONObj> > writerVectors(theReplSet->replWriterThreadCount); fillWriterVectors(ops, &writerVectors);//将ops分配到writer中,每个writer一个线程 // We must grab this because we're going to grab write locks later. // We hold this mutex the entire time we're writing; it doesn't matter // because all readers are blocked anyway. SimpleMutex::scoped_lock fsynclk(filesLockedFsync); // stop all readers until we're done Lock::ParallelBatchWriterMode pbwm; applyOps(writerVectors, applyFunc);//每个线程调用applyFunc更新操作日志 }_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication->multiInitialSyncApply
void multiInitialSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); for (std::vector<BSONObj>::const_iterator it = ops.begin();it != ops.end();++it) { if (!st->syncApply(*it)) {//具体的sync动作 bool status; { Lock::GlobalWrite lk; status = st->shouldRetry(*it);//要操作的对象不存在,可能原因是出于初始化状态,去同步端取得对象插入本地,然后再次操作 }//为什么同步复制数据库的情况下还存在数据对象不存在的情况呢? if (status) { // retry fassert(15915, st->syncApply(*it)); } // If shouldRetry() returns false, fall through. // This can happen if the document that was moved and missed by Cloner // subsequently got deleted and no longer exists on the Sync Target at all } } }
_syncThread->syncDoInitialSync->_syncDoInitialSync_applyToHead->InitialSync::oplogApplication->multiInitialSyncApply->syncApply
bool SyncTail::syncApply(const BSONObj &op, bool convertUpdateToUpsert/*=false*/) { const char *ns = op.getStringField("ns"); bool isCommand(op["op"].valuestrsafe()[0] == 'c'); boost::scoped_ptr<Lock::ScopedLock> lk; if(isCommand) { // a command may need a global write lock. so we will conservatively go // ahead and grab one here. suboptimal. :-( lk.reset(new Lock::GlobalWrite()); } else { // DB level lock for this operation lk.reset(new Lock::DBWrite(ns)); } Client::Context ctx(ns, dbpath, false); ctx.getClient()->curop()->reset(); // For non-initial-sync, we convert updates to upserts // to suppress errors when replaying oplog entries. bool ok = !applyOperation_inlock(op, true, convertUpdateToUpsert);//replay操作日志,在master/slave时分析过,不再分析 getDur().commitIfNeeded(); return ok; }回到_syncThread非initial状态调用如下,这里的oplogApplication和之前的oplogApplication不同.
/* we have some data. continue tailing. */ replset::SyncTail tail(replset::BackgroundSync::get()); tail.oplogApplication();
void SyncTail::oplogApplication() { while( 1 ) { OpQueue ops; time_t lastTimeChecked = time(0); // always fetch a few ops first // tryPopAndWaitForMore returns true when we need to end a batch early while (!tryPopAndWaitForMore(&ops) && (ops.getSize() < replBatchSizeBytes)) { time_t now = time(0); // occasionally check some things if (ops.empty() || now > lastTimeChecked) { lastTimeChecked = now; // can we become secondary? // we have to check this before calling mgr, as we must be a secondary to // become primary if (!theReplSet->isSecondary()) {//初始化过程第一次将自己设置成secondary OpTime minvalid; theReplSet->tryToGoLiveAsASecondary(minvalid);//设置自己为secondary后通过心跳协议把自己状态传给其它member } // normally msgCheckNewState gets called periodically, but in a single node repl set // there are no heartbeat threads, so we do it here to be sure. this is relevant if the // singleton member has done a stepDown() and needs to come back up. if (theReplSet->config().members.size() == 1 &&//只有一个节点,得自己调用函数改变状态 theReplSet->myConfig().potentiallyHot()) { Manager* mgr = theReplSet->mgr; // When would mgr be null? During replsettest'ing. if (mgr) mgr->send(boost::bind(&Manager::msgCheckNewState, theReplSet->mgr)); sleepsecs(1); return; } } } const BSONObj& lastOp = ops.getDeque().back(); handleSlaveDelay(lastOp); // Set minValid to the last op to be applied in this next batch. // This will cause this node to go into RECOVERING state // if we should crash and restart before updating the oplog { Client::WriteContext cx( "local" ); Helpers::putSingleton("local.replset.minvalid", lastOp); } multiApply(ops.getDeque(), multiSyncApply);//实际的replay操作日志 applyOpsToOplog(&ops.getDeque()); } }_syncThread->SyncTail::oplogApplication->multiSyncApply
void multiSyncApply(const std::vector<BSONObj>& ops, SyncTail* st) { initializeWriterThread(); for (std::vector<BSONObj>::const_iterator it = ops.begin(); it != ops.end(); ++it) { try { fassert(16359, st->syncApply(*it, true));//实际的sync,前面分析过,这里不再分析 } catch (DBException& e) { error() << "writer worker caught exception: " << e.what() << " on: " << it->toString() << endl; fassertFailed(16360); } } }到这里同步操作的线程分析完毕,其内部就是读取操作日志然后开启多线程replay.
最后来看看Manager::msgCheckNewState,该函数将设置primary.
void Manager::msgCheckNewState() { { RSBase::lock lk(rs); if( busyWithElectSelf ) return; checkElectableSet();//将自己加入到候选primary中或移除,找到priority最大的,可能切换primary checkAuth();//检查不能连接的服务器和认证出问题的服务器 const Member *p = rs->box.getPrimary(); if( p && p != rs->_self ) {//primary不能给连接,将其置空 if( !p->hbinfo().up() || !p->hbinfo().hbstate.primary() ) { p = 0; rs->box.setOtherPrimary(0); } } const Member *p2;//是否有自己和另一个节点认为自己是primary bool two; p2 = findOtherPrimary(two); if( two ) { /* two other nodes think they are primary (asynchronously polled) -- wait for things to settle down. */ return; } if( p2 ) {//将其设置为primary noteARemoteIsPrimary(p2); return; } if( p ) { /* we are already primary */ if( p != rs->_self ) { return; } if( rs->elect.shouldRelinquish() ) {//在线服务器的总投票数已经不到一半,需要让出primary rs->relinquish(); } return; } if( !rs->iAmPotentiallyHot() ) { // if not we never try to be primary return; } /* no one seems to be primary. shall we try to elect ourself? */ if( !rs->elect.aMajoritySeemsToBeUp() ) {//投票数不足一半,返回 static time_t last; static int n; int ll = 0; if( ++n > 5 ) ll++; if( last + 60 > time(0 ) ) ll++; log(ll) << "replSet can't see a majority, will not try to elect self" << rsLog; last = time(0); return; } if( !rs->iAmElectable() ) { return; } busyWithElectSelf = true; // don't try to do further elections & such while we are already working on one. } try { rs->elect.electSelf();//尝试将自己设置为primary } catch(RetryAfterSleepException&) {//重启该task,也就是再次执行该函数 /* we want to process new inbounds before trying this again. so we just put a checkNewstate in the queue for eval later. */ requeue(); } busyWithElectSelf = false; }Manager::msgCheckNewState->checkElectableSet
void Manager::checkElectableSet() { unsigned otherOp = rs->lastOtherOpTime().getSecs();//找到其他服务器中时间最近的一个 // make sure the electable set is up-to-date if (rs->elect.aMajoritySeemsToBeUp() &&//自己离最近时间点的服务器在10s内, rs->iAmPotentiallyHot() && (otherOp == 0 || rs->lastOpTimeWritten.getSecs() >= otherOp - 10)) { theReplSet->addToElectable(rs->selfId());//将自己加入到primary候选中 } else { theReplSet->rmFromElectable(rs->selfId()); } // check if we should ask the primary (possibly ourselves) to step down const Member *highestPriority = theReplSet->getMostElectable();//找到primary最高的服务器 const Member *primary = rs->box.getPrimary(); if (primary && highestPriority && highestPriority->config().priority > primary->config().priority && // if we're stepping down to allow another member to become primary, we // better have another member (otherOp), and it should be up-to-date otherOp != 0 && highestPriority->hbinfo().opTime.getSecs() >= otherOp - 10) { if (primary->h().isSelf()) {//自己不是priority最高的,让出primary // replSetStepDown tries to acquire the same lock // msgCheckNewState takes, so we can't call replSetStepDown on // ourselves. rs->relinquish(); } else { BSONObj cmd = BSON( "replSetStepDown" << 1 ); ScopedConn conn(primary->fullName()); BSONObj result; try {//让远端让出primary if (!conn.runCommand( "admin", cmd, result, 0, &AuthenticationTable::getInternalSecurityAuthenticationTable())) { } } } } }回到msgCheckNewState服务器尝试将自身设置为primary.其调用Consensus::electSelf,其内部
继续调用Consensus::_electSelf.
void Consensus::_electSelf() { bool allUp; int nTies; if( !weAreFreshest(allUp, nTies) ) {//自己是否是最新的,比照lasttimewritten return; } if( !allUp && time(0) - started < 60 * 5 ) {//不是所有之前心跳协议没问题的都返回了其时间,对方可能crash了 /* the idea here is that if a bunch of nodes bounce all at once, we don't want to drop data if we don't have to -- we'd rather be offline and wait a little longer instead todo: make this configurable. */ return; } Member& me = *rs._self; if( nTies ) {//几台服务器最后的写时间一样,随机睡眠一段时间 /* tie? we then randomly sleep to try to not collide on our voting. */ /* todo: smarter. */ if( me.id() == 0 || sleptLast ) { // would be fine for one node not to sleep // todo: biggest / highest priority nodes should be the ones that get to not sleep } else { unsigned ms = ((unsigned) rand()) % 1000 + 50; sleptLast = true; sleepmillis(ms); throw RetryAfterSleepException(); } } sleptLast = false; time_t start = time(0); unsigned meid = me.id(); int tally = yea( meid ); bool success = false; try {//通过命令replSetElect推举自己为primary BSONObj electCmd = BSON( "replSetElect" << 1 << "set" << rs.name() << "who" << me.fullName() << "whoid" << me.hbinfo().id() << "cfgver" << rs._cfg->version << "round" << OID::gen() /* this is just for diagnostics */ ); int configVersion; list<Target> L; rs.getTargets(L, configVersion); multiCommand(electCmd, L);//开启多个线程执行该命令 { for( list<Target>::iterator i = L.begin(); i != L.end(); i++ ) { DEV log() << "replSet elect res: " << i->result.toString() << rsLog; if( i->ok ) { int v = i->result["vote"].Int(); tally += v; } } if( tally*2 <= totalVotes() ) {//得票少于半数 log() << "replSet couldn't elect self, only received " << tally << " votes" << rsLog; } else if( time(0) - start > 30 ) {//命令时间执行过程 // defensive; should never happen as we have timeouts on connection and operation for our conn log() << "replSet too much time passed during our election, ignoring result" << rsLog; } else if( configVersion != rs.config().version ) {版本不对 log() << "replSet config version changed during our election, ignoring result" << rsLog; } else { /* succeeded. */ log(1) << "replSet election succeeded, assuming primary role" << rsLog; success = true; rs.assumePrimary();//将自己设置为primary } } } catch( std::exception& ) { if( !success ) electionFailed(meid); throw; } if( !success ) electionFailed(meid); }Manager::msgCheckNewState->Consensus::electSelf->Consensus::_electSelf->CmdReplSetElect::run->Consensus::electCmdReceived
void Consensus::electCmdReceived(BSONObj cmd, BSONObjBuilder* _b) { BSONObjBuilder& b = *_b; string set = cmd["set"].String(); unsigned whoid = cmd["whoid"].Int(); int cfgver = cmd["cfgver"].Int(); OID round = cmd["round"].OID(); int myver = rs.config().version; const Member* primary = rs.box.getPrimary(); const Member* hopeful = rs.findById(whoid); const Member* highestPriority = rs.getMostElectable(); int vote = 0; else if( myver > cfgver ) {//自己的config已经更新,对方想将自己设置为primary,版本不对投-10000保证它不能成为primary vote = -10000; } else if( !hopeful ) {//自己找不到对方这个member vote = -10000; } else if( primary && primary == rs._self && rs.lastOpTimeWritten >= hopeful->hbinfo().opTime ) { vote = -10000;//自己是primary,且时间戳更新,不能让其成为primary } else if( primary && primary->hbinfo().opTime >= hopeful->hbinfo().opTime ) { vote = -10000;//存在primary且primary的时间戳更新 } else if( highestPriority && highestPriority->config().priority > hopeful->config().priority) { vote = -10000;//存在priority更高的服务器 } else { vote = yea(whoid);//投自己的票数 rs.relinquish(); } b.append("vote", vote); b.append("round", round); }通过这里可以看到要成为primary需要其它服务器同意,并且票数要达到总票数的一半.
到这里ReplSet部分的同步与状态更新讲完了,流程比较繁琐,需要注意心跳部分,producethread,newstate的
状态切换,需要注意priority为0则不可能成为primary,投票数过半服务器才能正常运行.
原文链接:mongodb源码分析(十六)replsetcation replset同步以及状态的切换
作者:yhjj0108,杨浩