官方文档https://docs.mongodb.com/v3.6/tutorial/configure-a-non-voting-replica-set-member/
主节点全部失败了,涉及Config Server、Shard Server,这个时候,通过mongos可以连接到副节点,但是执行操作时会报错, 说连不上shard2的primay主节点
"errmsg" : "Could not find host matching read preference { mode: \"primary\", tags: [ {} ] } for set shard2"
处理方法
方法1、一般会自动切换,如果shard是由3个节点组成的replica set,主节无法连接到另外两个副节点时,和rs.conf()里面的electionTimeoutMillis值相关默认是10秒,两个副节点自动会有选择一个节点成为主节点;如果shard是由2个节点组成的replica set,把主节点启动后该主节点自动变成了副节点,原副节点自动变成了新的主节点
方法2、如果主节点机器完全无法使用了,需要对某个shard的副本节点执行切换,切换到primay状态,mongo命令进入这个shard的辅助节点,查看rs.conf()配置信息,并查看rs.status()状态,看到主节点无法连接,使用rs.reconfig()重新配置,rs.reconfig括号里面的内容参考rs.conf()的信息,去掉primay节点的信息,并添加{ "force": true }。之后退出重新登录这个shard,发现副本切换到了primay状态,重新登录mongos发现正常了
使用mongo登录mongod实例执行rs.status()或db.isMaster()可以看到谁是主节点,谁是副节点
mongos实例执行rs.status()会报错replSetGetStatus is not supported through mongos
mongos实例执行db.isMaster()正常,默认连接到主节点
实验案例
1、切换到TDB6执行show tables报错,说这个数据库对应的shard的primay节点无法找到
mongo --host 172.22.138.157 --port 27001
mongos> use TDB6
mongos> show tables
2019-06-20T00:16:17.935-0700 E QUERY [thread1] Error: listCollections failed: {
"ok" : 0,
"errmsg" : "Could not find host matching read preference { mode: \"primary\", tags: [ {} ] } for set shard28003",
2、只能进入这个shard的辅助节点,查看rs.conf()配置信息,并查看rs.status()状态,看到主节点无法连接,使用rs.reconfig()重新配置,rs.reconfig括号里面的内容参考rs.conf()的信息,去掉primay节点的信息,并添加{ "force": true }
mongo --host 172.22.138.157 --port 28003
shard28003:SECONDARY>rs.conf()
{
"_id" : "shard28003",
"version" : 1,
"protocolVersion" : NumberLong(1),
"members" : [
{
"_id" : 0,
"host" : "172.22.138.157:28003",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
},
{
"_id" : 1,
"host" : "172.22.138.158:28003",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
}
],
"settings" : {
"chainingAllowed" : true,
"heartbeatIntervalMillis" : 2000,
"heartbeatTimeoutSecs" : 10,
"electionTimeoutMillis" : 10000,
"catchUpTimeoutMillis" : -1,
"catchUpTakeoverDelayMillis" : 30000,
"getLastErrorModes" : {
},
"getLastErrorDefaults" : {
"w" : 1,
"wtimeout" : 0
},
"replicaSetId" : ObjectId("5d09d2c89fb43c4506d995ac")
}
}
shard28003:SECONDARY> rs.status()
{
"set" : "shard28003",
"date" : ISODate("2019-06-20T07:25:07.438Z"),
"myState" : 2,
"term" : NumberLong(2),
"syncingTo" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"heartbeatIntervalMillis" : NumberLong(2000),
"optimes" : {
"lastCommittedOpTime" : {
"ts" : Timestamp(1561001587, 1),
"t" : NumberLong(2)
},
"readConcernMajorityOpTime" : {
"ts" : Timestamp(1561001587, 1),
"t" : NumberLong(2)
},
"appliedOpTime" : {
"ts" : Timestamp(1561001587, 1),
"t" : NumberLong(2)
},
"durableOpTime" : {
"ts" : Timestamp(1561001587, 1),
"t" : NumberLong(2)
}
},
"members" : [
{
"_id" : 0,
"name" : "172.22.138.157:28003",
"health" : 1,
"state" : 2,
"stateStr" : "SECONDARY",
"uptime" : 15254,
"optime" : {
"ts" : Timestamp(1561001587, 1),
"t" : NumberLong(2)
},
"optimeDate" : ISODate("2019-06-20T03:33:07Z"),
"syncingTo" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "could not find member to sync from",
"configVersion" : 1,
"self" : true,
"lastHeartbeatMessage" : ""
},
{
"_id" : 1,
"name" : "172.22.138.158:28003",
"health" : 0,
"state" : 8,
"stateStr" : "(not reachable/healthy)",
"uptime" : 0,
"optime" : {
"ts" : Timestamp(0, 0),
"t" : NumberLong(-1)
},
"optimeDurable" : {
"ts" : Timestamp(0, 0),
"t" : NumberLong(-1)
},
"optimeDate" : ISODate("1970-01-01T00:00:00Z"),
"optimeDurableDate" : ISODate("1970-01-01T00:00:00Z"),
"lastHeartbeat" : ISODate("2019-06-20T07:25:07.121Z"),
"lastHeartbeatRecv" : ISODate("2019-06-20T03:33:08.040Z"),
"pingMs" : NumberLong(0),
"lastHeartbeatMessage" : "Connection refused",
"syncingTo" : "",
"syncSourceHost" : "",
"syncSourceId" : -1,
"infoMessage" : "",
"configVersion" : -1
}
],
"ok" : 1,
"operationTime" : Timestamp(1561001587, 1),
"$gleStats" : {
"lastOpTime" : Timestamp(0, 0),
"electionId" : ObjectId("000000000000000000000000")
},
"$configServerState" : {
"opTime" : {
"ts" : Timestamp(1561001589, 1),
"t" : NumberLong(2)
}
},
"$clusterTime" : {
"clusterTime" : Timestamp(1561015481, 1),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : NumberLong(0)
}
}
}
shard28003:SECONDARY> rs.reconfig({
"_id" : "shard28003",
"version" : 1,
"protocolVersion" : NumberLong(1),
"members" : [
{
"_id" : 0,
"host" : "172.22.138.157:28003",
"arbiterOnly" : false,
"buildIndexes" : true,
"hidden" : false,
"priority" : 1,
"tags" : {
},
"slaveDelay" : NumberLong(0),
"votes" : 1
}
],
"settings" : {
"chainingAllowed" : true,
"heartbeatIntervalMillis" : 2000,
"heartbeatTimeoutSecs" : 10,
"electionTimeoutMillis" : 10000,
"catchUpTimeoutMillis" : -1,
"catchUpTakeoverDelayMillis" : 30000,
"getLastErrorModes" : {
},
"getLastErrorDefaults" : {
"w" : 1,
"wtimeout" : 0
},
"replicaSetId" : ObjectId("5d09d2c89fb43c4506d995ac")
}
},
{ "force": true }
)
3、退出后,重新登录,发现副本切换到了primay状态
mongo --host 172.22.138.157 --port 28003
shard28003:PRIMARY>
4、重新登录mongos,发现可以正常操作show tables,但是无法db.createCollection,因为Config Server的replica set的主节点坏了,重新按上面1、2、3方法切换Config Server的replica set的副节点为主节点就好了
mongo --host 172.22.138.157 --port 27001
mongos> show tables
test06
mongos> use testdb2
switched to db testdb2
mongos> db.createCollection("table1")
{
"ok" : 0,
"errmsg" : "Database testdb2 not found due to Could not confirm non-existence of database testdb 2 due to Could not find host matching read preference { mode: \"primary\" } for set config29001",
"code" : 133,
"codeName" : "FailedToSatisfyReadPreference",
"operationTime" : Timestamp(1561034171, 1),
"$clusterTime" : {
"clusterTime" : Timestamp(1561034171, 1),
"signature" : {
"hash" : BinData(0,"AAAAAAAAAAAAAAAAAAAAAAAAAAA="),
"keyId" : NumberLong(0)
}
}
}
5、遇到其他shard也是这种情况下,继续上面1、2、3的方法操作
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/30126024/viewspace-2648280/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/30126024/viewspace-2648280/