哨兵机制
基本原理
命令发送
- sentinel每10s每个Sentinel向master\slaves发送INFO命令
- 发现salve节点
- 确认主从关系
- sentinel每2s每个Sentinel向master\slaves\sentinels用PUBLISH发布信息
- 发布对节点的“看法”和自身信息
- 发现其他sentinel节点
- sentinel每1s每个sentinel向master\slaves\sentinels发送PING命令
- 心跳检测,下线判定的依据
- 客户端将sentinel作为redis发现的配置中心,也通过sentinel的频道作为配置变更的提醒机制
- SENTINEL GET-MASTER-ADDR-BY-NAME获取主节点信息
- SENTINEL SLAVES获取从节点信息
- 通过switch-master频道获取主备切换的信息
- 通过convert-to-slave, +sdown, +slave, +slave-reconf-done等频道获取slave节点配置变更信息
命令接收
- 收到master\slaves的INFO回复
- 收到master\slaves\sentinels对PING命令的回复
- 收到订阅频道master/slaves的sentinel:hello的订阅信息,和其它sentinels用PUBLISH发来的信息
流程图
源码分析
相关常量和结构体
/* 正在监控的RedisInstance对象 */
#define SRI_MASTER (1<<0) /* master */
#define SRI_SLAVE (1<<1) /* slave */
#define SRI_SENTINEL (1<<2) /* sentinel */
#define SRI_S_DOWN (1<<3) /* 主观下线 */
#define SRI_O_DOWN (1<<4) /* 客观下线 */
#define SRI_MASTER_DOWN (1<<5) /* 当前sentinel认为master下线 */
#define SRI_FAILOVER_IN_PROGRESS (1<<6) /* 对于这个master,正在进行故障转移 */
#define SRI_PROMOTED (1<<7) /* 这个slave已经被选为待晋升 */
#define SRI_RECONF_SENT (1<<8) /* SLAVEOF 已经发送. */
#define SRI_RECONF_INPROG (1<<9) /* slave正在同步master */
#define SRI_RECONF_DONE (1<<10) /* slave已经完成到master的同步 */
#define SRI_FORCE_FAILOVER (1<<11) /* Force failover with master up. */
#define SRI_SCRIPT_KILL_SENT (1<<12) /* SCRIPT KILL already sent on -BUSY */
/* Note: times are in milliseconds. */
#define SENTINEL_INFO_PERIOD 10000
#define SENTINEL_PING_PERIOD 1000
#define SENTINEL_ASK_PERIOD 1000
#define SENTINEL_PUBLISH_PERIOD 2000
#define SENTINEL_DEFAULT_DOWN_AFTER 30000
#define SENTINEL_HELLO_CHANNEL "__sentinel__:hello"
#define SENTINEL_TILT_TRIGGER 2000
#define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30)
#define SENTINEL_DEFAULT_SLAVE_PRIORITY 100
#define SENTINEL_SLAVE_RECONF_TIMEOUT 10000
#define SENTINEL_DEFAULT_PARALLEL_SYNCS 1
#define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000
#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*3*1000)
#define SENTINEL_MAX_PENDING_COMMANDS 100
#define SENTINEL_ELECTION_TIMEOUT 10000
#define SENTINEL_MAX_DESYNC 1000
#define SENTINEL_DEFAULT_DENY_SCRIPTS_RECONFIG 1
/* Failover machine different states. */
#define SENTINEL_FAILOVER_STATE_NONE 0 /* 没有故障转移在进行中 */
#define SENTINEL_FAILOVER_STATE_WAIT_START 1 /* 等待开始一个故障转移 Wait for failover_start_time*/
#define SENTINEL_FAILOVER_STATE_SELECT_SLAVE 2 /* 正选择一个slave晋升 */
#define SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE 3 /* 发送slaveof no one给slave */
#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* 等待所选择的slave晋升完成 */
#define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 5 /* 正在重新配置其它salves:SLAVEOF newmaster */
#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 6 /* 其它slave重新配置完成,监视晋升的slave . */
#define SENTINEL_MASTER_LINK_STATUS_UP 0
#define SENTINEL_MASTER_LINK_STATUS_DOWN 1
/* 一条到sentinelRedisInstance的连接。我们可能会有一个sentinel集合监控很多master。
* 比如5个sentinel监控100个master,那么对于100个master如果我们都各创建5个到sentinel的instanceLink,
* 我们将会创建500个instanceLink,但实际上我们只需要创建5个到sentinel的instanceLink共享就行了,
* refcount被用来实现共享,这样不仅用5个instanceLink代替了500个instanceLink,还用5个PING代替了500个PING。
*
* instanceLink的共享仅用于sentinels,master和slave的refcount总是1。 */
typedef struct instanceLink {
int refcount; /* 这个连接被多少sentinelRedisInstance共用 */
int disconnected; /* 如果cc或pc连接需要重连 */
int pending_commands; /* 这条连接上正在等待reply的数量 */
redisAsyncContext *cc; /* 命令连接的Hiredis上下文 */
redisAsyncContext *pc; /* 命令连接的Hiredis上下文 */
mstime_t cc_conn_time; /* cc连接的时间 */
mstime_t pc_conn_time; /* pc连接的时间 */
mstime_t pc_last_activity; /* 我们最后收到消息的时间 */
mstime_t last_avail_time; /* 最后一次这个实例收到一个合法的PING回复的时间 */
/* 正在等待回复的最后一次PING的时间。
* 当收到回复时时设置为0
* 当值为0并发送一个新的PING时设置为当前时间, */
mstime_t act_ping_time;
/* 最后一次发送ping的时间,仅用于防止在失败时发送过多的PING。空闲时间使用act_ping_time计算。 */
mstime_t last_ping_time;
/* 收到上一次回复的时间,无论收到的回复是什么。用来检查连接是否空闲,从而必须重连。 */
mstime_t last_pong_time;
mstime_t last_reconn_time; /* 当连接段开始,最后一次企图重连的时间 */
} instanceLink;
typedef struct sentinelRedisInstance {
int flags; /* SRI_...标志 */
char *name; /* 从这个sentinel视角看到的master名 */
char *runid; /* 这个实例的runid或者这个sentinel的uniqueID */
uint64_t config_epoch; /* 配置纪元 */
sentinelAddr *addr; /* Master host. */
instanceLink *link; /* 到这个实例的连接,对于sentinels可能是共享的 */
mstime_t last_pub_time; /* 最后一次我们我们发送hello的时间 */
mstime_t last_hello_time; /* 仅Sentinel使用,最后一次我们收到hello消息的时间 */
mstime_t last_master_down_reply_time; /* 最后一次收到SENTINEL is-master-down的回复时间 */
mstime_t s_down_since_time; /* Subjectively down since time. */
mstime_t o_down_since_time; /* Objectively down since time. */
mstime_t down_after_period; /* 如果经历了 Consider it down after that period. */
mstime_t info_refresh; /* 我们最后一次收到INFO回复的时间点 */
dict *renamed_commands; /* 重命名的命令 */
/* 角色和我们第一次观察到变更为该角色的时间。
* 在延迟替换时这是很有用的。我们需要等待一段时间来给新的leader报告新的配置。 */
int role_reported;
mstime_t role_reported_time;
mstime_t slave_conf_change_time; /* 最后一次slave的master地址改变的时间点 */
/* master相关 */
dict *sentinels; /* 监视相同的master的其他sentinels */
dict *slaves; /* 这个master的slaves */
unsigned int quorum;/* 故障转移时需要统一的sentinels数量 */
int parallel_syncs; /* 同时有多少slaves进行重配置。 */
char *auth_pass; /* 验证master和slaves时需要提供的密码 */
/* slave相关 */
mstime_t master_link_down_time; /* slave的复制连接下线的时间 */
int slave_priority; /* 根据INFO输出的slave的优先级 */
mstime_t slave_reconf_sent_time; /* 发送SLAVE OF 的时间 */
struct sentinelRedisInstance *master; /* 如果是slave,这个字段表示她的master */
char *slave_master_host; /* INFO报告的master host */
int slave_master_port; /* INFO报告的master port */
int slave_master_link_status; /* Master link status as reported by INFO */
unsigned long long slave_repl_offset; /* Slave replication offset. */
/* Failover */
char *leader; /* 如果是master,表示执行故障转移时的runid,如果是sentinel,表示这个sentinel投票的leader */
uint64_t leader_epoch; /* leader的配置纪元 */
uint64_t failover_epoch; /* 当前执行的故障转移的配置纪元 */
int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */
mstime_t failover_state_change_time;
mstime_t failover_start_time; /* 最后一次故障转移企图开始的时间 */
mstime_t failover_timeout; /* 刷新故障转移状态的最大时间 */
mstime_t failover_delay_logged; /* For what failover_start_time value we
logged the failover delay. */
struct sentinelRedisInstance *promoted_slave; /* 晋升的slave实例 */
/* 用来提醒管理员和重新配置客户端的脚本:如果为NULL则没有脚本会执行 */
char *notification_script;
char *client_reconfig_script;
sds info; /* cached INFO output */
} sentinelRedisInstance;
/* Sentinel状态 */
struct sentinelState {
char myid[CONFIG_RUN_ID_SIZE + 1]; /* 这个Sentinel的ID */
uint64_t current_epoch; /* 当前纪元 */
dict *masters; /* 所有master的字典。key是实例名,value是sentinelRedisInstance指针 */
int tilt; /* 我们是否处于TILT模式? */
int running_scripts; /* 正在执行的脚本的数量 */
mstime_t tilt_start_time; /* TILT模式开始时间 */
mstime_t previous_time; /* 上次运行定时器函数的时间 */
list *scripts_queue; /* 等待执行的用户脚本队列 */
char *announce_ip; /* gossiped给其它sentinels的IP地址 */
int announce_port; /* gossiped给其它sentinels的IP端口 */
unsigned long simfailure_flags; /* 模拟故障转移 */
int deny_scripts_reconfig; /* 是否运行通过Allow SENTINEL SET ...在运行时改变脚本地址 */
} sentinel;
初始化
int main(int argc, char **argv) {
...
/* 初始化端口号常量和数据结构 */
if (server.sentinel_mode) {
initSentinelConfig();
initSentinel();
}
...
if (!server.sentinel_mode) {
...
} else {
sentinelIsRunning();
}
...
aeSetBeforeSleepProc(server.el,beforeSleep);
aeSetAfterSleepProc(server.el,afterSleep);
aeMain(server.el);
aeDeleteEventLoop(server.el);
return 0;
}
/* 这个函数会在server运行在Sentinel模式下时调用,来启动、加载配置和为正常操作做准备 */
void sentinelIsRunning(void) {
int j;
if (server.configfile == NULL) {
serverLog(LL_WARNING,
"Sentinel started without a config file. Exiting...");
exit(1);
} else if (access(server.configfile, W_OK) == -1) {
serverLog(LL_WARNING,
"Sentinel config file %s is not writable: %s. Exiting...",
server.configfile, strerror(errno));
exit(1);
}
/* 如果Sentinel在配置文件中还没有一个ID,我们将会随机生成一个并持久化到磁盘上。
* 从现在开始,即使重启也会使用同一个ID */
for (j = 0; j < CONFIG_RUN_ID_SIZE; j++)
if (sentinel.myid[j] != 0) break;
if (j == CONFIG_RUN_ID_SIZE) {
/* 选择一个ID并持久化到配置文件中 */
getRandomHexChars(sentinel.myid, CONFIG_RUN_ID_SIZE);
sentinelFlushConfig();
}
/* Log its ID to make debugging of issues simpler. */
serverLog(LL_WARNING, "Sentinel ID is %s", sentinel.myid);
/* 在启动时,对于每个配置的master,我们想要生成一个+monitor的事件 */
sentinelGenerateInitialMonitorEvents();
}
void sentinelTimer(void) {
sentinelCheckTiltCondition(); // 检查TILT条件
sentinelHandleDictOfRedisInstances(sentinel.masters);
sentinelRunPendingScripts();
sentinelCollectTerminatedScripts();
sentinelKillTimedoutScripts();
/* 我们持续的修改Redis定时器中断的频率是为了防止各个sentinel的定时器同步,
* 从而降低同时发起领导选举的可能性 */
server.hz = CONFIG_DEFAULT_HZ + rand() % CONFIG_DEFAULT_HZ;
}
/* 这个函数检查我们是否需要进入TITL模式。
* 当我们在两次时间中断调用中遇到以下情况时,如果调用时间差为负或者超过了2s,我们会进入TITL模式。
* 注意:如果我们认为100ms左右是正常的,如果我们需要进入TITL模式,说明我们遇到了以下情况:
* 1) Sentinel进程因为某些原因阻塞住了,可能是:负载太高,IO冻结,信号停止等等。
* 2) 系统时钟被修改了。
* 在上面两种情况下Sentinel会认为出现了超时甚至故障,这是我们进入TILT,并且在SENTINEL_TILT_PERIOD
* 时间内我们都不执行任何操作。
* 在TILT期间,我们仍然收集信息,但是我们不执行操作。 */
void sentinelCheckTiltCondition(void) {
mstime_t now = mstime();
mstime_t delta = now - sentinel.previous_time;
if (delta < 0 || delta > SENTINEL_TILT_TRIGGER) {
sentinel.tilt = 1;
sentinel.tilt_start_time = mstime();
sentinelEvent(LL_WARNING, "+tilt", NULL, "#tilt mode entered");
}
sentinel.previous_time = mstime();
}
/* 对所有监控的master执行调度操作。 */
void sentinelHandleDictOfRedisInstances(dict *instances) {
dictIterator *di;
dictEntry *de;
sentinelRedisInstance *switch_to_promoted = NULL;
/* 对于每个master,有一些额外的事情要执行 */
di = dictGetIterator(instances);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
sentinelHandleRedisInstance(ri);
if (ri->flags & SRI_MASTER) {
// 如果是master,需要递归的处理slaves和sentinels
sentinelHandleDictOfRedisInstances(ri->slaves);
sentinelHandleDictOfRedisInstances(ri->sentinels);
if (ri->failover_state == SENTINEL_FAILOVER_STATE_UPDATE_CONFIG) {
switch_to_promoted = ri;
}
}
}
if (switch_to_promoted)
sentinelFailoverSwitchToPromotedSlave(switch_to_promoted);
dictReleaseIterator(di);
}
/* 对指定的Redis实例执行调度操作 */
void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
/* ========== MONITORING HALF ============ */
/* 对每种实例都要执行 */
sentinelReconnectInstance(ri); // 如果断开,重连
sentinelSendPeriodicCommands(ri); // 发送周期命令
/* ============== ACTING HALF ============= */
/* 如果我们在TILT模式,则不会执行故障转移的相关操作 */
if (sentinel.tilt) {
if (mstime() - sentinel.tilt_start_time < SENTINEL_TILT_PERIOD) return;
sentinel.tilt = 0;
sentinelEvent(LL_WARNING, "-tilt", NULL, "#tilt mode exited");
}
/* 对master、slave或sentinel检查是否客观下线 */
sentinelCheckSubjectivelyDown(ri);
/* 如果是master或者slave */
if (ri->flags & (SRI_MASTER | SRI_SLAVE)) {
/* Nothing so far. */
}
/* 如果当前实例是master */
if (ri->flags & SRI_MASTER) {
sentinelCheckObjectivelyDown(ri);
if (sentinelStartFailoverIfNeeded(ri))
sentinelAskMasterStateToOtherSentinels(ri, SENTINEL_ASK_FORCED);
sentinelFailoverStateMachine(ri);
sentinelAskMasterStateToOtherSentinels(ri, SENTINEL_NO_FLAGS);
}
}
/* 如果ri的link是断连状态,创建一个异步的连接。
* 注意:命令连接和订阅连接只要有一条断开,link->disconnected都会变成true */
void sentinelReconnectInstance(sentinelRedisInstance *ri) {
if (ri->link->disconnected == 0) return;
if (ri->addr->port == 0) return; /* port == 0 means invalid address. */
instanceLink *link = ri->link;
mstime_t now = mstime();
if (now - ri->link->last_reconn_time < SENTINEL_PING_PERIOD) return;
ri->link->last_reconn_time = now;
/* 命令连接 */
if (link->cc == NULL) {
// 发起一个异步连接
link->cc = redisAsyncConnectBind(ri->addr->ip, ri->addr->port, NET_FIRST_BIND_ADDR);
if (link->cc->err) {
sentinelEvent(LL_DEBUG, "-cmd-link-reconnection", ri, "%@ #%s",
link->cc->errstr);
instanceLinkCloseConnection(link, link->cc);
} else {
link->pending_commands = 0;
link->cc_conn_time = mstime();
link->cc->data = link;
// 加入事件循环,创建Connect回调,Disconnect回调,发送AUTH消息,设置Client名字,发送PING消息
redisAeAttach(server.el, link->cc);
redisAsyncSetConnectCallback(link->cc,
sentinelLinkEstablishedCallback);
redisAsyncSetDisconnectCallback(link->cc,
sentinelDisconnectCallback);
sentinelSendAuthIfNeeded(ri, link->cc);
sentinelSetClientName(ri, link->cc, "cmd");
/* 当重连后我们尽快先发送一个PING命令 */
sentinelSendPing(ri);
}
}
/* 对于master和slave,发起订阅连接 */
if ((ri->flags & (SRI_MASTER | SRI_SLAVE)) && link->pc == NULL) {
link->pc = redisAsyncConnectBind(ri->addr->ip, ri->addr->port, NET_FIRST_BIND_ADDR);
if (link->pc->err) {
sentinelEvent(LL_DEBUG, "-pubsub-link-reconnection", ri, "%@ #%s",
link->pc->errstr);
instanceLinkCloseConnection(link, link->pc);
} else {
int retval;
link->pc_conn_time = mstime();
link->pc->data = link;
// 将pc socket附加到ae事件循环框架上
redisAeAttach(server.el, link->pc);
redisAsyncSetConnectCallback(link->pc,
sentinelLinkEstablishedCallback);
redisAsyncSetDisconnectCallback(link->pc,
sentinelDisconnectCallback);
sentinelSendAuthIfNeeded(ri, link->pc);
sentinelSetClientName(ri, link->pc, "pubsub");
/* 订阅__sentinel:hello频道 */
retval = redisAsyncCommand(link->pc,
sentinelReceiveHelloMessages, ri, "%s %s",
sentinelInstanceMapCommand(ri, "SUBSCRIBE"),
SENTINEL_HELLO_CHANNEL);
if (retval != C_OK) {
/* 如果我们订阅失败,我们就关闭连接重试 */
instanceLinkCloseConnection(link, link->pc);
return;
}
}
}
/* 清理disconnected状态 */
if (link->cc && (ri->flags & SRI_SENTINEL || link->pc))
link->disconnected = 0;
}
心跳检测
通过sentinel:hello频道发现其它sentinel。
/* 处理从master或者slave收到的__sentinel__:hello频道中的hello消息,或者从其它sentinel直接发来的信息。
* 如果消息中指定的master name未知,消息将被丢弃。 */
void sentinelProcessHelloMessage(char *hello, int hello_len) {
/* Format is composed of 8 tokens:
* 0=ip,1=port,2=runid,3=current_epoch,4=master_name,
* 5=master_ip,6=master_port,7=master_config_epoch. */
int numtokens, port, removed, master_port;
uint64_t current_epoch, master_config_epoch;
char **token = sdssplitlen(hello, hello_len, ",", 1, &numtokens);
sentinelRedisInstance *si, *master;
if (numtokens == 8) {
/* 包含一个master的引用 */
master = sentinelGetMasterByName(token[4]);
if (!master) goto cleanup; /* 未知的master,跳过 */
/* 首先,检查我们是否有相同ip:port和runid的sentinel的信息 */
port = atoi(token[1]);
master_port = atoi(token[6]);
si = getSentinelRedisInstanceByAddrAndRunID(
master->sentinels, token[0], port, token[2]);
current_epoch = strtoull(token[3], NULL, 10);
master_config_epoch = strtoull(token[7], NULL, 10);
/* Sentinel处理情况分析:
* 1. 相同runid并且相同ip:port,什么都不必做
* 2. 相同runid但是不同ip:port,说明这个sentinel出现了地址切换,删除,并重新添加
* 3. 不同runid但是相同ip:port,说明这个ip:port所在的sentinel地址是非法的,我们需要标示所有具有该runid的sentinel非法,然后新增一个新的。
* 4. 不同runid并且不同ip:port,直接新增。 */
if (!si) { // 如果没发现相同ip和runid的sentinel,说明这是一个新的sentinel
/* 如果没有,因为sentinel的地址切换,我们需要移除所有相同runid的sentinels,
* 我们将会在之后添加一个相同runid但是有新的地址的sentinel */
removed = removeMatchingSentinelFromMaster(master, token[2]);
if (removed) {
// 如果找到并删除了相同runid但不同ip的sentinel,说明是sentinel进行了地址切换
sentinelEvent(LL_NOTICE, "+sentinel-address-switch", master,
"%@ ip %s port %d for %s", token[0], port, token[2]);
} else {
/* 如果找到了相同ip:port但不同runid的sentinel,说明这个sentinel是非法的,
* 我们将把这个ip:port关联的sentinel标记为非法,我们将把port设置为0,来标示地址非法。
* 我们将会在之后收到带有该runid实例的Hello消息时更新。 */
sentinelRedisInstance *other =
getSentinelRedisInstanceByAddrAndRunID(
master->sentinels, token[0], port, NULL);
if (other) {
sentinelEvent(LL_NOTICE, "+sentinel-invalid-addr", other, "%@");
other->addr->port = 0; /* 这意味着:地址是非法的 */
sentinelUpdateSentinelAddressInAllMasters(other);
}
}
/* 增加一个新的sentinel,并增加到master的sentinels中 */
si = createSentinelRedisInstance(token[2], SRI_SENTINEL,
token[0], port, master->quorum, master);
if (si) {
if (!removed) sentinelEvent(LL_NOTICE, "+sentinel", si, "%@");
/* 刚创建完实例,runid为空,我们需要立即填充它,否则以后没机会了 */
si->runid = sdsnew(token[2]);
sentinelTryConnectionSharing(si);
if (removed) sentinelUpdateSentinelAddressInAllMasters(si);
sentinelFlushConfig();
}
}
...
/* 更新sentinel的状态 */
if (si) si->last_hello_time = mstime();
}
cleanup:
sdsfreesplitres(token, numtokens);
}
通过master的INFO响应发现从节点
/* 处理从master接收到的INFO命令的回复 */
void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
sds *lines;
int numlines, j;
int role = 0;
/* 缓存INFO的回复信息 */
sdsfree(ri->info);
ri->info = sdsnew(info);
/* 如果在INFO输出中没有下面的域,那么需要被设定为给定的值 */
ri->master_link_down_time = 0;
/* 按行解析 */
lines = sdssplitlen(info, strlen(info), "\r\n", 2, &numlines);
for (j = 0; j < numlines; j++) {
sentinelRedisInstance *slave;
sds l = lines[j];
/* 查找runid:run_id:<40 hex chars>*/
if (sdslen(l) >= 47 && !memcmp(l, "run_id:", 7)) {
if (ri->runid == NULL) { // 如果为空,则填充
ri->runid = sdsnewlen(l + 7, 40);
} else { // 如果不为空,说明该实例进行过重启
if (strncmp(ri->runid, l + 7, 40) != 0) {
sentinelEvent(LL_NOTICE, "+reboot", ri, "%@");
sdsfree(ri->runid);
ri->runid = sdsnewlen(l + 7, 40);
}
}
}
/* old versions: slave0:,,
* new versions: slave0:ip=127.0.0.1,port=9999,... */
/* 如果我们是从master收到的slave信息,则新增slave */
if ((ri->flags & SRI_MASTER) &&
sdslen(l) >= 7 &&
!memcmp(l, "slave", 5) && isdigit(l[5])) {
char *ip, *port, *end;
if (strstr(l, "ip=") == NULL) {
/* Old format. */
ip = strchr(l, ':');
if (!ip) continue;
ip++; /* Now ip points to start of ip address. */
port = strchr(ip, ',');
if (!port) continue;
*port = '\0'; /* nul term for easy access. */
port++; /* Now port points to start of port number. */
end = strchr(port, ',');
if (!end) continue;
*end = '\0'; /* nul term for easy access. */
} else {
/* New format. */
ip = strstr(l, "ip=");
if (!ip) continue;
ip += 3; /* Now ip points to start of ip address. */
port = strstr(l, "port=");
if (!port) continue;
port += 5; /* Now port points to start of port number. */
/* Nul term both fields for easy access. */
end = strchr(ip, ',');
if (end) *end = '\0';
end = strchr(port, ',');
if (end) *end = '\0';
}
/* 如果我们没有记录这个slave,则新增 */
if (sentinelRedisInstanceLookupSlave(ri, ip, atoi(port)) == NULL) {
if ((slave = createSentinelRedisInstance(NULL, SRI_SLAVE, ip,
atoi(port), ri->quorum, ri)) != NULL) {
sentinelEvent(LL_NOTICE, "+slave", slave, "%@");
sentinelFlushConfig();
}
}
}
/* master_link_down_since_seconds: */
/* 更新主从断开时间 */
if (sdslen(l) >= 32 &&
!memcmp(l, "master_link_down_since_seconds", 30)) {
ri->master_link_down_time = strtoll(l + 31, NULL, 10) * 1000;
}
/* role: */
if (!memcmp(l, "role:master", 11)) role = SRI_MASTER;
else if (!memcmp(l, "role:slave", 10)) role = SRI_SLAVE;
// 如果当前角色是SLAVE,则更新slave相关的信息
if (role == SRI_SLAVE) {
/* master_host: */
if (sdslen(l) >= 12 && !memcmp(l, "master_host:", 12)) {
if (ri->slave_master_host == NULL ||
strcasecmp(l + 12, ri->slave_master_host)) {
sdsfree(ri->slave_master_host);
ri->slave_master_host = sdsnew(l + 12);
ri->slave_conf_change_time = mstime();
}
}
/* master_port: */
if (sdslen(l) >= 12 && !memcmp(l, "master_port:", 12)) {
int slave_master_port = atoi(l + 12);
if (ri->slave_master_port != slave_master_port) {
ri->slave_master_port = slave_master_port;
ri->slave_conf_change_time = mstime();
}
}
/* master_link_status: */
if (sdslen(l) >= 19 && !memcmp(l, "master_link_status:", 19)) {
ri->slave_master_link_status =
(strcasecmp(l + 19, "up") == 0) ?
SENTINEL_MASTER_LINK_STATUS_UP :
SENTINEL_MASTER_LINK_STATUS_DOWN;
}
/* slave_priority: */
if (sdslen(l) >= 15 && !memcmp(l, "slave_priority:", 15))
ri->slave_priority = atoi(l + 15);
/* slave_repl_offset: */
if (sdslen(l) >= 18 && !memcmp(l, "slave_repl_offset:", 18))
ri->slave_repl_offset = strtoull(l + 18, NULL, 10);
}
}
ri->info_refresh = mstime();
sdsfreesplitres(lines, numlines);
...
}
故障发现
心跳探测
/* 发送周期性的PING、INFO和PUBLISH命令到指定的Redis实例 */
void sentinelSendPeriodicCommands(sentinelRedisInstance *ri) {
mstime_t now = mstime();
mstime_t info_period, ping_period;
int retval;
/* 如果当前没有成功连接,直接返回 */
if (ri->link->disconnected) return;
/* 像INFO、PING、PUBLISH这样的命令并不重要,如果这个连接上阻塞的命令过多,我们直接返回。
* 当网络环境不好时,我们不希望发送为此消耗过多的内存。
* 注意我们有保护措施,如果这条连接检测到超时,连接会断开并重连。 */
if (ri->link->pending_commands >=
SENTINEL_MAX_PENDING_COMMANDS * ri->link->refcount)
return;
/* 当该实例是一个处于客观下线或者故障转移中的master的slave时,INFO的发送周期从10s一次改为1s一次。
* 这样我们可以更快的捕捉到slave向master的晋升。
* 类似的,如果这个slave和master断连了,我们也会更频繁的监视INFO的输出,来更快的捕捉到恢复。*/
if ((ri->flags & SRI_SLAVE) &&
((ri->master->flags & (SRI_O_DOWN | SRI_FAILOVER_IN_PROGRESS)) ||
(ri->master_link_down_time != 0))) {
info_period = 1000;
} else {
info_period = SENTINEL_INFO_PERIOD;
}
/* ping的周期为min(down_after_period,SENTINEL_PING_PERIOD) */
ping_period = ri->down_after_period;
if (ping_period > SENTINEL_PING_PERIOD) ping_period = SENTINEL_PING_PERIOD;
/* 对master和slaves发送INFO命令 */
if ((ri->flags & SRI_SENTINEL) == 0 &&
(ri->info_refresh == 0 ||
(now - ri->info_refresh) > info_period)) {
retval = redisAsyncCommand(ri->link->cc,
sentinelInfoReplyCallback, ri, "%s",
sentinelInstanceMapCommand(ri, "INFO"));
if (retval == C_OK) ri->link->pending_commands++;
}
/* 向所有实例发送PING命令 */
if ((now - ri->link->last_pong_time) > ping_period &&
(now - ri->link->last_ping_time) > ping_period / 2) {
sentinelSendPing(ri);
}
/* 每2s向所有实例发布PUBLISH hello消息:
* 其中对于master和slaves通过频道发送,对sentinel通过PUBLISH发送 */
if ((now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) {
sentinelSendHello(ri);
}
}
主观下线
/* 从我们的视角看这个节点是否是主观下线的 */
void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) {
mstime_t elapsed = 0;
if (ri->link->act_ping_time)
elapsed = mstime() - ri->link->act_ping_time;
else if (ri->link->disconnected)
elapsed = mstime() - ri->link->last_avail_time;
/* 检查是否我们需要重连链接
* 1) 如果命令连接已建立超过15s且发送了PING,但是下线周期已超过一半,还没有收到回复,就重连。
* 2) 如果订阅连接已建立超过15s且发送了PING,但是已经过3个订阅周期=6s都没有收到回复,就重连。*/
if (ri->link->cc &&
(mstime() - ri->link->cc_conn_time) >
SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
ri->link->act_ping_time != 0 && /* 有一个PING正在等待响应 */
/* 这个阻塞的PING延迟了,我们甚至都没有收到错误信息,可能redis在执行一个很长的阻塞命令 */
(mstime() - ri->link->act_ping_time) > (ri->down_after_period / 2) &&
(mstime() - ri->link->last_pong_time) > (ri->down_after_period / 2)) {
instanceLinkCloseConnection(ri->link, ri->link->cc);
}
if (ri->link->pc &&
(mstime() - ri->link->pc_conn_time) >
SENTINEL_MIN_LINK_RECONNECT_PERIOD &&
(mstime() - ri->link->pc_last_activity) > (SENTINEL_PUBLISH_PERIOD * 3)) {
instanceLinkCloseConnection(ri->link, ri->link->pc);
}
/* 如果这个实例满足以下两个条件时,我们认为它主观下线了:
* 1) 在down_after_period时间内,没有收到PING的回复或者没有重连上。
* 2) 我们认为这是一个master,但是他说自己是一个slave,并且已经报告了很久了。*/
if (elapsed > ri->down_after_period ||
(ri->flags & SRI_MASTER &&
ri->role_reported == SRI_SLAVE &&
mstime() - ri->role_reported_time >
(ri->down_after_period + SENTINEL_INFO_PERIOD * 2))) {
/* 客观下线 */
if ((ri->flags & SRI_S_DOWN) == 0) {
sentinelEvent(LL_WARNING, "+sdown", ri, "%@");
ri->s_down_since_time = mstime();
ri->flags |= SRI_S_DOWN;
}
} else {
/* 客观上线 */
if (ri->flags & SRI_S_DOWN) {
sentinelEvent(LL_WARNING, "-sdown", ri, "%@");
ri->flags &= ~(SRI_S_DOWN | SRI_SCRIPT_KILL_SENT);
}
}
}
客观下线
void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) {
dictIterator *di;
dictEntry *de;
di = dictGetIterator(master->sentinels);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
mstime_t elapsed = mstime() - ri->last_master_down_reply_time;
char port[32];
int retval;
/* 如果从其他sentinel得到state的时间过长,我们认为失效了,就清理掉 */
if (elapsed > SENTINEL_ASK_PERIOD * 5) {
ri->flags &= ~SRI_MASTER_DOWN;
sdsfree(ri->leader);
ri->leader = NULL;
}
/* 仅当满足以下情况时,我们才发送询问消息:
* 1) 当前master处于主观下线。
* 2) 和sentinel是连接状态。
* 3) 在1s内我们没有接受过sentinel is-master-down-by-addr回复信息。*/
if ((master->flags & SRI_S_DOWN) == 0) continue;
if (ri->link->disconnected) continue;
if (!(flags & SENTINEL_ASK_FORCED) &&
mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD)
continue;
/* 询问其他sentinel */
ll2string(port, sizeof(port), master->addr->port);
retval = redisAsyncCommand(ri->link->cc,
sentinelReceiveIsMasterDownReply, ri,
"%s is-master-down-by-addr %s %s %llu %s",
sentinelInstanceMapCommand(ri, "SENTINEL"),
master->addr->ip, port,
sentinel.current_epoch,
(master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ?
sentinel.myid : "*");
if (retval == C_OK) ri->link->pending_commands++;
}
dictReleaseIterator(di);
}
/* 根据配置的quorum,该master是否处于客观下线状态。
* 注意:客观下线是个法定人数,它仅仅意味着在给定的时间内有足够多的sentinels到这个实例是不可达的。
* 然而这个消息可能会延迟,所有它不是一个强保证,不能保证:
* N个实例在同一时刻都认为某个实例处于主观下线状态。*/
void sentinelCheckObjectivelyDown(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
unsigned int quorum = 0, odown = 0;
if (master->flags & SRI_S_DOWN) {
quorum = 1; /* 当前sentinel认为已经下线 */
/* 统计其他节点是否认为已经下线 */
di = dictGetIterator(master->sentinels);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
if (ri->flags & SRI_MASTER_DOWN) quorum++;
}
dictReleaseIterator(di);
// 如果超过了预设的法定人数,则认为客观下线了
if (quorum >= master->quorum) odown = 1;
}
/* 根据odown设置master状态 */
if (odown) {
if ((master->flags & SRI_O_DOWN) == 0) {
sentinelEvent(LL_WARNING, "+odown", master, "%@ #quorum %d/%d",
quorum, master->quorum);
master->flags |= SRI_O_DOWN;
master->o_down_since_time = mstime();
}
} else {
if (master->flags & SRI_O_DOWN) {
sentinelEvent(LL_WARNING, "-odown", master, "%@");
master->flags &= ~SRI_O_DOWN;
}
}
}
/* 这个实例用来检查是否可以开始故障转移,需要满足以下条件:
* 1) master必须处于客观下线条件。
* 2) 没有故障转移正在进行中。
* 3) 故障转移冷却中:在之前的failover_timeout*2的时间内有一个故障转移开始的企图。
* 我们还不知道我们是否能够赢得选举,所以有可能我们可是一个故障转移但是不做事情。
* 如果故障转移开始了,我们将会返回非0。 */
int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
/* 非客观下线,不开始故障转移 */
if (!(master->flags & SRI_O_DOWN)) return 0;
/* 故障转移进行中,不开始 */
if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0;
/* 故障转移,冷却中 */
if (mstime() - master->failover_start_time <
master->failover_timeout * 2) {
if (master->failover_delay_logged != master->failover_start_time) {
time_t clock = (master->failover_start_time +
master->failover_timeout * 2) / 1000;
char ctimebuf[26];
ctime_r(&clock, ctimebuf);
ctimebuf[24] = '\0'; /* Remove newline. */
master->failover_delay_logged = master->failover_start_time;
serverLog(LL_WARNING,
"Next failover delay: I will not start a failover before %s",
ctimebuf);
}
return 0;
}
sentinelStartFailover(master);
return 1;
}
故障转移
领导选举
/* 设置master状态来开始一个故障转移 */
void sentinelStartFailover(sentinelRedisInstance *master) {
serverAssert(master->flags & SRI_MASTER);
master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
master->flags |= SRI_FAILOVER_IN_PROGRESS;
master->failover_epoch = ++sentinel.current_epoch;
sentinelEvent(LL_WARNING, "+new-epoch", master, "%llu",
(unsigned long long) sentinel.current_epoch);
sentinelEvent(LL_WARNING, "+try-failover", master, "%@");
master->failover_start_time = mstime() + rand() % SENTINEL_MAX_DESYNC;
master->failover_state_change_time = mstime();
}
// 发起一次投票
void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
...
/* 如果当前实例是master */
if (ri->flags & SRI_MASTER) {
...
if (sentinelStartFailoverIfNeeded(ri))
sentinelAskMasterStateToOtherSentinels(ri, SENTINEL_ASK_FORCED);
sentinelFailoverStateMachine(ri);
...
}
}
void sentinelCommand(client *c) {
if (!strcasecmp(c->argv[1]->ptr, "masters")) {
...
} else if (!strcasecmp(c->argv[1]->ptr, "is-master-down-by-addr")) {
/* SENTINEL IS-MASTER-DOWN-BY-ADDR
* 参数:
* ip:port是要检查的master的ip和port,注意这个命令将不会通过name检查,
* 因为理论上来说,不同的sentinel可能监控带有相同name的不同master。
* current-epoch是为了理解我们是否被允许进行一次故障转移的投票。
* 每一个sentinel对于一个epoch仅能投票一次。
* runid不为空意味着我们需要为了故障转移而投票,否则将会仅进行查询。*/
sentinelRedisInstance *ri;
long long req_epoch;
uint64_t leader_epoch = 0;
char *leader = NULL;
long port;
int isdown = 0;
if (c->argc != 6) goto numargserr;
if (getLongFromObjectOrReply(c, c->argv[3], &port, NULL) != C_OK ||
getLongLongFromObjectOrReply(c, c->argv[4], &req_epoch, NULL)
!= C_OK)
return;
ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters,
c->argv[2]->ptr, port, NULL);
/* 是否存在,是否是master,是否是主观下线状态?
* 注意:如果我们处于TILT状态,我们总是回复0 */
if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) &&
(ri->flags & SRI_MASTER))
isdown = 1;
/* 为这个master投票或者拉取之前的投票结果 */
if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr, "*")) {
leader = sentinelVoteLeader(ri, (uint64_t) req_epoch,
c->argv[5]->ptr,
&leader_epoch);
}
/* 返回回复:down state, leader, vote epoch */
addReplyMultiBulkLen(c, 3);
addReply(c, isdown ? shared.cone : shared.czero);
addReplyBulkCString(c, leader ? leader : "*");
addReplyLongLong(c, (long long) leader_epoch);
if (leader) sdsfree(leader);
} else if (!strcasecmp(c->argv[1]->ptr, "reset")) {
/* SENTINEL RESET */
if (c->argc != 3) goto numargserr;
addReplyLongLong(c, sentinelResetMastersByPattern(c->argv[2]->ptr, SENTINEL_GENERATE_EVENT));
} else {
...
}
}
/* 接收SENTINEL is-master-down-by-addr命令回复 */
void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *privdata) {
sentinelRedisInstance *ri = privdata;
instanceLink *link = c->data;
redisReply *r;
if (!reply || !link) return;
link->pending_commands--;
r = reply;
/* 忽略错误或者不期望的回复。
* 注意:如果命令回复了错误,我们将会在timeout之后清理SRI_MASTER_DOWN标志。
* 回复格式:0: 主节点下线状态 1:runid 2: epoch */
if (r->type == REDIS_REPLY_ARRAY && r->elements == 3 &&
r->element[0]->type == REDIS_REPLY_INTEGER &&
r->element[1]->type == REDIS_REPLY_STRING &&
r->element[2]->type == REDIS_REPLY_INTEGER) {
ri->last_master_down_reply_time = mstime();
if (r->element[0]->integer == 1) {
ri->flags |= SRI_MASTER_DOWN;
} else {
ri->flags &= ~SRI_MASTER_DOWN;
}
if (strcmp(r->element[1]->str, "*")) {
/* 如果runid不是*,说明对端sentinel进行了一次投票。 */
sdsfree(ri->leader);
if ((long long) ri->leader_epoch != r->element[2]->integer)
serverLog(LL_WARNING,
"%s voted for %s %llu", ri->name,
r->element[1]->str,
(unsigned long long) r->element[2]->integer);
ri->leader = sdsnew(r->element[1]->str);
ri->leader_epoch = r->element[2]->integer;
}
}
}
故障转移
void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
serverAssert(ri->flags & SRI_MASTER);
if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
switch (ri->failover_state) {
case SENTINEL_FAILOVER_STATE_WAIT_START:
sentinelFailoverWaitStart(ri);
break;
case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
sentinelFailoverSelectSlave(ri);
break;
case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
sentinelFailoverSendSlaveOfNoOne(ri);
break;
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
sentinelFailoverWaitPromotion(ri);
break;
case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
sentinelFailoverReconfNextSlave(ri);
break;
}
}
// leader选举成功
void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
char *leader;
int isleader;
/* 检查我们是否是当前故障转移纪元的leader */
leader = sentinelGetLeader(ri, ri->failover_epoch);
isleader = leader && strcasecmp(leader, sentinel.myid) == 0;
sdsfree(leader);
/* 如果我不是领导者,并且也不是通过SENTINEL FAILOVER强行开启的故障转移,那我们不能继续 */
if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) {
int election_timeout = SENTINEL_ELECTION_TIMEOUT;
/* 选举超时的时间是max(SENTINEL_ELECTION_TIMEOUT,failover_timeout) */
if (election_timeout > ri->failover_timeout)
election_timeout = ri->failover_timeout;
/* 如果我在选举超时时间内都没有成为leader,则中止故障转移过程 */
if (mstime() - ri->failover_start_time > election_timeout) {
sentinelEvent(LL_WARNING, "-failover-abort-not-elected", ri, "%@");
sentinelAbortFailover(ri);
}
return;
}
sentinelEvent(LL_WARNING, "+elected-leader", ri, "%@");
if (sentinel.simfailure_flags & SENTINEL_SIMFAILURE_CRASH_AFTER_ELECTION)
sentinelSimFailureCrash();
ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
ri->failover_state_change_time = mstime();
sentinelEvent(LL_WARNING, "+failover-state-select-slave", ri, "%@");
}
// 选择要晋升的slave
void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
sentinelRedisInstance *slave = sentinelSelectSlave(ri);
/* 这个状态下我们不处理超时,因为这个函数会自己中止或者进入下一阶段 */
if (slave == NULL) {
sentinelEvent(LL_WARNING, "-failover-abort-no-good-slave", ri, "%@");
sentinelAbortFailover(ri);
} else {
sentinelEvent(LL_WARNING, "+selected-slave", slave, "%@");
slave->flags |= SRI_PROMOTED;
ri->promoted_slave = slave;
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE;
ri->failover_state_change_time = mstime();
sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone",
slave, "%@");
}
}
/* 选择一个合适的slave来进行晋升。这个算法仅仅允许满足下列的实例:
* 1) 不带有下列标志:S_DOWN, O_DOWN, DISCONNECTED。
* 2) 最后收到PING回复的时间不超过5个PING周期。
* 3) info_refresh不超过3个INFO周期。
* 4) master_link_down_time到现在的时间不超过:
* (now - master->s_down_since_time) + (master->down_after_period * 10)。
* 基本上,从我们的视角看到master下线,slave将会被断开不超过10个down-after-period。
* 这个想法是,因为master下线了,所以slave将会堆积,但不应该堆积过久。无论如何,
* 我们应该根据复制偏移量选择一个最好的slave。
* 5) slave优先级不能为0,不然我们会放弃这个。
* 满足以上条件时,我们将会按照以下条件排序:
* - 更小的优先级
* - 更大的复制偏移量
* - 更小字典序的runid
* 如果找到了合适的slave,将会返回,没找到则返回NULL */
/* sentinelSelectSlave()的辅助函数,被用于qsort()来选出"better first"的slave。 */
int compareSlavesForPromotion(const void *a, const void *b) {
sentinelRedisInstance **sa = (sentinelRedisInstance **) a,
**sb = (sentinelRedisInstance **) b;
char *sa_runid, *sb_runid;
/* 选择最大优先级 */
if ((*sa)->slave_priority != (*sb)->slave_priority)
return (*sa)->slave_priority - (*sb)->slave_priority;
/* 选择最大复制量 */
if ((*sa)->slave_repl_offset > (*sb)->slave_repl_offset) {
return -1; /* a < b */
} else if ((*sa)->slave_repl_offset < (*sb)->slave_repl_offset) {
return 1; /* a > b */
}
/* 选择最小的runid,注意:低版本的redis不会在INFO发布runid,所以是NULL */
sa_runid = (*sa)->runid;
sb_runid = (*sb)->runid;
if (sa_runid == NULL && sb_runid == NULL) return 0;
else if (sa_runid == NULL) return 1; /* a > b */
else if (sb_runid == NULL) return -1; /* a < b */
return strcasecmp(sa_runid, sb_runid);
}
sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
sentinelRedisInstance **instance =
zmalloc(sizeof(instance[0]) * dictSize(master->slaves));
sentinelRedisInstance *selected = NULL;
int instances = 0;
dictIterator *di;
dictEntry *de;
mstime_t max_master_down_time = 0;
if (master->flags & SRI_S_DOWN)
max_master_down_time += mstime() - master->s_down_since_time;
max_master_down_time += master->down_after_period * 10;
di = dictGetIterator(master->slaves);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
mstime_t info_validity_time;
if (slave->flags & (SRI_S_DOWN | SRI_O_DOWN)) continue;
if (slave->link->disconnected) continue;
if (mstime() - slave->link->last_avail_time > SENTINEL_PING_PERIOD * 5) continue;
if (slave->slave_priority == 0) continue;
/* 如果master处于SDOWN状态,我们将会从slaves每1秒获取一次INFO信息,否则是每10s一次 */
if (master->flags & SRI_S_DOWN)
info_validity_time = SENTINEL_PING_PERIOD * 5;
else
info_validity_time = SENTINEL_INFO_PERIOD * 3;
if (mstime() - slave->info_refresh > info_validity_time) continue;
if (slave->master_link_down_time > max_master_down_time) continue;
instance[instances++] = slave;
}
dictReleaseIterator(di);
if (instances) {
qsort(instance, instances, sizeof(sentinelRedisInstance *),
compareSlavesForPromotion);
selected = instance[0];
}
zfree(instance);
return selected;
}
// 提升选定的slave
void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
int retval;
/* 如果和要晋升的slave断开了,我们无法发送命令。重试直到超时然后中止 */
if (ri->promoted_slave->link->disconnected) {
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(LL_WARNING, "-failover-abort-slave-timeout", ri, "%@");
sentinelAbortFailover(ri);
}
return;
}
/* 发送SLAVEOF NO ONE到slave,从而把这个slave转换成一个master,
* 我们注册了一个通用的回调,因为我们不关心回复的内容,
* 我们将会通过不断检查INFO的返回来判断是否切换成功:slave -> master */
retval = sentinelSendSlaveOf(ri->promoted_slave, NULL, 0);
if (retval != C_OK) return;
sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion",
ri->promoted_slave, "%@");
ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;
ri->failover_state_change_time = mstime();
}
// 等待要晋升的slave晋升完成
/* 我们将会通过一直检查INFO命令的输出来确定是否这个slave已经转变成了master */
void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {
/* 仅仅处理这个超时。切换到下一个状态是通过解析INFO命令的回复来确定slave的晋升的 */
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(LL_WARNING, "-failover-abort-slave-timeout", ri, "%@");
sentinelAbortFailover(ri);
}
}
/* 处理从master接收到的INFO命令的回复 */
void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
...
/* ---------------------------- Acting half -----------------------------
/* 如果处于TILT模式,则只会记录相关信息,不执行某些操作 */
/* 当上次报告的角色和本次报告的不一样时 */
if (role != ri->role_reported) {
ri->role_reported_time = mstime();
ri->role_reported = role;
if (role == SRI_SLAVE) ri->slave_conf_change_time = mstime();
/* 如果本次汇报的角色配置和当前配置是一致的,我们记录+role-change事件,
* 如果本次汇报的角色配置和当前配置不一致,我们记录-role-change事件。 */
sentinelEvent(LL_VERBOSE,
((ri->flags & (SRI_MASTER | SRI_SLAVE)) == role) ?
"+role-change" : "-role-change",
ri, "%@ new reported role is %s",
role == SRI_MASTER ? "master" : "slave",
ri->flags & SRI_MASTER ? "master" : "slave");
}
/* 下面的行为不能在TILT模式下执行 */
if (sentinel.tilt) return;
/* 处理 master -> slave 角色转变 */
if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) {
/* 我们什么都不做,但是一个声明为slave的master被sentinel认为是无法访问的,
* 如果该实例一直这样报告,我们将会认为它是主观下线的,最终可能会触发一次故障转移。 */
}
/* 处理slave->master的角色转变 */
if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {
/* 如果该slave是晋升的slave,我们需要修改故障转移状态机 */
if ((ri->flags & SRI_PROMOTED) &&
(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
(ri->master->failover_state ==
SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) {
/* 注意:我们确认了slave已经重新配置为了master,所以我们把master的配置纪元作为当前纪元,
* 我们是通过这个纪元赢得故障转移的选举的。
* 这将会强制其他sentinels更新它们的配置(假设没有一个更新的纪元可用)。 */
ri->master->config_epoch = ri->master->failover_epoch;
ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;
ri->master->failover_state_change_time = mstime();
sentinelFlushConfig();
sentinelEvent(LL_WARNING, "+promoted-slave", ri, "%@");
if (sentinel.simfailure_flags &
SENTINEL_SIMFAILURE_CRASH_AFTER_PROMOTION)
sentinelSimFailureCrash();
sentinelEvent(LL_WARNING, "+failover-state-reconf-slaves",
ri->master, "%@");
sentinelCallClientReconfScript(ri->master, SENTINEL_LEADER,
"start", ri->master->addr, ri->addr);
sentinelForceHelloUpdateForMaster(ri->master);
} else {
/* 另外一个slave转化为了master。我们将原来master重新配置为slave。
* 在此之前等待8s时间,来接收新的配置,减少数据包乱序带来的影响。 */
mstime_t wait_time = SENTINEL_PUBLISH_PERIOD * 4;
if (!(ri->flags & SRI_PROMOTED) &&
sentinelMasterLooksSane(ri->master) &&
sentinelRedisInstanceNoDownFor(ri, wait_time) &&
mstime() - ri->role_reported_time > wait_time) {
int retval = sentinelSendSlaveOf(ri,
ri->master->addr->ip,
ri->master->addr->port);
if (retval == C_OK)
sentinelEvent(LL_NOTICE, "+convert-to-slave", ri, "%@");
}
}
}
/* slaves开始跟从一个新的master */
if ((ri->flags & SRI_SLAVE) &&
role == SRI_SLAVE &&
(ri->slave_master_port != ri->master->addr->port ||
strcasecmp(ri->slave_master_host, ri->master->addr->ip))) {
mstime_t wait_time = ri->master->failover_timeout;
/* 在更新slave之前确保master是正常的 */
if (sentinelMasterLooksSane(ri->master) &&
sentinelRedisInstanceNoDownFor(ri, wait_time) &&
mstime() - ri->slave_conf_change_time > wait_time) {
int retval = sentinelSendSlaveOf(ri,
ri->master->addr->ip,
ri->master->addr->port);
if (retval == C_OK)
sentinelEvent(LL_NOTICE, "+fix-slave-config", ri, "%@");
}
}
/* 检查slave重配置的进度状态 */
if ((ri->flags & SRI_SLAVE) && role == SRI_SLAVE &&
(ri->flags & (SRI_RECONF_SENT | SRI_RECONF_INPROG))) {
/* SRI_RECONF_SENT -> SRI_RECONF_INPROG. */
if ((ri->flags & SRI_RECONF_SENT) &&
ri->slave_master_host &&
strcmp(ri->slave_master_host,
ri->master->promoted_slave->addr->ip) == 0 &&
ri->slave_master_port == ri->master->promoted_slave->addr->port) {
ri->flags &= ~SRI_RECONF_SENT;
ri->flags |= SRI_RECONF_INPROG;
sentinelEvent(LL_NOTICE, "+slave-reconf-inprog", ri, "%@");
}
/* SRI_RECONF_INPROG -> SRI_RECONF_DONE */
if ((ri->flags & SRI_RECONF_INPROG) &&
ri->slave_master_link_status == SENTINEL_MASTER_LINK_STATUS_UP) {
ri->flags &= ~SRI_RECONF_INPROG;
ri->flags |= SRI_RECONF_DONE;
sentinelEvent(LL_NOTICE, "+slave-reconf-done", ri, "%@");
}
}
}
// 配置其它的slaves同步新的master
/* 发送slave of 对所有未完成配置更新的slaves */
void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
int in_progress = 0;
di = dictGetIterator(master->slaves);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
if (slave->flags & (SRI_RECONF_SENT | SRI_RECONF_INPROG))
in_progress++;
}
dictReleaseIterator(di);
di = dictGetIterator(master->slaves);
while (in_progress < master->parallel_syncs &&
(de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
int retval;
/* 跳过晋升的slave和已经完成配置的slave */
if (slave->flags & (SRI_PROMOTED | SRI_RECONF_DONE)) continue;
/* 如果发送SLAVEOF 之后超过10s,则认为超时,我们认为它已经完成,
* sentinels将会检查出这种情况并且在之后进行修复 */
if ((slave->flags & SRI_RECONF_SENT) &&
(mstime() - slave->slave_reconf_sent_time) >
SENTINEL_SLAVE_RECONF_TIMEOUT) {
sentinelEvent(LL_NOTICE, "-slave-reconf-sent-timeout", slave, "%@");
slave->flags &= ~SRI_RECONF_SENT;
slave->flags |= SRI_RECONF_DONE;
}
/* 对于断连的或者处于同步中的,我们直接跳过 */
if (slave->flags & (SRI_RECONF_SENT | SRI_RECONF_INPROG)) continue;
if (slave->link->disconnected) continue;
/* 发送SLAVEOF 命令 */
retval = sentinelSendSlaveOf(slave,
master->promoted_slave->addr->ip,
master->promoted_slave->addr->port);
if (retval == C_OK) {
slave->flags |= SRI_RECONF_SENT;
slave->slave_reconf_sent_time = mstime();
sentinelEvent(LL_NOTICE, "+slave-reconf-sent", slave, "%@");
in_progress++;
}
}
dictReleaseIterator(di);
/* 检查是否所有的slaves都已经重新配置或者处理了超时 */
sentinelFailoverDetectEnd(master);
}
void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {
int not_reconfigured = 0, timeout = 0;
dictIterator *di;
dictEntry *de;
mstime_t elapsed = mstime() - master->failover_state_change_time;
/* 如果这个新晋升的slave不可达,我们不认为故障转移完成 */
if (master->promoted_slave == NULL ||
master->promoted_slave->flags & SRI_S_DOWN)
return;
/* 如果所有可达的slaves都已经配置好了,则故障转移就结束了 */
di = dictGetIterator(master->slaves);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
if (slave->flags & (SRI_PROMOTED | SRI_RECONF_DONE)) continue;
if (slave->flags & SRI_S_DOWN) continue;
not_reconfigured++;
}
dictReleaseIterator(di);
/* 如果故障转移超时了,我们强制结束 */
if (elapsed > master->failover_timeout) {
not_reconfigured = 0;
timeout = 1;
sentinelEvent(LL_WARNING, "+failover-end-for-timeout", master, "%@");
}
if (not_reconfigured == 0) {
sentinelEvent(LL_WARNING, "+failover-end", master, "%@");
master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG;
master->failover_state_change_time = mstime();
}
/* 如果是因为超时导致的,则向所有还没有同步master的slaves发送slaveof命令 */
if (timeout) {
dictIterator *di;
dictEntry *de;
di = dictGetIterator(master->slaves);
while ((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
int retval;
if (slave->flags & (SRI_RECONF_DONE | SRI_RECONF_SENT)) continue;
if (slave->link->disconnected) continue;
retval = sentinelSendSlaveOf(slave,
master->promoted_slave->addr->ip,
master->promoted_slave->addr->port);
if (retval == C_OK) {
sentinelEvent(LL_NOTICE, "+slave-reconf-sent-be", slave, "%@");
slave->flags |= SRI_RECONF_SENT;
}
}
dictReleaseIterator(di);
}
}
/* 这个函数当slave处于SENTINEL_FAILOVER_STATE_UPDATE_CONFIG状态时被调用。
* 在这种情况下,我们将会把master从master表中移除,并把晋升的slave加入master表。 */
void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) {
sentinelRedisInstance *ref = master->promoted_slave ?
master->promoted_slave : master;
sentinelEvent(LL_WARNING, "+switch-master", master, "%s %s %d %s %d",
master->name, master->addr->ip, master->addr->port,
ref->addr->ip, ref->addr->port);
sentinelResetMasterAndChangeAddress(master, ref->addr->ip, ref->addr->port);
}
JedisSentinelPool实现:主备切换
- 找到任意一个可用的sentinel,通过sentinel get-master-addr-by-name获取master地址
- 通过订阅+switch-master频道,获取master地址的变更事件
- 缺点:过于依赖sentinel的+switch-master,事件丢失则无法完成客户端切换
private HostAndPort initSentinels(Set sentinels, final String masterName) {
HostAndPort master = null;
boolean sentinelAvailable = false;
log.info("Trying to find master from available Sentinels...");
for (String sentinel : sentinels) {
final HostAndPort hap = HostAndPort.parseString(sentinel);
log.debug("Connecting to Sentinel {}", hap);
Jedis jedis = null;
try {
jedis = new Jedis(hap.getHost(), hap.getPort(), sentinelConnectionTimeout, sentinelSoTimeout);
if (sentinelUser != null) {
jedis.auth(sentinelUser, sentinelPassword);
} else if (sentinelPassword != null) {
jedis.auth(sentinelPassword);
}
if (sentinelClientName != null) {
jedis.clientSetname(sentinelClientName);
}
List masterAddr = jedis.sentinelGetMasterAddrByName(masterName);
// connected to sentinel...
sentinelAvailable = true;
if (masterAddr == null || masterAddr.size() != 2) {
log.warn("Can not get master addr, master name: {}. Sentinel: {}", masterName, hap);
continue;
}
master = toHostAndPort(masterAddr);
log.debug("Found Redis master at {}", master);
break;
} catch (JedisException e) {
// resolves #1036, it should handle JedisException there's another chance
// of raising JedisDataException
log.warn(
"Cannot get master address from sentinel running @ {}. Reason: {}. Trying next one.", hap, e);
} finally {
if (jedis != null) {
jedis.close();
}
}
}
if (master == null) {
if (sentinelAvailable) {
// can connect to sentinel, but master name seems to not
// monitored
throw new JedisException("Can connect to sentinel, but " + masterName
+ " seems to be not monitored...");
} else {
throw new JedisConnectionException("All sentinels down, cannot determine where is "
+ masterName + " master is running...");
}
}
log.info("Redis master running at {}, starting Sentinel listeners...", master);
for (String sentinel : sentinels) {
final HostAndPort hap = HostAndPort.parseString(sentinel);
MasterListener masterListener = new MasterListener(masterName, hap.getHost(), hap.getPort());
// whether MasterListener threads are alive or not, process can be stopped
masterListener.setDaemon(true);
masterListeners.add(masterListener);
masterListener.start();
}
return master;
}
protected class MasterListener extends Thread {
...
@Override
public void run() {
running.set(true);
while (running.get()) {
try {
// double check that it is not being shutdown
if (!running.get()) {
break;
}
j = new Jedis(host, port, sentinelConnectionTimeout, sentinelSoTimeout);
if (sentinelUser != null) {
j.auth(sentinelUser, sentinelPassword);
} else if (sentinelPassword != null) {
j.auth(sentinelPassword);
}
if (sentinelClientName != null) {
j.clientSetname(sentinelClientName);
}
// code for active refresh
List masterAddr = j.sentinelGetMasterAddrByName(masterName);
if (masterAddr == null || masterAddr.size() != 2) {
log.warn("Can not get master addr, master name: {}. Sentinel: {}:{}.", masterName, host, port);
} else {
initPool(toHostAndPort(masterAddr));
}
j.subscribe(new JedisPubSub() {
@Override
public void onMessage(String channel, String message) {
log.debug("Sentinel {}:{} published: {}.", host, port, message);
String[] switchMasterMsg = message.split(" ");
if (switchMasterMsg.length > 3) {
if (masterName.equals(switchMasterMsg[0])) {
initPool(toHostAndPort(Arrays.asList(switchMasterMsg[3], switchMasterMsg[4])));
} else {
log.debug(
"Ignoring message on +switch-master for master name {}, our master name is {}",
switchMasterMsg[0], masterName);
}
} else {
log.error(
"Invalid message received on Sentinel {}:{} on channel +switch-master: {}", host,
port, message);
}
}
}, "+switch-master");
} catch (JedisException e) {
if (running.get()) {
log.error("Lost connection to Sentinel at {}:{}. Sleeping 5000ms and retrying.", host,
port, e);
try {
Thread.sleep(subscribeRetryWaitTimeMillis);
} catch (InterruptedException e1) {
log.error("Sleep interrupted: ", e1);
}
} else {
log.debug("Unsubscribing from Sentinel at {}:{}", host, port);
}
} finally {
if (j != null) {
j.close();
}
}
}
}
...
}
Redisson实现:主从切换
- 通过sentinel get-master-addr-by-name获取master地址
- 通过sentinel slaves获取slaves地址
- 通过周期性定时轮询来获取变更地址信息变更