对于replication.c的源码分析,我将会分两部分介绍主从复制的过程和主从同步的复制《redis replication主
从复制的源码分析(2)》。本文主要分析slave连接master进行主从复制的过程实现。
redis-cli通过向从服务器发送slaveof命令,可以使从服务器去复制一个主服务器:
slaveof
主从复制的详细的步骤如下:
1、设置主服务器的地址和端口
2、建立套接字连接
3、发送ping命令4、身份验证
5、发送端口信息6、同步
7、命令传播
replicationSetMaster() 设置主服务器的地址和端口,初始化replication状态
void replicationSetMaster(char *ip, int port) {
sdsfree(server.masterhost);
server.masterhost = sdsnew(ip);
server.masterport = port;
if (server.master) freeClient(server.master);
disconnectAllBlockedClients(); /* Clients blocked in master, now slave. */
disconnectSlaves(); /* Force our slaves to resync with us as well. */
replicationDiscardCachedMaster(); /* Don't try a PSYNC. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
cancelReplicationHandshake();
server.repl_state = REPL_STATE_CONNECT;//设置repl_state,准备开始replication
server.master_repl_offset = 0;//初始化replication的偏移
server.repl_down_since = 0;
}
connectWithMaster()建立套接字连接,设置事件回调syncWithMaster()
int connectWithMaster(void) {
int fd;//创建与master的socket
fd = anetTcpNonBlockBestEffortBindConnect(NULL,
server.masterhost,server.masterport,NET_FIRST_BIND_ADDR);
if (fd == -1) {
return C_ERR;
}//设置socket连接成功后的事件回调
if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) ==
AE_ERR)
{
close(fd);
return C_ERR;
}
server.repl_transfer_lastio = server.unixtime;
server.repl_transfer_s = fd;
server.repl_state = REPL_STATE_CONNECTING;//更新repl_state,正在连接中
return C_OK;
}
syncWithMaster()发送ping命令,身份验证,发送端口信息,通知master自己可以解析rdb,进行同步。基本流程如下:
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
……
//repl_state表示没有活跃replication,直接返回
if (server.repl_state == REPL_STATE_NONE) {
close(fd);
return;
}
……
//socket连接成功,发送ping给master
if (server.repl_state == REPL_STATE_CONNECTING) {
aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
server.repl_state = REPL_STATE_RECEIVE_PONG;//处于等待ping reply
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);
if (err) goto write_error;
return;
}//收到pong回复,读出pong回复
if (server.repl_state == REPL_STATE_RECEIVE_PONG) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] != '+' &&strncmp(err,"-NOAUTH",7) != 0 &&
strncmp(err,"-ERR operation not permitted",28) != 0)
{
……
goto error;
}
server.repl_state = REPL_STATE_SEND_AUTH;//处于要发送auth状态
}
//身份验证,发送auth信息给master
if (server.repl_state == REPL_STATE_SEND_AUTH) {
if (server.masterauth) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
if (err) goto write_error;
server.repl_state = REPL_STATE_RECEIVE_AUTH;
return;
} else {//不需要验证,直接进入配置REPL_STATE_SEND_PORT状态
server.repl_state = REPL_STATE_SEND_PORT;
}
}//接收验证的回复,读取验证回复信息
if (server.repl_state == REPL_STATE_RECEIVE_AUTH) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] == '-') {
serverLog(LL_WARNING,"Unable to AUTH to MASTER: %s",err);
sdsfree(err);
goto error;
}
server.repl_state = REPL_STATE_SEND_PORT;
}//发送listening-port给master
if (server.repl_state == REPL_STATE_SEND_PORT) {
sds port = sdsfromlonglong(server.port);
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"listening-port",port, NULL);
……
server.repl_state = REPL_STATE_RECEIVE_PORT;
return;
}
if (server.repl_state == REPL_STATE_RECEIVE_PORT) {
//接收"replconf listening-port"的回复
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] == '-') {
……
}
server.repl_state = REPL_STATE_SEND_CAPA;
}
//告知master自己可以解析rdb的格式
if (server.repl_state == REPL_STATE_SEND_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"capa","eof",NULL);
if (err) goto write_error;
sdsfree(err);
server.repl_state = REPL_STATE_RECEIVE_CAPA;
return;
}//接收"replconf capa eof"的回复
if (server.repl_state == REPL_STATE_RECEIVE_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] == '-') {
……
}
sdsfree(err);
server.repl_state = REPL_STATE_SEND_PSYNC;
}
//slave发送psync给master,如果有cached_matster,进行部分重同步;反之进行完整重同步
if (server.repl_state == REPL_STATE_SEND_PSYNC) {
if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
err = sdsnew("Write error sending the PSYNC command.");
goto write_error;
}
server.repl_state = REPL_STATE_RECEIVE_PSYNC;
return;
}
/* If reached this point, we should be in REPL_STATE_RECEIVE_PSYNC. */
if (server.repl_state != REPL_STATE_RECEIVE_PSYNC) {
……
goto error;
}
//接收master psync的回复,进行同步
psync_result = slaveTryPartialResynchronization(fd,1);
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
if (psync_result == PSYNC_CONTINUE) {
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Master accepted a Partial Resynchronization.");
return;
}
disconnectSlaves(); /* Force our slaves to resync with us as well. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
//如果master不支持psync,就改用sync进行同步(老版本的同步机制)
if (psync_result == PSYNC_NOT_SUPPORTED) {
serverLog(LL_NOTICE,"Retrying with SYNC...");
if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
……
goto error;
}
}
……
//设置事件回调读取回复过来的同步数据
if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
== AE_ERR)
{
……
}
server.repl_state = REPL_STATE_TRANSFER;
server.repl_transfer_size = -1;
server.repl_transfer_read = 0;
server.repl_transfer_last_fsync_off = 0;
server.repl_transfer_fd = dfd;
server.repl_transfer_lastio = server.unixtime;
server.repl_transfer_tmpfile = zstrdup(tmpfile);
return;
error:
……
write_error: /* Handle sendSynchronousCommand(SYNC_CMD_WRITE) errors. */
……
}
接下来看看主从复制的调度中心replicationCron,主要负责监控主从复制过程中的各个状态,
并根据不同情况作出不同处理。
//Replicationcron是复制的调度中心,由redis唯一timeEvent的回调函数serverCron每秒执行一次
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
……
run_with_period(1000) replicationCron();
……
}
void replicationCron(void) {
static long long replication_cron_loops = 0;
//slave非阻塞连接超时
if (server.masterhost &&
(server.repl_state == REPL_STATE_CONNECTING ||
slaveIsInHandshakeState()) &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
cancelReplicationHandshake();
}
//slave receiving .rdb超时
if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
cancelReplicationHandshake();
}
//slave连接上主服务器后出现交互超时
if (server.masterhost && server.repl_state == REPL_STATE_CONNECTED &&
(time(NULL)-server.master->lastinteraction) > server.repl_timeout)
{
freeClient(server.master);
}
//slave检查是否需要连接主服务器
if (server.repl_state == REPL_STATE_CONNECT) {
serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
server.masterhost, server.masterport);
//建立与主服务器的套接字连接
if (connectWithMaster() == C_OK) {
serverLog(LL_NOTICE,"MASTER <-> SLAVE sync started");
}
}// slave发送ack给master
if (server.masterhost && server.master &&
!(server.master->flags & CLIENT_PRE_PSYNC))
replicationSendAck();
listIter li;
listNode *ln;
robj *ping_argv[1];
//master周期性发生ping给slave
if ((replication_cron_loops % server.repl_ping_slave_period) == 0) {
ping_argv[0] = createStringObject("PING",4);
replicationFeedSlaves(server.slaves, server.slaveseldb,
ping_argv, 1);
decrRefCount(ping_argv[0]);
}
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
//master发送一个空行给每个符合下面两个条件的slave,refresh slave的last-io的timer
//1、master需要产生一个rdb文件给slave
//2、等待rdb文件完成,但还没发给slave
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
(slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
server.rdb_child_type != RDB_CHILD_TYPE_SOCKET))
{
if (write(slave->fd, "\n", 1) == -1) {
/* Don't worry, it's just a ping. */
}
}
}//master断开slave的连接
if (listLength(server.slaves)) {
listIter li;
listNode *ln;
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate != SLAVE_STATE_ONLINE) continue;
if (slave->flags & CLIENT_PRE_PSYNC) continue;
if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout)
{
freeClient(slave);
}
}
}//master没有slave,就释放掉repl_backlog的内存
if (listLength(server.slaves) == 0 && server.repl_backlog_time_limit &&
server.repl_backlog)
{
time_t idle = server.unixtime - server.repl_no_slaves_since;
if (idle > server.repl_backlog_time_limit) {
freeReplicationBacklog();
}
}//master的aof功能关闭而且没有slaves,就释放scriptcache
if (listLength(server.slaves) == 0 &&
server.aof_state == AOF_OFF &&
listLength(server.repl_scriptcache_fifo) != 0)
{
replicationScriptCacheFlush();
}//master没有在进行持久化操作
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
time_t idle, max_idle = 0;
int slaves_waiting = 0;
int mincapa = -1;
listNode *ln;
listIter li;
listRewind(server.slaves,&li);
//统计slaves中处于wait_bgsave_star的数量,最大超时时间和rdb解析能力
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
idle = server.unixtime - slave->lastinteraction;
if (idle > max_idle) max_idle = idle;
slaves_waiting++;
mincapa = (mincapa == -1) ? slave->slave_capa :
(mincapa & slave->slave_capa);
}
}
if (slaves_waiting && max_idle > server.repl_diskless_sync_delay) {
//有超时的处于SLAVE_STATE_WAIT_BGSAVE_START的slave
startBgsaveForReplication(mincapa);
}
}
//刷新延迟小于阈值的slave的数量
refreshGoodSlavesCount();
replication_cron_loops++; /* Incremented with frequency 1 HZ. */
}
replicantion.c的主要函数
/* ---------------------------------- MASTER -------------------------------- */
void createReplicationBacklog(void) /* 创建复制积压缓冲区 */
void resizeReplicationBacklog(long long newsize) /* 调整复制积压缓冲区的大小*/
void freeReplicationBacklog(void) /* 释放复制积压缓冲区*/
void feedReplicationBacklog(void *ptr, size_t len) /* 将写命令添加到复制积压缓冲区*/
void feedReplicationBacklogWithObject(robj *o) /*将写命令添加到复制积压缓冲区,但以对象的格式作为参数 */
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) /* 将主数据库复制到从数据库 */
void replicationFeedMonitors(redisClient *c, list *monitors, int dictid,
robj **argv, int argc) /* 发送数据给monitor监听者 */
long long addReplyReplicationBacklog(redisClient *c, long long offset)
/* 将复制积压缓冲区的offset到end的添加client的reply*/
int masterTryPartialResynchronization(redisClient *c) /* 主服务器尝试部分重同步 */
void syncCommand(redisClient *c) /* 同步命令函数 */
void replconfCommand(redisClient *c) /* 此函数用于从服务器进行配置复制进程中的执行参数设置 */
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) /* 给slave发送BULK数据 */
void updateSlavesWaitingBgsave(int bgsaveerr, int type) /* 此方法将用于后台保存进程快结束时调用,更新slave */
/* ----------------------------------- SLAVE -------------------------------- */
void replicationAbortSyncTransfer(void) /* 中止与master的同步操作 */
void replicationSendNewlineToMaster(void)
void replicationEmptyDbCallback(void *privdata)
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask)
/* 从服务器读取同步的Sync的BULK数据 */
char *sendSynchronousCommand(int flags, int fd, ...) /* 从服务器给主服务器进行同步数据的命令和接收相应的回复 */
int slaveTryPartialResynchronization(int fd) /* 从服务器尝试部分重同步操作 */
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask)
/* 与主服务器保持同步,期间包括发送ping命令,身份验证,发送端口信息 */
int connectWithMaster(void) /* 连接服务器,设置事件回调 syncWithMaster*/
void undoConnectWithMaster(void) /* 断开与主服务器的连接 */
int cancelReplicationHandshake(void) /* 当已经存在一个复制进程时,中止一个非阻塞的replication复制的尝试 */
void replicationSetMaster(char *ip, int port) /* 设置主服务器的ip地址和端口号 */
void replicationUnsetMaster(void)
void slaveofCommand(redisClient *c)
void roleCommand(redisClient *c)
void replicationSendAck(void) /* 发送ACK包给主服务器 ,告知当前的进程偏移量 */
/* ---------------------- MASTER CACHING FOR PSYNC -------------------------- */
void replicationCacheMaster(redisClient *c) /* 缓存主服务器信息 */
void replicationDiscardCachedMaster(void) /* 当某个从服务器将不会再回复的时候,可以释放掉缓存的主服务器信息 */
void replicationResurrectCachedMaster(int newfd) /* 将缓存主服务器复活 */
/* ------------------------- MIN-SLAVES-TO-WRITE --------------------------- */
void refreshGoodSlavesCount(void) /*刷新延迟小于阈值的slave的数量*/
void replicationScriptCacheInit(void)
void replicationScriptCacheFlush(void)
void replicationScriptCacheAdd(sds sha1)
int replicationScriptCacheExists(sds sha1)
void replicationCron(void) //主从复制的调度中心