源码版本:redis-3.2.3
1、redis的主从复制实现简单却功能强大,其具有以下特点:
1. 一个master支持多个slave连接,slave可以接受其他slave的连接
2. 主从同步时,master和slave都是非阻塞的
2、redis主从复制可以用来:
1. data redundancy
2. slave作为master的扩展,提供一些read-only的服务
3. 可以将数据持久化放在slave做,从而提升master性能
3、redis 主从复制配置项(redis.conf):
slaveof
表示该redis服务作为slave,masterip和masterport分别为master 的ip和port
不需要特别的设置则配此选项即可、启动从时加载此配置文件
masterauth
如果master设置了安全密码,则此处设置为相应的密码
slave-serve-stale-data yes
当slave丢失master或者同步正在进行时,如果发生对slave的服务请求:
slave-serve-stale-data设置为yes则slave依然正常提供服务
slave-serve-stale-data设置为no则slave返回client错误:"SYNC with master in progress"
repl-ping-slave-period 10
slave发送PINGS到master的时间间隔
repl-timeout 60
IO超时时间
4、代码层分析
主从复制的调度中心replicationCron函数开始:
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
/* Replication cron function -- used to reconnect to master,
* detect transfer failures, start background RDB transfers and so forth. */
run_with_period(1000) replicationCron(); // 每秒执行一次
}
void replicationCron(void) {
static long long replication_cron_loops = 0;
//slave非阻塞连接超时
if (server.masterhost &&
(server.repl_state == REPL_STATE_CONNECTING ||
slaveIsInHandshakeState()) &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
serverLog(LL_WARNING,"Timeout connecting to the MASTER...");
cancelReplicationHandshake();
}
/* Bulk transfer I/O timeout? */
//slave receiving .rdb超时
if (server.masterhost && server.repl_state == REPL_STATE_TRANSFER &&
(time(NULL)-server.repl_transfer_lastio) > server.repl_timeout)
{
serverLog(LL_WARNING,"Timeout receiving bulk data from MASTER... If the problem persists try to set the 'repl-timeout' parameter in redis.conf to a larger value.");
cancelReplicationHandshake();
}
/* Timed out master when we are an already connected slave? */
//slave连接上主服务器后出现交互超时
if (server.masterhost && server.repl_state == REPL_STATE_CONNECTED &&
(time(NULL)-server.master->lastinteraction) > server.repl_timeout)
{
serverLog(LL_WARNING,"MASTER timeout: no data nor PING received...");
freeClient(server.master);
}
//slave检查是否需要连接主服务器
if (server.repl_state == REPL_STATE_CONNECT) {
serverLog(LL_NOTICE,"Connecting to MASTER %s:%d",
server.masterhost, server.masterport);
//建立与主服务器的套接字连接
if (connectWithMaster() == C_OK) {
serverLog(LL_NOTICE,"MASTER <-> SLAVE sync started");
}
}
// slave发送ack给master
if (server.masterhost && server.master &&
!(server.master->flags & CLIENT_PRE_PSYNC))
replicationSendAck();
//master周期性发生ping给slave
if ((replication_cron_loops % server.repl_ping_slave_period) == 0) {
ping_argv[0] = createStringObject("PING",4);
replicationFeedSlaves(server.slaves, server.slaveseldb,
ping_argv, 1);
decrRefCount(ping_argv[0]);
}
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
//master发送一个空行给每个符合下面两个条件的slave,refresh slave的last-io的timer
//1、master需要产生一个rdb文件给slave
//2、等待rdb文件完成,但还没发给slave
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START ||
(slave->replstate == SLAVE_STATE_WAIT_BGSAVE_END &&
server.rdb_child_type != RDB_CHILD_TYPE_SOCKET))
{
if (write(slave->fd, "\n", 1) == -1) {
/* Don't worry, it's just a ping. */
}
}
}
//master断开slave的连接
if (listLength(server.slaves)) {
listIter li;
listNode *ln;
listRewind(server.slaves,&li);
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate != SLAVE_STATE_ONLINE) continue;
if (slave->flags & CLIENT_PRE_PSYNC) continue;
if ((server.unixtime - slave->repl_ack_time) > server.repl_timeout)
{
serverLog(LL_WARNING, "Disconnecting timedout slave: %s",
replicationGetSlaveName(slave));
freeClient(slave); // 断开连接
}
}
}
//master没有slave,就释放掉repl_backlog的内存
if (listLength(server.slaves) == 0 && server.repl_backlog_time_limit &&
server.repl_backlog)
{
time_t idle = server.unixtime - server.repl_no_slaves_since;
if (idle > server.repl_backlog_time_limit) {
freeReplicationBacklog();
serverLog(LL_NOTICE,
"Replication backlog freed after %d seconds "
"without connected slaves.",
(int) server.repl_backlog_time_limit);
}
}
//master的aof功能关闭而且没有slaves,就释放scriptcache
if (listLength(server.slaves) == 0 &&
server.aof_state == AOF_OFF &&
listLength(server.repl_scriptcache_fifo) != 0)
{
replicationScriptCacheFlush();
}
//master没有在进行持久化操作
if (server.rdb_child_pid == -1 && server.aof_child_pid == -1) {
listRewind(server.slaves,&li);
//统计slaves中处于wait_bgsave_star的数量,最大超时时间和rdb解析能力
while((ln = listNext(&li))) {
client *slave = ln->value;
if (slave->replstate == SLAVE_STATE_WAIT_BGSAVE_START) {
idle = server.unixtime - slave->lastinteraction;
if (idle > max_idle) max_idle = idle;
slaves_waiting++;
mincapa = (mincapa == -1) ? slave->slave_capa :
(mincapa & slave->slave_capa);
}
}
if (slaves_waiting && max_idle > server.repl_diskless_sync_delay) {
//有超时的处于SLAVE_STATE_WAIT_BGSAVE_START的slave
startBgsaveForReplication(mincapa);
}
}
//刷新延迟小于阈值的slave的数量
refreshGoodSlavesCount();
replication_cron_loops++; /* Incremented with frequency 1 HZ. */
}
void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
...
/* Check for errors in the socket. */
if (getsockopt(fd, SOL_SOCKET, SO_ERROR, &sockerr, &errlen) == -1)
sockerr = errno;
if (sockerr) {
serverLog(LL_WARNING,"Error condition on socket for SYNC: %s",
strerror(sockerr));
goto error;
}
//发送ping给master
if (server.repl_state == REPL_STATE_CONNECTING) {
serverLog(LL_NOTICE,"Non blocking connect for SYNC fired the event.");
aeDeleteFileEvent(server.el,fd,AE_WRITABLE);
server.repl_state = REPL_STATE_RECEIVE_PONG;
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"PING",NULL);
if (err) goto write_error;
return;
}
if (server.repl_state == REPL_STATE_RECEIVE_PONG) {
//接收ping的回复
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
if (err[0] != '+' &&
strncmp(err,"-NOAUTH",7) != 0 &&
strncmp(err,"-ERR operation not permitted",28) != 0)
{
serverLog(LL_WARNING,"Error reply to PING from master: '%s'",err);
sdsfree(err);
goto error;
} else {
serverLog(LL_NOTICE,
"Master replied to PING, replication can continue...");
}
sdsfree(err);
server.repl_state = REPL_STATE_SEND_AUTH;
}
//身份验证
if (server.repl_state == REPL_STATE_SEND_AUTH) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"AUTH",server.masterauth,NULL);
server.repl_state = REPL_STATE_SEND_PORT;
}
if (server.repl_state == REPL_STATE_RECEIVE_AUTH) {
//接收验证的回复
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
server.repl_state = REPL_STATE_SEND_PORT;
}
//发送listening-port给master
if (server.repl_state == REPL_STATE_SEND_PORT) {
sds port = sdsfromlonglong(server.port);
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"listening-port",port, NULL);
server.repl_state = REPL_STATE_RECEIVE_PORT;
}
//接收"replconf listening-port"的回复
if (server.repl_state == REPL_STATE_RECEIVE_PORT) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
server.repl_state = REPL_STATE_SEND_CAPA;
}
//告知master自己可以解析rdb的格式
if (server.repl_state == REPL_STATE_SEND_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_WRITE,fd,"REPLCONF",
"capa","eof",NULL);
server.repl_state = REPL_STATE_RECEIVE_CAPA;
}
//接收"replconf capa eof"的回复
if (server.repl_state == REPL_STATE_RECEIVE_CAPA) {
err = sendSynchronousCommand(SYNC_CMD_READ,fd,NULL);
server.repl_state = REPL_STATE_SEND_PSYNC;
}
//slave发送psync给master,如果有cached_matster,进行部分重同步;反之进行完整重同步
if (server.repl_state == REPL_STATE_SEND_PSYNC) {
if (slaveTryPartialResynchronization(fd,0) == PSYNC_WRITE_ERROR) {
err = sdsnew("Write error sending the PSYNC command.");
goto write_error;
}
server.repl_state = REPL_STATE_RECEIVE_PSYNC;
return;
}
//接收master psync的回复,进行同步
psync_result = slaveTryPartialResynchronization(fd,1);
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
if (psync_result == PSYNC_CONTINUE) {
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Master accepted a Partial Resynchronization.");
return;
}
disconnectSlaves(); /* Force our slaves to resync with us as well. */
freeReplicationBacklog(); /* Don't allow our chained slaves to PSYNC. */
//如果master不支持psync,就改用sync进行同步(老版本的同步机制)
if (psync_result == PSYNC_NOT_SUPPORTED) {
serverLog(LL_NOTICE,"Retrying with SYNC...");
if (syncWrite(fd,"SYNC\r\n",6,server.repl_syncio_timeout*1000) == -1) {
serverLog(LL_WARNING,"I/O error writing to MASTER: %s",
strerror(errno));
goto error;
}
}
/* Setup the non blocking download of the bulk file. */
// 负责从连接中读取主服务器建立的同步文件,可分多次读完所有同步数据。
// 在数据超过8MB后,每次读取操作都会进行写磁盘操作,如果在最后才进行
// 写磁盘操作可能会造成极大的延迟。
if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL)
== AE_ERR)
{
serverLog(LL_WARNING,
"Can't create readable event for SYNC: %s (fd=%d)",
strerror(errno),fd);
goto error;
}
...
}
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) {
//读取master发过来的RDB大小以及文件内容保存到本地文件中;
//如果读取完毕,那么调用rdbLoad加载文件内容。并考虑重新启动startAppendOnly
//看看是否文件全部接收完毕,如果完毕
if (server.repl_transfer_read == server.repl_transfer_size) {
serverLog(LL_NOTICE, "MASTER <-> SLAVE sync: Flushing old data");
// 清空整个数据库,这个操作非常重,如果当前正在做BGSAVE,
// 那么会导致快照的COW写时复制机制失效,严重耗费物理内存。
emptyDb(replicationEmptyDbCallback);
//开始加载RDB文件到内存数据结构中,这个要花费不少时间的。
if (rdbLoad(server.rdb_filename) != REDIS_OK) {
...
}
}
}
master端:
master对于slave的连接和client的连接统一处理,在接收到slave发出的SYNC命令后,执行syncCommand,syncCommand 将查看当前状态,如果正在做快照,则等待,否则启动后台进程做快照。
void syncCommand(redisClient *c) {
/* SYNC can't be issued when the server has pending data to send to
* the client about already issued commands. We need a fresh reply
* buffer registering the differences between the BGSAVE and the current
* dataset, so that we can copy to other slaves if needed. */
if (listLength(c->reply) != 0) {
addReplyError(c,"SYNC is invalid with pending input");
return;
}
redisLog(REDIS_NOTICE,"Slave ask for synchronization");
/* Here we need to check if there is a background saving operation
* in progress, or if it is required to start one */
if (server.bgsavechildpid != -1) {
.....
} else {
/* Ok we don't have a BGSAVE in progress, let's start one */
redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC");
if (rdbSaveBackground(server.dbfilename) != REDIS_OK) {
redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE");
addReplyError(c,"Unable to perform background save");
return;
}
c->replstate = REDIS_REPL_WAIT_BGSAVE_END;
}
}
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) {
......
lseek(slave->repldbfd,slave->repldboff,SEEK_SET); //指针移动到上次发送的位置
buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); //读取16K数据
......
if ((nwritten = write(fd,buf,buflen)) == -1) { //传输数据到slave
if (errno != EAGAIN) {
redisLog(REDIS_WARNING,"Write error sending DB to slave: %s",
strerror(errno));
freeClient(slave);
}
return;
}
slave->repldboff += nwritten; //更新已发送位置
......
}
比我分析的更好的:
http://www.cnblogs.com/lukexwang/p/4711977.html