redis replication
//////////////////////////////////////////////// slave /////////////////////////////////////////////
1. Slave init方式void slaveofCommand(redisClient *c) { if (!strcasecmp(c->argv[1]->ptr,"no") && !strcasecmp(c->argv[2]->ptr,"one")) { //该slave将变成一个master if (server.masterhost) { sdsfree(server.masterhost); server.masterhost = NULL; if (server.master) freeClient(server.master); if (server.replstate == REDIS_REPL_TRANSFER) replicationAbortSyncTransfer(); server.replstate = REDIS_REPL_NONE; redisLog(REDIS_NOTICE,"MASTER MODE enabled (user request)"); } } else { sdsfree(server.masterhost); server.masterhost = sdsdup(c->argv[1]->ptr); server.masterport = atoi(c->argv[2]->ptr); if (server.master) freeClient(server.master); if (server.replstate == REDIS_REPL_TRANSFER) replicationAbortSyncTransfer(); server.replstate = REDIS_REPL_CONNECT; redisLog(REDIS_NOTICE,"SLAVE OF %s:%d enabled (user request)", server.masterhost, server.masterport); } addReply(c,shared.ok); }注:该命令还可将一个slave变成一个master节点(slaveof no one)。
loadServerConfig { else if (!strcasecmp(argv[0],"slaveof") && argc == 3) { server.masterhost = sdsnew(argv[1]); server.masterport = atoi(argv[2]); server.replstate = REDIS_REPL_CONNECT; }
serverCron{ /* Replication cron function -- used to reconnect to master and * to detect transfer failures. */ if (!(loops % 10)) replicationCron(); server.cronloops++; } void replicationCron(void) { ... /* Check if we should connect to a MASTER */ if (server.replstate == REDIS_REPL_CONNECT) { redisLog(REDIS_NOTICE,"Connecting to MASTER..."); if (connectWithMaster() == REDIS_OK) { redisLog(REDIS_NOTICE,"MASTER <-> SLAVE sync started"); } } ... }下面的函数完成与master的连接,并且注册file event事件及回调函数syncWithMaster,并且slave进入 REDIS_REPL_CONNECTING状态。
int connectWithMaster(void) { int fd; fd = anetTcpNonBlockConnect(NULL,server.masterhost,server.masterport); ... if (aeCreateFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE,syncWithMaster,NULL) == AE_ERR) ... server.repl_transfer_s = fd; server.replstate = REDIS_REPL_CONNECTING; return REDIS_OK; }连接成功之后,看是否需要密码验证,并且像master发送sync命令,注册新的file event事件来处理master发送rdb数据(readSyncBulkPayload),然后进入 REDIS_REPL_TRANSFER状态,等待数据的传输。
syncWithMaster (aeEventLoop *el, int fd, void *privdata, int mask) { /* This event should only be triggered once since it is used to have a * non-blocking connect(2) to the master. It has been triggered when this * function is called, so we can delete it. */ aeDeleteFileEvent(server.el,fd,AE_READABLE|AE_WRITABLE); /*从客户端接收到slave no one,即该server将变成master,此时不再去sync数据*/ if (server.replstate == REDIS_REPL_NONE) { close(fd); return; } /* AUTH with the master if required. */ if(server.masterauth) { authlen = snprintf(authcmd,sizeof(authcmd),"AUTH %s\r\n",server.masterauth); if (syncWrite(fd,authcmd,authlen,server.repl_syncio_timeout) == -1) { //这是redis少有的阻塞io之一(通过select来实现),因为slave只有在完成sync之后才会对外提供服务 } /* Read the AUTH result. */ if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout) == -1) … } /* 发送sync命令 */ if (syncWrite(fd,"SYNC \r\n",7,server.repl_syncio_timeout) == -1) /* 创建一个临时文件来保存从master传送过来的rdb数据 */ while(maxtries--) { snprintf(tmpfile,256, "temp-%d.%ld.rdb",(int)time(NULL),(long int)getpid()); dfd = open(tmpfile,O_CREAT|O_WRONLY|O_EXCL,0644); if (dfd != -1) break; sleep(1); } /*增加file event事件,来读取master的响应*/ if (aeCreateFileEvent(server.el,fd, AE_READABLE,readSyncBulkPayload,NULL) server.replstate = REDIS_REPL_TRANSFER; server.repl_transfer_left = -1; server.repl_transfer_fd = dfd; server.repl_transfer_lastio = time(NULL); server.repl_transfer_tmpfile = zstrdup(tmpfile); return; }到此请先看下面的master处理流程,等看完的sendBulkToSlave处理,再回来看接下来的函数:readSyncBulkPayload,这个函数就是用来接收master send to slave的rdb数据的。
void readSyncBulkPayload(aeEventLoop *el, int fd, void *privdata, int mask) { if (server.repl_transfer_left == -1) { //还没有接收到master发送过来的第一个报文:rdb文件大小的报文 if (syncReadLine(fd,buf,1024,server.repl_syncio_timeout) == -1) if (buf[0] == '-') { //master 出错 } else if (buf[0] == '\0') { //这是一个connection live的ping操作 server.repl_transfer_lastio = time(NULL); return; } else if (buf[0] != '$') { //其它报文,见master过程的sendBulkToSlave …} server.repl_transfer_left = strtol(buf+1,NULL,10); //赋值等待接收的数据量 return; } /* Read bulk data 真正的数据报文*/ readlen = (server.repl_transfer_left < (signed)sizeof(buf)) ? server.repl_transfer_left : (signed)sizeof(buf); nread = read(fd,buf,readlen); //读数据 server.repl_transfer_lastio = time(NULL); if (write(server.repl_transfer_fd,buf,nread) != nread) { //写到前面创建的临时文件 server.repl_transfer_left -= nread; /* Check if the transfer is now complete */ if (server.repl_transfer_left == 0) { //接收完毕 if (rename(server.repl_transfer_tmpfile,server.dbfilename) == -1) { …} redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Loading DB in memory"); emptyDb(); aeDeleteFileEvent(server.el,server.repl_transfer_s,AE_READABLE);//删除该file event事件 if (rdbLoad(server.dbfilename) != REDIS_OK) {//把rdb文件加载到内存 } zfree(server.repl_transfer_tmpfile); close(server.repl_transfer_fd); server.master = createClient(server.repl_transfer_s); //为该fd创建新的client,该client的file event为aeCreateFileEvent(server.el,fd,AE_READABLE, readQueryFromClient, c) server.master->flags |= REDIS_MASTER; server.master->authenticated = 1; server.replstate = REDIS_REPL_CONNECTED; redisLog(REDIS_NOTICE, "MASTER <-> SLAVE sync: Finished with success"); /* Rewrite the AOF file now that the dataset changed. */ if (server.appendonly) rewriteAppendOnlyFileBackground(); //写aof文件 } }该函数主要分三个过程:读取第一个长度报文,读取数据报文,结束时把rdb加载到内存,创建新的file event 可读事件(readQueryFromClient),更新slave server状态到 REDIS_REPL_CONNECTED。到此master-slave进入增加量的命令同步,slave把来自master的更新命令当做一般的client命令来处理,slave也可对外提供服务。
////////////////////////////////////////////// master /////////////////////////////////////////////
3. Master slaveclient的状态转移Master在收到client发送过来的sync命令后,调用该回调函数:void syncCommand(redisClient *c) { /* 如果该client已经sync过了*/ if (c->flags & REDIS_SLAVE) return; /* 如果本server是一个master,但是它处于一个非REDIS_REPL_CONNECTED 的状态,显然它还处理一个正常的同步状态,此时它还不应该被别人sync*/ if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED) … /* SYNC can't be issued when the server has pending data to send to * the client about already issued commands. We need a fresh reply * buffer registering the differences between the BGSAVE and the current * dataset, so that we can copy to other slaves if needed. */ if (listLength(c->reply) != 0) { addReplyError(c,"SYNC is invalid with pending input"); return; } redisLog(REDIS_NOTICE,"Slave ask for synchronization"); /* 检查当前是否有后台save正在操作当中*/ if (server.bgsavechildpid != -1) { //检查是否有其它的slave刚好也在等待bgsave的完成 listRewind(server.slaves,&li); while((ln = listNext(&li))) { slave = ln->value; if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) break; } if (ln) { /* 是的,则把先前的这个slave的reply回复给新的这个client*/ listRelease(c->reply); c->reply = listDup(slave->reply); c->replstate = REDIS_REPL_WAIT_BGSAVE_END; redisLog(REDIS_NOTICE,"Waiting for end of BGSAVE for SYNC"); } else { /* 没有,则该client必须等待该bgsave结束(是master自动发起的而不是由其它的slave发起的),然后重新进行一个bgsave*/ c->replstate = REDIS_REPL_WAIT_BGSAVE_START; redisLog(REDIS_NOTICE,"Waiting for next BGSAVE for SYNC"); } } else { /* 当前没有bgsave,开启一个新的进程 */ redisLog(REDIS_NOTICE,"Starting BGSAVE for SYNC"); if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { redisLog(REDIS_NOTICE,"Replication failed, can't BGSAVE"); addReplyError(c,"Unable to perform background save"); return; } c->replstate = REDIS_REPL_WAIT_BGSAVE_END; //设置该slave client的状态 } c->repldbfd = -1; //这个为之后传输rdb文件的描述符,即server.dbfilename c->flags |= REDIS_SLAVE; //标志该client为slave c->slaveseldb = 0; listAddNodeTail(server.slaves,c); return; }接下来master会在它的serverCron的时候等待该bgsave子进程的结束(该过程我们已经在上一个章节里讲过),这里我们直接跳到我们当时跳过的replication的处理过程。在wait3的处理函数backgroundSaveDoneHandler的最后一步:updateSlavesWaitingBgsave(exitcode == 0 ? REDIS_OK : REDIS_ERR);
void updateSlavesWaitingBgsave(int bgsaveerr) { listRewind(server.slaves,&li); while((ln = listNext(&li))) { redisClient *slave = ln->value; if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) { //该client迟了一点点,所以只能等待下一次的bgsave startbgsave = 1; //在处理完所有的wait end的slave后马上再bgsave slave->replstate = REDIS_REPL_WAIT_BGSAVE_END; } else if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_END) { //该client正好是等待当前的bgsave struct redis_stat buf; if (bgsaveerr != REDIS_OK) … if ((slave->repldbfd = open(server.dbfilename,O_RDONLY)) == -1 || redis_fstat(slave->repldbfd,&buf) == -1) … slave->repldboff = 0; slave->repldbsize = buf.st_size; slave->replstate = REDIS_REPL_SEND_BULK; //标志新的状态 aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendBulkToSlave, slave) == AE_ERR) …//将该slave之前的file event del掉,因为它当前的回调函数已经变为sendBulk } } if (startbgsave) { //新的bgsave来满足之前是REDIS_REPL_WAIT_BGSAVE_START状态的slave client if (rdbSaveBackground(server.dbfilename) != REDIS_OK) { listIter li; listRewind(server.slaves,&li); redisLog(REDIS_WARNING,"SYNC failed. BGSAVE failed"); while((ln = listNext(&li))) { redisClient *slave = ln->value; //bg失败,则结束所有还没开始的slave 请求 if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) freeClient(slave); } } } }从该函数我们可以看到对于所有 REDIS_REPL_WAIT_BGSAVE_END状态的slave client,master 打开rdb文件,并且注册file event事件,它的回调函数是:sendBulkToSlave,并且slave client进入 REDIS_REPL_SEND_BULK状态。另外对于状态为的 REDIS_REPL_WAIT_BGSAVE_START的,则再次调用bgsave重写rdb。下面我们看一下sendBulkToSlave回调函数(这个函数是与slave 节点的readSyncBulkPayload函数相对应的):
void sendBulkToSlave(aeEventLoop *el, int fd, void *privdata, int mask) { if (slave->repldboff == 0) { //先把文件大小发送给slave节点,第一个报文并且以$开头 bulkcount = sdscatprintf(sdsempty(),"$%lld\r\n",(unsigned long long) slave->repldbsize); if (write(fd,bulkcount,sdslen(bulkcount)) != (signed)sdslen(bulkcount)) … } lseek(slave->repldbfd,slave->repldboff,SEEK_SET); buflen = read(slave->repldbfd,buf,REDIS_IOBUF_LEN); //读取rdb文件 if ((nwritten = write(fd,buf,buflen)) == -1) {…} //写到slave client fd slave->repldboff += nwritten; //修改偏移量 if (slave->repldboff == slave->repldbsize) { //文件读取完毕 close(slave->repldbfd); slave->repldbfd = -1; aeDeleteFileEvent(server.el,slave->fd,AE_WRITABLE); //结束该事件 slave->replstate = REDIS_REPL_ONLINE; //设置新的slave client状态 if (aeCreateFileEvent(server.el, slave->fd, AE_WRITABLE, sendReplyToClient, slave) == AE_ERR) { //添加新可写事件 freeClient(slave); return; }… } }该函数就是用于向slave 节点发送rdb文件,直到结束时添加新的file event(AE_WRITABLE, sendReplyToClient)事件,以便来同步save rdb文件之后的更新操作,我们可以看到这个回调函数就是一般的响应客户请求的回调函数,同时slave client进入 REDIS_REPL_ONLINE状态。下面我们将看到master是在什么时候向slave发送后面的更新操作:
void call(redisClient *c) { … if ((dirty > 0 || c->cmd->flags & REDIS_CMD_FORCE_REPLICATION) && listLength(server.slaves)) replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc); … }是的在每次执行客户端请求之后判断是否有更新内容,以及master的slave队列是否有成员,调用replicationFeedSlaves来向slave client的reply buf里增加数据。
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) { listRewind(slaves,&li); //遍历所有的slave client while((ln = listNext(&li))) { redisClient *slave = ln->value; if (slave->replstate == REDIS_REPL_WAIT_BGSAVE_START) continue; //对于还没有bgsave的则不需要,因为后面的bgsave 的rdb会包含这些更新 if (slave->slaveseldb != dictid) { //查看db是否改变 robj *selectcmd; … selectcmd = createObject(REDIS_STRING, sdscatprintf(sdsempty(),"select %d\r\n",dictid)); //构造select db命令 addReply(slave,selectcmd); slave->slaveseldb = dictid; } } addReplyMultiBulkLen(slave,argc); //发送更新命令 for (j = 0; j < argc; j++) addReplyBulk(slave,argv[j]); }
4. 状态转移图
上面我们详细的介绍了slave与master进行同步的一个过程。下面我们通过一张图来总结该过程:
图1 slave-master sync状态转移图