AOF简介
Redis拥有者两种的持久化存储的方式一中是rdb序列化存储db的信息另外一种就是aof.
Append-only file,将“操作 + 数据”以格式化指令的方式追加到操作日志文件的尾部,在append操作返回后(已经写入到文件或者即将写入),才进行实际的数据变更,“日志文件”保存了历史所有的操作过程;当server需要数据恢复时,可以直接replay此日志文件,即可还原所有的操作过程。AOF相对可靠.
AOF记录的是修改的操作日志所以一般来说AOF文件会比rdb序列化出来的大。恢复的速度也会相对来说慢些。
AOF编码
其实aof就是一个日志文件。所以我们来查看下这个日志,包含了所有的write操作。所以来看下他是怎么对每个write操作进行保存的
对于aof的编码来说相对就显得特别的简单了,他没有对长度啊那些进行编码那些。所以我们来看两个例子就好啦
selectdb
*2\r\n$6\r\nSELECT\r\n$1\r\n\1\r\n
这句话保存了一个最简单的命令选择一个db
*代表着一个command的开始标记
2代表这一个command 总共需要表示的字符串的个数
\r\n是作为一个结束符代表一个参数的结束
$作为字符串长度的标记
所以对我们的select 2来说
*2因为的一共需要使用两个字符串
\r\n结束总字符串个数长度
$6表示我们的select字符串的长度
\r\n select字符串结束
$1代表我们db的长度为1
\r\n结束字符串
再根据我们的总字符串长度为2确定我们的command结束
其他的command同理
//构造一个command
sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) {
char buf[32];
int len, j;
robj *o;
buf[0] = '*'; //一个command以*开头
len = 1+ll2string(buf+1,sizeof(buf)-1,argc);//写入参数的总个数
buf[len++] = '\r';//紧接着一个\r\n
buf[len++] = '\n';
dst = sdscatlen(dst,buf,len);//把buf里面的内容拷贝到dst 现在的dst里面就是*n\r\n
for (j = 0; j < argc; j++) {
o = getDecodedObject(argv[j]);//
buf[0] = '$';//第一个字段$
len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr));//构造出长度
buf[len++] = '\r';
buf[len++] = '\n';
dst = sdscatlen(dst,buf,len);//这个时候就是*n\r\n$len\r\n
dst = sdscatlen(dst,o->ptr,sdslen(o->ptr));//把string连接上去
dst = sdscatlen(dst,"\r\n",2);//连上\r\n
decrRefCount(o);
}
return dst;
}
AOF文件的追加
打开aof
appendonly yes
appendfilename "appendonly.aof"
这样就会开启aof模式
if (server.aof_state == AOF_ON) {//打开aof
server.aof_fd = open(server.aof_filename,
O_WRONLY|O_APPEND|O_CREAT,0644);//打开或者创建aof文件采用追加的方式
if (server.aof_fd == -1) {
serverLog(LL_WARNING, "Can't open the append-only file: %s",
strerror(errno));
exit(1);
}
}
command操作追加到内存
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
sds buf = sdsempty();
robj *tmpargv[3];
/* The DB this command was targeting is not the same as the last command
* we appended. To issue a SELECT command is needed. */
if (dictid != server.aof_selected_db) {// db不同调用select方法
char seldb[64];
snprintf(seldb,sizeof(seldb),"%d",dictid);//把dictid转换成字符串
buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
(unsigned long)strlen(seldb),seldb);
server.aof_selected_db = dictid;
}
if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
cmd->proc == expireatCommand) { //对于expire command 转换成PEXPIREAT
/* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]); //
} else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) { //setx 和setex
/* Translate SETEX/PSETEX to SET and PEXPIREAT */
tmpargv[0] = createStringObject("SET",3);
tmpargv[1] = argv[1];
tmpargv[2] = argv[3];
buf = catAppendOnlyGenericCommand(buf,3,tmpargv);//创建set
decrRefCount(tmpargv[0]);
buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);//创建PEXPIREAT
} else if (cmd->proc == setCommand && argc > 3) {//这个set 带有过期信息
int i;
robj *exarg = NULL, *pxarg = NULL;
/* Translate SET [EX seconds][PX milliseconds] to SET and PEXPIREAT */
buf = catAppendOnlyGenericCommand(buf,3,argv);
for (i = 3; i < argc; i ++) {
if (!strcasecmp(argv[i]->ptr, "ex")) exarg = argv[i+1];//毫秒还是秒
if (!strcasecmp(argv[i]->ptr, "px")) pxarg = argv[i+1];
}
serverAssert(!(exarg && pxarg));
if (exarg)
buf = catAppendOnlyExpireAtCommand(buf,server.expireCommand,argv[1],
exarg);
if (pxarg)
buf = catAppendOnlyExpireAtCommand(buf,server.pexpireCommand,argv[1],
pxarg);//创建PEXPIREAT
} else {
/* All the other commands don't need translation or need the
* same translation already operated in the command vector
* for the replication itself. */
buf = catAppendOnlyGenericCommand(buf,argc,argv);//序列化一个普通的命令
}
/* Append to the AOF buffer. This will be flushed on disk just before
* of re-entering the event loop, so before the client will get a
* positive reply about the operation performed. */
if (server.aof_state == AOF_ON)
server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));//追加buff
/* If a background append only file rewriting is in progress we want to
* accumulate the differences between the child DB and the current one
* in a buffer, so that when the child process will do its work we
* can append the differences to the new append only file. */
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));//发送到子进程
sdsfree(buf);
}
在对于有write操作的命令会调用到feedAppendOnlyFile方法来追加操作序列
序列化操作的方式上面有对于带有过期信息的command将会统一被转换为PEXPIREAT命令来进行追加
最后把buf放入到aof_buf aof_buf在程序初始化的时候回被设置成为sdsempty()
aof_buf写入到内存
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
mstime_t latency;
if (sdslen(server.aof_buf) == 0) return;
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;//判断下有没有异步任务存在
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { //后台刷新没完成
/* With this append fsync policy we do background fsyncing.
* If the fsync is still in progress we can try to delay
* the write for a couple of seconds. */
if (sync_in_progress) { //如果后台任务还没有完成
if (server.aof_flush_postponed_start == 0) {
/* No previous write postponing, remember that we are
* postponing the flush and return. */
server.aof_flush_postponed_start = server.unixtime; //记录等待时间 后面会被判断刷新下
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) { //两秒 还能忍等着
/* We were already waiting for fsync to finish, but for less
* than two seconds this is still ok. Postpone again. */
return;
}
/* Otherwise fall trough, and go write since we can't wait
* over two seconds. */
server.aof_delayed_fsync++;// 忍不住了
serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
}
}
/* We want to perform a single write. This should be guaranteed atomic
* at least if the filesystem we are writing is a real physical one.
* While this will save us against the server being killed I don't think
* there is much to do about the whole server stopping for power problems
* or alike */
latencyStartMonitor(latency);
nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));//数据写入
latencyEndMonitor(latency);
/* We want to capture different events for delayed writes:
* when the delay happens with a pending fsync, or with a saving child
* active, and when the above two conditions are missing.
* We also use an additional event name to save all samples which is
* useful for graphing / monitoring purposes. */
if (sync_in_progress) {
latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);
} else if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) {
latencyAddSampleIfNeeded("aof-write-active-child",latency);
} else {
latencyAddSampleIfNeeded("aof-write-alone",latency);
}
latencyAddSampleIfNeeded("aof-write",latency);
/* We performed the write so reset the postponed flush sentinel to zero. */
server.aof_flush_postponed_start = 0;
if (nwritten != (ssize_t)sdslen(server.aof_buf)) {//判断下写入的长度
static time_t last_write_error_log = 0;
int can_log = 0;
/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { //隔了这么长时间了 可以写错误了
can_log = 1;
last_write_error_log = server.unixtime;
}
/* Log the AOF write error and record the error code. */
if (nwritten == -1) {//这逗比是完全没有写的有进去呀
if (can_log) {
serverLog(LL_WARNING,"Error writing to the AOF file: %s",
strerror(errno));
server.aof_last_write_errno = errno;
}
} else {
if (can_log) {//没写完 发下牢骚
serverLog(LL_WARNING,"Short write while writing to "
"the AOF file: (nwritten=%lld, "
"expected=%lld)",
(long long)nwritten,
(long long)sdslen(server.aof_buf));
}
if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {//如果原来的文件大小比参数length大,则超过的部分会被删去。这是为了aof文件的完整性 在出错解析文件的时候不会出错
if (can_log) {
serverLog(LL_WARNING, "Could not remove short write "
"from the append-only file. Redis may refuse "
"to load the AOF the next time it starts. "
"ftruncate: %s", strerror(errno));
}
} else {
/* If the ftruncate() succeeded we can set nwritten to
* -1 since there is no longer partial data into the AOF. */
nwritten = -1;
}
server.aof_last_write_errno = ENOSPC;
}
/* Handle the AOF write error. */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) { //代表每次都要全部锤进去
/* We can't recover when the fsync policy is ALWAYS since the
* reply for the client is already in the output buffers, and we
* have the contract with the user that on acknowledged write data
* is synced on disk. */
serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
exit(1);
} else {
/* Recover from failed write leaving data into the buffer. However
* set an error to stop accepting writes as long as the error
* condition is not cleared. */
server.aof_last_write_status = C_ERR; //上次写入没有完成成功
/* Trim the sds buffer if there was a partial write, and there
* was no way to undo it with ftruncate(2). */
if (nwritten > 0) {// 如果有写入 需要更新aof_buf
server.aof_current_size += nwritten;//更新当前大小
sdsrange(server.aof_buf,nwritten,-1);//内存收缩
}
return; /* We'll try again on the next call... */
}
} else {//写完了的
/* Successful write(2). If AOF was in error state, restore the
* OK state and log the event. */
if (server.aof_last_write_status == C_ERR) {
serverLog(LL_WARNING,
"AOF write error looks solved, Redis can write again.");
server.aof_last_write_status = C_OK;
}
}
server.aof_current_size += nwritten;//更新大小
/* Re-use AOF buffer when it is small enough. The maximum comes from the
* arena size of 4k minus some overhead (but is otherwise arbitrary). */
if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) { //小于4k就重用
sdsclear(server.aof_buf);
} else {
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
* children doing I/O in the background. */
if (server.aof_no_fsync_on_rewrite &&
(server.aof_child_pid != -1 || server.rdb_child_pid != -1))//确定在后台任务存在的时候 是否刷新
return;
/* Perform the fsync if needed. */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {//刷盘的方式
/* aof_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
latencyStartMonitor(latency);
aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */ //每次刷
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-fsync-always",latency);
server.aof_last_fsync = server.unixtime;
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
if (!sync_in_progress) aof_background_fsync(server.aof_fd);
server.aof_last_fsync = server.unixtime; //开启异步刷
}
}
aof有三种刷到磁盘的方法
appendfsync always
appendfsync everysec
appendfsync no
alaways 每次都是要刷入磁盘 这样的话就是会卡io
everysec 每秒刷一次
no 操作系统自己刷
刷入磁盘的方法主要是在beforeSleep的时候调用
这个方式redis在进入睡眠之前调用
在主循环serverCron中
if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
这个是在flushAppendOnlyFile调用的时候发现异步线程还在刷入磁盘
如果刷新还是没完成
run_with_period(1000) {
if (server.aof_last_write_status == C_ERR)
flushAppendOnlyFile(0);
}
每秒钟对于上次刷盘没成功进行一次在刷入
其实对于appendfsync的刷aof_buf到磁盘上来说的话always是最安全的,他会每次都会在完成写入完成的时候调用aof_fsync来将数据写入到磁盘里面。这是个同步的操作,会等待系统io的返回。属于最耗时的一种
everysec 将会使用aof_background_fsync方法来刷新。看代码可以知道其实在异步刷新时间过长两秒以内是不会再进入write方法的。并且在刷新没有完成的情况新写入也不会被刷入。这时候崩溃的话丢失的数据可能会大于1秒
no 就是不刷新了 操作系统自己玩
刷新使用的是fdatasync; fdatasync的功能与fsync类似,但是仅仅在必要的情况下才会同步metadata,因此可以减少一次IO写操作。那么,什么是“必要的情况。比如文件的尺寸(st_size)如果变化,是需要立即同步的,否则OS一旦崩溃,即使文件的数据部分已同步,由于metadata没有同步,依然读不到修改的内容。
但是喃由于我们的aof使用的是append。每次写入都会触发文件大小的变化。就被退化成了fsync。
Rewrite
我们都知道我们的aof文件是以追加的方式的。这样的话随着时间的推移。我们的aof文件会越来越大。而且里面的有些元素本身就是被删除了。如果这时候我们进行还原的话就会有很多不必要的操作。所以Redis提供了一个rewrite。当aof文件到一定大小。或者是跟上次相比涨到一定程度的情况下。把数据库进行一次序列化。然后在接着写aof文件。这样可以去掉一些很多无用的字段。但是这样对于恢复来说的话。也可能是会存在一些问题的。看个人取舍了
配置:
auto-aof-rewrite-percentage 100 //涨的百分比 0就是不rewrite
auto-aof-rewrite-min-size 64mb //最小开始rewrite的大小
/* Trigger an AOF rewrite if needed. */
if (server.aof_state == AOF_ON && //开启了aof
server.rdb_child_pid == -1 && //没有rdb子进程
server.aof_child_pid == -1 && //没有aof子进程
server.aof_rewrite_perc && //开启了百分比rewrite
server.aof_current_size > server.aof_rewrite_min_size)//达到最小的rewrite的点
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
if (growth >= server.aof_rewrite_perc) { //计算成长倍数 然后重写一下
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
rewriteAppendOnlyFileBackground(); //进行rwrite
}
}
}
对于rewrite来说Redis使用的是fork,启用一个新的子进程,然后在子进程里面把rdb里面的数据序列化到新的aof文件中。在子进程写rdb的过程中,使用pipe通信来把主进程新的修改接收保存。在rbd写完之后追加到aof文件中。报告写入完成,停止接收新的修改最后结束。主进程把aof文件替换。也把子进程到主进程切换过程中的写入追加到aof。aof_fd切换成新的fd。整个流程完成。
首先查看pipe的建立
int aofCreatePipes(void) {
int fds[6] = {-1, -1, -1, -1, -1, -1}; // 一对1 data ack ack
int j;
if (pipe(fds) == -1) goto error; /* parent -> children data. */
if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
if (pipe(fds+4) == -1) goto error; /* parent -> children ack. */
/* Parent -> children data is non blocking. */
if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;// NonBlock
if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error; //aof_pipe_read_ack_from_child //增加读的回调
server.aof_pipe_write_data_to_child = fds[1];
server.aof_pipe_read_data_from_parent = fds[0]; // read 0 write 1
server.aof_pipe_write_ack_to_parent = fds[3];
server.aof_pipe_read_ack_from_child = fds[2];
server.aof_pipe_write_ack_to_child = fds[5];
server.aof_pipe_read_ack_from_parent = fds[4];
server.aof_stop_sending_diff = 0;// 标记可以发送 diff 在aofChildPipeReadable中会收到结束发送的标记 设置为不发送
return C_OK;
error:
serverLog(LL_WARNING,"Error opening /setting AOF rewrite IPC pipes: %s",
strerror(errno));
for (j = 0; j < 6; j++) if(fds[j] != -1) close(fds[j]);
return C_ERR;
}
//打开info消息的收取
void openChildInfoPipe(void) {
if (pipe(server.child_info_pipe) == -1) {
/* On error our two file descriptors should be still set to -1,
* but we call anyway cloesChildInfoPipe() since can't hurt. */
closeChildInfoPipe();
} else if (anetNonBlock(NULL,server.child_info_pipe[0]) != ANET_OK) {
closeChildInfoPipe();
} else {
memset(&server.child_info_data,0,sizeof(server.child_info_data));
}
}
一共有三组pipe。对于pipe来说第一个作为读第二个作为写端。对于data这种数据量比较大的来说。将会采用异步的方式进行发送。对于ack这类消息。因为只有一个字节。就不那么麻烦了。同步锤完。
来看子进程核心的写方法rewriteAppendOnlyFile
int rewriteAppendOnlyFile(char *filename) {
rio aof;
FILE *fp;
char tmpfile[256];
char byte;
/* Note that we have to use a different temp name here compared to the
* one used by rewriteAppendOnlyFileBackground() function. */
snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());// 创建文件
fp = fopen(tmpfile,"w");//write
if (!fp) {
serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
return C_ERR;
}
server.aof_child_diff = sdsempty(); // 父进程写过来的数据 保存在写rbd期间 主进程的修改信息 格式是个aof追加的格式是一致的
rioInitWithFile(&aof,fp);//rio
if (server.aof_rewrite_incremental_fsync)
rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);
if (server.aof_use_rdb_preamble) {//写入rdb数据的方式
int error;
if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {//这里是使用rdb的编码方式写入aof文件
errno = error;
goto werr;
}
} else {
if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;//这里是将rdb里面的内容转换成command的模式写入
}
/* Do an initial slow fsync here while the parent is still sending
* data, in order to make the next final fsync faster. */
if (fflush(fp) == EOF) goto werr; //刷入
if (fsync(fileno(fp)) == -1) goto werr;
/* Read again a few times to get more data from the parent.
* We can't read forever (the server may receive data from clients
* faster than it is able to send data to the child), so we try to read
* some more data in a loop as soon as there is a good chance more data
* will come. If it looks like we are wasting time, we abort (this
* happens after 20 ms without new data). */
int nodata = 0;
mstime_t start = mstime();
while(mstime()-start < 1000 && nodata < 20) {//讲道理20ms你没给我发消息了 我就不等了 我特最多等一秒钟
if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)//每秒等待一次数据的到来
{
nodata++;
continue;
}
nodata = 0; /* Start counting from zero, we stop on N *contiguous*
timeouts. */
aofReadDiffFromParent();//同步父进程发来的更新信息
}
/* Ask the master to stop sending diffs. */
if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr; //通知父进程 停止发送diff
if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)//异步收取父进程ack
goto werr;
/* We read the ACK from the server using a 10 seconds timeout. Normally
* it should reply ASAP, but just in case we lose its reply, we are sure
* the child will eventually get terminated. */
if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
byte != '!') goto werr; //给你五秒钟的时间考虑给我发消息
serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");
/* Read the final diff if any. */
aofReadDiffFromParent();// 我发送的时候 父进程可能又锤了点数据过来
/* Write the received diff to the file. */
serverLog(LL_NOTICE,
"Concatenating %.2f MB of AOF diff received from parent.",
(double) sdslen(server.aof_child_diff) / (1024*1024)); //获取更新的大小
if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)//把这个更新的数据锤进去
goto werr;
/* Make sure data will not remain on the OS's output buffers */
if (fflush(fp) == EOF) goto werr; //flush
if (fsync(fileno(fp)) == -1) goto werr; //fsync
if (fclose(fp) == EOF) goto werr;
/* Use RENAME to make sure the DB file is changed atomically only
* if the generate DB file is ok. */
if (rename(tmpfile,filename) == -1) { //改名字
serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
unlink(tmpfile);
return C_ERR;
}
serverLog(LL_NOTICE,"SYNC append only file rewrite performed");
return C_OK;
werr:
serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
fclose(fp);
unlink(tmpfile);
return C_ERR;
}
//转换成command的模式存储
int rewriteAppendOnlyFileRio(rio *aof) {
dictIterator *di = NULL;
dictEntry *de;
size_t processed = 0;
long long now = mstime();
int j;
for (j = 0; j < server.dbnum; j++) {//遍历db
char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; //select db语句
redisDb *db = server.db+j;
dict *d = db->dict;
if (dictSize(d) == 0) continue;
di = dictGetSafeIterator(d);
/* SELECT the new DB */
if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
if (rioWriteBulkLongLong(aof,j) == 0) goto werr;
/* Iterate this DB writing every entry */
while((de = dictNext(di)) != NULL) { //遍历dict
sds keystr;
robj key, *o;
long long expiretime;
keystr = dictGetKey(de);
o = dictGetVal(de);
initStaticStringObject(key,keystr);
expiretime = getExpire(db,&key);
/* If this key is already expired skip it */
if (expiretime != -1 && expiretime < now) continue; //过期的不写
/* Save the key and associated value */
//对于list set这些 保存的值可能有多个的时候 就会采用一次带上n个value的方式写入 没有别的特殊操作 就不看了
if (o->type == OBJ_STRING) { //set
/* Emit a SET command */
char cmd[]="*3\r\n$3\r\nSET\r\n";
if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; //先写set头
/* Key and value */
if (rioWriteBulkObject(aof,&key) == 0) goto werr; //写入key
if (rioWriteBulkObject(aof,o) == 0) goto werr; //写入value
} else if (o->type == OBJ_LIST) {
if (rewriteListObject(aof,&key,o) == 0) goto werr; //list
} else if (o->type == OBJ_SET) {
if (rewriteSetObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_ZSET) {
if (rewriteSortedSetObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_HASH) {
if (rewriteHashObject(aof,&key,o) == 0) goto werr;
} else if (o->type == OBJ_MODULE) {
if (rewriteModuleObject(aof,&key,o) == 0) goto werr;
} else {
serverPanic("Unknown object type");
}
/* Save the expire time */
if (expiretime != -1) { //在写入结束后加上一条PEXPIREAT
char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
if (rioWriteBulkObject(aof,&key) == 0) goto werr;
if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr;
}
/* Read some diff from the parent process from time to time. */
if (aof->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) {// 到一定数量去读一下父进程发过来的消息
processed = aof->processed_bytes;
aofReadDiffFromParent();
}
}
dictReleaseIterator(di);
di = NULL;
}
return C_OK;
werr:
if (di) dictReleaseIterator(di);
return C_ERR;
}
这个流程里面都是常规的操作。把rdb锤进aof有两种方式嘛随意选一中。在通知父进程终止之后再来读取diff一次是因为在发送过程中父进程可能已经又发了消息。这样可以防止这个消息的丢失。
最后把diff写入aof这个过程就结束了。
来看下rewrite的时候主进程做的事情
int rewriteAppendOnlyFileBackground(void) {
pid_t childpid;
long long start;
if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;//
if (aofCreatePipes() != C_OK) return C_ERR;//开启管道通信 aof_pipe_write_data_to_child
// aof_pipe_read_data_from_parent 将被设置为非阻塞
openChildInfoPipe(); //创建info 管道 读端将被设置成非阻塞
start = ustime();
if ((childpid = fork()) == 0) {//fork
char tmpfile[256];
//这里是子进程
/* Child */
closeListeningSockets(0);// 关闭监听
redisSetProcTitle("redis-aof-rewrite");//设置title
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());//生成临时的文件名
if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
size_t private_dirty = zmalloc_get_private_dirty(-1); //读取一下private_dirty
if (private_dirty) {
serverLog(LL_NOTICE,
"AOF rewrite: %zu MB of memory used by copy-on-write",
private_dirty/(1024*1024));
}
server.child_info_data.cow_size = private_dirty;
sendChildInfo(CHILD_INFO_TYPE_AOF);//发送info
exitFromChild(0);//进程退出
} else {
exitFromChild(1);//退出的code设置为1 代表失败
}
} else {
/* Parent */
//这里是父进程的继续运行
server.stat_fork_time = ustime()-start; // 记录下fork花了多久 其实现在的操作系统都会使用写实复制技术 在子进程fork的时候 子进程和父进程的页表项指向的同一个页帧 子进程不会调用修改 父进程调用修改的时候会触发缺页然后拷贝
server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ //fork的速度
latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
if (childpid == -1) {//创建失败了
closeChildInfoPipe(); //关闭info
serverLog(LL_WARNING,
"Can't rewrite append only file in background: fork: %s",
strerror(errno));
aofClosePipes();//关闭pipes
return C_ERR;
}
serverLog(LL_NOTICE,
"Background append only file rewriting started by pid %d",childpid);
server.aof_rewrite_scheduled = 0;// 代表完成
server.aof_rewrite_time_start = time(NULL);
server.aof_child_pid = childpid;
updateDictResizePolicy();//不允许resize 根据fork的原因 屏蔽掉resize resize 会触发大量的内存拷贝
/* We set appendseldb to -1 in order to force the next call to the
* feedAppendOnlyFile() to issue a SELECT command, so the differences
* accumulated by the parent into server.aof_rewrite_buf will start
* with a SELECT statement and it will be safe to merge. */
server.aof_selected_db = -1;
replicationScriptCacheFlush();
return C_OK;
}
return C_OK; /* unreached */
}
在操作中主要是打开管道通信然后发起fork。记录子进程的开始时间,pid这些信息。最主要的一点是updateDictResizePolicy();这句话是意思是屏蔽rehash.因为我们的子进程使用的是fork。fork以后子进程就拥有一个主进程的内存拷贝。但是真正的拷贝的话,在db特别大的时候很花时间,所以操作系统使用的是写时拷贝。在物理内存上,主进程和子进程指向的同一个地址,逻辑上不一样。内存的读取不会触发额外操作。只有在写的时候触发缺页中断,造成拷贝。子进程和主进程内存页的分离。所以要尽量避免主进程和子进程内相同内存块的修改。刚好rehash会造成大量相同内存页的修改,他就被屏蔽了。
子进程建立完成操作
子进程建立完成之后主进程主要是监控子进程的退出,发送write数据到子进程。接收子进程返回的信息。
write
在我们看aof_buf追加的时候看到一句话
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));//发送到子进程
对这句话就是在rewrite的时候发送write到子进程
void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
listNode *ln = listLast(server.aof_rewrite_buf_blocks);
aofrwblock *block = ln ? ln->value : NULL;
while(len) {
/* If we already got at least an allocated block, try appending
* at least some piece into it. */
if (block) {//首先找到一个block
unsigned long thislen = (block->free < len) ? block->free : len; //查看他还能存储多少数据
if (thislen) { /* The current block is not already full. */ //可以存储
memcpy(block->buf+block->used, s, thislen);//把数据拷贝进去
block->used += thislen;//更新block的信息
block->free -= thislen;
s += thislen;//对s进行len的偏移
len -= thislen; //还剩多少
}
}
if (len) { /* First block to allocate, or need another block. */ //代表上一个block没有把它存完
int numblocks;
block = zmalloc(sizeof(*block));//新建block
block->free = AOF_RW_BUF_BLOCK_SIZE; //初始化
block->used = 0;
listAddNodeTail(server.aof_rewrite_buf_blocks,block);//加入尾部节点
/* Log every time we cross more 10 or 100 blocks, respectively
* as a notice or warning. */
numblocks = listLength(server.aof_rewrite_buf_blocks);
if (((numblocks+1) % 10) == 0) { //10个要报道一下
int level = ((numblocks+1) % 100) == 0 ? LL_WARNING :
LL_NOTICE;
serverLog(level,"Background AOF buffer size: %lu MB",
aofRewriteBufferSize()/(1024*1024));
}
}
}
/* Install a file event to send data to the rewrite child if there is
* not one already. */
if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {//查看下有没有加入写方法 增加了异步的write事件
aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
AE_WRITABLE, aofChildWriteDiffData, NULL); // 增加write方法
}
}
在aof_pipe_write_data_to_child的write准备好的时候调用aofChildWriteDiffData写消息
void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) {
listNode *ln;
aofrwblock *block;
ssize_t nwritten;
UNUSED(el);
UNUSED(fd);
UNUSED(privdata);
UNUSED(mask);
while(1) {
ln = listFirst(server.aof_rewrite_buf_blocks);// 找到头结点
block = ln ? ln->value : NULL;
if (server.aof_stop_sending_diff || !block) { // 停止发了 或者是没有数据了
aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,
AE_WRITABLE);
return;
}
if (block->used > 0) {// 有数据
nwritten = write(server.aof_pipe_write_data_to_child,
block->buf,block->used);// write
if (nwritten <= 0) return; // 这个玩意是失败 或者是 进入等待了
memmove(block->buf,block->buf+nwritten,block->used-nwritten);//内存移动
block->used -= nwritten;
block->free += nwritten;
}
if (block->used == 0) listDelNode(server.aof_rewrite_buf_blocks,ln);//这个节点写完了 删除
}
}
接收子进程信号
在aofCreatePipes有这么一句
if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error; //aof_pipe_read_ack_from_child //增加读的回调
void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) {
char byte;
UNUSED(el);
UNUSED(privdata);
UNUSED(mask);
if (read(fd,&byte,1) == 1 && byte == '!') {
serverLog(LL_NOTICE,"AOF rewrite child asks to stop sending diffs.");
server.aof_stop_sending_diff = 1; //标记不发送
if (write(server.aof_pipe_write_ack_to_child,"!",1) != 1) {
/* If we can't send the ack, inform the user, but don't try again
* since in the other side the children will use a timeout if the
* kernel can't buffer our write, or, the children was
* terminated. */
serverLog(LL_WARNING,"Can't send ACK to AOF child: %s",
strerror(errno));
}
}
/* Remove the handler since this can be called only one time during a
* rewrite. */
aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
}
接收到标记后会停止往子进程写write。如果这个时候主线程很忙,子进程在等待五秒之后就会结束。造成rewrite的失败
主进程的监控
if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
ldbPendingChildren())
{
int statloc;
pid_t pid;
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {//非阻塞 等待进程结束
int exitcode = WEXITSTATUS(statloc);
int bysignal = 0;
if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); //确定是不是信号量退出的
if (pid == -1) {//失败了
serverLog(LL_WARNING,"wait3() returned an error: %s. "
"rdb_child_pid = %d, aof_child_pid = %d",
strerror(errno),
(int) server.rdb_child_pid,
(int) server.aof_child_pid);
} else if (pid == server.rdb_child_pid) {//如果是一个rdb
backgroundSaveDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else if (pid == server.aof_child_pid) {//如果是child
backgroundRewriteDoneHandler(exitcode,bysignal);
if (!bysignal && exitcode == 0) receiveChildInfo();
} else {
if (!ldbRemoveChild(pid)) {
serverLog(LL_WARNING,
"Warning, detected child with unmatched pid: %ld",
(long)pid);
}
}
updateDictResizePolicy();
closeChildInfoPipe();
}
}
主进程在循环中监控子进程的状态,判断是否退出。然后根据程序的返回值进行操作
结束后的操作
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
if (!bysignal && exitcode == 0) { //代表是正常结束的
int newfd, oldfd;
char tmpfile[256];
long long now = ustime();
mstime_t latency;
serverLog(LL_NOTICE,
"Background AOF rewrite terminated with success");
/* Flush the differences accumulated by the parent to the
* rewritten AOF. */
latencyStartMonitor(latency);
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
(int)server.aof_child_pid);
newfd = open(tmpfile,O_WRONLY|O_APPEND);
if (newfd == -1) {
serverLog(LL_WARNING,
"Unable to open the temporary AOF produced by the child: %s", strerror(errno));
goto cleanup;
}
if (aofRewriteBufferWrite(newfd) == -1) { //把没有发送过去的数据 在写入
serverLog(LL_WARNING,
"Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
close(newfd);
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);
serverLog(LL_NOTICE,
"Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));
if (server.aof_fd == -1) {
/* AOF disabled */
/* Don't care if this fails: oldfd will be -1 and we handle that.
* One notable case of -1 return is if the old file does
* not exist. */
oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
} else {
/* AOF enabled */
oldfd = -1; /* We'll set this to the current AOF filedes later. */
}
/* Rename the temporary file. This will not unlink the target file if
* it exists, because we reference it with "oldfd". */
latencyStartMonitor(latency);
//rename 如果原文件存在会先删除 在重命名
if (rename(tmpfile,server.aof_filename) == -1) {
serverLog(LL_WARNING,
"Error trying to rename the temporary AOF file %s into %s: %s",
tmpfile,
server.aof_filename,
strerror(errno));
close(newfd);
if (oldfd != -1) close(oldfd);
goto cleanup;
}
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-rename",latency);
if (server.aof_fd == -1) { //代表没有开启aof
/* AOF disabled, we don't need to set the AOF file descriptor
* to this new file, so we can close it. */
close(newfd);//也不需要再开启
} else {
/* AOF enabled, replace the old fd with the new one. */
oldfd = server.aof_fd;
server.aof_fd = newfd; //新的的文件的fd指向过来
if (server.aof_fsync == AOF_FSYNC_ALWAYS)
aof_fsync(newfd);
else if (server.aof_fsync == AOF_FSYNC_EVERYSEC) //异步刷新
aof_background_fsync(newfd);
server.aof_selected_db = -1; /* Make sure SELECT is re-issued */ //设置select还没有
aofUpdateCurrentSize();//更新当前大小
server.aof_rewrite_base_size = server.aof_current_size; //设置basesize
/* Clear regular AOF buffer since its contents was just written to
* the new AOF from the background rewrite buffer. */
sdsfree(server.aof_buf); //在background的进程中写入了这些信息
server.aof_buf = sdsempty();
}
server.aof_lastbgrewrite_status = C_OK; //处理成功
serverLog(LL_NOTICE, "Background AOF rewrite finished successfully");
/* Change state from WAIT_REWRITE to ON if needed */
if (server.aof_state == AOF_WAIT_REWRITE)
server.aof_state = AOF_ON;
/* Asynchronously close the overwritten AOF. */
if (oldfd != -1) bioCreateBackgroundJob(BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);//关闭掉任务
serverLog(LL_VERBOSE,
"Background AOF rewrite signal handler took %lldus", ustime()-now);
} else if (!bysignal && exitcode != 0) {
/* SIGUSR1 is whitelisted, so we have a way to kill a child without
* tirggering an error conditon. */
if (bysignal != SIGUSR1)
server.aof_lastbgrewrite_status = C_ERR;
serverLog(LL_WARNING,
"Background AOF rewrite terminated with error");
} else {
server.aof_lastbgrewrite_status = C_ERR;
serverLog(LL_WARNING,
"Background AOF rewrite terminated by signal %d", bysignal);
}
cleanup:
aofClosePipes();//关闭pipe
aofRewriteBufferReset();
aofRemoveTempFile(server.aof_child_pid);//删除文件
server.aof_child_pid = -1;
server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
server.aof_rewrite_time_start = -1;
/* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
if (server.aof_state == AOF_WAIT_REWRITE) //重新来过
server.aof_rewrite_scheduled = 1;
}
主进程首先是判断rewrite进程是否写入成功,写入成功的话。主进程主要做这些事情
1. aofRewriteBufferWrite把没有发送到子进程的writebuf写入新的aof文件
2.rename(tmpfile,server.aof_filename)。把新的aof文件命名为aof_filename。其中rename如果原文件存在会先删除 在重命名
3.更新变量信息。比如大小。新的fd。关闭老的fd等。后面的aof就会使用新的fd进行书写了
4.关闭pipe通信。如果失败的话。设置aof_rewrite_scheduled标志