Redis AOF持久化存储

AOF简介

Redis拥有者两种的持久化存储的方式一中是rdb序列化存储db的信息另外一种就是aof.
Append-only file,将“操作 + 数据”以格式化指令的方式追加到操作日志文件的尾部,在append操作返回后(已经写入到文件或者即将写入),才进行实际的数据变更,“日志文件”保存了历史所有的操作过程;当server需要数据恢复时,可以直接replay此日志文件,即可还原所有的操作过程。AOF相对可靠.
AOF记录的是修改的操作日志所以一般来说AOF文件会比rdb序列化出来的大。恢复的速度也会相对来说慢些。

AOF编码

其实aof就是一个日志文件。所以我们来查看下这个日志,包含了所有的write操作。所以来看下他是怎么对每个write操作进行保存的
对于aof的编码来说相对就显得特别的简单了,他没有对长度啊那些进行编码那些。所以我们来看两个例子就好啦
selectdb

*2\r\n$6\r\nSELECT\r\n$1\r\n\1\r\n

这句话保存了一个最简单的命令选择一个db
*代表着一个command的开始标记
2代表这一个command 总共需要表示的字符串的个数
\r\n是作为一个结束符代表一个参数的结束
$作为字符串长度的标记

所以对我们的select 2来说
*2因为的一共需要使用两个字符串
\r\n结束总字符串个数长度
$6表示我们的select字符串的长度
\r\n select字符串结束
$1代表我们db的长度为1
\r\n结束字符串
再根据我们的总字符串长度为2确定我们的command结束
其他的command同理

//构造一个command
sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) {
    char buf[32];
    int len, j;
    robj *o;

    buf[0] = '*'; //一个command以*开头
    len = 1+ll2string(buf+1,sizeof(buf)-1,argc);//写入参数的总个数
    buf[len++] = '\r';//紧接着一个\r\n
    buf[len++] = '\n';
    dst = sdscatlen(dst,buf,len);//把buf里面的内容拷贝到dst 现在的dst里面就是*n\r\n

    for (j = 0; j < argc; j++) {
        o = getDecodedObject(argv[j]);//
        buf[0] = '$';//第一个字段$
        len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr));//构造出长度
        buf[len++] = '\r';
        buf[len++] = '\n';
        dst = sdscatlen(dst,buf,len);//这个时候就是*n\r\n$len\r\n
        dst = sdscatlen(dst,o->ptr,sdslen(o->ptr));//把string连接上去
        dst = sdscatlen(dst,"\r\n",2);//连上\r\n
        decrRefCount(o);
    }
    return dst;
}

AOF文件的追加

打开aof
appendonly yes
appendfilename "appendonly.aof"
这样就会开启aof模式

    if (server.aof_state == AOF_ON) {//打开aof
        server.aof_fd = open(server.aof_filename,
                               O_WRONLY|O_APPEND|O_CREAT,0644);//打开或者创建aof文件采用追加的方式
        if (server.aof_fd == -1) {
            serverLog(LL_WARNING, "Can't open the append-only file: %s",
                strerror(errno));
            exit(1);
        }
    }

command操作追加到内存

void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    sds buf = sdsempty();
    robj *tmpargv[3];

    /* The DB this command was targeting is not the same as the last command
     * we appended. To issue a SELECT command is needed. */
    if (dictid != server.aof_selected_db) {// db不同调用select方法
        char seldb[64];

        snprintf(seldb,sizeof(seldb),"%d",dictid);//把dictid转换成字符串

        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);
        server.aof_selected_db = dictid;
    }

    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
        cmd->proc == expireatCommand) { //对于expire command 转换成PEXPIREAT
        /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]); //
    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) { //setx 和setex
        /* Translate SETEX/PSETEX to SET and PEXPIREAT */
        tmpargv[0] = createStringObject("SET",3);
        tmpargv[1] = argv[1];
        tmpargv[2] = argv[3];
        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);//创建set
        decrRefCount(tmpargv[0]);
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);//创建PEXPIREAT
    } else if (cmd->proc == setCommand && argc > 3) {//这个set 带有过期信息
        int i;
        robj *exarg = NULL, *pxarg = NULL;
        /* Translate SET [EX seconds][PX milliseconds] to SET and PEXPIREAT */
        buf = catAppendOnlyGenericCommand(buf,3,argv);
        for (i = 3; i < argc; i ++) {
            if (!strcasecmp(argv[i]->ptr, "ex")) exarg = argv[i+1];//毫秒还是秒
            if (!strcasecmp(argv[i]->ptr, "px")) pxarg = argv[i+1];
        }
        serverAssert(!(exarg && pxarg));
        if (exarg)
            buf = catAppendOnlyExpireAtCommand(buf,server.expireCommand,argv[1],
                                               exarg);
        if (pxarg)
            buf = catAppendOnlyExpireAtCommand(buf,server.pexpireCommand,argv[1],
                                               pxarg);//创建PEXPIREAT
    } else {
        /* All the other commands don't need translation or need the
         * same translation already operated in the command vector
         * for the replication itself. */
        buf = catAppendOnlyGenericCommand(buf,argc,argv);//序列化一个普通的命令
    }

    /* Append to the AOF buffer. This will be flushed on disk just before
     * of re-entering the event loop, so before the client will get a
     * positive reply about the operation performed. */
    if (server.aof_state == AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));//追加buff

    /* If a background append only file rewriting is in progress we want to
     * accumulate the differences between the child DB and the current one
     * in a buffer, so that when the child process will do its work we
     * can append the differences to the new append only file. */
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));//发送到子进程

    sdsfree(buf);
}

在对于有write操作的命令会调用到feedAppendOnlyFile方法来追加操作序列
序列化操作的方式上面有对于带有过期信息的command将会统一被转换为PEXPIREAT命令来进行追加
最后把buf放入到aof_buf aof_buf在程序初始化的时候回被设置成为sdsempty()

aof_buf写入到内存

void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;
    mstime_t latency;

    if (sdslen(server.aof_buf) == 0) return;

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;//判断下有没有异步任务存在

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) { //后台刷新没完成
        /* With this append fsync policy we do background fsyncing.
         * If the fsync is still in progress we can try to delay
         * the write for a couple of seconds. */
        if (sync_in_progress) { //如果后台任务还没有完成
            if (server.aof_flush_postponed_start == 0) {
                /* No previous write postponing, remember that we are
                 * postponing the flush and return. */
                server.aof_flush_postponed_start = server.unixtime; //记录等待时间 后面会被判断刷新下
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) { //两秒 还能忍等着
                /* We were already waiting for fsync to finish, but for less
                 * than two seconds this is still ok. Postpone again. */
                return;
            }
            /* Otherwise fall trough, and go write since we can't wait
             * over two seconds. */
            server.aof_delayed_fsync++;// 忍不住了
            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }
    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike */

    latencyStartMonitor(latency);
    nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));//数据写入
    latencyEndMonitor(latency);
    /* We want to capture different events for delayed writes:
     * when the delay happens with a pending fsync, or with a saving child
     * active, and when the above two conditions are missing.
     * We also use an additional event name to save all samples which is
     * useful for graphing / monitoring purposes. */
    if (sync_in_progress) {
        latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);
    } else if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) {
        latencyAddSampleIfNeeded("aof-write-active-child",latency);
    } else {
        latencyAddSampleIfNeeded("aof-write-alone",latency);
    }
    latencyAddSampleIfNeeded("aof-write",latency);

    /* We performed the write so reset the postponed flush sentinel to zero. */
    server.aof_flush_postponed_start = 0;

    if (nwritten != (ssize_t)sdslen(server.aof_buf)) {//判断下写入的长度
        static time_t last_write_error_log = 0;
        int can_log = 0;

        /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) { //隔了这么长时间了 可以写错误了
            can_log = 1;
            last_write_error_log = server.unixtime;
        }

        /* Log the AOF write error and record the error code. */
        if (nwritten == -1) {//这逗比是完全没有写的有进去呀
            if (can_log) {
                serverLog(LL_WARNING,"Error writing to the AOF file: %s",
                    strerror(errno));
                server.aof_last_write_errno = errno;
            }
        } else {
            if (can_log) {//没写完 发下牢骚
                serverLog(LL_WARNING,"Short write while writing to "
                                       "the AOF file: (nwritten=%lld, "
                                       "expected=%lld)",
                                       (long long)nwritten,
                                       (long long)sdslen(server.aof_buf));
            }

            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {//如果原来的文件大小比参数length大,则超过的部分会被删去。这是为了aof文件的完整性 在出错解析文件的时候不会出错
                if (can_log) {
                    serverLog(LL_WARNING, "Could not remove short write "
                             "from the append-only file.  Redis may refuse "
                             "to load the AOF the next time it starts.  "
                             "ftruncate: %s", strerror(errno));
                }
            } else {
                /* If the ftruncate() succeeded we can set nwritten to
                 * -1 since there is no longer partial data into the AOF. */
                nwritten = -1;
            }
            server.aof_last_write_errno = ENOSPC;
        }

        /* Handle the AOF write error. */
        if (server.aof_fsync == AOF_FSYNC_ALWAYS) { //代表每次都要全部锤进去
            /* We can't recover when the fsync policy is ALWAYS since the
             * reply for the client is already in the output buffers, and we
             * have the contract with the user that on acknowledged write data
             * is synced on disk. */
            serverLog(LL_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
            exit(1);
        } else {
            /* Recover from failed write leaving data into the buffer. However
             * set an error to stop accepting writes as long as the error
             * condition is not cleared. */
            server.aof_last_write_status = C_ERR; //上次写入没有完成成功

            /* Trim the sds buffer if there was a partial write, and there
             * was no way to undo it with ftruncate(2). */
            if (nwritten > 0) {// 如果有写入 需要更新aof_buf
                server.aof_current_size += nwritten;//更新当前大小
                sdsrange(server.aof_buf,nwritten,-1);//内存收缩
            }
            return; /* We'll try again on the next call... */
        }
    } else {//写完了的
        /* Successful write(2). If AOF was in error state, restore the
         * OK state and log the event. */
        if (server.aof_last_write_status == C_ERR) {
            serverLog(LL_WARNING,
                "AOF write error looks solved, Redis can write again.");
            server.aof_last_write_status = C_OK;
        }
    }
    server.aof_current_size += nwritten;//更新大小

    /* Re-use AOF buffer when it is small enough. The maximum comes from the
     * arena size of 4k minus some overhead (but is otherwise arbitrary). */
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) { //小于4k就重用
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }

    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
     * children doing I/O in the background. */
    if (server.aof_no_fsync_on_rewrite &&
        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))//确定在后台任务存在的时候 是否刷新
            return;

    /* Perform the fsync if needed. */
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {//刷盘的方式
        /* aof_fsync is defined as fdatasync() for Linux in order to avoid
         * flushing metadata. */
        latencyStartMonitor(latency);
        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */ //每次刷
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-fsync-always",latency);
        server.aof_last_fsync = server.unixtime;
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        if (!sync_in_progress) aof_background_fsync(server.aof_fd);
        server.aof_last_fsync = server.unixtime; //开启异步刷
    }
}

aof有三种刷到磁盘的方法
appendfsync always
appendfsync everysec
appendfsync no
alaways 每次都是要刷入磁盘 这样的话就是会卡io
everysec 每秒刷一次
no 操作系统自己刷
刷入磁盘的方法主要是在beforeSleep的时候调用
这个方式redis在进入睡眠之前调用
在主循环serverCron中
if (server.aof_flush_postponed_start) flushAppendOnlyFile(0);
这个是在flushAppendOnlyFile调用的时候发现异步线程还在刷入磁盘
如果刷新还是没完成
run_with_period(1000) {
if (server.aof_last_write_status == C_ERR)
flushAppendOnlyFile(0);
}
每秒钟对于上次刷盘没成功进行一次在刷入
其实对于appendfsync的刷aof_buf到磁盘上来说的话always是最安全的,他会每次都会在完成写入完成的时候调用aof_fsync来将数据写入到磁盘里面。这是个同步的操作,会等待系统io的返回。属于最耗时的一种
everysec 将会使用aof_background_fsync方法来刷新。看代码可以知道其实在异步刷新时间过长两秒以内是不会再进入write方法的。并且在刷新没有完成的情况新写入也不会被刷入。这时候崩溃的话丢失的数据可能会大于1秒
no 就是不刷新了 操作系统自己玩
刷新使用的是fdatasync; fdatasync的功能与fsync类似,但是仅仅在必要的情况下才会同步metadata,因此可以减少一次IO写操作。那么,什么是“必要的情况。比如文件的尺寸(st_size)如果变化,是需要立即同步的,否则OS一旦崩溃,即使文件的数据部分已同步,由于metadata没有同步,依然读不到修改的内容。
但是喃由于我们的aof使用的是append。每次写入都会触发文件大小的变化。就被退化成了fsync。

Rewrite

我们都知道我们的aof文件是以追加的方式的。这样的话随着时间的推移。我们的aof文件会越来越大。而且里面的有些元素本身就是被删除了。如果这时候我们进行还原的话就会有很多不必要的操作。所以Redis提供了一个rewrite。当aof文件到一定大小。或者是跟上次相比涨到一定程度的情况下。把数据库进行一次序列化。然后在接着写aof文件。这样可以去掉一些很多无用的字段。但是这样对于恢复来说的话。也可能是会存在一些问题的。看个人取舍了
配置:
auto-aof-rewrite-percentage 100 //涨的百分比 0就是不rewrite
auto-aof-rewrite-min-size 64mb //最小开始rewrite的大小

         /* Trigger an AOF rewrite if needed. */
         if (server.aof_state == AOF_ON && //开启了aof
             server.rdb_child_pid == -1 && //没有rdb子进程
             server.aof_child_pid == -1 && //没有aof子进程
             server.aof_rewrite_perc && //开启了百分比rewrite
             server.aof_current_size > server.aof_rewrite_min_size)//达到最小的rewrite的点
         {
            long long base = server.aof_rewrite_base_size ?
                            server.aof_rewrite_base_size : 1;
            long long growth = (server.aof_current_size*100/base) - 100;
            if (growth >= server.aof_rewrite_perc) { //计算成长倍数 然后重写一下
                serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
                rewriteAppendOnlyFileBackground(); //进行rwrite
            }
         }
    }

对于rewrite来说Redis使用的是fork,启用一个新的子进程,然后在子进程里面把rdb里面的数据序列化到新的aof文件中。在子进程写rdb的过程中,使用pipe通信来把主进程新的修改接收保存。在rbd写完之后追加到aof文件中。报告写入完成,停止接收新的修改最后结束。主进程把aof文件替换。也把子进程到主进程切换过程中的写入追加到aof。aof_fd切换成新的fd。整个流程完成。
首先查看pipe的建立

int aofCreatePipes(void) {
    int fds[6] = {-1, -1, -1, -1, -1, -1}; //  一对1   data ack ack
    int j;

    if (pipe(fds) == -1) goto error; /* parent -> children data. */
    if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
    if (pipe(fds+4) == -1) goto error; /* parent -> children ack. */
    /* Parent -> children data is non blocking. */
    if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;//  NonBlock
    if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
    if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error; //aof_pipe_read_ack_from_child //增加读的回调

    server.aof_pipe_write_data_to_child = fds[1];
    server.aof_pipe_read_data_from_parent = fds[0]; // read 0  write 1
    server.aof_pipe_write_ack_to_parent = fds[3];
    server.aof_pipe_read_ack_from_child = fds[2];
    server.aof_pipe_write_ack_to_child = fds[5];
    server.aof_pipe_read_ack_from_parent = fds[4];
    server.aof_stop_sending_diff = 0;// 标记可以发送 diff 在aofChildPipeReadable中会收到结束发送的标记 设置为不发送
    return C_OK;

error:
    serverLog(LL_WARNING,"Error opening /setting AOF rewrite IPC pipes: %s",
        strerror(errno));
    for (j = 0; j < 6; j++) if(fds[j] != -1) close(fds[j]);
    return C_ERR;
}
//打开info消息的收取
void openChildInfoPipe(void) {
    if (pipe(server.child_info_pipe) == -1) {
        /* On error our two file descriptors should be still set to -1,
         * but we call anyway cloesChildInfoPipe() since can't hurt. */
        closeChildInfoPipe();
    } else if (anetNonBlock(NULL,server.child_info_pipe[0]) != ANET_OK) {
        closeChildInfoPipe();
    } else {
        memset(&server.child_info_data,0,sizeof(server.child_info_data));
    }
}

一共有三组pipe。对于pipe来说第一个作为读第二个作为写端。对于data这种数据量比较大的来说。将会采用异步的方式进行发送。对于ack这类消息。因为只有一个字节。就不那么麻烦了。同步锤完。
来看子进程核心的写方法rewriteAppendOnlyFile

int rewriteAppendOnlyFile(char *filename) {
    rio aof;
    FILE *fp;
    char tmpfile[256];
    char byte;

    /* Note that we have to use a different temp name here compared to the
     * one used by rewriteAppendOnlyFileBackground() function. */
    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());// 创建文件
    fp = fopen(tmpfile,"w");//write
    if (!fp) {
        serverLog(LL_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
        return C_ERR;
    }

    server.aof_child_diff = sdsempty(); //  父进程写过来的数据 保存在写rbd期间 主进程的修改信息 格式是个aof追加的格式是一致的
    rioInitWithFile(&aof,fp);//rio

    if (server.aof_rewrite_incremental_fsync)
        rioSetAutoSync(&aof,AOF_AUTOSYNC_BYTES);

    if (server.aof_use_rdb_preamble) {//写入rdb数据的方式
        int error;
        if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {//这里是使用rdb的编码方式写入aof文件
            errno = error;
            goto werr;
        }
    } else {
        if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;//这里是将rdb里面的内容转换成command的模式写入
    }

    /* Do an initial slow fsync here while the parent is still sending
     * data, in order to make the next final fsync faster. */
    if (fflush(fp) == EOF) goto werr; //刷入
    if (fsync(fileno(fp)) == -1) goto werr;

    /* Read again a few times to get more data from the parent.
     * We can't read forever (the server may receive data from clients
     * faster than it is able to send data to the child), so we try to read
     * some more data in a loop as soon as there is a good chance more data
     * will come. If it looks like we are wasting time, we abort (this
     * happens after 20 ms without new data). */
    int nodata = 0;
    mstime_t start = mstime();
    while(mstime()-start < 1000 && nodata < 20) {//讲道理20ms你没给我发消息了 我就不等了 我特最多等一秒钟
        if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)//每秒等待一次数据的到来
        {
            nodata++;
            continue;
        }
        nodata = 0; /* Start counting from zero, we stop on N *contiguous*
                       timeouts. */
        aofReadDiffFromParent();//同步父进程发来的更新信息
    }

    /* Ask the master to stop sending diffs. */
    if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr; //通知父进程 停止发送diff
    if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)//异步收取父进程ack
        goto werr;
    /* We read the ACK from the server using a 10 seconds timeout. Normally
     * it should reply ASAP, but just in case we lose its reply, we are sure
     * the child will eventually get terminated. */
    if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
        byte != '!') goto werr; //给你五秒钟的时间考虑给我发消息 
    serverLog(LL_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");

    /* Read the final diff if any. */
    aofReadDiffFromParent();// 我发送的时候 父进程可能又锤了点数据过来

    /* Write the received diff to the file. */
    serverLog(LL_NOTICE,
        "Concatenating %.2f MB of AOF diff received from parent.",
        (double) sdslen(server.aof_child_diff) / (1024*1024)); //获取更新的大小
    if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)//把这个更新的数据锤进去
        goto werr;

    /* Make sure data will not remain on the OS's output buffers */
    if (fflush(fp) == EOF) goto werr; //flush
    if (fsync(fileno(fp)) == -1) goto werr; //fsync
    if (fclose(fp) == EOF) goto werr;

    /* Use RENAME to make sure the DB file is changed atomically only
     * if the generate DB file is ok. */
    if (rename(tmpfile,filename) == -1) { //改名字
        serverLog(LL_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
        unlink(tmpfile);
        return C_ERR;
    }
    serverLog(LL_NOTICE,"SYNC append only file rewrite performed");
    return C_OK;

werr:
    serverLog(LL_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
    fclose(fp);
    unlink(tmpfile);
    return C_ERR;
}
//转换成command的模式存储
int rewriteAppendOnlyFileRio(rio *aof) {
    dictIterator *di = NULL;
    dictEntry *de;
    size_t processed = 0;
    long long now = mstime();
    int j;

    for (j = 0; j < server.dbnum; j++) {//遍历db
        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n"; //select db语句
        redisDb *db = server.db+j;
        dict *d = db->dict;
        if (dictSize(d) == 0) continue;
        di = dictGetSafeIterator(d);

        /* SELECT the new DB */
        if (rioWrite(aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
        if (rioWriteBulkLongLong(aof,j) == 0) goto werr;

        /* Iterate this DB writing every entry */
        while((de = dictNext(di)) != NULL) { //遍历dict
            sds keystr;
            robj key, *o;
            long long expiretime;

            keystr = dictGetKey(de);
            o = dictGetVal(de);
            initStaticStringObject(key,keystr);

            expiretime = getExpire(db,&key);

            /* If this key is already expired skip it */
            if (expiretime != -1 && expiretime < now) continue; //过期的不写

            /* Save the key and associated value */
            //对于list set这些 保存的值可能有多个的时候 就会采用一次带上n个value的方式写入 没有别的特殊操作 就不看了
            if (o->type == OBJ_STRING) { //set
                /* Emit a SET command */
                char cmd[]="*3\r\n$3\r\nSET\r\n";
                if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr; //先写set头
                /* Key and value */
                if (rioWriteBulkObject(aof,&key) == 0) goto werr; //写入key
                if (rioWriteBulkObject(aof,o) == 0) goto werr; //写入value
            } else if (o->type == OBJ_LIST) {
                if (rewriteListObject(aof,&key,o) == 0) goto werr; //list
            } else if (o->type == OBJ_SET) {
                if (rewriteSetObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_ZSET) {
                if (rewriteSortedSetObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_HASH) {
                if (rewriteHashObject(aof,&key,o) == 0) goto werr;
            } else if (o->type == OBJ_MODULE) {
                if (rewriteModuleObject(aof,&key,o) == 0) goto werr;
            } else {
                serverPanic("Unknown object type");
            }
            /* Save the expire time */
            if (expiretime != -1) { //在写入结束后加上一条PEXPIREAT
                char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
                if (rioWrite(aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                if (rioWriteBulkObject(aof,&key) == 0) goto werr;
                if (rioWriteBulkLongLong(aof,expiretime) == 0) goto werr;
            }
            /* Read some diff from the parent process from time to time. */
            if (aof->processed_bytes > processed+AOF_READ_DIFF_INTERVAL_BYTES) {// 到一定数量去读一下父进程发过来的消息
                processed = aof->processed_bytes;
                aofReadDiffFromParent();
            }
        }
        dictReleaseIterator(di);
        di = NULL;
    }
    return C_OK;

werr:
    if (di) dictReleaseIterator(di);
    return C_ERR;
}

这个流程里面都是常规的操作。把rdb锤进aof有两种方式嘛随意选一中。在通知父进程终止之后再来读取diff一次是因为在发送过程中父进程可能已经又发了消息。这样可以防止这个消息的丢失。
最后把diff写入aof这个过程就结束了。
来看下rewrite的时候主进程做的事情

int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;
    long long start;

    if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;//
    if (aofCreatePipes() != C_OK) return C_ERR;//开启管道通信 aof_pipe_write_data_to_child
                                               //           aof_pipe_read_data_from_parent 将被设置为非阻塞
    openChildInfoPipe(); //创建info 管道 读端将被设置成非阻塞
    start = ustime();
    if ((childpid = fork()) == 0) {//fork
        char tmpfile[256];
        //这里是子进程
        /* Child */
        closeListeningSockets(0);// 关闭监听
        redisSetProcTitle("redis-aof-rewrite");//设置title
         snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());//生成临时的文件名
        if (rewriteAppendOnlyFile(tmpfile) == C_OK) {
            size_t private_dirty = zmalloc_get_private_dirty(-1); //读取一下private_dirty

            if (private_dirty) {
                serverLog(LL_NOTICE,
                    "AOF rewrite: %zu MB of memory used by copy-on-write",
                    private_dirty/(1024*1024));
            }

            server.child_info_data.cow_size = private_dirty;
            sendChildInfo(CHILD_INFO_TYPE_AOF);//发送info
            exitFromChild(0);//进程退出
        } else {
            exitFromChild(1);//退出的code设置为1 代表失败
        }
    } else {
        /* Parent */
        //这里是父进程的继续运行
        server.stat_fork_time = ustime()-start; // 记录下fork花了多久  其实现在的操作系统都会使用写实复制技术 在子进程fork的时候 子进程和父进程的页表项指向的同一个页帧 子进程不会调用修改 父进程调用修改的时候会触发缺页然后拷贝
        server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */ //fork的速度
        latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
        if (childpid == -1) {//创建失败了
            closeChildInfoPipe(); //关闭info
            serverLog(LL_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            aofClosePipes();//关闭pipes
            return C_ERR;
        }
        serverLog(LL_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);
        server.aof_rewrite_scheduled = 0;// 代表完成
        server.aof_rewrite_time_start = time(NULL);
        server.aof_child_pid = childpid;
        updateDictResizePolicy();//不允许resize  根据fork的原因  屏蔽掉resize  resize 会触发大量的内存拷贝
        /* We set appendseldb to -1 in order to force the next call to the
         * feedAppendOnlyFile() to issue a SELECT command, so the differences
         * accumulated by the parent into server.aof_rewrite_buf will start
         * with a SELECT statement and it will be safe to merge. */
        server.aof_selected_db = -1;
        replicationScriptCacheFlush();
        return C_OK;
    }
    return C_OK; /* unreached */
}

在操作中主要是打开管道通信然后发起fork。记录子进程的开始时间,pid这些信息。最主要的一点是updateDictResizePolicy();这句话是意思是屏蔽rehash.因为我们的子进程使用的是fork。fork以后子进程就拥有一个主进程的内存拷贝。但是真正的拷贝的话,在db特别大的时候很花时间,所以操作系统使用的是写时拷贝。在物理内存上,主进程和子进程指向的同一个地址,逻辑上不一样。内存的读取不会触发额外操作。只有在写的时候触发缺页中断,造成拷贝。子进程和主进程内存页的分离。所以要尽量避免主进程和子进程内相同内存块的修改。刚好rehash会造成大量相同内存页的修改,他就被屏蔽了。

子进程建立完成操作

子进程建立完成之后主进程主要是监控子进程的退出,发送write数据到子进程。接收子进程返回的信息。

write

在我们看aof_buf追加的时候看到一句话
if (server.aof_child_pid != -1)
aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));//发送到子进程
对这句话就是在rewrite的时候发送write到子进程

void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
    listNode *ln = listLast(server.aof_rewrite_buf_blocks);
    aofrwblock *block = ln ? ln->value : NULL;

    while(len) {
        /* If we already got at least an allocated block, try appending
         * at least some piece into it. */
        if (block) {//首先找到一个block
            unsigned long thislen = (block->free < len) ? block->free : len; //查看他还能存储多少数据
            if (thislen) {  /* The current block is not already full. */ //可以存储
                memcpy(block->buf+block->used, s, thislen);//把数据拷贝进去
                block->used += thislen;//更新block的信息
                block->free -= thislen;
                s += thislen;//对s进行len的偏移
                len -= thislen; //还剩多少
            }
        }

        if (len) { /* First block to allocate, or need another block. */ //代表上一个block没有把它存完
            int numblocks;

            block = zmalloc(sizeof(*block));//新建block
            block->free = AOF_RW_BUF_BLOCK_SIZE; //初始化
            block->used = 0;
            listAddNodeTail(server.aof_rewrite_buf_blocks,block);//加入尾部节点

            /* Log every time we cross more 10 or 100 blocks, respectively
             * as a notice or warning. */
            numblocks = listLength(server.aof_rewrite_buf_blocks);
            if (((numblocks+1) % 10) == 0) { //10个要报道一下
                int level = ((numblocks+1) % 100) == 0 ? LL_WARNING :
                                                         LL_NOTICE;
                serverLog(level,"Background AOF buffer size: %lu MB",
                    aofRewriteBufferSize()/(1024*1024));
            }
        }
    }

    /* Install a file event to send data to the rewrite child if there is
     * not one already. */
    if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {//查看下有没有加入写方法 增加了异步的write事件
        aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
            AE_WRITABLE, aofChildWriteDiffData, NULL); // 增加write方法
    }
}

在aof_pipe_write_data_to_child的write准备好的时候调用aofChildWriteDiffData写消息

void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) {
    listNode *ln;
    aofrwblock *block;
    ssize_t nwritten;
    UNUSED(el);
    UNUSED(fd);
    UNUSED(privdata);
    UNUSED(mask);

    while(1) {
        ln = listFirst(server.aof_rewrite_buf_blocks);//  找到头结点
        block = ln ? ln->value : NULL; 
        if (server.aof_stop_sending_diff || !block) { // 停止发了 或者是没有数据了
            aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,
                              AE_WRITABLE);
            return;
        }
        if (block->used > 0) {// 有数据
            nwritten = write(server.aof_pipe_write_data_to_child,
                             block->buf,block->used);// write
            if (nwritten <= 0) return; // 这个玩意是失败 或者是 进入等待了
            memmove(block->buf,block->buf+nwritten,block->used-nwritten);//内存移动
            block->used -= nwritten;
            block->free += nwritten;
        }
        if (block->used == 0) listDelNode(server.aof_rewrite_buf_blocks,ln);//这个节点写完了 删除
    }
}
接收子进程信号

在aofCreatePipes有这么一句
if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error; //aof_pipe_read_ack_from_child //增加读的回调

void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) {
    char byte;
    UNUSED(el);
    UNUSED(privdata);
    UNUSED(mask);

    if (read(fd,&byte,1) == 1 && byte == '!') {
        serverLog(LL_NOTICE,"AOF rewrite child asks to stop sending diffs.");
        server.aof_stop_sending_diff = 1; //标记不发送
        if (write(server.aof_pipe_write_ack_to_child,"!",1) != 1) {
            /* If we can't send the ack, inform the user, but don't try again
             * since in the other side the children will use a timeout if the
             * kernel can't buffer our write, or, the children was
             * terminated. */
            serverLog(LL_WARNING,"Can't send ACK to AOF child: %s",
                strerror(errno));
        }
    }
    /* Remove the handler since this can be called only one time during a
     * rewrite. */
    aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
}

接收到标记后会停止往子进程写write。如果这个时候主线程很忙,子进程在等待五秒之后就会结束。造成rewrite的失败

主进程的监控
    if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
        ldbPendingChildren())
    {
        int statloc;
        pid_t pid;

        if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {//非阻塞 等待进程结束
            int exitcode = WEXITSTATUS(statloc);
            int bysignal = 0;

            if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc); //确定是不是信号量退出的

            if (pid == -1) {//失败了
                serverLog(LL_WARNING,"wait3() returned an error: %s. "
                    "rdb_child_pid = %d, aof_child_pid = %d",
                    strerror(errno),
                    (int) server.rdb_child_pid,
                    (int) server.aof_child_pid);
            } else if (pid == server.rdb_child_pid) {//如果是一个rdb
                backgroundSaveDoneHandler(exitcode,bysignal);
                if (!bysignal && exitcode == 0) receiveChildInfo();
            } else if (pid == server.aof_child_pid) {//如果是child
                backgroundRewriteDoneHandler(exitcode,bysignal);
                if (!bysignal && exitcode == 0) receiveChildInfo();
            } else {
                if (!ldbRemoveChild(pid)) {
                    serverLog(LL_WARNING,
                        "Warning, detected child with unmatched pid: %ld",
                        (long)pid);
                }
            }
            updateDictResizePolicy();
            closeChildInfoPipe();
        }
    }

主进程在循环中监控子进程的状态,判断是否退出。然后根据程序的返回值进行操作

结束后的操作
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
    if (!bysignal && exitcode == 0) { //代表是正常结束的
        int newfd, oldfd;
        char tmpfile[256];
        long long now = ustime();
        mstime_t latency;

        serverLog(LL_NOTICE,
            "Background AOF rewrite terminated with success");

        /* Flush the differences accumulated by the parent to the
         * rewritten AOF. */
        latencyStartMonitor(latency);
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
            (int)server.aof_child_pid);
        newfd = open(tmpfile,O_WRONLY|O_APPEND);
        if (newfd == -1) {
            serverLog(LL_WARNING,
                "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
            goto cleanup;
        }

        if (aofRewriteBufferWrite(newfd) == -1) { //把没有发送过去的数据 在写入
            serverLog(LL_WARNING,
                "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
            close(newfd);
            goto cleanup;
        }
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);

        serverLog(LL_NOTICE,
            "Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));

        if (server.aof_fd == -1) {
            /* AOF disabled */

            /* Don't care if this fails: oldfd will be -1 and we handle that.
             * One notable case of -1 return is if the old file does
             * not exist. */
            oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
        } else {
            /* AOF enabled */
            oldfd = -1; /* We'll set this to the current AOF filedes later. */
        }

        /* Rename the temporary file. This will not unlink the target file if
         * it exists, because we reference it with "oldfd". */
        latencyStartMonitor(latency);
        //rename 如果原文件存在会先删除 在重命名
        if (rename(tmpfile,server.aof_filename) == -1) {
            serverLog(LL_WARNING,
                "Error trying to rename the temporary AOF file %s into %s: %s",
                tmpfile,
                server.aof_filename,
                strerror(errno));
            close(newfd);
            if (oldfd != -1) close(oldfd);
            goto cleanup;
        }
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-rename",latency);

        if (server.aof_fd == -1) { //代表没有开启aof
            /* AOF disabled, we don't need to set the AOF file descriptor
             * to this new file, so we can close it. */
            close(newfd);//也不需要再开启
        } else {
            /* AOF enabled, replace the old fd with the new one. */
            oldfd = server.aof_fd;
            server.aof_fd = newfd; //新的的文件的fd指向过来
            if (server.aof_fsync == AOF_FSYNC_ALWAYS)
                aof_fsync(newfd);
            else if (server.aof_fsync == AOF_FSYNC_EVERYSEC) //异步刷新
                aof_background_fsync(newfd);
            server.aof_selected_db = -1; /* Make sure SELECT is re-issued */ //设置select还没有
            aofUpdateCurrentSize();//更新当前大小
            server.aof_rewrite_base_size = server.aof_current_size; //设置basesize

            /* Clear regular AOF buffer since its contents was just written to
             * the new AOF from the background rewrite buffer. */
            sdsfree(server.aof_buf); //在background的进程中写入了这些信息
            server.aof_buf = sdsempty();
        }

        server.aof_lastbgrewrite_status = C_OK; //处理成功

        serverLog(LL_NOTICE, "Background AOF rewrite finished successfully");
        /* Change state from WAIT_REWRITE to ON if needed */
        if (server.aof_state == AOF_WAIT_REWRITE)
            server.aof_state = AOF_ON;

        /* Asynchronously close the overwritten AOF. */
        if (oldfd != -1) bioCreateBackgroundJob(BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);//关闭掉任务

        serverLog(LL_VERBOSE,
            "Background AOF rewrite signal handler took %lldus", ustime()-now);
    } else if (!bysignal && exitcode != 0) {
        /* SIGUSR1 is whitelisted, so we have a way to kill a child without
         * tirggering an error conditon. */
        if (bysignal != SIGUSR1)
            server.aof_lastbgrewrite_status = C_ERR;
        serverLog(LL_WARNING,
            "Background AOF rewrite terminated with error");
    } else {
        server.aof_lastbgrewrite_status = C_ERR;

        serverLog(LL_WARNING,
            "Background AOF rewrite terminated by signal %d", bysignal);
    }

cleanup:
    aofClosePipes();//关闭pipe
    aofRewriteBufferReset();
    aofRemoveTempFile(server.aof_child_pid);//删除文件
    server.aof_child_pid = -1;
    server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
    server.aof_rewrite_time_start = -1;
    /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
    if (server.aof_state == AOF_WAIT_REWRITE) //重新来过
        server.aof_rewrite_scheduled = 1;
}

主进程首先是判断rewrite进程是否写入成功,写入成功的话。主进程主要做这些事情
1. aofRewriteBufferWrite把没有发送到子进程的writebuf写入新的aof文件
2.rename(tmpfile,server.aof_filename)。把新的aof文件命名为aof_filename。其中rename如果原文件存在会先删除 在重命名
3.更新变量信息。比如大小。新的fd。关闭老的fd等。后面的aof就会使用新的fd进行书写了
4.关闭pipe通信。如果失败的话。设置aof_rewrite_scheduled标志

你可能感兴趣的:(Redis AOF持久化存储)