Redis源码分析(十五)——持久化AOF

Redis提供的第二种持久化机制AOF(Append-Only File),正如其名字,是以协议文本的方式将所有对数据库进行写的命令(及其参数)记录到追加到AOF文件,以此达到记录数据库状态的目的。


基本过程为: 客户端通过网络协议向服务器发起命令请求——>服务器选择命令函数并创建命令参数对象——>数数据库执行命令操作。/同时服务器将命令操作广播到AOF程序——>AOF程序将命令及参数对象、数据库ID等参数还原为协议文本——>追加到服务器的aof_buf缓存中。——>当服务器的常规任务函数或者事件处理器执行时调用flushAppendOnlyFile函数将aof_buf缓存中数据写入临时AOF文件——>由同步函数fsync/fdatasync将AOF文件保存到磁盘。


其中主要是文件的写入和保存:

函数flushAppendOnlyFile(简称函数Func)执行一下两个工作:

WRITE:根据条件,将aof_buf中的缓存写入到AOF文件

SAVE:根据条件,调用fsync或fdatasync函数将AOF文件保存到磁盘

两个步骤都需要根据一定的条件来执行,条件由AOF所使用的保存模式决定

AOF保存模式:

** AOF_FSYNC_NO:不保存   (WRITE/SAVE都由主进程成完成)

**AOF_FSYNC_EVERYSEC: 每秒钟保存一次  (WRITE由主进程完成,SAVE由子线程完成)

**AOF_FSYNC_ALWAYS:每执行一次命令保存一次 (  WRITE/SAVE都由主进程成完成


其中 AOF_FSYNC_NO模式,每次执行Func都会执行WRITE,但是SAVE只有在以下情况才会执行: Redis被关闭 或者  AOF功能别关闭  或者 系统的缓村被刷新 。

AOF_FSYNC_EVERYSEC模式: 原则上每秒钟执行一次SAVE(后后台子线程完成),但实际上与当前Redis的状态有关如图:

Redis源码分析(十五)——持久化AOF_第1张图片

因此在每秒钟保存一次的模式下,如果情况1发生故障停机,那么将最多损失小于2秒的数据,在情况2下则可能超过损失2秒的数据。

AOF_FSYNC_ALWAYS模式:在这种模式下,每次执行一个命令都会执行WRITE 和SAVE。且由主进程完成。


以上三种模式,其安全性能依次增加,而效率逐渐下降。 



读入AOF文件和保存的主要函数:


将 AOF 缓存写入到文件中:

void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;

    // 缓冲区中没有任何内容,直接返回
    if (sdslen(server.aof_buf) == 0) return;

    // 策略为每秒 FSYNC : 实际将根据当前Redis的状态有关,而非真正的每秒钟就能执行一次SAVE操作:一共四种情况:
	if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        // 是否有 SYNC 正在后台进行?
        sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;

    // 每秒 fsync ,并且强制写入为假
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {

        /* With this append fsync policy we do background fsyncing.
         *
         * 当 fsync 策略为每秒钟一次时, fsync 在后台执行。
         *
         * If the fsync is still in progress we can try to delay
         * the write for a couple of seconds. 
         *
         * 如果后台仍在执行 FSYNC ,那么我们可以延迟写操作一两秒
         * (如果强制执行 write 的话,服务器主线程将阻塞在 write 上面)
         */
        if (sync_in_progress) {

            // 有 fsync 正在后台进行 。。。

            if (server.aof_flush_postponed_start == 0) {
                /* No previous write postponinig, remember that we are
                 * postponing the flush and return. 
                 *
                 * 前面没有推迟过 write 操作,这里将推迟写操作的时间记录下来
                 * 然后就返回,不执行 write 或者 fsync
                 */
                server.aof_flush_postponed_start = server.unixtime;
                return;

            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                /* We were already waiting for fsync to finish, but for less
                 * than two seconds this is still ok. Postpone again. 
                 *
                 * 如果之前已经因为 fsync 而推迟了 write 操作
                 * 但是推迟的时间不超过 2 秒,那么直接返回
                 * 不执行 write 或者 fsync
                 */
                return;

            }

            /* Otherwise fall trough, and go write since we can't wait
             * over two seconds. 
             *
             * 如果后台还有 fsync 在执行,并且 write 已经推迟 >= 2 秒
             * 那么执行写操作(write 将被阻塞)
             */
            server.aof_delayed_fsync++;
            redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }

    /* If you are following this code path, then we are going to write so
     * set reset the postponed flush sentinel to zero. 
     *
     * 执行到这里,程序会对 AOF 文件进行写入。
     *
     * 清零延迟 write 的时间记录
     */
    server.aof_flush_postponed_start = 0;

    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     *
     * 执行单个 write 操作,如果写入设备是物理的话,那么这个操作应该是原子的
     *
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike 
     *
     * 当然,如果出现像电源中断这样的不可抗现象,那么 AOF 文件也是可能会出现问题的
     * 这时就要用 redis-check-aof 程序来进行修复。
     */
    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));//将aof_buf缓冲写入到AOF文件
    if (nwritten != (signed)sdslen(server.aof_buf)) {

        static time_t last_write_error_log = 0;
        int can_log = 0;

        /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
        // 将日志的记录频率限制在每行 AOF_WRITE_LOG_ERROR_RATE 秒
        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
            can_log = 1;
            last_write_error_log = server.unixtime;
        }

        /* Lof the AOF write error and record the error code. */
        // 如果写入出错,那么尝试将该情况写入到日志里面
        if (nwritten == -1) {
            if (can_log) {
                redisLog(REDIS_WARNING,"Error writing to the AOF file: %s",
                    strerror(errno));
                server.aof_last_write_errno = errno;
            }
        } else {
            if (can_log) {
                redisLog(REDIS_WARNING,"Short write while writing to "
                                       "the AOF file: (nwritten=%lld, "
                                       "expected=%lld)",
                                       (long long)nwritten,
                                       (long long)sdslen(server.aof_buf));
            }

            // 尝试移除新追加的不完整内容
            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
                if (can_log) {
                    redisLog(REDIS_WARNING, "Could not remove short write "
                             "from the append-only file.  Redis may refuse "
                             "to load the AOF the next time it starts.  "
                             "ftruncate: %s", strerror(errno));
                }
            } else {
                /* If the ftrunacate() succeeded we can set nwritten to
                 * -1 since there is no longer partial data into the AOF. */
                nwritten = -1;
            }
            server.aof_last_write_errno = ENOSPC;
        }

        /* Handle the AOF write error. */
        // 处理写入 AOF 文件时出现的错误
        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
            /* We can't recover when the fsync policy is ALWAYS since the
             * reply for the client is already in the output buffers, and we
             * have the contract with the user that on acknowledged write data
             * is synched on disk. */
            redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
            exit(1);
        } else {
            /* Recover from failed write leaving data into the buffer. However
             * set an error to stop accepting writes as long as the error
             * condition is not cleared. */
            server.aof_last_write_status = REDIS_ERR;

            /* Trim the sds buffer if there was a partial write, and there
             * was no way to undo it with ftruncate(2). */
            if (nwritten > 0) {
                server.aof_current_size += nwritten;
                sdsrange(server.aof_buf,nwritten,-1);
            }
            return; /* We'll try again on the next call... */
        }
    } else {
        /* Successful write(2). If AOF was in error state, restore the
         * OK state and log the event. */
        // 写入成功,更新最后写入状态
        if (server.aof_last_write_status == REDIS_ERR) {
            redisLog(REDIS_WARNING,
                "AOF write error looks solved, Redis can write again.");
            server.aof_last_write_status = REDIS_OK;
        }
    }

    // 更新写入后的 AOF 文件大小
    server.aof_current_size += nwritten;

    /* Re-use AOF buffer when it is small enough. The maximum comes from the
     * arena size of 4k minus some overhead (but is otherwise arbitrary). 
     *
     * 如果 AOF 缓存的大小足够小的话,那么重用这个缓存,
     * 否则的话,释放 AOF 缓存。
     */
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        // 清空缓存中的内容,等待重用
        sdsclear(server.aof_buf);
    } else {
        // 释放缓存
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }

    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
     * children doing I/O in the background. 
     *
     * 如果 no-appendfsync-on-rewrite 选项为开启状态,
     * 并且有 BGSAVE 或者 BGREWRITEAOF 正在进行的话,
     * 那么不执行 fsync 
     */
    if (server.aof_no_fsync_on_rewrite &&
        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
            return;

    /* Perform the fsync if needed. */

    // AOF_FSYNC_ALWAYS
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
        /* aof_fsync is defined as fdatasync() for Linux in order to avoid
         * flushing metadata. */
        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */ 

        // 更新最后一次执行 fsnyc 的时间
        server.aof_last_fsync = server.unixtime;

    // 策略为每秒 fsnyc ,并且距离上次 fsync 已经超过 1 秒
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        // 放到后台执行
        if (!sync_in_progress) aof_background_fsync(server.aof_fd);
        // 更新最后一次执行 fsync 的时间
        server.aof_last_fsync = server.unixtime;
    }

  
}


接下来就是 逆向的处理过程:

AOF文件的读取和数据还原:

AOF文件以通信协议的文本格式保存了Redis数据库的状态。因此可以创建一个伪客户端来执行AOF文件中的命令,从而还原数据库的状态。

基本过程如下:

1、创建一个不带网络连接的伪客户端

2、读取AOF所保存的文本,并根据内容还原出命令、参数以及参数个数

3、用伪客户端执行读取的命令

4、执行2 、3直到AOF文件中保存的数据库都被还原出来。

完成第四步之后,AOF所保存的数据库状态就被完整的还原出来了。

加载函数:

int loadAppendOnlyFile(char *filename) {

    // 为客户端
    struct redisClient *fakeClient;

    // 打开 AOF 文件
    FILE *fp = fopen(filename,"r");

    struct redis_stat sb;
    int old_aof_state = server.aof_state;
    long loops = 0;

    // 检查文件的正确性
    if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
        server.aof_current_size = 0;
        fclose(fp);
        return REDIS_ERR;
    }

    // 检查文件是否正常打开
    if (fp == NULL) {
        redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
        exit(1);
    }

    /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
     * to the same file we're about to read. 
     *
     * 暂时性地关闭 AOF ,防止在执行 MULTI 时,
     * EXEC 命令被传播到正在打开的 AOF 文件中。
     */
    server.aof_state = REDIS_AOF_OFF;

    fakeClient = createFakeClient();

    // 设置服务器的状态为:正在载入
    // startLoading 定义于 rdb.c
    startLoading(fp);

    while(1) {
        int argc, j;
        unsigned long len;
        robj **argv;
        char buf[128];
        sds argsds;
        struct redisCommand *cmd;

        /* Serve the clients from time to time 
         *
         * 间隔性地处理客户端发送来的请求
         * 因为服务器正处于载入状态,所以能正常执行的只有 PUBSUB 等模块
         */
        if (!(loops++ % 1000)) {
            loadingProgress(ftello(fp));
            processEventsWhileBlocked();
        }

        // 读入文件内容到缓存
        if (fgets(buf,sizeof(buf),fp) == NULL) {
            if (feof(fp))
                // 文件已经读完,跳出
                break;
            else
                goto readerr;
        }

        // 确认协议格式,比如 *3\r\n
        if (buf[0] != '*') goto fmterr;
        
        // 取出命令参数,比如 *3\r\n 中的 3
        argc = atoi(buf+1);

        // 至少要有一个参数(被调用的命令)
        if (argc < 1) goto fmterr;

        // 从文本中创建字符串对象:包括命令,以及命令参数
        // 例如 $3\r\nSET\r\n$3\r\nKEY\r\n$5\r\nVALUE\r\n
        // 将创建三个包含以下内容的字符串对象:
        // SET 、 KEY 、 VALUE
        argv = zmalloc(sizeof(robj*)*argc);
        for (j = 0; j < argc; j++) {
            if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;

            if (buf[0] != '$') goto fmterr;

            // 读取参数值的长度
            len = strtol(buf+1,NULL,10);
            // 读取参数值
            argsds = sdsnewlen(NULL,len);
            if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
            // 为参数创建对象
            argv[j] = createObject(REDIS_STRING,argsds);

            if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
        }

        /* Command lookup 
         *
         * 查找命令
         */
        cmd = lookupCommand(argv[0]->ptr);
        if (!cmd) {
            redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", (char*)argv[0]->ptr);
            exit(1);
        }

        /* Run the command in the context of a fake client 
         *
         * 调用伪客户端,执行命令
         */
        fakeClient->argc = argc;
        fakeClient->argv = argv;
        cmd->proc(fakeClient);

        /* The fake client should not have a reply */
        redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0);
        /* The fake client should never get blocked */
        redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);

        /* Clean up. Command code may have changed argv/argc so we use the
         * argv/argc of the client instead of the local variables. 
         *
         * 清理命令和命令参数对象
         */
        for (j = 0; j < fakeClient->argc; j++)
            decrRefCount(fakeClient->argv[j]);
        zfree(fakeClient->argv);
    }

    /* This point can only be reached when EOF is reached without errors.
     * If the client is in the middle of a MULTI/EXEC, log error and quit. 
     *
     * 如果能执行到这里,说明 AOF 文件的全部内容都可以正确地读取,
     * 但是,还要检查 AOF 是否包含未正确结束的事务
     */
    if (fakeClient->flags & REDIS_MULTI) goto readerr;

    // 关闭 AOF 文件
    fclose(fp);
    // 释放伪客户端
    freeFakeClient(fakeClient);
    // 复原 AOF 状态
    server.aof_state = old_aof_state;
    // 停止载入
    stopLoading();
    // 更新服务器状态中, AOF 文件的当前大小
    aofUpdateCurrentSize();
    // 记录前一次重写时的大小
    server.aof_rewrite_base_size = server.aof_current_size;
    
    return REDIS_OK;

// 读入错误
readerr:
    // 非预期的末尾,可能是 AOF 文件在写入的中途遭遇了停机
    if (feof(fp)) {
        redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
    
    // 文件内容出错
    } else {
        redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
    }
    exit(1);

// 内容格式错误
fmterr:
    redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix ");
    exit(1);
}

AOF文件的重写REWRITE / BGREWRITE:

AOF采用同步方式将执行的命令及参数等记录到AOF文件中,这将随着时间使得AOF文件越来越大。 因此Redis实现了AOF文件的重写机制,即将当前数据库的状态用最少的命令及参数记录到新的AOF文件中,然后利用新的AOF文件代替旧的AOF文件。 注意: 此处的”重写“实际是不准确的,因为新的AOF文件时直接来与当前的数据库的状态,而非旧的AOF文件。

  REWRITE由主进程来执行重写,这将阻塞主进程,由于重写数据量较大,所以这将严重影响性能。  而在后台重写 BGREWRITE ,以上过程(及新AOF文件的写入)都将由子进程来完成,当子进程写好新的AOF文件后,将发送信号给主进程,到此子进程结束。接着主进程将把在子进程写新AOF文件期间,新到的命令缓存到服务器的 AOF重写缓冲中,此时主进程将重写缓存中的内容追加到新的AOF文件中,然后用新的AOF文件代替就得AOF文件。 就此完成整个重写过程。


将AOF重写缓存中的内容写入指定的AOF文件:

/* Write the buffer (possibly composed of multiple blocks) into the specified
 * fd. If a short write or any other error happens -1 is returned,
 * otherwise the number of bytes written is returned. 
 *
 * 将重写缓存中的所有内容(可能由多个块组成)写入到给定 fd 中。
 *
 * 如果没有 short write 或者其他错误发生,那么返回写入的字节数量,
 * 否则,返回 -1 。
 */
ssize_t aofRewriteBufferWrite(int fd) {
    listNode *ln;
    listIter li;
    ssize_t count = 0;

    // 遍历所有缓存块
    listRewind(server.aof_rewrite_buf_blocks,&li);
    while((ln = listNext(&li))) {
        aofrwblock *block = listNodeValue(ln);
        ssize_t nwritten;

        if (block->used) {

            // 写入缓存块内容到 fd
            nwritten = write(fd,block->buf,block->used);
            if (nwritten != block->used) {
                if (nwritten == 0) errno = EIO;
                return -1;
            }

            // 积累写入字节
            count += nwritten;
        }
    }

    return count;
}

重写AOF文件(REWRITE/ BGREWRITE 都将调用该函数):

/* Write a sequence of commands able to fully rebuild the dataset into
 * "filename". Used both by REWRITEAOF and BGREWRITEAOF.
 *
 * 将一集足以还原当前数据集的命令写入到 filename 指定的文件中。
 *
 * 这个函数被 REWRITEAOF 和 BGREWRITEAOF 两个命令调用。
 * (REWRITEAOF 似乎已经是一个废弃的命令)
 *
 * In order to minimize the number of commands needed in the rewritten
 * log Redis uses variadic commands when possible, such as RPUSH, SADD
 * and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time
 * are inserted using a single command. 
 *
 * 为了最小化重建数据集所需执行的命令数量,
 * Redis 会尽可能地使用接受可变参数数量的命令,比如 RPUSH 、SADD 和 ZADD 等。
 *
 * 不过单个命令每次处理的元素数量不能超过 REDIS_AOF_REWRITE_ITEMS_PER_CMD 。
 */
int rewriteAppendOnlyFile(char *filename) {
    dictIterator *di = NULL;
    dictEntry *de;
    rio aof;
    FILE *fp;
    char tmpfile[256];
    int j;
    long long now = mstime();

    /* Note that we have to use a different temp name here compared to the
     * one used by rewriteAppendOnlyFileBackground() function. 
     *
     * 创建临时文件
     *
     * 注意这里创建的文件名和 rewriteAppendOnlyFileBackground() 创建的文件名稍有不同
     */
    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
    fp = fopen(tmpfile,"w");
    if (!fp) {
        redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
        return REDIS_ERR;
    }

    // 初始化文件 io
    rioInitWithFile(&aof,fp);

    // 设置每写入 REDIS_AOF_AUTOSYNC_BYTES 字节
    // 就执行一次 FSYNC 
    // 防止缓存中积累太多命令内容,造成 I/O 阻塞时间过长
    if (server.aof_rewrite_incremental_fsync)
        rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);

    // 遍历所有数据库
    for (j = 0; j < server.dbnum; j++) {

        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";

        redisDb *db = server.db+j;

        // 指向键空间
        dict *d = db->dict;
        if (dictSize(d) == 0) continue;

        // 创建键空间迭代器
        di = dictGetSafeIterator(d);
        if (!di) {
            fclose(fp);
            return REDIS_ERR;
        }

        /* SELECT the new DB 
         *
         * 首先写入 SELECT 命令,确保之后的数据会被插入到正确的数据库上
         */
        if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
        if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;//写入数据库编号

        /* Iterate this DB writing every entry 
         *
         * 遍历数据库所有键,并通过命令将它们的当前状态(值)记录到新 AOF 文件中
         */
        while((de = dictNext(di)) != NULL) {
            sds keystr;
            robj key, *o;
            long long expiretime;

            // 取出键
            keystr = dictGetKey(de);

            // 取出值
            o = dictGetVal(de);
            initStaticStringObject(key,keystr);

            // 取出过期时间
            expiretime = getExpire(db,&key);

            /* If this key is already expired skip it 
             *
             * 如果键已经过期,那么跳过它,不保存
             */
            if (expiretime != -1 && expiretime < now) continue;

            /* Save the key and associated value 
             *
             * 根据值的类型,选择适当的命令来保存值
             */
            if (o->type == REDIS_STRING) {
                /* Emit a SET command */
                char cmd[]="*3\r\n$3\r\nSET\r\n";
                if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                /* Key and value */
                if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
                if (rioWriteBulkObject(&aof,o) == 0) goto werr;
            } else if (o->type == REDIS_LIST) {
                if (rewriteListObject(&aof,&key,o) == 0) goto werr;
            } else if (o->type == REDIS_SET) {
                if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
            } else if (o->type == REDIS_ZSET) {
                if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
            } else if (o->type == REDIS_HASH) {
                if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
            } else {
                redisPanic("Unknown object type");
            }

            /* Save the expire time 
             *
             * 保存键的过期时间
             */
            if (expiretime != -1) {
                char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";

                // 写入 PEXPIREAT expiretime 命令
                if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
                if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
            }
        }

        // 释放迭代器
        dictReleaseIterator(di);
    }

    /* Make sure data will not remain on the OS's output buffers */
    // 冲洗并关闭新 AOF 文件
    if (fflush(fp) == EOF) goto werr;
    if (aof_fsync(fileno(fp)) == -1) goto werr;
    if (fclose(fp) == EOF) goto werr;

    /* Use RENAME to make sure the DB file is changed atomically only
     * if the generate DB file is ok. 
     *
     * 原子地改名,用重写后的新 AOF 文件覆盖旧 AOF 文件
     */
    if (rename(tmpfile,filename) == -1) {
        redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
        unlink(tmpfile);
        return REDIS_ERR;
    }

    redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");

    return REDIS_OK;

werr:
    fclose(fp);
    unlink(tmpfile);
    redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
    if (di) dictReleaseIterator(di);
    return REDIS_ERR;
}

后台重写AOF文件( BGREWRITE):

/* This is how rewriting of the append only file in background works:
 * 
 * 以下是后台重写 AOF 文件(BGREWRITEAOF)的工作步骤:
 *
 * 1) The user calls BGREWRITEAOF
 *    用户调用 BGREWRITEAOF
 *
 * 2) Redis calls this function, that forks():
 *    Redis 调用这个函数,它执行 fork() :
 *
 *    2a) the child rewrite the append only file in a temp file.
 *        子进程在临时文件中对 AOF 文件进行重写
 *
 *    2b) the parent accumulates differences in server.aof_rewrite_buf.
 *        父进程将新输入的写命令追加到 server.aof_rewrite_buf 中
 *
 * 3) When the child finished '2a' exists.
 *    当步骤 2a 执行完之后,子进程结束
 *
 * 4) The parent will trap the exit code, if it's OK, will append the
 *    data accumulated into server.aof_rewrite_buf into the temp file, and
 *    finally will rename(2) the temp file in the actual file name.
 *    The the new file is reopened as the new append only file. Profit!
 *
 *    父进程会捕捉子进程的退出信号,
 *    如果子进程的退出状态是 OK 的话,
 *    那么父进程将新输入命令的缓存追加到临时文件,
 *    然后使用 rename(2) 对临时文件改名,用它代替旧的 AOF 文件,
 *    至此,后台 AOF 重写完成。
 */
int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;
    long long start;

    // 已经有进程在进行 AOF 重写了
    if (server.aof_child_pid != -1) return REDIS_ERR;

    // 记录 fork 开始前的时间,计算 fork 耗时用
    start = ustime();

    if ((childpid = fork()) == 0) {
        char tmpfile[256];

        /* Child */

        // 关闭网络连接 fd
        closeListeningSockets(0);

        // 为进程设置名字,方便记认
        redisSetProcTitle("redis-aof-rewrite");

        // 创建临时文件,并进行 AOF 重写
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
        if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
            size_t private_dirty = zmalloc_get_private_dirty();

            if (private_dirty) {
                redisLog(REDIS_NOTICE,
                    "AOF rewrite: %zu MB of memory used by copy-on-write",
                    private_dirty/(1024*1024));
            }
            // 发送重写成功信号
            exitFromChild(0);//父进程完成AOF重写缓存的追加;并以新的AOF文件重命名代替旧的AOF文件
        } else {
            // 发送重写失败信号
            exitFromChild(1);
        }
    } else {
        /* Parent */
        // 记录执行 fork 所消耗的时间
        server.stat_fork_time = ustime()-start;

        if (childpid == -1) {
            redisLog(REDIS_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            return REDIS_ERR;
        }

        redisLog(REDIS_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);

        // 记录 AOF 重写的信息
        server.aof_rewrite_scheduled = 0;
        server.aof_rewrite_time_start = time(NULL);
        server.aof_child_pid = childpid;

        // 关闭字典自动 rehash
        updateDictResizePolicy();

        /* We set appendseldb to -1 in order to force the next call to the
         * feedAppendOnlyFile() to issue a SELECT command, so the differences
         * accumulated by the parent into server.aof_rewrite_buf will start
         * with a SELECT statement and it will be safe to merge. 
         *
         * 将 aof_selected_db 设为 -1 ,
         * 强制让 feedAppendOnlyFile() 下次执行时引发一个 SELECT 命令,
         * 从而确保之后新添加的命令会设置到正确的数据库中
         */
        server.aof_selected_db = -1;
        replicationScriptCacheFlush();
        return REDIS_OK;
    }
    return REDIS_OK; /* unreached */
}

子进程后台重写AOF文件完成后,将由父进程调用该函数完成AOF重写缓存的追加,和以新AOF代替旧AOF文件的操作:

/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
 * Handle this. 
 *
 * 当子线程完成 AOF 重写时,父进程调用这个函数。
 */
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
    if (!bysignal && exitcode == 0) {
        int newfd, oldfd;
        char tmpfile[256];
        long long now = ustime();

        redisLog(REDIS_NOTICE,
            "Background AOF rewrite terminated with success");

        /* Flush the differences accumulated by the parent to the
         * rewritten AOF. */
        // 打开保存新 AOF 文件内容的临时文件
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
            (int)server.aof_child_pid);
        newfd = open(tmpfile,O_WRONLY|O_APPEND);
        if (newfd == -1) {
            redisLog(REDIS_WARNING,
                "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
            goto cleanup;
        }

        // 将累积的重写缓存写入到临时文件中
        // 这个函数调用的 write 操作会阻塞主进程
        if (aofRewriteBufferWrite(newfd) == -1) {
            redisLog(REDIS_WARNING,
                "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
            close(newfd);
            goto cleanup;
        }

        redisLog(REDIS_NOTICE,
            "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", aofRewriteBufferSize());

        /* The only remaining thing to do is to rename the temporary file to
         * the configured file and switch the file descriptor used to do AOF
         * writes. We don't want close(2) or rename(2) calls to block the
         * server on old file deletion.
         *
         * 剩下的工作就是将临时文件改名为 AOF 程序指定的文件名,
         * 并将新文件的 fd 设为 AOF 程序的写目标。
         *
         * 不过这里有一个问题 ——
         * 我们不想 close(2) 或者 rename(2) 在删除旧文件时阻塞。
         *
         * There are two possible scenarios:
         *
         * 以下是两个可能的场景:
         *
         * 1) AOF is DISABLED and this was a one time rewrite. The temporary
         * file will be renamed to the configured file. When this file already
         * exists, it will be unlinked, which may block the server.
         *
         * AOF 被关闭,这个是一次单次的写操作。
         * 临时文件会被改名为 AOF 文件。
         * 本来已经存在的 AOF 文件会被 unlink ,这可能会阻塞服务器。
         *
         * 2) AOF is ENABLED and the rewritten AOF will immediately start
         * receiving writes. After the temporary file is renamed to the
         * configured file, the original AOF file descriptor will be closed.
         * Since this will be the last reference to that file, closing it
         * causes the underlying file to be unlinked, which may block the
         * server.
         *
         * AOF 被开启,并且重写后的 AOF 文件会立即被用于接收新的写入命令。
         * 当临时文件被改名为 AOF 文件时,原来的 AOF 文件描述符会被关闭。
         * 因为 Redis 会是最后一个引用这个文件的进程,
         * 所以关闭这个文件会引起 unlink ,这可能会阻塞服务器。
         *
         * To mitigate the blocking effect of the unlink operation (either
         * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we
         * use a background thread to take care of this. First, we
         * make scenario 1 identical to scenario 2 by opening the target file
         * when it exists. The unlink operation after the rename(2) will then
         * be executed upon calling close(2) for its descriptor. Everything to
         * guarantee atomicity for this switch has already happened by then, so
         * we don't care what the outcome or duration of that close operation
         * is, as long as the file descriptor is released again. 
         *
         * 为了避免出现阻塞现象,程序会将 close(2) 放到后台线程执行,
         * 这样服务器就可以持续处理请求,不会被中断。
         */
        if (server.aof_fd == -1) {
            /* AOF disabled */

             /* Don't care if this fails: oldfd will be -1 and we handle that.
              * One notable case of -1 return is if the old file does
              * not exist. */
             oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
        } else {
            /* AOF enabled */
            oldfd = -1; /* We'll set this to the current AOF filedes later. */
        }

        /* Rename the temporary file. This will not unlink the target file if
         * it exists, because we reference it with "oldfd". 
         *
         * 对临时文件进行改名,替换现有的 AOF 文件。
         *
         * 旧的 AOF 文件不会在这里被 unlink ,因为 oldfd 引用了它。
         */
        if (rename(tmpfile,server.aof_filename) == -1) {
            redisLog(REDIS_WARNING,
                "Error trying to rename the temporary AOF file: %s", strerror(errno));
            close(newfd);
            if (oldfd != -1) close(oldfd);
            goto cleanup;
        }

        if (server.aof_fd == -1) {
            /* AOF disabled, we don't need to set the AOF file descriptor
             * to this new file, so we can close it. 
             *
             * AOF 被关闭,直接关闭 AOF 文件,
             * 因为关闭 AOF 本来就会引起阻塞,所以这里就算 close 被阻塞也无所谓
             */
            close(newfd);
        } else {
            /* AOF enabled, replace the old fd with the new one. 
             *
             * 用新 AOF 文件的 fd 替换原来 AOF 文件的 fd
             */
            oldfd = server.aof_fd;
            server.aof_fd = newfd;

            // 因为前面进行了 AOF 重写缓存追加,所以这里立即 fsync 一次
            if (server.aof_fsync == AOF_FSYNC_ALWAYS)
                aof_fsync(newfd);
            else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
                aof_background_fsync(newfd);

            // 强制引发 SELECT
            server.aof_selected_db = -1; /* Make sure SELECT is re-issued */

            // 更新 AOF 文件的大小
            aofUpdateCurrentSize();

            // 记录前一次重写时的大小
            server.aof_rewrite_base_size = server.aof_current_size;

            /* Clear regular AOF buffer since its contents was just written to
             * the new AOF from the background rewrite buffer. 
             *
             * 清空 AOF 缓存,因为它的内容已经被写入过了,没用了
             */
            sdsfree(server.aof_buf);
            server.aof_buf = sdsempty();
        }

        server.aof_lastbgrewrite_status = REDIS_OK;

        redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");

        /* Change state from WAIT_REWRITE to ON if needed 
         *
         * 如果是第一次创建 AOF 文件,那么更新 AOF 状态
         */
        if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
            server.aof_state = REDIS_AOF_ON;

        /* Asynchronously close the overwritten AOF. 
         *
         * 异步关闭旧 AOF 文件
         */
        if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);

        redisLog(REDIS_VERBOSE,
            "Background AOF rewrite signal handler took %lldus", ustime()-now);

    // BGREWRITEAOF 重写出错
    } else if (!bysignal && exitcode != 0) {
        server.aof_lastbgrewrite_status = REDIS_ERR;

        redisLog(REDIS_WARNING,
            "Background AOF rewrite terminated with error");

    // 未知错误
    } else {
        server.aof_lastbgrewrite_status = REDIS_ERR;

        redisLog(REDIS_WARNING,
            "Background AOF rewrite terminated by signal %d", bysignal);
    }

cleanup:

    // 清空 AOF 缓冲区
    aofRewriteBufferReset();

    // 移除临时文件
    aofRemoveTempFile(server.aof_child_pid);

    // 重置默认属性
    server.aof_child_pid = -1;
    server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
    server.aof_rewrite_time_start = -1;

    /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
    if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
        server.aof_rewrite_scheduled = 1;
}





你可能感兴趣的:(Redis源码分析)