Redis源码研究之AOF持久化

本文主要研究AOF持久化策略的实现方式,了解AOF数据组织方式和运作机制。。

建议阅读
1、Redis AOF持久化方式的理论说明见:Redis之AOF持久化小探

I、上帝视角

为了更好的理解AOF持久化,我们首先将其拆分为文件追加文件写入文件同步三个部分:

1.1 文件追加

struct redisServer结构中维护了一个AOF缓冲区,服务器在执行完一个写命令后,会以协议的格式调用feedAppendOnlyFile将命令追加到服务器状态的aof_buf中(这个缓冲区是在内存中):

/*src/redis.h*/
struct redisServer {
    ...
    //AOF缓冲区
    sds aof_buf;
};
1.2 文件写入与同步

1、要理解文件写入与同步,首先我们要理解缓存机制
为了提高文件的写入效率,在现代操作系统中,当用户调用write函数,将一些数据写入到文件的时候,操作系统通常会将写入数据暂时保存在一个内存缓冲区里面,等到内存缓冲区的空间被填满,或者超过了指定的时限后,才真正的将缓冲区中的数据写入到磁盘里面,这里的内存缓冲区就是1.1中说明的aof_buf

2、因为服务器在处理文件事件时可能会执行写命令,使得一些内容被追加到aof_buf中,所以,服务器每次结束一个事件循环之前都会调用flushAppendOnlyFile,判断是否需要将aof_buf缓冲区的内容写入和保存到AOF文件中。下面将重点说明flushAppendOnlyFile函数。

II、命令传播

propagate()函数的作用是将命令传播给AOF以及slave中,对于slave是Redis集群部分的内容,我们暂且不说。而propagate()将命令传播到AOF中是通过调用feedAppendOnlyFile()函数:

/*命令传播到AOF和slave*/
/*src/redis.c/propagate*/
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
    int flags)
    {
    // AOF 策略需要打开,且设置AOF 传播标记,将更新发布给本地文件
    if (server.aof_state != REDIS_AOF_OFF && flags & REDIS_PROPAGATE_AOF)
        feedAppendOnlyFile(cmd,dbid,argv,argc);

    // 设置了从机传播标记,将更新发布给从机
    if (flags & REDIS_PROPAGATE_REPL)
        replicationFeedSlaves(server.slaves,dbid,argv,argc);
    }
}

下面看被调用的feedAppendOnlyFile函数做了什么:

/*命令追加至aof_buf缓冲区,如果正在执行重写功能,则也追加到重写缓冲区中*/  
/*src/aof.c/feedAppendOnlyFile*/
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    sds buf = sdsempty();
    robj *tmpargv[3];

    /* The DB this command was targeting is not the same as the last command
     * we appendend. To issue a SELECT command is needed. 
     *
     * 使用 SELECT 命令,显式设置数据库,确保之后的命令被设置到正确的数据库
     */
    if (dictid != server.aof_selected_db) {
        char seldb[64];

        snprintf(seldb,sizeof(seldb),"%d",dictid);
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);

        server.aof_selected_db = dictid;
    }

    // EXPIRE 、 PEXPIRE 和 EXPIREAT 命令
    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
        cmd->proc == expireatCommand) {
        /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT 
         *
         * 将 EXPIRE 、 PEXPIRE 和 EXPIREAT 都翻译成 PEXPIREAT
         */
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);

    // SETEX 和 PSETEX 命令
    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
        /* Translate SETEX/PSETEX to SET and PEXPIREAT 
         *
         * 将两个命令都翻译成 SET 和 PEXPIREAT
         */

        // SET
        tmpargv[0] = createStringObject("SET",3);
        tmpargv[1] = argv[1];
        tmpargv[2] = argv[3];
        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);

        // PEXPIREAT
        decrRefCount(tmpargv[0]);
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);

    // 其他命令
    } else {
        /* All the other commands don't need translation or need the
         * same translation already operated in the command vector
         * for the replication itself. */
        buf = catAppendOnlyGenericCommand(buf,argc,argv);
    }

    /* Append to the AOF buffer. This will be flushed on disk just before
     * of re-entering the event loop, so before the client will get a
     * positive reply about the operation performed. 
     *
     * 将命令追加到 AOF 缓存中,
     * 在重新进入事件循环之前,这些命令会被冲洗到磁盘上,
     * 并向客户端返回一个回复。
     */
    if (server.aof_state == REDIS_AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));

    /* If a background append only file rewriting is in progress we want to
     * accumulate the differences between the child DB and the current one
     * in a buffer, so that when the child process will do its work we
     * can append the differences to the new append only file. 
     *
     * 如果 BGREWRITEAOF 正在进行,
     * 那么我们还需要将命令追加到重写缓存中,
     * 从而记录当前正在重写的 AOF 文件和数据库当前状态的差异。
     */
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));

    // 释放
    sdsfree(buf);
}

注释中有一句很重要的话就是:将命令追加大AOF缓存中,在重新进入事件循环之前,这些命令会被冲洗到磁盘上,并向客户端返回一个回复,注意给客户端返回回复的时间,这是数据库常见的备份操作,需要先将数据备份,在返回给客户端回复,如果反过来先给客户端回复,此时数据库down掉,则会出现不一致的情况。

III、同步磁盘

同步即将所有累积的aof_buf中的内容写入到磁盘中,由flushAppendOnlyFile()函数完成:

// 同步磁盘;将所有累积的更新server.aof_buf 写入磁盘
/*src/aof.c/flushAppendOnlyFile*/
/* Write the append only file buffer on disk.
 *
 * 将 AOF 缓存写入到文件中。
 *
 * Since we are required to write the AOF before replying to the client,
 * and the only way the client socket can get a write is entering when the
 * the event loop, we accumulate all the AOF writes in a memory
 * buffer and write it on disk using this function just before entering
 * the event loop again.
 *
 * 因为程序需要在回复客户端之前对 AOF 执行写操作。
 * 而客户端能执行写操作的唯一机会就是在事件 loop 中,
 * 因此,程序将所有 AOF 写累积到缓存中,
 * 并在重新进入事件 loop 之前,将缓存写入到文件中。
 *
 * About the 'force' argument:
 *
 * 关于 force 参数:
 *
 * When the fsync policy is set to 'everysec' we may delay the flush if there
 * is still an fsync() going on in the background thread, since for instance
 * on Linux write(2) will be blocked by the background fsync anyway.
 *
 * 当 fsync 策略为每秒钟保存一次时,如果后台线程仍然有 fsync 在执行,
 * 那么我们可能会延迟执行冲洗(flush)操作,
 * 因为 Linux 上的 write(2) 会被后台的 fsync 阻塞。
 *
 * When this happens we remember that there is some aof buffer to be
 * flushed ASAP, and will try to do that in the serverCron() function.
 *
 * 当这种情况发生时,说明需要尽快冲洗 aof 缓存,
 * 程序会尝试在 serverCron() 函数中对缓存进行冲洗。
 *
 * However if force is set to 1 we'll write regardless of the background
 * fsync. 
 *
 * 不过,如果 force 为 1 的话,那么不管后台是否正在 fsync ,
 * 程序都直接进行写入。
 */
#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */
void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;

    // 缓冲区中没有任何内容,直接返回
    if (sdslen(server.aof_buf) == 0) return;

    // 策略为每秒 FSYNC 
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        // 是否有 SYNC 正在后台进行?
        sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;

    // 每秒 fsync ,并且强制写入为假
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {

        /* With this append fsync policy we do background fsyncing.
         *
         * 当 fsync 策略为每秒钟一次时, fsync 在后台执行。
         *
         * If the fsync is still in progress we can try to delay
         * the write for a couple of seconds. 
         *
         * 如果后台仍在执行 FSYNC ,那么我们可以延迟写操作一两秒
         * (如果强制执行 write 的话,服务器主线程将阻塞在 write 上面)
         */
        if (sync_in_progress) {

            // 有 fsync 正在后台进行 。。。

            if (server.aof_flush_postponed_start == 0) {
                /* No previous write postponinig, remember that we are
                 * postponing the flush and return. 
                 *
                 * 前面没有推迟过 write 操作,这里将推迟写操作的时间记录下来
                 * 然后就返回,不执行 write 或者 fsync
                 */
                server.aof_flush_postponed_start = server.unixtime;
                return;

            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                /* We were already waiting for fsync to finish, but for less
                 * than two seconds this is still ok. Postpone again. 
                 *
                 * 如果之前已经因为 fsync 而推迟了 write 操作
                 * 但是推迟的时间不超过 2 秒,那么直接返回
                 * 不执行 write 或者 fsync
                 */
                return;

            }

            /* Otherwise fall trough, and go write since we can't wait
             * over two seconds. 
             *
             * 如果后台还有 fsync 在执行,并且 write 已经推迟 >= 2 秒
             * 那么执行写操作(write 将被阻塞)
             */
            server.aof_delayed_fsync++;
            redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }

    /* If you are following this code path, then we are going to write so
     * set reset the postponed flush sentinel to zero. 
     *
     * 执行到这里,程序会对 AOF 文件进行写入。
     *
     * 清零延迟 write 的时间记录
     */
    server.aof_flush_postponed_start = 0;

    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     *
     * 执行单个 write 操作,如果写入设备是物理的话,那么这个操作应该是原子的
     *
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike 
     *
     * 当然,如果出现像电源中断这样的不可抗现象,那么 AOF 文件也是可能会出现问题的
     * 这时就要用 redis-check-aof 程序来进行修复。
     */
    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
    if (nwritten != (signed)sdslen(server.aof_buf)) {

        static time_t last_write_error_log = 0;
        int can_log = 0;

        /* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
        // 将日志的记录频率限制在每行 AOF_WRITE_LOG_ERROR_RATE 秒
        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
            can_log = 1;
            last_write_error_log = server.unixtime;
        }

        /* Lof the AOF write error and record the error code. */
        // 如果写入出错,那么尝试将该情况写入到日志里面
        if (nwritten == -1) {
            if (can_log) {
                redisLog(REDIS_WARNING,"Error writing to the AOF file: %s",
                    strerror(errno));
                server.aof_last_write_errno = errno;
            }
        } else {
            if (can_log) {
                redisLog(REDIS_WARNING,"Short write while writing to "
                                       "the AOF file: (nwritten=%lld, "
                                       "expected=%lld)",
                                       (long long)nwritten,
                                       (long long)sdslen(server.aof_buf));
            }

            // 尝试移除新追加的不完整内容
            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
                if (can_log) {
                    redisLog(REDIS_WARNING, "Could not remove short write "
                             "from the append-only file.  Redis may refuse "
                             "to load the AOF the next time it starts.  "
                             "ftruncate: %s", strerror(errno));
                }
            } else {
                /* If the ftrunacate() succeeded we can set nwritten to
                 * -1 since there is no longer partial data into the AOF. */
                nwritten = -1;
            }
            server.aof_last_write_errno = ENOSPC;
        }

        /* Handle the AOF write error. */
        // 处理写入 AOF 文件时出现的错误
        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
            /* We can't recover when the fsync policy is ALWAYS since the
             * reply for the client is already in the output buffers, and we
             * have the contract with the user that on acknowledged write data
             * is synched on disk. */
            redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
            exit(1);
        } else {
            /* Recover from failed write leaving data into the buffer. However
             * set an error to stop accepting writes as long as the error
             * condition is not cleared. */
            server.aof_last_write_status = REDIS_ERR;

            /* Trim the sds buffer if there was a partial write, and there
             * was no way to undo it with ftruncate(2). */
            if (nwritten > 0) {
                server.aof_current_size += nwritten;
                sdsrange(server.aof_buf,nwritten,-1);
            }
            return; /* We'll try again on the next call... */
        }
    } else {
        /* Successful write(2). If AOF was in error state, restore the
         * OK state and log the event. */
        // 写入成功,更新最后写入状态
        if (server.aof_last_write_status == REDIS_ERR) {
            redisLog(REDIS_WARNING,
                "AOF write error looks solved, Redis can write again.");
            server.aof_last_write_status = REDIS_OK;
        }
    }

    // 更新写入后的 AOF 文件大小
    server.aof_current_size += nwritten;

    /* Re-use AOF buffer when it is small enough. The maximum comes from the
     * arena size of 4k minus some overhead (but is otherwise arbitrary). 
     *
     * 如果 AOF 缓存的大小足够小的话,那么重用这个缓存,
     * 否则的话,释放 AOF 缓存。
     */
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        // 清空缓存中的内容,等待重用
        sdsclear(server.aof_buf);
    } else {
        // 释放缓存
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }

    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
     * children doing I/O in the background. 
     *
     * 如果 no-appendfsync-on-rewrite 选项为开启状态,
     * 并且有 BGSAVE 或者 BGREWRITEAOF 正在进行的话,
     * 那么不执行 fsync 
     */
    if (server.aof_no_fsync_on_rewrite &&
        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
            return;

    /* Perform the fsync if needed. */

    // 总是执行 fsnyc
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
        /* aof_fsync is defined as fdatasync() for Linux in order to avoid
         * flushing metadata. */
        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */

        // 更新最后一次执行 fsnyc 的时间
        server.aof_last_fsync = server.unixtime;

    // 策略为每秒 fsnyc ,并且距离上次 fsync 已经超过 1 秒
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        // 放到后台执行
        if (!sync_in_progress) aof_background_fsync(server.aof_fd);
        // 更新最后一次执行 fsync 的时间
        server.aof_last_fsync = server.unixtime;
    }

    // 其实上面无论执行 if 部分还是 else 部分都要更新 fsync 的时间
    // 可以将代码挪到下面来
    // server.aof_last_fsync = server.unixtime;
}

这里的重点是fsync操作,这是Linux的I/O同步操作,系统会将自己缓存中的内容以后台的方式写到磁盘上。

IV、AOF重写

AOF后台子进程方式重写操作由rewriteAppendOnlyFileBackground函数完成:

/*src/aof.c/rewriteAppendOnlyFileBackground*/
/* This is how rewriting of the append only file in background works:
 * 
 * 以下是后台重写 AOF 文件(BGREWRITEAOF)的工作步骤:
 *
 * 1) The user calls BGREWRITEAOF
 *    用户调用 BGREWRITEAOF
 *
 * 2) Redis calls this function, that forks():
 *    Redis 调用这个函数,它执行 fork() :
 *
 *    2a) the child rewrite the append only file in a temp file.
 *        子进程在临时文件中对 AOF 文件进行重写
 *
 *    2b) the parent accumulates differences in server.aof_rewrite_buf.
 *        父进程将新输入的写命令追加到 server.aof_rewrite_buf 中
 *
 * 3) When the child finished '2a' exists.
 *    当步骤 2a 执行完之后,子进程结束
 *
 * 4) The parent will trap the exit code, if it's OK, will append the
 *    data accumulated into server.aof_rewrite_buf into the temp file, and
 *    finally will rename(2) the temp file in the actual file name.
 *    The the new file is reopened as the new append only file. Profit!
 *
 *    父进程会捕捉子进程的退出信号,
 *    如果子进程的退出状态是 OK 的话,
 *    那么父进程将新输入命令的缓存追加到临时文件,
 *    然后使用 rename(2) 对临时文件改名,用它代替旧的 AOF 文件,
 *    至此,后台 AOF 重写完成。
 */
int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;
    long long start;

    // 已经有进程在进行 AOF 重写了
    if (server.aof_child_pid != -1) return REDIS_ERR;

    // 记录 fork 开始前的时间,计算 fork 耗时用
    start = ustime();

    if ((childpid = fork()) == 0) {
        char tmpfile[256];    //临时文件

        /* 子进程设置 */

        // 关闭网络连接 fd
        closeListeningSockets(0);

        // 为进程设置名字,方便记认
        redisSetProcTitle("redis-aof-rewrite");

        // 创建临时文件,并进行 AOF 重写
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());

        /*rewriteAppendOnlyFile函数将命令写入到tmpfile中,下面说明*/
        if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
            size_t private_dirty = zmalloc_get_private_dirty();

            if (private_dirty) {
                redisLog(REDIS_NOTICE,
                    "AOF rewrite: %zu MB of memory used by copy-on-write",
                    private_dirty/(1024*1024));
            }
            // 发送重写成功信号
            exitFromChild(0);
        } else {
            // 发送重写失败信号
            exitFromChild(1);
        }
    } else {
        /* Parent */
        // 记录执行 fork 所消耗的时间
        server.stat_fork_time = ustime()-start;

        if (childpid == -1) {
            redisLog(REDIS_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            return REDIS_ERR;
        }

        redisLog(REDIS_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);

        // 记录 AOF 重写的信息
        server.aof_rewrite_scheduled = 0;
        server.aof_rewrite_time_start = time(NULL);
        server.aof_child_pid = childpid;

        // 关闭字典自动 rehash
        updateDictResizePolicy();

        /* We set appendseldb to -1 in order to force the next call to the
         * feedAppendOnlyFile() to issue a SELECT command, so the differences
         * accumulated by the parent into server.aof_rewrite_buf will start
         * with a SELECT statement and it will be safe to merge. 
         *
         * 将 aof_selected_db 设为 -1 ,
         * 强制让 feedAppendOnlyFile() 下次执行时引发一个 SELECT 命令,
         * 从而确保之后新添加的命令会设置到正确的数据库中
         */
        server.aof_selected_db = -1;
        replicationScriptCacheFlush();
        return REDIS_OK;
    }
    return REDIS_OK; /* unreached */
}

可以看出rewriteAppendOnlyFileBackground主要调用rewriteAppendOnlyFile函数完成重写,下面看这个函数做了什么,也就是AOF持久化具体是怎么做的:

// AOF 持久化主函数。只在rewriteAppendOnlyFileBackground() 中会调用此函数
/*src/aof.c/rewriteAppendOnlyFile*/
/* Write a sequence of commands able to fully rebuild the dataset into
* "filename". Used both by REWRITEAOF and BGREWRITEAOF.
**
In order to minimize the number of commands needed in the rewritten
* log Redis uses variadic commands when possible, such as RPUSH, SADD
* and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time
* are inserted using a single command. */

/*将一集足以还原当前数据集的命令写入到 filename 指定的文件中*/
int rewriteAppendOnlyFile(char *filename) {
    dictIterator *di = NULL;
    dictEntry *de;
    rio aof;
    FILE *fp;
    char tmpfile[256];
    int j;
    long long now = mstime();
    /* Note that we have to use a different temp name here compared to the
    * one used by rewriteAppendOnlyFileBackground() function. */

    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
    // 打开文件
    fp = fopen(tmpfile,"w");
    if (!fp) {
        redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in"
        "rewriteAppendOnlyFile(): %s", strerror(errno));
        return REDIS_ERR;
    }

    // 初始化rio 结构体
    rioInitWithFile(&aof,fp);

    // 如果设置了自动备份参数,将进行设置
    if (server.aof_rewrite_incremental_fsync)
        rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);
    // 备份每一个数据集
    for (j = 0; j < server.dbnum; j++) {
        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
        redisDb *db = server.db+j;
        dict *d = db->dict;
    if (dictSize(d) == 0) continue;
        // 获取数据集的迭代器
        di = dictGetSafeIterator(d);
    if (!di) {
        fclose(fp);
        return REDIS_ERR;
    }
    // 写入AOF 操作码
    /* SELECT the new DB */
    if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
    // 写入数据集序号
    if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;
    // 写入数据集中每一个数据项
    /* Iterate this DB writing every entry */
    while((de = dictNext(di)) != NULL) {
        sds keystr;
        robj key, *o;
        long long expiretime;
        keystr = dictGetKey(de);
        o = dictGetVal(de);
        // 将keystr 封装在robj 里
        initStaticStringObject(key,keystr);
        // 获取过期时间
        expiretime = getExpire(db,&key);

        // 如果已经过期,放弃存储
        /* If this key is already expired skip it */
    if (expiretime != -1 && expiretime < now) continue;
        // 写入键值对应的写操作
        /* Save the key and associated value */
    if (o->type == REDIS_STRING) {
        /* Emit a SET command */
        char cmd[]="*3\r\n$3\r\nSET\r\n";
    if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
        /* Key and value */
    if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
    if (rioWriteBulkObject(&aof,o) == 0) goto werr;
    } else if (o->type == REDIS_LIST) {
    if (rewriteListObject(&aof,&key,o) == 0) goto werr;
    } else if (o->type == REDIS_SET) {
    if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
    } else if (o->type == REDIS_ZSET) {
    if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
    } else if (o->type == REDIS_HASH) {
    if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
    } else {
        redisPanic("Unknown object type");
    }
    // 写入过期时间
    /* Save the expire time */
    if (expiretime != -1) {
        char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
    if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
    if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
    if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
    }
}
    // 释放迭代器
    dictReleaseIterator(di);
}
    // 写入磁盘
    /* Make sure data will not remain on the OS's output buffers */
    fflush(fp);
    aof_fsync(fileno(fp));
    fclose(fp);
    // 重写文件名
    /* Use RENAME to make sure the DB file is changed atomically only
    * if the generate DB file is ok. */
    if (rename(tmpfile,filename) == -1) {
        redisLog(REDIS_WARNING,"Error moving temp append only file on the "
        "final destination: %s", strerror(errno));
        unlink(tmpfile);
        return REDIS_ERR;
    }
    redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
    return REDIS_OK;
    werr:
    // 清理工作
    fclose(fp);
    unlink(tmpfile);
    redisLog(REDIS_WARNING,"Write error writing append only file on disk: "
    "%s", strerror(errno));
    if (di) dictReleaseIterator(di);
        return REDIS_ERR;
}  

在子进程完成AOF重写是,父进程会调用backgroundRewriteDoneHandler函数,将重写缓冲区server.aof_rewrite_buf_blocks中的内容写入磁盘:

/*src/aof.c/backgroundRewriteDoneHandler*/
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
    ......
    // 将AOF 缓存server.aof_rewrite_buf_blocks 的AOF 写入磁盘
    //这个函数调用write会阻塞主进程
    if (aofRewriteBufferWrite(newfd) == -1) {
        redisLog(REDIS_WARNING,
        "Error trying to flush the parent diff to the rewritten AOF: %s",
        strerror(errno));
        close(newfd);
        goto cleanup;
    }
    ......
}  

这其中主要又调用了aofRewriteBufferrWrite函数,完成真正的追加操作:

/*src/aof.c/aofRewriteBufferWrite*/
// 将累积的更新缓存server.aof_rewrite_buf_blocks 同步到磁盘
/* Write the buffer (possibly composed of multiple blocks) into the specified
* fd. If no short write or any other error happens -1 is returned,
* otherwise the number of bytes written is returned. */
    ssize_t aofRewriteBufferWrite(int fd) {
    listNode *ln;
    listIter li;
    ssize_t count = 0;
    listRewind(server.aof_rewrite_buf_blocks,&li);
    //遍历所有缓存块
    while((ln = listNext(&li))) {
        aofrwblock *block = listNodeValue(ln);
        ssize_t nwritten;
        if (block->used) {
            //写入缓存内容
            nwritten = write(fd,block->buf,block->used);
            if (nwritten != block->used) {
                 if (nwritten == 0) errno = EIO;
                 return -1;
            }
            count += nwritten;
        }
  }
  return count;
}

V、AOF的恢复过程

伪客户端的方式重新读入AOF文件以恢复内容主要在loadAppendOnlyFile函数中实现:

// 加载AOF 文件,恢复数据
/**src/aof.c/loadAppendOnlyFile*/
/* Replay the append log file. On error REDIS_OK is returned. On non fatal
* error (the append only file is zero-length) REDIS_ERR is returned. On
* fatal error an error message is logged and the program exists. */
int loadAppendOnlyFile(char *filename) {
    struct redisClient *fakeClient;
    FILE *fp = fopen(filename,"r");
    struct redis_stat sb;
    int old_aof_state = server.aof_state;
    long loops = 0;
    // 文件大小不能为0
    if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
        server.aof_current_size = 0;
        fclose(fp);
        return REDIS_ERR;
    }
    if (fp == NULL) {
        redisLog(REDIS_WARNING,"Fatal error: can't open the append log file "
        "for reading: %s",strerror(errno));
        exit(1);
    }
    // 正在执行AOF 加载操作,于是暂时禁止AOF 的所有操作,以免混淆
    /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
    * to the same file we're about to read. */
    server.aof_state = REDIS_AOF_OFF;
    // 虚拟出一个客户端,即redisClient
    fakeClient = createFakeClient();
    startLoading(fp);
    while(1) {
        int argc, j;
        unsigned long len;
        robj **argv;
        char buf[128];
        sds argsds;
        struct redisCommand *cmd;
        // 每循环1000 次,在恢复数据的同时,服务器也为客户端服务。
        // aeProcessEvents() 会进入事件循环
        /* Serve the clients from time to time */
    if (!(loops++ % 1000)) {
        loadingProgress(ftello(fp));
        aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);
    }
    // 可能aof 文件到了结尾
    if (fgets(buf,sizeof(buf),fp) == NULL) {
    if (feof(fp))
        break;
    else
        goto readerr;
    }
    // 必须以“*”开头,格式不对,退出
    if (buf[0] != '*') goto fmterr;
        // 参数的个数
        argc = atoi(buf+1);
        // 参数个数错误
    if (argc < 1) goto fmterr;
        // 为参数分配空间
        argv = zmalloc(sizeof(robj*)*argc);
        // 依次读取参数
    for (j = 0; j < argc; j++) {
    if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
    if (buf[0] != '$') goto fmterr;
        len = strtol(buf+1,NULL,10);
        argsds = sdsnewlen(NULL,len);
    if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
        argv[j] = createObject(REDIS_STRING,argsds);
    if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
    }
    // 找到相应的命令
    /* Command lookup */
    cmd = lookupCommand(argv[0]->ptr);
    if (!cmd) {
        redisLog(REDIS_WARNING,"Unknown command '%s' reading the "
        "append only file", (char*)argv[0]->ptr);
        exit(1);
    }
    // 执行命令,模拟服务客户端请求的过程,从而写入数据
    /* Run the command in the context of a fake client */
    fakeClient->argc = argc;
    fakeClient->argv = argv;
    cmd->proc(fakeClient);
    /* The fake client should not have a reply */
    redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply)
    == 0);
    /* The fake client should never get blocked */
    redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);
    // 释放虚拟客户端空间
    /* Clean up. Command code may have changed argv/argc so we use the
    * argv/argc of the client instead of the local variables. */
    for (j = 0; j < fakeClient->argc; j++)
        decrRefCount(fakeClient->argv[j]);
        zfree(fakeClient->argv);
    }
    /* This point can only be reached when EOF is reached without errors.
    * If the client is in the middle of a MULTI/EXEC, log error and quit. */
    if (fakeClient->flags & REDIS_MULTI) goto readerr;
        // 清理工作
        fclose(fp);
        freeFakeClient(fakeClient);
        // 恢复旧的AOF 状态
        server.aof_state = old_aof_state;
        stopLoading();
        // 记录最近AOF 操作的文件大小
        aofUpdateCurrentSize();
        server.aof_rewrite_base_size = server.aof_current_size;
        return REDIS_OK;
        readerr:
    // 错误,清理工作
    if (feof(fp)) {
        redisLog(REDIS_WARNING,"Unexpected end of file reading the append "
        "only file");
    } else {
        redisLog(REDIS_WARNING,"Unrecoverable error reading the append only "
        "file: %s", strerror(errno));
    }
    exit(1);
    fmterr:
    redisLog(REDIS_WARNING,"Bad file format reading the append only file: "
    "make a backup of your AOF file, then use ./redis-check-aof --fix "
    "");
    exit(1);
}

【参考】
[1] 《Redis设计与实现》

你可能感兴趣的:(Redis源码研究之AOF持久化)