Redis提供的第二种持久化机制AOF(Append-Only File),正如其名字,是以协议文本的方式将所有对数据库进行写的命令(及其参数)记录到追加到AOF文件,以此达到记录数据库状态的目的。
基本过程为: 客户端通过网络协议向服务器发起命令请求——>服务器选择命令函数并创建命令参数对象——>数数据库执行命令操作。/同时服务器将命令操作广播到AOF程序——>AOF程序将命令及参数对象、数据库ID等参数还原为协议文本——>追加到服务器的aof_buf缓存中。——>当服务器的常规任务函数或者事件处理器执行时调用flushAppendOnlyFile函数将aof_buf缓存中数据写入临时AOF文件——>由同步函数fsync/fdatasync将AOF文件保存到磁盘。
其中主要是文件的写入和保存:
函数flushAppendOnlyFile(简称函数Func)执行一下两个工作:
WRITE:根据条件,将aof_buf中的缓存写入到AOF文件
SAVE:根据条件,调用fsync或fdatasync函数将AOF文件保存到磁盘
两个步骤都需要根据一定的条件来执行,条件由AOF所使用的保存模式决定
AOF保存模式:
** AOF_FSYNC_NO:不保存 (WRITE/SAVE都由主进程成完成)
**AOF_FSYNC_EVERYSEC: 每秒钟保存一次 (WRITE由主进程完成,SAVE由子线程完成)
**AOF_FSYNC_ALWAYS:每执行一次命令保存一次 ( WRITE/SAVE都由主进程成完成)
其中 AOF_FSYNC_NO模式,每次执行Func都会执行WRITE,但是SAVE只有在以下情况才会执行: Redis被关闭 或者 AOF功能别关闭 或者 系统的缓村被刷新 。
AOF_FSYNC_EVERYSEC模式: 原则上每秒钟执行一次SAVE(后后台子线程完成),但实际上与当前Redis的状态有关如图:
因此在每秒钟保存一次的模式下,如果情况1发生故障停机,那么将最多损失小于2秒的数据,在情况2下则可能超过损失2秒的数据。
AOF_FSYNC_ALWAYS模式:在这种模式下,每次执行一个命令都会执行WRITE 和SAVE。且由主进程完成。
以上三种模式,其安全性能依次增加,而效率逐渐下降。
读入AOF文件和保存的主要函数:
将 AOF 缓存写入到文件中:
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
// 缓冲区中没有任何内容,直接返回
if (sdslen(server.aof_buf) == 0) return;
// 策略为每秒 FSYNC : 实际将根据当前Redis的状态有关,而非真正的每秒钟就能执行一次SAVE操作:一共四种情况:
if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
// 是否有 SYNC 正在后台进行?
sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;
// 每秒 fsync ,并且强制写入为假
if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
/* With this append fsync policy we do background fsyncing.
*
* 当 fsync 策略为每秒钟一次时, fsync 在后台执行。
*
* If the fsync is still in progress we can try to delay
* the write for a couple of seconds.
*
* 如果后台仍在执行 FSYNC ,那么我们可以延迟写操作一两秒
* (如果强制执行 write 的话,服务器主线程将阻塞在 write 上面)
*/
if (sync_in_progress) {
// 有 fsync 正在后台进行 。。。
if (server.aof_flush_postponed_start == 0) {
/* No previous write postponinig, remember that we are
* postponing the flush and return.
*
* 前面没有推迟过 write 操作,这里将推迟写操作的时间记录下来
* 然后就返回,不执行 write 或者 fsync
*/
server.aof_flush_postponed_start = server.unixtime;
return;
} else if (server.unixtime - server.aof_flush_postponed_start < 2) {
/* We were already waiting for fsync to finish, but for less
* than two seconds this is still ok. Postpone again.
*
* 如果之前已经因为 fsync 而推迟了 write 操作
* 但是推迟的时间不超过 2 秒,那么直接返回
* 不执行 write 或者 fsync
*/
return;
}
/* Otherwise fall trough, and go write since we can't wait
* over two seconds.
*
* 如果后台还有 fsync 在执行,并且 write 已经推迟 >= 2 秒
* 那么执行写操作(write 将被阻塞)
*/
server.aof_delayed_fsync++;
redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
}
}
/* If you are following this code path, then we are going to write so
* set reset the postponed flush sentinel to zero.
*
* 执行到这里,程序会对 AOF 文件进行写入。
*
* 清零延迟 write 的时间记录
*/
server.aof_flush_postponed_start = 0;
/* We want to perform a single write. This should be guaranteed atomic
* at least if the filesystem we are writing is a real physical one.
*
* 执行单个 write 操作,如果写入设备是物理的话,那么这个操作应该是原子的
*
* While this will save us against the server being killed I don't think
* there is much to do about the whole server stopping for power problems
* or alike
*
* 当然,如果出现像电源中断这样的不可抗现象,那么 AOF 文件也是可能会出现问题的
* 这时就要用 redis-check-aof 程序来进行修复。
*/
nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));//将aof_buf缓冲写入到AOF文件
if (nwritten != (signed)sdslen(server.aof_buf)) {
static time_t last_write_error_log = 0;
int can_log = 0;
/* Limit logging rate to 1 line per AOF_WRITE_LOG_ERROR_RATE seconds. */
// 将日志的记录频率限制在每行 AOF_WRITE_LOG_ERROR_RATE 秒
if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
can_log = 1;
last_write_error_log = server.unixtime;
}
/* Lof the AOF write error and record the error code. */
// 如果写入出错,那么尝试将该情况写入到日志里面
if (nwritten == -1) {
if (can_log) {
redisLog(REDIS_WARNING,"Error writing to the AOF file: %s",
strerror(errno));
server.aof_last_write_errno = errno;
}
} else {
if (can_log) {
redisLog(REDIS_WARNING,"Short write while writing to "
"the AOF file: (nwritten=%lld, "
"expected=%lld)",
(long long)nwritten,
(long long)sdslen(server.aof_buf));
}
// 尝试移除新追加的不完整内容
if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
if (can_log) {
redisLog(REDIS_WARNING, "Could not remove short write "
"from the append-only file. Redis may refuse "
"to load the AOF the next time it starts. "
"ftruncate: %s", strerror(errno));
}
} else {
/* If the ftrunacate() succeeded we can set nwritten to
* -1 since there is no longer partial data into the AOF. */
nwritten = -1;
}
server.aof_last_write_errno = ENOSPC;
}
/* Handle the AOF write error. */
// 处理写入 AOF 文件时出现的错误
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
/* We can't recover when the fsync policy is ALWAYS since the
* reply for the client is already in the output buffers, and we
* have the contract with the user that on acknowledged write data
* is synched on disk. */
redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
exit(1);
} else {
/* Recover from failed write leaving data into the buffer. However
* set an error to stop accepting writes as long as the error
* condition is not cleared. */
server.aof_last_write_status = REDIS_ERR;
/* Trim the sds buffer if there was a partial write, and there
* was no way to undo it with ftruncate(2). */
if (nwritten > 0) {
server.aof_current_size += nwritten;
sdsrange(server.aof_buf,nwritten,-1);
}
return; /* We'll try again on the next call... */
}
} else {
/* Successful write(2). If AOF was in error state, restore the
* OK state and log the event. */
// 写入成功,更新最后写入状态
if (server.aof_last_write_status == REDIS_ERR) {
redisLog(REDIS_WARNING,
"AOF write error looks solved, Redis can write again.");
server.aof_last_write_status = REDIS_OK;
}
}
// 更新写入后的 AOF 文件大小
server.aof_current_size += nwritten;
/* Re-use AOF buffer when it is small enough. The maximum comes from the
* arena size of 4k minus some overhead (but is otherwise arbitrary).
*
* 如果 AOF 缓存的大小足够小的话,那么重用这个缓存,
* 否则的话,释放 AOF 缓存。
*/
if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
// 清空缓存中的内容,等待重用
sdsclear(server.aof_buf);
} else {
// 释放缓存
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
* children doing I/O in the background.
*
* 如果 no-appendfsync-on-rewrite 选项为开启状态,
* 并且有 BGSAVE 或者 BGREWRITEAOF 正在进行的话,
* 那么不执行 fsync
*/
if (server.aof_no_fsync_on_rewrite &&
(server.aof_child_pid != -1 || server.rdb_child_pid != -1))
return;
/* Perform the fsync if needed. */
// AOF_FSYNC_ALWAYS
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
/* aof_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
// 更新最后一次执行 fsnyc 的时间
server.aof_last_fsync = server.unixtime;
// 策略为每秒 fsnyc ,并且距离上次 fsync 已经超过 1 秒
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
// 放到后台执行
if (!sync_in_progress) aof_background_fsync(server.aof_fd);
// 更新最后一次执行 fsync 的时间
server.aof_last_fsync = server.unixtime;
}
}
接下来就是 逆向的处理过程:
AOF文件的读取和数据还原:
AOF文件以通信协议的文本格式保存了Redis数据库的状态。因此可以创建一个伪客户端来执行AOF文件中的命令,从而还原数据库的状态。
基本过程如下:
1、创建一个不带网络连接的伪客户端
2、读取AOF所保存的文本,并根据内容还原出命令、参数以及参数个数
3、用伪客户端执行读取的命令
4、执行2 、3直到AOF文件中保存的数据库都被还原出来。
完成第四步之后,AOF所保存的数据库状态就被完整的还原出来了。
加载函数:
int loadAppendOnlyFile(char *filename) {
// 为客户端
struct redisClient *fakeClient;
// 打开 AOF 文件
FILE *fp = fopen(filename,"r");
struct redis_stat sb;
int old_aof_state = server.aof_state;
long loops = 0;
// 检查文件的正确性
if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
server.aof_current_size = 0;
fclose(fp);
return REDIS_ERR;
}
// 检查文件是否正常打开
if (fp == NULL) {
redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
exit(1);
}
/* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
* to the same file we're about to read.
*
* 暂时性地关闭 AOF ,防止在执行 MULTI 时,
* EXEC 命令被传播到正在打开的 AOF 文件中。
*/
server.aof_state = REDIS_AOF_OFF;
fakeClient = createFakeClient();
// 设置服务器的状态为:正在载入
// startLoading 定义于 rdb.c
startLoading(fp);
while(1) {
int argc, j;
unsigned long len;
robj **argv;
char buf[128];
sds argsds;
struct redisCommand *cmd;
/* Serve the clients from time to time
*
* 间隔性地处理客户端发送来的请求
* 因为服务器正处于载入状态,所以能正常执行的只有 PUBSUB 等模块
*/
if (!(loops++ % 1000)) {
loadingProgress(ftello(fp));
processEventsWhileBlocked();
}
// 读入文件内容到缓存
if (fgets(buf,sizeof(buf),fp) == NULL) {
if (feof(fp))
// 文件已经读完,跳出
break;
else
goto readerr;
}
// 确认协议格式,比如 *3\r\n
if (buf[0] != '*') goto fmterr;
// 取出命令参数,比如 *3\r\n 中的 3
argc = atoi(buf+1);
// 至少要有一个参数(被调用的命令)
if (argc < 1) goto fmterr;
// 从文本中创建字符串对象:包括命令,以及命令参数
// 例如 $3\r\nSET\r\n$3\r\nKEY\r\n$5\r\nVALUE\r\n
// 将创建三个包含以下内容的字符串对象:
// SET 、 KEY 、 VALUE
argv = zmalloc(sizeof(robj*)*argc);
for (j = 0; j < argc; j++) {
if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
if (buf[0] != '$') goto fmterr;
// 读取参数值的长度
len = strtol(buf+1,NULL,10);
// 读取参数值
argsds = sdsnewlen(NULL,len);
if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
// 为参数创建对象
argv[j] = createObject(REDIS_STRING,argsds);
if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
}
/* Command lookup
*
* 查找命令
*/
cmd = lookupCommand(argv[0]->ptr);
if (!cmd) {
redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", (char*)argv[0]->ptr);
exit(1);
}
/* Run the command in the context of a fake client
*
* 调用伪客户端,执行命令
*/
fakeClient->argc = argc;
fakeClient->argv = argv;
cmd->proc(fakeClient);
/* The fake client should not have a reply */
redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0);
/* The fake client should never get blocked */
redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);
/* Clean up. Command code may have changed argv/argc so we use the
* argv/argc of the client instead of the local variables.
*
* 清理命令和命令参数对象
*/
for (j = 0; j < fakeClient->argc; j++)
decrRefCount(fakeClient->argv[j]);
zfree(fakeClient->argv);
}
/* This point can only be reached when EOF is reached without errors.
* If the client is in the middle of a MULTI/EXEC, log error and quit.
*
* 如果能执行到这里,说明 AOF 文件的全部内容都可以正确地读取,
* 但是,还要检查 AOF 是否包含未正确结束的事务
*/
if (fakeClient->flags & REDIS_MULTI) goto readerr;
// 关闭 AOF 文件
fclose(fp);
// 释放伪客户端
freeFakeClient(fakeClient);
// 复原 AOF 状态
server.aof_state = old_aof_state;
// 停止载入
stopLoading();
// 更新服务器状态中, AOF 文件的当前大小
aofUpdateCurrentSize();
// 记录前一次重写时的大小
server.aof_rewrite_base_size = server.aof_current_size;
return REDIS_OK;
// 读入错误
readerr:
// 非预期的末尾,可能是 AOF 文件在写入的中途遭遇了停机
if (feof(fp)) {
redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
// 文件内容出错
} else {
redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
}
exit(1);
// 内容格式错误
fmterr:
redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix ");
exit(1);
}
AOF采用同步方式将执行的命令及参数等记录到AOF文件中,这将随着时间使得AOF文件越来越大。 因此Redis实现了AOF文件的重写机制,即将当前数据库的状态用最少的命令及参数记录到新的AOF文件中,然后利用新的AOF文件代替旧的AOF文件。 注意: 此处的”重写“实际是不准确的,因为新的AOF文件时直接来与当前的数据库的状态,而非旧的AOF文件。
REWRITE由主进程来执行重写,这将阻塞主进程,由于重写数据量较大,所以这将严重影响性能。 而在后台重写 BGREWRITE时 ,以上过程(及新AOF文件的写入)都将由子进程来完成,当子进程写好新的AOF文件后,将发送信号给主进程,到此子进程结束。接着主进程将把在子进程写新AOF文件期间,新到的命令缓存到服务器的 AOF重写缓冲中,此时主进程将重写缓存中的内容追加到新的AOF文件中,然后用新的AOF文件代替就得AOF文件。 就此完成整个重写过程。
将AOF重写缓存中的内容写入指定的AOF文件:
/* Write the buffer (possibly composed of multiple blocks) into the specified
* fd. If a short write or any other error happens -1 is returned,
* otherwise the number of bytes written is returned.
*
* 将重写缓存中的所有内容(可能由多个块组成)写入到给定 fd 中。
*
* 如果没有 short write 或者其他错误发生,那么返回写入的字节数量,
* 否则,返回 -1 。
*/
ssize_t aofRewriteBufferWrite(int fd) {
listNode *ln;
listIter li;
ssize_t count = 0;
// 遍历所有缓存块
listRewind(server.aof_rewrite_buf_blocks,&li);
while((ln = listNext(&li))) {
aofrwblock *block = listNodeValue(ln);
ssize_t nwritten;
if (block->used) {
// 写入缓存块内容到 fd
nwritten = write(fd,block->buf,block->used);
if (nwritten != block->used) {
if (nwritten == 0) errno = EIO;
return -1;
}
// 积累写入字节
count += nwritten;
}
}
return count;
}
/* Write a sequence of commands able to fully rebuild the dataset into
* "filename". Used both by REWRITEAOF and BGREWRITEAOF.
*
* 将一集足以还原当前数据集的命令写入到 filename 指定的文件中。
*
* 这个函数被 REWRITEAOF 和 BGREWRITEAOF 两个命令调用。
* (REWRITEAOF 似乎已经是一个废弃的命令)
*
* In order to minimize the number of commands needed in the rewritten
* log Redis uses variadic commands when possible, such as RPUSH, SADD
* and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time
* are inserted using a single command.
*
* 为了最小化重建数据集所需执行的命令数量,
* Redis 会尽可能地使用接受可变参数数量的命令,比如 RPUSH 、SADD 和 ZADD 等。
*
* 不过单个命令每次处理的元素数量不能超过 REDIS_AOF_REWRITE_ITEMS_PER_CMD 。
*/
int rewriteAppendOnlyFile(char *filename) {
dictIterator *di = NULL;
dictEntry *de;
rio aof;
FILE *fp;
char tmpfile[256];
int j;
long long now = mstime();
/* Note that we have to use a different temp name here compared to the
* one used by rewriteAppendOnlyFileBackground() function.
*
* 创建临时文件
*
* 注意这里创建的文件名和 rewriteAppendOnlyFileBackground() 创建的文件名稍有不同
*/
snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
fp = fopen(tmpfile,"w");
if (!fp) {
redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
return REDIS_ERR;
}
// 初始化文件 io
rioInitWithFile(&aof,fp);
// 设置每写入 REDIS_AOF_AUTOSYNC_BYTES 字节
// 就执行一次 FSYNC
// 防止缓存中积累太多命令内容,造成 I/O 阻塞时间过长
if (server.aof_rewrite_incremental_fsync)
rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);
// 遍历所有数据库
for (j = 0; j < server.dbnum; j++) {
char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
redisDb *db = server.db+j;
// 指向键空间
dict *d = db->dict;
if (dictSize(d) == 0) continue;
// 创建键空间迭代器
di = dictGetSafeIterator(d);
if (!di) {
fclose(fp);
return REDIS_ERR;
}
/* SELECT the new DB
*
* 首先写入 SELECT 命令,确保之后的数据会被插入到正确的数据库上
*/
if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;//写入数据库编号
/* Iterate this DB writing every entry
*
* 遍历数据库所有键,并通过命令将它们的当前状态(值)记录到新 AOF 文件中
*/
while((de = dictNext(di)) != NULL) {
sds keystr;
robj key, *o;
long long expiretime;
// 取出键
keystr = dictGetKey(de);
// 取出值
o = dictGetVal(de);
initStaticStringObject(key,keystr);
// 取出过期时间
expiretime = getExpire(db,&key);
/* If this key is already expired skip it
*
* 如果键已经过期,那么跳过它,不保存
*/
if (expiretime != -1 && expiretime < now) continue;
/* Save the key and associated value
*
* 根据值的类型,选择适当的命令来保存值
*/
if (o->type == REDIS_STRING) {
/* Emit a SET command */
char cmd[]="*3\r\n$3\r\nSET\r\n";
if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
/* Key and value */
if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
if (rioWriteBulkObject(&aof,o) == 0) goto werr;
} else if (o->type == REDIS_LIST) {
if (rewriteListObject(&aof,&key,o) == 0) goto werr;
} else if (o->type == REDIS_SET) {
if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
} else if (o->type == REDIS_ZSET) {
if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
} else if (o->type == REDIS_HASH) {
if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
} else {
redisPanic("Unknown object type");
}
/* Save the expire time
*
* 保存键的过期时间
*/
if (expiretime != -1) {
char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
// 写入 PEXPIREAT expiretime 命令
if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
}
}
// 释放迭代器
dictReleaseIterator(di);
}
/* Make sure data will not remain on the OS's output buffers */
// 冲洗并关闭新 AOF 文件
if (fflush(fp) == EOF) goto werr;
if (aof_fsync(fileno(fp)) == -1) goto werr;
if (fclose(fp) == EOF) goto werr;
/* Use RENAME to make sure the DB file is changed atomically only
* if the generate DB file is ok.
*
* 原子地改名,用重写后的新 AOF 文件覆盖旧 AOF 文件
*/
if (rename(tmpfile,filename) == -1) {
redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
unlink(tmpfile);
return REDIS_ERR;
}
redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
return REDIS_OK;
werr:
fclose(fp);
unlink(tmpfile);
redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
if (di) dictReleaseIterator(di);
return REDIS_ERR;
}
/* This is how rewriting of the append only file in background works:
*
* 以下是后台重写 AOF 文件(BGREWRITEAOF)的工作步骤:
*
* 1) The user calls BGREWRITEAOF
* 用户调用 BGREWRITEAOF
*
* 2) Redis calls this function, that forks():
* Redis 调用这个函数,它执行 fork() :
*
* 2a) the child rewrite the append only file in a temp file.
* 子进程在临时文件中对 AOF 文件进行重写
*
* 2b) the parent accumulates differences in server.aof_rewrite_buf.
* 父进程将新输入的写命令追加到 server.aof_rewrite_buf 中
*
* 3) When the child finished '2a' exists.
* 当步骤 2a 执行完之后,子进程结束
*
* 4) The parent will trap the exit code, if it's OK, will append the
* data accumulated into server.aof_rewrite_buf into the temp file, and
* finally will rename(2) the temp file in the actual file name.
* The the new file is reopened as the new append only file. Profit!
*
* 父进程会捕捉子进程的退出信号,
* 如果子进程的退出状态是 OK 的话,
* 那么父进程将新输入命令的缓存追加到临时文件,
* 然后使用 rename(2) 对临时文件改名,用它代替旧的 AOF 文件,
* 至此,后台 AOF 重写完成。
*/
int rewriteAppendOnlyFileBackground(void) {
pid_t childpid;
long long start;
// 已经有进程在进行 AOF 重写了
if (server.aof_child_pid != -1) return REDIS_ERR;
// 记录 fork 开始前的时间,计算 fork 耗时用
start = ustime();
if ((childpid = fork()) == 0) {
char tmpfile[256];
/* Child */
// 关闭网络连接 fd
closeListeningSockets(0);
// 为进程设置名字,方便记认
redisSetProcTitle("redis-aof-rewrite");
// 创建临时文件,并进行 AOF 重写
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
size_t private_dirty = zmalloc_get_private_dirty();
if (private_dirty) {
redisLog(REDIS_NOTICE,
"AOF rewrite: %zu MB of memory used by copy-on-write",
private_dirty/(1024*1024));
}
// 发送重写成功信号
exitFromChild(0);//父进程完成AOF重写缓存的追加;并以新的AOF文件重命名代替旧的AOF文件
} else {
// 发送重写失败信号
exitFromChild(1);
}
} else {
/* Parent */
// 记录执行 fork 所消耗的时间
server.stat_fork_time = ustime()-start;
if (childpid == -1) {
redisLog(REDIS_WARNING,
"Can't rewrite append only file in background: fork: %s",
strerror(errno));
return REDIS_ERR;
}
redisLog(REDIS_NOTICE,
"Background append only file rewriting started by pid %d",childpid);
// 记录 AOF 重写的信息
server.aof_rewrite_scheduled = 0;
server.aof_rewrite_time_start = time(NULL);
server.aof_child_pid = childpid;
// 关闭字典自动 rehash
updateDictResizePolicy();
/* We set appendseldb to -1 in order to force the next call to the
* feedAppendOnlyFile() to issue a SELECT command, so the differences
* accumulated by the parent into server.aof_rewrite_buf will start
* with a SELECT statement and it will be safe to merge.
*
* 将 aof_selected_db 设为 -1 ,
* 强制让 feedAppendOnlyFile() 下次执行时引发一个 SELECT 命令,
* 从而确保之后新添加的命令会设置到正确的数据库中
*/
server.aof_selected_db = -1;
replicationScriptCacheFlush();
return REDIS_OK;
}
return REDIS_OK; /* unreached */
}
/* A background append only file rewriting (BGREWRITEAOF) terminated its work.
* Handle this.
*
* 当子线程完成 AOF 重写时,父进程调用这个函数。
*/
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
if (!bysignal && exitcode == 0) {
int newfd, oldfd;
char tmpfile[256];
long long now = ustime();
redisLog(REDIS_NOTICE,
"Background AOF rewrite terminated with success");
/* Flush the differences accumulated by the parent to the
* rewritten AOF. */
// 打开保存新 AOF 文件内容的临时文件
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
(int)server.aof_child_pid);
newfd = open(tmpfile,O_WRONLY|O_APPEND);
if (newfd == -1) {
redisLog(REDIS_WARNING,
"Unable to open the temporary AOF produced by the child: %s", strerror(errno));
goto cleanup;
}
// 将累积的重写缓存写入到临时文件中
// 这个函数调用的 write 操作会阻塞主进程
if (aofRewriteBufferWrite(newfd) == -1) {
redisLog(REDIS_WARNING,
"Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
close(newfd);
goto cleanup;
}
redisLog(REDIS_NOTICE,
"Parent diff successfully flushed to the rewritten AOF (%lu bytes)", aofRewriteBufferSize());
/* The only remaining thing to do is to rename the temporary file to
* the configured file and switch the file descriptor used to do AOF
* writes. We don't want close(2) or rename(2) calls to block the
* server on old file deletion.
*
* 剩下的工作就是将临时文件改名为 AOF 程序指定的文件名,
* 并将新文件的 fd 设为 AOF 程序的写目标。
*
* 不过这里有一个问题 ——
* 我们不想 close(2) 或者 rename(2) 在删除旧文件时阻塞。
*
* There are two possible scenarios:
*
* 以下是两个可能的场景:
*
* 1) AOF is DISABLED and this was a one time rewrite. The temporary
* file will be renamed to the configured file. When this file already
* exists, it will be unlinked, which may block the server.
*
* AOF 被关闭,这个是一次单次的写操作。
* 临时文件会被改名为 AOF 文件。
* 本来已经存在的 AOF 文件会被 unlink ,这可能会阻塞服务器。
*
* 2) AOF is ENABLED and the rewritten AOF will immediately start
* receiving writes. After the temporary file is renamed to the
* configured file, the original AOF file descriptor will be closed.
* Since this will be the last reference to that file, closing it
* causes the underlying file to be unlinked, which may block the
* server.
*
* AOF 被开启,并且重写后的 AOF 文件会立即被用于接收新的写入命令。
* 当临时文件被改名为 AOF 文件时,原来的 AOF 文件描述符会被关闭。
* 因为 Redis 会是最后一个引用这个文件的进程,
* 所以关闭这个文件会引起 unlink ,这可能会阻塞服务器。
*
* To mitigate the blocking effect of the unlink operation (either
* caused by rename(2) in scenario 1, or by close(2) in scenario 2), we
* use a background thread to take care of this. First, we
* make scenario 1 identical to scenario 2 by opening the target file
* when it exists. The unlink operation after the rename(2) will then
* be executed upon calling close(2) for its descriptor. Everything to
* guarantee atomicity for this switch has already happened by then, so
* we don't care what the outcome or duration of that close operation
* is, as long as the file descriptor is released again.
*
* 为了避免出现阻塞现象,程序会将 close(2) 放到后台线程执行,
* 这样服务器就可以持续处理请求,不会被中断。
*/
if (server.aof_fd == -1) {
/* AOF disabled */
/* Don't care if this fails: oldfd will be -1 and we handle that.
* One notable case of -1 return is if the old file does
* not exist. */
oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
} else {
/* AOF enabled */
oldfd = -1; /* We'll set this to the current AOF filedes later. */
}
/* Rename the temporary file. This will not unlink the target file if
* it exists, because we reference it with "oldfd".
*
* 对临时文件进行改名,替换现有的 AOF 文件。
*
* 旧的 AOF 文件不会在这里被 unlink ,因为 oldfd 引用了它。
*/
if (rename(tmpfile,server.aof_filename) == -1) {
redisLog(REDIS_WARNING,
"Error trying to rename the temporary AOF file: %s", strerror(errno));
close(newfd);
if (oldfd != -1) close(oldfd);
goto cleanup;
}
if (server.aof_fd == -1) {
/* AOF disabled, we don't need to set the AOF file descriptor
* to this new file, so we can close it.
*
* AOF 被关闭,直接关闭 AOF 文件,
* 因为关闭 AOF 本来就会引起阻塞,所以这里就算 close 被阻塞也无所谓
*/
close(newfd);
} else {
/* AOF enabled, replace the old fd with the new one.
*
* 用新 AOF 文件的 fd 替换原来 AOF 文件的 fd
*/
oldfd = server.aof_fd;
server.aof_fd = newfd;
// 因为前面进行了 AOF 重写缓存追加,所以这里立即 fsync 一次
if (server.aof_fsync == AOF_FSYNC_ALWAYS)
aof_fsync(newfd);
else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
aof_background_fsync(newfd);
// 强制引发 SELECT
server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
// 更新 AOF 文件的大小
aofUpdateCurrentSize();
// 记录前一次重写时的大小
server.aof_rewrite_base_size = server.aof_current_size;
/* Clear regular AOF buffer since its contents was just written to
* the new AOF from the background rewrite buffer.
*
* 清空 AOF 缓存,因为它的内容已经被写入过了,没用了
*/
sdsfree(server.aof_buf);
server.aof_buf = sdsempty();
}
server.aof_lastbgrewrite_status = REDIS_OK;
redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");
/* Change state from WAIT_REWRITE to ON if needed
*
* 如果是第一次创建 AOF 文件,那么更新 AOF 状态
*/
if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
server.aof_state = REDIS_AOF_ON;
/* Asynchronously close the overwritten AOF.
*
* 异步关闭旧 AOF 文件
*/
if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);
redisLog(REDIS_VERBOSE,
"Background AOF rewrite signal handler took %lldus", ustime()-now);
// BGREWRITEAOF 重写出错
} else if (!bysignal && exitcode != 0) {
server.aof_lastbgrewrite_status = REDIS_ERR;
redisLog(REDIS_WARNING,
"Background AOF rewrite terminated with error");
// 未知错误
} else {
server.aof_lastbgrewrite_status = REDIS_ERR;
redisLog(REDIS_WARNING,
"Background AOF rewrite terminated by signal %d", bysignal);
}
cleanup:
// 清空 AOF 缓冲区
aofRewriteBufferReset();
// 移除临时文件
aofRemoveTempFile(server.aof_child_pid);
// 重置默认属性
server.aof_child_pid = -1;
server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
server.aof_rewrite_time_start = -1;
/* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
server.aof_rewrite_scheduled = 1;
}