本文基于redis 5.0.7的源码分析aof持久化。
redis的aof持久化方式有两个重要的组成部分
redis有个缓冲区,未被写入磁盘的命令首先被存入缓冲区,达到条件后再写入磁盘
struct redisServer {
// sds 是redis定义的char数组
sds aof_buf; /* AOF buffer, written before entering the event loop */
}
redis每次执行完写操作后,会调用propagate函数将写操作追加到aof_buf缓冲区
void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc, int flags)
{
if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
// aof功能打开的前提下,把新的追加aof_buffer
feedAppendOnlyFile(cmd,dbid,argv,argc);
if (flags & PROPAGATE_REPL)
replicationFeedSlaves(server.slaves,dbid,argv,argc);
}
aof缓冲区同步到磁盘有三种策略
appendfsync always // 每条写命令同步一次
appendfsync everysec // 每秒钟同步一次
appendfsync no // 不手动同步,由操作系统决定同步的时刻
说明: appendfsync no策略 仅仅调用write函数将缓冲区的数据写如操作系统内核缓冲区,至于内核缓冲区的数据什么时候写入磁盘,由操作系统决定
每秒钟同步一次是一种折中(compromise)策略。同步需要调用系统函数fsync,涉及到操作系统用户态和核心态的切换,同时真正的磁盘IO发生在这里,比较耗性能。
redis的servercron大循环里调用flushAppendOnlyFile同步数据
void flushAppendOnlyFile(int force) {
ssize_t nwritten;
int sync_in_progress = 0;
mstime_t latency;
... ...
latencyStartMonitor(latency);
// 调用write函数将aof_buf的数据写入文件的内核缓冲区
nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
latencyEndMonitor(latency);
... ...
try_fsync:
/* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
* children doing I/O in the background. */
if (server.aof_no_fsync_on_rewrite &&
(server.aof_child_pid != -1 || server.rdb_child_pid != -1))
return;
/* Perform the fsync if needed. */
if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
/* redis_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
latencyStartMonitor(latency);
// 直接调用fsync函数将内核缓冲区的数据写入磁盘
redis_fsync(server.aof_fd); /* Let's try to get this data on the disk */
latencyEndMonitor(latency);
latencyAddSampleIfNeeded("aof-fsync-always",latency);
server.aof_fsync_offset = server.aof_current_size;
server.aof_last_fsync = server.unixtime;
} else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
server.unixtime > server.aof_last_fsync)) {
if (!sync_in_progress) {
// 创建后台线程,在后台线程里调用sync函数将内核缓冲区的数据写入磁盘
aof_background_fsync(server.aof_fd);
server.aof_fsync_offset = server.aof_current_size;
}
server.aof_last_fsync = server.unixtime;
}
}
如果是每秒同步一次,则会创建一个线程,线程里调用fsync同步数据。
/* Starts a background task that performs fsync() against the specified
* file descriptor (the one of the AOF file) in another thread. */
void aof_background_fsync(int fd) {
bioCreateBackgroundJob(BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);
}
后台调度线程会从任务队列中取出任务并执行
void *bioProcessBackgroundJobs(void *arg) {
struct bio_job *job;
unsigned long type = (unsigned long) arg;
sigset_t sigset;
while(1) {
listNode *ln;
// 从任务队列中取出位于队首的任务
ln = listFirst(bio_jobs[type]);
job = ln->value;
/* It is now possible to unlock the background system as we know have
* a stand alone job structure to process.*/
pthread_mutex_unlock(&bio_mutex[type]);
// 根据任务类型执行任务
if (type == BIO_CLOSE_FILE) {
// 关闭文件
close((long)job->arg1);
} else if (type == BIO_AOF_FSYNC) {
// 执行fsync同步数据
redis_fsync((long)job->arg1);
} else if (type == BIO_LAZY_FREE) {
if (job->arg1)
lazyfreeFreeObjectFromBioThread(job->arg1);
else if (job->arg2 && job->arg3)
lazyfreeFreeDatabaseFromBioThread(job->arg2,job->arg3);
else if (job->arg3)
lazyfreeFreeSlotsMapFromBioThread(job->arg3);
} else {
serverPanic("Wrong job type in bioProcessBackgroundJobs().");
}
// 释放存放任务的对象
zfree(job);
pthread_mutex_lock(&bio_mutex[type]);
// 删除任务队列队首元素
listDelNode(bio_jobs[type],ln);
bio_pending[type]--;
/* Unblock threads blocked on bioWaitStepOfType() if any. */
pthread_cond_broadcast(&bio_step_cond[type]);
}
}
很多帖子说redis是单线程的,非也非也。redis是多进程多线程的,只是单线程执行读写命令而已。
write系统调用不保证数据最终写入磁盘,如果遇到机器掉电,内核缓冲区的数据没有写入磁盘,那么数据也就丢了。所以仅调用write是不够的。
与redis的aof机制类似,MySQL同样是将写命令追加到binlog里。给mysql的master配置一个slave时,slave会拿master的binlog来恢复数据,从而和master数据保持一致。
随着写操作的进行,aof文件越来越大,同时冗余的数据也很多。假设有这样的一个操作序列
set key value1
set key value2
set key value3
... ...
set key valueN
那么aof文件仅仅需要保存 set key valueN
这个命令用于数据库还原。
auto-aof-rewrite-percentage 100 // 当前文件超过上次同步后文件百分比
auto-aof-rewrite-min-size 64mb // 重写的文件最小大小
在serverCron大事件循环里判断是否满足重写的条件
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
... ...
/* Trigger an AOF rewrite if needed. */
if (server.aof_state == AOF_ON &&
server.rdb_child_pid == -1 &&
server.aof_child_pid == -1 &&
server.aof_rewrite_perc &&
// 文件大小超过最小值
server.aof_current_size > server.aof_rewrite_min_size)
{
long long base = server.aof_rewrite_base_size ?
server.aof_rewrite_base_size : 1;
long long growth = (server.aof_current_size*100/base) - 100;
// 文件增幅超过指定大小
if (growth >= server.aof_rewrite_perc) {
serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
// 开启子进程进行文件重写
rewriteAppendOnlyFileBackground();
}
}
... ...
}
在子进程里,redis将数据库中所有的键值对转换成对应格式的命令写入新的aof文件。源码就补贴了,基本这个流程。