Redis(九) AOF持久化介绍及部分源码解析

一.概述

    AOF持久化不同于RDB的存储整个数据结构,AOF是通过记录redis执行的服务器所执行的写命令来记录数据库状态的。

二.AOF持久化的实现

       AOF持久化操作分为三个步骤,分别是:命令追加,文件写入,文件同步。具体如下:redis在每次执行写命令时(比如增删改),都会将该条命令加入到服务器结构redisServer的AOF缓冲区中,redis向muduo一样是基于eventloop(事件循环)的,在每次循环中,redis都会根据设置的策略决定是否要将AOF缓冲中的数据写入AOF文件,以及是否要进行等待文件同步(fsync,参博文《文件访问》)。

      Redis事件循环中的serverCron函数默认会每隔100ms执行一次,其中会调用flushAppendOnlyFile函数,该函数会判断是否将aof_buf内的数据写入到文件以及是否进行文件同步,且当同步策略为everysec时文件的同步操作由后台线程完成,具体代码如下所示:

1.redis任务线程代码:

     bio.c文件中定义了redis工作线程池,每种任务选项都对应了一个线程(如:AOF的文件同步操作便对应着一个线程),但工作线程只执行两种操作:关闭文件和AOF文件同步。redis的线程实现如下:

static pthread_t bio_threads[REDIS_BIO_NUM_OPS];        // 记录了线程标识符
static pthread_mutex_t bio_mutex[REDIS_BIO_NUM_OPS]; 	// 记录了每线程锁,用于互斥访问任务队列及用于条件变量
static pthread_cond_t bio_condvar[REDIS_BIO_NUM_OPS];	// 每线程的条件变量
static list *bio_jobs[REDIS_BIO_NUM_OPS];               // 每任务类型任务队列,也是每线程任务队列
static unsigned long long bio_pending[REDIS_BIO_NUM_OPS]; //记录了每个线程上挂起的任务数

// 该结构表示着一个任务,该结构仅在本文件中使用
struct bio_job {
    time_t time;  // 任务创建的时间
    void *arg1, *arg2, *arg3; // 用于该任务的参数,若参数数超过3个,则需要通过结构体传递
};

// 初始化线程池
// 每种任务选项都有一个对应的线程
void bioInit(void) {
    pthread_attr_t attr;
    pthread_t thread;
    size_t stacksize;
    int j;

    // 初始化每操作类型的锁,条件变量,任务队列及挂起作业数,#define REDIS_BIO_NUM_OPS  2

    for (j = 0; j < REDIS_BIO_NUM_OPS; j++) {
        pthread_mutex_init(&bio_mutex[j],NULL);
        pthread_cond_init(&bio_condvar[j],NULL);
        bio_jobs[j] = listCreate();
        bio_pending[j] = 0;
    }

    // 设置线程属性(在有些系统中的默认栈大小可能过小)
    // - 首先使用pthread_attr_init初始化一个pthread_attr_t结构attr
    // - 使用函数pthread_attr_setstacksize设置attr中的栈属性(栈大小)
    // - 最后使用属性attr调用函数pthread_create创建线程
    pthread_attr_init(&attr);
    pthread_attr_getstacksize(&attr,&stacksize); // 获取当前的栈大小属性
    if (!stacksize) stacksize = 1;
    while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2; // 扩展线程栈大小(但不超过4G)
    pthread_attr_setstacksize(&attr, stacksize);

    for (j = 0; j < REDIS_BIO_NUM_OPS; j++) {
        void *arg = (void*)(unsigned long) j;
        // 创建线程,线程入口函数为bioProcessBackgroundJobs
        if (pthread_create(&thread,&attr,bioProcessBackgroundJobs,arg) != 0) {
            redisLog(REDIS_WARNING,"Fatal: Can't initialize Background Jobs.");
            exit(1);
        }
        bio_threads[j] = thread; // 将线程标识符记录在bio_threads中
    }
}

// 创建任务,并添加至任务队列
// type : 任务类型
// arg1,arg2,arg3 : 本任务需要使用的参数
void bioCreateBackgroundJob(int type, void *arg1, void *arg2, void *arg3) {
    struct bio_job *job = zmalloc(sizeof(*job));

    // 设置时间及参数
    job->time = time(NULL);
    job->arg1 = arg1;
    job->arg2 = arg2;
    job->arg3 = arg3;

    // 加锁后将该任务放至type类型操作的任务队列末尾
    // 并调用pthread_cond_signal函数唤醒阻塞在信号量上的线程
    pthread_mutex_lock(&bio_mutex[type]);
    listAddNodeTail(bio_jobs[type],job);
    bio_pending[type]++;
    pthread_cond_signal(&bio_condvar[type]);
    pthread_mutex_unlock(&bio_mutex[type]);
}

// 返回type类型任务队列中挂起任务的个数
unsigned long long bioPendingJobsOfType(int type) {
    unsigned long long val;
    pthread_mutex_lock(&bio_mutex[type]);
    val = bio_pending[type];
    pthread_mutex_unlock(&bio_mutex[type]);
    return val;
}


// 取消所有任务线程
void bioKillThreads(void) {
    int err, j;

    for (j = 0; j < REDIS_BIO_NUM_OPS; j++) {
        if (pthread_cancel(bio_threads[j]) == 0) {
            if ((err = pthread_join(bio_threads[j],NULL)) != 0) {
                redisLog(REDIS_WARNING,
                    "Bio thread for job type #%d can be joined: %s",
                        j, strerror(err));
            } else {
                redisLog(REDIS_WARNING,
                    "Bio thread for job type #%d terminated",j);
            }
        }
    }
}


// 线程的入口函数
void *bioProcessBackgroundJobs(void *arg) {
    struct bio_job *job;
    unsigned long type = (unsigned long) arg;
    sigset_t sigset;

    // 线程的取消选项未包含于pthread_attr_t结构中,而是使用特定的函数设置(参APUE p362) 
    // pthread_setcancelstate函数设置线程是否可被取消,PTHREAD_CANCEL_ENABLE可,PTHREAD_CANCEL_DISABLE不可
    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);

    // 设置取消类型,默认为推迟取消(PTHREADCANCEL_DEFERRED),即遇到取消点时才会取消线程
    // redis将取消类型设置为异步取消(PTHREAD_CANCEL_ASYNCHRONOUS),即线程可以在任意时间被取消
    // 参APUE p364
    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

    pthread_mutex_lock(&bio_mutex[type]);

    // 阻塞本线程接受SIGALRM信号,这样可以保证只有主线程可以收到看门狗程序的信号
    sigemptyset(&sigset); //清空sigset
    sigaddset(&sigset, SIGALRM); // 设置SIGALRM信号在sigset中对应的位
    if (pthread_sigmask(SIG_BLOCK, &sigset, NULL)) // 设置该线程的信号屏蔽字
        redisLog(REDIS_WARNING,
            "Warning: can't mask SIGALRM in bio.c thread: %s", strerror(errno));

    while(1) {
        listNode *ln;

        // 若该类型操作的任务队列为空,则将该线程阻塞在该条件变量的阻塞队列上
        // pthread_cond_wait函数会解锁bio_mutex,当被唤醒时再次加锁
        if (listLength(bio_jobs[type]) == 0) {
            pthread_cond_wait(&bio_condvar[type],&bio_mutex[type]);
            continue;
        }

        // 取出任务队列中的首个工作任务
        ln = listFirst(bio_jobs[type]);
        job = ln->value;
        pthread_mutex_unlock(&bio_mutex[type]);

        // redis任务线程只做两种工作:关闭文件和文件同步(AOF持久化时要进行文件同步)
        if (type == REDIS_BIO_CLOSE_FILE) {
            close((long)job->arg1); // 关闭文件
        } else if (type == REDIS_BIO_AOF_FSYNC) {
            aof_fsync((long)job->arg1); // 进行文件同步操作
        } else {
            redisPanic("Wrong job type in bioProcessBackgroundJobs().");
        }
        zfree(job);

        // 再次加锁,因为下次循环调用条件变量的函数pthread_cond_wait需要上锁
        pthread_mutex_lock(&bio_mutex[type]);
        listDelNode(bio_jobs[type],ln);
        bio_pending[type]--;
    }
}

      在了解了redis任务线程后,接下来说明flushAppendOnlyFile如何进行AOF持久化操作,其代码如下所示:

#define AOF_WRITE_LOG_ERROR_RATE 30 /* Seconds between errors logging. */
// 将aof缓存中的数据写入aof文件,并依据选择的同步策略进行文件同步
// 当force为1时,将忽略AOF同步
void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;
    mstime_t latency;

    // 若aof_buf中无命令,则无需进行持久化操作
    if (sdslen(server.aof_buf) == 0) return;

    // server.aof_fsync选项设置了同步文件的策略
    // AOF_FSYNC_EVERYSEC :若距上次同步AOF持久化的时间超过2s,则AOF文件进行同步,且该同步操作由一个线程执行
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0; // 是否已存在AOF同步任务

	// 若同步策略是AOF_FSYNC_EVERYSEC,且未强制忽略同步操作
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
        // 若当前已存在同步任务,且距前一次进行成功写入AOF文件小于2s,则此次循环不进行后续写入操作,以等待同步操作完成
        if (sync_in_progress) {
            if (server.aof_flush_postponed_start == 0) {
                // server.aof_flush_postponed_start记录了前一次延期(拒绝)AOF同步的时间
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                // server.unixtime 保存了精确到秒的时间,以免每次都要进行系统调用
                return;
            }
            // 进行EVERYSEC同步的次数
            server.aof_delayed_fsync++;
            redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }

    // 将aof_buf写入aof文件,并计算写入时间
    latencyStartMonitor(latency);
    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
    latencyEndMonitor(latency);

    // 根据执行的事件不同,当此次写入aof文件超过最大时限时,
    // 会更新服务器用于超时事件记录的字典server.latency_events
    if (sync_in_progress) {
        // 若任务线程中还存在aof同步任务,则此次超时事件为aof-write-pending-fsync
        latencyAddSampleIfNeeded("aof-write-pending-fsync",latency);
    } else if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) {
        // 若正在进行AOF重写或BGSAVE,那么超时事件为aof-write-active-child
        latencyAddSampleIfNeeded("aof-write-active-child",latency);
    } else {
        // 若只进行了aof写入文件,那么超时事件为aof-write-alone
        latencyAddSampleIfNeeded("aof-write-alone",latency);
    }
    // 更新超时事件aof-write
    latencyAddSampleIfNeeded("aof-write",latency);

    // 完成了写AOF文件,因此将aof_flush_postponed_start(前一次同步时间)清0
    server.aof_flush_postponed_start = 0;

    // 若写入aof文件的长度少于aof_buf,则说明写入异常
    if (nwritten != (signed)sdslen(server.aof_buf)) {
        static time_t last_write_error_log = 0;
        int can_log = 0;

        // 若此次写入异常事件距上次异常的时间间隔大于30s,则更新异常时间,并置位记录日志标志can_log为1
        if ((server.unixtime - last_write_error_log) > AOF_WRITE_LOG_ERROR_RATE) {
            can_log = 1;
            last_write_error_log = server.unixtime;
        }

        // 记录aof写入异常和错误码到日志。
        if (nwritten == -1) {
            if (can_log) {
                // 记录未写入错误
                redisLog(REDIS_WARNING,"Error writing to the AOF file: %s",
                    strerror(errno));
                server.aof_last_write_errno = errno;
            }
        } else {
            if (can_log) {
                // 记录未完全写入异常
                redisLog(REDIS_WARNING,"Short write while writing to "
                                       "the AOF file: (nwritten=%lld, "
                                       "expected=%lld)",
                                       (long long)nwritten,
                                       (long long)sdslen(server.aof_buf));
            }

            // 使用文件截断函数删除此次添加的不完整数据
            if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
                if (can_log) {
                    redisLog(REDIS_WARNING, "Could not remove short write "
                             "from the append-only file.  Redis may refuse "
                             "to load the AOF the next time it starts.  "
                             "ftruncate: %s", strerror(errno));
                }
            } else {
                // 截断成功则nwritten = -1
                nwritten = -1;
            }
            server.aof_last_write_errno = ENOSPC;
        }

        /* Handle the AOF write error. */
        if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
            // 若redis的文件同步方式为每次写入都需同步,则记录日志并结束redis进程
            redisLog(REDIS_WARNING,"Can't recover from AOF write error when the AOF fsync policy is 'always'. Exiting...");
            exit(1);
        } else {
            server.aof_last_write_status = REDIS_ERR;
            if (nwritten > 0) {
                // 截断不成功则需要更新aof文件大小的记录,及删除aof_buf中已写入的部分
                server.aof_current_size += nwritten;
                sdsrange(server.aof_buf,nwritten,-1);
            }
			
            return; /* We'll try again on the next call... */
        }
    } else {
        // 写入成功
        if (server.aof_last_write_status == REDIS_ERR) {
            // 若前一次出现了写入异常,而此次回复了,则进行日志记录
            redisLog(REDIS_WARNING,
                "AOF write error looks solved, Redis can write again.");
            server.aof_last_write_status = REDIS_OK;
        }
    }
    // 运行到此处说明aof写入正常
    server.aof_current_size += nwritten;

    // 若aof_buf缓冲的总大小小于4000字节,则只是清空aof_buf
    // 否则说明aof_buf缓冲过大,则释放aof_buf缓存,重新分配一个空的缓存空间
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }

    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
     * children doing I/O in the background. */
    /*如果 no-appendfsync-on-rewrite 选项为开启状态,
     * 并且有 BGSAVE 或者 BGREWRITEAOF 正在进行的话,
     * 那么不执行 fsync 
     */
    if (server.aof_no_fsync_on_rewrite &&
        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
            return;

    // 若同步策略为AOF_FSYNC_ALWAYS则在服务器主线程进行同步
    // 若同步策略为AOF_FSYNC_EVERYSEC则由一个线程专门负责
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
        latencyStartMonitor(latency);
        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-fsync-always",latency);
        server.aof_last_fsync = server.unixtime;
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        if (!sync_in_progress) aof_background_fsync(server.aof_fd);
        server.aof_last_fsync = server.unixtime;
    }
}

 

3.AOF重写

AOF重写的总体示意图如下:

    Redis(九) AOF持久化介绍及部分源码解析_第1张图片

在定期执行的serverCron函数中将会判断aof文件是否需要重写,当以下两个条件同时满足时便会进行aof重写:1.aof文件大小超过了最小重写阈值。2.与上次重写之后的aof文件大小相比若增长率超过了增长阈值。其判断代码如下所述:

void serverCron()
{  
    ...
    // server.aof_rewrite_base_size记录了上一次重写之后的aof文件大小
    // server.aof_rewrite_perc记录了aof文件的增长率阈值
    // 若aof文件大小超过了最小aof重写阈值,并超过了增长阈值,则进行aof重写
    if (server.rdb_child_pid == -1 &&
        server.aof_child_pid == -1 &&
        server.aof_rewrite_perc &&
        server.aof_current_size > server.aof_rewrite_min_size)
    {
        long long base = server.aof_rewrite_base_size ? server.aof_rewrite_base_size : 1;
			
        // 计算与上一次重写后aof文件增长率
        long long growth = (server.aof_current_size*100/base) - 100;
        if (growth >= server.aof_rewrite_perc) { // 判断增长率是否超过了aof文件增长阈值
           redisLog(REDIS_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
           // 执行aof重写
           rewriteAppendOnlyFileBackground();
        }
    }
    ...
}

 

        在rewriteAppendOnlyFileBackgroung()函数中首先会创建3个管道用于服务器进程与aof重写子进程间的通信,关于管道的介绍可参博文《进程间通信(IPC)》。其创建函数如下所示:

// 服务器端的管道可读回调函数
void aofChildPipeReadable(aeEventLoop *el, int fd, void *privdata, int mask) {
    char byte;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(privdata);
    REDIS_NOTUSED(mask);

    // 读取数据,并验证其是否为'!'字符
    // 若是说明正确接收到aof持久化进程发送来的停止发送aof差异数据请求
    if (read(fd,&byte,1) == 1 && byte == '!') {
        redisLog(REDIS_NOTICE,"AOF rewrite child asks to stop sending diffs.");
        server.aof_stop_sending_diff = 1;
        if (write(server.aof_pipe_write_ack_to_child,"!",1) != 1) {
            /* If we can't send the ack, inform the user, but don't try again
             * since in the other side the children will use a timeout if the
             * kernel can't buffer our write, or, the children was
             * terminated. */
            redisLog(REDIS_WARNING,"Can't send ACK to AOF child: %s",
                strerror(errno));
        }
    }
    aeDeleteFileEvent(server.el,server.aof_pipe_read_ack_from_child,AE_READABLE);
}

// 为aof文件重写创建用于父子进程间通信的3个无名管道
int aofCreatePipes(void) {
    int fds[6] = {-1, -1, -1, -1, -1, -1};
    int j;

    // 每个管道需要两个文件描述符,一个用于读,一个用于写
    if (pipe(fds) == -1) goto error; /* parent -> children data. */
    if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
    if (pipe(fds+4) == -1) goto error; /* children -> parent ack. */
    // 将从父进程向子进程传递数据的管道设置为非阻塞的
    if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;
    if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;

    // 将fds[2]添加到事件循环的监听,关注该文件描述符上发生的可读事件,读回调函数为aofChildPipeReadable。
    // 当aof重写进程希望服务器进程停止发送aof差异数据时,会像服务器进程发送'!',此时会调用aofChildPipeReadable函数
    if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;

    // 父进程使用fd[1]写数据到子进程的fd[0]
    server.aof_pipe_write_data_to_child = fds[1];
    server.aof_pipe_read_data_from_parent = fds[0];

    // 子进程使用fd[3]写确认到父进程的fd[2]
    server.aof_pipe_write_ack_to_parent = fds[3];
    server.aof_pipe_read_ack_from_child = fds[2];

    // 父进程使用fd[5]写确认到子进程的fd[4]
    server.aof_pipe_write_ack_to_child = fds[5];
    server.aof_pipe_read_ack_from_parent = fds[4];
    server.aof_stop_sending_diff = 0;
    return REDIS_OK;

error:
    redisLog(REDIS_WARNING,"Error opening /setting AOF rewrite IPC pipes: %s",
        strerror(errno));
    for (j = 0; j < 6; j++) if(fds[j] != -1) close(fds[j]);
    return REDIS_ERR;
}

      每当执行写命令时,都会调用feedAppendOnlyFile()函数将命令追加到aof_buf,若正在进行aof重写,该函数还会额外添加到aof重写缓存中一份。aof重写缓存是一个由一个个缓存单元组成的链表(server.aof_rewrite_buf_blocks)(类似于TCP/IP协议中的mbuf)。当进行aof重写的过程中,服务器进程会通过管道将aof重写缓存中的数据发送给aof重写进程,这样可以减少aof重写进程结束后服务器进程要进行的合并数据,从而减少了服务器失去响应的时间。feedAppendOnlyFile函数及aof重写缓存的实现如下所示:

// 添加命令到aof缓存,并且若正在进行aof重写,则还要追加到aof重写缓存中
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    sds buf = sdsempty();
    robj *tmpargv[3];

    // 若当前操作的数据库与AOF文件中当前选中的数据库不同,则向AOF文件中添加一条选择数据库命令
    if (dictid != server.aof_selected_db) {
        char seldb[64];
        snprintf(seldb,sizeof(seldb),"%d",dictid);
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);
        server.aof_selected_db = dictid;
    }

    // 判断命令类型
    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
        cmd->proc == expireatCommand) {
        // 超时命令
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
        /* Translate SETEX/PSETEX to SET and PEXPIREAT */
        tmpargv[0] = createStringObject("SET",3);
        tmpargv[1] = argv[1];
        tmpargv[2] = argv[3];
        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
        decrRefCount(tmpargv[0]);
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    } else {
        // 所有其它命令采用相同的存储方式
        buf = catAppendOnlyGenericCommand(buf,argc,argv);
    }

    // 添加到aof_buf
    if (server.aof_state == REDIS_AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));

     // 若正在进行aof重写,则将aof数据添加到aof重写缓存
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));

    sdsfree(buf);
}

      aof重写缓存如下所示:

#define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10)    /* 10 MB per block */
// 用于存储aof重写数据的缓存单元
typedef struct aofrwblock {
    unsigned long used, free;   // 已用空间和可用空间
    char buf[AOF_RW_BUF_BLOCK_SIZE];
} aofrwblock;

// 获取当前aof重写缓存的总大小
//(即server.aof_rewrite_buf_blocks链表中各个节点的已用空间之和)
unsigned long aofRewriteBufferSize(void) {
    listNode *ln;
    listIter li;
    unsigned long size = 0;

    listRewind(server.aof_rewrite_buf_blocks,&li);
    while((ln = listNext(&li))) {
        aofrwblock *block = listNodeValue(ln);
        size += block->used;
    }
    return size;
}

//添加新的数据到aof重写缓存,若需要则开辟新的缓存单元aofrwblock
void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
    listNode *ln = listLast(server.aof_rewrite_buf_blocks);
    aofrwblock *block = ln ? ln->value : NULL;

    while(len) {
        // 将数据添加到最后一个缓存单元
        if (block) {
            unsigned long thislen = (block->free < len) ? block->free : len;
            if (thislen) {  /* The current block is not already full. */
                memcpy(block->buf+block->used, s, thislen);
                block->used += thislen;
                block->free -= thislen;
                s += thislen;
                len -= thislen;
            }
        }

        // 若现有缓存单元不足以存储,则开辟一个新的缓存单元
        if (len) { /* First block to allocate, or need another block. */
            int numblocks;

            block = zmalloc(sizeof(*block)); // 开辟新的缓存单元
            block->free = AOF_RW_BUF_BLOCK_SIZE;
            block->used = 0;
			// 将新的缓存单元添加到aof重写缓存链表的末尾
            listAddNodeTail(server.aof_rewrite_buf_blocks,block);

            /* Log every time we cross more 10 or 100 blocks, respectively
             * as a notice or warning. */
            numblocks = listLength(server.aof_rewrite_buf_blocks);
            // 若缓存列表中的缓存单元个数超过10个则进行警告级别的日志记录,
            // 若超过100个则进行NOTICE级别的日志记录
            if (((numblocks+1) % 10) == 0) {
                int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING :
                                                         REDIS_NOTICE;
                redisLog(level,"Background AOF buffer size: %lu MB",
                    aofRewriteBufferSize()/(1024*1024));
            }
        }
    }

    // 若当前还未将数据管道的写端添加到事件循环中(文件事件)
    // 则通过aeCreateFileEvent()函数将管道写端添加到事件循环
    // 监听的文件描述符中,并将aofChildWriteDiffData做为可写
    // 事件回调。(【注】:类似于muduo)
    if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {
        aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
            AE_WRITABLE, aofChildWriteDiffData, NULL);
    }
}

// 将aof重写缓存中的数据写入文件描述符位fd的文件
// 【注】:用于aof重写进程结束胡,服务器进程将剩余aof差异数据写入新的aof文件
ssize_t aofRewriteBufferWrite(int fd) {
    listNode *ln;
    listIter li;
    ssize_t count = 0;

    // 遍历aof重写缓存链表
    listRewind(server.aof_rewrite_buf_blocks,&li);
    while((ln = listNext(&li))) {
        aofrwblock *block = listNodeValue(ln);
        ssize_t nwritten;

        if (block->used) {
            nwritten = write(fd,block->buf,block->used);
            if (nwritten != (ssize_t)block->used) {
                if (nwritten == 0) errno = EIO;
                return -1;
            }
            count += nwritten;
        }
    }
    return count;
}


// 使用该函数将启动aof重写后,服务器随后产生的aof指令通过管道发送至子进程
// 以此减少子进程执行完aof重写后,服务器要进行合并的数量,从而减少了服务器
// 阻塞在合并上的时间
void aofChildWriteDiffData(aeEventLoop *el, int fd, void *privdata, int mask) {
    listNode *ln;
    aofrwblock *block;
    ssize_t nwritten;
    REDIS_NOTUSED(el);
    REDIS_NOTUSED(fd);
    REDIS_NOTUSED(privdata);
    REDIS_NOTUSED(mask);

    while(1) {
        ln = listFirst(server.aof_rewrite_buf_blocks);
        block = ln ? ln->value : NULL;
        if (server.aof_stop_sending_diff || !block) {
            aeDeleteFileEvent(server.el,server.aof_pipe_write_data_to_child,
                              AE_WRITABLE);
            return;
        }
        if (block->used > 0) {
            nwritten = write(server.aof_pipe_write_data_to_child,
                             block->buf,block->used);
            if (nwritten <= 0) return;
            // 将未写数据迁移,并更新缓存单元的已用大小
            memmove(block->buf,block->buf+nwritten,block->used-nwritten);
            block->used -= nwritten;
        }
        // 若该缓存单元已发送完毕,则释放该缓存单元
        if (block->used == 0) listDelNode(server.aof_rewrite_buf_blocks,ln);
    }
}

       创建子进程调用重写函数的实现如下所示:

// 创建一个子进程进行aof文件重写
int rewriteAppendOnlyFileBackground(void) {
    pid_t childpid;
    long long start;

    if (server.aof_child_pid != -1) return REDIS_ERR;
    // 调用aofCreatePipes创建用于父子进程间通信的管道
    if (aofCreatePipes() != REDIS_OK) return REDIS_ERR;
    // 记录aof文件重写的开始时间
    start = ustime();
    // 创建子进程进行aof重写,以放至阻塞服务器进程提供服务
    if ((childpid = fork()) == 0) {
        char tmpfile[256];

        /* Child */
        // 关闭从父进程继承而来的套接字描述符
        closeListeningSockets(0);
        redisSetProcTitle("redis-aof-rewrite");
        // 将子进程的进程id写入tmpfile,做为新aof文件的文件名
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
        // 子进程调用rewriteAppendOnlyFile函数完成aof重写,依据是否成功,子进程有不同的退出码0或1
        if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
            size_t private_dirty = zmalloc_get_private_dirty();

            if (private_dirty) {
                redisLog(REDIS_NOTICE,
                    "AOF rewrite: %zu MB of memory used by copy-on-write",
                    private_dirty/(1024*1024));
            }
            exitFromChild(0);
        } else {
            exitFromChild(1);
        }
    } else {
        /* Parent */
        //  server.stat_fork_time保存了fork调用时间
        server.stat_fork_time = ustime()-start;
        server.stat_fork_rate = (double) zmalloc_used_memory() * 1000000 / server.stat_fork_time / (1024*1024*1024); /* GB per second. */
        latencyAddSampleIfNeeded("fork",server.stat_fork_time/1000);
        if (childpid == -1) { // fork失败
            redisLog(REDIS_WARNING,
                "Can't rewrite append only file in background: fork: %s",
                strerror(errno));
            return REDIS_ERR;
        }
        redisLog(REDIS_NOTICE,
            "Background append only file rewriting started by pid %d",childpid);
        server.aof_rewrite_scheduled = 0;
        server.aof_rewrite_time_start = time(NULL);
        server.aof_child_pid = childpid; // 记录子进程的id
        // 当存在正在进行持久化的子进程时不允许调整数据库中字典的大小
        updateDictResizePolicy();

        // 将server.aof_selected_db设置为-1是为了保证下一次调用feedAppendOnlyFile()函数添加命令到
        // aof文件之前会先添加一条select db的命令,这样进行合并将会是安全的,因为在重写时是遍历
        // 每个数据库来实现aof重写的,在重写每个数据库时都会在开头加上select db的命令
        server.aof_selected_db = -1;
        // 【注】:该函数暂不了解,似乎和主从服务器之类的有关,之后在进行补注
        replicationScriptCacheFlush();
        return REDIS_OK;
    }
    return REDIS_OK; /* unreached */
}

     子进程中调用的rewriteAppendOnlyFile函数执行真正的aof重写操作,其过程如下所示:

// 从服务器读取aof数据,将其添加到server.aof_child_diff缓存中
// aof_child_diff存储着启动aof重写后,服务器增加并通过管道告
// 知客户端的aof数据(指令集)
ssize_t aofReadDiffFromParent(void) {
    char buf[65536]; /* Default pipe buffer size on most Linux systems. */
    ssize_t nread, total = 0;

    while ((nread =
            read(server.aof_pipe_read_data_from_parent,buf,sizeof(buf))) > 0) {
        server.aof_child_diff = sdscatlen(server.aof_child_diff,buf,nread);
        total += nread;
    }
    return total;
} 

// 在子进程中进行aof文件重写
int rewriteAppendOnlyFile(char *filename) {
    dictIterator *di = NULL;
    dictEntry *de;
    rio aof;
    FILE *fp;
    char tmpfile[256];
    int j;
    long long now = mstime();
    char byte;
    size_t processed = 0;

    // 创建新的aof文件
    snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
    fp = fopen(tmpfile,"w");
    if (!fp) {
        redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
        return REDIS_ERR;
    }

    server.aof_child_diff = sdsempty();
    // 保存新的文件信息到aof结构体中,比如:读写函数,文件描述符等
    rioInitWithFile(&aof,fp);
    // server.aof_rewrite_incremental_fsync用于标示在rewrite过程中是否增量进行fsync操作,
    // 便于均摊磁盘IO压力,增量每次写入自动进行fsync
    if (server.aof_rewrite_incremental_fsync)
        rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);

    // 循环遍历每个数据库,并进行aof持久化
    for (j = 0; j < server.dbnum; j++) {
        // 重写每个数据库前都会先写入select db的命令
        char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
        redisDb *db = server.db+j;
        dict *d = db->dict;
        if (dictSize(d) == 0) continue;
		// 获取遍历该字典的迭代器di
        di = dictGetSafeIterator(d);
        if (!di) {
            fclose(fp);
            return REDIS_ERR;
        }

        if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
        if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;

        // 遍历整个字典已完成指令简化,即aof重写
        while((de = dictNext(di)) != NULL) {
            sds keystr;
            robj key, *o;
            long long expiretime;

            keystr = dictGetKey(de);// 获取字典的键(必为字符串对象)
            o = dictGetVal(de);		// 获取字典的值
            initStaticStringObject(key,keystr);

            // 检查该键是否设置了过期时间
            expiretime = getExpire(db,&key);

            // 若该键已过期,则不存入AOF重写文件
            if (expiretime != -1 && expiretime < now) continue;

            // 根据值类型的不同,将其转化为对应的命令进行存储
            if (o->type == REDIS_STRING) {
                char cmd[]="*3\r\n$3\r\nSET\r\n";
                // 存储命令
                if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                // 存储键和值
                if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
                if (rioWriteBulkObject(&aof,o) == 0) goto werr;
            } else if (o->type == REDIS_LIST) {
                if (rewriteListObject(&aof,&key,o) == 0) goto werr;
            } else if (o->type == REDIS_SET) {
                if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
            } else if (o->type == REDIS_ZSET) {
                if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
            } else if (o->type == REDIS_HASH) {
                if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
            } else {
                redisPanic("Unknown object type");
            }
            // 若设置了过期时间则还要存储过期键的设置命令
            if (expiretime != -1) {
                char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
                if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
                // 存储键和值(值为过期时间)
                if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
                if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
            }
            // aof.processed_bytes记录了该管道io中读入了多少字节的数据
            // 若从管道中读出的数据(来自父进程的aof重写数据)大于10K则将其
            // 写入aof差异缓存
            if (aof.processed_bytes > processed+1024*10) {
                processed = aof.processed_bytes;
                aofReadDiffFromParent();
            }
        }
        dictReleaseIterator(di);
        di = NULL;
    }

    if (fflush(fp) == EOF) goto werr; // 冲刷数据到内核的缓冲
    if (fsync(fileno(fp)) == -1) goto werr; // 进行文件同步(将内核缓冲区中的数据同步到磁盘上)


    int nodata = 0;
    mstime_t start = mstime();
    // 在读aof数据的管道上等待1000次1ms,且连续无数据的次数不超过20次
    while(mstime()-start < 1000 && nodata < 20) {
        // aeWait函数调用poll在管道描述符server.aof_pipe_read_data_from_parent上等待1ms,以期待
        // 可读事件的到来
        if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)
        {
            nodata++;
            continue;
        }
        nodata = 0; 
        // 将服务器之后写入的aof数据追加到aof差异缓存server.aof_child_diff中
        aofReadDiffFromParent();
    }

    // 写一个"!"给父进程,告知其停止发送aof差异数据
    if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;
    // 调用fcntl函数将aof进程读确认管道设置为非阻塞
    if (anetNonBlock(NULL,server.aof_pipe_read_ack_from_parent) != ANET_OK)
        goto werr;
    // 调用syncRead从服务器进程读取回复,若读取数据不为'!',则说明出现异常
    if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
        byte != '!') goto werr;
    redisLog(REDIS_NOTICE,"Parent agreed to stop sending diffs. Finalizing AOF...");

    // 最后一次读取aof差异数据,因为在向服务器请求停止发送的过程中,服务器可能仍发送了部分数据
    aofReadDiffFromParent();

    redisLog(REDIS_NOTICE,
        "Concatenating %.2f MB of AOF diff received from parent.",
        (double) sdslen(server.aof_child_diff) / (1024*1024));
    // 将aof差异数据写入aof文件
    if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)
        goto werr;

    if (fflush(fp) == EOF) goto werr;	// 调用fflush函数将数据冲洗到操作系统的缓存中
    if (fsync(fileno(fp)) == -1) goto werr; // 将数据从操作系统的缓存同步到磁盘
    if (fclose(fp) == EOF) goto werr; // 关闭文件

    if (rename(tmpfile,filename) == -1) {
        redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
        unlink(tmpfile);
        return REDIS_ERR;
    }
    redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
    return REDIS_OK;

werr:
    redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
    fclose(fp);
    unlink(tmpfile);
    if (di) dictReleaseIterator(di);
    return REDIS_ERR;
}

      服务器进程会在serverCron函数中非阻塞的调用wait3函数,若aof重写进程结束后,服务器进程会获取aof重写进程的退出码,并将剩余aof差异数据(aof重写缓存中的数据)写入新的aof文件。其代码如下所示:

void serverCron()
{
    if (server.rdb_child_pid != -1 || server.aof_child_pid != -1) {
        int statloc;
        pid_t pid;

        // 调用wait3函数判断子进程是否结束,当返回值>0则说明进程以终止,若 == 0则未终止, 若<0则出错
        // statloc: 用于保存进程的终止状态
        // WNOHANG:若由pid指定的子进程的结束状态不可立刻获得(子进程还未结束),则不阻塞,(参 APUE p193)
        if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
			// 获取子进程传送给exit或_exit参数的低8位
            int exitcode = WEXITSTATUS(statloc);
            int bysignal = 0;

            // WIFSIGNALED宏:当statloc为异常终止的返回状态,则返回true,此时可使用WTERMSIG获取使子进程终止的信号编号
            if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);

            if (pid == -1) { // wait调用出错(如:无子进程调用了wait)
                redisLog(LOG_WARNING,"wait3() returned an error: %s. "
                    "rdb_child_pid = %d, aof_child_pid = %d",
                    strerror(errno),
                    (int) server.rdb_child_pid,
                    (int) server.aof_child_pid);
            } else if (pid == server.rdb_child_pid) { // 若是rdb持久化子进程结束
                backgroundSaveDoneHandler(exitcode,bysignal);
            } else if (pid == server.aof_child_pid) { // 若是aof重写进程结束
                backgroundRewriteDoneHandler(exitcode,bysignal);
            } else {
                redisLog(REDIS_WARNING,
                    "Warning, detected child with unmatched pid: %ld",
                    (long)pid);
            }
            updateDictResizePolicy();
        }
    }
}


// 在服务器进程中进行aof重写最后的合并操作
void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
    if (!bysignal && exitcode == 0) {
        int newfd, oldfd;
        char tmpfile[256];
        long long now = ustime();
        mstime_t latency;

        redisLog(REDIS_NOTICE,
            "Background AOF rewrite terminated with success");

        latencyStartMonitor(latency); // 检测写入aof重写缓存数据到aof文件的耗时
        snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
            (int)server.aof_child_pid);
		// 以追加方式打开新的aof文件
        newfd = open(tmpfile,O_WRONLY|O_APPEND);
        if (newfd == -1) {
            redisLog(REDIS_WARNING,
                "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
            goto cleanup;
        }

		// 将aof重写缓存中的剩余数据写入新的aof文件
        if (aofRewriteBufferWrite(newfd) == -1) {
            redisLog(REDIS_WARNING,
                "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
            close(newfd);
            goto cleanup;
        }
        latencyEndMonitor(latency); // 计算耗时
        latencyAddSampleIfNeeded("aof-rewrite-diff-write",latency);

        redisLog(REDIS_NOTICE,
            "Residual parent diff successfully flushed to the rewritten AOF (%.2f MB)", (double) aofRewriteBufferSize() / (1024*1024));
       
        /* 现在仅剩的问题是将新aof的文件名从临时名变为配置的文件名,并关闭旧的aof文件。
         * 我们不想调用close(2)函数或rename(2)函数,因为它们并非非阻塞的,可能会造成服
         * 务器进程的阻塞。
         * 
         * 它们可能会造成以下两种情况:
         *
         * 1)若旧aof文件已被关闭并且这是一次aof重写操作,那么新aof文件将会从临时文件名
         * 变更为配置的aof文件名。若已存在一个旧aof文件,那么会造成删除该目录项节点(
         * 此处说的目录项并非平常意义中的目录,而是linux文件系统中的一个环节,其指向了
         * 索引节点),而这会减少索引节点中的引用计数,当引用计数为0时将删除该文件(但若
         * 此时有进程打开了该文件也不会立刻删除,索引节点的i_count记录了使用该节点的进程数
         * ,i_nlink记录了硬链接数目,在关闭一个文件时会检查这两个计数,只要有一个不为0,
         * 则索引节点都不会被删除),而删除文件会耗费时间。参APUE p94 - p95
         *
         * 2)若旧aof文件仍处于打开状态,则会关闭该文件,并对新aof进行重命名,那么此时
         * 又会造成1)中的情况。
         *
         * 为了缓和unlink造成的阻塞影响(耗时),我们使用线程去处理这件事
         */
        if (server.aof_fd == -1) {
            // 若旧的aof文件已关闭,则我们先将其打开,因为有进程打开该文件时,该文件进行unlink不会被删除
            // 以便之后在任务线程中进行关闭
             oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
        } else {
            oldfd = -1; /* We'll set this to the current AOF filedes later. */
        }

        latencyStartMonitor(latency);
        if (rename(tmpfile,server.aof_filename) == -1) { // 重命名失败
            redisLog(REDIS_WARNING,
                "Error trying to rename the temporary AOF file: %s", strerror(errno));
            close(newfd);
            if (oldfd != -1) close(oldfd);
            goto cleanup;
        }
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-rename",latency);

        if (server.aof_fd == -1) {
            // 若aof文件描述符本就为-1(即无打开的aof文件,那么也不必打开新的aof文件)
            close(newfd);
        } else {
            /* 若旧的aof文件是打开的,那么我们将其文件描述符记录在oldfd中,
            * 以便之后将其做为参数传入任务线程将其关闭。*/
            oldfd = server.aof_fd;

            // 将新的aof文件的文件描述符保存在服务器中
            server.aof_fd = newfd;

            // 根据同步策略进行同步
            if (server.aof_fsync == AOF_FSYNC_ALWAYS)
                aof_fsync(newfd);
            else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
                aof_background_fsync(newfd);
            server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
            aofUpdateCurrentSize();
            server.aof_rewrite_base_size = server.aof_current_size;

            sdsfree(server.aof_buf);
            server.aof_buf = sdsempty();
        }

        server.aof_lastbgrewrite_status = REDIS_OK;

        redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");

        if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
            server.aof_state = REDIS_AOF_ON;

        // 在任务线程中关闭旧的aof文件
        if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);

        redisLog(REDIS_VERBOSE,
            "Background AOF rewrite signal handler took %lldus", ustime()-now);
    } else if (!bysignal && exitcode != 0) {
        server.aof_lastbgrewrite_status = REDIS_ERR;

        redisLog(REDIS_WARNING,
            "Background AOF rewrite terminated with error");
    } else {
        server.aof_lastbgrewrite_status = REDIS_ERR;

        redisLog(REDIS_WARNING,
            "Background AOF rewrite terminated by signal %d", bysignal);
    }

cleanup:
    aofClosePipes();
    aofRewriteBufferReset();
    aofRemoveTempFile(server.aof_child_pid);
    server.aof_child_pid = -1;
    server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
    server.aof_rewrite_time_start = -1;
    /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
    if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
        server.aof_rewrite_scheduled = 1;
}

【注】:当调用rename函数且newname已存在时,会调用unlink减少索引节点的硬连接数,当硬链接数减至0且无进程打开该文件(索引节点的引用计数i_count为0)则删除文件,若仍有进程打开该文件,则不会立刻删除。

【注】:当关闭一个文件时,操作系统会检查该文件的硬链接数与引用计数(即是否有进程打开),若二者都为0则删除文件。

【注】:正式因为以上原因,redis为了避免在主线程中删除文件而造成阻塞,从而先保证旧的AOF文件在rename之前处于被打开状态,之后在任务线程中关闭文件。

 

你可能感兴趣的:(Redis笔记)