本文源码基于redis2.2分析
为了解决生成快照时,因为保存rdb文件的导致redis阻塞,redis引入了bgsave方式,
再一次为了解决不能实时的保存,redis引入了aof机制。
以独立日志的方式记录每次写命令,
重启时再重新执行AOF文件中的命令达到恢复数据的目的。AOF的主要作用
是解决了数据持久化的实时性,目前已经是Redis持久化的主流方式。
############################## APPEND ONLY MODE ###############################
# 是否开启AOF,默认不开启,aof和rdb可同时开启,同时开启的情况下,优先加载aof的文件
appendonly no
# aof写入磁盘的文件名
appendfilename "appendonly.aof"
# 指定fsync()函数的调用时机,调用fsync()告诉os写入数据到磁盘的时机
# alway: 每次一次的数据都要立即写入aof的文件,数据安全的,不怕丢失
# no: 当os觉得时机成熟的时候,再把数据写入aof文件,性能好(能够批量写,而不是每一次都写)
# everysec: 以上两种方案的折中,每秒写入
# 如果不确定用那种,那就用 everysec(默认这种方式)
# appendfsync always
appendfsync everysec
# appendfsync no
# aof写入文件以及重写aof的文件,会出现两个竞争操作磁盘这个文件,
# 而重写会设计大量的磁盘io操作,会造成写aof文件是阻塞,如果设置这个参数为yes,
# 则先不写aof文件,等待重写完aof文件后,再把缓冲区的数据写入,极端情况下会出现redis挂掉,缓冲区数据没有写入丢数据
# (这和们持久化的目的相反);如果设置为no(默认为no),则表示可以同时写,丢失数据,但是要忍受写入aof文件时阻塞,
no-appendfsync-on-rewrite no
# 自动重写aof文件的时机,运维aof文件直接存储写入命令,所以会存在冗余
# 如果重写合并命令,写入命令为当前结果的命令,则可以大大节省空间。(如对
# 字符类型的键key1进行几次操作,set key1 121; set key1 34; set key12;如果不重写
# aof文件,那么会存在三条命令,如果重写后,仅只存写入生成最终value的命令)
# 当相比先一次重写后的大小大于100%时,重写
auto-aof-rewrite-percentage 100
# 当大于64M时重写文件
auto-aof-rewrite-min-size 64mb
# 当redis运行时发生崩溃或者断电,可能会导致写入的aof命令时不完整的,之后如果重启
# redis可能会导致启动失败,下面这个参数控制是否忽略最后一条可能存在问题的指令。
# 默认值是yes: 忽略最后最后一条有可能有问题的命令
# no:不忽略最后一条可能有问题的指令。
aof-load-truncated yes
#redis4.0后支持这个配置
# 支持rdb与aof混合模式,yes表示打开混合模式,将rdb写入都aof中
aof-use-rdb-preamble yes
redis启动:
4. 加载aof文件到内存中
执行redis客户端命令前的检验
1. 客户端是否退出,如果退出关闭服务端的客户端的socket
2. 查询输入的命令是否正确,如果命令不存在返回客户端错误信息
3. 检查命令的参数个数是否正确
4. 检查客户端是否已认证,如果没有认证返回无权操作信息
5. 释放一些没有使用的内存,
6. 是否在订阅模式下,订阅模式仅支持客户端SUBSCRIBE and UNSUBSCRIBE
7. 是否主从模式
8. 如果正在加载db,则接收处理命令
9. 执行命令
10. 执行命令以及进行aof(调用函数call)
/**
** 执行redis客户端命令前的检验
** redis.c
**/
int processCommand(redisClient *c) {
//1. 客户端是否退出,如果退出关闭服务端的客户端的socket
if (!strcasecmp(c->argv[0]->ptr,"quit")) {
addReply(c,shared.ok);
c->flags |= REDIS_CLOSE_AFTER_REPLY;
return REDIS_ERR;
}
//2. 查询输入的命令是否正确,如果命令不存在返回客户端错误信息
c->cmd = lookupCommand(c->argv[0]->ptr);
if (!c->cmd) {
addReplyErrorFormat(c,"unknown command '%s'",
(char*)c->argv[0]->ptr);
return REDIS_OK;
}
//3. 检查命令的参数个数是否正确
else if ((c->cmd->arity > 0 && c->cmd->arity != c->argc) ||
(c->argc < -c->cmd->arity)) {
addReplyErrorFormat(c,"wrong number of arguments for '%s' command",
c->cmd->name);
return REDIS_OK;
}
//4. 检查客户端是否已认证,如果没有认证返回无权操作信息
if (server.requirepass && !c->authenticated && c->cmd->proc != authCommand)
{
addReplyError(c,"operation not permitted");
return REDIS_OK;
}
//5. 释放一些没有使用的内存,
if (server.maxmemory) freeMemoryIfNeeded();
if (server.maxmemory && (c->cmd->flags & REDIS_CMD_DENYOOM) &&
zmalloc_used_memory() > server.maxmemory)
{
addReplyError(c,"command not allowed when used memory > 'maxmemory'");
return REDIS_OK;
}
//6. 是否在订阅模式下,订阅模式仅支持客户端SUBSCRIBE and UNSUBSCRIBE
if ((dictSize(c->pubsub_channels) > 0 || listLength(c->pubsub_patterns) > 0)
&&
c->cmd->proc != subscribeCommand &&
c->cmd->proc != unsubscribeCommand &&
c->cmd->proc != psubscribeCommand &&
c->cmd->proc != punsubscribeCommand) {
addReplyError(c,"only (P)SUBSCRIBE / (P)UNSUBSCRIBE / QUIT allowed in this context");
return REDIS_OK;
}
//7. 是否主从模式
if (server.masterhost && server.replstate != REDIS_REPL_CONNECTED &&
server.repl_serve_stale_data == 0 &&
c->cmd->proc != infoCommand && c->cmd->proc != slaveofCommand)
{
addReplyError(c,
"link with MASTER is down and slave-serve-stale-data is set to no");
return REDIS_OK;
}
//8. 如果正在加载db,则接收处理命令
if (server.loading && c->cmd->proc != infoCommand) {
addReply(c, shared.loadingerr);
return REDIS_OK;
}
//9. 执行命令
if (c->flags & REDIS_MULTI &&
c->cmd->proc != execCommand && c->cmd->proc != discardCommand &&
c->cmd->proc != multiCommand && c->cmd->proc != watchCommand)
{
queueMultiCommand(c);
addReply(c,shared.queued);
} else {
if (server.vm_enabled && server.vm_max_threads > 0 &&
blockClientOnSwappedKeys(c)) return REDIS_ERR;
//10. 执行命令以及进行aof
call(c);
}
return REDIS_OK;
}
执行客户端命令以及传播aof、复制、
1. 当前时间,如果命令执行过长,记录到慢日志
2. 执行命令(proc指向命令的函数)
3. 计算命令执行耗费时间,如果过长,记录慢日志
4. 如果开启了aof,命令追加到aof_buf(调用函数:feedAppendOnlyFile)
/**
** 执行客户端命令以及传播aof、复制、
** redis.c
**/
void call(redisClient *c) {
//1. 当前时间,如果命令执行过长,记录到慢日志
long long dirty, start = ustime(), duration;
dirty = server.dirty;
//2. 执行命令(proc指向命令的函数)
c->cmd->proc(c);
dirty = server.dirty-dirty;
//3. 计算命令执行耗费时间,如果过长,记录慢日志
duration = ustime()-start;
slowlogPushEntryIfNeeded(c->argv,c->argc,duration);
if (server.appendonly && dirty > 0)
//4. 如果开启了aof,命令追加到aof_buf
feedAppendOnlyFile(c->cmd,c->db->id,c->argv,c->argc);
if ((dirty > 0 || c->cmd->flags & REDIS_CMD_FORCE_REPLICATION) &&
listLength(server.slaves))
replicationFeedSlaves(server.slaves,c->db->id,c->argv,c->argc);
if (listLength(server.monitors))
replicationFeedMonitors(server.monitors,c->db->id,c->argv,c->argc);
server.stat_numcommands++;
}
追加写入操作的命令到aof_buf
1.标识aof当前记录是那个db上的写操作
2.构造写入aof的过期命令
3.构造写入aof的set命令
4.构造其他命令
5.本次写入的命令追加到aof_buf
6.如果此时正确执行重写,有子进程,则命令同时也写入到bgrewritebuf,保证在写入aof文件,不会丢失这个时间段客户端执行的命令
7.释放函数内的变量
/**
** aof.c
**/
void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
sds buf = sdsempty();
robj *tmpargv[3];
//1.标识aof当前记录是那个db上的写操作
if (dictid != server.appendseldb) {
char seldb[64];
snprintf(seldb,sizeof(seldb),"%d",dictid);
buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
(unsigned long)strlen(seldb),seldb);
server.appendseldb = dictid;
}
//2. 构造写入aof的过期命令
if (cmd->proc == expireCommand) {
/* Translate EXPIRE into EXPIREAT */
buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
} else if (cmd->proc == setexCommand) {
//3. 构造写入aof的set命令
/* Translate SETEX to SET and EXPIREAT */
tmpargv[0] = createStringObject("SET",3);
tmpargv[1] = argv[1];
tmpargv[2] = argv[3];
buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
decrRefCount(tmpargv[0]);
buf = catAppendOnlyExpireAtCommand(buf,argv[1],argv[2]);
} else {
//4.构造其他命令
buf = catAppendOnlyGenericCommand(buf,argc,argv);
}
//5.本次写入的命令追加到aof_buf
server.aofbuf = sdscatlen(server.aofbuf,buf,sdslen(buf));
//6. 如果此时正确执行重写,有子进程,则命令同时也写入到bgrewritebuf,保证在写入aof文件,不会丢失这个时间段客户端执行的命令
if (server.bgrewritechildpid != -1)
server.bgrewritebuf = sdscatlen(server.bgrewritebuf,buf,sdslen(buf));
//7. 释放函数内的变量
sdsfree(buf);
}
aof_bufh缓存刷入到aof文件中
1. 如果aof_bug为空,则直接返回
2. 写入aof_bug到aof文件中
3. 判断写入是否正确,不正确日志输出错误信息
4. 释放aof_buf
5. 判断是否需要同步刷新
/**
** aof_bufh缓存刷入到aof文件中
** aof.c
**/
void flushAppendOnlyFile(void) {
time_t now;
ssize_t nwritten;
//1. 如果aof_bug为空,则直接返回
if (sdslen(server.aofbuf) == 0) return;
//2. 写入aof_bug到aof文件中
nwritten = write(server.appendfd,server.aofbuf,sdslen(server.aofbuf));
//3. 判断写入是否正确,不正确日志输出错误信息
if (nwritten != (signed)sdslen(server.aofbuf)) {
if (nwritten == -1) {
redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
} else {
redisLog(REDIS_WARNING,"Exiting on short write while writing to the append-only file: %s",strerror(errno));
}
exit(1);
}
//4. 释放aof_buf
sdsfree(server.aofbuf);
//5.aof_buf置为空
server.aofbuf = sdsempty();
//5. 判断是否需要同步刷新
if (server.no_appendfsync_on_rewrite &&
(server.bgrewritechildpid != -1 || server.bgsavechildpid != -1))
return;
/* Fsync if needed */
now = time(NULL);
if (server.appendfsync == APPENDFSYNC_ALWAYS ||
(server.appendfsync == APPENDFSYNC_EVERYSEC &&
now-server.lastfsync > 1))
{
/* aof_fsync is defined as fdatasync() for Linux in order to avoid
* flushing metadata. */
aof_fsync(server.appendfd); /* Let's try to get this data on the disk */
server.lastfsync = now;
}
}
1.客户端执行bgrewriteaof或者达到条件触发了重写
2. 父进程fork出子进程(这里耗费的时间和bgsave一样)
3.1 fork完成后父进程继续aof追加机制,写入到aof文件
3.2 fork完成后父进程追加写入的命令到aof_rewrite_buf(保证在子进程写入到新的aof时不丢失写入的命令)
4. 子进程写入当前所有的键值对到新的aof文件中
5.1 子进程完成写入,通过信号量通知父进程
5.2 父进程写入aof_rewrite_buf追加到新的aof文件中
5.3 5.2写入完成,新的aof文件重命名为配置文件的appendFileName
/**
** bgrewriteaof命令的执行函数
** redis.c
**/
struct redisCommand readonlyCommandTable[] = {
{"bgrewriteaof",bgrewriteaofCommand,1,0,NULL,0,0,0}
}
fork子进程顶层抽象函数
1. 如果当前有子进程则直接返回
2. fork子进程执行
3. fork失败返回错误信息
/**
** aof.c
**/
void bgrewriteaofCommand(redisClient *c) {
//1. 如果当前有子进程则直接返回
if (server.bgrewritechildpid != -1) {
addReplyError(c,"Background append only file rewriting already in progress");
return;
}
//2. fork子进程执行
if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
addReplyStatus(c,"Background append only file rewriting started");
} else {
//3. fork失败返回错误信息
addReply(c,shared.err);
}
}
fork子进程的函数
1. 如果当前有子进程,返回失败
2. fork出子进程
3.使用子进程的pid构造aof文件名
4. 子进程写入所有的键值对到新的aof文件中
5. 如果fork子进程失败,返回错误信息
6. redis输出日志信息开始重写aof
/**
** aof.c
**/
int rewriteAppendOnlyFileBackground(void) {
pid_t childpid;
//1. 如果当前有子进程,返回失败
if (server.bgrewritechildpid != -1) return REDIS_ERR;
if (server.vm_enabled) waitEmptyIOJobsQueue();
//2. fork出子进程
if ((childpid = fork()) == 0) {
/* Child */
char tmpfile[256];
if (server.vm_enabled) vmReopenSwapFile();
if (server.ipfd > 0) close(server.ipfd);
if (server.sofd > 0) close(server.sofd);
//3.使用子进程的pid构造aof文件名
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
//4. 子进程写入所有的键值对到新的aof文件中
if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
_exit(0);
} else {
_exit(1);
}
} else {
/* Parent */
//5. 如果fork子进程失败,返回错误信息
if (childpid == -1) {
redisLog(REDIS_WARNING,
"Can't rewrite append only file in background: fork: %s",
strerror(errno));
return REDIS_ERR;
}
//6. redis输出日志信息开始重写aof
redisLog(REDIS_NOTICE,
"Background append only file rewriting started by pid %d",childpid);
server.bgrewritechildpid = childpid;
updateDictResizePolicy();
server.appendseldb = -1;
return REDIS_OK;
}
return REDIS_OK; /* unreached */
}
子进程操作当前所有键值对写入到新的aof文件中
1. 构造临时文件名
2. 打开临时文件
3.循环写入每个db的键
4. 写入当前db的开始标识
5.写入当前db的number
6. 迭代当前db的键值对
7. 写入数据类型为string的键及其值
8. 写入数据类型为list的键及其值
8.1 写入数据类型为list的内部编码为zipList键及其值
8.2 写入数据类型为list的内部编码为linkedlist键及其值
8.3 不存在的list内部编码
9. 写入数据类型为set的键及其值
9.1 写入数据类型为set的内部编码为intset键及其值
9.2 写入数据类型为set的内部编码为hashtable键及其值
9.3 不存在set的内部编码
10. 写入数据类型为set的键及其值
11. 写入数据类型为hash的键及其值
11.1 写入数据类型为hash的内部编码为zipmap键及其值
11.2 写入数据类型为hash的内部编码为其他键及其值
12. 未知的数据类型
13. 保存建的过期时间
14. 释放当前数据库的键的迭代器
15.数据刷入磁盘
16. 关闭文件描述符
17. 重命名文件 temp-rewriteaof-bg-#{子进程pid}.aof 注意aof_rewrite_buf在追加命令到这个文件中
/**
** aof.c
**/
int rewriteAppendOnlyFile(char *filename) {
dictIterator *di = NULL;
dictEntry *de;
FILE *fp;
char tmpfile[256];
int j;
time_t now = time(NULL);
//1. 构造临时文件名
snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
//2. 打开临时文件
fp = fopen(tmpfile,"w");
//3.如果打开失败输出错误日志信息
if (!fp) {
redisLog(REDIS_WARNING, "Failed rewriting the append only file: %s", strerror(errno));
return REDIS_ERR;
}
//3.循环写入每个db的键
for (j = 0; j < server.dbnum; j++) {
char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
redisDb *db = server.db+j;
dict *d = db->dict;
if (dictSize(d) == 0) continue;
di = dictGetSafeIterator(d);
if (!di) {
fclose(fp);
return REDIS_ERR;
}
//4. 写入当前db的开始标识
if (fwrite(selectcmd,sizeof(selectcmd)-1,1,fp) == 0) goto werr;
//5.写入当前db的number
if (fwriteBulkLongLong(fp,j) == 0) goto werr;
//6. 迭代当前db的键值对
while((de = dictNext(di)) != NULL) {
sds keystr = dictGetEntryKey(de);
robj key, *o;
time_t expiretime;
int swapped;
keystr = dictGetEntryKey(de);
o = dictGetEntryVal(de);
initStaticStringObject(key,keystr);
if (!server.vm_enabled || o->storage == REDIS_VM_MEMORY ||
o->storage == REDIS_VM_SWAPPING) {
swapped = 0;
} else {
o = vmPreviewObject(o);
swapped = 1;
}
expiretime = getExpire(db,&key);
//7. 写入数据类型为string的键及其值
if (o->type == REDIS_STRING) {
char cmd[]="*3\r\n$3\r\nSET\r\n";
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkObject(fp,o) == 0) goto werr;
} else if (o->type == REDIS_LIST) {
//8. 写入数据类型为list的键及其值
/* Emit the RPUSHes needed to rebuild the list */
char cmd[]="*3\r\n$5\r\nRPUSH\r\n";
//8.1 写入数据类型为list的内部编码为zipList键及其值
if (o->encoding == REDIS_ENCODING_ZIPLIST) {
unsigned char *zl = o->ptr;
unsigned char *p = ziplistIndex(zl,0);
unsigned char *vstr;
unsigned int vlen;
long long vlong;
while(ziplistGet(p,&vstr,&vlen,&vlong)) {
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (vstr) {
if (fwriteBulkString(fp,(char*)vstr,vlen) == 0)
goto werr;
} else {
if (fwriteBulkLongLong(fp,vlong) == 0)
goto werr;
}
p = ziplistNext(zl,p);
}
} else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
//8.2 写入数据类型为list的内部编码为linkedlist键及其值
list *list = o->ptr;
listNode *ln;
listIter li;
listRewind(list,&li);
while((ln = listNext(&li))) {
robj *eleobj = listNodeValue(ln);
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
}
} else {
//8.3 不存在的list内部编码
redisPanic("Unknown list encoding");
}
} else if (o->type == REDIS_SET) {
//9. 写入数据类型为set的键及其值
char cmd[]="*3\r\n$4\r\nSADD\r\n";
if (o->encoding == REDIS_ENCODING_INTSET) {
//9.1 写入数据类型为set的内部编码为intset键及其值
int ii = 0;
int64_t llval;
while(intsetGet(o->ptr,ii++,&llval)) {
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkLongLong(fp,llval) == 0) goto werr;
}
} else if (o->encoding == REDIS_ENCODING_HT) {
//9.2 写入数据类型为set的内部编码为hashtable键及其值
dictIterator *di = dictGetIterator(o->ptr);
dictEntry *de;
while((de = dictNext(di)) != NULL) {
robj *eleobj = dictGetEntryKey(de);
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
}
dictReleaseIterator(di);
} else {
//9.3 不存在set的内部编码
redisPanic("Unknown set encoding");
}
} else if (o->type == REDIS_ZSET) {
//10. 写入数据类型为set的键及其值
/* Emit the ZADDs needed to rebuild the sorted set */
zset *zs = o->ptr;
dictIterator *di = dictGetIterator(zs->dict);
dictEntry *de;
while((de = dictNext(di)) != NULL) {
char cmd[]="*4\r\n$4\r\nZADD\r\n";
robj *eleobj = dictGetEntryKey(de);
double *score = dictGetEntryVal(de);
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkDouble(fp,*score) == 0) goto werr;
if (fwriteBulkObject(fp,eleobj) == 0) goto werr;
}
dictReleaseIterator(di);
} else if (o->type == REDIS_HASH) {
//11. 写入数据类型为hash的键及其值
char cmd[]="*4\r\n$4\r\nHSET\r\n";
//11.1 写入数据类型为hash的内部编码为zipmap键及其值
if (o->encoding == REDIS_ENCODING_ZIPMAP) {
unsigned char *p = zipmapRewind(o->ptr);
unsigned char *field, *val;
unsigned int flen, vlen;
while((p = zipmapNext(p,&field,&flen,&val,&vlen)) != NULL) {
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkString(fp,(char*)field,flen) == 0)
goto werr;
if (fwriteBulkString(fp,(char*)val,vlen) == 0)
goto werr;
}
} else {
//11.2 写入数据类型为hash的内部编码为其他键及其值
dictIterator *di = dictGetIterator(o->ptr);
dictEntry *de;
while((de = dictNext(di)) != NULL) {
robj *field = dictGetEntryKey(de);
robj *val = dictGetEntryVal(de);
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkObject(fp,field) == 0) goto werr;
if (fwriteBulkObject(fp,val) == 0) goto werr;
}
dictReleaseIterator(di);
}
} else {
//12. 未知的数据类型
redisPanic("Unknown object type");
}
//13. 保存建的过期时间
if (expiretime != -1) {
char cmd[]="*3\r\n$8\r\nEXPIREAT\r\n";
/* If this key is already expired skip it */
if (expiretime < now) continue;
if (fwrite(cmd,sizeof(cmd)-1,1,fp) == 0) goto werr;
if (fwriteBulkObject(fp,&key) == 0) goto werr;
if (fwriteBulkLongLong(fp,expiretime) == 0) goto werr;
}
if (swapped) decrRefCount(o);
}
//14. 释放当前数据库的键的迭代器
dictReleaseIterator(di);
}
//15.数据刷入磁盘
fflush(fp);
aof_fsync(fileno(fp));
//16. 关闭文件描述符
fclose(fp);
//17. 重命名文件 temp-rewriteaof-bg-#{子进程pid}.aof 注意aof_rewrite_buf在追加命令到这个文件中
if (rename(tmpfile,filename) == -1) {
redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
unlink(tmpfile);
return REDIS_ERR;
}
redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
return REDIS_OK;
werr:
fclose(fp);
unlink(tmpfile);
redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
if (di) dictReleaseIterator(di);
return REDIS_ERR;
}
/**
** 循环事件 :当子进程完成后,通过信号信号莲触发backgroundRewriteDoneHandler函数执行
** 本函数不做分析,这里目标时告诉读者backgroundRewriteDoneHandler这个函数时从哪里掉的。
** redis.c
**/
int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
int j, loops = server.cronloops;
REDIS_NOTUSED(eventLoop);
REDIS_NOTUSED(id);
REDIS_NOTUSED(clientData);
/* We take a cached value of the unix time in the global state because
* with virtual memory and aging there is to store the current time
* in objects at every object access, and accuracy is not needed.
* To access a global var is faster than calling time(NULL) */
server.unixtime = time(NULL);
/* We have just 22 bits per object for LRU information.
* So we use an (eventually wrapping) LRU clock with 10 seconds resolution.
* 2^22 bits with 10 seconds resoluton is more or less 1.5 years.
*
* Note that even if this will wrap after 1.5 years it's not a problem,
* everything will still work but just some object will appear younger
* to Redis. But for this to happen a given object should never be touched
* for 1.5 years.
*
* Note that you can change the resolution altering the
* REDIS_LRU_CLOCK_RESOLUTION define.
*/
updateLRUClock();
/* We received a SIGTERM, shutting down here in a safe way, as it is
* not ok doing so inside the signal handler. */
if (server.shutdown_asap) {
if (prepareForShutdown() == REDIS_OK) exit(0);
redisLog(REDIS_WARNING,"SIGTERM received but errors trying to shut down the server, check the logs for more information");
}
/* Show some info about non-empty databases */
for (j = 0; j < server.dbnum; j++) {
long long size, used, vkeys;
size = dictSlots(server.db[j].dict);
used = dictSize(server.db[j].dict);
vkeys = dictSize(server.db[j].expires);
if (!(loops % 50) && (used || vkeys)) {
redisLog(REDIS_VERBOSE,"DB %d: %lld keys (%lld volatile) in %lld slots HT.",j,used,vkeys,size);
/* dictPrintStats(server.dict); */
}
}
/* We don't want to resize the hash tables while a bacground saving
* is in progress: the saving child is created using fork() that is
* implemented with a copy-on-write semantic in most modern systems, so
* if we resize the HT while there is the saving child at work actually
* a lot of memory movements in the parent will cause a lot of pages
* copied. */
if (server.bgsavechildpid == -1 && server.bgrewritechildpid == -1) {
if (!(loops % 10)) tryResizeHashTables();
if (server.activerehashing) incrementallyRehash();
}
/* Show information about connected clients */
if (!(loops % 50)) {
redisLog(REDIS_VERBOSE,"%d clients connected (%d slaves), %zu bytes in use",
listLength(server.clients)-listLength(server.slaves),
listLength(server.slaves),
zmalloc_used_memory());
}
/* Close connections of timedout clients */
if ((server.maxidletime && !(loops % 100)) || server.bpop_blocked_clients)
closeTimedoutClients();
/* Check if a background saving or AOF rewrite in progress terminated */
if (server.bgsavechildpid != -1 || server.bgrewritechildpid != -1) {
int statloc;
pid_t pid;
if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
if (pid == server.bgsavechildpid) {
backgroundSaveDoneHandler(statloc);
} else {
backgroundRewriteDoneHandler(statloc);
}
updateDictResizePolicy();
}
} else {
/* If there is not a background saving in progress check if
* we have to save now */
time_t now = time(NULL);
for (j = 0; j < server.saveparamslen; j++) {
struct saveparam *sp = server.saveparams+j;
if (server.dirty >= sp->changes &&
now-server.lastsave > sp->seconds) {
redisLog(REDIS_NOTICE,"%d changes in %d seconds. Saving...",
sp->changes, sp->seconds);
rdbSaveBackground(server.dbfilename);
break;
}
}
}
/* Expire a few keys per cycle, only if this is a master.
* On slaves we wait for DEL operations synthesized by the master
* in order to guarantee a strict consistency. */
if (server.masterhost == NULL) activeExpireCycle();
/* Swap a few keys on disk if we are over the memory limit and VM
* is enbled. Try to free objects from the free list first. */
if (vmCanSwapOut()) {
while (server.vm_enabled && zmalloc_used_memory() >
server.vm_max_memory)
{
int retval = (server.vm_max_threads == 0) ?
vmSwapOneObjectBlocking() :
vmSwapOneObjectThreaded();
if (retval == REDIS_ERR && !(loops % 300) &&
zmalloc_used_memory() >
(server.vm_max_memory+server.vm_max_memory/10))
{
redisLog(REDIS_WARNING,"WARNING: vm-max-memory limit exceeded by more than 10%% but unable to swap more objects out!");
}
/* Note that when using threade I/O we free just one object,
* because anyway when the I/O thread in charge to swap this
* object out will finish, the handler of completed jobs
* will try to swap more objects if we are still out of memory. */
if (retval == REDIS_ERR || server.vm_max_threads > 0) break;
}
}
/* Replication cron function -- used to reconnect to master and
* to detect transfer failures. */
if (!(loops % 10)) replicationCron();
server.cronloops++;
return 100;
}
重写缓存区中的命令追加到子进程的文件中
1. 子进程信号量通知父进程完成了键值的重新写入
2. 构造aof写入的文件名(注意:这里文件名和子进程写入的时同一个文件,相当于把aof_rewrite_buf追加到了子进程的aof文件中)
3. 打开文件
4. 打开失败。错误信息写入日志
5. 重写缓存bgrewritebuf的数据追加到子进程的apf文件中
6. 如果追加失败,输出错误日志
7. 追加缓存成功,输出日志
8. 重命令子进程的aof为配置文件的aof名
9. 后面是一些关闭文件、收尾工作
/**
** 重写缓存区中的命令追加到子进程的文件中
**aof.c
**/
void backgroundRewriteDoneHandler(int statloc) {
int exitcode = WEXITSTATUS(statloc);
int bysignal = WIFSIGNALED(statloc);
//1. 子进程信号量通知父进程完成了键值的重新写入
if (!bysignal && exitcode == 0) {
int fd;
char tmpfile[256];
redisLog(REDIS_NOTICE,
//2. 构造aof写入的文件名(注意:这里文件名和子进程写入的时同一个文件,相当于把aof_rewrite_buf追加到了子进程的aof文件中)
snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) server.bgrewritechildpid);
//3. 打开文件
fd = open(tmpfile,O_WRONLY|O_APPEND);
if (fd == -1) {
//4. 打开失败。错误信息写入日志
redisLog(REDIS_WARNING, "Not able to open the temp append only file produced by the child: %s", strerror(errno));
goto cleanup;
}
//5. 重写缓存bgrewritebuf的数据追加到子进程的apf文件中
if (write(fd,server.bgrewritebuf,sdslen(server.bgrewritebuf)) !=
(signed) sdslen(server.bgrewritebuf)) {
//6. 如果追加失败,输出错误日志
redisLog(REDIS_WARNING, "Error or short write trying to flush the parent diff of the append log file in the child temp file: %s", strerror(errno));
close(fd);
goto cleanup;
}
//7. 追加缓存成功,输出日志
redisLog(REDIS_NOTICE,"Parent diff flushed into the new append log file with success (%lu bytes)",sdslen(server.bgrewritebuf));
//8. 重命令子进程的aof为配置文件的aof名
if (rename(tmpfile,server.appendfilename) == -1) {
redisLog(REDIS_WARNING,"Can't rename the temp append only file into the stable one: %s", strerror(errno));
close(fd);
goto cleanup;
}
//9. 后面是一些关闭文件、收尾工作
redisLog(REDIS_NOTICE,"Append only file successfully rewritten.");
if (server.appendfd != -1) {
/* If append only is actually enabled... */
close(server.appendfd);
server.appendfd = fd;
if (server.appendfsync != APPENDFSYNC_NO) aof_fsync(fd);
server.appendseldb = -1; /* Make sure it will issue SELECT */
redisLog(REDIS_NOTICE,"The new append only file was selected for future appends.");
/* Clear regular AOF buffer since its contents was just written to
* the new AOF from the background rewrite buffer. */
sdsfree(server.aofbuf);
server.aofbuf = sdsempty();
} else {
/* If append only is disabled we just generate a dump in this
* format. Why not? */
close(fd);
}
} else if (!bysignal && exitcode != 0) {
redisLog(REDIS_WARNING, "Background append only file rewriting error");
} else {
redisLog(REDIS_WARNING,
"Background append only file rewriting terminated by signal %d",
WTERMSIG(statloc));
}
cleanup:
sdsfree(server.bgrewritebuf);
server.bgrewritebuf = sdsempty();
aofRemoveTempFile(server.bgrewritechildpid);
server.bgrewritechildpid = -1;
}
no-appendfsync-on-rewrite参数
Linux之file_struct&fd(file descriptor)
Linux open()