Redis 之BIO与RIO

一、BIO 之后台IO操作
BIO : Background I/O service for Redis. 

负责我们需要在后台执行的操作。现在redis的版本中只有两类的操作,后台的close及fsync 系统调用。
为了避免一个文件最后的owner在执行close操作带来的unlink使得阻塞server,将这类操作用单独的后台线程来执行

将数据从内存写入磁盘这点非常重要,即fdatasync、因此就需要调用 fsync() 把文件数据和文件元信息写入强制刷新到磁盘中,这个速度是比较慢的、而其调用频度又会很高,所以有必要不能因IO而堵住现有的流程操作。

REDIS 允许有三种不同的策略:

/* Append only defines */
// 让kernel后台线程去做  这个线程默认可能是30秒去做一次
#define AOF_FSYNC_NO 0  
// 每次有write操作到AOF里 就会调用fsync
#define AOF_FSYNC_ALWAYS 1 
// 每秒调用一次fsync
#define AOF_FSYNC_EVERYSEC 2 
#define CONFIG_DEFAULT_AOF_FSYNC AOF_FSYNC_EVERYSEC

AOF_FSYNC_EVERYSEC是一个很好的这种折中对于性能和安全这2个点。

/* 刷新缓存区的内容到磁盘中 */
void flushAppendOnlyFile(int force) {
	int sync_in_progress = 0;
	
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC) // 这个判定是否后台正在执行 fsync 
        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;

	// 这里根据时间进行判定
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
        /* With this append fsync policy we do background fsyncing.
         * If the fsync is still in progress we can try to delay
         * the write for a couple of seconds. */
        if (sync_in_progress) {
            if (server.aof_flush_postponed_start == 0) {
                /* No previous write postponing, remember that we are
                 * postponing the flush and return. */
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                /* We were already waiting for fsync to finish, but for less
                 * than two seconds this is still ok. Postpone again. */
                return;
            }
            /* Otherwise fall trough, and go write since we can't wait
             * over two seconds. */
            server.aof_delayed_fsync++;
            serverLog(LL_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
        }
    }
    
    /* We want to perform a single write. This should be guaranteed atomic
     * at least if the filesystem we are writing is a real physical one.
     * While this will save us against the server being killed I don't think
     * there is much to do about the whole server stopping for power problems
     * or alike */
	//在进行写入操作的时候,还监听了延迟、write函数由于aof_buf一般不大很快就能返回,而阻塞的是fdatasync导致write等待
    latencyStartMonitor(latency);
    nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
    latencyEndMonitor(latency);
    
    // 调用偏移量
    server.aof_current_size += nwritten;

	// 优化内存重复使用性
    /* Re-use AOF buffer when it is small enough. The maximum comes from the
     * arena size of 4k minus some overhead (but is otherwise arbitrary). */
    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }
    
    /* Perform the fsync if needed. */
    if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
        /* aof_fsync is defined as fdatasync() for Linux in order to avoid
         * flushing metadata. */
        latencyStartMonitor(latency);
        aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("aof-fsync-always",latency);
        server.aof_last_fsync = server.unixtime;
    } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        // 交由后台执行fsync操作
        if (!sync_in_progress) 
        	aof_background_fsync(server.aof_fd);
        server.aof_last_fsync = server.unixtime;
    }
}

用法很简单、下面重点看下BIO内部是如何实现的.

/* Background job opcodes */
#define BIO_CLOSE_FILE    0 /* Deferred close(2) syscall. */
#define BIO_AOF_FSYNC     1 /* Deferred AOF fsync. */
#define BIO_NUM_OPS       2

主要两类作业类型:1.close 2.aof_fsync

//使用互斥量+条件变量,作为线程的保护条件
static pthread_mutex_t bio_mutex[REDIS_BIO_NUM_OPS];
static pthread_cond_t bio_condvar[REDIS_BIO_NUM_OPS];

//两类作业的队列、工作与挂起队列
static list *bio_jobs[REDIS_BIO_NUM_OPS];
static unsigned long long bio_pending[REDIS_BIO_NUM_OPS];

/* This structure represents a background Job. It is only used locally to this
 * file as the API does not expose the internals at all. */
struct bio_job {
    time_t time; /* Time at which the job was created. */
    /* Job specific arguments pointers. If we need to pass more than three
     * arguments we can just pass a pointer to a structure or alike. */
    void *arg1, *arg2, *arg3;
};

// 初始化相应变量并建立后台线程bioProcessBackgroundJobs
/* Initialize the background system, spawning the thread. */
void bioInit(void) {
    pthread_attr_t attr;
    pthread_t thread;
    size_t stacksize;
    int j;

    /* Initialization of state vars and objects */
    for (j = 0; j < BIO_NUM_OPS; j++) {
        pthread_mutex_init(&bio_mutex[j],NULL);
        pthread_cond_init(&bio_condvar[j],NULL);
        bio_jobs[j] = listCreate();
        bio_pending[j] = 0;
    }

    /* Set the stack size as by default it may be small in some system */
    pthread_attr_init(&attr);
    pthread_attr_getstacksize(&attr,&stacksize);
    if (!stacksize) stacksize = 1; /* The world is full of Solaris Fixes */
    while (stacksize < REDIS_THREAD_STACK_SIZE) stacksize *= 2;
    pthread_attr_setstacksize(&attr, stacksize);

    /* Ready to spawn our threads. We use the single argument the thread
     * function accepts in order to pass the job ID the thread is
     * responsible of. */
    for (j = 0; j < BIO_NUM_OPS; j++) {
        void *arg = (void*)(unsigned long) j;
        if (pthread_create(&thread,&attr,bioProcessBackgroundJobs,arg) != 0) {
            serverLog(LL_WARNING,"Fatal: Can't initialize Background Jobs.");
            exit(1);
        }
        bio_threads[j] = thread;
    }
}

// 创建后台作业、并将作业挂成链表
void bioCreateBackgroundJob(int type, void *arg1, void *arg2, void *arg3) {
    struct bio_job *job = zmalloc(sizeof(*job));

    job->time = time(NULL);
    job->arg1 = arg1;
    job->arg2 = arg2;
    job->arg3 = arg3;
    pthread_mutex_lock(&bio_mutex[type]);
    // 作业加入到队尾
    listAddNodeTail(bio_jobs[type],job);
    // 挂起的作业队列数目加1
    bio_pending[type]++;
    pthread_cond_signal(&bio_condvar[type]);
    pthread_mutex_unlock(&bio_mutex[type]);
}
 
// 后台处理线程
void *bioProcessBackgroundJobs(void *arg) {
    struct bio_job *job;
    unsigned long type = (unsigned long) arg;
    sigset_t sigset;

    /* Check that the type is within the right interval. */
    if (type >= BIO_NUM_OPS) {
        serverLog(LL_WARNING,
            "Warning: bio thread started with wrong type %lu",type);
        return NULL;
    }

    /* Make the thread killable at any time, so that bioKillThreads()
     * can work reliably. */
    pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL);
    pthread_setcanceltype(PTHREAD_CANCEL_ASYNCHRONOUS, NULL);

    pthread_mutex_lock(&bio_mutex[type]);
    /* Block SIGALRM so we are sure that only the main thread will
     * receive the watchdog signal. */
    sigemptyset(&sigset);
    sigaddset(&sigset, SIGALRM);
    if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
        serverLog(LL_WARNING,
            "Warning: can't mask SIGALRM in bio.c thread: %s", strerror(errno));

    while(1) {
        listNode *ln;

		// 是否有作业未做,如果无任何作业则wait
        /* The loop always starts with the lock hold. */
        if (listLength(bio_jobs[type]) == 0) {
            pthread_cond_wait(&bio_condvar[type],&bio_mutex[type]);
            continue;
        }
        
        // 取出链表头作业结点
        /* Pop the job from the queue. */
        ln = listFirst(bio_jobs[type]);
        job = ln->value;
        /* It is now possible to unlock the background system as we know have
         * a stand alone job structure to process.*/
        pthread_mutex_unlock(&bio_mutex[type]);

		// 真正执行
        /* Process the job accordingly to its type. */
        if (type == BIO_CLOSE_FILE) {
            close((long)job->arg1);
        } else if (type == BIO_AOF_FSYNC) {
            aof_fsync((long)job->arg1);
        } else {
            serverPanic("Wrong job type in bioProcessBackgroundJobs().");
        }
        zfree(job);

        /* Lock again before reiterating the loop, if there are no longer
         * jobs to process we'll block again in pthread_cond_wait(). */
        pthread_mutex_lock(&bio_mutex[type]);
        // 从链表中删除已完成的作业
        listDelNode(bio_jobs[type],ln);
        // 挂起的作业队列数目减1
        bio_pending[type]--;
    }
}

总结:
1、针对耗时的 close及fsync 进行另起线程后台执行、可以避免主线程阻塞问题。
2、对于高性能的文件刷新还有一些好的创意、这个好好再细看下。

二、RIO (统一buffer、file、socket不同对象IO操作)
I/O操作对于每个系统来说都是必不可少的一部分、而I/O操作的好坏,在一定程度上也会影响着系统的效率问题。

提供三个方面内容:
1、读写操作、获取偏移量操作等相关的回调函数。
   rio可以处理buffer、file、socket三种不同类型的I/O对象,不同的rio对象底层使用相应的系统调用完成
   read、write、tell、flush操作。比如,对于file rio对象,底层通过fwrite函数完成写操作,通过fread
   函数完成读操作。
2、校验和操作。rio使用了RCR64算法计算校验和,具体实现可以参看crc64.h和crc64.c文件。
3、IO变量。_rio中的io成员是一个联合体,针对不同的I/O情况进行不同的处理:当执行内存buffer的I/O操作时,
  使用rio.buffer结构体;当执行文件I/O操作时,使用rio.file结构体;当执行socket的I/O操作时,使用rio.fdset结构体。
   
先看一下 struct rio 结构:

// 系统IO操作的封装
struct _rio {
    /* Backend functions.
     * Since this functions do not tolerate short writes or reads the return
     * value is simplified to: zero on error, non zero on complete success. */
    // 数据流的读方法
    size_t (*read)(struct _rio *, void *buf, size_t len);
    // 数据流的写方法
    size_t (*write)(struct _rio *, const void *buf, size_t len);
    // 获取当前的读写偏移量
    off_t (*tell)(struct _rio *);
    // flush操作
    int (*flush)(struct _rio *);
    /* The update_cksum method if not NULL is used to compute the checksum of
     * all the data that was read or written so far. The method should be
     * designed so that can be called with the current checksum, and the buf
     * and len fields pointing to the new block of data to add to the checksum
     * computation. */
    // 更新校验和
    void (*update_cksum)(struct _rio *, const void *buf, size_t len);

    /* The current checksum */
    // 当前校验和
    uint64_t cksum;

    /* number of bytes read or written */
    // 已读或已写的字节数
    size_t processed_bytes;

    /* maximum single read or write chunk size */
    // 每次读或写操作的最大字节数
    size_t max_processing_chunk;

    /* Backend-specific vars. */
    // 不同的io变量
    union {
        /* In-memory buffer target. */
        // 内存缓冲区buffer结构体(buffer指针及偏移量)
        struct {
            sds ptr;
            off_t pos;
        } buffer;
        
        /* Stdio file pointer target. */
        // 文件结构体(文件句柄)
        struct {
            FILE *fp;
            // 最后一个fsync后写入的字节数
            off_t buffered; /* Bytes written since last fsync. */
            // 多少字节进行一次fsync操作
            off_t autosync; /* fsync after 'autosync' bytes written. */
        } file;
        
        /* Multiple FDs target (used to write to N sockets). */
         // 封装了多个文件描述符结构体(写同样的数据到多个socket fd中)
        struct {
        	// 文件描述符数组
            int *fds;       /* File descriptors. */
            int *state;     /* Error state of each fd. 0 (if ok) or errno. */
            // 文件描述符的个数
            int numfds;
            // 偏移量
            off_t pos;
            // 缓冲区
            sds buf;
        } fdset;
    } io;
};

再看rio统一定义的读写方法:

static inline size_t rioWrite(rio *r, const void *buf, size_t len) {
    while (len) {
    	//判断当前操作字节长度是否超过最大长度
        size_t bytes_to_write = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
        //写入新的数据时,更新校验和
        if (r->update_cksum) r->update_cksum(r,buf,bytes_to_write);
        //执行写方法
        if (r->write(r,buf,bytes_to_write) == 0)
            return 0;
        buf = (char*)buf + bytes_to_write;
        len -= bytes_to_write;
        //操作字节数增加  
        r->processed_bytes += bytes_to_write;
    }
    return 1;
}

static inline size_t rioRead(rio *r, void *buf, size_t len) {
    while (len) {
    	//判断当前操作字节长度是否超过最大长度 
        size_t bytes_to_read = (r->max_processing_chunk && r->max_processing_chunk < len) ? r->max_processing_chunk : len;
        //读数据方法  
        if (r->read(r,buf,bytes_to_read) == 0)
            return 0;
        //读数据时,更新校验和  
        if (r->update_cksum) r->update_cksum(r,buf,bytes_to_read);
        buf = (char*)buf + bytes_to_read;
        len -= bytes_to_read;
        r->processed_bytes += bytes_to_read;
    }
    return 1;
}

每次当有数据发生改变的时候,Redis都会做一个计算校验和的处理算法,表明了数据操作的改变动作,用的算法就是CRC64算法。

下面继续分析 buffer IO和File IO及Socket IO.
rioFileIO使用标准C流式文件IO进行流式IO操作
rioBufferIO使用sds进行内存流式IO操作
rioFdsetIO使用多个socket fd写数据的IO操作

static const rio rioBufferIO = {
    rioBufferRead,
    rioBufferWrite,
    rioBufferTell,
    rioBufferFlush,
    NULL,           /* update_checksum */
    0,              /* current checksum */
    0,              /* bytes read or written */
    0,              /* read/write chunk size */
    { { NULL, 0 } } /* union for io-specific vars */
};

static const rio rioFileIO = {
    rioFileRead,
    rioFileWrite,
    rioFileTell,
    rioFileFlush,
    NULL,           /* update_checksum */
    0,              /* current checksum */
    0,              /* bytes read or written */
    0,              /* read/write chunk size */
    { { NULL, 0 } } /* union for io-specific vars */
};

static const rio rioFdsetIO = {
    rioFdsetRead,
    rioFdsetWrite,
    rioFdsetTell,
    rioFdsetFlush,
    NULL,           /* update_checksum */
    0,              /* current checksum */
    0,              /* bytes read or written */
    0,              /* read/write chunk size */
    { { NULL, 0 } } /* union for io-specific vars */
};

以上的几个函数都很简单、稍微看下就能明白意思,就不细讲了。这里说下file write函数,有个细节是
当把内容写入到rio.file.buffer时,buffer超过给定的同步最小字节,必须将buffer内容刷新到文件中。

static size_t rioFileWrite(rio *r, const void *buf, size_t len) {
    size_t retval;

    retval = fwrite(buf,len,1,r->io.file.fp);
    r->io.file.buffered += len;

	//判读是否需要同步
    if (r->io.file.autosync &&
        r->io.file.buffered >= r->io.file.autosync)
    {
        fflush(r->io.file.fp);
        aof_fsync(fileno(r->io.file.fp));
        r->io.file.buffered = 0;
    }
    return retval;
}

Redis中的rio模块还封装了一些辅助生成AOF协议的函数:

// 以【"*\r\n"】 的形式将count以字符串的格式写入rio对象中,返回写入的字节数。
size_t rioWriteBulkCount(rio *r, char prefix, int count);

// 以【"$\r\n\r\n"】格式往rio对象中写入二进制安全字符串。
size_t rioWriteBulkString(rio *r, const char *buf, size_t len);

// 以【"$\r\n\r\n"】的格式往rio对象中写入long long类型的值。
size_t rioWriteBulkLongLong(rio *r, long long l);

// 以【"$\r\n\r\n"】的格式往rio对象中写入double类型的值。
size_t rioWriteBulkDouble(rio *r, double d);

总结:
1、rio提供了基于文件流和内存流的读、写、位置通告、校验和操作方法
2、若设置了校验和方法,读写前会进行校验和更新操作
3、提供了用于写Redis协议的高层API函数


你可能感兴趣的:(分布式存储)