linux 2.6以上内核提供以下几个系统调用来支持aio:
1、 SYS_io_setup:建立aio 的context
2、 SYS_io_submit: 提交I/O操作请求
3、 SYS_io_getevents:获取已完成的I/O事件
4、 SYS_io_cancel:取消I/O操作请求
5、 SYS_io_destroy:毁销aio的context
nginx中将aio和epoll事件模型(假设nginx使用epoll事件模型)组合起来使用,当请求的I/O操作完成时调用epoll相关函数通知应用程序来读取。组合的关键点在于使用了eventfd对象,那什么是eventfd呢?有关eventfd的详解资料参考http://linux.die.net/man/2/eventfd。
nginx在woker工作进程启动后初始化epoll事件模型时初始化aio(ngx_epoll_aio_init函数负责aio的初始化)。实现原理如下:在aio初始化时调用eventfd系统调用创建一个eventfd对象,eventfd系统调用返回一个与eventfd对象关联的文件描述符,设置该描述符为非阻塞并添加到epoll中,当该描述符可读时epoll_wait函数返回调用read函数读取当前完成的I/O操作个数,我们通过调用SYS_io_getevents系统调用就可以获取已完成的I/o事件。但eventfd描述符什么时候可读呢?这需要在提交I/O事件时将eventfd与aio关联起来(实现代码在ngx_file_aio_read函数中),提交I/O事件后如果I/O事件已完成就将当前完成的事件个数写入到eventfd描述符相关的计数器中并标识eventfd可读。下面来看看nginx中与aio相关的源代码:
ngx_epoll_aio_init函数:
static void
ngx_epoll_aio_init(ngx_cycle_t *cycle, ngx_epoll_conf_t *epcf)
{
int n;
struct epoll_event ee;
/***调用eventfd系统调用新建一个eventfd对象,第二个参数中的0表示eventfd的计数器初始值为0,
系统调用成功返回的是与eventfd对象关联的描述符***/
ngx_eventfd = syscall(SYS_eventfd, 0);
if (ngx_eventfd == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"eventfd() failed");
ngx_file_aio = 0;
return;
}
ngx_log_debug1(NGX_LOG_DEBUG_EVENT, cycle->log, 0,
"eventfd: %d", ngx_eventfd);
n = 1;
//设置描述符为非阻塞
if (ioctl(ngx_eventfd, FIONBIO, &n) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"ioctl(eventfd, FIONBIO) failed");
goto failed;
}
//调用SYS_io_setup系统调用建立aio context
if (io_setup(epcf->aio_requests, &ngx_aio_ctx) == -1) {
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"io_setup() failed");
goto failed;
}
//ngx_epoll_eventfd_handler函数当ngx_eventfd描述符可读时被调用
ngx_eventfd_event.data = &ngx_eventfd_conn;
ngx_eventfd_event.handler = ngx_epoll_eventfd_handler;
ngx_eventfd_event.log = cycle->log;
ngx_eventfd_event.active = 1;
ngx_eventfd_conn.fd = ngx_eventfd;
ngx_eventfd_conn.read = &ngx_eventfd_event;
ngx_eventfd_conn.log = cycle->log;
ee.events = EPOLLIN|EPOLLET;
ee.data.ptr = &ngx_eventfd_conn;
//将ngx_eventfd加入到epoll中
if (epoll_ctl(ep, EPOLL_CTL_ADD, ngx_eventfd, &ee) != -1) {
return;
}
ngx_log_error(NGX_LOG_EMERG, cycle->log, ngx_errno,
"epoll_ctl(EPOLL_CTL_ADD, eventfd) failed");
//epoll_ctl失败时需毁销aio的context
if (io_destroy(ngx_aio_ctx) == -1) {
ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_errno,
"io_destroy() failed");
}
failed:
if (close(ngx_eventfd) == -1) {
ngx_log_error(NGX_LOG_ALERT, cycle->log, ngx_errno,
"eventfd close() failed");
}
ngx_eventfd = -1;
ngx_aio_ctx = 0;
ngx_file_aio = 0;
}
ngx_epoll_eventfd_handler函数:
static void
ngx_epoll_eventfd_handler(ngx_event_t *ev)
{
int n, events;
long i;
uint64_t ready;
ngx_err_t err;
ngx_event_t *e;
ngx_event_aio_t *aio;
struct io_event event[64];
struct timespec ts;
ngx_log_debug0(NGX_LOG_DEBUG_EVENT, ev->log, 0, "eventfd handler");
//调用read函数读取已完成的I/O的个数
n = read(ngx_eventfd, &ready, 8);
. . .
ts.tv_sec = 0;
ts.tv_nsec = 0;
/***循环调用io_getevents函数获取所有已完成的I/O操作***/
while (ready) {
events = io_getevents(ngx_aio_ctx, 1, 64, event, &ts);
ngx_log_debug1(NGX_LOG_DEBUG_EVENT, ev->log, 0,
"io_getevents: %l", events);
if (events > 0) {
. . .
continue;
}
if (events == 0) {
return;
}
/* events == -1 */
ngx_log_error(NGX_LOG_ALERT, ev->log, ngx_errno,
"io_getevents() failed");
return;
}
}
ngx_file_aio_read函数:
ssize_t
ngx_file_aio_read(ngx_file_t *file, u_char *buf, size_t size, off_t offset,
ngx_pool_t *pool)
{
ngx_err_t err;
struct iocb *piocb[1];
ngx_event_t *ev;
ngx_event_aio_t *aio;
. . .
ngx_memzero(&aio->aiocb, sizeof(struct iocb));
/***设置iocb结构,注意aio->aiocb.aio_flags与aio->aiocb.aio_resfd这两个成员***/
aio->aiocb.aio_data = (uint64_t) (uintptr_t) ev;
aio->aiocb.aio_lio_opcode = IOCB_CMD_PREAD;
aio->aiocb.aio_fildes = file->fd;
aio->aiocb.aio_buf = (uint64_t) (uintptr_t) buf;
aio->aiocb.aio_nbytes = size;
aio->aiocb.aio_offset = offset;
/***当IOCB_FLAG_RESFD标识被设置时就使用aio->aiocb.aio_resfd变量中的描述符中通知用户态I/O事件已完成***/
aio->aiocb.aio_flags = IOCB_FLAG_RESFD;
aio->aiocb.aio_resfd = ngx_eventfd;
ev->handler = ngx_file_aio_event_handler;
piocb[0] = &aio->aiocb;
//提交I/O事件
if (io_submit(ngx_aio_ctx, 1, piocb) == 1) {
ev->active = 1;
ev->ready = 0;
ev->complete = 0;
return NGX_AGAIN;
}
err = ngx_errno;
if (err == NGX_EAGAIN) {
return ngx_read_file(file, buf, size, offset);
}
ngx_log_error(NGX_LOG_CRIT, file->log, err,
"io_submit(\"%V\") failed", &file->name);
if (err == NGX_ENOSYS) {
ngx_file_aio = 0;
return ngx_read_file(file, buf, size, offset);
}
return NGX_ERROR;
}
io_setup函数:
static int
io_setup(u_int nr_reqs, aio_context_t *ctx)
{
return syscall(SYS_io_setup, nr_reqs, ctx);
}
io_destroy函数:
static int
io_destroy(aio_context_t ctx)
{
return syscall(SYS_io_destroy, ctx);
}
io_getevents函数:
static int
io_getevents(aio_context_t ctx, long min_nr, long nr, struct io_event *events,
struct timespec *tmo)
{
return syscall(SYS_io_getevents, ctx, min_nr, nr, events, tmo);
}
io_submit函数:
static int
io_submit(aio_context_t ctx, long n, struct iocb **paiocb)
{
return syscall(SYS_io_submit, ctx, n, paiocb);
}