传递要读或写内容的控制块
/* Asynchronous I/O control block. */
struct aiocb
{
int aio_fildes; /* File desriptor. */
int aio_lio_opcode; /* Operation to be performed. */
int aio_reqprio; /* Request priority offset. */
volatile void *aio_buf; /* Location of buffer. */
size_t aio_nbytes; /* Length of transfer. */
struct sigevent aio_sigevent; /* Signal number and value. */
/* Internal members. */
struct aiocb *__next_prio;
int __abs_prio;
int __policy;
int __error_code;
__ssize_t __return_value;
#ifndef __USE_FILE_OFFSET64
__off_t aio_offset; /* File offset. */
char __pad[sizeof (__off64_t) - sizeof (__off_t)];
#else
__off64_t aio_offset; /* File offset. */
#endif
char __unused[32];
};
在异步读写初始化时需要的参数
/* To customize the implementation one can use the following struct.
This implementation follows the one in Irix. */
struct aioinit
{
int aio_threads; /* Maximal number of threads. */
int aio_num; /* Number of expected simultanious requests. */
int aio_locks; /* Not used. */
int aio_usedba; /* Not used. */
int aio_debug; /* Not used. */
int aio_numusers; /* Not used. */
int aio_idle_time; /* Number of seconds before idle thread
terminates. */
int aio_reserved;
};
把32位和64位的结构体组成的共用体
/* Union of the two request types. */
typedef union
{
struct aiocb aiocb;
struct aiocb64 aiocb64;
} aiocb_union;
这个结构体最重要,是多线程执行时,全是从这里面取数据
/* Used to queue requests.. */
struct requestlist
{
int running;
struct requestlist *last_fd;
struct requestlist *next_fd;//last_fd,next_fd,根据file ID组成一个递增的双向链表
struct requestlist *next_prio;//file ID相同的,根据优先级递减组成单向链表
struct requestlist *next_run; //单链成下一个要执行的,是全局变量 runlist 中的链接
/* Pointer to the actual data. */
aiocb_union *aiocbp; //指向传进来的aiocb结构
#ifdef BROKEN_THREAD_SIGNALS
/* PID of the initiator thread.
XXX This is only necessary for the broken signal handling on Linux. */
pid_t caller_pid;
#endif
/* List of waiting processes. */
struct waitlist *waiting;
};
每个requestlist都有一个状态
/* Status of a request. */
enum
{
no,
queued, //已进链表
yes, //将来执行,
allocated, //已分配线程
done
};
重要的全局变量
/* Pool of request list entries. */
static struct requestlist **pool; 每个requestlist都是从这里面取的,最预先分配的,也可动态增加,
/* List of available entries. */
static struct requestlist *freelist;
/* List of request waiting to be processed. */
static struct requestlist *runlist;
/* Structure list of all currently processed requests. */
static struct requestlist *requests;
aio_read 和 aio_write 里面没有用的代码,直接调进队函数了
int
aio_read (aiocbp)
struct aiocb *aiocbp;
{
return (__aio_enqueue_request ((aiocb_union *) aiocbp, LIO_READ) == NULL
? -1 : 0);
}
int
aio_write (aiocbp)
struct aiocb *aiocbp;
{
return (__aio_enqueue_request ((aiocb_union *) aiocbp, LIO_WRITE) == NULL
? -1 : 0);
}
最重要的 __aio_enqueue_request函数
/* The main function of the async I/O handling. It enqueues requests
and if necessary starts and handles threads. */
struct requestlist *
internal_function
__aio_enqueue_request (aiocb_union *aiocbp, int operation)
{
int result = 0;
int policy, prio;
struct sched_param param;
struct requestlist *last, *runp, *newp;
int running = no;
if (operation == LIO_SYNC || operation == LIO_DSYNC)
aiocbp->aiocb.aio_reqprio = 0;
else if (aiocbp->aiocb.aio_reqprio < 0
|| aiocbp->aiocb.aio_reqprio > AIO_PRIO_DELTA_MAX)
{
/* Invalid priority value. */
__set_errno (EINVAL);
aiocbp->aiocb.__error_code = EINVAL;
aiocbp->aiocb.__return_value = -1;
return NULL;
}
/* Compute priority for this request. */
pthread_getschedparam (pthread_self (), &policy, ¶m);
prio = param.sched_priority - aiocbp->aiocb.aio_reqprio;
/* Get the mutex. */
pthread_mutex_lock (&__aio_requests_mutex);
last = NULL;
runp = requests;
/* First look whether the current file descriptor is currently
worked with. */
while (runp != NULL
&& runp->aiocbp->aiocb.aio_fildes < aiocbp->aiocb.aio_fildes)
{
last = runp;
runp = runp->next_fd;
}
/* Get a new element for the waiting list. */
newp = get_elem ();
if (newp == NULL)
{
pthread_mutex_unlock (&__aio_requests_mutex);
__set_errno (EAGAIN);
return NULL;
}
newp->aiocbp = aiocbp;
#ifdef BROKEN_THREAD_SIGNALS
newp->caller_pid = (aiocbp->aiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL
? getpid () : 0);
#endif
newp->waiting = NULL;
aiocbp->aiocb.__abs_prio = prio;
aiocbp->aiocb.__policy = policy;
aiocbp->aiocb.aio_lio_opcode = operation;
aiocbp->aiocb.__error_code = EINPROGRESS;
aiocbp->aiocb.__return_value = 0;
if (runp != NULL
&& runp->aiocbp->aiocb.aio_fildes == aiocbp->aiocb.aio_fildes)
{
/* The current file descriptor is worked on. It makes no sense
to start another thread since this new thread would fight
with the running thread for the resources. But we also cannot
say that the thread processing this desriptor shall immediately
after finishing the current job process this request if there
are other threads in the running queue which have a higher
priority. */
/* Simply enqueue it after the running one according to the
priority. */
while (runp->next_prio != NULL
&& runp->next_prio->aiocbp->aiocb.__abs_prio >= prio)
runp = runp->next_prio;
newp->next_prio = runp->next_prio;
runp->next_prio = newp;
running = queued;
}
else
{
running = yes;
/* Enqueue this request for a new descriptor. */
if (last == NULL)
{
newp->last_fd = NULL;
newp->next_fd = requests;
if (requests != NULL)
requests->last_fd = newp;
requests = newp;
}
else
{
newp->next_fd = last->next_fd;
newp->last_fd = last;
last->next_fd = newp;
if (newp->next_fd != NULL)
newp->next_fd->last_fd = newp;
}
newp->next_prio = NULL;
}
if (running == yes)
{
/* We try to create a new thread for this file descriptor. The
function which gets called will handle all available requests
for this descriptor and when all are processed it will
terminate.
If no new thread can be created or if the specified limit of
threads for AIO is reached we queue the request. */
/* See if we need to and are able to create a thread. */
if (nthreads < optim.aio_threads && idle_thread_count == 0)
{
pthread_t thid;
running = newp->running = allocated;
/* Now try to start a thread. */
if (aio_create_helper_thread (&thid, handle_fildes_io, newp) == 0)// 调进线程执行
/* We managed to enqueue the request. All errors which can
happen now can be recognized by calls to `aio_return' and
`aio_error'. */
++nthreads;
else
{
/* Reset the running flag. The new request is not running. */
running = newp->running = yes;
if (nthreads == 0)
/* We cannot create a thread in the moment and there is
also no thread running. This is a problem. `errno' is
set to EAGAIN if this is only a temporary problem. */
result = -1;
}
}
}
/* Enqueue the request in the run queue if it is not yet running. */
if (running == yes && result == 0)
{
add_request_to_runlist (newp);
/* If there is a thread waiting for work, then let it know that we
have just given it something to do. */
if (idle_thread_count > 0)
pthread_cond_signal (&__aio_new_request_notification);
}
if (result == 0)
newp->running = running;
else
{
/* Something went wrong. */
__aio_free_request (newp);
newp = NULL;
}
/* Release the mutex. */
pthread_mutex_unlock (&__aio_requests_mutex);
return newp;
}
总的来说,就是先根据File Id看是否在requests,在的话,就根据递减的优先权放入next_prio单链表中;没有的话,就在requests中新增。之后,对于后一种情况,看有没有空闲的线程,有的话,创建并执行操作,没有的话,就加入到runlist之中。真正的读写操作是放在了线程中的handle_fildes_io 执行的
int
aio_error (aiocbp)
const struct aiocb *aiocbp;
{
return aiocbp->__error_code;
}
ssize_t
aio_return (aiocbp)
struct aiocb *aiocbp;
{
return aiocbp->__return_value;
}
这两个函数什么也没做,就是返回了aiocb 的中的状态,这个状态是在handle_fildes_io 中修改的
总的来说,glibc中的 Asynchronous I/O 就是启动了多个线程,每个线程负责一个文件的读写操作,来达到提高读写效率的,
缺点是大量使用了全局的唯一互斥量
/* Since the list is global we need a mutex protecting it. */
pthread_mutex_t __aio_requests_mutex = PTHREAD_RECURSIVE_MUTEX_INITIALIZER_NP;
O_DIRECT alone only promises that the kernel will avoid copying data from user space to kernel space, and will instead write it directly via DMA (if possible). Data does not go into caches. There is no strict guarantee that the function will return only after all data has been transferred.
O_SYNC guarantees that the call will not return before all data has been transferred to the disk (as far as the OS can tell). This still does not guarantee that the data isn't somewhere in the harddisk write cache, but it is as much as the OS can guarantee.
O_DIRECT|O_SYNC is the combination of these, i.e. "DMA + guarantee".
When you open with O_SYNC, the data must go all the way to disk on
every write call. This means you get at least one disk access for
every write, and possibly more if the writes are large (>64k).
When you don't use O_SYNC and only flush after all writes have been
submitted by the application, then the kernel is able to combine writes
in the cache and at the blk dev layer. Therefore you end up with fewer
accesses to the physical disk, which makes it much faster.
On a Linux-Compaq Alpha I measured the following: if I open with O_SYNC,
> I can flush the end of my file (it is a log file) to
> disk 170 times / second. If I do not open with O_SYNC,
> but call fsync or fdatasync after each write, I get only 50
> writes/second.
This is generally the case. If you need to sync every write, O_SYNC
is usually faster than fsync. If you don't need to sync
every individual write, then a single fsync after the last
write is the fastest to get all the data to disk.