http://bookjovi.iteye.com/blog/1186736
Linux中异步IO等待无非就三个系统调用:select, poll和epoll。很多人无法理解三种调用的区别,或不够了解,今天就结合Linux kernel code详细描述三个的区别!
select:
select 的限制就是最大1024个fd,可以查看kernel中的posix_types.h,里面定义了fdset数据结构,显然select不适合poll大量fd的场景(如webserver)。
include/linux/posix_types.h :
- #undef __NFDBITS
- #define __NFDBITS (8 * sizeof(unsigned long))
-
- #undef __FD_SETSIZE
- #define __FD_SETSIZE 1024
-
- #undef __FDSET_LONGS
- #define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS)
-
- #undef __FDELT
- #define __FDELT(d) ((d) / __NFDBITS)
-
- #undef __FDMASK
- #define __FDMASK(d) (1UL << ((d) % __NFDBITS))
-
- typedef struct {
- unsigned long fds_bits [__FDSET_LONGS];
- } __kernel_fd_set;
#undef __NFDBITS
#define __NFDBITS (8 * sizeof(unsigned long))
#undef __FD_SETSIZE
#define __FD_SETSIZE 1024
#undef __FDSET_LONGS
#define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS)
#undef __FDELT
#define __FDELT(d) ((d) / __NFDBITS)
#undef __FDMASK
#define __FDMASK(d) (1UL << ((d) % __NFDBITS))
typedef struct {
unsigned long fds_bits [__FDSET_LONGS];
} __kernel_fd_set;
poll:
poll相对于select改进了fdset size的限制,poll没有再使用fdset数组结构,反而使用了pollfd,这样用户可以自定义非常大的pollfd数组,这个pollfd数组在kernel中的表现形式是poll_list链表,这样就不存在了1024的限制了,除此之外poll相比select无太大区别。
- int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
- struct timespec *end_time)
- {
- struct poll_wqueues table;
- int err = -EFAULT, fdcount, len, size;
-
-
-
- long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
- struct poll_list *const head = (struct poll_list *)stack_pps;
- struct poll_list *walk = head;
- unsigned long todo = nfds;
-
- if (nfds > rlimit(RLIMIT_NOFILE))
- return -EINVAL;
-
- len = min_t(unsigned int, nfds, N_STACK_PPS);
- for (;;) {
- walk->next = NULL;
- walk->len = len;
- if (!len)
- break;
-
- if (copy_from_user(walk->entries, ufds + nfds-todo,
- sizeof(struct pollfd) * walk->len))
- goto out_fds;
-
- todo -= walk->len;
- if (!todo)
- break;
-
- len = min(todo, POLLFD_PER_PAGE);
- size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
- walk = walk->next = kmalloc(size, GFP_KERNEL);
- if (!walk) {
- err = -ENOMEM;
- goto out_fds;
- }
- }
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
struct timespec *end_time)
{
struct poll_wqueues table;
int err = -EFAULT, fdcount, len, size;
/* Allocate small arguments on the stack to save memory and be
faster - use long to make sure the buffer is aligned properly
on 64 bit archs to avoid unaligned access */
long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
struct poll_list *const head = (struct poll_list *)stack_pps;
struct poll_list *walk = head;
unsigned long todo = nfds;
if (nfds > rlimit(RLIMIT_NOFILE))
return -EINVAL;
len = min_t(unsigned int, nfds, N_STACK_PPS);
for (;;) {
walk->next = NULL;
walk->len = len;
if (!len)
break;
if (copy_from_user(walk->entries, ufds + nfds-todo,
sizeof(struct pollfd) * walk->len))
goto out_fds;
todo -= walk->len;
if (!todo)
break;
len = min(todo, POLLFD_PER_PAGE);
size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
walk = walk->next = kmalloc(size, GFP_KERNEL);
if (!walk) {
err = -ENOMEM;
goto out_fds;
}
}
epoll:
select与poll的共同点是fd有数据后kernel会遍历所有fd,找到有效fd后初始化相应的revents,用户空间程序须再次遍历整个fdset,以找到有效的fd,这样实际上就遍历了两次fd数组表,对于极大量fd的情况,这样的性能非常不好,请看一下do_poll代码:
- static int do_poll(unsigned int nfds, struct poll_list *list,
- struct poll_wqueues *wait, struct timespec *end_time)
- {
- poll_table* pt = &wait->pt;
- ktime_t expire, *to = NULL;
- int timed_out = 0, count = 0;
- unsigned long slack = 0;
-
-
- if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
- pt = NULL;
- timed_out = 1;
- }
-
- if (end_time && !timed_out)
- slack = select_estimate_accuracy(end_time);
-
- for (;;) {
- struct poll_list *walk;
-
- for (walk = list; walk != NULL; walk = walk->next) {
- struct pollfd * pfd, * pfd_end;
-
- pfd = walk->entries;
- pfd_end = pfd + walk->len;
- for (; pfd != pfd_end; pfd++) {
-
-
-
-
-
-
-
- if (do_pollfd(pfd, pt)) {
- count++;
- pt = NULL;
- }
- }
- }
static int do_poll(unsigned int nfds, struct poll_list *list,
struct poll_wqueues *wait, struct timespec *end_time)
{
poll_table* pt = &wait->pt;
ktime_t expire, *to = NULL;
int timed_out = 0, count = 0;
unsigned long slack = 0;
/* Optimise the no-wait case */
if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
pt = NULL;
timed_out = 1;
}
if (end_time && !timed_out)
slack = select_estimate_accuracy(end_time);
for (;;) {
struct poll_list *walk;
for (walk = list; walk != NULL; walk = walk->next) {
struct pollfd * pfd, * pfd_end;
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
/*
* Fish for events. If we found one, record it
* and kill the poll_table, so we don't
* needlessly register any other waiters after
* this. They'll get immediately deregistered
* when we break out and return.
*/
if (do_pollfd(pfd, pt)) {
count++;
pt = NULL;
}
}
}
epoll的出现解决了这种问题,那么epoll是如何做到的呢? 我们知道select, poll和epoll都是使用waitqueue调用callback函数去wakeup你的异步等待线程的,如果设置了timeout的话就起一个hrtimer,select和poll的callback函数并没有做什么事情,但epoll的waitqueue callback函数把当前的有效fd加到ready list,然后唤醒异步等待进程,所以你的epoll函数返回的就是这个ready list, ready list中包含所有有效的fd,这样一来kernel不用去遍历所有的fd,用户空间程序也不用遍历所有的fd,而只是遍历返回有效fd链表,所以epoll自然比select和poll更适合大数量fd的场景。
- static int ep_send_events(struct eventpoll *ep,
- struct epoll_event __user *events, int maxevents)
- {
- struct ep_send_events_data esed;
-
- esed.maxevents = maxevents;
- esed.events = events;
-
- return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
- }
static int ep_send_events(struct eventpoll *ep,
struct epoll_event __user *events, int maxevents)
{
struct ep_send_events_data esed;
esed.maxevents = maxevents;
esed.events = events;
return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
}
现在大家应该明白select, poll和epoll的区别了吧!有人问既然select和poll有这么明显的缺陷,为什么不改掉kernel中的实现呢? 原因很简单,后向ABI兼容,select和poll的ABI无法返回ready list,只能返回整个fd数组,所以用户只得再次遍历整个fd数组以找到哪些fd是有数据的。
epoll还包括 “Level-Triggered” 和 “Edge-Triggered”,这两个概念在这里就不多赘述了,因为"man epoll"里面解释的非常详细,还有使用epoll的example。