本文将介绍内核epoll实现的原理。基于kernel 2.6.32版本。
本文只描述epoll对其他fd的监听,由于epoll本身也是一种文件系统,也可以被监听,这一部分不在这里介绍。
epoll中主要数据结构有两个,一个是epoll_create创建的epoll_fd的结构体eventpoll,一个是事件源对应的epitem结构体。
epollevent的数据结构及相应解释如下:这里注意的是eventpoll中其实有两个ready list,一个是常规的rdllist,还有一个是ovflist,两个队列的区别是ovflist用于当前epoll已经在将ready list发送到用户空间时,这时候设备状态改变唤醒的时候不能直接添加在ready list中,而是需要添加在ovflist,当ready list处理完时候再从ovflist移到ready list,相当于一个备用的队列。
struct eventpoll { /* Protect the this structure access */ spinlock_t lock; /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ /* 用于epoll_wait时等待事件激活时让出本进程调度权限时候的等待队列。 */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ /* 用于epoll这个文件类型的对应的poll操作。 */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ /* 已激活的事件队列。 */ struct list_head rdllist; /* RB tree root used to store monitored fd structs */ /* 管理所有事件源,用于epoll_ctl中查找epitem。 */ struct rb_root rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transfering ready events to userspace w/out * holding ->lock. */ /* 用于当epoll准备将数据返回给用户时候(ep_send_events_proc), * 这时候设备状态改变回调(ep_poll_callback)的时候, * 不直接添加在ready list中,而是先暂时放在ovflist, * 当ep_send_events_proc结束的时候,重新把ovflist中的数据加到ready list中。 */ struct epitem *ovflist; /* The user that created the eventpoll descriptor */ /* 主要用来统计当前监听多少个fd。 */ struct user_struct *user; /* epfd对应的struct file。 */ struct file *file; /* used to optimize loop detection check */ /* 用于把一个epfd添加到另一个epoll中监听时候检测用。 */ int visited; struct list_head visited_list_link; };epitem结构体及相应解释如下:
struct epitem { /* RB tree node used to link this structure to the eventpoll RB tree */ /* 记录在struct eventpoll中的rbr节点。 */ struct rb_node rbn; /* List header used to link this structure to the eventpoll ready list */ /* 记录在struct eventpoll中的rdllist节点。 */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ /* 记录在struct eventpoll中的ovflist节点,ovflist用处见eventpoll。 */ struct epitem *next; /* The file descriptor information this item refers to */ /* 文件描述符和对应的file结构体的封装 * struct epoll_filefd { * struct file *file; * int fd; * }; */ struct epoll_filefd ffd; /* Number of active wait queue attached to poll operations */ int nwait; /* List containing poll wait queues */ /* * struct eppoll_entry { * /* List header used to link this structure to the "struct epitem" */ * struct list_head llink; * /* The "base" pointer is set to the container "struct epitem" */ * struct epitem *base; * /* * * Wait queue item that will be linked to the target file wait * * queue head. * */ * wait_queue_t wait; * /* The wait queue head that linked the "wait" wait queue item */ * wait_queue_head_t *whead; * }; * struct eppoll_entry节点,该节点在每个事件源在相应的设备中注册(ep_ptable_queue_proc)时候创建, * 结构体中主要封装了当前事件源对应epitem,事件源在相应设备系统中的钩子和队列 * 当设备状态改变回调时,将通过eppoll_entry中的wait找到eppoll_entry结构体再 * 找到epitem(ep_item_from_wait) */ struct list_head pwqlist; /* The "container" of this item */ /* 记录epitem所在的eventpoll。 */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ /* struct file中的f_ep_links节点,好像是用作递归深度的检测,暂时没懂。 */ struct list_head fllink; /* The structure that describe the interested events and the source fd */ /* 用户监听的事件类型。 */ struct epoll_event event; };
epoll简单流程和reactor模式有一些相似,通过epoll来对事件源(用户关注的某个设备的某个状态)进行管理,每添加一个事件源,都会在对应设备上进行注册。当事件源有用户所关注的事件触发,就在中断回调时加入到epoll的ready list中,当用户epoll_wait的时候,等待超时时间(无事件触发),交出调度权,进程唤醒后如果有事件就将事件通知给用户,简单示意图如下:当然内在还有其他更多的细节处理,下文会描述。
epoll_create将创建一个属于epoll文件系统的file,同时创建一个eventpoll的结构体,作为file的private_data,这里只注意eventpoll结构体初始化的函数ep_alloc中的ovflist,当其初始化为EP_UNACTIVE_PTR时,表示不使用该队列,当该队列开放使用时,会重新初始化为0。
epoll_ctl执行时,对于非删除事件需要把用户空间的数据拷贝到内核空间,然后根据epfd获取对应的eventpoll结构体,根据fd获取事件源对应file结构体,对这些数据进行基本的校验。当添加事件源时,需要额外判断事件源是否是epoll,是的话需要额外检查,这个系统调用外层比较直观,这里主要讲述下三个子操作,ep_insert,ep_remove,ep_modify。
static int ep_insert(struct eventpoll *ep, struct epoll_event *event, struct file *tfile, int fd) { int error, revents, pwake = 0; unsigned long flags; struct epitem *epi; struct ep_pqueue epq; /* 是否达到监听数量上限。 */ if (unlikely(atomic_read(&ep->user->epoll_watches) >= max_user_watches)) return -ENOSPC; /* epitem初始化。 */ if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) return -ENOMEM; /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->nwait = 0; epi->next = EP_UNACTIVE_PTR; /* Initialize the poll table using the queue callback */ /* 使用栈上的ep_pqueue去向设备注册(ep_ptable_queue_proc), * 因为f_op->poll ()的时候会马上执行,因此只需要使用栈即可。 * 这里也是唯一一处提交注册请求的地方,其他地方poll只获取状态。 */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = tfile->f_op->poll(tfile, &epq.pt); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ error = -ENOMEM; /* 注册是否成功。 */ if (epi->nwait < 0) goto error_unregister; /* Add the current item to the list of active epoll hook for this file */ spin_lock(&tfile->f_lock); list_add_tail(&epi->fllink, &tfile->f_ep_links); spin_unlock(&tfile->f_lock); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ /* 加入到eventpoll结构体中进行管理。 */ ep_rbtree_insert(ep, epi); /* now check if we've created too many backpaths */ /* 检查激活路径,貌似和添加epfd到另一个epfd有关。暂时不懂。 */ error = -EINVAL; if (reverse_path_check()) goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); /* If the file is already "ready" we drop it inside the ready list */ /* 如果当前事件已经激活,则添加到ready list,并唤醒epoll。 */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); /* 监听数量统计。与函数入口的检查对应。 */ atomic_inc(&ep->user->epoll_watches); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0; …… exception handler …… }在ep_insert中注意向设备poll注册的函数ep_ptable_queue_proc:当事件源向设备注册时,如果注册成功,设备信息和epitem及回调函数信息都会封装在eppoll_entry中。
/* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct epitem *epi = ep_item_from_epqueue(pt); struct eppoll_entry *pwq; if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) { init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { /* We have to signal that an error occurred */ epi->nwait = -1; } }
在这里先分析下设备唤醒的回调函数ep_poll_callback,看看设备状态改变后epoll所执行的回调,分析如下:
主要注意的是当设备有用户所关注的事件被激活时,需要根据当前epoll所处的时机决定添加到ready list还是ovflist中。
/* * This is the callback that is passed to the wait queue wakeup * machanism. It is called by the stored file descriptors when they * have events to report. */ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; /* POLLFREE调用点貌似和fork和thread有关,待研究。 */ if ((unsigned long)key & POLLFREE) { ep_pwq_from_wait(wait)->whead = NULL; /* * whead = NULL above can race with ep_remove_wait_queue() * which can do another remove_wait_queue() after us, so we * can't use __remove_wait_queue(). whead->lock is held by * the caller. */ list_del_init(&wait->task_list); } spin_lock_irqsave(&ep->lock, flags); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ /* #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) * 当只有事件触发方式而没有实际触发类型时不做任何处理。 */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ /* 非用户关心的事件类型不做任何处理。 */ if (key && !((unsigned long) key & epi->event.events)) goto out_unlock; /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are * chained in ep->ovflist and requeued later on. */ /* 如ovflist描述,这时候epoll已经准备将数据返回给用户, * 这时候正在更新ready list,此时不添加到ready list, * 添加到备用的ovflist。 */ if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { if (epi->next == EP_UNACTIVE_PTR) { epi->next = ep->ovflist; ep->ovflist = epi; } goto out_unlock; } /* If this file is already in the ready list we exit soon */ /* 添加事件到ready list。 */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ /* 激活epoll。 */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 1; }
ep_modify的逻辑比较简单,更新事件类型后获取一次设备状态,如果已经激活就添加到ready list,并激活epoll。
/* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_event *event) { int pwake = 0; unsigned int revents; /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ /* 更新事件。 */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ /* ep_insert已经注册过了,这里用NULL只获取当前状态。 */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ /* 事件已经激活,则加入ready list,唤醒epoll。 * 这里epoll肯定不会执行epoll_wait的返回数据,因此不用ovflist。 */ if (revents & event->events) { spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return 0; }
ep_remove就是删除该事件源对应的epitem和其对应的eppoll_entry(即对应设备的等待队列和回调。),代码比较明了,对应eppoll_entry的删除过程为ep_unregister_pollwait ----> ep_remove_wait_queue。
epoll_wait是用户调用的主要函数。系统调用入口处主要是对maxevents进行校验,并通过epfd获取file结构体进而获取eventpoll结构体。主逻辑在ep_poll函数中实现。
ep_poll代码比较简短,也很清晰,就是计算超时时间,当没有事件触发时,利用eventpoll的wait和超时时间让出当前调度权,等待超时结束或者事件到来,再根据结果进行处理。
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, long timeout) { int res, eavail; unsigned long flags; long jtimeout; wait_queue_t wait; /* * Calculate the timeout by checking for the "infinite" value (-1) * and the overflow condition. The passed timeout is in milliseconds, * that why (t * HZ) / 1000. */ /* 计算超时时间。 */ jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ? MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000; retry: spin_lock_irqsave(&ep->lock, flags); res = 0; /* ready list非空的时候会直接返回给用户。 */ if (list_empty(&ep->rdllist)) { /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by * ep_poll_callback() when events will become available. */ /* 使用eventpoll中的wait。 */ init_waitqueue_entry(&wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue(&ep->wq, &wait); for (;;) { /* 出让调度权,等待事件到来或者超时。 */ /* * We don't want to sleep if the ep_poll_callback() sends us * a wakeup in between. That's why we set the task state * to TASK_INTERRUPTIBLE before doing the checks. */ set_current_state(TASK_INTERRUPTIBLE); if (!list_empty(&ep->rdllist) || !jtimeout) break; if (signal_pending(current)) { res = -EINTR; break; } spin_unlock_irqrestore(&ep->lock, flags); jtimeout = schedule_timeout(jtimeout); spin_lock_irqsave(&ep->lock, flags); } __remove_wait_queue(&ep->wq, &wait); set_current_state(TASK_RUNNING); } /* Is it worth to try to dig for events ? */ eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; spin_unlock_irqrestore(&ep->lock, flags); /* * Try to transfer events to user space. In case we get 0 events and * there's still timeout left over, we go trying again in search of * more luck. */ /* 没有异常,有事件到来,非超时,尝试把事件拷贝回用户空间。 */ if (!res && eavail && !(res = ep_send_events(ep, events, maxevents)) && jtimeout) goto retry; return res; }这里主要再讲述下ep_send_events里的两个函数,ep_scan_ready_list和在ep_send_events中使用的函数指针ep_send_events_proc。
先看ep_scan_ready_list,这里会根据调用点传递进来的sproc去扫描ready list,在扫描前,会先把ready list移到另一个链表txlist中,再去扫描txlist,扫描前同时还开放ovflist的访问权限,扫描过程中触发的事件都会添加到ovflist中,在扫描完成时添加到ready list中,同时扫描完成后残余的激活事件也会重新接入回ready list中,如果有事件,还会重新唤醒epoll。
这里使用txlist的原因可能是对于非边缘触发的方式,需要重新添加回ready list,如果使用ready list遍历,则需要一个标志来判断是否是已询问过,并重新添加回ready list的事件,因此这里直接把list分离,遍历和ready list是两个不同的list,节约标志位的空间也节约了判断的时间。
/** * ep_scan_ready_list - Scans the ready list in a way that makes possible for * the scan code, to call f_op->poll(). Also allows for * O(NumReady) performance. * * @ep: Pointer to the epoll private data structure. * @sproc: Pointer to the scan callback. * @priv: Private opaque data passed to the @sproc callback. * @depth: The current depth of recursive f_op->poll calls. * * Returns: The same integer error code returned by the @sproc callback. */ static int ep_scan_ready_list(struct eventpoll *ep, int (*sproc)(struct eventpoll *, struct list_head *, void *), void *priv, int depth) { int error, pwake = 0; unsigned long flags; struct epitem *epi, *nepi; LIST_HEAD(txlist); /* * We need to lock this because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ mutex_lock_nested(&ep->mtx, depth); /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ /* 当前即将对ready list进行处理,因此这里放开备用队列ovflist, * 当epoll执行在这个函数期间(部分),ready list不开放, * 新到来的事件将暂时添加到ovflist中,直到sproc处理完成。 * NULL表示可用,EP_UNACTIVE_PTR表示不可用。 */ spin_lock_irqsave(&ep->lock, flags); /* 从ready list中把链表移到txlist。 */ list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); /* * Now call the callback function. */ error = (*sproc)(ep, &txlist, priv); spin_lock_irqsave(&ep->lock, flags); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ /* 在处理sproc的时候有事件到来,需要把ovflist中的事件添加到ready list。 */ for (nepi = ep->ovflist; (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(&epi->rdllink)) list_add_tail(&epi->rdllink, &ep->rdllist); } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ /* 关闭ovflist的使用。 */ ep->ovflist = EP_UNACTIVE_PTR; /* * Quickly re-inject items left on "txlist". */ /* 把处理剩下的ready list重新接回rdllist,这时候可能ready list还有数据,如拷贝异常等。 */ list_splice(&txlist, &ep->rdllist); if (!list_empty(&ep->rdllist)) { /* * Wake up (if active) both the eventpoll wait list and * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } spin_unlock_irqrestore(&ep->lock, flags); mutex_unlock(&ep->mtx); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(&ep->poll_wait); return error; }ep_send_events_proc函数处理ready list中的事件,并拷贝数据回到用户空间。
这里就是把链表中的节点取出来拷贝必要信息给用户空间,这里主要注意的是对于非边缘触发的处理方式,会重新添加回ready list,以便下次epoll_wait的时候可以正常激活对应事件。
static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) { /* struct ep_send_events_data封装了用户数据。 */ struct ep_send_events_data *esed = priv; int eventcnt; unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. */ for (eventcnt = 0, uevent = esed->events; !list_empty(head) && eventcnt < esed->maxevents;) { /* 遍历head,即scan_ready_list中创建的ready list的备份txlist。 */ epi = list_first_entry(head, struct epitem, rdllink); list_del_init(&epi->rdllink); /* 不是用户关注事件不会上报。 */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events; /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() * is holding "mtx", so no operations coming from userspace * can change the item. */ if (revents) { if (__put_user(revents, &uevent->events) || __put_user(epi->event.data, &uevent->data)) { /* 拷贝失败重新添加回head,head在外层会添加会ready list。 */ list_add(&epi->rdllink, head); return eventcnt ? eventcnt : -EFAULT; } eventcnt++; uevent++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, noone can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ /* 不是边缘触发的话,需要重新添加回ready list, * 以便下一次epoll_wait的时候即使设备没有输入仍可唤醒。 * 如第一次epoll_wait可读1000byte,但是用户只读取了100byte, * 第二次epoll_wait的时候设备无数据输入,但是缓冲区中仍有900byte, * ready list仍旧非空,epoll_wait仍可马上唤醒。 * 这里直接添加会eventpoll的ready list,是防止一直添加回head * 导致循环无法退出。 */ list_add_tail(&epi->rdllink, &ep->rdllist); } } } return eventcnt; }