/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line.(红黑树单节点) */
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list (列表头用于连接结构的eventpoll就绪列表)*/
struct list_head rdllink;
/* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */
struct epitem *next;
/* The file descriptor information this item refers to (关联的文件描述符)*/
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations (轮询操作)*/
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd (感兴趣的监控文件描述符的状态)*/
struct epoll_event event;
};
/* * This structure is stored inside the "private_data" member of the file * structure and represents(表现)the main data structure for the eventpoll * interface. */
struct eventpoll {
/* Protect the access to this structure */
spinlock_t lock;
/* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() (双链表,epoll文件的等待队列。 *调用epoll_wait的进程可能在此队列上睡眠, 等待ep_poll_callback()函数唤醒或超时 */
wait_queue_head_t wq;
/* Wait queue used by file->poll() (双链表, poll_wait是eventpoll文件本身的唤醒队列, *该队列上睡眠的进程是等待eventpoll文件本身的某些事件发生 */
wait_queue_head_t poll_wait;
/* List of ready file descriptors (就绪链表)*/
struct list_head rdllist;
/* RB tree root used to store monitored fd structs (存储监听文件描述符结构红黑树根节点)*/
struct rb_root rbr;
/* * This is a single linked list(单链表) that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock.(如果正在向用户空间传递事件,此时状态就绪的文件描述符相关的结构会暂时放在该队列上, * 否则会直接添加到就绪队列rdllist中。) */
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
typedef struct poll_table_struct {
poll_queue_proc _qproc; //事件回调函数
unsigned long _key; //事件位标志(由epi->event.events确定)
} poll_table;
/* 等待队列的poll回调的hook*/
struct eppoll_entry {
struct list_head llink;//链接epitem
struct epitem *base; //主epitem
wait_queue_t wait; //wait队列的元素
wait_queue_head_t *whead;//wait队列的对头元素指针
};
struct __wait_queue {
unsigned int flags;
#define WQ_FLAG_EXCLUSIVE 0x01
void *private;
wait_queue_func_t func;
struct list_head task_list;
};
int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
if (ep_op_has_event(op) &©_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
epi = ep_find(ep, tf.file, fd);
static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
int kcmp;
struct rb_node *rbp;
struct epitem *epi, *epir = NULL;
struct epoll_filefd ffd;
ep_set_ffd(&ffd, file, fd);
for (rbp = ep->rbr.rb_node; rbp; ) {
epi = rb_entry(rbp, struct epitem, rbn);
kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
if (kcmp > 0)
rbp = rbp->rb_right;
else if (kcmp < 0)
rbp = rbp->rb_left;
else {
epir = epi;
break;
}
}
return epir;
}
switch (op) {
case EPOLL_CTL_ADD:
break;
case EPOLL_CTL_DEL:
break;
case EPOLL_CTL_MOD:
break;
}
error = ep_insert(ep, &epds, tf.file, fd, full_check);
注册ep_ptable_queue_proc()函数
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
在注册回调函数 ep_ptable_queue_proc()中
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct epitem *epi = ep_item_from_epqueue(pt);
struct eppoll_entry *pwq;
//构造eppoll_entry并将其挂到目标文件的wait队列中
if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
//ep_poll_callback:将该eppoll_entry hook 所描述的epitem挂入eventpoll的就绪队列中
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
pwq->whead = whead;
pwq->base = epi;
/*正式将该epoll_entry挂到目标文件的等待队列中,当文件出发事件后, *会依次出发该队列中的每一项(epoll_entry)的ep_poll_callback */
add_wait_queue(whead, &pwq->wait);
list_add_tail(&pwq->llink, &epi->pwqlist);
epi->nwait++;
} else {
/* We have to signal that an error occurred */
epi->nwait = -1;
}
}
回调函数ep_poll_callback()的实现过程(在对应的文件描述符中有事件发生时,将会被调用)
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int pwake = 0;
unsigned long flags;
struct epitem *epi = ep_item_from_wait(wait);
struct eventpoll *ep = epi->ep;
if ((unsigned long)key & POLLFREE) {
ep_pwq_from_wait(wait)->whead = NULL;
/*
* whead = NULL above can race with ep_remove_wait_queue()
* which can do another remove_wait_queue() after us, so we
* can't use __remove_wait_queue(). whead->lock is held by
* the caller.
*/
list_del_init(&wait->task_list);
}
spin_lock_irqsave(&ep->lock, flags);
/*
* If the event mask does not contain any poll(2) event, we consider the
* descriptor to be disabled. This condition is likely the effect of the
* EPOLLONESHOT bit that disables the descriptor when an event is received,
* until the next EPOLL_CTL_MOD will be issued.
*/
if (!(epi->event.events & ~EP_PRIVATE_BITS))
goto out_unlock;
/*
* Check the events coming with the callback. At this stage, not
* every device reports the events in the "key" parameter of the
* callback. We need to be able to handle both cases here, hence the
* test for "key" != NULL before the event match test.
*/
if (key && !((unsigned long) key & epi->event.events))
goto out_unlock;
/*
* If we are transferring events to userspace, we can hold no locks
* (because we're accessing user memory, and because of linux f_op->poll()
* semantics). All the events that happen during that period of time are
* chained(链接的) in ep->ovflist and requeued later on.
*/
if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
if (epi->next == EP_UNACTIVE_PTR) {
//目标文件发生的事件符合 epi要监听的 ,故而将epi加入到eventpoll的ovflist(epitem就绪队列)中
epi->next = ep->ovflist;
ep->ovflist = epi;
if (epi->ws) {
/*
* Activate(激活) ep->ws since epi->ws may get
* deactivated(关闭) at any time.
*/
__pm_stay_awake(ep->ws);
}
}
goto out_unlock;
}
/* If this file is already in the ready list we exit soon */
if (!ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake_rcu(epi);
}
/*
* Wake up ( if active ) both the eventpoll wait list and the ->poll()
* wait list.
*/
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
out_unlock:
spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 1;
}
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
struct file *tfile, int fd, int full_check)
{
int error, revents, pwake = 0;
unsigned long flags;
long user_watches;
struct epitem *epi;
struct ep_pqueue epq;
/* * 检查epoll监视的文件描述符的个数是否超过max_user_watches, * max_user_watches用来存储每个用户使用epoll可以监视的文件 * 描述符个数 */
user_watches = atomic_long_read(&ep->user->epoll_watches);
if (unlikely(user_watches >= max_user_watches))
return -ENOSPC;
/* * 每个加入到epoll中的文件都会附加到一个epitem实例中, * 分配当前文件对应的epitem实例。 */
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
/* Item initialization follow here ... */
INIT_LIST_HEAD(&epi->rdllink);
INIT_LIST_HEAD(&epi->fllink);
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
epi->event = *event;
epi->nwait = 0;
epi->next = EP_UNACTIVE_PTR;
if (epi->event.events & EPOLLWAKEUP) {
error = ep_create_wakeup_source(epi);
if (error)
goto error_create_wakeup_source;
} else {
RCU_INIT_POINTER(epi->ws, NULL);
}
/* Initialize the poll table using the queue callback(回调)(函数指针) */
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
/* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */
/* * 如果fd是套接字,f_op为socket_file_ops,poll函数是 * sock_poll()。如果是TCP套接字的话,进而会调用 * 到tcp_poll()函数。此处调用poll函数查看当前 * 文件描述符的状态,存储在revents中。 * 在poll的处理函数(tcp_poll())中,会调用sock_poll_wait(), * 在sock_poll_wait()中会调用到epq.pt.qproc指向的函数, * 也就是ep_ptable_queue_proc()。 */
revents = ep_item_poll(epi, &epq.pt);
/* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */
/* * ep_ptable_queue_proc()中如果分配内存失败时,会 * 将nwait置为-1。 */
error = -ENOMEM;
if (epi->nwait < 0)
goto error_unregister;
/* Add the current item to the list of active epoll hook for ths file */
spin_lock(&tfile->f_lock);
list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
spin_unlock(&tfile->f_lock);
/* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */
ep_rbtree_insert(ep, epi);
/* now check if we've created too many backpaths */
error = -EINVAL;
if (full_check && reverse_path_check())
goto error_remove_epi;
/* We have to drop the new item inside our item list to keep track of it */
spin_lock_irqsave(&ep->lock, flags);
/* If the file is already "ready" we drop it inside the ready list */
if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
/* Notify waiting tasks that events are available */
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
spin_unlock_irqrestore(&ep->lock, flags);
atomic_long_inc(&ep->user->epoll_watches);
/* We have to call this outside the lock */
if (pwake)
ep_poll_safewake(&ep->poll_wait);
return 0;
error_remove_epi:
spin_lock(&tfile->f_lock);
list_del_rcu(&epi->fllink);
spin_unlock(&tfile->f_lock);
rb_erase(&epi->rbn, &ep->rbr);
error_unregister:
ep_unregister_pollwait(ep, epi);
/* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */
spin_lock_irqsave(&ep->lock, flags);
if (ep_is_linked(&epi->rdllink))
list_del_init(&epi->rdllink);
spin_unlock_irqrestore(&ep->lock, flags);
wakeup_source_unregister(ep_wakeup_source(epi));
error_create_wakeup_source:
kmem_cache_free(epi_cache, epi);
return error;
}
error = ep_remove(ep, epi);
error = ep_modify(ep, epi, &epds);
fdput(tf);
fdput(f);