首先来看看epoll_create的真身
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
//也就是说参数size根本用不上
return sys_epoll_create1(0);
}
再来看看epoll_create1的真身
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
error = ep_alloc(&ep);
if (error < 0)
return error;
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
1. 对epoll来讲,目前唯一有效的flag只有EPOLL_CLOEXEC
2. ep_alloc初始化spinlock_t锁,mutex锁
3. 每次epoll_create1一个epollfd,内核就会分配一个eventpoll 与之对应
struct eventpoll
{
spinlock_t lock;
//添加,修改,删除fd,epoll_wait返回,内核态向用户态传递数据时都会持有这个锁,所以多线程操作epoll是安全的,内核做了保护
struct mutex mtx;
/* Wait queue used by sys_epoll_wait()*/
wait_queue_head_t wq;
/* Wait queue used by file->poll() */
wait_queue_head_t poll_wait;
//所有触发的epitem都放在这个链表里面
struct list_head rdllist;
//红黑树的root节点,所有要监听的epitem都在这个红黑树中,我们可以把红黑树的所有节点都看作epitem
struct rb_root rbr;
/*
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
3. 因为epollfd本身不存在一个真正的文件与之对应,不像socket,所以内核会分配一个真正的file结构且有真正的fd,然后和epollfd对应
struct file{
//eventpoll存储在这里
void private_data;
struct list_head f_ep_links;
};
这样,通过epollfd找到它在内核中的file,然后通过file找到了存储的eventpoll
4. struct epitem {
/ RB tree node used to link this structure to the eventpoll RB tree */
struct rb_node rbn;
//当这个节点触发的时候,会链到之前提到的eventpoll中的rdllist中去
struct list_head rdllink;
/*
//epitem对应的fd和真正的file
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations */
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
//epitem属于的eventpoll
struct eventpoll *ep;
/* List header used to link this item to the “struct file” items list */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source *ws;
/* The structure that describe the interested events and the source fd */
//epitem关心的事件
struct epoll_event event;
};
struct epoll_filefd{
struct file *file;
int fd;
};
再来看看epoll_ctl的真身
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
int error;
int did_lock_epmutex = 0;
struct file *file, *tfile;
struct eventpoll *ep;
struct epitem *epi;
struct epoll_event epds;
error = -EFAULT;
if (ep_op_has_event(op) &&
copy_from_user(&epds, event, sizeof(struct epoll_event)))
goto error_return;
/* Get the "struct file *" for the eventpoll file */
error = -EBADF;
//这里就是之前说的通过epollfd找到对应的file,后续会通过这个file找到eventpoll
file = fget(epfd);
if (!file)
goto error_return;
/* Get the "struct file *" for the target file */
tfile = fget(fd);
if (!tfile)
goto error_fput;
/* The target file descriptor must support poll */
error = -EPERM;
if (!tfile->f_op || !tfile->f_op->poll)
goto error_tgt_fput;
/* Check if EPOLLWAKEUP is allowed */
if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
epds.events &= ~EPOLLWAKEUP;
/*
* We have to check that the file structure underneath the file descriptor
* the user passed to us _is_ an eventpoll file. And also we do not permit
* adding an epoll file descriptor inside itself.
*/
error = -EINVAL;
//epoll不能监听自己
if (file == tfile || !is_file_epoll(file))
goto error_tgt_fput;
/*
* At this point it is safe to assume that the "private_data" contains
* our own data structure.
*/
//这里就是通过file找到对应的eventpoll
ep = file->private_data;
/*
* When we insert an epoll file descriptor, inside another epoll file
* descriptor, there is the change of creating closed loops, which are
* better be handled here, than in more critical paths. While we are
* checking for loops we also determine the list of files reachable
* and hang them on the tfile_check_list, so we can check that we
* haven't created too many possible wakeup paths.
*
* We need to hold the epmutex across both ep_insert and ep_remove
* b/c we want to make sure we are looking at a coherent view of
* epoll network.
*/
if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) {
mutex_lock(&epmutex);
did_lock_epmutex = 1;
}
if (op == EPOLL_CTL_ADD) {
if (is_file_epoll(tfile)) {
error = -ELOOP;
if (ep_loop_check(ep, tfile) != 0) {
clear_tfile_check_list();
goto error_tgt_fput;
}
} else
list_add(&tfile->f_tfile_llink, &tfile_check_list);
}
mutex_lock_nested(&ep->mtx, 0);
/*
* Try to lookup the file inside our RB tree, Since we grabbed "mtx"
* above, we can be sure to be able to use the item looked up by
* ep_find() till we release the mutex.
*/
//我们在接口层面知道一个fd只能添加一次,这里对应到红黑树中是epitem
epi = ep_find(ep, tfile, fd);
error = -EINVAL;
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
clear_tfile_check_list();
break;
case EPOLL_CTL_DEL:
if (epi)
error = ep_remove(ep, epi);
else
error = -ENOENT;
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
} else
error = -ENOENT;
break;
}
mutex_unlock(&ep->mtx);
error_tgt_fput:
if (did_lock_epmutex)
mutex_unlock(&epmutex);
fput(tfile);
error_fput:
fput(file);
error_return:
return error;
}
这里我们可以很清楚的看到EPOLL_CTL_ADD,EPOLL_CTL_DEL,EPOLL_CTL_MOD操作都是有加锁保护的,ep_insert使用了spinlock_t 锁,内部首先是查看eventpoll中user成员,查看给的最大监听数量,然后再分配一个epitem,并设置回调ep_ptable_queue_proc,也就是红黑树的节点epitem有事件触发就调用这个回调。这个回调将触发的epitem放到waitqueue中,并设置了回调ep_poll_callback,这个waitqueue是fd所持有的。然后这个回调内部将触发的epitem放到了之前说的eventpoll的rdllist中。最后我们的epoll_wait就是遍历这个rdllist,如果有事件触发,就开始从内核态拷贝数据给用户态,这里也使用了spinlock_t锁。拷贝完之后的操作,在这里还设置了ET和LT的区别,如果是ET,epitem是不会再进入到rdllist,除非fd再次发生了状态改变,ep_poll_callback被调用。如果是LT,不管你还有没有激活的事件或者有效的数据,都会被重新插入到rdllist,再下一次epoll_wait的时候又返回给你。
总结: