epoll系列的系统函数,很简单,但是很强大。epoll_create(),epoll_ctl() , epoll_wait(),三个就够了。
一些重要的结构:
/*
* Each file descriptor added to the eventpoll interface will
* have an entry of this type linked to the "rbr" RB tree.
* Avoid increasing the size of this struct, there can be many thousands
* of these on a server and we do not want this to take another cache line.(红黑树单节点)
*/
struct epitem {
union {
/* RB tree node links this structure to the eventpoll RB tree */
struct rb_node rbn;
/* Used to free the struct epitem */
struct rcu_head rcu;
};
/* List header used to link this structure to the eventpoll ready list (列表头用于连接结构的eventpoll就绪列表)*/
struct list_head rdllink;
/*
* Works together "struct eventpoll"->ovflist in keeping the
* single linked chain of items.
*/
struct epitem *next;
/* The file descriptor information this item refers to (关联的文件描述符)*/
struct epoll_filefd ffd;
/* Number of active wait queue attached to poll operations (轮询操作)*/
int nwait;
/* List containing poll wait queues */
struct list_head pwqlist;
/* The "container" of this item */
struct eventpoll *ep;
/* List header used to link this item to the "struct file" items list */
struct list_head fllink;
/* wakeup_source used when EPOLLWAKEUP is set */
struct wakeup_source __rcu *ws;
/* The structure that describe the interested events and the source fd (感兴趣的监控文件描述符的状态)*/
struct epoll_event event;
};
/*
* This structure is stored inside the "private_data" member of the file
* structure and represents(表现)the main data structure for the eventpoll
* interface.
*/
struct eventpoll {
/* Protect the access to this structure */
spinlock_t lock;
/*
* This mutex is used to ensure that files are not removed
* while epoll is using them. This is held during the event
* collection loop, the file cleanup path, the epoll file exit
* code and the ctl operations.
*/
struct mutex mtx;
/* Wait queue used by sys_epoll_wait() (双链表,epoll文件的等待队列。
*调用epoll_wait的进程可能在此队列上睡眠, 等待ep_poll_callback()函数唤醒或超时
*/
wait_queue_head_t wq;
/* Wait queue used by file->poll() (双链表, poll_wait是eventpoll文件本身的唤醒队列,
*该队列上睡眠的进程是等待eventpoll文件本身的某些事件发生
*/
wait_queue_head_t poll_wait;
/* List of ready file descriptors (就绪链表)*/
struct list_head rdllist;
/* RB tree root used to store monitored fd structs (存储监听文件描述符结构红黑树根节点)*/
struct rb_root rbr;
/*
* This is a single linked list(单链表) that chains all the "struct epitem" that
* happened while transferring ready events to userspace w/out
* holding ->lock.(如果正在向用户空间传递事件,此时状态就绪的文件描述符相关的结构会暂时放在该队列上,
* 否则会直接添加到就绪队列rdllist中。)
*/
struct epitem *ovflist;
/* wakeup_source used when ep_scan_ready_list is running */
struct wakeup_source *ws;
/* The user that created the eventpoll descriptor */
struct user_struct *user;
struct file *file;
/* used to optimize loop detection check */
int visited;
struct list_head visited_list_link;
};
epoll_create函数:
创建一个epoll的句柄。需要注意的是,当创建好epoll句柄后,它就是会占用一个fd值,在linux下如果查看/proc/进程id/fd/,是能够看到这个fd的,所以在使用完epoll后,必须调用close()关闭,否则可能导致fd被耗尽。
int epoll_create(int size);
int epoll_create1(int flags);
第一级:epoll_create()(注意在Linux 2.6.8之后,size参数是被忽略的)
第二级: epoll_create1()
第三级:ep_alloc()创建内部数据(eventpoll)
在ep_alloc()中
1.初始化epoll文件等待队列(双向链表)
2.初始化eventpoll文件唤醒队列(双向链表)
3.初始化就绪队列(双向链表)
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
4.初始化红黑树根节点
#define RB_ROOT (struct rb_root) { NULL, }
ep->rbr = RB_ROOT;
5.初始化发生事件红黑树节点链表(单链表)
#define EP_UNACTIVE_PTR ((void *) -1L)
ep->ovflist = EP_UNACTIVE_PTR;
第三级:get_unused_fd_flags()获取一个空闲的文件描述符
第三级:anon_inode_getfile()创建一个匿名文件
第三级:fd_install()将文件与fd建立联系
/*
* Open an eventpoll file descriptor.
*/
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
int error, fd;
struct eventpoll *ep = NULL;
struct file *file;
/* Check the EPOLL_* constant for consistency(符合条件EPOLL_CLOEXEC != O_CLOEXEC就报错). */
BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
if (flags & ~EPOLL_CLOEXEC)
return -EINVAL;
/*
* Create the internal(内部) data structure ("struct eventpoll").
*/
error = ep_alloc(&ep);
if (error < 0)
return error;
/*
* Creates all the items needed to setup an eventpoll file. That is,
* a file structure and a free file descriptor.
*(分配eventpoll实例并初始化,存储在file结构的private_data成员中。
* private_data成员用来存储文件描述符真正对应的对象。例如
* 如果文件描述符是一个套接字的话,其对应的file实例的private_data
* 成员存储的就是一个socket实例。)
*/
fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC));
if (fd < 0) {
error = fd;
goto out_free_ep;
}
file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
O_RDWR | (flags & O_CLOEXEC));
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto out_free_fd;
}
ep->file = file;
fd_install(fd, file);
return fd;
out_free_fd:
put_unused_fd(fd);
out_free_ep:
ep_free(ep);
return error;
}
SYSCALL_DEFINE1(epoll_create, int, size)
{
if (size <= 0)
return -EINVAL;
return sys_epoll_create1(0);
}