昨晚分析了poll,通过代码的阅读可以发现,poll操作有很多可以优化的地方。epoll是eventpoll的简称,他的效率是非常高的,我们今天来看看他的实现。他的实现在FS/Eventpoll.c,代码有1500多行,呵呵,怕了吧。
大家都知道,epoll有三个系统调用,C库封装成以下三个:
1. int epoll_create(int size);
2. int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);
3. int epoll_wait(int epfd, struct epoll_event *events,int maxevents, int timeout);
epoll的源码这么多,我们就干脆跟着他们三个走着瞧。今天先搞定第一个---epoll_create
第一个是
/* * It opens an eventpoll file descriptor by suggesting a storage of "size" * file descriptors. The size parameter is just an hint about how to size * data structures. It won't prevent the user to store more than "size" * file descriptors inside the epoll interface. It is the kernel part of * the userspace epoll_create(2). */ asmlinkage long sys_epoll_create(int size) { int error, fd; struct inode *inode; struct file *file; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, size)); /* Sanity check on the size parameter */ error = -EINVAL; if (size <= 0) goto eexit_1; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure, and inode and a free file descriptor. */ error = ep_getfd(&fd, &inode, &file); //(1) if (error) goto eexit_1; /* Setup the file internal data structure ( "struct eventpoll" ) */ error = ep_file_init(file); //(2) if (error) goto eexit_2; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, fd)); return fd; eexit_2: sys_close(fd); eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, error)); return error; }
(1)这里用到了一个ep_getfd函数,从注释我们知道,这个函数建立eventpoll相关的file,当然,一个file要包括文件描述符、inode、还有文件对象,这也是我们传的三个参数。废话不说,看源码:
/* * Creates the file descriptor to be used by the epoll interface. */ static int ep_getfd(int *efd, struct inode **einode, struct file **efile) { struct qstr this; char name[32]; struct dentry *dentry; struct inode *inode; struct file *file; int error, fd; /* Get an ready to use file */ error = -ENFILE; file = get_empty_filp(); if (!file) goto eexit_1; /* Allocates an inode from the eventpoll file system */ inode = ep_eventpoll_inode(); error = PTR_ERR(inode); if (IS_ERR(inode)) goto eexit_2; /* Allocates a free descriptor to plug the file onto */ error = get_unused_fd(); if (error < 0) goto eexit_3; fd = error; /* * Link the inode to a directory entry by creating a unique name * using the inode number. */ error = -ENOMEM; sprintf(name, "[%lu]", inode->i_ino); this.name = name; this.len = strlen(name); this.hash = inode->i_ino; dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this); if (!dentry) goto eexit_4; dentry->d_op = &eventpollfs_dentry_operations; d_add(dentry, inode); file->f_vfsmnt = mntget(eventpoll_mnt); file->f_dentry = dentry; file->f_mapping = inode->i_mapping; file->f_pos = 0; file->f_flags = O_RDONLY; file->f_op = &eventpoll_fops; file->f_mode = FMODE_READ; file->f_version = 0; file->private_data = NULL; /* Install the new setup file into the allocated fd. */ fd_install(fd, file); *efd = fd; *einode = inode; *efile = file; return 0; eexit_4: put_unused_fd(fd); eexit_3: iput(inode); eexit_2: put_filp(file); eexit_1: return error; }
这个函数的注释都比较全,这里简单提一下,况且因为涉及到的函数太多,要深究起来涉及的知识太多,也不可能逐一去列代码。不过这个函数个人觉得比较经典,这函数就是创建一个文件的流程。
首先,我们得拿到一个file结构体,通过内核分配给我们;然后我们要拿到inode,调用这个ep_eventpoll_inode()就可以了;接着是get_unused_fd()拿到文件描述符;接着d_alloc()函数为我们拿到一个dentry;d_add(dentry, inode)函数把dentry建立hash里面并且绑定inode;后面是继续填充文件对象file;fd_install(fd, file)向进程注册文件,并通过这样的方式把文件描述符和文件对象关联起来。
(2)在跟踪ep_file_init函数之前,我们先来看一下eventpoll结构体:
/* * This structure is stored inside the "private_data" member of the file * structure and rapresent the main data sructure for the eventpoll * interface. */ struct eventpoll { /* Protect the this structure access */ rwlock_t lock; /* * This semaphore is used to ensure that files are not removed * while epoll is using them. This is read-held during the event * collection loop and it is write-held during the file cleanup * path, the epoll file exit code and the ctl operations. */ struct rw_semaphore sem; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* RB-Tree root used to store monitored fd structs */ struct rb_root rbr; };
注释也是相当清楚。这个eventpoll可以看得出来,是epoll的核心,它将会存储你想要监听的文件描述符,这也是为什么epoll高效之所在。
好,我们回到sys_epoll_create函数,开始跟踪ep_file_init函数:
static int ep_file_init(struct file *file) { struct eventpoll *ep; if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL))) return -ENOMEM; memset(ep, 0, sizeof(*ep)); rwlock_init(&ep->lock); init_rwsem(&ep->sem); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT; file->private_data = ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_file_init() ep=%p\n", current, ep)); return 0; }
其实也就是eventpoll结构体的初始化。
sys_epoll_create函数大概就这样了,明天接着看sys_epoll_ctl。