epoll源码解析

epoll函数

int epoll_create(int size)int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event)int epoll_wait(int epfd, struct epoll_event * events, int maxevents, int timeout);

#define EPOLL_PACKED __attribute__((packed)) // 紧凑内存分配
struct epoll_event {
	__poll_t events;
	__u64 data;
} EPOLL_PACKED;

初始化

static int __init eventpoll_init(void)
{
	struct sysinfo si;

	si_meminfo(&si);
	/*
	 * Allows top 4% of lomem to be allocated for epoll watches (per user).
	 */
	max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) /
		EP_ITEM_COST;
	BUG_ON(max_user_watches < 0);

	/*
	 * Initialize the structure used to perform epoll file descriptor
	 * inclusion loops checks.
	 */
	ep_nested_calls_init(&poll_loop_ncalls);

	/* Initialize the structure used to perform safe poll wait head wake ups */
	ep_nested_calls_init(&poll_safewake_ncalls);

	/* Initialize the structure used to perform file's f_op->poll() calls */
	ep_nested_calls_init(&poll_readywalk_ncalls);

	// epitem的高速缓存
	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);

	// epoll_entry的高速缓存
	pwq_cache = kmem_cache_create("eventpoll_pwq",
			sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);

	return 0;
}
fs_initcall(eventpoll_init);

epoll_create

SYSCALL_DEFINE1(epoll_create, int, size)
{
	if (size <= 0)
		return -EINVAL;

	return sys_epoll_create1(0);
}

size仅仅用来检查是否大于0,并没有真正使用。然后调用sys_epoll_create1过程检查参数,然后调用epoll_create1。

sys_epoll_create1

epoll_create的过程主要是创建并初始化数据结构eventpoll,以及创建file实例,并放入file->private_data

SYSCALL_DEFINE1(epoll_create1, int, flags)
{
	int error;
	struct eventpoll *ep = NULL;

	if (flags & ~EPOLL_CLOEXEC)
		return -EINVAL;

	// 为eventpoll分配内存。
	error = ep_alloc(&ep); 
	if (error < 0)
		return error;
    
	// 建立ep和file的关系 file->private_data = ep。然后将对应的file和fd绑定
	error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
				 O_RDWR | (flags & O_CLOEXEC));
	if (error < 0)
		ep_free(ep);

	return error;
}
static int ep_alloc(struct eventpoll **pep)
{
	int error;
	struct user_struct *user;
	struct eventpoll *ep;

	user = get_current_user();
	error = -ENOMEM;
	// 分配内存
	ep = kzalloc(sizeof(*ep), GFP_KERNEL);
	if (unlikely(!ep))
		goto free_uid;

	// 成员初始化
	spin_lock_init(&ep->lock);
	mutex_init(&ep->mtx);
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT;
	ep->ovflist = EP_UNACTIVE_PTR;
	ep->user = user;

	*pep = ep;

	return 0;

free_uid:
	free_uid(user);
	return error;
}

int anon_inode_getfd(const char *name, const struct file_operations *fops,
		     void *priv, int flags)
{
	int error, fd;
	struct file *file;

	// 获取未使用的fd
	error = get_unused_fd_flags(flags); 
	if (error < 0)
		return error;
	fd = error;
	// 创建file实例,以及匿名inode节点和dentry等数据结构,然后让file->private = priv(epollevent)
	file = anon_inode_getfile(name, fops, priv, flags); 
	if (IS_ERR(file)) {
		error = PTR_ERR(file);
		goto err_put_unused_fd;
	}
	fd_install(fd, file);

	return fd;

err_put_unused_fd:
	put_unused_fd(fd);
	return error;
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);

看看重要的结构体

eventpoll

struct eventpoll {
	// 自旋锁
	spinlock_t lock; 

	// 访问的互斥量
	struct mutex mtx;

	// sys_epoll_wait()使用的等待队列
	wait_queue_head_t wq;

	// file->poll()使用的等待队列,最后每个监控的fd的带有回调函数的等待队列都会挂在这上面
	wait_queue_head_t poll_wait; 

	// ready的文件描述符,最终通过回调函数添加准备的文件描述符到这个上面
	struct list_head rdllist; 

 	// 用来放epitem的红黑树的根节点
	struct rb_root rbr;  

	// 当正在向用户空间传递事件,则就绪事件会临时放到该队列,否则直接放到rdllist
	struct epitem *ovflist; 

	// 创建eventpoll描述符的用户
	struct user_struct *user; 
};

epitem

// 每个监控的文件描述符最后都会通过epitem呈现
struct epitem {
	// 用来链接这个结构体到eventpoll的红黑树上
	struct rb_node rbn;

	// 链表节点,所有已经ready的epitem都会被链到eventpoll中的rdllist中
	struct list_head rdllink; 

	// 和ovflist一起使用来保持单向链的条目
	struct epitem *next;

	// 描述此epitem对应的fd和file
	struct epoll_filefd ffd;

	/* Number of active wait queue attached to poll operations */
	// poll操作中活跃的等待队列数
	int nwait;

	// 双向链表,保存着被监视文件的等待队列, list_add_tail(&pwq->llink, &epi->pwqlist);
	struct list_head pwqlist;

	// epitem的容器
	struct eventpoll *ep;

	// 每个file有个f_ep_links,链接所对应的epitem
	struct list_head fllink;

	// 这个epitem关系哪些events,这个数据是调用epoll_ctl时从用户态传递过来
	struct epoll_event event;
};

eppoll_entry

/* Wait structure used by the poll hooks */
struct eppoll_entry {
	/* List header used to link this structure to the "struct epitem" */
	// 链接这个结构体到epitem中
	struct list_head llink;

	/* The "base" pointer is set to the container "struct epitem" */
	struct epitem *base; // 指向epitem的指针

	// 这个wait上睡着fd的回调函数
	wait_queue_t wait;  

	// 指向eventpoll的wait等待队列
	wait_queue_head_t *whead; 
};

epoll_event

#ifdef __x86_64__
#define EPOLL_PACKED __attribute__((packed))
typedef unsigned __bitwise __poll_t;

struct epoll_event {
	__poll_t events;
	__u64 data;
} EPOLL_PACKED;

epoll_ctl

SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
		struct epoll_event __user *, event)
{
	int error;
	int did_lock_epmutex = 0;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

	error = -EFAULT;
	// 将用户空间的epoll_event拷贝到内核
	if (ep_op_has_event(op) &&
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	// 前面create的时候绑定过了这两个东西,通过epfd便可以获取file实例,这里的file是和eventpoll绑定的。
	file = fget(epfd);
	if (!file)
		goto error_return;

	/* Get the "struct file *" for the target file */
	// 要io的file,也就是target的file实例
	tfile = fget(fd);
	if (!tfile)
		goto error_fput;

	/* The target file descriptor must support poll */
	error = -EPERM;
	// target要支持poll
	if (!tfile->f_op || !tfile->f_op->poll)
		goto error_tgt_fput;

	error = -EINVAL;
	// 和epoll绑定的file必须是支持epoll的poll函数
	if (file == tfile || !is_file_epoll(file))
		goto error_tgt_fput;

	// 获取eventpoll
	ep = file->private_data;

	/*
	 * When we insert an epoll file descriptor, inside another epoll file
	 * descriptor, there is the change of creating closed loops, which are
	 * better be handled here, than in more critical paths.
	 *
	 * We hold epmutex across the loop check and the insert in this case, in
	 * order to prevent two separate inserts from racing and each doing the
	 * insert "at the same time" such that ep_loop_check passes on both
	 * before either one does the insert, thereby creating a cycle.
	 */
	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
		mutex_lock(&epmutex);
		did_lock_epmutex = 1;
		error = -ELOOP;
		if (ep_loop_check(ep, tfile) != 0)
			goto error_tgt_fput;
	}


	mutex_lock(&ep->mtx);

	// 在ep红黑树中找该fd,是否有epitem实例。
	epi = ep_find(ep, tfile, fd); 

	error = -EINVAL;
	switch (op) {
	case EPOLL_CTL_ADD:
		if (!epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_insert(ep, &epds, tfile, fd);
		} else
			error = -EEXIST;
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	if (unlikely(did_lock_epmutex))
		mutex_unlock(&epmutex);

	fput(tfile);
error_fput:
	fput(file);
error_return:

	return error;
}

ep_insert

主要聊聊ep_insert

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
{
	int error, revents, pwake = 0;
	unsigned long flags;
	long user_watches;
	struct epitem *epi;
	struct ep_pqueue epq;

	// epoll可监控的最大值
	// static long max_user_watches __read_mostly;
	user_watches = atomic_long_read(&ep->user->epoll_watches);
	if (unlikely(user_watches >= max_user_watches))
		return -ENOSPC;

	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
		return -ENOMEM;

	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;
	// epi将要监听的fd加入到epitem中
	ep_set_ffd(&epi->ffd, tfile, fd);
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;

	/* Initialize the poll table using the queue callback */
	epq.epi = epi;
	// 初始化回调函数
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

	/*
	 * Attach the item to the poll hooks and get current event bits.
	 * We can safely use the file* here because its usage count has
	 * been increased by the caller of this function. Note that after
	 * this operation completes, the poll callback can start hitting
	 * the new item.
	 */
	revents = tfile->f_op->poll(tfile, &epq.pt);

	/*
	 * We have to check if something went wrong during the poll wait queue
	 * install process. Namely an allocation for a wait queue failed due
	 * high memory pressure.
	 */
	error = -ENOMEM;
	if (epi->nwait < 0)
		goto error_unregister;

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_lock);
	// 每个**文件**会将所有监听自己的epitem链起来
	list_add_tail(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	ep_rbtree_insert(ep, epi);

	/* We have to drop the new item inside our item list to keep track of it */
	spin_lock_irqsave(&ep->lock, flags);

	// 如果该文件已经可以操作了,但是没有调用回调函数将epitem链接到eventepoll中的话。直接唤醒
	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);

		// 监控的事件可用,唤醒正在等待的任务
		if (waitqueue_active(&ep->wq))
			wake_up_locked(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	spin_unlock_irqrestore(&ep->lock, flags);

	atomic_long_inc(&ep->user->epoll_watches);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return 0;

error_unregister:
	ep_unregister_pollwait(ep, epi);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	 */
	spin_lock_irqsave(&ep->lock, flags);
	if (ep_is_linked(&epi->rdllink))
		list_del_init(&epi->rdllink);
	spin_unlock_irqrestore(&ep->lock, flags);

	kmem_cache_free(epi_cache, epi);

	return error;
}
static inline int waitqueue_active(wait_queue_head_t *q)
{
	return !list_empty(&q->task_list);
}
static inline int list_empty(const struct list_head *head)
{
	return head->next == head;
}

梳理下调用链:tfile->f_op->poll(tfile, &epq.pt) -> ep_eventpoll_poll(struct file *file, poll_table *wait) -> poll_wait(file, &ep->poll_wait, wait) -> ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt)

也就是说,最终,tfile->f_op_poll(tfile, &epq.pt)调用的是ep_ptable_queue_proc这个回调函数:

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;
	// 创建eppoll_entry,设置唤醒函数为ep_poll_callback,加入到设备等待队列
	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead; 
		pwq->base = epi;
		// void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
		// 将设备的回调函数链接到epollevent中的链表poll_wait
		add_wait_queue(whead, &pwq->wait);
		list_add_tail(&pwq->llink, &epi->pwqlist);
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

上面的代码就是ep_insert中要做的最重要的事:创建struct epoll_entry,设置唤醒回调函数为ep_poll_callback,然后加入设备等待队列(这个whead就是epollevent中的poll_wait链表)。将此回调函数链接起来。

只有这样,当设备准备就绪,唤醒队列上的等待进程,ep_poll_callback就会被调用

核心

每次调用poll系统调用时,操作系统都要把相应的fd挂到current(当前进程)上,当fd多的时候,这样挂就非常费事;而每次调用epoll_wait则没有这么啰嗦,epoll只在epoll_ctl时把fd设备挂在current(当前进程)上。如果设备有事件了,通过回调函数,会把fd放入rdllist,而每次调用epoll_wait就只是手机rdllist里的fd就可以。 — epoll巧妙的利用回调函数,实现了更高效的事件驱动模型。

那么这里也应该能猜出来ep_poll_callback会干什么了,肯定是把红黑树上的收到event的epitem(代表的fd)插入到ep->rdllist中,这样,当epoll_wait返回时,rdllist里就都是准备就绪的fd了。

ep_poll_callback

// 初始化自定义唤醒函数
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
    q->flags = 0;
    q->private = p;
    q->func = default_wake_function;
}

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	unsigned long flags;
	// 从等待队列中获取epitem
	struct epitem *epi = ep_item_from_wait(wait);
	struct eventpoll *ep = epi->ep;

	// 禁止中断
	spin_lock_irqsave(&ep->lock, flags);

	/*
	 * If the event mask does not contain any poll(2) event, we consider the
	 * descriptor to be disabled. This condition is likely the effect of the
	 * EPOLLONESHOT bit that disables the descriptor when an event is received,
	 * until the next EPOLL_CTL_MOD will be issued.
	 */
	// 事件没有包含任何poll(2)的事件
	if (!(epi->event.events & ~EP_PRIVATE_BITS))
		goto out_unlock;

	/*
	 * Check the events coming with the callback. At this stage, not
	 * every device reports the events in the "key" parameter of the
	 * callback. We need to be able to handle both cases here, hence the
	 * test for "key" != NULL before the event match test.
	 */
	if (key && !((unsigned long) key & epi->event.events))
		goto out_unlock;

	/*
	 * If we are transferring events to userspace, we can hold no locks
	 * (because we're accessing user memory, and because of linux f_op->poll()
	 * semantics). All the events that happen during that period of time are
	 * chained in ep->ovflist and requeued later on.
	 */
	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
		if (epi->next == EP_UNACTIVE_PTR) {
			epi->next = ep->ovflist;
			ep->ovflist = epi;
		}
		goto out_unlock;
	}

	/* If this file is already in the ready list we exit soon */
	if (!ep_is_linked(&epi->rdllink))
		list_add_tail(&epi->rdllink, &ep->rdllist);

	/*
	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
	 * wait list.
	 */
	// 唤醒
	if (waitqueue_active(&ep->wq))
		wake_up_locked(&ep->wq);
	if (waitqueue_active(&ep->poll_wait))
		pwake++;

out_unlock:
	spin_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&ep->poll_wait);

	return 1;
}

查阅很多资料后才搞明白其实 epoll 也是一种文件类型,其底层驱动也实现了 file_operations 中的 poll 函数,因此一个 epoll 类型的 fd 可以被其他 epoll 实例监视。而 epoll 类型的 fd 只会有“读就绪”的事件。当 epoll 所监视的非 epoll 类型文件有“读就绪”事件时,当前 epoll 也会进入“读就绪”状态。

因此如果一个 epoll 实例监视了另一个 epoll 就会出现递归。举个例子,如图所示:
epoll源码解析_第1张图片
epollfd1 监视了 2 个“非 epoll”类型的 fd

epollfd2 监视了 epollfd1 和 2 个“非 epoll”类型的 fd

如果 epollfd1 所监视的 2 个 fd 中有可读事件触发,fd 的 ep_poll_callback 回调函数会触发将 fd 放到 epollfd1 的 rdllist 中。此时 epollfd1 本身的可读事件也会触发,就需要从 epollfd1 的 poll_wait 等待队列中找到 epollfd2,调用 epollfd1 的 ep_poll_callback(将 epollfd1 放到 epollfd2 的 rdllist 中)。因此 ep->poll_wait 是用来处理 epoll 间嵌套监视的情况的。

你可能感兴趣的:(Linux,后端,linux)