epoll源代码分析

文章目录

  • linux内核代码版本:v2.6.26.8
  • epoll_filefd
  • list_head
  • epoll_event
  • 红黑树的节点
  • epitem
  • eventpoll
  • eppoll_entry
  • sys_epoll_create(int size)
  • ep_alloc
  • eventpoll_fops
  • anon_inode_getfd
  • sys_epoll_ctl
  • ep_find
  • ep_poll_callback
  • ep_ptable_queue_proc
  • ep_insert
  • sys_epoll_wait
  • ep_poll

linux内核代码版本:v2.6.26.8

当系统启动时,epoll会进行初始化操作:

//用于实现安全的poll唤醒,避免在wake_up()内部再次进入poll回调
struct poll_safewake {
	struct list_head wake_task_list;
	spinlock_t lock;
};

//用来序列化ep_free()和eventpoll_release_file()的互斥量
static struct mutex epmutex;

/* Safe wake up implementation */
static struct poll_safewake psw;

static int __init eventpoll_init(void)
{
	mutex_init(&epmutex);		//初始化互斥量

	/* Initialize the structure used to perform safe poll wait head wake ups */
	ep_poll_safewake_init(&psw);		

	//slab动态分配内存,用于分配struct epitm
	epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
			0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
			NULL);			

	 //slab动态分配内存,用于分配struct eppoll_entry
	pwq_cache = kmem_cache_create("eventpoll_pwq",
			sizeof(struct eppoll_entry), 0,
			EPI_SLAB_DEBUG|SLAB_PANIC, NULL);	
	
    return 0;
}
fs_initcall(eventpoll_init);

epoll需要三个级别的锁。

1.epmutex(mutex);
2.ep->mtx(mutex);
3.ep->lock(spinlock);

对于自旋锁ep->lock(spinlock),因为我们在poll回调内部操作对象,该回调可能是由wake_up触发的,而wake_up有可能从中断请求上下文调用。所以我们不能在poll回调中休眠,因此我们需要一个spinlock

在事件传输循环(从内核到用户空间)期间,由于copy_to_user(),我们可能需要一个允许我们进入睡眠状态的锁。这个锁是一个互斥锁。epoll_ctl(EPOLL_CTL_DEL)eventpoll_release_file()期间获取的。然后我们还需要一个全局互斥锁来序列化eventpoll_release_file()ep_free()

这个互斥锁是在epoll文件清理路径期间由ep_free()获取的,如果文件已被pushepoll set中,则它也由eventpoll_release_file()获取,然后在没有之前调用epoll_ctl(EPOLL_CTL_DEL)的情况下关闭它。

epoll_filefd

struct epoll_filefd {
	struct file *file;
	int fd;
};

list_head

struct list_head {
	struct list_head *next, *prev;
};

epoll_event

struct epoll_event {
	__u32 events;
	__u64 data;
} EPOLL_PACKED;

红黑树的节点

struct rb_root
{
	struct rb_node *rb_node;
};

struct rb_node
{
	unsigned long  rb_parent_color;		//父节点颜色
#define	RB_RED		0
#define	RB_BLACK	1
	struct rb_node *rb_right;			//左子树
	struct rb_node *rb_left;			//右子树
} __attribute__((aligned(sizeof(long))));

epoll源代码分析_第1张图片

epitem

//添加到eventpoll的每一个文件描述符都会有一个链接到红黑树的epitem条目
struct epitem {
    //红黑树节点用于将epitm链接到eventpoll红黑树
	struct rb_node rbn;

    //用于将epitem链接到eventpoll就绪列表的列表头
	struct list_head rdllink;

	/*
	 * Works together "struct eventpoll"->ovflist in keeping the
	 * single linked chain of items.
	 */
	struct epitem *next;

	//文件描述符的信息
	struct epoll_filefd ffd;

    //附加在poll操作的活动等待队列数
	int nwait;

	//包含poll wait queue列表
	struct list_head pwqlist;

    //包含epitem的容器
	struct eventpoll *ep;

	//将epitem链接到“struct file”项列表的列表头
	struct list_head fllink;

	//描述感兴趣事件的结构和源文件描述符的结构
	struct epoll_event event;
};

epoll源代码分析_第2张图片

eventpoll

//该结构存储在文件结构的“private_data”成员中,并代表eventpoll接口的主要数据结构
struct eventpoll {
	//保护这个结构
	spinlock_t lock;

	//该互斥锁用于确保epoll使用文件时不会删除这些文件
    //在事件收集循环,文件清理路径,epoll文件退出代码和ctl操作时持有
	struct mutex mtx;

	//sys_epoll_wait()使用的等待队列
	wait_queue_head_t wq;

	//file->poll()使用的等待队列
	wait_queue_head_t poll_wait;

	//就绪文件描述符列表
	struct list_head rdllist;

	//红黑树树根用于存储受监控的fd结构
	struct rb_root rbr;

	//这是一个单链表,它链接了在将就绪事件传输到用户空间时发生的所有“epitem”
    //不需要持有锁
	struct epitem *ovflist;
};

epoll源代码分析_第3张图片

eppoll_entry

/* poll hooks使用的等待结构 */
struct eppoll_entry {
	/* 将eppoll_entry链接到“struct epitem”的队列头 */
	struct list_head llink;

	/* base指针指向"struct epitem"的容器 */
	void *base;

	/*
	 * wait queue项链接到目标文件等待队列头
	 */
	wait_queue_t wait;

	/* 链接等待队列项的等待队列头 */
	wait_queue_head_t *whead;
};

epoll源代码分析_第4张图片

epoll源代码分析_第5张图片

sys_epoll_create(int size)

/*
 * 打开一个eventepoll文件描述符。size参数现在已经被抛弃
 */
asmlinkage long sys_epoll_create(int size)
{
	int error, fd = -1;
	struct eventpoll *ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
		     current, size));

	/*
	 * 对size参数进行完整性检查
	 */
	error = -EINVAL;			//EINVAL表示无效的参数
	if (size <= 0 || (error = ep_alloc(&ep)) < 0) { 	//ep_alloc初始化eventpoll结构
		fd = error;
		goto error_return;
	}

	/*
	 * 创建设置eventpoll文件所需的所有项目。即,文件结构和空闲文件描述符
	 * 将创建的eventpoll文件和文件描述符关联起来
	 * static const struct file_operations eventpoll_fops = {
	 *		.release	= ep_eventpoll_release,
	 *		.poll		= ep_eventpoll_poll
	 *  };
	 */
	fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
	if (fd < 0)
		ep_free(ep);

error_return:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
		     current, size, fd));

	return fd;
}

ep_alloc

static int ep_alloc(struct eventpoll **pep)
{
	struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);

	if (!ep)
		return -ENOMEM;

	spin_lock_init(&ep->lock);
	mutex_init(&ep->mtx);
	init_waitqueue_head(&ep->wq);
	init_waitqueue_head(&ep->poll_wait);
	INIT_LIST_HEAD(&ep->rdllist);
	ep->rbr = RB_ROOT;
	ep->ovflist = EP_UNACTIVE_PTR;

	*pep = ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
		     current, ep));
	return 0;
}

eventpoll_fops

/* File callbacks that implement the eventpoll file behaviour */
static const struct file_operations eventpoll_fops = {
	.release	= ep_eventpoll_release,
	.poll		= ep_eventpoll_poll
};

static int ep_eventpoll_release(struct inode *inode, struct file *file)
{
	struct eventpoll *ep = file->private_data; 	//file->private_data指向eventpoll结构

	if (ep)
		ep_free(ep);		//释放eventpoll

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
	return 0;
}

static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
{
	unsigned int pollflags = 0;
	unsigned long flags;
	struct eventpoll *ep = file->private_data;

    //将wait插入poll等待队列
    /*
     *	static inline void poll_wait(struct file * filp, wait_queue_head_t*\
     *	wait_address, poll_table *p)
	 *	{
	 *		if (p && wait_address)
	 *			p->qproc(filp, wait_address, p);
	 *	}
     *	typedef struct poll_table_struct {
	 *		poll_queue_proc qproc;
	 *	} poll_table;
     *	typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
	 *	static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
	 *	{
	 *		pt->qproc = qproc;
	 *	}
	 */
	poll_wait(file, &ep->poll_wait, wait);

	/* Check our condition */
	spin_lock_irqsave(&ep->lock, flags);
	if (!list_empty(&ep->rdllist))
		pollflags = POLLIN | POLLRDNORM;	//普通或优先级可读
	spin_unlock_irqrestore(&ep->lock, flags);

	return pollflags;
}

anon_inode_getfd

//fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep);
//上面语句在 sys_epoll_create(int size)函数中调用
/**
 * anon_inode_getfd - 通过将其连接到匿名inode和描述文件“类”的dentry来创建一个新文件实例
 *
 * @name:    [in]    name of the "class" of the new file
 * @fops     [in]    file operations for the new file
 * @priv     [in]    private data for the new file (will be file's private_data)
 *
 * 使用anon_inode_getfd()创建的所有文件将共享一个inode,从而节省内存并避免/inode/dentry
 * 设置的代码重复
 */

int anon_inode_getfd(const char *name, const struct file_operations *fops,
		     void *priv)
{
	/*
	 *  struct qstr {
	 *		unsigned int hash;
	 *		unsigned int len;
	 *		const unsigned char *name;
	 * };
	 */
    struct qstr this;
	struct dentry *dentry;
	struct file *file;
	int error, fd;

	if (IS_ERR(anon_inode_inode))
		return -ENODEV;

	error = get_unused_fd();  		//得到一个未使用的文件描述符
	if (error < 0)
		return error;
	fd = error;

	/*
	 * Link the inode to a directory entry by creating a unique name
	 * using the inode sequence number.
	 */
	error = -ENOMEM;		//这个错误码的意思是内存不足
	this.name = name;
	this.len = strlen(name);
	this.hash = 0;
	dentry = d_alloc(anon_inode_mnt->mnt_sb->s_root, &this);
	if (!dentry)
		goto err_put_unused_fd;

	/*
	 * We know the anon_inode inode count is always greater than zero,
	 * so we can avoid doing an igrab() and we can use an open-coded
	 * atomic_inc().
	 */
	atomic_inc(&anon_inode_inode->i_count);

	dentry->d_op = &anon_inodefs_dentry_operations;
	/* Do not publish this dentry inside the global dentry hash table */
	dentry->d_flags &= ~DCACHE_UNHASHED;
	d_instantiate(dentry, anon_inode_inode);

	error = -ENFILE;
	file = alloc_file(anon_inode_mnt, dentry,
			  FMODE_READ | FMODE_WRITE, fops);
	if (!file)
		goto err_dput;
	file->f_mapping = anon_inode_inode->i_mapping;

	file->f_pos = 0;
	file->f_flags = O_RDWR;
	file->f_version = 0;
	file->private_data = priv;
	//在fd数组中安装file指针
	fd_install(fd, file);

	return fd;

err_dput:
	dput(dentry);
err_put_unused_fd:
	put_unused_fd(fd);
	return error;
}

epoll源代码分析_第6张图片

sys_epoll_ctl

/*
 * 该函数实现了eventpoll文件的控制器,该控制器实现了在interest set
 * 上的insertion/remove/change操作
 */
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
			      struct epoll_event __user *event)
{
	int error;
	struct file *file, *tfile;
	struct eventpoll *ep;
	struct epitem *epi;
	struct epoll_event epds;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
		     current, epfd, op, fd, event));
	
    /*
     * 如果op等于EPOLL_CTL_DEL,则跳过if语句
     * 如果op不等于EPOLL_CTL_DEL且将event复制到epds中出现错误则跳到error_return
	 */
	error = -EFAULT;
	if (ep_op_has_event(op) &&		
	    copy_from_user(&epds, event, sizeof(struct epoll_event)))
		goto error_return;

	/* 获取eventpoll文件的struct file */
	error = -EBADF;
	file = fget(epfd);
	if (!file)
		goto error_return;

    /* 得到目标文件的struct file* */
	tfile = fget(fd);
	if (!tfile)
		goto error_fput;

	/* 目标文件描述符必须支持poll操作 */
	error = -EPERM;
	if (!tfile->f_op || !tfile->f_op->poll)
		goto error_tgt_fput;

    /* 
     * 判断epfd文件描述符对应的文件是否是eventpoll file
     * 以及epfd对应的文件不应该等于fd对应的文件
     */
	error = -EINVAL;
	if (file == tfile || !is_file_epoll(file))
		goto error_tgt_fput;

   	//这个file->private_data存放的是eventpoll结构,
    //这个eventpoll结构是sys_epoll_create调用
    //anon_inode_getfd时绑定的
	ep = file->private_data; 

	mutex_lock(&ep->mtx);

	//在eventpoll树中找到fd对应的节点
    //返回epitem
	epi = ep_find(ep, tfile, fd);

	error = -EINVAL;
	switch (op) {			//判断op的操作
	case EPOLL_CTL_ADD:
		if (!epi) {		
			epds.events |= POLLERR | POLLHUP;	//发生错误和对端关闭连接

			error = ep_insert(ep, &epds, tfile, fd);	//把新节点插入ep的红黑树
		} else
			error = -EEXIST;			//已经存在,返回错误
		break;
	case EPOLL_CTL_DEL:
		if (epi)
			error = ep_remove(ep, epi);	//在ep红黑树中删除epi
		else
			error = -ENOENT;
		break;
	case EPOLL_CTL_MOD:				//修改操作
		if (epi) {
			epds.events |= POLLERR | POLLHUP;
			error = ep_modify(ep, epi, &epds);
		} else
			error = -ENOENT;
		break;
	}
	mutex_unlock(&ep->mtx);

error_tgt_fput:
	fput(tfile);
error_fput:
	fput(file);
error_return:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
		     current, epfd, op, fd, event, error));

	return error;
}

ep_find

static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
	int kcmp;
	struct rb_node *rbp;
	struct epitem *epi, *epir = NULL;
	struct epoll_filefd ffd;

	ep_set_ffd(&ffd, file, fd);				//初始化ffd
	for (rbp = ep->rbr.rb_node; rbp; ) {
		epi = rb_entry(rbp, struct epitem, rbn);
		kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
		if (kcmp > 0)
			rbp = rbp->rb_right;
		else if (kcmp < 0)
			rbp = rbp->rb_left;
		else {
			epir = epi;
			break;
		}
	}

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
		     current, file, epir));

	return epir;
}

ep_poll_callback

/*
 * 这是传递给等待队列唤醒机制的回调
 * 当存储的文件描述符有事件要报告时,它会被调用
 */
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
	int pwake = 0;
	unsigned long flags;
    //从等待队列得到epitem
	struct epitem *epi = ep_item_from_wait(wait);
	struct eventpoll *ep = epi->ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
		     current, epi->ffd.file, epi, ep));

	spin_lock_irqsave(&ep->lock, flags);

	/*
	 * 如果事件掩码不包含任何poll事件,应该禁用描述符
	 * 这种情况很可能是EPOLLONESHOT位在接收到事件时
	 * 禁用描述符的影响,直到发出EPOLL_CTL_MOD
	 *
     * #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
	 * #define EPOLLONESHOT (1 << 30)
	 * #define EPOLLET (1 << 31)
	 */
	if (!(epi->event.events & ~EP_PRIVATE_BITS))
		goto out_unlock;

	/*
	 * 如果我们将事件传输到用户空间中,我们不能持有锁(因为我们正在访问用户
	 * 内存,并且由于linux f_op->poll()语义)。在这段时间内发生的所有事件
	 * 都链接在ep->ovflist中,并在稍后重新排队
	 * #define EP_UNACTIVE_PTR ((void *) -1L)
	 */
	if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
		if (epi->next == EP_UNACTIVE_PTR) {
			epi->next = ep->ovflist;
			ep->ovflist = epi;
		}
		goto out_unlock;
	}

	/* If this file is already in the ready list we exit soon */
	if (ep_is_linked(&epi->rdllink))
		goto is_linked;
	//将epi->rdllink放到ep->rdllist头部,表示已经就绪
	list_add_tail(&epi->rdllink, &ep->rdllist);

is_linked:
	/*
	 * 唤醒(如果激活)eventpoll等待列表和
	 * ->poll()等待列表
	 * static inline int waitqueue_active(wait_queue_head_t *q)
	 * {
	 *		return !list_empty(&q->task_list);
	 * }
	 */
	if (waitqueue_active(&ep->wq))
		wake_up_locked(&ep->wq);
	if (waitqueue_active(&ep->poll_wait))
		pwake++;

out_unlock:
	spin_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
    /*
     * struct poll_safewake {
	 * 	struct list_head wake_task_list;
	 * 	spinlock_t lock;
	 * };
	 * static struct poll_safewake psw;
     */
	if (pwake)
		ep_poll_safewake(&psw, &ep->poll_wait); //就是执行了一个安全的唤醒操作

	return 1;
}

ep_ptable_queue_proc

/* revents = tfile->f_op->poll(tfile, &epq.pt);
 * 将我们的等待队列加入到目标文件
 * 的唤醒列表中
 */
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
				 poll_table *pt)
{
    //得到当前pt对应的epitem
	struct epitem *epi = ep_item_from_epqueue(pt);
	struct eppoll_entry *pwq;

	if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
		//设置等待队列的回调函数
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
		pwq->whead = whead;
		pwq->base = epi;
		add_wait_queue(whead, &pwq->wait);
        //把pwq->llink放到epi->pwqlist的头部
		list_add_tail(&pwq->llink, &epi->pwqlist);
		epi->nwait++;
	} else {
		/* We have to signal that an error occurred */
		epi->nwait = -1;
	}
}

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	if (p && wait_address)
		p->qproc(filp, wait_address, p);
}
struct __wait_queue_head {
	spinlock_t lock;
	struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;

ep_insert

//这个函数必须在持有mtx互斥量时才可以调用
//sys_epoll_ctl中调用
//error = ep_insert(ep, &epds, tfile, fd);
static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
		     struct file *tfile, int fd)
{
	int error, revents, pwake = 0;
	unsigned long flags;
	struct epitem *epi;
	struct ep_pqueue epq;
    /*
     *	struct ep_pqueue {
	 *		poll_table pt;
	 *		struct epitem *epi;
	 *	};
     */
    
	error = -ENOMEM;
	if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))	//分配一个epitem
		goto error_return;

	/* Item initialization follow here ... */
	INIT_LIST_HEAD(&epi->rdllink);
	INIT_LIST_HEAD(&epi->fllink);
	INIT_LIST_HEAD(&epi->pwqlist);
	epi->ep = ep;
	ep_set_ffd(&epi->ffd, tfile, fd);	//初始化epoll_fieldfd
	epi->event = *event;
	epi->nwait = 0;
	epi->next = EP_UNACTIVE_PTR;
    
    /*
     *	typedef struct poll_table_struct {
	 *		poll_queue_proc qproc;
	 *	} poll_table;
     *	typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
	 *	static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
	 *	{
	 *		pt->qproc = qproc;
	 *	}
	 */
    //使用queue的回调函数初始化poll table
	epq.epi = epi;
	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
    	
	/*
	 * 执行tfile->f_op->poll(tfile, &epq.pt)时,会调用poll_wait()函数
	 * static inline void poll_wait(struct file * filp, wait_queue_head_t * \
	 * wait_address, poll_table* p)
	 * { 
	 *		if (p && wait_address)
	 *			p->qproc(filp, wait_address, p);
	 * }
	 * 在这里,其实调用的是上面的ep_ptable_queue_proc()函数
	 * 然后返回可能立即执行不会阻塞的操作的位掩码
	 */
	revents = tfile->f_op->poll(tfile, &epq.pt);

	/*
	 * 检查在poll wait queue设置过程中是否出现错误
	 */
	if (epi->nwait < 0)
		goto error_unregister;

	/* Add the current item to the list of active epoll hook for this file */
	spin_lock(&tfile->f_ep_lock);
    //把epi->fllink加入到tfile->f_ep_links的头部
	list_add_tail(&epi->fllink, &tfile->f_ep_links);
	spin_unlock(&tfile->f_ep_lock);

	/*
	 * Add the current item to the RB tree. All RB tree operations are
	 * protected by "mtx", and ep_insert() is called with "mtx" held.
	 */
	ep_rbtree_insert(ep, epi);

	/* We have to drop the new item inside our item list to keep track of it */
	spin_lock_irqsave(&ep->lock, flags);

	/* 如果文件已经就绪,我们将其放到就绪列表中 */
	if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
		list_add_tail(&epi->rdllink, &ep->rdllist);

		/* Notify waiting tasks that events are available */
		if (waitqueue_active(&ep->wq))
			wake_up_locked(&ep->wq);
		if (waitqueue_active(&ep->poll_wait))
			pwake++;
	}

	spin_unlock_irqrestore(&ep->lock, flags);

	/* We have to call this outside the lock */
	if (pwake)
		ep_poll_safewake(&psw, &ep->poll_wait);	//就是执行了一个安全的唤醒操作

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
		     current, ep, tfile, fd));

	return 0;

error_unregister:
	ep_unregister_pollwait(ep, epi);

	/*
	 * We need to do this because an event could have been arrived on some
	 * allocated wait queue. Note that we don't care about the ep->ovflist
	 * list, since that is used/cleaned only inside a section bound by "mtx".
	 * And ep_insert() is called with "mtx" held.
	 */
	spin_lock_irqsave(&ep->lock, flags);
	if (ep_is_linked(&epi->rdllink))
		list_del_init(&epi->rdllink);
	spin_unlock_irqrestore(&ep->lock, flags);

	kmem_cache_free(epi_cache, epi);
error_return:
	return error;
}

epoll源代码分析_第7张图片

sys_epoll_wait

/*
 * 为eventpoll文件实现时间等待接口 
 * 它是用户空间epoll_wait的内核部分
 */
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
			       int maxevents, int timeout)
{
	int error;
	struct file *file;
	struct eventpoll *ep;

	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
		     current, epfd, events, maxevents, timeout));

	/* The maximum number of event must be greater than zero */
	if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
		return -EINVAL;

	/* 判断用户传递的区域是否是可写的 */
	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
		error = -EFAULT;
		goto error_return;
	}

	/* Get the "struct file *" for the eventpoll file */
	error = -EBADF;
	file = fget(epfd);		//得到eventpoll file的struct file结构
	if (!file)
		goto error_return;

	/*
	 * 检查上面得到的file结构是否是epoll file结构
	 */
	error = -EINVAL;
	if (!is_file_epoll(file))
		goto error_fput;

	/*
	 * 得到struct file对应的eventpoll结构
	 */
	ep = file->private_data; 

	/* Time to fish for events ... */
	error = ep_poll(ep, events, maxevents, timeout);

error_fput:
	fput(file);
error_return:
	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
		     current, epfd, events, maxevents, timeout, error));

	return error;
}

ep_poll

static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
		   int maxevents, long timeout)
{
	int res, eavail;
	unsigned long flags;
	long jtimeout;
	wait_queue_t wait;

	/*
	 * Calculate the timeout by checking for the "infinite" value ( -1 )
	 * and the overflow condition. The passed timeout is in milliseconds,
	 * that why (t * HZ) / 1000.
	 */
	jtimeout = (timeout < 0 || timeout >= EP_MAX_MSTIMEO) ?
		MAX_SCHEDULE_TIMEOUT : (timeout * HZ + 999) / 1000;

retry:
	spin_lock_irqsave(&ep->lock, flags);

	res = 0;
	if (list_empty(&ep->rdllist)) {
		/*
		 * 没有任何可用的事件返回给调用者,需要在这里睡眠
		 * 当事件可用时,我们将会被ep_poll_callback()唤醒
		 */
        //current是task_struct结构,把current加入到等待队列中 
		init_waitqueue_entry(&wait, current);
		wait.flags |= WQ_FLAG_EXCLUSIVE;	
		__add_wait_queue(&ep->wq, &wait);		//wait等待队列加入到eventpoll的等待队列中

		for (;;) {
			/*
			 * 设置当前进程状态为可中断
			 */
			set_current_state(TASK_INTERRUPTIBLE);
			if (!list_empty(&ep->rdllist) || !jtimeout)
				break;
            //测试是否有信号
			if (signal_pending(current)) {
				res = -EINTR;
				break;
			}

			spin_unlock_irqrestore(&ep->lock, flags);
            //主动让出处理器,等待ep_poll_callback()将当前进程唤醒或者超时
            //返回值是剩余的时间
			jtimeout = schedule_timeout(jtimeout);
			spin_lock_irqsave(&ep->lock, flags);
		}
		__remove_wait_queue(&ep->wq, &wait);

		set_current_state(TASK_RUNNING);
	}

	/* Is it worth to try to dig for events ? */
	eavail = !list_empty(&ep->rdllist);

	spin_unlock_irqrestore(&ep->lock, flags);

	/*
	 * Try to transfer events to user space. In case we get 0 events and
	 * there's still timeout left over, we go trying again in search of
	 * more luck.
	 */
	if (!res && eavail &&
	    !(res = ep_send_events(ep, events, maxevents)) && jtimeout)
		goto retry;

	return res;
}

你可能感兴趣的:(Linux网络编程基础)