函数原型:
int poll (struct pollfd *fds, nfds_t nfds, int timeout);
其中pollfd类型的结构体包含三个元素。
几个重要的结构体:
struct pollfd {
int fd;
short events;
short revents;
};
/*
* 每次select()都会初始化一个poll_wqueues结构与这个
* 调用相对应。结构中的polling_task字段指向调用进程
* (也就是current进程)的task_struct
*
* 资源等待队列节点wait_queue_t中的private字段指向
* 对应的poll_wqueues对象。因此资源就绪时,通过获得
* poll_wqueues对象,然后访问其中的polling_task字段
* 能够得到调用进程的PCB,然后对其进行唤醒...
*/
struct poll_wqueues {
poll_table pt; //该结构体中存放一个函数指针
struct poll_table_page *table;
struct task_struct *polling_task; //保存当前调用进程的task_struct结构体
int triggered;
int error;
int inline_index;
struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};
typedef struct poll_table_struct {
poll_queue_proc _qproc;
unsigned long _key;
} poll_table;
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *,
struct poll_table_struct *);
/* 真正被挂载到资源等待队列中的结构 */
struct poll_table_entry {
struct file *filp; //设备的文件描述符
unsigned long key; //等待特定事件的掩码,如POLLIN、POLLOUT、POLLERR
wait_queue_t wait; //节点的private成员关联poll_table_entry所属的poll_wqueues
wait_queue_head_t *wait_address; //指向资源等待队列队列头
};
static inline void poll_wait(struct file * filp,ait_queue_head_t * wait_address,
poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}
struct poll_table_page {
struct poll_table_page * next;
struct poll_table_entry * entry;
struct poll_table_entry entries[0];
};
一些重要函数
/* 初始化poll_wqueues */
void poll_initwait(struct poll_wqueues *pwq)
{
init_poll_funcptr(&pwq->pt, __pollwait); //将poll_wqueues中poll_table里的
pwq->polling_task = current;
pwq->triggered = 0;
pwq->error = 0;
pwq->table = NULL;
pwq->inline_index = 0;
}
static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
pt->qproc = qproc;
pt->key = ~0UL; /* all events enabled */
}
/* Add a new entry */
/**
* __pollwait - 将poll_table_entry挂载到资源文件的监听队列
* @file: 被监听的资源文件
* @wait_address: 被监听的资源文件的等待队列头
* @p: 在poll_initwait()中设置的poll_tbale
*/
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
poll_table *p)
{
/* 获取poll_wqueues */
struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
/* 从poll_wqueues中取得一个poll_table_entry结构 */
struct poll_table_entry *entry = poll_get_entry(pwq);
if (!entry)
return;
/* 增加资源文件引用计数 */
get_file(filp);
/* 关联资源文件 */
entry->filp = filp;
/* 保存资源文件监听队列队列头 */
entry->wait_address = wait_address;
/* 设置想要监听事件 */
entry->key = p->key;
/*
* 初始化一个等待队列节点,其中唤醒函数设置为pollwake
*
* 重点!!!:
* 唤醒函数为pollwake
*/
init_waitqueue_func_entry(&entry->wait, pollwake);
/*
* 来看一下为什么等待队列节点的private要这样设计:
* 1. 实际linux内核设计:
* 每个wait_queue_t的private字段指向同一个poll_wqueues,然后
* 共用的poll_wqueues中保存了指向调用进程PCB的指针,这样总共
* 需要n + 1个指针...
* 2. 假想设计:
* 每个wait_queue_t的private字段指向调用进程PCB,对应的事件
* 结构poll_table_entry中每个都保存了指向同一个poll_wqueues
* 的指针,这样总共需要n + n个指针...
*/
entry->wait.private = pwq;
/* 将poll_table_entry挂载到资源文件的监听队列中 */
add_wait_queue(wait_address, &entry->wait);
}
static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
struct poll_table_page *table = p->table;
/* 缓存数组够用就从缓存数组中分配... */
if (p->inline_index < N_INLINE_POLL_ENTRIES)
return p->inline_entries + p->inline_index++;
/* 动态分配的内存为空或者已用完... */
if (!table || POLL_TABLE_FULL(table)) {
struct poll_table_page *new_table;
/* 分配一页的内存给poll_table_page使用 */
new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
if (!new_table) {
p->error = -ENOMEM;
return NULL;
}
new_table->entry = new_table->entries;
new_table->next = table;
p->table = new_table;
table = new_table;
}
return table->entry++;
}
void poll_freewait(struct poll_wqueues *pwq)
{
struct poll_table_page * p = pwq->table;
int i;
/* 对缓存数组中的poll_table_entry进行卸载 */
for (i = 0; i < pwq->inline_index; i++)
free_poll_entry(pwq->inline_entries + i);
/* 对动态内存中的poll_table_entry进行卸载 */
while (p) {
struct poll_table_entry * entry;
struct poll_table_page *old;
entry = p->entry;
do {
/* 一个poll_table_page对象中至少分配了一个
* poll_table_entry,所以entry--是安全的 */
entry--;
free_poll_entry(entry);
} while (entry > p->entries);
old = p;
p = p->next;
free_page((unsigned long) old);
}
}
static void free_poll_entry(struct poll_table_entry *entry)
{
remove_wait_queue(entry->wait_address, &entry->wait);
fput(entry->filp);
}
/**
* pollwake - 唤醒回调函数,这个函数验证资源当前状态中是否有我们所关心的
* 事件,如果没有,就忽略这次唤醒;如果有,就转调用__pollwake...
* @wait: poll_table_entry.wait
* @mode:
* @key: 携带资源当前状态
*/
static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
struct poll_table_entry *entry;
/* 通过poll_wqueues中的wait获取poll_table_entry */
entry = container_of(wait, struct poll_table_entry, wait);
/* 如果资源的已就绪的状态中没有我们所关心的events的话,直接忽略返回 */
if (key && !((unsigned long)key & entry->key))
return 0;
/* 有我们所关心的events,那就转调用__pollwake去处理吧... */
return __pollwake(wait, mode, sync, key);
}
/* 资源就绪时真正调用的唤醒回调函数 */
static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
/* 见__pollwait()中关于等待队列节点private字段的注释 */
struct poll_wqueues *pwq = wait->private;
/* 构造一个有效的等待队列节点,private字段指向调用进程的PCB */
DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);
/*
* Although this function is called under waitqueue lock, LOCK
* doesn't imply write barrier and the users expect write
* barrier semantics on wakeup functions. The following
* smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
* and is paired with set_mb() in poll_schedule_timeout.
*/
smp_wmb();
/* 设置已触发状态标志 */
pwq->triggered = 1;
/*
* Perform the default wake up operation using a dummy
* waitqueue.
*
* TODO: This is hacky but there currently is no interface to
* pass in @sync. @sync is scheduled to be removed and once
* that happens, wake_up_process() can be used directly.
*/
/* 唤醒select()的调用进程 */
return default_wake_function(&dummy_wait, mode, sync, key);
}
nfds指定被监听事件集合fds的大小
typedef unsigned long int nfds_t;
timeout参数指定poll的超时值,单位是毫秒
当超时为-1时,轮询调用将一直阻塞,直到某个事件发生;当超时为0时,轮询调用立即返回。
poll函数将超时时间格式进行转换,然后调用do_sys_poll函数
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
int, timeout_msecs)
{
struct timespec end_time, *to = NULL;
int ret;
if (timeout_msecs >= 0) {
to = &end_time;
poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC,
NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC));
}
ret = do_sys_poll(ufds, nfds, to);
if (ret == -EINTR) {
struct restart_block *restart_block;
restart_block = ¤t_thread_info()->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
if (timeout_msecs >= 0) {
restart_block->poll.tv_sec = end_time.tv_sec;
restart_block->poll.tv_nsec = end_time.tv_nsec;
restart_block->poll.has_timeout = 1;
}
else
restart_block->poll.has_timeout = 0;
ret = -ERESTART_RESTARTBLOCK
}
return ret;
}