poll调用和select调用实现的功能一样,都是网络IO利用的一种机制。先看一下poll的调用形式
一,poll调用
#include <poll.h> int poll(struct pollfd fds[], nfds_t nfds, int timeout);struct pollfd结构如下:【在源码文件poll.h文件中】
struct pollfd { int fd; short events; short revents; };
这个结构中fd表示文件描述符,events表示请求检测的事件,revents表示检测之后返回的事件,如果当某个文件描述符有状态变化时,revents的值就不为空。
二,参数说明asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, long timeout_msecs) { s64 timeout_jiffies; int ret; if (timeout_msecs > 0) { #if HZ > 1000 /* We can only overflow if HZ > 1000 */ if (timeout_msecs / 1000 > (s64)0x7fffffffffffffffULL / (s64)HZ) timeout_jiffies = -1; else #endif timeout_jiffies = msecs_to_jiffies(timeout_msecs); } else { /* Infinite (< 0) or no (0) timeout */ timeout_jiffies = timeout_msecs; } ret = do_sys_poll(ufds, nfds, &timeout_jiffies); if (ret == -EINTR) { struct restart_block *restart_block; restart_block = ¤t_thread_info()->restart_block; restart_block->fn = do_restart_poll; restart_block->arg0 = (unsigned long)ufds; restart_block->arg1 = nfds; restart_block->arg2 = timeout_jiffies & 0xFFFFFFFF; restart_block->arg3 = (u64)timeout_jiffies >> 32; ret = -ERESTART_RESTARTBLOCK; } return ret; }这个函数还是比较容易理解,包括三个部分的工作:
int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned long todo = nfds; if (nfds > current->signal->rlim[RLIMIT_NOFILE].rlim_cur) return -EINVAL; len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; todo -= walk->len; if (!todo) break; len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; walk = walk->next = kmalloc(size, GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } pollfd poll_initwait(&table); fdcount = do_poll(nfds, head, &table, timeout); poll_freewait(&table); for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; int j; for (j = 0; j < walk->len; j++, ufds++) if (__put_user(fds[j].revents, &ufds->revents)) goto out_fds; } err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; }为了加快处理速度和提高系统性能,这里优先使用已经定好的一个栈空间,其大小为POLL_STACK_ALLOC,在我系统上,其值为256,大小为256个字节的栈空间转换为struct poll_list结构,以存储需要被检测的socket描述符,struct poll_list的结构如下:
struct poll_list { struct poll_list *next; int len; struct pollfd entries[0]; };上面可以看到该结构的entries为一个数组,结构为struct pollfd,这个有点眼熟,没错,它就是存储poll调用中需要被检测的socket描述符。那么前面分配的栈空间能存储多少个struct pollfd呢?这计算如下:
len = min_t(unsigned int, nfds, N_STACK_PPS);式中的N_STACK_PPS就是计算前面默认的固定栈大小能够存储多少个struct pollfd的
#define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd))然后就复制len个struct pollfd至内核空间,这里有细心的用户就会发现:如果nfds比N_STACK_PPS大的话,怎么办呢?注意上面的函数,是一个循环,如果nfds比N_STACK_PPS大(事实上,一般都会比这里大),那么会再请求内存,然后接着复制,就是这个代码片段:
len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; walk = walk->next = kmalloc(size, GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } }POLLFD_PER_PAGE表示一页的内存能够存储多少个struct pollfd,可以计算一下,一页是4K,而struct pollfd的内存占用8个字节,就是一页的内存可以将近存储512个socket描述符。如果在分配一页的内存之后,还不够nfds来用,没关系,循环不会退出的,会再分配一个页,并且所有分配的块都被struct poll_list链接起来,上面可以看到,这个结构有一个next域,就是专门做这个的。
poll_initwait(&table); fdcount = do_poll(nfds, head, &table, timeout); poll_freewait(&table);这是最重要的部分,因为接下来的部分比较容易理解,在这之后,做两件事:
struct poll_wqueues { poll_table pt; struct poll_table_page * table; int error; int inline_index; struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; }; typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); typedef struct poll_table_struct { poll_queue_proc qproc; } poll_table;poll_initwait函数如下:
void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait);//设置poll_table结构中的qproc函数指针为__pollwait函数,就是pwq->pt->qproc=__pollwait。这个函数是一个回调函数,基本上这种机制的实现,就是依靠回调函数了。 pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; }所以poll_initwait就是初始化了poll_wqueues table,主要是将其结构中的函数指针设置为__pollwait函数。那么这个函数是做什么的呢?我们先看poll_initwait之后调用的函数,就是do_poll函数,其实现如下:
static int do_poll(unsigned int nfds, struct poll_list *list, struct poll_wqueues *wait, s64 *timeout) { int count = 0; poll_table* pt = &wait->pt; /* Optimise the no-wait case */ if (!(*timeout)) pt = NULL; for (;;) { struct poll_list *walk; long __timeout; set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill the poll_table, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ if (do_pollfd(pfd, pt)) { count++; pt = NULL; } } } /* * All waiters have already been registered, so don't provide * a poll_table to them on the next loop iteration. */ pt = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -EINTR; } if (count || !*timeout) break; if (*timeout < 0) { /* Wait indefinitely */ __timeout = MAX_SCHEDULE_TIMEOUT; } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT-1)) { /* * Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in * a loop */ __timeout = MAX_SCHEDULE_TIMEOUT - 1; *timeout -= __timeout; } else { __timeout = *timeout; *timeout = 0; } __timeout = schedule_timeout(__timeout); if (*timeout >= 0) *timeout += __timeout; } __set_current_state(TASK_RUNNING); return count; }这个函数有以下几个要注意的点:
static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) { unsigned int mask; int fd; mask = 0; fd = pollfd->fd; if (fd >= 0) { int fput_needed; struct file * file; file = fget_light(fd, &fput_needed); mask = POLLNVAL; if (file != NULL) { mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) mask = file->f_op->poll(file, pwait); /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; fput_light(file, fput_needed); } } pollfd->revents = mask; return mask; }这个函数很简单,先根据socket描述符或者是文件句柄找到进程对应的struct file *file结构,然后调用file->f_op->poll(file,pwait),这是这个函数的核心调用,这其实也是linux的VFS的一部分,这会根据当前的文件是什么类型的文件来选择调用的入口,如file是socket网络文件,此时调用的就是由网络驱动设备来实现的poll,如果file是ext3等文件系统上打开的一个文件,那就会调用由该文件系统来实现的poll函数,我们以tcp_poll为例来了解一般poll完成什么工作;
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events by poll logic and correct handling of state changes made by another threads is impossible in any case. */ mask = 0; if (sk->sk_err) mask = POLLERR; /* * POLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that POLLHUP is incompatible * with the POLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if PULLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why PULLHUP is incompatible with POLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= POLLIN | POLLRDNORM | POLLRDHUP; /* Connected? */ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken * in SYN_* states. */ if ((tp->rcv_nxt != tp->copied_seq) && (tp->urg_seq != tp->copied_seq || tp->rcv_nxt != tp->copied_seq + 1 || sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { mask |= POLLOUT | POLLWRNORM; } else { /* send SIGIO later */ set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. */ if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) mask |= POLLOUT | POLLWRNORM; } } if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; } return mask; }上面的tcp_poll看上去很长,但核心的的调用是:
poll_wait(file, sk->sk_sleep, wait);
struct sock *sk = sock->sk;struct sock是socket连接的内核表示,sk->sk_sleep是structwait_queue_head_t结构类型,这表示的是socket的等待队列,每一个socket都有自己的一个等待队列,由内核结构struct sock来维护。
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && wait_address) p->qproc(filp, wait_address, p); }现在一个转折点出现了,前面我们说过初始化table的函数指针为__pollwait,那么此时调用的就是__pollwait(filp,wait_address,p),这里的参数分别表示为进程表示文件结构struct file,socket或设备的等待队列wait_queue_head_t,和poll_table。
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_table_entry *entry = poll_get_entry(p); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address, &entry->wait); }我们现在来分析一下,__pollwait调用完成之后,内核做了什么?先看一下poll_get_entry(p);
static struct poll_table_entry *poll_get_entry(poll_table *_p) { struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; __set_current_state(TASK_RUNNING); return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; }这个函数会根据情况创建struct poll_table_page结构,因为__pollwait在系统中是会被多次调用的,所以可能会有多个struct poll_table_page结构,这个结构是对struct poll_table_entry的一个封装,其结构如下所示:
struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; struct poll_table_entry entries[0]; }; struct poll_table_entry { struct file * filp; wait_queue_t wait; wait_queue_head_t * wait_address; };所以在调用poll_get_entry之后,会返回一个新的poll_table_entry,这也是每次调用__pollwait都会产生的。接下来调用init_waitqueue_entry函数将这个新建的struct poll_table_entry和当前的进程绑定起来,再将struct poll_table_entry加入到socket的等待队列。这样就将当前进程和socket的等待队列联系,说白了,就是把current挂到等待队列上。
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p) { q->flags = 0; q->private = p; q->func = default_wake_function; }这里同时,注册了一个数据就绪时的叫醒函数
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { return try_to_wake_up(curr->private, mode, sync); }这就完成了调用。再来所有回顾一下