select 系统调用原型如下所示:
int select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout);
这个调用的参数的意思如下:
select是和设备的驱动程序实现相关的,主要由驱动程序实现其调用的poll函数。下面我将分析select调用过程,select对应的系统调用为sys_select,该函数位于fs/select.c文件中。
在分析具体的源码之前,先对整个调用有个了解,下面的源码将会简单的以这个控制路径为主线。
sys_select:处理传入的时间参数,检查是否超时,然后调用core_sys_select,接下来把剩余的时间传递至用户空间。
core_sys_select:准备文件描述符位图,调用do_select。
do_select:做select/poll的工作。在合适的时机把自己挂起等待,调用sock_poll。
sock_poll:用函数指针分派到具体的协议层函数tcp_poll、udp_poll、datagram_poll。
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp) { s64 timeout = -1; struct timeval tv; int ret; if (tvp) { if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; if (tv.tv_sec < 0 || tv.tv_usec < 0) return -EINVAL; // 检查是否超时 /* Cast to u64 to make GCC stop complaining */ if ((u64)tv.tv_sec >= (u64)MAX_INT64_SECONDS) timeout = -1; /* infinite */ else { timeout = DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC/HZ); timeout += tv.tv_sec * HZ; } } ret = core_sys_select(n, inp, outp, exp, &timeout); if (tvp) { struct timeval rtv; if (current->personality & STICKY_TIMEOUTS) goto sticky; rtv.tv_usec = jiffies_to_usecs(do_div((*(u64*)&timeout), HZ)); rtv.tv_sec = timeout; if (timeval_compare(&rtv, &tv) >= 0) rtv = tv; if (copy_to_user(tvp, &rtv, sizeof(rtv))) { sticky: /* * If an application puts its timeval in read-only * memory, we don't want the Linux-specific update to * the timeval to cause a fault after the select has * completed successfully. However, because we're not * updating the timeval, we can't restart the system * call. */ if (ret == -ERESTARTNOHAND) ret = -EINTR; } } return ret; }
static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; void *bits; int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); // 获取当前进程的文件描述符表, max_fds = fdt->max_fds; // 文件描述符表中文件限制。 rcu_read_unlock(); if (n > max_fds) n = max_fds; // 如有可能,修改用户传入的用于检查的文件描述符个数。 /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ size = FDS_BYTES(n); // 每个文件描述符占用一个位。 bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; } fds.in = bits; // 初始化位图, fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); ret = do_select(n, &fds, timeout); // 开始下一步。 if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) // 该函数会调用 __copy_to_user,将改动传递至用户空间 ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; }
#define FDS_BITPERLONG (8*sizeof(long)) #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long))
static inline unsigned long __must_check set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) { if (ufdset) return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); return 0; }
int do_select(int n, fd_set_bits *fds, s64 *timeout) { struct poll_wqueues table; poll_table *wait; // 等待列表,这个在实现过程是很重要的,等下我会提到。 int retval, i; rcu_read_lock(); retval = max_select_fd(n, fds); // 只处理已经被打开的文件描述符 rcu_read_unlock(); if (retval < 0) return retval; n = retval; poll_initwait(&table); 初始化结构体 wait = &table.pt; if (!*timeout) wait = NULL; retval = 0; for (;;) { // 这是一个死循环,我们应该关注:什么时候会跳出这个循环 unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; long __timeout; set_current_state(TASK_INTERRUPTIBLE); 设置当前进程状态 inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; for (i = 0; i < n; ++rinp, ++routp, ++rexp) { 遍历所有的文件描述符 unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; const struct file_operations *f_op = NULL; struct file *file = NULL; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; if (all_bits == 0) { i += __NFDBITS; 8 * sizeof(long)] continue; } for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { int fput_needed; if (i >= n) break; if (!(bit & all_bits)) continue; file = fget_light(i, &fput_needed); // 得到file结构指针,并增加对该文件的引用计数 if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; if (f_op && f_op->poll) mask = (*f_op->poll)(file, retval ? NULL : wait); //对于socket描述符,f_op->poll=sock_poll, //第三个参数wait很重要,它是等待队列,在poll成功后会将本进程唤醒执行 fput_light(file, fput_needed); // 释放file结构指针,并减少对file的引用计数 //根据调用结果,依次设置返回的数据 if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; } } //之前已经改变了该进程的状态,这里重新调度其它的进程来运行。 //如果可以抢占,这里增加一个被抢占重新运行的机会。 cond_resched(); } // 写回位图 if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; } wait = NULL; if (retval || !*timeout || signal_pending(current)) break; if(table.error) { retval = table.error; break; } if (*timeout < 0) { 一直等待 /* Wait indefinitely */ __timeout = MAX_SCHEDULE_TIMEOUT; } else if (unlikely(*timeout >= (s64)MAX_SCHEDULE_TIMEOUT - 1)) { 设置的等待时间过长,超过了限制。 /* Wait for longer than MAX_SCHEDULE_TIMEOUT. Do it in a loop */ __timeout = MAX_SCHEDULE_TIMEOUT - 1; *timeout -= __timeout; } else { 等待一段时间 __timeout = *timeout; *timeout = 0; } __timeout = schedule_timeout(__timeout); 延时唤醒 if (*timeout >= 0) *timeout += __timeout; } //设置进程为可运行状态 __set_current_state(TASK_RUNNING); //从等待队列移除 poll_freewait(&table); return retval; }
/* No kernel lock held - perfect */ static unsigned int sock_poll(struct file *file, poll_table *wait) { struct socket *sock; /* * We can't return errors to poll, so it's either yes or no. */ sock = file->private_data; return sock->ops->poll(file, sock, wait); }
struct poll_wqueues table; poll_table *wait;
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *); typedef struct poll_table_struct { poll_queue_proc qproc; } poll_table; struct poll_wqueues { poll_table pt; struct poll_table_page * table; int error; int inline_index; struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; };
poll_initwait(&table); wait = &table.pt; void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->qproc = qproc; }
struct __wait_queue_head { spinlock_t lock; struct list_head task_list; }; typedef struct __wait_queue_head wait_queue_head_t;
static unsigned int scull_p_poll(struct file *filp, poll_table *wait) { struct scull_pipe *dev = filp->private_data; unsigned int mask = 0; down(&dev->sem); poll_wait(filp, &dev->inq, wait); poll_wait(filp, &dev->outq, wait); if (dev->rp != dev->wp) mask |= POLLIN | POLLRDNORM; /* readable */ if (spacefree(dev)) mask |= POLLOUT | POLLWRNORM; /* writable */ up(&dev->sem); return mask; }
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { if (p && wait_address) p->qproc(filp, wait_address, p); }这正是上面我们设置的回调函数 __pollwait,定义如下:
/* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { struct poll_table_entry *entry = poll_get_entry(p); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; init_waitqueue_entry(&entry->wait, current); add_wait_queue(wait_address, &entry->wait); }该函数通过init_waitqueue_entry,初始化一个等待队列项,并将该项与当前进程关联,并将其插入到wait_address。wait_address正是dev>inq或者dev->outq,表示设备的读或者写的等待队列。