select源码剖析

select只有一个系统调用
select(int nfds, fd_set *readfds, fd_set *writefds, fd_set *expectfds, struct timeval *timeout);

nfds指定被监听文件描述符的总数,一般为socket监听的所有文件描述符中的最大值+1。

readfds, writefds, expectfds分别指向可读可写异常事件的文件描述符集合,应用程序调用select时,将这些参数传给内核空间,内核保存用户想监听的事件,当监听的fd上有事件发生后,内核通过修改这些参数告知应用程序哪些已经就绪。每次再调用select时都要重新设置readfds, writefds, expectfds,因为文件描述符集合被内核修改了。

timeout参数设置select函数的超时时间,提供了秒和微秒的事件单位。如果是0,则select立刻返回。如果是NULL,select将一直阻塞,直到某个文件描述符上有时间发生。

select中重要的数据结构fd_set_bits,poll_table_page,poll_tablle_entry,poll_wqueues。

以下是select的源码,重要的地方我都做了注释,相信大家可以看懂

typedef struct 
{
    unsigned long *in, *out, *ex;
    unsigned long *res_in, *res_out, *res_ex;
}fd_set_bits;
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timeval __user *tvp)
{
    fd_set_bits fds;
    char *bits;
    long timeout;
    int ret, size, max_fdset;

    //将用户的等待事件拷贝到内核态
    timeout = MAX_SCHEDULE_TIMEOUT;
    if (tvp) {
        time_t sec, usec;

        if ((ret = verify_area(VERIFY_READ, tvp, sizeof(*tvp)))
            || (ret = __get_user(sec, &tvp->tv_sec))
            || (ret = __get_user(usec, &tvp->tv_usec)))
            goto out_nofds;

        ret = -EINVAL;
        if (sec < 0 || usec < 0)
            goto out_nofds;

        //进行单位换算
        if ((unsigned long) sec < MAX_SELECT_SECONDS) {
            timeout = ROUND_UP(usec, 1000000/HZ);
            timeout += sec * (unsigned long) HZ;
        }
    }

    ret = -EINVAL;
    if (n < 0)
        goto out_nofds;

    /* max_fdset can increase, so grab it once to avoid race */
    max_fdset = current->files->max_fdset;
    if (n > max_fdset)
        n = max_fdset;

    /*
     * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
     * since we used fdset we need to allocate memory in units of
     * long-words. 
     */
    ret = -ENOMEM;
    size = FDS_BYTES(n);
    //为内核的fd_set_bits申请空间并初始化
    bits = select_bits_alloc(size);
    if (!bits)
        goto out_nofds;
    fds.in      = (unsigned long *)  bits;
    fds.out     = (unsigned long *) (bits +   size);
    fds.ex      = (unsigned long *) (bits + 2*size);
    fds.res_in  = (unsigned long *) (bits + 3*size);
    fds.res_out = (unsigned long *) (bits + 4*size);
    fds.res_ex  = (unsigned long *) (bits + 5*size);
    //拷贝用户空间感兴趣的事件
    if ((ret = get_fd_set(n, inp, fds.in)) ||
        (ret = get_fd_set(n, outp, fds.out)) ||
        (ret = get_fd_set(n, exp, fds.ex)))
        goto out;
    //清空存放事件的位数组
    zero_fd_set(n, fds.res_in);
    zero_fd_set(n, fds.res_out);
    zero_fd_set(n, fds.res_ex);

    //监听用户感兴趣的事件
    ret = do_select(n, &fds, &timeout);

    if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
        time_t sec = 0, usec = 0;
        if (timeout) {
            sec = timeout / HZ;
            usec = timeout % HZ;
            usec *= (1000000/HZ);
        }
        put_user(sec, &tvp->tv_sec);
        put_user(usec, &tvp->tv_usec);
    }

    if (ret < 0)
        goto out;
    if (!ret) {
        ret = -ERESTARTNOHAND;
        if (signal_pending(current))
            goto out;
        ret = 0;
    }
    //将存放事件结果的位数组拷贝给用户参数位数组,通过修改用户传进来的位数组将事件返回给用户
    if (set_fd_set(n, inp, fds.res_in) ||
        set_fd_set(n, outp, fds.res_out) ||
        set_fd_set(n, exp, fds.res_ex))
        ret = -EFAULT;

out:
    select_bits_free(bits, size);
out_nofds:
    return ret;
}
int do_select(int n, fd_set_bits *fds, long *timeout)
{
    struct poll_wqueues table;
    poll_table *wait;
    int retval, i;
    long __timeout = *timeout;

    spin_lock(¤t->files->file_lock);
    retval = max_select_fd(n, fds);
    spin_unlock(¤t->files->file_lock);

    if (retval < 0)
        return retval;
    n = retval;

    //设置回调函数,该回调函数将当前进程挂到等待对队中
    poll_initwait(&table);
    wait = &table.pt;
    if (!__timeout)
        wait = NULL;
    retval = 0;
    for (;;) {
        unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;

        set_current_state(TASK_INTERRUPTIBLE);

        inp = fds->in; outp = fds->out; exp = fds->ex;
        rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

        for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
            unsigned long in, out, ex, all_bits, bit = 1, mask, j;
            unsigned long res_in = 0, res_out = 0, res_ex = 0;
            struct file_operations *f_op = NULL;
            struct file *file = NULL;

            in = *inp++; out = *outp++; ex = *exp++;
            all_bits = in | out | ex;
            if (all_bits == 0) {
                i += __NFDBITS;
                continue;
            }

            for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) {
                if (i >= n)
                    break;
                if (!(bit & all_bits))
                    continue;
                file = fget(i);
                if (file) {
                    f_op = file->f_op;
                    mask = DEFAULT_POLLMASK;
                    if (f_op && f_op->poll)
                        //调用回调函数,为每一个fd分配一个poll_table_page,用来保存当前fd监听的事件                        
                        mask = (*f_op->poll)(file, retval ? NULL : wait);
                    fput(file);
                    //判断当前fd是否有事件发生,然后设置事件返回位数组
                    if ((mask & POLLIN_SET) && (in & bit)) {
                        res_in |= bit;
                        retval++;
                    }
                    if ((mask & POLLOUT_SET) && (out & bit)) {
                        res_out |= bit;
                        retval++;
                    }
                    if ((mask & POLLEX_SET) && (ex & bit)) {
                        res_ex |= bit;
                        retval++;
                    }
                }
                cond_resched();
            }
            //将返回事件位数组的值拷贝给传进来的参数
            if (res_in)
                *rinp = res_in;
            if (res_out)
                *routp = res_out;
            if (res_ex)
                *rexp = res_ex;
        }
        wait = NULL;
        if (retval || !__timeout || signal_pending(current))
            break;
        if(table.error) {
            retval = table.error;
            break;
        }
        __timeout = schedule_timeout(__timeout);
    }
    __set_current_state(TASK_RUNNING);
    //删除fd的poll_table_page
    poll_freewait(&table);

    *timeout = __timeout;
    return retval;
}

一张图帮你理解select
select源码剖析_第1张图片

select的事件处理就说到这儿,留下几个问题,也是面试中常问的问题:
1.select和epoll有什么不同?
2.为什么epoll比select高效?
3.不论哪种情况,epoll永远比select高效吗?

你可能感兴趣的:(Linux)