源码剖析之poll

1. poll

   从内核的角度看来,借助于VFS, 一切皆file

// 文件表示 include/linux/fs.h
struct file {  

    const struct file_operations    *f_op;  
    spinlock_t          f_lock;  
    // 文件内部实现细节  
    void               *private_data;  
#ifdef CONFIG_EPOLL  
    /* Used by fs/eventpoll.c to link all the hooks to this file */  
    struct list_head    f_ep_links;  
    struct list_head    f_tfile_llink;  
#endif /* #ifdef CONFIG_EPOLL */  
    // 其他细节....  
};  

// 文件操作  include/linux/fs.h
struct file_operations {  
    // 文件提供给poll/select/epoll  
    // 调用poll_table_struct中指定的函数并获取文件的当前状态
    unsigned int (*poll) (struct file *, struct poll_table_struct *);  
    // 其他方法read/write 等... ...  
};  

/**
 * 通常文件poll方法的实现
 * 调用poll_table_struct中指定的函数,并获得文件当前就绪事件的掩码
 * @param flip 文件的指针
 * @param wait 指向poll_table_struct的指针
 * @return 返回文件当前就绪事件掩码
 */
unsigned int XXX_poll (struct file *filp, struct poll_table_struct *wait)  
{  
    unsigned int mask = 0;  
    wait_queue_head_t * wait_queue;  

    // 1. 根据事件掩码wait->key_和文件实现filp->private_data 取得事件掩码对应的一个或多个wait queue head  
    some_code();  

    // 2. 调用poll_wait,目的是向获得的等待队列中添加等待队列项 
    poll_wait(filp, wait_queue, wait);  

    // 3. 取得文件当前就绪事件的掩码并保存到mask  
    some_code();  

    return mask;  
}  

// select/poll/epoll 向文件注册就绪后回调节点的接口结构  
typedef struct poll_table_struct {  
    // 向指定等待队列(wait_queue_head)添加等待队列项的(wait_queue_t)的接口函数  
    poll_queue_proc _qproc;  
    // 关注的事件掩码, 文件的实现利用此掩码将对应的等待队列传递给_qproc  
    unsigned long   _key;  
} poll_table;  

// 
typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);  


// 通用的poll_wait 函数, 文件的f_ops->poll 通常会调用此函数  
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)  
{  
    if (p && p->_qproc && wait_address) {   
        // 调用poll_table_struct 中指定的函数_qproc
        // qproc一般的作用是向指定事件等待队列中添加等待队列项
        // 如果是select或poll 则是 __pollwait, 如果是 epoll 则是 ep_ptable_queue_proc 
        p->_qproc(filp, wait_address, p);  
    }  
}  

2. upd的poll

2.1 poll

net/ipv4/af_inet.c

const struct proto_ops inet_dgram_ops = {
    // 其它细节
    .poll          = udp_poll,
    // 其它细节
};

2.2 udp_poll

net/ipv4/udp.c

/**
 *  udp_poll - wait for a UDP event.
 *  @file - file struct
 *  @sock - socket
 *  @wait - poll table
 *
 *  This is same as datagram poll, except for the special case of
 *  blocking sockets. If application is using a blocking fd
 *  and a packet with checksum error is in the queue;
 *  then it could get return from select indicating data available
 *  but then block when reading it. Add special case code
 *  to work around these arguably broken applications.
 */
unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
    unsigned int mask = datagram_poll(file, sock, wait);
    struct sock *sk = sock->sk;

    sock_rps_record_flow(sk);

    /* Check for false positives due to checksum errors */
    if ((mask & POLLRDNORM) && !(file->f_flags & O_NONBLOCK) &&
        !(sk->sk_shutdown & RCV_SHUTDOWN) && first_packet_length(sk) == -1)
        mask &= ~(POLLIN | POLLRDNORM);

    return mask;
}

2.3 datagram_poll

net/core/datagram.c

/**
 *  datagram_poll - generic datagram poll
 *  @file: file struct
 *  @sock: socket
 *  @wait: poll table
 *
 *  Datagram poll: Again totally generic. This also handles
 *  sequenced packet sockets providing the socket receive queue
 *  is only ever holding data ready to receive.
 *
 *  Note: when you _don't_ use this routine for this protocol,
 *  and you use a different write policy from sock_writeable()
 *  then please supply your own write_space callback.
 */
unsigned int datagram_poll(struct file *file, struct socket *sock,
               poll_table *wait)
{
    struct sock *sk = sock->sk;
    unsigned int mask;

    sock_poll_wait(file, sk_sleep(sk), wait);
    mask = 0;

    /* exceptional events? */
    if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
        mask |= POLLERR |
            (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);

    if (sk->sk_shutdown & RCV_SHUTDOWN)
        mask |= POLLRDHUP | POLLIN | POLLRDNORM;
    if (sk->sk_shutdown == SHUTDOWN_MASK)
        mask |= POLLHUP;

    /* readable? */
    if (!skb_queue_empty(&sk->sk_receive_queue))
        mask |= POLLIN | POLLRDNORM;

    /* Connection-based need to check for termination and startup */
    if (connection_based(sk)) {
        if (sk->sk_state == TCP_CLOSE)
            mask |= POLLHUP;
        /* connection hasn't started yet? */
        if (sk->sk_state == TCP_SYN_SENT)
            return mask;
    }

    /* writable? */
    if (sock_writeable(sk))
        mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
    else
        sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);

    return mask;
}

2.4 sock_poll_wait

include/net/sock.c

/**
 * sock_poll_wait - place memory barrier behind the poll_wait call.
 * @filp:           file
 * @wait_address:   socket wait queue
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp,
        wait_queue_head_t *wait_address, poll_table *p)
{
    if (!poll_does_not_wait(p) && wait_address) {
        poll_wait(filp, wait_address, p);
        /* We need to be sure we are in sync with the
         * socket flags modification.
         *
         * This memory barrier is paired in the wq_has_sleeper.
         */
        smp_mb();
    }
}

2.5 poll_wait

include/linux/poll.h

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    if (p && p->_qproc && wait_address)
        p->_qproc(filp, wait_address, p);
}

2.6 小结

poll -> udp_poll -> datagram_poll -> sock_poll_wait -> poll_wait

3. TCP的poll

3.1 poll

net/ipv4/af_inet.c

const struct proto_ops inet_stream_ops = {
    .family        = PF_INET,
    .owner         = THIS_MODULE,
    .release       = inet_release,
    .bind          = inet_bind,
    .connect       = inet_stream_connect,
    .socketpair    = sock_no_socketpair,
    .accept        = inet_accept,
    .getname       = inet_getname,
    .poll          = tcp_poll,
    .ioctl         = inet_ioctl,
    .listen        = inet_listen,
    .shutdown      = inet_shutdown,
    .setsockopt    = sock_common_setsockopt,
    .getsockopt    = sock_common_getsockopt,
    .sendmsg       = inet_sendmsg,
    .recvmsg       = inet_recvmsg,
    .mmap          = sock_no_mmap,
    .sendpage      = inet_sendpage,
    .splice_read       = tcp_splice_read,
    .read_sock     = tcp_read_sock,
    .peek_len      = tcp_peek_len,
#ifdef CONFIG_COMPAT
    .compat_setsockopt = compat_sock_common_setsockopt,
    .compat_getsockopt = compat_sock_common_getsockopt,
    .compat_ioctl      = inet_compat_ioctl,
#endif
};

3.2 tcp_poll

net/ipv4/tcp.c

/*
 *  Wait for a TCP event.
 *
 *  Note that we don't need to lock the socket, as the upper poll layers
 *  take care of normal races (between the test and the event) and we don't
 *  go look at any of the socket buffers directly.
 */
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
    unsigned int mask;
    struct sock *sk = sock->sk;
    const struct tcp_sock *tp = tcp_sk(sk);
    int state;

    sock_rps_record_flow(sk);

    sock_poll_wait(file, sk_sleep(sk), wait);

    state = sk_state_load(sk);
    if (state == TCP_LISTEN)
        return inet_csk_listen_poll(sk);

    /* Socket is not locked. We are protected from async events
     * by poll logic and correct handling of state changes
     * made by other threads is impossible in any case.
     */

    mask = 0;

    /*
     * POLLHUP is certainly not done right. But poll() doesn't
     * have a notion of HUP in just one direction, and for a
     * socket the read side is more interesting.
     *
     * Some poll() documentation says that POLLHUP is incompatible
     * with the POLLOUT/POLLWR flags, so somebody should check this
     * all. But careful, it tends to be safer to return too many
     * bits than too few, and you can easily break real applications
     * if you don't tell them that something has hung up!
     *
     * Check-me.
     *
     * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
     * our fs/select.c). It means that after we received EOF,
     * poll always returns immediately, making impossible poll() on write()
     * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
     * if and only if shutdown has been made in both directions.
     * Actually, it is interesting to look how Solaris and DUX
     * solve this dilemma. I would prefer, if POLLHUP were maskable,
     * then we could set it on SND_SHUTDOWN. BTW examples given
     * in Stevens' books assume exactly this behaviour, it explains
     * why POLLHUP is incompatible with POLLOUT.    --ANK
     *
     * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
     * blocking on fresh not-connected or disconnected socket. --ANK
     */
    if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
        mask |= POLLHUP;
    if (sk->sk_shutdown & RCV_SHUTDOWN)
        mask |= POLLIN | POLLRDNORM | POLLRDHUP;

    /* Connected or passive Fast Open socket? */
    if (state != TCP_SYN_SENT &&
        (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
        int target = sock_rcvlowat(sk, 0, INT_MAX);

        if (tp->urg_seq == tp->copied_seq &&
            !sock_flag(sk, SOCK_URGINLINE) &&
            tp->urg_data)
            target++;

        if (tp->rcv_nxt - tp->copied_seq >= target)
            mask |= POLLIN | POLLRDNORM;

        if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
            if (sk_stream_is_writeable(sk)) {
                mask |= POLLOUT | POLLWRNORM;
            } else {  /* send SIGIO later */
                sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
                set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);

                /* Race breaker. If space is freed after
                 * wspace test but before the flags are set,
                 * IO signal will be lost. Memory barrier
                 * pairs with the input side.
                 */
                smp_mb__after_atomic();
                if (sk_stream_is_writeable(sk))
                    mask |= POLLOUT | POLLWRNORM;
            }
        } else
            mask |= POLLOUT | POLLWRNORM;

        if (tp->urg_data & TCP_URG_VALID)
            mask |= POLLPRI;
    }
    /* This barrier is coupled with smp_wmb() in tcp_reset() */
    smp_rmb();
    if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
        mask |= POLLERR;

    return mask;
}

3.3 sock_poll_wait

include/net/sock.c

/**
 * sock_poll_wait - place memory barrier behind the poll_wait call.
 * @filp:           file
 * @wait_address:   socket wait queue
 * @p:              poll_table
 *
 * See the comments in the wq_has_sleeper function.
 */
static inline void sock_poll_wait(struct file *filp,
        wait_queue_head_t *wait_address, poll_table *p)
{
    if (!poll_does_not_wait(p) && wait_address) {
        poll_wait(filp, wait_address, p);
        /* We need to be sure we are in sync with the
         * socket flags modification.
         *
         * This memory barrier is paired in the wq_has_sleeper.
         */
        smp_mb();
    }
}

3.4 poll_wait

include/linux/poll.h

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
    if (p && p->_qproc && wait_address)
        p->_qproc(filp, wait_address, p);
}

3.5 小结

poll -> tcp_poll -> sock_poll_wait -> poll_wait

你可能感兴趣的:(linux源码剖析)