epoll源码探秘(epoll_ctl)

epoll源码探秘(epoll_ctl)

一些基本的数据结构

epitem

/* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line.(红黑树单节点) */
struct epitem {
    union {
        /* RB tree node links this structure to the eventpoll RB tree */
        struct rb_node rbn;
        /* Used to free the struct epitem */
        struct rcu_head rcu;
    };

    /* List header used to link this structure to the eventpoll ready list (列表头用于连接结构的eventpoll就绪列表)*/
    struct list_head rdllink;

    /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */
    struct epitem *next;

    /* The file descriptor information this item refers to (关联的文件描述符)*/
    struct epoll_filefd ffd;

    /* Number of active wait queue attached to poll operations (轮询操作)*/
    int nwait;

    /* List containing poll wait queues */
    struct list_head pwqlist;

    /* The "container" of this item */
    struct eventpoll *ep;

    /* List header used to link this item to the "struct file" items list */
    struct list_head fllink;

    /* wakeup_source used when EPOLLWAKEUP is set */
    struct wakeup_source __rcu *ws;

    /* The structure that describe the interested events and the source fd (感兴趣的监控文件描述符的状态)*/
    struct epoll_event event;
};

eventpoll

/* * This structure is stored inside the "private_data" member of the file * structure and represents(表现)the main data structure for the eventpoll * interface. */
struct eventpoll {
    /* Protect the access to this structure */
    spinlock_t lock;

    /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */
    struct mutex mtx;

    /* Wait queue used by sys_epoll_wait() (双链表,epoll文件的等待队列。 *调用epoll_wait的进程可能在此队列上睡眠, 等待ep_poll_callback()函数唤醒或超时 */
    wait_queue_head_t wq;

    /* Wait queue used by file->poll() (双链表, poll_wait是eventpoll文件本身的唤醒队列, *该队列上睡眠的进程是等待eventpoll文件本身的某些事件发生 */
    wait_queue_head_t poll_wait;

    /* List of ready file descriptors (就绪链表)*/
    struct list_head rdllist;

    /* RB tree root used to store monitored fd structs (存储监听文件描述符结构红黑树根节点)*/
    struct rb_root rbr;

    /* * This is a single linked list(单链表) that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock.(如果正在向用户空间传递事件,此时状态就绪的文件描述符相关的结构会暂时放在该队列上, * 否则会直接添加到就绪队列rdllist中。) */
    struct epitem *ovflist;

    /* wakeup_source used when ep_scan_ready_list is running */
    struct wakeup_source *ws;

    /* The user that created the eventpoll descriptor */
    struct user_struct *user;

    struct file *file;

    /* used to optimize loop detection check */
    int visited;
    struct list_head visited_list_link;
};

poll_table_struct

typedef struct poll_table_struct {
  poll_queue_proc _qproc; //事件回调函数
  unsigned long _key;       //事件位标志(由epi->event.events确定)
} poll_table;

eppoll_entry

/* 等待队列的poll回调的hook*/
struct eppoll_entry {
  struct list_head llink;//链接epitem
  struct epitem *base;    //主epitem
  wait_queue_t wait;      //wait队列的元素
  wait_queue_head_t *whead;//wait队列的对头元素指针
};

__wait_queue

struct __wait_queue {
  unsigned int      flags;
#define WQ_FLAG_EXCLUSIVE 0x01
  void          *private;
  wait_queue_func_t func;
  struct list_head  task_list;
};

epoll_ctl函数:

int epoll_ctl(int epfd, int op, int fd, struct epoll_event *event);

第一级:copy_from_user()(检查是否需要从用户空间拷贝关心事件)

if (ep_op_has_event(op) &©_from_user(&epds, event, sizeof(struct epoll_event)))
     goto error_return;

第一级:ep_find()(在红黑树种查找对应fd的文件)

epi = ep_find(ep, tf.file, fd);

第二级:ep_find()在获取红黑树互斥锁已取得情况下

static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
{
    int kcmp;
    struct rb_node *rbp;
    struct epitem *epi, *epir = NULL;
    struct epoll_filefd ffd;

    ep_set_ffd(&ffd, file, fd);
    for (rbp = ep->rbr.rb_node; rbp; ) {
        epi = rb_entry(rbp, struct epitem, rbn);
        kcmp = ep_cmp_ffd(&ffd, &epi->ffd);
        if (kcmp > 0)
            rbp = rbp->rb_right;
        else if (kcmp < 0)
            rbp = rbp->rb_left;
        else {
            epir = epi;
            break;
        }
    }

    return epir;
}

第一级:case分支语句(ctl功能选择)

该函数首先在eventpoll中查找操作的fd对应的epitem对象是否存在,然后根据用户指定的命令参数,作相应的处理。每个添加到epoll的文件都会附加到一个epitem对象中。epoll的删除文件和修改文件命令,分别有ep_remove()和ep_modify()来完成,这两个函数比较简单,不作过多分析。主要关心的是epoll的添加命令对应的函数ep_insert().
        switch (op) {
        case EPOLL_CTL_ADD:
            break;
        case EPOLL_CTL_DEL:
            break;
        case EPOLL_CTL_MOD:
            break;
        }

第二级:EPOLL_CTL_ADD操作(如果已满执行clear_tfile_check_list())

error = ep_insert(ep, &epds, tf.file, fd, full_check);

第三级:init_poll_funcptr()回调函数初始化

注册ep_ptable_queue_proc()函数

init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

在注册回调函数 ep_ptable_queue_proc()中

static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
                 poll_table *pt)
{
    struct epitem *epi = ep_item_from_epqueue(pt);
    struct eppoll_entry *pwq;

    //构造eppoll_entry并将其挂到目标文件的wait队列中 
    if (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
        //ep_poll_callback:将该eppoll_entry hook 所描述的epitem挂入eventpoll的就绪队列中 
        init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
        pwq->whead = whead;
        pwq->base = epi;
        /*正式将该epoll_entry挂到目标文件的等待队列中,当文件出发事件后, *会依次出发该队列中的每一项(epoll_entry)的ep_poll_callback */
        add_wait_queue(whead, &pwq->wait);
        list_add_tail(&pwq->llink, &epi->pwqlist);
        epi->nwait++;
    } else {
        /* We have to signal that an error occurred */
        epi->nwait = -1;
    }
}

回调函数ep_poll_callback()的实现过程(在对应的文件描述符中有事件发生时,将会被调用)

static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
    int pwake = 0;
    unsigned long flags;
    struct epitem *epi = ep_item_from_wait(wait);
    struct eventpoll *ep = epi->ep;

    if ((unsigned long)key & POLLFREE) {
        ep_pwq_from_wait(wait)->whead = NULL;
        /*
         * whead = NULL above can race with ep_remove_wait_queue()
         * which can do another remove_wait_queue() after us, so we
         * can't use __remove_wait_queue(). whead->lock is held by
         * the caller.
         */
        list_del_init(&wait->task_list);
    }

    spin_lock_irqsave(&ep->lock, flags);

    /*
     * If the event mask does not contain any poll(2) event, we consider the
     * descriptor to be disabled. This condition is likely the effect of the
     * EPOLLONESHOT bit that disables the descriptor when an event is received,
     * until the next EPOLL_CTL_MOD will be issued.
     */
    if (!(epi->event.events & ~EP_PRIVATE_BITS))
        goto out_unlock;

    /*
     * Check the events coming with the callback. At this stage, not
     * every device reports the events in the "key" parameter of the
     * callback. We need to be able to handle both cases here, hence the
     * test for "key" != NULL before the event match test.
     */
    if (key && !((unsigned long) key & epi->event.events))
        goto out_unlock;

    /*
     * If we are transferring events to userspace, we can hold no locks
     * (because we're accessing user memory, and because of linux f_op->poll()
     * semantics). All the events that happen during that period of time are
     * chained(链接的) in ep->ovflist and requeued later on.
     */
    if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) {
        if (epi->next == EP_UNACTIVE_PTR) {
             //目标文件发生的事件符合 epi要监听的 ,故而将epi加入到eventpoll的ovflist(epitem就绪队列)中  
            epi->next = ep->ovflist;
            ep->ovflist = epi;
            if (epi->ws) {
                /*
                 * Activate(激活) ep->ws since epi->ws may get
                 * deactivated(关闭) at any time.
                 */
                __pm_stay_awake(ep->ws);
            }

        }
        goto out_unlock;
    }

    /* If this file is already in the ready list we exit soon */
    if (!ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        ep_pm_stay_awake_rcu(epi);
    }

    /*
     * Wake up ( if active ) both the eventpoll wait list and the ->poll()
     * wait list.
     */
    if (waitqueue_active(&ep->wq))
        wake_up_locked(&ep->wq);
    if (waitqueue_active(&ep->poll_wait))
        pwake++;

out_unlock:
    spin_unlock_irqrestore(&ep->lock, flags);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 1;
}

第三级:ep_rbtree_insert()(将节点添加至红黑树中)

第三级:list_add_tail()(将节点添加至就绪队列中)

static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
             struct file *tfile, int fd, int full_check)
{
    int error, revents, pwake = 0;
    unsigned long flags;
    long user_watches;
    struct epitem *epi;
    struct ep_pqueue epq;

    /* * 检查epoll监视的文件描述符的个数是否超过max_user_watches, * max_user_watches用来存储每个用户使用epoll可以监视的文件 * 描述符个数 */ 
    user_watches = atomic_long_read(&ep->user->epoll_watches);
    if (unlikely(user_watches >= max_user_watches))
        return -ENOSPC;
    /* * 每个加入到epoll中的文件都会附加到一个epitem实例中, * 分配当前文件对应的epitem实例。 */  
    if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
        return -ENOMEM;

    /* Item initialization follow here ... */
    INIT_LIST_HEAD(&epi->rdllink);
    INIT_LIST_HEAD(&epi->fllink);
    INIT_LIST_HEAD(&epi->pwqlist);
    epi->ep = ep;
    ep_set_ffd(&epi->ffd, tfile, fd);
    epi->event = *event;
    epi->nwait = 0;
    epi->next = EP_UNACTIVE_PTR;
    if (epi->event.events & EPOLLWAKEUP) {
        error = ep_create_wakeup_source(epi);
        if (error)
            goto error_create_wakeup_source;
    } else {
        RCU_INIT_POINTER(epi->ws, NULL);
    }

    /* Initialize the poll table using the queue callback(回调)(函数指针) */
    epq.epi = epi;
    init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);

    /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */
    /* * 如果fd是套接字,f_op为socket_file_ops,poll函数是 * sock_poll()。如果是TCP套接字的话,进而会调用 * 到tcp_poll()函数。此处调用poll函数查看当前 * 文件描述符的状态,存储在revents中。 * 在poll的处理函数(tcp_poll())中,会调用sock_poll_wait(), * 在sock_poll_wait()中会调用到epq.pt.qproc指向的函数, * 也就是ep_ptable_queue_proc()。 */  
    revents = ep_item_poll(epi, &epq.pt);

    /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */
    /* * ep_ptable_queue_proc()中如果分配内存失败时,会 * 将nwait置为-1。 */  
    error = -ENOMEM;
    if (epi->nwait < 0)
        goto error_unregister;

    /* Add the current item to the list of active epoll hook for ths file */
    spin_lock(&tfile->f_lock);
    list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
    spin_unlock(&tfile->f_lock);

    /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */
    ep_rbtree_insert(ep, epi);

    /* now check if we've created too many backpaths */
    error = -EINVAL;
    if (full_check && reverse_path_check())
        goto error_remove_epi;

    /* We have to drop the new item inside our item list to keep track of it */
    spin_lock_irqsave(&ep->lock, flags);

    /* If the file is already "ready" we drop it inside the ready list */
    if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
        list_add_tail(&epi->rdllink, &ep->rdllist);
        ep_pm_stay_awake(epi);

        /* Notify waiting tasks that events are available */
        if (waitqueue_active(&ep->wq))
            wake_up_locked(&ep->wq);
        if (waitqueue_active(&ep->poll_wait))
            pwake++;
    }

    spin_unlock_irqrestore(&ep->lock, flags);

    atomic_long_inc(&ep->user->epoll_watches);

    /* We have to call this outside the lock */
    if (pwake)
        ep_poll_safewake(&ep->poll_wait);

    return 0;

error_remove_epi:
    spin_lock(&tfile->f_lock);
    list_del_rcu(&epi->fllink);
    spin_unlock(&tfile->f_lock);

    rb_erase(&epi->rbn, &ep->rbr);

error_unregister:
    ep_unregister_pollwait(ep, epi);

    /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */
    spin_lock_irqsave(&ep->lock, flags);
    if (ep_is_linked(&epi->rdllink))
        list_del_init(&epi->rdllink);
    spin_unlock_irqrestore(&ep->lock, flags);

    wakeup_source_unregister(ep_wakeup_source(epi));

error_create_wakeup_source:
    kmem_cache_free(epi_cache, epi);

    return error;
}

第二级:EPOLL_CTL_DEL操作

error = ep_remove(ep, epi);

第二级:EPOLL_CTL_MOD操作

error = ep_modify(ep, epi, &epds);

第一级:fdput()(关闭临时的两个文件描述符)

    fdput(tf);
    fdput(f);

你可能感兴趣的:(linux,源码探秘,epoll)