linux源码 网络包接收--从中断到协议栈

本文基于4.11内核

linux在硬中断触发后,会进入do_IRQ函数(arch/x86/kernel/irq.c):

/*
 * do_IRQ handles all normal device IRQ's (the special
 * SMP cross-CPU interrupts have their own specific
 * handlers).
 */
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
    struct pt_regs *old_regs = set_irq_regs(regs);
    struct irq_desc * desc;
    /* high bit used in ret_from_ code  */
    unsigned vector = ~regs->orig_ax;

    /*
     * NB: Unlike exception entries, IRQ entries do not reliably
     * handle context tracking in the low-level entry code.  This is
     * because syscall entries execute briefly with IRQs on before
     * updating context tracking state, so we can take an IRQ from
     * kernel mode with CONTEXT_USER.  The low-level entry code only
     * updates the context if we came from user mode, so we won't
     * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
     * code is cleaned up enough that we can cleanly defer enabling
     * IRQs.
     */

    entering_irq();

    /* entering_irq() tells RCU that we're not quiescent.  Check it. */
    RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");

    desc = __this_cpu_read(vector_irq[vector]);

    if (!handle_irq(desc, regs)) {
        ack_APIC_irq();

        if (desc != VECTOR_RETRIGGERED) {
            pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
                         __func__, smp_processor_id(),
                         vector);
        } else {
            __this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
        }
    }

    exiting_irq();

    set_irq_regs(old_regs);
    return 1;
}

set_irq_regs函数将CPU寄存器设置为指定值,并返回旧的寄存器状态。在函数开头和末尾调用的
set_irq_regs使得中断可以嵌套执行。

desc = __this_cpu_read(vector_irq[vector])获得中断向量的描述符,中断描述符的结构如下:

/**
 * struct irq_desc - interrupt descriptor
 * @irq_common_data:    per irq and chip data passed down to chip functions
 * @kstat_irqs:     irq stats per cpu
 * @handle_irq:     highlevel irq-events handler
 * @preflow_handler:    handler called before the flow handler (currently used by sparc)
 * @action:     the irq action chain
 * @status:     status information
 * @core_internal_state__do_not_mess_with_it: core internal status information
 * @depth:      disable-depth, for nested irq_disable() calls
 * @wake_depth:     enable depth, for multiple irq_set_irq_wake() callers
 * @irq_count:      stats field to detect stalled irqs
 * @last_unhandled: aging timer for unhandled count
 * @irqs_unhandled: stats field for spurious unhandled interrupts
 * @threads_handled:    stats field for deferred spurious detection of threaded handlers
 * @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
 * @lock:       locking for SMP
 * @affinity_hint:  hint to user space for preferred irq affinity
 * @affinity_notify:    context for notification of affinity changes
 * @pending_mask:   pending rebalanced interrupts
 * @threads_oneshot:    bitfield to handle shared oneshot threads
 * @threads_active: number of irqaction threads currently running
 * @wait_for_threads:   wait queue for sync_irq to wait for threaded handlers
 * @nr_actions:     number of installed actions on this descriptor
 * @no_suspend_depth:   number of irqactions on a irq descriptor with
 *          IRQF_NO_SUSPEND set
 * @force_resume_depth: number of irqactions on a irq descriptor with
 *          IRQF_FORCE_RESUME set
 * @rcu:        rcu head for delayed free
 * @kobj:       kobject used to represent this struct in sysfs
 * @dir:        /proc/irq/ procfs entry
 * @name:       flow handler name for /proc/interrupts output
 */
struct irq_desc {
    struct irq_common_data  irq_common_data;
    struct irq_data     irq_data;
    unsigned int __percpu   *kstat_irqs;
    irq_flow_handler_t  handle_irq;
#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
    irq_preflow_handler_t   preflow_handler;
#endif
    struct irqaction    *action;    /* IRQ action list */
    unsigned int        status_use_accessors;
    unsigned int        core_internal_state__do_not_mess_with_it;
    unsigned int        depth;      /* nested irq disables */
    unsigned int        wake_depth; /* nested wake enables */
    unsigned int        irq_count;  /* For detecting broken IRQs */
    unsigned long       last_unhandled; /* Aging timer for unhandled count */
    unsigned int        irqs_unhandled;
    atomic_t        threads_handled;
    int         threads_handled_last;
    raw_spinlock_t      lock;
    struct cpumask      *percpu_enabled;
    const struct cpumask    *percpu_affinity;
#ifdef CONFIG_SMP
    const struct cpumask    *affinity_hint;
    struct irq_affinity_notify *affinity_notify;
#ifdef CONFIG_GENERIC_PENDING_IRQ
    cpumask_var_t       pending_mask;
#endif
#endif
    unsigned long       threads_oneshot;
    atomic_t        threads_active;
    wait_queue_head_t       wait_for_threads;
#ifdef CONFIG_PM_SLEEP
    unsigned int        nr_actions;
    unsigned int        no_suspend_depth;
    unsigned int        cond_suspend_depth;
    unsigned int        force_resume_depth;
#endif
#ifdef CONFIG_PROC_FS
    struct proc_dir_entry   *dir;
#endif
#ifdef CONFIG_SPARSE_IRQ
    struct rcu_head     rcu;
    struct kobject      kobj;
#endif
    int         parent_irq;
    struct module       *owner;
    const char      *name;
} ____cacheline_internodealigned_in_smp;

handle_irq函数实际只做了一个溢出检查,然后就调用了desc->handle_irq函数。然后调用exiting_irq
来结束中断处理。exiting_irq实际调用前几个版本常用的命名irq_exit函数。

/*
 * Exit an interrupt context. Process softirqs if needed and possible:
 */
void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
    local_irq_disable();
#else
    WARN_ON_ONCE(!irqs_disabled());
#endif

    account_irq_exit_time(current);
    preempt_count_sub(HARDIRQ_OFFSET);
    if (!in_interrupt() && local_softirq_pending())
        invoke_softirq();

    tick_irq_exit();
    rcu_irq_exit();
    trace_hardirq_exit(); /* must be last! */
}

该函数减少一个preempt_count的硬中断计数器。in_interrupt通过preempt_count的软中断和
硬中断计数,来判断是否处于中断嵌套中,如果处于中断嵌套,那么计数值肯定大于1。local_softirq_pending
函数检查该CPU的__softirq_pending变量,查看是否有软中断挂起。如果不处于中断嵌套,且有软中断的话,
那么会执行invoke_softirq函数。

static inline void invoke_softirq(void)
{
    if (ksoftirqd_running())
        return;

    if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
        /*
         * We can safely execute softirq on the current stack if
         * it is the irq stack, because it should be near empty
         * at this stage.
         */
        __do_softirq();
#else
        /*
         * Otherwise, irq_exit() is called on the task stack that can
         * be potentially deep already. So call softirq in its own stack
         * to prevent from any overrun.
         */
        do_softirq_own_stack();
#endif
    } else {
        wakeup_softirqd();
    }
}

force_irqthreads是一个开关,如果该开关打开,那么所有的软中断都会在ksoftirqd中处理。
但是默认情况下,该开关是关闭的。这里我遇到过一个问题,曾经空跑一个收包收udp包的程序,
当入包速率为10Wpps(每秒10万包)时,用mpstat -P ALL 1看到的soft%只有1%,但是入包
速率为40Wpps时,soft%达到了8%,而入包速率为90Wpps时,soft%变成了100%。当时百思不得其解。
后来才知道mpstat那个只统计ksoftirqd的cpu占用,而ksoftirqd只有在软中断频繁被抢占然后
累计较多时候才会被唤醒。而在硬中断的时候进行的CPU消耗,是计算在硬中断的上下文进程里面的。

硬中断频率很低,负载很低的时候,软中断直接在硬中断之后就解决掉了,不会唤醒ksoftirqd线程,
所以即使本来应该花11%时间去处理10w包,但是大多数都没有在ksoftirqd线程中处理,所以记录出来的
soft%很低。当负载慢慢上升,很多软中断慢慢被硬中断抢占,于是累积起来了,于是ksoftirqd频繁被
唤醒,所以soft%越来越高,直到满载的时候,soft%占用100%。我曾在StackOverFlow上面提了这个问题,
后来自己解决了:https://stackoverflow.com/questions/44063602/the-linux-softirq-cpu-usage-looks-strange/44716705#44716705

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
    unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
    unsigned long old_flags = current->flags;
    int max_restart = MAX_SOFTIRQ_RESTART;
    struct softirq_action *h;
    bool in_hardirq;
    __u32 pending;
    int softirq_bit;

    /*
     * Mask out PF_MEMALLOC s current task context is borrowed for the
     * softirq. A softirq handled such as network RX might set PF_MEMALLOC
     * again if the socket is related to swap
     */
    current->flags &= ~PF_MEMALLOC;

    pending = local_softirq_pending();
    account_irq_enter_time(current);

    __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
    in_hardirq = lockdep_softirq_start();

restart:
    /* Reset the pending bitmask before enabling irqs */
    set_softirq_pending(0);

    local_irq_enable();

    h = softirq_vec;

    while ((softirq_bit = ffs(pending))) {
        unsigned int vec_nr;
        int prev_count;

        h += softirq_bit - 1;

        vec_nr = h - softirq_vec;
        prev_count = preempt_count();

        kstat_incr_softirqs_this_cpu(vec_nr);

        trace_softirq_entry(vec_nr);
        h->action(h);
        trace_softirq_exit(vec_nr);
        if (unlikely(prev_count != preempt_count())) {
            pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
                   vec_nr, softirq_to_name[vec_nr], h->action,
                   prev_count, preempt_count());
            preempt_count_set(prev_count);
        }
        h++;
        pending >>= softirq_bit;
    }

    rcu_bh_qs();
    local_irq_disable();

    pending = local_softirq_pending();
    if (pending) {
        if (time_before(jiffies, end) && !need_resched() &&
            --max_restart)
            goto restart;

        wakeup_softirqd();
    }

    lockdep_softirq_end(in_hardirq);
    account_irq_exit_time(current);
    __local_bh_enable(SOFTIRQ_OFFSET);
    WARN_ON_ONCE(in_interrupt());
    tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}

MAX_SOFT_IRQ是软中断最多占用的时间,MAX_SOFTIRQ_RESTART是软中断循环执行次数。
account_irq_enter_time统计了软中断执行的时间,__local_bh_disable_ip增加了preempt_count
软中断计数,相当于阻止了进程调度。

然后是一个循环,先set_softirq_pending把软中断pending计数清零,然后local_irq_enable开启硬中断,
这时候软中断的处理过程可以被硬中断抢占了。然后获取优先级最高的软中断,增加软中断发生次数,执行
h->action,然后关闭硬中断。再次获取__softirq_pending变量,检查是否有软中断还未执行,如果
还有的话,先检查时间有没有用完,再看最多次数有没有用完,如果都没有,那么继续循环,否则调用
wakeup_softirqd唤醒ksoftirqd线程。

最后统计进程被软中断使用时间,减少软中断计数,并还原进程标识。

网卡的硬中断处理函数处理收包:

drivers/net/ethernet/realtek/8169.c为例:

static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
{
    struct net_device *dev = dev_instance;
    struct rtl8169_private *tp = netdev_priv(dev);
    int handled = 0;
    u16 status;

    status = rtl_get_events(tp);
    if (status && status != 0xffff) {
        status &= RTL_EVENT_NAPI | tp->event_slow;
        if (status) {
            handled = 1;

            rtl_irq_disable(tp);
            napi_schedule(&tp->napi);
        }
    }
    return IRQ_RETVAL(handled);
}

从网卡的中断状态寄存器先读出状态值,如果使用NAPI接收机制进行接收。接收过程关中断。
napi_schedule调用napi_schedule_prep检查NAPI的前提条件满足后,进行中断处理:

/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                     struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}

__raise_softirq_irqoff激活一个NET_RX_SOFTIRQ的软中断,然后报文就进入了软中断处理:

static __latent_entropy void net_rx_action(struct softirq_action *h)
{
    struct softnet_data *sd = this_cpu_ptr(&softnet_data);
    unsigned long time_limit = jiffies + 2;
    int budget = netdev_budget;
    LIST_HEAD(list);
    LIST_HEAD(repoll);

    local_irq_disable();
    list_splice_init(&sd->poll_list, &list);
    local_irq_enable();

    for (;;) {
        struct napi_struct *n;

        if (list_empty(&list)) {
            if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
                goto out;
            break;
        }

        n = list_first_entry(&list, struct napi_struct, poll_list);
        budget -= napi_poll(n, &repoll);

        /* If softirq window is exhausted then punt.
         * Allow this to run for 2 jiffies since which will allow
         * an average latency of 1.5/HZ.
         */
        if (unlikely(budget <= 0 ||
                 time_after_eq(jiffies, time_limit))) {
            sd->time_squeeze++;
            break;
        }
    }

    local_irq_disable();

    list_splice_tail_init(&sd->poll_list, &list);
    list_splice_tail(&repoll, &list);
    list_splice(&list, &sd->poll_list);
    if (!list_empty(&sd->poll_list))
        __raise_softirq_irqoff(NET_RX_SOFTIRQ);

    net_rps_action_and_irq_enable(sd);
out:
    __kfree_skb_flush();
}

在软中断处理函数中调用NAPI的poll收包,并且在收取过程中中断是开着的,所以还可以在poll_list中
继续追加。这里的poll_list就是前面napi_schedule中
list_add_tail(&napi->poll_list, &sd->poll_list);
所处理的poll_list。这个函数同样限制了时间和最多处理的包数量,如果超出就结束循环poll。如果是
因此退出的循环,poll_list里面还有包,那么就再触发一个软中断。最后流程转到了RPS的处理。

网卡驱动中的rtl_rx函数是NAPI的poll函数:

static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget)
{
    unsigned int cur_rx, rx_left;
    unsigned int count;

    cur_rx = tp->cur_rx;

    for (rx_left = min(budget, NUM_RX_DESC); rx_left > 0; rx_left--, cur_rx++) {
        unsigned int entry = cur_rx % NUM_RX_DESC;
        struct RxDesc *desc = tp->RxDescArray + entry;
        u32 status;

        status = le32_to_cpu(desc->opts1) & tp->opts1_mask;
        if (status & DescOwn)
            break;

        /* This barrier is needed to keep us from reading
         * any other fields out of the Rx descriptor until
         * we know the status of DescOwn
         */
        dma_rmb();

        if (unlikely(status & RxRES)) {
            netif_info(tp, rx_err, dev, "Rx ERROR. status = %08x\n",
                   status);
            dev->stats.rx_errors++;
            if (status & (RxRWT | RxRUNT))
                dev->stats.rx_length_errors++;
            if (status & RxCRC)
                dev->stats.rx_crc_errors++;
            if (status & RxFOVF) {
                rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
                dev->stats.rx_fifo_errors++;
            }
            if ((status & (RxRUNT | RxCRC)) &&
                !(status & (RxRWT | RxFOVF)) &&
                (dev->features & NETIF_F_RXALL))
                goto process_pkt;
        } else {
            struct sk_buff *skb;
            dma_addr_t addr;
            int pkt_size;

process_pkt:
            addr = le64_to_cpu(desc->addr);
            if (likely(!(dev->features & NETIF_F_RXFCS)))
                pkt_size = (status & 0x00003fff) - 4;
            else
                pkt_size = status & 0x00003fff;

            /*
             * The driver does not support incoming fragmented
             * frames. They are seen as a symptom of over-mtu
             * sized frames.
             */
            if (unlikely(rtl8169_fragmented_frame(status))) {
                dev->stats.rx_dropped++;
                dev->stats.rx_length_errors++;
                goto release_descriptor;
            }

            skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
                          tp, pkt_size, addr);
            if (!skb) {
                dev->stats.rx_dropped++;
                goto release_descriptor;
            }

            rtl8169_rx_csum(skb, status);
            skb_put(skb, pkt_size);
            skb->protocol = eth_type_trans(skb, dev);

            rtl8169_rx_vlan_tag(desc, skb);

            if (skb->pkt_type == PACKET_MULTICAST)
                dev->stats.multicast++;

            napi_gro_receive(&tp->napi, skb);

            u64_stats_update_begin(&tp->rx_stats.syncp);
            tp->rx_stats.packets++;
            tp->rx_stats.bytes += pkt_size;
            u64_stats_update_end(&tp->rx_stats.syncp);
        }
release_descriptor:
        desc->opts2 = 0;
        rtl8169_mark_to_asic(desc, rx_buf_sz);
    }

    count = cur_rx - tp->cur_rx;
    tp->cur_rx = cur_rx;

    return count;
}

大概是一个大循环,从rx_ring缓冲区中一直读取包,然后解析Ethernet头,最后选择丢弃或者将剥离下来的skb
(socket buffer)通过napi_gro_receive最后送到netif_receive_skb送入内核协议栈。这个rtl_rx函数是
rtl8169_poll函数调用的,在驱动程序初始化的时候,有
netif_napi_add(dev, &tp->napi, rtl8169_poll, R8169_NAPI_WEIGHT);这样的注册。关于NAPI的poll
机制,可以参考参考中的4.

如果网卡不支持NAPI,那么默认的poll函数会变成:

static int process_backlog(struct napi_struct *napi, int quota)
{
    struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
    bool again = true;
    int work = 0;

    /* Check if we have pending ipi, its better to send them now,
     * not waiting net_rx_action() end.
     */
    if (sd_has_rps_ipi_waiting(sd)) {
        local_irq_disable();
        net_rps_action_and_irq_enable(sd);
    }

    napi->weight = dev_rx_weight;
    while (again) {
        struct sk_buff *skb;

        while ((skb = __skb_dequeue(&sd->process_queue))) {
            rcu_read_lock();
            __netif_receive_skb(skb);
            rcu_read_unlock();
            input_queue_head_incr(sd);
            if (++work >= quota)
                return work;

        }

        local_irq_disable();
        rps_lock(sd);
        if (skb_queue_empty(&sd->input_pkt_queue)) {
            /*
             * Inline a custom version of __napi_complete().
             * only current cpu owns and manipulates this napi,
             * and NAPI_STATE_SCHED is the only possible flag set
             * on backlog.
             * We can use a plain write instead of clear_bit(),
             * and we dont need an smp_mb() memory barrier.
             */
            napi->state = 0;
            again = false;
        } else {
            skb_queue_splice_tail_init(&sd->input_pkt_queue,
                           &sd->process_queue);
        }
        rps_unlock(sd);
        local_irq_enable();
    }

    return work;
}

这个函数取出sd->process_queue中的数据包,分别处理每个skb,最后也是调用netif_receive_skb将包转到
协议栈处理。当然协议栈也是中断下半部的内容(软中断)。

由此看来,一个包的接收,在到3层协议栈之前,就经过了这么多的处理,所以linux内核协议栈,单核最多也只能跑
百万级的数据包。在高频的场合,会使用内核旁路等技术来绕过内核协议栈,因为之后的内核协议栈里面还有很多
锁机制和限制性能的其他因素。

参考:

  1. linux网卡驱动处理 http://blog.csdn.net/yuan1164345228/article/details/18078539
  2. linux中断发生 http://www.cnblogs.com/tolimit/p/4444850.html
  3. linux软中断机制 http://www.cnblogs.com/tolimit/p/4495128.html
  4. NAPI原理 http://blog.csdn.net/zhangskd/article/details/21627963

你可能感兴趣的:(后端,源码分析)