本文基于4.11内核
linux在硬中断触发后,会进入do_IRQ
函数(arch/x86/kernel/irq.c
):
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
*/
__visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc * desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
/*
* NB: Unlike exception entries, IRQ entries do not reliably
* handle context tracking in the low-level entry code. This is
* because syscall entries execute briefly with IRQs on before
* updating context tracking state, so we can take an IRQ from
* kernel mode with CONTEXT_USER. The low-level entry code only
* updates the context if we came from user mode, so we won't
* switch to CONTEXT_KERNEL. We'll fix that once the syscall
* code is cleaned up enough that we can cleanly defer enabling
* IRQs.
*/
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]);
if (!handle_irq(desc, regs)) {
ack_APIC_irq();
if (desc != VECTOR_RETRIGGERED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
}
}
exiting_irq();
set_irq_regs(old_regs);
return 1;
}
set_irq_regs
函数将CPU寄存器设置为指定值,并返回旧的寄存器状态。在函数开头和末尾调用的
set_irq_regs
使得中断可以嵌套执行。
desc = __this_cpu_read(vector_irq[vector])
获得中断向量的描述符,中断描述符的结构如下:
/**
* struct irq_desc - interrupt descriptor
* @irq_common_data: per irq and chip data passed down to chip functions
* @kstat_irqs: irq stats per cpu
* @handle_irq: highlevel irq-events handler
* @preflow_handler: handler called before the flow handler (currently used by sparc)
* @action: the irq action chain
* @status: status information
* @core_internal_state__do_not_mess_with_it: core internal status information
* @depth: disable-depth, for nested irq_disable() calls
* @wake_depth: enable depth, for multiple irq_set_irq_wake() callers
* @irq_count: stats field to detect stalled irqs
* @last_unhandled: aging timer for unhandled count
* @irqs_unhandled: stats field for spurious unhandled interrupts
* @threads_handled: stats field for deferred spurious detection of threaded handlers
* @threads_handled_last: comparator field for deferred spurious detection of theraded handlers
* @lock: locking for SMP
* @affinity_hint: hint to user space for preferred irq affinity
* @affinity_notify: context for notification of affinity changes
* @pending_mask: pending rebalanced interrupts
* @threads_oneshot: bitfield to handle shared oneshot threads
* @threads_active: number of irqaction threads currently running
* @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
* @nr_actions: number of installed actions on this descriptor
* @no_suspend_depth: number of irqactions on a irq descriptor with
* IRQF_NO_SUSPEND set
* @force_resume_depth: number of irqactions on a irq descriptor with
* IRQF_FORCE_RESUME set
* @rcu: rcu head for delayed free
* @kobj: kobject used to represent this struct in sysfs
* @dir: /proc/irq/ procfs entry
* @name: flow handler name for /proc/interrupts output
*/
struct irq_desc {
struct irq_common_data irq_common_data;
struct irq_data irq_data;
unsigned int __percpu *kstat_irqs;
irq_flow_handler_t handle_irq;
#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
irq_preflow_handler_t preflow_handler;
#endif
struct irqaction *action; /* IRQ action list */
unsigned int status_use_accessors;
unsigned int core_internal_state__do_not_mess_with_it;
unsigned int depth; /* nested irq disables */
unsigned int wake_depth; /* nested wake enables */
unsigned int irq_count; /* For detecting broken IRQs */
unsigned long last_unhandled; /* Aging timer for unhandled count */
unsigned int irqs_unhandled;
atomic_t threads_handled;
int threads_handled_last;
raw_spinlock_t lock;
struct cpumask *percpu_enabled;
const struct cpumask *percpu_affinity;
#ifdef CONFIG_SMP
const struct cpumask *affinity_hint;
struct irq_affinity_notify *affinity_notify;
#ifdef CONFIG_GENERIC_PENDING_IRQ
cpumask_var_t pending_mask;
#endif
#endif
unsigned long threads_oneshot;
atomic_t threads_active;
wait_queue_head_t wait_for_threads;
#ifdef CONFIG_PM_SLEEP
unsigned int nr_actions;
unsigned int no_suspend_depth;
unsigned int cond_suspend_depth;
unsigned int force_resume_depth;
#endif
#ifdef CONFIG_PROC_FS
struct proc_dir_entry *dir;
#endif
#ifdef CONFIG_SPARSE_IRQ
struct rcu_head rcu;
struct kobject kobj;
#endif
int parent_irq;
struct module *owner;
const char *name;
} ____cacheline_internodealigned_in_smp;
handle_irq
函数实际只做了一个溢出检查,然后就调用了desc->handle_irq
函数。然后调用exiting_irq
来结束中断处理。exiting_irq
实际调用前几个版本常用的命名irq_exit
函数。
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
WARN_ON_ONCE(!irqs_disabled());
#endif
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
tick_irq_exit();
rcu_irq_exit();
trace_hardirq_exit(); /* must be last! */
}
该函数减少一个preempt_count的硬中断计数器。in_interrupt
通过preempt_count的软中断和
硬中断计数,来判断是否处于中断嵌套中,如果处于中断嵌套,那么计数值肯定大于1。local_softirq_pending
函数检查该CPU的__softirq_pending变量,查看是否有软中断挂起。如果不处于中断嵌套,且有软中断的话,
那么会执行invoke_softirq
函数。
static inline void invoke_softirq(void)
{
if (ksoftirqd_running())
return;
if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
* it is the irq stack, because it should be near empty
* at this stage.
*/
__do_softirq();
#else
/*
* Otherwise, irq_exit() is called on the task stack that can
* be potentially deep already. So call softirq in its own stack
* to prevent from any overrun.
*/
do_softirq_own_stack();
#endif
} else {
wakeup_softirqd();
}
}
force_irqthreads
是一个开关,如果该开关打开,那么所有的软中断都会在ksoftirqd中处理。
但是默认情况下,该开关是关闭的。这里我遇到过一个问题,曾经空跑一个收包收udp包的程序,
当入包速率为10Wpps(每秒10万包)时,用mpstat -P ALL 1
看到的soft%
只有1%,但是入包
速率为40Wpps时,soft%
达到了8%,而入包速率为90Wpps时,soft%
变成了100%。当时百思不得其解。
后来才知道mpstat
那个只统计ksoftirqd
的cpu占用,而ksoftirqd
只有在软中断频繁被抢占然后
累计较多时候才会被唤醒。而在硬中断的时候进行的CPU消耗,是计算在硬中断的上下文进程里面的。
硬中断频率很低,负载很低的时候,软中断直接在硬中断之后就解决掉了,不会唤醒ksoftirqd
线程,
所以即使本来应该花11%时间去处理10w包,但是大多数都没有在ksoftirqd
线程中处理,所以记录出来的
soft%
很低。当负载慢慢上升,很多软中断慢慢被硬中断抢占,于是累积起来了,于是ksoftirqd
频繁被
唤醒,所以soft%
越来越高,直到满载的时候,soft%
占用100%。我曾在StackOverFlow上面提了这个问题,
后来自己解决了:https://stackoverflow.com/questions/44063602/the-linux-softirq-cpu-usage-looks-strange/44716705#44716705
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
local_irq_enable();
h = softirq_vec;
while ((softirq_bit = ffs(pending))) {
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}
rcu_bh_qs();
local_irq_disable();
pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
--max_restart)
goto restart;
wakeup_softirqd();
}
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
tsk_restore_flags(current, old_flags, PF_MEMALLOC);
}
MAX_SOFT_IRQ
是软中断最多占用的时间,MAX_SOFTIRQ_RESTART
是软中断循环执行次数。
account_irq_enter_time
统计了软中断执行的时间,__local_bh_disable_ip
增加了preempt_count
软中断计数,相当于阻止了进程调度。
然后是一个循环,先set_softirq_pending
把软中断pending计数清零,然后local_irq_enable
开启硬中断,
这时候软中断的处理过程可以被硬中断抢占了。然后获取优先级最高的软中断,增加软中断发生次数,执行
h->action
,然后关闭硬中断。再次获取__softirq_pending
变量,检查是否有软中断还未执行,如果
还有的话,先检查时间有没有用完,再看最多次数有没有用完,如果都没有,那么继续循环,否则调用
wakeup_softirqd
唤醒ksoftirqd线程。
最后统计进程被软中断使用时间,减少软中断计数,并还原进程标识。
网卡的硬中断处理函数处理收包:
以drivers/net/ethernet/realtek/8169.c
为例:
static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
{
struct net_device *dev = dev_instance;
struct rtl8169_private *tp = netdev_priv(dev);
int handled = 0;
u16 status;
status = rtl_get_events(tp);
if (status && status != 0xffff) {
status &= RTL_EVENT_NAPI | tp->event_slow;
if (status) {
handled = 1;
rtl_irq_disable(tp);
napi_schedule(&tp->napi);
}
}
return IRQ_RETVAL(handled);
}
从网卡的中断状态寄存器先读出状态值,如果使用NAPI接收机制进行接收。接收过程关中断。
napi_schedule
调用napi_schedule_prep
检查NAPI的前提条件满足后,进行中断处理:
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{
list_add_tail(&napi->poll_list, &sd->poll_list);
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
__raise_softirq_irqoff
激活一个NET_RX_SOFTIRQ
的软中断,然后报文就进入了软中断处理:
static __latent_entropy void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = this_cpu_ptr(&softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
LIST_HEAD(list);
LIST_HEAD(repoll);
local_irq_disable();
list_splice_init(&sd->poll_list, &list);
local_irq_enable();
for (;;) {
struct napi_struct *n;
if (list_empty(&list)) {
if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
goto out;
break;
}
n = list_first_entry(&list, struct napi_struct, poll_list);
budget -= napi_poll(n, &repoll);
/* If softirq window is exhausted then punt.
* Allow this to run for 2 jiffies since which will allow
* an average latency of 1.5/HZ.
*/
if (unlikely(budget <= 0 ||
time_after_eq(jiffies, time_limit))) {
sd->time_squeeze++;
break;
}
}
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
list_splice_tail(&repoll, &list);
list_splice(&list, &sd->poll_list);
if (!list_empty(&sd->poll_list))
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
net_rps_action_and_irq_enable(sd);
out:
__kfree_skb_flush();
}
在软中断处理函数中调用NAPI的poll收包,并且在收取过程中中断是开着的,所以还可以在poll_list中
继续追加。这里的poll_list就是前面napi_schedule中
list_add_tail(&napi->poll_list, &sd->poll_list);
所处理的poll_list。这个函数同样限制了时间和最多处理的包数量,如果超出就结束循环poll。如果是
因此退出的循环,poll_list里面还有包,那么就再触发一个软中断。最后流程转到了RPS的处理。
网卡驱动中的rtl_rx
函数是NAPI的poll函数:
static int rtl_rx(struct net_device *dev, struct rtl8169_private *tp, u32 budget)
{
unsigned int cur_rx, rx_left;
unsigned int count;
cur_rx = tp->cur_rx;
for (rx_left = min(budget, NUM_RX_DESC); rx_left > 0; rx_left--, cur_rx++) {
unsigned int entry = cur_rx % NUM_RX_DESC;
struct RxDesc *desc = tp->RxDescArray + entry;
u32 status;
status = le32_to_cpu(desc->opts1) & tp->opts1_mask;
if (status & DescOwn)
break;
/* This barrier is needed to keep us from reading
* any other fields out of the Rx descriptor until
* we know the status of DescOwn
*/
dma_rmb();
if (unlikely(status & RxRES)) {
netif_info(tp, rx_err, dev, "Rx ERROR. status = %08x\n",
status);
dev->stats.rx_errors++;
if (status & (RxRWT | RxRUNT))
dev->stats.rx_length_errors++;
if (status & RxCRC)
dev->stats.rx_crc_errors++;
if (status & RxFOVF) {
rtl_schedule_task(tp, RTL_FLAG_TASK_RESET_PENDING);
dev->stats.rx_fifo_errors++;
}
if ((status & (RxRUNT | RxCRC)) &&
!(status & (RxRWT | RxFOVF)) &&
(dev->features & NETIF_F_RXALL))
goto process_pkt;
} else {
struct sk_buff *skb;
dma_addr_t addr;
int pkt_size;
process_pkt:
addr = le64_to_cpu(desc->addr);
if (likely(!(dev->features & NETIF_F_RXFCS)))
pkt_size = (status & 0x00003fff) - 4;
else
pkt_size = status & 0x00003fff;
/*
* The driver does not support incoming fragmented
* frames. They are seen as a symptom of over-mtu
* sized frames.
*/
if (unlikely(rtl8169_fragmented_frame(status))) {
dev->stats.rx_dropped++;
dev->stats.rx_length_errors++;
goto release_descriptor;
}
skb = rtl8169_try_rx_copy(tp->Rx_databuff[entry],
tp, pkt_size, addr);
if (!skb) {
dev->stats.rx_dropped++;
goto release_descriptor;
}
rtl8169_rx_csum(skb, status);
skb_put(skb, pkt_size);
skb->protocol = eth_type_trans(skb, dev);
rtl8169_rx_vlan_tag(desc, skb);
if (skb->pkt_type == PACKET_MULTICAST)
dev->stats.multicast++;
napi_gro_receive(&tp->napi, skb);
u64_stats_update_begin(&tp->rx_stats.syncp);
tp->rx_stats.packets++;
tp->rx_stats.bytes += pkt_size;
u64_stats_update_end(&tp->rx_stats.syncp);
}
release_descriptor:
desc->opts2 = 0;
rtl8169_mark_to_asic(desc, rx_buf_sz);
}
count = cur_rx - tp->cur_rx;
tp->cur_rx = cur_rx;
return count;
}
大概是一个大循环,从rx_ring
缓冲区中一直读取包,然后解析Ethernet头,最后选择丢弃或者将剥离下来的skb
(socket buffer)通过napi_gro_receive
最后送到netif_receive_skb
送入内核协议栈。这个rtl_rx
函数是
被rtl8169_poll
函数调用的,在驱动程序初始化的时候,有
netif_napi_add(dev, &tp->napi, rtl8169_poll, R8169_NAPI_WEIGHT);
这样的注册。关于NAPI的poll
机制,可以参考参考中的4.
如果网卡不支持NAPI,那么默认的poll函数会变成:
static int process_backlog(struct napi_struct *napi, int quota)
{
struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
bool again = true;
int work = 0;
/* Check if we have pending ipi, its better to send them now,
* not waiting net_rx_action() end.
*/
if (sd_has_rps_ipi_waiting(sd)) {
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
napi->weight = dev_rx_weight;
while (again) {
struct sk_buff *skb;
while ((skb = __skb_dequeue(&sd->process_queue))) {
rcu_read_lock();
__netif_receive_skb(skb);
rcu_read_unlock();
input_queue_head_incr(sd);
if (++work >= quota)
return work;
}
local_irq_disable();
rps_lock(sd);
if (skb_queue_empty(&sd->input_pkt_queue)) {
/*
* Inline a custom version of __napi_complete().
* only current cpu owns and manipulates this napi,
* and NAPI_STATE_SCHED is the only possible flag set
* on backlog.
* We can use a plain write instead of clear_bit(),
* and we dont need an smp_mb() memory barrier.
*/
napi->state = 0;
again = false;
} else {
skb_queue_splice_tail_init(&sd->input_pkt_queue,
&sd->process_queue);
}
rps_unlock(sd);
local_irq_enable();
}
return work;
}
这个函数取出sd->process_queue中的数据包,分别处理每个skb,最后也是调用netif_receive_skb
将包转到
协议栈处理。当然协议栈也是中断下半部的内容(软中断)。
由此看来,一个包的接收,在到3层协议栈之前,就经过了这么多的处理,所以linux内核协议栈,单核最多也只能跑
百万级的数据包。在高频的场合,会使用内核旁路等技术来绕过内核协议栈,因为之后的内核协议栈里面还有很多
锁机制和限制性能的其他因素。
参考: