1, irq_exit函数的解析
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
void irq_exit(void)
{
#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
local_irq_disable();
#else
lockdep_assert_irqs_disabled();
#endif
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET); //这个位置修改preempt_count
// 判断当前是否在中断上下文中,并且是否有软中断在 pending 状态
注意:这里只有两个条件同时满足时,才有可能调用 invoke_softirq() 进入软中断。
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
tick_irq_exit();
rcu_irq_exit();
trace_hardirq_exit(); /* must be last! */
}
2, 对in_interrupt的解释:
include/linux/preempt.h
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
* - bits 0-7 are the preemption count (max preemption depth: 256)
* - bits 8-15 are the softirq count (max # of softirqs: 256)
*
* The hardirq count could in theory be the same as the number of
* interrupts in the system, but we run all interrupt handlers with
* interrupts disabled, so we cannot have nesting interrupts. Though
* there are a few palaeontologic drivers which reenable interrupts in
* the handler, so we need more than one bit here.
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x000f0000
* NMI_MASK: 0x00100000
* PREEMPT_NEED_RESCHED: 0x80000000
*/
#define PREEMPT_BITS 8
#define SOFTIRQ_BITS 8
#define HARDIRQ_BITS 4
#define NMI_BITS 1
#define PREEMPT_SHIFT 0
#define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS)
#define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS)
#define __IRQ_MASK(x) ((1UL << (x))-1)
#define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
#define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT)
#define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT)
#define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT)
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
#define NMI_OFFSET (1UL << NMI_SHIFT)
include/asm-generic/preempt.h
static __always_inline int preempt_count(void)
{
return READ_ONCE(current_thread_info()->preempt_count);
}
(1)bit0~7位表示抢占计数,即支持最大的抢占深度为256
(2)bit8~15位表示软中断计数,即支持最大的软中断的个数为256,
由于软中断还受限于pending状态,一个32位的变量,该变量的每一位标记一个软中
断类型,因此实际最大只能支持32个软中断。
(3)bit16~20位表示硬件中断嵌套层数,hardirq计数理论上可以与系统中的中断数
相同,但是我们在禁用中断的情况下运行所有的中断处理程序,因此我们不能使用嵌
套的中断。虽然有一些古老的驱动程序可以在处理器中重新启动中断,因此我们这里
需要的不止一个。
(4)bit21位表示的是NMI中断。
include/linux/preempt.h
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
| NMI_MASK))
/*
* Are we doing bottom half or hardware interrupt processing?
*
* in_irq() - We're in (hard) IRQ context
* in_softirq() - We have BH disabled, or are processing softirqs
* in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
* in_serving_softirq() - We're in softirq context
* in_nmi() - We're in NMI context
* in_task() - We're in task context
*
* Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really
* should not be used in new code.
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
#define in_nmi() (preempt_count() & NMI_MASK)
#define in_task() (!(preempt_count() & \
(NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
因此in_interrupt()表示的是,如果我们在NMI, HARDIRQ,softirq的上下文中,该
计数就不会为0,用于判断我们是否是具有执行softirq的条件,因为softirq不允许
softirq抢占,也不能去抢占硬中断和NMI中断。
3, invoke_softirq
static inline void invoke_softirq(void)
{
if (ksoftirqd_running(local_softirq_pending()))
return;
/*force_irqthreads用于判断系统是否启用了强制中断线程化的特性,如果没有
启用irq线程化,则调用__do_softirq或者do_softirq_own_stack去处理软终
端,否则调用wakeup_softirqd做irq线程化的后续工作。*/
if (!force_irqthreads) {
#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
/*
* We can safely execute softirq on the current stack if
* it is the irq stack, because it should be near empty
* at this stage.
*/
__do_softirq();
#else
/*
* Otherwise, irq_exit() is called on the task stack that can
* be potentially deep already. So call softirq in its own stack
* to prevent from any overrun.
*/
do_softirq_own_stack();
#endif
} else {
wakeup_softirqd();
}
}
ksoftirqd_running函数是用于判断ksoftirqd task是否被置位为TASK_RUNNING状态,也即为ksoftirqd task是否已经schedual。如果ksoftirqd已经处于TASK_RUNNING状态,则放弃执行当前的softirq。
/*
* If ksoftirqd is scheduled, we do not want to process pending softirqs
* right now. Let ksoftirqd handle this at its own rate, to get fairness,
* unless we're doing some of the synchronous softirqs.
*/
#define SOFTIRQ_NOW_MASK ((1 << HI_SOFTIRQ) | (1 << TASKLET_SOFTIRQ))
static bool ksoftirqd_running(unsigned long pending)
{
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
if (pending & SOFTIRQ_NOW_MASK)
return false;
return tsk && (tsk->state == TASK_RUNNING) &&
!__kthread_should_park(tsk);
}
#ifdef CONFIG_IRQ_FORCED_THREADING
__read_mostly bool force_irqthreads;
EXPORT_SYMBOL_GPL(force_irqthreads);
static int __init setup_forced_irqthreads(char *arg)
{
force_irqthreads = true;
return 0;
}
early_param("threadirqs", setup_forced_irqthreads);
#endif
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;
/*
* Mask out PF_MEMALLOC s current task context is borrowed for the
* softirq. A softirq handled such as network RX might set PF_MEMALLOC
* again if the socket is related to swap
*/
current->flags &= ~PF_MEMALLOC;
pending = local_softirq_pending();
account_irq_enter_time(current);
__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
restart:
// 每次循环在允许硬件 中断强占前,首先重置软中断的标志位。
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);
/* 到这里才开中断运行,注意:以前运行状态一直是关中断运行,这时当前处
理软中断才可能被硬件中断抢占。也就是说在进入软中断时不是一开始就会被
硬件中断抢占。只有在这里以后的代码才可能被硬件中断抢占。*/
local_irq_enable();
/* 这里要注意,以下代码运行时可以被硬件中断抢占,但这个硬件中断执行完
成后,它的所注册的软中断无法马上运行,别忘了,现在虽是开硬件中断执
行,但前面的 __local_bh_disable()函数屏蔽了软中断。所以这种环境下只
能被硬件中断抢占,但这个硬中断注册的软中断回调函数无法运行。要问为什
么,那是因为__local_bh_disable() 函数设置了一个标志当作互斥量,而这
个标志正是上面的 irq_exit() 和 do_softirq() 函数中的in_interrupt()
函数判断的条件之一,也就是说 in_interrupt() 函数不仅检测硬中断而且还
判断了软中断。所以在这个环境下触发硬中断时注册的软中断,根本无法重新
进入到这个函数中来,只能是做一个标志,等待下面的重复循环(最大
MAX_SOFTIRQ_RESTART)才可能处理到这个时候触发的硬件中断所注册的软中
断。得到软中断向量表。*/
h = softirq_vec;
while ((softirq_bit = ffs(pending))) {
unsigned int vec_nr;
int prev_count;
h += softirq_bit - 1;
vec_nr = h - softirq_vec;
prev_count = preempt_count();
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}
rcu_bh_qs();
// 关中断执行以下代码。注意:这里又关中断了,下面的代码执行过程中硬件中断无法抢占。
local_irq_disable();
/* 前面提到过,在刚才开硬件中断执行环境时只能被硬件中断抢占 ,在这个
时候是无法处理软中断的,因为刚才开中断执行过程中可能多次被硬件中断抢
占,每抢占一次就有可能注册一个软中断,所以要再重新取一次所有的软中
断。以便下面的代码进行处理后跳回到 restart 处重复执行。*/
pending = local_softirq_pending();
if (pending) {
/* 如果在上面的开中断执行环境中触发了硬件中断,且注册了一个软中断
的话,这个软中断会设置 pending 位,但在当前一直屏蔽软中断的环境下
无法得到执行,前面提到过,因为 irq_exit() 和 do_softirq() 根本无
法进入到这个处理过程中来。这个在上面周详的记录过了。那么在这里又
有了一个执行的机会。注意:虽然当前环境一直是处于屏蔽软中断执行的
环境中,但在这里又给出了一个执行刚才在开中断环境过程中触发硬件中
断时所注册的软中断的机会,其实只要理解了软中断机制就会知道,无非
是在一些特定环境下调用 ISR 注册到软中断向量表里的函数而已。如果刚
才触发的硬件中断注册了软中断,并且重复执行次数没有到 10 次的话,
那么则跳转到 restart 标志处重复以上所介绍的所有步骤:设置软中断标
志位,重新开中断执行...*/
// 注意:这里是要两个条件都满足的情况下才可能重复以上步骤。
if (time_before(jiffies, end) && !need_resched() && --max_restart)
goto restart;
/* 如果以上步骤重复了 10 次后还有 pending 的软中断的话,那么系统
在一定时间内可能达到了一个峰值,为了平衡这点。系统专门建立了一个
ksoftirqd 线程来处理,这样避免在一 定时间内负荷太大。这个
ksoftirqd 线程本身是个大循环,在某些条件下为了不负载过重,他是能
被其他进程抢占的,但注意,他是显示的调用了 preempt_xxx() 和
schedule()才会被抢占和转换的。这么做的原因是因为在他一旦调用
local_softirq_pending() 函数检测到有 pending 的软中断需要处理的
时候,则会显示的调用 do_softirq() 来处理软中 断。也就是说,下面
代码唤醒的 ksoftirqd 线程有可能会回到这个函数当中来,尤其是在系统
需要响应非常多软中断的情况下,他的调用入口是 do_softirq(),这也就
是为什么在 do_softirq() 的入口处也会用 in_interrupt() 函数来判断
是否有软中断正在处理的原因了,目的还是为了防止重入。ksoftirqd 实
现看下面对 ksoftirqd() 函数的分析。*/
wakeup_softirqd();
}
lockdep_softirq_end(in_hardirq);
account_irq_exit_time(current);
// 到最后才开软中断执行环境,允许软中断执行。注意:这里使用的不是 local_bh_enable(),不会再次触发 do_softirq()的调用。
__local_bh_enable(SOFTIRQ_OFFSET);
WARN_ON_ONCE(in_interrupt());
current_restore_flags(old_flags, PF_MEMALLOC);
}
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
static void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __this_cpu_read(ksoftirqd);
if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk);
}
/**
* wake_up_process - Wake up a specific process
* @p: The process to be woken up.
*
* Attempt to wake up the nominated process and move it to the set of runnable
* processes.
*
* Return: 1 if the process was woken up, 0 if it was already running.
*
* This function executes a full memory barrier before accessing the task state.
*/
int wake_up_process(struct task_struct *p)
{
return try_to_wake_up(p, TASK_NORMAL, 0);
}
/**
* try_to_wake_up - wake up a thread
* @p: the thread to be awakened
* @state: the mask of task states that can be woken
* @wake_flags: wake modifier flags (WF_*)
*
* If (@state & @p->state) @p->state = TASK_RUNNING.
*
* If the task was not queued/runnable, also place it back on a runqueue.
*
* Atomic against schedule() which would dequeue a task, also see
* set_current_state().
*
* This function executes a full memory barrier before accessing the task
* state; see set_current_state().
*
* Return: %true if @p->state changes (an actual wakeup was done),
* %false otherwise.
*/
static int
try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
unsigned long flags;
int cpu, success = 0;
preempt_disable();
if (p == current) {
/*
* We're waking current, this means 'p->on_rq' and 'task_cpu(p)
* == smp_processor_id()'. Together this means we can special
* case the whole 'p->on_rq && ttwu_remote()' case below
* without taking any locks.
*
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
if (!(p->state & state))
goto out;
success = 1;
cpu = task_cpu(p);
trace_sched_waking(p);
p->state = TASK_RUNNING; /*将ksoftirqd task的的状态设置为TASK_RUNNING,等待CPU执行*/
trace_sched_wakeup(p);
goto out;
}
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
* reordered with p->state check below. This pairs with mb() in
* set_current_state() the waiting thread does.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
smp_mb__after_spinlock();
if (!(p->state & state))
goto unlock;
trace_sched_waking(p);
/* We're going to change ->state: */
success = 1;
cpu = task_cpu(p);
/*
* Ensure we load p->on_rq _after_ p->state, otherwise it would
* be possible to, falsely, observe p->on_rq == 0 and get stuck
* in smp_cond_load_acquire() below.
*
* sched_ttwu_pending() try_to_wake_up()
* STORE p->on_rq = 1 LOAD p->state
* UNLOCK rq->lock
*
* __schedule() (switch to task 'p')
* LOCK rq->lock smp_rmb();
* smp_mb__after_spinlock();
* UNLOCK rq->lock
*
* [task p]
* STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*/
smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto unlock;
#ifdef CONFIG_SMP
/*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
*
* One must be running (->on_cpu == 1) in order to remove oneself
* from the runqueue.
*
* __schedule() (switch to task 'p') try_to_wake_up()
* STORE p->on_cpu = 1 LOAD p->on_rq
* UNLOCK rq->lock
*
* __schedule() (put 'p' to sleep)
* LOCK rq->lock smp_rmb();
* smp_mb__after_spinlock();
* STORE p->on_rq = 0 LOAD p->on_cpu
*
* Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
* __schedule(). See the comment for smp_mb__after_spinlock().
*/
smp_rmb();
/*
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until its done referencing the task.
*
* Pairs with the smp_store_release() in finish_task().
*
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
if (task_cpu(p) != cpu) {
wake_flags |= WF_MIGRATED;
psi_ttwu_dequeue(p);
set_task_cpu(p, cpu);
}
#else /* CONFIG_SMP */
if (p->in_iowait) {
delayacct_blkio_end(p);
atomic_dec(&task_rq(p)->nr_iowait);
}
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu, wake_flags);
unlock:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out:
if (success)
ttwu_stat(p, cpu, wake_flags);
preempt_enable();
return success;
}