__report_bad_irq() 是输出该crash log的函数。
/*
* If 99,900 of the previous 100,000 interrupts have not been handled
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
* and try to turn the IRQ off.
*
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
*/
static void
__report_bad_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
unsigned long flags;
if (bad_action_ret(action_ret)) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
irq, action_ret);
} else {
printk(KERN_ERR "irq %d: nobody cared (try booting with "
"the \"irqpoll\" option)\n", irq);
}
dump_stack();
printk(KERN_ERR "handlers:\n");
/*
* We need to take desc->lock here. note_interrupt() is called
* w/o desc->lock held, but IRQ_PROGRESS set. We might race
* with something else removing an action. It's ok to take
* desc->lock here. See synchronize_irq().
*/
raw_spin_lock_irqsave(&desc->lock, flags);
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
if (action->thread_fn)
printk(KERN_CONT " threaded [<%p>] %pf",
action->thread_fn, action->thread_fn);
printk(KERN_CONT "\n");
action = action->next;
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void
report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
{
static int count = 100;
if (count > 0) {
count--;
__report_bad_irq(irq, desc, action_ret);
}
}
在note_interrupt() 函数中,封装了出现crash的两种情况。
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
if (desc->istate & IRQS_POLL_INPROGRESS)
return;
/* we get here again via the threaded handler */
if (action_ret == IRQ_WAKE_THREAD)
return;
if (bad_action_ret(action_ret)) {
report_bad_irq(irq, desc, action_ret);
return;
}
if (unlikely(action_ret == IRQ_NONE)) {
/*
* If we are seeing only the odd spurious IRQ caused by
* bus asynchronicity then don't eventually trigger an error,
* otherwise the counter becomes a doomsday timer for otherwise
* working systems
*/
if (time_after(jiffies, desc->last_unhandled + HZ/10))
desc->irqs_unhandled = 1;
else
desc->irqs_unhandled++;
desc->last_unhandled = jiffies;
}
if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
int ok = misrouted_irq(irq);
if (action_ret == IRQ_NONE)
desc->irqs_unhandled -= ok;
}
desc->irq_count++;
if (likely(desc->irq_count < 100000))
return;
desc->irq_count = 0;
if (unlikely(desc->irqs_unhandled > 99900)) {
/*
* The interrupt is stuck
*/
__report_bad_irq(irq, desc, action_ret);
/*
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
irq_disable(desc);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
desc->irqs_unhandled = 0;
}
从note_interrupt()中得到,当注册irq_action返回值错误(非IRQ_NONE,IRQ_HANDLED, IRQ_WAKE_THREAD)或返回值在很长时间都是IRQ_NONE时,都会导致这个crash。
中断被触发执行,此时却读到中断没有使能,这种情况最经常触发此crash。
irqreturn_t
handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
do {
irqreturn_t res;
trace_irq_handler_entry(irq, action);
res = action->handler(irq, action->dev_id);
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
irq, action->handler))
local_irq_disable();
switch (res) {
case IRQ_WAKE_THREAD:
/*
* Catch drivers which return WAKE_THREAD but
* did not set up a thread function
*/
if (unlikely(!action->thread_fn)) {
warn_no_thread(irq, action);
break;
}
irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
flags |= action->flags;
break;
default:
break;
}
retval |= res;
action = action->next;
} while (action);
add_interrupt_randomness(irq, flags);
if (!noirqdebug)
note_interrupt(irq, desc, retval);
return retval;
}
共享irq时,所有的irq_action都没有被正确处理才会引发该crash。