[root@localhost ~]# uname -a Linux localhost.localdomain 3.10.0-693.5.2.el7.x86_64
问题描述,在crash的时候,小内核因为分配中断号失败而触发panic,打印如下:(备注:本文大内核就是指正常运行的内核,小内核是用于kdump收集crash的内核,下同)
[ 17.428239] ------------[ cut here ]------------ [ 17.433467] kernel BUG at arch/x86/kernel/apic/io_apic.c:1358! [ 17.439916] invalid opcode: 0000 [#1] SMP [ 17.444670] Modules linked in: mpt3sas(OE+) raid_class scsi_transport_sas i40e(OE) ast i2c_algo_bit ptp drm_kms_helper pps_core syscopyarea sysfillrect sysimgblt fb_sys_fops tta [ 17.465081] CPU: 0 PID: 234 Comm: systemd-udevd Tainted: G OE ------------ 3.10.0-693.5.2.el7.x86_64 #1 [ 17.476265] Hardware name: Insyde Purley/Type2 - Board Product Name1, BIOS 00.1 08/24/2017 [ 17.485203] task: ffff880032419fa0 ti: ffff88002bfbc000 task.ti: ffff88002bfbc000 [ 17.493359] RIP: 0010:[] [ ] __clear_irq_vector+0x9d/0x100 [ 17.502671] RSP: 0000:ffff88002bfbf8a8 EFLAGS: 00010046 [ 17.508657] RAX: 0000000000000246 RBX: 00000000000000d6 RCX: 00000000fffffffa [ 17.516473] RDX: 0000000000000001 RSI: ffff880029e1db40 RDI: 00000000000000d6 [ 17.524295] RBP: ffff88002bfbf8d0 R08: 0000000000000000 R09: ffff88002e10eb68 [ 17.532118] R10: 0000000000000000 R11: ffffea0000ab3b80 R12: ffff880029e1db40 [ 17.539943] R13: 0000000000000000 R14: 0000000000000002 R15: ffff880029e1db40 [ 17.547761] FS: 00007f749dd4a8c0(0000) GS:ffff880033c00000(0000) knlGS:0000000000000000 [ 17.556538] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 17.562961] CR2: 00007f749dd52000 CR3: 0000000032402000 CR4: 00000000003407b0 [ 17.570777] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 17.578593] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 17.586393] Stack: [ 17.589077] 00000000000000d6 ffff880029e1db40 0000000000000246 0000000000000002 [ 17.597247] 0000000000000004 ffff88002bfbf8f8 ffffffff8105803e 0000000000000004 [ 17.605413] 00000000000000d6 ffff880029e1db40 ffff88002bfbf938 ffffffff8105902a [ 17.613574] Call Trace: [ 17.616697] [ ] arch_teardown_hwirq+0x3e/0x70 [ 17.623293] [ ] mp_irqdomain_unmap+0xba/0x100 [ 17.629882] [ ] irq_domain_disassociate_many+0xa7/0x130 [ 17.637336] [ ] irq_dispose_mapping+0x3c/0x60 [ 17.643922] [ ] mp_unmap_irq+0x81/0xb0 [ 17.649902] [ ] acpi_unregister_gsi_ioapic+0x31/0x40 [ 17.657100] [ ] acpi_unregister_gsi+0x17/0x20 [ 17.663690] [ ] acpi_pci_irq_disable+0xb6/0xc6 [ 17.670359] [ ] pcibios_disable_device+0x20/0x30 [ 17.677194] [ ] do_pci_disable_device+0x56/0x80 [ 17.683941] [ ] pci_disable_device+0x48/0x90 [ 17.690421] [ ] _base_unmap_resources+0xa8/0xf0 [mpt3sas] [ 17.698028] [ ] mpt3sas_base_map_resources+0x188/0x710 [mpt3sas]------调用_base_enable_msix--->_base_request_irq失败,注册中断失败。 [ 17.706242] [ ] mpt3sas_base_attach+0xec/0x9c0 [mpt3sas] [ 17.713763] [ ] _scsih_probe+0x6ad/0xb40 [mpt3sas] [ 17.720752] [ ] local_pci_probe+0x45/0xa0 [ 17.726966] [ ] pci_device_probe+0x109/0x160 [ 17.733434] [ ] driver_probe_device+0xc2/0x3e0 [ 17.740069] [ ] __driver_attach+0x93/0xa0 [ 17.746268] [ ] ? __device_attach+0x40/0x40 [ 17.752629] [ ] bus_for_each_dev+0x73/0xc0 [ 17.758900] [ ] driver_attach+0x1e/0x20 [ 17.764906] [ ] bus_add_driver+0x200/0x2d0 [ 17.771169] [ ] driver_register+0x64/0xf0 [ 17.777345] [ ] __pci_register_driver+0xa5/0xc0 [ 17.784033] [ ] ? 0xffffffffc01e7fff [ 17.789757] [ ] _mpt3sas_init+0x1fa/0x1000 [mpt3sas] [ 17.796857] [ ] do_one_initcall+0xb8/0x230 [ 17.803071] [ ] load_module+0x1f64/0x29e0 [ 17.809183] [ ] ? ddebug_proc_write+0xf0/0xf0 [ 17.815632] [ ] ? copy_module_from_fd.isra.42+0x53/0x150 [ 17.823029] [ ] SyS_finit_module+0xa6/0xd0 [ 17.829209] [ ] system_call_fastpath+0x16/0x1b [ 17.835729] Code: 3f 49 8b 7f 08 31 f6 48 c1 fa 03 41 c6 47 18 00 48 83 e2 f8 e8 e5 d9 2d 00 41 f6 47 19 01 75 0d 5b 41 5c 41 5d 41 5e 41 5f 5d c3 <0f> 0b b8 ff ff ff ff 48 c7 [ 17.857031] RIP [ ] __clear_irq_vector+0x9d/0x100 [ 17.863865] RSP [ 17.867900] ---[ end trace 389c806a74c30735 ]--- [ 17.999540] Kernel panic - not syncing: Fatal exception [ 18.005310] Kernel Offset: disabled [ 18.135825] Rebooting in 30 seconds.. [ 48.141388] ACPI MEMORY or I/O RESET_REG.
串口打印如下:
[ 18.291414] mpt3sas version 21.00.00.00 loaded [ 18.306304] mpt3sas_cm0: 32 BIT PCI BUS DMA ADDRESSING SUPPORTED, total mem (496792 kB) [ 18.395250] mpt3sas_cm0: IOC Number : 0 [ 18.399709] mpt3sas_cm0: CurrentHostPageSize is 0: Setting default host page size to 4k [ 18.419317] mpt3sas0: unable to allocate interrupt 214!
而在大内核中,加载打印如下:
[ 11.056440] mpt3sas version 21.00.00.00 loaded [ 11.062317] mpt3sas_cm0: 64 BIT PCI BUS DMA ADDRESSING SUPPORTED, total mem (393786508 kB) [ 11.072540] ahci 0000:00:11.5: version 3.0
一开始只注意到分配中断失败,但没有注意到在小内核中是加载的sas驱动是32位模式,在大内核中是64位,所以还以加载驱动有问题。一开始以为是sas驱动的21版本才有这个问题,
回退到系统自带的15版本也有这个问题,所以后面的代码以15版本描述为准,当然修改驱动还是修改的21版本。
走查sas驱动代码,加载的分支在:
static int _base_config_dma_addressing(struct MPT3SAS_ADAPTER *ioc, struct pci_dev *pdev) { struct sysinfo s; char *desc = NULL; if (sizeof(dma_addr_t) > 4) { const uint64_t required_mask = dma_get_required_mask(&pdev->dev); if ((required_mask > DMA_BIT_MASK(32)) && !pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) && !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64))) { ioc->base_add_sg_single = &_base_add_sg_single_64; ioc->sge_size = sizeof(Mpi2SGESimple64_t); desc = "64"; goto out; } } if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) && !pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32))) { ioc->base_add_sg_single = &_base_add_sg_single_32; ioc->sge_size = sizeof(Mpi2SGESimple32_t); desc = "32"; } else return -ENODEV; out: si_meminfo(&s); printk(MPT3SAS_INFO_FMT "%s BIT PCI BUS DMA ADDRESSING SUPPORTED, " "total mem (%ld kB)\n", ioc->name, desc, convert_to_kb(s.totalram)); return 0; }
根据dma_get_required_mask是否大于DMA_BIT_MASK(32)来判断走32位流程还是64位流程。
通过加打印,获取以下大内核中对应的打印是:
mpt3sas_cm0: required_mask: 0x7fffffffff DMA_BIT_MASK_32: 0xffffffff
而小内核中,对应的打印是:
mpt3sas_cm0: required_mask: 0x3fffffff DMA_BIT_MASK_32: 0xffffffff
这么说来,小内核中加载32位的驱动是正常的了,是故意为之,取决于保留的内存大小,这个走了弯路。排除了这种可能。
好,那么接下来,继续分析,为什么分配中断失败。先描述调用链:mpt3sas_base_map_resources-->_base_enable_msix-->_base_request_irq-->request_irq-->
我们知道,mpt3sas_driver 和i40e的网卡驱动,都属于pci_dirver.
static struct pci_driver mpt3sas_driver = { #ifdef MPT2SAS_SCSI .name = MPT2SAS_DRIVER_NAME, #else .name = MPT3SAS_DRIVER_NAME, #endif /* MPT2SAS_SCSI */ .id_table = mpt3sas_pci_table, .probe = _scsih_probe, .remove = scsih_remove, .shutdown = scsih_shutdown, .err_handler = &_mpt3sas_err_handler, #ifdef CONFIG_PM .suspend = scsih_suspend, .resume = scsih_resume, #endif }; static struct pci_driver i40e_driver = { .name = i40e_driver_name, .id_table = i40e_pci_tbl, .probe = i40e_probe, .remove = i40e_remove, #ifdef CONFIG_PM .suspend = i40e_suspend, .resume = i40e_resume, #endif .shutdown = i40e_shutdown, .err_handler = &i40e_err_handler, .sriov_configure = i40e_pci_sriov_configure, };
那么,继续推敲中断的注册,看看有没有什么猫腻。
sas的异常,我们修改sas的打印,获取如下:
[ 17.397195] mpt3sas0: new add vector: 214, name: mpt3sas0! [ 17.403222] mpt3sas0: unable to allocate interrupt 214, r: -38!--------增加了打印返回值,为-38 [ 17.409762] mpt3sas_cm0: _base_unmap_resources
返回值是-38,request_irq调用的是request_threaded_irq---->__setup_irq,这里多说一句,因为这里涉及到中断线程化的一些代码。
一般来说request_irq最终会生成一个irqaction 来调用setup_irq,而setup_irq是__setup_irq的包裹函数,在中断线程化之后,request_threaded_irq调用__setup_irq的上锁过程如下:
chip_bus_lock(desc); retval = __setup_irq(irq, desc, action); chip_bus_sync_unlock(desc);
而包裹函数setup_irq的上锁过程如下。
int setup_irq(unsigned int irq, struct irqaction *act) { int retval; struct irq_desc *desc = irq_to_desc(irq); if (WARN_ON(irq_settings_is_per_cpu_devid(desc))) return -EINVAL; chip_bus_lock(desc); retval = __setup_irq(irq, desc, act); chip_bus_sync_unlock(desc); return retval; }
两者的区别在于使用阶段,在没有slab初始化之前,只能用setup_irq,因为这个时候如果调用kmalloc申请irqaction ,必须是在slab
初始化之后,包括mem_init以及kmem_cache_init。但是对于time_init来说,它位于mem_init代码之前,所以time_init触发的时钟驱动的初始化和注册中断处理函数就没法使用kmalloc来
申请irqaction ,除此之外,并不是所有的中断都可以被线程化,比如时钟中断,主要用来维护系统时间以及定时器等,其中定时器是操作系统的脉搏,一旦被线程化,就有可能被挂起,
有些级联的interrupt controller对应的IRQ也是不能线程化的,所以request_threaded_irq和setup_irq将是一个长期并存的过程,不能线程化的中断具有_IRQ_NOTHREAD标志。
言归正传,下面看__setup_irq 返回出错的地方:
static int __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) { struct irqaction *old, **old_ptr; unsigned long flags, thread_mask = 0; int ret, nested, shared = 0; cpumask_var_t mask; if (!desc) return -EINVAL; if (desc->irq_data.chip == &no_irq_chip) return -ENOSYS;--------------返回出错的地方
error.h中
#define ENOSYS 38 /* Function not implemented */
在过去,我们会以IRQ number为index,从irq_desc这个全局数组中直接获取中断描述符。由于我们使能了CONFIG_SPARSE_IRQ,现在则需要从radix tree中搜索使用radix数来存储中断,会减少一些内存占用,所以下面就需要重点分析,为什么这个中断没有初始化chip信息了。
stap -l 'kernel.function("irq_to_desc")'
kernel.function("irq_to_desc@kernel/irq/irqdesc.c:133")
我们来查看调用链,看正常情况下,chip信息在哪初始化的。
_base_enable_msix函数中,既然调用了_base_request_irq,一开始没看到”pci_enable_msix_exact failed“,这个打印,那么在此之上的pci_enable_msix_exact肯定返回正常。
pci_enable_msix_exact-->pci_enable_msix_range-->pci_enable_msix-->msix_capability_init-->arch_setup_msi_irqs-->native_setup_msi_irqs-->setup_msi_irq-->irq_set_chip_and_handler_name就决定了不会返回-38啊
int irq_set_chip(unsigned int irq, struct irq_chip *chip) { unsigned long flags; struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0); if (!desc) return -EINVAL; if (!chip) chip = &no_irq_chip;-------------赋值为这个的前提是,传入的chip为NULL desc->irq_data.chip = chip; irq_put_desc_unlock(desc, flags); /* * For !CONFIG_SPARSE_IRQ make the irq show up in * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is * already marked, and this call is harmless. */ irq_reserve_irq(irq); return 0; }
而传入给irq_set_chip的chip参数是由setup_msi_irq赋值的,默认为 &msi_chip
int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, unsigned int irq_base, unsigned int irq_offset) { struct irq_chip *chip = &msi_chip;---------------chip的默认参数 struct msi_msg msg; unsigned int irq = irq_base + irq_offset; int ret; ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; irq_set_msi_desc_off(irq_base, irq_offset, msidesc); /* * MSI-X message is written per-IRQ, the offset is always 0. * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. */ if (!irq_offset) write_msi_msg(irq, &msg); setup_remapped_irq(irq, irq_cfg(irq), chip); irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");-------这个会调用irq_set_chip,见下面。
void irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, irq_flow_handler_t handle, const char *name) { irq_set_chip(irq, chip);----这个chip传入的应该是&msi_chip
__irq_set_handler(irq, handle, 0, name); }
setup_msi_irq传给irq_set_chip_and_handler_name传入的第二个参数,也就是struct irq_chip指针,是&msi_chip,而不是&no_irq_chip,这个就比较奇怪了。
然后跟同事文洋讨论发现,由于打印级别的问题,pci_enable_msix_exact failed不会打印,所以有可能是pci_enable_msix_exact失败了,然后走try_ioapic流程,
然后在try_ioapic流程中,调用_base_request_irq,再次失败,并进入panic流程。
通过加打印,再次修改mpt3sas_base.c文件
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)) r = pci_enable_msix_exact(ioc->pdev, entries, ioc->reply_queue_count); #else r = pci_enable_msix(ioc->pdev, entries, ioc->reply_queue_count); #endif if (r) { dfailprintk(ioc, printk(MPT3SAS_INFO_FMT #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)) "pci_enable_msix_exact " #else "pci_enable_msix " #endif "failed (r=%d) !!!\n", ioc->name, r)); kfree(entries); flag_caq=5;-----------设置返回标志,看从哪个fail返回的
goto try_ioapic;
获取打印如下,果然是pci_enable_msix 失败了。
[ 18.930924] mpt3sas_cm0: MSI-X vectors supported: 96, no of cores: 1, max_msix_vectors: -1
[ 18.958778] mpt3sas_cm0: pci_enable_msix failed (r=-1)
[ 18.964598] mpt3sas_cm0: caq enter try_ioapic and flag_caq=5
最终一步步加打印,包括修改打印级别,确定是因为中断数不够用了。因为只使能了一个cpu,我们大量的pci设备占用了很多中断。
为了减少中断数量,让保留内核能够生成crash文件,我们做了两个尝试,一个是将大量申请中断的i40e驱动在kdump的配置中屏蔽掉。见《linux 3.10的kdump配置的小坑》描述,
另外一个尝试是使能多个cpu,也能解决这个问题。