一次 irq_fpu_usable 内核报错排查总结

开发了一个内核ko模块async_memory_reclaim_for_cold_file_area(内核版本5.14.0-284.11.1),使用kprobe技术捕捉内核copy_page_to_iter()函数,在里边执行自定义的hot_file_update_file_status()函数,统计文件页page的访问信息,源码简要如下:

  1. static void kprobe_handler_post(struct kprobe *p, struct pt_regs *regs,
  2.         unsigned long flags)
  3. {
  4.     struct page *page = (struct page *)(regs->di);
  5.     if(page){
  6.         hot_file_update_file_status(page);
  7.     }
  8. }
  9. static struct kprobe kp_read_cache_func = {
  10.     //buffer io read读取文件页page数据执行到 copy_folio_to_iter()
  11.     .symbol_name    = "copy_page_to_iter",
  12. };
  13. kp_read_cache_func.post_handler = kprobe_handler_post;
  14. //注册kprobe函数
  15. ret = register_kprobe(&kp_read_cache_func);

但是偶发触发内核异常告警,如下:

  • [ 2606.173350] WARNING: CPU: 3 PID: 3347 at arch/x86/kernel/fpu/core.c:60 irq_fpu_usable+0x39/0x50
  • [ 2606.173420]  fuse [last unloaded: async_memory_reclaim_for_cold_file_area_debug]
  • [ 2606.173423] CPU: 3 PID: 3347 Comm: bash Kdump: loaded Tainted: G           OE    -------- h---  5.14.0-284.11.1.el9_2.x86_64 #1
  • [ 2606.173425] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020
  • [ 2606.173426] RIP: 0010:irq_fpu_usable+0x39/0x50
  • [ 2606.173432] RSP: 0018:ffffb6fe009076e8 EFLAGS: 00010006
  • [ 2606.173434] RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024
  • [ 2606.173435] RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740
  • [ 2606.173435] RBP: ffffffffa30c6770 R08: 0000000000000004 R09: 0000000000000004
  • [ 2606.173436] R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001
  • [ 2606.173437] R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780
  • [ 2606.173438] FS:  00007f6c674d4740(0000) GS:ffff971e39ec0000(0000) knlGS:0000000000000000
  • [ 2606.173461] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  • [ 2606.173462] CR2: 0000559a22f5dad0 CR3: 000000000c066004 CR4: 00000000003706e0
  • [ 2606.173475]  blake2s_compress+0x20/0xa0
  • [ 2606.173480]  blake2s_final+0x41/0x80
  • [ 2606.173483]  extract_entropy.constprop.0+0x94/0x250
  • [ 2606.173493]  crng_make_state+0x129/0x180
  • [ 2606.173495]  _get_random_bytes.part.0+0x4c/0x190
  • [ 2606.173497]  ? __alloc_pages+0xe6/0x230
  • [ 2606.173499]  get_random_u32+0x59/0x90
  • [ 2606.173501]  allocate_slab+0x14a/0x460
  • [ 2606.173505]  ___slab_alloc+0x44f/0x570
  • [ 2606.173509]  ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
  • [ 2606.173513]  ? find_busiest_group+0x65/0x190
  • [ 2606.173516]  ? hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
  • [ 2606.173519]  kmem_cache_alloc+0x294/0x300
  • [ 2606.173522]  hot_file_update_file_status+0x164/0xa60 [async_memory_reclaim_for_cold_file_area]
  • [ 2606.173526]  ? 0xffffffffc0572011
  • [ 2606.173528]  kprobe_post_process+0x25/0x70
  • [ 2606.173531]  ? 0xffffffffc0572012
  • [ 2606.173532]  kprobe_int3_handler+0x154/0x180
  • [ 2606.173534]  do_int3+0x3b/0x80
  • [ 2606.173536]  exc_int3+0x81/0xc0
  • [ 2606.173546] RSP: 0018:ffffb6fe00907bf0 EFLAGS: 00000246
  • [ 2606.173548] RBP: 0000000000000000 R08: 0000000000000402 R09: ffff971e10b97500
  • [ 2606.173549] R10: ffffb6fe00907d98 R11: ffffffffffffffff R12: ffffb6fe00907d38
  • [ 2606.173550] R13: ffffb6fe00907d60 R14: 0000000000000100 R15: 0000000000000000
  • [ 2606.173553]  ? 0xffffffffc0572012
  • [ 2606.173554]  filemap_read+0x18a/0x320
  • ...........................

显然是irq_fpu_usable()函数里触发的问题。为了查清楚根因,/proc/sys/kernel/panic_on_warn置1。下次触发这个内核告警后,触发了内核crash,并保存了vmcoe。通过vmcore看到出问题时完整函数流程:

  • crash> bt
  • PID: 33     TASK: ffff971d01358000  CPU: 3    COMMAND: "bash"
  •  #0 [ffffb6fe00907400] machine_kexec at ffffffffa126c237
  •  #1 [ffffb6fe00907458] __crash_kexec at ffffffffa13c3c9a
  •  #2 [ffffb6fe00907518] panic at ffffffffa1cd7dc6
  •  #3 [ffffb6fe00907598] __warn.cold at ffffffffa1cd7feb
  •  #4 [ffffb6fe009075d0] report_bug at ffffffffa17ac20e
  •  #5 [ffffb6fe00907608] handle_bug at ffffffffa1d254bc
  •  #6 [ffffb6fe00907618] exc_invalid_op at ffffffffa1d25684
  •  #7 [ffffb6fe00907630] asm_exc_invalid_op at ffffffffa1e00af6
  •     [exception RIP: irq_fpu_usable+57]
  •     RIP: ffffffffa1232d29  RSP: ffffb6fe009076e8  RFLAGS: 00010006
  •     RAX: ffffffffa30c6794  RBX: ffffffffa30c6740  RCX: 0000000000000024
  •     RDX: 0000000080110002  RSI: ffffffffa30c6770  RDI: ffffffffa30c6740
  •     RBP: ffffffffa30c6770   R8: 0000000000000004   R9: 0000000000000004
  •     R10: ffffffffa30c6794  R11: 00000000000319c0  R12: 0000000000000001
  •     R13: ffffffffa30c6740  R14: 0000000000000024  R15: fffffc6140046780
  •     ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
  •  #8 [ffffb6fe009076e8] blake2s_compress at ffffffffa12e5ae0
  •  #9 [ffffb6fe00907718] blake2s_final at ffffffffa176e391
  • #10 [ffffb6fe00907738] extract_entropy.constprop.0 at ffffffffa18e2904
  • #11 [ffffb6fe009077d8] crng_reseed.constprop.0 at ffffffffa18e2b25
  • #12 [ffffb6fe00907818] crng_make_state at ffffffffa18e2dd9
  • #13 [ffffb6fe00907848] _get_random_bytes at ffffffffa18e2e7c
  • #14 [ffffb6fe009078f8] get_random_u32 at ffffffffa18e3129
  • #15 [ffffb6fe00907918] allocate_slab at ffffffffa1577bea
  • #16 [ffffb6fe00907960] ___slab_alloc at ffffffffa157b17f
  • #17 [ffffb6fe00907a00] kmem_cache_alloc at ffffffffa157d194
  • #18 [ffffb6fe00907a48] hot_file_update_file_status at ffffffffc0dd9d24 [async_memory_reclaim_for_cold_file_area]
  • #19 [ffffb6fe00907aa8] kprobe_post_process at ffffffffa126ec65
  • #20 [ffffb6fe00907ab8] kprobe_int3_handler at ffffffffa126ef44
  • #21 [ffffb6fe00907ae8] do_int3 at ffffffffa1225b2b
  • #22 [ffffb6fe00907af8] exc_int3 at ffffffffa1d25f51
  • #23 [ffffb6fe00907b10] asm_exc_int3 at ffffffffa1e00b35
  • #24 [ffffb6fe00907b98] copy_page_to_iter at ffffffffa1764d32
  • #25 [ffffb6fe00907bf8] filemap_read at ffffffffa14d7b9a
  • ..................

触发问题的函数流程是,kprobe捕捉内核函数copy_page_to_iter。在读文件时执行到该函数时,产生int3中断,在中断服务函数里最后执行自定义的hot_file_update_file_status()函数,在该函数里执行kmem_cache_alloc()分配一个自定义slab数据结构。而在分配slab时,执行allocate_slab->shuffle_freelist->get_random_int分配一个随机数,源码如下:

  1. static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
  2. {
  3.     void *start;
  4.     void *cur;
  5.     void *next;
  6.     unsigned long idx, pos, page_limit, freelist_count;
  7.     if (page->objects < 2 || !s->random_seq)
  8.         return false;
  9.     freelist_count = oo_objects(s->oo);
  10.     //分配随机数,这里触发内核告警
  11.     pos = get_random_int() % freelist_count;
  12.     ............
  13.     next = next_freelist_entry(s, page, &pos, start, page_limit,freelist_count);
  14.     set_freepointer(s, cur, next);
  15.     ............
  16. }

为什么分配slab时要分配随机数呢?查资料说是为了使分配的slab obj地址随机,不容易被攻击,安全考虑。

把这个过程的函数流程再整理一下hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable。再看下最后的几个关键blake2s_compress函数调用:

  1. void blake2s_compress(struct blake2s_state *state, const u8 *block,
  2.               size_t nblocks, const u32 inc)
  3. {
  4.     kernel_fpu_begin();
  5. ..................
  6.     kernel_fpu_end();
  7. }
  8. static inline void kernel_fpu_begin(void)
  9. {
  10.     kernel_fpu_begin_mask(KFPU_MXCSR);
  11. }
  12. void kernel_fpu_begin_mask(unsigned int kfpu_mask)
  13. {
  14.     preempt_disable();
  15.     //这里触发内核告警
  16.     WARN_ON_FPU(!irq_fpu_usable());
  17.     WARN_ON_FPU(this_cpu_read(in_kernel_fpu));
  18.     this_cpu_write(in_kernel_fpu, true);
  19.     ..................
  20. }

是irq_fpu_usable()函数返回false触发的crash,看下这个函数的源码

  1. bool irq_fpu_usable(void)
  2. {
  3.     if (WARN_ON_ONCE(in_nmi()))
  4.         return false;
  5.     if (this_cpu_read(in_kernel_fpu))
  6.         return false;
  7.     if (!in_hardirq())
  8.         return true;
  9.     return !softirq_count();
  10. }

显然,in_nmi()、this_cpu_read(in_kernel_fpu)、softirq_count()大于0都可能会导致该函数返回false,得一一排查。

crash时的cpu是cpu3,很快查找cpu3的in_kernel_fpu这个per cpu 变量是0,

  • crash> in_kernel_fpu
  • PER-CPU DATA TYPE:
  •   bool in_kernel_fpu;
  • PER-CPU ADDRESSES:
  •   [0]: ffff971e39e18ed8
  •   [1]: ffff971e39e58ed8
  •   [2]: ffff971e39e98ed8
  •   //cpu3in_kernel_fpu 变量地址
  •   [3]: ffff971e39ed8ed8
  • crash> rd  0xffff971e39ed8ed8
  • ffff971e39ed8ed8:  0000000000000000                    ........    

in_nmi() 和 softirq_count() 得查看cpu3的preempt_count变量,是0x 80110003,如下:

  • crash> __preempt_count
  • PER-CPU DATA TYPE:
  •   int __preempt_count;
  • PER-CPU ADDRESSES:
  •   [0]: ffff971e39e18f00
  •   [1]: ffff971e39e58f00
  •   [2]: ffff971e39e98f00
  •   //cpu3preempt_count变量地址
  •   [3]: ffff971e39ed8f00
  • crash> rd -x  ffff971e39ed8f00
  • ffff971e39ed8f00:  0000000080110003 

然后查看in_nmi()和softirq_count(),全都来自preempt_count()函数。

  1. #define PREEMPT_NEED_RESCHED    0x80000000
  2. static __always_inline int preempt_count(void)
  3. {
  4.     return raw_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
  5. }
  6. #define PREEMPT_BITS    8
  7. #define PREEMPT_SHIFT   0
  8. #define SOFTIRQ_BITS    8
  9. #define SOFTIRQ_SHIFT   (PREEMPT_SHIFT + PREEMPT_BITS)//8
  10. #define __IRQ_MASK(x)   ((1UL << (x))-1)
  11. #define HARDIRQ_SHIFT   (SOFTIRQ_SHIFT + SOFTIRQ_BITS)//16
  12. #define HARDIRQ_BITS    4
  13.                                                // ((1 << 4) - 1) << 16  0xF << 16
  14. #define HARDIRQ_MASK    (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
  15.                                       //preempt_count bit16~bit19
  16. #define hardirq_count() (preempt_count() & HARDIRQ_MASK)
  17.                                                 //((1 << 8) - 1) << 8    0x7F << 8
  18. #define SOFTIRQ_MASK    (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
  19.                                           //preempt_count bit8~bit15
  20. # define softirq_count()    (preempt_count() & SOFTIRQ_MASK)
  21. #define NMI_SHIFT   (HARDIRQ_SHIFT + HARDIRQ_BITS)//20
  22. #define NMI_BITS    4
  23.                                     //((1 << 4) - 1) << 20   0xF << 20
  24. #define NMI_MASK    (__IRQ_MASK(NMI_BITS)     << NMI_SHIFT)
  25.                                   //preempt_count bit20~bit23
  26. #define nmi_count() (preempt_count() & NMI_MASK)
  27. #define in_nmi()        (nmi_count())

cpu3的preempt_count()值是 0x80110003 , bit8~bit15 是0,bit16~bit19是1,bit20~bit23是1。即 in_nmi() 是1,in_hardirq()是1,softirq_count()是0。显然,问题根源竟然是 in_nmi() 是非0导致的。这个就是NMI计数,非0说明当前处于NMI中断。这怎么可能呢?怎么会处于NMI中断呢?什么是NMI中断,这个一般是当发生hard lock、cpu硬件级别错误时才会发生的不可屏蔽中断。可是这个问题场景只是我利用kprobe捕捉内核函数,然后产生int3中断,在中断服务函数里最后执行自定义的hot_file_update_file_status()函数,在该函数里执行kmem_cache_alloc()分配一个slab数据结构而已,这个普通场景怎么会触发NMI中断呢?

这个虚拟机是rocky 9.2 ,之前就出现过莫名奇妙的问题,怀疑内存数据有问题,难道这次又是内存数据有问题?当遇到莫名其妙的问题,就容易怀疑到内存有问题导致数据跳变、内存越界等导致?哪里会有这么多神奇的行为,还是先耐着性子看下源码吧。于是发现in_nmi()大于0是执行了 __nmi_enter()导致的,如下:

  1. #define __nmi_enter()                       \
  2.     do {                            \
  3.         lockdep_off();                  \
  4.         arch_nmi_enter();               \
  5.         BUG_ON(in_nmi() == NMI_MASK);           \
  6.         __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET);   \
  7.     } while (0)

再进一步,发现int3中断里竟然在irqentry_nmi_enter()函数里执行了__nmi_enter,细节如下:

  1. irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
  2. {
  3.     irqentry_state_t irq_state;
  4.     irq_state.lockdep = lockdep_hardirqs_enabled();
  5.     __nmi_enter();// in_nmi 1
  6.     lockdep_hardirqs_off(CALLER_ADDR0);
  7.     lockdep_hardirq_enter();
  8.     rcu_nmi_enter();
  9.     instrumentation_begin();
  10.     trace_hardirqs_off_finish();
  11.     ftrace_nmi_enter();
  12.     instrumentation_end();
  13.     return irq_state;
  14. }
  15. DEFINE_IDTENTRY_RAW(exc_int3)
  16. {
  17.     if (poke_int3_handler(regs))
  18.         return;
  19.    if (user_mode(regs)) {
  20.        ……………
  21.     } else {
  22.         // in_nmi 1
  23.         irqentry_state_t irq_state = irqentry_nmi_enter(regs);
  24.         instrumentation_begin();
  25.         //执行kprobe 的函数
  26.         if (!do_int3(regs))
  27.             die("int3", regs, 0);
  28.         instrumentation_end();
  29.         // in_nmi 1
  30.         irqentry_nmi_exit(regs, irq_state);
  31.     }
  32. }
  33. static bool do_int3(struct pt_regs *regs)
  34. {
  35.     int res;
  36. #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
  37.     if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
  38.              SIGTRAP) == NOTIFY_STOP)
  39.         return true;
  40. #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
  41. #ifdef CONFIG_KPROBES
  42.     //执行kprobe 的函数
  43.     if (kprobe_int3_handler(regs))
  44.         return true;
  45. #endif
  46.     res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
  47.     return res == NOTIFY_STOP;
  48. }

简单说:从源码角度来说,每次kprobe触发的int3中断,在exc_int3()中执行kprobe hot_file_update_file_status函数前,都会先执行 irqentry_nmi_enter->__nmi_enter 令NMI中断加1。这样看到的,每次执行 hot_file_update_file_status->kmem_cache_alloc 函数分配slab 前,都会出发内核告警irq_fpu_usable。可实际测试是随机的?这是怎么回事?

我认为 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int->get_random_u32->_get_random_bytes->crng_make_state->crng_reseed->drain_entropy->extract_entropy->blake2s_final->__blake2s_final->blake2s_compress->kernel_fpu_begin->irq_fpu_usable,这个流程并不是每次都执行到最后的 irq_fpu_usable ,中间有很多条件判断,但是一旦执行到irq_fpu_usable ,必然会触发内核告警。

简单说,kprobe的函数里,不能执行 kmem_cache_alloc()分配slab 内存。因为kprobe的init3中断会令NMI计数加1,然后执行kmem_cache_alloc()分配slab有概率执行到irq_fpu_usable,因为NMI计数大于0而触发内核告警。

怎么解决呢?

  • 1:在执行 hot_file_update_file_status->kmem_cache_alloc 前,自己的源码里强制令nmi计数减1,但怕对系统会有不良影响。
  • 2:自己实现一套简易的slab源码,不执行 get_random_int() 函数就行。

但是这个两个方案改动太大,研究源码后,其实有个很简单的方法,不用动内核。

禁止掉 allocate_slab random 功能,就不会执行 hot_file_update_file_status->kmem_cache_alloc->___slab_alloc->allocate_slab->shuffle_freelist->get_random_int 函数,具体改动如下:

  1. struct hot_cold_file_global
  2. {
  3.     struct kmem_cache *file_stat_cachep;
  4. }
  5. struct hot_cold_file_global hot_cold_file_global_info;
  6. static void (*cache_random_seq_destroy)(struct kmem_cache *cachep);
  7. //获取内核cache_random_seq_destroy函数指针
  8. cache_random_seq_destroy = (void *)kallsyms_lookup_name_async("cache_random_seq_destroy");
  9. cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
  10. //禁止分配 file_stat_cachep  这个slab时使用随机数功能,分配slab时就不会执行到shuffle_freelist->get_random_int函数
  11. cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);

当执行cache_random_seq_destroy()函数后,就令slab的random_seq为NULL

  1. void cache_random_seq_destroy(struct kmem_cache *cachep)
  2. {
  3.     kfree(cachep->random_seq);
  4.     cachep->random_seq = NULL;
  5. }

然后将来分配slab,执行到allocate_slab->shuffle_freelist时,直接返回false,不会再执行到get_random_int()函数。

  1. static bool shuffle_freelist(struct kmem_cache *s, struct page *page)
  2. {
  3.     void *start;
  4.     void *cur;
  5.     void *next;
  6.     unsigned long idx, pos, page_limit, freelist_count;
  7.      //直接return false
  8.     if (page->objects < 2 || !s->random_seq)
  9.         return false;
  10.     freelist_count = oo_objects(s->oo);
  11.     //分配随机数,这里触发crash
  12.     pos = get_random_int() % freelist_count;
  13.     ............
  14. }

你可能感兴趣的:(linux,性能优化,c语言,源码,服务器)