前段时间遇到一个问题,具有一定的代表性,特此记录,后续遇到类似问题时,可以参考这个方向
此问题为低概率煲机测试类问题,概率较低,需要长时间的运行才可以触发到,排查起来比较困难;
煲机12小时以后,概率性出现KE重启;
6台机器同时煲机12小时,其中4台出现上述KE重启现象,且部分机器不只发生一次;
根据以往经验,对于这类KE问题的排查基本可以分为如下几个类型:
<1>[ 1788.158209].(2)[567:mtk_wmtd_worker]Unable to handle kernel paging request at virtual address ffffffe5af4c12af
<1>[ 1788.158212].(2)[567:mtk_wmtd_worker]pgd = ffffffe5e030e000
<1>[ 1788.158217].(2)[567:mtk_wmtd_worker][ffffffe5af4c12af] *pgd=0000000000000000, *pud=0000000000000000
<0>[ 1788.158222]-(2)[567:mtk_wmtd_worker]Internal error: Oops: 96000061 [#1] PREEMPT SMP
<6>[ 1788.158229]-(2)[567:mtk_wmtd_worker]disable aee kernel api
<5>[ 1788.158228]-(2)[567:mtk_wmtd_worker]Kernel Offset: 0xaa3a00000 from 0xffffff8008000000
<4>[ 1788.158249]-(2)[567:mtk_wmtd_worker]Modules linked in: wlan_drv_gen4m ffffff88e98db000 (null) 1925120 0 (O) wmt_chrdev_wifi ffffff88e98d1000 (null) 28672 0 (O) gps_drv ffffff88e98bd000 (null) 61440 0 (O) fmradio_drv ffffff88e988d000 (null) 176128 0 (O) bt_drv ffffff88e9882000 (null) 32768 0 (O) wmt_drv ffffff88e974d000 (null) 1191936 0 (O) fpsgo ffffff88e9745000 (null) 16384 0 (O)
<5>[ 1789.158261]-(2)[567:mtk_wmtd_worker]Non-crashing CPUs did not react to IPI
<4>[ 1789.158272]-(2)[567:mtk_wmtd_worker]CPU: 2 PID: 567 Comm: mtk_wmtd_worker Tainted: G W O 4.9.117 #4
<4>[ 1789.158274]-(2)[567:mtk_wmtd_worker]Hardware name: AC8257V/WAB (DT)
<4>[ 1789.158278]-(2)[567:mtk_wmtd_worker]task: ffffffe5e0190000 task.stack: ffffffe5dcc70000
<4>[ 1789.158291]-(2)[567:mtk_wmtd_worker]PC is at alloc_vmap_area+0x314/0x3e8
<4>[ 1789.158294]-(2)[567:mtk_wmtd_worker]LR is at alloc_vmap_area+0x304/0x3e8
<4>[ 1789.158296]-(2)[567:mtk_wmtd_worker]pc : [] lr : [] pstate: 20400145
接下来是调用栈:
vmlinux __list_add_rcu(new=0xFFFFFFE5AF4C1730, next=0xFFFFFFE5E1192AB0) + 4
vmlinux list_add_rcu(new=0xFFFFFFE5AF4C1730) + 8
vmlinux __insert_vmap_area() + 144
vmlinux alloc_vmap_area(align=1, vstart=-549621596160, vend=-280246681600, node=37748930) + 788
vmlinux __get_vm_area_node(align=1, node=37748930) + 200
vmlinux __vmalloc_node_range(align=1, vm_flags=0, node=-1, caller=0xFFFFFF88E99EB86C) + 112
vmlinux __vmalloc_node(align=1, gfp_mask=37748930, node=-1) + 60
vmlinux __vmalloc_node_flags(node=-1, flags=37748930) + 116
vmlinux vmalloc(size=49) + 116
wlan_drv_gen4m.ko txPwrCtrlGlobalVariableToList() + 248
即最终是在alloc内存时访问未对齐地址(0xFFFFFFE5AF4C12AF)
<3>[ 6245.358909] (2)[3288:Binder:558_E]BUG kmalloc-128 (Tainted: G W O ): Error: Freelist has been corrupted, object: ffffffeeaf7a2000, next_object: 2
<4>[ 6245.358937] (2)[3288:Binder:558_E]Disabling lock debugging due to kernel taint
<3>[ 6245.358945] (2)[3288:Binder:558_E]INFO: Slab 0xffffffbfbabde880 objects=32 used=32 fp=0x (null) flags=0x0080
<4>[ 6245.358959]-(2)[3288:Binder:558_E]------------[ cut here ]------------
<2>[ 6245.358964]-(2)[3288:Binder:558_E]Kernel BUG at ffffff9824016cb8 [verbose debug info unavailable]
<0>[ 6245.358974]-(2)[3288:Binder:558_E]Internal error: Oops - BUG: 0 [#1] PREEMPT SMP
<6>[ 6245.358983]-(2)[3288:Binder:558_E]disable aee kernel api
<5>[ 6245.358988]-(2)[3288:Binder:558_E]Kernel Offset: 0x181be00000 from 0xffffff8008000000
<4>[ 6245.358996]-(2)[3288:Binder:558_E]Modules linked in: wlan_drv_gen4m ffffffad59200000 (null) 1925120 0 (O) wmt_chrdev_wifi ffffffad591f6000 (null) 28672 0 (O) gps_drv ffffffad591e2000 (null) 61440 0 (O) fmradio_drv ffffffad591b2000 (null) 176128 0 (O) bt_drv ffffffad591a7000 (null) 32768 0 (O) wmt_drv ffffffad59072000 (null) 1191936 0 (O) fpsgo ffffffad5906a000 (null) 16384 0 (O)
<5>[ 6246.359046]-(2)[3288:Binder:558_E]Non-crashing CPUs did not react to IPI
<4>[ 6246.359068]-(2)[3288:Binder:558_E]CPU: 2 PID: 3288 Comm: Binder:558_E Tainted: G B W O 4.9.117 #4
<4>[ 6246.359075]-(2)[3288:Binder:558_E]Hardware name: AC8257V/WAB (DT)
<4>[ 6246.359082]-(2)[3288:Binder:558_E]task: ffffffeeccf91000 task.stack: ffffffee9334c000
<4>[ 6246.359098]-(2)[3288:Binder:558_E]PC is at __kmalloc+0x2bc/0x2fc
<4>[ 6246.359104]-(2)[3288:Binder:558_E]LR is at __kmalloc+0x2bc/0x2fc
调用栈:
vmlinux slab_alloc_node() + 628
vmlinux slab_alloc() + 628
vmlinux __kmalloc(size=104, flags=37781696) + 700
vmlinux kmalloc() + 12
vmlinux kzalloc() + 12
vmlinux binder_inc_ref_for_node(proc=0xFFFFFFEEC42F5000, node=0xFFFFFFEEFB5E5D00, strong=true, target_list=0xFFFFFFEEC503E448, rdata=0xFFFFFFEE9334FB38) + 304
vmlinux binder_translate_binder() + 1448
vmlinux binder_transaction() + 8448
vmlinux binder_thread_write(thread=0xFFFFFFEE9334FC80, binder_buffer=501282693376) + 1540
vmlinux binder_ioctl_write_read() + 1776
vmlinux binder_ioctl() + 456
vmlinux vfs_ioctl(filp=0xFFFFFFEED6C60A00, arg=501186143272) + 28
vmlinux do_vfs_ioctl(filp=0xFFFFFFEED6C60A00, fd=11, cmd=3224396289, arg=501186143272) + 2044
vmlinux SYSC_ioctl(fd=11, cmd=3224396289) + 100
vmlinux SyS_ioctl(fd=11, cmd=3224396289, arg=501186143272) + 132
vmlinux el0_svc() + 64
即在分配slab内存的时候触发了bug_on的调用
cat /proc/buddyinfo
在linux中使用buddy算法解决物理内存的外碎片问题,其把所有空闲的内存,以2的幂次方的形式,分成11个块链表,分别对应为1、2、4、8、16、32、64、128、256、512、1024个页块。
类似的还有:slubinfo、zoneinfo、pagetypeinfo
则既然有内存使用问题,且从几份log来看,都定位到了kmalloc分配内存的时候,则优先使用Linux内置的debug工具来做分析,即打开slub_debug配置来做煲机测试;
slub debug功能的话,有一个问题是只会在主动申请时检测,即无法定位到具体是被谁改动的,当然按照经验来看一般都是申请该块内存的程序存在同步问题(个人有限经验);
该部分机器添加修改后,一运行即提示KE异常;
直接看调用栈:
vmlinux check_bytes_and_report() + 136
vmlinux check_object(s=0xFFFFFFC6BA171180, page=0xFFFFFFBF1A6A2C00, object=0xFFFFFFC69A8B0F00, val=187) + 692
vmlinux alloc_consistency_checks() + 112
vmlinux alloc_debug_processing(s=0xFFFFFFC6BA171180, page=0xFFFFFFBF1A6A2C00, object=0xFFFFFFC69A8B0F00, addr=-426485574932) + 172
vmlinux ___slab_alloc(s=0xFFFFFFC6BA171180, gfpflags=37748736, node=-1283812628, c=0xFFFFFFC6FFE83E60) + 1384
vmlinux __slab_alloc() + 32
vmlinux slab_alloc_node() + 668
vmlinux slab_alloc(gfpflags=37748736) + 668
vmlinux __kmalloc(size=8, flags=37748736) + 740
vmlinux kmalloc() + 8
vmlinux usb_control_msg(dev=0xFFFFFFC6FB28D000, request=161, requesttype=192, value=0, index=20, data=0xFFFFFFC6FB30CF00, size=4, timeout=5000) + 156
vmlinux smsc9500_read_reg(dev=0xFFFFFFC6FB204880, index=20, data=0xFFFFFFC6F847F88C) + 192
vmlinux smsc9500_reset(dev=0xFFFFFFC6FB204880) + 84
vmlinux Smsc9500SystemResume(intf=0xFFFFFFC6FB292900) + 324
vmlinux usb_resume_interface() + 180
vmlinux usb_resume_both(udev=0xFFFFFFC6FB28D000) + 436
vmlinux usb_resume(dev=0xFFFFFFC6FB28D098, msg=16) + 28
vmlinux usb_dev_resume(dev=0xFFFFFFC6FB28D098) + 12
vmlinux dpm_run_callback(cb=usb_dev_resume(), dev=0xFFFFFFC6FB28D098, state=16, info=0xFFFFFF9CB408FD58) + 220
vmlinux device_resume(dev=0xFFFFFFC6FB28D098, state=16, async=false) + 652
vmlinux dpm_resume() + 428
vmlinux dpm_resume_end(state=16) + 20
vmlinux suspend_devices_and_enter() + 1936
vmlinux enter_state() + 1624
vmlinux pm_suspend(state=3) + 1752
vmlinux state_store(buf=0xFFFFFFC6ADB8BF00, n=3) + 76
vmlinux kobj_attr_store() + 16
vmlinux sysfs_kf_write(of=0xFFFFFFC6E851C3C0, buf=0xFFFFFFC6ADB8BF00, count=3, pos=0) + 60
vmlinux kernfs_fop_write(file=0xFFFFFFC68813B900, ppos=0xFFFFFFC6F847FE80) + 260
vmlinux __vfs_write(file=0xFFFFFFC68813B900, p=0x0000007005504409, count=3, pos=0xFFFFFFC6F847FE80) + 64
vmlinux vfs_write(file=0xFFFFFFC68813B900, buf=0x0000007005504409, count=3, pos=0xFFFFFFC6F847FE80) + 368
vmlinux SYSC_write(buf=0x0000007005504409) + 40
vmlinux SyS_write(buf=481125483529, count=3) + 80
vmlinux el0_svc() + 64
分析log信息为slub检测异常触发KE,具体slub_info 信息如下:
<6>[ 713.459426].(0)[4719:PhotonicModulat]usb 2-1.1: usb_control_msg, timeout 5000->2000, request=161, type=192, value=0, index=20
<3>[ 713.460888]-(0)[4719:PhotonicModulat]=============================================================================
<3>[ 713.462231]-(0)[4719:PhotonicModulat]BUG kmalloc-64 (Tainted: G W O ): Poison overwritten
<3>[ 713.463409]-(0)[4719:PhotonicModulat]-----------------------------------------------------------------------------
<3>[ 713.463409]
<4>[ 713.464966]-(0)[4719:PhotonicModulat]Disabling lock debugging due to kernel taint
<3>[ 713.465951]-(0)[4719:PhotonicModulat]INFO: 0xffffffc69a8b0f08-0xffffffc69a8b0f27. First byte 0x6a instead of 0x6b
<3>[ 713.467296]-(0)[4719:PhotonicModulat]INFO: Allocated in Smsc9500SystemResume+0x84/0x310 age=1 cpu=0 pid=4719
<3>[ 713.468579]-(0)[4719:PhotonicModulat] ___slab_alloc+0x56c/0x63c
<3>[ 713.469371]-(0)[4719:PhotonicModulat] __kmalloc+0x2e8/0x300
<3>[ 713.470119]-(0)[4719:PhotonicModulat] Smsc9500SystemResume+0x84/0x310
<3>[ 713.470978]-(0)[4719:PhotonicModulat] usb_resume_both+0x1b8/0x234
<3>[ 713.471793]-(0)[4719:PhotonicModulat]INFO: Freed in smsc9500_async_cmd_callback+0x60/0x74 age=1 cpu=0 pid=0
<3>[ 713.473059]-(0)[4719:PhotonicModulat] kfree+0x22c/0x5d8
<3>[ 713.473763]-(0)[4719:PhotonicModulat] smsc9500_async_cmd_callback+0x60/0x74
<3>[ 713.474688]-(0)[4719:PhotonicModulat] __usb_hcd_giveback_urb+0x88/0xe8
<3>[ 713.475556]-(0)[4719:PhotonicModulat] usb_hcd_giveback_urb+0x40/0xd0
<3>[ 713.476401]-(0)[4719:PhotonicModulat]INFO: Slab 0xffffffbf1a6a2c00 objects=21 used=21 fp=0x (null) flags=0x0080
<3>[ 713.477806]-(0)[4719:PhotonicModulat]INFO: Object 0xffffffc69a8b0f00 @offset=3840 fp=0xffffffc69a8b0000
<3>[ 713.477806]
<3>[ 713.479250]-(0)[4719:PhotonicModulat]Object ffffffc69a8b0f00: 6b 6b 6b 6b 6b 6b 6b 6b 6a 6b 6b 6b 6b 6b 6b 6b kkkkkkkkjkkkkkkk
<3>[ 713.480731]-(0)[4719:PhotonicModulat]Object ffffffc69a8b0f10: 6c 6b 6c 6b 6b 6b 6b 6b 18 0f 8b 9a c6 ff ff ff lklkkkkk…
<3>[ 713.482211]-(0)[4719:PhotonicModulat]Object ffffffc69a8b0f20: 18 0f 8b 9a c6 ff ff ff 6b 6b 6b 6b 6b 6b 6b 6b …kkkkkkkk
<3>[ 713.483693]-(0)[4719:PhotonicModulat]Object ffffffc69a8b0f30: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkkkkkkkkkk.
<3>[ 713.485172]-(0)[4719:PhotonicModulat]Redzone ffffffc69a8b0f40: bb bb bb bb bb bb bb bb …
上述log信息解释:
对应code:
static void smsc9500_async_cmd_callback(struct urb *urb, struct pt_regs *regs)
{
struct USB_CONTEXT *usb_context = (struct USB_CONTEXT *)urb->context;
if (urb->status < 0)
SMSC_WARNING("smsc9500_async_cmd_callback() failed with %d\n",urb->status);
complete((struct completion *)&usb_context->notify);
kfree(&usb_context->req);//问题出在这里,仅释放了其中一部分
usb_free_urb(urb);
}
...
if (wait) {
/* wait_for_completion_timeout only implemented in 2.6.11 and higher kernel version */
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11))
expire = msecs_to_jiffies(USB_CTRL_SET_TIMEOUT);
if (!wait_for_completion_timeout(&usb_context->notify, expire)) {
ret = ASYNC_RW_TIMEOUT;
SMSC_TRACE(DBG_WARNING, "urb timeout");
kfree(usb_context);
usb_free_urb(urb);
return ASYNC_RW_FAIL;
}
#endif
}
根据上述log信息可以说明该部分内存申请时,那部分内存被改写了,导致下次申请使用的时候出现问题,但是由于slub debug中并没有指明具体时哪里改写的该部分内容,所以我们简单做一个测试,即删除该模块,煲机测试,2天未出现问题,说明其实改写该部分内存的还是该模块程序,且根据上述调用栈信息和code逻辑来看,应该同步没有做好;
根据上述怀疑点,将smsc9500_async_cmd_callback函数中的free操作去除,而是在主函数中free,验证煲机两天无异常,则此问题暂时关闭
static void smsc9500_async_cmd_callback(struct urb *urb, struct pt_regs *regs)
{
struct USB_CONTEXT *usb_context = (struct USB_CONTEXT *)urb->context;
if (urb->status < 0)
SMSC_WARNING("smsc9500_async_cmd_callback() failed with %d\n",urb->status);
complete((struct completion *)&usb_context->notify);
//kfree(&usb_context->req);
//usb_free_urb(urb);
}
static int
smsc9500_read_reg_async(struct usbnet *dev, u32 index, void *data, int wait)
{
...
if (wait) {
/* wait_for_completion_timeout only implemented in 2.6.11 and higher kernel version */
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,11))
expire = msecs_to_jiffies(USB_CTRL_SET_TIMEOUT);
if (!wait_for_completion_timeout(&usb_context->notify, expire)) {
ret = ASYNC_RW_TIMEOUT;
SMSC_TRACE(DBG_WARNING, "urb timeout");
kfree(usb_context);
usb_free_urb(urb);
return ASYNC_RW_FAIL;
}
#endif
}
kfree(usb_context);
usb_free_urb(urb);//在这里释放
return ret;
}
由于此问题整理时间距离当时处理已经过去蛮久了,部分细节信息记录不是那么准确,特此记录