early_suspend 休眠死机

现象:

待机状态下,按Power键或者自动进入休眠,稳定死机.

1、取出db解开获取SYS_MINI_RDUMP, 使用 gdb 调试:
android@c0490:~/log/6601/20151220-goodix-KE$ arm-linux-androideabi-gdb vmlinux SYS_MINI_RDUMP 
GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "--host=x86_64-linux-gnu --target=arm-linux-android".
For bug reporting instructions, please see:
...
Reading symbols from /home/android/log/6601/20151220-goodix-KE/vmlinux...done.
[New LWP 103]
[New LWP 1]
[New LWP 2]
[New LWP 3]
[New LWP 4]
[New LWP 5]
[New LWP 6]
[New LWP 7]
[New LWP 8]
Core was generated by `console=tty0 console=ttyMT0,921600n1 root=/dev/ram vmalloc=496M slub_max_order='.
#0  0xd9027400 in ?? ()
(gdb) bt
#0  0xd9027400 in ?? ()
#1  0xc00985c0 in early_suspend (work=) at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/kernel/power/earlysuspend.c:144
#2  0xc0042b60 in process_one_work (worker=0xdaf6d000, work=0xc0e056a0 )
    at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/kernel/workqueue.c:2216
#3  0xc0043098 in worker_thread (__worker=0xd90273dc) at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/kernel/workqueue.c:2348
#4  0xc004c0bc in kthread (_create=0xdbc69df0) at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/kernel/kthread.c:200
#5  0xc000f308 in ret_from_fork () at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/arch/arm/kernel/entry-common.S:91
#6  0xc000f308 in ret_from_fork () at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/arch/arm/kernel/entry-common.S:91
Backtrace stopped: previous frame identical to this frame (corrupt stack?)
(gdb) f 1
#1  0xc00985c0 in early_suspend (work=) at /home/android/work/prj/6601-32bit-kernel/baseline/kernel-3.10/kernel/power/earlysuspend.c:144
144                 pos->suspend(pos);
(gdb) list
139         if (pos->suspend != NULL) {
140             if (!(forbid_id & (0x1 << count))) {
141                 /* if (earlysuspend_debug_mask & DEBUG_VERBOSE) */
142                 pr_warn("ES handlers %d: [%pf], level: %d\n", count, pos->suspend,
143                     pos->level);
144                 pos->suspend(pos);
145             }
146             count++;
147         }
148     }
(gdb)

发现是死在了  kernel-3.10/kernel/power/earlysuspend.c:144 位置,查看具体挂掉的原因,打印汇编代码:
0xc009859c <+348>: b   0xc00984e8 
   0xc00985a0 <+352>: ldr r3, [r4, #8]
   0xc00985a4 <+356>: mov r1, r6
   0xc00985a8 <+360>: movw    r0, #15956 ; 0x3e54
   0xc00985ac <+364>: movt    r0, #49338 ; 0xc0ba
   0xc00985b0 <+368>: bl  0xc09849d8 
   0xc00985b4 <+372>: ldr r3, [r4, #12]
   0xc00985b8 <+376>: mov r0, r4
   0xc00985bc <+380>: blx r3
=> 0xc00985c0 <+384>: b   0xc0098550 
   0xc00985c4 <+388>: addsgt  r3, r10, r8, lsr sp
   0xc00985c8 <+392>: rscgt   r5, r0, r8, asr #12
End of assembler dump.
(gdb) info reg
r0             0xd90273dc  3640816604
r1             0x60785a    6322266
r2             0x0 0
r3             0xd90273e8  3640816616
r4             0xd90273dc  3640816604
r5             0xc0f08180  3236987264
r6             0xc 12
r7             0xc0e05648  3235927624
r8             0xdbc54400  3687138304
r9             0xdafcc030  3673997360
r10            0xdb70b6d8   3681597144
r11            0xdafcddec   3674004972
r12            0xc10af654   3238721108
sp             0xdafcddd0  0xdafcddd0
lr             0xc00985c0  3221849536
pc             0xc00985c0  0xc00985c0 
cpsr           0x200b0013    537591827
(gdb)

=>  0xc00985c0 <+384>: b   0xc0098550  ---> 跳转挂了,这个是什么异常 o(╯□╰)o  ???

先不管,打印下当前的关键数据:
(gdb) p &pos->suspend
$1 = (void (**)(struct early_suspend *)) 0xd90273e8
(gdb) p pos
$2 = (struct early_suspend *) 0xd90273dc
(gdb)

发生异常的地址找出来了,就是  0xd90273e8 这个suspend函数导致的!现在要做的就是要想办法把这个是哪里来得给抓出来,分析  earlysuspend.c  内核代码  early_suspend 函数
static void early_suspend(struct work_struct *work)
{
...
	list_for_each_entry(pos, &early_suspend_handlers, link) {
		if (pos->suspend != NULL) {
			if (!(forbid_id & (0x1 << count))) {
				/* if (earlysuspend_debug_mask & DEBUG_VERBOSE) */
				pr_warn("ES handlers %d: [%pf], level: %d\n", count, pos->suspend,
					pos->level);
				pos->suspend(pos);
			}
			count++;
		}
	}

可以看出,要找到  pos->suspend 就要找到 pos,而pos是从early_supend_handers 这个链表得来的,所以看下 early_supend_handers 是在哪里加载的
追踪代码会发现追踪是在   register_early_suspend  函数注册的:
void register_early_suspend(struct early_suspend *handler)
{
    struct list_head *pos;
 
    mutex_lock(&early_suspend_lock);
    list_for_each(pos, &early_suspend_handlers) {
        struct early_suspend *e;
        e = list_entry(pos, struct early_suspend, link);
        if (e->level > handler->level)
            break;
    }
    list_add_tail(&handler->link, pos);
    early_suspend_count++;
    if ((state & SUSPENDED) && handler->suspend)
        handler->suspend(handler);
    mutex_unlock(&early_suspend_lock);
}

所以应该只要在这个函数加打印,再抓开机log就可以把错误的 pos->suspend 揪出来!打印信息如下:
void register_early_suspend(struct early_suspend *handler)
{
    struct list_head *pos;
 
    mutex_lock(&early_suspend_lock);
    list_for_each(pos, &early_suspend_handlers) {
        struct early_suspend *e;
        e = list_entry(pos, struct early_suspend, link);
        if (e->level > handler->level)
            break;
    }
 
    printk("#---^_^-->>>[%s],[0x%lx],[0x%lx],[%pf]\n", 
        __func__, 
        (long)handler, 
        (long)&handler->suspend, 
        handler->suspend);
     
    list_add_tail(&handler->link, pos);
    early_suspend_count++;
    if ((state & SUSPENDED) && handler->suspend)
        handler->suspend(handler);
    mutex_unlock(&early_suspend_lock);
}

编译、烧机,复现问题(因为有加打印代码改动,所以 有可能需要重新复现问题抓db再使用gdb调试打印 pos->suspend 的值于开机log做对比),抓开机log搜索: 0xd90273e8 
[   21.423978]<2>.(1)[1:swapper/0]#---^_^-->>>[register_early_suspend],[0xd90273dc],[0xd90273e8],[gfx1xm_early_suspend]


现在就很明显了,凶手就是  gfx1xm_early_suspend 函数,KE问题锁定在gfx1xm驱动范围内!
查看 gfx1xm驱动代码发现 xxx_probe里面有  register_early_suspend 函数:
gfx1xm_dev->early_fp.level       = EARLY_SUSPEND_LEVEL_DISABLE_FB - 1,
gfx1xm_dev->early_fp.suspend = gfx1xm_early_suspend,       
gfx1xm_dev->early_fp.resume      = gfx1xm_late_resume,     
register_early_suspend(&gfx1xm_dev->early_fp);

而复现问题的情况是,当gfx1xm设备不存在的时候才会出现,也就是说设备不存在的时候应该remove掉这个reigster,而当前的驱动 xxx_remove里面却没有对应的反注册函数导致了此问题!
这样一分析,那么解决问题就简单了,直接在退出函数加上反注册函数就行了:
static int  gfx1xm_remove(struct spi_device *spi)
{
    struct gfx1xm_dev  *gfx1xm_dev = spi_get_drvdata(spi);
    FUNC_ENTRY();
 
    klog("unregister_early_suspend(&gfx1xm_dev->early_fp)\n");
    unregister_early_suspend(&gfx1xm_dev->early_fp);
...

编译、烧机,开机 验证ok  ==》问题解决




















你可能感兴趣的:(【解题笔记】,【系统异常分析】)