死机问题

作为系统的问题,死机,dump,panic, crash,tombstone 都是常见的问题
在qcom 平台,我们死机的时候,进行reboot,echo 0 > download_mode
因此reboot 有很多种,PMIC ,OCP, 瞬间掉电, panic ,watchdog ....

下面分析死机的一个例子:

log

 244.889835:   <6> Internal error: Accessing user space memory outside uaccess.h routines: 9600004f [#1] PREEMPT SMP
 244.901438:   <6> Modules linked in: wlan(O) machine_dlkm(O) wsa881x_dlkm(O) wcd9360_dlkm(O) wcd934x_dlkm(O) mbhc_dlkm(O) swr_ctrl_dlkm(O) wcd9xxx_dlkm(O) fs16xx_dlkm(O) wcd_core_dlkm(O) stub_dlkm(O) wcd_spi_dlkm(O) hdmi_dlkm(O) swr_dlkm(O) pinctrl_wcd_dlkm(O) usf_dlkm(O) native_dlkm(O) platform_dlkm(O) q6_dlkm(O) adsp_loader_dlkm(O) apr_dlkm(O) q6_notifier_dlkm(O) q6_pdr_dlkm(O) wglink_dlkm(O) msm_11ad_proxy
 244.939517:   <6> CPU: 5 PID: 4008 Comm: cat Tainted: G S      W  O    4.14.83+ #1
 244.948176:   <6> Hardware name: Qualcomm Technologies, Inc. SM8150 V2 PM8150 MTP (DT)
 244.957198:   <6> task: ffffffc42507d800 task.stack: ffffff8044fc8000
 244.964717:   <2> pc : __memcpy+0x30/0x180
 244.969828:   <2> lr : vsnprintf+0x114/0x4d8
 244.975101:   <2> sp : ffffff8044fcba60 pstate : 00400145
 244.981532:   <2> x29: ffffff8044fcbad0 x28: 0000000000000020 
 244.988407:   <2> x27: 000000000000002f x26: 00000000ffffffff 
 244.995286:   <2> x25: ffffff8044fcbbb0 x24: ffffff9a98ea9942 
 245.002169:   <2> x23: 000000621a4dc8b0 x22: ffffff9a98ea9971 
 245.009045:   <2> x21: 000000629a4dc8af x20: ffffff8044fcbb98 
 245.016009:   <2> x19: ffffff9a98a93778 x18: 0000000000000001 
 245.023120:   <2> x17: 000000773a09a78c x16: ffffff9a97a5c4a4 
 245.030006:   <2> x15: aaaaaaaaaaaaaaab x14: 0000000000000050 
 245.036892:   <2> x13: 0000000000000000 x12: ffffff80ffffffd0 
 245.043775:   <2> x11: 0000000000000000 x10: ffffff9a98a93930 
 245.050660:   <2> x9 : ffffff9a988dfac0 x8 : 000000000000002f 
 245.057539:   <2> x7 : 0000001dc76a24c1 x6 : 000000621a4dc8b0 
 245.064418:   <2> x5 : 0000000000000000 x4 : 000000000000000e 
 245.071301:   <2> x3 : 0000000000003d3d x2 : 0000000000000021 
 245.078182:   <2> x1 : ffffff9a98ea9944 x0 : 000000621a4dc8b0 
 245.085064:   <2> 
 PC: 0xffffff9a988d0270:
 245.093008:   <2> 0270  d65f03c0 00000000 00000000 00000000 aa0003e6 f100405f 540003c3 cb0103e4
 245.102829:   <2> 0290  f2400c84 540001c0 cb040042 36000064 38401423 380014c3 36080064 78402423
 245.112647:   <2> 02b0  780024c3 36100064 b8404423 b80044c3 36180064 f8408423 f80084c3 f101005f
 245.122474:   <2> 02d0  5400032a f27c0443 54000140 7100807f 54000080 540000ab a8c12027 a88120c7
 245.132299:   <2> 
 LR: 0xffffff9a988dfaa4:
 245.140243:   <2> faa4  f9400122 aa1703e0 aa1503e1 aa0803e3 9400037f aa0003f7 17ffffe0 eb1502ff
 245.150061:   <2> fac4  54000102 cb1702a8 eb1b011f aa1703e0 1a9bb108 aa1803e1 93407d02 97ffc1e8
 245.159878:   <2> fae4  8b1b02f7 17ffffd5 b9800328 36f800a8 11002109 7100013f b9000329 5400160d
 245.169706:   <2> fb04  f9400288 91002109 f9000289 b9400101 910043e0 940001f6 17ffffc8 b9800328
 245.179529:   <2> 
 SP: 0xffffff8044fcba20:
 245.187482:   <2> ba20  988d02b0 ffffff9a 00400145 00000000 98ea9971 ffffff9a 17b8f400 5549d54f
 245.197302:   <2> ba40  ffffffff 0000007f 98a93778 ffffff9a 44fcbad0 ffffff80 988d02b0 ffffff9a
 245.207126:   <2> ba60  7fffffff 00000000 1a4dc8b0 00000062 00000000 00000000 17b8f400 5549d54f
 245.216950:   <2> ba80  2507d800 ffffffc4 98901000 ffffff9a 998d3000 ffffff9a 00000124 00000000
 245.226782:   <2> 
 245.229750:   <6> Process cat (pid: 4008, stack limit = 0xffffff8044fc8000)
 245.237794:   <2> Call trace:
 245.241741:   <2>  __memcpy+0x30/0x180
 245.246596:   <2>  sprintf+0x7c/0xa4
 245.251165:   <2>  gt1x_debug_read_proc+0x74/0x368
 245.256977:   <2>  proc_reg_read+0x90/0xc4
 245.262084:   <2>  __vfs_read+0x54/0x144
 245.267020:   <2>  vfs_read+0xa4/0x13c
 245.271764:   <2>  SyS_read+0x60/0xc0
 245.276420:   <2>  el0_svc_naked+0x34/0x38
 245.281521:   <6> Code: 38401423 380014c3 36080064 78402423 (780024c3) 
 245.289205:   <6> ---[ end trace 8a0a0b8d7a66762a ]---
 245.295386:   <6> Kernel panic - not syncing: Fatal exception

第一步分析

根据上述log以及Call trace,可以确定执行到__memcpy函数偏移0x30的地方carsh,把vmlinux符号文件拷贝到Android源码路径:android/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin/目录下,使用如下命令反编译符号文件aarch64-linux-androidkernel-objdump -D -t ./vmlinux > vmlinux_objdump.txt,使用UE工具,打开vmlinux_objdump.txt, __memcpy函数入口地址为ffffff80090d0280,则__memcpy+0x30地址为ffffff80090d02b0, 摘取vmlinux_objdump.txt反编译文件函数__memcpy部分代码如下,可以看到死机时正在执行的指令为strh w3, [x6],#2(Store halfword from w3 to x6, then x6 = x6 + 2);

 ffffff80090d0280 <__memcpy>:
 ffffff80090d0280:   aa0003e6    mov x6, x0
 ffffff80090d0284:   f100405f    cmp x2, #0x10
 ffffff80090d0288:   540003c3    b.cc    ffffff80090d0300 <__memcpy+0x80>
 ffffff80090d028c:   cb0103e4    neg x4, x1
 ffffff80090d0290:   f2400c84    ands    x4, x4, #0xf
 ffffff80090d0294:   540001c0    b.eq    ffffff80090d02cc <__memcpy+0x4c>
 ffffff80090d0298:   cb040042    sub x2, x2, x4
 ffffff80090d029c:   36000064    tbz w4, #0, ffffff80090d02a8 <__memcpy+0x28>
 ffffff80090d02a0:   38401423    ldrb    w3, [x1],#1
 ffffff80090d02a4:   380014c3    strb    w3, [x6],#1
 ffffff80090d02a8:   36080064    tbz w4, #1, ffffff80090d02b4 <__memcpy+0x34>
 ffffff80090d02ac:   78402423    ldrh    w3, [x1],#2
 ffffff80090d02b0:   780024c3    strh    w3, [x6],#2
 ffffff80090d02b4:   36100064    tbz w4, #2, ffffff80090d02c0 <__memcpy+0x40>
 ffffff80090d02b8:   b8404423    ldr w3, [x1],#4
 ffffff80090d02bc:   b80044c3    str w3, [x6],#4
 ffffff80090d02c0:   36180064    tbz w4, #3, ffffff80090d02cc <__memcpy+0x4c>
 ffffff80090d02c4:   f8408423    ldr x3, [x1],#8
 ffffff80090d02c8:   f80084c3    str x3, [x6],#8
 ffffff80090d02cc:   f101005f    cmp x2, #0x40
 ffffff80090d02d0:   5400032a    b.ge    ffffff80090d0334 <__memcpy+0xb4>
 ffffff80090d02d4:   f27c0443    ands    x3, x2, #0x30
 ffffff80090d02d8:   54000140    b.eq    ffffff80090d0300 <__memcpy+0x80>
 ffffff80090d02dc:   7100807f    cmp w3, #0x20
 ffffff80090d02e0:   54000080    b.eq    ffffff80090d02f0 <__memcpy+0x70>
 ffffff80090d02e4:   540000ab    b.lt    ffffff80090d02f8 <__memcpy+0x78>
 ffffff80090d02e8:   a8c12027    ldp x7, x8, [x1],#16
 ffffff80090d02ec:   a88120c7    stp x7, x8, [x6],#16
 ffffff80090d02f0:   a8c12027    ldp x7, x8, [x1],#16

第二步分析

strh w3, [x6],#2往X6寄存器保存的地址写值,通过解析的log,X6寄存器保存的值为 000000621a4dc8b0(此为用户空间的地址),是一个非法地址,__memcpy的入口地址指令,可以看到X6 是从寄存器 X0拷贝而来,也即是传递给__memcpy的第一个参数是一个用户空间地址。

  • 64位系统虚拟地址映射:用户地址的位 63:39 设为 0。内核地址的相同位设为 1
    0x0000_0000_0000_0000 – 0x0000_007F_FFFF_FFFF:用户空间
    0xFFFF_FF80_0000_0000 – 0xFFFF_FFFF_FFFF_FFFF:内核

  • ARM汇编中,R0~R3用来传递第一至第四个参数,超出的参数通过堆栈来传递。R0寄存器同时用来存放函数的返回值。被调用的函数在返回前无须恢复这些寄存器的值。(R0 对等 X0)

第三步分析

根据Call trace我们很容易追踪到非法地址的引入源头gt1x_debug_read_proc+0x74,注意偏移0x74地址为ffffff80089b4560,但需要减4个字节ffffff80089b455c(根据调用流程推导,具体原因不明),使用命令aarch64-linux-android-addr2line -e vmlinux -Cf ffffff80089b455c可以转换具体的代码行,如下可以直接查到104行代码ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n")

/android/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin$ ./aarch64-linux-android-addr2line -e vmlinux -Cf ffffff80089b455c
gt1x_debug_read_proc
/android/kernel/msm-4.14/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c:104
 ffffff80089b44ec :
 ffffff80089b44ec:   a9ba6ffc    stp x28, x27, [sp,#-96]!
 ffffff80089b44f0:   a90167fa    stp x26, x25, [sp,#16]
 ffffff80089b44f4:   a9025ff8    stp x24, x23, [sp,#32]
 ffffff80089b44f8:   a90357f6    stp x22, x21, [sp,#48]
 ffffff80089b44fc:   a9044ff4    stp x20, x19, [sp,#64]
 ffffff80089b4500:   a9057bfd    stp x29, x30, [sp,#80]
 ffffff80089b4504:   910143fd    add x29, sp, #0x50
 ffffff80089b4508:   d10043ff    sub sp, sp, #0x10
 ffffff80089b450c:   d000afe8    adrp    x8, ffffff8009fb2000 
 ffffff80089b4510:   aa0303f3    mov x19, x3
 ffffff80089b4514:   aa0103f6    mov x22, x1
 ffffff80089b4518:   f941c508    ldr x8, [x8,#904]
 ffffff80089b451c:   f81a83a8    stur    x8, [x29,#-88]
 ffffff80089b4520:   97db8345    bl  ffffff8008095234 <_mcount>
 ffffff80089b4524:   f000b8fa    adrp    x26, ffffff800a0d3000 
 ffffff80089b4528:   910003e9    mov x9, sp
 ffffff80089b452c:   b941f748    ldr w8, [x26,#500]
 ffffff80089b4530:   91003d08    add x8, x8, #0xf
 ffffff80089b4534:   927c7108    and x8, x8, #0x1fffffff0
 ffffff80089b4538:   cb080135    sub x21, x9, x8
 ffffff80089b453c:   910002bf    mov sp, x21
 ffffff80089b4540:   f9400268    ldr x8, [x19]
 ffffff80089b4544:   b4000068    cbz x8, ffffff80089b4550 
 ffffff80089b4548:   aa1f03e0    mov x0, xzr
 ffffff80089b454c:   140000b4    b   ffffff80089b481c 
 ffffff80089b4550:   b00067a1    adrp    x1, ffffff80096a9000 
 ffffff80089b4554:   91250821    add x1, x1, #0x942
 ffffff80089b4558:   aa1603e0    mov x0, x22
 ffffff80089b455c:   941cb212    bl  ffffff80090e0da4 
 ffffff80089b4560:   d0010ed4    adrp    x20, ffffff800ab8e000 

第四步分析

这个死机问题,其实比较简单,不需要前面的分析,也能判断大概死机原因。根据这个logInternal error: Accessing user space memory outside uaccess.h routines: 9600004f [#1] PREEMPT SMP,其实就可以大概猜测到传递给__memcpy函数的地址可能是非法的用户空间地址,只需跟踪Call trace,找到非法参数引入的地方。查看 gt1x_debug_read_proc代码,发现ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n") 参数ptr的地址赋值为用户空间地址page,故引起死机。

 static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
                                     size_t size, loff_t * ppos)
 {

     char *ptr = page;
     char temp_data[gt1x_cfg_length];
     struct irq_desc *irq_desc = NULL;
     int i;
     int ret;

     if(*ppos) {
         return 0;
     }
 
     ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n");
 
     for(i = 0; i < gt1x_cfg_length; i++) {
         ptr += sprintf(ptr, "0x%02X,", gt1x_config[i]);
         if(i % 10 == 9 && i != GTP_CONFIG_ORG_LENGTH)
             ptr += sprintf(ptr, "\n");
 
         if(i == GTP_CONFIG_ORG_LENGTH - 1)
             ptr += sprintf(ptr, "\n-------------\n");
     }
 
     ptr += sprintf(ptr, "\n");
 
     ptr += sprintf(ptr, "==== GT1X config read from chip====\n");
     ret = gt1x_i2c_read(GTP_REG_CONFIG_DATA, temp_data,
                         GTP_CONFIG_ORG_LENGTH);
     if(ret)
         GTP_ERROR("gt1x_i2c_read GTP_REG_CONFIG_DATA fail!");
 
     if(ret == 0 && gt1x_cfg_length == GTP_CONFIG_ORG_LENGTH +
             GTP_CONFIG_EXT_LENGTH) {
         ret = gt1x_i2c_read(GTP_REG_EXT_CONFIG,
                             &temp_data[GTP_CONFIG_ORG_LENGTH],
                             GTP_CONFIG_EXT_LENGTH);
         if(ret)
             GTP_ERROR("gt1x_i2c_read GTP_REG_EXT_CONFIG fail!");
     }
 
     for(i = 0; i < gt1x_cfg_length; i++) {
         ptr += sprintf(ptr, "0x%02X,", temp_data[i]);
         if(i % 10 == 9 && i !=  GTP_CONFIG_ORG_LENGTH)
             ptr += sprintf(ptr, "\n");
 
         if(i == GTP_CONFIG_ORG_LENGTH - 1)
             ptr += sprintf(ptr, "\n-------------\n");
     }
 
     ptr += sprintf(ptr, "\n");
     /* Touch PID & VID */
     ptr += sprintf(ptr, "==== GT1X Version Info ====\n");
 
     ret = gt1x_i2c_read(GTP_REG_VERSION, temp_data, 12);
     if(ret)
         GTP_ERROR("gt1x_i2c_read GTP_REG_VERSION fail!");
     ptr += sprintf(ptr, "ProductID: GT%c%c%c%c\n", temp_data[0],
                    temp_data[1], temp_data[2], temp_data[3]);
     ptr += sprintf(ptr, "PatchID: %02X%02X\n", temp_data[4], temp_data[5]);
     ptr += sprintf(ptr, "MaskID: %02X%02X\n", temp_data[7], temp_data[8]);
     ptr += sprintf(ptr, "SensorID: %02X\n", temp_data[10] & 0x0F);
 
     irq_desc = irq_to_desc(gt1x_i2c_client->irq);
     if(irq_desc) {
         ptr += sprintf(ptr, "IRQ: %d, irq_desc->disable-depth:%d\n",
                        gt1x_i2c_client->irq, irq_desc->depth);
     }
     *ppos += ptr - page;
     size = ptr - page;
     return size;
 }

第五步分析

进行fix code:

 diff --git a/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c b/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c
 index bd8bdae..669f287 100644
 --- a/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c
 +++ b/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c
 @@ -91,7 +91,8 @@ static void gt1x_deinit_debug_node(void)
  static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
                                      size_t size, loff_t * ppos)
  {
 -    char *ptr = page;
 +    char *ptr;
 +    char *ptr_start;
      char temp_data[gt1x_cfg_length];
      struct irq_desc *irq_desc = NULL;
      int i;
 @@ -101,6 +102,15 @@ static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
          return 0;
      }
  
 +    GTP_INFO("%s size=%d\n", __func__, size);
 +    ptr = kmalloc(size, GFP_KERNEL);
 +    if (!ptr) {
 +        GTP_ERROR("Fail to alloc memory");
 +        return -EFAULT;
 +    }
 +
 +    ptr_start = ptr;
 +
      ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n");
  
      for(i = 0; i < gt1x_cfg_length; i++) {
 @@ -156,8 +166,16 @@ static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
          ptr += sprintf(ptr, "IRQ: %d, irq_desc->disable-depth:%d\n",
                         gt1x_i2c_client->irq, irq_desc->depth);
      }
 -    *ppos += ptr - page;
 -    size = ptr - page;
 +
 +    size = ptr - ptr_start;
 +    if (copy_to_user(page, ptr_start, size)) {
 +        GTP_ERROR("copy_to_user failed");
 +        kfree(ptr_start);
 +        return -EFAULT;
 +    }
 +
 +    kfree(ptr_start);
 +    *ppos += size;
      return size;
  }

备注:上面的分析也可以看出page 使用在了copy_to_user, 因此page 是否有效也是很重要的, 如果非法,还有可能有更多的问题。

REF:
https://www.jianshu.com/p/05a27d37ea3b

你可能感兴趣的:(死机问题)