作为系统的问题,死机,dump,panic, crash,tombstone 都是常见的问题
在qcom 平台,我们死机的时候,进行reboot,echo 0 > download_mode
因此reboot 有很多种,PMIC ,OCP, 瞬间掉电, panic ,watchdog ....
下面分析死机的一个例子:
log
244.889835: <6> Internal error: Accessing user space memory outside uaccess.h routines: 9600004f [#1] PREEMPT SMP
244.901438: <6> Modules linked in: wlan(O) machine_dlkm(O) wsa881x_dlkm(O) wcd9360_dlkm(O) wcd934x_dlkm(O) mbhc_dlkm(O) swr_ctrl_dlkm(O) wcd9xxx_dlkm(O) fs16xx_dlkm(O) wcd_core_dlkm(O) stub_dlkm(O) wcd_spi_dlkm(O) hdmi_dlkm(O) swr_dlkm(O) pinctrl_wcd_dlkm(O) usf_dlkm(O) native_dlkm(O) platform_dlkm(O) q6_dlkm(O) adsp_loader_dlkm(O) apr_dlkm(O) q6_notifier_dlkm(O) q6_pdr_dlkm(O) wglink_dlkm(O) msm_11ad_proxy
244.939517: <6> CPU: 5 PID: 4008 Comm: cat Tainted: G S W O 4.14.83+ #1
244.948176: <6> Hardware name: Qualcomm Technologies, Inc. SM8150 V2 PM8150 MTP (DT)
244.957198: <6> task: ffffffc42507d800 task.stack: ffffff8044fc8000
244.964717: <2> pc : __memcpy+0x30/0x180
244.969828: <2> lr : vsnprintf+0x114/0x4d8
244.975101: <2> sp : ffffff8044fcba60 pstate : 00400145
244.981532: <2> x29: ffffff8044fcbad0 x28: 0000000000000020
244.988407: <2> x27: 000000000000002f x26: 00000000ffffffff
244.995286: <2> x25: ffffff8044fcbbb0 x24: ffffff9a98ea9942
245.002169: <2> x23: 000000621a4dc8b0 x22: ffffff9a98ea9971
245.009045: <2> x21: 000000629a4dc8af x20: ffffff8044fcbb98
245.016009: <2> x19: ffffff9a98a93778 x18: 0000000000000001
245.023120: <2> x17: 000000773a09a78c x16: ffffff9a97a5c4a4
245.030006: <2> x15: aaaaaaaaaaaaaaab x14: 0000000000000050
245.036892: <2> x13: 0000000000000000 x12: ffffff80ffffffd0
245.043775: <2> x11: 0000000000000000 x10: ffffff9a98a93930
245.050660: <2> x9 : ffffff9a988dfac0 x8 : 000000000000002f
245.057539: <2> x7 : 0000001dc76a24c1 x6 : 000000621a4dc8b0
245.064418: <2> x5 : 0000000000000000 x4 : 000000000000000e
245.071301: <2> x3 : 0000000000003d3d x2 : 0000000000000021
245.078182: <2> x1 : ffffff9a98ea9944 x0 : 000000621a4dc8b0
245.085064: <2>
PC: 0xffffff9a988d0270:
245.093008: <2> 0270 d65f03c0 00000000 00000000 00000000 aa0003e6 f100405f 540003c3 cb0103e4
245.102829: <2> 0290 f2400c84 540001c0 cb040042 36000064 38401423 380014c3 36080064 78402423
245.112647: <2> 02b0 780024c3 36100064 b8404423 b80044c3 36180064 f8408423 f80084c3 f101005f
245.122474: <2> 02d0 5400032a f27c0443 54000140 7100807f 54000080 540000ab a8c12027 a88120c7
245.132299: <2>
LR: 0xffffff9a988dfaa4:
245.140243: <2> faa4 f9400122 aa1703e0 aa1503e1 aa0803e3 9400037f aa0003f7 17ffffe0 eb1502ff
245.150061: <2> fac4 54000102 cb1702a8 eb1b011f aa1703e0 1a9bb108 aa1803e1 93407d02 97ffc1e8
245.159878: <2> fae4 8b1b02f7 17ffffd5 b9800328 36f800a8 11002109 7100013f b9000329 5400160d
245.169706: <2> fb04 f9400288 91002109 f9000289 b9400101 910043e0 940001f6 17ffffc8 b9800328
245.179529: <2>
SP: 0xffffff8044fcba20:
245.187482: <2> ba20 988d02b0 ffffff9a 00400145 00000000 98ea9971 ffffff9a 17b8f400 5549d54f
245.197302: <2> ba40 ffffffff 0000007f 98a93778 ffffff9a 44fcbad0 ffffff80 988d02b0 ffffff9a
245.207126: <2> ba60 7fffffff 00000000 1a4dc8b0 00000062 00000000 00000000 17b8f400 5549d54f
245.216950: <2> ba80 2507d800 ffffffc4 98901000 ffffff9a 998d3000 ffffff9a 00000124 00000000
245.226782: <2>
245.229750: <6> Process cat (pid: 4008, stack limit = 0xffffff8044fc8000)
245.237794: <2> Call trace:
245.241741: <2> __memcpy+0x30/0x180
245.246596: <2> sprintf+0x7c/0xa4
245.251165: <2> gt1x_debug_read_proc+0x74/0x368
245.256977: <2> proc_reg_read+0x90/0xc4
245.262084: <2> __vfs_read+0x54/0x144
245.267020: <2> vfs_read+0xa4/0x13c
245.271764: <2> SyS_read+0x60/0xc0
245.276420: <2> el0_svc_naked+0x34/0x38
245.281521: <6> Code: 38401423 380014c3 36080064 78402423 (780024c3)
245.289205: <6> ---[ end trace 8a0a0b8d7a66762a ]---
245.295386: <6> Kernel panic - not syncing: Fatal exception
第一步分析
根据上述log以及Call trace,可以确定执行到__memcpy函数偏移0x30的地方carsh,把vmlinux符号文件拷贝到Android源码路径:android/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin/
目录下,使用如下命令反编译符号文件aarch64-linux-androidkernel-objdump -D -t ./vmlinux > vmlinux_objdump.txt
,使用UE工具,打开vmlinux_objdump.txt, __memcpy函数入口地址为ffffff80090d0280
,则__memcpy+0x30
地址为ffffff80090d02b0
, 摘取vmlinux_objdump.txt反编译文件函数__memcpy部分代码如下,可以看到死机时正在执行的指令为strh w3, [x6],#2
(Store halfword from w3 to x6, then x6 = x6 + 2);
ffffff80090d0280 <__memcpy>:
ffffff80090d0280: aa0003e6 mov x6, x0
ffffff80090d0284: f100405f cmp x2, #0x10
ffffff80090d0288: 540003c3 b.cc ffffff80090d0300 <__memcpy+0x80>
ffffff80090d028c: cb0103e4 neg x4, x1
ffffff80090d0290: f2400c84 ands x4, x4, #0xf
ffffff80090d0294: 540001c0 b.eq ffffff80090d02cc <__memcpy+0x4c>
ffffff80090d0298: cb040042 sub x2, x2, x4
ffffff80090d029c: 36000064 tbz w4, #0, ffffff80090d02a8 <__memcpy+0x28>
ffffff80090d02a0: 38401423 ldrb w3, [x1],#1
ffffff80090d02a4: 380014c3 strb w3, [x6],#1
ffffff80090d02a8: 36080064 tbz w4, #1, ffffff80090d02b4 <__memcpy+0x34>
ffffff80090d02ac: 78402423 ldrh w3, [x1],#2
ffffff80090d02b0: 780024c3 strh w3, [x6],#2
ffffff80090d02b4: 36100064 tbz w4, #2, ffffff80090d02c0 <__memcpy+0x40>
ffffff80090d02b8: b8404423 ldr w3, [x1],#4
ffffff80090d02bc: b80044c3 str w3, [x6],#4
ffffff80090d02c0: 36180064 tbz w4, #3, ffffff80090d02cc <__memcpy+0x4c>
ffffff80090d02c4: f8408423 ldr x3, [x1],#8
ffffff80090d02c8: f80084c3 str x3, [x6],#8
ffffff80090d02cc: f101005f cmp x2, #0x40
ffffff80090d02d0: 5400032a b.ge ffffff80090d0334 <__memcpy+0xb4>
ffffff80090d02d4: f27c0443 ands x3, x2, #0x30
ffffff80090d02d8: 54000140 b.eq ffffff80090d0300 <__memcpy+0x80>
ffffff80090d02dc: 7100807f cmp w3, #0x20
ffffff80090d02e0: 54000080 b.eq ffffff80090d02f0 <__memcpy+0x70>
ffffff80090d02e4: 540000ab b.lt ffffff80090d02f8 <__memcpy+0x78>
ffffff80090d02e8: a8c12027 ldp x7, x8, [x1],#16
ffffff80090d02ec: a88120c7 stp x7, x8, [x6],#16
ffffff80090d02f0: a8c12027 ldp x7, x8, [x1],#16
第二步分析
strh w3, [x6],#2
往X6寄存器保存的地址写值,通过解析的log,X6寄存器保存的值为 000000621a4dc8b0
(此为用户空间的地址),是一个非法地址,__memcpy
的入口地址指令,可以看到X6 是从寄存器 X0拷贝而来,也即是传递给__memcpy
的第一个参数是一个用户空间地址。
64位系统虚拟地址映射:用户地址的位 63:39 设为 0。内核地址的相同位设为 1
0x0000_0000_0000_0000 – 0x0000_007F_FFFF_FFFF
:用户空间
0xFFFF_FF80_0000_0000 – 0xFFFF_FFFF_FFFF_FFFF
:内核ARM汇编中,R0~R3用来传递第一至第四个参数,超出的参数通过堆栈来传递。R0寄存器同时用来存放函数的返回值。被调用的函数在返回前无须恢复这些寄存器的值。(R0 对等 X0)
第三步分析
根据Call trace我们很容易追踪到非法地址的引入源头gt1x_debug_read_proc+0x74,注意偏移0x74地址为ffffff80089b4560,但需要减4个字节ffffff80089b455c(根据调用流程推导,具体原因不明),使用命令aarch64-linux-android-addr2line -e vmlinux -Cf ffffff80089b455c可以转换具体的代码行,如下可以直接查到104行代码ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n")
/android/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin$ ./aarch64-linux-android-addr2line -e vmlinux -Cf ffffff80089b455c
gt1x_debug_read_proc
/android/kernel/msm-4.14/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c:104
ffffff80089b44ec :
ffffff80089b44ec: a9ba6ffc stp x28, x27, [sp,#-96]!
ffffff80089b44f0: a90167fa stp x26, x25, [sp,#16]
ffffff80089b44f4: a9025ff8 stp x24, x23, [sp,#32]
ffffff80089b44f8: a90357f6 stp x22, x21, [sp,#48]
ffffff80089b44fc: a9044ff4 stp x20, x19, [sp,#64]
ffffff80089b4500: a9057bfd stp x29, x30, [sp,#80]
ffffff80089b4504: 910143fd add x29, sp, #0x50
ffffff80089b4508: d10043ff sub sp, sp, #0x10
ffffff80089b450c: d000afe8 adrp x8, ffffff8009fb2000
ffffff80089b4510: aa0303f3 mov x19, x3
ffffff80089b4514: aa0103f6 mov x22, x1
ffffff80089b4518: f941c508 ldr x8, [x8,#904]
ffffff80089b451c: f81a83a8 stur x8, [x29,#-88]
ffffff80089b4520: 97db8345 bl ffffff8008095234 <_mcount>
ffffff80089b4524: f000b8fa adrp x26, ffffff800a0d3000
ffffff80089b4528: 910003e9 mov x9, sp
ffffff80089b452c: b941f748 ldr w8, [x26,#500]
ffffff80089b4530: 91003d08 add x8, x8, #0xf
ffffff80089b4534: 927c7108 and x8, x8, #0x1fffffff0
ffffff80089b4538: cb080135 sub x21, x9, x8
ffffff80089b453c: 910002bf mov sp, x21
ffffff80089b4540: f9400268 ldr x8, [x19]
ffffff80089b4544: b4000068 cbz x8, ffffff80089b4550
ffffff80089b4548: aa1f03e0 mov x0, xzr
ffffff80089b454c: 140000b4 b ffffff80089b481c
ffffff80089b4550: b00067a1 adrp x1, ffffff80096a9000
ffffff80089b4554: 91250821 add x1, x1, #0x942
ffffff80089b4558: aa1603e0 mov x0, x22
ffffff80089b455c: 941cb212 bl ffffff80090e0da4
ffffff80089b4560: d0010ed4 adrp x20, ffffff800ab8e000
第四步分析
这个死机问题,其实比较简单,不需要前面的分析,也能判断大概死机原因。根据这个logInternal error: Accessing user space memory outside uaccess.h routines: 9600004f [#1] PREEMPT SMP
,其实就可以大概猜测到传递给__memcpy函数的地址可能是非法的用户空间地址,只需跟踪Call trace,找到非法参数引入的地方。查看 gt1x_debug_read_proc代码,发现ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n")
参数ptr的地址赋值为用户空间地址page,故引起死机。
static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
size_t size, loff_t * ppos)
{
char *ptr = page;
char temp_data[gt1x_cfg_length];
struct irq_desc *irq_desc = NULL;
int i;
int ret;
if(*ppos) {
return 0;
}
ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n");
for(i = 0; i < gt1x_cfg_length; i++) {
ptr += sprintf(ptr, "0x%02X,", gt1x_config[i]);
if(i % 10 == 9 && i != GTP_CONFIG_ORG_LENGTH)
ptr += sprintf(ptr, "\n");
if(i == GTP_CONFIG_ORG_LENGTH - 1)
ptr += sprintf(ptr, "\n-------------\n");
}
ptr += sprintf(ptr, "\n");
ptr += sprintf(ptr, "==== GT1X config read from chip====\n");
ret = gt1x_i2c_read(GTP_REG_CONFIG_DATA, temp_data,
GTP_CONFIG_ORG_LENGTH);
if(ret)
GTP_ERROR("gt1x_i2c_read GTP_REG_CONFIG_DATA fail!");
if(ret == 0 && gt1x_cfg_length == GTP_CONFIG_ORG_LENGTH +
GTP_CONFIG_EXT_LENGTH) {
ret = gt1x_i2c_read(GTP_REG_EXT_CONFIG,
&temp_data[GTP_CONFIG_ORG_LENGTH],
GTP_CONFIG_EXT_LENGTH);
if(ret)
GTP_ERROR("gt1x_i2c_read GTP_REG_EXT_CONFIG fail!");
}
for(i = 0; i < gt1x_cfg_length; i++) {
ptr += sprintf(ptr, "0x%02X,", temp_data[i]);
if(i % 10 == 9 && i != GTP_CONFIG_ORG_LENGTH)
ptr += sprintf(ptr, "\n");
if(i == GTP_CONFIG_ORG_LENGTH - 1)
ptr += sprintf(ptr, "\n-------------\n");
}
ptr += sprintf(ptr, "\n");
/* Touch PID & VID */
ptr += sprintf(ptr, "==== GT1X Version Info ====\n");
ret = gt1x_i2c_read(GTP_REG_VERSION, temp_data, 12);
if(ret)
GTP_ERROR("gt1x_i2c_read GTP_REG_VERSION fail!");
ptr += sprintf(ptr, "ProductID: GT%c%c%c%c\n", temp_data[0],
temp_data[1], temp_data[2], temp_data[3]);
ptr += sprintf(ptr, "PatchID: %02X%02X\n", temp_data[4], temp_data[5]);
ptr += sprintf(ptr, "MaskID: %02X%02X\n", temp_data[7], temp_data[8]);
ptr += sprintf(ptr, "SensorID: %02X\n", temp_data[10] & 0x0F);
irq_desc = irq_to_desc(gt1x_i2c_client->irq);
if(irq_desc) {
ptr += sprintf(ptr, "IRQ: %d, irq_desc->disable-depth:%d\n",
gt1x_i2c_client->irq, irq_desc->depth);
}
*ppos += ptr - page;
size = ptr - page;
return size;
}
第五步分析
进行fix code:
diff --git a/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c b/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c
index bd8bdae..669f287 100644
--- a/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c
+++ b/drivers/input/touchscreen/gt1x_v1.6_generic/gt1x_generic.c
@@ -91,7 +91,8 @@ static void gt1x_deinit_debug_node(void)
static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
size_t size, loff_t * ppos)
{
- char *ptr = page;
+ char *ptr;
+ char *ptr_start;
char temp_data[gt1x_cfg_length];
struct irq_desc *irq_desc = NULL;
int i;
@@ -101,6 +102,15 @@ static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
return 0;
}
+ GTP_INFO("%s size=%d\n", __func__, size);
+ ptr = kmalloc(size, GFP_KERNEL);
+ if (!ptr) {
+ GTP_ERROR("Fail to alloc memory");
+ return -EFAULT;
+ }
+
+ ptr_start = ptr;
+
ptr += sprintf(ptr, "==== GT1X default config setting in driver====\n");
for(i = 0; i < gt1x_cfg_length; i++) {
@@ -156,8 +166,16 @@ static ssize_t gt1x_debug_read_proc(struct file *file, char __user * page,
ptr += sprintf(ptr, "IRQ: %d, irq_desc->disable-depth:%d\n",
gt1x_i2c_client->irq, irq_desc->depth);
}
- *ppos += ptr - page;
- size = ptr - page;
+
+ size = ptr - ptr_start;
+ if (copy_to_user(page, ptr_start, size)) {
+ GTP_ERROR("copy_to_user failed");
+ kfree(ptr_start);
+ return -EFAULT;
+ }
+
+ kfree(ptr_start);
+ *ppos += size;
return size;
}
备注:上面的分析也可以看出page 使用在了copy_to_user, 因此page 是否有效也是很重要的, 如果非法,还有可能有更多的问题。
REF:
https://www.jianshu.com/p/05a27d37ea3b