kdump一般与crash工具联合使用,以便在故障发生的时候,进行问题追踪
配置kdump:
yum install kexec-tools
修改grub:
GRUB_CMDLINE_LINUX 中添加crashkernel=auto
#vim /etc/default/grub
GRUB_CMDLINE_LINUX="crashkernel=auto console=ttyS0 console=tty0 panic=5 net.ifnames=0 biosdevname=0"
#grub2-mkconfig -o /boot/grub2/grub.cfg
配置:
vim /etc/kdump.conf
path /var/crash #指定coredump文件放在/var/crash文件夹中
启动服务:
systemctl start kdump.service
systemctl enable kdump.service
重启主机
可进行手动触发验证:
#echo 1 > /proc/sys/kernel/sysrq
#echo c > /proc/sysrq-trigger
crash工具安装:
yum install crash
查看系统内核版本:
# uname -r
3.10.0-1062.9.1.el7.x86_64
下载对应的rpm包,进行安装:
下载地址http://debuginfo.centos.org/7/x86_64/
下载如下两个内核版本对应的包:
kernel-debuginfo-common-x86_64
kernel-debuginfo-
例如:
kernel-debuginfo-common-x86_64-3.10.0-327.el7.x86_64.rpm
kernel-debuginfo-3.10.0-327.el7.x86_64.rpm
# 安装
rpm -ivh kernel-debuginfo-common-x86_64-3.10.0-327.el7.x86_64.rpm
rpm -ivh kernel-debuginfo-3.10.0-327.el7.x86_64.rpm
# cd /var/crash
# ls -lrt
total 510856
drwxr-xr-x 2 root root 4096 Apr 14 12:46 127.0.0.1-2020-04-14-12:46:27
# cd 127.0.0.1-2020-04-14-12:46:27
# ls -l
total 170948
-rw------- 1 root root 174699120 Apr 14 12:46 vmcore
-rw-r--r-- 1 root root 341386 Apr 14 12:46 vmcore-dmesg.txt
查看
# more vmcore-dmesg.txt
[ 0.000000] Initializing cgroup subsys cpuset
[ 0.000000] Initializing cgroup subsys cpu
[ 0.000000] Initializing cgroup subsys cpuacct
[ 0.000000] Linux version 3.10.0-1062.9.1.el7.x86_64 ([email protected])
分析vmcore
# crash /usr/lib/debug/lib/modules/3.10.0-1062.9.1.el7.x86_64/vmlinux vmcore
crash 7.2.3-10.el7
Copyright (C) 2002-2017 Red Hat, Inc.
Copyright (C) 2004, 2005, 2006, 2010 IBM Corporation
Copyright (C) 1999-2006 Hewlett-Packard Co
Copyright (C) 2005, 2006, 2011, 2012 Fujitsu Limited
Copyright (C) 2006, 2007 VA Linux Systems Japan K.K.
Copyright (C) 2005, 2011 NEC Corporation
Copyright (C) 1999, 2002, 2007 Silicon Graphics, Inc.
Copyright (C) 1999, 2000, 2001, 2002 Mission Critical Linux, Inc.
This program is free software, covered by the GNU General Public License,
and you are welcome to change it and/or distribute copies of it under
certain conditions. Enter "help copying" to see the conditions.
This program has absolutely no warranty. Enter "help warranty" for details.
GNU gdb (GDB) 7.6
Copyright (C) 2013 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law. Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-unknown-linux-gnu"...
WARNING: kernel relocated [300MB]: patching 86886 gdb minimal_symbol values
please wait... (patching 86886 gdb minimal_symbol values)
KERNEL: /usr/lib/debug/lib/modules/3.10.0-1062.9.1.el7.x86_64/vmlinux
DUMPFILE: vmcore [PARTIAL DUMP]
CPUS: 8
DATE: Tue Apr 14 12:45:58 2020
UPTIME: 1 days, 14:32:12
LOAD AVERAGE: 2.62, 2.49, 2.46
TASKS: 460
NODENAME: VM_240_16_centos
RELEASE: 3.10.0-1062.9.1.el7.x86_64
VERSION: #1 SMP Fri Dec 6 15:49:49 UTC 2019
MACHINE: x86_64 (2394 Mhz)
MEMORY: 16 GB
PANIC: "Kernel panic - not syncing: softlockup: hung tasks"
PID: 21116
COMMAND: "kworker/u16:0"
TASK: ffff993ca497e2a0 [THREAD_INFO: ffff993d80bcc000]
CPU: 0
STATE: TASK_RUNNING (PANIC)
可以查看到基本信息:
panic原因:Kernel panic - not syncing: softlockup: hung tasks
哪个进程: PID: 21116
引起panic的命令: COMMAND: “kworker/u16:0”
在哪个cpu: CPU: 0
help查看帮助信息:
crash> help
* extend log rd task
alias files mach repeat timer
ascii foreach mod runq tree
bpf fuser mount search union
bt gdb net set vm
btop help p sig vtop
dev ipcs ps struct waitq
dis irq pte swap whatis
eval kmem ptob sym wr
exit list ptov sys q
crash version: 7.2.3-10.el7 gdb version: 7.6
For help on any command above, enter "help ".
For help on input options, enter "help input".
For help on output options, enter "help output".
bt查看堆栈信息:
crash> bt
PID: 21116 TASK: ffff993ca497e2a0 CPU: 0 COMMAND: "kworker/u16:0"
#0 [ffff993edfc03d38] machine_kexec at ffffffff93c65b24
#1 [ffff993edfc03d98] __crash_kexec at ffffffff93d22342
#2 [ffff993edfc03e68] panic at ffffffff94374972
#3 [ffff993edfc03ee8] watchdog_timer_fn at ffffffff93d4e731
#4 [ffff993edfc03f20] __hrtimer_run_queues at ffffffff93cca5ee
#5 [ffff993edfc03f78] hrtimer_interrupt at ffffffff93ccab4f
#6 [ffff993edfc03fc0] local_apic_timer_interrupt at ffffffff93c5c60b
#7 [ffff993edfc03fd8] smp_apic_timer_interrupt at ffffffff943929d3
#8 [ffff993edfc03ff0] apic_timer_interrupt at ffffffff9438eefa
--- ---
#9 [ffff993d80bcf618] apic_timer_interrupt at ffffffff9438eefa
[exception RIP: generic_exec_single+258]
RIP: ffffffff93d163c2 RSP: ffff993d80bcf6c0 RFLAGS: 00000202
RAX: 0000000000000008 RBX: ffff993d80bcf690 RCX: 0000000000000038
RDX: 00000000000000ff RSI: 0000000000000008 RDI: 0000000000000286
RBP: ffff993d80bcf708 R8: ffffffff944137c0 R9: 00000002b921e000
R10: ffff993edfc24d70 R11: ffffe23ed0314100 R12: 0000000000000006
R13: ffff993d80bcf7e8 R14: 0000000000000005 R15: ffff993ca45db000
ORIG_RAX: ffffffffffffff10 CS: 0010 SS: 0018
#10 [ffff993d80bcf710] smp_call_function_single at ffffffff93d164df
#11 [ffff993d80bcf740] smp_call_function_many at ffffffff93d16a8b
#12 [ffff993d80bcf788] native_flush_tlb_others at ffffffff93c7e478
#13 [ffff993d80bcf7d8] flush_tlb_page at ffffffff93c7e614
#14 [ffff993d80bcf7f8] ptep_clear_flush at ffffffff93e059d8
#15 [ffff993d80bcf830] page_mkclean_one at ffffffff93dfef66
#16 [ffff993d80bcf878] rmap_walk at ffffffff93e00d93
可以查看到:
进程:
PID: 21116 TASK: ffff993ca497e2a0 CPU: 0 COMMAND: "kworker/u16:0"
#9 [ffff993d80bcf618] apic_timer_interrupt at ffffffff9438eefa
[exception RIP: generic_exec_single+258]
generic_exec_single+258
task查看具体堆栈信息:
crash> task ffff993ca497e2a0
PID: 21116 TASK: ffff993ca497e2a0 CPU: 0 COMMAND: "kworker/u16:0"
struct task_struct {
state = 0,
stack = 0xffff993d80bcc000,
usage = {
counter = 2
},
flags = 77635680,
ptrace = 0,
wake_entry = {
next = 0x0
},
on_cpu = 1,
last_wakee = 0xffff993af5c23150,
wakee_flips = 5,
wakee_flip_decay_ts = 4433399691,
wake_cpu = 0,
on_rq = 1,
prio = 120,
static_prio = 120,
normal_prio = 120,
rt_priority = 0,
sched_class = 0xffffffff9441e3c0,
se = {
load = {
weight = 1024,
inv_weight = 4194304
},
run_node = {
__rb_parent_color = 1,
rb_right = 0x0,
dis进行反汇编:
crash> dis -r ffff993ca497e2a0
dis: WARNING: ffff993ca497e2a0: no associated kernel symbol found
0xffff993ca497e2a0: add %al,(%rax)
crash>
log查看系统信息:
crash> log
[ 0.000000] Initializing cgroup subsys cpuset
[ 0.000000] Initializing cgroup subsys cpu
[ 0.000000] Initializing cgroup subsys cpuacct
[ 0.000000] Linux version 3.10.0-1062.9.1.el7.x86_64 ([email protected]) (gcc version 4.8.5 20150623 (Red Hat 4.8.5-39) (GCC) ) #1 SMP Fri Dec 6 15:49:49 UTC 2019
.....
ps查看进程:
crash> ps
PID PPID CPU TASK ST %MEM VSZ RSS COMM
0 0 0 ffffffff94818480 RU 0.0 0 0 [swapper/0]
0 0 1 ffff993c3a3141c0 RU 0.0 0 0 [swapper/1]
> 0 0 2 ffff993c3a315230 RU 0.0 0 0 [swapper/2]
> 0 0 3 ffff993c3a3162a0 RU 0.0 0 0 [swapper/3]
0 0 4 ffff993c3a338000 RU 0.0 0 0 [swapper/4]
> 0 0 5 ffff993c3a339070 RU 0.0 0 0 [swapper/5]
> 0 0 6 ffff993c3a33a0e0 RU 0.0 0 0 [swapper/6]
> 0 0 7 ffff993c3a33b150 RU 0.0 0 0 [swapper/7]
查看进程树:
crash> ps -p 21116
PID: 0 TASK: ffffffff94818480 CPU: 0 COMMAND: "swapper/0"
PID: 2 TASK: ffff993c3a2c1070 CPU: 3 COMMAND: "kthreadd"
PID: 21116 TASK: ffff993ca497e2a0 CPU: 0 COMMAND: "kworker/u16:0"
查看进程子进程:
crash> ps -c 21116
PID: 21116 TASK: ffff993ca497e2a0 CPU: 0 COMMAND: "kworker/u16:0"
(no children)
crash> ps -c 2
PID: 2 TASK: ffff993c3a2c1070 CPU: 3 COMMAND: "kthreadd"
PID: 4 TASK: ffff993c3a2c3150 CPU: 0 COMMAND: "kworker/0:0H"
PID: 6 TASK: ffff993c3a2c5230 CPU: 0 COMMAND: "ksoftirqd/0"
PID: 7 TASK: ffff993c3a2c62a0 CPU: 0 COMMAND: "migration/0"
PID: 8 TASK: ffff993c3a310000 CPU: 4 COMMAND: "rcu_bh"
PID: 9 TASK: ffff993c3a311070 CPU: 2 COMMAND: "rcu_sched"
PID: 10 TASK: ffff993c3a3120e0 CPU: 0 COMMAND: "lru-add-drain"
PID: 11 TASK: ffff993c3a313150 CPU: 0 COMMAND: "watchdog/0"
PID: 12 TASK: ffff993c3a33c1c0 CPU: 1 COMMAND: "watchdog/1"
查看进程活动时间:
crash> ps -t 21116
PID: 21116 TASK: ffff993ca497e2a0 CPU: 0 COMMAND: "kworker/u16:0"
RUN TIME: 00:34:46
START TIME: 136646
UTIME: 0
STIME: 27582000000