限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
本文基于 ARM32 + Linux 4.14
内核源码进行分析。
Linux SysRq
,是内核允许通过 特殊按键
和 procfs 文件节点
向系统发起一些特殊请求,用于查看系统状态 和 调试目的
。
使用 SysRq
功能,需要开启内核配置项 CONFIG_MAGIC_SYSRQ
。来看一下 SysRq
的初始化:
/* drivers/tty/sysrq.c */
...
static struct input_handler sysrq_handler = {
.filter = sysrq_filter,
.connect = sysrq_connect,
.disconnect = sysrq_disconnect,
.name = "sysrq",
.id_table = sysrq_ids,
};
static bool sysrq_handler_registered;
static inline void sysrq_register_handler(void)
{
int error;
/* 注册 以 【特殊按键】 方式发起 SysRq 的按键处理接口 */
error = input_register_handler(&sysrq_handler);
if (error)
pr_err("Failed to register input handler, error %d", error);
else
sysrq_handler_registered = true;
}
...
static const struct file_operations proc_sysrq_trigger_operations = {
.write = write_sysrq_trigger,
.llseek = noop_llseek,
};
static void sysrq_init_procfs(void)
{
/* 建立 /proc/sysrq-trigger 节点 */
if (!proc_create("sysrq-trigger", S_IWUSR, NULL,
&proc_sysrq_trigger_operations))
pr_err("Failed to register proc interface\n");
}
static int __init sysrq_init(void)
{
sysrq_init_procfs();
if (sysrq_on())
sysrq_register_handler();
return 0;
}
device_initcall(sysrq_init);
本小节讨论以 procfs 文件节点
发起 SysRq
请求的方式,它们都是以向 /proc/sysrq-trigger
写入预定义字符发起:
# echo X > /proc/sysrq-trigger
操作需要特权用户权限
。下面讲述几个常见的 SysRq
请求操作,并对它们的实现做简单分析。所有支持的 SysRq
请求列举在数据表格 sysrq_key_table[]
中:
static struct sysrq_key_op *sysrq_key_table[36] = {
/* 修改内核日志等级 */
&sysrq_loglevel_op, /* 0 */
&sysrq_loglevel_op, /* 1 */
&sysrq_loglevel_op, /* 2 */
&sysrq_loglevel_op, /* 3 */
&sysrq_loglevel_op, /* 4 */
&sysrq_loglevel_op, /* 5 */
&sysrq_loglevel_op, /* 6 */
&sysrq_loglevel_op, /* 7 */
&sysrq_loglevel_op, /* 8 */
&sysrq_loglevel_op, /* 9 */
...
/* 触发系统重启,不会同步或者卸载磁盘 */
&sysrq_reboot_op, /* b */
/* 触发内核 crash */
&sysrq_crash_op, /* c */
/* 显示所有持有的锁(需开启 CONFIG_LOCKDEP) */
&sysrq_showlocks_op, /* d */
/* 向除 init 外的所有进程发送 SIGTERM 信号 */
&sysrq_term_op, /* e */
/* 触发 OOM 回收 */
&sysrq_moom_op, /* f */
...
/* 向除 init 外的所有进程发送 SIGKILL 信号 */
&sysrq_kill_op, /* i */
/* 文件系统解冻操作 */
#ifdef CONFIG_BLOCK
&sysrq_thaw_op, /* j */
#else
NULL, /* j */
#endif
&sysrq_SAK_op, /* k */
/* 显示所有活动 cpu 的栈回溯 */
#ifdef CONFIG_SMP
&sysrq_showallcpus_op, /* l */
#else
NULL, /* l */
#endif
/* 显示系统内存信息, 如同 cat /proc/meminfo */
&sysrq_showmem_op, /* m */
/* 用于将所有实时任务变成普通任务 */
&sysrq_unrt_op, /* n */
/* o: This will often be registered as 'Off' at init time */
NULL, /* o */
/* 显示 CPU 当前寄存器和标志位 */
&sysrq_showregs_op, /* p */
/* 显示每个 CPU 上的高精度定时器 */
&sysrq_show_timers_op, /* q */
/* 关闭键盘 RAW 模式 */
&sysrq_unraw_op, /* r */
/* 尝试同步所有的已挂载文件系统 */
&sysrq_sync_op, /* s */
/* 导出当前所有任务列表和它们的信息 */
&sysrq_showstate_op, /* t */
/* 尝试重新挂载已挂载文件系统为只读 */
&sysrq_mountro_op, /* u */
/* v: May be registered for frame buffer console restore */
NULL, /* v */
/* 导出处于不可中断状态(阻塞)的任务 */
&sysrq_showstate_blocked_op, /* w */
/* x: May be registered on mips for TLB dump */
/* x: May be registered on ppc/powerpc for xmon */
/* x: May be registered on sparc64 for global PMU dump */
NULL, /* x */
/* y: May be registered on sparc64 for global register dump */
NULL, /* y */
/* 导出 ftrace 缓存信息 */
&sysrq_ftrace_dump_op, /* z */
};
对 /proc/sysrq-trigger
写入触发调用序列:
write()
...
write_sysrq_trigger()
char c;
get_user(c, buf);
__handle_sysrq(c, false)
void __handle_sysrq(int key, bool check_mask)
{
struct sysrq_key_op *op_p;
int orig_log_level;
int i;
rcu_sysrq_start();
rcu_read_lock();
/*
* Raise the apparent loglevel to maximum so that the sysrq header
* is shown to provide the user with positive feedback. We do not
* simply emit this at KERN_EMERG as that would change message
* routing in the consumers of /proc/kmsg.
*/
orig_log_level = console_loglevel;
console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
pr_info("SysRq : ");
op_p = __sysrq_get_key_op(key); /* 从 sysrq_key_table[] 查找 c 对应的 SysRq 操作接口 */
if (op_p) {
/*
* Should we check for enabled operations (/proc/sysrq-trigger
* should not) and is the invoked operation enabled?
*/
if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
pr_cont("%s\n", op_p->action_msg);
console_loglevel = orig_log_level;
op_p->handler(key); /* 调用 SysRq 操作接口:sysrq_handle_crash(), ... */
} else {
...
}
} else {
...
}
rcu_read_unlock();
rcu_sysrq_end();
}
# echo 3 > /proc/sysrq-trigger
[ 6956.852664] sysrq: SysRq : Changing Loglevel
[ 6956.856987] sysrq: Loglevel set to 3
write()
...
write_sysrq_trigger()
sysrq_handle_loglevel()
/* drivers/tty/sysrq.c */
static void sysrq_handle_loglevel(int key)
{
int i;
i = key - '0';
console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
pr_info("Loglevel set to %d\n", i);
console_loglevel = i;
}
# echo c > /proc/sysrq-trigger
[ 856.968802] sysrq: SysRq : Trigger a crash
[ 856.973059] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[ 856.981194] pgd = 6e7ca3d4
[ 856.984471] [00000000] *pgd=8a84c831, *pte=00000000, *ppte=00000000
[ 856.990789] Internal error: Oops: 817 [#1] PREEMPT ARM
[ 856.995946] Modules linked in:
[ 856.999020] CPU: 0 PID: 123 Comm: sh Not tainted 4.19.94-g1194fe2-dirty #102
[ 857.006095] Hardware name: Generic AM33XX (Flattened Device Tree)
[ 857.012236] PC is at sysrq_handle_crash+0x2c/0x34
[ 857.016958] LR is at sysrq_handle_crash+0x28/0x34
[ 857.021678] pc : [<c04fd9bc>] lr : [<c04fd9b8>] psr: 60080013
[ 857.027969] sp : ca8bbe38 ip : ca8bbe38 fp : ca8bbe4c
[ 857.033212] r10: 00000004 r9 : ca8bbf60 r8 : c0e2c544
[ 857.038455] r7 : 00000000 r6 : 00000063 r5 : 00000007 r4 : 00000001
[ 857.045007] r3 : 00000000 r2 : 00000000 r1 : 00000000 r0 : c0e14618
[ 857.051561] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none
[ 857.058723] Control: 10c5387d Table: 8a868019 DAC: 00000051
[ 857.064493] Process sh (pid: 123, stack limit = 0xce84bd3c)
[ 857.070087] Stack: (0xca8bbe38 to 0xca8bc000)
[ 857.074462] be20: c0e0ff48 00000007
[ 857.082676] be40: ca8bbe7c ca8bbe50 c04fdf6c c04fd99c 00000055 00000002 c04fe474 00000000
[ 857.090890] be60: ca8bbf60 00000002 ca8bbf60 00000004 ca8bbe94 ca8bbe80 c04fe4d0 c04fdec8
[ 857.099104] be80: cf26a980 c04fe474 ca8bbeb4 ca8bbe98 c029e968 c04fe480 c0e03048 ca87e240
[ 857.107317] bea0: c029e908 ca8bbf60 ca8bbf2c ca8bbeb8 c02371f8 c029e914 00000000 00000000
[ 857.115532] bec0: fffffff6 c0e03048 ca8bbf34 ca8bbed8 c012e2ac c01467e4 c0112fdc 00000004
[ 857.123745] bee0: 00000007 00000000 00000000 00000000 00000000 00000000 ca873000 c012bc68
[ 857.131960] bf00: 00000100 42749327 00000002 ca87e240 000c4b38 ca8bbf60 00000002 ca8ba000
[ 857.140174] bf20: ca8bbf5c ca8bbf30 c02374b4 c02371c4 c012e42c c012e228 ca8bbf5c ca87e240
[ 857.148388] bf40: c0e03048 ca87e240 000c4b38 00000002 ca8bbf94 ca8bbf60 c023773c c0237414
[ 857.156602] bf60: 00000000 00000000 ca8bbf94 42749327 c0257618 000c2d5c 00000001 000c4b38
[ 857.164816] bf80: 00000004 c0101204 ca8bbfa4 ca8bbf98 c02377c0 c02376dc 00000000 ca8bbfa8
[ 857.173030] bfa0: c0101000 c02377bc 000c2d5c 00000001 00000001 000c4b38 00000002 00000000
[ 857.181243] bfc0: 000c2d5c 00000001 000c4b38 00000004 00000001 00000020 00000000 00091144
[ 857.189458] bfe0: 00000000 bec9a4bc 0001a908 b6f53556 60080030 00000001 00000000 00000000
[ 857.197664] Backtrace:
[ 857.200127] [<c04fd990>] (sysrq_handle_crash) from [<c04fdf6c>] (__handle_sysrq+0xb0/0x180)
[ 857.208513] r5:00000007 r4:c0e0ff48
[ 857.212106] [<c04fdebc>] (__handle_sysrq) from [<c04fe4d0>] (write_sysrq_trigger+0x5c/0x6c)
[ 857.220494] r10:00000004 r9:ca8bbf60 r8:00000002 r7:ca8bbf60 r6:00000000 r5:c04fe474
[ 857.228354] r4:00000002 r3:00000055
[ 857.231948] [<c04fe474>] (write_sysrq_trigger) from [<c029e968>] (proc_reg_write+0x60/0x90)
[ 857.240332] r5:c04fe474 r4:cf26a980
[ 857.243927] [<c029e908>] (proc_reg_write) from [<c02371f8>] (__vfs_write+0x40/0x164)
[ 857.251703] r7:ca8bbf60 r6:c029e908 r5:ca87e240 r4:c0e03048
[ 857.257387] [<c02371b8>] (__vfs_write) from [<c02374b4>] (vfs_write+0xac/0x188)
[ 857.264729] r9:ca8ba000 r8:00000002 r7:ca8bbf60 r6:000c4b38 r5:ca87e240 r4:00000002
[ 857.272506] [<c0237408>] (vfs_write) from [<c023773c>] (ksys_write+0x6c/0xe0)
[ 857.279672] r8:00000002 r7:000c4b38 r6:ca87e240 r5:c0e03048 r4:ca87e240
[ 857.286402] [<c02376d0>] (ksys_write) from [<c02377c0>] (sys_write+0x10/0x14)
[ 857.293568] r8:c0101204 r7:00000004 r6:000c4b38 r5:00000001 r4:000c2d5c
[ 857.300300] [<c02377b0>] (sys_write) from [<c0101000>] (ret_fast_syscall+0x0/0x54)
[ 857.307899] Exception stack(0xca8bbfa8 to 0xca8bbff0)
[ 857.312971] bfa0: 000c2d5c 00000001 00000001 000c4b38 00000002 00000000
[ 857.321185] bfc0: 000c2d5c 00000001 000c4b38 00000004 00000001 00000020 00000000 00091144
[ 857.329397] bfe0: 00000000 bec9a4bc 0001a908 b6f53556
[ 857.334472] Code: e5834000 f57ff04e ebf05e88 e3a03000 (e5c34000)
[ 857.343688] ---[ end trace 0caa0a25d6458889 ]---
[ 857.348334] Kernel panic - not syncing: Fatal exception
[ 857.353587] ---[ end Kernel panic - not syncing: Fatal exception ]---
write()
...
write_sysrq_trigger()
sysrq_handle_crash()
/* drivers/tty/sysrq.c */
static void sysrq_handle_crash(int key)
{
char *killer = NULL;
/* we need to release the RCU read lock here,
* otherwise we get an annoying
* 'BUG: sleeping function called from invalid context'
* complaint from the kernel before the panic.
*/
rcu_read_unlock();
/* 强制 oops 导致内核 panic */
panic_on_oops = 1; /* force panic */
wmb();
*killer = 1; /* 写空指针导致 页表访问异常 */
}
static struct sysrq_key_op sysrq_crash_op = {
.handler = sysrq_handle_crash,
.help_msg = "crash(c)",
.action_msg = "Trigger a crash",
.enable_mask = SYSRQ_ENABLE_DUMP,
};
static struct sysrq_key_op *sysrq_key_table[36] = {
...
&sysrq_crash_op, /* c */
...
};
假定使用 AMR32 3级分页
:
/* arch/arm/mm/fsr-3level.c */
static struct fsr_info fsr_info[] = {
...
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
{ do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
...
};
do_translation_fault() / do_page_fault()
...
__do_kernel_fault(mm, addr, fsr, regs)
/* arch/arm/mm/fault.c */
static void
__do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
struct pt_regs *regs)
{
...
bust_spinlocks(1);
pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
(addr < PAGE_SIZE) ? "NULL pointer dereference" :
"paging request", addr);
show_pte(mm, addr); /* 打印地址 @addr 的 pgd, pmd, ... pte */
die("Oops", regs, fsr); /* arch/arm/kernel/trap.c */
...
oops_end(flags, regs, sig)
...
if (panic_on_oops) /* 促使 oops 导致内核 panic */
panic("Fatal exception");
...
bust_spinlocks(0);
do_exit(SIGKILL);
}
有时候程序卡住了(如死锁),可能想知道当前的调用链,可以通过 c
触发 panic
导出堆栈记录。如果启用了 kexec + kdump
,还会进一步进入内核转储过程,后续可用 crash
工具对转储内核做进一步的分析。
感兴趣的读者可自行分析。
发起 SysRq
请求的具体按键,各个硬件平台各有不同,读者可参考文章末尾资料,或查阅相关资料了解。本小节对通过 特殊按键
的方式发起 SysRq
请求的过程做简要分析,如下:
/* drivers/input/input.c */
input_report_key()
input_handle_event(dev, type, code, value)
input_pass_values(dev, dev->vals, dev->num_vals)
/* 将事件数据传递给挂接在输入设备 input_dev 上 input_handler 处理 */
list_for_each_entry_rcu(handle, &dev->h_list, d_node)
if (handle->open) {
count = input_to_handler(handle, vals, count);
if (handler->filter) {
for (v = vals; v != vals + count; v++) {
if (handler->filter(handle, v->type, v->code, v->value)) /* sysrq_filter(), ... */
continue;
if (end != v)
*end = *v;
end++;
}
count = end - vals;
}
/* 所有按键事件已被过滤处理(如 SysRq 按键事件),没有按键事件需做进一步处理 */
if (!count)
return 0;
/* 按键事件处理 */
if (handler->events)
handler->events(handle, vals, count);
else if (handler->event)
for (v = vals; v != vals + count; v++)
handler->event(handle, v->type, v->code, v->value);
return count;
}
/* drivers/tty/sysrq.c */
sysrq_filter()
sysrq_handle_keypress(sysrq, code, value)
static bool sysrq_handle_keypress(struct sysrq_state *sysrq,
unsigned int code, int value)
{
...
switch (code) {
case KEY_LEFTALT:
case KEY_RIGHTALT:
...
break;
case KEY_SYSRQ:
...
break;
default:
if (sysrq->active && value && value != 2) {
sysrq->need_reinject = false;
__handle_sysrq(sysrq_xlate[code], true); /* 处理 SYSRQ 按键事件 */
}
break;
}
...
}
更多按键处理的细节可参考博文:Linux输入子系统简析 。
https://www.kernel.org/doc/html/latest/translations/zh_CN/admin-guide/sysrq.html