Linux SysRq 简介

文章目录

  • 1. 前言
  • 2. 背景
  • 3. Linux SysRq
    • 3.1 SysRq 简介
      • 3.1.1 SysRq 初始化
    • 3.2 通过 procfs 发起 SysRq 请求
      • 3.2.1 修改内核日志等级
        • 3.2.1.1 触发
        • 3.2.1.2 实现简析
      • 3.2.2 手动触发内核 panic
        • 3.2.2.1 触发
        • 3.2.2.2 实现简析
        • 3.2.2.3 应用场景
      • 3.2.3 其它 SysRq 请求
    • 3.3 通过 特殊按键 发起 SysRq 请求
  • 4. 参考资料

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 背景

本文基于 ARM32 + Linux 4.14 内核源码进行分析。

3. Linux SysRq

3.1 SysRq 简介

Linux SysRq,是内核允许通过 特殊按键procfs 文件节点 向系统发起一些特殊请求,用于查看系统状态 和 调试目的

3.1.1 SysRq 初始化

使用 SysRq 功能,需要开启内核配置项 CONFIG_MAGIC_SYSRQ 。来看一下 SysRq 的初始化:

/* drivers/tty/sysrq.c */

...

static struct input_handler sysrq_handler = {
	.filter  = sysrq_filter,
	.connect = sysrq_connect,
	.disconnect = sysrq_disconnect,
	.name  = "sysrq",
	.id_table = sysrq_ids,
};

static bool sysrq_handler_registered;

static inline void sysrq_register_handler(void)
{
	int error;

	/* 注册 以 【特殊按键】 方式发起 SysRq 的按键处理接口 */
	error = input_register_handler(&sysrq_handler);
	if (error)
		pr_err("Failed to register input handler, error %d", error);
	else
		sysrq_handler_registered = true;
}

...

static const struct file_operations proc_sysrq_trigger_operations = {
	.write  = write_sysrq_trigger,
	.llseek  = noop_llseek,
};

static void sysrq_init_procfs(void)
{
	/* 建立 /proc/sysrq-trigger 节点 */
	if (!proc_create("sysrq-trigger", S_IWUSR, NULL,
		&proc_sysrq_trigger_operations))
	pr_err("Failed to register proc interface\n");
}

static int __init sysrq_init(void)
{
	sysrq_init_procfs();
	
	if (sysrq_on())
		sysrq_register_handler();
	
	return 0;
}
device_initcall(sysrq_init);

3.2 通过 procfs 发起 SysRq 请求

本小节讨论以 procfs 文件节点 发起 SysRq 请求的方式,它们都是以向 /proc/sysrq-trigger 写入预定义字符发起:

# echo X > /proc/sysrq-trigger

操作需要特权用户权限。下面讲述几个常见的 SysRq 请求操作,并对它们的实现做简单分析。所有支持的 SysRq 请求列举在数据表格 sysrq_key_table[] 中:

static struct sysrq_key_op *sysrq_key_table[36] = {
	/* 修改内核日志等级 */
	&sysrq_loglevel_op,  /* 0 */
	&sysrq_loglevel_op,  /* 1 */
	&sysrq_loglevel_op,  /* 2 */
	&sysrq_loglevel_op,  /* 3 */
	&sysrq_loglevel_op,  /* 4 */
	&sysrq_loglevel_op,  /* 5 */
	&sysrq_loglevel_op,  /* 6 */
	&sysrq_loglevel_op,  /* 7 */
	&sysrq_loglevel_op,  /* 8 */
	&sysrq_loglevel_op,  /* 9 */

	...
	/* 触发系统重启,不会同步或者卸载磁盘 */
	&sysrq_reboot_op,  /* b */
	/* 触发内核 crash */
	&sysrq_crash_op,  /* c */
	/* 显示所有持有的锁(需开启 CONFIG_LOCKDEP) */
 	&sysrq_showlocks_op,  /* d */
 	/* 向除 init 外的所有进程发送 SIGTERM 信号 */
	&sysrq_term_op,   /* e */
	/* 触发 OOM 回收 */
	&sysrq_moom_op,   /* f */
	...
	/* 向除 init 外的所有进程发送 SIGKILL 信号 */
	&sysrq_kill_op,   /* i */
	/* 文件系统解冻操作 */
#ifdef CONFIG_BLOCK
	&sysrq_thaw_op,   /* j */
#else
	NULL,    /* j */
#endif
	&sysrq_SAK_op,   /* k */
	/* 显示所有活动 cpu 的栈回溯 */
#ifdef CONFIG_SMP
	&sysrq_showallcpus_op,  /* l */
#else
	NULL,    /* l */
#endif
	/* 显示系统内存信息, 如同 cat /proc/meminfo */
	&sysrq_showmem_op,  /* m */
	/* 用于将所有实时任务变成普通任务 */
	&sysrq_unrt_op,   /* n */
	/* o: This will often be registered as 'Off' at init time */
	NULL,    /* o */
	/* 显示 CPU 当前寄存器和标志位 */
	&sysrq_showregs_op,  /* p */
	/* 显示每个 CPU 上的高精度定时器 */
	&sysrq_show_timers_op,  /* q */
	/* 关闭键盘 RAW 模式 */
	&sysrq_unraw_op,  /* r */
	/* 尝试同步所有的已挂载文件系统 */
	&sysrq_sync_op,   /* s */
	/* 导出当前所有任务列表和它们的信息 */
	&sysrq_showstate_op,  /* t */
	/* 尝试重新挂载已挂载文件系统为只读 */
	&sysrq_mountro_op,  /* u */
	/* v: May be registered for frame buffer console restore */
	NULL,    /* v */
	/* 导出处于不可中断状态(阻塞)的任务 */
	&sysrq_showstate_blocked_op, /* w */
	/* x: May be registered on mips for TLB dump */
	/* x: May be registered on ppc/powerpc for xmon */
	/* x: May be registered on sparc64 for global PMU dump */
	NULL,    /* x */
	/* y: May be registered on sparc64 for global register dump */
	NULL,    /* y */
	/* 导出 ftrace 缓存信息 */
	&sysrq_ftrace_dump_op,  /* z */
};

/proc/sysrq-trigger 写入触发调用序列:

write()
	...
	write_sysrq_trigger()
		char c;
		get_user(c, buf);
		__handle_sysrq(c, false)

void __handle_sysrq(int key, bool check_mask)
{
	struct sysrq_key_op *op_p;
	int orig_log_level;
	int i;

	rcu_sysrq_start();
	rcu_read_lock();
	/*
	 * Raise the apparent loglevel to maximum so that the sysrq header
	 * is shown to provide the user with positive feedback.  We do not
	 * simply emit this at KERN_EMERG as that would change message
	 * routing in the consumers of /proc/kmsg.
	 */
	orig_log_level = console_loglevel;
	console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
	pr_info("SysRq : ");

	op_p = __sysrq_get_key_op(key); /* 从 sysrq_key_table[] 查找 c 对应的 SysRq 操作接口 */
	if (op_p) {
		/*
		 * Should we check for enabled operations (/proc/sysrq-trigger
		 * should not) and is the invoked operation enabled?
		 */
		if (!check_mask || sysrq_on_mask(op_p->enable_mask)) {
			pr_cont("%s\n", op_p->action_msg);
			console_loglevel = orig_log_level;
			op_p->handler(key); /* 调用 SysRq 操作接口:sysrq_handle_crash(), ... */
		} else {
			...
		}
	} else {
		...
	}
	rcu_read_unlock();
	rcu_sysrq_end();
}

3.2.1 修改内核日志等级

3.2.1.1 触发
# echo 3 > /proc/sysrq-trigger 
[ 6956.852664] sysrq: SysRq : Changing Loglevel
[ 6956.856987] sysrq: Loglevel set to 3
3.2.1.2 实现简析
write()
	...
	write_sysrq_trigger()
		sysrq_handle_loglevel()
/* drivers/tty/sysrq.c */

static void sysrq_handle_loglevel(int key)
{
	int i;

	i = key - '0';
	console_loglevel = CONSOLE_LOGLEVEL_DEFAULT;
	pr_info("Loglevel set to %d\n", i);
	console_loglevel = i;
}

3.2.2 手动触发内核 panic

3.2.2.1 触发
# echo c > /proc/sysrq-trigger
[  856.968802] sysrq: SysRq : Trigger a crash
[  856.973059] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[  856.981194] pgd = 6e7ca3d4
[  856.984471] [00000000] *pgd=8a84c831, *pte=00000000, *ppte=00000000
[  856.990789] Internal error: Oops: 817 [#1] PREEMPT ARM
[  856.995946] Modules linked in:
[  856.999020] CPU: 0 PID: 123 Comm: sh Not tainted 4.19.94-g1194fe2-dirty #102
[  857.006095] Hardware name: Generic AM33XX (Flattened Device Tree)
[  857.012236] PC is at sysrq_handle_crash+0x2c/0x34
[  857.016958] LR is at sysrq_handle_crash+0x28/0x34
[  857.021678] pc : [<c04fd9bc>]    lr : [<c04fd9b8>]    psr: 60080013
[  857.027969] sp : ca8bbe38  ip : ca8bbe38  fp : ca8bbe4c
[  857.033212] r10: 00000004  r9 : ca8bbf60  r8 : c0e2c544
[  857.038455] r7 : 00000000  r6 : 00000063  r5 : 00000007  r4 : 00000001
[  857.045007] r3 : 00000000  r2 : 00000000  r1 : 00000000  r0 : c0e14618
[  857.051561] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
[  857.058723] Control: 10c5387d  Table: 8a868019  DAC: 00000051
[  857.064493] Process sh (pid: 123, stack limit = 0xce84bd3c)
[  857.070087] Stack: (0xca8bbe38 to 0xca8bc000)
[  857.074462] be20:                                                       c0e0ff48 00000007
[  857.082676] be40: ca8bbe7c ca8bbe50 c04fdf6c c04fd99c 00000055 00000002 c04fe474 00000000
[  857.090890] be60: ca8bbf60 00000002 ca8bbf60 00000004 ca8bbe94 ca8bbe80 c04fe4d0 c04fdec8
[  857.099104] be80: cf26a980 c04fe474 ca8bbeb4 ca8bbe98 c029e968 c04fe480 c0e03048 ca87e240
[  857.107317] bea0: c029e908 ca8bbf60 ca8bbf2c ca8bbeb8 c02371f8 c029e914 00000000 00000000
[  857.115532] bec0: fffffff6 c0e03048 ca8bbf34 ca8bbed8 c012e2ac c01467e4 c0112fdc 00000004
[  857.123745] bee0: 00000007 00000000 00000000 00000000 00000000 00000000 ca873000 c012bc68
[  857.131960] bf00: 00000100 42749327 00000002 ca87e240 000c4b38 ca8bbf60 00000002 ca8ba000
[  857.140174] bf20: ca8bbf5c ca8bbf30 c02374b4 c02371c4 c012e42c c012e228 ca8bbf5c ca87e240
[  857.148388] bf40: c0e03048 ca87e240 000c4b38 00000002 ca8bbf94 ca8bbf60 c023773c c0237414
[  857.156602] bf60: 00000000 00000000 ca8bbf94 42749327 c0257618 000c2d5c 00000001 000c4b38
[  857.164816] bf80: 00000004 c0101204 ca8bbfa4 ca8bbf98 c02377c0 c02376dc 00000000 ca8bbfa8
[  857.173030] bfa0: c0101000 c02377bc 000c2d5c 00000001 00000001 000c4b38 00000002 00000000
[  857.181243] bfc0: 000c2d5c 00000001 000c4b38 00000004 00000001 00000020 00000000 00091144
[  857.189458] bfe0: 00000000 bec9a4bc 0001a908 b6f53556 60080030 00000001 00000000 00000000
[  857.197664] Backtrace: 
[  857.200127] [<c04fd990>] (sysrq_handle_crash) from [<c04fdf6c>] (__handle_sysrq+0xb0/0x180)
[  857.208513]  r5:00000007 r4:c0e0ff48
[  857.212106] [<c04fdebc>] (__handle_sysrq) from [<c04fe4d0>] (write_sysrq_trigger+0x5c/0x6c)
[  857.220494]  r10:00000004 r9:ca8bbf60 r8:00000002 r7:ca8bbf60 r6:00000000 r5:c04fe474
[  857.228354]  r4:00000002 r3:00000055
[  857.231948] [<c04fe474>] (write_sysrq_trigger) from [<c029e968>] (proc_reg_write+0x60/0x90)
[  857.240332]  r5:c04fe474 r4:cf26a980
[  857.243927] [<c029e908>] (proc_reg_write) from [<c02371f8>] (__vfs_write+0x40/0x164)
[  857.251703]  r7:ca8bbf60 r6:c029e908 r5:ca87e240 r4:c0e03048
[  857.257387] [<c02371b8>] (__vfs_write) from [<c02374b4>] (vfs_write+0xac/0x188)
[  857.264729]  r9:ca8ba000 r8:00000002 r7:ca8bbf60 r6:000c4b38 r5:ca87e240 r4:00000002
[  857.272506] [<c0237408>] (vfs_write) from [<c023773c>] (ksys_write+0x6c/0xe0)
[  857.279672]  r8:00000002 r7:000c4b38 r6:ca87e240 r5:c0e03048 r4:ca87e240
[  857.286402] [<c02376d0>] (ksys_write) from [<c02377c0>] (sys_write+0x10/0x14)
[  857.293568]  r8:c0101204 r7:00000004 r6:000c4b38 r5:00000001 r4:000c2d5c
[  857.300300] [<c02377b0>] (sys_write) from [<c0101000>] (ret_fast_syscall+0x0/0x54)
[  857.307899] Exception stack(0xca8bbfa8 to 0xca8bbff0)
[  857.312971] bfa0:                   000c2d5c 00000001 00000001 000c4b38 00000002 00000000
[  857.321185] bfc0: 000c2d5c 00000001 000c4b38 00000004 00000001 00000020 00000000 00091144
[  857.329397] bfe0: 00000000 bec9a4bc 0001a908 b6f53556
[  857.334472] Code: e5834000 f57ff04e ebf05e88 e3a03000 (e5c34000) 
[  857.343688] ---[ end trace 0caa0a25d6458889 ]---
[  857.348334] Kernel panic - not syncing: Fatal exception
[  857.353587] ---[ end Kernel panic - not syncing: Fatal exception ]---
3.2.2.2 实现简析
write()
	...
	write_sysrq_trigger()
		sysrq_handle_crash()
/* drivers/tty/sysrq.c */

static void sysrq_handle_crash(int key)
{
	char *killer = NULL;
	
	/* we need to release the RCU read lock here,
	 * otherwise we get an annoying
	 * 'BUG: sleeping function called from invalid context'
	 * complaint from the kernel before the panic.
	 */
	rcu_read_unlock();
	/* 强制 oops 导致内核 panic */
	panic_on_oops = 1; /* force panic */
	wmb();
	*killer = 1; /* 写空指针导致 页表访问异常 */
}
static struct sysrq_key_op sysrq_crash_op = {
	.handler = sysrq_handle_crash,
	.help_msg = "crash(c)",
	.action_msg = "Trigger a crash",
	.enable_mask = SYSRQ_ENABLE_DUMP,
};

static struct sysrq_key_op *sysrq_key_table[36] = {
	...
	&sysrq_crash_op,  /* c */
	...
};

假定使用 AMR32 3级分页

/* arch/arm/mm/fsr-3level.c */

static struct fsr_info fsr_info[] = {
	...
	{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
	{ do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
	{ do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
	...
};

do_translation_fault() / do_page_fault()
	...
	__do_kernel_fault(mm, addr, fsr, regs)
/* arch/arm/mm/fault.c */

static void
__do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
		struct pt_regs *regs)
{
	...
	bust_spinlocks(1);
	pr_alert("Unable to handle kernel %s at virtual address %08lx\n",
		(addr < PAGE_SIZE) ? "NULL pointer dereference" :
		"paging request", addr);
	
	show_pte(mm, addr); /* 打印地址 @addr 的 pgd, pmd, ... pte */
	die("Oops", regs, fsr); /* arch/arm/kernel/trap.c */
		...
		oops_end(flags, regs, sig)
			...
			if (panic_on_oops) /* 促使 oops 导致内核 panic */
				panic("Fatal exception");
			...
	bust_spinlocks(0);
	do_exit(SIGKILL);
}
3.2.2.3 应用场景

有时候程序卡住了(如死锁),可能想知道当前的调用链,可以通过 c 触发 panic 导出堆栈记录。如果启用了 kexec + kdump ,还会进一步进入内核转储过程,后续可用 crash 工具对转储内核做进一步的分析。

3.2.3 其它 SysRq 请求

感兴趣的读者可自行分析。

3.3 通过 特殊按键 发起 SysRq 请求

发起 SysRq 请求的具体按键,各个硬件平台各有不同,读者可参考文章末尾资料,或查阅相关资料了解。本小节对通过 特殊按键 的方式发起 SysRq 请求的过程做简要分析,如下:

/* drivers/input/input.c */
input_report_key()
	input_handle_event(dev, type, code, value)
		input_pass_values(dev, dev->vals, dev->num_vals)
			/* 将事件数据传递给挂接在输入设备 input_dev 上 input_handler 处理 */
			list_for_each_entry_rcu(handle, &dev->h_list, d_node)
				if (handle->open) {
					count = input_to_handler(handle, vals, count);
					if (handler->filter) {
						for (v = vals; v != vals + count; v++) {
							if (handler->filter(handle, v->type, v->code, v->value)) /* sysrq_filter(), ... */
								continue;
							if (end != v)
								*end = *v;
							end++;
						}
						count = end - vals;
					}
					
					/* 所有按键事件已被过滤处理(如 SysRq 按键事件),没有按键事件需做进一步处理 */
					if (!count)
						return 0;

					/* 按键事件处理 */
					if (handler->events)
						handler->events(handle, vals, count);
					else if (handler->event)
						for (v = vals; v != vals + count; v++)
							handler->event(handle, v->type, v->code, v->value);
					
					return count;
				}

/* drivers/tty/sysrq.c */
sysrq_filter()
	sysrq_handle_keypress(sysrq, code, value)

static bool sysrq_handle_keypress(struct sysrq_state *sysrq,
			unsigned int code, int value)
{
	...
	
	switch (code) {
	case KEY_LEFTALT:
	case KEY_RIGHTALT:
		...
		break;
		
	case KEY_SYSRQ:
		...
		break;
	
	default:
		if (sysrq->active && value && value != 2) {
			sysrq->need_reinject = false;
			__handle_sysrq(sysrq_xlate[code], true); /* 处理 SYSRQ 按键事件 */
		}
		break;
	}
	
	...
}

更多按键处理的细节可参考博文:Linux输入子系统简析 。

4. 参考资料

https://www.kernel.org/doc/html/latest/translations/zh_CN/admin-guide/sysrq.html

你可能感兴趣的:(#,追踪,&,调试,&,性能,linux,SysRq)