Ftrace分析cpu idle被周期唤醒原因

1. 背景介绍

Ftrace的配置和使用,参考链接:
https://blog.csdn.net/lunhui2016/article/details/88677464

在Linux系统中,为了降低功耗,基本都会配置上CONFIG_NO_HZ_IDLE选项,表示如果CPU上啥task都没有跑,一般会跑到一个idle task,在进入idle task前,会把该cpu的tick关闭,用于降低功耗,避免频繁的tick将其cpu从idle中唤醒,导致增加功耗,但在实际项目中发现,发现cpu进入idle后,会被一个固定周期的tick中断将cpu唤醒出来,进而导致idle进出过于频繁,增加了系统的功耗。

本文介绍利用Ftrace提共的trace event来分析cpu在进入idle后,被周期性tick中断打断的原因并解决,以降低产品功耗。

2. 问题分析

2.1 idle下的tick中断统计

while [ 1 ]; do cat /proc/interrupts | head -n 2; sleep 1; done &

           CPU0       CPU1       CPU2       CPU3
  4:       7791       2456        462       4020     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       7829       2480        463       4051     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       7861       2510        463       4072     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       7896       2530        463       4090     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       7930       2550        463       4115     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       7967       2579        464       4143     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8005       2611        464       4176     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8038       2638        464       4199     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8074       2656        464       4228     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8111       2675        465       4256     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8147       2694        465       4283     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8184       2727        465       4316     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8217       2747        465       4338     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8251       2766        466       4363     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8288       2784        466       4391     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8325       2811        466       4418     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8361       2836        466       4450     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8400       2859        467       4473     GIC-0  27 Level     arch_timer
           CPU0       CPU1       CPU2       CPU3
  4:       8437       2880        467       4497     GIC-0  27 Level     arch_timer

从统计中可以看出,cpu2对应的tick中断,基本隔4s左右,就会触发一次,导致cpu2从idle中退出

2.2 idle下的tick中断产生原因分析

使用ftrace跟踪timer相关的event:

  • 配置ftrace event相关信息
:以可读写方式重新挂载根文件系统
adb shell "mount -o remount,rw /"

:Ftrace设置
adb shell "echo 0 > /sys/kernel/debug/tracing/tracing_on"
adb shell "echo > /sys/kernel/debug/tracing/trace"

:Only trace cpu2
adb shell "echo 4 > /sys/kernel/debug/tracing/tracing_cpumask"

:Ftrace hrtimer setup
adb shell "echo 1 > /sys/kernel/debug/tracing/events/timer/hrtimer_start/enable"
adb shell "echo traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_start/trigger"
adb shell "echo 1 > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_entry/enable"
adb shell "echo traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_entry/trigger"
adb shell "echo 1 > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_exit/enable"
adb shell "echo traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_exit/trigger"
adb shell "echo 1 > /sys/kernel/debug/tracing/events/timer/hrtimer_cancel/enable"
adb shell "echo traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_cancel/trigger"

pause

:restore Ftrace config
adb shell "echo 0 > /sys/kernel/debug/tracing/events/timer/hrtimer_start/enable"
adb shell "echo !traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_start/trigger"
adb shell "echo 0 > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_entry/enable"
adb shell "echo !traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_entry/trigger"
adb shell "echo 0 > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_exit/enable"
adb shell "echo !traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_expire_exit/trigger"
adb shell "echo 0 > /sys/kernel/debug/tracing/events/timer/hrtimer_cancel/enable"
adb shell "echo !traceon > /sys/kernel/debug/tracing/events/timer/hrtimer_cancel/trigger"
adb shell "echo 0 > /sys/kernel/debug/tracing/tracing_on"
  • 使能ftrace并且输出跟踪结果
# echo 1 > /sys/kernel/debug/tracing/tracing_on ;cat /sys/kernel/debug/tracing/t
race
# tracer: nop
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | /  _----=> need-resched_lazy
#                            || / _---=> hardirq/softirq
#                            ||| / _--=> preempt-depth
#                            |||| /     delay
#           TASK-PID   CPU#  |||||    TIMESTAMP  FUNCTION
#              | |       |   |||||       |         |
          -0     [002] d..h2..    48.050752: hrtimer_cancel: hrtimer=ffff80006ff1f888
          -0     [002] d..h1..    48.050757: hrtimer_expire_entry: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn now=48046003339
          -0     [002] dn.h1..    48.050761: hrtimer_expire_exit: hrtimer=ffff80006ff1f888
          -0     [002] dn.h2..    48.050762: hrtimer_start: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn expires=52046000000 softexpires=52046000000 mode=ABS
          -0     [002] dn..2..    48.050765: hrtimer_start: hrtimer=ffff80006ff1f730 function=tick_sched_timer expires=48047000000 softexpires=48047000000 mode=ABS|PINNED
          -0     [002] d...2..    48.050776: hrtimer_cancel: hrtimer=ffff80006ff1f730
          -0     [002] d..h2..    52.050750: hrtimer_cancel: hrtimer=ffff80006ff1f888
          -0     [002] d..h1..    52.050751: hrtimer_expire_entry: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn now=52046003339
          -0     [002] dn.h1..    52.050753: hrtimer_expire_exit: hrtimer=ffff80006ff1f888
          -0     [002] dn.h2..    52.050754: hrtimer_start: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn expires=56046000000 softexpires=56046000000 mode=ABS
          -0     [002] dn..2..    52.050757: hrtimer_start: hrtimer=ffff80006ff1f730 function=tick_sched_timer expires=52047000000 softexpires=52047000000 mode=ABS|PINNED
          -0     [002] d...2..    52.050766: hrtimer_cancel: hrtimer=ffff80006ff1f730
          -0     [002] d..h2..    56.050750: hrtimer_cancel: hrtimer=ffff80006ff1f888
          -0     [002] d..h1..    56.050751: hrtimer_expire_entry: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn now=56046003673
          -0     [002] dn.h1..    56.050754: hrtimer_expire_exit: hrtimer=ffff80006ff1f888
          -0     [002] dn.h2..    56.050754: hrtimer_start: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn expires=60046000000 softexpires=60046000000 mode=ABS
          -0     [002] dn..2..    56.050757: hrtimer_start: hrtimer=ffff80006ff1f730 function=tick_sched_timer expires=56047000000 softexpires=56047000000 mode=ABS|PINNED
          -0     [002] d...2..    56.050766: hrtimer_cancel: hrtimer=ffff80006ff1f730
          -0     [002] d..h2..    60.050749: hrtimer_cancel: hrtimer=ffff80006ff1f888
          -0     [002] d..h1..    60.050750: hrtimer_expire_entry: hrtimer=ffff80006ff1f888 function=watchdog_timer_fn now=60046003007
...

从Ftrace抓取的log可以看到,有个watchdog timer会周期性触发,时间也是4s左右,跟现象一样,因此很可能就是这个timer导致的了

搜索源码发现,是开启了CONFIG_SOFTLOCKUP_DETECTOR导致,代码如下所示:

/* watchdog kicker functions */
static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
{
	unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
	struct pt_regs *regs = get_irq_regs();
	int duration;
	int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;

	if (!watchdog_enabled)
		return HRTIMER_NORESTART;

	/* kick the hardlockup detector */
	watchdog_interrupt_count();

	/* kick the softlockup detector */
	wake_up_process(__this_cpu_read(softlockup_watchdog));

	/* .. and repeat */
	hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));\
    ...
}

其中softlockup检查的采样周期计算如下所示:

/*
 * Hard-lockup warnings should be triggered after just a few seconds. Soft-
 * lockups can have false positives under extreme conditions. So we generally
 * want a higher threshold for soft lockups than for hard lockups. So we couple
 * the thresholds with a factor: we make the soft threshold twice the amount of
 * time the hard threshold is.
 */
static int get_softlockup_thresh(void)
{
	return watchdog_thresh * 2;
}

static void set_sample_period(void)
{
	/*
	 * convert watchdog_thresh from seconds to ns
	 * the divide by 5 is to give hrtimer several chances (two
	 * or three with the current relation between the soft
	 * and hard thresholds) to increment before the
	 * hardlockup detector generates a warning
	 */
	sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
	watchdog_update_hrtimer_threshold(sample_period);
}

其中watchdog_thresh是一个全局变量,默认值为10,如下所示:

unsigned long __read_mostly watchdog_enabled;
int __read_mostly watchdog_user_enabled = 1;
int __read_mostly nmi_watchdog_user_enabled = NMI_WATCHDOG_DEFAULT;
int __read_mostly soft_watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
int __read_mostly nmi_watchdog_available;

可以算出,这个hrtimer的周期也是4s,符号Ftrace抓取的log情况

3. 问题解决

在开发过程中,打开CONFIG_SOFTLOCKUP_DETECTOR这个配置,是方便开发者进行系统问题定位和分析的,如果进行release,则该配置可以关闭,因此把这个配置关闭即可。

你可能感兴趣的:(Linux,debug,linux,linux)