主要想了解下Linux的hrtimer。
用来记录启动至今的滴答数。
因为 jiffies
存在溢出的可能,建议使用内核提供的宏来进行时间比较
#include
time_after(a,b)
time_before(a,b)
time_after_eq(a,b)
time_before_eq(a,b)
time_in_range(a,b,c)
定义如下:
struct timeval {
__kernel_time_t tv_sec; /* seconds */
__kernel_suseconds_t tv_usec; /* microseconds */
};
一般下面2个函数使用这个结构体:
#include
/* timezone已经过时了, 一般设置成NULL */
struct timezone {
int tz_minuteswest; /* minutes west of Greenwich */
int tz_dsttime; /* type of DST correction */
};
int gettimeofday(struct timeval *tv, struct timezone *tz);
int settimeofday(const struct timeval *tv, const struct timezone *tz);
定义如下:
struct timespec {
__kernel_time_t tv_sec; /* seconds */
long tv_nsec; /* nanoseconds */
};
/* 内核提供的一些辅助函数 */
#include
s64 timespec64_to_ns(const struct timespec64 *ts);
struct timespec64 ns_to_timespec64(const s64 nsec);
内核通用时间框架,兼容32,64位大小端
/* 这是在 Linux5.15的定义,别的版本可能不一致 */
typedef s64>ktime_t;
提供的函数可在linux/ktime.h
中找到
几类时间:
RTC
时间xtime
:同RTC
时间,只是xtime
实际上是一个内存中的变量,它的访问速度非常快,xtime
记录的是自1970年1月1日24时到当前时刻所经历的纳秒数monotonic time
:单调递增的时间, 不包含系统休眠时间raw monotonic time
:与monotonic time
相同,不过不受NTP
时间调整的影响boot time
:与monotonic time
,不过会加上休眠时间,所以相当于总启动时间结构体定义如下:
struct tk_read_base {
struct clocksource *clock; /* 指向一个时钟源 */
u64 mask;
u64 cycle_last; /* 上一次读取出来的cycle */
u32 mult;
u32 shift;
u64 xtime_nsec;
ktime_t base; /* 该变量有 timekeeper维护 */
u64 base_real;
};
struct timekeeper {
struct tk_read_base tkr_mono;
struct tk_read_base tkr_raw;
u64 xtime_sec; /* 刚刚说的xtime */
unsigned long ktime_sec;
/*
* wall_to_monotonic xtime和monotonic的偏移
* monotonic = xtime + wall_to_monotonic
*/
struct timespec64 wall_to_monotonic;
ktime_t offs_real;
ktime_t offs_boot;
ktime_t offs_tai;
s32 tai_offset;
unsigned int clock_was_set_seq;
u8 cs_was_changed_seq;
ktime_t next_leap_ktime;
u64 raw_sec;
struct timespec64 monotonic_to_boot;
/* 下面成员 timekeeping 内部使用 */
u64 cycle_interval;
u64 xtime_interval;
s64 xtime_remainder;
u64 raw_interval;
/* ntp_tick_length() 当前使用的返回值.
* This cached copy ensures we consistently apply the tick
* length for an entire tick, as ntp_tick_length may change
* mid-tick, and we don't want to apply that new value to
* the tick in progress.
*/
u64 ntp_tick;
/* Difference between accumulated time and NTP time in ntp
* shifted nano seconds. */
s64 ntp_error;
u32 ntp_error_shift;
u32 ntp_err_mult;
/* Flag used to avoid updating NTP twice with same second */
u32 skip_second_overflow;
#ifdef CONFIG_DEBUG_TIMEKEEPING
long last_warning;
/*
* These simple flag variables are managed
* without locks, which is racy, but they are
* ok since we don't really care about being
* super precise about how many events were
* seen, just that a problem was observed.
*/
int underflow_seen;
int overflow_seen;
#endif
};
/* 在 kernel/time/timekeeping.c 会静态定义一个 tk_core */
static struct {
seqcount_raw_spinlock_t seq;
struct timekeeper timekeeper;
} tk_core ____cacheline_aligned = {
.seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_core.seq, &timekeeper_lock),
};
提供的接口在 linux/timekeeping.h
使用 struct clocksource
表示,主要是提供 read
函数
重要字段:
rating
:精度,有以下取值范围:
read
回调:返回计数值,不是真正的时间,还要依赖mult
和shift
进行转换
t = (cycle * mult) >> shift
,只要保证始终频率 F = (1 << shift) / mult
,因为 t = cyclt / F
时钟源通过 clocksource_register_hz
来注册。
如 ARM:
static struct clocksource clocksource_counter = {
.name = "arch_sys_counter",
.id = CSID_ARM_ARCH_COUNTER,
.rating = 400,
.read = arch_counter_read,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
...
clocksource_register_hz(&clocksource_counter, arch_timer_rate);
clocksource_register_hz做的工作:
mult, shift
&clocksource_list
clocksource_select
来选择一个最好的时钟源,记录到curr_clocksource
并调用timekeeping_notify
通知时钟源改变默认会创建一个jiffies
的时钟源(kernel/time/jiffies.c
)
static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.uncertainty_margin = 32 * NSEC_PER_MSEC,
.read = jiffies_read,
.mask = CLOCKSOURCE_MASK(32),
.mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
.max_cycles = 10,
};
static int __init init_jiffies_clocksource(void)
{
return __clocksource_register(&clocksource_jiffies);
}
然后:
如果平台级的代码在初始化时也注册真正的硬件clocksource
,经过clocksource_select()
函数后,curr_clocksource
将会被设为最合适的clocksource
如果clocksource_select
函数认为需要切换更好的时钟源,它会通过timekeeping_notify
通知timekeeping
系统,使用新的clocksource
进行时间计数和更新操作
可编程的时钟设备:
clock_event_device
tick_device
是基于clock_event_device
的进一步封装,代替原有的时钟滴答中断,给内核提供tick事件定义如下:
enum clock_event_state {
CLOCK_EVT_STATE_DETACHED, /* 分离 */
CLOCK_EVT_STATE_SHUTDOWN, /* 关闭 */
CLOCK_EVT_STATE_PERIODIC, /* 周期 */
CLOCK_EVT_STATE_ONESHOT, /* 单次 */
CLOCK_EVT_STATE_ONESHOT_STOPPED, /* 单次关闭 */
};
struct clock_event_device {
/* 由框架分配,由事件源的低级处理程序调用 */
void (*event_handler)(struct clock_event_device *);
/* 使用tick设置下一个事件时间 */
int (*set_next_event)(unsigned long evt, struct clock_event_device *);
/* 使用ktime_t设置下一个事件时间 */
int (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
/* 下一个事件的事件 */
ktime_t next_event;
/* 最大/最小的时间 */
u64 max_delta_ns;
u64 min_delta_ns;
u32 mult;
u32 shift;
/* 设备的当前状态,由核心代码分配 */
enum clock_event_state state_use_accessors;
unsigned int features;
unsigned long retries; /* 强制编程重试次数 */
/* 切换状态为周期触发 */
int (*set_state_periodic)(struct clock_event_device *);
/* 切换状态为单次触发 */
int (*set_state_oneshot)(struct clock_event_device *);
/* 切换状态到 oneshot_stopped */
int (*set_state_oneshot_stopped)(struct clock_event_device *);
/* 切换状态到 shutdown */
int (*set_state_shutdown)(struct clock_event_device *);
/* 回复tick event 设备 */
int (*tick_resume)(struct clock_event_device *);
/* 用于广播事件 */
void (*broadcast)(const struct cpumask *mask);
void (*suspend)(struct clock_event_device *);
void (*resume)(struct clock_event_device *);
unsigned long min_delta_ticks;
unsigned long max_delta_ticks;
const char *name;
int rating;
int irq;
int bound_on;
const struct cpumask *cpumask;
struct list_head list;
struct module *owner;
} ____cacheline_aligned;
功能:定期生成tick事件,供调度器使用(旧框架),切换到高精度模式后,高精度定时器就接管了tick_device
,不会再提供原有的tick事件机制
定义如下:
enum tick_device_mode {
TICKDEV_MODE_PERIODIC,
TICKDEV_MODE_ONESHOT,
};
/* 该结构体每个CPU都有一个,定义在 kernel/time/tick-common.c */
struct tick_device {
struct clock_event_device *evtdev;
enum tick_device_mode mode;
};
初始化:
xxxx 由各类平台驱动代码调用
如riscv:
TIMER_OF_DECLARE(riscv_timer, "riscv", riscv_timer_init_dt);
riscv_timer_init_dt
riscv_timer_starting_cpu
clockevents_config_and_register
如arm:
TIMER_OF_DECLARE(armv7_arch_timer, "arm,armv7-timer", arch_timer_of_init);
arch_timer_of_init
arch_timer_register
arch_timer_starting_cpu
__arch_timer_setup
clockevents_config_and_register
clockevents_config_and_register
clockevents_register_device
list_add(&dev->list, &clockevent_devices);
tick_check_new_device(dev);
tick_setup_device(td, newdev, cpu, cpumask_of(cpu));
td->mode = TICKDEV_MODE_PERIODIC; // 如果是第一次注册 clock_event_device 会默认设置成周期模式
tick_setup_periodic(newdev, 0); /* TICKDEV_MODE_PERIODIC 走这里 */
dev->event_handler = tick_handle_periodic;
clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
tick_setup_oneshot(newdev, handler, next_event); /* 其他走这里 */
newdev->event_handler = handler;
clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
clockevents_program_event(newdev, next_event, true);
如上可以看到,tick_deice
的处理函数是tick_handle_periodic
。
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
ktime_t next = dev->next_event;
/*
* 完成tick事件的所有处理
* 1. 更新 jiffies_64 变量
* 2. 更新墙上时间
* 3. 每10个tick更新cpu负载信息
* 4. 处理所有硬件定时器(hrtimer_run_queues)
* 5. 触发 TIMER_SOFTIRQ,以便系统处理传统的低分辨率定时器
* 6. 检查rcu的callback
* 7. 通过scheduler_tick触发调度系统进行进程统计和调度工作
*/
tick_periodic(cpu);
#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
/*
* 当前CPU可能已经被切换到的高精度模式或NOHZ模式,通过调用:
* update_process_times() -> run_local_timers() ->
* hrtimer_run_queues().
*/
if (dev->event_handler != tick_handle_periodic)
return;
#endif
/* 如果不是单次模式,则直接返回 */
if (!clockevent_state_oneshot(dev))
return;
for (;;) {
/* 设置下次超时时间 */
next = ktime_add_ns(next, TICK_NSEC);
if (!clockevents_program_event(dev, next, false))
return;
/*
* 如果前面 clockevents_program_event 失败
* 说明时间已经过了,所以我们需要重新调用 tick_periodic 来更新tick
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
}
}
主要用于
hrtimer
继续提供tick
事件定义如下:
/* 该结构体每个CPU都有一个,定义在 kernel/time/tick-sched.c */
struct tick_sched {
struct hrtimer sched_timer;
/*
* 用于时钟源改变的通知机制
* bit0被置位的时候表示有新的 clock_event_device 或 clocksource注册到系统
*/
unsigned long check_clocks;
enum tick_nohz_mode nohz_mode; /* nohz模式 */
unsigned int inidle : 1; /* 表示CPU在idle模式 */
unsigned int tick_stopped : 1; /* 表示idle tick已经停止 */
unsigned int idle_active : 1; /* 表示CPU处理tick idle模式, 在中断阶段会被复位 */
unsigned int do_timer_last : 1; /* 空闲前最后执行的是 do_timer */
unsigned int got_idle_tick : 1; /* 是否在idle模式下收到tick */
ktime_t last_tick; /* 上次tick到来的时间 */
ktime_t next_tick; /* 下次tick到来的时间 */
unsigned long idle_jiffies; /* 进入idle时的jiffies值 */
unsigned long idle_calls; /* idle进入次数 */
unsigned long idle_sleeps; /* 进入了idle多少个tick */
ktime_t idle_entrytime; /* 进入idle的时间 */
ktime_t idle_waketime; /* idle被中断的时间 */
ktime_t idle_exittime; /* idle退出的时间 */
ktime_t idle_sleeptime; /* tick停止的总时间(没有IO等待) */
ktime_t iowait_sleeptime; /* tick停止的总时间同时还有IO请求在等待 */
unsigned long last_jiffies; /* 进入tick停止时的 jiffies值 */
u64 timer_expires; /* tick停止时 预约定时器的到期时间 */
u64 timer_expires_base; /* tick停止 相对与 timer_expires 的基础时间 */
u64 next_timer; /* 下一个定时器到期的时间 */
ktime_t idle_expires;
atomic_t tick_dep_mask;
unsigned long last_tick_jiffies; /* last tick的 jiffies */
unsigned int stalled_jiffies;
};
初始化
start_kernel
tick_init
tick_broadcast_init
tick_nohz_init
/* kernel/time/tick-broadcast.c */
void __init tick_broadcast_init(void)
{
zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
#ifdef CONFIG_TICK_ONESHOT
zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
zalloc_cpumask_var(&tick_broadcast_pending_mask, GFP_NOWAIT);
zalloc_cpumask_var(&tick_broadcast_force_mask, GFP_NOWAIT);
#endif
}
/* kernel/time/tick-sched.c */
void __init tick_nohz_init(void)
{
int cpu, ret;
if (!tick_nohz_full_running)
return;
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
* interrupts to avoid circular dependency on the tick
*/
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
}
if (IS_ENABLED(CONFIG_PM_SLEEP_SMP) && !IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) {
cpu = smp_processor_id();
if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
pr_warn("NO_HZ: Clearing %d from nohz_full range "
"for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
}
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
"kernel/nohz:predown", NULL,
tick_nohz_cpu_down);
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
}
/* hrtimer处理函数 */
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
tick_sched_do_timer(ts, now);
/* 获取中断上下文,确保是在中断上下文调用 tick_sched_handle */
if (regs)
tick_sched_handle(ts, regs);
else
ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
hrtimer_forward(timer, now, TICK_NSEC);
return HRTIMER_RESTART;
}
static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
{
int cpu = smp_processor_id();
#ifdef CONFIG_NO_HZ_COMMON
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
WARN_ON_ONCE(tick_nohz_full_running);
#endif
tick_do_timer_cpu = cpu;
}
#endif
/* 检查当前CPU是否是当前是负责更新jiffies的CPU */
if (tick_do_timer_cpu == cpu)
/* 更新jiffies */
tick_do_update_jiffies64(now);
/*
* If jiffies update stalled for too long (timekeeper in stop_machine()
* or VMEXIT'ed for several msecs), force an update.
*/
if (ts->last_tick_jiffies != jiffies) {
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
} else {
if (++ts->stalled_jiffies == MAX_STALLED_JIFFIES) {
tick_do_update_jiffies64(now);
ts->stalled_jiffies = 0;
ts->last_tick_jiffies = READ_ONCE(jiffies);
}
}
if (ts->inidle)
ts->got_idle_tick = 1;
}
初始化:
start_kernel
init_timers
init_timer_cpus();
init_timer_cpu(cpu);
posix_cputimers_init_work();
open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
基于内核的jiffies
定义如下:
struct timer_list {
/*
* All fields that change during normal runtime grouped to the
* same cacheline
*/
struct hlist_node entry;
unsigned long expires;
void (*function)(struct timer_list *);
u32 flags;
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
};
每一个轮有N
个实例,每个实例串这一堆将要到期的定时器。
比如第一个时间轮:单位是1个jiffies
,有256个。第1个链表挂载着所有1个jiffies后到期的定时器
第二个定时器的单位是前一个定时器的事件之和(相当于前一个轮转一圈,本轮转一个刻度),这里第一个是1*256=256 jiffies
,第1个链表挂载着所有256-(512-1)个jiffies后到期的定时器。
Linux内核有5个时间轮:
1 jiffies
的时间轮,包含超时时间在1 - (2^8 - 1)
内超时的定时器256(2^8) jiffies
的时间轮,包含超时时间在2^8 - (2^14 - 1)
内超时的定时器 16384(2^14) jiffies
的时间轮,包含超时时间在2^14 - (2^20 - 1)
内超时的定时器1048576(2^20) jiffies
的时间轮,包含超时时间在2^20 - (2^26 - 1)
内超时的定时器67108864(26) jiffies
的时间轮,包含超时时间在2^26 - (2^32 - 1)
内超时的定时器所以最高一轮可以记录超时事件为2^32
的定时器。
如何检查:假设curr_time=0x12345678
,那么下一个检查的时刻为0x12345679
第一个时间轮检查低8位
tv1.bucket[0~7]
上链表非空,则下一个检查时刻有的定时器节点超时第二个时间轮检查8~13位
tv1.bucket[8~13]
上链表非空,则下一个检查时刻有的定时器节点超时以此类推
Linux内核定义:
/* Size of each clock level */
#define LVL_BITS 6
#define LVL_SIZE (1UL << LVL_BITS) /* 每个时间轮有64个链表(2^6) */
/* Level depth */
#if HZ > 100 /* 根据系统调度频率来确定有多少个时间轮 */
# define LVL_DEPTH 9
# else
# define LVL_DEPTH 8
#endif
#define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
#ifdef CONFIG_NO_HZ_COMMON
# define NR_BASES 2
# define BASE_STD 0
# define BASE_DEF 1
#else
# define NR_BASES 1
# define BASE_STD 0
# define BASE_DEF 0
#endif
struct timer_base {
raw_spinlock_t lock;
struct timer_list *running_timer;
#ifdef CONFIG_PREEMPT_RT
spinlock_t expiry_lock;
atomic_t timer_waiters;
#endif
unsigned long clk;
unsigned long next_expiry;
unsigned int cpu;
bool next_expiry_recalc;
bool is_idle;
bool timers_pending;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE]; /* 所有定时器挂在该定时器 */
} ____cacheline_aligned;
/* 静态全局变量,每个CPU都有一个timer_base */
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
内核用一个hrtimer
结构来表示一个高精度定时器:
enum hrtimer_restart {
HRTIMER_NORESTART, /* Timer 不需要重启 */
HRTIMER_RESTART, /* Timer 需要重启 */
};
truct timerqueue_node {
struct rb_node node;
ktime_t expires; /* 硬过时事件 */
};
struct hrtimer {
struct timerqueue_node node;
ktime_t _softexpires; /* 软过时时间 */
enum hrtimer_restart (*function)(struct hrtimer *);
struct hrtimer_clock_base *base;
u8 state;
u8 is_rel;
u8 is_soft;
u8 is_hard;
};
enum hrtimer_base_type {
HRTIMER_BASE_MONOTONIC,
HRTIMER_BASE_REALTIME,
HRTIMER_BASE_BOOTTIME,
HRTIMER_BASE_TAI,
HRTIMER_BASE_MONOTONIC_SOFT,
HRTIMER_BASE_REALTIME_SOFT,
HRTIMER_BASE_BOOTTIME_SOFT,
HRTIMER_BASE_TAI_SOFT,
HRTIMER_MAX_CLOCK_BASES,
};
struct hrtimer_clock_base {
struct hrtimer_cpu_base *cpu_base;
unsigned int index;
clockid_t clockid;
seqcount_raw_spinlock_t seq;
struct hrtimer *running; /* 执行当前运行的定时器 */
struct timerqueue_head active; /* 红黑树根节点 */
ktime_t (*get_time)(void);
ktime_t offset; /* 与 monotonic 时间的偏移 */
} __hrtimer_clock_base_align;
/* 该结构提每个CPU都会有一个 */
struct hrtimer_cpu_base {
raw_spinlock_t lock;
unsigned int cpu;
unsigned int active_bases; /* 标记活跃定时器的位域 */
unsigned int clock_was_set_seq;
unsigned int hres_active : 1, /* 是否处理高精度模式 */
in_hrtirq : 1,
hang_detected : 1, /* 最后一个 hrtimer_interrupt 检测当hang */
softirq_activated : 1; /* 软中断被触发 */
#ifdef CONFIG_HIGH_RES_TIMERS
unsigned int nr_events;
unsigned short nr_retries;
unsigned short nr_hangs;
unsigned int max_hang_time; /* hrtimer_interrupt 最长执行时间 */
#endif
#ifdef CONFIG_PREEMPT_RT
spinlock_t softirq_expiry_lock;
atomic_t timer_waiters;
#endif
ktime_t expires_next; /* 下次到期时间 */
struct hrtimer *next_timer; /* 下次第一个到期的定时器 */
ktime_t softirq_expires_next; /* 下次软定时器的到期时间 */
struct hrtimer *softirq_next_timer; /* 下次第一个到期的软定时器 */
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
} ____cacheline_aligned;
_softexpires - expires
期间超时,主要是为了将让多个定时器在同一时间超时,避免频繁中断CPU
都有一个 hrtimer_cpu_base
结构hrtimer
系统需要通过timekeeper获取当前的时间,计算与到期时间的差值,并根据该差值,设定该cpu
的tick_device(clock_event_device)
的下一次的到期时间,在clock_event_device
的事件回调函数中处理到期的hrtimer
tick_device
,一旦开启了hrtimer
,tick_device
所关联的clock_event_device
的回调会被修改为:hrtimer_interrupt
#include
/*
* @clock_id:
* #define CLOCK_REALTIME 0
* #define CLOCK_MONOTONIC 1
* #define CLOCK_PROCESS_CPUTIME_ID 2
* #define CLOCK_THREAD_CPUTIME_ID 3
* #define CLOCK_MONOTONIC_RAW 4
* #define CLOCK_REALTIME_COARSE 5
* #define CLOCK_MONOTONIC_COARSE 6
* #define CLOCK_BOOTTIME 7
* #define CLOCK_REALTIME_ALARM 8
* #define CLOCK_BOOTTIME_ALARM 9
* @mode: 有好多个,具体看 hrtimer.h
*/
void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,enum hrtimer_mode mode);
/* 设置回调 */
timer.function = hr_callback;
/* 启动定时器 */
void hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode);
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode);
/* 取消定时器 */
int hrtimer_cancel(struct hrtimer *timer);
/* 推迟定时器 */
u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t int erval);
对到期定时器处理:
jiffies
的tick
事件中断进行查询和处理(调用hrtimer_run_queues
)HRTIMER_SOFTIRQ
软中断中进行查询和处理
timekeeper
系统中的时间需要修正,系统会发出HRTIMER_SOFTIRQ
软中断clock_event_device
的到期事件时间被重新编程,系统会发出HRTIMER_SOFTIRQ
软中断clock_event_device
的到期事件中断中进行查询和处理(hrtimer_interrupt
)hrtimer_run_queues
中会判断高精度模式是否已经启用,如果已经切换到了高精度模式,什么也不做,直接返回hrtimer_interrupt功能如下:
__hrtimer_run_queues
运行所有到期定时器的回调函数tick_program_event
设置下次tick_dev
的到期事件
hrtimer: interrupt took
警告
__hrtimer_run_queues
运行太久高精度定时器中断部分处理代码:
void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
ktime_t expires_next, now, entry_time, delta;
unsigned long flags;
int retries = 0;
raw_spin_lock_irqsave(&cpu_base->lock, flags);
/* 记录进入中断的时间 */
entry_time = now = hrtimer_update_base(cpu_base);
retry:
cpu_base->in_hrtirq = 1;
/*
* 暂时将 expires_next设到最大,避免有定时器在此时入队
*/
cpu_base->expires_next = KTIME_MAX;
/*
* 如果当前时间new在 softirq_expires_next 说明有软定时器要处理
* 所以这里收到触发htimer的软中断.
* 所以为啥不用 ktime_after 而已要用 !ktime_before
*/
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
/*
* 这里处理了所有到期的硬定时器,分别是
* HRTIMER_BASE_MONOTONIC,
* HRTIMER_BASE_REALTIME,
* HRTIMER_BASE_BOOTTIME,
* HRTIMER_BASE_TAI
*/
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
/*
* 获取最近 硬/软定时器的下次到期时间
* 该函数会更新
* cpu_base->softirq_expires_next
* cpu_base->next_timer
*/
expires_next = hrtimer_update_next_event(cpu_base);
cpu_base->expires_next = expires_next;
cpu_base->in_hrtirq = 0;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
/*
* 重新设置本CPU的 clock_event_device 的下次到期时间
* 如果要设置的时间已经过了返回 -ETIME, 否则返回0
*/
if (!tick_program_event(expires_next, 0)) {
cpu_base->hang_detected = 0;
return;
}
/*
* 时间已经过了可能有以下原因:
* 1. 前面 `__hrtimer_run_queues`运行太久
* 2. 程序被追踪耗时太久
* 3. 系统运行在虚拟机,虚拟机停止运行导致
*
* 为了避免在本函数一直循环处理定时器,所以我们会进行3次尝试
* 并将超时时间设置得足够长? 为啥?这里没太懂,处理不完反而还要设得更长?
*/
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
cpu_base->nr_retries++;
cpu_base->nr_retries++;
if (++retries < 3)
goto retry;
cpu_base->nr_hangs++;
cpu_base->hang_detected = 1;
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
delta = ktime_sub(now, entry_time);
if ((unsigned int)delta > cpu_base->max_hang_time)
cpu_base->max_hang_time = (unsigned int) delta;
if (delta > 100 * NSEC_PER_MSEC)
expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
else
expires_next = ktime_add(now, delta);
tick_program_event(expires_next, 1);
pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta));
}
static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now,
unsigned long flags, unsigned int active_mask)
{
struct hrtimer_clock_base *base;
unsigned int active = cpu_base->active_bases & active_mask;
for_each_active_base(base, cpu_base, active) {
/* 去除每个type对应的 struct hrtimer_clock_base */
struct timerqueue_node *node;
ktime_t basenow;
basenow = ktime_add(now, base->offset);
/* 从红黑树去除最小超时时间(左下元素) */
while ((node = timerqueue_getnext(&base->active))) {
struct hrtimer *timer;
timer = container_of(node, struct hrtimer, node);
if (basenow < hrtimer_get_softexpires_tv64(timer))
break;
__run_hrtimer(cpu_base, base, timer, &basenow, flags);
if (active_mask == HRTIMER_ACTIVE_SOFT)
hrtimer_sync_wait_running(cpu_base, flags);
}
}
}
static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
struct hrtimer_clock_base *base,
struct hrtimer *timer, ktime_t *now,
unsigned long flags) __must_hold(&cpu_base->lock)
{
enum hrtimer_restart (*fn)(struct hrtimer *);
bool expires_in_hardirq;
int restart;
debug_deactivate(timer);
base->running = timer;
/*
* Separate the ->running assignment from the ->state assignment.
*
* As with a regular write barrier, this ensures the read side in
* hrtimer_active() cannot observe base->running == NULL &&
* timer->state == INACTIVE.
*/
raw_write_seqcount_barrier(&base->seq);
/* 将定时器从红黑树移除 */
__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
fn = timer->function;
if (IS_ENABLED(CONFIG_TIME_LOW_RES))
timer->is_rel = false;
/*
* 这是一个 trace event, 忽略即可
*/
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
trace_hrtimer_expire_entry(timer, now);
expires_in_hardirq = lockdep_hrtimer_enter(timer);
/* 调用具体函数 */
restart = fn(timer);
/*
* 这是一个 trace event, 忽略即可
*/
lockdep_hrtimer_exit(expires_in_hardirq);
trace_hrtimer_expire_exit(timer);
raw_spin_lock_irq(&cpu_base->lock);
if (restart != HRTIMER_NORESTART &&
!(timer->state & HRTIMER_STATE_ENQUEUED))
enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS);
raw_write_seqcount_barrier(&base->seq);
WARN_ON_ONCE(base->running != timer);
base->running = NULL;
}
内核并不是一开始就工作于高精度模式,系统在启动的开始阶段,还是按照传统的模式在运行:
tick_device
按HZ频率定期地产生tick事件hrtimer
工作在低分辨率模式,到期事件在每个tick事件中断中由hrtimer_run_queues
函数处理TIMER_SOFTIRQ
中,hrtimer_run_pending
会被调用,系统在这个函数中判断系统的条件是否满足切换到高精度模式,如果条件满足,则会切换至高分辨率模式,NO_HZ
模式也是在该函数中判断并切换。
void hrtimer_run_queues(void)
{
struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
unsigned long flags;
ktime_t now;
if (__hrtimer_hres_active(cpu_base))
return;
/* 切换到 高精度模式 */
if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
hrtimer_switch_to_hres();
return;
}
raw_spin_lock_irqsave(&cpu_base->lock, flags);
now = hrtimer_update_base(cpu_base);
if (!ktime_before(now, cpu_base->softirq_expires_next)) {
cpu_base->softirq_expires_next = KTIME_MAX;
cpu_base->softirq_activated = 1;
raise_softirq_irqoff(HRTIMER_SOFTIRQ);
}
/* 处理所有到期的 硬定时器 */
__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
}
int tick_check_oneshot_change(int allow_nohz)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
/*
* 检查 check_clocks 的bit0
* check_clocks变量由于 tick_clock_notify,tick_oneshot_notify改变
* 当有更高精度的时钟源被注册后,bit0会被置位
* 或 有支持 CLOCK_EVT_MODE_ONESHOT 的 clock_event_device注册时,也会被置位
*/
if (!test_and_clear_bit(0, &ts->check_clocks))
return 0;
if (ts->nohz_mode != NOHZ_MODE_INACTIVE)
return 0;
/*
* timekeeping_valid_for_hres 检查 timekeeping是否支持高精度模式
* tick_is_oneshot_available 检查 tick_dev 是否支持 CLOCK_EVT_MODE_ONESHOT 模式
*/
if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available())
return 0;
if (!allow_nohz)
return 1;
/* tick_sched 切换到 NOHZ_MODE_LOWRES 模式 */
tick_nohz_switch_to_nohz();
return 0;
}
static void hrtimer_switch_to_hres(void)
{
struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
/*
* 初始化高精度定时器
* 1. 将本CPU的 tick_device->mode 切换到 TICKDEV_MODE_ONESHOT
* 2. 切换 clock_event_device->event_handler 到 hrtimer_interrupt
*/
if (tick_init_highres()) {
pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu);
return;
}
base->hres_active = 1;
hrtimer_resolution = HIGH_RES_NSEC;
/*
* 因为tick_device被高精度定时器接管,不会再提供原有的tick事件机制
* 所以需要系统模拟一个tick事件的设备
* tick_setup_sched_timer会初始化 tick_sched 设备来进行tick事件的模拟
*/
tick_setup_sched_timer();
/*
* tick_device的到期时间并没有被正确地设置为下一个到期定时器的时间
* 这里使用retrigger_next_event函数,传入参数NULL,使得tick_device立刻产生到期中断
* hrtimer_interrupt被调用一次,然后下一个到期的定时器的时间会编程到tick_device中
* 从而完成了到高精度模式的切换:
*/
retrigger_next_event(NULL);
}
void tick_setup_sched_timer(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t now = ktime_get();
/* 初始化定时器 */
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
ts->sched_timer.function = tick_sched_timer;
/* 设置超时时间为下一个tick */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
/* Offset the tick to avert jiffies_lock contention. */
if (sched_skew_tick) {
u64 offset = TICK_NSEC >> 1;
do_div(offset, num_possible_cpus());
offset *= smp_processor_id();
hrtimer_add_expires_ns(&ts->sched_timer, offset);
}
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
}
依赖于CONFIG_NO_HZ
,主要用于空闲时减少tick
中断,以降低功耗,这一特性也叫tickless
。
切换到NO_HZ
模式条件:cpu
的clock_event_device
设备需要支持单次触发
由 idle
进程来决定是否停止tick
周期,退出idle
时需要恢复tick
周期
和高精度定时器切换的时候一样,每个中断都会检查是否可以切换到NOHZ模式跟高精度模式
tick_handle_periodic /* 低精度模式下,定期回调该函数 */
update_process_times
run_local_timers
hrtimer_run_queues
tick_check_oneshot_change
tick_nohz_switch_to_nohz (高精度模式没使能时会调用到这里)
static void tick_nohz_switch_to_nohz(void)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
ktime_t next;
if (!tick_nohz_enabled)
return;
/*
* 1. 将本cpu的tick_device模式切换到单次模式
* 2. 将本cpu的tick_device的回调函数切换到 tick_nohz_handler
*/
if (tick_switch_to_oneshot(tick_nohz_handler))
return;
/* 初始化高精度定时器 */
hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_HARD);
/* Get the next period */
next = tick_init_jiffy_update();
hrtimer_set_expires(&ts->sched_timer, next);
hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
低精度模式,NOHZ回调函数:
static void tick_nohz_handler(struct clock_event_device *dev)
{
struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
dev->next_event = KTIME_MAX;
tick_sched_do_timer(ts, now);
tick_sched_handle(ts, regs);
/* No need to reprogram if we are running tickless */
if (unlikely(ts->tick_stopped))
return;
hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
其功能取下:
tick_sched_do_timer
,完成jiffies
的更新tick_sched_handle
update_process_times
SOFTIRQ
scheduler_tick
,通过scheduler_tick
触发调度系统进行进程统计和调度工作高精度模式下,tick_device
被高精度定时器中断接管,所以原来的tick
事件由tick_sched
来提供:
/* 根据前面说明可知, tick_sched的回调函数是 tick_sched_timer*/
static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
{
struct tick_sched *ts =
container_of(timer, struct tick_sched, sched_timer);
struct pt_regs *regs = get_irq_regs();
ktime_t now = ktime_get();
tick_sched_do_timer(ts, now);
/*
* Do not call, when we are not in irq context and have
* no valid regs pointer
*/
if (regs)
tick_sched_handle(ts, regs);
else
ts->next_tick = 0;
/* No need to reprogram if we are in idle or full dynticks mode */
if (unlikely(ts->tick_stopped))
return HRTIMER_NORESTART;
hrtimer_forward(timer, now, TICK_NSEC);
return HRTIMER_RESTART;
}
可以看到,tick_sched
的回调函数和tick_nohz_handler
函数基本一致,都是调用了 tick_sched_do_timer
和 tick_sched_handle
函数