struct clock_event_device { const char *name; unsigned int features; unsigned long max_delta_ns; unsigned long min_delta_ns; unsigned long mult; int shift; int rating; int irq; cpumask_t cpumask; int (*set_next_event)(unsigned long evt, struct clock_event_device *); void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); void (*event_handler)(struct clock_event_device *); void (*broadcast)(cpumask_t mask); struct list_head list; enum clock_event_mode mode; ktime_t next_event; }; |
和硬件计时器(相关的数据结构主要有两个:
上述两个结构内核源代码中有较详细的注解,分别位于文件 clocksource.h 和 clockchips.h 中。需要特别注意的是结构 clock_event_device 的成员 event_handler ,它指定了当硬件时钟中断发生时,内核应该执行那些操作,也就是真正的时钟中断处理函数。
Linux 内核维护了两个链表,分别存储了系统中所有时钟源的信息和时钟事件设备的信息。这两个链表的表头在内核中分别是 clocksource_list 和 clockevent_devices 。图2-1显示了这两个链表。
/* * This is called directly from init code; we must delay timer setup in the * HPET case as we can't make the decision to turn on HPET this early in the * boot process. * * The chosen time_init function will usually be hpet_time_init, above, but * in the case of virtual hardware, an alternative function may be substituted. */ void __init time_init(void) { tsc_init(); late_time_init = choose_time_init(); } |
/* * This is the same as the above, except we _also_ save the current * Time Stamp Counter value at the time of the timer interrupt, so that * we later on can estimate the time of day more exactly. */ irqreturn_t timer_interrupt(int irq, void *dev_id) { #ifdef CONFIG_X86_IO_APIC if (timer_ack) { /* * Subtle, when I/O APICs are used we have to ack timer IRQ * manually to reset the IRR bit for do_slow_gettimeoffset(). * This will also deassert NMI lines for the watchdog if run * on an 82489DX-based system. */ spin_lock(&i8259A_lock); outb(0x0c, PIC_MASTER_OCW3); /* Ack the IRQ; AEOI will end it automatically. */ inb(PIC_MASTER_POLL); spin_unlock(&i8259A_lock); } #endif do_timer_interrupt_hook(); if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to enable edge-triggered interrupts usually gets intercepted by a special hardware circuit). Hence we have to acknowledge the timer interrupt. Through some incredibly stupid design idea, the reset for IRQ 0 is done by setting the high bit of the PPI port B (0x61). Note that some PS/2s, notably the 55SX, work fine if this is removed. */ u8 irq_v = inb_p( 0x61 ); /* read the current state */ outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } return IRQ_HANDLED; } |
初始化dev->event_handler函数的过程: clockevents_register_device() > clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev) > raw_notifier_call_chain(&clockevents_chain, reason, dev) > __raw_notifier_call_chain(nh, val, v, -1, NULL) > notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls) : ... ret = nb->notifier_call(nb, val, v); /* 注意这个参数v,指向clockevents_register_device * 的参数clock_event_device *global_clock_event; * 而参数nb,就是&clockevents_chain */ 然而,这个notifier_call函数指针赋值在start_kernel() > tick_init() : clockevents_register_notifier(&tick_notifier); 而tick_notifier的定义: static struct notifier_block tick_notifier = { .notifier_call = tick_notify, }; 还没有完呢, tick_notify> tick_check_new_device()> tick_setup-device()> tick_setup_periodic()> tick_set_periodic_handler() 这个dev->event_handler()处理函数终于初始化了 |
void tick_handle_periodic(struct clock_event_device *dev) { int cpu = smp_processor_id(); ktime_t next; tick_periodic(cpu); if (dev->mode != CLOCK_EVT_MODE_ONESHOT) return; /* * Setup the next period for devices, which do not have * periodic mode: */ next = ktime_add(dev->next_event, tick_period); for (;;) { if (!clockevents_program_event(dev, next, ktime_get())) return; tick_periodic(cpu); next = ktime_add(next, tick_period); } } 其中tick_periodic调用就是以前的一系列更新操作,包括更新进程时间片等等. static void tick_periodic(int cpu) { if (tick_do_timer_cpu == cpu) { write_seqlock(&xtime_lock); /* Keep track of the next tick event */ tick_next_period = ktime_add(tick_next_period, tick_period); do_timer(1); write_sequnlock(&xtime_lock); } //更新当前运行进程的与时钟相关的信息 update_process_times(user_mode(get_irq_regs())); profile_tick(CPU_PROFILING); } |
static inline void __run_timers(tvec_base_t *base) { struct timer_list *timer; spin_lock_irq(&base->lock); /*这里进入定时器处理循环,利用系统全局jiffies与定时器基准jiffies进行对比,如果前者大,则表明某些定时器进行处理了,否则表示所有的 定时器都没有超时.因为CPU可能关闭中断,引起时钟中断信号丢失.可能jiffies要大base->timer_jiffies while (time_after_eq(jiffies, base->timer_jiffies)) { //定义并初始化一个链表 struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: */ //当index == 0时,说明已经循环了一个周期 //则将tv2填充tv1.如果tv2为空,则用tv3填充tv2.依次类推...... if (!index && (!cascade(base, &base->tv2, INDEX(0))) && (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); //更新base->timer_jiffies ++base->timer_jiffies; //将base->tv1.vec项移至work_list.并将base->tv1.vec置空 list_replace_init(base->tv1.vec + index, &work_list); /*如果当前找到的时间数组对应的列表不为空,则表明该列表上串连的所有定时器都已经超时,循环调用每个定时器的处理 函数,并将其从列表中删除,直到列表为空为止。*/ while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; //遍历链表中的每一项.运行它所对应的函数,并将定时器从链表上脱落 timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; timer_stats_account_timer(timer); set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); fn(data); if (preempt_count != preempt_count()) { printk(KERN_WARNING "huh, entered %p " "with preempt_count %08x, exited" " with %08x?\n", fn, preempt_count, preempt_count()); BUG(); } } spin_lock_irq(&base->lock); } } set_running_timer(base, NULL); spin_unlock_irq(&base->lock); } |
void scheduler_tick(void) { int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; u64 next_tick = rq->tick_timestamp + TICK_NSEC; spin_lock(&rq->lock); __update_rq_clock(rq); /* * Let rq->clock advance by at least TICK_NSEC: */ if (unlikely(rq->clock < next_tick)) rq->clock = next_tick; rq->tick_timestamp = rq->clock; update_cpu_load(rq); if (curr != rq->idle) /* FIXME: needed? */ curr->sched_class->task_tick(rq, curr); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif } |
软件时钟处理
实现软件时钟原理也比较简单:每一次硬件时钟中断到达时,内核更新的 jiffies ,然后将其和软件时钟的到期时间进行比较。如果 jiffies 等于或者大于软件时钟的到期时间,内核就执行软件时钟指定的函数。
接下来的几节会详细介绍 Linux2.6.23 是怎么实现软件时钟的。
struct timer_list 主要成员
|
struct tvec_base 类型的成员
|
其中 tv1 的类型为 struct tvec_root ,tv 2~ tv 5的类型为 struct tvec .
struct tvec { |
可见它们实际上就是类型为 struct list_head 的数组,其中 TVN_SIZE 和 TVR_SIZE 在系统没有配置宏 CONFIG_BASE_SMALL 时分别被定义为64和256。
从图中可以清楚地看出:软件时钟( struct timer_list ,在图中由 timer 表示)以双向链表( struct list_head )的形式,按照它们的到期时间保存相应的( tv1~tv5 )中。tv1 中保存了相对于 timer_jiffies 下256个 tick 时间内到期的所有软件时钟; tv2 中保存了相对于 timer_jiffies 下256*64个 tick 时间内到期的所有软件时钟; tv3 中保存了相对于 timer_jiffies 下256*64*64个 tick 时间内到期的所有软件时钟; tv4 中保存了相对于 timer_jiffies 下256*64*64*64个 tick 时间内到期的所有软件时钟; tv5 中保存了相对于 timer_jiffies 下256*64*64*64*64个 tick 时间内到期的所有软件时钟。具体的说,从静态的角度看,假设 timer_jiffies 为0,那么 tv1[0] 保存着当前到期(到期时间等于 timer_jiffies )的软件时钟(需要马上被处理), tv1[1] 保存着下一个 tick 到达时,到期的所有软件时钟, tv1[n] (0<= n <=255)保存着下 n 个 tick 到达时,到期的所有软件时钟。而 tv2[0] 则保存着下256到511个 tick 之间到期所有软件时钟, tv2[1] 保存着下512到767个 tick 之间到期的所有软件时钟, tv2[n] (0<= n <=63)保存着下256*(n+1)到256*(n+2)-1个 tick 之间到达的所有软件时钟。 tv3~tv5 依次类推。
注:本章主要摘自IBM资料,下章主要对软时钟中断代码分析
资料:http://www.ibm.com/developerworks/cn/linux/l-cn-clocks/index.html
int mod_timer(struct timer_list *timer, unsigned long expires) { //如果该定时器没有定义fuction BUG_ON(!timer->function); timer_stats_timer_set_start_info(timer); /* * This is a common optimization triggered by the * networking code - if the timer is re-modified * to be the same thing then just return: */ //如果要调整的时间就是定时器的定时时间而且已经被激活,则直接返回 if (timer->expires == expires && timer_pending(timer)) return 1; //调用__mod_timer().呆会再给出分析 return __mod_timer(timer, expires); } |
int __mod_timer(struct timer_list *timer, unsigned long expires) { tvec_base_t *base, *new_base; unsigned long flags; int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); base = lock_timer_base(timer, &flags); if (timer_pending(timer)) { detach_timer(timer, 0); ret = 1; } //取得当前CPU对应的tvec_bases new_base = __get_cpu_var(tvec_bases); if (base != new_base) { if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ timer_set_base(timer, NULL); spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); timer_set_base(timer, base); } } timer->expires = expires; internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); return ret; } |
static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; struct list_head *vec; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; vec = base->tv1.vec + i; } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { int i = (expires >> TVR_BITS) & TVN_MASK; vec = base->tv2.vec + i; } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; vec = base->tv3.vec + i; } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; vec = base->tv4.vec + i; } else if ((signed long) idx < 0) { /* * Can happen if you add a timer with expires == jiffies, * or you set a timer to go off in the past */ vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); } else { int i; /* If the timeout is larger than 0xffffffff on 64-bit * architectures then we use the maximum timeout: */ if (idx > 0xffffffffUL) { idx = 0xffffffffUL; expires = idx + base->timer_jiffies; } i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } /* * Timers are FIFO: */ list_add_tail(&timer->entry, vec); } |
static inline void __run_timers(struct tvec_base *base) { …… spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { …… int index = base->timer_jiffies & TVR_MASK; if (!index && (!cascade(base, &base->tv2, INDEX(0))) && (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; list_replace_init(base->tv1.vec + index, &work_list); while (!list_empty(head)) { …… timer = list_first_entry(head, struct timer_list,entry); fn = timer->function; data = timer->data; …… set_running_timer(base, timer); detach_timer(timer, 1); spin_unlock_irq(&base->lock); { int preempt_count = preempt_count(); fn(data); …… } spin_lock_irq(&base->lock); } } set_running_timer(base, NULL); spin_unlock_irq(&base->lock); } |
fastcall signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx\n", timeout); dump_stack(); current->state = TASK_RUNNING; goto out; } } expire = timeout + jiffies; setup_timer(&timer, process_timeout, (unsigned long)current); __mod_timer(&timer, expire); schedule(); del_singleshot_timer_sync(&timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } |