本文以Linux3.10版本为例进行学习,在Linux3.10的内核调度中,Linux的进程的调度时机主要如下几个时机(根据文档内容翻译)
__schedule() 是主要的调度函数。
驱动调度器从而进入这个函数的主要手段是:
1. 显式阻塞:mutex、semaphore、waitqueue等。
2. TIF_NEED_RESCHED 标志在中断和用户空间返回时被检查
路径。例如,请参见 arch/x86/entry_64.S。
为了驱动任务之间的抢占,调度器在定时器中设置标志
中断处理程序 scheduler_tick()。
3. 唤醒并不会真正导致进入 schedule()。他们添加了一个
任务到运行队列,就是这样。
现在,如果添加到运行队列的新任务抢占了当前任务
任务,然后唤醒设置 TIF_NEED_RESCHED 和 schedule() 获取
在最近的可能场合致电:
- 如果内核是可抢占的(CONFIG_PREEMPT=y):
- 在系统调用或异常上下文中,在下一个最外面
抢占启用()。 (这可能是唤醒()的尽快
自旋解锁()!)
- 在 IRQ 上下文中,从中断处理程序返回到
可抢占的上下文
- 如果内核不可抢占(CONFIG_PREEMPT 未设置)
然后在下一个:
- cond_resched() 调用
- 显式 schedule() 调用
- 从系统调用或异常返回到用户空间
- 从中断处理程序返回到用户空间
在显示调度的场景下,例如进程的退出、获取锁或者信号量的情况下等都会进行主动的进程调度。
举例以futex为例,在使用futex_wait的时候实际上的调用逻辑如下:
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
struct timespec __user *, utime, u32 __user *, uaddr2,
u32, val3)
{
...
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
...
switch (cmd) {
...
return futex_wait(uaddr, flags, val, timeout, val3);
...
}
static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
ktime_t *abs_time, u32 bitset)
{
struct hrtimer_sleeper timeout, *to = NULL;
struct restart_block *restart;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int ret;
if (!bitset)
return -EINVAL;
q.bitset = bitset;
if (abs_time) {
to = &timeout;
hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
CLOCK_REALTIME : CLOCK_MONOTONIC,
HRTIMER_MODE_ABS);
hrtimer_init_sleeper(to, current);
hrtimer_set_expires_range_ns(&to->timer, *abs_time,
current->timer_slack_ns);
}
...
/* queue_me and wait for wakeup, timeout, or a signal. */
futex_wait_queue_me(hb, &q, to);
...
}
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
struct hrtimer_sleeper *timeout)
{
/*
* The task state is guaranteed to be set before another task can
* wake it. set_current_state() is implemented using set_mb() and
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
set_current_state(TASK_INTERRUPTIBLE);
queue_me(q, hb);
/* Arm the timer */
if (timeout) {
hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
if (!hrtimer_active(&timeout->timer))
timeout->task = NULL;
}
/*
* If we have been removed from the hash list, then another task
* has tried to wake us, and we can skip the call to schedule().
*/
if (likely(!plist_node_empty(&q->list))) {
/*
* If the timer has already expired, current will already be
* flagged for rescheduling. Only call schedule if there
* is no timeout, or if it has yet to expire.
*/
if (!timeout || timeout->task)
schedule(); // 等待进行主动的进程调度
}
__set_current_state(TASK_RUNNING);
}
从整个的流程来看显示的调用场景基本上都会调用到schedule()函数。
有关semaphore相关的操作也可查看semaphore.c文件来看当中也调用了schedule_timeout来主动调度。
该标志位代表着该进程需要进行调度。在进程等待被唤醒或者在运行时间到了需要调度的情况下进行调度。我们简单的来了解一下在运行时间超时的情况下的调度机制。
在操作系统上面时间中断函数。该函数主要如下:
/*
* Default timer interrupt handler for PIT/HPET
*/
static irqreturn_t timer_interrupt(int irq, void *dev_id)
{
global_clock_event->event_handler(global_clock_event);
return IRQ_HANDLED;
}
static struct irqaction irq0 = {
.handler = timer_interrupt,
.flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_IRQPOLL | IRQF_TIMER,
.name = "timer"
};
因为在初始化阶段会进行time_init的初始化:
void __init setup_default_timer_irq(void)
{
setup_irq(0, &irq0);
}
/* Default timer init function */
void __init hpet_time_init(void)
{
if (!hpet_enable())
setup_pit_timer();
setup_default_timer_irq();
}
static __init void x86_late_time_init(void)
{
x86_init.timers.timer_init();
tsc_init();
}
/*
* Initialize TSC and delay the periodic timer init to
* late x86_late_time_init() so ioremap works.
*/
void __init time_init(void)
{
late_time_init = x86_late_time_init;
}
最终会调用到setup_pit_timer函数来建立时间中断的处理函数。
void __init setup_pit_timer(void)
{
clockevent_i8253_init(true);
global_clock_event = &i8253_clockevent;
}
...
void __init clockevent_i8253_init(bool oneshot)
{
if (oneshot)
i8253_clockevent.features |= CLOCK_EVT_FEAT_ONESHOT;
/*
* Start pit with the boot cpu mask. x86 might make it global
* when it is used as broadcast device later.
*/
i8253_clockevent.cpumask = cpumask_of(smp_processor_id());
clockevents_config_and_register(&i8253_clockevent, PIT_TICK_RATE,
0xF, 0x7FFF);
}
...
void clockevents_config_and_register(struct clock_event_device *dev,
u32 freq, unsigned long min_delta,
unsigned long max_delta)
{
dev->min_delta_ticks = min_delta;
dev->max_delta_ticks = max_delta;
clockevents_config(dev, freq);
clockevents_register_device(dev);
}
EXPORT_SYMBOL_GPL(clockevents_config_and_register);
...
/**
* clockevents_register_device - register a clock event device
* @dev: device to register
*/
void clockevents_register_device(struct clock_event_device *dev)
{
unsigned long flags;
BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
if (!dev->cpumask) {
WARN_ON(num_possible_cpus() > 1);
dev->cpumask = cpumask_of(smp_processor_id());
}
raw_spin_lock_irqsave(&clockevents_lock, flags);
list_add(&dev->list, &clockevent_devices);
clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
clockevents_notify_released();
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
...
/*
* Notify about a clock event change. Called with clockevents_lock
* held.
*/
static void clockevents_do_notify(unsigned long reason, void *dev)
{
raw_notifier_call_chain(&clockevents_chain, reason, dev);
}
通过clockevents来注册时间中断函数,最终调用了clockevents_do_notify来设置回调的handle。有关chain的回调注册如下:
static struct notifier_block tick_notifier = {
.notifier_call = tick_notify,
};
/**
* tick_init - initialize the tick control
*
* Register the notifier with the clockevents framework
*/
void __init tick_init(void)
{
clockevents_register_notifier(&tick_notifier);
tick_broadcast_init();
}
/*
* Notification about clock event devices
*/
static int tick_notify(struct notifier_block *nb, unsigned long reason,
void *dev)
{
switch (reason) {
case CLOCK_EVT_NOTIFY_ADD:
return tick_check_new_device(dev); // 添加任务注册任务
case CLOCK_EVT_NOTIFY_BROADCAST_ON:
case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
tick_broadcast_on_off(reason, dev);
break;
case CLOCK_EVT_NOTIFY_BROADCAST_ENTER:
case CLOCK_EVT_NOTIFY_BROADCAST_EXIT:
tick_broadcast_oneshot_control(reason);
break;
case CLOCK_EVT_NOTIFY_CPU_DYING:
tick_handover_do_timer(dev);
break;
case CLOCK_EVT_NOTIFY_CPU_DEAD:
tick_shutdown_broadcast_oneshot(dev);
tick_shutdown_broadcast(dev);
tick_shutdown(dev);
break;
case CLOCK_EVT_NOTIFY_SUSPEND:
tick_suspend();
tick_suspend_broadcast();
break;
case CLOCK_EVT_NOTIFY_RESUME:
tick_resume();
break;
default:
break;
}
return NOTIFY_OK;
}
最终的handler调用了tick_check_new_device函数。
/*
* Check, if the new registered device should be used.
*/
static int tick_check_new_device(struct clock_event_device *newdev)
{
struct clock_event_device *curdev;
struct tick_device *td;
int cpu, ret = NOTIFY_OK;
...
clockevents_exchange_device(curdev, newdev);
tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); // 建立tick设备
...
return ret;
}
...
/*
* Setup the tick device
*/
static void tick_setup_device(struct tick_device *td,
struct clock_event_device *newdev, int cpu,
const struct cpumask *cpumask)
{
ktime_t next_event;
void (*handler)(struct clock_event_device *) = NULL;
/*
* First device setup ?
*/
if (!td->evtdev) {
/*
* If no cpu took the do_timer update, assign it to
* this cpu:
*/
if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
if (!tick_nohz_full_cpu(cpu))
tick_do_timer_cpu = cpu;
else
tick_do_timer_cpu = TICK_DO_TIMER_NONE;
tick_next_period = ktime_get();
tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
}
/*
* Startup in periodic mode first.
*/
td->mode = TICKDEV_MODE_PERIODIC;
} else {
handler = td->evtdev->event_handler;
next_event = td->evtdev->next_event;
td->evtdev->event_handler = clockevents_handle_noop;
}
td->evtdev = newdev;
/*
* When the device is not per cpu, pin the interrupt to the
* current cpu:
*/
if (!cpumask_equal(newdev->cpumask, cpumask))
irq_set_affinity(newdev->irq, cpumask);
/*
* When global broadcasting is active, check if the current
* device is registered as a placeholder for broadcast mode.
* This allows us to handle this x86 misfeature in a generic
* way.
*/
if (tick_device_uses_broadcast(newdev, cpu))
return;
if (td->mode == TICKDEV_MODE_PERIODIC)
tick_setup_periodic(newdev, 0); // 建立周期的handler
else
tick_setup_oneshot(newdev, handler, next_event);
}
/*
* Setup the device for a periodic tick
*/
void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
{
tick_set_periodic_handler(dev, broadcast); // 设置回调handler
/* Broadcast setup ? */
if (!tick_device_is_functional(dev))
return;
if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
!tick_broadcast_oneshot_active()) { // 是否需要手动设置下次时间handler
clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
} else {
unsigned long seq;
ktime_t next;
do {
seq = read_seqbegin(&jiffies_lock);
next = tick_next_period;
} while (read_seqretry(&jiffies_lock, seq));
clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
for (;;) {
if (!clockevents_program_event(dev, next, false))
return;
next = ktime_add(next, tick_period); // 设置下次时间
}
}
}
...
/*
* Set the periodic handler depending on broadcast on/off
*/
void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast)
{
if (!broadcast)
dev->event_handler = tick_handle_periodic;
else
dev->event_handler = tick_handle_periodic_broadcast;
}
周期处理函数如下:
/*
* Event handler for periodic ticks
*/
void tick_handle_periodic(struct clock_event_device *dev)
{
int cpu = smp_processor_id();
ktime_t next;
tick_periodic(cpu);
if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
return;
/*
* Setup the next period for devices, which do not have
* periodic mode: 是否需要设置下一次的中断时间
*/
next = ktime_add(dev->next_event, tick_period);
for (;;) {
if (!clockevents_program_event(dev, next, false))
return;
/*
* Have to be careful here. If we're in oneshot mode,
* before we call tick_periodic() in a loop, we need
* to be sure we're using a real hardware clocksource.
* Otherwise we could get trapped in an infinite
* loop, as the tick_periodic() increments jiffies,
* when then will increment time, posibly causing
* the loop to trigger again and again.
*/
if (timekeeping_valid_for_hres())
tick_periodic(cpu);
next = ktime_add(next, tick_period);
}
}
...
/*
* Periodic tick
*/
static void tick_periodic(int cpu)
{
if (tick_do_timer_cpu == cpu) {
write_seqlock(&jiffies_lock);
/* Keep track of the next tick event */
tick_next_period = ktime_add(tick_next_period, tick_period);
do_timer(1);
write_sequnlock(&jiffies_lock);
}
update_process_times(user_mode(get_irq_regs())); // 更新进程的时间
profile_tick(CPU_PROFILING);
}
...
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
*/
void update_process_times(int user_tick)
{
struct task_struct *p = current;
int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
rcu_check_callbacks(cpu, user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_run();
#endif
scheduler_tick();
run_posix_cpu_timers(p);
}
...
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
*/
void scheduler_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
sched_clock_tick();
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0); // 调用当前进程的调度策略类进行task_tick
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq, cpu);
#endif
rq_last_tick_reset(rq);
}
假设当前的调度类是cfs,则task_tick函数为task_tick_fair函数。
/*
* scheduler tick hitting a task of our scheduling class:
*/
static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &curr->se;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
entity_tick(cfs_rq, se, queued);
}
if (sched_feat_numa(NUMA))
task_tick_numa(rq, curr);
update_rq_runnable_avg(rq, 1);
}
...
static void
entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
{
/*
* Update run-time statistics of the 'current'.
*/
update_curr(cfs_rq);
/*
* Ensure that runnable average is periodically updated.
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
#ifdef CONFIG_SCHED_HRTICK
/*
* queued ticks are scheduled to match the slice, so don't bother
* validating it and just reschedule.
*/
if (queued) {
resched_task(rq_of(cfs_rq)->curr);
return;
}
/*
* don't let the period tick interfere with the hrtick preemption
*/
if (!sched_feat(DOUBLE_TICK) &&
hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
return;
#endif
if (cfs_rq->nr_running > 1)
check_preempt_tick(cfs_rq, curr); //检查是否需要设置调度标志位
}
...
/*
* Preempt the current task with a newly woken task if needed:
*/
static void
check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
unsigned long ideal_runtime, delta_exec;
struct sched_entity *se;
s64 delta;
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
*/
clear_buddies(cfs_rq, curr);
return;
}
/*
* Ensure that a task that missed wakeup preemption by a
* narrow margin doesn't have to wait for a full slice.
* This also mitigates buddy induced latencies under load.
*/
if (delta_exec < sysctl_sched_min_granularity)
return;
se = __pick_first_entity(cfs_rq);
delta = curr->vruntime - se->vruntime;
if (delta < 0)
return;
if (delta > ideal_runtime)
resched_task(rq_of(cfs_rq)->curr); // 设置调度标志位
}
...
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
此时就设置了调度的标致位。时间中断情况下的标志位设置,涉及到了操作系统的时钟,当时间中断的时候会调用update_process_times函数来更新进程的运行时间,最终通过scheduler_tick函数来对进场进行时间片的更新,并检查该进程是否时间片用完需要进行调度,Linux操作系统支持不同的进程调度策略本文选用了cfs场景下的调度场景来进一步查看了时间片到期之后去设置中断标志位。
本文主要是学习了在进程调度的场景下显示主动调度和时间片中断模式下的标志位设置调度。有关时间中断的时钟源设置都简单的略过有兴趣的同学可自行了解,并且针对进场的调度策略本文也是简单的选用了cfs场景来举例。在显示场景下主要是直接调用schedule函数,后续再进一步学习Linux进场调度策略和中断场景下的调度。由于本人才疏学浅,如有错误请批评指正。