1)实时优先级队列rt_prio_array:在kernel/sched.c中,是一组链表,每个优先级对应一个链表。还维护一个由101 bit组成的bitmap,其中实时进程优先级为0-99,占100 bit,再加1 bit的定界符。当某个优先级别上有进程被插入列表时,相应的比特位就被置位。 通常用sched_find_first_bit()函数查询该bitmap,它返回当前被置位的最高优先级的数组下标。由于使用位图,查找一个任务来执行所需要的时间并不依赖于活动任务的个数,而是依赖于优先级的数量。可见实时调度是一个O(1)调度策略。
struct rt_prio_array { DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* 包含1 bit的定界符 */ struct list_head queue[MAX_RT_PRIO]; };这里用include/linux/types.h中的DECLARE_BITMAP宏来定义指定长度的位图,用include/linux/list.h中的struct list_head来为100个优先级定义各自的双链表。在实时调度中,运行进程根据优先级放到对应的队列里面,对于相同的优先级的进程后面来的进程放到同一优先级队列的队尾。对于FIFO/RR调度,各自的进程需要设置相关的属性。进程运行时,要根据task中的这些属性判断和设置,放弃cpu的时机(运行完或是时间片用完)。
struct rt_rq { struct rt_prio_array active; unsigned long rt_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* 最高优先级的实时任务 */ #ifdef CONFIG_SMP int next; /* 下一个最高优先级的任务 */ #endif } highest_prio; #endif #ifdef CONFIG_SMP unsigned long rt_nr_migratory; unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; #endif int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; struct rq *rq; struct list_head leaf_rt_rq_list; struct task_group *tg; struct sched_rt_entity *rt_se; #endif };3)实时调度实体sched_rt_entity:在include/linux/sched.h中,表示一个可实时调度的实体,包含了完整的实时调度信息。
struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned int time_slice; int nr_cpus_allowed; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif }; 4)实时调度类rt_sched_class:在kernel/sched_rt.c中。 static const struct sched_class rt_sched_class = { .next = &fair_sched_class, .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, .check_preempt_curr = check_preempt_curr_rt, .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_rt, .load_balance = load_balance_rt, .move_one_task = move_one_task_rt, .set_cpus_allowed = set_cpus_allowed_rt, .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, .post_schedule = post_schedule_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, #endif .set_curr_task = set_curr_task_rt, .task_tick = task_tick_rt, .get_rr_interval = get_rr_interval_rt, .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, };(2)实时调度的主要操作:实时调度的操作在kernel/sched_rt.c中实现。
static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) { struct sched_rt_entity *rt_se = &p->rt; if (wakeup) rt_se->timeout = 0; enqueue_rt_entity(rt_se, head); /* 实际工作 */ if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); /* 添加到对应的hash表中 */ } static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { dequeue_rt_stack(rt_se); /* 先从运行队列中删除 */ for_each_sched_rt_entity(rt_se) __enqueue_rt_entity(rt_se, head); /* 然后添加到运行队列尾部 */ } static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; struct rt_rq *group_rq = group_rt_rq(rt_se); struct list_head *queue = array->queue + rt_se_prio(rt_se); /* * Don't enqueue the group if its throttled, or when empty. * The latter is a consequence of the former when a child group * get throttled and the current group doesn't have any other * active members. */ if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) return; if (head) list_add(&rt_se->run_list, queue); else list_add_tail(&rt_se->run_list, queue); __set_bit(rt_se_prio(rt_se), array->bitmap); inc_rt_tasks(rt_se, rt_rq); /* 运行进程数增一 */ }该函数先获取运行队列中的优先级队列,然后调用include/linux/list.h:list_add_tail()--->__list_add(),将进程插入到链表的末尾。如下:
static inline void __list_add(struct list_head *new, struct list_head *prev, struct list_head *next) { next->prev = new; new->next = next; new->prev = prev; prev->next = new; }2)进程选择pick_next_task_rt:实时调度会选择最高优先级的实时进程来运行。调用_pick_next_task_rt()--->pick_next_rt_entity()来完成获取下一个进程的工作。如下:
static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); /* 实际工作 */ /* The running task is never eligible for pushing */ if (p) dequeue_pushable_task(rq, p); #ifdef CONFIG_SMP /* * We detect this state here so that we can avoid taking the RQ * lock again later if there is no need to push */ rq->post_schedule = has_pushable_tasks(rq); #endif return p; } static struct task_struct *_pick_next_task_rt(struct rq *rq) { struct sched_rt_entity *rt_se; struct task_struct *p; struct rt_rq *rt_rq; rt_rq = &rq->rt; if (unlikely(!rt_rq->rt_nr_running)) return NULL; if (rt_rq_throttled(rt_rq)) return NULL; do { /* 遍历组调度中的每个进程 */ rt_se = pick_next_rt_entity(rq, rt_rq); BUG_ON(!rt_se); rt_rq = group_rt_rq(rt_se); } while (rt_rq); p = rt_task_of(rt_se); /* 更新执行域 */ p->se.exec_start = rq->clock_task; return p; } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq) { struct rt_prio_array *array = &rt_rq->active; struct sched_rt_entity *next = NULL; struct list_head *queue; int idx; /* 找到第一个可用的 */ idx = sched_find_first_bit(array->bitmap); BUG_ON(idx >= MAX_RT_PRIO); /* 从链表组中找到对应的链表 */ queue = array->queue + idx; next = list_entry(queue->next, struct sched_rt_entity, run_list); /* 返回找到的运行实体 */ return next; }该函数调用include/asm-generic/bitops/sched.h:sched_find_first_bit()返回位图中当前被置位的最高优先级,以作为这组链表的数组下标找到其优先级队列。然后调用include/linux/list.h:list_entry()--->include/linux/kernel.h:container_of(),返回该优先级队列中的第一个进程,以作为下一个要运行的实时进程。例如当前所有实时进程中最高优先级为45(换句话说,系统中没有任何实时进程的优先级小于45),则直接读取rt_prio_array中的queue[45],得到优先级为45的进程队列指针。该队列头上的第一个进程就是被选中的进程。这种算法的复杂度为O(1)。
static inline int sched_find_first_bit(const unsigned long *b) { #if BITS_PER_LONG == 64 if (b[0]) return __ffs(b[0]); return __ffs(b[1]) + 64; #elif BITS_PER_LONG == 32 if (b[0]) return __ffs(b[0]); if (b[1]) return __ffs(b[1]) + 32; if (b[2]) return __ffs(b[2]) + 64; return __ffs(b[3]) + 96; #else #error BITS_PER_LONG not defined #endif }3)进程删除dequeue_task_rt:从优先级队列中删除实时进程,并更新调度信息,然后把这个进程添加到队尾。调用链为dequeue_rt_entity()--->dequeue_rt_stack()--->__dequeue_rt_entity(),如下:
static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) { struct sched_rt_entity *rt_se = &p->rt; /* 更新调度信息 */ update_curr_rt(rq); /* 实际工作,将rt_se从运行队列中删除然后 添加到队列尾部 */ dequeue_rt_entity(rt_se); /* 从hash表中删除 */ dequeue_pushable_task(rq, p); } static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; struct rt_rq *rt_rq = rt_rq_of_se(rt_se); u64 delta_exec; if (!task_has_rt_policy(curr)) /* 判断是否问实时调度进程 */ return; /* 执行时间 */ delta_exec = rq->clock_task - curr->se.exec_start; if (unlikely((s64)delta_exec < 0)) delta_exec = 0; schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); /* 更新当前进程的总的执行时间 */ curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); /* 更新执行的开始时间 */ curr->se.exec_start = rq->clock_task; cpuacct_charge(curr, delta_exec); /* 组调度相关 */ sched_rt_avg_update(rq, delta_exec); if (!rt_bandwidth_enabled()) return; for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); spin_unlock(&rt_rq->rt_runtime_lock); } } } static void dequeue_rt_entity(struct sched_rt_entity *rt_se) { dequeue_rt_stack(rt_se); /* 从运行队列中删除 */ for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) __enqueue_rt_entity(rt_se, false); /* 添加到队尾 */ } } static void dequeue_rt_stack(struct sched_rt_entity *rt_se) { struct sched_rt_entity *back = NULL; for_each_sched_rt_entity(rt_se) { /* 遍历整个组调度实体 */ rt_se->back = back; /* 可见rt_se的back实体为组调度中前一个调度实体 */ back = rt_se; } /* 将组中的所有进程从运行队列中移除 */ for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) __dequeue_rt_entity(rt_se); } } static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; /* 移除进程 */ list_del_init(&rt_se->run_list); /* 如果链表变为空,则将位图中对应的bit位清零 */ if (list_empty(array->queue + rt_se_prio(rt_se))) __clear_bit(rt_se_prio(rt_se), array->bitmap); dec_rt_tasks(rt_se, rt_rq); /* 运行进程计数减一 */ }可见更新调度信息的函数为update_curr_rt(),在dequeue_rt_entity()中将当前实时进程从运行队列中移除,并添加到队尾。完成工作函数为dequeue_rt_stack()--->__dequeue_rt_entity(),它调用include/linux/list.h:list_del_init()--->__list_del()删除进程。然后如果链表变为空,则将位图中对应优先级的bit位清零。如下:
static inline void __list_del(struct list_head * prev, struct list_head * next) { next->prev = prev; prev->next = next; }从上面的介绍可以看出,对于实时调度,Linux的实现比较简单,仍然采用之前的O(1)调度策略,把所有的运行进程根据优先级放到不用的队列里面,采用位图方式进行使用记录。进队列仅仅是删除原来队列里面的本进程,然后将他挂到队列尾部;而对于“移除”操作,也仅仅是从队列里面移除后添加到运行队列尾部。