/**/
__schedule 调用函数pick_next_task从rq中得到一个task
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
pick_next_task(struct rq *rq)
{
const struct sched_class *class;
struct task_struct *p;
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq);
if (likely(p))
return p;
}
for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
}
BUG(); /* the idle class will always have a runnable task */
}
#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
sched_class的next成员指向下一个class,形成一个单向list
由list command得到所有的class.
crash> sym stop_sched_class
c0454ee8 (R) stop_sched_class
crash> list sched_class.next c0454ee8
c0454ee8
c0454e80
c0454e24
c0454dc8
crash> sym c0454ee8 c0454e80 c0454e24 c0454dc8
c0454ee8 (R) stop_sched_class
c0454e80 (R) rt_sched_class
c0454e24 (R) fair_sched_class
c0454dc8 (R) idle_sched_class
crash> list sched_class.next -s sched_class.pick_next_task c0454ee8
c0454ee8
pick_next_task = 0xc0055850 <pick_next_task_stop>
c0454e80
pick_next_task = 0xc0054b4c <pick_next_task_rt>
c0454e24
pick_next_task = 0xc0050e4c <pick_next_task_fair>
c0454dc8
pick_next_task = 0xc004fe08 <pick_next_task_idle>
各sched_class是怎样得到next_task的?
idle:直接得到
static struct task_struct *pick_next_task_idle(struct rq *rq)
{
schedstat_inc(rq, sched_goidle);
calc_load_account_idle(rq);
return rq->idle;
}
fair class:通过红黑树
static struct task_struct *pick_next_task_fair(struct rq *rq)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
if (!cfs_rq->nr_running)
return NULL;
do {
se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
} while (cfs_rq);
p = task_of(se);
return p;
}
pick_next_entity -> __pick_first_entity -> {struct rb_node *left = cfs_rq->rb_leftmost;}
cfs:这里才用到rb_node
rt:
pick_next_task_rt:
使用的是位图bitmap
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
/*
* This is the priority-queue data structure of the RT scheduling class:
*/
struct rt_prio_array {
DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_RT_PRIO];
};
并不是没有优先级都对应一个 queue, [0, 100]的优先级才有
进程的 sched_class 成员是何时赋值的?
创建进程时或设置优先级时
1.
void sched_set_stop_task(int cpu, struct task_struct *stop)
-> stop->sched_class = &stop_sched_class;
2./*设置优先级时*/
void rt_mutex_setprio(struct task_struct *p, int prio)
-> if (rt_prio(prio))
p->sched_class = &rt_sched_class;
3.
void sched_fork(struct task_struct *p)
-> if (!rt_prio(p->prio))
p->sched_class = &fair_sched_class;
4.
void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->sched_class = &idle_sched_class;
与进程调度有关的数据结构
crash> rt_rq -o
struct rt_rq {
[0] struct rt_prio_array active;
[816] unsigned long rt_nr_running;
struct {
int curr;
int next;
[820] } highest_prio;
[828] unsigned long rt_nr_migratory;
[832] unsigned long rt_nr_total;
[836] int overloaded;
[840] struct plist_head pushable_tasks;
[848] int rt_throttled;
[856] u64 rt_time;
[864] u64 rt_runtime;
[872] raw_spinlock_t rt_runtime_lock;
}
SIZE: 880
SIZE: 880
其中的rt_prio_array占了880字节中的816字节。
crash> rt_prio_array
struct rt_prio_array {
unsigned long bitmap[4];
struct list_head queue[100];
}
SIZE: 816
crash> cfs_rq
struct cfs_rq {
struct load_weight load;
unsigned long nr_running;
unsigned long h_nr_running;
u64 exec_clock;
u64 min_vruntime;
u64 min_vruntime_copy;
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
struct sched_entity *curr;
struct sched_entity *next;
struct sched_entity *last;
struct sched_entity *skip;
unsigned int nr_spread_over;
}
SIZE: 72
crash> struct rq -o
struct rq {
[0] raw_spinlock_t lock;
[4] unsigned long nr_running;
[8] unsigned long cpu_load[5];
[28] unsigned long last_load_update_tick;
[32] u64 nohz_stamp;
[40] unsigned long nohz_flags;
[44] int skip_clock_update;
[48] struct load_weight load;
[56] unsigned long nr_load_updates;
[64] u64 nr_switches;
[72] struct cfs_rq cfs;
[144] struct rt_rq rt;
[1024] unsigned long nr_uninterruptible;
[1028] struct task_struct *curr;
[1032] struct task_struct *idle;
[1036] struct task_struct *stop;
[1040] unsigned long next_balance;
[1044] struct mm_struct *prev_mm;
[1048] u64 clock;
[1056] u64 clock_task;
[1064] atomic_t nr_iowait;
[1068] struct root_domain *rd;
[1072] struct sched_domain *sd;
[1076] unsigned long cpu_power;
[1080] unsigned char idle_balance;
[1084] int post_schedule;
[1088] int active_balance;
[1092] int push_cpu;
[1096] struct cpu_stop_work active_balance_work;
[1116] int cpu;
[1120] int online;
[1124] struct list_head cfs_tasks;
[1136] u64 rt_avg;
[1144] u64 age_stamp;
[1152] u64 idle_stamp;
[1160] u64 avg_idle;
[1168] unsigned long calc_load_update;
[1172] long calc_load_active;
[1176] struct llist_head wake_list;
}
SIZE: 1184
crash> task_struct -o
struct task_struct {
[0] volatile long state;
[4] void *stack;
[8] atomic_t usage;
[12] unsigned int flags;
[16] unsigned int ptrace;
[20] struct llist_node wake_entry;
[24] int on_cpu;
[28] int on_rq;
[32] int prio;
[36] int static_prio;
[40] int normal_prio;
[44] unsigned int rt_priority;
[48] const struct sched_class *sched_class;
[56] struct sched_entity se;
[128] struct sched_rt_entity rt;
[152] unsigned char fpu_counter;
[156] unsigned int policy;
[160] cpumask_t cpus_allowed;
[164] struct list_head tasks;
[172] struct plist_node pushable_tasks;
[192] struct mm_struct *mm;
[196] struct mm_struct *active_mm;
[200] unsigned int brk_randomized : 1;
[204] struct task_rss_stat rss_stat;
[220] int exit_state;
[224] int exit_code;
[228] int exit_signal;
[232] int pdeath_signal;
[236] unsigned int jobctl;
[240] unsigned int personality;
[244] unsigned int did_exec : 1;
[244] unsigned int in_execve : 1;
[244] unsigned int in_iowait : 1;
[244] unsigned int sched_reset_on_fork : 1;
[244] unsigned int sched_contributes_to_load : 1;
[244] unsigned int irq_thread : 1;
[248] pid_t pid;
[252] pid_t tgid;
[256] struct task_struct *real_parent;
[260] struct task_struct *parent;
[264] struct list_head children;
[272] struct list_head sibling;
[280] struct task_struct *group_leader;
[284] struct list_head ptraced;
[292] struct list_head ptrace_entry;
[300] struct pid_link pids[3];
[336] struct list_head thread_group;
[344] struct completion *vfork_done;
[348] int *set_child_tid;
[352] int *clear_child_tid;
[356] cputime_t utime;
[360] cputime_t stime;
[364] cputime_t utimescaled;
[368] cputime_t stimescaled;
[372] cputime_t gtime;
[376] cputime_t prev_utime;
[380] cputime_t prev_stime;
[384] unsigned long nvcsw;
[388] unsigned long nivcsw;
[392] struct timespec start_time;
[400] struct timespec real_start_time;
[408] unsigned long min_flt;
[412] unsigned long maj_flt;
[416] struct task_cputime cputime_expires;
[432] struct list_head cpu_timers[3];
[456] const struct cred *real_cred;
[460] const struct cred *cred;
[464] struct cred *replacement_session_keyring;
[468] char comm[16];
[484] int link_count;
[488] int total_link_count;
[492] struct sysv_sem sysvsem;
[496] unsigned long last_switch_count;
[500] struct thread_struct thread;
[512] struct fs_struct *fs;
[516] struct files_struct *files;
[520] struct nsproxy *nsproxy;
[524] struct signal_struct *signal;
[528] struct sighand_struct *sighand;
[532] sigset_t blocked;
[540] sigset_t real_blocked;
[548] sigset_t saved_sigmask;
[556] struct sigpending pending;
[572] unsigned long sas_ss_sp;
[576] size_t sas_ss_size;
[580] int (*notifier)(void *);
[584] void *notifier_data;
[588] sigset_t *notifier_mask;
[592] struct audit_context *audit_context;
[596] seccomp_t seccomp;
[596] u32 parent_exec_id;
[600] u32 self_exec_id;
[604] spinlock_t alloc_lock;
[608] raw_spinlock_t pi_lock;
[612] struct plist_head pi_waiters;
[620] struct rt_mutex_waiter *pi_blocked_on;
[624] void *journal_info;
[628] struct bio_list *bio_list;
[632] struct blk_plug *plug;
[636] struct reclaim_state *reclaim_state;
[640] struct backing_dev_info *backing_dev_info;
[644] struct io_context *io_context;
[648] unsigned long ptrace_message;
[652] siginfo_t *last_siginfo;
[656] struct task_io_accounting ioac;
[656] struct robust_list_head *robust_list;
[660] struct list_head pi_state_list;
[668] struct futex_pi_state *pi_state_cache;
[672] struct rcu_head rcu;
[680] struct pipe_inode_info *splice_pipe;
[684] int nr_dirtied;
[688] int nr_dirtied_pause;
[692] unsigned long dirty_paused_when;
[696] unsigned long timer_slack_ns;
[700] unsigned long default_timer_slack_ns;
[704] struct list_head *scm_work_list;
}
SIZE: 712
crash> mm_struct -o
struct mm_struct {
[0] struct vm_area_struct *mmap;
[4] struct rb_root mm_rb;
[8] struct vm_area_struct *mmap_cache;
[12] unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
[16] void (*unmap_area)(struct mm_struct *, unsigned long);
[20] unsigned long mmap_base;
[24] unsigned long task_size;
[28] unsigned long cached_hole_size;
[32] unsigned long free_area_cache;
[36] pgd_t *pgd;
[40] atomic_t mm_users;
[44] atomic_t mm_count;
[48] int map_count;
[52] spinlock_t page_table_lock;
[56] struct rw_semaphore mmap_sem;
[72] struct list_head mmlist;
[80] unsigned long hiwater_rss;
[84] unsigned long hiwater_vm;
[88] unsigned long total_vm;
[92] unsigned long locked_vm;
[96] unsigned long pinned_vm;
[100] unsigned long shared_vm;
[104] unsigned long exec_vm;
[108] unsigned long stack_vm;
[112] unsigned long reserved_vm;
[116] unsigned long def_flags;
[120] unsigned long nr_ptes;
[124] unsigned long start_code;
[128] unsigned long end_code;
[132] unsigned long start_data;
[136] unsigned long end_data;
[140] unsigned long start_brk;
[144] unsigned long brk;
[148] unsigned long start_stack;
[152] unsigned long arg_start;
[156] unsigned long arg_end;
[160] unsigned long env_start;
[164] unsigned long env_end;
[168] unsigned long saved_auxv[40];
[328] struct mm_rss_stat rss_stat;
[340] struct linux_binfmt *binfmt;
[344] cpumask_var_t cpu_vm_mask_var;
[348] mm_context_t context;
[360] unsigned int faultstamp;
[364] unsigned int token_priority;
[368] unsigned int last_interval;
[372] unsigned long flags;
[376] struct core_state *core_state;
[380] spinlock_t ioctx_lock;
[384] struct hlist_head ioctx_list;
[388] struct file *exe_file;
[392] unsigned long num_exe_file_vmas;
}
下面的问题是怎样设怎样维护每CPU runqueues的?
1. 怎样把唤醒的task 加入run queue?
2. 怎样把睡眠的task中从runqueue中删除