1.每个物理CPU下面都有一个虚拟CPU的运行队列
2.每个运行队列中的VCPU都有一个credit值
3. credit表示VCPU的priority的价值
4.CPU调度的是最先入队的处于under状态下的VCPU
5.每10msec 为一个时间片,相应一次中断。如果被调度的VCPU的credit处于OVER状态那么它将不再被继续调度,重新计算credit值,调度后面的处于under状态下的第一个VCPU
6.如果进行了3个时间片也就是30msec时,原先的VCPU还是没有处于OVER状态,那么这个VCPU也将不被继续调度,credit值的重计算,同5后面的步骤
7.代码中的计算公式VCPU的credit = credit –CSCHED_CREDITS_PER_TICK (100)+30msec/n(VCPU的数)
8.处于OVER状态的VCPU credit的值不被增加
概念:Credit, Weight, Cap
想彻底搞清楚这三个词的概念,我想最重要的是把这个函数读懂:sched_credit.c/csched_acct()
Credit: 这是针对Scheduler而言的,而不是针对Domain.
csched_priv.credit = CSCHED_CREDITS_PER_ACCT * #_of_PCPU. (for example: 300 * 4 = 1200)
Weight: 这个是针对Domain而言的,Scheduler根据各个domain的Weight,来分配credit。是一个“相对”的概念
比如说:256:256和512:512是一样的,彼此各占一半。但有什么区别呢?
512:512相对于256:256,控制的精度更高。
/*
* A domain's fair share is computed using its weight in competition
* with that of all other active domains.
*
* At most, a domain can use credits to run all its active VCPUs
* for one full accounting period. We allow a domain to earn more
* only when the system-wide credit balance is negative.
*/
Cap: 这个也是针对Domain而言的,是一个“绝对”的概念。100代表一整颗PCPU的Cycles。50代表,最多可以运行半个PCPU的Cycles.
在csched_acct这个函数中:
(1) 根据各个domain的weight情况,把total_credit分配到各个domain中
credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
) / weight_total;
(2) 再把domain的Credit平均分配到domain的各个VCPU中
credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
) / sdom->active_vcpu_count;
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/snailhit/archive/2010/12/30/6107279.aspx
虚拟机(xen)中credit调度算法分析
宋伟 联想研究院
调度简介
在虚拟机xen中主要有两中调度算法,一种是credit算法,另一种是sedf算法。Credit算法就是让每一个vcpu(虚拟cpu)都可以公平的使用物理cpu的资源。Sedf算法可以根据每个vcpu负载的大小动态的调整vcpu的优先级。
在虚拟机xen中关于调度的代码是这样的:
void __init scheduler_init(void)
{
int i;
open_softirq(SCHEDULE_SOFTIRQ, schedule); //打开/注册schedule这个软中断
for_each_cpu ( i )
{ //为每个cpu定一个定时器。在时间到后就调用回调函数s_time_fn, 并且在回调函数中调用产生软中断(设置bit),在cpu 发送vmexit后会检查软中断的mask位,如果发现某些位被置上后就会调用其中断回调函数。
spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
}
for ( i = 0; schedulers[i] != NULL; i++ )
{
ops = *schedulers[i];
if ( strcmp(ops.opt_name, opt_sched) == 0 )
break;
}
if ( schedulers[i] == NULL )
printk("Could not find scheduler: %s/n", opt_sched);
printk("Using scheduler: %s (%s)/n", ops.name, ops.opt_name);
SCHED_OP(init);
}
在schedule.c中其实是对调度的抽象层,具体的实现在sched_credit.c or sched_sedf.c中。至于要使用sedf或者credit算法。可以由宏定义来指明。
Credit算法
Credit算法:
每个物理cpu都有一个runq,这个runq是一个以每个vcpu的priority的大小来排序的。Priority有over(above fair share) and down(below fair share)
(图一)调度队列整体结构
图(二)Credit scheduler调度队列具体实现
图(三)内核启动调度器流程
图(四)调度器初始化流程
图(五)Credit 调度的优先级计算方法
我们可以看到bsp对其他的ap的runq队列按照计算的优先级进行排序。
图(六) Credit算法偷取任务流程图
任务的优先级次序:
1
.CSCHED_PRI_TS_UNDER
2
.CSCHED_PRI_TS_OVER
3
.CSCHED_PRI_IDLE
static void
csched_acct(void)
计算任务优先级
{
unsigned long flags;
struct list_head *iter_vcpu, *next_vcpu;
struct list_head *iter_sdom, *next_sdom;
struct csched_vcpu *svc;
struct csched_dom *sdom;
uint32_t credit_total;
uint32_t weight_total;
uint32_t weight_left;
uint32_t credit_fair;
uint32_t credit_peak;
int credit_balance;
int credit_xtra;
int credit;
spin_lock_irqsave(&csched_priv.lock, flags);
weight_total = csched_priv.weight;
这里的
weight
为所有
active domain
的权重总和
credit_total = csched_priv.credit; credit
为当前系统的分值,为物理
cpu
个数×
30
/* Converge balance towards 0 when it drops negative */
if ( csched_priv.credit_balance < 0 )
{
credit_total -= csched_priv.credit_balance;
CSCHED_STAT_CRANK(acct_balance);
}
if ( unlikely(weight_total == 0) )
没有
active domain
无需进行调度
{
csched_priv.credit_balance = 0;
spin_unlock_irqrestore(&csched_priv.lock, flags);
CSCHED_STAT_CRANK(acct_no_work);
return;
}
CSCHED_STAT_CRANK(acct_run);
weight_left = weight_total;
credit_balance = 0;
credit_xtra = 0;
以
active domain
为循环过程,先计算每个
domain
的
credit
值,然后平分到这个
domain
的每个
vcpu
中,最后计算每个
vcpu
的优先级。
list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
{
sdom = list_entry(iter_sdom, struct csched_dom, active_sdom_elem);
BUG_ON( is_idle_domain(sdom->dom) );
BUG_ON( sdom->active_vcpu_count == 0 );
BUG_ON( sdom->weight == 0 );
BUG_ON( sdom->weight > weight_left );
weight_left -= sdom->weight;
/*
* A domain's fair share is computed using its weight in competition
* with that of all other active domains.
*
* At most, a domain can use credits to run all its active VCPUs
* for one full accounting period. We allow a domain to earn more
* only when the system-wide credit balance is negative.
*/
credit_peak = sdom->active_vcpu_count * CSCHED_ACCT_PERIOD;
if ( csched_priv.credit_balance < 0 )
{
credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
(weight_total - 1)
) / weight_total;
}
if ( sdom->cap != 0U )
{
uint32_t credit_cap = ((sdom->cap * CSCHED_ACCT_PERIOD) + 99) / 100;
if ( credit_cap < credit_peak )
credit_peak = credit_cap;
}
credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
) / weight_total;
计算当前
domain
的
credit
值
if ( credit_fair < credit_peak )
如果实际分配的
credit
值比它应该获得的
credit
低,表明分配给该
domain
的
credit
应该多给一些
credit
值。需要将该
domain
在
csched_priv.active_sdom
向后排,(越往后排可能获得的
credit
值就比它排在前面时多)
{
credit_xtra = 1;表明当前domain需要向后排
}
else
实际分配的
credit
值比它应该获得的
credit
高
{
if ( weight_left != 0U )
将该
domain
多出的
credit
值分配给其他
domain
{
/* Give other domains a chance at unused credits */
credit_total += ( ( ( credit_fair - credit_peak
) * weight_total
) + ( weight_left - 1 )
) / weight_left;
}
if ( credit_xtra )
{
需要将该
domain
往队列前面放
/*
* Lazily keep domains with extra credits at the head of
* the queue to give others a chance at them in future
* accounting periods.
*/
CSCHED_STAT_CRANK(acct_reorder);
list_del(&sdom->active_sdom_elem);
list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
}
credit_fair = credit_peak;
用应该获得的
credit
值(非实际
credit
值)进行分配
vcpu
,即需要降低他们的优先级
}
/* Compute fair share per VCPU */
credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
) / sdom->active_vcpu_count;
list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
{
分配当前
domain
的
credit
值给每个
vcpu
svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
BUG_ON( sdom != svc->sdom );
/* Increment credit */
atomic_add(credit_fair, &svc->credit);
credit = atomic_read(&svc->credit);
/*
* Recompute priority or, if VCPU is idling, remove it from
* the active list.
*/
if ( credit < 0 )
计算
vcpu
优先级
{
if ( sdom->cap == 0U )
svc->pri = CSCHED_PRI_TS_OVER;
else
svc->pri = CSCHED_PRI_TS_PARKED;
if ( credit < -CSCHED_TSLICE )
{
CSCHED_STAT_CRANK(acct_min_credit);
credit = -CSCHED_TSLICE;
atomic_set(&svc->credit, credit);
}
}
else
{
svc->pri = CSCHED_PRI_TS_UNDER;
if ( credit > CSCHED_TSLICE )
检测是否当前的
vpu
为空闲状态
__csched_vcpu_acct_idle_locked(svc);
}
svc->credit_last = credit;
svc->credit_incr = credit_fair;
credit_balance += credit;
}
}
csched_priv.credit_balance = credit_balance;
spin_unlock_irqrestore(&csched_priv.lock, flags);
/* Inform each CPU that its runq needs to be sorted */
csched_priv.runq_sort++;
}
static struct csched_vcpu *
csched_runq_steal(struct csched_pcpu *spc, int cpu, int pri)
{
struct list_head *iter;
struct csched_vcpu *speer;
struct vcpu *vc;
list_for_each( iter, &spc->runq )
遍历
spc
(
vcpu
)对应物理
cpu
的调度队列
{
speer = __runq_elem(iter);
找到每个队列元素的调度结构
/*
* If next available VCPU here is not of higher priority than ours,
* this PCPU is useless to us.
*/
if ( speer->pri <= CSCHED_PRI_IDLE || speer->pri <= pri )
{
如果所偷的任务的优先级没有原
cpu
的任务低,则无需偷此
cpu
的任务
CSCHED_STAT_CRANK(steal_peer_idle);
break;
}
/* Is this VCPU is runnable on our PCPU? */
vc = speer->vcpu;
找到了比原
cpu
优先级高的任务
BUG_ON( is_idle_vcpu(vc) );
如果当前此任务对应的
vcpu
是空闲的,就停止偷取此任务
if ( __csched_vcpu_is_stealable(cpu, vc) )
{
/* We got a candidate. Grab it! */
__runq_remove(speer);
vc->processor = cpu;
return speer;
}
}
return NULL;
}