q 计算机系统启动的时候,需要读BIOS获得机器的内存,硬盘参数等物理信息。在虚拟化的情况下,BIOS是不存在的。所以VMM需要模拟这部分的功能。
q VMM运行在保护模式,而Guest OS也运行在保护模式,需要提供保护模式下的信息共享机制。
q VMM运作在最高优先级(0级),而Guest OS运行在低优先级。这意味着虚拟机的内核不能执行某些特权指令,VMM必须提供执行这些特权指令的接口。
q VMM要通知事件到VM,需要机制实现这种事件机制。
q Linux系统进程之间有通信机制。而虚拟机之间也需要一种安全高效的通信机制。
启动信息页包含了内核启动所需要的信息。启动信息页是一个start_info的数据结构,定义在/xen/include/public/xen.h文件。启动信息页包括了分配给domain的内存页面数,xen store通信页表的机器页号,保存共享信息页的物理地址等等。
共享信息页主要是与VCPU和虚拟机状态相关的信息,包括VCPU状态信息,时钟信息和虚拟中断状态信息。共享信息页能够被xen和Guest OS访问,因此可以用来在xen和Guest OS之间共享信息。
超级调用为Guest OS提供了实现特权指令的机制。在linux系统中,内核提供了系统调用功能,这是通过软中断指令(int 80H)实现。超级调用也是通过软中断实现的, 它使用了0x82这个软中断调用号。
Xen store类似windows里面的注册表。Xen store存储了各个VM的配置信息,前后端设备的信息,虚拟机状态等等。Xen store是一种高级通信机制,它是基于低级通信机制共享页面和事件通道来实现的。Xen store提供了更高级的操作,它提供了一个具有层次结构的目录,类似linux里面的树形目录。通过xen store可以列出目录,读写值,写入值等等。
Xen bus可以看做是一条虚拟的总线。
q Vcpu结构:保存vcpu
q Arch_vcpu结构:
q Vcpu_guest_context:
q Vcpu_info:
VCPU结构保存了vcpu的基本信息,同时有成员指针指向arch_vcpu结构。Vcpu的基本信息包括cpu ID,vcpu调度相关信息,vcpu状态信息等。
代码清单2-1 VCPU结构
int vcpu_id;
int processor;
vcpu_info_t *vcpu_info;
struct domain *domain;
struct vcpu *next_in_list;
uint64_t periodic_period;
uint64_t periodic_last_event;
struct timer periodic_timer;
struct timer singleshot_timer;
struct timer poll_timer; /* timeout for SCHEDOP_poll */
void *sched_priv; /* scheduler-specific data */
struct vcpu_runstate_info runstate;
/* Has the FPU been initialised? */
bool_t fpu_initialised;
/* Has the FPU been used since it was last saved? */
bool_t fpu_dirtied;
/* Is this VCPU polling any event channels (SCHEDOP_poll)? */
bool_t is_polling;
/* Initialization completed for this VCPU? */
bool_t is_initialised;
/* Currently running on a CPU? */
bool_t is_running;
/* NMI callback pending for this VCPU? */
bool_t nmi_pending;
/* Avoid NMI reentry by allowing NMIs to be masked for short periods. */
bool_t nmi_masked;
/* Require shutdown to be deferred for some asynchronous operation? */
bool_t defer_shutdown;
/* VCPU is paused following shutdown request (d->is_shutting_down)? */
bool_t paused_for_shutdown;
unsigned long pause_flags;
atomic_t pause_count;
u16 virq_to_evtchn[NR_VIRQS];
/* Bitmask of CPUs on which this VCPU may run. */
cpumask_t cpu_affinity;
unsigned long nmi_addr; /* NMI callback address. */
/* Bitmask of CPUs which are holding onto this VCPU's state. */
cpumask_t vcpu_dirty_cpumask;
struct arch_vcpu arch;
代码清单2-2 vcpu运行状态
/* VCPU's current state (RUNSTATE_*). */
int state;
/* When was current state entered (system time, ns)? */
uint64_t state_entry_time;
* Time spent in each RUNSTATE_* (ns). The sum of these times is
* guaranteed not to drift from system time.
uint64_t time[4];
代码清单2-3 Arch_vcpu
/* Needs 16-byte aligment for FXSAVE/FXRSTOR. */
struct vcpu_guest_context guest_context
struct pae_l3_cache pae_l3_cache;
unsigned long flags; /* TF_ */
void (*schedule_tail) (struct vcpu *);
void (*ctxt_switch_from) (struct vcpu *);
void (*ctxt_switch_to) (struct vcpu *);
/* Bounce information for propagating an exception to guest OS. */
struct trap_bounce trap_bounce;
/* I/O-port access bitmap. */
XEN_GUEST_HANDLE(uint8_t) iobmp; /* Guest kernel virtual address of the bitmap. */
int iobmp_limit; /* Number of ports represented in the bitmap. */
int iopl; /* Current IOPL for this VCPU. */
struct desc_struct int80_desc;
/* Virtual Machine Extensions */
struct hvm_vcpu hvm_vcpu;
l1_pgentry_t *perdomain_ptes;
pagetable_t guest_table; /* (MFN) guest notion of cr3 */
/* guest_table holds a ref to the page, and also a type-count unless
* shadow refcounts are in use */
pagetable_t shadow_table[4]; /* (MFN) shadow(s) of guest */
pagetable_t monitor_table; /* (MFN) hypervisor PT (for HVM) */
unsigned long cr3; /* (MA) value to install in HW CR3 */
/* Current LDT details. */
unsigned long shadow_ldt_mapcnt;
struct paging_vcpu paging;
} __cacheline_aligned;type="application/x-silverlight-2"
而函数指针ctxt_switch_from 和ctxt_switch_to是要在vcpu切换的时候调用,对于xen的半虚拟化和全虚拟化来说,它们的实现也是各自不同的。
代码清单2-4 vcpu_guest_context
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0)
#define VGCF_IN_KERNEL (1<<2)
#define _VGCF_i387_valid 0
#define VGCF_i387_valid (1<<_VGCF_i387_valid)
#define _VGCF_in_kernel 2
#define VGCF_in_kernel (1<<_VGCF_in_kernel)
#define _VGCF_failsafe_disables_events 3
#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
#define _VGCF_syscall_disables_events 4
#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
#define _VGCF_online 5
#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
/* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
unsigned long event_callback_eip;
unsigned long failsafe_callback_eip;
#ifdef __XEN__
union {
unsigned long syscall_callback_eip;
struct {
unsigned int event_callback_cs; /* compat CS of event cb */
unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */
unsigned long syscall_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
#ifdef __x86_64__
/* Segment base addresses. */
uint64_t fs_base;
uint64_t gs_base_kernel;
uint64_t gs_base_user;
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0)
#define VGCF_IN_KERNEL (1<<2)
#define _VGCF_i387_valid 0
#define VGCF_i387_valid (1<<_VGCF_i387_valid)
#define _VGCF_in_kernel 2
#define VGCF_in_kernel (1<<_VGCF_in_kernel)
#define _VGCF_failsafe_disables_events 3
#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
#define _VGCF_syscall_disables_events 4
#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
#define _VGCF_online 5
#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
/* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
unsigned long event_callback_eip;
unsigned long failsafe_callback_eip;
#ifdef __XEN__
union {
unsigned long syscall_callback_eip;
struct {
unsigned int event_callback_cs; /* compat CS of event cb */
unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */
unsigned long syscall_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
#ifdef __x86_64__
/* Segment base addresses. */
uint64_t fs_base;
uint64_t gs_base_kernel;
uint64_t gs_base_user;
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0)
#define VGCF_IN_KERNEL (1<<2)
#define _VGCF_i387_valid 0
#define VGCF_i387_valid (1<<_VGCF_i387_valid)
#define _VGCF_in_kernel 2
#define VGCF_in_kernel (1<<_VGCF_in_kernel)
#define _VGCF_failsafe_disables_events 3
#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
#define _VGCF_syscall_disables_events 4
#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
#define _VGCF_online 5
#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
/* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
unsigned long event_callback_eip;
unsigned long failsafe_callback_eip;
#ifdef __XEN__
union {
unsigned long syscall_callback_eip;
struct {
unsigned int event_callback_cs; /* compat CS of event cb */
unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */
unsigned long syscall_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
#ifdef __x86_64__
/* Segment base addresses. */
uint64_t fs_base;
uint64_t gs_base_kernel;
uint64_t gs_base_user;
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0)
#define VGCF_IN_KERNEL (1<<2)
#define _VGCF_i387_valid 0
#define VGCF_i387_valid (1<<_VGCF_i387_valid)
#define _VGCF_in_kernel 2
#define VGCF_in_kernel (1<<_VGCF_in_kernel)
#define _VGCF_failsafe_disables_events 3
#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
#define _VGCF_syscall_disables_events 4
#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
#define _VGCF_online 5
#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
/* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
unsigned long event_callback_eip;
unsigned long failsafe_callback_eip;
#ifdef __XEN__
union {
unsigned long syscall_callback_eip;
struct {
unsigned int event_callback_cs; /* compat CS of event cb */
unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */
unsigned long syscall_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
#ifdef __x86_64__
/* Segment base addresses. */
uint64_t fs_base;
uint64_t gs_base_kernel;
uint64_t gs_base_user;
struct vcpu_guest_context {
/* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
#define VGCF_I387_VALID (1<<0)
#define VGCF_IN_KERNEL (1<<2)
#define _VGCF_i387_valid 0
#define VGCF_i387_valid (1<<_VGCF_i387_valid)
#define _VGCF_in_kernel 2
#define VGCF_in_kernel (1<<_VGCF_in_kernel)
#define _VGCF_failsafe_disables_events 3
#define VGCF_failsafe_disables_events (1<<_VGCF_failsafe_disables_events)
#define _VGCF_syscall_disables_events 4
#define VGCF_syscall_disables_events (1<<_VGCF_syscall_disables_events)
#define _VGCF_online 5
#define VGCF_online (1<<_VGCF_online)
unsigned long flags; /* VGCF_* flags */
struct cpu_user_regs user_regs; /* User-level CPU registers */
struct trap_info trap_ctxt[256]; /* Virtual IDT */
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
/* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
unsigned long event_callback_eip;
unsigned long failsafe_callback_eip;
#ifdef __XEN__
union {
unsigned long syscall_callback_eip;
struct {
unsigned int event_callback_cs; /* compat CS of event cb */
unsigned int failsafe_callback_cs; /* compat CS of failsafe cb */
unsigned long syscall_callback_eip;
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
#ifdef __x86_64__
/* Segment base addresses. */
uint64_t fs_base;
uint64_t gs_base_kernel;
uint64_t gs_base_user;
代码清单2-5 Vcpu_info
struct vcpu_info {
* 'evtchn_upcall_pending' is written non-zero by Xen to indicate
* a pending notification for a particular VCPU. It is then cleared
* by the guest OS /before/ checking for pending work, thus avoiding
* a set-and-check race. Note that the mask is only accessed by Xen
* on the CPU that is currently hosting the VCPU. This means that the
* pending and mask flags can be updated by the guest without special
* synchronisation (i.e., no need for the x86 LOCK prefix).
* This may seem suboptimal because if the pending flag is set by
* a different CPU then an IPI may be scheduled even when the mask
* is set. However, note:
* 1. The task of 'interrupt holdoff' is covered by the per-event-
* channel mask bits. A 'noisy' event that is continually being
* triggered can be masked at source at this very precise
* granularity.
* 2. The main purpose of the per-VCPU mask is therefore to restrict
* reentrant execution: whether for concurrency control, or to
* prevent unbounded stack usage. Whatever the purpose, we expect
* that the mask will be asserted only for short periods at a time,
* and so the likelihood of a 'spurious' IPI is suitably small.
* The mask is read before making an event upcall to the guest: a
* non-zero mask therefore guarantees that the VCPU will not receive
* an upcall activation. The mask is cleared when the VCPU requests
* to block: this avoids wakeup-waiting races.
uint8_t evtchn_upcall_pending;
uint8_t evtchn_upcall_mask;
unsigned long evtchn_pending_sel;
struct arch_vcpu_info arch;
struct vcpu_time_info time;
}; /* 64 bytes (x86) */
struct vcpu_info {
uint8_t evtchn_upcall_pending;
uint8_t evtchn_upcall_mask;
unsigned long evtchn_pending_sel;
struct arch_vcpu_info arch;
struct vcpu_time_info time;
}; /* 64 bytes (x86) */
Vcpu_info位于共享信息页,因此可以被Guest OS所访问。它包括event_chan的信息和系统时间信息。
代码清单2-6 init_idle_domain
static void __init init_idle_domain(void)
struct domain *idle_domain;
/* Domain creation requires that scheduler structures are initialised. */
idle_domain = domain_create(IDLE_DOMAIN_ID, 0, 0);
if ( (idle_domain == NULL) || (alloc_vcpu(idle_domain, 0, 0) == NULL) )
idle_vcpu[0] = this_cpu(curr_vcpu) = current;
然后创建一个 domain结构,通过alloc_vcpu为domain分配一个vcpu。这里创建的domain是一个idle domain。Idle domain和idle进程有点像,都是用来填补物理cpu的空闲,如果cpu找不到合适的进程投入运行,那就运行idle 进程,而在xen里面,如果没合适的vcpu运行,就运行idle domain的idle vcpu。
代码清单2-7 Alloc_vcpu
struct vcpu *alloc_vcpu(
struct domain *d, unsigned int vcpu_id, unsigned int cpu_id)
struct vcpu *v;
BUG_ON(d->vcpu[vcpu_id] != NULL);
if ( (v = alloc_vcpu_struct()) == NULL )
return NULL;
v->domain = d;
v->vcpu_id = vcpu_id;
/*设置状态,如果是idle domain,状态设置为运行态,否则设置为离线态*/
v->runstate.state = is_idle_vcpu(v) ? RUNSTATE_running : RUNSTATE_offline;
v->runstate.state_entry_time = NOW();
/*为非idle domain,设置共享信息页*/
if ( !is_idle_domain(d) )
set_bit(_VPF_down, &v->pause_flags);
v->vcpu_info = shared_info_addr(d, vcpu_info[vcpu_id]);
if ( sched_init_vcpu(v, cpu_id) != 0 )
return NULL;
if ( vcpu_initialise(v) != 0 )
return NULL;
d->vcpu[vcpu_id] = v;
if ( vcpu_id != 0 )
d->vcpu[v->vcpu_id-1]->next_in_list = v;
/* Must be called after making new vcpu visible to for_each_vcpu(). */
return v;
创建vcpu之后,要把vcpu和具体的物理处理器绑定,投入调度。因为是一个idle domain,所以立即投入运行。这个工作是通过sched_init_vcpu来完成的。
代码清单2-8 sched_init_vcpu
int sched_init_vcpu(struct vcpu *v, unsigned int processor)
struct domain *d = v->domain;
* Initialize processor and affinity settings. The idler, and potentially
* domain-0 VCPUs, are pinned onto their respective physical CPUs.
v->processor = processor;
if ( is_idle_domain(d) || ((d->domain_id == 0) && opt_dom0_vcpus_pin) )
v->cpu_affinity = cpumask_of_cpu(processor);
/* Initialise the per-vcpu timers. */
init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
v, v->processor);
init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
v, v->processor);
init_timer(&v->poll_timer, poll_timer_fn,
v, v->processor);
/*如英文注解,idle domain立即进入运行*/
/* Idle VCPUs are scheduled immediately. */
if ( is_idle_domain(d) )
per_cpu(schedule_data, v->processor).curr = v;
per_cpu(schedule_data, v->processor).idle = v;
v->is_running = 1;
TRACE_2D(TRC_SCHED_DOM_ADD, v->domain->domain_id, v->vcpu_id);
return SCHED_OP(init_vcpu, v);
periodic_timer和singleshot_timer这个计时器对Guest OS的运行具有重大意义。periodic_timer是周期计时器,它用来触发时间中断,控制Guest OS时间的更新,而singleshot_timer是单次计时器,用来Guest OS完成某些时间相关的任务。这两个计时器在后面将继续分析。
void __init scheduler_init(void)
int i;
open_softirq(SCHEDULE_SOFTIRQ, schedule);
for_each_cpu ( i )
spin_lock_init(&per_cpu(schedule_data, i).schedule_lock);
init_timer(&per_cpu(schedule_data, i).s_timer, s_timer_fn, NULL, i);
for ( i = 0; schedulers[i] != NULL; i++ )
ops = *schedulers[i];
if ( strcmp(ops.opt_name, opt_sched) == 0 )
if ( schedulers[i] == NULL )
printk("Could not find scheduler: %s\n", opt_sched);
printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
static void s_timer_fn(void *unused)
代码清单2-11 虚
struct scheduler {
char *name; /* full name for this scheduler */
char *opt_name; /* option name for this scheduler */
unsigned int sched_id; /* ID for this scheduler */
void (*init) (void);
int (*init_domain) (struct domain *);
void (*destroy_domain) (struct domain *);
int (*init_vcpu) (struct vcpu *);
void (*destroy_vcpu) (struct vcpu *);
void (*sleep) (struct vcpu *);
void (*wake) (struct vcpu *);
struct task_slice (*do_schedule) (s_time_t);
int (*pick_cpu) (struct vcpu *);
int (*adjust) (struct domain *,
struct xen_domctl_scheduler_op *);
void (*dump_settings) (void);
void (*dump_cpu_state) (int);
static void schedule(void)
/* get policy-specific decision on scheduling... */
next_slice = ops.do_schedule(now);
next = next_slice.task;
if ( unlikely(prev == next) )
return continue_running(prev);
vcpu_runstate_change(next, RUNSTATE_running, now);
context_switch(prev, next);
调用调度器的do_schedule函数来找到下一个运行的vcpu,如果下一个vcpu等于当前的vcpu,说明不需要切换,那么调用continue_running继续运行当前的vcpu,否则,要切换next vcpu的状态,然后调用context_switch执行vcpu的切换。
用户和Guest OS都需要控制vcpu的运行,比如用户想暂停domain。Xen提供了超级调用__HYPERVISOR_vcpu_op来完成这个工作。
代码清单2-13 xen
void __init init_IRQ(void)
int i;
for ( i = 0; i < NR_IRQS; i++ )
irq_desc[i].status = IRQ_DISABLED;
irq_desc[i].handler = &no_irq_type;
irq_desc[i].action = NULL;
irq_desc[i].depth = 1;
set_intr_gate(i, interrupt[i]);
for ( i = 0; i < 16; i++ )
vector_irq[LEGACY_VECTOR(i)] = i;
irq_desc[LEGACY_VECTOR(i)].handler = &i8259A_irq_type;
/* Set the clock to HZ Hz */
#define CLOCK_TICK_RATE 1193180 /* crystal freq (Hz) */
#define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
outb(LATCH >> 8, PIT_CH0); /* MSB */
setup_irq(2, &cascade);
Irq_desx是个256个成员的全局数组,每个成员代表一个中断。set_intr_gate(i, interrupt[i])这句是设置中断的通用处理函数。Interrupt这个变量的定义非常繁琐,其目的是定义256个中断的处理函数为一个通用的处理函数,就是common_interrupt这个处理函数,所有的中断都由这个处理函数来处理。
代码清单2-14 xen
#define BUILD_COMMON_IRQ() \
__asm__( \
"\n" __ALIGN_STR"\n" \
"common_interrupt:\n\t" \
"movl %esp,%eax\n\t" \
"pushl %eax\n\t" \
"call " STR(do_IRQ) "\n\t" \
"addl $4,%esp\n\t" \
"jmp ret_from_intr\n");
代码清单2-15 do_IRQ
asmlinkage void do_IRQ(struct cpu_user_regs *regs)
unsigned int vector = regs->entry_vector;
irq_desc_t *desc = &irq_desc[vector];
struct irqaction *action;
if ( likely(desc->status & IRQ_GUEST) )
desc->status &= ~IRQ_REPLAY;
desc->status |= IRQ_PENDING;
* Since we set PENDING, if another processor is handling a different
* instance of this same irq, the other processor will take care of it.
if ( desc->status & (IRQ_DISABLED | IRQ_INPROGRESS) )
goto out;
desc->status |= IRQ_INPROGRESS;
action = desc->action;
while ( desc->status & IRQ_PENDING )
desc->status &= ~IRQ_PENDING;
action->handler(vector_to_irq(vector), action->dev_id, regs);
desc->status &= ~IRQ_INPROGRESS;
Do_IRQ要根据中断的类型进行不同的处理。如果是IRQ_GUEST类型的中断,说明是由Guest OS处理的中断,要送给VM处理。如果是xen处理的中断,那么调用注册进来的handler函数处理。处理时候要设置中断状态,避免同一个中断再次进入。
代码清单2-16 xen
void __init early_time_init(void)
u64 tmp = calibrate_boot_tsc();
set_time_scale(&per_cpu(cpu_time, 0).tsc_scale, tmp);
do_div(tmp, 1000);
cpu_khz = (unsigned long)tmp;
printk("Detected %lu.%03lu MHz processor.\n",
cpu_khz / 1000, cpu_khz % 1000);
setup_irq(0, &irq0);
虚拟中断要从两方面分析。一个方面是xen是如何发送虚拟中断的,一个方面是Guest OS是如何注册中断处理函数,然后接收xen发送过来的虚拟中断。
代码清单2-17 __do_IRQ_guest
static void __do_IRQ_guest(int vector)
unsigned int irq = vector_to_irq(vector);
irq_desc_t *desc = &irq_desc[vector];
irq_guest_action_t *action = (irq_guest_action_t *)desc->action;
struct domain *d;
int i, sp;
struct pending_eoi *peoi = this_cpu(pending_eoi);
if ( unlikely(action->nr_guests == 0) )
/* An interrupt may slip through while freeing an ACKTYPE_EOI irq. */
ASSERT(action->ack_type == ACKTYPE_EOI);
ASSERT(desc->status & IRQ_DISABLED);
if ( action->ack_type == ACKTYPE_EOI )
sp = pending_eoi_sp(peoi);
ASSERT((sp == 0) || (peoi[sp-1].vector < vector));
peoi[sp].vector = vector;
peoi[sp].ready = 0;
pending_eoi_sp(peoi) = sp+1;
cpu_set(smp_processor_id(), action->cpu_eoi_map);
for ( i = 0; i < action->nr_guests; i++ )
d = action->guest[i];
if ( (action->ack_type != ACKTYPE_NONE) &&
!test_and_set_bit(irq, d->pirq_mask) )
send_guest_pirq(d, irq);
代码清单2-18 xen
void send_guest_pirq(struct domain *d, int pirq)
int port = d->pirq_to_evtchn[pirq];
struct evtchn *chn;
ASSERT(port != 0);
chn = evtchn_from_port(d, port);
evtchn_set_pending(d->vcpu[chn->notify_vcpu_id], port);
在xen通过send_guest_pirq向Guest OS发送事件通知后,Guest OS会调用函数evtcha_do_upcall处理。
代码清单2-19 evtchn_do_upcall
asmlinkage void evtchn_do_upcall(struct pt_regs *regs)
unsigned long l1, l2;
unsigned int l1i, l2i, port, count;
int irq, cpu = smp_processor_id();
shared_info_t *s = HYPERVISOR_shared_info;
vcpu_info_t *vcpu_info = &s->vcpu_info[cpu];
do {
/* Avoid a callback storm when we reenable delivery. */
vcpu_info->evtchn_upcall_pending = 0;
/* Nested invocations bail immediately. */
if (unlikely(per_cpu(upcall_count, cpu)++))
#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */
/* Clear master flag /before/ clearing selector flag. */
l1 = xchg(&vcpu_info->evtchn_pending_sel, 0);
while (l1 != 0) {
l1i = __ffs(l1);
l1 &= ~(1UL << l1i);
while ((l2 = active_evtchns(cpu, s, l1i)) != 0) {
l2i = __ffs(l2);
port = (l1i * BITS_PER_LONG) + l2i;
if ((irq = evtchn_to_irq[port]) != -1)
do_IRQ(irq, regs);
else {
/* If there were nested callbacks then we have more to do. */
count = per_cpu(upcall_count, cpu);
per_cpu(upcall_count, cpu) = 0;
} while (unlikely(count != 1));
时间是整个计算机系统运行的重要概念。VM和 xen的调度,都需要依赖时间来执行。
代码清单2-20 xen
void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
/* Update jiffies counter. */
(*(volatile unsigned long *)&jiffies)++;
/* Rough hack to allow accurate timers to sort-of-work with no APIC. */
if ( !cpu_has_apic )
if ( using_pit )
代码清单2-21 xen
/* Late init function (after all CPUs are booted). */
int __init init_xen_time(void)
{ /*获得cmos的时间*/
wc_sec = get_cmos_time();
/*初始化一个cpu 计时器*/
stime_platform_stamp = 0;
return 0;
代码清单2-22 xen
static void local_time_calibration(void *unused)
/* Record new timestamp information. */
t->tsc_scale.mul_frac = calibration_mul_frac;
t->tsc_scale.shift = tsc_shift;
t->local_tsc_stamp = curr_tsc;
t->stime_local_stamp = curr_local_stime;
t->stime_master_stamp = curr_master_stime;
set_timer(&t->calibration_timer, NOW() + EPOCH);
if ( smp_processor_id() == 0 )
这个初始系统时间是在domain 创建时候,通过update_domain_wallclock_time函数来设置。
代码清单2-23 xen
int do_settimeofday(struct timespec *tv)
time_t sec;
s64 nsec;
unsigned int cpu;
struct shadow_time_info *shadow;
struct xen_platform_op op;
if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
return -EINVAL;
cpu = get_cpu();
shadow = &per_cpu(shadow_time, cpu);
* Ensure we don't get blocked for a long time so that our time delta
* overflows. If that were to happen then our shadow time values would
* be stale, so we can retry with fresh ones.
for (;;) {
nsec = tv->tv_nsec - get_nsec_offset(shadow);
if (time_values_up_to_date(cpu))
sec = tv->tv_sec;
__normalize_time(&sec, &nsec);
if (is_initial_xendomain() && !independent_wallclock) {
op.cmd = XENPF_settime;
op.u.settime.secs = sec;
op.u.settime.nsecs = nsec;
op.u.settime.system_time = shadow->system_timestamp;
} else if (independent_wallclock) {
nsec -= shadow->system_timestamp;
__normalize_time(&sec, &nsec);
__update_wallclock(sec, nsec);
return 0;
在vcpu的创建时,就创建了一个周期计时器,这个计时器每10ms发送一个虚拟时钟中断VIRQ_TIMER给虚拟机。这个虚拟时钟中断同样是通过事件通道的方式发送到Guest OS。
代码清单2-24 xen
static void vcpu_periodic_timer_fn(void *data)
struct vcpu *v = data;
代码清单2-25 xen
static void vcpu_periodic_timer_work(struct vcpu *v)
s_time_t now = NOW();
uint64_t periodic_next_event;
if ( v->periodic_period == 0 )
periodic_next_event = v->periodic_last_event + v->periodic_period;
if ( now > periodic_next_event )
v->periodic_last_event = now;
periodic_next_event = now + v->periodic_period;
v->periodic_timer.cpu = smp_processor_id();
set_timer(&v->periodic_timer, periodic_next_event);
代码清单2-26 xen
void send_timer_event(struct vcpu *v)
/*送VIRQ_TIMER到Guest os*/
send_guest_vcpu_virq(v, VIRQ_TIMER);