2.2.1 VMX 的基本工作流程与操作
Intel VT 硬件虚拟化技术 ,在硬件级别上完成计算机的虚拟化. 为实现硬件虚拟化 ,VT增加了 12条新的 CPUVMX指令:
a. VMCS控制 5 条:VMPTRLD VMPTRST VMCLEAR VMREAD VMWRITE
b.VMX命令 5条:VMLAUNCH VMCALL VMXON VMXOFF VMRESUME
c. Guest software 2条:INVEPT INVVPID
arch\x86\include\vmx.h 包含这些指令的opcode定义
#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7,0x30"
#define ASM_VMX_INVEPT ".byte 0x66, 0x0f,0x38, 0x80, 0x08"
#define ASM_VMX_INVVPID ".byte 0x66, 0x0f,0x38, 0x81, 0x08"
基本操作的源代码在arch\x86\kvm\vmx.c中 (AMD cpu在svm.c中)
vmcs_load, vmcs_clear, vmcs_readl, vmcs_writel
kvm_cpu_vmxoff, kvm_cpu_vmxon
__invept, __invvpid
vmresume则在vmx_vcpu_run中被调用
(1)VMX控制命令
在KVM Host CPU处于VMMonitor状态下使用.
KVM始化一个VMCS (Virtual Machine Control Structures)内存区域后,启动VMM (Virtual Machine Monitor) ,virtual machine启动后, CPU通过VM-Entry会进入Guest OS(non-root).
当有某些异常或中断发生时或一些寄存器访问,CPU VM-Exit 到VMM(vm-root), VMM代码(KVM+QEMU)会处理导致VM-Exit发生的原因,然后重新VM-Entry使Guest-OS继续执行。
要让CPU进入VMM模式,需要调用VMXON指令, 而VMXOFF使CPU推出VMM状态
VMLAUNCH用于首次发起VM-Entry, VMRESUME用于从VM-EXIT后重新VM-Entry
VMCALL用于从non-root调用,并产生vm-exit
(2) VMCS
1) VMCS 是一个4K的内存区域.在逻辑上,虚拟机控制结构被划分为 6 部分:
1) GUEST-STATE 域:虚拟机从根操作模式进入非根操作模式时,处理器所处的状态;
2) HOST-STATE 域:虚拟机从非根操作模式退出到根操作模式时,处理器所处的状态;
3) VM 执行控制域:虚拟机在非根操作模式运行的时候,控制处理器非根操作模式退出到根操作模式;
4) VM 退出控制域:虚拟机从非根操作模式下退出时,需要保存的信息;
5) VM 进入控制域:虚拟机从根操作模式进入非根操作模式时,需要读取的信息;
6) VM 退出信息域:虚拟机从非根操作模式退出到根操作模式时,将退出的原因保存到该域中。
VMREAD VMWRITE 用于读写这些字段
VMCS有三种属性, activity/inactivity,current/non-current, launch/clear.
Active表明VMCS已经加载过。
vmptrld 将使vmcs进入active, 但只有最后一个执行该指令的vmcs处于current状态
VMREAD,VMWRITE, VMLAUNCH等指令要操作在currentvmcs上
,vmclear进入inactive和clear状态
一旦VM-Entry后就进入launched状态
(3) Cache刷新
invept用于对GPA->HPA先关缓存刷新
INVVPID用于线性地址到物理地址的刷新
2.2.2 KVM初始化
这一节我们根据用户接口的调用流程分析kvm的初始化
(1) 内核KVM模块初始化
static int __initvmx_init(void) (arch\x86\kvm\vmx.c) ==>
kvm_init(virt\kvm\kvm_main.c) ==> kvm_arch_init
a. r = kvm_mmu_module_init(); 内存管理结构初始化
b. kvm_init_msr_list
c. kvm_timer_init, kvm_lapic_init
kvm_init==> kvm_arch_hardware_setup ==>kvm_x86_ops->hardware_setup = hardware_setup(vmx.c)
a. setup_vmcs_config: 初始化 static struct vmcs_config 全局变量vmcs_config
b. alloc_kvm_area 为每个cpu调用alloc_vmcs_cpu分配vmcs区域
static struct vmcs*alloc_vmcs_cpu(int cpu) {
int node = cpu_to_node(cpu);
struct page *pages;
struct vmcs *vmcs;
pages = alloc_pages_exact_node(node, GFP_KERNEL,vmcs_config.order);
if (!pages)
return NULL;
vmcs = page_address(pages);
memset(vmcs, 0, vmcs_config.size);
vmcs->revision_id = vmcs_config.revision_id; /* vmcs revisionid */
return vmcs;
}
per_cpu(vmxarea, cpu) = vmcs;
struct vmcs {
u32 revision_id;
u32 abort;
char data[0];
};
(2) VM的创建与初始化
首先分析vm创建的流程:
kvm_dev_ioctl_create_vm ==>kvm_create_vm
A. kvm*kvm = kvm_arch_alloc_vm();
创建kvm虚拟机结构, 定义如下:
struct kvm {
。。。。。。
struct mm_struct *mm; /* userspace tied tothis vm */
struct kvm_memslots *memslots;
。。。。。。
struct kvm_arch arch;
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
}
B. kvm_arch_init_vm, 初始化kvm_archarch字段
其中注册了
INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work,kvmclock_sync_fn);
C. hardware_enable_all
对每个cpu通过smp call 调用hardware_enable_nolock ==》 kvm_x86_ops->hardware_enable
a) crash_enable_local_vmclear清理位图,
b) 判断MSR_IA32_FEATURE_CONTROL寄存器是否满足虚拟环境,不满足则将条件写入到寄存器内,
c) CR4将X86_CR4_VMXE置位:write_cr4(read_cr4()| X86_CR4_VMXE);,
d) if (vmm_exclusive)//只有一个VMM时
{
kvm_cpu_vmxon打开VMX操作模式,
ept_sync_global 刷新ept缓存.
}
kvm->mm = current->mm;
e) 初始化srcu
f) kvm_eventfd_init
g) 初始化内存管理结构并list_add(&kvm->vm_list,&vm_list); 加入到全区vm_list中
D. kvm_dev_ioctl_create_vm ==> kvm_coalesced_mmio_init初始化mmio
(3) vcpu的创建与初始化
kvm_vm_ioctl_create_vcpu ==>kvm_arch_vcpu_create => vmx_create_vcpu
1) vcpu_vmx *vmx =kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
为cpu分配 vid allocate_vpid(vmx);
2) kvm_vcpu_init
a) vcpu->run = page_address(page); 分配run结构, 该结构映射到qemu
b) kvm_arch_vcpu_init初始化vcpu->arch结构
structkvm_vcpu 定义在 include\linux\kvm_host.h
structkvm_vcpu_arch arch; 定义在include\arch\x86\asm\kvm_host.h
3) vmx操作:
vmx->loaded_vmcs->vmcs= alloc_vmcs(); //分配vmcs到vmx->loaded_vmcs
if(!vmx->loaded_vmcs->vmcs)
gotofree_msrs;
if (!vmm_exclusive) //不只有一个 VMM时在此时, vmxon
kvm_cpu_vmxon(__pa(per_cpu(vmxarea,raw_smp_processor_id())));
loaded_vmcs_init(vmx->loaded_vmcs);//vmclear
if(!vmm_exclusive)
kvm_cpu_vmxoff();//vmxoff
4) vmx_vcpu_load
if(!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
else if(vmx->loaded_vmcs->cpu != cpu)
loaded_vmcs_clear(vmx->loaded_vmcs);
if(per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs,cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);// VMPTRLD
}
5) 第一次初始化时执行下面代码
kvm_make_request(KVM_REQ_TLB_FLUSH,vcpu);
。。。。。。
vmcs_writel(HOST_TR_BASE,kvm_read_tr_base()); /* 22.2.4 */
vmcs_writel(HOST_GDTR_BASE,gdt->address); /* 22.2.4 */
rdmsrl(MSR_IA32_SYSENTER_ESP,sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP,sysenter_esp); /* 22.2.3 */
vmx->loaded_vmcs->cpu= cpu;
kvm_make_request管理软件标志位的设置, 当准备进入vm-entry时将集中处理这些标志
vmcs_writel将设置vmcs相关字段,
6) vmx_vcpu_setup初始化vmcs的各个字段;下一节将分析VMCS字段的设置
7) 分配alloc_apic_access_page apic页
接下来分析vcpu寄存器的初始化
kvm_vm_ioctl_create_vcpu==》kvm_arch_vcpu_setup==》kvm_arch_vcpu_load
1) vcpu_load ==》vmx_vcpu_load
2) kvm_vcpu_reset==》 kvm_x86_ops->vcpu_reset =vmx_vcpu_reset
初始化vmcs中的寄存器, 例子
seg_setup(VCPU_SREG_SS);
vmcs_write16(GUEST_TR_SELECTOR, 0);
vmcs_writel(GUEST_SYSENTER_EIP, 0);
3) kvm_mmu_setup==》init_kvm_mmu 初始化 mmu管理模块
当ept enable时 tdp_enabled = true;
init_kvm_mmu ==> init_kvm_tdp_mmu(vcpu)
(4) vCPU 的运行
kvm_arch_vcpu_ioctl_run==> _vcpu_run ==> vcpu_enter_guest
a) 处理 kvm->reuest
b) kvm_mmu_reload ==> kvm_mmu_load
c) kvm_x86_ops->prepare_guest_switch = vmx_save_host_state
保存当前host的状态到vmcs host区域和vmx->host_state变量
d) kvm_guest_enter
e) kvm_x86_ops->run(vcpu) = vmx_vcpu_run
下面分析vmx_vcpu_run
1) 从 vcpu->arch.regs设置guest rip, rsp
if(test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
if (test_bit(VCPU_REGS_RIP, (unsigned long*)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
vcpu->arch.regs由qemu设置, 参考前一节2.13.3
2) /* Store host registers */
"push %%" _ASM_DX "; push %%" _ASM_BP";"
"push %%" _ASM_CX " \n\t" /* placeholderfor guest rcx */
"push %%" _ASM_CX " \n\t"
"cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
"je 1f \n\t"
"mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
__ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
保存host的寄存器
3) /* Reload cr2 if changed */
"mov %c[cr2](%0), %%" _ASM_AX " \n\t"
"mov %%cr2, %%" _ASM_DX " \n\t"
"cmp %%" _ASM_AX ", %%" _ASM_DX "\n\t"
"je 2f \n\t"
"mov %%" _ASM_AX", %%cr2 \n\t"
"2: \n\t"
加载cr2,从guest os中对应vcpu的vmcs中获取数据,放到cr2寄存器中
4) /* Check if vmlaunch of vmresume is needed */
"cmpl $0, %c[launched](%0) \n\t"
判断是否需要vm launch,对于上次就是从对应vmcs退出的情况,不需要vm launch,使用vm resume,否则需要
5) /* Load guestregisters. Don't clobber flags. */
"mov %c[rax](%0), %%" _ASM_AX " \n\t"
"mov %c[rbx](%0), %%" _ASM_BX " \n\t"
.......
加载guest os相关的寄存器,从vcpu对应的结构体中获取
6) /* Enter guest mode */
"jne 1f \n\t"
__ex(ASM_VMX_VMLAUNCH) "\n\t"
"jmp 2f \n\t"
"1: " __ex(ASM_VMX_VMRESUME) "\n\t"
"2: "
调用VM launch或者vm resume进入non root state,运行vcpu。
7) 当vm-exit发生时才执行以下代码
/* Save guest registers, load host registers, keep flags */
"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
"pop %0 \n\t"
"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
"mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
"mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
"mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
"mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
保存guest os对应寄存器的值到对应变量中
"pop %%"_ASM_BP "; pop %%" _ASM_DX" \n\t"
"setbe %c[fail](%0) \n\t"
恢复host os的寄存器取值
8) 获取vm_exit 信息
vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1<< VCPU_REGS_RSP)
| (1 < | (1 < | (1 < | (1 < vcpu->arch.regs_dirty = 0; vmx->idt_vectoring_info =vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); (5) VCPU的调度保证 多核的环境下如果cpu1上启用了vmx,cpu2没有;如果当前线程从cpu1切换到cpu2,那么就发生异常,KVM为了避免这种情况,采用如下方法: 在vcpu n的线程被sched out和schedin的时候,采用某种机制来得知这个事件,并且对vmcs等作出处理。 int vcpu_load(struct kvm_vcpu*vcpu) { int cpu; cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); return 0; } static voidkvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); if (vcpu->preempted) vcpu->preempted = false; kvm_arch_vcpu_load(vcpu, cpu); } static voidkvm_sched_out(struct preempt_notifier *pn, struct task_struct*next) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); if (current->state == TASK_RUNNING) vcpu->preempted = true; kvm_arch_vcpu_put(vcpu); } 这样当cpu切换时,就可以恢复与保存vcpu的设置了