ernel 3.10内核源码分析--KVM相关--虚拟机运行

1、基本原理
KVM虚拟机通过字符设备/dev/kvm的ioctl接口创建和运行,相关原理见之前的文章说明。
虚拟机的运行通过/dev/kvm设备ioctl VCPU接口的KVM_RUN指令实现, VM VCPU 创建好并完成初始化后,就可以调度该虚拟机 运行了,通常,一个VCPU对应于一个线程,虚拟机运行的本质为调度该虚拟机相关的VCPU所在线程运行。 虚拟机 (VCPU) 的运行主要任务是要进行上下文切换,上下文主要包括相关寄存器、 APIC 状态、 TLB 等,通常上下文切换的过程如下:
1、    
保存当前的上下文。
2、     使用 kvm_vcpu 结构体中的上下文信息,加载到物理 CPU 中。
3、    
执行 kvm_x86_ops 中的 run_vcpu 函数,调用硬件相关的指令 ( VMLAUNCH) ,进入虚拟机运行环境中
虚拟机运行于qemu-kvm的进程上下文中,从硬件的角度看,虚拟机的运行过程,实质为相关指令的执行过程,虚拟机编译后的也就是相应的CPU指令序列,而虚拟机的指令跟Host机的指令执行过程并没有太多的差别,最关键的差别为“敏感指令”(通常为IO、内存等关键操作)的执行,这也是虚拟化实现的本质所在,当在虚拟机中(Guest模式)执行“敏感指令”时,会触发(由硬件触发)VM-exit,使当前CPU从Guest模式(non-root模式)切换到root模式,当前CPU的控制权随之转交给VMM(Hypervisor,KVM中即Host),由VMM进行相应的处理,处理完成后再次通过应该硬件指令(VMLAUNCH),重新进入到Guest模式,从而进入虚拟机运行环境中继续运行
本文简单解释及分析在3.10版本内核代码中的相关流程,用户态qemu-kvm部分暂不包括。


2、大致流程:
Qemu-kvm
可以通过 ioctl(KVM_RUN…) 使虚拟机运行,最终进入内核态,由KVM相关内核流程处理,在内核态执行的大致过程如下:
kvm_vcpu_ioctl
 -->
    kvm_arch_vcpu_ioctl_run
具体由内核函数
kvm_arch_vcpu_ioctl_run 完成相关工作。主要流程如下:


1、    Sigprocmask()屏蔽信号,防止在此过程中受到信号的干扰。

2、    设置当前VCPU状态为KVM_MP_STATE_UNINITIALIZED

3、    配置APICmmio相关信息

4、    VCPU中保存的上下文信息写入指定位置

5、    然后的工作交由__vcpu_run完成

6、    __vcpu_run最终调用vcpu_enter_guest,该函数实现了进入Guest,并执行Guest OS具体指令的操作。       

7、    vcpu_enter_guest最终调用kvm_x86_ops中的run函数运行。对应于Intel平台,该函数为vmx_vcpu_run(设置Guest CR3和其他寄存器、EPT/影子页表相关设置、汇编代码VMLAUNCH切换到非根模式,执行Guest目标代码)

8、    Guest代码执行到敏感指令或因其他原因(比如中断/异常)VM-Exit退出非根模式,返回到vcpu_enter_guest函数继续执行。

9、    vcpu_enter_guest函数中会判断VM-Exit原因,并进行相应处理。

10、处理完成后VM-EntryGuest重新执行Guest代码,或重新等待下次调度。


3、代码分析
kvm_vcpu_ioctl():

点击(此处)折叠或打开

  1. /*
  2.   * kvm ioctl VCPU指令的入口,传入的fd为KVM_CREATE_VCPU中返回的fd。
  3.   * 主要针对具体的VCPU进行参数设置。如:相关寄存器的读
  4.   * 写、中断控制等
  5.   */

  6. static long kvm_vcpu_ioctl(struct file *filp,
  7.              unsigned int ioctl, unsigned long arg)
  8. {
  9.     struct kvm_vcpu *vcpu = filp->private_data;
  10.     void __user *argp = (void __user *)arg;
  11.     int r;
  12.     struct kvm_fpu *fpu = NULL;
  13.     struct kvm_sregs *kvm_sregs = NULL;

  14.     if (vcpu->kvm->mm != current->mm)
  15.         return -EIO;

  16. #if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
  17.     /*
  18.      * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
  19.      * so vcpu_load() would break it.
  20.      */
  21.     if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
  22.         return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
  23. #endif

  24.     // KVM虚拟机VCPU数据结构载入物理CPU
  25.     r = vcpu_load(vcpu);
  26.     if (r)
  27.         return r;
  28.     switch (ioctl) {
  29.     /* 
  30.      * 运行虚拟机,最终通过执行VMLAUNCH指令进入non root模式,
  31.      * 进入虚拟机运行。当虚拟机内部执行敏感指令时,由硬
  32.      * 件触发VM-exit,返回到root模式
  33.      */
  34.     case KVM_RUN:
  35.         r = -EINVAL;
  36.         // 不能带参数。
  37.         if (arg)
  38.             goto out;
  39.         // 运行VCPU(即运行虚拟机)的入口函数
  40.         r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run);
  41.         trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
  42.         break;
  43. ...

kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run ()-->__vcpu_run():

点击(此处)折叠或打开

  1. static int __vcpu_run(struct kvm_vcpu *vcpu)
  2. {
  3.     int r;
  4.     struct kvm *kvm = vcpu->kvm;

  5.     vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
  6.     /*设置vcpu->arch.apic->vapic_page*/
  7.     r = vapic_enter(vcpu);
  8.     if (r) {
  9.         srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
  10.         return r;
  11.     }

  12.     r = 1;
  13.     while (> 0) {
  14.         /*检查状态*/
  15.         if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
  16.          !vcpu->arch.apf.halted)
  17.          /* 进入Guest模式,最终通过VMLAUNCH指令实现*/
  18.             r = vcpu_enter_guest(vcpu);
  19.         else {/*什么情况下会走到这里?*/
  20.             srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
  21.             /*阻塞VCPU,其实就是schddule()调度出去,但在有特殊情况时(比如有挂起的定时器或信号时),不进行调度而直接退出*/
  22.             kvm_vcpu_block(vcpu);
  23.             vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
  24.             if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
  25.                 kvm_apic_accept_events(vcpu);
  26.                 switch(vcpu->arch.mp_state) {
  27.                 case KVM_MP_STATE_HALTED:
  28.                     vcpu->arch.pv.pv_unhalted = false;
  29.                     vcpu->arch.mp_state =
  30.                         KVM_MP_STATE_RUNNABLE;
  31.                 case KVM_MP_STATE_RUNNABLE:
  32.                     vcpu->arch.apf.halted = false;
  33.                     break;
  34.                 case KVM_MP_STATE_INIT_RECEIVED:
  35.                     break;
  36.                 default:
  37.                     r = -EINTR;
  38.                     break;
  39.                 }
  40.             }
  41.         }

  42.         if (<= 0)
  43.             break;

  44.         clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
  45.         if (kvm_cpu_has_pending_timer(vcpu))
  46.             kvm_inject_pending_timer_irqs(vcpu);

  47.         if (dm_request_for_irq_injection(vcpu)) {
  48.             r = -EINTR;
  49.             vcpu->run->exit_reason = KVM_EXIT_INTR;
  50.             ++vcpu->stat.request_irq_exits;
  51.         }

  52.         kvm_check_async_pf_completion(vcpu);

  53.         if (signal_pending(current)) {
  54.             r = -EINTR;
  55.             vcpu->run->exit_reason = KVM_EXIT_INTR;
  56.             ++vcpu->stat.signal_exits;
  57.         }
  58.         /*这是kvm中的一个调度时机点,即选择新VCPU运行的时机点*/
  59.         if (need_resched()) {
  60.             srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
  61.             kvm_resched(vcpu);
  62.             vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
  63.         }
  64.     }

  65.     srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);

  66.     vapic_exit(vcpu);

  67.     return r;
  68. }
kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run() -->vcpu_enter_guest():

点击(此处)折叠或打开

  1. /* 进入Guest模式,最终通过VMLAUNCH指令实现*/
  2. static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
  3. {
  4.     int r;
  5.     bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
  6.         vcpu->run->request_interrupt_window;
  7.     bool req_immediate_exit = false;
  8.     /*进入Guest模式前先处理相关挂起的请求*/
  9.     if (vcpu->requests) {
  10.         /*卸载MMU*/
  11.         if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
  12.             kvm_mmu_unload(vcpu);
  13.         /*定时器迁移*/
  14.         if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
  15.             __kvm_migrate_timers(vcpu);
  16.         /*主时钟更新*/
  17.         if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
  18.             kvm_gen_update_masterclock(vcpu->kvm);
  19.         /*全局时钟更新*/
  20.         if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
  21.             kvm_gen_kvmclock_update(vcpu);
  22.         /*虚拟机时钟更新*/
  23.         if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
  24.             r = kvm_guest_time_update(vcpu);
  25.             if (unlikely(r))
  26.                 goto out;
  27.         }
  28.         /*更新mmu*/
  29.         if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
  30.             kvm_mmu_sync_roots(vcpu);
  31.         /*刷新TLB*/
  32.         if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
  33.             kvm_x86_ops->tlb_flush(vcpu);
  34.         if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
  35.             vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
  36.             r = 0;
  37.             goto out;
  38.         }
  39.         if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
  40.             vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
  41.             r = 0;
  42.             goto out;
  43.         }
  44.         if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
  45.             vcpu->fpu_active = 0;
  46.             kvm_x86_ops->fpu_deactivate(vcpu);
  47.         }
  48.         if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
  49.             /* Page is swapped out. Do synthetic halt */
  50.             vcpu->arch.apf.halted = true;
  51.             r = 1;
  52.             goto out;
  53.         }
  54.         if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
  55.             record_steal_time(vcpu);
  56.         if (kvm_check_request(KVM_REQ_NMI, vcpu))
  57.             process_nmi(vcpu);
  58.         if (kvm_check_request(KVM_REQ_PMU, vcpu))
  59.             kvm_handle_pmu_event(vcpu);
  60.         if (kvm_check_request(KVM_REQ_PMI, vcpu))
  61.             kvm_deliver_pmi(vcpu);
  62.         if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
  63.             vcpu_scan_ioapic(vcpu);
  64.     }
  65.     // 检查是否有事件请求
  66.     if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
  67.         kvm_apic_accept_events(vcpu);
  68.         if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
  69.             r = 1;
  70.             goto out;
  71.         }
  72.         // 注入阻塞的事件,中断,异常和nmi等
  73.         inject_pending_event(vcpu);

  74.         /* enable NMI/IRQ window open exits if needed */
  75.         /*
  76.          * 使能NMI/IRQ window,参见Intel64 System Programming Guide 25.3节
  77.          * 当使能了interrupt-window exiting或NMI-window exiting(由VMCS中相关字段控制)
  78.          * 表示在刚进入虚拟机后,就会立刻因为有pending或注入的中断导致VM-exit
  79.          */
  80.         if (vcpu->arch.nmi_pending)
  81.             req_immediate_exit =
  82.                 kvm_x86_ops->enable_nmi_window(vcpu) != 0;
  83.         else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
  84.             req_immediate_exit =
  85.                 kvm_x86_ops->enable_irq_window(vcpu) != 0;

  86.         if (kvm_lapic_enabled(vcpu)) {
  87.             /*
  88.              * Update architecture specific hints for APIC
  89.              * virtual interrupt delivery.
  90.              */
  91.             if (kvm_x86_ops->hwapic_irr_update)
  92.                 kvm_x86_ops->hwapic_irr_update(vcpu,
  93.                     kvm_lapic_find_highest_irr(vcpu));
  94.             update_cr8_intercept(vcpu);
  95.             kvm_lapic_sync_to_vapic(vcpu);
  96.         }
  97.     }
  98.     // 装载MMU,待深入分析
  99.     r = kvm_mmu_reload(vcpu);
  100.     if (unlikely(r)) {
  101.         goto cancel_injection;
  102.     }

  103.     preempt_disable();
  104.     // 进入Guest前期准备,架构相关
  105.     kvm_x86_ops->prepare_guest_switch(vcpu);
  106.     if (vcpu->fpu_active)
  107.         kvm_load_guest_fpu(vcpu);
  108.     kvm_load_guest_xcr0(vcpu);

  109.     vcpu->mode = IN_GUEST_MODE;

  110.     /* We should set ->mode before check ->requests,
  111.      * see the comment in make_all_cpus_request.
  112.      */
  113.     smp_mb();

  114.     local_irq_disable();
  115.     /* 
  116.      * 如果VCPU处于EXITING_GUEST_MODE或者vcpu->requests(?)或者需要调度或者
  117.      * 有挂起的信号,则放弃
  118.      */
  119.     if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
  120.      || need_resched() || signal_pending(current)) {
  121.         vcpu->mode = OUTSIDE_GUEST_MODE;
  122.         smp_wmb();
  123.         local_irq_enable();
  124.         preempt_enable();
  125.         r = 1;
  126.         goto cancel_injection;
  127.     }

  128.     srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
  129.     // req_immediate_exit在前面使能NMI/IRQ window失败时设置,此时需要立即退出,触发重新调度
  130.     if (req_immediate_exit)
  131.         smp_send_reschedule(vcpu->cpu);
  132.     // 计算虚拟机的enter时间
  133.     kvm_guest_enter();
  134.     // 调试相关
  135.     if (unlikely(vcpu->arch.switch_db_regs)) {
  136.         set_debugreg(0, 7);
  137.         set_debugreg(vcpu->arch.eff_db[0], 0);
  138.         set_debugreg(vcpu->arch.eff_db[1], 1);
  139.         set_debugreg(vcpu->arch.eff_db[2], 2);
  140.         set_debugreg(vcpu->arch.eff_db[3], 3);
  141.     }

  142.     trace_kvm_entry(vcpu->vcpu_id);
  143.     // 调用架构相关的run接口(vmx_vcpu_run),进入Guest模式
  144.     kvm_x86_ops->run(vcpu);

  145.     
  146.     // 此处开始,说明已经发生了VM-exit,退出了Guest模式
  147.     /*
  148.      * If the guest has used debug registers, at least dr7
  149.      * will be disabled while returning to the host.
  150.      * If we don't have active breakpoints in the host, we don't
  151.      * care about the messed up debug address registers. But if
  152.      * we have some of them active, restore the old state.
  153.      */
  154.     if (hw_breakpoint_active())
  155.         hw_breakpoint_restore();
  156.     /*记录Guest退出前的TSC时钟*/
  157.     vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
  158.                              native_read_tsc());
  159.     // 设置模式
  160.     vcpu->mode = OUTSIDE_GUEST_MODE;
  161.     smp_wmb();

  162.     /* Interrupt is enabled by handle_external_intr() */
  163.     kvm_x86_ops->handle_external_intr(vcpu);

  164.     ++vcpu->stat.exits;

  165.     /*
  166.      * We must have an instruction between local_irq_enable() and
  167.      * kvm_guest_exit(), so the timer interrupt isn't delayed by
  168.      * the interrupt shadow. The stat.exits increment will do nicely.
  169.      * But we need to prevent reordering, hence this barrier():
  170.      */
  171.     barrier();
  172.     // 计算虚拟机的退出时间,其中还开中断了?
  173.     kvm_guest_exit();
  174.     
  175.     preempt_enable();

  176.     vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);

  177.     /*
  178.      * Profile KVM exit RIPs:
  179.      */
  180.     // Profile(采样计数,用于性能分析和调优)相关
  181.     if (unlikely(prof_on == KVM_PROFILING)) {
  182.         unsigned long rip = kvm_rip_read(vcpu);
  183.         profile_hit(KVM_PROFILING, (void *)rip);
  184.     }

  185.     if (unlikely(vcpu->arch.tsc_always_catchup))
  186.         kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);

  187.     if (vcpu->arch.apic_attention)
  188.         kvm_lapic_sync_from_vapic(vcpu);
  189.     /* 
  190.      * 调用vmx_handle_exit()处理虚拟机异常,异常原因及其它关键信息
  191.      * 已经在之前获取。
  192.      */
  193.     r = kvm_x86_ops->handle_exit(vcpu);
  194.     return r;

  195. cancel_injection:
  196.     kvm_x86_ops->cancel_injection(vcpu);
  197.     if (unlikely(vcpu->arch.apic_attention))
  198.         kvm_lapic_sync_from_vapic(vcpu);
  199. out:
  200.     return r;
  201. }

kvm_vcpu_ioctl()-->kvm_arch_vcpu_ioctl_run()-->__vcpu_run()-->vcpu_enter_guest() -->vmx_vcpu_run():

点击(此处)折叠或打开

  1. /*
  2.   * 运行虚拟机,进入Guest模式,即non root模式
  3.   */
  4. static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
  5. {
  6.     struct vcpu_vmx *vmx = to_vmx(vcpu);
  7.     unsigned long debugctlmsr;

  8.     /* Record the guest's net vcpu time for enforced NMI injections. */
  9.     // nmi注入?跟nmi_watchdog相关?
  10.     if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
  11.         vmx->entry_time = ktime_get();

  12.     /* Don't enter VMX if guest state is invalid, let the exit handler
  13.      start emulation until we arrive back to a valid state */
  14.     if (vmx->emulation_required)
  15.         return;

  16.     if (vmx->nested.sync_shadow_vmcs) {
  17.         copy_vmcs12_to_shadow(vmx);
  18.         vmx->nested.sync_shadow_vmcs = false;
  19.     }
  20.     // 写入Guest的RSP寄存器信息至VMCS相关位置中
  21.     if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
  22.         vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
  23.     // 写入Guest的RIP寄存器信息至VMCS相关位置中
  24.     if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
  25.         vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);

  26.     /* When single-stepping over STI and MOV SS, we must clear the
  27.      * corresponding interruptibility bits in the guest state. Otherwise
  28.      * vmentry fails as it then expects bit 14 (BS) in pending debug
  29.      * exceptions being set, but that'not correct for the guest debugging
  30.      * case. */
  31.     // 单步调试时,需要禁用Guest中断
  32.     if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
  33.         vmx_set_interrupt_shadow(vcpu, 0);

  34.     atomic_switch_perf_msrs(vmx);
  35.     debugctlmsr = get_debugctlmsr();
  36.     // vmx->__launched用于判断当前VCPU是否已经VMLAUNCH了
  37.     vmx->__launched = vmx->loaded_vmcs->launched;
  38.     // 执行VMLAUNCH指令进入Guest模式,虚拟机开始运行
  39.     asm(
  40.         /* Store host registers */
  41.         /*将相关寄存器压栈*/
  42.         "push %%" _ASM_DX "; push %%" _ASM_BP ";"/*BP压栈*/
  43.         /*为guest的rcx寄存器保留个位置,所以这里压两次栈*/
  44.         "push %%" _ASM_CX " \n\t" /* placeholder for guest rcx */
  45.         "push %%" _ASM_CX " \n\t"
  46.         /*
  47.          * %c表示用来表示使用立即数替换,但不使用立即数的语法,at&t汇编中表示立即数的语法前面有一个$,而用了%c后,就去掉了这个$。
  48.          * 主要是用在间接寻址的情况,这种情况下如果直接使用$立即数的方式的话,会报语法错误。
  49.          * [host_rsp]是后面输入部分定义的tag,使用%tag方式可以直接引用,%0是后面输入输出部分中的第一个操作数,即vmx,这里是间接寻址
  50.          * %c[host_rsp](%0)整体来看就是vmx(以寄存器ecx传入)中的host_rsp成员。
  51.          * 所以,如下语句的整体含义就是比较当前SP寄存器和vmx->host_rsp的值。
  52.          */
  53.         /*如果当前RSP和vmx->rsp相等,那就不用mov了,否则将当前RSP保存到vmx中*/
  54.         "cmp %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
  55.         "je 1f \n\t"
  56.         "mov %%" _ASM_SP ", %c[host_rsp](%0) \n\t"
  57.        /*
            * 执行ASM_VMX_VMWRITE_RSP_RDX指令(Writes the contents of a primary source operand (register or memory) to a specified field in a VMCS,即将RSP的值写入vmcs中,field由RDX寄存器指定,
            * 而此时的RDX寄存器的内容由后面的约束条件:"d"((unsigned long)HOST_RSP指定为HOST_RSP,所以这句命令的作用为:将rsp的值写vmcs,field是HOST_RSP。),
            * 当出现异常时直接重启,由__ex()实现
            */

  58.         __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
  59.         "1: \n\t"
  60.         /* Reload cr2 if changed */
  61.         /*比较当前CR2寄存器和vmx中保存的CR2寄存器内容,如果不相等,就从vmx中重新CR2内容到当前CR2寄存器中*/
  62.         "mov %c[cr2](%0), %%" _ASM_AX " \n\t"
  63.         "mov %%cr2, %%" _ASM_DX " \n\t"
  64.         "cmp %%" _ASM_AX ", %%" _ASM_DX " \n\t"
  65.         "je 2f \n\t"
  66.         "mov %%" _ASM_AX", %%cr2 \n\t"
  67.         "2: \n\t"
  68.         /* Check if vmlaunch of vmresume is needed */
  69.         /*判断vcpu_vmx->__launched,确认是否需要执行VMLAUNCH*/
  70.         "cmpl $0, %c[launched](%0) \n\t"
  71.         /* Load guest registers. Don't clobber flags. */
  72.         /*加载guest寄存器,其实就是从vmx中加载*/
  73.         "mov %c[rax](%0), %%" _ASM_AX " \n\t"
  74.         "mov %c[rbx](%0), %%" _ASM_BX " \n\t"
  75.         "mov %c[rdx](%0), %%" _ASM_DX " \n\t"
  76.         "mov %c[rsi](%0), %%" _ASM_SI " \n\t"
  77.         "mov %c[rdi](%0), %%" _ASM_DI " \n\t"
  78.         "mov %c[rbp](%0), %%" _ASM_BP " \n\t"
  79. #ifdef CONFIG_X86_64
  80.         "mov %c[r8](%0), %%r8 \n\t"
  81.         "mov %c[r9](%0), %%r9 \n\t"
  82.         "mov %c[r10](%0), %%r10 \n\t"
  83.         "mov %c[r11](%0), %%r11 \n\t"
  84.         "mov %c[r12](%0), %%r12 \n\t"
  85.         "mov %c[r13](%0), %%r13 \n\t"
  86.         "mov %c[r14](%0), %%r14 \n\t"
  87.         "mov %c[r15](%0), %%r15 \n\t"
  88. #endif
  89.         "mov %c[rcx](%0), %%" _ASM_CX " \n\t" /* kills %(ecx) */

  90.         /* Enter guest mode */
  91.         "jne 1f \n\t"
  92.         /* 执行VMLAUNCH指令,进入Guest模式*/
  93.         __ex(ASM_VMX_VMLAUNCH) "\n\t"
  94.         "jmp 2f \n\t"
  95.         /* 执行VMRESUME指令,从Guest模式恢复到root模式*/
  96.         "1: " __ex(ASM_VMX_VMRESUME) "\n\t"
  97.         "2: "
  98.         /* Save guest registers, load host registers, keep flags */
  99.         "mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
  100.         "pop %0 \n\t"
  101.         "mov %%" _ASM_AX ", %c[rax](%0) \n\t"
  102.         "mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
  103.         __ASM_SIZE(pop) " %c[rcx](%0) \n\t"
  104.         "mov %%" _ASM_DX ", %c[rdx](%0) \n\t"
  105.         "mov %%" _ASM_SI ", %c[rsi](%0) \n\t"
  106.         "mov %%" _ASM_DI ", %c[rdi](%0) \n\t"
  107.         "mov %%" _ASM_BP ", %c[rbp](%0) \n\t"
  108. #ifdef CONFIG_X86_64
  109.         "mov %%r8, %c[r8](%0) \n\t"
  110.         "mov %%r9, %c[r9](%0) \n\t"
  111.         "mov %%r10, %c[r10](%0) \n\t"
  112.         "mov %%r11, %c[r11](%0) \n\t"
  113.         "mov %%r12, %c[r12](%0) \n\t"
  114.         "mov %%r13, %c[r13](%0) \n\t"
  115.         "mov %%r14, %c[r14](%0) \n\t"
  116.         "mov %%r15, %c[r15](%0) \n\t"
  117. #endif
  118.         "mov %%cr2, %%" _ASM_AX " \n\t"
  119.         "mov %%" _ASM_AX ", %c[cr2](%0) \n\t"

  120.         "pop %%" _ASM_BP "; pop %%" _ASM_DX " \n\t"
  121.         "setbe %c[fail](%0) \n\t"
  122.         ".pushsection .rodata \n\t"
  123.         ".global vmx_return \n\t"
  124.         "vmx_return: " _ASM_PTR " 2b \n\t"
  125.         ".popsection"
  126.      : : "c"(vmx), "d"((unsigned long)HOST_RSP),
  127.         [launched]"i"(offsetof(struct vcpu_vmx, __launched)),
  128.         [fail]"i"(offsetof(struct vcpu_vmx, fail)),
  129.         /*[host_rsp]是tag,可以在前面以%[host_rsp]方式引用*/
  130.         [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
  131.         [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
  132.         [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
  133.         [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
  134.         [rdx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDX])),
  135.         [rsi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RSI])),
  136.         [rdi]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RDI])),
  137.         [rbp]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBP])),
  138. #ifdef CONFIG_X86_64
  139.         [r8]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R8])),
  140.         [r9]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R9])),
  141.         [r10]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R10])),
  142.         [r11]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R11])),
  143.         [r12]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R12])),
  144.         [r13]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R13])),
  145.         [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
  146.         [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
  147. #endif
  148.         [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
  149.         [wordsize]"i"(sizeof(ulong))
  150.      : "cc", "memory"/*clobber list,cc表示寄存器,memory表示内存*/
  151. #ifdef CONFIG_X86_64
  152.         , "rax", "rbx", "rdi", "rsi"
  153.         , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
  154. #else
  155.         , "eax", "ebx", "edi", "esi"
  156. #endif
  157.      );
  158.     // 运行到这里,说明已经发生了VM-exit,返回到了root模式
  159.     /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
  160.     if (debugctlmsr)
  161.         update_debugctlmsr(debugctlmsr);

  162. #ifndef CONFIG_X86_64
  163.     /*
  164.      * The sysexit path does not restore ds/es, so we must set them to
  165.      * a reasonable value ourselves.
  166.      *
  167.      * We can't defer this to vmx_load_host_state() since that function
  168.      * may be executed in interrupt context, which saves and restore segments
  169.      * around it, nullifying its effect.
  170.      */
  171.     /*重新加载ds/es段寄存器,因为VM-exit不会自动加载他们*/
  172.     loadsegment(ds, __USER_DS);
  173.     loadsegment(es, __USER_DS);
  174. #endif

  175.     vcpu->arch.regs_avail = ~((<< VCPU_REGS_RIP) | (<< VCPU_REGS_RSP)
  176.                  | (<< VCPU_EXREG_RFLAGS)
  177.                  | (<< VCPU_EXREG_CPL)
  178.                  | (<< VCPU_EXREG_PDPTR)
  179.                  | (<< VCPU_EXREG_SEGMENTS)
  180.                  | (<< VCPU_EXREG_CR3));
  181.     vcpu->arch.regs_dirty = 0;
  182.     // 从硬件VMCS中读取中断向量表信息
  183.     vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);

  184.     vmx->loaded_vmcs->launched = 1;
  185.     // 从硬件VMCS中读取VM-exit原因信息,这些信息是VM-exit过程中由硬件自动写入的
  186.     vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
  187.     trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX);
  188.     /*处理MCE异常和NMI中断*/
  189.     vmx_complete_atomic_exit(vmx);
  190.     vmx_recover_nmi_blocking(vmx);
  191.     vmx_complete_interrupts(vmx);
  192. }

你可能感兴趣的:(Kernel,虚拟化)