qemu-kvn 内存虚拟化---ept

qemu-kvm内存虚拟化
内存虚拟化实际就是进行地址转换从客户机虚拟地址-->客户机物理地址-->宿主机的物理地址,转换实现有两种硬件内存虚拟化和软件影子页表方式, 下面主要分析基于intel ept硬件内存虚拟化实现,此实现主要做两件事情
1.开启ept功能2.构造转换页表。注意该页表构造采用动态方式(常说懒惰方式),就是不到完不得以情况不创建。此页表创建实现就是采用ept violation捕获,一步一步创建起来的,对人觉得十分费劲,但是机器喜欢做费劲事情。
我们还得从vcpu_enter_guest这个函数,可见此函数重要性,虚拟机每一次运行,都必须载入ept页表
static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
{
     r = kvm_mmu_reload(vcpu);
}

如果cr3内容无效,分配物理页作为ept页表根,此物理页地址作为cr3寄存器内容,也就是ept根目录,所有页表查询和转换基于cr3转换的,有效话不必分配了,直接使用。
int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
        int r;
        
        r = mmu_topup_memory_caches(vcpu);
        if (r)  
                goto out;
        spin_lock(&vcpu->kvm->mmu_lock);
        kvm_mmu_free_some_pages(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
        r = mmu_alloc_roots(vcpu);
        spin_lock(&vcpu->kvm->mmu_lock);
        mmu_sync_roots(vcpu);
        spin_unlock(&vcpu->kvm->mmu_lock);
        if (r)
                goto out;
        /* set_cr3() should ensure TLB has been flushed */
        kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
out:
        return r;       
}


获取EXIT_QUALIFICATION内容,了解ept violation退出的原因,原因有读,写等引起。
static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
        unsigned long exit_qualification;
        gpa_t gpa;
        int gla_validity;

        exit_qualification = vmcs_readl(EXIT_QUALIFICATION);

        if (exit_qualification & (1 << 6)) {
                printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
                return -EINVAL;
        }

        gla_validity = (exit_qualification >> 7) & 0x3;
        if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) {
                printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
                printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
                        (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
                        vmcs_readl(GUEST_LINEAR_ADDRESS));
                printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
                        (long unsigned int)exit_qualification);
                vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
                vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
                return 0;
        }       
        
        gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
        trace_kvm_page_fault(gpa, exit_qualification);
        return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0);



VMM 将截获此故障,handle_ept_violation函数被调用,通过EPT的故障处理函数tdp_page_fault进行GPA到HPA处理。如果相应ept页表不存在,构建此页表。

static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,

                                u32 error_code)
{
        pfn_t pfn;
        int r;
        int level;
        gfn_t gfn = gpa >> PAGE_SHIFT;
        unsigned long mmu_seq;

        ASSERT(vcpu);
        ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));

        r = mmu_topup_memory_caches(vcpu);
        if (r)
                return r;

        mmu_seq = vcpu->kvm->mmu_notifier_seq;
        smp_rmb();
        level = mapping_level(vcpu, gfn);

        gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);

        pfn = gfn_to_pfn(vcpu->kvm, gfn);
        if (is_error_pfn(pfn)) {
                kvm_release_pfn_clean(pfn);
                return is_fault_pfn(pfn) ? -EFAULT : 1;
        }
        spin_lock(&vcpu->kvm->mmu_lock);
        if (mmu_notifier_retry(vcpu, mmu_seq))
                goto out_unlock;
        kvm_mmu_free_some_pages(vcpu);
        r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
                         level, gfn, pfn);
        spin_unlock(&vcpu->kvm->mmu_lock);
        return r;

out_unlock:
        spin_unlock(&vcpu->kvm->mmu_lock);
        kvm_release_pfn_clean(pfn);
        return 0;
}
mmu_topup_memory_caches(vcpu)函数是qemu-kvm自己实现内存管理功能。

客户机物理地址转换为客户机物理页框号,将客户机物理页框号转换为宿主机物理地址页框号。
将客户机物理页框号转换为宿主机物理地址页框号分为两步

pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
        unsigned long addr;

        addr = gfn_to_hva(kvm, gfn);
        if (kvm_is_error_hva(addr)) {
                get_page(bad_page);
                return page_to_pfn(bad_page);
        }

        return hva_to_pfn(kvm, addr);
}
客户机页框号转换为宿主机虚拟地址
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
        struct kvm_memory_slot *slot;

        gfn = unalias_gfn_instantiation(kvm, gfn);
        slot = gfn_to_memslot_unaliased(kvm, gfn);
        if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
                return bad_hva();
        return (slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE);
}
将宿主机虚拟地址转换为宿主机物理地址,并将宿主机物理地址装换为宿主机物理地址页框号,注意此转换可能设计宿主机物理页确页不存在,那么需要分配相应物理页
pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
{
        struct page *page[1];
        int npages;
        pfn_t pfn;

        might_sleep();

        npages = get_user_pages_fast(addr, 1, 1, page);

        if (unlikely(npages != 1)) {
                struct vm_area_struct *vma;

                down_read(&current->mm->mmap_sem);
                vma = find_vma(current->mm, addr);

                if (vma == NULL || addr < vma->vm_start ||
                    !(vma->vm_flags & VM_PFNMAP)) {
                        up_read(&current->mm->mmap_sem);
                        get_page(fault_page);
                        return page_to_pfn(fault_page);
                }

                pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
                up_read(&current->mm->mmap_sem);
                BUG_ON(!kvm_is_mmio_pfn(pfn));
        } else
                pfn = page_to_pfn(page[0]);

        return pfn;
}
在ept页表相应页表项中设置客户机的物理地址。
大概过程如下:如果找到最终level的相应ept表项,设置物理地址。否则相应level不存在分配ept页表,把分配页表物理地址设置上一级level页表项中,重复该过程
最终level设置函数调用mmu_set_spte,中间level的设置函数为调用__set_spte 其实本质一样的,只不过相应表项内容的权限不一样。
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
                        int level, gfn_t gfn, pfn_t pfn)
{
        struct kvm_shadow_walk_iterator iterator;
        struct kvm_mmu_page *sp;
        int pt_write = 0;
        gfn_t pseudo_gfn;

        for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
                if (iterator.level == level) {
                        mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
                                     0, write, 1, &pt_write,
                                     level, gfn, pfn, false, true);
                        ++vcpu->stat.pf_fixed;
                        break;
                }
        
                if (is_shadow_present_pte(*iterator.sptep) &&
                    !is_large_pte(*iterator.sptep))
                        continue;

                if (is_large_pte(*iterator.sptep)) {
                        rmap_remove(vcpu->kvm, iterator.sptep);
                        __set_spte(iterator.sptep, shadow_trap_nonpresent_pte);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                }

                if (*iterator.sptep == shadow_trap_nonpresent_pte) {
                        pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
                        sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
                                           iterator.level - 1,
                                              1, ACC_ALL, iterator.sptep);
                        if (!sp) {
                                pgprintk("nonpaging_map: ENOMEM\n");
                                kvm_release_pfn_clean(pfn);
                                return -ENOMEM;
                        }

                        __set_spte(iterator.sptep,
                                   __pa(sp->spt)
                                   | PT_PRESENT_MASK | PT_WRITABLE_MASK
                                   | shadow_user_mask | shadow_x_mask);
                }
        }
        return pt_write;
}
如果要2MB PMD重新覆盖的PTE页指针,需要取消与父母不可达PTE。
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                         unsigned pt_access, unsigned pte_access,
                         int user_fault, int write_fault, int dirty,
                         int *ptwrite, int level, gfn_t gfn,
                         pfn_t pfn, bool speculative,
                         bool reset_host_protection)
{
        int was_rmapped = 0;
        int was_writeble = is_writeble_pte(*sptep);
        int rmap_count;

        if (is_rmap_spte(*sptep)) {
                /*
                 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
                 * the parent of the now unreachable PTE.
                 */
                if (level > PT_PAGE_TABLE_LEVEL &&
                    !is_large_pte(*sptep)) {
                struct kvm_mmu_page *child;
                        u64 pte = *sptep;

                        child = page_header(pte & PT64_BASE_ADDR_MASK);
                        mmu_page_remove_parent_pte(child, sptep);
                        __set_spte(sptep, shadow_trap_nonpresent_pte);
                        kvm_flush_remote_tlbs(vcpu->kvm);
                } else if (pfn != spte_to_pfn(*sptep)) {
                        pgprintk("hfn old %lx new %lx\n",
                                 spte_to_pfn(*sptep), pfn);
                        rmap_remove(vcpu->kvm, sptep);
                } else
                        was_rmapped = 1;
        }

        if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
                      dirty, level, gfn, pfn, speculative, true,
                      reset_host_protection)) {
                if (write_fault)
                        *ptwrite = 1;
                kvm_x86_ops->tlb_flush(vcpu);
        }

          if (!was_rmapped && is_large_pte(*sptep))
                ++vcpu->kvm->stat.lpages;

        page_header_update_slot(vcpu->kvm, sptep, gfn);
        if (!was_rmapped) {
                rmap_count = rmap_add(vcpu, sptep, gfn);
                kvm_release_pfn_clean(pfn);
                if (rmap_count > RMAP_RECYCLE_THRESHOLD)
                        rmap_recycle(vcpu, sptep, gfn);
        } else {
                if (was_writeble)
                        kvm_release_pfn_dirty(pfn);
                else
                        kvm_release_pfn_clean(pfn);
        }
        if (speculative) {
                vcpu->arch.last_pte_updated = sptep;
                vcpu->arch.last_pte_gfn = gfn;
        }
}

static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
                    unsigned pte_access, int user_fault,
                    int write_fault, int dirty, int level,
                    gfn_t gfn, pfn_t pfn, bool speculative,
                    bool can_unsync, bool reset_host_protection)
{       
        u64 spte;     
        int ret = 0;  
                
        /*              
         * We don't set the accessed bit, since we sometimes want to see
         * whether the guest actually used the pte (in order to detect
         * demand paging).
         */
        spte = shadow_base_present_pte | shadow_dirty_mask;
        if (!speculative)
                spte |= shadow_accessed_mask;
        if (!dirty)
                pte_access &= ~ACC_WRITE_MASK;
        if (pte_access & ACC_EXEC_MASK)
                spte |= shadow_x_mask;
        else
                spte |= shadow_nx_mask;
        if (pte_access & ACC_USER_MASK)
                spte |= shadow_user_mask;
        if (level > PT_PAGE_TABLE_LEVEL)
                spte |= PT_PAGE_SIZE_MASK;
        if (tdp_enabled)
                spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
                        kvm_is_mmio_pfn(pfn));

        if (reset_host_protection)
                spte |= SPTE_HOST_WRITEABLE;
        spte |= (u64)pfn << PAGE_SHIFT;

        if ((pte_access & ACC_WRITE_MASK)
            || (write_fault && !is_write_protection(vcpu) && !user_fault)) {

                if (level > PT_PAGE_TABLE_LEVEL &&
                    has_wrprotected_page(vcpu->kvm, gfn, level)) {
                        ret = 1;
                        rmap_remove(vcpu->kvm, sptep);
                        spte = shadow_trap_nonpresent_pte;
                        goto set_pte;
                }

                spte |= PT_WRITABLE_MASK;

                if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK))
                        spte &= ~PT_USER_MASK;

                /*
                 * Optimization: for pte sync, if spte was writable the hash
                 * lookup is unnecessary (and expensive). Write protection
                 * is responsibility of mmu_get_page / kvm_sync_page.
                 * Same reasoning can be applied to dirty page accounting.
                 */
                if (!can_unsync && is_writeble_pte(*sptep))
                        goto set_pte;

                if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
                        pgprintk("%s: found shadow page for %lx, marking ro\n",
                                 __func__, gfn);
                        ret = 1;
                  pte_access &= ~ACC_WRITE_MASK;
                        if (is_writeble_pte(spte))
                                spte &= ~PT_WRITABLE_MASK;
                }
        }

        if (pte_access & ACC_WRITE_MASK)
                mark_page_dirty(vcpu->kvm, gfn);

set_pte:
        __set_spte(sptep, spte);
        return ret;
}

sptep ept页表项指针,spte客户机物理地址
static void __set_spte(u64 *sptep, u64 spte)
{
#ifdef CONFIG_X86_64    
        set_64bit((unsigned long *)sptep, spte);
#else                   
        set_64bit((unsigned long long *)sptep, spte);
#endif          
}

你可能感兴趣的:(qemu-kvn 内存虚拟化---ept)