3.2 CPU虚拟化之数据结构

文章目录

  • 1. KVM结构体: 一个虚拟机
  • 2. kvm_vcpu结构体
    • 2.1. kvm_run结构体
    • 2.2. kvm_vcpu_arch结构体
  • 3. kvm, kvm_vcpu, vcpu_vmx和loaded_vmcs之间的关系
  • 4. struct vmcs_host_state: VMEXIT时加载的host状态
  • 5. kvm_x86_ops结构体
  • 6. KVM API中重要的结构体
    • 6.1. kvm_chardev_ops
    • 6.2. kvm_vm_fops
    • 6.3. kvm_vcpu_fops

1. KVM结构体: 一个虚拟机

KVM 结构体在 KVM 的系统架构中代表一个具体的虚拟机

当通过VM_CREATE_KVM!!!指令字创建一个新的 KVM 虚拟机之后,就会创建一个新的KVM结构体对象

// include/linux/kvm_host.h
struct kvm {
	// 保护mmu的spinlock, mmu范围最大的锁
	spinlock_t mmu_lock;
    // 内存槽操作锁
	struct mutex slots_lock;
	// 指向qemu用户态进程的mm_struct
	struct mm_struct *mm; /* userspace tied to this vm */
    // 该kvm所有的memslot
	struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
	struct srcu_struct srcu;
	struct srcu_struct irq_srcu;
    /* 模拟的CPU */
	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];

	/*
	 * created_vcpus is protected by kvm->lock, and is incremented
	 * at the beginning of KVM_CREATE_VCPU.  online_vcpus is only
	 * incremented after storing the kvm_vcpu pointer in vcpus,
	 * and is accessed atomically.
	 */
	// online的vcpu数量
	atomic_t online_vcpus;
	int created_vcpus;
	int last_boosted_vcpu;
	// host上vm管理链表
	struct list_head vm_list;
	struct mutex lock;
	struct kvm_io_bus *buses[KVM_NR_BUSES];
	//事件通道相关
#ifdef CONFIG_HAVE_KVM_EVENTFD
	struct {
		spinlock_t        lock;
		struct list_head  items;
		struct list_head  resampler_list;
		struct mutex      resampler_lock;
	} irqfds;
	struct list_head ioeventfds;
#endif
	struct kvm_vm_stat stat;
	// host上arch的一些参数
	struct kvm_arch arch;
	// 引用计数
	atomic_t users_count;
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
	struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
	spinlock_t ring_lock;
	struct list_head coalesced_zones;
#endif

	struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
	/*
	 * Update side is protected by irq_lock.
	 */
	// 中断路由信息表
	struct kvm_irq_routing_table __rcu *irq_routing;
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
	struct hlist_head irq_ack_notifier_list;
#endif

#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
	// mmu通知链
	struct mmu_notifier mmu_notifier;
	unsigned long mmu_notifier_seq;
	long mmu_notifier_count;
#endif
	// dirty tlb数量
	long tlbs_dirty;
	struct list_head devices;
	struct dentry *debugfs_dentry;
	struct kvm_stat_data **debugfs_stat_data;
};

KVM结构体对象中包含了vCPU内存APICIRQMMUEvent事件管理等信息。

该结构体中的信息主要在 KVM 虚拟机内部!!!使用,用于跟踪虚拟机的状态

在定义 KVM 结构体的结构成员的过程中,集成了很多编译开关,这些开关对应了 KVM 体系中的不同功能点。

在 KVM 中,连接了如下几个重要的结构体成员,它们对虚拟机的运行有重要的作用。

  • struct kvm_memslots *memslots;

KVM虚拟机所分配到的内存slot,内存条模型. 注意最大KVM_ADDRESS_SPACE_NUM(这里是1).

kvm_memslots结构体是kvm_mem_slot的封装,其中包含一个kvm_mem_slot的数组,对应于该虚拟机使用的所有内存区域(slot)。以数组形式存储这些slot的地址信息

kvm_mem_slot是kvm内存管理相关主要数据结构,用来表示虚拟机GPA主机HVA之间的映射关系,一个kvm_mem_slot表示一段内存区域(slot)的映射关系.

  • struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];

KVM虚拟机中包含的vCPU结构体数组

一个vCPU对应一个数组成员

  • struct list_head vm_list;

Host上的VM管理链表

  • struct kvm_io_bus *buses[KVM_NR_BUSES];

KVM虚拟机中的I/O总线结构体数组

一条总线对应一个kvm_io_bus结构体,如ISA总线PCI总线

  • struct kvm_vm_stat stat;

虚拟机运行时状态信息

KVM虚拟机中的页表MMU运行时状态信息

  • struct kvm_arch arch;

KVM 的arch 方面所需要的一些参数,

将在后文讨论 KVM 的实现机理时详细叙述。

2. kvm_vcpu结构体

在用户通过KVM_CREATE_VCPU系统调用请求创建vCPU之后,KVM子模块将创建kvm_vcpu结构体并进行相应的初始化操作,然后返回对应的vcpu_fd描述符

在KVM的内部虚拟机调度!!! 中,以kvm_vcpuKVM中的相关数据进行操作。

kvm_vcpu结构体中的字段较多,其中重要的成员如下。

// include/linux/kvm_host.h
struct kvm_vcpu {
    // 指向此vcpu所属的虚拟机对应的kvm结构
    struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
    struct preempt_notifier preempt_notifier;
#endif
    int cpu;
    // vcpu id,用于唯一标识该vcpu
    int vcpu_id;
    int srcu_idx;
    int mode;
    unsigned long requests;
    unsigned long guest_debug;

    struct mutex mutex;
    // 执行虚拟机对应的kvm_run结构, 运行时的状态
    struct kvm_run *run;

    int fpu_active;
    int guest_fpu_loaded, guest_xcr0_loaded;
	// 队列
    wait_queue_head_t wq;
    struct pid *pid;
    int sigset_active;
    sigset_t sigset;
    // vcpu状态信息
    struct kvm_vcpu_stat stat;
    // mmio相关部分
#ifdef CONFIG_HAS_IOMEM
    int mmio_needed;
    int mmio_read_completed;
    int mmio_is_write;
    int mmio_cur_fragment;
    int mmio_nr_fragments;
    struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
#endif

#ifdef CONFIG_KVM_ASYNC_PF
    struct {
        u32 queued;
        struct list_head queue;
        struct list_head done;
        spinlock_t lock;
    } async_pf;
#endif

#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
    /*
     * Cpu relax intercept or pause loop exit optimization
     * in_spin_loop: set when a vcpu does a pause loop exit
     * or cpu relax intercepted.
     * dy_eligible: indicates whether vcpu is eligible for directed yield.
     */
    struct {
        bool in_spin_loop;
        bool dy_eligible;
    } spin_loop;
#endif
    bool preempted;
	// 当前VCPU虚拟的架构, 默认x86
    // 架构相关部分,包括的寄存器、apic、mmu相关等架构相关的内容
    struct kvm_vcpu_arch arch;
};
  • struct kvm *kvm;

归属的kvm

  • int vcpu_id;

对应的VCPU的ID。

  • struct kvm_run *run;

vCPU 的运行时参数,即运行时的状态

其中保存了寄存器信息内存信息虚拟机状态等各种动态信息

  • struct kvm_vcpu_arch arch;

当前VCPU虚拟的架构信息.

存储有 KVM 虚拟机运行时参数,如定时器中断内存槽等方面的信息。

另外,kvm_vcpu中还包含了执行iomem所需要的数据结构,用于处理iomem方面的请求。

2.1. kvm_run结构体

vcpu的运行状态, 比如VM-Exit原因等

// include/uapi/linux/kvm.h
struct kvm_run {
	/* in */
	__u8 request_interrupt_window;
	__u8 immediate_exit;
	__u8 padding1[6];

	/* out */
	__u32 exit_reason;
	__u8 ready_for_interrupt_injection;
	__u8 if_flag;
	__u16 flags;

	/* in (pre_kvm_run), out (post_kvm_run) */
	__u64 cr8;
	__u64 apic_base;

	union {
		/* KVM_EXIT_UNKNOWN */
		struct {
			__u64 hardware_exit_reason;
		} hw;
		/* KVM_EXIT_FAIL_ENTRY */
		struct {
			__u64 hardware_entry_failure_reason;
		} fail_entry;
		/* KVM_EXIT_EXCEPTION */
		struct {
			__u32 exception;
			__u32 error_code;
		} ex;
		/* KVM_EXIT_IO */
		struct {
#define KVM_EXIT_IO_IN  0
#define KVM_EXIT_IO_OUT 1
			__u8 direction;
			__u8 size; /* bytes */
			__u16 port;
			__u32 count;
			__u64 data_offset; /* relative to kvm_run start */
		} io; // IO操作导致发生vm-exit时, 在这里保存IO相关信息
		/* KVM_EXIT_DEBUG */
		struct {
			struct kvm_debug_exit_arch arch;
		} debug;
		/* KVM_EXIT_MMIO */
		struct {
			__u64 phys_addr;
			__u8  data[8];
			__u32 len;
			__u8  is_write;
		} mmio;
		/* KVM_EXIT_HYPERCALL */
		struct {
			__u64 nr;
			__u64 args[6];
			__u64 ret;
			__u32 longmode;
			__u32 pad;
		} hypercall;
		/* KVM_EXIT_TPR_ACCESS */
		struct {
			__u64 rip;
			__u32 is_write;
			__u32 pad;
		} tpr_access;
		/* KVM_EXIT_DCR (deprecated) */
		struct {
			__u32 dcrn;
			__u32 data;
			__u8  is_write;
		} dcr;
		/* KVM_EXIT_INTERNAL_ERROR */
		struct {
			__u32 suberror;
			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
			__u32 ndata;
			__u64 data[16];
		} internal;
		/* KVM_EXIT_OSI */
		struct {
			__u64 gprs[32];
		} osi;
		/* KVM_EXIT_PAPR_HCALL */
		struct {
			__u64 nr;
			__u64 ret;
			__u64 args[9];
		} papr_hcall;
		/* KVM_EXIT_EPR */
		struct {
			__u32 epr;
		} epr;
		/* KVM_EXIT_SYSTEM_EVENT */
		struct {
#define KVM_SYSTEM_EVENT_SHUTDOWN       1
#define KVM_SYSTEM_EVENT_RESET          2
#define KVM_SYSTEM_EVENT_CRASH          3
			__u32 type;
			__u64 flags;
		} system_event;
		/* KVM_EXIT_IOAPIC_EOI */
		struct {
			__u8 vector;
		} eoi;
		/* KVM_EXIT_HYPERV */
		struct kvm_hyperv_exit hyperv;
		/* Fix the size of the union. */
		char padding[256];
	};

	/* 2048 is the size of the char array used to bound/pad the size
	 * of the union that holds sync regs.
	 */
	#define SYNC_REGS_SIZE_BYTES 2048
	/*
	 * shared registers between kvm and userspace.
	 * kvm_valid_regs specifies the register classes set by the host
	 * kvm_dirty_regs specified the register classes dirtied by userspace
	 * struct kvm_sync_regs is architecture specific, as well as the
	 * bits for kvm_valid_regs and kvm_dirty_regs
	 */
	__u64 kvm_valid_regs;
	__u64 kvm_dirty_regs;
	union {
		struct kvm_sync_regs regs;
		char padding[SYNC_REGS_SIZE_BYTES];
	} s;
};
  • __u8 request_interrupt_window;

向VCPU注入一个中断,让VCPU做好相关准备工作

  • __u8 ready_for_interrupt_injection;

响应request_interrupt_window的中断请求,当设置时,说明VCPU可以接收中断。

  • __u8 if_flag;

中断使能标识,如果使用了APIC,则无效

2.2. kvm_vcpu_arch结构体

当前VCPU虚拟的架构信息.

存储有 KVM 虚拟机运行时参数,如定时器、中断、内存槽等方面的信息。

struct kvm_vcpu_arch {
        /*
         * rip and regs accesses must go through
         * kvm_{register,rip}_{read,write} functions.
         */
        unsigned long regs[NR_VCPU_REGS];
        u32 regs_avail;
        u32 regs_dirty;
        //类似这些寄存器就是就是用来缓存真正的CPU值的
        unsigned long cr0;
        unsigned long cr0_guest_owned_bits;
        unsigned long cr2;
        unsigned long cr3;
        unsigned long cr4;
        unsigned long cr4_guest_owned_bits;
        unsigned long cr8;
        u32 pkru;
        u32 hflags;
        u64 efer;
        u64 apic_base;
        struct kvm_lapic *apic;    /* kernel irqchip context */
        bool apicv_active;
        bool load_eoi_exitmap_pending;
        DECLARE_BITMAP(ioapic_handled_vectors, 256);
        unsigned long apic_attention;
        int32_t apic_arb_prio;
        int mp_state;
        u64 ia32_misc_enable_msr;
        u64 smbase;
        u64 smi_count;
        bool tpr_access_reporting;
        bool xsaves_enabled;
        u64 ia32_xss;
        u64 microcode_version;
        u64 arch_capabilities;

        /*
         * Paging state of the vcpu
         *
         * If the vcpu runs in guest mode with two level paging this still saves
         * the paging mode of the l1 guest. This context is always used to
         * handle faults.
         */
        // 直接操作函数
        struct kvm_mmu *mmu;

        /* Non-nested MMU for L1 */
        // 非嵌套情况下的虚拟机 mmu
        struct kvm_mmu root_mmu;

        /* L1 MMU when running nested */
        // 嵌套情况下的 L1 的mmu
        struct kvm_mmu guest_mmu;

        /*
         * Paging state of an L2 guest (used for nested npt)
         *
         * This context will save all necessary information to walk page tables
         * of an L2 guest. This context is only initialized for page table
         * walking and not for faulting since we never handle l2 page faults on
         * the host.
         */
        // 
        struct kvm_mmu nested_mmu;

        /*
         * Pointer to the mmu context currently used for
         * gva_to_gpa translations.
         */
        // 用于 GVA 转换成 GPA
        struct kvm_mmu *walk_mmu;

        struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
        struct kvm_mmu_memory_cache mmu_page_cache;
        struct kvm_mmu_memory_cache mmu_page_header_cache;

        /*
         * QEMU userspace and the guest each have their own FPU state.
         * In vcpu_run, we switch between the user and guest FPU contexts.
         * While running a VCPU, the VCPU thread will have the guest FPU
         * context.
         *
         * Note that while the PKRU state lives inside the fpu registers,
         * it is switched out separately at VMENTER and VMEXIT time. The
         * "guest_fpu" state here contains the guest FPU context, with the
         * host PRKU bits.
         */
        struct fpu *user_fpu;
        struct fpu *guest_fpu;

        u64 xcr0;
        u64 guest_supported_xcr0;
        u32 guest_xstate_size;

        struct kvm_pio_request pio;
        void *pio_data;

        u8 event_exit_inst_len;

        struct kvm_queued_exception {
                bool pending;
                bool injected;
                bool has_error_code;
                u8 nr;
                u32 error_code;
                unsigned long payload;
                bool has_payload;
                u8 nested_apf;
        } exception;

        struct kvm_queued_interrupt {
                bool injected;
                bool soft;
                u8 nr;
        } interrupt;

        int halt_request; /* real mode on Intel only */

        int cpuid_nent;
        struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];

        int maxphyaddr;

        /* emulate context */
        // 下面是KVM的软件模拟模式,也就是没有vmx的情况,估计也没人用这一套
        struct x86_emulate_ctxt *emulate_ctxt;
        bool emulate_regs_need_sync_to_vcpu;
        bool emulate_regs_need_sync_from_vcpu;
        int (*complete_userspace_io)(struct kvm_vcpu *vcpu);

        gpa_t time;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned int hw_tsc_khz;
        struct gfn_to_hva_cache pv_time;
        bool pv_time_enabled;
        /* set guest stopped flag in pvclock flags field */
        bool pvclock_set_guest_stopped_request;

        struct {
                u8 preempted;
                u64 msr_val;
                u64 last_steal;
                struct gfn_to_pfn_cache cache;
        } st;

        u64 tsc_offset;
        u64 last_guest_tsc;
        u64 last_host_tsc;
        u64 tsc_offset_adjustment;
        u64 this_tsc_nsec;
        u64 this_tsc_write;
        u64 this_tsc_generation;
        bool tsc_catchup;
        bool tsc_always_catchup;
        s8 virtual_tsc_shift;
        u32 virtual_tsc_mult;
        u32 virtual_tsc_khz;
        s64 ia32_tsc_adjust_msr;
        u64 msr_ia32_power_ctl;
        u64 tsc_scaling_ratio;

        atomic_t nmi_queued;  /* unprocessed asynchronous NMIs */
        unsigned nmi_pending; /* NMI queued after currently running handler */
        bool nmi_injected;    /* Trying to inject an NMI this entry */
        bool smi_pending;    /* SMI queued after currently running handler */

        struct kvm_mtrr mtrr_state;
        u64 pat;

        unsigned switch_db_regs;
        unsigned long db[KVM_NR_DB_REGS];
        unsigned long dr6;
        unsigned long dr7;
        unsigned long eff_db[KVM_NR_DB_REGS];
        unsigned long guest_debug_dr7;
        u64 msr_platform_info;
        u64 msr_misc_features_enables;

        u64 mcg_cap;
        u64 mcg_status;
        u64 mcg_ctl;
        u64 mcg_ext_ctl;
        u64 *mce_banks;

        /* Cache MMIO info */
        u64 mmio_gva;
        unsigned mmio_access;
        gfn_t mmio_gfn;
        u64 mmio_gen;

        struct kvm_pmu pmu;

        /* used for guest single stepping over the given code position */
        unsigned long singlestep_rip;

        struct kvm_vcpu_hv hyperv;

        cpumask_var_t wbinvd_dirty_mask;

        unsigned long last_retry_eip;
        unsigned long last_retry_addr;

        struct {
                bool halted;
                gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
                struct gfn_to_hva_cache data;
                u64 msr_val;
                u32 id;
                bool send_user_only;
                u32 host_apf_reason;
                unsigned long nested_apf_token;
                bool delivery_as_pf_vmexit;
        } apf;

        /* OSVW MSRs (AMD only) */
        struct {
                u64 length;
                u64 status;
        } osvw;

        struct {
                u64 msr_val;
                struct gfn_to_hva_cache data;
        } pv_eoi;

        u64 msr_kvm_poll_control;

        /*
         * Indicates the guest is trying to write a gfn that contains one or
         * more of the PTEs used to translate the write itself, i.e. the access
         * is changing its own translation in the guest page tables.  KVM exits
         * to userspace if emulation of the faulting instruction fails and this
         * flag is set, as KVM cannot make forward progress.
         *
         * If emulation fails for a write to guest page tables, KVM unprotects
         * (zaps) the shadow page for the target gfn and resumes the guest to
         * retry the non-emulatable instruction (on hardware).  Unprotecting the
         * gfn doesn't allow forward progress for a self-changing access because
         * doing so also zaps the translation for the gfn, i.e. retrying the
         * instruction will hit a !PRESENT fault, which results in a new shadow
         * page and sends KVM back to square one.
         */
        bool write_fault_to_shadow_pgtable;

        /* set at EPT violation at this point */
        unsigned long exit_qualification;

        /* pv related host specific info */
        // 不支持VMX下的模拟虚拟化
        struct {
                bool pv_unhalted;
        } pv;

        int pending_ioapic_eoi;
        int pending_external_vector;

        /* be preempted when it's in kernel-mode(cpl=0) */
        bool preempted_in_kernel;

        /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
        bool l1tf_flush_l1d;

        /* AMD MSRC001_0015 Hardware Configuration */
        u64 msr_hwcr;
};

3. kvm, kvm_vcpu, vcpu_vmx和loaded_vmcs之间的关系

先看下相关结构体的相关部分

// include/linux/kvm_host.h
struct kvm
{
    struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
    struct list_head vm_list; // 用于挂到全局vm_list列表
    struct kvm_arch arch; // 平台相关
}

struct kvm代表一个虚拟机, 包含多个vcpu

// include/linux/kvm_host.h
struct kvm_vcpu {
	int cpu;    /*运行当前VCPU的物理CPU编号*/
}

struct kvm_vcpu代表一个vCPU, 同一个时刻只能对应一个物理CPU

// arch/x86/kvm/vmx/vmx.h
struct vcpu_vmx {
	struct kvm_vcpu       vcpu;
	/*
	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
	 * non-nested (L1) guest, it always points to vmcs01. For a nested
	 * guest (L2), it points to a different VMCS.
	 */
	struct loaded_vmcs    vmcs01;
	struct loaded_vmcs   *loaded_vmcs;
}

struct vcpu_vmx代表vcpu的一个运行环境, loaded_vmcs指向当前vcpu使用的vmcs. 对于非嵌套(即L1)虚拟机, loaded_vmcs一直指向vmcs01, 对于嵌套(L2), 指向另外的vmcs.

// arch/x86/kvm/vmx/vmcs.h
struct loaded_vmcs {
	struct vmcs *vmcs;  /*本VCPU对应的VMCS*/
	int cpu;            /*上一次运行的CPU编号*/
	int launched;		/* 是否被这个cpu加载*/
	struct list_head loaded_vmcss_on_cpu_link;	/* 这个cpu上的所有vmcs链表, 在cpu down时候清理*/
};

而vmcs结构如下

struct vmcs_hdr {
        u32 revision_id:31;
        u32 shadow_vmcs:1;
};
// vmcs具体结构分配由硬件实现, 程序员只需要通过 VMWRITE 和 VMREAD 指令去访问。
struct vmcs {
        struct vmcs_hdr hdr;
        u32 abort;
        char data[0];
};
// 每个逻辑cpu一个vmcs结构
DECLARE_PER_CPU(struct vmcs *, current_vmcs);

3.2 CPU虚拟化之数据结构_第1张图片

vcpu_vmx其实是VCPU一个运行环境,理解为environment。通过loaded_vmcscpu成员将vmcsCPU关联起来。

一个VCPU当然可以运行在不同的物理CPU之上,只要更换loaded_vmcscpu编号即可;

但是为什么会一个VCPU对应多个不同的VMCS呢?其实是因为嵌套虚拟化的原因,当L2虚拟机VCPU加载后,VCPU所使用的VMCS不是L1层的VMCS;而是L2层的VMCS;其实就是把L1的VCPU在L2中当做了物理CPU用,物理CPU当然可以有多个VMCS了。

3.2 CPU虚拟化之数据结构_第2张图片

一般L1中,loaded_vmcs就指向vmcs01,当VCPU运行的物理CPU发生切换的时候,修改loaded_vmcs中的cpu成员即可

static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
 
    /*vmcs的分配*/
	vmx->loaded_vmcs = &vmx->vmcs01;
	vmx->loaded_vmcs->vmcs = alloc_vmcs();
 
}

在L2中,loaded_vmcs会在L1 VMCS和各个L2 VMCS之间切换,那么就需要修改loaded_vmcs指针的指向

static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
	cpu = get_cpu();
	vmx->loaded_vmcs = vmcs02;
	vmx_vcpu_put(vcpu);
	vmx_vcpu_load(vcpu, cpu);
	vcpu->cpu = cpu;
	put_cpu();
}

4. struct vmcs_host_state: VMEXIT时加载的host状态

/*
 * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
 * and whose values change infrequently, but are not constant.  I.e. this is
 * used as a write-through cache of the corresponding VMCS fields.
 */
struct vmcs_host_state {
        unsigned long cr3;      /* May not match real cr3 */
        unsigned long cr4;      /* May not match real cr4 */
        unsigned long gs_base;
        unsigned long fs_base;
        unsigned long rsp;

        u16           fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64
        u16           ds_sel, es_sel;
#endif
};

虚拟机vmexit时, 会从vmcs中加载这部分内容, 很少修改

5. kvm_x86_ops结构体

kvm_x86_ops结构体中包含了针对具体的CPU架构进行虚拟化时函数指针调用,其定义在Linux内核文件的arch/x86/include/asm/kvm_host.h中。

该结构体主要包含 。

  • CPU VMM状态硬件初始化
  • vCPU创建与管理
  • 中断管理
  • 寄存器管理
  • 时钟管理

kvm_x86_ops 结构体中的所有成员都是函数指针,在 kvm-intel.ko 和kvm-amd.ko这两个不同的模块中,针对各自的体系提供了不同的函数。在KVM的初始化过程和后续的运行过程中,KVM 子系统的代码将通过该结构体的函数进行实际的硬件操作

kvm_x86_ops结构体通过静态初始化

  • 针对amd架构的初始化代码在svm.c中,
  • 针对Intel架构的初始化代码在vmx.c中。

amd架构的kvm_x86_ops结构体部分代码列举如下。

代码5-2 svm.c中的kvm_x86_ops初始化代码片段

// arch/x86/kvm/svm.c
(01)static struct kvm_x86_ops svm_x86_ops = {(02)     .cpu_has_kvm_support = has_svm,(03)     .disabled_by_bios = is_disabled,(04)     .hardware_setup = svm_hardware_setup,(05)     .hardware_unsetup = svm_hardware_unsetup,(06)     .check_processor_compatibility = svm_check_processor_compat,(07)     .hardware_enable = svm_hardware_enable,(08)     .hardware_disable = svm_hardware_disable,(09)     .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,(10) ……

需要注意的是,因为KVM架构要同时考虑到支持不同的架构体系。因此,kvm_x86_ops结构体是在KVM架构的初始化!!! 过程中注册导出成为全局变量!!!,让KVM的各个子模块能够方便地调用。

arch/x86/kvm/x86.c 中,定义了名为 kvm_x86_ops静态变量,通过export_symbol 宏全局范围!!! 内导出。

// arch/x86/kvm/x86.c
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);

kvm_init初始化过程中,通过调用kvm_arch_init函数给kvm_x86_ops赋值,代码如下,其中,ops就是通过svm.c调用kvm_init函数时传入的kvm_x86_ops结构体

代码5-3 kvm_x86_ops赋值

// arch/x86/kvm/x86.c(4652)        kvm_init_msr_list();(4653)(4654)        kvm_x86_ops = ops;(4655)        kvm_mmu_set_nonpresent_ptes(0ull, 0ull);(4656)        kvm_mmu_set_base_ptes(PT_PRESENT_MASK);

有关该结构的各个成员的详细说明,将在后文针对 KVM 的初始化和其他关键过程的分析过程中进行详细阐述。

6. KVM API中重要的结构体

KVM 在和用户态程序进行交互的过程中,主要通过/dev/kvm 设备文件进行通信。

从后文的 KVM的初始化过程中可以得知,/dev/kvm是一个字符型设备,通过符合Linux标准一系列结构体进行支撑,主要是kvm_chardev_opskvm_vm_fopskvm_vcpu_fops,分别对应字符型设备VM文件描述符vCPU文件描述符的三种操作。

6.1. kvm_chardev_ops

kvm_chardev_ops的定义在virt/kvm/kvm_main.c中,代码如下:

在模块初始化的kvm_init()调用了misc_register(&kvm_dev)注册/dev/kvm设备文件.

下面是设备文件的结构体

// virt/kvm/kvm_main.c
static struct file_operations kvm_chardev_ops = {
    .unlocked_ioctl = kvm_dev_ioctl,
    .llseek         = noop_llseek,
    KVM_COMPAT(kvm_dev_ioctl),
};
static struct miscdevice kvm_dev = {
    KVM_MINOR,
    "kvm",
    &kvm_chardev_ops,
};

代码5-4 kvm_chardev_ops

// virt/kvm/kvm_main.c
(1913)   static struct file_operations kvm_chardev_ops = {(1914)        .unlocked_ioctl = kvm_dev_ioctl,(1915)        .compat_ioctl   = kvm_dev_ioctl,(1916)        .llseek       = noop_llseek,(1917)   };

kvm_chardev_ops为一个标准的file_operations结构体,但是只包含了ioctl函数readopenwrite等常见的系统调用均采用默认实现。因此,就只能用户态通过ioctl函数进行操作。

6.2. kvm_vm_fops

如前文所述,通过KVM_CREATE_VM之后可以获得一个fd文件描述符,代表该VM,该fd文件描述符在KVM子模块内部操作实际上对应着kvm_vm_fops结构体,其主要内容如下。

代码5-5 kvm_vm_fops结构体

// virt/kvm/kvm_main.c
(1815)   static struct file_operations kvm_vm_fops = {(1816)        .release       = kvm_vm_release,(1817)        .unlocked_ioctl = kvm_vm_ioctl,// vmfd的ioctl接口,提供对vm级别的控制
(1818)   #ifdef CONFIG_COMPAT
(1819)        .compat_ioctl   = kvm_vm_compat_ioctl,(1820)   #endif
(1821)        .mmap          = kvm_vm_mmap,(1822)        .llseek       = noop_llseek,(1823)   };

针对VM的文件操作中,提供了ioctlmmap两个操作函数,其中mmap对应着GUEST OS的物理地址,可以直接对GUEST OS的地址空间进行读/写, ioctl则用于发送KVM的控制字

6.3. kvm_vcpu_fops

针对 KVM 的 fd,通过 KVM_CREATE_VCPU 指令字可以创建 KVM 的vCPU,并且获得该vcpu_fd,在KVM子模块中的操作对应着该结构体。

vcpu_fd的操作主要包含在kvm_vcpu_fops中,其主要定义如下。

代码5-6 kvm_vcpu_fops结构体

// virt/kvm/kvm_main.c
(1348)   static struct file_operations kvm_vcpu_fops = {(1349)        .release       = kvm_vcpu_release,(1350)        .unlocked_ioctl = kvm_vcpu_ioctl,(1351)        .compat_ioctl   = kvm_vcpu_ioctl,(1352)        .mmap          = kvm_vcpu_mmap,(1353)        .llseek       = noop_llseek,(1354)   };

ioctl中,通过发送ioctl,即可对vCPU进行控制

通过mmap,则可以访问kvm_run结构体,在这个结构体中保存了vCPU运行和控制的信息,并且可以对其运行参数进行设置。

你可能感兴趣的:(QEMU-KVM之KVM,#,3.,kvm之CPU虚拟化)