KVM 结构体在 KVM 的系统架构中代表一个具体的虚拟机。
当通过VM_CREATE_KVM!!!指令字创建一个新的 KVM 虚拟机之后,就会创建一个新的KVM结构体对象。
// include/linux/kvm_host.h
struct kvm {
// 保护mmu的spinlock, mmu范围最大的锁
spinlock_t mmu_lock;
// 内存槽操作锁
struct mutex slots_lock;
// 指向qemu用户态进程的mm_struct
struct mm_struct *mm; /* userspace tied to this vm */
// 该kvm所有的memslot
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
/* 模拟的CPU */
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
/*
* created_vcpus is protected by kvm->lock, and is incremented
* at the beginning of KVM_CREATE_VCPU. online_vcpus is only
* incremented after storing the kvm_vcpu pointer in vcpus,
* and is accessed atomically.
*/
// online的vcpu数量
atomic_t online_vcpus;
int created_vcpus;
int last_boosted_vcpu;
// host上vm管理链表
struct list_head vm_list;
struct mutex lock;
struct kvm_io_bus *buses[KVM_NR_BUSES];
//事件通道相关
#ifdef CONFIG_HAVE_KVM_EVENTFD
struct {
spinlock_t lock;
struct list_head items;
struct list_head resampler_list;
struct mutex resampler_lock;
} irqfds;
struct list_head ioeventfds;
#endif
struct kvm_vm_stat stat;
// host上arch的一些参数
struct kvm_arch arch;
// 引用计数
atomic_t users_count;
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
spinlock_t ring_lock;
struct list_head coalesced_zones;
#endif
struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
/*
* Update side is protected by irq_lock.
*/
// 中断路由信息表
struct kvm_irq_routing_table __rcu *irq_routing;
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
struct hlist_head irq_ack_notifier_list;
#endif
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
// mmu通知链
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
// dirty tlb数量
long tlbs_dirty;
struct list_head devices;
struct dentry *debugfs_dentry;
struct kvm_stat_data **debugfs_stat_data;
};
KVM结构体对象中包含了vCPU、内存、APIC、IRQ、MMU、Event事件管理等信息。
该结构体中的信息主要在 KVM 虚拟机内部!!!使用,用于跟踪虚拟机的状态。
在定义 KVM 结构体的结构成员的过程中,集成了很多编译开关,这些开关对应了 KVM 体系中的不同功能点。
在 KVM 中,连接了如下几个重要的结构体成员,它们对虚拟机的运行有重要的作用。
struct kvm_memslots *memslots
;KVM虚拟机所分配到的内存slot,内存条模型. 注意最大KVM_ADDRESS_SPACE_NUM
(这里是1).
kvm_memslots
结构体是kvm_mem_slot
的封装,其中包含一个kvm_mem_slot
的数组,对应于该虚拟机使用的所有内存区域(slot)。以数组形式存储这些slot的地址信息。
kvm_mem_slot
是kvm内存管理相关主要数据结构,用来表示虚拟机GPA和主机HVA之间的映射关系,一个kvm_mem_slot
表示一段内存区域(slot)的映射关系.
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]
;KVM虚拟机中包含的vCPU结构体数组,
一个vCPU对应一个数组成员。
struct list_head vm_list;
Host上的VM管理链表
struct kvm_io_bus *buses[KVM_NR_BUSES]
;KVM虚拟机中的I/O总线结构体数组
一条总线对应一个kvm_io_bus结构体,如ISA总线、PCI总线。
struct kvm_vm_stat stat
;虚拟机运行时状态信息
KVM虚拟机中的页表、MMU等运行时状态信息。
struct kvm_arch arch
;KVM 的arch 方面所需要的一些参数,
将在后文讨论 KVM 的实现机理时详细叙述。
在用户通过KVM_CREATE_VCPU系统调用请求创建vCPU之后,KVM子模块将创建kvm_vcpu结构体并进行相应的初始化操作,然后返回对应的vcpu_fd描述符。
在KVM的内部虚拟机调度!!! 中,以kvm_vcpu和KVM中的相关数据进行操作。
kvm_vcpu结构体中的字段较多,其中重要的成员如下。
// include/linux/kvm_host.h
struct kvm_vcpu {
// 指向此vcpu所属的虚拟机对应的kvm结构
struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier preempt_notifier;
#endif
int cpu;
// vcpu id,用于唯一标识该vcpu
int vcpu_id;
int srcu_idx;
int mode;
unsigned long requests;
unsigned long guest_debug;
struct mutex mutex;
// 执行虚拟机对应的kvm_run结构, 运行时的状态
struct kvm_run *run;
int fpu_active;
int guest_fpu_loaded, guest_xcr0_loaded;
// 队列
wait_queue_head_t wq;
struct pid *pid;
int sigset_active;
sigset_t sigset;
// vcpu状态信息
struct kvm_vcpu_stat stat;
// mmio相关部分
#ifdef CONFIG_HAS_IOMEM
int mmio_needed;
int mmio_read_completed;
int mmio_is_write;
int mmio_cur_fragment;
int mmio_nr_fragments;
struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
#endif
#ifdef CONFIG_KVM_ASYNC_PF
struct {
u32 queued;
struct list_head queue;
struct list_head done;
spinlock_t lock;
} async_pf;
#endif
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Cpu relax intercept or pause loop exit optimization
* in_spin_loop: set when a vcpu does a pause loop exit
* or cpu relax intercepted.
* dy_eligible: indicates whether vcpu is eligible for directed yield.
*/
struct {
bool in_spin_loop;
bool dy_eligible;
} spin_loop;
#endif
bool preempted;
// 当前VCPU虚拟的架构, 默认x86
// 架构相关部分,包括的寄存器、apic、mmu相关等架构相关的内容
struct kvm_vcpu_arch arch;
};
struct kvm *kvm;
归属的kvm
int vcpu_id
;对应的VCPU的ID。
struct kvm_run *run;
vCPU 的运行时参数,即运行时的状态
其中保存了寄存器信息、内存信息、虚拟机状态等各种动态信息。
struct kvm_vcpu_arch arch;
当前VCPU虚拟的架构信息.
存储有 KVM 虚拟机的运行时参数,如定时器、中断、内存槽等方面的信息。
另外,kvm_vcpu中还包含了执行iomem所需要的数据结构,用于处理iomem方面的请求。
vcpu的运行状态, 比如VM-Exit原因等
// include/uapi/linux/kvm.h
struct kvm_run {
/* in */
__u8 request_interrupt_window;
__u8 immediate_exit;
__u8 padding1[6];
/* out */
__u32 exit_reason;
__u8 ready_for_interrupt_injection;
__u8 if_flag;
__u16 flags;
/* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
__u64 apic_base;
union {
/* KVM_EXIT_UNKNOWN */
struct {
__u64 hardware_exit_reason;
} hw;
/* KVM_EXIT_FAIL_ENTRY */
struct {
__u64 hardware_entry_failure_reason;
} fail_entry;
/* KVM_EXIT_EXCEPTION */
struct {
__u32 exception;
__u32 error_code;
} ex;
/* KVM_EXIT_IO */
struct {
#define KVM_EXIT_IO_IN 0
#define KVM_EXIT_IO_OUT 1
__u8 direction;
__u8 size; /* bytes */
__u16 port;
__u32 count;
__u64 data_offset; /* relative to kvm_run start */
} io; // IO操作导致发生vm-exit时, 在这里保存IO相关信息
/* KVM_EXIT_DEBUG */
struct {
struct kvm_debug_exit_arch arch;
} debug;
/* KVM_EXIT_MMIO */
struct {
__u64 phys_addr;
__u8 data[8];
__u32 len;
__u8 is_write;
} mmio;
/* KVM_EXIT_HYPERCALL */
struct {
__u64 nr;
__u64 args[6];
__u64 ret;
__u32 longmode;
__u32 pad;
} hypercall;
/* KVM_EXIT_TPR_ACCESS */
struct {
__u64 rip;
__u32 is_write;
__u32 pad;
} tpr_access;
/* KVM_EXIT_DCR (deprecated) */
struct {
__u32 dcrn;
__u32 data;
__u8 is_write;
} dcr;
/* KVM_EXIT_INTERNAL_ERROR */
struct {
__u32 suberror;
/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
__u32 ndata;
__u64 data[16];
} internal;
/* KVM_EXIT_OSI */
struct {
__u64 gprs[32];
} osi;
/* KVM_EXIT_PAPR_HCALL */
struct {
__u64 nr;
__u64 ret;
__u64 args[9];
} papr_hcall;
/* KVM_EXIT_EPR */
struct {
__u32 epr;
} epr;
/* KVM_EXIT_SYSTEM_EVENT */
struct {
#define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
__u32 type;
__u64 flags;
} system_event;
/* KVM_EXIT_IOAPIC_EOI */
struct {
__u8 vector;
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
/* Fix the size of the union. */
char padding[256];
};
/* 2048 is the size of the char array used to bound/pad the size
* of the union that holds sync regs.
*/
#define SYNC_REGS_SIZE_BYTES 2048
/*
* shared registers between kvm and userspace.
* kvm_valid_regs specifies the register classes set by the host
* kvm_dirty_regs specified the register classes dirtied by userspace
* struct kvm_sync_regs is architecture specific, as well as the
* bits for kvm_valid_regs and kvm_dirty_regs
*/
__u64 kvm_valid_regs;
__u64 kvm_dirty_regs;
union {
struct kvm_sync_regs regs;
char padding[SYNC_REGS_SIZE_BYTES];
} s;
};
__u8 request_interrupt_window;
向VCPU注入一个中断,让VCPU做好相关准备工作
__u8 ready_for_interrupt_injection;
响应request_interrupt_window的中断请求,当设置时,说明VCPU可以接收中断。
__u8 if_flag;
中断使能标识,如果使用了APIC,则无效
当前VCPU虚拟的架构信息.
存储有 KVM 虚拟机的运行时参数,如定时器、中断、内存槽等方面的信息。
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
* kvm_{register,rip}_{read,write} functions.
*/
unsigned long regs[NR_VCPU_REGS];
u32 regs_avail;
u32 regs_dirty;
//类似这些寄存器就是就是用来缓存真正的CPU值的
unsigned long cr0;
unsigned long cr0_guest_owned_bits;
unsigned long cr2;
unsigned long cr3;
unsigned long cr4;
unsigned long cr4_guest_owned_bits;
unsigned long cr8;
u32 pkru;
u32 hflags;
u64 efer;
u64 apic_base;
struct kvm_lapic *apic; /* kernel irqchip context */
bool apicv_active;
bool load_eoi_exitmap_pending;
DECLARE_BITMAP(ioapic_handled_vectors, 256);
unsigned long apic_attention;
int32_t apic_arb_prio;
int mp_state;
u64 ia32_misc_enable_msr;
u64 smbase;
u64 smi_count;
bool tpr_access_reporting;
bool xsaves_enabled;
u64 ia32_xss;
u64 microcode_version;
u64 arch_capabilities;
/*
* Paging state of the vcpu
*
* If the vcpu runs in guest mode with two level paging this still saves
* the paging mode of the l1 guest. This context is always used to
* handle faults.
*/
// 直接操作函数
struct kvm_mmu *mmu;
/* Non-nested MMU for L1 */
// 非嵌套情况下的虚拟机 mmu
struct kvm_mmu root_mmu;
/* L1 MMU when running nested */
// 嵌套情况下的 L1 的mmu
struct kvm_mmu guest_mmu;
/*
* Paging state of an L2 guest (used for nested npt)
*
* This context will save all necessary information to walk page tables
* of an L2 guest. This context is only initialized for page table
* walking and not for faulting since we never handle l2 page faults on
* the host.
*/
//
struct kvm_mmu nested_mmu;
/*
* Pointer to the mmu context currently used for
* gva_to_gpa translations.
*/
// 用于 GVA 转换成 GPA
struct kvm_mmu *walk_mmu;
struct kvm_mmu_memory_cache mmu_pte_list_desc_cache;
struct kvm_mmu_memory_cache mmu_page_cache;
struct kvm_mmu_memory_cache mmu_page_header_cache;
/*
* QEMU userspace and the guest each have their own FPU state.
* In vcpu_run, we switch between the user and guest FPU contexts.
* While running a VCPU, the VCPU thread will have the guest FPU
* context.
*
* Note that while the PKRU state lives inside the fpu registers,
* it is switched out separately at VMENTER and VMEXIT time. The
* "guest_fpu" state here contains the guest FPU context, with the
* host PRKU bits.
*/
struct fpu *user_fpu;
struct fpu *guest_fpu;
u64 xcr0;
u64 guest_supported_xcr0;
u32 guest_xstate_size;
struct kvm_pio_request pio;
void *pio_data;
u8 event_exit_inst_len;
struct kvm_queued_exception {
bool pending;
bool injected;
bool has_error_code;
u8 nr;
u32 error_code;
unsigned long payload;
bool has_payload;
u8 nested_apf;
} exception;
struct kvm_queued_interrupt {
bool injected;
bool soft;
u8 nr;
} interrupt;
int halt_request; /* real mode on Intel only */
int cpuid_nent;
struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
int maxphyaddr;
/* emulate context */
// 下面是KVM的软件模拟模式,也就是没有vmx的情况,估计也没人用这一套
struct x86_emulate_ctxt *emulate_ctxt;
bool emulate_regs_need_sync_to_vcpu;
bool emulate_regs_need_sync_from_vcpu;
int (*complete_userspace_io)(struct kvm_vcpu *vcpu);
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
struct gfn_to_hva_cache pv_time;
bool pv_time_enabled;
/* set guest stopped flag in pvclock flags field */
bool pvclock_set_guest_stopped_request;
struct {
u8 preempted;
u64 msr_val;
u64 last_steal;
struct gfn_to_pfn_cache cache;
} st;
u64 tsc_offset;
u64 last_guest_tsc;
u64 last_host_tsc;
u64 tsc_offset_adjustment;
u64 this_tsc_nsec;
u64 this_tsc_write;
u64 this_tsc_generation;
bool tsc_catchup;
bool tsc_always_catchup;
s8 virtual_tsc_shift;
u32 virtual_tsc_mult;
u32 virtual_tsc_khz;
s64 ia32_tsc_adjust_msr;
u64 msr_ia32_power_ctl;
u64 tsc_scaling_ratio;
atomic_t nmi_queued; /* unprocessed asynchronous NMIs */
unsigned nmi_pending; /* NMI queued after currently running handler */
bool nmi_injected; /* Trying to inject an NMI this entry */
bool smi_pending; /* SMI queued after currently running handler */
struct kvm_mtrr mtrr_state;
u64 pat;
unsigned switch_db_regs;
unsigned long db[KVM_NR_DB_REGS];
unsigned long dr6;
unsigned long dr7;
unsigned long eff_db[KVM_NR_DB_REGS];
unsigned long guest_debug_dr7;
u64 msr_platform_info;
u64 msr_misc_features_enables;
u64 mcg_cap;
u64 mcg_status;
u64 mcg_ctl;
u64 mcg_ext_ctl;
u64 *mce_banks;
/* Cache MMIO info */
u64 mmio_gva;
unsigned mmio_access;
gfn_t mmio_gfn;
u64 mmio_gen;
struct kvm_pmu pmu;
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
struct kvm_vcpu_hv hyperv;
cpumask_var_t wbinvd_dirty_mask;
unsigned long last_retry_eip;
unsigned long last_retry_addr;
struct {
bool halted;
gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
struct gfn_to_hva_cache data;
u64 msr_val;
u32 id;
bool send_user_only;
u32 host_apf_reason;
unsigned long nested_apf_token;
bool delivery_as_pf_vmexit;
} apf;
/* OSVW MSRs (AMD only) */
struct {
u64 length;
u64 status;
} osvw;
struct {
u64 msr_val;
struct gfn_to_hva_cache data;
} pv_eoi;
u64 msr_kvm_poll_control;
/*
* Indicates the guest is trying to write a gfn that contains one or
* more of the PTEs used to translate the write itself, i.e. the access
* is changing its own translation in the guest page tables. KVM exits
* to userspace if emulation of the faulting instruction fails and this
* flag is set, as KVM cannot make forward progress.
*
* If emulation fails for a write to guest page tables, KVM unprotects
* (zaps) the shadow page for the target gfn and resumes the guest to
* retry the non-emulatable instruction (on hardware). Unprotecting the
* gfn doesn't allow forward progress for a self-changing access because
* doing so also zaps the translation for the gfn, i.e. retrying the
* instruction will hit a !PRESENT fault, which results in a new shadow
* page and sends KVM back to square one.
*/
bool write_fault_to_shadow_pgtable;
/* set at EPT violation at this point */
unsigned long exit_qualification;
/* pv related host specific info */
// 不支持VMX下的模拟虚拟化
struct {
bool pv_unhalted;
} pv;
int pending_ioapic_eoi;
int pending_external_vector;
/* be preempted when it's in kernel-mode(cpl=0) */
bool preempted_in_kernel;
/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
bool l1tf_flush_l1d;
/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
};
先看下相关结构体的相关部分
// include/linux/kvm_host.h
struct kvm
{
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
struct list_head vm_list; // 用于挂到全局vm_list列表
struct kvm_arch arch; // 平台相关
}
struct kvm
代表一个虚拟机, 包含多个vcpu
// include/linux/kvm_host.h
struct kvm_vcpu {
int cpu; /*运行当前VCPU的物理CPU编号*/
}
struct kvm_vcpu
代表一个vCPU, 同一个时刻只能对应一个物理CPU
// arch/x86/kvm/vmx/vmx.h
struct vcpu_vmx {
struct kvm_vcpu vcpu;
/*
* loaded_vmcs points to the VMCS currently used in this vcpu. For a
* non-nested (L1) guest, it always points to vmcs01. For a nested
* guest (L2), it points to a different VMCS.
*/
struct loaded_vmcs vmcs01;
struct loaded_vmcs *loaded_vmcs;
}
struct vcpu_vmx
代表vcpu的一个运行环境, loaded_vmcs
指向当前vcpu使用的vmcs. 对于非嵌套(即L1)虚拟机, loaded_vmcs
一直指向vmcs01
, 对于嵌套(L2), 指向另外的vmcs.
// arch/x86/kvm/vmx/vmcs.h
struct loaded_vmcs {
struct vmcs *vmcs; /*本VCPU对应的VMCS*/
int cpu; /*上一次运行的CPU编号*/
int launched; /* 是否被这个cpu加载*/
struct list_head loaded_vmcss_on_cpu_link; /* 这个cpu上的所有vmcs链表, 在cpu down时候清理*/
};
而vmcs结构如下
struct vmcs_hdr {
u32 revision_id:31;
u32 shadow_vmcs:1;
};
// vmcs具体结构分配由硬件实现, 程序员只需要通过 VMWRITE 和 VMREAD 指令去访问。
struct vmcs {
struct vmcs_hdr hdr;
u32 abort;
char data[0];
};
// 每个逻辑cpu一个vmcs结构
DECLARE_PER_CPU(struct vmcs *, current_vmcs);
vcpu_vmx
其实是VCPU的一个运行环境,理解为environment。通过loaded_vmcs和cpu成员将vmcs和CPU关联起来。
一个VCPU当然可以运行在不同的物理CPU之上,只要更换loaded_vmcs
中cpu编号即可;
但是为什么会一个VCPU对应多个不同的VMCS呢?其实是因为嵌套虚拟化的原因,当L2虚拟机的VCPU加载后,VCPU所使用的VMCS不是L1层的VMCS;而是L2层的VMCS;其实就是把L1的VCPU在L2中当做了物理CPU用,物理CPU当然可以有多个VMCS了。
一般L1中,loaded_vmcs
就指向vmcs01
,当VCPU运行的物理CPU发生切换的时候,修改loaded_vmcs
中的cpu成员即可
static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
{
/*vmcs的分配*/
vmx->loaded_vmcs = &vmx->vmcs01;
vmx->loaded_vmcs->vmcs = alloc_vmcs();
}
在L2中,loaded_vmcs会在L1 VMCS和各个L2 VMCS之间切换,那么就需要修改loaded_vmcs指针的指向
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
cpu = get_cpu();
vmx->loaded_vmcs = vmcs02;
vmx_vcpu_put(vcpu);
vmx_vcpu_load(vcpu, cpu);
vcpu->cpu = cpu;
put_cpu();
}
/*
* vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
* and whose values change infrequently, but are not constant. I.e. this is
* used as a write-through cache of the corresponding VMCS fields.
*/
struct vmcs_host_state {
unsigned long cr3; /* May not match real cr3 */
unsigned long cr4; /* May not match real cr4 */
unsigned long gs_base;
unsigned long fs_base;
unsigned long rsp;
u16 fs_sel, gs_sel, ldt_sel;
#ifdef CONFIG_X86_64
u16 ds_sel, es_sel;
#endif
};
虚拟机vmexit时, 会从vmcs中加载这部分内容, 很少修改
kvm_x86_ops结构体中包含了针对具体的CPU架构进行虚拟化时的函数指针调用,其定义在Linux内核文件的arch/x86/include/asm/kvm_host.h
中。
该结构体主要包含 。
kvm_x86_ops 结构体中的所有成员都是函数指针,在 kvm-intel.ko 和kvm-amd.ko这两个不同的模块中,针对各自的体系提供了不同的函数。在KVM的初始化过程和后续的运行过程中,KVM 子系统的代码将通过该结构体的函数进行实际的硬件操作。
kvm_x86_ops结构体通过静态初始化。
amd架构的kvm_x86_ops结构体部分代码列举如下。
代码5-2 svm.c中的kvm_x86_ops初始化代码片段
// arch/x86/kvm/svm.c
(01)static struct kvm_x86_ops svm_x86_ops = {
(02) .cpu_has_kvm_support = has_svm,
(03) .disabled_by_bios = is_disabled,
(04) .hardware_setup = svm_hardware_setup,
(05) .hardware_unsetup = svm_hardware_unsetup,
(06) .check_processor_compatibility = svm_check_processor_compat,
(07) .hardware_enable = svm_hardware_enable,
(08) .hardware_disable = svm_hardware_disable,
(09) .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr,
(10) ……
需要注意的是,因为KVM架构要同时考虑到支持不同的架构体系。因此,kvm_x86_ops结构体是在KVM架构的初始化!!! 过程中注册并 导出成为全局变量!!!,让KVM的各个子模块能够方便地调用。
在 arch/x86/kvm/x86.c 中,定义了名为 kvm_x86_ops 的静态变量,通过export_symbol 宏在 全局范围!!! 内导出。
// arch/x86/kvm/x86.c
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
在 kvm_init 的初始化过程中,通过调用kvm_arch_init函数给kvm_x86_ops赋值,代码如下,其中,ops就是通过svm.c调用kvm_init函数时传入的kvm_x86_ops结构体。
代码5-3 kvm_x86_ops赋值
// arch/x86/kvm/x86.c
(4652) kvm_init_msr_list();
(4653)
(4654) kvm_x86_ops = ops;
(4655) kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
(4656) kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
有关该结构的各个成员的详细说明,将在后文针对 KVM 的初始化和其他关键过程的分析过程中进行详细阐述。
KVM 在和用户态程序进行交互的过程中,主要通过/dev/kvm 设备文件进行通信。
从后文的 KVM的初始化过程中可以得知,/dev/kvm是一个字符型设备,通过符合Linux标准的一系列结构体进行支撑,主要是kvm_chardev_ops、kvm_vm_fops、kvm_vcpu_fops,分别对应字符型设备、VM文件描述符和vCPU文件描述符的三种操作。
kvm_chardev_ops的定义在virt/kvm/kvm_main.c中,代码如下:
在模块初始化的kvm_init()
调用了misc_register(&kvm_dev)
注册/dev/kvm
设备文件.
下面是设备文件的结构体
// virt/kvm/kvm_main.c
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl = kvm_dev_ioctl,
.llseek = noop_llseek,
KVM_COMPAT(kvm_dev_ioctl),
};
static struct miscdevice kvm_dev = {
KVM_MINOR,
"kvm",
&kvm_chardev_ops,
};
代码5-4 kvm_chardev_ops

// virt/kvm/kvm_main.c
(1913) static struct file_operations kvm_chardev_ops = {
(1914) .unlocked_ioctl = kvm_dev_ioctl,
(1915) .compat_ioctl = kvm_dev_ioctl,
(1916) .llseek = noop_llseek,
(1917) };
kvm_chardev_ops为一个标准的file_operations结构体,但是只包含了ioctl函数,read、open、write等常见的系统调用均采用默认实现。因此,就只能在用户态通过ioctl函数进行操作。
如前文所述,通过KVM_CREATE_VM之后可以获得一个fd文件描述符,代表该VM,该fd文件描述符在KVM子模块内部操作实际上对应着kvm_vm_fops结构体,其主要内容如下。
代码5-5 kvm_vm_fops结构体
// virt/kvm/kvm_main.c
(1815) static struct file_operations kvm_vm_fops = {
(1816) .release = kvm_vm_release,
(1817) .unlocked_ioctl = kvm_vm_ioctl, // vmfd的ioctl接口,提供对vm级别的控制
(1818) #ifdef CONFIG_COMPAT
(1819) .compat_ioctl = kvm_vm_compat_ioctl,
(1820) #endif
(1821) .mmap = kvm_vm_mmap,
(1822) .llseek = noop_llseek,
(1823) };
针对VM的文件操作中,提供了ioctl和mmap两个操作函数,其中mmap对应着GUEST OS的物理地址,可以直接对GUEST OS的地址空间进行读/写, ioctl则用于发送KVM的控制字。
针对 KVM 的 fd,通过 KVM_CREATE_VCPU 指令字可以创建 KVM 的vCPU,并且获得该vcpu_fd,在KVM子模块中的操作对应着该结构体。
vcpu_fd的操作主要包含在kvm_vcpu_fops中,其主要定义如下。
代码5-6 kvm_vcpu_fops结构体
// virt/kvm/kvm_main.c
(1348) static struct file_operations kvm_vcpu_fops = {
(1349) .release = kvm_vcpu_release,
(1350) .unlocked_ioctl = kvm_vcpu_ioctl,
(1351) .compat_ioctl = kvm_vcpu_ioctl,
(1352) .mmap = kvm_vcpu_mmap,
(1353) .llseek = noop_llseek,
(1354) };
在ioctl中,通过发送ioctl,即可对vCPU进行控制。
通过mmap,则可以访问kvm_run结构体,在这个结构体中保存了vCPU运行和控制的信息,并且可以对其运行参数进行设置。