kvm 模块加载后会生成/dev/kvm字符设备,/dev/kvm是一个标准的字符设备,可以使用常用的open、close、ioctl接口,使用ioctl代替read,write接口与kvm交互。KVM API从功能上可以分为三大类:
1、虚拟化system指令,针对虚拟化系统的全局性参数设置和控制。
2、VM指令,针对VM虚拟机进行控制,如:内存设置、创建VCPU等。
3、 VCPU指令,针对具体的VCPU进行参数设置。如:相关寄存器的读写、中断控制等。
通常对于KVM的操作都是从open /dev/kvm设备文件开始的,open后,会获得相应的文件描述符(fd),然后通过ioctl系统指令对该fd进行进一步的操作,比如通过KVM_CREATE_VM指令,可以创建一个虚拟机并返回虚拟机对应的文件描述符,然后根据该描述符来进一步控制虚拟机的行为,比如通过KVM_CREATE_VCPU指令来为该虚拟机创建VCPU。
一、 System指令
System ioctl指令用于控制KVM运行环境的参数,包括全局性的参数设置和虚拟机创建等工作,主要的指令字包括:
KVM_CREATE_VM 创建KVM虚拟机
KVM_GET_API_VERSION 查询当前KVM API版本
KVM_GET_MSR_INDEX_LIST 获得MSR索引列表
KVM_CHECK_EXTENSION 检查扩展支持情况
KVM_GET_VCPU_MMAP_SIZE 运行虚拟机和用户态空间共享的一片内存区域的大小
其中,KVM_CREATE_VM比较重要,用于创建虚拟机,并返回一个代表该虚拟机的描述符(fd)。新创建的虚拟机没有VCPU,也没有内存等资源,需要对创建虚拟机时返回的描述符,通过ioctl指令,进行进一步的配置。
二、VM指令
VM ioctl指令实现对虚拟机的控制,大多需要从KVM_CREATE_VM中返回的fd来进行操作,具体操作包括:配置内存、配置VCPU、运行虚拟机等,主要指令如下:
KVM_CREATE_VCPU 为虚拟机创建VCPU
KVM_RUN 根据kvm_run结构体信息,运行VM虚拟机
KVM_CREATE_IRQCHIP 创建虚拟APIC,且随后创建的VCPU都关联到此APIC
KVM_IRQ_LINE 对某虚拟APIC发出中断信号
KVM_GET_IRQCHIP 读取APIC的中断标志信息
KVM_SET_IRQCHIP 写入APIC的中断标志信息
KVM_GET_DIRTY_LOG 返回脏内存页的位图
KVM_CREATE_VCPU 和KVM_RUN是VM ioctl指令中两种重要的指令字,通过 KVM_CREATE_VCPU为虚拟机创建VCPU,并获得对应的fd描述符后,可以对其调用KVM_RUN,以启动该虚拟机(或称为调度VCPU)。
Kvm结构体代表一个具体的虚拟机,当通过KVM_CREATE_VM指令字创建一个虚拟机后,就会创建一个新的kvm结构体对象。Kvm结构体中包括了VCPU、内存、APIC、IRQ、MMU、Event事件等相关信息,该结构体主要在KVM虚拟机内部使用,用于跟踪虚拟机状态。
struct kvm {
spinlock_t mmu_lock;
struct mutex slots_lock;
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];/*KVM虚拟机分配的内存slot,用于GPAàHVA的转换,内存虚拟化使用*/
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];/*kvm支持的最大vcpu个数*/
/*
* created_vcpus is protected by kvm->lock, and is incremented
* at the beginning of KVM_CREATE_VCPU. online_vcpus is only
* incremented after storing the kvm_vcpu pointer in vcpus,
* and is accessed atomically.
*/
atomic_t online_vcpus;
int created_vcpus;
int last_boosted_vcpu;
struct list_head vm_list;
struct mutex lock;
struct kvm_io_bus __rcu *buses[KVM_NR_BUSES];
#ifdef CONFIG_HAVE_KVM_EVENTFD
struct {
spinlock_t lock;
struct list_head items;
struct list_head resampler_list;
struct mutex resampler_lock;
} irqfds;
struct list_head ioeventfds;
#endif
struct kvm_vm_stat stat;/*KVM虚拟机中的运行时状态信息,比如页表、MMU等状态。*/
struct kvm_arch arch;
refcount_t users_count;
#ifdef CONFIG_KVM_MMIO
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
spinlock_t ring_lock;
struct list_head coalesced_zones;
#endif
struct mutex irq_lock;
#ifdef CONFIG_HAVE_KVM_IRQCHIP
/*
* Update side is protected by irq_lock.
*/
struct kvm_irq_routing_table __rcu *irq_routing;
#endif
#ifdef CONFIG_HAVE_KVM_IRQFD
struct hlist_head irq_ack_notifier_list;
#endif
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
#endif
long tlbs_dirty;
struct list_head devices;
bool manual_dirty_log_protect;
struct dentry *debugfs_dentry;
struct kvm_stat_data **debugfs_stat_data;
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
pid_t userspace_pid;
struct kvm_mig_opt mig_opt;
};
kvm_run结构体定义在include/uapi/linux/kvm.h中,可以通过该结构体了解KVM的内部运行状态。
/* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */
struct kvm_run {
/* in */
__u8 request_interrupt_window;/*向VCPU注入一个中断,让VCPU做好相关准备工作*/
__u8 immediate_exit;
__u8 padding1[6];
/* out */
__u32 exit_reason;/*记录退出原因*/
__u8 ready_for_interrupt_injection; /*响应request_interrupt_window的中断请求,当设置时,说明VCPU可以接收中断*/
__u8 if_flag; /*中断使能标识,如果使用了APIC,则无效*/
__u16 flags;
/* in (pre_kvm_run), out (post_kvm_run) */
__u64 cr8;
__u64 apic_base;
#ifdef __KVM_S390
/* the processor status word for s390 */
__u64 psw_mask; /* psw upper half */
__u64 psw_addr; /* psw lower half */
#endif
union {
/* KVM_EXIT_UNKNOWN */
struct {
__u64 hardware_exit_reason;
} hw;
/* KVM_EXIT_FAIL_ENTRY */
struct {
__u64 hardware_entry_failure_reason;
} fail_entry;
/* KVM_EXIT_EXCEPTION */
struct {
__u32 exception;
__u32 error_code;
} ex;
/* KVM_EXIT_IO */
struct {
#define KVM_EXIT_IO_IN 0
#define KVM_EXIT_IO_OUT 1
__u8 direction;
__u8 size; /* bytes */
__u16 port;
__u32 count;
__u64 data_offset; /* relative to kvm_run start */
} io; /*当由于IO操作导致发生VM-Exit时,该结构体保存IO相关信息。*/
/* KVM_EXIT_DEBUG */
struct {
struct kvm_debug_exit_arch arch;
} debug;
/* KVM_EXIT_MMIO */
struct {
__u64 phys_addr;
__u8 data[8];
__u32 len;
__u8 is_write;
} mmio;
/* KVM_EXIT_HYPERCALL */
struct {
__u64 nr;
__u64 args[6];
__u64 ret;
__u32 longmode;
__u32 pad;
} hypercall; /*hypercall exit*/
/* KVM_EXIT_TPR_ACCESS */
struct {
__u64 rip;
__u32 is_write;
__u32 pad;
} tpr_access;
/* KVM_EXIT_S390_SIEIC */
struct {
__u8 icptcode;
__u16 ipa;
__u32 ipb;
} s390_sieic;
/* KVM_EXIT_S390_RESET */
#define KVM_S390_RESET_POR 1
#define KVM_S390_RESET_CLEAR 2
#define KVM_S390_RESET_SUBSYSTEM 4
#define KVM_S390_RESET_CPU_INIT 8
#define KVM_S390_RESET_IPL 16
__u64 s390_reset_flags;
/* KVM_EXIT_S390_UCONTROL */
struct {
__u64 trans_exc_code;
__u32 pgm_code;
} s390_ucontrol;
/* KVM_EXIT_DCR (deprecated) */
struct {
__u32 dcrn;
__u32 data;
__u8 is_write;
} dcr;
/* KVM_EXIT_INTERNAL_ERROR */
struct {
__u32 suberror;
/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
__u32 ndata;
__u64 data[16];
} internal;
/* KVM_EXIT_OSI */
struct {
__u64 gprs[32];
} osi;
/* KVM_EXIT_PAPR_HCALL */
struct {
__u64 nr;
__u64 ret;
__u64 args[9];
} papr_hcall;
/* KVM_EXIT_S390_TSCH */
struct {
__u16 subchannel_id;
__u16 subchannel_nr;
__u32 io_int_parm;
__u32 io_int_word;
__u32 ipb;
__u8 dequeued;
} s390_tsch;
/* KVM_EXIT_EPR */
struct {
__u32 epr;
} epr;
/* KVM_EXIT_SYSTEM_EVENT */
struct {
#define KVM_SYSTEM_EVENT_SHUTDOWN 1
#define KVM_SYSTEM_EVENT_RESET 2
#define KVM_SYSTEM_EVENT_CRASH 3
__u32 type;
__u64 flags;
} system_event;
/* KVM_EXIT_S390_STSI */
struct {
__u64 addr;
__u8 ar;
__u8 reserved;
__u8 fc;
__u8 sel1;
__u16 sel2;
} s390_stsi;
/* KVM_EXIT_IOAPIC_EOI */
struct {
__u8 vector;
} eoi;
/* KVM_EXIT_HYPERV */
struct kvm_hyperv_exit hyperv;
/* Fix the size of the union. */
char padding[256];
};
/* 2048 is the size of the char array used to bound/pad the size
* of the union that holds sync regs.
*/
#define SYNC_REGS_SIZE_BYTES 2048
/*
* shared registers between kvm and userspace.
* kvm_valid_regs specifies the register classes set by the host
* kvm_dirty_regs specified the register classes dirtied by userspace
* struct kvm_sync_regs is architecture specific, as well as the
* bits for kvm_valid_regs and kvm_dirty_regs
*/
__u64 kvm_valid_regs;
__u64 kvm_dirty_regs;
union {
struct kvm_sync_regs regs;
char padding[SYNC_REGS_SIZE_BYTES];
} s;
}
三、 VCPU指令
VCPU ioctl指令主要针对具体的VCPU进行配置,包括寄存器读写、中断设置、内存设置、时钟管理、调试开关等,可以对KVM虚拟机进行运行时配置。主要指令字包括:
1. 寄存器控制方面
KVM_GET_REGS 获取通用寄存器信息
KVM_SET_REGS 设置通用寄存器信息
KVM_GET_SREGS 获取特殊寄存器信息
KVM_SET_SREGS设置特殊寄存器信息
KVM_GET_MSRS获取MSR寄存器信息
KVM_SET_MSRS设置MSR寄存器信息
KVM_GET_FPU获取浮点寄存器信息
KVM_SET_FPU设置浮点寄存器信息
KVM_GET_XSAVE获取VCPU的xsave寄存器信息
KVM_SET_XSAVE设置VCPU的xsave寄存器信息
KVM_GET_XCRS获取VCPU的xcr寄存器信息
KVM_SET_XCRS设置VCPU的xcr寄存器信息
2 中断和事件管理方面
KVM_INTERRUPT 在VCPU上产生中断(当APIC无效时)
KVM_SET_SIGNAL_MASK 设置某个VCPU的中断信号屏蔽掩码
KVM_GET_CPU_EVENTS 获取VCPU中被挂起待延时处理的事件,如中断、NMI或异常
KVM_SET_CPU_EVENTS 设置VCPU的事件,如中断、NMI或异常
3 内存管理方面
KVM_TRANSLATE 将VCPU的物理地址翻译成HPA
KVM_SET_USER_MEMORY_REGION 修改VCPU的内存区域
KVM_SET_TSS_ADDR 初始化TSS内存区域(Intel架构专用)
KVM_SET_IDENTITY_MAP_ADDR 创建EPT页表(Intel架构专用)
4 其他方面(如:CPUID的设置、调试接口等)
kvm中kvm_vcpu(include/linux/kvm_host.h)结构实现vcpu相关的信息。
struct kvm_vcpu {
struct kvm *kvm;/*记录虚拟机相关信息*/
#ifdef CONFIG_PREEMPT_NOTIFIERS
struct preempt_notifier preempt_notifier; /*vcpu抢占通知*/
#endif
int cpu;
int vcpu_id;/*vcpu id*/
int srcu_idx;
int mode;
u64 requests;
unsigned long guest_debug;
int pre_pcpu;
struct list_head blocked_vcpu_list;
struct mutex mutex;
struct kvm_run *run; /*记录虚拟机运行状态*/
int guest_xcr0_loaded;
struct swait_queue_head wq;
struct pid __rcu *pid;
int sigset_active;
sigset_t sigset;
struct kvm_vcpu_stat stat;
unsigned int halt_poll_ns;
bool valid_wakeup;
#ifdef CONFIG_HAS_IOMEM
int mmio_needed;
int mmio_read_completed;
int mmio_is_write;
int mmio_cur_fragment;
int mmio_nr_fragments;
struct kvm_mmio_fragment mmio_fragments[KVM_MAX_MMIO_FRAGMENTS];
#endif
#ifdef CONFIG_KVM_ASYNC_PF
struct {
u32 queued;
struct list_head queue;
struct list_head done;
spinlock_t lock;
} async_pf;
#endif
#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
/*
* Cpu relax intercept or pause loop exit optimization
* in_spin_loop: set when a vcpu does a pause loop exit
* or cpu relax intercepted.
* dy_eligible: indicates whether vcpu is eligible for directed yield.
*/
struct {
bool in_spin_loop;
bool dy_eligible;
} spin_loop;
#endif
bool preempted;
bool ready;
struct kvm_vcpu_arch arch;
struct dentry *debugfs_dentry;