本节首先分析Qemu的初始化top level流程;从而引出Qemu各大功能模块的描述。最后分析Qemu与内核态KVM的通讯接口。
1.1.1Main的主流程
main– (vl.c function main)
a) module_call_init(MODULE_INIT_QOM);--设备驱动初始化和注册 type_init(x86_cpu_register_types)(target-i386/cpu.c)
b) module_call_init(MODULE_INIT_MACHINE); -- 机器类型注册初始化
machine_init(pc_machine_init)
c) socket_init
d) qemu_init_cpu_loop
e) configure_accelerator--tcg对KVM而言采用kvm type, 并调用kvm_init
accel_list[i].init();accel_list[] = {
{ "tcg", "tcg",tcg_available, tcg_init, &tcg_allowed },
{ "xen", "Xen",xen_available, xen_init, &xen_allowed },
{ "kvm","KVM", kvm_available, kvm_init, &kvm_allowed }, //open /dev/kvm
{ "qtest", "QTest",qtest_available, qtest_init, &qtest_allowed }, }
f) qemu_init_main_loop; –
qeume_mutex_lock
qemue_event_init
qemu_signal_init
g) qemu_init_cpu_loop
h) cpu_exec_init_all
1) memory_map_init建立系统内存的管理信息,第3章分析
memory_region_init
address_space_init
2) call io_mem_init èmemory_region_init_io
i) bdrv_init_with_whitelist();块设备类型初始化
block层的注册函数:block_init(bdrv_init)
j) blk_mig_init块设备migration功能初始化
k) qdev_machine_initQemu要使用的Machine信息初始化
l) machine->init(&args);调用machine初始化,建立虚拟机的硬件信息
对于PC 而言该函数是pc_init_pci_init(pc_piix.c);Machine type 在pc_machine_init中注册
m) init_displaystate --qemu 本身的display init
n) vm_start() 启动虚拟机,是vcpu开始执行
o) main_loop()
1.1.2Qemu 设备管理架构
(1) Qemu设备分类
Qemu将设备分为如下几类:
typedef enum {
MODULE_INIT_BLOCK, //存储设备 如:scsi ,qcow2
MODULE_INIT_MACHINE, //虚拟机目标机器类型
MODULE_INIT_QAPI,
MODULE_INIT_QOM, 虚拟机中设备类别
MODULE_INIT_MAX
} module_init_type;
不同类别的注册函数如下:
#define block_init(function)module_init(function, MODULE_INIT_BLOCK)
#define machine_init(function)module_init(function, MODULE_INIT_MACHINE)
#define qapi_init(function)module_init(function, MODULE_INIT_QAPI)
#define type_init(function)module_init(function, MODULE_INIT_QOM)
#define module_init(function, type) \
static void __attribute__((constructor)) do_qemu_init_## function(void) { \
register_module_init(function, type);
(2) Cpu类别
(i386-target/cpu.c)
static const TypeInfo x86_cpu_type_info = {
.name = TYPE_X86_CPU,
.parent = TYPE_CPU,
.instance_size = sizeof(X86CPU),
.instance_init= x86_cpu_initfn,
.abstract = false,
.class_size = sizeof(X86CPUClass),
.class_init = x86_cpu_common_class_init,
};
static void x86_cpu_register_types(void)
{
type_register_static(&x86_cpu_type_info); //注册到一个hasttable中
}
type_init(x86_cpu_register_types)
(target-i386/help.c)
X86CPU *cpu_x86_init(const char *cpu_model)
{
X86CPU *cpu;
.............
cpu = X86_CPU(object_new(TYPE_X86_CPU)); //根据Cpu类别创建cpu实例
..................
}
object的管理代码在(object.c中)
object_new ==> object_new_with_type==> object_initialize_with_type==>
type_initialize==>type_info->Class_init
object_initialize_with_type==> type_info-> instance_init
Qemu采用了类似面向对象的方式来管理虚拟机中的设备;2.1节将分析这种机制。
(3) Machine 类别:
(hw/pc_piix.c)
static QEMUMachine pc_machine_v1_3 = {
.name = "pc-1.3",
.alias = "pc",
.desc = "Standard PC",
.init = pc_init_pci,
.max_cpus = 255,
.is_default = 1,
.default_machine_opts = KVM_MACHINE_OPTIONS,
};
static void pc_machine_init(void)
{
qemu_register_machine(&pc_machine_v1_3);
.................
}
machine_init(pc_machine_init);
machine->init(main.c) ==> pc_init_pci==>pc_init1
(4) Object 与 Objectclass
Object用于记录设备对象的信息,而object class则记录设备的类别信息。
object_initialize_with_type ==> x86_cpu_initfn(Object * object)
struct Object
{
/*< private >*/
ObjectClass *class;//指向设备类别信息的指针
QTAILQ_HEAD(, ObjectProperty) properties; //一个object可以有多个属性
uint32_t ref;//应用计数
Object *parent; //指向设备的父类别的对象指针,用来实现继承关系
};
a. type_initialize 中将分配class:
ti->class =g_malloc0(ti->class_size);
ti->class->type = ti;
b. object_initialize_with_type 中 obj->class = type->class;
1.1.3Qemu 调用kvm内核模块流程
(1) KVM初始化
configure_accelerator--tcg ==> kvm_init
==>qemu_open("/dev/kvm", O_RDWR); //KVM访问句柄
==>kvm_ioctl(s, KVM_GET_API_VERSION, 0);
==>kvm_ioctl(s,KVM_CREATE_VM, 0); //创建virtual machine访问句柄
==> kvm_arch_init ==>
kvm_vm_ioctl(s,KVM_SET_IDENTITY_MAP_ADDR, &identity_base
kvm_vm_ioctl(s,KVM_SET_TSS_ADDR, identity_base + 0x1000)
kvm_vm_ioctl(s,KVM_SET_NR_MMU_PAGES, shadow_mem)
==>kvm_irqchip_create(s) ==> kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP)
(2) CPU虚拟化访问接口
pc_init1 ==> pc_cpus_init(hw/pc.c)==> pc_new_cpu ==>cpu_x86_init==>x86_cpu_realize(hw/helper.c)==>x86_cpu_realize(target-i386/cpus.c)==> qemu_init_vcpu==>qemu_kvm_start_vcpu
qemu_kvm_start_vcpu
static voidqemu_kvm_start_vcpu(CPUArchState *env)
{
CPUState *cpu = ENV_GET_CPU(env);
cpu->thread = g_malloc0(sizeof(QemuThread));
env->halt_cond = g_malloc0(sizeof(QemuCond));
qemu_cond_init(env->halt_cond);
qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, env,
QEMU_THREAD_JOINABLE);//启动线程qemu_kvm_cpu_thread_fn
while (env->created == 0) {
qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);
}
}
qemu_kvm_cpu_thread_fn => kvm_init_vcpu(kvm-all.c)
static void *qemu_kvm_cpu_thread_fn(void*arg)
{
CPUArchState *env = arg;
CPUState *cpu = ENV_GET_CPU(env);
int r;
qemu_mutex_lock(&qemu_global_mutex);
qemu_thread_get_self(cpu->thread);
env->thread_id = qemu_get_thread_id();
cpu_single_env = env;
r= kvm_init_vcpu(env);
if (r < 0) {
fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));
exit(1);
}
qemu_kvm_init_cpu_signals(env);
/* signal CPU creation */
env->created = 1;
qemu_cond_signal(&qemu_cpu_cond);
while (1) {
//如果VM处于Running状态 (main==>vm_start 回事vm进入running态)
if (cpu_can_run(env)) {
r = kvm_cpu_exec(env);
if (r == EXCP_DEBUG) {
cpu_handle_guest_debug(env);
}
}
qemu_kvm_wait_io_event(env);
}
return NULL;
}
int kvm_init_vcpu(CPUArchState *env)
{
.......
ret= kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index); //创建vcpu访问句柄
if (ret < 0) {
DPRINTF("kvm_create_vcpu failed\n");
goto err;
}
env->kvm_fd = ret;
env->kvm_state = s;
env->kvm_vcpu_dirty = 1;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE,0); //取得内核vcpu结构的大小
......
env->kvm_run = mmap(NULL, mmap_size,PROT_READ | PROT_WRITE, MAP_SHARED,
env->kvm_fd, 0); //映射内核vcpu结构
......
ret = kvm_arch_init_vcpu(env); //得到内核对cpu虚拟化支持的参数
......
}
int kvm_cpu_exec(CPUArchState *env)
{
......
do {
......
kvm_arch_pre_run(env, run);
if (env->exit_request) {
qemu_cpu_kick_self();
}
qemu_mutex_unlock_iothread();
//Cpu 进入 VM-Entry; 该函数返回表明Cpu进入VM-Exit 或 VM-Entry failed
run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
qemu_mutex_lock_iothread();
......//
//根据Exit的原因处理VM-Exit
switch (run->exit_reason) {
case KVM_EXIT_IO: //VM Exit Cause by IO operation
kvm_handle_io(run->io.port,
(uint8_t *)run +run->io.data_offset,
run->io.direction,
run->io.size,
run->io.count);
ret = 0;
break;
case KVM_EXIT_MMIO: //VM Exit caused by MMIO
cpu_physical_memory_rw(run->mmio.phys_addr,
run->mmio.data,
run->mmio.len,
run->mmio.is_write);
ret = 0;
break;
case KVM_EXIT_IRQ_WINDOW_OPEN:
ret = EXCP_INTERRUPT;
break;
case KVM_EXIT_SHUTDOWN:
qemu_system_reset_request();
ret = EXCP_INTERRUPT;
break;
case KVM_EXIT_UNKNOWN:
fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64"\n",
(uint64_t)run->hw.hardware_exit_reason);
ret = -1;
break;
case KVM_EXIT_INTERNAL_ERROR: //cpu内部错误(异常)
ret = kvm_handle_internal_error(env,run);
break;
default:
ret = kvm_arch_handle_exit(env,run);
break;
}
}while (ret == 0); //处理完VM-Exit若虚拟机未Halt或严重错误, 继续准备再次VM-Entry
if (ret < 0) {
cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);
vm_stop(RUN_STATE_INTERNAL_ERROR);
}
env->exit_request = 0;
return ret;
}
小结Qemu访问KVM的句柄有:
(1) KVM访问句柄 (2)虚拟机访问句柄 (3) VCPU访问句柄
(2)内存虚拟化访问接口
内存管理结构初始化
kvm_init ==> memory_listener_register(&kvm_memory_listener,NULL);
static MemoryListener kvm_memory_listener ={
.begin = kvm_begin,
.commit = kvm_commit,
.region_add = kvm_region_add,
.region_del = kvm_region_del,
.region_nop = kvm_region_nop,
.log_start = kvm_log_start,
.log_stop = kvm_log_stop,
.log_sync = kvm_log_sync,
.log_global_start = kvm_log_global_start,
.log_global_stop = kvm_log_global_stop,
.eventfd_add = kvm_eventfd_add,
.eventfd_del = kvm_eventfd_del,
.priority = 10,
};
memory_region_add_subregion ==> listener_add_address_space ==>region_add
kvm_region_add==> kvm_set_phys_mem==>kvm_set_user_memory_region ==>
kvm_vm_ioctl(s,KVM_SET_USER_MEMORY_REGION, &mem);
虚拟机内存初始化:
pc_init1==>pc_memory_init==>memory_region_add_subregion(memory.c) 添加内存区域到虚拟机的内存管理结构; 第3章将分析内存虚拟化。
1.1.4Qemu IO管理
main==>cpu_exec_init_all()
void cpu_exec_init_all(void)
{
memory_map_init();
io_mem_init();
}
X86 有两种硬件访问方式PIO 与 MMIO, 下面分别讲解
(1) PIO
isa_cirrus_vga采用IO port方式访问
IO port的注册
vga_initfn (cirrus_vga.c)==》 cirrus_init_common ==》 register_ioport_read(ioport.c)
int register_ioport_read(pio_addr_t start,int length, int size,
IOPortReadFunc *func,void *opaque)
{
......
for(i = start; i < start + length; ++i) {
ioport_read_table[bsize][i] = func;
if (ioport_opaque[i] != NULL && ioport_opaque[i] != opaque)
hw_error("register_ioport_read: invalid opaque for address0x%x",
i);
ioport_opaque[i] = opaque;
}
return 0;
}
当虚拟机由IO port引起VM-Exit时
kvm_handle_io==> cpu_inl (ioport.c)==> ioport_read
static uint32_t ioport_read(int index,uint32_t address)
{
static IOPortReadFunc * const default_func[3] = {
default_ioport_readb,
default_ioport_readw,
default_ioport_readl
};
IOPortReadFunc *func = ioport_read_table[index][address];
if (!func)
func = default_func[index];
return func(ioport_opaque[address], address);
}
(2)MMIO
cirrus_init_common (CirrusVGAState * s, intdevice_id, int is_pci,
MemoryRegion *system_memory)
{
........
memory_region_init(&s->low_mem_container,
"cirrus-lowmem-container",0x20000);
memory_region_init_io(&s->low_mem, &cirrus_vga_mem_ops, s,
"cirrus-low-memory", 0x20000);
memory_region_add_subregion(&s->low_mem_container, 0,&s->low_mem);
.......
}
定义mmio的read,write
static const MemoryRegionOpscirrus_vga_mem_ops = {
.read = cirrus_vga_mem_read,
.write = cirrus_vga_mem_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
.min_access_size = 1,
.max_access_size = 1,
},
};
当虚拟机由IO port引起VM-Exit时
cpu_physical_memory_rw(exec.c)==>io_mem_read(memory.c)==>
memory_region_dispatch_read==> access_with_adjusted_size
static uint64_tmemory_region_dispatch_read1(MemoryRegion *mr,
target_phys_addr_t addr,
unsigned size)
{
.......
access_with_adjusted_size(addr, &data, size,
mr->ops->impl.min_access_size,
mr->ops->impl.max_access_size,
memory_region_read_accessor, mr);
return data;
}
memory_region_read_accessor ==>mr->ops->read
第5.1节将详细介绍io的管理框架
1.1.5Qemu IO thread
IO thread 用来管理虚拟机的IO 读写,如对block设备的访问。5.4节将做详细介绍
int main_loop_wait(int nonblocking)
{
int ret;
uint32_t timeout = UINT32_MAX;
if (nonblocking) {
timeout = 0;
}else {
qemu_bh_update_timeout(&timeout);
}
/* poll any events */
/* XXX: separate device handlers from system ones */
nfds = -1;
FD_ZERO(&rfds);
FD_ZERO(&wfds);
FD_ZERO(&xfds);
#ifdef CONFIG_SLIRP
slirp_update_timeout(&timeout);
slirp_select_fill(&nfds, &rfds, &wfds, &xfds);
#endif
qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);
ret = os_host_main_loop_wait(timeout);
qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);
#ifdef CONFIG_SLIRP
slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));
#endif
qemu_run_all_timers();
/* Check bottom-halves last in case any of the earlier events triggered
them. */
qemu_bh_poll();
return ret;
}
Qemu中常用的IO描述符有下面几类:
· block io:虚拟磁盘相关的io,为了保证高性能,主要使用aio;
· qemu_notify_event
例子:qemu的时钟模拟利用了linux kernel的signalfd, 定期产生SIGALRM信号(qemu-timer.c;
· eventfd:主要用于qemu和kvm之间的notifier, 比如qemu的模拟设备可以通过notifier向kvm发送一个模拟中断,kvm也可以通过notifier向qemu报告guest的各种状态;
address_space_update_topology==>address_space_update_ioeventfds==>address_space_add_del_ioeventfds==>MEMORY_LISTENER_CALL==>eventfd_add(kvm_mem_ioeventfd_add)==>kvm_vm_ioctl(kvm_state,KVM_IOEVENTFD, &iofd);
· socket:用于虚拟机迁移,qmp管理等
该函数同时还负责轮询系统中所有的定时器,并调用定时器的回调函数;
IO Handler
用来表示一个IO描述符,其结构定义如下;iohandler.c中定义了一个全局的链表io_handlers,并提供qemu_set_fd_handler()和qemu_set_fd_handler2()函数将一个fd加入到这个链表QLIST_INSERT_HEAD; 在IO thread主循环中qemu_iohandler_fill()函数负责将io_handlers链表中的所有描述符,加入select测试集合。
Qemu IO thread和vcputhread使用一个全局共享线程锁来保证同步,函数qemu_mutex_lock_iothread()和qemu_mutex_unlock_iothread()分别用来获取和释放该锁
1.1.6 Qemu的模块
下面的表格是本系列文章将会分析到的代码和其对应的模块:
模块名与描述 |
文件 |
章节 |
参数管理与main函数 |
Vl.c Qemu-config.c Arch_init.c Qemu-opt.c |
1.1 8.3 |
Kvm访问接口层 |
Target-i386\Kvm.c Kvm-all.c |
1.2 2章 |
设备对象模型 |
Qdev.c; qdev-propreties.c module.c |
2.1 |
Machine与cpu管理 |
Hw\pc_piix.c Hw\pc.c Target-i386\Machine.c cpu_exec.c |
第2章 |
中断与时间管理 |
Hw\kvm\(ioapic.c, i8259.c,i8254.c, apic.c clockc) |
第4章 |
内存管理 |
Memory.c Memory-mapping.c Exec.c |
第3章 |
硬件辅助虚拟化 |
Hw\(pci.c, pcie.c,pci-birdge.c,piix_pci.c) Hw\ide\(core.c,pci.c,piix.c, piix.c) |
5.1 5.2 5.3 |
半虚拟化 |
Hw\(virtio.c, virtio-pci.c, virtio-ballon.c) |
6章 |
直接io |
Hw\kvm\pci-assign.c) |
7章 |
块设备 |
Block.c Blockdev.c Block\raw-posix.c |
5.4 |
异步io |
Aio.c posix-aio-compact.c iohandler.c main-loop.c |
5.4 |
字符设备 |
Qemu-char.c |
8.1 |
管理模块 |
Qmp.c; hmp.c qdev-monitor.c Monitor.c Vmsave.c |
8.1 8.2 |