1.1Qemu 用户态架构


本节首先分析Qemu的初始化top level流程;从而引出Qemu各大功能模块的描述。最后分析Qemu与内核态KVM的通讯接口。

1.1.1Main的主流程

main–  (vl.c function main)

a)   module_call_init(MODULE_INIT_QOM);--设备驱动初始化和注册 type_init(x86_cpu_register_types)(target-i386/cpu.c)

b)  module_call_init(MODULE_INIT_MACHINE); -- 机器类型注册初始化

machine_init(pc_machine_init)

 

c)  socket_init

d)  qemu_init_cpu_loop

 

e)  configure_accelerator--tcg对KVM而言采用kvm type, 并调用kvm_init

accel_list[i].init();accel_list[] = {

{ "tcg", "tcg",tcg_available, tcg_init, &tcg_allowed },

{ "xen", "Xen",xen_available, xen_init, &xen_allowed },

{ "kvm","KVM", kvm_available, kvm_init, &kvm_allowed }, //open /dev/kvm

{ "qtest", "QTest",qtest_available, qtest_init, &qtest_allowed }, }

 

f)   qemu_init_main_loop; –

qeume_mutex_lock

qemue_event_init

qemu_signal_init

 

g)   qemu_init_cpu_loop

 

h)   cpu_exec_init_all

1)  memory_map_init建立系统内存的管理信息,第3章分析

memory_region_init

 address_space_init

  2) call io_mem_init èmemory_region_init_io

 

i)  bdrv_init_with_whitelist();块设备类型初始化

 block层的注册函数:block_init(bdrv_init)

j)  blk_mig_init块设备migration功能初始化

k)  qdev_machine_initQemu要使用的Machine信息初始化

 

l)  machine->init(&args);调用machine初始化,建立虚拟机的硬件信息

对于PC 而言该函数是pc_init_pci_init(pc_piix.c);Machine type 在pc_machine_init中注册

 

m)   init_displaystate --qemu 本身的display init

n)   vm_start() 启动虚拟机,是vcpu开始执行

o)   main_loop()

 

1.1.2Qemu 设备管理架构

(1) Qemu设备分类

Qemu将设备分为如下几类:

typedef enum {

   MODULE_INIT_BLOCK, //存储设备 如:scsi ,qcow2

   MODULE_INIT_MACHINE, //虚拟机目标机器类型

   MODULE_INIT_QAPI,

   MODULE_INIT_QOM, 虚拟机中设备类别

   MODULE_INIT_MAX

} module_init_type;

 

不同类别的注册函数如下:

#define block_init(function)module_init(function, MODULE_INIT_BLOCK)

#define machine_init(function)module_init(function, MODULE_INIT_MACHINE)

#define qapi_init(function)module_init(function, MODULE_INIT_QAPI)

#define type_init(function)module_init(function, MODULE_INIT_QOM)

 

#define module_init(function, type)                                         \

static void __attribute__((constructor)) do_qemu_init_## function(void) {  \

   register_module_init(function, type);

 

 

(2) Cpu类别

(i386-target/cpu.c)

static const TypeInfo x86_cpu_type_info = {

   .name = TYPE_X86_CPU,

   .parent = TYPE_CPU,

   .instance_size = sizeof(X86CPU),

    .instance_init= x86_cpu_initfn,

   .abstract = false,

   .class_size = sizeof(X86CPUClass),

   .class_init = x86_cpu_common_class_init,

};

 

static void x86_cpu_register_types(void)

{

   type_register_static(&x86_cpu_type_info); //注册到一个hasttable中

}

 

type_init(x86_cpu_register_types)

 

(target-i386/help.c)

X86CPU *cpu_x86_init(const char *cpu_model)

{

   X86CPU *cpu;

  .............

   cpu = X86_CPU(object_new(TYPE_X86_CPU)); //根据Cpu类别创建cpu实例

    ..................

}

 

object的管理代码在(object.c中)

object_new ==> object_new_with_type==> object_initialize_with_type==>

           type_initialize==>type_info->Class_init

           object_initialize_with_type==> type_info-> instance_init

 

Qemu采用了类似面向对象的方式来管理虚拟机中的设备;2.1节将分析这种机制。

 

(3) Machine 类别:

(hw/pc_piix.c)

static QEMUMachine pc_machine_v1_3 = {

   .name = "pc-1.3",

   .alias = "pc",

   .desc = "Standard PC",

   .init = pc_init_pci,

   .max_cpus = 255,

   .is_default = 1,

   .default_machine_opts = KVM_MACHINE_OPTIONS,

};

 

static void pc_machine_init(void)

{

   qemu_register_machine(&pc_machine_v1_3);

    .................

}

 

machine_init(pc_machine_init);

machine->init(main.c) ==> pc_init_pci==>pc_init1

 

(4) Object 与 Objectclass

Object用于记录设备对象的信息,而object class则记录设备的类别信息。

object_initialize_with_type  ==> x86_cpu_initfn(Object * object)

 

struct Object

{

   /*< private >*/

   ObjectClass *class;//指向设备类别信息的指针

   QTAILQ_HEAD(, ObjectProperty) properties; //一个object可以有多个属性

   uint32_t ref;//应用计数

   Object *parent; //指向设备的父类别的对象指针,用来实现继承关系

};

 

a. type_initialize 中将分配class:

ti->class =g_malloc0(ti->class_size);

ti->class->type = ti;

 

b. object_initialize_with_type 中 obj->class = type->class;

 

1.1.3Qemu 调用kvm内核模块流程

(1) KVM初始化

configure_accelerator--tcg ==> kvm_init

    ==>qemu_open("/dev/kvm", O_RDWR); //KVM访问句柄

    ==>kvm_ioctl(s, KVM_GET_API_VERSION, 0);

    ==>kvm_ioctl(s,KVM_CREATE_VM, 0); //创建virtual machine访问句柄

   ==> kvm_arch_init ==>

           kvm_vm_ioctl(s,KVM_SET_IDENTITY_MAP_ADDR, &identity_base

           kvm_vm_ioctl(s,KVM_SET_TSS_ADDR, identity_base + 0x1000)

           kvm_vm_ioctl(s,KVM_SET_NR_MMU_PAGES, shadow_mem)

    ==>kvm_irqchip_create(s) ==> kvm_vm_ioctl(s, KVM_CREATE_IRQCHIP)

      

 

(2) CPU虚拟化访问接口

 

pc_init1 ==> pc_cpus_init(hw/pc.c)==> pc_new_cpu ==>cpu_x86_init==>x86_cpu_realize(hw/helper.c)==>x86_cpu_realize(target-i386/cpus.c)==> qemu_init_vcpu==>qemu_kvm_start_vcpu

qemu_kvm_start_vcpu

 

static voidqemu_kvm_start_vcpu(CPUArchState *env)

{

   CPUState *cpu = ENV_GET_CPU(env);

 

   cpu->thread = g_malloc0(sizeof(QemuThread));

   env->halt_cond = g_malloc0(sizeof(QemuCond));

   qemu_cond_init(env->halt_cond);

   qemu_thread_create(cpu->thread, qemu_kvm_cpu_thread_fn, env,

                       QEMU_THREAD_JOINABLE);//启动线程qemu_kvm_cpu_thread_fn

   while (env->created == 0) {

       qemu_cond_wait(&qemu_cpu_cond, &qemu_global_mutex);

    }

}

qemu_kvm_cpu_thread_fn => kvm_init_vcpu(kvm-all.c)

 

static void *qemu_kvm_cpu_thread_fn(void*arg)

{

   CPUArchState *env = arg;

   CPUState *cpu = ENV_GET_CPU(env);

   int r;

 

   qemu_mutex_lock(&qemu_global_mutex);

   qemu_thread_get_self(cpu->thread);

   env->thread_id = qemu_get_thread_id();

   cpu_single_env = env;

 

    r= kvm_init_vcpu(env);

   if (r < 0) {

       fprintf(stderr, "kvm_init_vcpu failed: %s\n", strerror(-r));

       exit(1);

    }

 

    qemu_kvm_init_cpu_signals(env);

 

   /* signal CPU creation */

   env->created = 1;

   qemu_cond_signal(&qemu_cpu_cond);

 

   while (1) {

//如果VM处于Running状态 (main==>vm_start 回事vm进入running态)

       if (cpu_can_run(env)) {

 

           r = kvm_cpu_exec(env);  

           if (r == EXCP_DEBUG) {

                cpu_handle_guest_debug(env);

           }

       }

       qemu_kvm_wait_io_event(env);

    }

 

   return NULL;

}

 

int kvm_init_vcpu(CPUArchState *env)

{

    .......

    ret= kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index); //创建vcpu访问句柄

   if (ret < 0) {

       DPRINTF("kvm_create_vcpu failed\n");

       goto err;

    }

 

   env->kvm_fd = ret;

   env->kvm_state = s;

   env->kvm_vcpu_dirty = 1;

 

    mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE,0); //取得内核vcpu结构的大小

    ......

    env->kvm_run = mmap(NULL, mmap_size,PROT_READ | PROT_WRITE, MAP_SHARED,

                        env->kvm_fd, 0); //映射内核vcpu结构

    ......

   ret = kvm_arch_init_vcpu(env); //得到内核对cpu虚拟化支持的参数

    ......

}

 

int kvm_cpu_exec(CPUArchState *env)

{

    ......

   do {

       ......

       kvm_arch_pre_run(env, run);

       if (env->exit_request) {

           qemu_cpu_kick_self();

       }

       qemu_mutex_unlock_iothread();

 

   //Cpu 进入 VM-Entry; 该函数返回表明Cpu进入VM-Exit 或 VM-Entry failed

       run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);

       qemu_mutex_lock_iothread();

       ......//

       //根据Exit的原因处理VM-Exit

       switch (run->exit_reason) {

       case KVM_EXIT_IO: //VM Exit Cause by IO operation

           kvm_handle_io(run->io.port,

                          (uint8_t *)run +run->io.data_offset,

                          run->io.direction,

                          run->io.size,

                          run->io.count);

           ret = 0;

           break;

       case KVM_EXIT_MMIO: //VM Exit caused by MMIO

           cpu_physical_memory_rw(run->mmio.phys_addr,

                                  run->mmio.data,

                                  run->mmio.len,

                                   run->mmio.is_write);

           ret = 0;

           break;

       case KVM_EXIT_IRQ_WINDOW_OPEN:

           ret = EXCP_INTERRUPT;

           break;

       case KVM_EXIT_SHUTDOWN:

           qemu_system_reset_request();

           ret = EXCP_INTERRUPT;

            break;

       case KVM_EXIT_UNKNOWN:

           fprintf(stderr, "KVM: unknown exit, hardware reason %" PRIx64"\n",

                   (uint64_t)run->hw.hardware_exit_reason);

           ret = -1;

           break;

       case KVM_EXIT_INTERNAL_ERROR: //cpu内部错误(异常)

           ret = kvm_handle_internal_error(env,run);

           break;

       default:

           ret = kvm_arch_handle_exit(env,run);

           break;

       }

    }while (ret == 0); //处理完VM-Exit若虚拟机未Halt或严重错误, 继续准备再次VM-Entry

 

   if (ret < 0) {

       cpu_dump_state(env, stderr, fprintf, CPU_DUMP_CODE);

       vm_stop(RUN_STATE_INTERNAL_ERROR);

    }

 

   env->exit_request = 0;

   return ret;

}

 1.1Qemu 用户态架构_第1张图片

小结Qemu访问KVM的句柄有:

(1) KVM访问句柄 (2)虚拟机访问句柄 (3) VCPU访问句柄

 

(2)内存虚拟化访问接口

内存管理结构初始化

kvm_init ==> memory_listener_register(&kvm_memory_listener,NULL);

 

static MemoryListener kvm_memory_listener ={

   .begin = kvm_begin,

   .commit = kvm_commit,

   .region_add = kvm_region_add,

   .region_del = kvm_region_del,

   .region_nop = kvm_region_nop,

   .log_start = kvm_log_start,

   .log_stop = kvm_log_stop,

   .log_sync = kvm_log_sync,

   .log_global_start = kvm_log_global_start,

   .log_global_stop = kvm_log_global_stop,

   .eventfd_add = kvm_eventfd_add,

   .eventfd_del = kvm_eventfd_del,

   .priority = 10,

};

 

 

memory_region_add_subregion  ==> listener_add_address_space ==>region_add

 

kvm_region_add==> kvm_set_phys_mem==>kvm_set_user_memory_region ==>

kvm_vm_ioctl(s,KVM_SET_USER_MEMORY_REGION, &mem);

 

虚拟机内存初始化:

pc_init1==>pc_memory_init==>memory_region_add_subregion(memory.c) 添加内存区域到虚拟机的内存管理结构; 第3章将分析内存虚拟化。

 

1.1.4Qemu IO管理

main==>cpu_exec_init_all()

void cpu_exec_init_all(void)

{

   memory_map_init();

    io_mem_init();

}

 

X86 有两种硬件访问方式PIO 与 MMIO, 下面分别讲解

(1) PIO

isa_cirrus_vga采用IO port方式访问

IO port的注册

vga_initfn (cirrus_vga.c)==》 cirrus_init_common ==》 register_ioport_read(ioport.c)

 

int register_ioport_read(pio_addr_t start,int length, int size,

                         IOPortReadFunc *func,void *opaque)

{

    ......

   for(i = start; i < start + length; ++i) {

       ioport_read_table[bsize][i] = func;

       if (ioport_opaque[i] != NULL && ioport_opaque[i] != opaque)

           hw_error("register_ioport_read: invalid opaque for address0x%x",

                     i);

       ioport_opaque[i] = opaque;

    }

   return 0;

}

 

当虚拟机由IO port引起VM-Exit时

kvm_handle_io==> cpu_inl (ioport.c)==> ioport_read

static uint32_t ioport_read(int index,uint32_t address)

{

   static IOPortReadFunc * const default_func[3] = {

       default_ioport_readb,

       default_ioport_readw,

       default_ioport_readl

   };

   IOPortReadFunc *func = ioport_read_table[index][address];

   if (!func)

        func = default_func[index];

   return func(ioport_opaque[address], address);

}

 

 (2)MMIO

cirrus_init_common (CirrusVGAState * s, intdevice_id, int is_pci,

MemoryRegion *system_memory)

{

    ........

   memory_region_init(&s->low_mem_container,

                   "cirrus-lowmem-container",0x20000);

 

   memory_region_init_io(&s->low_mem, &cirrus_vga_mem_ops, s,

                         "cirrus-low-memory", 0x20000);

 

   memory_region_add_subregion(&s->low_mem_container, 0,&s->low_mem);

    .......

}

 

定义mmio的read,write

static const MemoryRegionOpscirrus_vga_mem_ops = {

   .read = cirrus_vga_mem_read,

   .write = cirrus_vga_mem_write,

   .endianness = DEVICE_LITTLE_ENDIAN,

   .impl = {

       .min_access_size = 1,

       .max_access_size = 1,

   },

};

 

当虚拟机由IO port引起VM-Exit时

cpu_physical_memory_rw(exec.c)==>io_mem_read(memory.c)==>

memory_region_dispatch_read==>    access_with_adjusted_size

                

static uint64_tmemory_region_dispatch_read1(MemoryRegion *mr,

                                            target_phys_addr_t addr,

                                            unsigned size)

{

    .......

   access_with_adjusted_size(addr, &data, size,

                             mr->ops->impl.min_access_size,

                             mr->ops->impl.max_access_size,

                             memory_region_read_accessor, mr);

 

   return data;

}

memory_region_read_accessor ==>mr->ops->read

第5.1节将详细介绍io的管理框架

1.1.5Qemu IO thread

IO thread 用来管理虚拟机的IO 读写,如对block设备的访问。5.4节将做详细介绍

int main_loop_wait(int nonblocking)

{

   int ret;

   uint32_t timeout = UINT32_MAX;

 

   if (nonblocking) {

       timeout = 0;

    }else {

       qemu_bh_update_timeout(&timeout);

    }

 

   /* poll any events */

   /* XXX: separate device handlers from system ones */

   nfds = -1;

   FD_ZERO(&rfds);

   FD_ZERO(&wfds);

   FD_ZERO(&xfds);

 

#ifdef CONFIG_SLIRP

   slirp_update_timeout(&timeout);

   slirp_select_fill(&nfds, &rfds, &wfds, &xfds);

#endif

   qemu_iohandler_fill(&nfds, &rfds, &wfds, &xfds);

   ret = os_host_main_loop_wait(timeout);

   qemu_iohandler_poll(&rfds, &wfds, &xfds, ret);

#ifdef CONFIG_SLIRP

   slirp_select_poll(&rfds, &wfds, &xfds, (ret < 0));

#endif

 

   qemu_run_all_timers();

 

   /* Check bottom-halves last in case any of the earlier events triggered

      them.  */

   qemu_bh_poll();

 

   return ret;

}

Qemu中常用的IO描述符有下面几类:

·        block io:虚拟磁盘相关的io,为了保证高性能,主要使用aio;

·        qemu_notify_event

例子:qemu的时钟模拟利用了linux kernel的signalfd, 定期产生SIGALRM信号(qemu-timer.c;

·        eventfd:主要用于qemu和kvm之间的notifier, 比如qemu的模拟设备可以通过notifier向kvm发送一个模拟中断,kvm也可以通过notifier向qemu报告guest的各种状态;

 

   address_space_update_topology==>address_space_update_ioeventfds==>address_space_add_del_ioeventfds==>MEMORY_LISTENER_CALL==>eventfd_add(kvm_mem_ioeventfd_add)==>kvm_vm_ioctl(kvm_state,KVM_IOEVENTFD, &iofd);

 

·        socket:用于虚拟机迁移,qmp管理等

该函数同时还负责轮询系统中所有的定时器,并调用定时器的回调函数;

 

IO Handler

用来表示一个IO描述符,其结构定义如下;iohandler.c中定义了一个全局的链表io_handlers,并提供qemu_set_fd_handler()和qemu_set_fd_handler2()函数将一个fd加入到这个链表QLIST_INSERT_HEAD; 在IO thread主循环中qemu_iohandler_fill()函数负责将io_handlers链表中的所有描述符,加入select测试集合。

 

IO thread同步

Qemu IO thread和vcputhread使用一个全局共享线程锁来保证同步,函数qemu_mutex_lock_iothread()和qemu_mutex_unlock_iothread()分别用来获取和释放该锁

 

1.1.6 Qemu的模块

下面的表格是本系列文章将会分析到的代码和其对应的模块:

 

 

模块名与描述

文件

章节

参数管理与main函数

Vl.c

Qemu-config.c

Arch_init.c

Qemu-opt.c

1.1

8.3

Kvm访问接口层

Target-i386\Kvm.c

Kvm-all.c

1.2

2章

设备对象模型

Qdev.c;

qdev-propreties.c

module.c

2.1

Machine与cpu管理

Hw\pc_piix.c

Hw\pc.c

Target-i386\Machine.c

cpu_exec.c

第2章

中断与时间管理

Hw\kvm\(ioapic.c, i8259.c,i8254.c, apic.c clockc)

第4章

内存管理

Memory.c

Memory-mapping.c

Exec.c

第3章

硬件辅助虚拟化

Hw\(pci.c, pcie.c,pci-birdge.c,piix_pci.c)

Hw\ide\(core.c,pci.c,piix.c, piix.c)

5.1

5.2

5.3

半虚拟化

Hw\(virtio.c, virtio-pci.c, virtio-ballon.c)

6章

直接io

Hw\kvm\pci-assign.c)

7章

块设备

Block.c

Blockdev.c

Block\raw-posix.c

5.4

异步io

Aio.c

posix-aio-compact.c

iohandler.c

main-loop.c

5.4

字符设备

Qemu-char.c

8.1

管理模块

Qmp.c; hmp.c

qdev-monitor.c

Monitor.c

Vmsave.c

8.1

8.2

 

你可能感兴趣的:(虚拟化)