[虚拟化/云] kvm的架构分析

预备知识

1. 客户机物理页框到宿主机虚拟地址转换

http://blog.csdn.net/zhuriyuxiao/article/details/8968781

http://www.tuicool.com/articles/NjY3uu

2. KVM API

简单的API例子

http://smilejay.com/2013/03/use-kvm-api/

hejie 同学的《使用KVM API实现Emulator Demo》
http://soulxu.github.io/blog/2014/08/11/use-kvm-api-write-emulator/

3. wenyi 同学的 《KVM 内存虚拟化及其实现》

http://www.ibm.com/developerworks/cn/linux/l-cn-kvm-mem/

4. KVM 官方文档

$ git clone http://git.kernel.org/pub/scm/virt/kvm/kvm.git

$ vim Documentation/virtual/kvm/api.txt

实战

该实例由Mark Wu同学提供。

KVM API General Description

• The kvm API is centered around file descriptors.
• An initial open("/dev/kvm") obtains a handle to the kvm subsystem; this handle can be used to issue system ioctls.
• A KVM_CREATE_VM ioctl on this handle will create a VM file descriptor which can be used to issue VM ioctls.
• A KVM_CREATE_VCPU ioctl on a VM fd will create a virtual cpu and return a file descriptor pointing to it.
• Finally, ioctls on a vcpu fd can be used to control the vcpu, including the important task of actually running guest code.

• KVM related file descriptors in qemu.

 

 1 (gdb) p kvm_state->fd

 2 $1 = 3

 3 (gdb) p kvm_state->vmfd

 4 $2 = 4

 5 (gdb) info threads

 6 4 Thread 0x7f86a60f0700 (LWP 13455) 0x00007f86ad0803dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

 7 3 Thread 0x7f86a56ef700 (LWP 13456) 0x00007f86ad0803dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

 8 2 Thread 0x7f86a6af1700 (LWP 13960) 0x00007f86ad08075b in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

 9 * 1 Thread 0x7f86ae478940 (LWP 13453) 0x00007f86a97772f3 in select () from /lib64/libc.so.6

10 (gdb) t 3

11 [Switching to thread 3 (Thread 0x7f86a56ef700 (LWP 13456))]#0 0x00007f86ad0803dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

12 (gdb) bt

13 #0 0x00007f86ad0803dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

14 #1 0x00007f86ae60a2e9 in qemu_cond_wait (cond=<value optimized out>, mutex=<value optimized out>) at qemu-thread-posix.c:113

15 #2 0x00007f86ae67772f in qemu_kvm_wait_io_event (arg=0x7f86b10a0930) at /home/mark/Work/qemu/qemu/cpus.c:710

16 #3 qemu_kvm_cpu_thread_fn (arg=0x7f86b10a0930) at /home/mark/Work/qemu/qemu/cpus.c:745

17 #4 0x00007f86ad07c7f1 in start_thread () from /lib64/libpthread.so.0

18 #5 0x00007f86a977e70d in clone () from /lib64/libc.so.6

19 (gdb) p ((CPUX86State *)0x7f86b10a0930)->kvm_fd

20 $3 = 12

21 (gdb) t 4

22 [Switching to thread 4 (Thread 0x7f86a60f0700 (LWP 13455))]#0 0x00007f86ad0803dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

23 (gdb) bt

24 #0 0x00007f86ad0803dc in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0

25 #1 0x00007f86ae60a2e9 in qemu_cond_wait (cond=<value optimized out>, mutex=<value optimized out>) at qemu-thread-posix.c:113

26 #2 0x00007f86ae67772f in qemu_kvm_wait_io_event (arg=0x7f86b1088a00) at /home/mark/Work/qemu/qemu/cpus.c:710

27 #3 qemu_kvm_cpu_thread_fn (arg=0x7f86b1088a00) at /home/mark/Work/qemu/qemu/cpus.c:745

28 #4 0x00007f86ad07c7f1 in start_thread () from /lib64/libpthread.so.0

29 #5 0x00007f86a977e70d in clone () from /lib64/libc.so.6

30 (gdb) p ((CPUX86State *)0x7f86b1088a00)->kvm_fd

31 $4 = 11

• Dump KVM related files via crash 

 1 crash> files 15011

 2 PID: 15011 TASK: ffff880053ea0100 CPU: 0 COMMAND: "qemu-system-x86"

 3 ROOT: / CWD: /home/mark/Work/qemu/qemu

 4 FD FILE             DENTRY           INODE            TYPE PATH

 5  0 ffff880050b8c8c0 ffff88000ad77a80 ffff880134d13318 CHR /dev/pts/4

 6  1 ffff880050b8c8c0 ffff88000ad77a80 ffff880134d13318 CHR /dev/pts/4

 7  2 ffff880050b8c8c0 ffff88000ad77a80 ffff880134d13318 CHR /dev/pts/4

 8  3 ffff88008491fa80 ffff880134c9b0c0 ffff88013b372a78 CHR /dev/kvm

 9  4 ffff88012eb52140 ffff8800ae376e40 ffff88013b71e2d8 REG anon_inode:/kvm-vm

10  5 ffff8801357e7180 ffff8800ae3760c0 ffff88013b71e2d8 REG anon_inode:/[signalfd]

11  6 ffff880014255a80 ffff8800ae376180 ffff88013b71e2d8 REG anon_inode:/[eventfd]

12  7 ffff880014255a80 ffff8800ae376180 ffff88013b71e2d8 REG anon_inode:/[eventfd]

13  8 ffff880136751bc0 ffff880089da2c80 ffff88003f6490c0 REG /home/mark/Work/qemu/images/fedora.img

14  9 ffff8800a3c4d480 ffff8800ae376300 ffff880134cb1358 FIFO

15  10 ffff88008adc6980 ffff8800ae376300 ffff880134cb1358 FIFO

16  11 ffff88008ae865c0 ffff88012256f440 ffff88013b71e2d8 REG anon_inode:/kvm-vcpu

17  12 ffff88007bb11ec0 ffff88012256f2c0 ffff88013b71e2d8 REG anon_inode:/kvm-vcpu

18 crash> p ((struct file *)0xffff88008491fa80)->f_op

19 $5 = (const struct file_operations *) 0xffffffffa04f0e40

20 crash> sym 0xffffffffa04f0e40

21 ffffffffa04f0e40 (d) kvm_chardev_ops [kvm]

22 crash> px *((struct file*)0xffff88007bb11ec0)->f_op

23 $7 = {

24     owner = 0xffffffffa05249a0,

25     llseek = 0,

26     read = 0,

27     write = 0,

28     :

29     ioctl = 0,

30     unlocked_ioctl = 0xffffffffa04bae00,

31     compat_ioctl = 0xffffffffa04bae00,

32     mmap = 0xffffffffa04b9220,

33     open = 0,

34     flush = 0,

35     release = 0xffffffffa04bd830,

36     fsync = 0,

37     aio_fsync = 0,

38     :

39     setlease = 0

40 }

41 crash> sym 0xffffffffa04bae00

42 ffffffffa04bae00 (t) kvm_vcpu_ioctl [kvm]

43 crash> sym 0xffffffffa04b9220

44 ffffffffa04b9220 (t) kvm_vcpu_mmap [kvm]

45 crash> px ((struct file *)0xffff88012eb52140)->private_data

46 $15 = (void *) 0xffff880137c6c000

47 crash> px vm_list

48 vm_list = $16 = {

49     next = 0xffff880137c6c280,

50     prev = 0xffff880137c6c280

51 }

52 crash> sym vm_list

53 ffffffffa04f0aa0 (D) vm_list [kvm]

54 crash> px ((struct kvm*)0xffff880137c6c000)->vm_list

55 $17 = {

56     next = 0xffffffffa04f0aa0,

57     prev = 0xffffffffa04f0aa0

58 }

 

CPU Virtulization

vCPU initilization

• qemu-kvm backtrace of vcpu initlizaton

 1 (gdb) bt

 2 #0 qemu_init_vcpu (_env=0x7ffff8b18a00) at /home/mark/Work/qemu/qemu/cpus.c:936

 3 #1 0x00007ffff7e9f869 in cpu_x86_init (cpu_model=0x7ffff7f8fca9 "qemu64") at /home/mark/Work/qemu/qemu/target-i386/helper.c:1263

 4 #2 0x00007ffff7ee1de0 in pc_new_cpu (cpu_model=0x7ffff7f8fca9 "qemu64") at /home/mark/Work/qemu/qemu/hw/pc.c:936

 5 #3 pc_cpus_init (cpu_model=0x7ffff7f8fca9 "qemu64") at /home/mark/Work/qemu/qemu/hw/pc.c:963

 6 #4 0x00007ffff7ee297c in pc_init1 (system_memory=0x7ffff8b113f0, system_io=0x7ffff8b114f0, ram_size=536870912, boot_device=0x7fffffffdf10 "cad",

 7 kernel_filename=0x0, kernel_cmdline=0x7ffff7f668eb "", initrd_filename=0x0, cpu_model=0x0, pci_enabled=1, kvmclock_enabled=1)

 8 at /home/mark/Work/qemu/qemu/hw/pc_piix.c:103

 9 #5 0x00007ffff7ee30d8 in pc_init_pci (ram_size=536870912, boot_device=0x7fffffffdf10 "cad", kernel_filename=0x0, kernel_cmdline=0x7ffff7f668eb "",

10 initrd_filename=0x0, cpu_model=<value optimized out>) at /home/mark/Work/qemu/qemu/hw/pc_piix.c:245

11 #6 0x00007ffff7de57a9 in main (argc=<value optimized out>, argv=<value optimized out>, envp=<value optimized out>) at /home/mark/Work/qemu/qemu/vl.c:3351
 1 qemu_init_vcpu

 2     qemu_kvm_start_vcpu

 3         qemu_thread_create(env->thread, qemu_kvm_cpu_thread_fn, env);  /* One qemu thread per vCPU */

 4             qemu_kvm_cpu_thread_fn

 5                 kvm_init_vcpu

 6         +-->kvm_cpu_exec---+

 7         |                  |

 8         -------------------+

 9 

10 int kvm_init_vcpu(CPUState *env)

11 {

12     KVMState *s = kvm_state;

13     long mmap_size;

14     int ret;

15     DPRINTF("kvm_init_vcpu\n");

16     ret = kvm_vm_ioctl(s, KVM_CREATE_VCPU, env->cpu_index);

17     if (ret < 0) {

18         DPRINTF("kvm_create_vcpu failed\n");

19         goto err;

20     }

21     env->kvm_fd = ret;

22     env->kvm_state = s;

23     env->kvm_vcpu_dirty = 1;

24     mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);

25     if (mmap_size < 0) {

26         ret = mmap_size;

27         DPRINTF("KVM_GET_VCPU_MMAP_SIZE failed\n");

28         goto err;

29     }

30     env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,

31                         env->kvm_fd, 0);

32     :

33 }

 

Guest execution

• qemu function kvm_cpu_exec

 

 1 int kvm_cpu_exec(CPUState *env)

 2 {

 3     struct kvm_run *run = env->kvm_run;

 4     int ret, run_ret;

 5     DPRINTF("kvm_cpu_exec()\n");

 6     if (kvm_arch_process_async_events(env)) {

 7         env->exit_request = 0;

 8         return EXCP_HLT;

 9     }

10     cpu_single_env = env;

11     do {

12         if (env->kvm_vcpu_dirty) {

13             kvm_arch_put_registers(env, KVM_PUT_RUNTIME_STATE);

14             env->kvm_vcpu_dirty = 0;

15         }

16         kvm_arch_pre_run(env, run);

17         if (env->exit_request) {

18             DPRINTF("interrupt exit requested\n");

19             /*

20             * KVM requires us to reenter the kernel after IO exits to complete

21             * instruction emulation. This self-signal will ensure that we

22             * leave ASAP again.

23             */

24             qemu_cpu_kick_self();

25         }

26         cpu_single_env = NULL;

27         qemu_mutex_unlock_iothread();

28         run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);

29         qemu_mutex_lock_iothread();

30         cpu_single_env = env;

31         kvm_arch_post_run(env, run);

32         kvm_flush_coalesced_mmio_buffer();

33         if (run_ret < 0) {

34             if (run_ret == -EINTR || run_ret == -EAGAIN) {

35                 DPRINTF("io window exit\n");

36                 ret = EXCP_INTERRUPT;

37                 break;

38             }

39             DPRINTF("kvm run failed %s\n", strerror(-run_ret));

40             abort();

41         }

42         switch (run->exit_reason) {

43         case KVM_EXIT_IO:

44             DPRINTF("handle_io\n");

45             kvm_handle_io(run->io.port,

46             (uint8_t *)run + run->io.data_offset,

47             run->io.direction,

48             run->io.size,

49             run->io.count);

50             ret = 0;

51             break;

52         case KVM_EXIT_MMIO:

53             DPRINTF("handle_mmio\n");

54             cpu_physical_memory_rw(run->mmio.phys_addr,

55             run->mmio.data,

56             run->mmio.len,

57             run->mmio.is_write);

58             ret = 0;

59             break;

60             :

61         }

62     } while (ret == 0);

63     :

64     return ret;

65 }

• kernel code path

 1 sys_ioctl

 2     do_vfs_ioctl

 3         vfs_ioctl

 4             kvm_vcpu_ioctl /* kvm_vcpu_fops.unlocked_ioctl */

 5                 kvm_arch_vcpu_ioctl_run

 6                     __vcpu_run

 7                         vcpu_enter_guest

 8                             vmx_vcpu_run /* kvm_x86_ops->run */

 9                                        |

10                                        v vm entry

11                               +-----------------+

12                              |  guest code |

13                              |  on this cpu |

14                              +-------------------+

15                                        | vm exit

16                                        v

17                             vmx_handle_exit /* kvm_x86_ops->handle_exit */

18                                 return kvm_vmx_exit_handlers[exit_reason](vcpu)            

• kernel exit handlers

 1 /*

 2  * The exit handlers return 1 if the exit was handled fully and guest execution

 3  * may resume. Otherwise they set the kvm_run parameter to indicate what needs

 4  * to be done to userspace and return 0.

 5 */

 6 static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {

 7         [EXIT_REASON_EXCEPTION_NMI] = handle_exception,

 8         [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,

 9         [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,

10         [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,

11         [EXIT_REASON_IO_INSTRUCTION] = handle_io,

12         :

13         :

• guest runtime information shared between kvm mod and qemu-kvm

 

 1 env->kvm_run = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,

 2                         env->kvm_fd, 0);

 3 (gdb) p ((struct CPUX86State*)0x7fcdbe63f930)->kvm_run

 4 $2 = (struct kvm_run *) 0x7fcdbcfa2000

 5 (gdb) p *((struct CPUX86State*)0x7fcdbe63f930)->kvm_run

 6 $3 = {request_interrupt_window = 0 '\000', padding1 = "\000\000\000\000\000\000", exit_reason = 10, ready_for_interrupt_injection = 0 '\000', if_flag =

 7 0 '\000', padding2 = "\000", cr8 = 0, apic_base = 4276094976, {hw = {hardware_exit_reason = 4276093104}, fail_entry = {hardware_entry_failure_reason =

 8 4276093104}, ex = {exception = 4276093104, error_code = 0}, io = {direction = 176 '\260', size = 0 '\000', port = 65248, count = 0, data_offset =

 9 513418191540584448}, debug = {arch = {exception = 4276093104, pad = 0, pc = 513418191540584448, dr6 = 4294967300, dr7 = 0}}, mmio = {phys_addr =

10 4276093104, data = "\000\000\000\000 \a \a", len = 4, is_write = 1 '\001'}, hypercall = {nr = 4276093104, args = {513418191540584448, 4294967300, 0, 0,

11 0, 0}, ret = 0, longmode = 0, pad = 0}, tpr_access = {rip = 4276093104, is_write = 0, pad = 119539488}, s390_sieic = {icptcode = 176 '\260', ipa =

12 65248, ipb = 0}, s390_reset_flags = 4276093104, dcr = {dcrn = 4276093104, data = 0, is_write = 0 '\000'}, internal = {suberror = 4276093104, ndata = 0,

13 data = {513418191540584448, 4294967300, 0 <repeats 14 times>}}, osi = {gprs = {4276093104, 513418191540584448, 4294967300, 0 <repeats 29 times>}},

14 papr_hcall = {nr = 4276093104, ret = 513418191540584448, args = {4294967300, 0, 0, 0, 0, 0, 0, 0, 0}}, padding =

15 "\260\000\340\376\000\000\000\000\000\000\000\000 \a \a\004\000\000\000\001", '\000' <repeats 234 times>}}
 1 crash> vtop 7fcdbcfa2000

 2 VIRTUAL        PHYSICAL

 3 7fcdbcfa2000 12eb3c000

 4 PML:  137dfd7f8    =>  136ff7067

 5 PUD:   136ff79b0   =>   134069067

 6 PMD:  134069f38  =>   13671c067

 7 PTE:   13671cd10  =>  800000012eb3c067

 8 PAGE:  12eb3c000  

 9 

10 PTE                                PHYSICAL        FLAGS

11 800000012eb3c067       12eb3c000       

12 (PRESENT|RW|USER|ACCESSED|DIRTY|NX)

13 

14 VMA                         START                END                   FLAGS     FILE

15 ffff8800aac39b70     7fcdbcfa2000     7fcdbcfa5000    fb     anon_inode:/kvm-vcpu

16 PAGE                        PHYSICAL    MAPPING    INDEX         CNT     FLAGS

17 ffffea0004237520     12eb3c000     0             ffff8800b72c9980    2     40000000000014

18 crash> px ((struct kvm*)0xffff880137c6c000)->vcpus[1]->run

19 $23 = (struct kvm_run *) 0xffff88012eb3c000

20 crash> vtop 0xffff88012eb3c000

21 VIRTUAL                   PHYSICAL

22 ffff88012eb3c000     12eb3c000

23 PML4 DIRECTORY: ffffffff81a85000

24 PAGE DIRECTORY: 1a86063

25     PUD: 1a86020 => a067

26     PMD: aba8 => 800000012ea001e3

27     PAGE: 12ea00000 (2MB)

28 

29 

30 PTE                                PHYSICAL        FLAGS

31 800000012ea001e3       12ea00000        (PRESENT|RW|ACCESSED|DIRTY|PSE|GLOBAL|NX)

32 

33 PAGE                        PHYSICAL    MAPPING    INDEX             CNT     FLAGS

34 ffffea0004237520     12eb3c000            0 ffff8800b72c9980     2     40000000000014

35 crash> px ((struct file*)0xffff88007bb11ec0)->private_data

36 $30 = (void *) 0xffff88013860c2b8

37 crash> px ((struct kvm_vcpu *)0xffff88013860c2b8)->run

38 $31 = (struct kvm_run *) 0xffff88012eb3c000

 

Physical Memory Virtualization

Physical memory intialization

• Qemu backtrace

 1 (gdb) bt

 2 #0 kvm_set_user_memory_region (s=0x7ffff8b100a0, slot=0x7ffff8b100a0) at /home/mark/Work/qemu/qemu/kvm-all.c:168

 3 #1 0x00007ffff7ea3fae in kvm_set_phys_mem (client=<value optimized out>, start_addr=<value optimized out>, size=<value optimized out>,

 4 phys_offset=<value optimized out>, log_dirty=false) at /home/mark/Work/qemu/qemu/kvm-all.c:650

 5 #2 kvm_client_set_memory (client=<value optimized out>, start_addr=<value optimized out>, size=<value optimized out>, phys_offset=<value optimized out>,

 6 log_dirty=false) at /home/mark/Work/qemu/qemu/kvm-all.c:663

 7 #3 0x00007ffff7e8405a in cpu_notify_set_memory (start_addr=0, size=134217728, phys_offset=0, region_offset=0, log_dirty=false)

 8 at /home/mark/Work/qemu/qemu/exec.c:1742

 9 #4 cpu_register_physical_memory_log (start_addr=0, size=134217728, phys_offset=0, region_offset=0, log_dirty=false)

10 at /home/mark/Work/qemu/qemu/exec.c:2675

11 #5 0x00007ffff7eaac70 in address_space_update_topology_pass (as=0x7ffff82f31e0, old_view=..., new_view=..., adding=true)

12 at /home/mark/Work/qemu/qemu/memory.c:731

13 #6 0x00007ffff7eacf31 in address_space_update_topology (as=0x7ffff82f31e0) at /home/mark/Work/qemu/qemu/memory.c:746

14 #7 0x00007ffff7ead514 in memory_region_update_topology () at /home/mark/Work/qemu/qemu/memory.c:760

15 #8 0x00007ffff7ee1787 in pc_memory_init (system_memory=0x7ffff8b11430, kernel_filename=<value optimized out>, kernel_cmdline=0x7ffff7f668eb "",

16 initrd_filename=0x0, below_4g_mem_size=134217728, above_4g_mem_size=0, rom_memory=0x7ffff8b32240, ram_memory=0x7fffffffe188)

17 at /home/mark/Work/qemu/qemu/hw/pc.c:996

18 #9 0x00007ffff7ee2d96 in pc_init1 (system_memory=0x7ffff8b11430, system_io=0x7ffff8b11530, ram_size=134217728, boot_device=0x7fffffffe500 "cad",

19 kernel_filename=0x0, kernel_cmdline=0x7ffff7f668eb "", initrd_filename=0x0, cpu_model=0x0, pci_enabled=1, kvmclock_enabled=1)

20 at /home/mark/Work/qemu/qemu/hw/pc_piix.c:128

21 #10 0x00007ffff7ee30d8 in pc_init_pci (ram_size=134217728, boot_device=0x7fffffffe500 "cad", kernel_filename=0x0, kernel_cmdline=0x7ffff7f668eb "",

22 initrd_filename=0x0, cpu_model=<value optimized out>) at /home/mark/Work/qemu/qemu/hw/pc_piix.c:245

23 #11 0x00007ffff7de57a9 in main (argc=<value optimized out>, argv=<value optimized out>, envp=<value optimized out>) at /home/mark/Work/qemu/qemu/vl.c:3351

24 --------------------------------------------

25 kvm_set_user_memory_region

26     kvm_vm_ioctl

27         ioctl(kvm_context->vm_fd, KVM_SET_USER_MEMORY_REGION, ...)

Guest physical memory mapping

• dump gpa <-> hva <-> hpa mapping via crash

  1 crash> px vm_list

  2 vm_list = $7 = {

  3     next = 0xffff880080cb4280,

  4     prev = 0xffff880080cb4280

  5 }

  6 crash> struct kvm.vm_list

  7 struct kvm {

  8     [640] struct list_head vm_list;

  9 }

 10 crash> px 0xffff880080cb4280-640

 11 $8 = 0xffff880080cb4000

 12 crash> pd ((struct kvm *)0xffff880080cb4000)->memslots

 13 $9 = (struct kvm_memslots *) 0xffff880139326000

 14 crash> px *((struct kvm *)0xffff880080cb4000)->memslots

 15 $6 = {

 16     nmemslots = 0x23,

 17     memslots = {{

 18         base_gfn = 0x0,

 19         npages = 0xa0,

 20         flags = 0x0,

 21         rmap = 0xffffc90016aac000,

 22         dirty_bitmap = 0x0,

 23         lpage_info = {0xffffc900175d6000, 0xffffc900175d9000},

 24         userspace_addr = 0x7f30dbe00000,

 25         user_alloc = 0x1,

 26         id = 0x0

 27     }, {

 28         base_gfn = 0xfffe0,

 29         npages = 0x20,

 30         flags = 0x0,

 31         rmap = 0xffffc90016a82000,

 32         dirty_bitmap = 0x0,

 33         lpage_info = {0xffffc90016a85000, 0xffffc90016a88000},

 34         userspace_addr = 0x7f310b1f0000,

 35         user_alloc = 0x1,

 36         id = 0x1

 37     }, {

 38         base_gfn = 0xc0,

 39         npages = 0xc,

 40         flags = 0x0,

 41         rmap = 0xffffc9001787f000,

 42         dirty_bitmap = 0x0,

 43         lpage_info = {0xffffc90017882000, 0xffffc90017885000},

 44         userspace_addr = 0x7f30dbec0000,

 45         user_alloc = 0x1,

 46         id = 0x2

 47     }, {

 48         base_gfn = 0xfc000,

 49         npages = 0x800,

 50         flags = 0x1,

 51         rmap = 0xffffc90017b39000,

 52         dirty_bitmap = 0xffffc90017b45000,

 53         lpage_info = {0xffffc90017b3f000, 0xffffc90017b42000},

 54         userspace_addr = 0x7f3101c00000,

 55         user_alloc = 0x1,

 56         id = 0x3

 57     }, {

 58         base_gfn = 0xcc,

 59         npages = 0x24,

 60         flags = 0x0,

 61         rmap = 0xffffc90017990000,

 62         dirty_bitmap = 0x0,

 63         lpage_info = {0xffffc90017993000, 0xffffc90017996000},

 64         userspace_addr = 0x7f30dbecc000,

 65         user_alloc = 0x1,

 66         id = 0x4

 67     }, {

 68         base_gfn = 0xf0,

 69         npages = 0x10,

 70         flags = 0x0,

 71         rmap = 0xffffc90017999000,

 72         dirty_bitmap = 0x0,

 73         lpage_info = {0xffffc9001799c000, 0xffffc9001799f000},

 74         userspace_addr = 0x7f30dbef0000,

 75         user_alloc = 0x1,

 76         id = 0x5

 77     }, {

 78         base_gfn = 0x100,

 79         npages = 0x1ff00,

 80         flags = 0x0,

 81         rmap = 0xffffc900179a2000,

 82         dirty_bitmap = 0x0,

 83         lpage_info = {0xffffc90017aa4000, 0xffffc90017aa7000},

 84         userspace_addr = 0x7f30dbf00000,

 85         user_alloc = 0x1,

 86         id = 0x6

 87     }, {

 88         base_gfn = 0x0,

 89         npages = 0x0,

 90         flags = 0x0,

 91         rmap = 0x0,

 92         dirty_bitmap = 0x0,

 93         lpage_info = {0x0, 0x0},

 94         userspace_addr = 0x0,

 95         user_alloc = 0x0,

 96         id = 0x0

 97     },

 98 

 99 On Guest:

100 [root@localhost ~]# ./hello

101 [0x400638]: Hello, world

102 crash> ps \|grep hello

103 2203 2112 0 ffff88001d68ae60 IN 0.1 4124 356 hello

104 crash> set 2203

105 PID: 2203

106 COMMAND: "hello"

107 TASK: ffff88001d68ae60 [THREAD_INFO: ffff88001da6c000]

108 CPU: 0

109 STATE: TASK_INTERRUPTIBLE

110 crash> rd 0x400638 2

111           400638: 77202c6f6c6c6548 255b000a646c726f Hello, world..[%

112 crash> vtop 0x400638

113 VIRTUAL PHYSICAL

114 400638 30b2638

115 

116     PML: 1d669000 => 1dbfa067

117     PUD: 1dbfa000 => 1c82d067

118     PMD: 1c82d010 => 1ab49067

119     PTE: 1ab49000 => 30b2025

120     PAGE: 30b2000

121 

122 On Host:

123 crash> px 0x7f30dbf00000+0x30b2638-0x100000

124 $7 = 0x7f30deeb2638

125 crash> rd 0x7f30deeb2638 2

126 7f30deeb2638: 77202c6f6c6c6548 255b000a646c726f Hello, world..[%

MMU Virtualization

Extended Page Table

• Overview

1 Guest CR3             EPT Base Pointer

2     |                                |

3     +-->+-------------------+        +---->+---------------------+

4  GVA--->| Guest Page Table  |---> GPA ---> | Extended Page Table | ---> HPA

5         +-------------------+              +---------------------+

• EPT walkthrough

 1 sh> px ((struct kvm_vcpu *)0xffff88007768c078)->arch.mmu

 2 $18 = {

 3     new_cr3 = 0xffffffffa04dca40 <nonpaging_new_cr3>,

 4     page_fault = 0xffffffffa04e4410 <tdp_page_fault>,

 5     free = 0xffffffffa04e0870 <nonpaging_free>,

 6     gva_to_gpa = 0xffffffffa04e4b70 <paging64_gva_to_gpa>,

 7     prefetch_page = 0xffffffffa04dc7a0 <nonpaging_prefetch_page>,

 8     sync_page = 0xffffffffa04dc7d0 <nonpaging_sync_page>,

 9     invlpg = 0xffffffffa04dc7e0 <nonpaging_invlpg>,

10     root_hpa = 0x138457000,

11     root_level = 0x4,

12     shadow_root_level = 0x4,

13     base_role = {

14     word = 0x0,

15     {

16         glevels = 0x0,

17         level = 0x0,

18         quadrant = 0x0,

19         pad_for_nice_hex_output = 0x0,

20         direct = 0x0,

21         access = 0x0,

22         invalid = 0x0,

23         cr4_pge = 0x0,

24         nxe = 0x0,

25         cr0_wp = 0x0,

26         smep_andnot_wp = 0x0

27     }

28     },

29     pae_root = 0xffff88000d2c2000,

30     rsvd_bits_mask = {{0xfff0000000000, 0xfff0000000000, 0xfff0000000180, 0xfff0000000180}, {0x0, 0xfff00001fe000, 0xfff003fffe000, 0xfff0000000180}}

31 }

32 

33 crash> px (0x30b2638>>39)&0x1ff

34 $19 = 0x0

35 crash> rd -p 0x138457000

36        138457000: 0000000043138007        ...C....

37 crash> px (0x30b2638>>30)&0x1ff

38 $20 = 0x0

39 crash> rd -p 0x43138000

40        43138000: 0000000108c3c007        ........

41 crash> px (0x30b2638>>21)&0x1ff

42 $21 = 0x18

43 crash> px (0x108c3c007 & ~0xfff)+ (8*0x18)

44 $22 = 0x108c3c0c0

45 crash> rd -p 0x108c3c0c0

46        108c3c0c0: 0000000125713007        .0q%....

47 crash> px (0x30b2638>>12)&0x1ff

48 $23 = 0xb2

49 crash> px (0x125713007 & ~0xfff) + (8*0xb2)

50 $24 = 0x125713590

51 crash> rd -p 0x125713590

52        125713590: 000000011289a277        w.......

53 crash> vtop 7f30deeb2638

54 VIRTUAL        PHYSICAL

55 7f30deeb2638   11289a638

56     PML: 575e07f0 => 43236067

57     PUD: 43236618 => 7ef3c067

58     PMD: 7ef3c7b8 => 139495067

59     PTE: 139495590 => 800000011289a067

60     PAGE: 11289a000

61 PTE                PHYSICAL    FLAGS

62 800000011289a067   11289a000   (PRESENT|RW|USER|ACCESSED|DIRTY|NX)

63 VMA              START         END         FLAGS FILE

64 ffff88005d100788 7f30dbe00000 7f30fbe00000 80100073

65 PAGE             PHYSICAL   MAPPING         INDEX    CNT FLAGS

66 ffffea0003c0e1b0 11289a000 ffff88000d307f61 7f30deeb2 1 4000000010006c

 

Shadow Page Table

• Overview

 1 Guest CR3

 2    |

 3    +-->+-------------------+

 4 GVA--->| Guest Page Table | ---> GPA

 5        +-------------------+

 6 

 7 Host CR3

 8     |

 9     +-->+-------------------+

10 GVA--->| Shadow Page Table | ---> HPA

11        +-------------------+

12 ~                                

• Shadow page table walkthrough (with option ept=no for kernel moduel kvm_intel)

 1 crash> px ((struct kvm_vcpu *)0xffff88007768c078)->arch.mmu

 2     mmu = {

 3         new_cr3 = 0xffffffffa0914890 <paging_new_cr3>,

 4         page_fault = 0xffffffffa091a1b0 <paging64_page_fault>,

 5         free = 0xffffffffa0914880 <paging_free>,

 6         gva_to_gpa = 0xffffffffa0918b70 <paging64_gva_to_gpa>,

 7         prefetch_page = 0xffffffffa0915920 <paging64_prefetch_page>,

 8         sync_page = 0xffffffffa09177e0 <paging64_sync_page>,

 9         invlpg = 0xffffffffa0913b20 <paging64_invlpg>,

10         root_hpa = 0x8886d000,

11         root_level = 0x4,

12         shadow_root_level = 0x4,

13         base_role = {

14             word = 0xe00004,

15             {

16                 glevels = 0x4,

17                 level = 0x0,

18                 quadrant = 0x0,

19                 pad_for_nice_hex_output = 0x0,

20                 direct = 0x0,

21                 access = 0x0,

22                 invalid = 0x0,

23                 cr4_pge = 0x1,

24                 nxe = 0x1,

25                 cr0_wp = 0x1,

26                 smep_andnot_wp = 0x0

27             }

28         },

29         pae_root = 0xffff88008893e000,

30         rsvd_bits_mask = {{0xfff0000000000, 0xfff0000000000, 0xfff0000000180, 0xfff0000000180}, {0x0, 0xfff00001fe000, 0xfff003fffe000, 0xfff0000000180}}

31         },

32 crash> px (0x400608 >> 39) & 0x1ff

33 $17 = 0x0

34 crash> rd -p 0x8886d000

35         8886d000: 0000000081517027         'pQ.....

36 crash> px (0x400608 >> 30) & 0x1ff

37 $18 = 0x0

38 crash> px (0x81517027 & ~0xfff)

39 $19 = 0x81517000

40 crash> rd -p 0x81517000

41         81517000: 000000008159f027         '.Y.....

42 crash> px (0x400608 >> 21) & 0x1ff

43 $20 = 0x2

44 crash> px (0x8159f027 & ~0xfff)+(8*0x2)

45 $21 = 0x8159f010

46 crash> rd -p 0x8159f010

47         8159f010: 0000000069fd7027         'p.i....

48 crash> px (0x400608 >> 12) & 0x1ff

49 $22 = 0x0

50 crash> rd -p 0x69fd7000

51         69fd7000: 0000000055b99265         e..U....

52 crash> rd -p 55b99608 2

53         55b99608: 77202c6f6c6c6548 255b000a646c726f         Hello, world..[%

 

你可能感兴趣的:(kvm)