内存共享是XEN PV的一个优势,今天就简单介绍一下PV的内存共享(DomU创建一个页面共享,然后映射到Dom0中)的原理及其代码。
- 两个Domain之间的内存页共享, Doamin0和DomainU- 在这个页中设置一个共享ring- 为共享ring设置event channel- 在Dom0和DomU之间来回传递一些信息介绍
在xen中的虚拟机被称为Domain. Domain0(Dom0)是特别的并拥有与正实设备交互的设备驱动, 例如网卡.这个驱动被称为后端驱动. 在我们的例子中这被称为后端domain.
在被称为 DomainU(DomU)的用户Domain有一个相应的前端驱动, 其是虚拟设备的接口,为和真实设备通信在DomU中前端驱动要连接后端驱动. 在我们以下的例子中, 这个DomU被称为前端Domain.
Xen为共享Domain间的内存提供了授权表(Grant Tables). 设备驱动使用授权表工作. 每个Domain有它自己的授权表, 并与xen共享. 在这个表中的条目由授权引用(grant references)所标识. 授权引用在Domain间传递, 且所引用的共享页由授权表所指向, domain也设置一个共享环结构(ring structure), 其用于在domain间有效共享数据.先上代码:
#include <linux/module.h> #include <linux/version.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/bitops.h> #include <linux/ethtool.h> #include <linux/in.h> #include <linux/if_ether.h> #include <linux/io.h> #include <linux/moduleparam.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/arp.h> #include <net/route.h> #include <asm/uaccess.h> #include <asm/page.h> #include <xenpvdrivers/evtchn.h> #include <xenpvdrivers/xenbus.h> #include <xenpvdrivers/interface/io/netif.h> #include <xenpvdrivers/interface/memory.h> #include <xenpvdrivers/balloon.h> #include <xenpvdrivers/asm/maddr.h> #include <xenpvdrivers/grant_table.h> //int page; void *page; struct as_request { unsigned int id; /* private guest value echoed in resp */ unsigned int status; unsigned int operation; }; struct as_response { unsigned int id; /* copied from request */ unsigned int status; unsigned int operation; /* copied from request */ }; // The following makes the as_sring, as_back_ring, as_back_ring "types" DEFINE_RING_TYPES(as, struct as_request, struct as_response); struct info_t { struct as_front_ring ring; grant_ref_t gref; int irq; int port; } info; #define DOM0_ID 0 // Related the proc fs entries static struct proc_dir_entry *proc_dir = NULL; static struct proc_dir_entry *proc_file = NULL; char proc_data[20]; #ifdef SHARED_MEM /* * Send an request via the shared ring to Dom0, following by an INT */ int send_request_to_dom0(void) { struct as_request *ring_req; int notify; static int reqid=9; /* Write a request into the ring and update the req-prod pointer */ ring_req = RING_GET_REQUEST(&(info.ring), info.ring.req_prod_pvt); ring_req->id = reqid; ring_req->operation = reqid; ring_req->status = reqid; printk("\nxen:DomU: Fill in IDX-%d, with id=%d, op=%d, st=%d", info.ring.req_prod_pvt, ring_req->id, ring_req->operation, ring_req->status); reqid++; info.ring.req_prod_pvt += 1; // Send a reqest to backend followed by an int if needed RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&(info.ring), notify); if (notify) { printk("\nxen:DomU: Sent a req to Dom0"); notify_remote_via_irq(info.irq); } else { printk("\nxen:DomU: No notify req to Dom0"); notify_remote_via_irq(info.irq); } printk("...\n"); return 0; } ssize_t file_write (struct file *filp, const char __user *buff, unsigned long len, void *data) { int value; printk("\nxen:domU: file_write %lu bytes", len); //copy_from_user函数的目的是从用户空间拷贝数据到内核空间,失败返回没有被拷贝的字节数, //成功返回0. //buff->proc if (copy_from_user(&proc_data[0], buff, len)) return -EFAULT; proc_data[len] = '\x0'; //printk(" ,%s", &proc_data[0]); value = simple_strtol(proc_data, 0, 10);//把一个字符串转换为一个有符号长整数 switch(value) { case 1: send_request_to_dom0(); printk(" ,value = %d", value); break; default: printk(" ,value not recognized !"); } return len; } int file_read (char* page, char**start, off_t off, int count, int *eof, void *data) { sprintf(page, "%s", proc_data); //把格式化的数据写入某个字符串缓冲区 写入page return strlen(page); } /* * We create a /proc/demo/file entry. When we write a "1" ino this file once * the module is loaded, the file_write function() above is called and this * sends a requesst on the shared ring to the Dom0. This way we test the * event channel and shared ring routines. */ int create_procfs_entry(void)//创建虚拟文件夹,及文件 { int ret = 0; proc_dir = proc_mkdir("demo", NULL); if (!proc_dir) { printk("\nxen:domU Could not create demo entry in procfs"); ret = -EAGAIN; return ret; } /*要在 /proc 文件系统中创建一个虚拟文件,请使用 create_proc_entry 函数。这个函数可以接收一个文件名 、一组权限和这个文件在 /proc 文件系统中出现的位置。create_proc_entry 的返回值 是一个 proc_dir_entry 指针(或者为 NULL,说明在 create 时发生了错误)*/ proc_file = create_proc_entry("file", 0600, proc_dir); if (proc_file) { proc_file->read_proc = file_read; proc_file->write_proc = file_write; #if PROC_OWNER proc_file->owner = THIS_MODULE; #endif } else { printk("\nxen:domU Could not create /proc/demo/file"); ret = -EAGAIN; return ret; } return ret; } /* * Our interrupt handler for event channel that we set up */ static irqreturn_t as_int (int irq, void *dev_id)//中断处理函数 { struct as_response *ring_resp; RING_IDX i, rp; printk("\nxen:DomU: as_int called"); again: rp = info.ring.sring->rsp_prod; printk("\nxen:DomU: ring pointers %d to %d", info.ring.rsp_cons, rp); for(i=info.ring.rsp_cons; i != rp; i++) { unsigned long id; // what did we get from Dom0 ring_resp = RING_GET_RESPONSE(&(info.ring), i); printk("\nxen:DomU: Recvd in IDX-%d, with id=%d, op=%d, st=%d", i, ring_resp->id, ring_resp->operation, ring_resp->status); id = ring_resp->id; switch(ring_resp->operation) { case 0: printk("\nxen:DomU: operation:0"); break; default: break; } } info.ring.rsp_cons = i; if (i != info.ring.req_prod_pvt) { int more_to_do; RING_FINAL_CHECK_FOR_RESPONSES(&info.ring, more_to_do); if(more_to_do) goto again; } else info.ring.sring->rsp_event = i+1; return IRQ_HANDLED; } #endif int init_module(void) { int mfn; #ifdef ENABLE_EVENT_IRQ int err; #endif struct as_sring *sring; /* * Allocates and returns a pointer to the first byte of a memory area * that is several physically contiguous pages long, and doesn't zero * out the area. * GFP_KERNEL - process may sleep */ /* 在linux内核空间申请内存涉及的函数主要包括kmalloc()、__get_free_pages()和vmalloc()等。 kmalloc()和__get_free_pages()申请的内存位于物理内存映射区域(《896M,所以容易操作, 可以得到虚拟地址与物理地址),而且在物理上也是连续的,它们与真实的物理地址只有一个 固定的偏移,因此存在简单的转换关系。而vmalloc()在虚拟内存空间给出一块连续的内存空间 (>896,虚拟地址上连续),实质上,这片连续的虚拟内存在物理内存中并不一定连续, 而vmalloc()申请的虚拟内存和物理内存之间也没有简单的换算关系。*/ page = __get_free_pages(GFP_KERNEL, 1); if (page == 0) { printk("\nxen:DomU: could not get free page"); return 0; } #if ENABLE_SHARED_RING /* Put a shared ring structure on this page */ sring = (struct as_sring*) page; SHARED_RING_INIT(sring); /*前端分配一个用于共享通信 ring 的内存页, 授权它给后端domain, 并放授权引用到xenstore, 这样后端就能 map 这个页. 有共享ring这个页是一个主页, 用于传递更多的授权引用*/ /* info.ring is the front_ring structure */ FRONT_RING_INIT(&(info.ring), sring, PAGE_SIZE); #endif mfn = virt_to_mfn(page);//?????**** /* * The following grant table func is in drivers/xen/grant-table.c * For shared pages, used for synchronous data, advertise a page to * be shared via the hypervisor fu[nction call gnttab_grant_foreign_access. * This call notifies the hypervisor that other domains are allowed to * access this page. * * gnttab_map() has been called earlier to setup gnttable_setup_table * during init phase, with a call to HYPERVISOR_grant_table_op( * GNTTAB_setup_table...) and * "shared" pages have been malloc'ed. This "shared" page is then used * below later during the actual grant of a ref by this DOM. * * gnttab_grant_foreign_access() * => get_free_entries * gnttab_free_head - points to the ref of the head * gnttab_free_count- keeps number of free refs * * Get a ref id by calling gnttab_entry(head) * gnttab_list[entry/RPP][entry%RPP] * => gnttab_grat_foreign_access_ref * =>update_grant_entry * shared[ref].frame/domid/flags are updated * "shared" above is a pointer to struct grant_entry (flags/domid/frame) */ info.gref = gnttab_grant_foreign_access(DOM0_ID, mfn, 0); if (info.gref < 0) { printk("\nxen: could not grant foreign access"); free_page((unsigned long)page); return 0; } /* * The following strcpy is commented out, but was used initally to test * is the memory page is indeed shared with Dom0, when in Dom0, we do a * sprintf of the same memory location and get the same characters. */ strcpy((char*)page, "aseem sethi"); /* * TBD: Save gref to be sent via Xenstore to dom-0. As of now both the * gref and the event channel port id is sent manually during insmod * in the dom0 module. */ printk("\n gref = %d", info.gref); /* Setup an event channel to Dom0 */ #ifdef ENABLE_EVENT_IRQ err = bind_listening_port_to_irqhandler(DOM0_ID, as_int, 0, "xen-eg", &info); if (err < 0) { printk("\nxen:DomU failed to setup evtchn !"); gnttab_end_foreign_access(info.gref, 0, page); return 0; } info.irq = err; info.port = irq_to_evtchn_port(info.irq); printk(" interupt = %d, local-port = %d", info.irq, info.port); printk("....\n..."); create_procfs_entry(); #endif return 0; } void cleanup_module(void) { printk("\nCleanup grant ref:"); if (gnttab_query_foreign_access(info.gref) == 0) { //Remove the grant to the page printk("\n xen: No one has mapped this frame"); // If 3rd param is non NULL, page has to be freed gnttab_end_foreign_access(info.gref, 0, page); // free_pages(page,1); } else { printk("\n xen: Someone has mapped this frame"); // Guess, we still free the page, since we are rmmod-ed gnttab_end_foreign_access(info.gref, 0, page); } /* Cleanup proc entry */ remove_proc_entry("file", proc_dir); remove_proc_entry("demo", NULL); printk("....\n..."); } MODULE_LICENSE("GPL");
里面的备注可以多多少少增进一下大家对函数以及结构体的理解,现在我们大体说一下:
int send_request_to_dom0(void) 将请求写入RING之后再通过RING到达backend
file_write() 和file_read() 与将buff内容从用户控件写入内核空间proc_data在写入page中
create_procfs_entry()创建虚拟文件及文件夹
init_module() 为page分配内核空间,并设置共享RING及初始化前端RING,并验证是否可以被共享映射,为了验证在page中写入字符串"aseem sethi"。建立事件通道。gref和事件通道port需要在Dom0代码中手动加入。
cleanup_module()检查时候有dom映射过此页面,撤销映射,并且删除在/proc下的文件。
Dom0代码:
#include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> #if 0 #include <xen/interface/grant_table.h> #include <xen/interface/io/blkif.h> // for definition of blkif_sring_t #include <xen/gnttab.h> #include <linux/vmalloc.h> #include <asm-x86/xen/hypervisor.h> #include <xen/evtchn.h> #else #include <linux/vmalloc.h> #include <xen/xen.h> #include <xen/xenbus.h> #include <xen/events.h> #include <xen/page.h> #include <xen/grant_table.h> #include <xen/interface/io/netif.h> #include <xen/interface/memory.h> #include <xen/interface/grant_table.h> #endif struct gnttab_map_grant_ref ops;//根据(dom,GR)奖对应的页映射到自己的地址空间 struct gnttab_unmap_grant_ref unmap_ops;//撤销页映射 struct as_request { unsigned int id; /* private guest value, echoed in resp */ unsigned int status; unsigned int operation; }; struct as_response { unsigned int id; /* copied from request */ unsigned int status; unsigned int operation; /* copied from request */ }; typedef struct as_request as_request_t; typedef struct as_response as_response_t; // From /include/xen/interface/io/ring.h // The following makes the as_sring, as_back_ring, as_back_ring "types" DEFINE_RING_TYPES(as, struct as_request, struct as_response);//#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t) struct info_t { int irq; int gref; int remoteDomain; int evtchn; struct as_back_ring ring; } info; int gref; int port; module_param(gref, int, 0644);//在domU中 gref port需要手动加入 init_module() module_param(port, int, 0644);//编写一个内核模块则通过module_param()传递参数 /*关于中断处理函数的返回值:中断程序的返回值是一个特殊类型—irqreturn_t。但是中断程序的返回值却只有两个—IRQ_NONE和IRQ_HANDLED。 #ifndef _LINUX_IRQRETURN_H #define _LINUX_IRQRETURN_H typedef int irqreturn_t; #define IRQ_NONE (0) #define IRQ_HANDLED (1) #define IRQ_RETVAL(x) ((x) != 0) //这个宏只是返回0或非0 #endif*/ #if ENABLE_SRING static irqreturn_t as_int (int irq, void *dev_id)//io环操作 **请求,应答 { RING_IDX rc, rp;//typedef unsigned int RING_IDX; as_request_t req; as_response_t resp; int more_to_do, notify; // dev_id is a pointer to the info structure printk("\nxen:Dom0: as_int called with dev_id %x info=%x", (unsigned int)dev_id, (unsigned int)&info); rc = info.ring.req_cons; rp = info.ring.sring->req_prod; printk(" rc =%d rp =%d", rc, rp); while(rc!=rp) { /* define RING_REQUEST_CONS_OVERFLOW(_r, _cons) (((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r)) */ if(RING_REQUEST_CONS_OVERFLOW(&info.ring, rc))//RING 请求溢出 break; // what did we get from the frontend at index rc memcpy(&req, RING_GET_REQUEST(&info.ring, rc), sizeof(req)); resp.id = req.id; resp.operation = req.operation; resp.status = req.status+1; // Send back a status +1 of what was recvd printk("\nxen:Dom0: Recvd at IDX-%d: id=%d, op=%d, status=%d", rc, req.id, req.operation, req.status); // update the req-consumer info.ring.req_cons = ++rc; barrier();//防止读写出错 switch (req.operation) { case 0: printk("\nxen:Dom0: req.operation = 0"); break; default: printk("\nxen:Dom0: req.operation = %d", req.operation); break; } memcpy(RING_GET_RESPONSE(&info.ring, info.ring.rsp_prod_pvt), &resp, sizeof(resp)); info.ring.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info.ring, notify); if(info.ring.rsp_prod_pvt == info.ring.req_cons) { RING_FINAL_CHECK_FOR_REQUESTS(&info.ring, more_to_do); } else if (RING_HAS_UNCONSUMED_REQUESTS(&info.ring)) {//还有未处理req more_to_do = 1; } if(notify) { printk("\nxen:Dom0: Send notify to DomU"); notify_remote_via_irq(info.irq); } } return IRQ_HANDLED; } #endif int init_module(void) { struct vm_struct *v_start; #if ENABLE_SRING as_sring_t *sring; #endif int err; info.gref = gref; info.remoteDomain = 1; info.evtchn = port; printk("\nxen: dom0: init_module with gref = %d", info.gref); // The following function reserves a range of kernel address space and // allocates pagetables to map that range. No actual mappings are created. v_start = alloc_vm_area(PAGE_SIZE);//分配虚拟地址结构 if (v_start == 0) {//无法分配 free_vm_area(v_start); printk("\nxen: dom0: could not allocate page"); return -EFAULT; } /* struct vm_struct { struct vm_struct *next;//指向下一虚拟地址,加速查询 void *addr;//地址 unsigned long size;//大小 unsigned long flags;//标志 struct page **pages;//页指针 unsigned int nr_pages; unsigned long phys_addr;//物理地址 }; struct vm_struct *alloc_vm_area(size_t size);//分配虚拟地址结构 void free_vm_area(struct vm_struct *area);//释放虚拟地址结构*/ /* * ops struct in paramaeres * host_addr, flags, ref * ops struct out parameters * status (zero if OK), handle (used to unmap later), dev_bus_addr */ //分配内存 gnttab_set_map_op(&ops, (unsigned long)v_start->addr, GNTMAP_host_map, info.gref, info.remoteDomain); /* flags, ref, domID */ //GNTTABOP_map_grant_ref **操作码(映射到自己空间) //HYPERVISOR_grant_table_op超级调用 if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &ops, 1)) { printk("\nxen: dom0: HYPERVISOR map grant ref failed"); return -EFAULT; } if (ops.status) { printk("\nxen: dom0: HYPERVISOR map grant ref failed status = %d", ops.status); return -EFAULT; } printk("\nxen: dom0: shared_page = %x, handle = %x, status = %x", (unsigned int)v_start->addr, ops.handle, ops.status); // Used for unmapping unmap_ops.host_addr = (unsigned long)(v_start->addr); unmap_ops.handle = ops.handle; #define ENABLE_PRINT_PAGE 1 #if ENABLE_PRINT_PAGE //验证DomU page中写入的字符串"aseem sethi" { int i; printk("\nBytes in page "); for(i=0;i<=10;i++) { printk("%c", ((char*)(v_start->addr))[i]); } } #endif #if ENABLE_SRING sring = (as_sring_t*)v_start->addr; BACK_RING_INIT(&info.ring, sring, PAGE_SIZE); /* Seetup an event channel to the frontend */ err = bind_interdomain_evtchn_to_irqhandler(info.remoteDomain, info.evtchn, as_int, 0, "dom0-backend", &info); if (err < 0) { printk("\nxen: dom0: init_module failed binding to evtchn !"); err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unmap_ops, 1); return -EFAULT; } info.irq = err; printk("\nxen: dom0: end init_module: int = %d", info.irq); #endif printk("\nXEN: dom: end init_module\n"); return 0; } void cleanup_module(void) { int ret; printk("\nxen: dom0: cleanup_module"); // Unmap foreign frames // ops.handle points to the pages that were initially mapped. Set in the // __init() function //ops.host_addr ponts to the heap where the pages were mapped ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unmap_ops, 1); if (ret == 0) { printk(" cleanup_module: unmapped shared frame"); } else { printk(" cleanup_module: unmapped shared frame failed"); } printk("...\n"); } MODULE_LICENSE("GPL");
下面继续大致分析一下流程:
irqreturn_t as_int ()查看RING是否溢出,取得请求,取得回复,
之后是检查RING中是否还有请求。这里需要大家有一些RING环的知识。
关于barrier()函数,我这里有一个网址推荐
http://www.cnblogs.com/whyandinside/archive/2012/11/07/2759014.html
init_module(void) 映射到自己空间,并分配虚拟地址,验证addr地址中的字符串。
最后就是取消映射了。