xen 前段后端

内存共享是XEN PV的一个优势,今天就简单介绍一下PV的内存共享(DomU创建一个页面共享,然后映射到Dom0中)的原理及其代码。

- 两个Domain之间的内存页共享, Doamin0和DomainU- 在这个页中设置一个共享ring- 为共享ring设置event channel- 在Dom0和DomU之间来回传递一些信息介绍

在xen中的虚拟机被称为Domain. Domain0(Dom0)是特别的并拥有与正实设备交互的设备驱动, 例如网卡.这个驱动被称为后端驱动. 在我们的例子中这被称为后端domain.

在被称为 DomainU(DomU)的用户Domain有一个相应的前端驱动, 其是虚拟设备的接口,为和真实设备通信在DomU中前端驱动要连接后端驱动. 在我们以下的例子中, 这个DomU被称为前端Domain.

Xen为共享Domain间的内存提供了授权表(Grant Tables). 设备驱动使用授权表工作. 每个Domain有它自己的授权表, 并与xen共享. 在这个表中的条目由授权引用(grant references)所标识. 授权引用在Domain间传递, 且所引用的共享页由授权表所指向, domain也设置一个共享环结构(ring structure), 其用于在domain间有效共享数据.
对于分离前/后端驱动, 前端分配一个用于共享通信 ring 的内存页, 授权它给后端domain, 并放授权引用到xenstore, 这样后端就能 map 这个页. 有共享ring这个页是一个主页, 用于传递更多的授权引用. 共享页由块设备和其它同步接收数据的设备所使用, 异步接收数据的网络设备, 使用已知的 page flipping 方法, 这个页的所有权在Domain 间转移. 以上图展示了这个共享ring, 这个ring所有的公有和私有指针. "Request Producer" 和 "Responser Producer"是两个公共变量, 其会被共享这个页面的两者都看到. "Response Consumer" 是一个在前端(由前端设备所维护, 即DomU) ring 结构中指针. "Request Consumer" 是一个在后端(由后端设备维护, 即Dom0) ring 结构中的指针.API 使用
在DomU kerenl中的前端驱动广告一个页面用于共享, 这通过 hypervisor 函数调用("hypercall"), (gnttab_grant_foreign_access 系统调用)完成.  hypercall 通知 hypervisor 其它 domain 允许访问这个页. DomU然后传递引用 ID 给远端的 Domain 它是"授权"可访问的. 在我们的代码中, 这个访问授权给了 Dom0. 一但远端 domain 完成操作, 那么本地 domain 应调用gnttab_end_foreign_access删除授权.
网络设备和类似接收异步接收数据的其它设备. 使用已知的 page flipping 方法. 当 页翻转时, 在本地 domain kernel中的驱动会广告一个用于转移的页, 经由 gnttab_grant_foregin_transfer 调用完成. 这个调用会通知 hypervisor其它domain可以接收这个页. 转移给远程domain的这个本地domain执行 free page. (经由producer/consumer ring).

先上代码:

#include <linux/module.h>
#include <linux/version.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/etherdevice.h>
#include <linux/skbuff.h>
#include <linux/init.h>
#include <linux/bitops.h>
#include <linux/ethtool.h>
#include <linux/in.h>
#include <linux/if_ether.h>
#include <linux/io.h>
#include <linux/moduleparam.h>
#include <net/sock.h>
#include <net/pkt_sched.h>
#include <net/arp.h>
#include <net/route.h>
#include <asm/uaccess.h>
#include <asm/page.h>
#include <xenpvdrivers/evtchn.h>
#include <xenpvdrivers/xenbus.h>
#include <xenpvdrivers/interface/io/netif.h>
#include <xenpvdrivers/interface/memory.h>
#include <xenpvdrivers/balloon.h>
#include <xenpvdrivers/asm/maddr.h>

#include <xenpvdrivers/grant_table.h>

//int page;
void *page;
struct as_request {
unsigned int id; /* private guest value echoed in resp */
unsigned int status;
unsigned int operation;
};

struct as_response {
unsigned int id; /* copied from request */
unsigned int status;
unsigned int operation; /* copied from request */
};

// The following makes the as_sring, as_back_ring, as_back_ring "types"
DEFINE_RING_TYPES(as, struct as_request, struct as_response);

struct info_t {
struct as_front_ring ring;
grant_ref_t gref;
int irq;
int port;
} info;

#define DOM0_ID 0

// Related the proc fs entries
static struct proc_dir_entry *proc_dir = NULL;
static struct proc_dir_entry *proc_file = NULL;
char proc_data[20];

#ifdef SHARED_MEM
/*
* Send an request via the shared ring to Dom0, following by an INT
*/

int send_request_to_dom0(void)
{
struct as_request *ring_req;
int notify;
static int reqid=9;

/* Write a request into the ring and update the req-prod pointer */
ring_req = RING_GET_REQUEST(&(info.ring), info.ring.req_prod_pvt);
ring_req->id = reqid;
ring_req->operation = reqid;
ring_req->status = reqid;
printk("\nxen:DomU: Fill in IDX-%d, with id=%d, op=%d, st=%d",
info.ring.req_prod_pvt, ring_req->id, ring_req->operation,
ring_req->status);
reqid++;
info.ring.req_prod_pvt += 1;

// Send a reqest to backend followed by an int if needed
RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&(info.ring), notify);

if (notify) {
printk("\nxen:DomU: Sent a req to Dom0");
notify_remote_via_irq(info.irq);
} else {
printk("\nxen:DomU: No notify req to Dom0");
notify_remote_via_irq(info.irq);
}
printk("...\n");
return 0;
}

ssize_t file_write (struct file *filp, const char __user *buff,
unsigned long len, void *data)
{
int value;

printk("\nxen:domU: file_write %lu bytes", len);
//copy_from_user函数的目的是从用户空间拷贝数据到内核空间,失败返回没有被拷贝的字节数,
//成功返回0.
//buff->proc
if (copy_from_user(&proc_data[0], buff, len))
return -EFAULT;
proc_data[len] = '\x0';
//printk(" ,%s", &proc_data[0]);
value = simple_strtol(proc_data, 0, 10);//把一个字符串转换为一个有符号长整数

switch(value) {
case 1:
send_request_to_dom0();
printk(" ,value = %d", value);
break;
default:
printk(" ,value not recognized !");
}
return len;
}

int file_read (char* page, char**start, off_t off,
int count, int *eof, void *data)
{
sprintf(page, "%s", proc_data);
//把格式化的数据写入某个字符串缓冲区 写入page
return strlen(page);
}

/*
* We create a /proc/demo/file entry. When we write a "1" ino this file once
* the module is loaded, the file_write function() above is called and this
* sends a requesst on the shared ring to the Dom0. This way we test the
* event channel and shared ring routines.
*/
int create_procfs_entry(void)//创建虚拟文件夹,及文件
{
int ret = 0;

proc_dir = proc_mkdir("demo", NULL);
if (!proc_dir) {
printk("\nxen:domU Could not create demo entry in procfs");
ret = -EAGAIN;
return ret;
}
/*要在 /proc 文件系统中创建一个虚拟文件,请使用 create_proc_entry 函数。这个函数可以接收一个文件名
、一组权限和这个文件在 /proc 文件系统中出现的位置。create_proc_entry 的返回值
是一个 proc_dir_entry 指针(或者为 NULL,说明在 create 时发生了错误)*/
proc_file = create_proc_entry("file", 0600, proc_dir);
if (proc_file) {
proc_file->read_proc = file_read;
proc_file->write_proc = file_write;
#if PROC_OWNER
proc_file->owner = THIS_MODULE;
#endif
} else {
printk("\nxen:domU Could not create /proc/demo/file");
ret = -EAGAIN;
return ret;
}
return ret;
}

/*
* Our interrupt handler for event channel that we set up
*/

static irqreturn_t as_int (int irq, void *dev_id)//中断处理函数
{
struct as_response *ring_resp;
RING_IDX i, rp;

printk("\nxen:DomU: as_int called");
again:
rp = info.ring.sring->rsp_prod;
printk("\nxen:DomU: ring pointers %d to %d", info.ring.rsp_cons, rp);
for(i=info.ring.rsp_cons; i != rp; i++) {
unsigned long id;
// what did we get from Dom0
ring_resp = RING_GET_RESPONSE(&(info.ring), i);
printk("\nxen:DomU: Recvd in IDX-%d, with id=%d, op=%d, st=%d",
i, ring_resp->id, ring_resp->operation, ring_resp->status);
id = ring_resp->id;
switch(ring_resp->operation) {
case 0:
printk("\nxen:DomU: operation:0");
break;
default:
break;
}
}

info.ring.rsp_cons = i;
if (i != info.ring.req_prod_pvt) {
int more_to_do;
RING_FINAL_CHECK_FOR_RESPONSES(&info.ring, more_to_do);
if(more_to_do)
goto again;
} else
info.ring.sring->rsp_event = i+1;
return IRQ_HANDLED;
}
#endif

int init_module(void)
{
int mfn;
#ifdef ENABLE_EVENT_IRQ
int err;
#endif
struct as_sring *sring;

/*
* Allocates and returns a pointer to the first byte of a memory area
* that is several physically contiguous pages long, and doesn't zero
* out the area.
* GFP_KERNEL - process may sleep
*/


/*
在linux内核空间申请内存涉及的函数主要包括kmalloc()、__get_free_pages()和vmalloc()等。
kmalloc()和__get_free_pages()申请的内存位于物理内存映射区域(《896M,所以容易操作,
可以得到虚拟地址与物理地址),而且在物理上也是连续的,它们与真实的物理地址只有一个
固定的偏移,因此存在简单的转换关系。而vmalloc()在虚拟内存空间给出一块连续的内存空间
(>896,虚拟地址上连续),实质上,这片连续的虚拟内存在物理内存中并不一定连续,
而vmalloc()申请的虚拟内存和物理内存之间也没有简单的换算关系。*/
page = __get_free_pages(GFP_KERNEL, 1);
if (page == 0) {
printk("\nxen:DomU: could not get free page");
return 0;
}

#if ENABLE_SHARED_RING
/* Put a shared ring structure on this page */

sring = (struct as_sring*) page;

SHARED_RING_INIT(sring);
/*前端分配一个用于共享通信 ring 的内存页, 授权它给后端domain, 并放授权引用到xenstore,
这样后端就能 map 这个页. 有共享ring这个页是一个主页, 用于传递更多的授权引用*/

/* info.ring is the front_ring structure */

FRONT_RING_INIT(&(info.ring), sring, PAGE_SIZE);
#endif



mfn = virt_to_mfn(page);//?????****

/*
* The following grant table func is in drivers/xen/grant-table.c
* For shared pages, used for synchronous data, advertise a page to
* be shared via the hypervisor fu[nction call gnttab_grant_foreign_access.
* This call notifies the hypervisor that other domains are allowed to
* access this page.
*
* gnttab_map() has been called earlier to setup gnttable_setup_table
* during init phase, with a call to HYPERVISOR_grant_table_op(
* GNTTAB_setup_table...) and
* "shared" pages have been malloc'ed. This "shared" page is then used
* below later during the actual grant of a ref by this DOM.
*
* gnttab_grant_foreign_access()
* => get_free_entries
* gnttab_free_head - points to the ref of the head
* gnttab_free_count- keeps number of free refs
*
* Get a ref id by calling gnttab_entry(head)
* gnttab_list[entry/RPP][entry%RPP]
* => gnttab_grat_foreign_access_ref
* =>update_grant_entry
* shared[ref].frame/domid/flags are updated
* "shared" above is a pointer to struct grant_entry (flags/domid/frame)
*/

info.gref = gnttab_grant_foreign_access(DOM0_ID, mfn, 0);

if (info.gref < 0) {

printk("\nxen: could not grant foreign access");

free_page((unsigned long)page);

return 0;

}

/*
* The following strcpy is commented out, but was used initally to test
* is the memory page is indeed shared with Dom0, when in Dom0, we do a
* sprintf of the same memory location and get the same characters.
*/

strcpy((char*)page, "aseem sethi");
/*
* TBD: Save gref to be sent via Xenstore to dom-0. As of now both the
* gref and the event channel port id is sent manually during insmod
* in the dom0 module.
*/

printk("\n gref = %d", info.gref);

/* Setup an event channel to Dom0 */
#ifdef ENABLE_EVENT_IRQ
err = bind_listening_port_to_irqhandler(DOM0_ID, as_int, 0,
"xen-eg", &info);
if (err < 0) {
printk("\nxen:DomU failed to setup evtchn !");
gnttab_end_foreign_access(info.gref, 0, page);
return 0;
}

info.irq = err;
info.port = irq_to_evtchn_port(info.irq);
printk(" interupt = %d, local-port = %d", info.irq, info.port);
printk("....\n...");
create_procfs_entry();
#endif
return 0;
}

void cleanup_module(void)
{
printk("\nCleanup grant ref:");
if (gnttab_query_foreign_access(info.gref) == 0) {
//Remove the grant to the page
printk("\n xen: No one has mapped this frame");

// If 3rd param is non NULL, page has to be freed
gnttab_end_foreign_access(info.gref, 0, page);
// free_pages(page,1);
} else {
printk("\n xen: Someone has mapped this frame");
// Guess, we still free the page, since we are rmmod-ed
gnttab_end_foreign_access(info.gref, 0, page);
}

/* Cleanup proc entry */
remove_proc_entry("file", proc_dir);
remove_proc_entry("demo", NULL);
printk("....\n...");
}



MODULE_LICENSE("GPL");

里面的备注可以多多少少增进一下大家对函数以及结构体的理解,现在我们大体说一下:

int send_request_to_dom0(void) 将请求写入RING之后再通过RING到达backend

file_write() 和file_read() 与将buff内容从用户控件写入内核空间proc_data在写入page中

create_procfs_entry()创建虚拟文件及文件夹

init_module() 为page分配内核空间,并设置共享RING及初始化前端RING,并验证是否可以被共享映射,为了验证在page中写入字符串"aseem sethi"。建立事件通道。gref和事件通道port需要在Dom0代码中手动加入。

cleanup_module()检查时候有dom映射过此页面,撤销映射,并且删除在/proc下的文件。

Dom0代码:


#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>


#if 0
#include <xen/interface/grant_table.h>
#include <xen/interface/io/blkif.h>  // for definition of blkif_sring_t
#include <xen/gnttab.h>
#include <linux/vmalloc.h>
#include <asm-x86/xen/hypervisor.h>
#include <xen/evtchn.h>
#else
#include <linux/vmalloc.h>
#include <xen/xen.h>
#include <xen/xenbus.h>
#include <xen/events.h>
#include <xen/page.h>
#include <xen/grant_table.h>


#include <xen/interface/io/netif.h>
#include <xen/interface/memory.h>
#include <xen/interface/grant_table.h>
#endif


struct gnttab_map_grant_ref   ops;//根据(dom,GR)奖对应的页映射到自己的地址空间
struct gnttab_unmap_grant_ref unmap_ops;//撤销页映射


struct as_request {
    unsigned int  id;           /* private guest value, echoed in resp  */
    unsigned int  status; 
    unsigned int  operation;   
};


struct as_response {
    unsigned int  id;              /* copied from request */
    unsigned int  status; 
    unsigned int  operation;       /* copied from request */
};


typedef struct as_request as_request_t;
typedef struct as_response as_response_t;


// From /include/xen/interface/io/ring.h
// The following makes the as_sring, as_back_ring, as_back_ring "types"
DEFINE_RING_TYPES(as, struct as_request, struct as_response);//#define DEFINE_RING_TYPES(__name, __req_t, __rsp_t)
struct info_t {
      int irq;
      int gref;
      int remoteDomain;
      int evtchn;
      struct as_back_ring ring;
} info;


int gref;
int port;
module_param(gref, int, 0644);//在domU中 gref port需要手动加入 init_module()
module_param(port, int, 0644);//编写一个内核模块则通过module_param()传递参数


/*关于中断处理函数的返回值:中断程序的返回值是一个特殊类型—irqreturn_t。但是中断程序的返回值却只有两个—IRQ_NONE和IRQ_HANDLED。
 
#ifndef _LINUX_IRQRETURN_H
#define _LINUX_IRQRETURN_H
typedef int irqreturn_t;
 
#define IRQ_NONE       (0)
#define IRQ_HANDLED       (1)
#define IRQ_RETVAL(x)      ((x) != 0)  //这个宏只是返回0或非0
 
#endif*/


#if ENABLE_SRING
static irqreturn_t as_int (int irq, void *dev_id)//io环操作 **请求,应答
{
      RING_IDX rc, rp;//typedef unsigned int RING_IDX;
      as_request_t req;
      as_response_t resp;
      int more_to_do, notify;


      // dev_id is a pointer to the info structure
      printk("\nxen:Dom0: as_int called with dev_id %x info=%x",
            (unsigned int)dev_id, (unsigned int)&info);
      rc = info.ring.req_cons;
      rp = info.ring.sring->req_prod;
      printk("  rc =%d rp =%d", rc, rp);
      while(rc!=rp) {
  /*
define RING_REQUEST_CONS_OVERFLOW(_r, _cons)                         
(((_cons) - (_r)->rsp_prod_pvt) >= RING_SIZE(_r))


*/
            if(RING_REQUEST_CONS_OVERFLOW(&info.ring, rc))//RING 请求溢出
                  break;
            // what did we get from the frontend at index rc
            memcpy(&req, RING_GET_REQUEST(&info.ring, rc), sizeof(req));
            resp.id = req.id;
            resp.operation = req.operation;
            resp.status = req.status+1; // Send back a status +1 of what was recvd
            printk("\nxen:Dom0: Recvd at IDX-%d: id=%d, op=%d, status=%d",
                  rc, req.id, req.operation, req.status);


            // update the req-consumer
            info.ring.req_cons = ++rc;
            barrier();//防止读写出错
            switch (req.operation) {
            case 0:
                  printk("\nxen:Dom0: req.operation = 0");
                  break;
            default:
                  printk("\nxen:Dom0: req.operation = %d", req.operation);
                  break;
            }


            memcpy(RING_GET_RESPONSE(&info.ring, info.ring.rsp_prod_pvt),
                        &resp, sizeof(resp));
            info.ring.rsp_prod_pvt++;
            RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&info.ring, notify);
            if(info.ring.rsp_prod_pvt == info.ring.req_cons) {
                  RING_FINAL_CHECK_FOR_REQUESTS(&info.ring, more_to_do);
            } else if (RING_HAS_UNCONSUMED_REQUESTS(&info.ring)) {//还有未处理req
                  more_to_do = 1;
            }


            if(notify) {
                  printk("\nxen:Dom0: Send notify to DomU");
                  notify_remote_via_irq(info.irq);
            }
      }


      return IRQ_HANDLED;
}
#endif
 
int init_module(void)
{
      struct vm_struct *v_start;
#if ENABLE_SRING
      as_sring_t *sring;
#endif
      int err;
 
      info.gref = gref;
      info.remoteDomain = 1;
      info.evtchn = port;
      printk("\nxen: dom0: init_module with gref = %d", info.gref);
 
      // The following function reserves a range of kernel address space and
      // allocates pagetables to map that range. No actual mappings are created. 
      v_start = alloc_vm_area(PAGE_SIZE);//分配虚拟地址结构
      if (v_start == 0) {//无法分配
            free_vm_area(v_start);
            printk("\nxen: dom0: could not allocate page");
            return -EFAULT;
      }


/* struct vm_struct {  
    struct vm_struct    *next;//指向下一虚拟地址,加速查询  
    void            *addr;//地址  
    unsigned long       size;//大小  
    unsigned long       flags;//标志  
    struct page     **pages;//页指针  
    unsigned int        nr_pages;  
    unsigned long       phys_addr;//物理地址  
};  
struct vm_struct *alloc_vm_area(size_t size);//分配虚拟地址结构  
void free_vm_area(struct vm_struct *area);//释放虚拟地址结构*/


      /*
       * ops struct in paramaeres
       * host_addr, flags, ref 
       * ops struct out parameters 
       * status (zero if OK), handle (used to unmap later), dev_bus_addr 
       */
//分配内存
      gnttab_set_map_op(&ops, (unsigned long)v_start->addr, GNTMAP_host_map,
                  info.gref, info.remoteDomain); /* flags, ref, domID */
//GNTTABOP_map_grant_ref  **操作码(映射到自己空间)   
//HYPERVISOR_grant_table_op超级调用  
      if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &ops, 1)) {
            printk("\nxen: dom0: HYPERVISOR map grant ref failed");
            return -EFAULT;
      }


      if (ops.status) {
            printk("\nxen: dom0: HYPERVISOR map grant ref failed status = %d",
                        ops.status);
            return -EFAULT;
      }
      printk("\nxen: dom0: shared_page = %x, handle = %x, status = %x",
                  (unsigned int)v_start->addr, ops.handle, ops.status);


      // Used for unmapping
      unmap_ops.host_addr = (unsigned long)(v_start->addr);
      unmap_ops.handle = ops.handle;
 
#define ENABLE_PRINT_PAGE 1
#if ENABLE_PRINT_PAGE  //验证DomU page中写入的字符串"aseem sethi"
{
int i;
printk("\nBytes in page ");
for(i=0;i<=10;i++)
{
printk("%c", ((char*)(v_start->addr))[i]);
}
}
#endif


#if ENABLE_SRING
      sring = (as_sring_t*)v_start->addr;
      BACK_RING_INIT(&info.ring, sring, PAGE_SIZE);


      /* Seetup an event channel to the frontend */
      err = bind_interdomain_evtchn_to_irqhandler(info.remoteDomain,
                info.evtchn, as_int, 0, "dom0-backend", &info);
        if (err < 0) {
            printk("\nxen: dom0: init_module failed binding to evtchn !");
            err = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
                  &unmap_ops, 1);
            return -EFAULT;
      }


      info.irq = err;


      printk("\nxen: dom0: end init_module: int = %d", info.irq);
#endif
      printk("\nXEN: dom: end init_module\n");
      return 0;
}


void cleanup_module(void) {
      int ret;


      printk("\nxen: dom0: cleanup_module");
      // Unmap foreign frames
      // ops.handle points to the pages that were initially mapped. Set in the
      // __init() function
      //ops.host_addr ponts to the heap where the pages were mapped
      ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unmap_ops, 1);
      if (ret == 0) {
            printk(" cleanup_module: unmapped shared frame");
      } else {
            printk(" cleanup_module: unmapped shared frame failed");
      }
      printk("...\n");
}


MODULE_LICENSE("GPL");


下面继续大致分析一下流程:

irqreturn_t as_int ()查看RING是否溢出,取得请求,取得回复,之后是检查RING中是否还有请求。这里需要大家有一些RING环的知识。关于barrier()函数,我这里有一个网址推荐

http://www.cnblogs.com/whyandinside/archive/2012/11/07/2759014.html

init_module(void) 映射到自己空间,并分配虚拟地址,验证addr地址中的字符串。

最后就是取消映射了

你可能感兴趣的:(xen 前段后端)