virtio后端方案vhost

vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost

vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下

struct vhost_dev {
    MemoryListener memory_listener;  /* MemoryListener是物理内存操作的回调函数集合 */
    struct vhost_memory *mem;
    int n_mem_sections;
    MemoryRegionSection *mem_sections;
    struct vhost_virtqueue *vqs;  /* vhost_virtqueue列表和个数 */
    int nvqs;
    /* the first virtuque which would be used by this vhost dev */
    int vq_index;
    unsigned long long features;  /* vhost设备支持的features */
    unsigned long long acked_features;  /* guest acked的features */
    unsigned long long backend_features;  /* backend, e.g. tap设备,支持的features */
    bool started;
    bool log_enabled;
    vhost_log_chunk_t *log;
    unsigned long long log_size;
    Error *migration_blocker;
    bool force;
    bool memory_changed;
    hwaddr mem_changed_start_addr;
    hwaddr mem_changed_end_addr;
    const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */
    void *opaque;
};

struct vhost_virtqueue {
    int kick;
    int call;
    void *desc;
    void *avail;
    void *used;
    int num;
    unsigned long long used_phys;
    unsigned used_size;
    void *ring;
    unsigned long long ring_phys;
    unsigned ring_size;
    EventNotifier masked_notifier;
};
vhost的内存布局,也是由一组vhost_memory_region构成,

struct vhost_memory_region {
    __u64 guest_phys_addr;
    __u64 memory_size; /* bytes */
    __u64 userspace_addr;
    __u64 flags_padding; /* No flags are currently specified. */
};

/* All region addresses and sizes must be 4K aligned. */
#define VHOST_PAGE_SIZE 0x1000

struct vhost_memory {
    __u32 nregions;
    __u32 padding;
    struct vhost_memory_region regions[0];
};
vhost的控制面由qemu来控制,通过ioctl操作vhost_xxx的内核模块,e.g.

long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg)
{
    void __user *argp = (void __user *)arg;
    struct file *eventfp, *filep = NULL;
    struct eventfd_ctx *ctx = NULL;
    u64 p;
    long r;
    int i, fd;

    /* If you are not the owner, you can become one */
    if (ioctl == VHOST_SET_OWNER) {
        r = vhost_dev_set_owner(d);
        goto done;
    }

    /* You must be the owner to do anything else */
    r = vhost_dev_check_owner(d);
    if (r)
        goto done;

    switch (ioctl) {
    case VHOST_SET_MEM_TABLE:
        r = vhost_set_memory(d, argp);
        break;
...
    default:
        r = vhost_set_vring(d, ioctl, argp);
        break;
    }
done:
    return r;
}

VHOST_SET_OWNER,用于把当前guest对应的qemu进程和vhost内核线程关联起来

VHOST_SET_OWNER
/* Caller should have device mutex */
static long vhost_dev_set_owner(struct vhost_dev *dev)
{
    struct task_struct *worker;
    int err;
    /* Is there an owner already? */
    if (dev->mm) {
        err = -EBUSY;
        goto err_mm;
    }
    /* No owner, become one */
    dev->mm = get_task_mm(current); /* 拿到qemu进程的mm_struct,即guest的内存分布结构 */
    worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); /* 创建vhost线程 */
    if (IS_ERR(worker)) {
        err = PTR_ERR(worker);
        goto err_worker;
    }

    dev->worker = worker;
    wake_up_process(worker);    /* avoid contributing to loadavg */

    err = vhost_attach_cgroups(dev);
    if (err)
        goto err_cgroup;

    err = vhost_dev_alloc_iovecs(dev); /* 为vhost_virtqueue分配iovec内存空间 */
    if (err)
        goto err_cgroup;

    return 0;
err_cgroup:
    kthread_stop(worker);
    dev->worker = NULL;
err_worker:
    if (dev->mm)
        mmput(dev->mm);
    dev->mm = NULL;
err_mm:
    return err;
}
VHOST_SET_MEM_TABLE,初始化vhost_dev的vhost_memory内存成员
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
{
    struct vhost_memory mem, *newmem, *oldmem;
    unsigned long size = offsetof(struct vhost_memory, regions);
    if (copy_from_user(&mem, m, size))
        return -EFAULT;
    if (mem.padding)
        return -EOPNOTSUPP;
    if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS)
        return -E2BIG;
    newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); /* 分配多个vhost_memory_region */
    if (!newmem)
        return -ENOMEM;

    memcpy(newmem, &mem, size);
    if (copy_from_user(newmem->regions, m->regions,
               mem.nregions * sizeof *m->regions)) {
        kfree(newmem);
        return -EFAULT;
    }

    if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) {
        kfree(newmem);
        return -EFAULT;
    }
    oldmem = d->memory;
    rcu_assign_pointer(d->memory, newmem);
    synchronize_rcu();
    kfree(oldmem);
    return 0;
}
VHOST_GET_FEATURES, VHOST_SET_FEATURES,用于读写vhost支持的features,目前只有vhost_net模块用到,
enum {
    VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) |
             (1ULL << VIRTIO_RING_F_INDIRECT_DESC) |
             (1ULL << VIRTIO_RING_F_EVENT_IDX) |
             (1ULL << VHOST_F_LOG_ALL) |
             (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) |
             (1ULL << VIRTIO_NET_F_MRG_RXBUF),
};

static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
                unsigned long arg)
{
....
    case VHOST_GET_FEATURES:
        features = VHOST_FEATURES;
        if (copy_to_user(featurep, &features, sizeof features))
            return -EFAULT;
        return 0;
    case VHOST_SET_FEATURES:
        if (copy_from_user(&features, featurep, sizeof features))
            return -EFAULT;
        if (features & ~VHOST_FEATURES)
            return -EOPNOTSUPP;
        return vhost_net_set_features(n, features);
....
}
VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest

VHOST_SET_VRING_KICK,设置ioeventfd,获取guest notify

    case VHOST_SET_VRING_KICK:
        if (copy_from_user(&f, argp, sizeof f)) {
            r = -EFAULT;
            break;
        }
        eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
        if (IS_ERR(eventfp)) {
            r = PTR_ERR(eventfp);
            break;
        }
        if (eventfp != vq->kick) { /* eventfp不同于vq->kick,此时需要stop vq->kick同时start eventfp */
            pollstop = filep = vq->kick;
            pollstart = vq->kick = eventfp;
        } else
            filep = eventfp;  /* 两者相同,无需stop & start */
        break;
    case VHOST_SET_VRING_CALL:
        if (copy_from_user(&f, argp, sizeof f)) {
            r = -EFAULT;
            break;
        }
        eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
        if (IS_ERR(eventfp)) {
            r = PTR_ERR(eventfp);
            break;
        }
        if (eventfp != vq->call) {  /* eventfp不同于vq->call,此时需要stop vq->call同时start eventfp */
            filep = vq->call;
            ctx = vq->call_ctx;
            vq->call = eventfp;
            vq->call_ctx = eventfp ?
                eventfd_ctx_fileget(eventfp) : NULL;
        } else
            filep = eventfp;
        break;
    if (pollstop && vq->handle_kick)
        vhost_poll_stop(&vq->poll);

    if (ctx)
        eventfd_ctx_put(ctx); /* pollstop之后,释放之前占用的ctx */
    if (filep)
        fput(filep);  /* pollstop之后,释放之前占用的filep */

    if (pollstart && vq->handle_kick)
        vhost_poll_start(&vq->poll, vq->kick);

    mutex_unlock(&vq->mutex);

    if (pollstop && vq->handle_kick)
        vhost_poll_flush(&vq->poll);
    return r;
下面来看下vhost的数据流,vhost与kvm模块之间通过eventfd来实现,guest到host方向的kick event,通过ioeventfd实现,host到guest方向的call event,通过irqfd实现

host到guest方向

首先host处理used ring,然后判断如果KVM_IRQFD成功设置,kvm模块会通过irqfd把中断注入guest。qemu是通过virtio_pci_set_guest_notifiers -> kvm_virtio_pci_vector_use -> kvm_virtio_pci_irqfd_use -> kvm_irqchip_add_irqfd_notifier -> kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的,包含write fd和read fd(可选)

static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs)
{
    PCIDevice *dev = &proxy->pci_dev;
    VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
    VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
    unsigned int vector;
    int ret, queue_no;
    MSIMessage msg;

    for (queue_no = 0; queue_no < nvqs; queue_no++) {
        if (!virtio_queue_get_num(vdev, queue_no)) {
            break;
        }
        vector = virtio_queue_vector(vdev, queue_no);
        if (vector >= msix_nr_vectors_allocated(dev)) {
            continue;
        }
        msg = msix_get_message(dev, vector);
        ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg);
        if (ret < 0) {
            goto undo;
        }
        /* If guest supports masking, set up irqfd now.
         * Otherwise, delay until unmasked in the frontend.
         */
        if (k->guest_notifier_mask) {
            ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector);
            if (ret < 0) {
                kvm_virtio_pci_vq_vector_release(proxy, vector);
                goto undo;
            }
        }
    }
    return 0;

undo:
    while (--queue_no >= 0) {
        vector = virtio_queue_vector(vdev, queue_no);
        if (vector >= msix_nr_vectors_allocated(dev)) {
            continue;
        }
        if (k->guest_notifier_mask) {
            kvm_virtio_pci_irqfd_release(proxy, queue_no, vector);
        }
        kvm_virtio_pci_vq_vector_release(proxy, vector);
    }
    return ret;
}
如果没有设置irqfd,则guest notifier fd会通知到等待fd的qemu进程,进入注册函数virtio_queue_guest_notifier_read,调用virtio_irq,最终调用到virtio_pci_notify

static void virtio_queue_guest_notifier_read(EventNotifier *n)
{
    VirtQueue *vq = container_of(n, VirtQueue, guest_notifier);
    if (event_notifier_test_and_clear(n)) {
        virtio_irq(vq);
    }
}

void virtio_irq(VirtQueue *vq)
{
    trace_virtio_irq(vq);
    vq->vdev->isr |= 0x01;
    virtio_notify_vector(vq->vdev, vq->vector);
}

static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
{
    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);

    if (k->notify) {
        k->notify(qbus->parent, vector);
    }
}

static void virtio_pci_notify(DeviceState *d, uint16_t vector)
{
    VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d);

    if (msix_enabled(&proxy->pci_dev))
        msix_notify(&proxy->pci_dev, vector);
    else {
        VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
        pci_set_irq(&proxy->pci_dev, vdev->isr & 1);
    }
}
整个过程如图所示 ( 摘自http://royluo.org/2014/08/22/vhost/ )

guest到host方向

guest通过向pci配置空间写入从而产生VMEXIT,被kvm截获之后触发注册fd的notification

kvm_init:
    memory_listener_register(&kvm_memory_listener, &address_space_memory);
    memory_listener_register(&kvm_io_listener, &address_space_io);

static MemoryListener kvm_memory_listener = {
    .region_add = kvm_region_add,
    .region_del = kvm_region_del,
    .log_start = kvm_log_start,
    .log_stop = kvm_log_stop,
    .log_sync = kvm_log_sync,
    .log_global_start = kvm_log_global_start,
    .log_global_stop = kvm_log_global_stop,
    .eventfd_add = kvm_mem_ioeventfd_add,
    .eventfd_del = kvm_mem_ioeventfd_del,
    .coalesced_mmio_add = kvm_coalesce_mmio_region,
    .coalesced_mmio_del = kvm_uncoalesce_mmio_region,
    .priority = 10,
};

static MemoryListener kvm_io_listener = {
    .eventfd_add = kvm_io_ioeventfd_add,
    .eventfd_del = kvm_io_ioeventfd_del,
    .priority = 10,
};

static void kvm_io_ioeventfd_add(MemoryListener *listener,
                                 MemoryRegionSection *section,
                                 bool match_data, uint64_t data,
                                 EventNotifier *e)
{
    int fd = event_notifier_get_fd(e);
    int r;

    r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space,
                              data, true, int128_get64(section->size),
                              match_data);
    if (r < 0) {
        fprintf(stderr, "%s: error adding ioeventfd: %s\n",
                __func__, strerror(-r));
        abort();
    }
}
而kvm_io_ioeventfd_add最终调用了kvm_set_ioeventfd_pio,后者调用了kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick)进入到了kvm.ko中

static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val,
                                 bool assign, uint32_t size, bool datamatch)
{
    struct kvm_ioeventfd kick = {
        .datamatch = datamatch ? val : 0,
        .addr = addr,
        .flags = KVM_IOEVENTFD_FLAG_PIO,
        .len = size,
        .fd = fd,
    };
    int r;
    if (!kvm_enabled()) {
        return -ENOSYS;
    }
    if (datamatch) {
        kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH;
    }
    if (!assign) {
        kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN;
    }
    r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick);
    if (r < 0) {
        return r;
    }
    return 0;
}
KVM_IOEVENTFD的ioctl最终调用了kvm的kvm_ioeventfd函数,后者会调用到kvm_assign_ioeventfd或者kvm_deassign_ioeventfd

int
kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
    if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN)
        return kvm_deassign_ioeventfd(kvm, args);

    return kvm_assign_ioeventfd(kvm, args);
}

static int
kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
{
    int                       pio = args->flags & KVM_IOEVENTFD_FLAG_PIO;
    enum kvm_bus              bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS;
    struct _ioeventfd        *p; /* ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. */
    struct eventfd_ctx       *eventfd;  /* mostly wait_queue_head_t */
    int                       ret;

    /* must be natural-word sized */
    switch (args->len) {
    case 1:
    case 2:
    case 4:
    case 8:
        break;
    default:
        return -EINVAL;
    }

    /* check for range overflow */
    if (args->addr + args->len < args->addr)
        return -EINVAL;

    /* check for extra flags that we don't understand */
    if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK)
        return -EINVAL;

    eventfd = eventfd_ctx_fdget(args->fd);  /* file->private_data */
    if (IS_ERR(eventfd))
        return PTR_ERR(eventfd);

    p = kzalloc(sizeof(*p), GFP_KERNEL); /* 分配一个_ioeventfd,并把内存地址,长度,eventfd_ctx与其关联起来 */
    if (!p) {
        ret = -ENOMEM;
        goto fail;
    }

    INIT_LIST_HEAD(&p->list);
    p->addr    = args->addr;
    p->length  = args->len;
    p->eventfd = eventfd;

    /* The datamatch feature is optional, otherwise this is a wildcard */
    if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH)
        p->datamatch = args->datamatch;
    else
        p->wildcard = true;

    mutex_lock(&kvm->slots_lock);

    /* Verify that there isnt a match already */
    if (ioeventfd_check_collision(kvm, p)) {
        ret = -EEXIST;
        goto unlock_fail;
    }

    kvm_iodevice_init(&p->dev, &ioeventfd_ops);

    ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); /* 注册到kvm的pio bus或者mmio bus上 */
    if (ret < 0)
        goto unlock_fail;

    list_add_tail(&p->list, &kvm->ioeventfds); /* 添加到kvm.ko的ioeventfds的list中 */

    mutex_unlock(&kvm->slots_lock);

    return 0;

unlock_fail:
    mutex_unlock(&kvm->slots_lock);

fail:
    kfree(p);
    eventfd_ctx_put(eventfd);

    return ret;
}
kvm_assign_ioeventfd中,通过注册一个pio/mmio的地址段和一个fd,当访问这块内存产生的VMEXIT就会在kvm.ko中被转化成为fd的event notification,
static const struct kvm_io_device_ops ioeventfd_ops = {
    .write      = ioeventfd_write,
    .destructor = ioeventfd_destructor,
};

/* MMIO/PIO writes trigger an event if the addr/val match */
static int
ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
        const void *val)
{
    struct _ioeventfd *p = to_ioeventfd(this);

    if (!ioeventfd_in_range(p, addr, len, val))
        return -EOPNOTSUPP;

    eventfd_signal(p->eventfd, 1);
    return 0;
}
最终event notification通过eventfd_signal,唤醒vhost线程,整体的流程如下图所示



vhost的控制面和数据面如下图所示





最后,以vhost-net为例说明下vhost网络报文的初始化以及收发流程,e.g.

qemu通过netdev tap,vhost=on在创建网络设备时指定后端基于vhost,net_init_tap会对vhost的每个queue,调用net_init_tap_one初始化vhost。初始化的工作通过vhost_net_init完成

typedef struct VhostNetOptions {
    VhostBackendType backend_type;  /* vhost kernel or userspace */
    NetClientState *net_backend;  /* TAPState device */
    void *opaque;  /* ioctl vhostfd, /dev/vhost-net */
    bool force;
} VhostNetOptions;

static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                            const char *model, const char *name,
                            const char *ifname, const char *script,
                            const char *downscript, const char *vhostfdname,
                            int vnet_hdr, int fd)
{
...
    if (tap->has_vhost ? tap->vhost :
        vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
        VhostNetOptions options;

        options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
        options.net_backend = &s->nc;
        options.force = tap->has_vhostforce && tap->vhostforce;

        if ((tap->has_vhostfd || tap->has_vhostfds)) {
            vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname);
            if (vhostfd == -1) {
                return -1;
            }
        } else {
            vhostfd = open("/dev/vhost-net", O_RDWR);  /* open /dev/vhost-net for ioctl usage */
            if (vhostfd < 0) {
                error_report("tap: open vhost char device failed: %s",
                           strerror(errno));
                return -1;
            }
        }
        qemu_set_cloexec(vhostfd);
        options.opaque = (void *)(uintptr_t)vhostfd;
        s->vhost_net = vhost_net_init(&options);  /* 初始化struct vhost_net */
        if (!s->vhost_net) {
            error_report("vhost-net requested but could not be initialized");
            return -1;
        }
    }
...
}

struct vhost_net {
    struct vhost_dev dev;
    struct vhost_virtqueue vqs[2];
    int backend;
    NetClientState *nc;
};

struct vhost_net *vhost_net_init(VhostNetOptions *options)
{
    int r;
    bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL;
    struct vhost_net *net = g_malloc(sizeof *net);

    if (!options->net_backend) {
        fprintf(stderr, "vhost-net requires net backend to be setup\n");
        goto fail;
    }

    if (backend_kernel) {
        r = vhost_net_get_fd(options->net_backend);
        if (r < 0) {
            goto fail;
        }
        net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend)
            ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR);
        net->backend = r;  /* backend设置为NetClientState对应的fd */
    } else {
        net->dev.backend_features = 0;
        net->backend = -1;
    }
    net->nc = options->net_backend;  /* nc设置为NetClientState */

    net->dev.nvqs = 2; /* TX queue和RX queue */
    net->dev.vqs = net->vqs; /* vhost_dev,vhost_net公用vhost_virtqueue */

    r = vhost_dev_init(&net->dev, options->opaque,
                       options->backend_type, options->force); /* 初始化vhost_dev,这里通过VHOST_SET_OWNER的ioctl创建vhost kthread */
    if (r < 0) {
        goto fail;
    }
    if (!qemu_has_vnet_hdr_len(options->net_backend,
                               sizeof(struct virtio_net_hdr_mrg_rxbuf))) {
        net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF);
    }
    if (backend_kernel) {
        if (~net->dev.features & net->dev.backend_features) {
            fprintf(stderr, "vhost lacks feature mask %" PRIu64
                   " for backend\n",
                   (uint64_t)(~net->dev.features & net->dev.backend_features));
            vhost_dev_cleanup(&net->dev);
            goto fail;
        }
    }
    /* Set sane init value. Override when guest acks. */
    vhost_net_ack_features(net, 0);
    return net;
fail:
    g_free(net);
    return NULL;
}
当guest启动成功,qemu会配置相应的vhost,调用virtio_net_set_status用于开启/关闭virtio-net设备及队列,virtio_net_set_status会调用到vhost_net_start用于打开vhost队列,调用vhost_net_stop用于关闭vhost队列

int vhost_net_start(VirtIODevice *dev, NetClientState *ncs,
                    int total_queues)
{
    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev)));
    VirtioBusState *vbus = VIRTIO_BUS(qbus);
    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus);
    int r, e, i;

    if (!vhost_net_device_endian_ok(dev)) {
        error_report("vhost-net does not support cross-endian");
        r = -ENOSYS;
        goto err;
    }

    if (!k->set_guest_notifiers) {
        error_report("binding does not support guest notifiers");
        r = -ENOSYS;
        goto err;
    }

    for (i = 0; i < total_queues; i++) {
        vhost_net_set_vq_index(get_vhost_net(ncs[i].peer), i * 2);
    }
    /* 调用virtio_pci_set_guest_notifiers来配置irqfd等信息;如果没有enable vhost,qemu同样会调用到这里 */
    r = k->set_guest_notifiers(qbus->parent, total_queues * 2, true); 
    if (r < 0) {
        error_report("Error binding guest notifier: %d", -r);
        goto err;
    }
    /* 如果tun支持多队列的场景,会有多个NetClientState,分别代表tap设备的一个队列,每个NetClientState都会对应一个vhost_net结构 */
    for (i = 0; i < total_queues; i++) {
        r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev); /* 对每个队列调用vhost_net_start_one */

        if (r < 0) {
            goto err_start;
        }
    }

    return 0;

err_start:
    while (--i >= 0) {
        vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev);
    }
    e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false);
    if (e < 0) {
        fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", e);
        fflush(stderr);
    }
err:
    return r;
}
static int vhost_net_start_one(struct vhost_net *net,
                               VirtIODevice *dev)
{
    struct vhost_vring_file file = { };
    int r;

    if (net->dev.started) {
        return 0;
    }

    net->dev.nvqs = 2;  /* vqs包含一个TX virtqueue和一个RX virtqueue */
    net->dev.vqs = net->vqs;
    /* 调用<span style="font-family: Arial, Helvetica, sans-serif;">virtio_pci_set_guest_notifiers来enable vhost ioeventfd */</span>
    r = vhost_dev_enable_notifiers(&net->dev, dev); /* 停止在qemu中处理guest的IO通知,开始在vhost里处理guest的IO通知 */
    if (r < 0) {
        goto fail_notifiers;
    }

    r = vhost_dev_start(&net->dev, dev);
    if (r < 0) {
        goto fail_start;
    }

    if (net->nc->info->poll) {
        net->nc->info->poll(net->nc, false);
    }

    if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
        qemu_set_fd_handler(net->backend, NULL, NULL, NULL);
        file.fd = net->backend;
        for (file.index = 0; file.index < net->dev.nvqs; ++file.index) {
            const VhostOps *vhost_ops = net->dev.vhost_ops;
            r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
                                      &file);
            if (r < 0) {
                r = -errno;
                goto fail;
            }
        }
    }
    return 0;
fail:
    file.fd = -1;
    if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) {
        while (file.index-- > 0) {
            const VhostOps *vhost_ops = net->dev.vhost_ops;
            int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND,
                                          &file);
            assert(r >= 0);
        }
    }
    if (net->nc->info->poll) {
        net->nc->info->poll(net->nc, true);
    }
    vhost_dev_stop(&net->dev, dev);
fail_start:
    vhost_dev_disable_notifiers(&net->dev, dev);
fail_notifiers:
    return r;
}
vhost net的各个数据结构之间关系如下图所示

我们来看下内核对vhost_net的定义,e.g.

static const struct file_operations vhost_net_fops = {
    .owner          = THIS_MODULE,
    .release        = vhost_net_release,
    .unlocked_ioctl = vhost_net_ioctl,
#ifdef CONFIG_COMPAT
    .compat_ioctl   = vhost_net_compat_ioctl,
#endif
    .open           = vhost_net_open,
};

static struct miscdevice vhost_net_misc = {
    MISC_DYNAMIC_MINOR,
    "vhost-net",
    &vhost_net_fops,
};

enum {
    VHOST_NET_VQ_RX = 0,
    VHOST_NET_VQ_TX = 1,
    VHOST_NET_VQ_MAX = 2,
};

enum vhost_net_poll_state {
    VHOST_NET_POLL_DISABLED = 0,
    VHOST_NET_POLL_STARTED = 1,
    VHOST_NET_POLL_STOPPED = 2,
};

struct vhost_net {
    struct vhost_dev dev;
    struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; /* vhost的virtqueue封装,其handle_kick的回调函数会被ioeventfd唤醒 */
    struct vhost_poll poll[VHOST_NET_VQ_MAX];  /* 对应于NetClientState的socket IO,分别用两个vhost_poll结构体 */
    /* Tells us whether we are polling a socket for TX.
     * We only do this when socket buffer fills up.
     * Protected by tx vq lock. */
    enum vhost_net_poll_state tx_poll_state;
};

static int vhost_net_open(struct inode *inode, struct file *f)
{
    struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
    struct vhost_dev *dev;
    int r;

    if (!n)
        return -ENOMEM;

    dev = &n->dev;
    n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */
    n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */
    r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
    if (r < 0) {
        kfree(n);
        return r;
    }

    vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev);  /* 初始化vhost_net的TX vhost_poll */
    vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev);   /* 初始化vhost_net的RX vhost_poll */
    n->tx_poll_state = VHOST_NET_POLL_DISABLED;

    f->private_data = n;

    return 0;
}

handle_tx_kick/handle_rx_kick的实现和handle_tx_net/handle_rx_net完全一致,这里为什么要有两个不同的函数呢?看完下面的代码分析后你会有一个答案,不过我先在这里剧透下,handle_tx_kick/handle_rx_kick是阻塞在TX queue/RX queue的kick fd上的回调函数,handle_tx_net/handle_rx_net是阻塞在vhost_net TX poll/RX poll上的阻塞函数,无论对于TX还是RX而言,报文的路径都是一个两阶段的过程,e.g.

TX首先是kick virtqueue的fd,之后进行vring的buffer传递,最后通过NetClientState的socket fd发送,但socket有可能会出现缓冲区不足,或者本次发送的quota不够等情况,此时需要poll在socket的fd上阻塞等待。同理RX也是如此,一阶段阻塞在socket fd上,二阶段阻塞在virtqueue kick fd上


前面分析时已经提到过,qemu在vhost_virtqueue_start时,会取得VirtQueue的host_notifier的rfd,并把fd通过VHOST_SET_VRING_KICK传入kvm.ko,这样kvm.ko后续就会通过eventfd_signal通知这个fd。vhost模块会把这个fd和vhost_virtqueue->kick关联起来,并最终调用vhost_poll_start阻塞在这个poll fd上。

当guest发送报文时,ioeventfd触发了vhost_virtqueue的kick fd,POLLIN事件导致vhost_poll_wakeup被调用,最后唤醒了vhost worker线程,线程会调用注册的handle_kick函数,即handle_tx_kick

static void handle_tx_kick(struct vhost_work *work)
{
    struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue,
                          poll.work);
    struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev);

    handle_tx(net);
}

static void handle_tx(struct vhost_net *net)
{
    struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
    unsigned out, in, s;
    int head;
    struct msghdr msg = {
        .msg_name = NULL,
        .msg_namelen = 0,
        .msg_control = NULL,
        .msg_controllen = 0,
        .msg_iov = vq->iov,
        .msg_flags = MSG_DONTWAIT,
    };
    size_t len, total_len = 0;
    int err, wmem;
    size_t hdr_size;
    struct vhost_ubuf_ref *uninitialized_var(ubufs);
    bool zcopy;
    struct socket *sock = rcu_dereference(vq->private_data); /* NetClientState对应的socket以private_data的形式保存在vhost_virtqueue */
    if (!sock)
        return;

    wmem = atomic_read(&sock->sk->sk_wmem_alloc);
    if (wmem >= sock->sk->sk_sndbuf) { /* 已经申请的socket写内存,超过了发送缓冲区 */
        mutex_lock(&vq->mutex);
        tx_poll_start(net, sock); /* 此时无法发送,阻塞等待在sock上 */
        mutex_unlock(&vq->mutex);
        return;
    }

    mutex_lock(&vq->mutex);
    vhost_disable_notify(&net->dev, vq); /* disable virtqueue的notify通知,通过VRING_USED_F_NO_NOTIFY标志位 */

    if (wmem < sock->sk->sk_sndbuf / 2)
        tx_poll_stop(net);
    hdr_size = vq->vhost_hlen;
    zcopy = vq->ubufs;

    for (;;) {
        /* Release DMAs done buffers first */
        if (zcopy)
            vhost_zerocopy_signal_used(vq);

        head = vhost_get_vq_desc(&net->dev, vq, vq->iov,   /* 从last_avail_idx开始,把avail desc内容拷贝过来 */
                     ARRAY_SIZE(vq->iov),
                     &out, &in,
                     NULL, NULL);
        /* On error, stop handling until the next kick. */
        if (unlikely(head < 0))
            break;
        /* Nothing new?  Wait for eventfd to tell us they refilled. */
        if (head == vq->num) {  /* 此时vq->avail_idx == vq->last_avail_idx,前端没有新buf过来 */
            int num_pends;

            wmem = atomic_read(&sock->sk->sk_wmem_alloc);
            if (wmem >= sock->sk->sk_sndbuf * 3 / 4) {
                tx_poll_start(net, sock);
                set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
                break;
            }
            /* If more outstanding DMAs, queue the work.
             * Handle upend_idx wrap around
             */
            num_pends = likely(vq->upend_idx >= vq->done_idx) ?
                    (vq->upend_idx - vq->done_idx) :
                    (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
            if (unlikely(num_pends > VHOST_MAX_PEND)) {
                tx_poll_start(net, sock);
                set_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
                break;
            }
            if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* 重新调用vhost_enable_notify打开event notify flag */
                vhost_disable_notify(&net->dev, vq); /* vhost_enable_notify返回false,说明avail_idx有了变化,那么continue */
                continue;
            }
            break;
        }
        if (in) { /* Tx应该全部是out */
            vq_err(vq, "Unexpected descriptor format for TX: "
                   "out %d, int %d\n", out, in);
            break;
        }
        /* Skip header. TODO: support TSO. */
        s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); /* hdr_size是VNET_HDR的元数据,里面没有实际报文内容 */
        msg.msg_iovlen = out;
        len = iov_length(vq->iov, out);
        /* Sanity check */
        if (!len) {
            vq_err(vq, "Unexpected header len for TX: "
                   "%zd expected %zd\n",
                   iov_length(vq->hdr, s), hdr_size);
            break;
        }
        /* use msg_control to pass vhost zerocopy ubuf info to skb */
        if (zcopy) {
            vq->heads[vq->upend_idx].id = head;
            if (len < VHOST_GOODCOPY_LEN) {
                /* copy don't need to wait for DMA done */
                vq->heads[vq->upend_idx].len =
                            VHOST_DMA_DONE_LEN;
                msg.msg_control = NULL;
                msg.msg_controllen = 0;
                ubufs = NULL;
            } else {
                struct ubuf_info *ubuf = &vq->ubuf_info[head];

                vq->heads[vq->upend_idx].len = len;
                ubuf->callback = vhost_zerocopy_callback;
                ubuf->arg = vq->ubufs;
                ubuf->desc = vq->upend_idx;
                msg.msg_control = ubuf;
                msg.msg_controllen = sizeof(ubuf);
                ubufs = vq->ubufs;
                kref_get(&ubufs->kref);
            }
            vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
        }
        /* TODO: Check specific error and bomb out unless ENOBUFS? */
        err = sock->ops->sendmsg(NULL, sock, &msg, len);
        if (unlikely(err < 0)) {
            if (zcopy) {
                if (ubufs)
                    vhost_ubuf_put(ubufs);
                vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
                    UIO_MAXIOV;
            }
            vhost_discard_vq_desc(vq, 1); /* 发送失败,回退last_avail_idx */
            if (err == -EAGAIN || err == -ENOBUFS)
                tx_poll_start(net, sock);  /* 阻塞等待vhost_net->poll之后尝试重新发送 */
            break;
        }
        if (err != len)
            pr_debug("Truncated TX packet: "
                 " len %d != %zd\n", err, len);
        if (!zcopy)
            vhost_add_used_and_signal(&net->dev, vq, head, 0); /* 更新virtqueue used ring部分,e.g. used_elem, last_used_idx */
        else
            vhost_zerocopy_signal_used(vq);
        total_len += len;
        if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
            vhost_poll_queue(&vq->poll);  /* 超出了quota,重新入队列等待调度 */
            break;
        }
    }

    mutex_unlock(&vq->mutex);
}
收包过程首先是vhost阻塞在NetClientState的socket上,e.g.

vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev)

static void handle_rx_net(struct vhost_work *work)
{
    struct vhost_net *net = container_of(work, struct vhost_net,
                         poll[VHOST_NET_VQ_RX].work);
    handle_rx(net);
}

static void handle_rx(struct vhost_net *net)
{
    struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
    unsigned uninitialized_var(in), log;
    struct vhost_log *vq_log;
    struct msghdr msg = {
        .msg_name = NULL,
        .msg_namelen = 0,
        .msg_control = NULL, /* FIXME: get and handle RX aux data. */
        .msg_controllen = 0,
        .msg_iov = vq->iov,
        .msg_flags = MSG_DONTWAIT,
    };

    struct virtio_net_hdr_mrg_rxbuf hdr = {
        .hdr.flags = 0,
        .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
    };

    size_t total_len = 0;
    int err, headcount, mergeable;
    size_t vhost_hlen, sock_hlen;
    size_t vhost_len, sock_len;

    struct socket *sock = rcu_dereference(vq->private_data);

    if (!sock)
        return;

    mutex_lock(&vq->mutex); 
    vhost_disable_notify(&net->dev, vq); /* disable virtqueue event notify机制 */
    vhost_hlen = vq->vhost_hlen;
    sock_hlen = vq->sock_hlen;

    vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
        vq->log : NULL;
    mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF);

    while ((sock_len = peek_head_len(sock->sk))) {  /* 下一个报文的长度 */
        sock_len += sock_hlen;
        vhost_len = sock_len + vhost_hlen;
        headcount = get_rx_bufs(vq, vq->heads, vhost_len,    /* get_rx_bufs用于从virtqueue中拿到多个avail desc, */
                    &in, vq_log, &log,                       /* 直到满足所有这些iov加起来可以容纳下一个报文的长度 */
                    likely(mergeable) ? UIO_MAXIOV : 1);     /* 相当于多次调用<span style="font-family: Arial, Helvetica, sans-serif;">vhost_get_vq_desc */</span>

        /* On error, stop handling until the next kick. */
        if (unlikely(headcount < 0))
            break;
        /* OK, now we need to know about added descriptors. */
        if (!headcount) {
            if (unlikely(vhost_enable_notify(&net->dev, vq))) {
                /* They have slipped one in as we were
                 * doing that: check again. */
                vhost_disable_notify(&net->dev, vq);
                continue;
            }
            /* Nothing new?  Wait for eventfd to tell us
             * they refilled. */
            break;
        }
        /* We don't need to be notified again. */
        if (unlikely((vhost_hlen)))
            /* Skip header. TODO: support TSO. */
            move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
        else
            /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
             * needed because sendmsg can modify msg_iov. */
            copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
        msg.msg_iovlen = in;
        err = sock->ops->recvmsg(NULL, sock, &msg,
                     sock_len, MSG_DONTWAIT | MSG_TRUNC);  /* 报文被收到virtqueue->iov里面 */
        /* Userspace might have consumed the packet meanwhile:
         * it's not supposed to do this usually, but might be hard
         * to prevent. Discard data we got (if any) and keep going. */
        if (unlikely(err != sock_len)) {
            pr_debug("Discarded rx packet: "
                 " len %d, expected %zd\n", err, sock_len);
            vhost_discard_vq_desc(vq, headcount); /* 回滚used ring */
            continue;
        }
        if (unlikely(vhost_hlen) &&
            memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
                      vhost_hlen)) {
            vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
                   vq->iov->iov_base);
            break;
        }
        /* TODO: Should check and handle checksum. */
        if (likely(mergeable) &&
            memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
                      offsetof(typeof(hdr), num_buffers),
                      sizeof hdr.num_buffers)) {
            vq_err(vq, "Failed num_buffers write");
            vhost_discard_vq_desc(vq, headcount);
            break;
        }
        vhost_add_used_and_signal_n(&net->dev, vq, vq->heads,
                        headcount);  /* 添加多个vring_used_elem,并notify前端 */
        if (unlikely(vq_log))
            vhost_log_write(vq, vq_log, log, vhost_len);
        total_len += vhost_len;
        if (unlikely(total_len >= VHOST_NET_WEIGHT)) {
            vhost_poll_queue(&vq->poll);  /* 超出了quota,重新入队列等待,注意此时加入的是vq的poll,下次会触发调用handle_rx_kick */
            break;
        }
    }

    mutex_unlock(&vq->mutex);
}

你可能感兴趣的:(virtio后端方案vhost)