vhost是virtio的另一种方案,用于跳过qemu,减少qemu和内核之间上下文切换的开销,对于网络IO而言提升尤其明显。vhost目前有两种实现方案,内核态和用户态,本文重点讨论内核态的vhost
vhost内核模块主要处理数据面的事情,控制面上还是交给qemu,vhost的数据结构如下
struct vhost_dev { MemoryListener memory_listener; /* MemoryListener是物理内存操作的回调函数集合 */ struct vhost_memory *mem; int n_mem_sections; MemoryRegionSection *mem_sections; struct vhost_virtqueue *vqs; /* vhost_virtqueue列表和个数 */ int nvqs; /* the first virtuque which would be used by this vhost dev */ int vq_index; unsigned long long features; /* vhost设备支持的features */ unsigned long long acked_features; /* guest acked的features */ unsigned long long backend_features; /* backend, e.g. tap设备,支持的features */ bool started; bool log_enabled; vhost_log_chunk_t *log; unsigned long long log_size; Error *migration_blocker; bool force; bool memory_changed; hwaddr mem_changed_start_addr; hwaddr mem_changed_end_addr; const VhostOps *vhost_ops; /* VhostOps基于kernel和user两种形态的vhost有不同的实现,内核的实现最终调用ioctl完成 */ void *opaque; }; struct vhost_virtqueue { int kick; int call; void *desc; void *avail; void *used; int num; unsigned long long used_phys; unsigned used_size; void *ring; unsigned long long ring_phys; unsigned ring_size; EventNotifier masked_notifier; };vhost的内存布局,也是由一组vhost_memory_region构成,
struct vhost_memory_region { __u64 guest_phys_addr; __u64 memory_size; /* bytes */ __u64 userspace_addr; __u64 flags_padding; /* No flags are currently specified. */ }; /* All region addresses and sizes must be 4K aligned. */ #define VHOST_PAGE_SIZE 0x1000 struct vhost_memory { __u32 nregions; __u32 padding; struct vhost_memory_region regions[0]; };vhost的控制面由qemu来控制,通过ioctl操作vhost_xxx的内核模块,e.g.
long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) { void __user *argp = (void __user *)arg; struct file *eventfp, *filep = NULL; struct eventfd_ctx *ctx = NULL; u64 p; long r; int i, fd; /* If you are not the owner, you can become one */ if (ioctl == VHOST_SET_OWNER) { r = vhost_dev_set_owner(d); goto done; } /* You must be the owner to do anything else */ r = vhost_dev_check_owner(d); if (r) goto done; switch (ioctl) { case VHOST_SET_MEM_TABLE: r = vhost_set_memory(d, argp); break; ... default: r = vhost_set_vring(d, ioctl, argp); break; } done: return r; }
VHOST_SET_OWNER,用于把当前guest对应的qemu进程和vhost内核线程关联起来
VHOST_SET_OWNER /* Caller should have device mutex */ static long vhost_dev_set_owner(struct vhost_dev *dev) { struct task_struct *worker; int err; /* Is there an owner already? */ if (dev->mm) { err = -EBUSY; goto err_mm; } /* No owner, become one */ dev->mm = get_task_mm(current); /* 拿到qemu进程的mm_struct,即guest的内存分布结构 */ worker = kthread_create(vhost_worker, dev, "vhost-%d", current->pid); /* 创建vhost线程 */ if (IS_ERR(worker)) { err = PTR_ERR(worker); goto err_worker; } dev->worker = worker; wake_up_process(worker); /* avoid contributing to loadavg */ err = vhost_attach_cgroups(dev); if (err) goto err_cgroup; err = vhost_dev_alloc_iovecs(dev); /* 为vhost_virtqueue分配iovec内存空间 */ if (err) goto err_cgroup; return 0; err_cgroup: kthread_stop(worker); dev->worker = NULL; err_worker: if (dev->mm) mmput(dev->mm); dev->mm = NULL; err_mm: return err; }VHOST_SET_MEM_TABLE,初始化vhost_dev的vhost_memory内存成员
static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) { struct vhost_memory mem, *newmem, *oldmem; unsigned long size = offsetof(struct vhost_memory, regions); if (copy_from_user(&mem, m, size)) return -EFAULT; if (mem.padding) return -EOPNOTSUPP; if (mem.nregions > VHOST_MEMORY_MAX_NREGIONS) return -E2BIG; newmem = kmalloc(size + mem.nregions * sizeof *m->regions, GFP_KERNEL); /* 分配多个vhost_memory_region */ if (!newmem) return -ENOMEM; memcpy(newmem, &mem, size); if (copy_from_user(newmem->regions, m->regions, mem.nregions * sizeof *m->regions)) { kfree(newmem); return -EFAULT; } if (!memory_access_ok(d, newmem, vhost_has_feature(d, VHOST_F_LOG_ALL))) { kfree(newmem); return -EFAULT; } oldmem = d->memory; rcu_assign_pointer(d->memory, newmem); synchronize_rcu(); kfree(oldmem); return 0; }VHOST_GET_FEATURES, VHOST_SET_FEATURES,用于读写vhost支持的features,目前只有vhost_net模块用到,
enum { VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | (1ULL << VIRTIO_RING_F_EVENT_IDX) | (1ULL << VHOST_F_LOG_ALL) | (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | (1ULL << VIRTIO_NET_F_MRG_RXBUF), }; static long vhost_net_ioctl(struct file *f, unsigned int ioctl, unsigned long arg) { .... case VHOST_GET_FEATURES: features = VHOST_FEATURES; if (copy_to_user(featurep, &features, sizeof features)) return -EFAULT; return 0; case VHOST_SET_FEATURES: if (copy_from_user(&features, featurep, sizeof features)) return -EFAULT; if (features & ~VHOST_FEATURES) return -EOPNOTSUPP; return vhost_net_set_features(n, features); .... }VHOST_SET_VRING_CALL,设置irqfd,把中断注入guest
VHOST_SET_VRING_KICK,设置ioeventfd,获取guest notify
case VHOST_SET_VRING_KICK: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->kick) { /* eventfp不同于vq->kick,此时需要stop vq->kick同时start eventfp */ pollstop = filep = vq->kick; pollstart = vq->kick = eventfp; } else filep = eventfp; /* 两者相同,无需stop & start */ break; case VHOST_SET_VRING_CALL: if (copy_from_user(&f, argp, sizeof f)) { r = -EFAULT; break; } eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); if (IS_ERR(eventfp)) { r = PTR_ERR(eventfp); break; } if (eventfp != vq->call) { /* eventfp不同于vq->call,此时需要stop vq->call同时start eventfp */ filep = vq->call; ctx = vq->call_ctx; vq->call = eventfp; vq->call_ctx = eventfp ? eventfd_ctx_fileget(eventfp) : NULL; } else filep = eventfp; break; if (pollstop && vq->handle_kick) vhost_poll_stop(&vq->poll); if (ctx) eventfd_ctx_put(ctx); /* pollstop之后,释放之前占用的ctx */ if (filep) fput(filep); /* pollstop之后,释放之前占用的filep */ if (pollstart && vq->handle_kick) vhost_poll_start(&vq->poll, vq->kick); mutex_unlock(&vq->mutex); if (pollstop && vq->handle_kick) vhost_poll_flush(&vq->poll); return r;下面来看下vhost的数据流,vhost与kvm模块之间通过eventfd来实现,guest到host方向的kick event,通过ioeventfd实现,host到guest方向的call event,通过irqfd实现
host到guest方向
首先host处理used ring,然后判断如果KVM_IRQFD成功设置,kvm模块会通过irqfd把中断注入guest。qemu是通过virtio_pci_set_guest_notifiers -> kvm_virtio_pci_vector_use -> kvm_virtio_pci_irqfd_use -> kvm_irqchip_add_irqfd_notifier -> kvm_irqchip_assign_irqfd最终调用kvm_vm_ioctl来设置kvm模块的irqfd的,包含write fd和read fd(可选)
static int kvm_virtio_pci_vector_use(VirtIOPCIProxy *proxy, int nvqs) { PCIDevice *dev = &proxy->pci_dev; VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev); unsigned int vector; int ret, queue_no; MSIMessage msg; for (queue_no = 0; queue_no < nvqs; queue_no++) { if (!virtio_queue_get_num(vdev, queue_no)) { break; } vector = virtio_queue_vector(vdev, queue_no); if (vector >= msix_nr_vectors_allocated(dev)) { continue; } msg = msix_get_message(dev, vector); ret = kvm_virtio_pci_vq_vector_use(proxy, queue_no, vector, msg); if (ret < 0) { goto undo; } /* If guest supports masking, set up irqfd now. * Otherwise, delay until unmasked in the frontend. */ if (k->guest_notifier_mask) { ret = kvm_virtio_pci_irqfd_use(proxy, queue_no, vector); if (ret < 0) { kvm_virtio_pci_vq_vector_release(proxy, vector); goto undo; } } } return 0; undo: while (--queue_no >= 0) { vector = virtio_queue_vector(vdev, queue_no); if (vector >= msix_nr_vectors_allocated(dev)) { continue; } if (k->guest_notifier_mask) { kvm_virtio_pci_irqfd_release(proxy, queue_no, vector); } kvm_virtio_pci_vq_vector_release(proxy, vector); } return ret; }如果没有设置irqfd,则guest notifier fd会通知到等待fd的qemu进程,进入注册函数virtio_queue_guest_notifier_read,调用virtio_irq,最终调用到virtio_pci_notify
static void virtio_queue_guest_notifier_read(EventNotifier *n) { VirtQueue *vq = container_of(n, VirtQueue, guest_notifier); if (event_notifier_test_and_clear(n)) { virtio_irq(vq); } } void virtio_irq(VirtQueue *vq) { trace_virtio_irq(vq); vq->vdev->isr |= 0x01; virtio_notify_vector(vq->vdev, vq->vector); } static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector) { BusState *qbus = qdev_get_parent_bus(DEVICE(vdev)); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); if (k->notify) { k->notify(qbus->parent, vector); } } static void virtio_pci_notify(DeviceState *d, uint16_t vector) { VirtIOPCIProxy *proxy = to_virtio_pci_proxy_fast(d); if (msix_enabled(&proxy->pci_dev)) msix_notify(&proxy->pci_dev, vector); else { VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus); pci_set_irq(&proxy->pci_dev, vdev->isr & 1); } }整个过程如图所示 ( 摘自http://royluo.org/2014/08/22/vhost/ )
guest通过向pci配置空间写入从而产生VMEXIT,被kvm截获之后触发注册fd的notification
kvm_init: memory_listener_register(&kvm_memory_listener, &address_space_memory); memory_listener_register(&kvm_io_listener, &address_space_io); static MemoryListener kvm_memory_listener = { .region_add = kvm_region_add, .region_del = kvm_region_del, .log_start = kvm_log_start, .log_stop = kvm_log_stop, .log_sync = kvm_log_sync, .log_global_start = kvm_log_global_start, .log_global_stop = kvm_log_global_stop, .eventfd_add = kvm_mem_ioeventfd_add, .eventfd_del = kvm_mem_ioeventfd_del, .coalesced_mmio_add = kvm_coalesce_mmio_region, .coalesced_mmio_del = kvm_uncoalesce_mmio_region, .priority = 10, }; static MemoryListener kvm_io_listener = { .eventfd_add = kvm_io_ioeventfd_add, .eventfd_del = kvm_io_ioeventfd_del, .priority = 10, }; static void kvm_io_ioeventfd_add(MemoryListener *listener, MemoryRegionSection *section, bool match_data, uint64_t data, EventNotifier *e) { int fd = event_notifier_get_fd(e); int r; r = kvm_set_ioeventfd_pio(fd, section->offset_within_address_space, data, true, int128_get64(section->size), match_data); if (r < 0) { fprintf(stderr, "%s: error adding ioeventfd: %s\n", __func__, strerror(-r)); abort(); } }而kvm_io_ioeventfd_add最终调用了kvm_set_ioeventfd_pio,后者调用了kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick)进入到了kvm.ko中
static int kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint16_t val, bool assign, uint32_t size, bool datamatch) { struct kvm_ioeventfd kick = { .datamatch = datamatch ? val : 0, .addr = addr, .flags = KVM_IOEVENTFD_FLAG_PIO, .len = size, .fd = fd, }; int r; if (!kvm_enabled()) { return -ENOSYS; } if (datamatch) { kick.flags |= KVM_IOEVENTFD_FLAG_DATAMATCH; } if (!assign) { kick.flags |= KVM_IOEVENTFD_FLAG_DEASSIGN; } r = kvm_vm_ioctl(kvm_state, KVM_IOEVENTFD, &kick); if (r < 0) { return r; } return 0; }KVM_IOEVENTFD的ioctl最终调用了kvm的kvm_ioeventfd函数,后者会调用到kvm_assign_ioeventfd或者kvm_deassign_ioeventfd
int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { if (args->flags & KVM_IOEVENTFD_FLAG_DEASSIGN) return kvm_deassign_ioeventfd(kvm, args); return kvm_assign_ioeventfd(kvm, args); } static int kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { int pio = args->flags & KVM_IOEVENTFD_FLAG_PIO; enum kvm_bus bus_idx = pio ? KVM_PIO_BUS : KVM_MMIO_BUS; struct _ioeventfd *p; /* ioeventfd: translate a PIO/MMIO memory write to an eventfd signal. */ struct eventfd_ctx *eventfd; /* mostly wait_queue_head_t */ int ret; /* must be natural-word sized */ switch (args->len) { case 1: case 2: case 4: case 8: break; default: return -EINVAL; } /* check for range overflow */ if (args->addr + args->len < args->addr) return -EINVAL; /* check for extra flags that we don't understand */ if (args->flags & ~KVM_IOEVENTFD_VALID_FLAG_MASK) return -EINVAL; eventfd = eventfd_ctx_fdget(args->fd); /* file->private_data */ if (IS_ERR(eventfd)) return PTR_ERR(eventfd); p = kzalloc(sizeof(*p), GFP_KERNEL); /* 分配一个_ioeventfd,并把内存地址,长度,eventfd_ctx与其关联起来 */ if (!p) { ret = -ENOMEM; goto fail; } INIT_LIST_HEAD(&p->list); p->addr = args->addr; p->length = args->len; p->eventfd = eventfd; /* The datamatch feature is optional, otherwise this is a wildcard */ if (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH) p->datamatch = args->datamatch; else p->wildcard = true; mutex_lock(&kvm->slots_lock); /* Verify that there isnt a match already */ if (ioeventfd_check_collision(kvm, p)) { ret = -EEXIST; goto unlock_fail; } kvm_iodevice_init(&p->dev, &ioeventfd_ops); ret = kvm_io_bus_register_dev(kvm, bus_idx, &p->dev); /* 注册到kvm的pio bus或者mmio bus上 */ if (ret < 0) goto unlock_fail; list_add_tail(&p->list, &kvm->ioeventfds); /* 添加到kvm.ko的ioeventfds的list中 */ mutex_unlock(&kvm->slots_lock); return 0; unlock_fail: mutex_unlock(&kvm->slots_lock); fail: kfree(p); eventfd_ctx_put(eventfd); return ret; }kvm_assign_ioeventfd中,通过注册一个pio/mmio的地址段和一个fd,当访问这块内存产生的VMEXIT就会在kvm.ko中被转化成为fd的event notification,
static const struct kvm_io_device_ops ioeventfd_ops = { .write = ioeventfd_write, .destructor = ioeventfd_destructor, }; /* MMIO/PIO writes trigger an event if the addr/val match */ static int ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len, const void *val) { struct _ioeventfd *p = to_ioeventfd(this); if (!ioeventfd_in_range(p, addr, len, val)) return -EOPNOTSUPP; eventfd_signal(p->eventfd, 1); return 0; }最终event notification通过eventfd_signal,唤醒vhost线程,整体的流程如下图所示
vhost的控制面和数据面如下图所示
最后,以vhost-net为例说明下vhost网络报文的初始化以及收发流程,e.g.
qemu通过netdev tap,vhost=on在创建网络设备时指定后端基于vhost,net_init_tap会对vhost的每个queue,调用net_init_tap_one初始化vhost。初始化的工作通过vhost_net_init完成
typedef struct VhostNetOptions { VhostBackendType backend_type; /* vhost kernel or userspace */ NetClientState *net_backend; /* TAPState device */ void *opaque; /* ioctl vhostfd, /dev/vhost-net */ bool force; } VhostNetOptions; static int net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer, const char *model, const char *name, const char *ifname, const char *script, const char *downscript, const char *vhostfdname, int vnet_hdr, int fd) { ... if (tap->has_vhost ? tap->vhost : vhostfdname || (tap->has_vhostforce && tap->vhostforce)) { VhostNetOptions options; options.backend_type = VHOST_BACKEND_TYPE_KERNEL; options.net_backend = &s->nc; options.force = tap->has_vhostforce && tap->vhostforce; if ((tap->has_vhostfd || tap->has_vhostfds)) { vhostfd = monitor_handle_fd_param(cur_mon, vhostfdname); if (vhostfd == -1) { return -1; } } else { vhostfd = open("/dev/vhost-net", O_RDWR); /* open /dev/vhost-net for ioctl usage */ if (vhostfd < 0) { error_report("tap: open vhost char device failed: %s", strerror(errno)); return -1; } } qemu_set_cloexec(vhostfd); options.opaque = (void *)(uintptr_t)vhostfd; s->vhost_net = vhost_net_init(&options); /* 初始化struct vhost_net */ if (!s->vhost_net) { error_report("vhost-net requested but could not be initialized"); return -1; } } ... } struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[2]; int backend; NetClientState *nc; }; struct vhost_net *vhost_net_init(VhostNetOptions *options) { int r; bool backend_kernel = options->backend_type == VHOST_BACKEND_TYPE_KERNEL; struct vhost_net *net = g_malloc(sizeof *net); if (!options->net_backend) { fprintf(stderr, "vhost-net requires net backend to be setup\n"); goto fail; } if (backend_kernel) { r = vhost_net_get_fd(options->net_backend); if (r < 0) { goto fail; } net->dev.backend_features = qemu_has_vnet_hdr(options->net_backend) ? 0 : (1 << VHOST_NET_F_VIRTIO_NET_HDR); net->backend = r; /* backend设置为NetClientState对应的fd */ } else { net->dev.backend_features = 0; net->backend = -1; } net->nc = options->net_backend; /* nc设置为NetClientState */ net->dev.nvqs = 2; /* TX queue和RX queue */ net->dev.vqs = net->vqs; /* vhost_dev,vhost_net公用vhost_virtqueue */ r = vhost_dev_init(&net->dev, options->opaque, options->backend_type, options->force); /* 初始化vhost_dev,这里通过VHOST_SET_OWNER的ioctl创建vhost kthread */ if (r < 0) { goto fail; } if (!qemu_has_vnet_hdr_len(options->net_backend, sizeof(struct virtio_net_hdr_mrg_rxbuf))) { net->dev.features &= ~(1 << VIRTIO_NET_F_MRG_RXBUF); } if (backend_kernel) { if (~net->dev.features & net->dev.backend_features) { fprintf(stderr, "vhost lacks feature mask %" PRIu64 " for backend\n", (uint64_t)(~net->dev.features & net->dev.backend_features)); vhost_dev_cleanup(&net->dev); goto fail; } } /* Set sane init value. Override when guest acks. */ vhost_net_ack_features(net, 0); return net; fail: g_free(net); return NULL; }当guest启动成功,qemu会配置相应的vhost,调用virtio_net_set_status用于开启/关闭virtio-net设备及队列,virtio_net_set_status会调用到vhost_net_start用于打开vhost队列,调用vhost_net_stop用于关闭vhost队列
int vhost_net_start(VirtIODevice *dev, NetClientState *ncs, int total_queues) { BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(dev))); VirtioBusState *vbus = VIRTIO_BUS(qbus); VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(vbus); int r, e, i; if (!vhost_net_device_endian_ok(dev)) { error_report("vhost-net does not support cross-endian"); r = -ENOSYS; goto err; } if (!k->set_guest_notifiers) { error_report("binding does not support guest notifiers"); r = -ENOSYS; goto err; } for (i = 0; i < total_queues; i++) { vhost_net_set_vq_index(get_vhost_net(ncs[i].peer), i * 2); } /* 调用virtio_pci_set_guest_notifiers来配置irqfd等信息;如果没有enable vhost,qemu同样会调用到这里 */ r = k->set_guest_notifiers(qbus->parent, total_queues * 2, true); if (r < 0) { error_report("Error binding guest notifier: %d", -r); goto err; } /* 如果tun支持多队列的场景,会有多个NetClientState,分别代表tap设备的一个队列,每个NetClientState都会对应一个vhost_net结构 */ for (i = 0; i < total_queues; i++) { r = vhost_net_start_one(get_vhost_net(ncs[i].peer), dev); /* 对每个队列调用vhost_net_start_one */ if (r < 0) { goto err_start; } } return 0; err_start: while (--i >= 0) { vhost_net_stop_one(get_vhost_net(ncs[i].peer), dev); } e = k->set_guest_notifiers(qbus->parent, total_queues * 2, false); if (e < 0) { fprintf(stderr, "vhost guest notifier cleanup failed: %d\n", e); fflush(stderr); } err: return r; }
static int vhost_net_start_one(struct vhost_net *net, VirtIODevice *dev) { struct vhost_vring_file file = { }; int r; if (net->dev.started) { return 0; } net->dev.nvqs = 2; /* vqs包含一个TX virtqueue和一个RX virtqueue */ net->dev.vqs = net->vqs; /* 调用<span style="font-family: Arial, Helvetica, sans-serif;">virtio_pci_set_guest_notifiers来enable vhost ioeventfd */</span> r = vhost_dev_enable_notifiers(&net->dev, dev); /* 停止在qemu中处理guest的IO通知,开始在vhost里处理guest的IO通知 */ if (r < 0) { goto fail_notifiers; } r = vhost_dev_start(&net->dev, dev); if (r < 0) { goto fail_start; } if (net->nc->info->poll) { net->nc->info->poll(net->nc, false); } if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { qemu_set_fd_handler(net->backend, NULL, NULL, NULL); file.fd = net->backend; for (file.index = 0; file.index < net->dev.nvqs; ++file.index) { const VhostOps *vhost_ops = net->dev.vhost_ops; r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); if (r < 0) { r = -errno; goto fail; } } } return 0; fail: file.fd = -1; if (net->nc->info->type == NET_CLIENT_OPTIONS_KIND_TAP) { while (file.index-- > 0) { const VhostOps *vhost_ops = net->dev.vhost_ops; int r = vhost_ops->vhost_call(&net->dev, VHOST_NET_SET_BACKEND, &file); assert(r >= 0); } } if (net->nc->info->poll) { net->nc->info->poll(net->nc, true); } vhost_dev_stop(&net->dev, dev); fail_start: vhost_dev_disable_notifiers(&net->dev, dev); fail_notifiers: return r; }vhost net的各个数据结构之间关系如下图所示
我们来看下内核对vhost_net的定义,e.g.
static const struct file_operations vhost_net_fops = { .owner = THIS_MODULE, .release = vhost_net_release, .unlocked_ioctl = vhost_net_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vhost_net_compat_ioctl, #endif .open = vhost_net_open, }; static struct miscdevice vhost_net_misc = { MISC_DYNAMIC_MINOR, "vhost-net", &vhost_net_fops, }; enum { VHOST_NET_VQ_RX = 0, VHOST_NET_VQ_TX = 1, VHOST_NET_VQ_MAX = 2, }; enum vhost_net_poll_state { VHOST_NET_POLL_DISABLED = 0, VHOST_NET_POLL_STARTED = 1, VHOST_NET_POLL_STOPPED = 2, }; struct vhost_net { struct vhost_dev dev; struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX]; /* vhost的virtqueue封装,其handle_kick的回调函数会被ioeventfd唤醒 */ struct vhost_poll poll[VHOST_NET_VQ_MAX]; /* 对应于NetClientState的socket IO,分别用两个vhost_poll结构体 */ /* Tells us whether we are polling a socket for TX. * We only do this when socket buffer fills up. * Protected by tx vq lock. */ enum vhost_net_poll_state tx_poll_state; }; static int vhost_net_open(struct inode *inode, struct file *f) { struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL); struct vhost_dev *dev; int r; if (!n) return -ENOMEM; dev = &n->dev; n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick; /* TX virtqueue->kick的callback函数 */ n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick; /* RX virtqueue->kick的callback函数 */ r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX); if (r < 0) { kfree(n); return r; } vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, POLLOUT, dev); /* 初始化vhost_net的TX vhost_poll */ vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev); /* 初始化vhost_net的RX vhost_poll */ n->tx_poll_state = VHOST_NET_POLL_DISABLED; f->private_data = n; return 0; }
handle_tx_kick/handle_rx_kick的实现和handle_tx_net/handle_rx_net完全一致,这里为什么要有两个不同的函数呢?看完下面的代码分析后你会有一个答案,不过我先在这里剧透下,handle_tx_kick/handle_rx_kick是阻塞在TX queue/RX queue的kick fd上的回调函数,handle_tx_net/handle_rx_net是阻塞在vhost_net TX poll/RX poll上的阻塞函数,无论对于TX还是RX而言,报文的路径都是一个两阶段的过程,e.g.
TX首先是kick virtqueue的fd,之后进行vring的buffer传递,最后通过NetClientState的socket fd发送,但socket有可能会出现缓冲区不足,或者本次发送的quota不够等情况,此时需要poll在socket的fd上阻塞等待。同理RX也是如此,一阶段阻塞在socket fd上,二阶段阻塞在virtqueue kick fd上
当guest发送报文时,ioeventfd触发了vhost_virtqueue的kick fd,POLLIN事件导致vhost_poll_wakeup被调用,最后唤醒了vhost worker线程,线程会调用注册的handle_kick函数,即handle_tx_kick
static void handle_tx_kick(struct vhost_work *work) { struct vhost_virtqueue *vq = container_of(work, struct vhost_virtqueue, poll.work); struct vhost_net *net = container_of(vq->dev, struct vhost_net, dev); handle_tx(net); } static void handle_tx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX]; unsigned out, in, s; int head; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, .msg_controllen = 0, .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; size_t len, total_len = 0; int err, wmem; size_t hdr_size; struct vhost_ubuf_ref *uninitialized_var(ubufs); bool zcopy; struct socket *sock = rcu_dereference(vq->private_data); /* NetClientState对应的socket以private_data的形式保存在vhost_virtqueue */ if (!sock) return; wmem = atomic_read(&sock->sk->sk_wmem_alloc); if (wmem >= sock->sk->sk_sndbuf) { /* 已经申请的socket写内存,超过了发送缓冲区 */ mutex_lock(&vq->mutex); tx_poll_start(net, sock); /* 此时无法发送,阻塞等待在sock上 */ mutex_unlock(&vq->mutex); return; } mutex_lock(&vq->mutex); vhost_disable_notify(&net->dev, vq); /* disable virtqueue的notify通知,通过VRING_USED_F_NO_NOTIFY标志位 */ if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); hdr_size = vq->vhost_hlen; zcopy = vq->ubufs; for (;;) { /* Release DMAs done buffers first */ if (zcopy) vhost_zerocopy_signal_used(vq); head = vhost_get_vq_desc(&net->dev, vq, vq->iov, /* 从last_avail_idx开始,把avail desc内容拷贝过来 */ ARRAY_SIZE(vq->iov), &out, &in, NULL, NULL); /* On error, stop handling until the next kick. */ if (unlikely(head < 0)) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { /* 此时vq->avail_idx == vq->last_avail_idx,前端没有新buf过来 */ int num_pends; wmem = atomic_read(&sock->sk->sk_wmem_alloc); if (wmem >= sock->sk->sk_sndbuf * 3 / 4) { tx_poll_start(net, sock); set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } /* If more outstanding DMAs, queue the work. * Handle upend_idx wrap around */ num_pends = likely(vq->upend_idx >= vq->done_idx) ? (vq->upend_idx - vq->done_idx) : (vq->upend_idx + UIO_MAXIOV - vq->done_idx); if (unlikely(num_pends > VHOST_MAX_PEND)) { tx_poll_start(net, sock); set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* 重新调用vhost_enable_notify打开event notify flag */ vhost_disable_notify(&net->dev, vq); /* vhost_enable_notify返回false,说明avail_idx有了变化,那么continue */ continue; } break; } if (in) { /* Tx应该全部是out */ vq_err(vq, "Unexpected descriptor format for TX: " "out %d, int %d\n", out, in); break; } /* Skip header. TODO: support TSO. */ s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out); /* hdr_size是VNET_HDR的元数据,里面没有实际报文内容 */ msg.msg_iovlen = out; len = iov_length(vq->iov, out); /* Sanity check */ if (!len) { vq_err(vq, "Unexpected header len for TX: " "%zd expected %zd\n", iov_length(vq->hdr, s), hdr_size); break; } /* use msg_control to pass vhost zerocopy ubuf info to skb */ if (zcopy) { vq->heads[vq->upend_idx].id = head; if (len < VHOST_GOODCOPY_LEN) { /* copy don't need to wait for DMA done */ vq->heads[vq->upend_idx].len = VHOST_DMA_DONE_LEN; msg.msg_control = NULL; msg.msg_controllen = 0; ubufs = NULL; } else { struct ubuf_info *ubuf = &vq->ubuf_info[head]; vq->heads[vq->upend_idx].len = len; ubuf->callback = vhost_zerocopy_callback; ubuf->arg = vq->ubufs; ubuf->desc = vq->upend_idx; msg.msg_control = ubuf; msg.msg_controllen = sizeof(ubuf); ubufs = vq->ubufs; kref_get(&ubufs->kref); } vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV; } /* TODO: Check specific error and bomb out unless ENOBUFS? */ err = sock->ops->sendmsg(NULL, sock, &msg, len); if (unlikely(err < 0)) { if (zcopy) { if (ubufs) vhost_ubuf_put(ubufs); vq->upend_idx = ((unsigned)vq->upend_idx - 1) % UIO_MAXIOV; } vhost_discard_vq_desc(vq, 1); /* 发送失败,回退last_avail_idx */ if (err == -EAGAIN || err == -ENOBUFS) tx_poll_start(net, sock); /* 阻塞等待vhost_net->poll之后尝试重新发送 */ break; } if (err != len) pr_debug("Truncated TX packet: " " len %d != %zd\n", err, len); if (!zcopy) vhost_add_used_and_signal(&net->dev, vq, head, 0); /* 更新virtqueue used ring部分,e.g. used_elem, last_used_idx */ else vhost_zerocopy_signal_used(vq); total_len += len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待调度 */ break; } } mutex_unlock(&vq->mutex); }收包过程首先是vhost阻塞在NetClientState的socket上,e.g.
vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, POLLIN, dev)
static void handle_rx_net(struct vhost_work *work) { struct vhost_net *net = container_of(work, struct vhost_net, poll[VHOST_NET_VQ_RX].work); handle_rx(net); } static void handle_rx(struct vhost_net *net) { struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX]; unsigned uninitialized_var(in), log; struct vhost_log *vq_log; struct msghdr msg = { .msg_name = NULL, .msg_namelen = 0, .msg_control = NULL, /* FIXME: get and handle RX aux data. */ .msg_controllen = 0, .msg_iov = vq->iov, .msg_flags = MSG_DONTWAIT, }; struct virtio_net_hdr_mrg_rxbuf hdr = { .hdr.flags = 0, .hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE }; size_t total_len = 0; int err, headcount, mergeable; size_t vhost_hlen, sock_hlen; size_t vhost_len, sock_len; struct socket *sock = rcu_dereference(vq->private_data); if (!sock) return; mutex_lock(&vq->mutex); vhost_disable_notify(&net->dev, vq); /* disable virtqueue event notify机制 */ vhost_hlen = vq->vhost_hlen; sock_hlen = vq->sock_hlen; vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ? vq->log : NULL; mergeable = vhost_has_feature(&net->dev, VIRTIO_NET_F_MRG_RXBUF); while ((sock_len = peek_head_len(sock->sk))) { /* 下一个报文的长度 */ sock_len += sock_hlen; vhost_len = sock_len + vhost_hlen; headcount = get_rx_bufs(vq, vq->heads, vhost_len, /* get_rx_bufs用于从virtqueue中拿到多个avail desc, */ &in, vq_log, &log, /* 直到满足所有这些iov加起来可以容纳下一个报文的长度 */ likely(mergeable) ? UIO_MAXIOV : 1); /* 相当于多次调用<span style="font-family: Arial, Helvetica, sans-serif;">vhost_get_vq_desc */</span> /* On error, stop handling until the next kick. */ if (unlikely(headcount < 0)) break; /* OK, now we need to know about added descriptors. */ if (!headcount) { if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us * they refilled. */ break; } /* We don't need to be notified again. */ if (unlikely((vhost_hlen))) /* Skip header. TODO: support TSO. */ move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in); else /* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF: * needed because sendmsg can modify msg_iov. */ copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in); msg.msg_iovlen = in; err = sock->ops->recvmsg(NULL, sock, &msg, sock_len, MSG_DONTWAIT | MSG_TRUNC); /* 报文被收到virtqueue->iov里面 */ /* Userspace might have consumed the packet meanwhile: * it's not supposed to do this usually, but might be hard * to prevent. Discard data we got (if any) and keep going. */ if (unlikely(err != sock_len)) { pr_debug("Discarded rx packet: " " len %d, expected %zd\n", err, sock_len); vhost_discard_vq_desc(vq, headcount); /* 回滚used ring */ continue; } if (unlikely(vhost_hlen) && memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0, vhost_hlen)) { vq_err(vq, "Unable to write vnet_hdr at addr %p\n", vq->iov->iov_base); break; } /* TODO: Should check and handle checksum. */ if (likely(mergeable) && memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount, offsetof(typeof(hdr), num_buffers), sizeof hdr.num_buffers)) { vq_err(vq, "Failed num_buffers write"); vhost_discard_vq_desc(vq, headcount); break; } vhost_add_used_and_signal_n(&net->dev, vq, vq->heads, headcount); /* 添加多个vring_used_elem,并notify前端 */ if (unlikely(vq_log)) vhost_log_write(vq, vq_log, log, vhost_len); total_len += vhost_len; if (unlikely(total_len >= VHOST_NET_WEIGHT)) { vhost_poll_queue(&vq->poll); /* 超出了quota,重新入队列等待,注意此时加入的是vq的poll,下次会触发调用handle_rx_kick */ break; } } mutex_unlock(&vq->mutex); }