和内核一样,qemu也需要支持virtqueue,VirtQueue的定义如下
#define VIRTIO_PCI_VRING_ALIGN 4096 typedef struct VRingDesc { uint64_t addr; uint32_t len; uint16_t flags; uint16_t next; } VRingDesc; typedef struct VRingAvail { uint16_t flags; uint16_t idx; uint16_t ring[0]; } VRingAvail; typedef struct VRingUsedElem { uint32_t id; uint32_t len; } VRingUsedElem; typedef struct VRingUsed { uint16_t flags; uint16_t idx; VRingUsedElem ring[0]; } VRingUsed; typedef struct VRing { unsigned int num; unsigned int align; hwaddr desc; hwaddr avail; hwaddr used; } VRing; struct VirtQueue { VRing vring; /* vring的元数据 */ hwaddr pa; /* vring实际的内存地址 */ uint16_t last_avail_idx; /* Last used index value we have signalled on */ uint16_t signalled_used; /* Last used index value we have signalled on */ bool signalled_used_valid; /* Notification enabled? */ bool notification; uint16_t queue_index; int inuse; uint16_t vector; void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); VirtIODevice *vdev; EventNotifier guest_notifier; EventNotifier host_notifier; };可以看出VRing结构体的定义,qemu和内核在ABI上是一致的。virtqueue_init用于初始化vring的元数据,同时qemu提供了一系列接口来读写vring的不同成员,e.g.
static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa, int i) /* 读取第i个VRingDesc的addr地址 */ { hwaddr pa; pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); return virtio_ldq_phys(vdev, pa); } static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int i) /* 读取第i个VRingDesc的len */ { hwaddr pa; pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); return virtio_ldl_phys(vdev, pa); } static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa, /* 读取第i个VRingDesc的flags */ int i) { hwaddr pa; pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); return virtio_lduw_phys(vdev, pa); } static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa, /* 读取第i个VRingDesc的next索引 */ int i) { hwaddr pa; pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); return virtio_lduw_phys(vdev, pa); } static inline uint16_t vring_avail_flags(VirtQueue *vq) /* 读取avail ring的flags */ { hwaddr pa; pa = vq->vring.avail + offsetof(VRingAvail, flags); return virtio_lduw_phys(vq->vdev, pa); } static inline uint16_t vring_avail_idx(VirtQueue *vq) /* 读取avail ring的idx */ { hwaddr pa; pa = vq->vring.avail + offsetof(VRingAvail, idx); return virtio_lduw_phys(vq->vdev, pa); } static inline uint16_t vring_avail_ring(VirtQueue *vq, int i) /* 读取avail ring的第i个idx */ { hwaddr pa; pa = vq->vring.avail + offsetof(VRingAvail, ring[i]); return virtio_lduw_phys(vq->vdev, pa); } static inline uint16_t vring_used_event(VirtQueue *vq) /* 读取avail ring中保存的used_event_idx */ { return vring_avail_ring(vq, vq->vring.num); } static inline void vring_used_ring_id(VirtQueue *vq, int i, uint32_t val) /* 修改used ring中第i个elem的id */ { hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, ring[i].id); virtio_stl_phys(vq->vdev, pa, val); } static inline void vring_used_ring_len(VirtQueue *vq, int i, uint32_t val) /* 修改used ring中第i个elem的len */ { hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, ring[i].len); virtio_stl_phys(vq->vdev, pa, val); } static uint16_t vring_used_idx(VirtQueue *vq) /* 读取used ring中的idx */ { hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, idx); return virtio_lduw_phys(vq->vdev, pa); } static inline void vring_used_idx_set(VirtQueue *vq, uint16_t val) /* 设置used ring中的idx */ { hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, idx); virtio_stw_phys(vq->vdev, pa, val); } static inline void vring_used_flags_set_bit(VirtQueue *vq, int mask) /* 设置used ring中flags的bit位 */ { VirtIODevice *vdev = vq->vdev; hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, flags); virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) | mask); } static inline void vring_used_flags_unset_bit(VirtQueue *vq, int mask) /* 清理used ring中flags的bit位 */ { VirtIODevice *vdev = vq->vdev; hwaddr pa; pa = vq->vring.used + offsetof(VRingUsed, flags); virtio_stw_phys(vdev, pa, virtio_lduw_phys(vdev, pa) & ~mask); }同时后端也提供了一系列接口来处理used ring,e.g.
virtqueue_pop主要用于从descriptor table中找到available ring中添加的buffer,即guest新添加并让后端处理的buffer
int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) { unsigned int i, head, max; hwaddr desc_pa = vq->vring.desc; VirtIODevice *vdev = vq->vdev; if (!virtqueue_num_heads(vq, vq->last_avail_idx)) /* 对比vring_avail_idx(vq)和vq->last_avail_idx,判断vq的avail idx是否有增长 */ return 0; /* 如果为0表示avail ring没有新的buffer,无需处理直接返回 */ /* When we start there are none of either input nor output. */ elem->out_num = elem->in_num = 0; max = vq->vring.num; i = head = virtqueue_get_head(vq, vq->last_avail_idx++); /* 从last_avail_idx开始,avail ring指向的vring desc entry索引 */ if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) { /* <span style="font-family: Arial, Helvetica, sans-serif;">如果guest enable VIRTIO_RING_F_EVENT_IDX */</span> vring_avail_event(vq, vring_avail_idx(vq)); /* 设置avail_event_idx为最新的avail ring idx值 */ } if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { /* 第i个desc的flags如果enable VRING_DESC_F_INDIRECT */ if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { /* INDIRECT的desc len必须是sizeof(VRingDesc)的整数倍 */ error_report("Invalid size for indirect buffer table"); exit(1); } /* loop over the indirect descriptor table */ max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); /* 最多遍历max个VRingDesc */ desc_pa = vring_desc_addr(vdev, desc_pa, i); /* desc_pa指向indirect指向的VRingDesc数组 */ i = 0; } /* Collect all the descriptors */ do { /* 遍历VRingDesc的项,把addr, len填到VirtQueueElement结构体里 */ struct iovec *sg; if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) { if (elem->in_num >= ARRAY_SIZE(elem->in_sg)) { error_report("Too many write descriptors in indirect table"); exit(1); } elem->in_addr[elem->in_num] = vring_desc_addr(vdev, desc_pa, i); sg = &elem->in_sg[elem->in_num++]; } else { if (elem->out_num >= ARRAY_SIZE(elem->out_sg)) { error_report("Too many read descriptors in indirect table"); exit(1); } elem->out_addr[elem->out_num] = vring_desc_addr(vdev, desc_pa, i); sg = &elem->out_sg[elem->out_num++]; } sg->iov_len = vring_desc_len(vdev, desc_pa, i); /* sg的iov_base部分被存放到in_addr, out_addr里 */ /* If we've got too many, that implies a descriptor loop. */ if ((elem->in_num + elem->out_num) > max) { error_report("Looped descriptor"); exit(1); } } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); /* 遍历VRingDesc,直到max */ /* Now map what we have collected */ virtqueue_map_sg(elem->in_sg, elem->in_addr, elem->in_num, 1); /* 通过cpu_physical_memory_map把地址映射成HVA,存入sg->iov_base */ virtqueue_map_sg(elem->out_sg, elem->out_addr, elem->out_num, 0); elem->index = head; /* index设置为VRingDesc head index */ vq->inuse++; trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num); return elem->in_num + elem->out_num; /* 返回virtqueue_pop总共的VRingDesc个数, */ }virtqueue_fill当virtio host端(qemu/vhost)处理完guest放入avail ring中的buffer之后,把buffer解除映射并放入used ring
void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem, unsigned int len, unsigned int idx) { unsigned int offset; int i; trace_virtqueue_fill(vq, elem, len, idx); offset = 0; for (i = 0; i < elem->in_num; i++) { /* 取消sg_in的HVA内存映射 */ size_t size = MIN(len - offset, elem->in_sg[i].iov_len); cpu_physical_memory_unmap(elem->in_sg[i].iov_base, elem->in_sg[i].iov_len, 1, size); offset += size; } for (i = 0; i < elem->out_num; i++) /* 取消sg_out的HVA内存映射 */ cpu_physical_memory_unmap(elem->out_sg[i].iov_base, elem->out_sg[i].iov_len, 0, elem->out_sg[i].iov_len); idx = (idx + vring_used_idx(vq)) % vq->vring.num; /* 计算新的used ring idx值,通过idx + used_event_idx对vring.num取模 */ /* Get a pointer to the next entry in the used ring. */ vring_used_ring_id(vq, idx, elem->index); /* 配置新的used ring项的内容,id是elem->index指向的VRingDesc的索引,len为其长度 */ vring_used_ring_len(vq, idx, len); }virtqueue_flush用于更新user ring的idx
void virtqueue_flush(VirtQueue *vq, unsigned int count) { uint16_t old, new; /* Make sure buffer is written before we update index. */ smp_wmb(); trace_virtqueue_flush(vq, count); old = vring_used_idx(vq); new = old + count; vring_used_idx_set(vq, new); vq->inuse -= count; if (unlikely((int16_t)(new - vq->signalled_used) < (uint16_t)(new - old))) vq->signalled_used_valid = false; /* 是否触发used_event */ }