假设Host配置的virtqueue队列深度是10,初始化时Descriptor Table有10个条目,用于存放10个buffer指针,如下图所示,Descriptor Table的每一个条目初始化时都指向数组的下一个元素,所有10个entry组成一个descriptorchain。virtqueue的关键成员初始值如下:
1. num_free:10,表示有10个空闲buffer
2. free_head:0,表示当前Descriptor Table中,空闲buffer的头是第0个entry
3. avail_idx_shadow:0,表示下一次Guest添加buffer后,Avail Ring中记录其头部的entry。这里为0,表示下一次Guest添加buffer后将其头部记录到Avail Ring的第0个entry Avail Ring
成员初始化如下:
4. idx:0,表示Guest下一次添加buffer将把其头部记录到Avail Ring的第0个entry
. Host VQ关键成员初始值如下:
Guest初始化virtqueue的函数如下
truct virtqueue *__vring_new_virtqueue(unsigned int index,
struct vring vring,
struct virtio_device *vdev,
bool weak_barriers,
bool context,
bool (*notify)(struct virtqueue *),
void (*callback)(struct virtqueue *),
const char *name)
{
unsigned int i;
struct vring_virtqueue *vq;
vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
GFP_KERNEL);
if (!vq)
return NULL;
vq->vring = vring;
vq->vq.callback = callback;
vq->vq.vdev = vdev;
vq->vq.name = name;
vq->vq.num_free = vring.num;
vq->vq.index = index; // 设置的是设备的第几个virtqueue
vq->we_own_ring = false;
vq->queue_dma_addr = 0;
vq->queue_size_in_bytes = 0;
vq->notify = notify;
vq->weak_barriers = weak_barriers;
vq->broken = false;
vq->last_used_idx = 0;
vq->avail_flags_shadow = 0;
vq->avail_idx_shadow = 0; // 下一次添加buffer后index放到Avail Ring的第0个位置
vq->num_added = 0;
list_add_tail(&vq->vq.list, &vdev->vqs);
......
vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
/* No callback? Tell other side not to bother us. */
if (!callback) {
vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
if (!vq->event)
vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
}
/* Put everything in free lists. */
vq->free_head = 0; // Descriptor Table中空闲的buffer头
for (i = 0; i < vring.num-1; i++)
vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); // 所有buffer初始化成一个Descriptor chain
memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
return &vq->vq;
}
void virtio_reset(void *opaque)
{
VirtIODevice *vdev = opaque;
VirtioDeviceClass *k = VIRTIO_DEVICE_GET_CLASS(vdev);
int i;
virtio_set_status(vdev, 0);
if (current_cpu) {
/* Guest initiated reset */
vdev->device_endian = virtio_current_cpu_endian();
} else {
/* System reset */
vdev->device_endian = virtio_default_endian();
}
if (k->reset) {
k->reset(vdev);
}
vdev->broken = false;
vdev->guest_features = 0;
vdev->queue_sel = 0;
vdev->status = 0;
atomic_set(&vdev->isr, 0);
vdev->config_vector = VIRTIO_NO_VECTOR;
virtio_notify_vector(vdev, vdev->config_vector);
for(i = 0; i < VIRTIO_QUEUE_MAX; i++) {
vdev->vq[i].vring.desc = 0;
vdev->vq[i].vring.avail = 0;
vdev->vq[i].vring.used = 0;
vdev->vq[i].last_avail_idx = 0; // Host维护的Avail Ring可用buffer的idx
vdev->vq[i].shadow_avail_idx = 0; // 存放VQ上取下Guest设置的avail_idx
vdev->vq[i].used_idx = 0; // host处理buffer的初始位置
virtio_queue_set_vector(vdev, i, VIRTIO_NO_VECTOR);
vdev->vq[i].signalled_used = 0;
vdev->vq[i].signalled_used_valid = false;
vdev->vq[i].notification = true;
vdev->vq[i].vring.num = vdev->vq[i].vring.num_default;
vdev->vq[i].inuse = 0;
virtio_virtqueue_reset_region_cache(&vdev->vq[i]);
}
}
static inline int virtqueue_add(struct virtqueue *_vq,
struct scatterlist *sgs[],
unsigned int total_sg,
unsigned int out_sgs, // 要发送的数据buffer
unsigned int in_sgs, // 要接受的数据buffer
void *data,
void *ctx,
gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
struct scatterlist *sg;
struct vring_desc *desc;
unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
int head;
bool indirect;
START_USE(vq);
......
if (unlikely(vq->broken)) {
END_USE(vq);
return -EIO;
}
......
head = vq->free_head; // 取出Descriptor Table的空闲数据buffer头索引
......
{
indirect = false;
desc = vq->vring.desc; // 读取Descriptor Table地址
i = head;
descs_used = total_sg;
}
if (vq->vq.num_free < descs_used) { // 如果要添加的buffer数超过了Descriptor Table空闲的buffer数,报错没有空间
pr_debug("Can't add buf len %i - avail = %i\n",
descs_used, vq->vq.num_free);
/* FIXME: for historical reasons, we force a notify here if
* there are outgoing parts to the buffer. Presumably the
* host should service the ring ASAP. */
if (out_sgs)
vq->notify(&vq->vq);
if (indirect)
kfree(desc);
END_USE(vq);
return -ENOSPC;
}
for (n = 0; n < out_sgs; n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) { // 从scatter-gather(向量化IO)缓存中挨个取出其地址,记录到Descriptor中
dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
if (vring_mapping_error(vq, addr))
goto unmap_release;
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); // 发送的数据buffer客户端只读
desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
}
}
for (; n < (out_sgs + in_sgs); n++) {
for (sg = sgs[n]; sg; sg = sg_next(sg)) {
dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
if (vring_mapping_error(vq, addr))
goto unmap_release;
desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); // 接收的数据buffer客户端可写
desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
prev = i;
i = virtio16_to_cpu(_vq->vdev, desc[i].next);
}
}
/* Last one doesn't continue. */
desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
......
/* We're using some buffers from the free list. */
vq->vq.num_free -= descs_used; // 减去用掉的buffer空间
......
vq->free_head = i; // 更新Descriptor Table可用buffer的头部
......
vq->desc_state[head].indir_desc = ctx;
/* Put entry in available array (but don't update avail->idx until they
* do sync). */
avail = vq->avail_idx_shadow & (vq->vring.num - 1); // 获取要记录的Avail Ring的索引
vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); // 将本次添加的buffer头的索引写入Avail Ring中
/* Descriptors and available array need to be set before we expose the
* new available array entries. */
virtio_wmb(vq->weak_barriers);
vq->avail_idx_shadow++; // Avail Ring索引加1
vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); // 将下一次要更新的Avail Ring索引放到idx中
vq->num_added++;
pr_debug("Added buffer head %i to %p\n", head, vq);
END_USE(vq);
/* This is very unlikely, but theoretically possible. Kick
* just in case. */
if (unlikely(vq->num_added == (1 << 16) - 1))
virtqueue_kick(_vq);
return 0;
void *virtqueue_pop(VirtQueue *vq, size_t sz)
{
unsigned int i, head, max;
VRingMemoryRegionCaches *caches;
MemoryRegionCache indirect_desc_cache = MEMORY_REGION_CACHE_INVALID;
MemoryRegionCache *desc_cache;
int64_t len;
VirtIODevice *vdev = vq->vdev;
VirtQueueElement *elem = NULL;
unsigned out_num, in_num, elem_entries;
hwaddr addr[VIRTQUEUE_MAX_SIZE];
struct iovec iov[VIRTQUEUE_MAX_SIZE];
VRingDesc desc;
int rc;
......
rcu_read_lock();
if (virtio_queue_empty_rcu(vq)) { // 判断VQ中是否有数据,当Host记录的工作位置last_avail_idx和从VQ取下的
goto done; // Guest记录的工作位置shadow_avail_idx相等。说明Guest没有添加buffer,因此VQ为空
}
......
max = vq->vring.num; // 获取VQ深度
if (vq->inuse >= vq->vring.num) {
virtio_error(vdev, "Virtqueue size exceeded");
goto done;
}
if (!virtqueue_get_head(vq, vq->last_avail_idx++, &head)) { // 根据Host记录的工作位置last_avail_idx从Avail Ring中
goto done; // 取下可用buffer的头部索引
}
if (virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
vring_set_avail_event(vq, vq->last_avail_idx); // 设置last_avail_idx到Used Ring的最后一项
}
i = head;
caches = vring_get_region_caches(vq);
if (caches->desc.len < max * sizeof(VRingDesc)) {
virtio_error(vdev, "Cannot map descriptor ring");
goto done;
}
desc_cache = &caches->desc;
vring_desc_read(vdev, &desc, desc_cache, i); // 读取Descriptor Table的第一个buffer地址到descr
......
/* Collect all the descriptors */ // 顺序读取整个Descriptor chain包含的descriptor
do {
bool map_ok;
if (desc.flags & VRING_DESC_F_WRITE) {
map_ok = virtqueue_map_desc(vdev, &in_num, addr + out_num, // 将Guest映射的sg entry转换成iov
iov + out_num,
VIRTQUEUE_MAX_SIZE - out_num, true,
desc.addr, desc.len);
} else {
if (in_num) {
virtio_error(vdev, "Incorrect order for descriptors");
goto err_undo_map;
}
map_ok = virtqueue_map_desc(vdev, &out_num, addr, iov,
VIRTQUEUE_MAX_SIZE, false,
desc.addr, desc.len);
}
if (!map_ok) {
goto err_undo_map;
}
/* If we've got too many, that implies a descriptor loop. */
if (++elem_entries > max) { // 超过了队列深度,循环了
virtio_error(vdev, "Looped descriptor");
goto err_undo_map;
}
rc = virtqueue_read_next_desc(vdev, &desc, desc_cache, max, &i);
} while (rc == VIRTQUEUE_READ_DESC_MORE);
if (rc == VIRTQUEUE_READ_DESC_ERROR) {
goto err_undo_map;
}
/* Now copy what we have collected and mapped */
elem = virtqueue_alloc_element(sz, out_num, in_num);
elem->index = head;
for (i = 0; i < out_num; i++) {
elem->out_addr[i] = addr[i];
elem->out_sg[i] = iov[i];
}
for (i = 0; i < in_num; i++) {
elem->in_addr[i] = addr[out_num + i];
elem->in_sg[i] = iov[out_num + i];
}
vq->inuse++;
trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
done:
address_space_cache_destroy(&indirect_desc_cache);
rcu_read_unlock();
return elem;
}