在virtio中,中间层的数据管理是一个重点,前期的文章中提到的相关的队列和缓冲区就是最典型的数据结构即vring,virtqueue。它们两个定义在virtio_ring.h和virqueue.h这两个文件。下面看一看与其相关的定义代码:
/* VirtIO ring descriptors: 16 bytes.
* These can chain together via "next". */
//描述符主要用于对客户端一侧的数据缓冲,用来提供client和host的数据传递,请注意next,这是一个链表用来包含多个此类缓冲
struct vring_desc {
uint64_t addr; /* Address (guest-physical). */
uint32_t len; /* Length. */
uint16_t flags; /* The flags as indicated above. */
uint16_t next; /* We chain unused descriptors via this. * /
};
//数据标记
struct vring_avail {
uint16_t flags;
uint16_t idx;
uint16_t ring[0];
};
/* id is a 16bit index. uint32_t is used here for ids for padding reasons. */
struct vring_used_elem {
/* Index of start of used descriptor chain. */
uint32_t id;
/* Total length of the descriptor chain which was written to. * /
uint32_t len;
};
//vring已使用
struct vring_used {
uint16_t flags;
volatile uint16_t idx;
struct vring_used_elem ring[0];
};
/* For support of packed virtqueues in Virtio 1.1 the format of descriptors
* looks like this.
*/
struct vring_packed_desc {
uint64_t addr;
uint32_t len;
uint16_t id;
uint16_t flags;
};
#define RING_EVENT_FLAGS_ENABLE 0x0
#define RING_EVENT_FLAGS_DISABLE 0x1
#define RING_EVENT_FLAGS_DESC 0x2
struct vring_packed_desc_event {
uint16_t desc_event_off_wrap;
uint16_t desc_event_flags;
};
struct vring_packed {
unsigned int num;
struct vring_packed_desc * desc;
struct vring_packed_desc_event * driver;
struct vring_packed_desc_event * device;
};
//vring其实是把上述的数据结构代码合并,即packed ring (1.1版本后实现)
struct vring {
unsigned int num;
struct vring_desc * desc;
struct vring_avail * avail;
struct vring_used * used;
};
上面的代码是vring的数据结构,在virtio1.1以前,几个ring(desc,avail,used)是分开的,在1.1以后,合在一起。也叫packed ring,而以前的则称呼为split ring.
再看一下virqueue的定义:
struct virtio_net_ctrl_hdr {
uint8_t class;
uint8_t cmd;
} __attribute__((packed));
typedef uint8_t virtio_net_ctrl_ack;
#define VIRTIO_NET_OK 0
#define VIRTIO_NET_ERR 1
#define VIRTIO_MAX_CTRL_DATA 2048
struct virtio_pmd_ctrl {
struct virtio_net_ctrl_hdr hdr;
virtio_net_ctrl_ack status;
uint8_t data[VIRTIO_MAX_CTRL_DATA];
};
struct vq_desc_extra {
void *cookie;
uint16_t ndescs;
uint16_t next;
};
struct virtqueue {
struct virtio_hw *hw; /**< virtio_hw structure pointer. */
union {
struct {
/**< vring keeping desc, used and avail */
struct vring ring;
} vq_split;
struct {
/**< vring keeping descs and events */
struct vring_packed ring;
bool used_wrap_counter;
uint16_t cached_flags; /**< cached flags for descs */
uint16_t event_flags_shadow;
} vq_packed;
};
uint16_t vq_used_cons_idx; /**< last consumed descriptor */
uint16_t vq_nentries; /**< vring desc numbers */
uint16_t vq_free_cnt; /**< num of desc available */
uint16_t vq_avail_idx; /**< sync until needed */
uint16_t vq_free_thresh; /**< free threshold */
void *vq_ring_virt_mem; /**< linear address of vring*/
unsigned int vq_ring_size;
//数据的类型,收,发或者控制
union {
struct virtnet_rx rxq;
struct virtnet_tx txq;
struct virtnet_ctl cq;
};
rte_iova_t vq_ring_mem; /**< physical address of vring,
* or virtual address for virtio_user. */
/**
* Head of the free chain in the descriptor table. If
* there are no free descriptors, this will be set to
* VQ_RING_DESC_CHAIN_END.
*/
uint16_t vq_desc_head_idx;
uint16_t vq_desc_tail_idx;
uint16_t vq_queue_index; /**< PCI queue index */
uint16_t offset; /**< relative offset to obtain addr in mbuf */
uint16_t *notify_addr;
struct rte_mbuf **sw_ring; /**< RX software ring. * /
struct vq_desc_extra vq_descx[0];
};
其实通过上述的代码可以看出队列中包含有vring这个数据结构,vring是一个环形的读写空间。在实际的运行中,Client把Buffers插入到这个队列中,而队列会根据不同设备安排不同的数量,一般来说,网络设备有两个队列,即用于收和发两种数据类型。
在上面的分析中,知道了split ring和packed ring,它们都是用来做为前后端数据交互的数据结构,在实际的运行的过程中,通过其产生的结构体对象形成的队列及队列内的环形缓冲区,通过级联式的内存缓冲和复制来保证数据的安全性相关的数据传输的效率。
不同之处在于,Split Ring由于是分离的,所以其数据结构在virtq中是要求严格有序的。其代理的一些缓冲区及内存数组部分需要使用相关ID来操作,标记结构也比较少;而在Packed Ring中,由于三个Ring合在一起,增加了Flag的相关标记值,去除了next字段,同时,增加了Buffer ID,对entries支持进行了增强。而且,Packed Ring还更容易增加与硬件的亲和性并更好的利用Cache。
在源码中先看一下处理的分支:
//lib/librte_vhost/virtio_net.c
static __rte_always_inline uint32_t
virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
struct vhost_virtqueue * vq;
uint32_t nb_tx = 0;
.......
if (count == 0)
goto out;
//此处代码即用来判断是否为Packed Ring
if (vq_is_packed(dev))
nb_tx = virtio_dev_rx_packed(dev, vq, pkts, count);
else
nb_tx = virtio_dev_rx_split(dev, vq, pkts, count);
out:
if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
vhost_user_iotlb_rd_unlock(vq);
out_access_unlock:
rte_spinlock_unlock(&vq->access_lock);
return nb_tx;
}
在上面的代码中,不同的设备状态会引起不同的操作函数来处理,这也是一种兼容的方式。下面看一下两种处理方式的具体的代码:
static __rte_noinline uint32_t
virtio_dev_rx_packed(struct virtio_net *dev,
struct vhost_virtqueue *vq,
struct rte_mbuf **pkts,
uint32_t count)
{
uint32_t pkt_idx = 0;
uint32_t remained = count;
do {
//处理缓冲区拷贝mbuf
rte_prefetch0(&vq->desc_packed[vq->last_avail_idx]);
if (remained >= PACKED_BATCH_SIZE) {
if (!virtio_dev_rx_batch_packed(dev, vq,
&pkts[pkt_idx])) {
pkt_idx += PACKED_BATCH_SIZE;
remained -= PACKED_BATCH_SIZE;
continue;
}
}
//处理数据填充
if (virtio_dev_rx_single_packed(dev, vq, pkts[pkt_idx]))
break;
pkt_idx++;
remained--;
} while (pkt_idx < count);
if (vq->shadow_used_idx) {
do_data_copy_enqueue(dev, vq);//小数据处理
vhost_flush_enqueue_shadow_packed(dev, vq);//更新used ring
}
if (pkt_idx)
vhost_vring_call_packed(dev, vq);//kick前端
return pkt_idx;
}
split ring的处理代码:
static __rte_noinline uint32_t
virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
struct rte_mbuf **pkts, uint32_t count)
{
//基础变量的定义
uint32_t pkt_idx = 0;
uint16_t num_buffers;
struct buf_vector buf_vec[BUF_VECTOR_MAX];
uint16_t avail_head;
avail_head = * ((volatile uint16_t * )&vq->avail->idx);
/*
* The ordering between avail index and
* desc reads needs to be enforced.
* /
rte_smp_rmb();
rte_prefetch0(&vq->avail->ring[vq->last_avail_idx & (vq->size - 1)]);
for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
uint32_t pkt_len = pkts[pkt_idx]->pkt_len + dev->vhost_hlen;
uint16_t nr_vec = 0;
//判断当前avail desc是否满足拷贝mbuf的需求
if (unlikely(reserve_avail_buf_split(dev, vq,
pkt_len, buf_vec, &num_buffers,
avail_head, &nr_vec) < 0)) {
VHOST_LOG_DEBUG(VHOST_DATA,
"(%d) failed to get enough desc from vring\n",
dev->vid);
vq->shadow_used_idx -= num_buffers;
break;
}
VHOST_LOG_DEBUG(VHOST_DATA, "(%d) current index %d | end index %d\n",
dev->vid, vq->last_avail_idx,
vq->last_avail_idx + num_buffers);
//拷贝mbuf到avail desc
if (copy_mbuf_to_desc(dev, vq, pkts[pkt_idx],
buf_vec, nr_vec,
num_buffers) < 0) {
vq->shadow_used_idx -= num_buffers;
break;
}
vq->last_avail_idx += num_buffers;
}
//处理小数据
do_data_copy_enqueue(dev, vq);
if (likely(vq->shadow_used_idx)) {
flush_shadow_used_ring_split(dev, vq);
vhost_vring_call_split(dev, vq);
}
return pkt_idx;
}
通过上面两种代码的分别处理流程,即可处理相关的数据结构,也就是分别数据最初提出的不同的内存数据结构体。
其实DPDK本身就是一种以数据流动为向导的框架,追根到底,就是极致的增加数据传输的效率,所以扭住了数据传送中的数据对象的数据定义,其实就找到了DPDK中为实现这种高效的数据流动所采取的算法和各种通信方式,IO动作形式和数据管理机制。把这些都搞明白后,那么学习DPDK的基础和前提就有了,就可以结合上层的如CPU亲和,大内存等一一进行分析处理。