本文会重点讨论下vhost pmd和lib库中的api如何使用。
在前面的章节中描述过virtio-net设备的生命周期包括设备创建、配置、服务启动和设备销毁几个阶段。
我们先回忆下整个生命周期:
在examples\vhost目录下有对virtio-net使用的示例,一起来研究下:
int main(int argc, char *argv[])
{
unsigned lcore_id, core_id = 0;
unsigned nb_ports, valid_num_ports;
int ret, i;
uint8_t portid;
static pthread_t tid;
char thread_name[RTE_MAX_THREAD_NAME_LEN];
uint64_t flags = 0;
signal(SIGINT, sigint_handler);
/* init EAL */
ret = rte_eal_init(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
argc -= ret;
argv += ret;
/* parse app arguments */
ret = us_vhost_parse_args(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Invalid argument\n");
for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
if (rte_lcore_is_enabled(lcore_id))
lcore_ids[core_id++] = lcore_id;
}
if (rte_lcore_count() > RTE_MAX_LCORE)
rte_exit(EXIT_FAILURE,"Not enough cores\n");
/* Get the number of physical ports. */
nb_ports = rte_eth_dev_count();
/*
* Update the global var NUM_PORTS and global array PORTS
* and get value of var VALID_NUM_PORTS according to system ports number
*/
valid_num_ports = check_ports_num(nb_ports);
if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
"but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
return -1;
}
/*
* FIXME: here we are trying to allocate mbufs big enough for
* @MAX_QUEUES, but the truth is we're never going to use that
* many queues here. We probably should only do allocation for
* those queues we are going to use.
*/
create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
if (vm2vm_mode == VM2VM_HARDWARE) {
/* Enable VT loop back to let L2 switch to do it. */
vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
RTE_LOG(DEBUG, VHOST_CONFIG,
"Enable loop back for L2 switch in vmdq.\n");
}
/* initialize all ports */
for (portid = 0; portid < nb_ports; portid++) {
/* skip ports that are not enabled */
if ((enabled_port_mask & (1 << portid)) == 0) {
RTE_LOG(INFO, VHOST_PORT,
"Skipping disabled port %d\n", portid);
continue;
}
if (port_init(portid) != 0)
rte_exit(EXIT_FAILURE,
"Cannot initialize network ports\n");
}
/* Enable stats if the user option is set. */
if (enable_stats) {
ret = pthread_create(&tid, NULL, (void *)print_stats, NULL);
if (ret != 0)
rte_exit(EXIT_FAILURE,
"Cannot create print-stats thread\n");
/* Set thread_name for aid in debugging. */
snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN, "print-stats");
ret = rte_thread_setname(tid, thread_name);
if (ret != 0)
RTE_LOG(DEBUG, VHOST_CONFIG,
"Cannot set print-stats name\n");
}
/* Launch all data cores. */
RTE_LCORE_FOREACH_SLAVE(lcore_id)
rte_eal_remote_launch(switch_worker, NULL, lcore_id);
if (client_mode)
flags |= RTE_VHOST_USER_CLIENT;
if (dequeue_zero_copy)
flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
/* Register vhost user driver to handle vhost messages. */
for (i = 0; i < nb_sockets; i++) {
char *file = socket_files + i * PATH_MAX;
ret = rte_vhost_driver_register(file, flags);
if (ret != 0) {
unregister_drivers(i);
rte_exit(EXIT_FAILURE,
"vhost driver register failure.\n");
}
if (builtin_net_driver)
rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
if (mergeable == 0) {
rte_vhost_driver_disable_features(file,
1ULL << VIRTIO_NET_F_MRG_RXBUF);
}
if (enable_tx_csum == 0) {
rte_vhost_driver_disable_features(file,
1ULL << VIRTIO_NET_F_CSUM);
}
if (enable_tso == 0) {
rte_vhost_driver_disable_features(file,
1ULL << VIRTIO_NET_F_HOST_TSO4);
rte_vhost_driver_disable_features(file,
1ULL << VIRTIO_NET_F_HOST_TSO6);
rte_vhost_driver_disable_features(file,
1ULL << VIRTIO_NET_F_GUEST_TSO4);
rte_vhost_driver_disable_features(file,
1ULL << VIRTIO_NET_F_GUEST_TSO6);
}
if (promiscuous) {
rte_vhost_driver_enable_features(file,
1ULL << VIRTIO_NET_F_CTRL_RX);
}
ret = rte_vhost_driver_callback_register(file,
&virtio_net_device_ops);
if (ret != 0) {
rte_exit(EXIT_FAILURE,
"failed to register vhost driver callbacks.\n");
}
if (rte_vhost_driver_start(file) < 0) {
rte_exit(EXIT_FAILURE,
"failed to start vhost driver.\n");
}
}
RTE_LCORE_FOREACH_SLAVE(lcore_id)
rte_eal_wait_lcore(lcore_id);
return 0;
}
从上面的流程中可以看出,创建是通过rte_vhost_driver_register()来完成的,之后根据配置文件调用rte_vhost_driver_set_features()\rte_vhost_driver_disable_features()来配置设备的相关特性,并调用rte_vhost_driver_start()接口来启动virtio-net设备,在这个接口中会判断vhost是作为client还是server模式启动的,根据不同的角色和virtio前段驱动完成状态协商以及virtqueue和vring的地址空间映射动作。以上的所有动作完成后,就可以开启服务等待接收\发送报文了。
对报文的处理都是由switch_worker()完成,这个函数会运行在所有转发核上。
/*
* Main function of vhost-switch. It basically does:
*
* for each vhost device {
* - drain_eth_rx()
*
* Which drains the host eth Rx queue linked to the vhost device,
* and deliver all of them to guest virito Rx ring associated with
* this vhost device.
*
* - drain_virtio_tx()
*
* Which drains the guest virtio Tx queue and deliver all of them
* to the target, which could be another vhost device, or the
* physical eth dev. The route is done in function "virtio_tx_route".
* }
*/
static int
switch_worker(void *arg __rte_unused)
{
unsigned i;
unsigned lcore_id = rte_lcore_id();
struct vhost_dev *vdev;
struct mbuf_table *tx_q;
RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
tx_q = &lcore_tx_queue[lcore_id];
for (i = 0; i < rte_lcore_count(); i++) {
if (lcore_ids[i] == lcore_id) {
tx_q->txq_id = i;
break;
}
}
while(1) {
drain_mbuf_table(tx_q);
/*
* Inform the configuration core that we have exited the
* linked list and that no devices are in use if requested.
*/
if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
/*
* Process vhost devices
*/
TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
lcore_vdev_entry) {
if (unlikely(vdev->remove)) {
unlink_vmdq(vdev);
vdev->ready = DEVICE_SAFE_REMOVE;
continue;
}
if (likely(vdev->ready == DEVICE_RX))
drain_eth_rx(vdev);
if (likely(!vdev->remove))
drain_virtio_tx(vdev);
}
}
return 0;
}
这个接口做的事情比较明确,在每个核上启动一个while()循环,在开始服务前先看看物理网卡的发包队列里是否还有缓存报文未发出去,如果有,先处理掉;然后再对该核上所有的设备在收报方向上调用drain_eth_rx(vdev)从host的物理nic上接受报文并放入到guest的Rx ring中,在发包方向上调用drain_virtio_tx(vdev)从guest的Tx ring中取出报文并通过host的物理nic上发送出去。这两个接口也有必要研究下。
实现接口是drain_eth_rx(struct vhost_dev *vdev),这个接口里的动作又可以划分为两步:
static inline void __attribute__((always_inline)) drain_eth_rx(struct vhost_dev *vdev)
{
uint16_t rx_count, enqueue_count;
struct rte_mbuf *pkts[MAX_PKT_BURST];
/*step1: 从物理网卡收取报文,以批量模式*/
rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
pkts, MAX_PKT_BURST);
if (!rx_count)
return;
/*step2:将收取的报文放入到virtio的Rx ring中*/
/*
* When "enable_retry" is set, here we wait and retry when there
* is no enough free slots in the queue to hold @rx_count packets,
* to diminish packet loss.
*/
/*放入之前先检查avail ring的空间是否够用,不够的话就根据配置等待*/
if (enable_retry &&
unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
VIRTIO_RXQ))) {
uint32_t retry;
for (retry = 0; retry < burst_rx_retry_num; retry++) {
rte_delay_us(burst_rx_delay_time);
if (rx_count <= rte_vhost_avail_entries(vdev->vid,
VIRTIO_RXQ))
break;
}
}
if (builtin_net_driver) {
enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
pkts, rx_count);
} else {
enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
pkts, rx_count);
}
if (enable_stats) {
rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
}
free_pkts(pkts, rx_count);
}
uint16_t
rte_vhost_enqueue_burst(int vid, uint16_t queue_id,
struct rte_mbuf **pkts, uint16_t count)
{
struct virtio_net *dev = get_device(vid);
if (!dev)
return 0;
if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
return virtio_dev_merge_rx(dev, queue_id, pkts, count);
else
return virtio_dev_rx(dev, queue_id, pkts, count);
}
/**
* This function adds buffers to the virtio devices RX virtqueue. Buffers can
* be received from the physical port or from another virtio device. A packet
* count is returned to indicate the number of packets that are succesfully
* added to the RX queue. This function works when the mbuf is scattered, but
* it doesn't support the mergeable feature.
*/
static inline uint32_t __attribute__((always_inline))
virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
struct rte_mbuf **pkts, uint32_t count)
{
struct vhost_virtqueue *vq;
uint16_t avail_idx, free_entries, start_idx;
uint16_t desc_indexes[MAX_PKT_BURST];
struct vring_desc *descs;
uint16_t used_idx;
uint32_t i, sz;
LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
if (unlikely(!is_valid_virt_queue_idx(queue_id, 0, dev->nr_vring))) {
RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
if (unlikely(vq->enabled == 0))
return 0;
avail_idx = *((volatile uint16_t *)&vq->avail->idx);
start_idx = vq->last_used_idx;
free_entries = avail_idx - start_idx;
count = RTE_MIN(count, free_entries);
count = RTE_MIN(count, (uint32_t)MAX_PKT_BURST);
if (count == 0)
return 0;
LOG_DEBUG(VHOST_DATA, "(%d) start_idx %d | end_idx %d\n",
dev->vid, start_idx, start_idx + count);
/* Retrieve all of the desc indexes first to avoid caching issues. */
rte_prefetch0(&vq->avail->ring[start_idx & (vq->size - 1)]);
for (i = 0; i < count; i++) {
used_idx = (start_idx + i) & (vq->size - 1);
desc_indexes[i] = vq->avail->ring[used_idx];
vq->used->ring[used_idx].id = desc_indexes[i];
vq->used->ring[used_idx].len = pkts[i]->pkt_len +
dev->vhost_hlen;
vhost_log_used_vring(dev, vq,
offsetof(struct vring_used, ring[used_idx]),
sizeof(vq->used->ring[used_idx]));
}
rte_prefetch0(&vq->desc[desc_indexes[0]]);
for (i = 0; i < count; i++) {
uint16_t desc_idx = desc_indexes[i];
int err;
if (vq->desc[desc_idx].flags & VRING_DESC_F_INDIRECT) {
descs = (struct vring_desc *)(uintptr_t)
rte_vhost_gpa_to_vva(dev->mem,
vq->desc[desc_idx].addr);
if (unlikely(!descs)) {
count = i;
break;
}
desc_idx = 0;
sz = vq->desc[desc_idx].len / sizeof(*descs);
} else {
descs = vq->desc; sz = vq->size;
}
err = copy_mbuf_to_desc(dev, descs, pkts[i], desc_idx, sz);
if (unlikely(err)) {
used_idx = (start_idx + i) & (vq->size - 1);
vq->used->ring[used_idx].len = dev->vhost_hlen;
vhost_log_used_vring(dev, vq, offsetof(struct vring_used, ring[used_idx]), sizeof(vq->used->ring[used_idx]));
} if (i + 1 < count)
rte_prefetch0(&vq->desc[desc_indexes[i+1]]);
}
rte_smp_wmb();
*(volatile uint16_t *)&vq->used->idx += count;
vq->last_used_idx += count;
vhost_log_used_vring(dev, vq,offsetof(struct vring_used, idx),sizeof(vq->used->idx));
/* flush used->idx update before we read avail->flags. */
rte_mb();
/* Kick the guest if necessary. */
if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)
&& (vq->callfd >= 0))
eventfd_write(vq->callfd, (eventfd_t)1);
return count;
}
第2步的具体调用路径是:drain_eth_rx–>rte_vhost_enqueue_burst–>virtio_dev_rx,在virtio_dev_rx()函数中完成报文的拷贝入desc,vring以及virtqueue相关指针的更新,完成后发送eventfd通知guest去处理。
实现接口是drain_virtio_tx(struct vhost_dev *vdev),这个接口里的动作也可以划分为两步:
static inline void __attribute__((always_inline))
drain_virtio_tx(struct vhost_dev *vdev)
{
struct rte_mbuf *pkts[MAX_PKT_BURST];
uint16_t count;
uint16_t i;
/*step 1:从virtio的Tx ring中取包到pkts数组中*/
if (builtin_net_driver) {
count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
pkts, MAX_PKT_BURST);
} else {
count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
mbuf_pool, pkts, MAX_PKT_BURST);
}
/*step 2:将pkts中的报文通过合适路径发送出去*/
/* setup VMDq for the first packet * 第一个报文时rarp报文,用来实现mac学习和在VMDq中建立vlan tag映射 */
if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
free_pkts(pkts, count);
}
for (i = 0; i < count; ++i)
virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
}
第1步中的rte_vhost_dequeue_burst的实现就是rte_vhost_enqueue_burst的反向动作,具体实现看代码即可:
uint16_t
rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
{
struct virtio_net *dev;
struct rte_mbuf *rarp_mbuf = NULL;
struct vhost_virtqueue *vq;
uint32_t desc_indexes[MAX_PKT_BURST];
uint32_t used_idx;
uint32_t i = 0;
uint16_t free_entries;
uint16_t avail_idx;
dev = get_device(vid);
if (!dev)
return 0;
if (unlikely(!is_valid_virt_queue_idx(queue_id, 1, dev->nr_vring))) {
RTE_LOG(ERR, VHOST_DATA, "(%d) %s: invalid virtqueue idx %d.\n",
dev->vid, __func__, queue_id);
return 0;
}
vq = dev->virtqueue[queue_id];
if (unlikely(vq->enabled == 0))
return 0;
if (unlikely(dev->dequeue_zero_copy)) {
struct zcopy_mbuf *zmbuf, *next;
int nr_updated = 0;
for (zmbuf = TAILQ_FIRST(&vq->zmbuf_list);
zmbuf != NULL; zmbuf = next) {
next = TAILQ_NEXT(zmbuf, next);
if (mbuf_is_consumed(zmbuf->mbuf)) {
used_idx = vq->last_used_idx++ & (vq->size - 1);
update_used_ring(dev, vq, used_idx,
zmbuf->desc_idx);
nr_updated += 1;
TAILQ_REMOVE(&vq->zmbuf_list, zmbuf, next);
rte_pktmbuf_free(zmbuf->mbuf);
put_zmbuf(zmbuf);
vq->nr_zmbuf -= 1;
}
}
update_used_idx(dev, vq, nr_updated);
}
/*
* Construct a RARP broadcast packet, and inject it to the "pkts"
* array, to looks like that guest actually send such packet.
*
* Check user_send_rarp() for more information.
*
* broadcast_rarp shares a cacheline in the virtio_net structure
* with some fields that are accessed during enqueue and
* rte_atomic16_cmpset() causes a write if using cmpxchg. This could
* result in false sharing between enqueue and dequeue.
*
* Prevent unnecessary false sharing by reading broadcast_rarp first
* and only performing cmpset if the read indicates it is likely to
* be set.
*/
if (unlikely(rte_atomic16_read(&dev->broadcast_rarp) &&
rte_atomic16_cmpset((volatile uint16_t *)
&dev->broadcast_rarp.cnt, 1, 0))) {
rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
if (rarp_mbuf == NULL) {
RTE_LOG(ERR, VHOST_DATA,
"Failed to allocate memory for mbuf.\n");
return 0;
}
if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
rte_pktmbuf_free(rarp_mbuf);
rarp_mbuf = NULL;
} else {
count -= 1;
}
}
free_entries = *((volatile uint16_t *)&vq->avail->idx) -
vq->last_avail_idx;
if (free_entries == 0)
goto out;
LOG_DEBUG(VHOST_DATA, "(%d) %s\n", dev->vid, __func__);
/* Prefetch available and used ring */
avail_idx = vq->last_avail_idx & (vq->size - 1);
used_idx = vq->last_used_idx & (vq->size - 1);
rte_prefetch0(&vq->avail->ring[avail_idx]);
rte_prefetch0(&vq->used->ring[used_idx]);
count = RTE_MIN(count, MAX_PKT_BURST);
count = RTE_MIN(count, free_entries);
LOG_DEBUG(VHOST_DATA, "(%d) about to dequeue %u buffers\n",
dev->vid, count);
/* Retrieve all of the head indexes first to avoid caching issues. */
for (i = 0; i < count; i++) {
avail_idx = (vq->last_avail_idx + i) & (vq->size - 1);
used_idx = (vq->last_used_idx + i) & (vq->size - 1);
desc_indexes[i] = vq->avail->ring[avail_idx];
if (likely(dev->dequeue_zero_copy == 0))
update_used_ring(dev, vq, used_idx, desc_indexes[i]);
}
/* Prefetch descriptor index. */
rte_prefetch0(&vq->desc[desc_indexes[0]]);
for (i = 0; i < count; i++) {
struct vring_desc *desc;
uint16_t sz, idx;
int err;
if (likely(i + 1 < count))
rte_prefetch0(&vq->desc[desc_indexes[i + 1]]);
if (vq->desc[desc_indexes[i]].flags & VRING_DESC_F_INDIRECT) {
desc = (struct vring_desc *)(uintptr_t)
rte_vhost_gpa_to_vva(dev->mem,
vq->desc[desc_indexes[i]].addr);
if (unlikely(!desc))
break;
rte_prefetch0(desc);
sz = vq->desc[desc_indexes[i]].len / sizeof(*desc);
idx = 0;
} else {
desc = vq->desc;
sz = vq->size;
idx = desc_indexes[i];
}
pkts[i] = rte_pktmbuf_alloc(mbuf_pool);
if (unlikely(pkts[i] == NULL)) {
RTE_LOG(ERR, VHOST_DATA,
"Failed to allocate memory for mbuf.\n");
break;
}
err = copy_desc_to_mbuf(dev, desc, sz, pkts[i], idx, mbuf_pool);
if (unlikely(err)) {
rte_pktmbuf_free(pkts[i]);
break;
}
if (unlikely(dev->dequeue_zero_copy)) {
struct zcopy_mbuf *zmbuf;
zmbuf = get_zmbuf(vq);
if (!zmbuf) {
rte_pktmbuf_free(pkts[i]);
break;
}
zmbuf->mbuf = pkts[i];
zmbuf->desc_idx = desc_indexes[i];
/*
* Pin lock the mbuf; we will check later to see
* whether the mbuf is freed (when we are the last
* user) or not. If that's the case, we then could
* update the used ring safely.
*/
rte_mbuf_refcnt_update(pkts[i], 1);
vq->nr_zmbuf += 1;
TAILQ_INSERT_TAIL(&vq->zmbuf_list, zmbuf, next);
}
}
vq->last_avail_idx += i;
if (likely(dev->dequeue_zero_copy == 0)) {
vq->last_used_idx += i;
update_used_idx(dev, vq, i);
}
out:
if (unlikely(rarp_mbuf != NULL)) {
/*
* Inject it to the head of "pkts" array, so that switch's mac
* learning table will get updated first.
*/
memmove(&pkts[1], pkts, i * sizeof(struct rte_mbuf *));
pkts[0] = rarp_mbuf;
i += 1;
}
return i;
}
第二步中virtio_tx_route接口中需要区分下发送的目标,如果目标是local VM并且设置了VM2VM_SOFTWARE,就可以直接通过软交换把报文转发出去了;如果没有设置VM2VM_SOFTWARE宏或者是要通过物理nic发送出去,则最终调用do_drain_mbuf_table–>rte_eth_tx_burst将报文发送出去。
static inline void __attribute__((always_inline))
do_drain_mbuf_table(struct mbuf_table *tx_q)
{
uint16_t count;
count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
tx_q->m_table, tx_q->len);
if (unlikely(count < tx_q->len))
free_pkts(&tx_q->m_table[count], tx_q->len - count);
tx_q->len = 0;
}
至此,再整理下具体的vhost发包和收包接口: 主要使用上一讲中重要的API来完成对设备生命周期的管理。
使用vhost lib提供的以下接口来完成vhost设备的收发包:
uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
uint16_t rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count);
原文链接:http://www.voidcn.com/article/p-vfzcgoqe-bpq.html