本文以virtio网络驱动为例,分析virtio驱动是如何处理网络IO的,驱动的定义如下
static struct virtio_device_id id_table[] = { { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID }, { 0 }, }; static unsigned int features[] = { VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC, VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, }; static struct virtio_driver virtio_net_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, .probe = virtnet_probe, .remove = __devexit_p(virtnet_remove), .config_changed = virtnet_config_changed, #ifdef CONFIG_PM .freeze = virtnet_freeze, .restore = virtnet_restore, #endif }; static int __init init(void) { return register_virtio_driver(&virtio_net_driver); /* 调用driver_register */ } static void __exit fini(void) { unregister_virtio_driver(&virtio_net_driver); /* 调用driver_unregister */ }virtnet_probe用于pci总线发现virtio net设备
struct virtnet_info { struct virtio_device *vdev; struct virtqueue *rvq, *svq, *cvq; struct net_device *dev; struct napi_struct napi; /* * Upstream uses the system_nrt workqueue; RHEL6 doesn't have * that, so we create a singlethread wq. */ struct workqueue_struct *st_wq; unsigned int status; /* Number of input buffers, and max we've ever had. */ unsigned int num, max; /* I like... big packets and I cannot lie! */ bool big_packets; /* Host will merge rx buffers for big packets (shake it! shake it!) */ bool mergeable_rx_bufs; /* Work struct for refilling if we run low on memory. */ struct delayed_work refill; /* Chain pages by the private ptr. */ struct page *pages; }; static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, .ndo_start_xmit = start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = virtnet_set_mac_address, .ndo_set_rx_mode = virtnet_set_rx_mode, .ndo_change_mtu = virtnet_change_mtu, .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = virtnet_netpoll, #endif }; static const struct ethtool_ops virtnet_ethtool_ops = { .set_tx_csum = virtnet_set_tx_csum, .set_sg = ethtool_op_set_sg, .set_tso = ethtool_op_set_tso, .set_ufo = ethtool_op_set_ufo, .get_link = ethtool_op_get_link, }; static int virtnet_probe(struct virtio_device *vdev) { int err; struct net_device *dev; struct virtnet_info *vi; /* Allocate ourselves a network device with room for our info */ dev = alloc_etherdev(sizeof(struct virtnet_info)); if (!dev) return -ENOMEM; /* Set up network device as normal. */ dev->netdev_ops = &virtnet_netdev; dev->features = NETIF_F_HIGHDMA; SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops); SET_NETDEV_DEV(dev, &vdev->dev); /* Do we support "hardware" checksums? */ if (csum && virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) { /* This opens up the world of extra features. */ dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST; if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) { dev->features |= NETIF_F_TSO | NETIF_F_UFO | NETIF_F_TSO_ECN | NETIF_F_TSO6; } /* Individual feature bits: what can host handle? */ if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4)) dev->features |= NETIF_F_TSO; if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6)) dev->features |= NETIF_F_TSO6; if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN)) dev->features |= NETIF_F_TSO_ECN; if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO)) dev->features |= NETIF_F_UFO; } /* Configuration may specify what MAC to use. Otherwise random. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) { vdev->config->get(vdev, offsetof(struct virtio_net_config, mac), dev->dev_addr, dev->addr_len); } else random_ether_addr(dev->dev_addr); /* Set up our device-specific information */ vi = netdev_priv(dev); netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight); /* 初始化virtnet_info->napi为virtnet_poll */ vi->dev = dev; vi->vdev = vdev; vdev->priv = vi; vi->pages = NULL; vi->st_wq = create_singlethread_workqueue("virtio-net"); if (!vi->st_wq) { /* Can't get a precise err from function above */ err = -ENOMEM; goto free; } INIT_DELAYED_WORK(&vi->refill, refill_work); /* refill_work用来补充收包的buffer */ /* If we can receive ANY GSO packets, we must allocate large ones. */ if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN)) vi->big_packets = true; if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) vi->mergeable_rx_bufs = true; err = init_vqs(vi); /* 初始化TX, RX, CTRL三个virtqueue, 其中skb_recv_done, skb_xmit_done为RX,TX完成后的回调函数 */ if (err) goto free_wq; err = register_netdev(dev); if (err) { pr_debug("virtio_net: registering device failed\n"); goto free_vqs; } /* Last of all, set up some receive buffers. */ try_fill_recv(vi, GFP_KERNEL); /* recv virtqueue里调用add_buf,为收包准备好buffer */ /* If we didn't even get one input buffer, we're useless. */ if (vi->num == 0) { err = -ENOMEM; goto unregister; } /* Assume link up if device can't report link status, otherwise get link status from config. */ if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) { netif_carrier_off(dev); virtnet_update_status(vi); } else { vi->status = VIRTIO_NET_S_LINK_UP; netif_carrier_on(dev); } pr_debug("virtnet: registered device %s\n", dev->name); return 0; unregister: unregister_netdev(dev); free_vqs: vdev->config->del_vqs(vdev); free_wq: destroy_workqueue(vi->st_wq); free: free_netdev(dev); return err; }下面来看看virtnet_dev的net_device_ops,
static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, .ndo_start_xmit = start_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_set_mac_address = virtnet_set_mac_address, .ndo_set_rx_mode = virtnet_set_rx_mode, .ndo_change_mtu = virtnet_change_mtu, .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = virtnet_netpoll, #endif }; static int virtnet_open(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); /* Make sure we have some buffers: if oom use wq. */ if (!try_fill_recv(vi, GFP_KERNEL)) queue_delayed_work(vi->st_wq, &vi->refill, 0); /* try_fill_recv为0,唤醒workqueue执行refill_work增加收方向buffer */ virtnet_napi_enable(vi); /* enable napi函数virtnet_poll */ return 0; } static int virtnet_close(struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); /* Make sure refill_work doesn't re-enable napi! */ cancel_delayed_work_sync(&vi->refill); /* 关闭refill_work的workqueue */ napi_disable(&vi->napi); /* 关闭napi */ return 0; } static int virtnet_set_mac_address(struct net_device *dev, void *p) { struct virtnet_info *vi = netdev_priv(dev); struct virtio_device *vdev = vi->vdev; struct sockaddr *addr = p; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN); dev->addr_assign_type &= ~NET_ADDR_RANDOM; if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) vdev->config->set(vdev, offsetof(struct virtio_net_config, mac), dev->dev_addr, dev->addr_len); /* 配置virtio配置空间设置mac地址 */ return 0; }start_xmit是发送函数,可以看出整个发送过程中,skb的数据都是零拷贝的方式交给后端
tatic unsigned int free_old_xmit_skbs(struct virtnet_info *vi) { struct sk_buff *skb; unsigned int len, tot_sgs = 0; while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) { /* flush发送virtqueue的所有skb */ pr_debug("Sent skb %p\n", skb); vi->dev->stats.tx_bytes += skb->len; vi->dev->stats.tx_packets++; tot_sgs += skb_vnet_hdr(skb)->num_sg; dev_kfree_skb_any(skb); } return tot_sgs; } static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) { struct scatterlist sg[2+MAX_SKB_FRAGS]; struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb); const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest; sg_init_table(sg, 2+MAX_SKB_FRAGS); pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest); if (skb->ip_summed == CHECKSUM_PARTIAL) { /* 构造vnet_hdr的checksum部分 */ hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; hdr->hdr.csum_start = skb->csum_start - skb_headroom(skb); hdr->hdr.csum_offset = skb->csum_offset; } else { hdr->hdr.flags = 0; hdr->hdr.csum_offset = hdr->hdr.csum_start = 0; } if (skb_is_gso(skb)) { /* 构造vnet_hdr的gso部分 */ hdr->hdr.hdr_len = skb_headlen(skb); hdr->hdr.gso_size = skb_shinfo(skb)->gso_size; if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4; else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6; else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP; else BUG(); if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN) hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN; } else { hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE; hdr->hdr.gso_size = hdr->hdr.hdr_len = 0; } hdr->mhdr.num_buffers = 0; /* Encode metadata header at front. */ if (vi->mergeable_rx_bufs) sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr); else sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr); hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; /* 构造skb的scatterlist结构体 */ return virtqueue_add_buf(vi->svq, sg, hdr->num_sg, 0, skb); /* 把sg加入到free desc里面,增加avail idx */ } static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) { struct virtnet_info *vi = netdev_priv(dev); int capacity; again: /* Free up any pending old buffers before queueing new ones. */ free_old_xmit_skbs(vi); /* Try to transmit */ capacity = xmit_skb(vi, skb); /* This can happen with OOM and indirect buffers. */ if (unlikely(capacity < 0)) { /* virtqueue_add_buf返回-ENOSPC,此时发送队列空间不足 */ netif_stop_queue(dev); dev_warn(&dev->dev, "Unexpected full queue\n"); if (unlikely(!virtqueue_enable_cb(vi->svq))) { /* virtqueue_enable_cb返回false,表明used idx有变化,此时可以重试 */ virtqueue_disable_cb(vi->svq); netif_start_queue(dev); goto again; } return NETDEV_TX_BUSY; /* 发送队列满,返回NETDEV_TX_BUSY */ } virtqueue_kick(vi->svq); /* 通知后端 */ /* Don't wait up for transmitted skbs to be freed. */ skb_orphan(skb); nf_reset(skb); /* Apparently nice girls don't return TX_BUSY; stop the queue * before it gets out of hand. Naturally, this wastes entries. */ if (capacity < 2+MAX_SKB_FRAGS) { netif_stop_queue(dev); if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) { /* More just got used, free them then recheck. */ capacity += free_old_xmit_skbs(vi); if (capacity >= 2+MAX_SKB_FRAGS) { netif_start_queue(dev); virtqueue_disable_cb(vi->svq); } } } return NETDEV_TX_OK; }下面来看收包,其中napi的回调函数是virtnet_poll,refill_work作为workqueue的函数用于循环refill接收buffer
static void refill_work(struct work_struct *work) { struct virtnet_info *vi; bool still_empty; vi = container_of(work, struct virtnet_info, refill.work); napi_disable(&vi->napi); try_fill_recv(vi, GFP_KERNEL); /* 补充buffer用于接收报文,即增加avail idx和free desc */ still_empty = (vi->num == 0); virtnet_napi_enable(vi); /* In theory, this can happen: if we don't get any buffers in * we will *never* try to fill again. */ if (still_empty) queue_delayed_work(vi->st_wq, &vi->refill, HZ/2); } static void receive_buf(struct net_device *dev, void *buf, unsigned int len) { struct virtnet_info *vi = netdev_priv(dev); struct sk_buff *skb; struct page *page; struct skb_vnet_hdr *hdr; if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { /* 非法报文 */ pr_debug("%s: short packet %i\n", dev->name, len); dev->stats.rx_length_errors++; if (vi->mergeable_rx_bufs || vi->big_packets) give_pages(vi, buf); else dev_kfree_skb(buf); return; } if (!vi->mergeable_rx_bufs && !vi->big_packets) { /* skb单个page,此时data[i]里保存的就是skb指针,skb数据页已经被写入 */ skb = buf; len -= sizeof(struct virtio_net_hdr); /* virtio_net_hdr保存在数据页首部 */ skb_trim(skb, len); /* 得到skb结构体 */ } else { page = buf; /* skb有多个page,此时buf是首页的指针,需要重新创建skb */ skb = page_to_skb(vi, page, len); if (unlikely(!skb)) { dev->stats.rx_dropped++; give_pages(vi, page); return; } if (vi->mergeable_rx_bufs) if (receive_mergeable(vi, skb)) { /* 继续接收剩余的page,存入skb shinfo的frags里 */ dev_kfree_skb(skb); return; } } hdr = skb_vnet_hdr(skb); skb->truesize += skb->data_len; dev->stats.rx_bytes += skb->len; dev->stats.rx_packets++; if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { pr_debug("Needs csum!\n"); if (!skb_partial_csum_set(skb, hdr->hdr.csum_start, hdr->hdr.csum_offset)) goto frame_err; } else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) { skb->ip_summed = CHECKSUM_UNNECESSARY; } skb->protocol = eth_type_trans(skb, dev); pr_debug("Receiving skb proto 0x%04x len %i type %i\n", ntohs(skb->protocol), skb->len, skb->pkt_type); if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) { pr_debug("GSO!\n"); switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) { case VIRTIO_NET_HDR_GSO_TCPV4: skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; break; case VIRTIO_NET_HDR_GSO_UDP: skb_shinfo(skb)->gso_type = SKB_GSO_UDP; break; case VIRTIO_NET_HDR_GSO_TCPV6: skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6; break; default: if (net_ratelimit()) printk(KERN_WARNING "%s: bad gso type %u.\n", dev->name, hdr->hdr.gso_type); goto frame_err; } if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN) skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; skb_shinfo(skb)->gso_size = hdr->hdr.gso_size; if (skb_shinfo(skb)->gso_size == 0) { if (net_ratelimit()) printk(KERN_WARNING "%s: zero gso size.\n", dev->name); goto frame_err; } /* Header must be checked, and gso_segs computed. */ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; skb_shinfo(skb)->gso_segs = 0; } netif_receive_skb(skb); /* 完成skb元数据的修改,让内核接收该报文 */ return; frame_err: dev->stats.rx_frame_errors++; dev_kfree_skb(skb); }virtnet_poll是napi真正的接收函数
static int virtnet_poll(struct napi_struct *napi, int budget) { struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi); void *buf; unsigned int len, received = 0; again: while (received < budget && (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) { /* virtqueue_get_buf返回avail传入的token,即skb或者报文首page的指针 */ receive_buf(vi->dev, buf, len); /* 构造skb并交给内核协议栈 */ --vi->num; /* 接收队列空闲个数递减 */ received++; } if (vi->num < vi->max / 2) { /* 如果接收队列空闲不够,重新refill */ if (!try_fill_recv(vi, GFP_ATOMIC)) queue_delayed_work(vi->st_wq, &vi->refill, 0); } /* Out of packets? */ if (received < budget) { napi_complete(napi); if (unlikely(!virtqueue_enable_cb(vi->rvq)) && napi_schedule_prep(napi)) { virtqueue_disable_cb(vi->rvq); __napi_schedule(napi); goto again; } } return received; }对于后端的处理,vhost部分之前已经分析过了,本篇分析下后端是qemu的场景,其中virtio_net_device_realize函数在初始化队列时,已经初始化好了TX, RX, CTRL队列的回调函数
static void virtio_net_device_realize(DeviceState *dev, Error **errp) { VirtIODevice *vdev = VIRTIO_DEVICE(dev); VirtIONet *n = VIRTIO_NET(dev); NetClientState *nc; int i; virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size); n->max_queues = MAX(n->nic_conf.peers.queues, 1); n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues); n->vqs[0].rx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_rx); /* qemu RX发给guest之后的队列回调函数 */ n->curr_queues = 1; n->vqs[0].n = n; n->tx_timeout = n->net_conf.txtimer; if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) { n->vqs[0].tx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_tx_timer); n->vqs[0].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer, &n->vqs[0]); } else { n->vqs[0].tx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_tx_bh); /* guest TX到qemu之后的队列回调函数 */ n->vqs[0].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[0]); } n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl); ... }对于收包而言,由NetClientState,e.g. tun对应的socket来触发,调用到virtio_net_receive
static NetClientInfo net_virtio_info = { .type = NET_CLIENT_OPTIONS_KIND_NIC, .size = sizeof(NICState), .can_receive = virtio_net_can_receive, .receive = virtio_net_receive, .cleanup = virtio_net_cleanup, .link_status_changed = virtio_net_set_link_status, .query_rx_filter = virtio_net_query_rxfilter, }; static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size) { VirtIONet *n = qemu_get_nic_opaque(nc); VirtIONetQueue *q = virtio_net_get_subqueue(nc); VirtIODevice *vdev = VIRTIO_DEVICE(n); struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE]; struct virtio_net_hdr_mrg_rxbuf mhdr; unsigned mhdr_cnt = 0; size_t offset, i, guest_offset; if (!virtio_net_can_receive(nc)) { return -1; } /* hdr_len refers to the header we supply to the guest */ if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) { return 0; } if (!receive_filter(n, buf, size)) return size; offset = i = 0; while (offset < size) { VirtQueueElement elem; int len, total; const struct iovec *sg = elem.in_sg; total = 0; if (virtqueue_pop(q->rx_vq, &elem) == 0) { /* 从avail ring中取得可用buffer */ if (i == 0) return -1; error_report("virtio-net unexpected empty queue: " "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd guest features 0x%x", i, n->mergeable_rx_bufs, offset, size, n->guest_hdr_len, n->host_hdr_len, vdev->guest_features); exit(1); } if (elem.in_num < 1) { error_report("virtio-net receive queue contains no in buffers"); exit(1); } /* 报文内容填充到elem里面 */ if (i == 0) { assert(offset == 0); if (n->mergeable_rx_bufs) { mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg), sg, elem.in_num, offsetof(typeof(mhdr), num_buffers), sizeof(mhdr.num_buffers)); } receive_header(n, sg, elem.in_num, buf, size); offset = n->host_hdr_len; total += n->guest_hdr_len; guest_offset = n->guest_hdr_len; } else { guest_offset = 0; } /* copy in packet. ugh */ len = iov_from_buf(sg, elem.in_num, guest_offset, buf + offset, size - offset); total += len; offset += len; /* If buffers can't be merged, at this point we * must have consumed the complete packet. * Otherwise, drop it. */ if (!n->mergeable_rx_bufs && offset < size) { #if 0 error_report("virtio-net truncated non-mergeable packet: " "i %zd mergeable %d offset %zd, size %zd, " "guest hdr len %zd, host hdr len %zd", i, n->mergeable_rx_bufs, offset, size, n->guest_hdr_len, n->host_hdr_len); #endif return size; } /* signal other side */ virtqueue_fill(q->rx_vq, &elem, total, i++); /* 修改used elem */ } if (mhdr_cnt) { virtio_stw_p(vdev, &mhdr.num_buffers, i); iov_from_buf(mhdr_sg, mhdr_cnt, 0, &mhdr.num_buffers, sizeof mhdr.num_buffers); } virtqueue_flush(q->rx_vq, i); /* 更新used idx */ virtio_notify(vdev, q->rx_vq); /* 通知guest */ return size; }对于发包而言,由guest触发,类似于一个硬中断到来,之后qemu会模拟一个软中断来执行,对应函数为virtio_net_tx_bh
static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq) { VirtIONet *n = VIRTIO_NET(vdev); VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; if (unlikely(q->tx_waiting)) { return; } q->tx_waiting = 1; /* This happens when device was stopped but VCPU wasn't. */ if (!vdev->vm_running) { return; } virtio_queue_set_notification(vq, 0); /* 关闭virtqueue notification */ qemu_bh_schedule(q->tx_bh); /* qemu模拟触发软中断 */ } static void virtio_net_tx_bh(void *opaque) { VirtIONetQueue *q = opaque; VirtIONet *n = q->n; VirtIODevice *vdev = VIRTIO_DEVICE(n); int32_t ret; /* This happens when device was stopped but BH wasn't. */ if (!vdev->vm_running) { /* Make sure tx waiting is set, so we'll run when restarted. */ assert(q->tx_waiting); return; } q->tx_waiting = 0; /* Just in case the driver is not ready on more */ if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) { return; } ret = virtio_net_flush_tx(q); /* 实际发送报文的函数 */ if (ret == -EBUSY) { return; /* Notification re-enable handled by tx_complete */ } /* If we flush a full burst of packets, assume there are * more coming and immediately reschedule */ if (ret >= n->tx_burst) { qemu_bh_schedule(q->tx_bh); q->tx_waiting = 1; return; } /* If less than a full burst, re-enable notification and flush * anything that may have come in while we weren't looking. If * we find something, assume the guest is still active and reschedule */ virtio_queue_set_notification(q->tx_vq, 1); if (virtio_net_flush_tx(q) > 0) { virtio_queue_set_notification(q->tx_vq, 0); qemu_bh_schedule(q->tx_bh); q->tx_waiting = 1; } }virtio_net_flush_tx是真正发送报文的函数
static int32_t virtio_net_flush_tx(VirtIONetQueue *q) { VirtIONet *n = q->n; VirtIODevice *vdev = VIRTIO_DEVICE(n); VirtQueueElement elem; int32_t num_packets = 0; int queue_index = vq2q(virtio_get_queue_index(q->tx_vq)); if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) { return num_packets; } if (q->async_tx.elem.out_num) { virtio_queue_set_notification(q->tx_vq, 0); return num_packets; } while (virtqueue_pop(q->tx_vq, &elem)) { /* 从avail ring中获得要发送的报文内容 */ ssize_t ret, len; unsigned int out_num = elem.out_num; struct iovec *out_sg = &elem.out_sg[0]; struct iovec sg[VIRTQUEUE_MAX_SIZE]; if (out_num < 1) { error_report("virtio-net header not in first element"); exit(1); } if (n->has_vnet_hdr) { if (out_sg[0].iov_len < n->guest_hdr_len) { error_report("virtio-net header incorrect"); exit(1); } virtio_net_hdr_swap(vdev, (void *) out_sg[0].iov_base); } /* * If host wants to see the guest header as is, we can * pass it on unchanged. Otherwise, copy just the parts * that host is interested in. */ assert(n->host_hdr_len <= n->guest_hdr_len); if (n->host_hdr_len != n->guest_hdr_len) { unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg), out_sg, out_num, 0, n->host_hdr_len); sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num, out_sg, out_num, n->guest_hdr_len, -1); out_num = sg_num; out_sg = sg; } len = n->guest_hdr_len; ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index), /* 通过qemu nic发送 */ out_sg, out_num, virtio_net_tx_complete); if (ret == 0) { virtio_queue_set_notification(q->tx_vq, 0); q->async_tx.elem = elem; q->async_tx.len = len; return -EBUSY; } len += ret; virtqueue_push(q->tx_vq, &elem, 0); virtio_notify(vdev, q->tx_vq); if (++num_packets >= n->tx_burst) { break; } } return num_packets; }