本文以virtio网络驱动为例,分析virtio驱动是如何处理网络IO的,驱动的定义如下
static struct virtio_device_id id_table[] = {
{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
{ 0 },
};
static unsigned int features[] = {
VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
};
static struct virtio_driver virtio_net_driver = {
.feature_table = features,
.feature_table_size = ARRAY_SIZE(features),
.driver.name = KBUILD_MODNAME,
.driver.owner = THIS_MODULE,
.id_table = id_table,
.probe = virtnet_probe,
.remove = __devexit_p(virtnet_remove),
.config_changed = virtnet_config_changed,
#ifdef CONFIG_PM
.freeze = virtnet_freeze,
.restore = virtnet_restore,
#endif
};
static int __init init(void)
{
return register_virtio_driver(&virtio_net_driver); /* 调用driver_register */
}
static void __exit fini(void)
{
unregister_virtio_driver(&virtio_net_driver); /* 调用driver_unregister */
}
virtnet_probe用于pci总线发现virtio net设备
struct virtnet_info
{
struct virtio_device *vdev;
struct virtqueue *rvq, *svq, *cvq;
struct net_device *dev;
struct napi_struct napi;
/*
* Upstream uses the system_nrt workqueue; RHEL6 doesn't have
* that, so we create a singlethread wq.
*/
struct workqueue_struct *st_wq;
unsigned int status;
/* Number of input buffers, and max we've ever had. */
unsigned int num, max;
/* I like... big packets and I cannot lie! */
bool big_packets;
/* Host will merge rx buffers for big packets (shake it! shake it!) */
bool mergeable_rx_bufs;
/* Work struct for refilling if we run low on memory. */
struct delayed_work refill;
/* Chain pages by the private ptr. */
struct page *pages;
};
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
.ndo_start_xmit = start_xmit,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = virtnet_set_mac_address,
.ndo_set_rx_mode = virtnet_set_rx_mode,
.ndo_change_mtu = virtnet_change_mtu,
.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = virtnet_netpoll,
#endif
};
static const struct ethtool_ops virtnet_ethtool_ops = {
.set_tx_csum = virtnet_set_tx_csum,
.set_sg = ethtool_op_set_sg,
.set_tso = ethtool_op_set_tso,
.set_ufo = ethtool_op_set_ufo,
.get_link = ethtool_op_get_link,
};
static int virtnet_probe(struct virtio_device *vdev)
{
int err;
struct net_device *dev;
struct virtnet_info *vi;
/* Allocate ourselves a network device with room for our info */
dev = alloc_etherdev(sizeof(struct virtnet_info));
if (!dev)
return -ENOMEM;
/* Set up network device as normal. */
dev->netdev_ops = &virtnet_netdev;
dev->features = NETIF_F_HIGHDMA;
SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
SET_NETDEV_DEV(dev, &vdev->dev);
/* Do we support "hardware" checksums? */
if (csum && virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
/* This opens up the world of extra features. */
dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
dev->features |= NETIF_F_TSO | NETIF_F_UFO
| NETIF_F_TSO_ECN | NETIF_F_TSO6;
}
/* Individual feature bits: what can host handle? */
if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
dev->features |= NETIF_F_TSO;
if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
dev->features |= NETIF_F_TSO6;
if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
dev->features |= NETIF_F_TSO_ECN;
if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
dev->features |= NETIF_F_UFO;
}
/* Configuration may specify what MAC to use. Otherwise random. */
if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
vdev->config->get(vdev,
offsetof(struct virtio_net_config, mac),
dev->dev_addr, dev->addr_len);
} else
random_ether_addr(dev->dev_addr);
/* Set up our device-specific information */
vi = netdev_priv(dev);
netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight); /* 初始化virtnet_info->napi为virtnet_poll */
vi->dev = dev;
vi->vdev = vdev;
vdev->priv = vi;
vi->pages = NULL;
vi->st_wq = create_singlethread_workqueue("virtio-net");
if (!vi->st_wq) {
/* Can't get a precise err from function above */
err = -ENOMEM;
goto free;
}
INIT_DELAYED_WORK(&vi->refill, refill_work); /* refill_work用来补充收包的buffer */
/* If we can receive ANY GSO packets, we must allocate large ones. */
if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)
|| virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)
|| virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
vi->big_packets = true;
if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
vi->mergeable_rx_bufs = true;
err = init_vqs(vi); /* 初始化TX, RX, CTRL三个virtqueue, 其中skb_recv_done, skb_xmit_done为RX,TX完成后的回调函数 */
if (err)
goto free_wq;
err = register_netdev(dev);
if (err) {
pr_debug("virtio_net: registering device failed\n");
goto free_vqs;
}
/* Last of all, set up some receive buffers. */
try_fill_recv(vi, GFP_KERNEL); /* recv virtqueue里调用add_buf,为收包准备好buffer */
/* If we didn't even get one input buffer, we're useless. */
if (vi->num == 0) {
err = -ENOMEM;
goto unregister;
}
/* Assume link up if device can't report link status,
otherwise get link status from config. */
if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
netif_carrier_off(dev);
virtnet_update_status(vi);
} else {
vi->status = VIRTIO_NET_S_LINK_UP;
netif_carrier_on(dev);
}
pr_debug("virtnet: registered device %s\n", dev->name);
return 0;
unregister:
unregister_netdev(dev);
free_vqs:
vdev->config->del_vqs(vdev);
free_wq:
destroy_workqueue(vi->st_wq);
free:
free_netdev(dev);
return err;
}
下面来看看virtnet_dev的net_device_ops,
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
.ndo_start_xmit = start_xmit,
.ndo_validate_addr = eth_validate_addr,
.ndo_set_mac_address = virtnet_set_mac_address,
.ndo_set_rx_mode = virtnet_set_rx_mode,
.ndo_change_mtu = virtnet_change_mtu,
.ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
.ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = virtnet_netpoll,
#endif
};
static int virtnet_open(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
/* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(vi, GFP_KERNEL))
queue_delayed_work(vi->st_wq, &vi->refill, 0); /* try_fill_recv为0,唤醒workqueue执行refill_work增加收方向buffer */
virtnet_napi_enable(vi); /* enable napi函数virtnet_poll */
return 0;
}
static int virtnet_close(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
/* Make sure refill_work doesn't re-enable napi! */
cancel_delayed_work_sync(&vi->refill); /* 关闭refill_work的workqueue */
napi_disable(&vi->napi); /* 关闭napi */
return 0;
}
static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
struct virtnet_info *vi = netdev_priv(dev);
struct virtio_device *vdev = vi->vdev;
struct sockaddr *addr = p;
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
dev->addr_assign_type &= ~NET_ADDR_RANDOM;
if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
dev->dev_addr, dev->addr_len); /* 配置virtio配置空间设置mac地址 */
return 0;
}
start_xmit是发送函数,可以看出整个发送过程中,skb的数据都是零拷贝的方式交给后端
tatic unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
{
struct sk_buff *skb;
unsigned int len, tot_sgs = 0;
while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) { /* flush发送virtqueue的所有skb */
pr_debug("Sent skb %p\n", skb);
vi->dev->stats.tx_bytes += skb->len;
vi->dev->stats.tx_packets++;
tot_sgs += skb_vnet_hdr(skb)->num_sg;
dev_kfree_skb_any(skb);
}
return tot_sgs;
}
static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
{
struct scatterlist sg[2+MAX_SKB_FRAGS];
struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;
sg_init_table(sg, 2+MAX_SKB_FRAGS);
pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);
if (skb->ip_summed == CHECKSUM_PARTIAL) { /* 构造vnet_hdr的checksum部分 */
hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
hdr->hdr.csum_start = skb->csum_start - skb_headroom(skb);
hdr->hdr.csum_offset = skb->csum_offset;
} else {
hdr->hdr.flags = 0;
hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
}
if (skb_is_gso(skb)) { /* 构造vnet_hdr的gso部分 */
hdr->hdr.hdr_len = skb_headlen(skb);
hdr->hdr.gso_size = skb_shinfo(skb)->gso_size;
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
else
BUG();
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
} else {
hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
hdr->hdr.gso_size = hdr->hdr.hdr_len = 0;
}
hdr->mhdr.num_buffers = 0;
/* Encode metadata header at front. */
if (vi->mergeable_rx_bufs)
sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
else
sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);
hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; /* 构造skb的scatterlist结构体 */
return virtqueue_add_buf(vi->svq, sg, hdr->num_sg, 0, skb); /* 把sg加入到free desc里面,增加avail idx */
}
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
int capacity;
again:
/* Free up any pending old buffers before queueing new ones. */
free_old_xmit_skbs(vi);
/* Try to transmit */
capacity = xmit_skb(vi, skb);
/* This can happen with OOM and indirect buffers. */
if (unlikely(capacity < 0)) { /* virtqueue_add_buf返回-ENOSPC,此时发送队列空间不足 */
netif_stop_queue(dev);
dev_warn(&dev->dev, "Unexpected full queue\n");
if (unlikely(!virtqueue_enable_cb(vi->svq))) { /* virtqueue_enable_cb返回false,表明used idx有变化,此时可以重试 */
virtqueue_disable_cb(vi->svq);
netif_start_queue(dev);
goto again;
}
return NETDEV_TX_BUSY; /* 发送队列满,返回NETDEV_TX_BUSY */
}
virtqueue_kick(vi->svq); /* 通知后端 */
/* Don't wait up for transmitted skbs to be freed. */
skb_orphan(skb);
nf_reset(skb);
/* Apparently nice girls don't return TX_BUSY; stop the queue
* before it gets out of hand. Naturally, this wastes entries. */
if (capacity < 2+MAX_SKB_FRAGS) {
netif_stop_queue(dev);
if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
/* More just got used, free them then recheck. */
capacity += free_old_xmit_skbs(vi);
if (capacity >= 2+MAX_SKB_FRAGS) {
netif_start_queue(dev);
virtqueue_disable_cb(vi->svq);
}
}
}
return NETDEV_TX_OK;
}
下面来看收包,其中napi的回调函数是virtnet_poll,refill_work作为workqueue的函数用于循环refill接收buffer
static void refill_work(struct work_struct *work)
{
struct virtnet_info *vi;
bool still_empty;
vi = container_of(work, struct virtnet_info, refill.work);
napi_disable(&vi->napi);
try_fill_recv(vi, GFP_KERNEL); /* 补充buffer用于接收报文,即增加avail idx和free desc */
still_empty = (vi->num == 0);
virtnet_napi_enable(vi);
/* In theory, this can happen: if we don't get any buffers in
* we will *never* try to fill again. */
if (still_empty)
queue_delayed_work(vi->st_wq, &vi->refill, HZ/2);
}
static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
{
struct virtnet_info *vi = netdev_priv(dev);
struct sk_buff *skb;
struct page *page;
struct skb_vnet_hdr *hdr;
if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) { /* 非法报文 */
pr_debug("%s: short packet %i\n", dev->name, len);
dev->stats.rx_length_errors++;
if (vi->mergeable_rx_bufs || vi->big_packets)
give_pages(vi, buf);
else
dev_kfree_skb(buf);
return;
}
if (!vi->mergeable_rx_bufs && !vi->big_packets) { /* skb单个page,此时data[i]里保存的就是skb指针,skb数据页已经被写入 */
skb = buf;
len -= sizeof(struct virtio_net_hdr); /* virtio_net_hdr保存在数据页首部 */
skb_trim(skb, len); /* 得到skb结构体 */
} else {
page = buf; /* skb有多个page,此时buf是首页的指针,需要重新创建skb */
skb = page_to_skb(vi, page, len);
if (unlikely(!skb)) {
dev->stats.rx_dropped++;
give_pages(vi, page);
return;
}
if (vi->mergeable_rx_bufs)
if (receive_mergeable(vi, skb)) { /* 继续接收剩余的page,存入skb shinfo的frags里 */
dev_kfree_skb(skb);
return;
}
}
hdr = skb_vnet_hdr(skb);
skb->truesize += skb->data_len;
dev->stats.rx_bytes += skb->len;
dev->stats.rx_packets++;
if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
pr_debug("Needs csum!\n");
if (!skb_partial_csum_set(skb,
hdr->hdr.csum_start,
hdr->hdr.csum_offset))
goto frame_err;
} else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
}
skb->protocol = eth_type_trans(skb, dev);
pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
ntohs(skb->protocol), skb->len, skb->pkt_type);
if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
pr_debug("GSO!\n");
switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
case VIRTIO_NET_HDR_GSO_TCPV4:
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
break;
case VIRTIO_NET_HDR_GSO_UDP:
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
break;
case VIRTIO_NET_HDR_GSO_TCPV6:
skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
break;
default:
if (net_ratelimit())
printk(KERN_WARNING "%s: bad gso type %u.\n",
dev->name, hdr->hdr.gso_type);
goto frame_err;
}
if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;
skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
if (skb_shinfo(skb)->gso_size == 0) {
if (net_ratelimit())
printk(KERN_WARNING "%s: zero gso size.\n",
dev->name);
goto frame_err;
}
/* Header must be checked, and gso_segs computed. */
skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
skb_shinfo(skb)->gso_segs = 0;
}
netif_receive_skb(skb); /* 完成skb元数据的修改,让内核接收该报文 */
return;
frame_err:
dev->stats.rx_frame_errors++;
dev_kfree_skb(skb);
}
virtnet_poll是napi真正的接收函数
static int virtnet_poll(struct napi_struct *napi, int budget)
{
struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
void *buf;
unsigned int len, received = 0;
again:
while (received < budget &&
(buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) { /* virtqueue_get_buf返回avail传入的token,即skb或者报文首page的指针 */
receive_buf(vi->dev, buf, len); /* 构造skb并交给内核协议栈 */
--vi->num; /* 接收队列空闲个数递减 */
received++;
}
if (vi->num < vi->max / 2) { /* 如果接收队列空闲不够,重新refill */
if (!try_fill_recv(vi, GFP_ATOMIC))
queue_delayed_work(vi->st_wq, &vi->refill, 0);
}
/* Out of packets? */
if (received < budget) {
napi_complete(napi);
if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
napi_schedule_prep(napi)) {
virtqueue_disable_cb(vi->rvq);
__napi_schedule(napi);
goto again;
}
}
return received;
}
对于后端的处理,vhost部分之前已经分析过了,本篇分析下后端是qemu的场景,其中virtio_net_device_realize函数在初始化队列时,已经初始化好了TX, RX, CTRL队列的回调函数
static void virtio_net_device_realize(DeviceState *dev, Error **errp)
{
VirtIODevice *vdev = VIRTIO_DEVICE(dev);
VirtIONet *n = VIRTIO_NET(dev);
NetClientState *nc;
int i;
virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);
n->max_queues = MAX(n->nic_conf.peers.queues, 1);
n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues);
n->vqs[0].rx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_rx); /* qemu RX发给guest之后的队列回调函数 */
n->curr_queues = 1;
n->vqs[0].n = n;
n->tx_timeout = n->net_conf.txtimer;
if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
n->vqs[0].tx_vq = virtio_add_queue(vdev, 256,
virtio_net_handle_tx_timer);
n->vqs[0].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer,
&n->vqs[0]);
} else {
n->vqs[0].tx_vq = virtio_add_queue(vdev, 256,
virtio_net_handle_tx_bh); /* guest TX到qemu之后的队列回调函数 */
n->vqs[0].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[0]);
}
n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
...
}
对于收包而言,由NetClientState,e.g. tun对应的socket来触发,调用到virtio_net_receive
static NetClientInfo net_virtio_info = {
.type = NET_CLIENT_OPTIONS_KIND_NIC,
.size = sizeof(NICState),
.can_receive = virtio_net_can_receive,
.receive = virtio_net_receive,
.cleanup = virtio_net_cleanup,
.link_status_changed = virtio_net_set_link_status,
.query_rx_filter = virtio_net_query_rxfilter,
};
static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size)
{
VirtIONet *n = qemu_get_nic_opaque(nc);
VirtIONetQueue *q = virtio_net_get_subqueue(nc);
VirtIODevice *vdev = VIRTIO_DEVICE(n);
struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
struct virtio_net_hdr_mrg_rxbuf mhdr;
unsigned mhdr_cnt = 0;
size_t offset, i, guest_offset;
if (!virtio_net_can_receive(nc)) {
return -1;
}
/* hdr_len refers to the header we supply to the guest */
if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
return 0;
}
if (!receive_filter(n, buf, size))
return size;
offset = i = 0;
while (offset < size) {
VirtQueueElement elem;
int len, total;
const struct iovec *sg = elem.in_sg;
total = 0;
if (virtqueue_pop(q->rx_vq, &elem) == 0) { /* 从avail ring中取得可用buffer */
if (i == 0)
return -1;
error_report("virtio-net unexpected empty queue: "
"i %zd mergeable %d offset %zd, size %zd, "
"guest hdr len %zd, host hdr len %zd guest features 0x%x",
i, n->mergeable_rx_bufs, offset, size,
n->guest_hdr_len, n->host_hdr_len, vdev->guest_features);
exit(1);
}
if (elem.in_num < 1) {
error_report("virtio-net receive queue contains no in buffers");
exit(1);
}
/* 报文内容填充到elem里面 */
if (i == 0) {
assert(offset == 0);
if (n->mergeable_rx_bufs) {
mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
sg, elem.in_num,
offsetof(typeof(mhdr), num_buffers),
sizeof(mhdr.num_buffers));
}
receive_header(n, sg, elem.in_num, buf, size);
offset = n->host_hdr_len;
total += n->guest_hdr_len;
guest_offset = n->guest_hdr_len;
} else {
guest_offset = 0;
}
/* copy in packet. ugh */
len = iov_from_buf(sg, elem.in_num, guest_offset,
buf + offset, size - offset);
total += len;
offset += len;
/* If buffers can't be merged, at this point we
* must have consumed the complete packet.
* Otherwise, drop it. */
if (!n->mergeable_rx_bufs && offset < size) {
#if 0
error_report("virtio-net truncated non-mergeable packet: "
"i %zd mergeable %d offset %zd, size %zd, "
"guest hdr len %zd, host hdr len %zd",
i, n->mergeable_rx_bufs,
offset, size, n->guest_hdr_len, n->host_hdr_len);
#endif
return size;
}
/* signal other side */
virtqueue_fill(q->rx_vq, &elem, total, i++); /* 修改used elem */
}
if (mhdr_cnt) {
virtio_stw_p(vdev, &mhdr.num_buffers, i);
iov_from_buf(mhdr_sg, mhdr_cnt,
0,
&mhdr.num_buffers, sizeof mhdr.num_buffers);
}
virtqueue_flush(q->rx_vq, i); /* 更新used idx */
virtio_notify(vdev, q->rx_vq); /* 通知guest */
return size;
}
对于发包而言,由guest触发,类似于一个硬中断到来,之后qemu会模拟一个软中断来执行,对应函数为virtio_net_tx_bh
static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
{
VirtIONet *n = VIRTIO_NET(vdev);
VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
if (unlikely(q->tx_waiting)) {
return;
}
q->tx_waiting = 1;
/* This happens when device was stopped but VCPU wasn't. */
if (!vdev->vm_running) {
return;
}
virtio_queue_set_notification(vq, 0); /* 关闭virtqueue notification */
qemu_bh_schedule(q->tx_bh); /* qemu模拟触发软中断 */
}
static void virtio_net_tx_bh(void *opaque)
{
VirtIONetQueue *q = opaque;
VirtIONet *n = q->n;
VirtIODevice *vdev = VIRTIO_DEVICE(n);
int32_t ret;
/* This happens when device was stopped but BH wasn't. */
if (!vdev->vm_running) {
/* Make sure tx waiting is set, so we'll run when restarted. */
assert(q->tx_waiting);
return;
}
q->tx_waiting = 0;
/* Just in case the driver is not ready on more */
if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
return;
}
ret = virtio_net_flush_tx(q); /* 实际发送报文的函数 */
if (ret == -EBUSY) {
return; /* Notification re-enable handled by tx_complete */
}
/* If we flush a full burst of packets, assume there are
* more coming and immediately reschedule */
if (ret >= n->tx_burst) {
qemu_bh_schedule(q->tx_bh);
q->tx_waiting = 1;
return;
}
/* If less than a full burst, re-enable notification and flush
* anything that may have come in while we weren't looking. If
* we find something, assume the guest is still active and reschedule */
virtio_queue_set_notification(q->tx_vq, 1);
if (virtio_net_flush_tx(q) > 0) {
virtio_queue_set_notification(q->tx_vq, 0);
qemu_bh_schedule(q->tx_bh);
q->tx_waiting = 1;
}
}
virtio_net_flush_tx是真正发送报文的函数
static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
{
VirtIONet *n = q->n;
VirtIODevice *vdev = VIRTIO_DEVICE(n);
VirtQueueElement elem;
int32_t num_packets = 0;
int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
return num_packets;
}
if (q->async_tx.elem.out_num) {
virtio_queue_set_notification(q->tx_vq, 0);
return num_packets;
}
while (virtqueue_pop(q->tx_vq, &elem)) { /* 从avail ring中获得要发送的报文内容 */
ssize_t ret, len;
unsigned int out_num = elem.out_num;
struct iovec *out_sg = &elem.out_sg[0];
struct iovec sg[VIRTQUEUE_MAX_SIZE];
if (out_num < 1) {
error_report("virtio-net header not in first element");
exit(1);
}
if (n->has_vnet_hdr) {
if (out_sg[0].iov_len < n->guest_hdr_len) {
error_report("virtio-net header incorrect");
exit(1);
}
virtio_net_hdr_swap(vdev, (void *) out_sg[0].iov_base);
}
/*
* If host wants to see the guest header as is, we can
* pass it on unchanged. Otherwise, copy just the parts
* that host is interested in.
*/
assert(n->host_hdr_len <= n->guest_hdr_len);
if (n->host_hdr_len != n->guest_hdr_len) {
unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
out_sg, out_num,
0, n->host_hdr_len);
sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
out_sg, out_num,
n->guest_hdr_len, -1);
out_num = sg_num;
out_sg = sg;
}
len = n->guest_hdr_len;
ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index), /* 通过qemu nic发送 */
out_sg, out_num, virtio_net_tx_complete);
if (ret == 0) {
virtio_queue_set_notification(q->tx_vq, 0);
q->async_tx.elem = elem;
q->async_tx.len = len;
return -EBUSY;
}
len += ret;
virtqueue_push(q->tx_vq, &elem, 0);
virtio_notify(vdev, q->tx_vq);
if (++num_packets >= n->tx_burst) {
break;
}
}
return num_packets;
}