virtio network驱动分析

本文以virtio网络驱动为例,分析virtio驱动是如何处理网络IO的,驱动的定义如下

static struct virtio_device_id id_table[] = {
    { VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
    { 0 },
};

static unsigned int features[] = {
    VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM,
    VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC,
    VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6,
    VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
    VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO,
    VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ,
    VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN,
};

static struct virtio_driver virtio_net_driver = {
    .feature_table = features,
    .feature_table_size = ARRAY_SIZE(features),
    .driver.name =  KBUILD_MODNAME,
    .driver.owner = THIS_MODULE,
    .id_table = id_table,
    .probe =    virtnet_probe,
    .remove =   __devexit_p(virtnet_remove),
    .config_changed = virtnet_config_changed,
#ifdef CONFIG_PM
    .freeze =   virtnet_freeze,
    .restore =  virtnet_restore,
#endif
};

static int __init init(void)
{
    return register_virtio_driver(&virtio_net_driver);  /* 调用driver_register */
}

static void __exit fini(void)
{
    unregister_virtio_driver(&virtio_net_driver);  /* 调用driver_unregister */
}
virtnet_probe用于pci总线发现virtio net设备

struct virtnet_info
{
    struct virtio_device *vdev;
    struct virtqueue *rvq, *svq, *cvq;
    struct net_device *dev;
    struct napi_struct napi;
    /*
     * Upstream uses the system_nrt workqueue; RHEL6 doesn't have
     * that, so we create a singlethread wq.
     */
    struct workqueue_struct *st_wq;
    unsigned int status;

    /* Number of input buffers, and max we've ever had. */
    unsigned int num, max;

    /* I like... big packets and I cannot lie! */
    bool big_packets;

    /* Host will merge rx buffers for big packets (shake it! shake it!) */
    bool mergeable_rx_bufs;

    /* Work struct for refilling if we run low on memory. */
    struct delayed_work refill;

    /* Chain pages by the private ptr. */
    struct page *pages;
};

static const struct net_device_ops virtnet_netdev = {
    .ndo_open            = virtnet_open,
    .ndo_stop            = virtnet_close,
    .ndo_start_xmit      = start_xmit,
    .ndo_validate_addr   = eth_validate_addr,
    .ndo_set_mac_address = virtnet_set_mac_address,
    .ndo_set_rx_mode     = virtnet_set_rx_mode,
    .ndo_change_mtu      = virtnet_change_mtu,
    .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
    .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
    .ndo_poll_controller = virtnet_netpoll,
#endif
};

static const struct ethtool_ops virtnet_ethtool_ops = {
    .set_tx_csum = virtnet_set_tx_csum,
    .set_sg = ethtool_op_set_sg,
    .set_tso = ethtool_op_set_tso,
    .set_ufo = ethtool_op_set_ufo,
    .get_link = ethtool_op_get_link,
};

static int virtnet_probe(struct virtio_device *vdev)
{
    int err;
    struct net_device *dev;
    struct virtnet_info *vi;

    /* Allocate ourselves a network device with room for our info */
    dev = alloc_etherdev(sizeof(struct virtnet_info));
    if (!dev)
        return -ENOMEM;

    /* Set up network device as normal. */
    dev->netdev_ops = &virtnet_netdev;
    dev->features = NETIF_F_HIGHDMA;
    SET_ETHTOOL_OPS(dev, &virtnet_ethtool_ops);
    SET_NETDEV_DEV(dev, &vdev->dev);

    /* Do we support "hardware" checksums? */
    if (csum && virtio_has_feature(vdev, VIRTIO_NET_F_CSUM)) {
        /* This opens up the world of extra features. */
        dev->features |= NETIF_F_HW_CSUM|NETIF_F_SG|NETIF_F_FRAGLIST;
        if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_GSO)) {
            dev->features |= NETIF_F_TSO | NETIF_F_UFO
                | NETIF_F_TSO_ECN | NETIF_F_TSO6;
        }
        /* Individual feature bits: what can host handle? */
        if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO4))
            dev->features |= NETIF_F_TSO;
        if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_TSO6))
            dev->features |= NETIF_F_TSO6;
        if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_ECN))
            dev->features |= NETIF_F_TSO_ECN;
        if (gso && virtio_has_feature(vdev, VIRTIO_NET_F_HOST_UFO))
            dev->features |= NETIF_F_UFO;
    }

    /* Configuration may specify what MAC to use.  Otherwise random. */
    if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC)) {
        vdev->config->get(vdev,
                  offsetof(struct virtio_net_config, mac),
                  dev->dev_addr, dev->addr_len);
    } else
        random_ether_addr(dev->dev_addr);

    /* Set up our device-specific information */
    vi = netdev_priv(dev);
    netif_napi_add(dev, &vi->napi, virtnet_poll, napi_weight);  /* 初始化virtnet_info->napi为virtnet_poll */
    vi->dev = dev;
    vi->vdev = vdev;
    vdev->priv = vi;
    vi->pages = NULL;
    vi->st_wq = create_singlethread_workqueue("virtio-net");
    if (!vi->st_wq) {
        /* Can't get a precise err from function above */
        err = -ENOMEM;
        goto free;
    }
    INIT_DELAYED_WORK(&vi->refill, refill_work); /* refill_work用来补充收包的buffer */

    /* If we can receive ANY GSO packets, we must allocate large ones. */
    if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4)
        || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)
        || virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_ECN))
        vi->big_packets = true;

    if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF))
        vi->mergeable_rx_bufs = true;

    err = init_vqs(vi);  /* 初始化TX, RX, CTRL三个virtqueue, 其中skb_recv_done, skb_xmit_done为RX,TX完成后的回调函数 */
    if (err)
        goto free_wq;

    err = register_netdev(dev);
    if (err) {
        pr_debug("virtio_net: registering device failed\n");
        goto free_vqs;
    }

    /* Last of all, set up some receive buffers. */
    try_fill_recv(vi, GFP_KERNEL);  /* recv virtqueue里调用add_buf,为收包准备好buffer */

    /* If we didn't even get one input buffer, we're useless. */
    if (vi->num == 0) {
        err = -ENOMEM;
        goto unregister;
    }

    /* Assume link up if device can't report link status,
       otherwise get link status from config. */
    if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_STATUS)) {
        netif_carrier_off(dev);
        virtnet_update_status(vi);
    } else {
        vi->status = VIRTIO_NET_S_LINK_UP;
        netif_carrier_on(dev);
    }

    pr_debug("virtnet: registered device %s\n", dev->name);
    return 0;

unregister:
    unregister_netdev(dev);
free_vqs:
    vdev->config->del_vqs(vdev);
free_wq:
    destroy_workqueue(vi->st_wq);
free:
    free_netdev(dev);
    return err;
}
下面来看看virtnet_dev的net_device_ops,

static const struct net_device_ops virtnet_netdev = {
    .ndo_open            = virtnet_open,
    .ndo_stop            = virtnet_close,
    .ndo_start_xmit      = start_xmit,
    .ndo_validate_addr   = eth_validate_addr,
    .ndo_set_mac_address = virtnet_set_mac_address,
    .ndo_set_rx_mode     = virtnet_set_rx_mode,
    .ndo_change_mtu      = virtnet_change_mtu,
    .ndo_vlan_rx_add_vid = virtnet_vlan_rx_add_vid,
    .ndo_vlan_rx_kill_vid = virtnet_vlan_rx_kill_vid,
#ifdef CONFIG_NET_POLL_CONTROLLER
    .ndo_poll_controller = virtnet_netpoll,
#endif
};

static int virtnet_open(struct net_device *dev)
{
    struct virtnet_info *vi = netdev_priv(dev);

    /* Make sure we have some buffers: if oom use wq. */
    if (!try_fill_recv(vi, GFP_KERNEL))
        queue_delayed_work(vi->st_wq, &vi->refill, 0);  /* try_fill_recv为0,唤醒workqueue执行refill_work增加收方向buffer */

    virtnet_napi_enable(vi);  /* enable napi函数virtnet_poll */
    return 0;
}

static int virtnet_close(struct net_device *dev)
{
    struct virtnet_info *vi = netdev_priv(dev);

    /* Make sure refill_work doesn't re-enable napi! */
    cancel_delayed_work_sync(&vi->refill);  /* 关闭refill_work的workqueue */
    napi_disable(&vi->napi);  /* 关闭napi */

    return 0;
}

static int virtnet_set_mac_address(struct net_device *dev, void *p)
{
    struct virtnet_info *vi = netdev_priv(dev);
    struct virtio_device *vdev = vi->vdev;
    struct sockaddr *addr = p;

    if (!is_valid_ether_addr(addr->sa_data))
        return -EADDRNOTAVAIL;
    memcpy(dev->dev_addr, addr->sa_data, ETH_ALEN);
    dev->addr_assign_type &= ~NET_ADDR_RANDOM;

    if (virtio_has_feature(vdev, VIRTIO_NET_F_MAC))
        vdev->config->set(vdev, offsetof(struct virtio_net_config, mac),
                          dev->dev_addr, dev->addr_len);  /* 配置virtio配置空间设置mac地址 */

    return 0;
}
start_xmit是发送函数,可以看出整个发送过程中,skb的数据都是零拷贝的方式交给后端

tatic unsigned int free_old_xmit_skbs(struct virtnet_info *vi)
{
    struct sk_buff *skb;
    unsigned int len, tot_sgs = 0;

    while ((skb = virtqueue_get_buf(vi->svq, &len)) != NULL) {  /* flush发送virtqueue的所有skb */
        pr_debug("Sent skb %p\n", skb);
        vi->dev->stats.tx_bytes += skb->len;
        vi->dev->stats.tx_packets++;
        tot_sgs += skb_vnet_hdr(skb)->num_sg;
        dev_kfree_skb_any(skb);
    }
    return tot_sgs;
}

static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb)
{
    struct scatterlist sg[2+MAX_SKB_FRAGS];
    struct skb_vnet_hdr *hdr = skb_vnet_hdr(skb);
    const unsigned char *dest = ((struct ethhdr *)skb->data)->h_dest;

    sg_init_table(sg, 2+MAX_SKB_FRAGS);

    pr_debug("%s: xmit %p %pM\n", vi->dev->name, skb, dest);

    if (skb->ip_summed == CHECKSUM_PARTIAL) {  /* 构造vnet_hdr的checksum部分 */
        hdr->hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
        hdr->hdr.csum_start = skb->csum_start - skb_headroom(skb);
        hdr->hdr.csum_offset = skb->csum_offset;
    } else {
        hdr->hdr.flags = 0;
        hdr->hdr.csum_offset = hdr->hdr.csum_start = 0;
    }

    if (skb_is_gso(skb)) {  /* 构造vnet_hdr的gso部分 */
        hdr->hdr.hdr_len = skb_headlen(skb);
        hdr->hdr.gso_size = skb_shinfo(skb)->gso_size;
        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4)
            hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
        else if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6)
            hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
        else if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP)
            hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
        else
            BUG();
        if (skb_shinfo(skb)->gso_type & SKB_GSO_TCP_ECN)
            hdr->hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
    } else {
        hdr->hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
        hdr->hdr.gso_size = hdr->hdr.hdr_len = 0;
    }

    hdr->mhdr.num_buffers = 0;

    /* Encode metadata header at front. */
    if (vi->mergeable_rx_bufs)
        sg_set_buf(sg, &hdr->mhdr, sizeof hdr->mhdr);
    else
        sg_set_buf(sg, &hdr->hdr, sizeof hdr->hdr);

    hdr->num_sg = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1;  /* 构造skb的scatterlist结构体 */
    return virtqueue_add_buf(vi->svq, sg, hdr->num_sg, 0, skb); /* 把sg加入到free desc里面,增加avail idx */
}

static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
{
    struct virtnet_info *vi = netdev_priv(dev);
    int capacity;

again:
    /* Free up any pending old buffers before queueing new ones. */
    free_old_xmit_skbs(vi);

    /* Try to transmit */
    capacity = xmit_skb(vi, skb);

    /* This can happen with OOM and indirect buffers. */
    if (unlikely(capacity < 0)) { /* virtqueue_add_buf返回-ENOSPC,此时发送队列空间不足 */
        netif_stop_queue(dev);
        dev_warn(&dev->dev, "Unexpected full queue\n");
        if (unlikely(!virtqueue_enable_cb(vi->svq))) { /* virtqueue_enable_cb返回false,表明used idx有变化,此时可以重试 */
            virtqueue_disable_cb(vi->svq);
            netif_start_queue(dev);
            goto again;
        }
        return NETDEV_TX_BUSY; /* 发送队列满,返回NETDEV_TX_BUSY */
    }
    virtqueue_kick(vi->svq); /* 通知后端 */

    /* Don't wait up for transmitted skbs to be freed. */
    skb_orphan(skb);
    nf_reset(skb);

    /* Apparently nice girls don't return TX_BUSY; stop the queue
     * before it gets out of hand.  Naturally, this wastes entries. */
    if (capacity < 2+MAX_SKB_FRAGS) {
        netif_stop_queue(dev);
        if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) {
            /* More just got used, free them then recheck. */
            capacity += free_old_xmit_skbs(vi);
            if (capacity >= 2+MAX_SKB_FRAGS) {
                netif_start_queue(dev);
                virtqueue_disable_cb(vi->svq);
            }
        }
    }

    return NETDEV_TX_OK;
}
下面来看收包,其中napi的回调函数是virtnet_poll,refill_work作为workqueue的函数用于循环refill接收buffer
static void refill_work(struct work_struct *work)
{
    struct virtnet_info *vi;
    bool still_empty;

    vi = container_of(work, struct virtnet_info, refill.work);
    napi_disable(&vi->napi);
    try_fill_recv(vi, GFP_KERNEL);  /* 补充buffer用于接收报文,即增加avail idx和free desc */
    still_empty = (vi->num == 0);
    virtnet_napi_enable(vi);

    /* In theory, this can happen: if we don't get any buffers in
     * we will *never* try to fill again. */
    if (still_empty)
        queue_delayed_work(vi->st_wq, &vi->refill, HZ/2);
}

static void receive_buf(struct net_device *dev, void *buf, unsigned int len)
{
    struct virtnet_info *vi = netdev_priv(dev);
    struct sk_buff *skb;
    struct page *page;
    struct skb_vnet_hdr *hdr;

    if (unlikely(len < sizeof(struct virtio_net_hdr) + ETH_HLEN)) {  /* 非法报文 */
        pr_debug("%s: short packet %i\n", dev->name, len);
        dev->stats.rx_length_errors++;
        if (vi->mergeable_rx_bufs || vi->big_packets)
            give_pages(vi, buf);
        else
            dev_kfree_skb(buf);
        return;
    }

    if (!vi->mergeable_rx_bufs && !vi->big_packets) { /* skb单个page,此时data[i]里保存的就是skb指针,skb数据页已经被写入 */
        skb = buf;
        len -= sizeof(struct virtio_net_hdr); /* virtio_net_hdr保存在数据页首部 */
        skb_trim(skb, len);  /* 得到skb结构体 */
    } else {
        page = buf;  /* skb有多个page,此时buf是首页的指针,需要重新创建skb */
        skb = page_to_skb(vi, page, len);
        if (unlikely(!skb)) {
            dev->stats.rx_dropped++;
            give_pages(vi, page);
            return;
        }
        if (vi->mergeable_rx_bufs)
            if (receive_mergeable(vi, skb)) {  /* 继续接收剩余的page,存入skb shinfo的frags里 */
                dev_kfree_skb(skb);
                return;
            }
    }

    hdr = skb_vnet_hdr(skb);
    skb->truesize += skb->data_len;
    dev->stats.rx_bytes += skb->len;
    dev->stats.rx_packets++;

    if (hdr->hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
        pr_debug("Needs csum!\n");
        if (!skb_partial_csum_set(skb,
                      hdr->hdr.csum_start,
                      hdr->hdr.csum_offset))
            goto frame_err;
    } else if (hdr->hdr.flags & VIRTIO_NET_HDR_F_DATA_VALID) {
        skb->ip_summed = CHECKSUM_UNNECESSARY;
    }

    skb->protocol = eth_type_trans(skb, dev);
    pr_debug("Receiving skb proto 0x%04x len %i type %i\n",
         ntohs(skb->protocol), skb->len, skb->pkt_type);

    if (hdr->hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
        pr_debug("GSO!\n");
        switch (hdr->hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
        case VIRTIO_NET_HDR_GSO_TCPV4:
            skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
            break;
        case VIRTIO_NET_HDR_GSO_UDP:
            skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
            break;
        case VIRTIO_NET_HDR_GSO_TCPV6:
            skb_shinfo(skb)->gso_type = SKB_GSO_TCPV6;
            break;
        default:
            if (net_ratelimit())
                printk(KERN_WARNING "%s: bad gso type %u.\n",
                       dev->name, hdr->hdr.gso_type);
            goto frame_err;
        }

        if (hdr->hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
            skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN;

        skb_shinfo(skb)->gso_size = hdr->hdr.gso_size;
        if (skb_shinfo(skb)->gso_size == 0) {
            if (net_ratelimit())
                printk(KERN_WARNING "%s: zero gso size.\n",
                       dev->name);
            goto frame_err;
        }

        /* Header must be checked, and gso_segs computed. */
        skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
        skb_shinfo(skb)->gso_segs = 0;
    }

    netif_receive_skb(skb);  /* 完成skb元数据的修改,让内核接收该报文 */
    return;

frame_err:
    dev->stats.rx_frame_errors++;
    dev_kfree_skb(skb);
}
virtnet_poll是napi真正的接收函数

static int virtnet_poll(struct napi_struct *napi, int budget)
{
    struct virtnet_info *vi = container_of(napi, struct virtnet_info, napi);
    void *buf;
    unsigned int len, received = 0;

again:
    while (received < budget &&
           (buf = virtqueue_get_buf(vi->rvq, &len)) != NULL) { /* virtqueue_get_buf返回avail传入的token,即skb或者报文首page的指针 */
        receive_buf(vi->dev, buf, len); /* 构造skb并交给内核协议栈 */
        --vi->num;  /* 接收队列空闲个数递减 */
        received++;
    }

    if (vi->num < vi->max / 2) {  /* 如果接收队列空闲不够,重新refill */
        if (!try_fill_recv(vi, GFP_ATOMIC))
            queue_delayed_work(vi->st_wq, &vi->refill, 0);
    }

    /* Out of packets? */
    if (received < budget) {
        napi_complete(napi);
        if (unlikely(!virtqueue_enable_cb(vi->rvq)) &&
            napi_schedule_prep(napi)) {
            virtqueue_disable_cb(vi->rvq);
            __napi_schedule(napi);
            goto again;
        }
    }

    return received;
}
对于后端的处理,vhost部分之前已经分析过了,本篇分析下后端是qemu的场景,其中virtio_net_device_realize函数在初始化队列时,已经初始化好了TX, RX, CTRL队列的回调函数

static void virtio_net_device_realize(DeviceState *dev, Error **errp)
{
    VirtIODevice *vdev = VIRTIO_DEVICE(dev);
    VirtIONet *n = VIRTIO_NET(dev);
    NetClientState *nc;
    int i;

    virtio_init(vdev, "virtio-net", VIRTIO_ID_NET, n->config_size);

    n->max_queues = MAX(n->nic_conf.peers.queues, 1);
    n->vqs = g_malloc0(sizeof(VirtIONetQueue) * n->max_queues);
    n->vqs[0].rx_vq = virtio_add_queue(vdev, 256, virtio_net_handle_rx);  /* qemu RX发给guest之后的队列回调函数 */
    n->curr_queues = 1;
    n->vqs[0].n = n;
    n->tx_timeout = n->net_conf.txtimer;

    if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
        n->vqs[0].tx_vq = virtio_add_queue(vdev, 256,
                                           virtio_net_handle_tx_timer);
        n->vqs[0].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, virtio_net_tx_timer,
                                               &n->vqs[0]);
    } else {
        n->vqs[0].tx_vq = virtio_add_queue(vdev, 256,
                                           virtio_net_handle_tx_bh); /* guest TX到qemu之后的队列回调函数 */
        n->vqs[0].tx_bh = qemu_bh_new(virtio_net_tx_bh, &n->vqs[0]);
    }
    n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
...
}
对于收包而言,由NetClientState,e.g. tun对应的socket来触发,调用到virtio_net_receive

static NetClientInfo net_virtio_info = {
    .type = NET_CLIENT_OPTIONS_KIND_NIC,
    .size = sizeof(NICState),
    .can_receive = virtio_net_can_receive,
    .receive = virtio_net_receive,
    .cleanup = virtio_net_cleanup,
    .link_status_changed = virtio_net_set_link_status,
    .query_rx_filter = virtio_net_query_rxfilter,
};

static ssize_t virtio_net_receive(NetClientState *nc, const uint8_t *buf, size_t size)
{
    VirtIONet *n = qemu_get_nic_opaque(nc);
    VirtIONetQueue *q = virtio_net_get_subqueue(nc);
    VirtIODevice *vdev = VIRTIO_DEVICE(n);
    struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
    struct virtio_net_hdr_mrg_rxbuf mhdr;
    unsigned mhdr_cnt = 0;
    size_t offset, i, guest_offset;

    if (!virtio_net_can_receive(nc)) {
        return -1;
    }

    /* hdr_len refers to the header we supply to the guest */
    if (!virtio_net_has_buffers(q, size + n->guest_hdr_len - n->host_hdr_len)) {
        return 0;
    }

    if (!receive_filter(n, buf, size))
        return size;

    offset = i = 0;

    while (offset < size) {
        VirtQueueElement elem;
        int len, total;
        const struct iovec *sg = elem.in_sg;

        total = 0;

        if (virtqueue_pop(q->rx_vq, &elem) == 0) {  /* 从avail ring中取得可用buffer */
            if (i == 0)
                return -1;
            error_report("virtio-net unexpected empty queue: "
                    "i %zd mergeable %d offset %zd, size %zd, "
                    "guest hdr len %zd, host hdr len %zd guest features 0x%x",
                    i, n->mergeable_rx_bufs, offset, size,
                    n->guest_hdr_len, n->host_hdr_len, vdev->guest_features);
            exit(1);
        }

        if (elem.in_num < 1) {
            error_report("virtio-net receive queue contains no in buffers");
            exit(1);
        }
        /* 报文内容填充到elem里面 */
        if (i == 0) {
            assert(offset == 0);
            if (n->mergeable_rx_bufs) {
                mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
                                    sg, elem.in_num,
                                    offsetof(typeof(mhdr), num_buffers),
                                    sizeof(mhdr.num_buffers));
            }

            receive_header(n, sg, elem.in_num, buf, size);
            offset = n->host_hdr_len;
            total += n->guest_hdr_len;
            guest_offset = n->guest_hdr_len;
        } else {
            guest_offset = 0;
        }

        /* copy in packet.  ugh */
        len = iov_from_buf(sg, elem.in_num, guest_offset,
                           buf + offset, size - offset);
        total += len;
        offset += len;

        /* If buffers can't be merged, at this point we
         * must have consumed the complete packet.
         * Otherwise, drop it. */
        if (!n->mergeable_rx_bufs && offset < size) {
#if 0
            error_report("virtio-net truncated non-mergeable packet: "
                         "i %zd mergeable %d offset %zd, size %zd, "
                         "guest hdr len %zd, host hdr len %zd",
                         i, n->mergeable_rx_bufs,
                         offset, size, n->guest_hdr_len, n->host_hdr_len);
#endif
            return size;
        }

        /* signal other side */
        virtqueue_fill(q->rx_vq, &elem, total, i++);  /* 修改used elem */
    }

    if (mhdr_cnt) {
        virtio_stw_p(vdev, &mhdr.num_buffers, i);
        iov_from_buf(mhdr_sg, mhdr_cnt,
                     0,
                     &mhdr.num_buffers, sizeof mhdr.num_buffers);
    }

    virtqueue_flush(q->rx_vq, i); /* 更新used idx */
    virtio_notify(vdev, q->rx_vq);  /* 通知guest */

    return size;
}
对于发包而言,由guest触发,类似于一个硬中断到来,之后qemu会模拟一个软中断来执行,对应函数为virtio_net_tx_bh
static void virtio_net_handle_tx_bh(VirtIODevice *vdev, VirtQueue *vq)
{
    VirtIONet *n = VIRTIO_NET(vdev);
    VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];

    if (unlikely(q->tx_waiting)) {
        return;
    }
    q->tx_waiting = 1;
    /* This happens when device was stopped but VCPU wasn't. */
    if (!vdev->vm_running) {
        return;
    }
    virtio_queue_set_notification(vq, 0);  /* 关闭virtqueue notification */
    qemu_bh_schedule(q->tx_bh); /* qemu模拟触发软中断 */
}

static void virtio_net_tx_bh(void *opaque)
{
    VirtIONetQueue *q = opaque;
    VirtIONet *n = q->n;
    VirtIODevice *vdev = VIRTIO_DEVICE(n);
    int32_t ret;

    /* This happens when device was stopped but BH wasn't. */
    if (!vdev->vm_running) {
        /* Make sure tx waiting is set, so we'll run when restarted. */
        assert(q->tx_waiting);
        return;
    }

    q->tx_waiting = 0;

    /* Just in case the driver is not ready on more */
    if (unlikely(!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK))) {
        return;
    }

    ret = virtio_net_flush_tx(q);  /* 实际发送报文的函数 */
    if (ret == -EBUSY) {
        return; /* Notification re-enable handled by tx_complete */
    }

    /* If we flush a full burst of packets, assume there are
     * more coming and immediately reschedule */
    if (ret >= n->tx_burst) {
        qemu_bh_schedule(q->tx_bh);
        q->tx_waiting = 1;
        return;
    }

    /* If less than a full burst, re-enable notification and flush
     * anything that may have come in while we weren't looking.  If
     * we find something, assume the guest is still active and reschedule */
    virtio_queue_set_notification(q->tx_vq, 1);
    if (virtio_net_flush_tx(q) > 0) {
        virtio_queue_set_notification(q->tx_vq, 0);
        qemu_bh_schedule(q->tx_bh);
        q->tx_waiting = 1;
    }
}
virtio_net_flush_tx是真正发送报文的函数

static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
{
    VirtIONet *n = q->n;
    VirtIODevice *vdev = VIRTIO_DEVICE(n);
    VirtQueueElement elem;
    int32_t num_packets = 0;
    int queue_index = vq2q(virtio_get_queue_index(q->tx_vq));
    if (!(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
        return num_packets;
    }

    if (q->async_tx.elem.out_num) {
        virtio_queue_set_notification(q->tx_vq, 0);
        return num_packets;
    }

    while (virtqueue_pop(q->tx_vq, &elem)) {  /* 从avail ring中获得要发送的报文内容 */
        ssize_t ret, len;
        unsigned int out_num = elem.out_num;
        struct iovec *out_sg = &elem.out_sg[0];
        struct iovec sg[VIRTQUEUE_MAX_SIZE];

        if (out_num < 1) {
            error_report("virtio-net header not in first element");
            exit(1);
        }

        if (n->has_vnet_hdr) {
            if (out_sg[0].iov_len < n->guest_hdr_len) {
                error_report("virtio-net header incorrect");
                exit(1);
            }
            virtio_net_hdr_swap(vdev, (void *) out_sg[0].iov_base);
        }

        /*
         * If host wants to see the guest header as is, we can
         * pass it on unchanged. Otherwise, copy just the parts
         * that host is interested in.
         */
        assert(n->host_hdr_len <= n->guest_hdr_len);
        if (n->host_hdr_len != n->guest_hdr_len) {
            unsigned sg_num = iov_copy(sg, ARRAY_SIZE(sg),
                                       out_sg, out_num,
                                       0, n->host_hdr_len);
            sg_num += iov_copy(sg + sg_num, ARRAY_SIZE(sg) - sg_num,
                             out_sg, out_num,
                             n->guest_hdr_len, -1);
            out_num = sg_num;
            out_sg = sg;
        }

        len = n->guest_hdr_len;

        ret = qemu_sendv_packet_async(qemu_get_subqueue(n->nic, queue_index),  /* 通过qemu nic发送 */
                                      out_sg, out_num, virtio_net_tx_complete);
        if (ret == 0) {
            virtio_queue_set_notification(q->tx_vq, 0);
            q->async_tx.elem = elem;
            q->async_tx.len  = len;
            return -EBUSY;
        }

        len += ret;

        virtqueue_push(q->tx_vq, &elem, 0);
        virtio_notify(vdev, q->tx_vq);

        if (++num_packets >= n->tx_burst) {
            break;
        }
    }
    return num_packets;
}



你可能感兴趣的:(KVM/QEMU)