xen网络后端驱动分析(设备篇)

netback和通用网络设备差不多,其priv结构体为xen_netif(netfront为netfront_info,igb叫igb_adapter,都是一个调调。注,priv结构体是跟着net_device后面的一块线性内存区域,用来存放不同设备驱动的私有结构)

我们先拿jeremy's git tree的2.6.31的netback做分析,然后再比较下2.6.32以及upstream netback的updates。和netfront一样,2.6.31也有一个accelerator的机制,基本已经快被SRIOV取代了,这里就不多说了。

struct xen_netif {
    /* Unique identifier for this interface. */
    domid_t          domid;
    unsigned int     handle;

    u8               fe_dev_addr[6];

netfront的mac地址,通过xenstore获得(前端写,后端读)

    /* Physical parameters of the comms window. */
    grant_handle_t   tx_shmem_handle;
    grant_ref_t      tx_shmem_ref;
    grant_handle_t   rx_shmem_handle;
    grant_ref_t      rx_shmem_ref;
    unsigned int     irq;

tx, rx两个IO ring对应的grant_handle_t, grant_ref_t

    /* The shared rings and indexes. */
    struct xen_netif_tx_back_ring tx;
    struct xen_netif_rx_back_ring rx;
    struct vm_struct *tx_comms_area;
    struct vm_struct *rx_comms_area;

    /* Set of features that can be turned on in dev->features. */
    int features;

    int smart_poll;

    /* Internal feature information. */
    u8 can_queue:1; /* can queue packets for receiver? */

    /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
    RING_IDX rx_req_cons_peek;

    /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
    unsigned long   credit_bytes;
    unsigned long   credit_usec;
    atomic64_t   remaining_credit;
    struct timer_list credit_timeout;

    /* Enforce draining of the transmit queue. */
    struct timer_list tx_queue_timeout;

    /* Statistics */
    int nr_copied_skbs;

    /* Miscellaneous private stuff. */
    struct list_head list;  /* scheduling list */
    atomic_t         refcnt;
    struct net_device *dev;
    struct net_device_stats stats;

    unsigned int carrier;

    wait_queue_head_t waiting_to_free;
};


struct backend_info {
    struct xenbus_device *dev;
    struct xen_netif *netif;
    enum xenbus_state frontend_state;
    struct xenbus_watch hotplug_status_watch;
    int have_hotplug_status_watch:1;

    int have_rate_watch:1;
    struct xenbus_watch rate_watch;
};

netback会watch两个xenstore entry,一个是rate_watch,用来侦测出包速率的变化,一个是hotplug_status_watch,用来发现netfront状态变化,并记录到frontend_state中。前后端驱动的状态包括

enum xenbus_state
{
    XenbusStateUnknown      = 0,
    XenbusStateInitialising = 1,
    XenbusStateInitWait     = 2,  /* Finished early
                     initialisation, but waiting
                     for information from the peer
                     or hotplug scripts. */
    XenbusStateInitialised  = 3,  /* Initialised and waiting for a
                     connection from the peer. */
    XenbusStateConnected    = 4,
    XenbusStateClosing      = 5,  /* The device is being closed
                     due to an error or an unplug
                     event. */
    XenbusStateClosed       = 6,

    /*
    * Reconfiguring: The device is being reconfigured.
    */
    XenbusStateReconfiguring = 7,

    XenbusStateReconfigured  = 8
};


netback设备调用xenbus_register_backend来把自己注册到xenbus上,你可以把xenbus想象成前后端驱动的pci bus,而netback, netfront则是上面的pci device。

static struct xenbus_driver netback = {
    .name = "vif",
    .owner = THIS_MODULE,
    .ids = netback_ids,
    .probe = netback_probe,
    .remove = netback_remove,
    .uevent = netback_uevent,
    .otherend_changed = frontend_changed,
};

int netif_xenbus_init(void)
{
    printk(KERN_CRIT "registering netback\n");
    return xenbus_register_backend(&netback);
}


static int netback_probe(struct xenbus_device *dev,
             const struct xenbus_device_id *id)
{
    const char *message;
    struct xenbus_transaction xbt;
    int err;
    int sg;
    struct backend_info *be = kzalloc(sizeof(struct backend_info),
                      GFP_KERNEL);
    if (!be) {
        xenbus_dev_fatal(dev, -ENOMEM,
                 "allocating backend structure");
        return -ENOMEM;
    }

    be->dev = dev;
    dev_set_drvdata(&dev->dev, be);

通过xenbus_device生成一个backend_info结构体

    sg = 1;
    if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
        sg = 0;

    do {
        err = xenbus_transaction_start(&xbt);
        if (err) {
            xenbus_dev_fatal(dev, err, "starting transaction");
            goto fail;
        }

        err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
        if (err) {
            message = "writing feature-sg";
            goto abort_transaction;
        }

        err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
                    "%d", sg);
        if (err) {
            message = "writing feature-gso-tcpv4";
            goto abort_transaction;
        }

        /* We support rx-copy path. */
        err = xenbus_printf(xbt, dev->nodename,
                    "feature-rx-copy", "%d", 1);
        if (err) {
            message = "writing feature-rx-copy";
            goto abort_transaction;
        }

        /*
         * We don't support rx-flip path (except old guests who don't
         * grok this feature flag).
         */
        err = xenbus_printf(xbt, dev->nodename,
                    "feature-rx-flip", "%d", 0);
        if (err) {
            message = "writing feature-rx-flip";
            goto abort_transaction;
        }

        /* We support data smart poll mechanism */
        err = xenbus_printf(xbt, dev->nodename,
                    "feature-smart-poll", "%d", 1);
        if (err) {
            message = "writing feature-smart-poll";
            goto abort_transaction;
        }

        err = xenbus_transaction_end(xbt, 0);
    } while (err == -EAGAIN);

    if (err) {
        xenbus_dev_fatal(dev, err, "completing transaction");
        goto fail;
    }

    //netback_probe_accelerators(be, dev);

    err = xenbus_switch_state(dev, XenbusStateInitWait);
    if (err)
        goto fail;

    /* This kicks hotplug scripts, so do it immediately. */
    backend_create_netif(be);

    return 0;

abort_transaction:
    xenbus_transaction_end(xbt, 1);
    xenbus_dev_fatal(dev, err, "%s", message);
fail:
    DPRINTK("failed");
    netback_remove(dev);
    return err;
}

backend_create_netif调用netif_alloc创建一个包含xen_netif结构体的net_device设备


struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
{
    int err = 0;
    struct net_device *dev;
    struct xen_netif *netif;
    char name[IFNAMSIZ] = {};

    snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
    dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
    if (dev == NULL) {
        DPRINTK("Could not create netif: out of memory\n");
        return ERR_PTR(-ENOMEM);
    }
创建net_device结构体

    SET_NETDEV_DEV(dev, parent);

    netif = netdev_priv(dev);
    memset(netif, 0, sizeof(*netif));
    netif->domid  = domid;
    netif->handle = handle;
    netif->features = NETIF_F_SG;
    atomic_set(&netif->refcnt, 1);
    init_waitqueue_head(&netif->waiting_to_free);
    netif->dev = dev;
    INIT_LIST_HEAD(&netif->list);

    netback_carrier_off(netif);

初始化xen_netif结构体

    atomic64_set(&netif->remaining_credit,INT_MAX);
    netif->credit_bytes = INT_MAX;
    netif->credit_usec  = 0L;
    init_timer(&netif->credit_timeout);
    /* Initialize 'expires' now: it's used to track the credit window. */
    netif->credit_timeout.expires = jiffies;


    init_timer(&netif->tx_queue_timeout);

tx_queue_timeout是定时器,用来释放tx_queue中的过期报文

    dev->netdev_ops = &netback_ops;
    dev->features   = NETIF_F_IP_CSUM|NETIF_F_SG;

    SET_ETHTOOL_OPS(dev, &network_ethtool_ops);

    dev->tx_queue_len = netbk_queue_length;

netback的tx queue长度,如果设置过大会造成延迟增加

    /*
     * Initialise a dummy MAC address. We choose the numerically
     * largest non-broadcast address to prevent the address getting
     * stolen by an Ethernet bridge for STP purposes.
     * (FE:FF:FF:FF:FF:FF)
     */
    memset(dev->dev_addr, 0xFF, ETH_ALEN);
    dev->dev_addr[0] &= ~0x01;

netback的mac地址都是FE:FF:FF:FF:FF:FF

    rtnl_lock();
    err = register_netdevice(dev);
    rtnl_unlock();
    if (err) {
        DPRINTK("Could not register new net device %s: err=%d\n",
            dev->name, err);
        free_netdev(dev);
        return ERR_PTR(err);
    }

    DPRINTK("Successfully created netif\n");
    return netif;
}

netback_ops结构如下

static struct net_device_ops netback_ops =
{
    .ndo_start_xmit = netif_be_start_xmit,
    .ndo_get_stats  = netif_be_get_stats,
    .ndo_open   = net_open,
    .ndo_stop   = net_close,
    .ndo_change_mtu = netbk_change_mtu,
};


如果netfront的状态发生变化,那么netback通过xenbus_watch也可以发现,此时会调用frontend_changed

static void frontend_changed(struct xenbus_device *dev,
                 enum xenbus_state frontend_state)
{
    struct backend_info *be = dev_get_drvdata(&dev->dev);

    DPRINTK("%s", xenbus_strstate(frontend_state));

    be->frontend_state = frontend_state;

    switch (frontend_state) {
    case XenbusStateInitialising:
        if (dev->state == XenbusStateClosed) {
            printk(KERN_INFO "%s: %s: prepare for reconnect\n",
                   __FUNCTION__, dev->nodename);
            xenbus_switch_state(dev, XenbusStateInitWait);
        }
        break;

    case XenbusStateInitialised:
        break;

    case XenbusStateConnected:
        if (dev->state == XenbusStateConnected)
            break;
        backend_create_netif(be);
        if (be->netif)
            connect(be);
        break;

    case XenbusStateClosing:
        if (be->netif)
            kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
        disconnect_backend(dev);
        xenbus_switch_state(dev, XenbusStateClosing);
        break;

    case XenbusStateClosed:
        xenbus_switch_state(dev, XenbusStateClosed);
        if (xenbus_dev_is_online(dev))
            break;
        /* fall through if not online */
    case XenbusStateUnknown:
        device_unregister(&dev->dev);
        break;

    default:
        xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
                 frontend_state);
        break;
    }
}

e.g. 如果发现netfront状态是XenbusStateConnected,调用connect与netfront建立连接

static void connect(struct backend_info *be)
{
    int err;
    struct xenbus_device *dev = be->dev;

    err = connect_rings(be);
    if (err)
        return;

    err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
    if (err) {
        xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
        return;
    }

    xen_net_read_rate(dev, &be->netif->credit_bytes,
              &be->netif->credit_usec);
    atomic64_set(&be->netif->remaining_credit,be->netif->credit_bytes);

    unregister_hotplug_status_watch(be);

    err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
                   hotplug_status_changed,
                   "%s/%s", dev->nodename, "hotplug-status");
    if (err) {
        /* Switch now, since we can't do a watch. */
        xenbus_switch_state(dev, XenbusStateConnected);
    } else {
        be->have_hotplug_status_watch = 1;
    }

增加hotplug-status的xenbus_watch

    unregister_rate_watch(be);
    err=xenbus_watch_pathfmt(dev, &be->rate_watch,
                   rate_changed,"%s/%s", dev->nodename, "rate");
    if(!err){
        be->have_rate_watch=1;
    }

增加rate的xenbus_watch

    netif_wake_queue(be->netif->dev);

唤醒发送队列
}


static int connect_rings(struct backend_info *be)
{
    struct xenbus_device *dev = be->dev;
    unsigned long tx_ring_ref, rx_ring_ref;
    unsigned int evtchn, rx_copy;
    int err;
    int val;

    DPRINTK("");

    err = xenbus_gather(XBT_NIL, dev->otherend,
                "tx-ring-ref", "%lu", &tx_ring_ref,
                "rx-ring-ref", "%lu", &rx_ring_ref,
                "event-channel", "%u", &evtchn, NULL);
    if (err) {
        xenbus_dev_fatal(dev, err,
                 "reading %s/ring-ref and event-channel",
                 dev->otherend);
        return err;
    }

    err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",

               &rx_copy);

    if (err == -ENOENT) {
        err = 0;
        rx_copy = 0;
    }
    if (err < 0) {
        xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
                 dev->otherend);
        return err;
    }

    if (!rx_copy)

        return -EOPNOTSUPP;

    if (be->netif->dev->tx_queue_len != 0) {
        if (xenbus_scanf(XBT_NIL, dev->otherend,
                 "feature-rx-notify", "%d", &val) < 0)
            val = 0;
        if (val)
            be->netif->can_queue = 1;
        else
            /* Must be non-zero for pfifo_fast to work. */
            be->netif->dev->tx_queue_len = 1;
    }

    if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
        val = 0;
    if (!val) {
        be->netif->features &= ~NETIF_F_SG;
        be->netif->dev->features &= ~NETIF_F_SG;
        if (be->netif->dev->mtu > ETH_DATA_LEN)
            be->netif->dev->mtu = ETH_DATA_LEN;
    }

    if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
             &val) < 0)
        val = 0;
    if (val) {
        be->netif->features |= NETIF_F_TSO;
        be->netif->dev->features |= NETIF_F_TSO;
    }

    if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
             "%d", &val) < 0)
        val = 0;
    if (val) {
        be->netif->features &= ~NETIF_F_IP_CSUM;
        be->netif->dev->features &= ~NETIF_F_IP_CSUM;
    }

    if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
             "%d", &val) < 0)
        val = 0;
    if (val)
        be->netif->smart_poll = 1;
    else
        be->netif->smart_poll = 0;

从xenbus获得netfront参数信息

    /* Map the shared frame, irq etc. */
    err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
    if (err) {
        xenbus_dev_fatal(dev, err,
                 "mapping shared-frames %lu/%lu port %u",
                 tx_ring_ref, rx_ring_ref, evtchn);
        return err;
    }
    return 0;
}

这里又调用了netif_map,把前端的IO ring page映射过来

int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
          unsigned long rx_ring_ref, unsigned int evtchn)
{
    int err = -ENOMEM;
    struct xen_netif_tx_sring *txs;
    struct xen_netif_rx_sring *rxs;

    /* Already connected through? */
    if (netif->irq)
        return 0;

    netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
    if (netif->tx_comms_area == NULL)
        return -ENOMEM;
    netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
    if (netif->rx_comms_area == NULL)
        goto err_rx;

调用alloc_vm_area生成一个vm_struct,tx_comms_area, rx_comms_area都是一个vm_struct。vm_struct表示虚拟地址连续的一段内核地址。

    err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
    if (err)
        goto err_map;

    err = bind_interdomain_evtchn_to_irqhandler(
        netif->domid, evtchn, netif_be_int, 0,
        netif->dev->name, netif);
    if (err < 0)
        goto err_hypervisor;
    netif->irq = err;
    disable_irq(netif->irq);

绑定event channel

    txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
    BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);

    rxs = (struct xen_netif_rx_sring *)
        ((char *)netif->rx_comms_area->addr);
    BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);

初始化tx, rx IO ring,大小为一个PAGE_SIZE

    netif->rx_req_cons_peek = 0;

    netif_get(netif);

    rtnl_lock();
    netback_carrier_on(netif);
    if (netif_running(netif->dev))
        __netif_up(netif);
    rtnl_unlock();

    return 0;
err_hypervisor:
    unmap_frontend_pages(netif);
err_map:
    free_vm_area(netif->rx_comms_area);
err_rx:
    free_vm_area(netif->tx_comms_area);
    return err;
}

其中调用的map_frontend_pages,用来映射netfront传送过来的GR,并把授权的page映射到自己的地址空间。我们知道一个页面映射的过程有如下步骤:

1. netfront创建一个GR(通过调用gnttab_claim_grant_reference),这里的话就是tx_ring_ref, rx_ring_ref两个GR

2. netfront把这个GR授权给netback访问(调用gnttab_grant_foreign_access_ref),GR此时包含了这个page的mfn,被授权的domid,即netfront_info->xbdev->otherend_id

3. netback调用map_frontend_pages开始做映射,该函数原型如下

static int map_frontend_pages(
    struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
{
    struct gnttab_map_grant_ref op;

    gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, GNTMAP_host_map, tx_ring_ref, netif->domid);

该函数设置gnttab_map_grant_ref结构体,映射类型是GNTMAP_host_map,把tx_ring_ref代表的GR,映射到tx_comms_area->addr的本地虚拟地址上

    if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))

        BUG();

调用grant table的hypercall进行映射

    if (op.status) {
        DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
        return op.status;
    }

    netif->tx_shmem_ref    = tx_ring_ref;

    netif->tx_shmem_handle = op.handle;

op.handle是hypercall调用的返回值,指向一个grant_mapping

    gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, GNTMAP_host_map, rx_ring_ref, netif->domid);

    if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
        BUG();

    if (op.status) {
        struct gnttab_unmap_grant_ref unop;

        gnttab_set_unmap_op(&unop,
                    (unsigned long)netif->tx_comms_area->addr,
                    GNTMAP_host_map, netif->tx_shmem_handle);
        HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
        DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
        return op.status;
    }

    netif->rx_shmem_ref    = rx_ring_ref;
    netif->rx_shmem_handle = op.handle;

    return 0;
}

4. 当映射完成后,就可以访问了,之后就是撤销映射,函数为unmap_frontend_pages

static void unmap_frontend_pages(struct xen_netif *netif)
{
    struct gnttab_unmap_grant_ref op;

    gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
                GNTMAP_host_map, netif->tx_shmem_handle);

    if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
        BUG();

    gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
                GNTMAP_host_map, netif->rx_shmem_handle);

    if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
        BUG();

对tx_comms_area, rx_comms_area设置gnttab_unmap_grant_ref,之后调用hypercall撤销映射

}

5. 最后是netfront撤销grant ref的foreign access权限,通过调用xennet_end_access或者直接调用gnttab_end_foreign_access_ref来实现。在netfront驱动中,tx_ring, rx_ring是通过xennet_end_access来释放的(xennet_disconnect_backend中调用到),发送数据的page是通过gnttab_end_foreign_access_ref来实现回收,而接收由于是通过grant table页面传递的方式进行的内存拷贝操作,因此通过gnttab_end_foreign_transfer_ref的方式进行回收(可以参考netfront驱动中的xennet_uninit实现,分别调用了xennet_release_tx_bufs, xennet_release_rx_bufs)


最后,再来看下netback提供的net_device_ops接口

static struct net_device_ops netback_ops =
{
    .ndo_start_xmit = netif_be_start_xmit,
    .ndo_get_stats  = netif_be_get_stats,
    .ndo_open   = net_open,
    .ndo_stop   = net_close,
    .ndo_change_mtu = netbk_change_mtu,
};

static int net_open(struct net_device *dev)
{
    struct xen_netif *netif = netdev_priv(dev);
    if (netback_carrier_ok(netif)) {
        __netif_up(netif);
        netif_start_queue(dev);
    }
    return 0;
}

static int net_close(struct net_device *dev)
{
    struct xen_netif *netif = netdev_priv(dev);
    if (netback_carrier_ok(netif))
        __netif_down(netif);
    netif_stop_queue(dev);
    return 0;
}

__netif_up, __netif_down用来打开/关闭irq中断。__netif_up中,如果有包需要发送,则把xen_netif加入到一个net_schedule_list的全局链表中,之后调用maybe_schedule_tx_action,判断是否触发tasklet_schedule(&net_tx_tasklet)进行报文发送。


netif_be_start_xmit留到后面再分析了




你可能感兴趣的:(xen网络后端驱动分析(设备篇))