netback和通用网络设备差不多,其priv结构体为xen_netif(netfront为netfront_info,igb叫igb_adapter,都是一个调调。注,priv结构体是跟着net_device后面的一块线性内存区域,用来存放不同设备驱动的私有结构)
我们先拿jeremy's git tree的2.6.31的netback做分析,然后再比较下2.6.32以及upstream netback的updates。和netfront一样,2.6.31也有一个accelerator的机制,基本已经快被SRIOV取代了,这里就不多说了。
struct xen_netif {
/* Unique identifier for this interface. */
domid_t domid;
unsigned int handle;
u8 fe_dev_addr[6];
netfront的mac地址,通过xenstore获得(前端写,后端读)
/* Physical parameters of the comms window. */
grant_handle_t tx_shmem_handle;
grant_ref_t tx_shmem_ref;
grant_handle_t rx_shmem_handle;
grant_ref_t rx_shmem_ref;
unsigned int irq;
tx, rx两个IO ring对应的grant_handle_t, grant_ref_t
/* The shared rings and indexes. */
struct xen_netif_tx_back_ring tx;
struct xen_netif_rx_back_ring rx;
struct vm_struct *tx_comms_area;
struct vm_struct *rx_comms_area;
/* Set of features that can be turned on in dev->features. */
int features;
int smart_poll;
/* Internal feature information. */
u8 can_queue:1; /* can queue packets for receiver? */
/* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */
RING_IDX rx_req_cons_peek;
/* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */
unsigned long credit_bytes;
unsigned long credit_usec;
atomic64_t remaining_credit;
struct timer_list credit_timeout;
/* Enforce draining of the transmit queue. */
struct timer_list tx_queue_timeout;
/* Statistics */
int nr_copied_skbs;
/* Miscellaneous private stuff. */
struct list_head list; /* scheduling list */
atomic_t refcnt;
struct net_device *dev;
struct net_device_stats stats;
unsigned int carrier;
wait_queue_head_t waiting_to_free;
};
struct backend_info {
struct xenbus_device *dev;
struct xen_netif *netif;
enum xenbus_state frontend_state;
struct xenbus_watch hotplug_status_watch;
int have_hotplug_status_watch:1;
int have_rate_watch:1;
struct xenbus_watch rate_watch;
};
netback会watch两个xenstore entry,一个是rate_watch,用来侦测出包速率的变化,一个是hotplug_status_watch,用来发现netfront状态变化,并记录到frontend_state中。前后端驱动的状态包括
enum xenbus_state
{
XenbusStateUnknown = 0,
XenbusStateInitialising = 1,
XenbusStateInitWait = 2, /* Finished early
initialisation, but waiting
for information from the peer
or hotplug scripts. */
XenbusStateInitialised = 3, /* Initialised and waiting for a
connection from the peer. */
XenbusStateConnected = 4,
XenbusStateClosing = 5, /* The device is being closed
due to an error or an unplug
event. */
XenbusStateClosed = 6,
/*
* Reconfiguring: The device is being reconfigured.
*/
XenbusStateReconfiguring = 7,
XenbusStateReconfigured = 8
};
netback设备调用xenbus_register_backend来把自己注册到xenbus上,你可以把xenbus想象成前后端驱动的pci bus,而netback, netfront则是上面的pci device。
static struct xenbus_driver netback = {
.name = "vif",
.owner = THIS_MODULE,
.ids = netback_ids,
.probe = netback_probe,
.remove = netback_remove,
.uevent = netback_uevent,
.otherend_changed = frontend_changed,
};
int netif_xenbus_init(void)
{
printk(KERN_CRIT "registering netback\n");
return xenbus_register_backend(&netback);
}
static int netback_probe(struct xenbus_device *dev,
const struct xenbus_device_id *id)
{
const char *message;
struct xenbus_transaction xbt;
int err;
int sg;
struct backend_info *be = kzalloc(sizeof(struct backend_info),
GFP_KERNEL);
if (!be) {
xenbus_dev_fatal(dev, -ENOMEM,
"allocating backend structure");
return -ENOMEM;
}
be->dev = dev;
dev_set_drvdata(&dev->dev, be);
通过xenbus_device生成一个backend_info结构体
sg = 1;
if (netbk_copy_skb_mode == NETBK_ALWAYS_COPY_SKB)
sg = 0;
do {
err = xenbus_transaction_start(&xbt);
if (err) {
xenbus_dev_fatal(dev, err, "starting transaction");
goto fail;
}
err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", sg);
if (err) {
message = "writing feature-sg";
goto abort_transaction;
}
err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4",
"%d", sg);
if (err) {
message = "writing feature-gso-tcpv4";
goto abort_transaction;
}
/* We support rx-copy path. */
err = xenbus_printf(xbt, dev->nodename,
"feature-rx-copy", "%d", 1);
if (err) {
message = "writing feature-rx-copy";
goto abort_transaction;
}
/*
* We don't support rx-flip path (except old guests who don't
* grok this feature flag).
*/
err = xenbus_printf(xbt, dev->nodename,
"feature-rx-flip", "%d", 0);
if (err) {
message = "writing feature-rx-flip";
goto abort_transaction;
}
/* We support data smart poll mechanism */
err = xenbus_printf(xbt, dev->nodename,
"feature-smart-poll", "%d", 1);
if (err) {
message = "writing feature-smart-poll";
goto abort_transaction;
}
err = xenbus_transaction_end(xbt, 0);
} while (err == -EAGAIN);
if (err) {
xenbus_dev_fatal(dev, err, "completing transaction");
goto fail;
}
//netback_probe_accelerators(be, dev);
err = xenbus_switch_state(dev, XenbusStateInitWait);
if (err)
goto fail;
/* This kicks hotplug scripts, so do it immediately. */
backend_create_netif(be);
return 0;
abort_transaction:
xenbus_transaction_end(xbt, 1);
xenbus_dev_fatal(dev, err, "%s", message);
fail:
DPRINTK("failed");
netback_remove(dev);
return err;
}
backend_create_netif调用netif_alloc创建一个包含xen_netif结构体的net_device设备
struct xen_netif *netif_alloc(struct device *parent, domid_t domid, unsigned int handle)
{
int err = 0;
struct net_device *dev;
struct xen_netif *netif;
char name[IFNAMSIZ] = {};
snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle);
dev = alloc_netdev(sizeof(struct xen_netif), name, ether_setup);
if (dev == NULL) {
DPRINTK("Could not create netif: out of memory\n");
return ERR_PTR(-ENOMEM);
}
创建net_device结构体
SET_NETDEV_DEV(dev, parent);
netif = netdev_priv(dev);
memset(netif, 0, sizeof(*netif));
netif->domid = domid;
netif->handle = handle;
netif->features = NETIF_F_SG;
atomic_set(&netif->refcnt, 1);
init_waitqueue_head(&netif->waiting_to_free);
netif->dev = dev;
INIT_LIST_HEAD(&netif->list);
netback_carrier_off(netif);
初始化xen_netif结构体
atomic64_set(&netif->remaining_credit,INT_MAX);
netif->credit_bytes = INT_MAX;
netif->credit_usec = 0L;
init_timer(&netif->credit_timeout);
/* Initialize 'expires' now: it's used to track the credit window. */
netif->credit_timeout.expires = jiffies;
init_timer(&netif->tx_queue_timeout);
tx_queue_timeout是定时器,用来释放tx_queue中的过期报文
dev->netdev_ops = &netback_ops;
dev->features = NETIF_F_IP_CSUM|NETIF_F_SG;
SET_ETHTOOL_OPS(dev, &network_ethtool_ops);
dev->tx_queue_len = netbk_queue_length;
netback的tx queue长度,如果设置过大会造成延迟增加
/*
* Initialise a dummy MAC address. We choose the numerically
* largest non-broadcast address to prevent the address getting
* stolen by an Ethernet bridge for STP purposes.
* (FE:FF:FF:FF:FF:FF)
*/
memset(dev->dev_addr, 0xFF, ETH_ALEN);
dev->dev_addr[0] &= ~0x01;
netback的mac地址都是FE:FF:FF:FF:FF:FF
rtnl_lock();
err = register_netdevice(dev);
rtnl_unlock();
if (err) {
DPRINTK("Could not register new net device %s: err=%d\n",
dev->name, err);
free_netdev(dev);
return ERR_PTR(err);
}
DPRINTK("Successfully created netif\n");
return netif;
}
netback_ops结构如下
static struct net_device_ops netback_ops =
{
.ndo_start_xmit = netif_be_start_xmit,
.ndo_get_stats = netif_be_get_stats,
.ndo_open = net_open,
.ndo_stop = net_close,
.ndo_change_mtu = netbk_change_mtu,
};
如果netfront的状态发生变化,那么netback通过xenbus_watch也可以发现,此时会调用frontend_changed
static void frontend_changed(struct xenbus_device *dev,
enum xenbus_state frontend_state)
{
struct backend_info *be = dev_get_drvdata(&dev->dev);
DPRINTK("%s", xenbus_strstate(frontend_state));
be->frontend_state = frontend_state;
switch (frontend_state) {
case XenbusStateInitialising:
if (dev->state == XenbusStateClosed) {
printk(KERN_INFO "%s: %s: prepare for reconnect\n",
__FUNCTION__, dev->nodename);
xenbus_switch_state(dev, XenbusStateInitWait);
}
break;
case XenbusStateInitialised:
break;
case XenbusStateConnected:
if (dev->state == XenbusStateConnected)
break;
backend_create_netif(be);
if (be->netif)
connect(be);
break;
case XenbusStateClosing:
if (be->netif)
kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE);
disconnect_backend(dev);
xenbus_switch_state(dev, XenbusStateClosing);
break;
case XenbusStateClosed:
xenbus_switch_state(dev, XenbusStateClosed);
if (xenbus_dev_is_online(dev))
break;
/* fall through if not online */
case XenbusStateUnknown:
device_unregister(&dev->dev);
break;
default:
xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
frontend_state);
break;
}
}
e.g. 如果发现netfront状态是XenbusStateConnected,调用connect与netfront建立连接
static void connect(struct backend_info *be)
{
int err;
struct xenbus_device *dev = be->dev;
err = connect_rings(be);
if (err)
return;
err = xen_net_read_mac(dev, be->netif->fe_dev_addr);
if (err) {
xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename);
return;
}
xen_net_read_rate(dev, &be->netif->credit_bytes,
&be->netif->credit_usec);
atomic64_set(&be->netif->remaining_credit,be->netif->credit_bytes);
unregister_hotplug_status_watch(be);
err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch,
hotplug_status_changed,
"%s/%s", dev->nodename, "hotplug-status");
if (err) {
/* Switch now, since we can't do a watch. */
xenbus_switch_state(dev, XenbusStateConnected);
} else {
be->have_hotplug_status_watch = 1;
}
增加hotplug-status的xenbus_watch
unregister_rate_watch(be);
err=xenbus_watch_pathfmt(dev, &be->rate_watch,
rate_changed,"%s/%s", dev->nodename, "rate");
if(!err){
be->have_rate_watch=1;
}
增加rate的xenbus_watch
netif_wake_queue(be->netif->dev);
唤醒发送队列
}
static int connect_rings(struct backend_info *be)
{
struct xenbus_device *dev = be->dev;
unsigned long tx_ring_ref, rx_ring_ref;
unsigned int evtchn, rx_copy;
int err;
int val;
DPRINTK("");
err = xenbus_gather(XBT_NIL, dev->otherend,
"tx-ring-ref", "%lu", &tx_ring_ref,
"rx-ring-ref", "%lu", &rx_ring_ref,
"event-channel", "%u", &evtchn, NULL);
if (err) {
xenbus_dev_fatal(dev, err,
"reading %s/ring-ref and event-channel",
dev->otherend);
return err;
}
err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u",
&rx_copy);
if (err == -ENOENT) {
err = 0;
rx_copy = 0;
}
if (err < 0) {
xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy",
dev->otherend);
return err;
}
if (!rx_copy)
return -EOPNOTSUPP;
if (be->netif->dev->tx_queue_len != 0) {
if (xenbus_scanf(XBT_NIL, dev->otherend,
"feature-rx-notify", "%d", &val) < 0)
val = 0;
if (val)
be->netif->can_queue = 1;
else
/* Must be non-zero for pfifo_fast to work. */
be->netif->dev->tx_queue_len = 1;
}
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0)
val = 0;
if (!val) {
be->netif->features &= ~NETIF_F_SG;
be->netif->dev->features &= ~NETIF_F_SG;
if (be->netif->dev->mtu > ETH_DATA_LEN)
be->netif->dev->mtu = ETH_DATA_LEN;
}
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d",
&val) < 0)
val = 0;
if (val) {
be->netif->features |= NETIF_F_TSO;
be->netif->dev->features |= NETIF_F_TSO;
}
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload",
"%d", &val) < 0)
val = 0;
if (val) {
be->netif->features &= ~NETIF_F_IP_CSUM;
be->netif->dev->features &= ~NETIF_F_IP_CSUM;
}
if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-smart-poll",
"%d", &val) < 0)
val = 0;
if (val)
be->netif->smart_poll = 1;
else
be->netif->smart_poll = 0;
从xenbus获得netfront参数信息
/* Map the shared frame, irq etc. */
err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn);
if (err) {
xenbus_dev_fatal(dev, err,
"mapping shared-frames %lu/%lu port %u",
tx_ring_ref, rx_ring_ref, evtchn);
return err;
}
return 0;
}
这里又调用了netif_map,把前端的IO ring page映射过来
int netif_map(struct xen_netif *netif, unsigned long tx_ring_ref,
unsigned long rx_ring_ref, unsigned int evtchn)
{
int err = -ENOMEM;
struct xen_netif_tx_sring *txs;
struct xen_netif_rx_sring *rxs;
/* Already connected through? */
if (netif->irq)
return 0;
netif->tx_comms_area = alloc_vm_area(PAGE_SIZE);
if (netif->tx_comms_area == NULL)
return -ENOMEM;
netif->rx_comms_area = alloc_vm_area(PAGE_SIZE);
if (netif->rx_comms_area == NULL)
goto err_rx;
调用alloc_vm_area生成一个vm_struct,tx_comms_area, rx_comms_area都是一个vm_struct。vm_struct表示虚拟地址连续的一段内核地址。
err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref);
if (err)
goto err_map;
err = bind_interdomain_evtchn_to_irqhandler(
netif->domid, evtchn, netif_be_int, 0,
netif->dev->name, netif);
if (err < 0)
goto err_hypervisor;
netif->irq = err;
disable_irq(netif->irq);
绑定event channel
txs = (struct xen_netif_tx_sring *)netif->tx_comms_area->addr;
BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE);
rxs = (struct xen_netif_rx_sring *)
((char *)netif->rx_comms_area->addr);
BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE);
初始化tx, rx IO ring,大小为一个PAGE_SIZE
netif->rx_req_cons_peek = 0;
netif_get(netif);
rtnl_lock();
netback_carrier_on(netif);
if (netif_running(netif->dev))
__netif_up(netif);
rtnl_unlock();
return 0;
err_hypervisor:
unmap_frontend_pages(netif);
err_map:
free_vm_area(netif->rx_comms_area);
err_rx:
free_vm_area(netif->tx_comms_area);
return err;
}
其中调用的map_frontend_pages,用来映射netfront传送过来的GR,并把授权的page映射到自己的地址空间。我们知道一个页面映射的过程有如下步骤:
1. netfront创建一个GR(通过调用gnttab_claim_grant_reference),这里的话就是tx_ring_ref, rx_ring_ref两个GR
2. netfront把这个GR授权给netback访问(调用gnttab_grant_foreign_access_ref),GR此时包含了这个page的mfn,被授权的domid,即netfront_info->xbdev->otherend_id
3. netback调用map_frontend_pages开始做映射,该函数原型如下
static int map_frontend_pages(
struct xen_netif *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref)
{
struct gnttab_map_grant_ref op;
gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, GNTMAP_host_map, tx_ring_ref, netif->domid);
该函数设置gnttab_map_grant_ref结构体,映射类型是GNTMAP_host_map,把tx_ring_ref代表的GR,映射到tx_comms_area->addr的本地虚拟地址上
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
BUG();
调用grant table的hypercall进行映射
if (op.status) {
DPRINTK(" Gnttab failure mapping tx_ring_ref!\n");
return op.status;
}
netif->tx_shmem_ref = tx_ring_ref;
netif->tx_shmem_handle = op.handle;
op.handle是hypercall调用的返回值,指向一个grant_mapping
gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, GNTMAP_host_map, rx_ring_ref, netif->domid);
if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
BUG();
if (op.status) {
struct gnttab_unmap_grant_ref unop;
gnttab_set_unmap_op(&unop,
(unsigned long)netif->tx_comms_area->addr,
GNTMAP_host_map, netif->tx_shmem_handle);
HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &unop, 1);
DPRINTK(" Gnttab failure mapping rx_ring_ref!\n");
return op.status;
}
netif->rx_shmem_ref = rx_ring_ref;
netif->rx_shmem_handle = op.handle;
return 0;
}
4. 当映射完成后,就可以访问了,之后就是撤销映射,函数为unmap_frontend_pages
static void unmap_frontend_pages(struct xen_netif *netif)
{
struct gnttab_unmap_grant_ref op;
gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr,
GNTMAP_host_map, netif->tx_shmem_handle);
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr,
GNTMAP_host_map, netif->rx_shmem_handle);
if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
BUG();
对tx_comms_area, rx_comms_area设置gnttab_unmap_grant_ref,之后调用hypercall撤销映射
}
5. 最后是netfront撤销grant ref的foreign access权限,通过调用xennet_end_access或者直接调用gnttab_end_foreign_access_ref来实现。在netfront驱动中,tx_ring, rx_ring是通过xennet_end_access来释放的(xennet_disconnect_backend中调用到),发送数据的page是通过gnttab_end_foreign_access_ref来实现回收,而接收由于是通过grant table页面传递的方式进行的内存拷贝操作,因此通过gnttab_end_foreign_transfer_ref的方式进行回收(可以参考netfront驱动中的xennet_uninit实现,分别调用了xennet_release_tx_bufs, xennet_release_rx_bufs)
最后,再来看下netback提供的net_device_ops接口
static struct net_device_ops netback_ops =
{
.ndo_start_xmit = netif_be_start_xmit,
.ndo_get_stats = netif_be_get_stats,
.ndo_open = net_open,
.ndo_stop = net_close,
.ndo_change_mtu = netbk_change_mtu,
};
static int net_open(struct net_device *dev)
{
struct xen_netif *netif = netdev_priv(dev);
if (netback_carrier_ok(netif)) {
__netif_up(netif);
netif_start_queue(dev);
}
return 0;
}
static int net_close(struct net_device *dev)
{
struct xen_netif *netif = netdev_priv(dev);
if (netback_carrier_ok(netif))
__netif_down(netif);
netif_stop_queue(dev);
return 0;
}
__netif_up, __netif_down用来打开/关闭irq中断。__netif_up中,如果有包需要发送,则把xen_netif加入到一个net_schedule_list的全局链表中,之后调用maybe_schedule_tx_action,判断是否触发tasklet_schedule(&net_tx_tasklet)进行报文发送。
netif_be_start_xmit留到后面再分析了