虚拟机配置接口多队列
为虚拟机接口配置多队列可以提高虚拟机网卡的收发性能。
如下,我们为vm的一个网卡配置了4队列。
# virsh dumpxml 5a6a67e65b2d43c6850dc8998a6d51f1
......
......
虚拟机启动后,可以看到无论是宿主机上的tap口还是vm的网卡都已经是4队列了。
vhost-net多队列的相关实现
包含几部分
tap设备多队列
在 https://www.jianshu.com/p/53b3199c9a92 中介绍过tap设备驱动,tap设备一般的使用方式如下:
int open_tun (const char *dev, char *actual, int size)
{
struct ifreq ifr;
int fd;
char *device = "/dev/net/tun";
if ((fd = open (device, O_RDWR)) < 0) //创建描述符
msg (M_ERR, "Cannot open TUN/TAP dev %s", device);
memset (&ifr, 0, sizeof (ifr));
ifr.ifr_flags = IFF_NO_PI;
if (!strncmp (dev, "tun", 3)) {
ifr.ifr_flags |= IFF_TUN;
} else if (!strncmp (dev, "tap", 3)) {
ifr.ifr_flags |= IFF_TAP;
} else {
msg (M_FATAL, "I don't recognize device %s as a TUN or TAP device",dev);
}
if (strlen (dev) > 3) /* unit number specified? */
strncpy (ifr.ifr_name, dev, IFNAMSIZ);
if (ioctl (fd, TUNSETIFF, (void *) &ifr) < 0) //打开虚拟网卡
msg (M_ERR, "Cannot ioctl TUNSETIFF %s", dev);
set_nonblock (fd);
msg (M_INFO, "TUN/TAP device %s opened", ifr.ifr_name);
strncpynt (actual, ifr.ifr_name, size);
return fd;
}
包含两个步骤:
1) 打开tun字符设备,返回一个文件句柄。在内核创建了tun_file结构,它就是队列的一个抽象;
2)为tun设备设置虚拟网卡。是真正的创建tun设备,内核创建了net_device 和其私有数据 tun_struct,并将队列(tun_file)绑定到设备上。
tun_struct 代表着一个tun/tap设备,定义中包含一个tun_file数组,即代表着设备的多个队列,tun_attach函数负责绑定 tun_file 和 tun_struct,每绑定一个队列,tun_struct的numqueues++,所以,每次执行一次类似上面 open_tun,open一次"/dev/net/tun",并TUNSETIFF绑定到相同名称的设备上 ,tun/tap设备就会多一个队列。
从定义上看(3.10内核),tun设备最多支持8队列。
#define DEFAULT_MAX_NUM_RSS_QUEUES (8)
struct tun_struct {
struct tun_file __rcu *tfiles[MAX_TAP_QUEUES];
unsigned int numqueues;
......
};
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
struct tun_struct *tun;
struct tun_file *tfile = file->private_data;
struct net_device *dev;
int err;
if (tfile->detached)
return -EINVAL;
dev = __dev_get_by_name(net, ifr->ifr_name);
if (dev) {
// tun 设备已经存在的情况下,只绑定新的队列到设备
if (ifr->ifr_flags & IFF_TUN_EXCL)
return -EBUSY;
if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
tun = netdev_priv(dev);
else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
tun = netdev_priv(dev);
else
return -EINVAL;
if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
!!(tun->flags & TUN_TAP_MQ))
return -EINVAL;
if (tun_not_capable(tun))
return -EPERM;
err = security_tun_dev_open(tun->security);
if (err < 0)
return err;
err = tun_attach(tun, file);
if (err < 0)
return err;
if (tun->flags & TUN_TAP_MQ &&
(tun->numqueues + tun->numdisabled > 1)) {
/* One or more queue has already been attached, no need
* to initialize the device again.
*/
return 0;
}
}
else {
char *name;
unsigned long flags = 0;
int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
MAX_TAP_QUEUES : 1;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
err = security_tun_dev_create();
if (err < 0)
return err;
/* Set dev type */
if (ifr->ifr_flags & IFF_TUN) {
/* TUN device */
flags |= TUN_TUN_DEV;
name = "tun%d";
} else if (ifr->ifr_flags & IFF_TAP) {
/* TAP device */
flags |= TUN_TAP_DEV;
name = "tap%d";
} else
return -EINVAL;
if (*ifr->ifr_name)
name = ifr->ifr_name;
// 创建tun设备
dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
tun_setup, queues, queues);
......
// 绑定 队列到tun 设备
err = tun_attach(tun, file);
if (err < 0)
goto err_free_dev;
......
}
static int tun_attach(struct tun_struct *tun, struct file *file)
{
struct tun_file *tfile = file->private_data;
int err;
err = security_tun_dev_attach(tfile->socket.sk, tun->security);
if (err < 0)
goto out;
err = -EINVAL;
if (rtnl_dereference(tfile->tun) && !tfile->detached)
goto out;
err = -EBUSY;
if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
goto out;
err = -E2BIG;
if (!tfile->detached &&
tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
goto out;
err = 0;
/* Re-attach the filter to presist device */
if (tun->filter_attached == true) {
err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
if (!err)
goto out;
}
tfile->queue_index = tun->numqueues;
rcu_assign_pointer(tfile->tun, tun);
rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
tun->numqueues++;
if (tfile->detached)
tun_enable_queue(tfile);
else
sock_hold(&tfile->sk);
tun_set_real_num_queues(tun);
/* device is allowed to go away first, so no need to hold extra
* refcnt.
*/
out:
return err;
}
vhost设置多队列
vhost设备设置多队列其实和tap很类似。
qemu 每open(“/dev/vhost-net”, O_RDWR)一次,就会调用vhost_net_open创建一个vhost设备,对应一个vhost_net 数据结构,对应一个队列。如下,vhost_net包含一个vhost_net_virtqueue,对应一对发送和接收队列,所以如果要为虚拟机的一个接口配置4队列,需要open 四次“/dev/vhost-net”。
struct vhost_net {
struct vhost_dev dev;
struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
/* Number of TX recently submitted.
* Protected by tx vq lock. */
unsigned tx_packets;
/* Number of times zerocopy TX recently failed.
* Protected by tx vq lock. */
unsigned tx_zcopy_err;
/* Flush in progress. Protected by tx vq lock. */
bool tx_flush;
};
然后qemu通过 VHOST_NET_SET_BACKEND,设置vhost设备和tap设备的关系,其实是设置 vhost设备和 tap设备队列的关系,因为传递下来的参数是tun_file对应的socket文件的句柄。而tun_file如上文所述,是一个队列的抽象。
VHOST_NET_SET_BACKEND完成了vhost_net(vhost设备)和tap socket的绑定,vhost_net.vq->private_data 设置为了tap socket。也就是将vhost_net和tap队列绑定了。
所以虚拟机网卡支持多队列,就需要走多次VHOST_NET_SET_BACKEND流程,将多个vhost_net和多个tap队列绑定。
static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
unsigned long arg)
{
struct vhost_net *n = f->private_data;
void __user *argp = (void __user *)arg;
u64 __user *featurep = argp;
struct vhost_vring_file backend;
u64 features;
int r;
switch (ioctl) {
case VHOST_NET_SET_BACKEND:
if (copy_from_user(&backend, argp, sizeof backend))
return -EFAULT;
return vhost_net_set_backend(n, backend.index, backend.fd);
......
}
static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
struct vhost_virtqueue *vq;
struct vhost_net_virtqueue *nvq;
struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
int r;
......
sock = get_socket(fd);
if (IS_ERR(sock)) {
r = PTR_ERR(sock);
goto err_vq;
}
/* start polling new socket */
oldsock = rcu_dereference_protected(vq->private_data,
lockdep_is_held(&vq->mutex));
if (sock != oldsock) {
ubufs = vhost_net_ubuf_alloc(vq,
sock && vhost_sock_zcopy(sock));
if (IS_ERR(ubufs)) {
r = PTR_ERR(ubufs);
goto err_ubufs;
}
vhost_net_disable_vq(n, vq);
rcu_assign_pointer(vq->private_data, sock);
r = vhost_init_used(vq);
if (r)
goto err_used;
r = vhost_net_enable_vq(n, vq);
if (r)
goto err_used;
......
}
多队列情况下,为每个队列创建一个vhost内核线程。
qemu通过 VHOST_SET_OWNER 为每个vhost_net(vhost设备)创建一个vhost内核线程,详情可以参考本系列的其他文章。所以为虚拟机设置多队列,创建了多个vhost设备(vhost_net),自然为每个vhost设备做一次VHOST_SET_OWNER操作,在内核创建出队列数个vhost线程。
static long vhost_net_set_owner(struct vhost_net *n)
{
int r;
mutex_lock(&n->dev.mutex);
if (vhost_dev_has_owner(&n->dev)) {
r = -EBUSY;
goto out;
}
r = vhost_net_set_ubuf_info(n);
if (r)
goto out;
r = vhost_dev_set_owner(&n->dev);
if (r)
vhost_net_clear_ubuf_info(n);
vhost_net_flush(n);
out:
mutex_unlock(&n->dev.mutex);
return r;
}