vhost-net 3 -- 网卡多队列

2022-11-17  本文已影响0人  苏苏林

虚拟机配置接口多队列

为虚拟机接口配置多队列可以提高虚拟机网卡的收发性能。
如下,我们为vm的一个网卡配置了4队列。

# virsh dumpxml 5a6a67e65b2d43c6850dc8998a6d51f1
......
    <interface type='bridge'>
      <mac address='fa:b9:b3:7e:17:00'/>
      <source bridge='br_zsn0_31'/>
      <target dev='vnic285.0'/>
      <model type='virtio'/>
      <driver name='vhost' txmode='iothread' ioeventfd='on' event_idx='off' queues='4' rx_queue_size='256' tx_queue_size='256'/>
      <mtu size='1500'/>
      <alias name='net0'/>
      <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/>
    </interface>
......
虚拟机启动后,可以看到无论是宿主机上的tap口还是vm的网卡都已经是4队列了。 tap口多队列 vm接口多队列 同时,内核为这个vm启动了四个vhost线程。也就是说,每个接口的每个队列对应一个vhost线程。 多队列创建的多线程

vhost-net多队列的相关实现

包含几部分
tap设备多队列

https://www.jianshu.com/p/53b3199c9a92 中介绍过tap设备驱动,tap设备一般的使用方式如下:

int open_tun (const char *dev, char *actual, int size)  
{  
  struct ifreq ifr;  
  int fd;  
  char *device = "/dev/net/tun";  

  if ((fd = open (device, O_RDWR)) < 0) //创建描述符  
      msg (M_ERR, "Cannot open TUN/TAP dev %s", device);  

  memset (&ifr, 0, sizeof (ifr));  
  ifr.ifr_flags = IFF_NO_PI;  

  if (!strncmp (dev, "tun", 3)) {    
      ifr.ifr_flags |= IFF_TUN;  
  } else if (!strncmp (dev, "tap", 3)) {  
      ifr.ifr_flags |= IFF_TAP;  
  }  else {  
      msg (M_FATAL, "I don't recognize device %s as a TUN or TAP device",dev);  
  } 

  if (strlen (dev) > 3)      /* unit number specified? */  
      strncpy (ifr.ifr_name, dev, IFNAMSIZ);  
  if (ioctl (fd, TUNSETIFF, (void *) &ifr) < 0) //打开虚拟网卡  
      msg (M_ERR, "Cannot ioctl TUNSETIFF %s", dev);  

  set_nonblock (fd);  
  msg (M_INFO, "TUN/TAP device %s opened", ifr.ifr_name);  
  strncpynt (actual, ifr.ifr_name, size);  
  return fd;  
} 

包含两个步骤:
1) 打开tun字符设备,返回一个文件句柄。在内核创建了tun_file结构,它就是队列的一个抽象;
2)为tun设备设置虚拟网卡。是真正的创建tun设备,内核创建了net_device 和其私有数据 tun_struct,并将队列(tun_file)绑定到设备上。

tun_struct 代表着一个tun/tap设备,定义中包含一个tun_file数组,即代表着设备的多个队列,tun_attach函数负责绑定 tun_file 和 tun_struct,每绑定一个队列,tun_struct的numqueues++,所以,每次执行一次类似上面 open_tun,open一次"/dev/net/tun",并TUNSETIFF绑定到相同名称的设备上 ,tun/tap设备就会多一个队列。
从定义上看(3.10内核),tun设备最多支持8队列。

#define DEFAULT_MAX_NUM_RSS_QUEUES  (8)

struct tun_struct {
    struct tun_file __rcu   *tfiles[MAX_TAP_QUEUES];
    unsigned int            numqueues;
......
};



static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
{
    struct tun_struct *tun;
    struct tun_file *tfile = file->private_data;
    struct net_device *dev;
    int err;

    if (tfile->detached)
        return -EINVAL;

    dev = __dev_get_by_name(net, ifr->ifr_name);
    if (dev) {
        // tun 设备已经存在的情况下,只绑定新的队列到设备
        if (ifr->ifr_flags & IFF_TUN_EXCL)
            return -EBUSY;
        if ((ifr->ifr_flags & IFF_TUN) && dev->netdev_ops == &tun_netdev_ops)
            tun = netdev_priv(dev);
        else if ((ifr->ifr_flags & IFF_TAP) && dev->netdev_ops == &tap_netdev_ops)
            tun = netdev_priv(dev);
        else
            return -EINVAL;

        if (!!(ifr->ifr_flags & IFF_MULTI_QUEUE) !=
            !!(tun->flags & TUN_TAP_MQ))
            return -EINVAL;

        if (tun_not_capable(tun))
            return -EPERM;
        err = security_tun_dev_open(tun->security);
        if (err < 0)
            return err;

        err = tun_attach(tun, file);
        if (err < 0)
            return err;

        if (tun->flags & TUN_TAP_MQ &&
            (tun->numqueues + tun->numdisabled > 1)) {
            /* One or more queue has already been attached, no need
             * to initialize the device again.
             */
            return 0;
        }
    }
    else {
        char *name;
        unsigned long flags = 0;
        int queues = ifr->ifr_flags & IFF_MULTI_QUEUE ?
                 MAX_TAP_QUEUES : 1;

        if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
            return -EPERM;
        err = security_tun_dev_create();
        if (err < 0)
            return err;

        /* Set dev type */
        if (ifr->ifr_flags & IFF_TUN) {
            /* TUN device */
            flags |= TUN_TUN_DEV;
            name = "tun%d";
        } else if (ifr->ifr_flags & IFF_TAP) {
            /* TAP device */
            flags |= TUN_TAP_DEV;
            name = "tap%d";
        } else
            return -EINVAL;

        if (*ifr->ifr_name)
            name = ifr->ifr_name;
        // 创建tun设备
        dev = alloc_netdev_mqs(sizeof(struct tun_struct), name,
                       tun_setup, queues, queues);

        ......
        // 绑定 队列到tun 设备
        err = tun_attach(tun, file);
        if (err < 0)
            goto err_free_dev;

        ......
}

static int tun_attach(struct tun_struct *tun, struct file *file)
{
    struct tun_file *tfile = file->private_data;
    int err;

    err = security_tun_dev_attach(tfile->socket.sk, tun->security);
    if (err < 0)
        goto out;

    err = -EINVAL;
    if (rtnl_dereference(tfile->tun) && !tfile->detached)
        goto out;

    err = -EBUSY;
    if (!(tun->flags & TUN_TAP_MQ) && tun->numqueues == 1)
        goto out;

    err = -E2BIG;
    if (!tfile->detached &&
        tun->numqueues + tun->numdisabled == MAX_TAP_QUEUES)
        goto out;

    err = 0;

    /* Re-attach the filter to presist device */
    if (tun->filter_attached == true) {
        err = sk_attach_filter(&tun->fprog, tfile->socket.sk);
        if (!err)
            goto out;
    }
    tfile->queue_index = tun->numqueues;
    rcu_assign_pointer(tfile->tun, tun);
    rcu_assign_pointer(tun->tfiles[tun->numqueues], tfile);
    tun->numqueues++;

    if (tfile->detached)
        tun_enable_queue(tfile);
    else
        sock_hold(&tfile->sk);

    tun_set_real_num_queues(tun);

    /* device is allowed to go away first, so no need to hold extra
     * refcnt.
     */

out:
    return err;
}
vhost设置多队列

vhost设备设置多队列其实和tap很类似。

qemu 每open(“/dev/vhost-net”, O_RDWR)一次,就会调用vhost_net_open创建一个vhost设备,对应一个vhost_net 数据结构,对应一个队列。如下,vhost_net包含一个vhost_net_virtqueue,对应一对发送和接收队列,所以如果要为虚拟机的一个接口配置4队列,需要open 四次“/dev/vhost-net”。

struct vhost_net {
    struct vhost_dev dev;
    struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
    struct vhost_poll poll[VHOST_NET_VQ_MAX];
    /* Number of TX recently submitted.
     * Protected by tx vq lock. */
    unsigned tx_packets;
    /* Number of times zerocopy TX recently failed.
     * Protected by tx vq lock. */
    unsigned tx_zcopy_err;
    /* Flush in progress. Protected by tx vq lock. */
    bool tx_flush;
};

然后qemu通过 VHOST_NET_SET_BACKEND,设置vhost设备和tap设备的关系,其实是设置 vhost设备和 tap设备队列的关系,因为传递下来的参数是tun_file对应的socket文件的句柄。而tun_file如上文所述,是一个队列的抽象。

VHOST_NET_SET_BACKEND完成了vhost_net(vhost设备)和tap socket的绑定,vhost_net.vq->private_data 设置为了tap socket。也就是将vhost_net和tap队列绑定了。

所以虚拟机网卡支持多队列,就需要走多次VHOST_NET_SET_BACKEND流程,将多个vhost_net和多个tap队列绑定。

static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
                unsigned long arg)
{
    struct vhost_net *n = f->private_data;
    void __user *argp = (void __user *)arg;
    u64 __user *featurep = argp;
    struct vhost_vring_file backend;
    u64 features;
    int r;

    switch (ioctl) {
    case VHOST_NET_SET_BACKEND:
        if (copy_from_user(&backend, argp, sizeof backend))
            return -EFAULT;
        return vhost_net_set_backend(n, backend.index, backend.fd);

......
}


static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
    struct socket *sock, *oldsock;
    struct vhost_virtqueue *vq;
    struct vhost_net_virtqueue *nvq;
    struct vhost_net_ubuf_ref *ubufs, *oldubufs = NULL;
    int r;
......
    sock = get_socket(fd);
    if (IS_ERR(sock)) {
        r = PTR_ERR(sock);
        goto err_vq;
    }

    /* start polling new socket */
    oldsock = rcu_dereference_protected(vq->private_data,
                        lockdep_is_held(&vq->mutex));
    if (sock != oldsock) {
        ubufs = vhost_net_ubuf_alloc(vq,
                         sock && vhost_sock_zcopy(sock));
        if (IS_ERR(ubufs)) {
            r = PTR_ERR(ubufs);
            goto err_ubufs;
        }

        vhost_net_disable_vq(n, vq);
        rcu_assign_pointer(vq->private_data, sock);
        r = vhost_init_used(vq);
        if (r)
            goto err_used;
        r = vhost_net_enable_vq(n, vq);
        if (r)
            goto err_used;

......
}

多队列情况下,为每个队列创建一个vhost内核线程。
qemu通过 VHOST_SET_OWNER 为每个vhost_net(vhost设备)创建一个vhost内核线程,详情可以参考本系列的其他文章。所以为虚拟机设置多队列,创建了多个vhost设备(vhost_net),自然为每个vhost设备做一次VHOST_SET_OWNER操作,在内核创建出队列数个vhost线程。

static long vhost_net_set_owner(struct vhost_net *n)
{
    int r;

    mutex_lock(&n->dev.mutex);
    if (vhost_dev_has_owner(&n->dev)) {
        r = -EBUSY;
        goto out;
    }
    r = vhost_net_set_ubuf_info(n);
    if (r)
        goto out;
    r = vhost_dev_set_owner(&n->dev);
    if (r)
        vhost_net_clear_ubuf_info(n);
    vhost_net_flush(n);
out:
    mutex_unlock(&n->dev.mutex);
    return r;
}
上一篇下一篇

猜你喜欢

热点阅读