From mboxrd@z Thu Jan 1 00:00:00 1970 From: Anthony Liguori Subject: Re: [PATCH 4/5] kvm: qemu: Use vringfd to eliminate copies Date: Sat, 14 Jun 2008 18:28:34 -0500 Message-ID: <48545422.1040109@us.ibm.com> References: <1213365481-23460-1-git-send-email-markmc@redhat.com> <1213365481-23460-2-git-send-email-markmc@redhat.com> <1213365481-23460-3-git-send-email-markmc@redhat.com> <1213365481-23460-4-git-send-email-markmc@redhat.com> <1213365481-23460-5-git-send-email-markmc@redhat.com> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-1; format=flowed Content-Transfer-Encoding: 7bit Cc: Avi Kivity , Rusty Russell , kvm@vger.kernel.org To: Mark McLoughlin Return-path: Received: from e31.co.us.ibm.com ([32.97.110.149]:57356 "EHLO e31.co.us.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754878AbYFNX3F (ORCPT ); Sat, 14 Jun 2008 19:29:05 -0400 Received: from d03relay04.boulder.ibm.com (d03relay04.boulder.ibm.com [9.17.195.106]) by e31.co.us.ibm.com (8.13.8/8.13.8) with ESMTP id m5ENSxca014207 for ; Sat, 14 Jun 2008 19:28:59 -0400 Received: from d03av04.boulder.ibm.com (d03av04.boulder.ibm.com [9.17.195.170]) by d03relay04.boulder.ibm.com (8.13.8/8.13.8/NCO v9.0) with ESMTP id m5ENSxL2168522 for ; Sat, 14 Jun 2008 17:28:59 -0600 Received: from d03av04.boulder.ibm.com (loopback [127.0.0.1]) by d03av04.boulder.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id m5ENSw8V012325 for ; Sat, 14 Jun 2008 17:28:59 -0600 In-Reply-To: <1213365481-23460-5-git-send-email-markmc@redhat.com> Sender: kvm-owner@vger.kernel.org List-ID: Mark McLoughlin wrote: > Where a VLAN consists only of a single tap interface on the > host side and a single virtio interface on the guest side, > we can use the tun/tap driver's vringfd support to eliminate > copies. > > We set up a vringfd for both rings and pass them to the > tun/tap driver. When the guest informs us of the location of > the ring pages, we use the VRINGSETINFO ioctl() to inform > the /dev/vring driver. > This patch set is useful for testing (I have one too in my patch queue). We need to make some more pervasive changes to QEMU though to take advantage of vringfd upstream. Specifically, we need to introduce a RX/TX buffer adding/polling API for VLANClientState. We can then use this within a vringfd VLAN client to push the indexes to vringfd. We can't use the base/limit stuff in QEMU so we have to do translation. Not a big deal really. Have you benchmarked the driver? I wasn't seeing great performance myself although I think that was due to some bugs in the vringfd code. Regards, Anthony Liguori > On xmit, we write() to the vringfd to notify tun/tap of new > buffers. Once the buffers have been sent, the vringfd > becomes readable and we read() from it, causing the buffers > to be placed back on the used ring. We can then notify the > guest that that we're done with them. > > On the recv side, the guest makes buffers available via the > ring and we merely poll() the vringfd for notification of > tun/tap having buffers available. When the vringfd becomes > readable, we read() from it to make tun/tap copy its buffers > into the guest buffers and, finally, we notify the guest. > > Signed-off-by: Mark McLoughlin > --- > qemu/hw/virtio-net.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ > qemu/hw/virtio.c | 58 +++++++++++++++++++++++++++++++++++++ > qemu/hw/virtio.h | 4 ++ > qemu/net.h | 4 ++ > qemu/vl.c | 32 ++++++++++++++++++++ > 5 files changed, 176 insertions(+), 0 deletions(-) > > diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c > index 6c42bf0..f3ff356 100644 > --- a/qemu/hw/virtio-net.c > +++ b/qemu/hw/virtio-net.c > @@ -15,6 +15,7 @@ > #include "net.h" > #include "qemu-timer.h" > #include "qemu-kvm.h" > +#include "qemu-char.h" > > /* from Linux's virtio_net.h */ > > @@ -156,6 +157,12 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq) > if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK)) > return; > > + if (vq->vringfd > 0) { > + if (write(vq->vringfd, "", 0) != 0) > + fprintf(stderr, "Failed to notify host kernel of xmit buffers\n"); > + return; > + } > + > while (virtqueue_pop(vq, &elem)) { > ssize_t len = 0; > > @@ -205,6 +212,65 @@ static void virtio_net_tx_timer(void *opaque) > virtio_net_flush_tx(n, n->tx_vq); > } > > +static void virtio_net_rx_used(void *opaque) > +{ > + VirtIONet *n = opaque; > + > + if (read(n->rx_vq->vringfd, NULL, 0) != 0) { > + fprintf(stderr, "Failed to pull used rx buffers\n"); > + return; > + } > + > + virtio_notify(&n->vdev, n->rx_vq); > +} > + > +static void virtio_net_tx_used(void *opaque) > +{ > + VirtIONet *n = opaque; > + > + if (read(n->tx_vq->vringfd, NULL, 0) != 0) { > + fprintf(stderr, "Failed to pull used tx buffers\n"); > + return; > + } > + > + virtio_notify(&n->vdev, n->tx_vq); > +} > + > +static void virtio_net_close_vrings(VirtIONet *n) > +{ > + if (n->rx_vq->vringfd > 0) { > + qemu_set_fd_handler(n->rx_vq->vringfd, NULL, NULL, NULL); > + virtqueue_close_vringfd(n->rx_vq); > + } > + if (n->tx_vq->vringfd > 0) { > + qemu_set_fd_handler(n->tx_vq->vringfd, NULL, NULL, NULL); > + virtqueue_close_vringfd(n->tx_vq); > + } > +} > + > +static void virtio_net_init_vrings(VirtIONet *n) > +{ > + struct VLANState *vlan = n->vc->vlan; > + VLANClientState *host = vlan->first_client; > + > + if (vlan->nb_host_devs != 1 || vlan->nb_guest_devs != 1) > + return; > + > + if (!host->set_vring_fds) > + return; > + > + if (!virtqueue_open_vringfd(n->rx_vq) || !virtqueue_open_vringfd(n->tx_vq)) { > + virtio_net_close_vrings(n); > + return; > + } > + > + qemu_set_fd_handler(n->rx_vq->vringfd, virtio_net_rx_used, NULL, n); > + qemu_set_fd_handler(n->tx_vq->vringfd, virtio_net_tx_used, NULL, n); > + > + if (!host->set_vring_fds(host, n->rx_vq->vringfd, n->tx_vq->vringfd)) > + virtio_net_close_vrings(n); > +} > + > static void virtio_net_save(QEMUFile *f, void *opaque) > { > VirtIONet *n = opaque; > @@ -224,6 +290,16 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id) > > virtio_load(&n->vdev, f); > > + if (n->rx_vq->vringfd > 0 && n->tx_vq->vringfd > 0) { > + VLANClientState *host = n->vc->vlan->first_client; > + > + qemu_set_fd_handler(n->rx_vq->vringfd, virtio_net_rx_used, NULL, n); > + qemu_set_fd_handler(n->tx_vq->vringfd, virtio_net_tx_used, NULL, n); > + > + if (!host->set_vring_fds(host, n->rx_vq->vringfd, n->tx_vq->vringfd)) > + virtio_net_close_vrings(n); > + } > + > qemu_get_buffer(f, n->mac, 6); > n->tx_timer_active = qemu_get_be32(f); > > @@ -258,6 +334,8 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn) > n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n); > n->tx_timer_active = 0; > > + virtio_net_init_vrings(n); > + > register_savevm("virtio-net", virtio_net_id++, 1, > virtio_net_save, virtio_net_load, n); > > diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c > index a1ee93f..e9ec1ad 100644 > --- a/qemu/hw/virtio.c > +++ b/qemu/hw/virtio.c > @@ -13,6 +13,7 @@ > > #include > #include > +#include > > #include "virtio.h" > #include "sysemu.h" > @@ -193,6 +194,56 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem) > return elem->in_num + elem->out_num; > } > > +/* vringfd functions */ > + > +static int virtqueue_setup_vringfd(VirtQueue *vq) > +{ > + struct vring_ioctl_info info; > + > + if (vq->vringfd <= 0 || !vq->vring.desc) > + return 1; > + > + info.num_descs = vq->vring.num; > + info.descs = (unsigned long)vq->vring.desc; > + info.base = (unsigned long)phys_ram_base; > + info.limit = phys_ram_size; > + > + if (ioctl(vq->vringfd, VRINGSETINFO, &info) != 0) { > + fprintf(stderr, "Failed to setup vring: %s\n", strerror(errno)); > + return 0; > + } > + > + return 1; > +} > + > +int virtqueue_open_vringfd(VirtQueue *vq) > +{ > + int fd; > + > + fd = open("/dev/vring", O_RDWR); > + if (fd < 0) { > + static int warned = 0; > + if (!warned) > + fprintf(stderr, "Warning: no vringfd support available: " > + "Failed to open /dev/vring\n"); > + warned = 1; > + return 0; > + } > + > + vq->vringfd = fd; > + > + return virtqueue_setup_vringfd(vq); > +} > + > +void virtqueue_close_vringfd(VirtQueue *vq) > +{ > + if (vq->vringfd <= 0) > + return; > + > + close(vq->vringfd); > + vq->vringfd = 0; > +} > + > /* virtio device */ > > static VirtIODevice *to_virtio_device(PCIDevice *pci_dev) > @@ -248,6 +299,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val) > size_t size = virtqueue_size(vdev->vq[vdev->queue_sel].vring.num); > virtqueue_init(&vdev->vq[vdev->queue_sel], > virtio_map_gpa(pa, size)); > + virtqueue_setup_vringfd(&vdev->vq[vdev->queue_sel]); > } > break; > case VIRTIO_PCI_QUEUE_SEL: > @@ -464,6 +516,8 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f) > qemu_put_be32(f, i); > > for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) { > + qemu_put_byte(f, vdev->vq[i].vringfd > 0); > + > if (vdev->vq[i].vring.num == 0) > break; > > @@ -490,6 +544,9 @@ void virtio_load(VirtIODevice *vdev, QEMUFile *f) > num = qemu_get_be32(f); > > for (i = 0; i < num; i++) { > + if (qemu_get_byte(f)) > + virtqueue_open_vringfd(&vdev->vq[i]); > + > vdev->vq[i].vring.num = qemu_get_be32(f); > qemu_get_be32s(f, &vdev->vq[i].pfn); > vq_last_avail(&vdev->vq[i]) = qemu_get_be16(f); > @@ -501,6 +558,7 @@ void virtio_load(VirtIODevice *vdev, QEMUFile *f) > pa = (ram_addr_t)vdev->vq[i].pfn << TARGET_PAGE_BITS; > size = virtqueue_size(vdev->vq[i].vring.num); > virtqueue_init(&vdev->vq[i], virtio_map_gpa(pa, size)); > + virtqueue_setup_vringfd(&vdev->vq[i]); > } > } > > diff --git a/qemu/hw/virtio.h b/qemu/hw/virtio.h > index 142ecbd..d3cf5fb 100644 > --- a/qemu/hw/virtio.h > +++ b/qemu/hw/virtio.h > @@ -93,6 +93,7 @@ struct VirtQueue > VRing vring; > uint32_t pfn; > int inuse; > + int vringfd; > void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq); > }; > > @@ -152,4 +153,7 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f); > > void virtio_load(VirtIODevice *vdev, QEMUFile *f); > > +int virtqueue_open_vringfd(VirtQueue *vq); > +void virtqueue_close_vringfd(VirtQueue *vq); > + > #endif > diff --git a/qemu/net.h b/qemu/net.h > index 9dc8b7c..e8ed926 100644 > --- a/qemu/net.h > +++ b/qemu/net.h > @@ -9,12 +9,15 @@ typedef ssize_t (IOReadvHandler)(void *, const struct iovec *, int); > > typedef struct VLANClientState VLANClientState; > > +typedef int (SetVringFDs)(VLANClientState *, int, int); > + > struct VLANClientState { > IOReadHandler *fd_read; > IOReadvHandler *fd_readv; > /* Packets may still be sent if this returns zero. It's used to > rate-limit the slirp code. */ > IOCanRWHandler *fd_can_read; > + SetVringFDs *set_vring_fds; > void *opaque; > struct VLANClientState *next; > struct VLANState *vlan; > @@ -26,6 +29,7 @@ struct VLANState { > VLANClientState *first_client; > struct VLANState *next; > unsigned int nb_guest_devs, nb_host_devs; > + unsigned int hotplug_disabled : 1; > }; > > VLANState *qemu_find_vlan(int id); > diff --git a/qemu/vl.c b/qemu/vl.c > index f573dce..d043ccd 100644 > --- a/qemu/vl.c > +++ b/qemu/vl.c > @@ -4166,6 +4166,31 @@ static void tap_send(void *opaque) > } while (s->size > 0); > } > > +#ifdef TUNSETRECVVRING > +static int tap_set_vring_fds(VLANClientState *vc, int recv, int xmit) > +{ > + TAPState *s = vc->opaque; > + > + if (ioctl(s->fd, TUNSETRECVVRING, recv) != 0) { > + fprintf(stderr, "TUNSETRECVVRING ioctl() failed: %s\n", > + strerror(errno)); > + return 0; > + } > + > + if (ioctl(s->fd, TUNSETXMITVRING, xmit) != 0) { > + fprintf(stderr, "TUNSETXMITVRING ioctl() failed: %s\n", > + strerror(errno)); > + exit(1); > + } > + > + qemu_set_fd_handler(s->fd, NULL, NULL, NULL); > + > + vc->vlan->hotplug_disabled = 1; > + > + return 1; > +} > +#endif > + > /* fd support */ > > static TAPState *net_tap_fd_init(VLANState *vlan, int fd) > @@ -4178,6 +4203,9 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd) > s->fd = fd; > s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s); > s->vc->fd_readv = tap_readv; > +#ifdef TUNSETRECVVRING > + s->vc->set_vring_fds = tap_set_vring_fds; > +#endif > qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s); > snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd); > return s; > @@ -4974,6 +5002,10 @@ int net_client_init(const char *str) > fprintf(stderr, "Could not create vlan %d\n", vlan_id); > return -1; > } > + if (vlan->hotplug_disabled) { > + fprintf(stderr, "Hotplug disabled on vlan %d\n", vlan_id); > + return -1; > + } > if (!strcmp(device, "nic")) { > NICInfo *nd; > uint8_t *macaddr; >