From: Mark McLoughlin <markmc@redhat.com>
To: Anthony Liguori <aliguori@us.ibm.com>,
Avi Kivity <avi@qumranet.com>,
Rusty Russell <rusty@rustcorp.com.au>
Cc: kvm@vger.kernel.org, Mark McLoughlin <markmc@redhat.com>
Subject: [PATCH 4/5] kvm: qemu: Use vringfd to eliminate copies
Date: Fri, 13 Jun 2008 14:58:00 +0100 [thread overview]
Message-ID: <1213365481-23460-5-git-send-email-markmc@redhat.com> (raw)
In-Reply-To: <1213365481-23460-4-git-send-email-markmc@redhat.com>
Where a VLAN consists only of a single tap interface on the
host side and a single virtio interface on the guest side,
we can use the tun/tap driver's vringfd support to eliminate
copies.
We set up a vringfd for both rings and pass them to the
tun/tap driver. When the guest informs us of the location of
the ring pages, we use the VRINGSETINFO ioctl() to inform
the /dev/vring driver.
On xmit, we write() to the vringfd to notify tun/tap of new
buffers. Once the buffers have been sent, the vringfd
becomes readable and we read() from it, causing the buffers
to be placed back on the used ring. We can then notify the
guest that that we're done with them.
On the recv side, the guest makes buffers available via the
ring and we merely poll() the vringfd for notification of
tun/tap having buffers available. When the vringfd becomes
readable, we read() from it to make tun/tap copy its buffers
into the guest buffers and, finally, we notify the guest.
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
---
qemu/hw/virtio-net.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
qemu/hw/virtio.c | 58 +++++++++++++++++++++++++++++++++++++
qemu/hw/virtio.h | 4 ++
qemu/net.h | 4 ++
qemu/vl.c | 32 ++++++++++++++++++++
5 files changed, 176 insertions(+), 0 deletions(-)
diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
index 6c42bf0..f3ff356 100644
--- a/qemu/hw/virtio-net.c
+++ b/qemu/hw/virtio-net.c
@@ -15,6 +15,7 @@
#include "net.h"
#include "qemu-timer.h"
#include "qemu-kvm.h"
+#include "qemu-char.h"
/* from Linux's virtio_net.h */
@@ -156,6 +157,12 @@ static void virtio_net_flush_tx(VirtIONet *n, VirtQueue *vq)
if (!(n->vdev.status & VIRTIO_CONFIG_S_DRIVER_OK))
return;
+ if (vq->vringfd > 0) {
+ if (write(vq->vringfd, "", 0) != 0)
+ fprintf(stderr, "Failed to notify host kernel of xmit buffers\n");
+ return;
+ }
+
while (virtqueue_pop(vq, &elem)) {
ssize_t len = 0;
@@ -205,6 +212,65 @@ static void virtio_net_tx_timer(void *opaque)
virtio_net_flush_tx(n, n->tx_vq);
}
+static void virtio_net_rx_used(void *opaque)
+{
+ VirtIONet *n = opaque;
+
+ if (read(n->rx_vq->vringfd, NULL, 0) != 0) {
+ fprintf(stderr, "Failed to pull used rx buffers\n");
+ return;
+ }
+
+ virtio_notify(&n->vdev, n->rx_vq);
+}
+
+static void virtio_net_tx_used(void *opaque)
+{
+ VirtIONet *n = opaque;
+
+ if (read(n->tx_vq->vringfd, NULL, 0) != 0) {
+ fprintf(stderr, "Failed to pull used tx buffers\n");
+ return;
+ }
+
+ virtio_notify(&n->vdev, n->tx_vq);
+}
+
+static void virtio_net_close_vrings(VirtIONet *n)
+{
+ if (n->rx_vq->vringfd > 0) {
+ qemu_set_fd_handler(n->rx_vq->vringfd, NULL, NULL, NULL);
+ virtqueue_close_vringfd(n->rx_vq);
+ }
+ if (n->tx_vq->vringfd > 0) {
+ qemu_set_fd_handler(n->tx_vq->vringfd, NULL, NULL, NULL);
+ virtqueue_close_vringfd(n->tx_vq);
+ }
+}
+
+static void virtio_net_init_vrings(VirtIONet *n)
+{
+ struct VLANState *vlan = n->vc->vlan;
+ VLANClientState *host = vlan->first_client;
+
+ if (vlan->nb_host_devs != 1 || vlan->nb_guest_devs != 1)
+ return;
+
+ if (!host->set_vring_fds)
+ return;
+
+ if (!virtqueue_open_vringfd(n->rx_vq) || !virtqueue_open_vringfd(n->tx_vq)) {
+ virtio_net_close_vrings(n);
+ return;
+ }
+
+ qemu_set_fd_handler(n->rx_vq->vringfd, virtio_net_rx_used, NULL, n);
+ qemu_set_fd_handler(n->tx_vq->vringfd, virtio_net_tx_used, NULL, n);
+
+ if (!host->set_vring_fds(host, n->rx_vq->vringfd, n->tx_vq->vringfd))
+ virtio_net_close_vrings(n);
+}
+
static void virtio_net_save(QEMUFile *f, void *opaque)
{
VirtIONet *n = opaque;
@@ -224,6 +290,16 @@ static int virtio_net_load(QEMUFile *f, void *opaque, int version_id)
virtio_load(&n->vdev, f);
+ if (n->rx_vq->vringfd > 0 && n->tx_vq->vringfd > 0) {
+ VLANClientState *host = n->vc->vlan->first_client;
+
+ qemu_set_fd_handler(n->rx_vq->vringfd, virtio_net_rx_used, NULL, n);
+ qemu_set_fd_handler(n->tx_vq->vringfd, virtio_net_tx_used, NULL, n);
+
+ if (!host->set_vring_fds(host, n->rx_vq->vringfd, n->tx_vq->vringfd))
+ virtio_net_close_vrings(n);
+ }
+
qemu_get_buffer(f, n->mac, 6);
n->tx_timer_active = qemu_get_be32(f);
@@ -258,6 +334,8 @@ PCIDevice *virtio_net_init(PCIBus *bus, NICInfo *nd, int devfn)
n->tx_timer = qemu_new_timer(vm_clock, virtio_net_tx_timer, n);
n->tx_timer_active = 0;
+ virtio_net_init_vrings(n);
+
register_savevm("virtio-net", virtio_net_id++, 1,
virtio_net_save, virtio_net_load, n);
diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c
index a1ee93f..e9ec1ad 100644
--- a/qemu/hw/virtio.c
+++ b/qemu/hw/virtio.c
@@ -13,6 +13,7 @@
#include <inttypes.h>
#include <err.h>
+#include <sys/ioctl.h>
#include "virtio.h"
#include "sysemu.h"
@@ -193,6 +194,56 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement *elem)
return elem->in_num + elem->out_num;
}
+/* vringfd functions */
+
+static int virtqueue_setup_vringfd(VirtQueue *vq)
+{
+ struct vring_ioctl_info info;
+
+ if (vq->vringfd <= 0 || !vq->vring.desc)
+ return 1;
+
+ info.num_descs = vq->vring.num;
+ info.descs = (unsigned long)vq->vring.desc;
+ info.base = (unsigned long)phys_ram_base;
+ info.limit = phys_ram_size;
+
+ if (ioctl(vq->vringfd, VRINGSETINFO, &info) != 0) {
+ fprintf(stderr, "Failed to setup vring: %s\n", strerror(errno));
+ return 0;
+ }
+
+ return 1;
+}
+
+int virtqueue_open_vringfd(VirtQueue *vq)
+{
+ int fd;
+
+ fd = open("/dev/vring", O_RDWR);
+ if (fd < 0) {
+ static int warned = 0;
+ if (!warned)
+ fprintf(stderr, "Warning: no vringfd support available: "
+ "Failed to open /dev/vring\n");
+ warned = 1;
+ return 0;
+ }
+
+ vq->vringfd = fd;
+
+ return virtqueue_setup_vringfd(vq);
+}
+
+void virtqueue_close_vringfd(VirtQueue *vq)
+{
+ if (vq->vringfd <= 0)
+ return;
+
+ close(vq->vringfd);
+ vq->vringfd = 0;
+}
+
/* virtio device */
static VirtIODevice *to_virtio_device(PCIDevice *pci_dev)
@@ -248,6 +299,7 @@ static void virtio_ioport_write(void *opaque, uint32_t addr, uint32_t val)
size_t size = virtqueue_size(vdev->vq[vdev->queue_sel].vring.num);
virtqueue_init(&vdev->vq[vdev->queue_sel],
virtio_map_gpa(pa, size));
+ virtqueue_setup_vringfd(&vdev->vq[vdev->queue_sel]);
}
break;
case VIRTIO_PCI_QUEUE_SEL:
@@ -464,6 +516,8 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f)
qemu_put_be32(f, i);
for (i = 0; i < VIRTIO_PCI_QUEUE_MAX; i++) {
+ qemu_put_byte(f, vdev->vq[i].vringfd > 0);
+
if (vdev->vq[i].vring.num == 0)
break;
@@ -490,6 +544,9 @@ void virtio_load(VirtIODevice *vdev, QEMUFile *f)
num = qemu_get_be32(f);
for (i = 0; i < num; i++) {
+ if (qemu_get_byte(f))
+ virtqueue_open_vringfd(&vdev->vq[i]);
+
vdev->vq[i].vring.num = qemu_get_be32(f);
qemu_get_be32s(f, &vdev->vq[i].pfn);
vq_last_avail(&vdev->vq[i]) = qemu_get_be16(f);
@@ -501,6 +558,7 @@ void virtio_load(VirtIODevice *vdev, QEMUFile *f)
pa = (ram_addr_t)vdev->vq[i].pfn << TARGET_PAGE_BITS;
size = virtqueue_size(vdev->vq[i].vring.num);
virtqueue_init(&vdev->vq[i], virtio_map_gpa(pa, size));
+ virtqueue_setup_vringfd(&vdev->vq[i]);
}
}
diff --git a/qemu/hw/virtio.h b/qemu/hw/virtio.h
index 142ecbd..d3cf5fb 100644
--- a/qemu/hw/virtio.h
+++ b/qemu/hw/virtio.h
@@ -93,6 +93,7 @@ struct VirtQueue
VRing vring;
uint32_t pfn;
int inuse;
+ int vringfd;
void (*handle_output)(VirtIODevice *vdev, VirtQueue *vq);
};
@@ -152,4 +153,7 @@ void virtio_save(VirtIODevice *vdev, QEMUFile *f);
void virtio_load(VirtIODevice *vdev, QEMUFile *f);
+int virtqueue_open_vringfd(VirtQueue *vq);
+void virtqueue_close_vringfd(VirtQueue *vq);
+
#endif
diff --git a/qemu/net.h b/qemu/net.h
index 9dc8b7c..e8ed926 100644
--- a/qemu/net.h
+++ b/qemu/net.h
@@ -9,12 +9,15 @@ typedef ssize_t (IOReadvHandler)(void *, const struct iovec *, int);
typedef struct VLANClientState VLANClientState;
+typedef int (SetVringFDs)(VLANClientState *, int, int);
+
struct VLANClientState {
IOReadHandler *fd_read;
IOReadvHandler *fd_readv;
/* Packets may still be sent if this returns zero. It's used to
rate-limit the slirp code. */
IOCanRWHandler *fd_can_read;
+ SetVringFDs *set_vring_fds;
void *opaque;
struct VLANClientState *next;
struct VLANState *vlan;
@@ -26,6 +29,7 @@ struct VLANState {
VLANClientState *first_client;
struct VLANState *next;
unsigned int nb_guest_devs, nb_host_devs;
+ unsigned int hotplug_disabled : 1;
};
VLANState *qemu_find_vlan(int id);
diff --git a/qemu/vl.c b/qemu/vl.c
index f573dce..d043ccd 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -4166,6 +4166,31 @@ static void tap_send(void *opaque)
} while (s->size > 0);
}
+#ifdef TUNSETRECVVRING
+static int tap_set_vring_fds(VLANClientState *vc, int recv, int xmit)
+{
+ TAPState *s = vc->opaque;
+
+ if (ioctl(s->fd, TUNSETRECVVRING, recv) != 0) {
+ fprintf(stderr, "TUNSETRECVVRING ioctl() failed: %s\n",
+ strerror(errno));
+ return 0;
+ }
+
+ if (ioctl(s->fd, TUNSETXMITVRING, xmit) != 0) {
+ fprintf(stderr, "TUNSETXMITVRING ioctl() failed: %s\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ qemu_set_fd_handler(s->fd, NULL, NULL, NULL);
+
+ vc->vlan->hotplug_disabled = 1;
+
+ return 1;
+}
+#endif
+
/* fd support */
static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
@@ -4178,6 +4203,9 @@ static TAPState *net_tap_fd_init(VLANState *vlan, int fd)
s->fd = fd;
s->vc = qemu_new_vlan_client(vlan, tap_receive, NULL, s);
s->vc->fd_readv = tap_readv;
+#ifdef TUNSETRECVVRING
+ s->vc->set_vring_fds = tap_set_vring_fds;
+#endif
qemu_set_fd_handler2(s->fd, tap_can_send, tap_send, NULL, s);
snprintf(s->vc->info_str, sizeof(s->vc->info_str), "tap: fd=%d", fd);
return s;
@@ -4974,6 +5002,10 @@ int net_client_init(const char *str)
fprintf(stderr, "Could not create vlan %d\n", vlan_id);
return -1;
}
+ if (vlan->hotplug_disabled) {
+ fprintf(stderr, "Hotplug disabled on vlan %d\n", vlan_id);
+ return -1;
+ }
if (!strcmp(device, "nic")) {
NICInfo *nd;
uint8_t *macaddr;
--
1.5.4.1
next prev parent reply other threads:[~2008-06-13 13:58 UTC|newest]
Thread overview: 23+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-06-13 13:57 [PATCH 0/0][RFC] KVM use of vringfd Mark McLoughlin
2008-06-13 13:57 ` [PATCH 1/5] vring: Replace mmap() interface with ioctl() Mark McLoughlin
2008-06-13 13:57 ` [PATCH 2/5] lguest: Use VRINGSETINFO ioctl() instead of mmap() Mark McLoughlin
2008-06-13 13:57 ` [PATCH 3/5] kvm: qemu: Publish last_avail index in the ring Mark McLoughlin
2008-06-13 13:58 ` Mark McLoughlin [this message]
2008-06-13 13:58 ` [PATCH 5/5] kvm: qemu: Add support for partial csums and GSO Mark McLoughlin
2008-06-14 23:28 ` [PATCH 4/5] kvm: qemu: Use vringfd to eliminate copies Anthony Liguori
2008-06-16 2:10 ` Rusty Russell
2008-06-16 14:02 ` Anthony Liguori
2008-06-16 14:58 ` Avi Kivity
2008-06-18 5:43 ` Rusty Russell
2008-06-18 14:01 ` Avi Kivity
2008-06-17 14:08 ` Mark McLoughlin
2008-06-17 14:54 ` Anthony Liguori
2008-06-17 15:45 ` Mark McLoughlin
2008-06-13 14:09 ` [PATCH 1/5] vring: Replace mmap() interface with ioctl() Avi Kivity
2008-06-17 12:19 ` Mark McLoughlin
2008-06-18 14:05 ` Avi Kivity
2008-06-14 9:02 ` Rusty Russell
2008-06-14 14:20 ` Avi Kivity
2008-06-14 23:23 ` Anthony Liguori
2008-06-15 15:24 ` Avi Kivity
2008-06-15 19:13 ` Anthony Liguori
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1213365481-23460-5-git-send-email-markmc@redhat.com \
--to=markmc@redhat.com \
--cc=aliguori@us.ibm.com \
--cc=avi@qumranet.com \
--cc=kvm@vger.kernel.org \
--cc=rusty@rustcorp.com.au \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox