* [Qemu-devel] [PATCH 01/18] raw-posix: add raw_get_aio_fd() for virtio-blk-data-plane
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 02/18] configure: add CONFIG_VIRTIO_BLK_DATA_PLANE Stefan Hajnoczi
` (16 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The raw_get_aio_fd() function allows virtio-blk-data-plane to get the
file descriptor of a raw image file with Linux AIO enabled. This
interface is really a layering violation that can be resolved once the
block layer is able to run outside the global mutex - at that point
virtio-blk-data-plane will switch from custom Linux AIO code to using
the block layer.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block.h | 9 +++++++++
block/raw-posix.c | 34 ++++++++++++++++++++++++++++++++++
2 files changed, 43 insertions(+)
diff --git a/block.h b/block.h
index 893448a..c96f028 100644
--- a/block.h
+++ b/block.h
@@ -365,6 +365,15 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs);
void bdrv_set_in_use(BlockDriverState *bs, int in_use);
int bdrv_in_use(BlockDriverState *bs);
+#ifdef CONFIG_LINUX_AIO
+int raw_get_aio_fd(BlockDriverState *bs);
+#else
+static inline int raw_get_aio_fd(BlockDriverState *bs)
+{
+ return -ENOTSUP;
+}
+#endif
+
enum BlockAcctType {
BDRV_ACCT_READ,
BDRV_ACCT_WRITE,
diff --git a/block/raw-posix.c b/block/raw-posix.c
index abfedbe..634824b 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -1777,6 +1777,40 @@ static BlockDriver bdrv_host_cdrom = {
};
#endif /* __FreeBSD__ */
+#ifdef CONFIG_LINUX_AIO
+/**
+ * Return the file descriptor for Linux AIO
+ *
+ * This function is a layering violation and should be removed when it becomes
+ * possible to call the block layer outside the global mutex. It allows the
+ * caller to hijack the file descriptor so I/O can be performed outside the
+ * block layer.
+ */
+int raw_get_aio_fd(BlockDriverState *bs)
+{
+ BDRVRawState *s;
+
+ if (!bs->drv) {
+ return -ENOMEDIUM;
+ }
+
+ if (bs->drv == bdrv_find_format("raw")) {
+ bs = bs->file;
+ }
+
+ /* raw-posix has several protocols so just check for raw_aio_readv */
+ if (bs->drv->bdrv_aio_readv != raw_aio_readv) {
+ return -ENOTSUP;
+ }
+
+ s = bs->opaque;
+ if (!s->use_aio) {
+ return -ENOTSUP;
+ }
+ return s->fd;
+}
+#endif /* CONFIG_LINUX_AIO */
+
static void bdrv_file_init(void)
{
/*
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 02/18] configure: add CONFIG_VIRTIO_BLK_DATA_PLANE
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 01/18] raw-posix: add raw_get_aio_fd() for virtio-blk-data-plane Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 03/18] dataplane: add host memory mapping code Stefan Hajnoczi
` (15 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The virtio-blk-data-plane feature only works with Linux AIO. Therefore
add a ./configure option and necessary checks to implement this
dependency.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
configure | 21 +++++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/configure b/configure
index 728caca..c3d7c1a 100755
--- a/configure
+++ b/configure
@@ -223,6 +223,7 @@ libiscsi=""
coroutine=""
seccomp=""
glusterfs=""
+virtio_blk_data_plane=""
# parse CC options first
for opt do
@@ -880,6 +881,10 @@ for opt do
;;
--enable-glusterfs) glusterfs="yes"
;;
+ --disable-virtio-blk-data-plane) virtio_blk_data_plane="no"
+ ;;
+ --enable-virtio-blk-data-plane) virtio_blk_data_plane="yes"
+ ;;
*) echo "ERROR: unknown option $opt"; show_help="yes"
;;
esac
@@ -2257,6 +2262,17 @@ EOF
fi
##########################################
+# adjust virtio-blk-data-plane based on linux-aio
+
+if test "$virtio_blk_data_plane" = "yes" -a \
+ "$linux_aio" != "yes" ; then
+ echo "Error: virtio-blk-data-plane requires Linux AIO, please try --enable-linux-aio"
+ exit 1
+elif test -z "$virtio_blk_data_plane" ; then
+ virtio_blk_data_plane=$linux_aio
+fi
+
+##########################################
# attr probe
if test "$attr" != "no" ; then
@@ -3257,6 +3273,7 @@ echo "build guest agent $guest_agent"
echo "seccomp support $seccomp"
echo "coroutine backend $coroutine_backend"
echo "GlusterFS support $glusterfs"
+echo "virtio-blk-data-plane $virtio_blk_data_plane"
if test "$sdl_too_old" = "yes"; then
echo "-> Your SDL version is too old - please upgrade to have SDL support"
@@ -3603,6 +3620,10 @@ if test "$glusterfs" = "yes" ; then
echo "CONFIG_GLUSTERFS=y" >> $config_host_mak
fi
+if test "$virtio_blk_data_plane" = "yes" ; then
+ echo "CONFIG_VIRTIO_BLK_DATA_PLANE=y" >> $config_host_mak
+fi
+
# USB host support
case "$usb" in
linux)
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 03/18] dataplane: add host memory mapping code
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 01/18] raw-posix: add raw_get_aio_fd() for virtio-blk-data-plane Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 02/18] configure: add CONFIG_VIRTIO_BLK_DATA_PLANE Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 04/18] dataplane: add virtqueue vring code Stefan Hajnoczi
` (14 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The data plane thread needs to map guest physical addresses to host
pointers. Normally this is done with cpu_physical_memory_map() but the
function assumes the global mutex is held. The data plane thread does
not touch the global mutex and therefore needs a thread-safe memory
mapping mechanism.
Hostmem registers a MemoryListener similar to how vhost collects and
pushes memory region information into the kernel. There is a
fine-grained lock on the regions list which is held during lookup and
when installing a new regions list.
When the physical memory map changes the MemoryListener callbacks are
invoked. They build up a new list of memory regions which is finally
installed when the list has been completed.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/Makefile.objs | 2 +-
hw/dataplane/Makefile.objs | 3 +
hw/dataplane/hostmem.c | 176 +++++++++++++++++++++++++++++++++++++++++++++
hw/dataplane/hostmem.h | 57 +++++++++++++++
4 files changed, 237 insertions(+), 1 deletion(-)
create mode 100644 hw/dataplane/Makefile.objs
create mode 100644 hw/dataplane/hostmem.c
create mode 100644 hw/dataplane/hostmem.h
diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index d581d8d..cec84bc 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -1,4 +1,4 @@
-common-obj-y = usb/ ide/
+common-obj-y = usb/ ide/ dataplane/
common-obj-y += loader.o
common-obj-$(CONFIG_VIRTIO) += virtio-console.o
common-obj-$(CONFIG_VIRTIO) += virtio-rng.o
diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
new file mode 100644
index 0000000..8c8dea1
--- /dev/null
+++ b/hw/dataplane/Makefile.objs
@@ -0,0 +1,3 @@
+ifeq ($(CONFIG_VIRTIO), y)
+common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o
+endif
diff --git a/hw/dataplane/hostmem.c b/hw/dataplane/hostmem.c
new file mode 100644
index 0000000..bde821f
--- /dev/null
+++ b/hw/dataplane/hostmem.c
@@ -0,0 +1,176 @@
+/*
+ * Thread-safe guest to host memory mapping
+ *
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "exec-memory.h"
+#include "hostmem.h"
+
+static int hostmem_lookup_cmp(const void *phys_, const void *region_)
+{
+ hwaddr phys = *(const hwaddr *)phys_;
+ const HostMemRegion *region = region_;
+
+ if (phys < region->guest_addr) {
+ return -1;
+ } else if (phys >= region->guest_addr + region->size) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/**
+ * Map guest physical address to host pointer
+ */
+void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write)
+{
+ HostMemRegion *region;
+ void *host_addr = NULL;
+ hwaddr offset_within_region;
+
+ qemu_mutex_lock(&hostmem->current_regions_lock);
+ region = bsearch(&phys, hostmem->current_regions,
+ hostmem->num_current_regions,
+ sizeof(hostmem->current_regions[0]),
+ hostmem_lookup_cmp);
+ if (!region) {
+ goto out;
+ }
+ if (is_write && region->readonly) {
+ goto out;
+ }
+ offset_within_region = phys - region->guest_addr;
+ if (len <= region->size - offset_within_region) {
+ host_addr = region->host_addr + offset_within_region;
+ }
+out:
+ qemu_mutex_unlock(&hostmem->current_regions_lock);
+
+ return host_addr;
+}
+
+/**
+ * Install new regions list
+ */
+static void hostmem_listener_commit(MemoryListener *listener)
+{
+ HostMem *hostmem = container_of(listener, HostMem, listener);
+
+ qemu_mutex_lock(&hostmem->current_regions_lock);
+ g_free(hostmem->current_regions);
+ hostmem->current_regions = hostmem->new_regions;
+ hostmem->num_current_regions = hostmem->num_new_regions;
+ qemu_mutex_unlock(&hostmem->current_regions_lock);
+
+ /* Reset new regions list */
+ hostmem->new_regions = NULL;
+ hostmem->num_new_regions = 0;
+}
+
+/**
+ * Add a MemoryRegionSection to the new regions list
+ */
+static void hostmem_append_new_region(HostMem *hostmem,
+ MemoryRegionSection *section)
+{
+ void *ram_ptr = memory_region_get_ram_ptr(section->mr);
+ size_t num = hostmem->num_new_regions;
+ size_t new_size = (num + 1) * sizeof(hostmem->new_regions[0]);
+
+ hostmem->new_regions = g_realloc(hostmem->new_regions, new_size);
+ hostmem->new_regions[num] = (HostMemRegion){
+ .host_addr = ram_ptr + section->offset_within_region,
+ .guest_addr = section->offset_within_address_space,
+ .size = section->size,
+ .readonly = section->readonly,
+ };
+ hostmem->num_new_regions++;
+}
+
+static void hostmem_listener_append_region(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+ HostMem *hostmem = container_of(listener, HostMem, listener);
+
+ /* Ignore non-RAM regions, we may not be able to map them */
+ if (!memory_region_is_ram(section->mr)) {
+ return;
+ }
+
+ /* Ignore regions with dirty logging, we cannot mark them dirty */
+ if (memory_region_is_logging(section->mr)) {
+ return;
+ }
+
+ hostmem_append_new_region(hostmem, section);
+}
+
+/* We don't implement most MemoryListener callbacks, use these nop stubs */
+static void hostmem_listener_dummy(MemoryListener *listener)
+{
+}
+
+static void hostmem_listener_section_dummy(MemoryListener *listener,
+ MemoryRegionSection *section)
+{
+}
+
+static void hostmem_listener_eventfd_dummy(MemoryListener *listener,
+ MemoryRegionSection *section,
+ bool match_data, uint64_t data,
+ EventNotifier *e)
+{
+}
+
+static void hostmem_listener_coalesced_mmio_dummy(MemoryListener *listener,
+ MemoryRegionSection *section,
+ hwaddr addr, hwaddr len)
+{
+}
+
+void hostmem_init(HostMem *hostmem)
+{
+ memset(hostmem, 0, sizeof(*hostmem));
+
+ qemu_mutex_init(&hostmem->current_regions_lock);
+
+ hostmem->listener = (MemoryListener){
+ .begin = hostmem_listener_dummy,
+ .commit = hostmem_listener_commit,
+ .region_add = hostmem_listener_append_region,
+ .region_del = hostmem_listener_section_dummy,
+ .region_nop = hostmem_listener_append_region,
+ .log_start = hostmem_listener_section_dummy,
+ .log_stop = hostmem_listener_section_dummy,
+ .log_sync = hostmem_listener_section_dummy,
+ .log_global_start = hostmem_listener_dummy,
+ .log_global_stop = hostmem_listener_dummy,
+ .eventfd_add = hostmem_listener_eventfd_dummy,
+ .eventfd_del = hostmem_listener_eventfd_dummy,
+ .coalesced_mmio_add = hostmem_listener_coalesced_mmio_dummy,
+ .coalesced_mmio_del = hostmem_listener_coalesced_mmio_dummy,
+ .priority = 10,
+ };
+
+ memory_listener_register(&hostmem->listener, &address_space_memory);
+ if (hostmem->num_new_regions > 0) {
+ hostmem_listener_commit(&hostmem->listener);
+ }
+}
+
+void hostmem_finalize(HostMem *hostmem)
+{
+ memory_listener_unregister(&hostmem->listener);
+ g_free(hostmem->new_regions);
+ g_free(hostmem->current_regions);
+ qemu_mutex_destroy(&hostmem->current_regions_lock);
+}
diff --git a/hw/dataplane/hostmem.h b/hw/dataplane/hostmem.h
new file mode 100644
index 0000000..61d9dc1
--- /dev/null
+++ b/hw/dataplane/hostmem.h
@@ -0,0 +1,57 @@
+/*
+ * Thread-safe guest to host memory mapping
+ *
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HOSTMEM_H
+#define HOSTMEM_H
+
+#include "memory.h"
+#include "qemu-thread.h"
+
+typedef struct {
+ void *host_addr;
+ hwaddr guest_addr;
+ uint64_t size;
+ bool readonly;
+} HostMemRegion;
+
+typedef struct {
+ /* The listener is invoked when regions change and a new list of regions is
+ * built up completely before they are installed.
+ */
+ MemoryListener listener;
+ HostMemRegion *new_regions;
+ size_t num_new_regions;
+
+ /* Current regions are accessed from multiple threads either to lookup
+ * addresses or to install a new list of regions. The lock protects the
+ * pointer and the regions.
+ */
+ QemuMutex current_regions_lock;
+ HostMemRegion *current_regions;
+ size_t num_current_regions;
+} HostMem;
+
+void hostmem_init(HostMem *hostmem);
+void hostmem_finalize(HostMem *hostmem);
+
+/**
+ * Map a guest physical address to a pointer
+ *
+ * Note that there is map/unmap mechanism here. The caller must ensure that
+ * mapped memory is no longer used across events like hot memory unplug. This
+ * can be done with other mechanisms like bdrv_drain_all() that quiesce
+ * in-flight I/O.
+ */
+void *hostmem_lookup(HostMem *hostmem, hwaddr phys, hwaddr len, bool is_write);
+
+#endif /* HOSTMEM_H */
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 04/18] dataplane: add virtqueue vring code
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (2 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 03/18] dataplane: add host memory mapping code Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 05/18] dataplane: add event loop Stefan Hajnoczi
` (13 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The virtio-blk-data-plane cannot access memory using the usual QEMU
functions since it executes outside the global mutex and the memory APIs
are this time are not thread-safe.
This patch introduces a virtqueue module based on the kernel's vhost
vring code. The trick is that we map guest memory ahead of time and
access it cheaply outside the global mutex.
Once the hardware emulation code can execute outside the global mutex it
will be possible to drop this code.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/dataplane/Makefile.objs | 2 +-
hw/dataplane/vring.c | 362 +++++++++++++++++++++++++++++++++++++++++++++
hw/dataplane/vring.h | 63 ++++++++
trace-events | 3 +
4 files changed, 429 insertions(+), 1 deletion(-)
create mode 100644 hw/dataplane/vring.c
create mode 100644 hw/dataplane/vring.h
diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
index 8c8dea1..34e6d57 100644
--- a/hw/dataplane/Makefile.objs
+++ b/hw/dataplane/Makefile.objs
@@ -1,3 +1,3 @@
ifeq ($(CONFIG_VIRTIO), y)
-common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o
+common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o
endif
diff --git a/hw/dataplane/vring.c b/hw/dataplane/vring.c
new file mode 100644
index 0000000..d5d4ef4
--- /dev/null
+++ b/hw/dataplane/vring.c
@@ -0,0 +1,362 @@
+/* Copyright 2012 Red Hat, Inc.
+ * Copyright IBM, Corp. 2012
+ *
+ * Based on Linux 2.6.39 vhost code:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Inspiration, some code, and most witty comments come from
+ * Documentation/virtual/lguest/lguest.c, by Rusty Russell
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#include "trace.h"
+#include "hw/dataplane/vring.h"
+
+/* Map the guest's vring to host memory */
+bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
+{
+ hwaddr vring_addr = virtio_queue_get_ring_addr(vdev, n);
+ hwaddr vring_size = virtio_queue_get_ring_size(vdev, n);
+ void *vring_ptr;
+
+ vring->broken = false;
+
+ hostmem_init(&vring->hostmem);
+ vring_ptr = hostmem_lookup(&vring->hostmem, vring_addr, vring_size, true);
+ if (!vring_ptr) {
+ error_report("Failed to map vring "
+ "addr %#" HWADDR_PRIx " size %" HWADDR_PRIu,
+ vring_addr, vring_size);
+ vring->broken = true;
+ return false;
+ }
+
+ vring_init(&vring->vr, virtio_queue_get_num(vdev, n), vring_ptr, 4096);
+
+ vring->last_avail_idx = 0;
+ vring->last_used_idx = 0;
+ vring->signalled_used = 0;
+ vring->signalled_used_valid = false;
+
+ trace_vring_setup(virtio_queue_get_ring_addr(vdev, n),
+ vring->vr.desc, vring->vr.avail, vring->vr.used);
+ return true;
+}
+
+void vring_teardown(Vring *vring)
+{
+ hostmem_finalize(&vring->hostmem);
+}
+
+/* Disable guest->host notifies */
+void vring_disable_notification(VirtIODevice *vdev, Vring *vring)
+{
+ if (!(vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX))) {
+ vring->vr.used->flags |= VRING_USED_F_NO_NOTIFY;
+ }
+}
+
+/* Enable guest->host notifies
+ *
+ * Return true if the vring is empty, false if there are more requests.
+ */
+bool vring_enable_notification(VirtIODevice *vdev, Vring *vring)
+{
+ if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
+ vring_avail_event(&vring->vr) = vring->vr.avail->idx;
+ } else {
+ vring->vr.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+ }
+ smp_mb(); /* ensure update is seen before reading avail_idx */
+ return !vring_more_avail(vring);
+}
+
+/* This is stolen from linux/drivers/vhost/vhost.c:vhost_notify() */
+bool vring_should_notify(VirtIODevice *vdev, Vring *vring)
+{
+ uint16_t old, new;
+ bool v;
+ /* Flush out used index updates. This is paired
+ * with the barrier that the Guest executes when enabling
+ * interrupts. */
+ smp_mb();
+
+ if ((vdev->guest_features & VIRTIO_F_NOTIFY_ON_EMPTY) &&
+ unlikely(vring->vr.avail->idx == vring->last_avail_idx)) {
+ return true;
+ }
+
+ if (!(vdev->guest_features & VIRTIO_RING_F_EVENT_IDX)) {
+ return !(vring->vr.avail->flags & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+ old = vring->signalled_used;
+ v = vring->signalled_used_valid;
+ new = vring->signalled_used = vring->last_used_idx;
+ vring->signalled_used_valid = true;
+
+ if (unlikely(!v)) {
+ return true;
+ }
+
+ return vring_need_event(vring_used_event(&vring->vr), new, old);
+}
+
+/* This is stolen from linux/drivers/vhost/vhost.c. */
+static int get_indirect(Vring *vring,
+ struct iovec iov[], struct iovec *iov_end,
+ unsigned int *out_num, unsigned int *in_num,
+ struct vring_desc *indirect)
+{
+ struct vring_desc desc;
+ unsigned int i = 0, count, found = 0;
+
+ /* Sanity check */
+ if (unlikely(indirect->len % sizeof(desc))) {
+ error_report("Invalid length in indirect descriptor: "
+ "len %#x not multiple of %#zx",
+ indirect->len, sizeof(desc));
+ vring->broken = true;
+ return -EFAULT;
+ }
+
+ count = indirect->len / sizeof(desc);
+ /* Buffers are chained via a 16 bit next field, so
+ * we can have at most 2^16 of these. */
+ if (unlikely(count > USHRT_MAX + 1)) {
+ error_report("Indirect buffer length too big: %d", indirect->len);
+ vring->broken = true;
+ return -EFAULT;
+ }
+
+ do {
+ struct vring_desc *desc_ptr;
+
+ /* Translate indirect descriptor */
+ desc_ptr = hostmem_lookup(&vring->hostmem,
+ indirect->addr + found * sizeof(desc),
+ sizeof(desc), false);
+ if (!desc_ptr) {
+ error_report("Failed to map indirect descriptor "
+ "addr %#" PRIx64 " len %zu",
+ (uint64_t)indirect->addr + found * sizeof(desc),
+ sizeof(desc));
+ vring->broken = true;
+ return -EFAULT;
+ }
+ desc = *desc_ptr;
+
+ /* Ensure descriptor has been loaded before accessing fields */
+ barrier(); /* read_barrier_depends(); */
+
+ if (unlikely(++found > count)) {
+ error_report("Loop detected: last one at %u "
+ "indirect size %u", i, count);
+ vring->broken = true;
+ return -EFAULT;
+ }
+
+ if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+ error_report("Nested indirect descriptor");
+ vring->broken = true;
+ return -EFAULT;
+ }
+
+ /* Stop for now if there are not enough iovecs available. */
+ if (iov >= iov_end) {
+ return -ENOBUFS;
+ }
+
+ iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len,
+ desc.flags & VRING_DESC_F_WRITE);
+ if (!iov->iov_base) {
+ error_report("Failed to map indirect descriptor"
+ "addr %#" PRIx64 " len %u",
+ (uint64_t)desc.addr, desc.len);
+ vring->broken = true;
+ return -EFAULT;
+ }
+ iov->iov_len = desc.len;
+ iov++;
+
+ /* If this is an input descriptor, increment that count. */
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ *in_num += 1;
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ error_report("Indirect descriptor "
+ "has out after in: idx %u", i);
+ vring->broken = true;
+ return -EFAULT;
+ }
+ *out_num += 1;
+ }
+ i = desc.next;
+ } while (desc.flags & VRING_DESC_F_NEXT);
+ return 0;
+}
+
+/* This looks in the virtqueue and for the first available buffer, and converts
+ * it to an iovec for convenient access. Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function returns the descriptor number found, or vq->num (which is
+ * never a valid descriptor number) if none was found. A negative code is
+ * returned on error.
+ *
+ * Stolen from linux/drivers/vhost/vhost.c.
+ */
+int vring_pop(VirtIODevice *vdev, Vring *vring,
+ struct iovec iov[], struct iovec *iov_end,
+ unsigned int *out_num, unsigned int *in_num)
+{
+ struct vring_desc desc;
+ unsigned int i, head, found = 0, num = vring->vr.num;
+ uint16_t avail_idx, last_avail_idx;
+
+ /* If there was a fatal error then refuse operation */
+ if (vring->broken) {
+ return -EFAULT;
+ }
+
+ /* Check it isn't doing very strange things with descriptor numbers. */
+ last_avail_idx = vring->last_avail_idx;
+ avail_idx = vring->vr.avail->idx;
+ barrier(); /* load indices now and not again later */
+
+ if (unlikely((uint16_t)(avail_idx - last_avail_idx) > num)) {
+ error_report("Guest moved used index from %u to %u",
+ last_avail_idx, avail_idx);
+ vring->broken = true;
+ return -EFAULT;
+ }
+
+ /* If there's nothing new since last we looked. */
+ if (avail_idx == last_avail_idx) {
+ return -EAGAIN;
+ }
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ smp_rmb();
+
+ /* Grab the next descriptor number they're advertising, and increment
+ * the index we've seen. */
+ head = vring->vr.avail->ring[last_avail_idx % num];
+
+ /* If their number is silly, that's an error. */
+ if (unlikely(head >= num)) {
+ error_report("Guest says index %u > %u is available", head, num);
+ vring->broken = true;
+ return -EFAULT;
+ }
+
+ if (vdev->guest_features & (1 << VIRTIO_RING_F_EVENT_IDX)) {
+ vring_avail_event(&vring->vr) = vring->vr.avail->idx;
+ }
+
+ /* When we start there are none of either input nor output. */
+ *out_num = *in_num = 0;
+
+ i = head;
+ do {
+ if (unlikely(i >= num)) {
+ error_report("Desc index is %u > %u, head = %u", i, num, head);
+ vring->broken = true;
+ return -EFAULT;
+ }
+ if (unlikely(++found > num)) {
+ error_report("Loop detected: last one at %u vq size %u head %u",
+ i, num, head);
+ vring->broken = true;
+ return -EFAULT;
+ }
+ desc = vring->vr.desc[i];
+
+ /* Ensure descriptor is loaded before accessing fields */
+ barrier();
+
+ if (desc.flags & VRING_DESC_F_INDIRECT) {
+ int ret = get_indirect(vring, iov, iov_end, out_num, in_num, &desc);
+ if (ret < 0) {
+ return ret;
+ }
+ continue;
+ }
+
+ /* If there are not enough iovecs left, stop for now. The caller
+ * should check if there are more descs available once they have dealt
+ * with the current set.
+ */
+ if (iov >= iov_end) {
+ return -ENOBUFS;
+ }
+
+ /* TODO handle non-contiguous memory across region boundaries */
+ iov->iov_base = hostmem_lookup(&vring->hostmem, desc.addr, desc.len,
+ desc.flags & VRING_DESC_F_WRITE);
+ if (!iov->iov_base) {
+ error_report("Failed to map vring desc addr %#" PRIx64 " len %u",
+ (uint64_t)desc.addr, desc.len);
+ vring->broken = true;
+ return -EFAULT;
+ }
+ iov->iov_len = desc.len;
+ iov++;
+
+ if (desc.flags & VRING_DESC_F_WRITE) {
+ /* If this is an input descriptor,
+ * increment that count. */
+ *in_num += 1;
+ } else {
+ /* If it's an output descriptor, they're all supposed
+ * to come before any input descriptors. */
+ if (unlikely(*in_num)) {
+ error_report("Descriptor has out after in: idx %d", i);
+ vring->broken = true;
+ return -EFAULT;
+ }
+ *out_num += 1;
+ }
+ i = desc.next;
+ } while (desc.flags & VRING_DESC_F_NEXT);
+
+ /* On success, increment avail index. */
+ vring->last_avail_idx++;
+ return head;
+}
+
+/* After we've used one of their buffers, we tell them about it.
+ *
+ * Stolen from linux/drivers/vhost/vhost.c.
+ */
+void vring_push(Vring *vring, unsigned int head, int len)
+{
+ struct vring_used_elem *used;
+ uint16_t new;
+
+ /* Don't touch vring if a fatal error occurred */
+ if (vring->broken) {
+ return;
+ }
+
+ /* The virtqueue contains a ring of used buffers. Get a pointer to the
+ * next entry in that used ring. */
+ used = &vring->vr.used->ring[vring->last_used_idx % vring->vr.num];
+ used->id = head;
+ used->len = len;
+
+ /* Make sure buffer is written before we update index. */
+ smp_wmb();
+
+ new = vring->vr.used->idx = ++vring->last_used_idx;
+ if (unlikely((int16_t)(new - vring->signalled_used) < (uint16_t)1)) {
+ vring->signalled_used_valid = false;
+ }
+}
diff --git a/hw/dataplane/vring.h b/hw/dataplane/vring.h
new file mode 100644
index 0000000..a495708
--- /dev/null
+++ b/hw/dataplane/vring.h
@@ -0,0 +1,63 @@
+/* Copyright 2012 Red Hat, Inc. and/or its affiliates
+ * Copyright IBM, Corp. 2012
+ *
+ * Based on Linux 2.6.39 vhost code:
+ * Copyright (C) 2009 Red Hat, Inc.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * Author: Michael S. Tsirkin <mst@redhat.com>
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * Inspiration, some code, and most witty comments come from
+ * Documentation/virtual/lguest/lguest.c, by Rusty Russell
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ */
+
+#ifndef VRING_H
+#define VRING_H
+
+#include <linux/virtio_ring.h>
+#include "qemu-common.h"
+#include "qemu-barrier.h"
+#include "hw/dataplane/hostmem.h"
+#include "hw/virtio.h"
+
+typedef struct {
+ HostMem hostmem; /* guest memory mapper */
+ struct vring vr; /* virtqueue vring mapped to host memory */
+ uint16_t last_avail_idx; /* last processed avail ring index */
+ uint16_t last_used_idx; /* last processed used ring index */
+ uint16_t signalled_used; /* EVENT_IDX state */
+ bool signalled_used_valid;
+ bool broken; /* was there a fatal error? */
+} Vring;
+
+static inline unsigned int vring_get_num(Vring *vring)
+{
+ return vring->vr.num;
+}
+
+/* Are there more descriptors available? */
+static inline bool vring_more_avail(Vring *vring)
+{
+ return vring->vr.avail->idx != vring->last_avail_idx;
+}
+
+/* Fail future vring_pop() and vring_push() calls until reset */
+static inline void vring_set_broken(Vring *vring)
+{
+ vring->broken = true;
+}
+
+bool vring_setup(Vring *vring, VirtIODevice *vdev, int n);
+void vring_teardown(Vring *vring);
+void vring_disable_notification(VirtIODevice *vdev, Vring *vring);
+bool vring_enable_notification(VirtIODevice *vdev, Vring *vring);
+bool vring_should_notify(VirtIODevice *vdev, Vring *vring);
+int vring_pop(VirtIODevice *vdev, Vring *vring,
+ struct iovec iov[], struct iovec *iov_end,
+ unsigned int *out_num, unsigned int *in_num);
+void vring_push(Vring *vring, unsigned int head, int len);
+
+#endif /* VRING_H */
diff --git a/trace-events b/trace-events
index bb7621e..167d776 100644
--- a/trace-events
+++ b/trace-events
@@ -98,6 +98,9 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
+# hw/dataplane/vring.c
+vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring physical %#"PRIx64" desc %p avail %p used %p"
+
# thread-pool.c
thread_pool_submit(void *req, void *opaque) "req %p opaque %p"
thread_pool_complete(void *req, void *opaque, int ret) "req %p opaque %p ret %d"
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 05/18] dataplane: add event loop
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (3 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 04/18] dataplane: add virtqueue vring code Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 06/18] dataplane: add Linux AIO request queue Stefan Hajnoczi
` (12 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
Outside the safety of the global mutex we need to poll on file
descriptors. I found epoll(2) is a convenient way to do that, although
other options could replace this module in the future (such as an
AioContext-based loop or glib's GMainLoop).
One important feature of this small event loop implementation is that
the loop can be terminated in a thread-safe way. This allows QEMU to
stop the data plane thread cleanly.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/dataplane/Makefile.objs | 2 +-
hw/dataplane/event-poll.c | 100 +++++++++++++++++++++++++++++++++++++++++++++
hw/dataplane/event-poll.h | 40 ++++++++++++++++++
3 files changed, 141 insertions(+), 1 deletion(-)
create mode 100644 hw/dataplane/event-poll.c
create mode 100644 hw/dataplane/event-poll.h
diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
index 34e6d57..e26bd7d 100644
--- a/hw/dataplane/Makefile.objs
+++ b/hw/dataplane/Makefile.objs
@@ -1,3 +1,3 @@
ifeq ($(CONFIG_VIRTIO), y)
-common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o
+common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o
endif
diff --git a/hw/dataplane/event-poll.c b/hw/dataplane/event-poll.c
new file mode 100644
index 0000000..2b55c6e
--- /dev/null
+++ b/hw/dataplane/event-poll.c
@@ -0,0 +1,100 @@
+/*
+ * Event loop with file descriptor polling
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <sys/epoll.h>
+#include "hw/dataplane/event-poll.h"
+
+/* Add an event notifier and its callback for polling */
+void event_poll_add(EventPoll *poll, EventHandler *handler,
+ EventNotifier *notifier, EventCallback *callback)
+{
+ struct epoll_event event = {
+ .events = EPOLLIN,
+ .data.ptr = handler,
+ };
+ handler->notifier = notifier;
+ handler->callback = callback;
+ if (epoll_ctl(poll->epoll_fd, EPOLL_CTL_ADD,
+ event_notifier_get_fd(notifier), &event) != 0) {
+ fprintf(stderr, "failed to add event handler to epoll: %m\n");
+ exit(1);
+ }
+}
+
+/* Event callback for stopping event_poll() */
+static void handle_stop(EventHandler *handler)
+{
+ /* Do nothing */
+}
+
+void event_poll_init(EventPoll *poll)
+{
+ /* Create epoll file descriptor */
+ poll->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (poll->epoll_fd < 0) {
+ fprintf(stderr, "epoll_create1 failed: %m\n");
+ exit(1);
+ }
+
+ /* Set up stop notifier */
+ if (event_notifier_init(&poll->stop_notifier, 0) < 0) {
+ fprintf(stderr, "failed to init stop notifier\n");
+ exit(1);
+ }
+ event_poll_add(poll, &poll->stop_handler,
+ &poll->stop_notifier, handle_stop);
+}
+
+void event_poll_cleanup(EventPoll *poll)
+{
+ event_notifier_cleanup(&poll->stop_notifier);
+ close(poll->epoll_fd);
+ poll->epoll_fd = -1;
+}
+
+/* Block until the next event and invoke its callback */
+void event_poll(EventPoll *poll)
+{
+ EventHandler *handler;
+ struct epoll_event event;
+ int nevents;
+
+ /* Wait for the next event. Only do one event per call to keep the
+ * function simple, this could be changed later. */
+ do {
+ nevents = epoll_wait(poll->epoll_fd, &event, 1, -1);
+ } while (nevents < 0 && errno == EINTR);
+ if (unlikely(nevents != 1)) {
+ fprintf(stderr, "epoll_wait failed: %m\n");
+ exit(1); /* should never happen */
+ }
+
+ /* Find out which event handler has become active */
+ handler = event.data.ptr;
+
+ /* Clear the eventfd */
+ event_notifier_test_and_clear(handler->notifier);
+
+ /* Handle the event */
+ handler->callback(handler);
+}
+
+/* Stop event_poll()
+ *
+ * This function can be used from another thread.
+ */
+void event_poll_notify(EventPoll *poll)
+{
+ event_notifier_set(&poll->stop_notifier);
+}
diff --git a/hw/dataplane/event-poll.h b/hw/dataplane/event-poll.h
new file mode 100644
index 0000000..ed3456c
--- /dev/null
+++ b/hw/dataplane/event-poll.h
@@ -0,0 +1,40 @@
+/*
+ * Event loop with file descriptor polling
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef EVENT_POLL_H
+#define EVENT_POLL_H
+
+#include "event_notifier.h"
+
+typedef struct EventHandler EventHandler;
+typedef void EventCallback(EventHandler *handler);
+struct EventHandler {
+ EventNotifier *notifier; /* eventfd */
+ EventCallback *callback; /* callback function */
+};
+
+typedef struct {
+ int epoll_fd; /* epoll(2) file descriptor */
+ EventNotifier stop_notifier; /* stop poll notifier */
+ EventHandler stop_handler; /* stop poll handler */
+} EventPoll;
+
+void event_poll_add(EventPoll *poll, EventHandler *handler,
+ EventNotifier *notifier, EventCallback *callback);
+void event_poll_init(EventPoll *poll);
+void event_poll_cleanup(EventPoll *poll);
+void event_poll(EventPoll *poll);
+void event_poll_notify(EventPoll *poll);
+
+#endif /* EVENT_POLL_H */
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 06/18] dataplane: add Linux AIO request queue
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (4 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 05/18] dataplane: add event loop Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 07/18] iov: add iov_discard_front/back() to remove data Stefan Hajnoczi
` (11 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The IOQueue has a pool of iocb structs and a function to add new
read/write requests. Multiple requests can be added before calling the
submit function to actually tell the host kernel to begin I/O. This
allows callers to batch requests and submit them in one go.
The actual I/O is performed using Linux AIO.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/dataplane/Makefile.objs | 2 +-
hw/dataplane/ioq.c | 117 +++++++++++++++++++++++++++++++++++++++++++++
hw/dataplane/ioq.h | 57 ++++++++++++++++++++++
3 files changed, 175 insertions(+), 1 deletion(-)
create mode 100644 hw/dataplane/ioq.c
create mode 100644 hw/dataplane/ioq.h
diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
index e26bd7d..abd408f 100644
--- a/hw/dataplane/Makefile.objs
+++ b/hw/dataplane/Makefile.objs
@@ -1,3 +1,3 @@
ifeq ($(CONFIG_VIRTIO), y)
-common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o
+common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o ioq.o
endif
diff --git a/hw/dataplane/ioq.c b/hw/dataplane/ioq.c
new file mode 100644
index 0000000..0c9f5c4
--- /dev/null
+++ b/hw/dataplane/ioq.c
@@ -0,0 +1,117 @@
+/*
+ * Linux AIO request queue
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "hw/dataplane/ioq.h"
+
+void ioq_init(IOQueue *ioq, int fd, unsigned int max_reqs)
+{
+ int rc;
+
+ ioq->fd = fd;
+ ioq->max_reqs = max_reqs;
+
+ memset(&ioq->io_ctx, 0, sizeof ioq->io_ctx);
+ rc = io_setup(max_reqs, &ioq->io_ctx);
+ if (rc != 0) {
+ fprintf(stderr, "ioq io_setup failed %d\n", rc);
+ exit(1);
+ }
+
+ rc = event_notifier_init(&ioq->io_notifier, 0);
+ if (rc != 0) {
+ fprintf(stderr, "ioq io event notifier creation failed %d\n", rc);
+ exit(1);
+ }
+
+ ioq->freelist = g_malloc0(sizeof ioq->freelist[0] * max_reqs);
+ ioq->freelist_idx = 0;
+
+ ioq->queue = g_malloc0(sizeof ioq->queue[0] * max_reqs);
+ ioq->queue_idx = 0;
+}
+
+void ioq_cleanup(IOQueue *ioq)
+{
+ g_free(ioq->freelist);
+ g_free(ioq->queue);
+
+ event_notifier_cleanup(&ioq->io_notifier);
+ io_destroy(ioq->io_ctx);
+}
+
+EventNotifier *ioq_get_notifier(IOQueue *ioq)
+{
+ return &ioq->io_notifier;
+}
+
+struct iocb *ioq_get_iocb(IOQueue *ioq)
+{
+ /* Underflow cannot happen since ioq is sized for max_reqs */
+ assert(ioq->freelist_idx != 0);
+
+ struct iocb *iocb = ioq->freelist[--ioq->freelist_idx];
+ ioq->queue[ioq->queue_idx++] = iocb;
+ return iocb;
+}
+
+void ioq_put_iocb(IOQueue *ioq, struct iocb *iocb)
+{
+ /* Overflow cannot happen since ioq is sized for max_reqs */
+ assert(ioq->freelist_idx != ioq->max_reqs);
+
+ ioq->freelist[ioq->freelist_idx++] = iocb;
+}
+
+struct iocb *ioq_rdwr(IOQueue *ioq, bool read, struct iovec *iov,
+ unsigned int count, long long offset)
+{
+ struct iocb *iocb = ioq_get_iocb(ioq);
+
+ if (read) {
+ io_prep_preadv(iocb, ioq->fd, iov, count, offset);
+ } else {
+ io_prep_pwritev(iocb, ioq->fd, iov, count, offset);
+ }
+ io_set_eventfd(iocb, event_notifier_get_fd(&ioq->io_notifier));
+ return iocb;
+}
+
+int ioq_submit(IOQueue *ioq)
+{
+ int rc = io_submit(ioq->io_ctx, ioq->queue_idx, ioq->queue);
+ ioq->queue_idx = 0; /* reset */
+ return rc;
+}
+
+int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion,
+ void *opaque)
+{
+ struct io_event events[ioq->max_reqs];
+ int nevents, i;
+
+ do {
+ nevents = io_getevents(ioq->io_ctx, 0, ioq->max_reqs, events, NULL);
+ } while (nevents < 0 && errno == EINTR);
+ if (nevents < 0) {
+ return nevents;
+ }
+
+ for (i = 0; i < nevents; i++) {
+ ssize_t ret = ((uint64_t)events[i].res2 << 32) | events[i].res;
+
+ completion(events[i].obj, ret, opaque);
+ ioq_put_iocb(ioq, events[i].obj);
+ }
+ return nevents;
+}
diff --git a/hw/dataplane/ioq.h b/hw/dataplane/ioq.h
new file mode 100644
index 0000000..890db22
--- /dev/null
+++ b/hw/dataplane/ioq.h
@@ -0,0 +1,57 @@
+/*
+ * Linux AIO request queue
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef IOQ_H
+#define IOQ_H
+
+#include <libaio.h>
+#include "event_notifier.h"
+
+typedef struct {
+ int fd; /* file descriptor */
+ unsigned int max_reqs; /* max length of freelist and queue */
+
+ io_context_t io_ctx; /* Linux AIO context */
+ EventNotifier io_notifier; /* Linux AIO eventfd */
+
+ /* Requests can complete in any order so a free list is necessary to manage
+ * available iocbs.
+ */
+ struct iocb **freelist; /* free iocbs */
+ unsigned int freelist_idx;
+
+ /* Multiple requests are queued up before submitting them all in one go */
+ struct iocb **queue; /* queued iocbs */
+ unsigned int queue_idx;
+} IOQueue;
+
+void ioq_init(IOQueue *ioq, int fd, unsigned int max_reqs);
+void ioq_cleanup(IOQueue *ioq);
+EventNotifier *ioq_get_notifier(IOQueue *ioq);
+struct iocb *ioq_get_iocb(IOQueue *ioq);
+void ioq_put_iocb(IOQueue *ioq, struct iocb *iocb);
+struct iocb *ioq_rdwr(IOQueue *ioq, bool read, struct iovec *iov,
+ unsigned int count, long long offset);
+int ioq_submit(IOQueue *ioq);
+
+static inline unsigned int ioq_num_queued(IOQueue *ioq)
+{
+ return ioq->queue_idx;
+}
+
+typedef void IOQueueCompletion(struct iocb *iocb, ssize_t ret, void *opaque);
+int ioq_run_completion(IOQueue *ioq, IOQueueCompletion *completion,
+ void *opaque);
+
+#endif /* IOQ_H */
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 07/18] iov: add iov_discard_front/back() to remove data
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (5 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 06/18] dataplane: add Linux AIO request queue Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 08/18] test-iov: add iov_discard_front/back() testcases Stefan Hajnoczi
` (10 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The iov_discard_front/back() functions remove data from the front or
back of the vector. This is useful when peeling off header/footer
structs.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
iov.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++
iov.h | 13 +++++++++++++
2 files changed, 64 insertions(+)
diff --git a/iov.c b/iov.c
index a81eedc..d3b19e3 100644
--- a/iov.c
+++ b/iov.c
@@ -354,3 +354,54 @@ size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
{
return iov_memset(qiov->iov, qiov->niov, offset, fillc, bytes);
}
+
+size_t iov_discard_front(struct iovec **iov, unsigned int *iov_cnt,
+ size_t bytes)
+{
+ size_t total = 0;
+ struct iovec *cur;
+
+ for (cur = *iov; *iov_cnt > 0; cur++) {
+ if (cur->iov_len > bytes) {
+ cur->iov_base += bytes;
+ cur->iov_len -= bytes;
+ total += bytes;
+ break;
+ }
+
+ bytes -= cur->iov_len;
+ total += cur->iov_len;
+ *iov_cnt -= 1;
+ }
+
+ *iov = cur;
+ return total;
+}
+
+size_t iov_discard_back(struct iovec *iov, unsigned int *iov_cnt,
+ size_t bytes)
+{
+ size_t total = 0;
+ struct iovec *cur;
+
+ if (*iov_cnt == 0) {
+ return 0;
+ }
+
+ cur = iov + (*iov_cnt - 1);
+
+ while (*iov_cnt > 0) {
+ if (cur->iov_len > bytes) {
+ cur->iov_len -= bytes;
+ total += bytes;
+ break;
+ }
+
+ bytes -= cur->iov_len;
+ total += cur->iov_len;
+ cur--;
+ *iov_cnt -= 1;
+ }
+
+ return total;
+}
diff --git a/iov.h b/iov.h
index 34c8ec9..237e34c 100644
--- a/iov.h
+++ b/iov.h
@@ -95,3 +95,16 @@ void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt,
unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt,
const struct iovec *iov, unsigned int iov_cnt,
size_t offset, size_t bytes);
+
+/*
+ * Remove a given number of bytes from the front or back of a vector.
+ * This may update iov and/or iov_cnt to exclude iovec elements that are
+ * no longer required.
+ *
+ * The number of bytes actually discarded is returned. This number may be
+ * smaller than requested if the vector is too small.
+ */
+size_t iov_discard_front(struct iovec **iov, unsigned int *iov_cnt,
+ size_t bytes);
+size_t iov_discard_back(struct iovec *iov, unsigned int *iov_cnt,
+ size_t bytes);
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 08/18] test-iov: add iov_discard_front/back() testcases
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (6 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 07/18] iov: add iov_discard_front/back() to remove data Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 09/18] iov: add qemu_iovec_concat_iov() Stefan Hajnoczi
` (9 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
tests/test-iov.c | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 150 insertions(+)
diff --git a/tests/test-iov.c b/tests/test-iov.c
index cbe7a89..720d95c 100644
--- a/tests/test-iov.c
+++ b/tests/test-iov.c
@@ -250,11 +250,161 @@ static void test_io(void)
#endif
}
+static void test_discard_front(void)
+{
+ struct iovec *iov;
+ struct iovec *iov_tmp;
+ unsigned int iov_cnt;
+ unsigned int iov_cnt_tmp;
+ void *old_base;
+ size_t size;
+ size_t ret;
+
+ /* Discard zero bytes */
+ iov_random(&iov, &iov_cnt);
+ iov_tmp = iov;
+ iov_cnt_tmp = iov_cnt;
+ ret = iov_discard_front(&iov_tmp, &iov_cnt_tmp, 0);
+ g_assert(ret == 0);
+ g_assert(iov_tmp == iov);
+ g_assert(iov_cnt_tmp == iov_cnt);
+ iov_free(iov, iov_cnt);
+
+ /* Discard more bytes than vector size */
+ iov_random(&iov, &iov_cnt);
+ iov_tmp = iov;
+ iov_cnt_tmp = iov_cnt;
+ size = iov_size(iov, iov_cnt);
+ ret = iov_discard_front(&iov_tmp, &iov_cnt_tmp, size + 1);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == 0);
+ iov_free(iov, iov_cnt);
+
+ /* Discard entire vector */
+ iov_random(&iov, &iov_cnt);
+ iov_tmp = iov;
+ iov_cnt_tmp = iov_cnt;
+ size = iov_size(iov, iov_cnt);
+ ret = iov_discard_front(&iov_tmp, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == 0);
+ iov_free(iov, iov_cnt);
+
+ /* Discard within first element */
+ iov_random(&iov, &iov_cnt);
+ iov_tmp = iov;
+ iov_cnt_tmp = iov_cnt;
+ old_base = iov->iov_base;
+ size = g_test_rand_int_range(1, iov->iov_len);
+ ret = iov_discard_front(&iov_tmp, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_tmp == iov);
+ g_assert(iov_cnt_tmp == iov_cnt);
+ g_assert(iov_tmp->iov_base == old_base + size);
+ iov_tmp->iov_base = old_base; /* undo before g_free() */
+ iov_free(iov, iov_cnt);
+
+ /* Discard entire first element */
+ iov_random(&iov, &iov_cnt);
+ iov_tmp = iov;
+ iov_cnt_tmp = iov_cnt;
+ ret = iov_discard_front(&iov_tmp, &iov_cnt_tmp, iov->iov_len);
+ g_assert(ret == iov->iov_len);
+ g_assert(iov_tmp == iov + 1);
+ g_assert(iov_cnt_tmp == iov_cnt - 1);
+ iov_free(iov, iov_cnt);
+
+ /* Discard within second element */
+ iov_random(&iov, &iov_cnt);
+ iov_tmp = iov;
+ iov_cnt_tmp = iov_cnt;
+ old_base = iov[1].iov_base;
+ size = iov->iov_len + g_test_rand_int_range(1, iov[1].iov_len);
+ ret = iov_discard_front(&iov_tmp, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_tmp == iov + 1);
+ g_assert(iov_cnt_tmp == iov_cnt - 1);
+ g_assert(iov_tmp->iov_base == old_base + (size - iov->iov_len));
+ iov_tmp->iov_base = old_base; /* undo before g_free() */
+ iov_free(iov, iov_cnt);
+}
+
+static void test_discard_back(void)
+{
+ struct iovec *iov;
+ unsigned int iov_cnt;
+ unsigned int iov_cnt_tmp;
+ void *old_base;
+ size_t size;
+ size_t ret;
+
+ /* Discard zero bytes */
+ iov_random(&iov, &iov_cnt);
+ iov_cnt_tmp = iov_cnt;
+ ret = iov_discard_back(iov, &iov_cnt_tmp, 0);
+ g_assert(ret == 0);
+ g_assert(iov_cnt_tmp == iov_cnt);
+ iov_free(iov, iov_cnt);
+
+ /* Discard more bytes than vector size */
+ iov_random(&iov, &iov_cnt);
+ iov_cnt_tmp = iov_cnt;
+ size = iov_size(iov, iov_cnt);
+ ret = iov_discard_back(iov, &iov_cnt_tmp, size + 1);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == 0);
+ iov_free(iov, iov_cnt);
+
+ /* Discard entire vector */
+ iov_random(&iov, &iov_cnt);
+ iov_cnt_tmp = iov_cnt;
+ size = iov_size(iov, iov_cnt);
+ ret = iov_discard_back(iov, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == 0);
+ iov_free(iov, iov_cnt);
+
+ /* Discard within last element */
+ iov_random(&iov, &iov_cnt);
+ iov_cnt_tmp = iov_cnt;
+ old_base = iov[iov_cnt - 1].iov_base;
+ size = g_test_rand_int_range(1, iov[iov_cnt - 1].iov_len);
+ ret = iov_discard_back(iov, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == iov_cnt);
+ g_assert(iov[iov_cnt - 1].iov_base == old_base);
+ iov_free(iov, iov_cnt);
+
+ /* Discard entire last element */
+ iov_random(&iov, &iov_cnt);
+ iov_cnt_tmp = iov_cnt;
+ old_base = iov[iov_cnt - 1].iov_base;
+ size = iov[iov_cnt - 1].iov_len;
+ ret = iov_discard_back(iov, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == iov_cnt - 1);
+ iov_free(iov, iov_cnt);
+
+ /* Discard within second-to-last element */
+ iov_random(&iov, &iov_cnt);
+ iov_cnt_tmp = iov_cnt;
+ old_base = iov[iov_cnt - 2].iov_base;
+ size = iov[iov_cnt - 1].iov_len +
+ g_test_rand_int_range(1, iov[iov_cnt - 2].iov_len);
+ ret = iov_discard_back(iov, &iov_cnt_tmp, size);
+ g_assert(ret == size);
+ g_assert(iov_cnt_tmp == iov_cnt - 1);
+ g_assert(iov[iov_cnt - 2].iov_base == old_base);
+ iov_free(iov, iov_cnt);
+}
+
int main(int argc, char **argv)
{
g_test_init(&argc, &argv, NULL);
g_test_rand_int();
g_test_add_func("/basic/iov/from-to-buf", test_to_from_buf);
g_test_add_func("/basic/iov/io", test_io);
+ g_test_add_func("/basic/iov/discard-front", test_discard_front);
+ g_test_add_func("/basic/iov/discard-back", test_discard_back);
return g_test_run();
}
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 09/18] iov: add qemu_iovec_concat_iov()
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (7 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 08/18] test-iov: add iov_discard_front/back() testcases Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 10/18] virtio-blk: restore VirtIOBlkConf->config_wce flag Stefan Hajnoczi
` (8 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The qemu_iovec_concat() function copies a subset of a QEMUIOVector. The
new qemu_iovec_concat_iov() function does the same for a iov/cnt pair.
It is easy to define qemu_iovec_concat() in terms of
qemu_iovec_concat_iov(). The existing code is mostly unchanged, except
for the assertion src->size >= soffset, which cannot be efficiently
checked upfront on a iov/cnt pair. Instead we assert upon hitting the
end of src with an unsatisfied soffset.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
iov.c | 39 +++++++++++++++++++++++++++------------
qemu-common.h | 3 +++
2 files changed, 30 insertions(+), 12 deletions(-)
diff --git a/iov.c b/iov.c
index d3b19e3..0feab8e 100644
--- a/iov.c
+++ b/iov.c
@@ -289,34 +289,49 @@ void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
}
/*
- * Concatenates (partial) iovecs from src to the end of dst.
+ * Concatenates (partial) iovecs from src_iov to the end of dst.
* It starts copying after skipping `soffset' bytes at the
* beginning of src and adds individual vectors from src to
* dst copies up to `sbytes' bytes total, or up to the end
- * of src if it comes first. This way, it is okay to specify
+ * of src_iov if it comes first. This way, it is okay to specify
* very large value for `sbytes' to indicate "up to the end
* of src".
* Only vector pointers are processed, not the actual data buffers.
*/
-void qemu_iovec_concat(QEMUIOVector *dst,
- QEMUIOVector *src, size_t soffset, size_t sbytes)
+void qemu_iovec_concat_iov(QEMUIOVector *dst,
+ struct iovec *src_iov, unsigned int src_cnt,
+ size_t soffset, size_t sbytes)
{
int i;
size_t done;
- struct iovec *siov = src->iov;
assert(dst->nalloc != -1);
- assert(src->size >= soffset);
- for (i = 0, done = 0; done < sbytes && i < src->niov; i++) {
- if (soffset < siov[i].iov_len) {
- size_t len = MIN(siov[i].iov_len - soffset, sbytes - done);
- qemu_iovec_add(dst, siov[i].iov_base + soffset, len);
+ for (i = 0, done = 0; done < sbytes && i < src_cnt; i++) {
+ if (soffset < src_iov[i].iov_len) {
+ size_t len = MIN(src_iov[i].iov_len - soffset, sbytes - done);
+ qemu_iovec_add(dst, src_iov[i].iov_base + soffset, len);
done += len;
soffset = 0;
} else {
- soffset -= siov[i].iov_len;
+ soffset -= src_iov[i].iov_len;
}
}
- /* return done; */
+ assert(soffset == 0); /* offset beyond end of src */
+}
+
+/*
+ * Concatenates (partial) iovecs from src to the end of dst.
+ * It starts copying after skipping `soffset' bytes at the
+ * beginning of src and adds individual vectors from src to
+ * dst copies up to `sbytes' bytes total, or up to the end
+ * of src if it comes first. This way, it is okay to specify
+ * very large value for `sbytes' to indicate "up to the end
+ * of src".
+ * Only vector pointers are processed, not the actual data buffers.
+ */
+void qemu_iovec_concat(QEMUIOVector *dst,
+ QEMUIOVector *src, size_t soffset, size_t sbytes)
+{
+ qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes);
}
void qemu_iovec_destroy(QEMUIOVector *qiov)
diff --git a/qemu-common.h b/qemu-common.h
index e674786..2492189 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -329,6 +329,9 @@ void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov);
void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len);
void qemu_iovec_concat(QEMUIOVector *dst,
QEMUIOVector *src, size_t soffset, size_t sbytes);
+void qemu_iovec_concat_iov(QEMUIOVector *dst,
+ struct iovec *src_iov, unsigned int src_cnt,
+ size_t soffset, size_t sbytes);
void qemu_iovec_destroy(QEMUIOVector *qiov);
void qemu_iovec_reset(QEMUIOVector *qiov);
size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 10/18] virtio-blk: restore VirtIOBlkConf->config_wce flag
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (8 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 09/18] iov: add qemu_iovec_concat_iov() Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 11/18] dataplane: add virtio-blk data plane code Stefan Hajnoczi
` (7 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
Two slightly different versions of a patch to conditionally set
VIRTIO_BLK_F_CONFIG_WCE through the "config-wce" qdev property have been
applied (ea776abca and eec7f96c2). David Gibson
<david@gibson.dropbear.id.au> noticed that the "config-wce"
property is broken as a result and fixed it recently.
The fix sets the host_features VIRTIO_BLK_F_CONFIG_WCE bit from a qdev
property. Unfortunately, the virtio device then has no chance to test
for the presence of the feature bit during virtio_blk_init().
Therefore, reinstate the VirtIOBlkConf->config_wce flag. Drop the
duplicate qdev property to set the host_features bit. The
VirtIOBlkConf->config_wce flag will be used by virtio-blk-data-plane in
a later patch.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/virtio-blk.c | 3 +++
hw/virtio-blk.h | 4 ++--
hw/virtio-pci.c | 1 +
3 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index e25cc96..fabf387 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -524,6 +524,9 @@ static uint32_t virtio_blk_get_features(VirtIODevice *vdev, uint32_t features)
features |= (1 << VIRTIO_BLK_F_BLK_SIZE);
features |= (1 << VIRTIO_BLK_F_SCSI);
+ if (s->blk->config_wce) {
+ features |= (1 << VIRTIO_BLK_F_CONFIG_WCE);
+ }
if (bdrv_enable_write_cache(s->bs))
features |= (1 << VIRTIO_BLK_F_WCE);
diff --git a/hw/virtio-blk.h b/hw/virtio-blk.h
index 651a000..454f445 100644
--- a/hw/virtio-blk.h
+++ b/hw/virtio-blk.h
@@ -104,10 +104,10 @@ struct VirtIOBlkConf
BlockConf conf;
char *serial;
uint32_t scsi;
+ uint32_t config_wce;
};
#define DEFINE_VIRTIO_BLK_FEATURES(_state, _field) \
- DEFINE_VIRTIO_COMMON_FEATURES(_state, _field), \
- DEFINE_PROP_BIT("config-wce", _state, _field, VIRTIO_BLK_F_CONFIG_WCE, true)
+ DEFINE_VIRTIO_COMMON_FEATURES(_state, _field)
#endif
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 7684ac9..71f4fb5 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -895,6 +895,7 @@ static Property virtio_blk_properties[] = {
#ifdef __linux__
DEFINE_PROP_BIT("scsi", VirtIOPCIProxy, blk.scsi, 0, true),
#endif
+ DEFINE_PROP_BIT("config-wce", VirtIOPCIProxy, blk.config_wce, 0, true),
DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features),
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 11/18] dataplane: add virtio-blk data plane code
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (9 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 10/18] virtio-blk: restore VirtIOBlkConf->config_wce flag Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 12/18] virtio-blk: add x-data-plane=on|off performance feature Stefan Hajnoczi
` (6 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
virtio-blk-data-plane is a subset implementation of virtio-blk. It only
handles read, write, and flush requests. It does this using a dedicated
thread that executes an epoll(2)-based event loop and processes I/O
using Linux AIO.
This approach performs very well but can be used for raw image files
only. The number of IOPS achieved has been reported to be several times
higher than the existing virtio-blk implementation.
Eventually it should be possible to unify virtio-blk-data-plane with the
main body of QEMU code once the block layer and hardware emulation is
able to run outside the global mutex.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/dataplane/Makefile.objs | 2 +-
hw/dataplane/virtio-blk.c | 465 +++++++++++++++++++++++++++++++++++++++++++++
hw/dataplane/virtio-blk.h | 29 +++
hw/virtio-blk.h | 1 +
trace-events | 6 +
5 files changed, 502 insertions(+), 1 deletion(-)
create mode 100644 hw/dataplane/virtio-blk.c
create mode 100644 hw/dataplane/virtio-blk.h
diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
index abd408f..682aa9e 100644
--- a/hw/dataplane/Makefile.objs
+++ b/hw/dataplane/Makefile.objs
@@ -1,3 +1,3 @@
ifeq ($(CONFIG_VIRTIO), y)
-common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o ioq.o
+common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o ioq.o virtio-blk.o
endif
diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c
new file mode 100644
index 0000000..a42eff3
--- /dev/null
+++ b/hw/dataplane/virtio-blk.c
@@ -0,0 +1,465 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "iov.h"
+#include "event-poll.h"
+#include "qemu-thread.h"
+#include "vring.h"
+#include "ioq.h"
+#include "migration.h"
+#include "hw/virtio-blk.h"
+#include "hw/dataplane/virtio-blk.h"
+
+enum {
+ SEG_MAX = 126, /* maximum number of I/O segments */
+ VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */
+ REQ_MAX = VRING_MAX, /* maximum number of requests in the vring,
+ * is VRING_MAX / 2 with traditional and
+ * VRING_MAX with indirect descriptors */
+};
+
+typedef struct {
+ struct iocb iocb; /* Linux AIO control block */
+ QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */
+ unsigned int head; /* vring descriptor index */
+} VirtIOBlockRequest;
+
+struct VirtIOBlockDataPlane {
+ bool started;
+ QEMUBH *start_bh;
+ QemuThread thread;
+
+ VirtIOBlkConf *blk;
+ int fd; /* image file descriptor */
+
+ VirtIODevice *vdev;
+ Vring vring; /* virtqueue vring */
+ EventNotifier *guest_notifier; /* irq */
+
+ EventPoll event_poll; /* event poller */
+ EventHandler io_handler; /* Linux AIO completion handler */
+ EventHandler notify_handler; /* virtqueue notify handler */
+
+ IOQueue ioqueue; /* Linux AIO queue (should really be per
+ dataplane thread) */
+ VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the
+ queue */
+
+ unsigned int num_reqs;
+
+ Error *migration_blocker;
+};
+
+/* Raise an interrupt to signal guest, if necessary */
+static void notify_guest(VirtIOBlockDataPlane *s)
+{
+ if (!vring_should_notify(s->vdev, &s->vring)) {
+ return;
+ }
+
+ event_notifier_set(s->guest_notifier);
+}
+
+static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+ VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
+ struct virtio_blk_inhdr hdr;
+ int len;
+
+ if (likely(ret >= 0)) {
+ hdr.status = VIRTIO_BLK_S_OK;
+ len = ret;
+ } else {
+ hdr.status = VIRTIO_BLK_S_IOERR;
+ len = 0;
+ }
+
+ trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
+
+ qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr));
+ qemu_iovec_destroy(req->inhdr);
+ g_slice_free(QEMUIOVector, req->inhdr);
+
+ /* According to the virtio specification len should be the number of bytes
+ * written to, but for virtio-blk it seems to be the number of bytes
+ * transferred plus the status bytes.
+ */
+ vring_push(&s->vring, req->head, len + sizeof(hdr));
+
+ s->num_reqs--;
+}
+
+static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
+ QEMUIOVector *inhdr, unsigned char status)
+{
+ struct virtio_blk_inhdr hdr = {
+ .status = status,
+ };
+
+ qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr));
+ qemu_iovec_destroy(inhdr);
+ g_slice_free(QEMUIOVector, inhdr);
+
+ vring_push(&s->vring, head, sizeof(hdr));
+ notify_guest(s);
+}
+
+/* Get disk serial number */
+static void do_get_id_cmd(VirtIOBlockDataPlane *s,
+ struct iovec *iov, unsigned int iov_cnt,
+ unsigned int head, QEMUIOVector *inhdr)
+{
+ char id[VIRTIO_BLK_ID_BYTES];
+
+ /* Serial number not NUL-terminated when shorter than buffer */
+ strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
+ iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+}
+
+static int process_request(IOQueue *ioq, struct iovec iov[],
+ unsigned int out_num, unsigned int in_num,
+ unsigned int head)
+{
+ VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
+ struct iovec *in_iov = &iov[out_num];
+ struct virtio_blk_outhdr outhdr;
+ QEMUIOVector *inhdr;
+ size_t in_size;
+ struct iocb *iocb;
+
+ /* Copy in outhdr */
+ if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
+ sizeof(outhdr)) != sizeof(outhdr))) {
+ error_report("virtio-blk request outhdr too short");
+ return -EFAULT;
+ }
+ iov_discard_front(&iov, &out_num, sizeof(outhdr));
+
+ /* Grab inhdr for later */
+ in_size = iov_size(in_iov, in_num);
+ if (in_size < sizeof(struct virtio_blk_inhdr)) {
+ error_report("virtio_blk request inhdr too short");
+ return -EFAULT;
+ }
+ inhdr = g_slice_new(QEMUIOVector);
+ qemu_iovec_init(inhdr, 1);
+ qemu_iovec_concat_iov(inhdr, in_iov, in_num,
+ in_size - sizeof(struct virtio_blk_inhdr),
+ sizeof(struct virtio_blk_inhdr));
+ iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
+
+ /* TODO Linux sets the barrier bit even when not advertised! */
+ outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
+
+ switch (outhdr.type) {
+ case VIRTIO_BLK_T_IN:
+ iocb = ioq_rdwr(ioq, true, in_iov, in_num, outhdr.sector * 512);
+ break;
+
+ case VIRTIO_BLK_T_OUT:
+ iocb = ioq_rdwr(ioq, false, iov, out_num, outhdr.sector * 512);
+ break;
+
+ case VIRTIO_BLK_T_SCSI_CMD:
+ /* TODO support SCSI commands */
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
+ return 0;
+
+ case VIRTIO_BLK_T_FLUSH:
+ /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
+ if (qemu_fdatasync(s->fd) < 0) {
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
+ } else {
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+ }
+ return 0;
+
+ case VIRTIO_BLK_T_GET_ID:
+ do_get_id_cmd(s, in_iov, in_num, head, inhdr);
+ return 0;
+
+ default:
+ error_report("virtio-blk unsupported request type %#x", outhdr.type);
+ qemu_iovec_destroy(inhdr);
+ g_slice_free(QEMUIOVector, inhdr);
+ return -EFAULT;
+ }
+
+ /* Fill in virtio block metadata needed for completion */
+ VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
+ req->head = head;
+ req->inhdr = inhdr;
+ return 0;
+}
+
+static void handle_notify(EventHandler *handler)
+{
+ VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
+ notify_handler);
+
+ /* There is one array of iovecs into which all new requests are extracted
+ * from the vring. Requests are read from the vring and the translated
+ * descriptors are written to the iovecs array. The iovecs do not have to
+ * persist across handle_notify() calls because the kernel copies the
+ * iovecs on io_submit().
+ *
+ * Handling io_submit() EAGAIN may require storing the requests across
+ * handle_notify() calls until the kernel has sufficient resources to
+ * accept more I/O. This is not implemented yet.
+ */
+ struct iovec iovec[VRING_MAX];
+ struct iovec *end = &iovec[VRING_MAX];
+ struct iovec *iov = iovec;
+
+ /* When a request is read from the vring, the index of the first descriptor
+ * (aka head) is returned so that the completed request can be pushed onto
+ * the vring later.
+ *
+ * The number of hypervisor read-only iovecs is out_num. The number of
+ * hypervisor write-only iovecs is in_num.
+ */
+ int head;
+ unsigned int out_num = 0, in_num = 0;
+ unsigned int num_queued;
+
+ for (;;) {
+ /* Disable guest->host notifies to avoid unnecessary vmexits */
+ vring_disable_notification(s->vdev, &s->vring);
+
+ for (;;) {
+ head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
+ if (head < 0) {
+ break; /* no more requests */
+ }
+
+ trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
+ head);
+
+ if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
+ vring_set_broken(&s->vring);
+ break;
+ }
+ iov += out_num + in_num;
+ }
+
+ if (likely(head == -EAGAIN)) { /* vring emptied */
+ /* Re-enable guest->host notifies and stop processing the vring.
+ * But if the guest has snuck in more descriptors, keep processing.
+ */
+ if (vring_enable_notification(s->vdev, &s->vring)) {
+ break;
+ }
+ } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
+ /* Since there are no iovecs[] left, stop processing for now. Do
+ * not re-enable guest->host notifies since the I/O completion
+ * handler knows to check for more vring descriptors anyway.
+ */
+ break;
+ }
+ }
+
+ num_queued = ioq_num_queued(&s->ioqueue);
+ if (num_queued > 0) {
+ s->num_reqs += num_queued;
+
+ int rc = ioq_submit(&s->ioqueue);
+ if (unlikely(rc < 0)) {
+ fprintf(stderr, "ioq_submit failed %d\n", rc);
+ exit(1);
+ }
+ }
+}
+
+static void handle_io(EventHandler *handler)
+{
+ VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
+ io_handler);
+
+ if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) {
+ notify_guest(s);
+ }
+
+ /* If there were more requests than iovecs, the vring will not be empty yet
+ * so check again. There should now be enough resources to process more
+ * requests.
+ */
+ if (unlikely(vring_more_avail(&s->vring))) {
+ handle_notify(&s->notify_handler);
+ }
+}
+
+static void *data_plane_thread(void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+
+ do {
+ event_poll(&s->event_poll);
+ } while (s->started || s->num_reqs > 0);
+ return NULL;
+}
+
+static void start_data_plane_bh(void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+
+ qemu_bh_delete(s->start_bh);
+ s->start_bh = NULL;
+ qemu_thread_create(&s->thread, data_plane_thread,
+ s, QEMU_THREAD_JOINABLE);
+}
+
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
+ VirtIOBlockDataPlane **dataplane)
+{
+ VirtIOBlockDataPlane *s;
+ int fd;
+
+ *dataplane = NULL;
+
+ if (!blk->data_plane) {
+ return true;
+ }
+
+ if (blk->scsi) {
+ error_report("device is incompatible with x-data-plane, use scsi=off");
+ return false;
+ }
+
+ if (blk->config_wce) {
+ error_report("device is incompatible with x-data-plane, "
+ "use config-wce=off");
+ return false;
+ }
+
+ fd = raw_get_aio_fd(blk->conf.bs);
+ if (fd < 0) {
+ error_report("drive is incompatible with x-data-plane, "
+ "use format=raw,cache=none,aio=native");
+ return false;
+ }
+
+ s = g_new0(VirtIOBlockDataPlane, 1);
+ s->vdev = vdev;
+ s->fd = fd;
+ s->blk = blk;
+
+ /* Prevent block operations that conflict with data plane thread */
+ bdrv_set_in_use(blk->conf.bs, 1);
+
+ error_setg(&s->migration_blocker,
+ "x-data-plane does not support migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ *dataplane = s;
+ return true;
+}
+
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
+{
+ if (!s) {
+ return;
+ }
+
+ virtio_blk_data_plane_stop(s);
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+ bdrv_set_in_use(s->blk->conf.bs, 0);
+ g_free(s);
+}
+
+void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
+{
+ VirtQueue *vq;
+ int i;
+
+ if (s->started) {
+ return;
+ }
+
+ vq = virtio_get_queue(s->vdev, 0);
+ if (!vring_setup(&s->vring, s->vdev, 0)) {
+ return;
+ }
+
+ event_poll_init(&s->event_poll);
+
+ /* Set up guest notifier (irq) */
+ if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque,
+ true) != 0) {
+ fprintf(stderr, "virtio-blk failed to set guest notifier, "
+ "ensure -enable-kvm is set\n");
+ exit(1);
+ }
+ s->guest_notifier = virtio_queue_get_guest_notifier(vq);
+
+ /* Set up virtqueue notify */
+ if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
+ 0, true) != 0) {
+ fprintf(stderr, "virtio-blk failed to set host notifier\n");
+ exit(1);
+ }
+ event_poll_add(&s->event_poll, &s->notify_handler,
+ virtio_queue_get_host_notifier(vq),
+ handle_notify);
+
+ /* Set up ioqueue */
+ ioq_init(&s->ioqueue, s->fd, REQ_MAX);
+ for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
+ ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
+ }
+ event_poll_add(&s->event_poll, &s->io_handler,
+ ioq_get_notifier(&s->ioqueue), handle_io);
+
+ s->started = true;
+ trace_virtio_blk_data_plane_start(s);
+
+ /* Kick right away to begin processing requests already in vring */
+ event_notifier_set(virtio_queue_get_host_notifier(vq));
+
+ /* Spawn thread in BH so it inherits iothread cpusets */
+ s->start_bh = qemu_bh_new(start_data_plane_bh, s);
+ qemu_bh_schedule(s->start_bh);
+}
+
+void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
+{
+ if (!s->started) {
+ return;
+ }
+ s->started = false;
+ trace_virtio_blk_data_plane_stop(s);
+
+ /* Stop thread or cancel pending thread creation BH */
+ if (s->start_bh) {
+ qemu_bh_delete(s->start_bh);
+ s->start_bh = NULL;
+ } else {
+ event_poll_notify(&s->event_poll);
+ qemu_thread_join(&s->thread);
+ }
+
+ ioq_cleanup(&s->ioqueue);
+
+ s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false);
+
+ event_poll_cleanup(&s->event_poll);
+
+ /* Clean up guest notifier (irq) */
+ s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, false);
+
+ vring_teardown(&s->vring);
+}
diff --git a/hw/dataplane/virtio-blk.h b/hw/dataplane/virtio-blk.h
new file mode 100644
index 0000000..1e8fdfe
--- /dev/null
+++ b/hw/dataplane/virtio-blk.h
@@ -0,0 +1,29 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HW_DATAPLANE_VIRTIO_BLK_H
+#define HW_DATAPLANE_VIRTIO_BLK_H
+
+#include "hw/virtio.h"
+
+typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;
+
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
+ VirtIOBlockDataPlane **dataplane);
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s);
+
+#endif /* HW_DATAPLANE_VIRTIO_BLK_H */
diff --git a/hw/virtio-blk.h b/hw/virtio-blk.h
index 454f445..43ca492 100644
--- a/hw/virtio-blk.h
+++ b/hw/virtio-blk.h
@@ -105,6 +105,7 @@ struct VirtIOBlkConf
char *serial;
uint32_t scsi;
uint32_t config_wce;
+ uint32_t data_plane;
};
#define DEFINE_VIRTIO_BLK_FEATURES(_state, _field) \
diff --git a/trace-events b/trace-events
index 167d776..4023a4c 100644
--- a/trace-events
+++ b/trace-events
@@ -98,6 +98,12 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
+# hw/dataplane/virtio-blk.c
+virtio_blk_data_plane_start(void *s) "dataplane %p"
+virtio_blk_data_plane_stop(void *s) "dataplane %p"
+virtio_blk_data_plane_process_request(void *s, unsigned int out_num, unsigned int in_num, unsigned int head) "dataplane %p out_num %u in_num %u head %u"
+virtio_blk_data_plane_complete_request(void *s, unsigned int head, int ret) "dataplane %p head %u ret %d"
+
# hw/dataplane/vring.c
vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring physical %#"PRIx64" desc %p avail %p used %p"
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 12/18] virtio-blk: add x-data-plane=on|off performance feature
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (10 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 11/18] dataplane: add virtio-blk data plane code Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 13/18] virtio-blk: Return UNSUPP for unknown request types Stefan Hajnoczi
` (5 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
The virtio-blk-data-plane feature is easy to integrate into
hw/virtio-blk.c. The data plane can be started and stopped similar to
vhost-net.
Users can take advantage of the virtio-blk-data-plane feature using the
new -device virtio-blk-pci,x-data-plane=on property.
The x-data-plane name was chosen because at this stage the feature is
experimental and likely to see changes in the future.
If the VM configuration does not support virtio-blk-data-plane an error
message is printed. Although we could fall back to regular virtio-blk,
I prefer the explicit approach since it prompts the user to fix their
configuration if they want the performance benefit of
virtio-blk-data-plane.
Limitations:
* Only format=raw is supported
* Live migration is not supported
* Block jobs, hot unplug, and other operations fail with -EBUSY
* I/O throttling limits are ignored
* Only Linux hosts are supported due to Linux AIO usage
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/virtio-blk.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
hw/virtio-pci.c | 3 +++
2 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index fabf387..538a0ae 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -17,6 +17,9 @@
#include "hw/block-common.h"
#include "blockdev.h"
#include "virtio-blk.h"
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+#include "hw/dataplane/virtio-blk.h"
+#endif
#include "scsi-defs.h"
#ifdef __linux__
# include <scsi/sg.h>
@@ -33,6 +36,9 @@ typedef struct VirtIOBlock
VirtIOBlkConf *blk;
unsigned short sector_mask;
DeviceState *qdev;
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ VirtIOBlockDataPlane *dataplane;
+#endif
} VirtIOBlock;
static VirtIOBlock *to_virtio_blk(VirtIODevice *vdev)
@@ -407,6 +413,16 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, VirtQueue *vq)
.num_writes = 0,
};
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ /* Some guests kick before setting VIRTIO_CONFIG_S_DRIVER_OK so start
+ * dataplane here instead of waiting for .set_status().
+ */
+ if (s->dataplane) {
+ virtio_blk_data_plane_start(s->dataplane);
+ return;
+ }
+#endif
+
while ((req = virtio_blk_get_request(s))) {
virtio_blk_handle_request(req, &mrb);
}
@@ -446,8 +462,9 @@ static void virtio_blk_dma_restart_cb(void *opaque, int running,
{
VirtIOBlock *s = opaque;
- if (!running)
+ if (!running) {
return;
+ }
if (!s->bh) {
s->bh = qemu_bh_new(virtio_blk_dma_restart_bh, s);
@@ -457,6 +474,14 @@ static void virtio_blk_dma_restart_cb(void *opaque, int running,
static void virtio_blk_reset(VirtIODevice *vdev)
{
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ VirtIOBlock *s = to_virtio_blk(vdev);
+
+ if (s->dataplane) {
+ virtio_blk_data_plane_stop(s->dataplane);
+ }
+#endif
+
/*
* This should cancel pending requests, but can't do nicely until there
* are per-device request lists.
@@ -541,6 +566,12 @@ static void virtio_blk_set_status(VirtIODevice *vdev, uint8_t status)
VirtIOBlock *s = to_virtio_blk(vdev);
uint32_t features;
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ if (s->dataplane && !(status & VIRTIO_CONFIG_S_DRIVER)) {
+ virtio_blk_data_plane_stop(s->dataplane);
+ }
+#endif
+
if (!(status & VIRTIO_CONFIG_S_DRIVER_OK)) {
return;
}
@@ -638,6 +669,12 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, VirtIOBlkConf *blk)
s->sector_mask = (s->conf->logical_block_size / BDRV_SECTOR_SIZE) - 1;
s->vq = virtio_add_queue(&s->vdev, 128, virtio_blk_handle_output);
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ if (!virtio_blk_data_plane_create(&s->vdev, blk, &s->dataplane)) {
+ virtio_cleanup(&s->vdev);
+ return NULL;
+ }
+#endif
qemu_add_vm_change_state_handler(virtio_blk_dma_restart_cb, s);
s->qdev = dev;
@@ -655,6 +692,11 @@ VirtIODevice *virtio_blk_init(DeviceState *dev, VirtIOBlkConf *blk)
void virtio_blk_exit(VirtIODevice *vdev)
{
VirtIOBlock *s = to_virtio_blk(vdev);
+
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ virtio_blk_data_plane_destroy(s->dataplane);
+ s->dataplane = NULL;
+#endif
unregister_savevm(s->qdev, "virtio-blk", s);
blockdev_mark_auto_del(s->bs);
virtio_cleanup(vdev);
diff --git a/hw/virtio-pci.c b/hw/virtio-pci.c
index 71f4fb5..32cc910 100644
--- a/hw/virtio-pci.c
+++ b/hw/virtio-pci.c
@@ -897,6 +897,9 @@ static Property virtio_blk_properties[] = {
#endif
DEFINE_PROP_BIT("config-wce", VirtIOPCIProxy, blk.config_wce, 0, true),
DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags, VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
+#ifdef CONFIG_VIRTIO_BLK_DATA_PLANE
+ DEFINE_PROP_BIT("x-data-plane", VirtIOPCIProxy, blk.data_plane, 0, false),
+#endif
DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 2),
DEFINE_VIRTIO_BLK_FEATURES(VirtIOPCIProxy, host_features),
DEFINE_PROP_END_OF_LIST(),
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 13/18] virtio-blk: Return UNSUPP for unknown request types
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (11 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 12/18] virtio-blk: add x-data-plane=on|off performance feature Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 14/18] cutils: change strtosz_suffix_unit function Stefan Hajnoczi
` (4 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
From: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Currently, all unknown requests are treated as VIRTIO_BLK_T_IN
Signed-off-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/virtio-blk.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/hw/virtio-blk.c b/hw/virtio-blk.c
index 538a0ae..132180c 100644
--- a/hw/virtio-blk.c
+++ b/hw/virtio-blk.c
@@ -398,10 +398,14 @@ static void virtio_blk_handle_request(VirtIOBlockReq *req,
qemu_iovec_init_external(&req->qiov, &req->elem.out_sg[1],
req->elem.out_num - 1);
virtio_blk_handle_write(req, mrb);
- } else {
+ } else if (type == VIRTIO_BLK_T_IN || type == VIRTIO_BLK_T_BARRIER) {
+ /* VIRTIO_BLK_T_IN is 0, so we can't just & it. */
qemu_iovec_init_external(&req->qiov, &req->elem.in_sg[0],
req->elem.in_num - 1);
virtio_blk_handle_read(req);
+ } else {
+ virtio_blk_req_complete(req, VIRTIO_BLK_S_UNSUPP);
+ g_free(req);
}
}
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 14/18] cutils: change strtosz_suffix_unit function
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (12 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 13/18] virtio-blk: Return UNSUPP for unknown request types Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 15/18] qemu-img: report size overflow error message Stefan Hajnoczi
` (3 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
From: liguang <lig.fnst@cn.fujitsu.com>
if value to be translated is larger than INT64_MAX,
this function will not be convenient for caller to
be aware of it, so change a little for this.
Signed-off-by: liguang <lig.fnst@cn.fujitsu.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
cutils.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/cutils.c b/cutils.c
index 4f0692f..0433e05 100644
--- a/cutils.c
+++ b/cutils.c
@@ -214,12 +214,13 @@ static int64_t suffix_mul(char suffix, int64_t unit)
/*
* Convert string to bytes, allowing either B/b for bytes, K/k for KB,
* M/m for MB, G/g for GB or T/t for TB. End pointer will be returned
- * in *end, if not NULL. Return -1 on error.
+ * in *end, if not NULL. Return -ERANGE on overflow, Return -EINVAL on
+ * other error.
*/
int64_t strtosz_suffix_unit(const char *nptr, char **end,
const char default_suffix, int64_t unit)
{
- int64_t retval = -1;
+ int64_t retval = -EINVAL;
char *endptr;
unsigned char c;
int mul_required = 0;
@@ -246,6 +247,7 @@ int64_t strtosz_suffix_unit(const char *nptr, char **end,
goto fail;
}
if ((val * mul >= INT64_MAX) || val < 0) {
+ retval = -ERANGE;
goto fail;
}
retval = val * mul;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 15/18] qemu-img: report size overflow error message
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (13 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 14/18] cutils: change strtosz_suffix_unit function Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 16/18] block/raw-win32: Fix compiler warnings (wrong format specifiers) Stefan Hajnoczi
` (2 subsequent siblings)
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
From: liguang <lig.fnst@cn.fujitsu.com>
qemu-img will complain when qcow or qcow2
size overflow for 64 bits, report the right
message in this condition.
$./qemu-img create -f qcow2 /tmp/foo 0x10000000000000000
before change:
qemu-img: Invalid image size specified! You may use k, M, G or T suffixes for
qemu-img: kilobytes, megabytes, gigabytes and terabytes.
after change:
qemu-img: Image size must be less than 8 EiB!
[Resolved conflict with a9300911 goto removal -- Stefan]
Signed-off-by: liguang <lig.fnst@cn.fujitsu.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
qemu-img.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/qemu-img.c b/qemu-img.c
index c989a52..da44685 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -348,9 +348,13 @@ static int img_create(int argc, char **argv)
char *end;
sval = strtosz_suffix(argv[optind++], &end, STRTOSZ_DEFSUFFIX_B);
if (sval < 0 || *end) {
- error_report("Invalid image size specified! You may use k, M, G or "
- "T suffixes for ");
- error_report("kilobytes, megabytes, gigabytes and terabytes.");
+ if (sval == -ERANGE) {
+ error_report("Image size must be less than 8 EiB!");
+ } else {
+ error_report("Invalid image size specified! You may use k, M, "
+ "G or T suffixes for ");
+ error_report("kilobytes, megabytes, gigabytes and terabytes.");
+ }
return 1;
}
img_size = (uint64_t)sval;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 16/18] block/raw-win32: Fix compiler warnings (wrong format specifiers)
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (14 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 15/18] qemu-img: report size overflow error message Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 17/18] sheepdog: don't update inode when create_and_write fails Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 18/18] sheepdog: pass oid directly to send_pending_req() Stefan Hajnoczi
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
From: Stefan Weil <sw@weilnetz.de>
Commit fbcad04d6bfdff937536eb23088a01a280a1a3af added fprintf statements
with wrong format specifiers.
GetLastError() returns a DWORD which is unsigned long, so %lu must be used.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/raw-win32.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/block/raw-win32.c b/block/raw-win32.c
index ce207a3..a2e155e 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -314,11 +314,11 @@ static int raw_truncate(BlockDriverState *bs, int64_t offset)
*/
dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN);
if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) {
- fprintf(stderr, "SetFilePointer error: %d\n", GetLastError());
+ fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError());
return -EIO;
}
if (SetEndOfFile(s->hfile) == 0) {
- fprintf(stderr, "SetEndOfFile error: %d\n", GetLastError());
+ fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError());
return -EIO;
}
return 0;
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 17/18] sheepdog: don't update inode when create_and_write fails
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (15 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 16/18] block/raw-win32: Fix compiler warnings (wrong format specifiers) Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
2012-12-19 15:38 ` [Qemu-devel] [PATCH 18/18] sheepdog: pass oid directly to send_pending_req() Stefan Hajnoczi
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
From: Liu Yuan <tailai.ly@taobao.com>
For the error case such as SD_RES_NO_SPACE, we shouldn't update the inode bitmap
to avoid the scenario that the object is allocated but wasn't created at the
server side. This will result in VM's IO error on the failed object.
Cc: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Reviewed-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/sheepdog.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index a48f58c..ef7bc81 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -714,10 +714,11 @@ static void coroutine_fn aio_read_response(void *opaque)
* and max_dirty_data_idx are changed to include updated
* index between them.
*/
- s->inode.data_vdi_id[idx] = s->inode.vdi_id;
- s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
- s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
-
+ if (rsp.result == SD_RES_SUCCESS) {
+ s->inode.data_vdi_id[idx] = s->inode.vdi_id;
+ s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
+ s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+ }
/*
* Some requests may be blocked because simultaneous
* create requests are not allowed, so we search the
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 18/18] sheepdog: pass oid directly to send_pending_req()
2012-12-19 15:38 [Qemu-devel] [PULL 00/18] Block patches Stefan Hajnoczi
` (16 preceding siblings ...)
2012-12-19 15:38 ` [Qemu-devel] [PATCH 17/18] sheepdog: don't update inode when create_and_write fails Stefan Hajnoczi
@ 2012-12-19 15:38 ` Stefan Hajnoczi
17 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2012-12-19 15:38 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori
From: Liu Yuan <tailai.ly@taobao.com>
Cc: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Cc: Kevin Wolf <kwolf@redhat.com>
Signed-off-by: Liu Yuan <tailai.ly@taobao.com>
Reviewed-by: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
block/sheepdog.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index ef7bc81..ceabc00 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -724,7 +724,7 @@ static void coroutine_fn aio_read_response(void *opaque)
* create requests are not allowed, so we search the
* pending requests here.
*/
- send_pending_req(s, vid_to_data_oid(s->inode.vdi_id, idx));
+ send_pending_req(s, aio_req->oid);
}
break;
case AIOCB_READ_UDATA:
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread
* [Qemu-devel] [PATCH 11/18] dataplane: add virtio-blk data plane code
2013-01-02 15:15 [Qemu-devel] [PULL v2 00/18] Block patches Stefan Hajnoczi
@ 2013-01-02 15:15 ` Stefan Hajnoczi
0 siblings, 0 replies; 20+ messages in thread
From: Stefan Hajnoczi @ 2013-01-02 15:15 UTC (permalink / raw)
To: qemu-devel; +Cc: Anthony Liguori, Stefan Hajnoczi
virtio-blk-data-plane is a subset implementation of virtio-blk. It only
handles read, write, and flush requests. It does this using a dedicated
thread that executes an epoll(2)-based event loop and processes I/O
using Linux AIO.
This approach performs very well but can be used for raw image files
only. The number of IOPS achieved has been reported to be several times
higher than the existing virtio-blk implementation.
Eventually it should be possible to unify virtio-blk-data-plane with the
main body of QEMU code once the block layer and hardware emulation is
able to run outside the global mutex.
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
hw/dataplane/Makefile.objs | 2 +-
hw/dataplane/virtio-blk.c | 465 +++++++++++++++++++++++++++++++++++++++++++++
hw/dataplane/virtio-blk.h | 29 +++
hw/virtio-blk.h | 1 +
trace-events | 6 +
5 files changed, 502 insertions(+), 1 deletion(-)
create mode 100644 hw/dataplane/virtio-blk.c
create mode 100644 hw/dataplane/virtio-blk.h
diff --git a/hw/dataplane/Makefile.objs b/hw/dataplane/Makefile.objs
index abd408f..682aa9e 100644
--- a/hw/dataplane/Makefile.objs
+++ b/hw/dataplane/Makefile.objs
@@ -1,3 +1,3 @@
ifeq ($(CONFIG_VIRTIO), y)
-common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o ioq.o
+common-obj-$(CONFIG_VIRTIO_BLK_DATA_PLANE) += hostmem.o vring.o event-poll.o ioq.o virtio-blk.o
endif
diff --git a/hw/dataplane/virtio-blk.c b/hw/dataplane/virtio-blk.c
new file mode 100644
index 0000000..4c4ad84
--- /dev/null
+++ b/hw/dataplane/virtio-blk.c
@@ -0,0 +1,465 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "qemu/iov.h"
+#include "event-poll.h"
+#include "qemu/thread.h"
+#include "vring.h"
+#include "ioq.h"
+#include "migration/migration.h"
+#include "hw/virtio-blk.h"
+#include "hw/dataplane/virtio-blk.h"
+
+enum {
+ SEG_MAX = 126, /* maximum number of I/O segments */
+ VRING_MAX = SEG_MAX + 2, /* maximum number of vring descriptors */
+ REQ_MAX = VRING_MAX, /* maximum number of requests in the vring,
+ * is VRING_MAX / 2 with traditional and
+ * VRING_MAX with indirect descriptors */
+};
+
+typedef struct {
+ struct iocb iocb; /* Linux AIO control block */
+ QEMUIOVector *inhdr; /* iovecs for virtio_blk_inhdr */
+ unsigned int head; /* vring descriptor index */
+} VirtIOBlockRequest;
+
+struct VirtIOBlockDataPlane {
+ bool started;
+ QEMUBH *start_bh;
+ QemuThread thread;
+
+ VirtIOBlkConf *blk;
+ int fd; /* image file descriptor */
+
+ VirtIODevice *vdev;
+ Vring vring; /* virtqueue vring */
+ EventNotifier *guest_notifier; /* irq */
+
+ EventPoll event_poll; /* event poller */
+ EventHandler io_handler; /* Linux AIO completion handler */
+ EventHandler notify_handler; /* virtqueue notify handler */
+
+ IOQueue ioqueue; /* Linux AIO queue (should really be per
+ dataplane thread) */
+ VirtIOBlockRequest requests[REQ_MAX]; /* pool of requests, managed by the
+ queue */
+
+ unsigned int num_reqs;
+
+ Error *migration_blocker;
+};
+
+/* Raise an interrupt to signal guest, if necessary */
+static void notify_guest(VirtIOBlockDataPlane *s)
+{
+ if (!vring_should_notify(s->vdev, &s->vring)) {
+ return;
+ }
+
+ event_notifier_set(s->guest_notifier);
+}
+
+static void complete_request(struct iocb *iocb, ssize_t ret, void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+ VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
+ struct virtio_blk_inhdr hdr;
+ int len;
+
+ if (likely(ret >= 0)) {
+ hdr.status = VIRTIO_BLK_S_OK;
+ len = ret;
+ } else {
+ hdr.status = VIRTIO_BLK_S_IOERR;
+ len = 0;
+ }
+
+ trace_virtio_blk_data_plane_complete_request(s, req->head, ret);
+
+ qemu_iovec_from_buf(req->inhdr, 0, &hdr, sizeof(hdr));
+ qemu_iovec_destroy(req->inhdr);
+ g_slice_free(QEMUIOVector, req->inhdr);
+
+ /* According to the virtio specification len should be the number of bytes
+ * written to, but for virtio-blk it seems to be the number of bytes
+ * transferred plus the status bytes.
+ */
+ vring_push(&s->vring, req->head, len + sizeof(hdr));
+
+ s->num_reqs--;
+}
+
+static void complete_request_early(VirtIOBlockDataPlane *s, unsigned int head,
+ QEMUIOVector *inhdr, unsigned char status)
+{
+ struct virtio_blk_inhdr hdr = {
+ .status = status,
+ };
+
+ qemu_iovec_from_buf(inhdr, 0, &hdr, sizeof(hdr));
+ qemu_iovec_destroy(inhdr);
+ g_slice_free(QEMUIOVector, inhdr);
+
+ vring_push(&s->vring, head, sizeof(hdr));
+ notify_guest(s);
+}
+
+/* Get disk serial number */
+static void do_get_id_cmd(VirtIOBlockDataPlane *s,
+ struct iovec *iov, unsigned int iov_cnt,
+ unsigned int head, QEMUIOVector *inhdr)
+{
+ char id[VIRTIO_BLK_ID_BYTES];
+
+ /* Serial number not NUL-terminated when shorter than buffer */
+ strncpy(id, s->blk->serial ? s->blk->serial : "", sizeof(id));
+ iov_from_buf(iov, iov_cnt, 0, id, sizeof(id));
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+}
+
+static int process_request(IOQueue *ioq, struct iovec iov[],
+ unsigned int out_num, unsigned int in_num,
+ unsigned int head)
+{
+ VirtIOBlockDataPlane *s = container_of(ioq, VirtIOBlockDataPlane, ioqueue);
+ struct iovec *in_iov = &iov[out_num];
+ struct virtio_blk_outhdr outhdr;
+ QEMUIOVector *inhdr;
+ size_t in_size;
+ struct iocb *iocb;
+
+ /* Copy in outhdr */
+ if (unlikely(iov_to_buf(iov, out_num, 0, &outhdr,
+ sizeof(outhdr)) != sizeof(outhdr))) {
+ error_report("virtio-blk request outhdr too short");
+ return -EFAULT;
+ }
+ iov_discard_front(&iov, &out_num, sizeof(outhdr));
+
+ /* Grab inhdr for later */
+ in_size = iov_size(in_iov, in_num);
+ if (in_size < sizeof(struct virtio_blk_inhdr)) {
+ error_report("virtio_blk request inhdr too short");
+ return -EFAULT;
+ }
+ inhdr = g_slice_new(QEMUIOVector);
+ qemu_iovec_init(inhdr, 1);
+ qemu_iovec_concat_iov(inhdr, in_iov, in_num,
+ in_size - sizeof(struct virtio_blk_inhdr),
+ sizeof(struct virtio_blk_inhdr));
+ iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
+
+ /* TODO Linux sets the barrier bit even when not advertised! */
+ outhdr.type &= ~VIRTIO_BLK_T_BARRIER;
+
+ switch (outhdr.type) {
+ case VIRTIO_BLK_T_IN:
+ iocb = ioq_rdwr(ioq, true, in_iov, in_num, outhdr.sector * 512);
+ break;
+
+ case VIRTIO_BLK_T_OUT:
+ iocb = ioq_rdwr(ioq, false, iov, out_num, outhdr.sector * 512);
+ break;
+
+ case VIRTIO_BLK_T_SCSI_CMD:
+ /* TODO support SCSI commands */
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_UNSUPP);
+ return 0;
+
+ case VIRTIO_BLK_T_FLUSH:
+ /* TODO fdsync not supported by Linux AIO, do it synchronously here! */
+ if (qemu_fdatasync(s->fd) < 0) {
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_IOERR);
+ } else {
+ complete_request_early(s, head, inhdr, VIRTIO_BLK_S_OK);
+ }
+ return 0;
+
+ case VIRTIO_BLK_T_GET_ID:
+ do_get_id_cmd(s, in_iov, in_num, head, inhdr);
+ return 0;
+
+ default:
+ error_report("virtio-blk unsupported request type %#x", outhdr.type);
+ qemu_iovec_destroy(inhdr);
+ g_slice_free(QEMUIOVector, inhdr);
+ return -EFAULT;
+ }
+
+ /* Fill in virtio block metadata needed for completion */
+ VirtIOBlockRequest *req = container_of(iocb, VirtIOBlockRequest, iocb);
+ req->head = head;
+ req->inhdr = inhdr;
+ return 0;
+}
+
+static void handle_notify(EventHandler *handler)
+{
+ VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
+ notify_handler);
+
+ /* There is one array of iovecs into which all new requests are extracted
+ * from the vring. Requests are read from the vring and the translated
+ * descriptors are written to the iovecs array. The iovecs do not have to
+ * persist across handle_notify() calls because the kernel copies the
+ * iovecs on io_submit().
+ *
+ * Handling io_submit() EAGAIN may require storing the requests across
+ * handle_notify() calls until the kernel has sufficient resources to
+ * accept more I/O. This is not implemented yet.
+ */
+ struct iovec iovec[VRING_MAX];
+ struct iovec *end = &iovec[VRING_MAX];
+ struct iovec *iov = iovec;
+
+ /* When a request is read from the vring, the index of the first descriptor
+ * (aka head) is returned so that the completed request can be pushed onto
+ * the vring later.
+ *
+ * The number of hypervisor read-only iovecs is out_num. The number of
+ * hypervisor write-only iovecs is in_num.
+ */
+ int head;
+ unsigned int out_num = 0, in_num = 0;
+ unsigned int num_queued;
+
+ for (;;) {
+ /* Disable guest->host notifies to avoid unnecessary vmexits */
+ vring_disable_notification(s->vdev, &s->vring);
+
+ for (;;) {
+ head = vring_pop(s->vdev, &s->vring, iov, end, &out_num, &in_num);
+ if (head < 0) {
+ break; /* no more requests */
+ }
+
+ trace_virtio_blk_data_plane_process_request(s, out_num, in_num,
+ head);
+
+ if (process_request(&s->ioqueue, iov, out_num, in_num, head) < 0) {
+ vring_set_broken(&s->vring);
+ break;
+ }
+ iov += out_num + in_num;
+ }
+
+ if (likely(head == -EAGAIN)) { /* vring emptied */
+ /* Re-enable guest->host notifies and stop processing the vring.
+ * But if the guest has snuck in more descriptors, keep processing.
+ */
+ if (vring_enable_notification(s->vdev, &s->vring)) {
+ break;
+ }
+ } else { /* head == -ENOBUFS or fatal error, iovecs[] is depleted */
+ /* Since there are no iovecs[] left, stop processing for now. Do
+ * not re-enable guest->host notifies since the I/O completion
+ * handler knows to check for more vring descriptors anyway.
+ */
+ break;
+ }
+ }
+
+ num_queued = ioq_num_queued(&s->ioqueue);
+ if (num_queued > 0) {
+ s->num_reqs += num_queued;
+
+ int rc = ioq_submit(&s->ioqueue);
+ if (unlikely(rc < 0)) {
+ fprintf(stderr, "ioq_submit failed %d\n", rc);
+ exit(1);
+ }
+ }
+}
+
+static void handle_io(EventHandler *handler)
+{
+ VirtIOBlockDataPlane *s = container_of(handler, VirtIOBlockDataPlane,
+ io_handler);
+
+ if (ioq_run_completion(&s->ioqueue, complete_request, s) > 0) {
+ notify_guest(s);
+ }
+
+ /* If there were more requests than iovecs, the vring will not be empty yet
+ * so check again. There should now be enough resources to process more
+ * requests.
+ */
+ if (unlikely(vring_more_avail(&s->vring))) {
+ handle_notify(&s->notify_handler);
+ }
+}
+
+static void *data_plane_thread(void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+
+ do {
+ event_poll(&s->event_poll);
+ } while (s->started || s->num_reqs > 0);
+ return NULL;
+}
+
+static void start_data_plane_bh(void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+
+ qemu_bh_delete(s->start_bh);
+ s->start_bh = NULL;
+ qemu_thread_create(&s->thread, data_plane_thread,
+ s, QEMU_THREAD_JOINABLE);
+}
+
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
+ VirtIOBlockDataPlane **dataplane)
+{
+ VirtIOBlockDataPlane *s;
+ int fd;
+
+ *dataplane = NULL;
+
+ if (!blk->data_plane) {
+ return true;
+ }
+
+ if (blk->scsi) {
+ error_report("device is incompatible with x-data-plane, use scsi=off");
+ return false;
+ }
+
+ if (blk->config_wce) {
+ error_report("device is incompatible with x-data-plane, "
+ "use config-wce=off");
+ return false;
+ }
+
+ fd = raw_get_aio_fd(blk->conf.bs);
+ if (fd < 0) {
+ error_report("drive is incompatible with x-data-plane, "
+ "use format=raw,cache=none,aio=native");
+ return false;
+ }
+
+ s = g_new0(VirtIOBlockDataPlane, 1);
+ s->vdev = vdev;
+ s->fd = fd;
+ s->blk = blk;
+
+ /* Prevent block operations that conflict with data plane thread */
+ bdrv_set_in_use(blk->conf.bs, 1);
+
+ error_setg(&s->migration_blocker,
+ "x-data-plane does not support migration");
+ migrate_add_blocker(s->migration_blocker);
+
+ *dataplane = s;
+ return true;
+}
+
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
+{
+ if (!s) {
+ return;
+ }
+
+ virtio_blk_data_plane_stop(s);
+ migrate_del_blocker(s->migration_blocker);
+ error_free(s->migration_blocker);
+ bdrv_set_in_use(s->blk->conf.bs, 0);
+ g_free(s);
+}
+
+void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s)
+{
+ VirtQueue *vq;
+ int i;
+
+ if (s->started) {
+ return;
+ }
+
+ vq = virtio_get_queue(s->vdev, 0);
+ if (!vring_setup(&s->vring, s->vdev, 0)) {
+ return;
+ }
+
+ event_poll_init(&s->event_poll);
+
+ /* Set up guest notifier (irq) */
+ if (s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque,
+ true) != 0) {
+ fprintf(stderr, "virtio-blk failed to set guest notifier, "
+ "ensure -enable-kvm is set\n");
+ exit(1);
+ }
+ s->guest_notifier = virtio_queue_get_guest_notifier(vq);
+
+ /* Set up virtqueue notify */
+ if (s->vdev->binding->set_host_notifier(s->vdev->binding_opaque,
+ 0, true) != 0) {
+ fprintf(stderr, "virtio-blk failed to set host notifier\n");
+ exit(1);
+ }
+ event_poll_add(&s->event_poll, &s->notify_handler,
+ virtio_queue_get_host_notifier(vq),
+ handle_notify);
+
+ /* Set up ioqueue */
+ ioq_init(&s->ioqueue, s->fd, REQ_MAX);
+ for (i = 0; i < ARRAY_SIZE(s->requests); i++) {
+ ioq_put_iocb(&s->ioqueue, &s->requests[i].iocb);
+ }
+ event_poll_add(&s->event_poll, &s->io_handler,
+ ioq_get_notifier(&s->ioqueue), handle_io);
+
+ s->started = true;
+ trace_virtio_blk_data_plane_start(s);
+
+ /* Kick right away to begin processing requests already in vring */
+ event_notifier_set(virtio_queue_get_host_notifier(vq));
+
+ /* Spawn thread in BH so it inherits iothread cpusets */
+ s->start_bh = qemu_bh_new(start_data_plane_bh, s);
+ qemu_bh_schedule(s->start_bh);
+}
+
+void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s)
+{
+ if (!s->started) {
+ return;
+ }
+ s->started = false;
+ trace_virtio_blk_data_plane_stop(s);
+
+ /* Stop thread or cancel pending thread creation BH */
+ if (s->start_bh) {
+ qemu_bh_delete(s->start_bh);
+ s->start_bh = NULL;
+ } else {
+ event_poll_notify(&s->event_poll);
+ qemu_thread_join(&s->thread);
+ }
+
+ ioq_cleanup(&s->ioqueue);
+
+ s->vdev->binding->set_host_notifier(s->vdev->binding_opaque, 0, false);
+
+ event_poll_cleanup(&s->event_poll);
+
+ /* Clean up guest notifier (irq) */
+ s->vdev->binding->set_guest_notifiers(s->vdev->binding_opaque, false);
+
+ vring_teardown(&s->vring);
+}
diff --git a/hw/dataplane/virtio-blk.h b/hw/dataplane/virtio-blk.h
new file mode 100644
index 0000000..1e8fdfe
--- /dev/null
+++ b/hw/dataplane/virtio-blk.h
@@ -0,0 +1,29 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HW_DATAPLANE_VIRTIO_BLK_H
+#define HW_DATAPLANE_VIRTIO_BLK_H
+
+#include "hw/virtio.h"
+
+typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;
+
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *blk,
+ VirtIOBlockDataPlane **dataplane);
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_start(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_stop(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_drain(VirtIOBlockDataPlane *s);
+
+#endif /* HW_DATAPLANE_VIRTIO_BLK_H */
diff --git a/hw/virtio-blk.h b/hw/virtio-blk.h
index 454f445..43ca492 100644
--- a/hw/virtio-blk.h
+++ b/hw/virtio-blk.h
@@ -105,6 +105,7 @@ struct VirtIOBlkConf
char *serial;
uint32_t scsi;
uint32_t config_wce;
+ uint32_t data_plane;
};
#define DEFINE_VIRTIO_BLK_FEATURES(_state, _field) \
diff --git a/trace-events b/trace-events
index 167d776..4023a4c 100644
--- a/trace-events
+++ b/trace-events
@@ -98,6 +98,12 @@ virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
virtio_blk_handle_write(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
virtio_blk_handle_read(void *req, uint64_t sector, size_t nsectors) "req %p sector %"PRIu64" nsectors %zu"
+# hw/dataplane/virtio-blk.c
+virtio_blk_data_plane_start(void *s) "dataplane %p"
+virtio_blk_data_plane_stop(void *s) "dataplane %p"
+virtio_blk_data_plane_process_request(void *s, unsigned int out_num, unsigned int in_num, unsigned int head) "dataplane %p out_num %u in_num %u head %u"
+virtio_blk_data_plane_complete_request(void *s, unsigned int head, int ret) "dataplane %p head %u ret %d"
+
# hw/dataplane/vring.c
vring_setup(uint64_t physical, void *desc, void *avail, void *used) "vring physical %#"PRIx64" desc %p avail %p used %p"
--
1.8.0.2
^ permalink raw reply related [flat|nested] 20+ messages in thread