[Qemu-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
@ 2008-04-17 19:26 Anthony Liguori
  2008-04-17 19:26 ` [Qemu-devel] [PATCH 2/3] Split out posix-aio code Anthony Liguori
                   ` (2 more replies)
  0 siblings, 3 replies; 31+ messages in thread
From: Anthony Liguori @ 2008-04-17 19:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Posix AIO, especially as used by QEMU, is not very efficient for disk IO.
This patch introduces an AIO abstract to allow multiple AIO implements to be
used.  We can't simply replace posix-aio by linux-aio because linux-aio only
works on some filesystems and only with files opened with O_DIRECT.

This patch adds a command line option (-aio) to select the AIO implementation
to be used.  It avoids code motion to allow for easy review.  The next patch
separates out the posix-aio implementation.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/block-aio.h b/block-aio.h
new file mode 100644
index 0000000..2fe8c58
--- /dev/null
+++ b/block-aio.h
@@ -0,0 +1,46 @@
+/*
+ * QEMU Block AIO API
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#include "qemu-common.h"
+#include "block.h"
+
+typedef struct AIODriver
+{
+    const char *name;
+    size_t aiocb_size;
+    void (*aio_init)(void);
+    void (*aio_wait_start)(void);
+    void (*aio_wait)(void);
+    void (*aio_wait_end)(void);
+    void (*aio_flush)(void);
+    BlockDriverAIOCB *(*aio_submit)(BlockDriverState *bs, int fd,
+				    int64_t sector_num, void *buf,
+				    int sectors, int write,
+				    BlockDriverCompletionFunc *cb,
+				    void *opaque);
+    void (*aio_cancel)(BlockDriverAIOCB *aiocb);
+    struct AIODriver *next;
+} AIODriver;
+
+int qemu_register_aio(AIODriver *drv);
+
+int qemu_set_aio_driver(const char *name);
+
+extern AIODriver *aio_drv;
+
+int posix_aio_init(void);
+
+#endif
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 6b0009e..fee8422 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -27,6 +27,7 @@
 #include "exec-all.h"
 #endif
 #include "block_int.h"
+#include "block-aio.h"
 #include <assert.h>
 #include <aio.h>
 
@@ -243,6 +244,11 @@ static int aio_sig_num = SIGUSR2;
 static RawAIOCB *first_aio; /* AIO issued */
 static int aio_initialized = 0;
 
+static void pa_poll(void *opaque);
+static void pa_wait_start(void);
+static void pa_wait(void);
+static void pa_wait_end(void);
+
 static void aio_signal_handler(int signum)
 {
 #ifndef QEMU_IMG
@@ -259,11 +265,13 @@ static void aio_signal_handler(int signum)
 #endif
 }
 
-void qemu_aio_init(void)
+static void pa_init(void)
 {
     struct sigaction act;
 
-    aio_initialized = 1;
+#ifndef QEMU_IMG
+    qemu_register_poll(pa_poll, NULL);
+#endif
 
     sigfillset(&act.sa_mask);
     act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
@@ -284,7 +292,7 @@ void qemu_aio_init(void)
 #endif
 }
 
-void qemu_aio_poll(void)
+static void pa_poll(void *opaque)
 {
     RawAIOCB *acb, **pacb;
     int ret;
@@ -326,31 +334,29 @@ void qemu_aio_poll(void)
 }
 
 /* Wait for all IO requests to complete.  */
-void qemu_aio_flush(void)
+static void pa_flush(void)
 {
-    qemu_aio_wait_start();
-    qemu_aio_poll();
+    pa_wait_start();
+    pa_poll(NULL);
     while (first_aio) {
-        qemu_aio_wait();
+        pa_wait();
     }
-    qemu_aio_wait_end();
+    pa_wait_end();
 }
 
 /* wait until at least one AIO was handled */
 static sigset_t wait_oset;
 
-void qemu_aio_wait_start(void)
+static void pa_wait_start(void)
 {
     sigset_t set;
 
-    if (!aio_initialized)
-        qemu_aio_init();
     sigemptyset(&set);
     sigaddset(&set, aio_sig_num);
     sigprocmask(SIG_BLOCK, &set, &wait_oset);
 }
 
-void qemu_aio_wait(void)
+static void pa_wait(void)
 {
     sigset_t set;
     int nb_sigs;
@@ -362,19 +368,18 @@ void qemu_aio_wait(void)
     sigemptyset(&set);
     sigaddset(&set, aio_sig_num);
     sigwait(&set, &nb_sigs);
-    qemu_aio_poll();
+    pa_poll(NULL);
 }
 
-void qemu_aio_wait_end(void)
+static void pa_wait_end(void)
 {
     sigprocmask(SIG_SETMASK, &wait_oset, NULL);
 }
 
-static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
+static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int fd,
         int64_t sector_num, uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
-    BDRVRawState *s = bs->opaque;
     RawAIOCB *acb;
 
     if (fd_open(bs) < 0)
@@ -383,7 +388,7 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
     acb = qemu_aio_get(bs, cb, opaque);
     if (!acb)
         return NULL;
-    acb->aiocb.aio_fildes = s->fd;
+    acb->aiocb.aio_fildes = fd;
     acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num;
     acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
     acb->aiocb.aio_buf = buf;
@@ -397,39 +402,32 @@ static RawAIOCB *raw_aio_setup(BlockDriverState *bs,
     return acb;
 }
 
-static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
-        int64_t sector_num, uint8_t *buf, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
+static BlockDriverAIOCB *pa_submit(BlockDriverState *bs,
+				   int fd, int64_t sector_num,
+				   void *buf, int nb_sectors, int write,
+				   BlockDriverCompletionFunc *cb,
+				   void *opaque)
 {
     RawAIOCB *acb;
+    int err;
 
-    acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
+    acb = raw_aio_setup(bs, fd, sector_num, buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
-    if (aio_read(&acb->aiocb) < 0) {
-        qemu_aio_release(acb);
-        return NULL;
-    }
-    return &acb->common;
-}
 
-static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
-        int64_t sector_num, const uint8_t *buf, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
-{
-    RawAIOCB *acb;
+    if (write) 
+	err = aio_write(&acb->aiocb);
+    else
+	err = aio_read(&acb->aiocb);
 
-    acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
-    if (!acb)
-        return NULL;
-    if (aio_write(&acb->aiocb) < 0) {
+    if (err < 0) {
         qemu_aio_release(acb);
         return NULL;
     }
     return &acb->common;
 }
 
-static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
+static void pa_cancel(BlockDriverAIOCB *blockacb)
 {
     int ret;
     RawAIOCB *acb = (RawAIOCB *)blockacb;
@@ -456,6 +454,91 @@ static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
     }
 }
 
+static AIODriver posix_aio_drv = {
+    .name = "posix",
+    .aiocb_size = sizeof(RawAIOCB),
+    .aio_init = pa_init,
+    .aio_wait_start = pa_wait_start,
+    .aio_wait = pa_wait,
+    .aio_wait_end = pa_wait_end,
+    .aio_flush = pa_flush,
+    .aio_submit = pa_submit,
+    .aio_cancel = pa_cancel,
+};
+
+int posix_aio_init(void)
+{
+    return qemu_register_aio(&posix_aio_drv);
+}
+	
+void qemu_aio_init(void)
+{
+    if (aio_initialized)
+	return;
+ 
+    aio_initialized = 1;
+    bdrv_host_device.aiocb_size = aio_drv->aiocb_size;
+    bdrv_raw.aiocb_size = aio_drv->aiocb_size;
+    if (aio_drv->aio_init)
+	aio_drv->aio_init();
+}
+
+void qemu_aio_flush(void)
+{
+    qemu_aio_init();
+    aio_drv->aio_flush();
+}
+
+void qemu_aio_wait_start(void)
+{
+    qemu_aio_init();
+    if (aio_drv->aio_wait_start)
+	aio_drv->aio_wait_start();
+}
+
+void qemu_aio_wait(void)
+{
+    qemu_aio_init();
+    aio_drv->aio_wait();
+}
+
+void qemu_aio_wait_end(void)
+{
+    if (aio_drv->aio_wait_end)
+	aio_drv->aio_wait_end();
+}
+ 
+static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
+        int64_t sector_num, uint8_t *buf, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (fd_open(bs) < 0)
+	return NULL;
+
+    return aio_drv->aio_submit(bs, s->fd, sector_num, buf, nb_sectors, 0,
+			       cb, opaque);
+}
+
+static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
+        int64_t sector_num, const uint8_t *buf, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (fd_open(bs) < 0)
+	return NULL;
+
+    return aio_drv->aio_submit(bs, s->fd, sector_num, (void *)buf, nb_sectors,
+			       1, cb, opaque);
+}
+
+static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
+{
+    aio_drv->aio_cancel(blockacb);
+}
+ 
 static void raw_close(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
@@ -559,7 +642,6 @@ BlockDriver bdrv_raw = {
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
     .bdrv_aio_cancel = raw_aio_cancel,
-    .aiocb_size = sizeof(RawAIOCB),
     .protocol_name = "file",
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
@@ -911,7 +993,6 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
     .bdrv_aio_cancel = raw_aio_cancel,
-    .aiocb_size = sizeof(RawAIOCB),
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
     .bdrv_getlength = raw_getlength,
diff --git a/block-raw-win32.c b/block-raw-win32.c
index 43d3f6c..6b40a27 100644
--- a/block-raw-win32.c
+++ b/block-raw-win32.c
@@ -350,10 +350,6 @@ void qemu_aio_init(void)
 {
 }
 
-void qemu_aio_poll(void)
-{
-}
-
 void qemu_aio_flush(void)
 {
 }
diff --git a/block.c b/block.c
index eb610e0..44cb747 100644
--- a/block.c
+++ b/block.c
@@ -26,6 +26,7 @@
 #include "console.h"
 #endif
 #include "block_int.h"
+#include "block-aio.h"
 
 #ifdef _BSD
 #include <sys/types.h>
@@ -1347,6 +1348,9 @@ void bdrv_init(void)
     bdrv_register(&bdrv_vvfat);
     bdrv_register(&bdrv_qcow2);
     bdrv_register(&bdrv_parallels);
+#ifndef _WIN32
+    posix_aio_init();
+#endif
 }
 
 void *qemu_aio_get(BlockDriverState *bs, BlockDriverCompletionFunc *cb,
@@ -1378,6 +1382,40 @@ void qemu_aio_release(void *p)
     drv->free_aiocb = acb;
 }
 
+static AIODriver *aio_driver_list;
+AIODriver *aio_drv;
+
+int qemu_register_aio(AIODriver *drv)
+{
+    drv->next = aio_driver_list;
+    aio_driver_list = drv;
+    aio_drv = aio_driver_list;
+
+    return 0;
+}
+
+int qemu_set_aio_driver(const char *name)
+{
+    AIODriver *drv;
+
+    if (!strcmp(name, "?")) {
+	printf("Available aio drivers:\n");
+	for (drv = aio_driver_list; drv; drv = drv->next) {
+	    printf("%s\n", drv->name);
+	}
+	exit(0);
+    }
+
+    for (drv = aio_driver_list; drv; drv = drv->next) {
+	if (!strcmp(name, drv->name))
+	    break;
+    }
+
+    aio_drv = drv;
+
+    return 0;
+}
+
 /**************************************************************/
 /* removable device support */
 
diff --git a/block.h b/block.h
index 9d30db2..ff19425 100644
--- a/block.h
+++ b/block.h
@@ -94,7 +94,6 @@ BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
-void qemu_aio_poll(void);
 void qemu_aio_flush(void);
 void qemu_aio_wait_start(void);
 void qemu_aio_wait(void);
diff --git a/sysemu.h b/sysemu.h
index 0078190..9931139 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -41,6 +41,8 @@ void qemu_system_powerdown(void);
 #endif
 void qemu_system_reset(void);
 
+void qemu_register_poll(IOHandler *poll, void *opaque);
+
 void cpu_save(QEMUFile *f, void *opaque);
 int cpu_load(QEMUFile *f, void *opaque, int version_id);
 
diff --git a/vl.c b/vl.c
index cc328b0..cebcdc3 100644
--- a/vl.c
+++ b/vl.c
@@ -36,6 +36,7 @@
 #include "qemu-timer.h"
 #include "qemu-char.h"
 #include "block.h"
+#include "block-aio.h"
 #include "audio/audio.h"
 #include "balloon.h"
 
@@ -7371,6 +7372,33 @@ void qemu_bh_delete(QEMUBH *bh)
     qemu_free(bh);
 }
 
+ /***********************************************************/
+/* poll handlers */
+
+typedef struct PollHandler
+{
+    IOHandler *func;
+    void *opaque;
+    struct PollHandler *next;
+} PollHandler;
+
+static PollHandler *poll_handlers;
+
+void qemu_register_poll(IOHandler *poll, void *opaque)
+{
+    PollHandler *p;
+
+    p = qemu_mallocz(sizeof(*p));
+    if (p == NULL)
+	return;
+
+    p->func = poll;
+    p->opaque = opaque;
+    p->next = poll_handlers;
+
+    poll_handlers = p;
+}
+
 /***********************************************************/
 /* machine registration */
 
@@ -7689,7 +7717,12 @@ void main_loop_wait(int timeout)
         slirp_select_poll(&rfds, &wfds, &xfds);
     }
 #endif
-    qemu_aio_poll();
+    if (poll_handlers) {
+	PollHandler *poll;
+
+	for (poll = poll_handlers; poll; poll = poll->next)
+	    poll->func(poll->opaque);
+    }
 
     if (vm_running) {
         qemu_run_timers(&active_timers[QEMU_TIMER_VIRTUAL],
@@ -7928,6 +7961,8 @@ static void help(int exitcode)
            "-clock          force the use of the given methods for timer alarm.\n"
            "                To see what timers are available use -clock ?\n"
            "-startdate      select initial date of the clock\n"
+	   "-aio string     Force aio type `string'\n"
+	   "                Use -aio ? to see available aio types.\n"
            "\n"
            "During emulation, the following keys are useful:\n"
            "ctrl-alt-f      toggle full screen\n"
@@ -8031,6 +8066,7 @@ enum {
     QEMU_OPTION_old_param,
     QEMU_OPTION_clock,
     QEMU_OPTION_startdate,
+    QEMU_OPTION_aio,
 };
 
 typedef struct QEMUOption {
@@ -8142,6 +8178,7 @@ const QEMUOption qemu_options[] = {
 #endif
     { "clock", HAS_ARG, QEMU_OPTION_clock },
     { "startdate", HAS_ARG, QEMU_OPTION_startdate },
+    { "aio", HAS_ARG, QEMU_OPTION_aio },
     { NULL },
 };
 
@@ -8417,6 +8454,7 @@ int main(int argc, char **argv)
     int fds[2];
     const char *pid_file = NULL;
     VLANState *vlan;
+    const char *aio_opt = NULL;
 
     LIST_INIT (&vm_change_state_head);
 #ifndef _WIN32
@@ -8991,6 +9029,9 @@ int main(int argc, char **argv)
                     }
                 }
                 break;
+	    case QEMU_OPTION_aio:
+		aio_opt = optarg;
+		break;
             }
         }
     }
@@ -9075,7 +9116,6 @@ int main(int argc, char **argv)
 
     init_timers();
     init_timer_alarm();
-    qemu_aio_init();
 
 #ifdef _WIN32
     socket_init();
@@ -9146,6 +9186,11 @@ int main(int argc, char **argv)
 
     bdrv_init();
 
+    if (aio_opt)
+	qemu_set_aio_driver(aio_opt);
+
+    qemu_aio_init();
+
     /* we always create the cdrom drive, even if no disk is there */
 
     if (nb_drives_opt < MAX_DRIVES)

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [Qemu-devel] [PATCH 2/3] Split out posix-aio code
  2008-04-17 19:26 [Qemu-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Anthony Liguori
@ 2008-04-17 19:26 ` Anthony Liguori
  2008-04-17 19:26 ` [Qemu-devel] [PATCH 3/3] Implement linux-aio backend Anthony Liguori
  2008-04-17 19:38 ` [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Daniel P. Berrange
  2 siblings, 0 replies; 31+ messages in thread
From: Anthony Liguori @ 2008-04-17 19:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

This patch moves the posix-aio code into a separate file.  It's strictly code
motion, no new functionality is introduced.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile b/Makefile
index a8df278..916f071 100644
--- a/Makefile
+++ b/Makefile
@@ -139,7 +139,7 @@ QEMU_IMG_BLOCK_OBJS = $(BLOCK_OBJS)
 ifdef CONFIG_WIN32
 QEMU_IMG_BLOCK_OBJS += qemu-img-block-raw-win32.o
 else
-QEMU_IMG_BLOCK_OBJS += qemu-img-block-raw-posix.o
+QEMU_IMG_BLOCK_OBJS += qemu-img-block-raw-posix.o qemu-img-aio-posix.o
 endif
 
 ######################################################################
diff --git a/Makefile.target b/Makefile.target
index 75de753..f635d68 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -485,7 +485,7 @@ OBJS=vl.o osdep.o monitor.o pci.o loader.o isa_mmio.o
 ifdef CONFIG_WIN32
 OBJS+=block-raw-win32.o
 else
-OBJS+=block-raw-posix.o
+OBJS+=block-raw-posix.o aio-posix.o
 endif
 
 LIBS+=-lz
diff --git a/aio-posix.c b/aio-posix.c
new file mode 100644
index 0000000..b5fea7d
--- /dev/null
+++ b/aio-posix.c
@@ -0,0 +1,290 @@
+/*
+ * Block driver for RAW files (posix)
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#ifndef QEMU_IMG
+#include "qemu-timer.h"
+#include "exec-all.h"
+#endif
+#include "sysemu.h"
+#include "block_int.h"
+#include "block-aio.h"
+#include <assert.h>
+#include <aio.h>
+
+#ifdef CONFIG_COCOA
+#include <paths.h>
+#include <sys/param.h>
+#include <IOKit/IOKitLib.h>
+#include <IOKit/IOBSD.h>
+#include <IOKit/storage/IOMediaBSDClient.h>
+#include <IOKit/storage/IOMedia.h>
+#include <IOKit/storage/IOCDMedia.h>
+//#include <IOKit/storage/IOCDTypes.h>
+#include <CoreFoundation/CoreFoundation.h>
+#endif
+
+#ifdef __sun__
+#define _POSIX_PTHREAD_SEMANTICS 1
+#include <signal.h>
+#include <sys/dkio.h>
+#endif
+#ifdef __linux__
+#include <sys/ioctl.h>
+#include <linux/cdrom.h>
+#include <linux/fd.h>
+#endif
+#ifdef __FreeBSD__
+#include <sys/disk.h>
+#endif
+
+/***********************************************************/
+/* Unix AIO using POSIX AIO */
+
+typedef struct RawAIOCB {
+    BlockDriverAIOCB common;
+    struct aiocb aiocb;
+    struct RawAIOCB *next;
+} RawAIOCB;
+
+static int aio_sig_num = SIGUSR2;
+static RawAIOCB *first_aio; /* AIO issued */
+
+static void aio_signal_handler(int signum)
+{
+#ifndef QEMU_IMG
+    CPUState *env = cpu_single_env;
+    if (env) {
+        /* stop the currently executing cpu because a timer occured */
+        cpu_interrupt(env, CPU_INTERRUPT_EXIT);
+#ifdef USE_KQEMU
+        if (env->kqemu_enabled) {
+            kqemu_cpu_interrupt(env);
+        }
+#endif
+    }
+#endif
+}
+
+static void pa_poll(void *opaque)
+{
+    RawAIOCB *acb, **pacb;
+    int ret;
+
+    for(;;) {
+        pacb = &first_aio;
+        for(;;) {
+            acb = *pacb;
+            if (!acb)
+                goto the_end;
+            ret = aio_error(&acb->aiocb);
+            if (ret == ECANCELED) {
+                /* remove the request */
+                *pacb = acb->next;
+                qemu_aio_release(acb);
+            } else if (ret != EINPROGRESS) {
+                /* end of aio */
+                if (ret == 0) {
+                    ret = aio_return(&acb->aiocb);
+                    if (ret == acb->aiocb.aio_nbytes)
+                        ret = 0;
+                    else
+                        ret = -EINVAL;
+                } else {
+                    ret = -ret;
+                }
+                /* remove the request */
+                *pacb = acb->next;
+                /* call the callback */
+                acb->common.cb(acb->common.opaque, ret);
+                qemu_aio_release(acb);
+                break;
+            } else {
+                pacb = &acb->next;
+            }
+        }
+    }
+ the_end: ;
+}
+
+static void pa_init(void)
+{
+    struct sigaction act;
+
+#ifndef QEMU_IMG
+    qemu_register_poll(pa_poll, NULL);
+#endif
+
+    sigfillset(&act.sa_mask);
+    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
+    act.sa_handler = aio_signal_handler;
+    sigaction(aio_sig_num, &act, NULL);
+
+#if defined(__GLIBC__) && defined(__linux__)
+    {
+        /* XXX: aio thread exit seems to hang on RedHat 9 and this init
+           seems to fix the problem. */
+        struct aioinit ai;
+        memset(&ai, 0, sizeof(ai));
+        ai.aio_threads = 1;
+        ai.aio_num = 1;
+        ai.aio_idle_time = 365 * 100000;
+        aio_init(&ai);
+    }
+#endif
+}
+
+/* wait until at least one AIO was handled */
+static sigset_t wait_oset;
+
+static void pa_wait_start(void)
+{
+    sigset_t set;
+
+    sigemptyset(&set);
+    sigaddset(&set, aio_sig_num);
+    sigprocmask(SIG_BLOCK, &set, &wait_oset);
+}
+
+static void pa_wait(void)
+{
+    sigset_t set;
+    int nb_sigs;
+
+#ifndef QEMU_IMG
+    if (qemu_bh_poll())
+        return;
+#endif
+    sigemptyset(&set);
+    sigaddset(&set, aio_sig_num);
+    sigwait(&set, &nb_sigs);
+    pa_poll(NULL);
+}
+
+static void pa_wait_end(void)
+{
+    sigprocmask(SIG_SETMASK, &wait_oset, NULL);
+}
+
+/* Wait for all IO requests to complete.  */
+static void pa_flush(void)
+{
+    pa_wait_start();
+    pa_poll(NULL);
+    while (first_aio) {
+        pa_wait();
+    }
+    pa_wait_end();
+}
+
+static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int fd,
+        int64_t sector_num, uint8_t *buf, int nb_sectors,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb)
+        return NULL;
+    acb->aiocb.aio_fildes = fd;
+    acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num;
+    acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+    acb->aiocb.aio_buf = buf;
+    if (nb_sectors < 0)
+        acb->aiocb.aio_nbytes = -nb_sectors;
+    else
+        acb->aiocb.aio_nbytes = nb_sectors * 512;
+    acb->aiocb.aio_offset = sector_num * 512;
+    acb->next = first_aio;
+    first_aio = acb;
+    return acb;
+}
+
+static BlockDriverAIOCB *pa_submit(BlockDriverState *bs,
+				   int fd, int64_t sector_num,
+				   void *buf, int nb_sectors, int write,
+				   BlockDriverCompletionFunc *cb,
+				   void *opaque)
+{
+    RawAIOCB *acb;
+    int err;
+
+    acb = raw_aio_setup(bs, fd, sector_num, buf, nb_sectors, cb, opaque);
+    if (!acb)
+        return NULL;
+
+    if (write) 
+	err = aio_write(&acb->aiocb);
+    else
+	err = aio_read(&acb->aiocb);
+
+    if (err < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
+static void pa_cancel(BlockDriverAIOCB *blockacb)
+{
+    int ret;
+    RawAIOCB *acb = (RawAIOCB *)blockacb;
+    RawAIOCB **pacb;
+
+    ret = aio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
+    if (ret == AIO_NOTCANCELED) {
+        /* fail safe: if the aio could not be canceled, we wait for
+           it */
+        while (aio_error(&acb->aiocb) == EINPROGRESS);
+    }
+
+    /* remove the callback from the queue */
+    pacb = &first_aio;
+    for(;;) {
+        if (*pacb == NULL) {
+            break;
+        } else if (*pacb == acb) {
+            *pacb = acb->next;
+            qemu_aio_release(acb);
+            break;
+        }
+        pacb = &acb->next;
+    }
+}
+
+static AIODriver posix_aio_drv = {
+    .name = "posix",
+    .aiocb_size = sizeof(RawAIOCB),
+    .aio_init = pa_init,
+    .aio_wait_start = pa_wait_start,
+    .aio_wait = pa_wait,
+    .aio_wait_end = pa_wait_end,
+    .aio_flush = pa_flush,
+    .aio_submit = pa_submit,
+    .aio_cancel = pa_cancel,
+};
+
+int posix_aio_init(void)
+{
+    return qemu_register_aio(&posix_aio_drv);
+}
diff --git a/block-raw-posix.c b/block-raw-posix.c
index fee8422..f0a111a 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -29,7 +29,6 @@
 #include "block_int.h"
 #include "block-aio.h"
 #include <assert.h>
-#include <aio.h>
 
 #ifdef CONFIG_COCOA
 #include <paths.h>
@@ -232,245 +231,10 @@ label__raw_write__success:
 }
 
 /***********************************************************/
-/* Unix AIO using POSIX AIO */
-
-typedef struct RawAIOCB {
-    BlockDriverAIOCB common;
-    struct aiocb aiocb;
-    struct RawAIOCB *next;
-} RawAIOCB;
-
-static int aio_sig_num = SIGUSR2;
-static RawAIOCB *first_aio; /* AIO issued */
+/* AIO Interface */
+	
 static int aio_initialized = 0;
 
-static void pa_poll(void *opaque);
-static void pa_wait_start(void);
-static void pa_wait(void);
-static void pa_wait_end(void);
-
-static void aio_signal_handler(int signum)
-{
-#ifndef QEMU_IMG
-    CPUState *env = cpu_single_env;
-    if (env) {
-        /* stop the currently executing cpu because a timer occured */
-        cpu_interrupt(env, CPU_INTERRUPT_EXIT);
-#ifdef USE_KQEMU
-        if (env->kqemu_enabled) {
-            kqemu_cpu_interrupt(env);
-        }
-#endif
-    }
-#endif
-}
-
-static void pa_init(void)
-{
-    struct sigaction act;
-
-#ifndef QEMU_IMG
-    qemu_register_poll(pa_poll, NULL);
-#endif
-
-    sigfillset(&act.sa_mask);
-    act.sa_flags = 0; /* do not restart syscalls to interrupt select() */
-    act.sa_handler = aio_signal_handler;
-    sigaction(aio_sig_num, &act, NULL);
-
-#if defined(__GLIBC__) && defined(__linux__)
-    {
-        /* XXX: aio thread exit seems to hang on RedHat 9 and this init
-           seems to fix the problem. */
-        struct aioinit ai;
-        memset(&ai, 0, sizeof(ai));
-        ai.aio_threads = 1;
-        ai.aio_num = 1;
-        ai.aio_idle_time = 365 * 100000;
-        aio_init(&ai);
-    }
-#endif
-}
-
-static void pa_poll(void *opaque)
-{
-    RawAIOCB *acb, **pacb;
-    int ret;
-
-    for(;;) {
-        pacb = &first_aio;
-        for(;;) {
-            acb = *pacb;
-            if (!acb)
-                goto the_end;
-            ret = aio_error(&acb->aiocb);
-            if (ret == ECANCELED) {
-                /* remove the request */
-                *pacb = acb->next;
-                qemu_aio_release(acb);
-            } else if (ret != EINPROGRESS) {
-                /* end of aio */
-                if (ret == 0) {
-                    ret = aio_return(&acb->aiocb);
-                    if (ret == acb->aiocb.aio_nbytes)
-                        ret = 0;
-                    else
-                        ret = -EINVAL;
-                } else {
-                    ret = -ret;
-                }
-                /* remove the request */
-                *pacb = acb->next;
-                /* call the callback */
-                acb->common.cb(acb->common.opaque, ret);
-                qemu_aio_release(acb);
-                break;
-            } else {
-                pacb = &acb->next;
-            }
-        }
-    }
- the_end: ;
-}
-
-/* Wait for all IO requests to complete.  */
-static void pa_flush(void)
-{
-    pa_wait_start();
-    pa_poll(NULL);
-    while (first_aio) {
-        pa_wait();
-    }
-    pa_wait_end();
-}
-
-/* wait until at least one AIO was handled */
-static sigset_t wait_oset;
-
-static void pa_wait_start(void)
-{
-    sigset_t set;
-
-    sigemptyset(&set);
-    sigaddset(&set, aio_sig_num);
-    sigprocmask(SIG_BLOCK, &set, &wait_oset);
-}
-
-static void pa_wait(void)
-{
-    sigset_t set;
-    int nb_sigs;
-
-#ifndef QEMU_IMG
-    if (qemu_bh_poll())
-        return;
-#endif
-    sigemptyset(&set);
-    sigaddset(&set, aio_sig_num);
-    sigwait(&set, &nb_sigs);
-    pa_poll(NULL);
-}
-
-static void pa_wait_end(void)
-{
-    sigprocmask(SIG_SETMASK, &wait_oset, NULL);
-}
-
-static RawAIOCB *raw_aio_setup(BlockDriverState *bs, int fd,
-        int64_t sector_num, uint8_t *buf, int nb_sectors,
-        BlockDriverCompletionFunc *cb, void *opaque)
-{
-    RawAIOCB *acb;
-
-    if (fd_open(bs) < 0)
-        return NULL;
-
-    acb = qemu_aio_get(bs, cb, opaque);
-    if (!acb)
-        return NULL;
-    acb->aiocb.aio_fildes = fd;
-    acb->aiocb.aio_sigevent.sigev_signo = aio_sig_num;
-    acb->aiocb.aio_sigevent.sigev_notify = SIGEV_SIGNAL;
-    acb->aiocb.aio_buf = buf;
-    if (nb_sectors < 0)
-        acb->aiocb.aio_nbytes = -nb_sectors;
-    else
-        acb->aiocb.aio_nbytes = nb_sectors * 512;
-    acb->aiocb.aio_offset = sector_num * 512;
-    acb->next = first_aio;
-    first_aio = acb;
-    return acb;
-}
-
-static BlockDriverAIOCB *pa_submit(BlockDriverState *bs,
-				   int fd, int64_t sector_num,
-				   void *buf, int nb_sectors, int write,
-				   BlockDriverCompletionFunc *cb,
-				   void *opaque)
-{
-    RawAIOCB *acb;
-    int err;
-
-    acb = raw_aio_setup(bs, fd, sector_num, buf, nb_sectors, cb, opaque);
-    if (!acb)
-        return NULL;
-
-    if (write) 
-	err = aio_write(&acb->aiocb);
-    else
-	err = aio_read(&acb->aiocb);
-
-    if (err < 0) {
-        qemu_aio_release(acb);
-        return NULL;
-    }
-    return &acb->common;
-}
-
-static void pa_cancel(BlockDriverAIOCB *blockacb)
-{
-    int ret;
-    RawAIOCB *acb = (RawAIOCB *)blockacb;
-    RawAIOCB **pacb;
-
-    ret = aio_cancel(acb->aiocb.aio_fildes, &acb->aiocb);
-    if (ret == AIO_NOTCANCELED) {
-        /* fail safe: if the aio could not be canceled, we wait for
-           it */
-        while (aio_error(&acb->aiocb) == EINPROGRESS);
-    }
-
-    /* remove the callback from the queue */
-    pacb = &first_aio;
-    for(;;) {
-        if (*pacb == NULL) {
-            break;
-        } else if (*pacb == acb) {
-            *pacb = acb->next;
-            qemu_aio_release(acb);
-            break;
-        }
-        pacb = &acb->next;
-    }
-}
-
-static AIODriver posix_aio_drv = {
-    .name = "posix",
-    .aiocb_size = sizeof(RawAIOCB),
-    .aio_init = pa_init,
-    .aio_wait_start = pa_wait_start,
-    .aio_wait = pa_wait,
-    .aio_wait_end = pa_wait_end,
-    .aio_flush = pa_flush,
-    .aio_submit = pa_submit,
-    .aio_cancel = pa_cancel,
-};
-
-int posix_aio_init(void)
-{
-    return qemu_register_aio(&posix_aio_drv);
-}
-	
 void qemu_aio_init(void)
 {
     if (aio_initialized)

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [Qemu-devel] [PATCH 3/3] Implement linux-aio backend
  2008-04-17 19:26 [Qemu-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Anthony Liguori
  2008-04-17 19:26 ` [Qemu-devel] [PATCH 2/3] Split out posix-aio code Anthony Liguori
@ 2008-04-17 19:26 ` Anthony Liguori
  2008-04-18 15:09   ` [Qemu-devel] " Marcelo Tosatti
  2008-04-17 19:38 ` [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Daniel P. Berrange
  2 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2008-04-17 19:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

This patch introduces a Linux-aio backend that is disabled by default.  To
use this backend effectively, the user should disable caching and select
it with the appropriate -aio option.  For instance:

qemu-system-x86_64 -drive foo.img,cache=off -aio linux

There's no universal way to asynchronous wait with linux-aio.  At some point,
signals were added to signal completion.  More recently, and eventfd interface
was added.  This patch relies on the later.

We try hard to detect whether the right support is available in configure to
avoid compile failures.

Signed-off-by: Anthony Liguori <aliguori@us.ibm.com>

diff --git a/Makefile.target b/Makefile.target
index f635d68..289887c 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -487,6 +487,9 @@ OBJS+=block-raw-win32.o
 else
 OBJS+=block-raw-posix.o aio-posix.o
 endif
+ifdef CONFIG_LINUX_AIO
+OBJS+=aio-linux.o
+endif
 
 LIBS+=-lz
 ifdef CONFIG_ALSA
diff --git a/aio-linux.c b/aio-linux.c
new file mode 100644
index 0000000..f5c222b
--- /dev/null
+++ b/aio-linux.c
@@ -0,0 +1,210 @@
+/*
+ * QEMU Linux AIO Support
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "qemu-char.h"
+#include "block.h"
+#include "block_int.h"
+#include "block-aio.h"
+#include "sysemu.h"
+
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/aio_abi.h>
+
+int eventfd(unsigned int initval)
+{
+    return syscall(SYS_eventfd, initval);
+}
+
+int io_setup(unsigned nr_reqs, aio_context_t *ctx_id)
+{
+    return syscall(SYS_io_setup, nr_reqs, ctx_id);
+}
+
+int io_destroy(aio_context_t ctx_id)
+{
+    return syscall(SYS_io_destroy, ctx_id);
+}
+
+int io_getevents(aio_context_t ctx_id, long min_nr, long nr,
+		 struct io_event *events, struct timespec *timeout)
+{
+    return syscall(SYS_io_getevents, ctx_id, min_nr, nr, events, timeout);
+}
+
+int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocb)
+{
+    return syscall(SYS_io_submit, ctx_id, nr, iocb);
+}
+
+int io_cancel(aio_context_t ctx_id, struct iocb *iocb, struct io_event *result)
+{
+    return syscall(SYS_io_cancel, ctx_id, iocb, result);
+}
+
+typedef struct LinuxAIOCB {
+    BlockDriverAIOCB common;
+    struct iocb iocb;
+} LinuxAIOCB;
+
+static int aio_efd;
+static aio_context_t aio_ctxt_id;
+static int outstanding_requests;
+
+static BlockDriverAIOCB *la_submit(BlockDriverState *bs,
+				   int fd, int64_t sector_num,
+				   void *buf, int nb_sectors, int write,
+				   BlockDriverCompletionFunc *cb,
+				   void *opaque)
+{
+    LinuxAIOCB *aiocb;
+    struct iocb *iocbs[1];
+    int err;
+
+    aiocb = qemu_aio_get(bs, cb, opaque);
+    if (!aiocb) {
+	printf("returning null??\n");
+	return NULL;
+    }
+
+    if (write)
+	aiocb->iocb.aio_lio_opcode = IOCB_CMD_PWRITE;
+    else
+	aiocb->iocb.aio_lio_opcode = IOCB_CMD_PREAD;
+
+    aiocb->iocb.aio_data = (unsigned long)aiocb;
+    aiocb->iocb.aio_fildes = fd;
+    aiocb->iocb.aio_flags = IOCB_FLAG_RESFD;
+    aiocb->iocb.aio_resfd = aio_efd;
+    aiocb->iocb.aio_buf = (unsigned long)buf;
+    aiocb->iocb.aio_nbytes = nb_sectors * 512;
+    aiocb->iocb.aio_offset = sector_num * 512;
+
+    iocbs[0] = &aiocb->iocb;
+
+    do {
+	err = io_submit(aio_ctxt_id, 1, iocbs);
+    } while (err == -1 && errno == EINTR);
+
+    if (err != 1) {
+	fprintf(stderr, "failed to submit aio request: %m\n");
+	exit(1);
+    }
+
+    outstanding_requests++;
+
+    return &aiocb->common;
+}
+
+static void la_wait(void)
+{
+    main_loop_wait(10);
+}
+
+static void la_flush(void)
+{
+    while (outstanding_requests)
+	la_wait();
+}
+
+static void la_cancel(BlockDriverAIOCB *baiocb)
+{
+    LinuxAIOCB *aiocb = (void *)baiocb;
+    struct io_event result;
+    int err;
+    
+    do {
+	err = io_cancel(aio_ctxt_id, &aiocb->iocb, &result);
+    } while (err == -1 && errno == EINTR);
+
+    /* it may have happened...  we probably should check and complete */
+
+    outstanding_requests--;
+
+    qemu_aio_release(aiocb);
+}
+
+static void la_completion(void *opaque)
+{
+    struct io_event events[256];
+    struct timespec ts = {0, 0};
+    uint64_t count;
+    int i, ret;
+
+    do {
+	ret = read(aio_efd, &count, sizeof(count));
+    } while (ret == -1 && errno == EINTR);
+
+    if (ret != 8) {
+	fprintf(stderr, "bad read from eventfd\n");
+	exit(1);
+    }
+
+    do {
+	ret = io_getevents(aio_ctxt_id, count, ARRAY_SIZE(events),
+			   events, &ts);
+    } while (ret == -1 && errno == EINTR);
+
+    if (ret < count) {
+	fprintf(stderr, "io_getevents failed\n");
+	exit(1);
+    }
+
+    for (i = 0; i < ret; i++) {
+	LinuxAIOCB *aiocb;
+	int res;
+
+	aiocb = (LinuxAIOCB *)(unsigned long)events[i].data;
+	res = events[i].res;
+
+	if (res > 0)
+	    res = 0;
+
+	aiocb->common.cb(aiocb->common.opaque, res);
+	qemu_aio_release(aiocb);
+
+	outstanding_requests--;
+    }
+}
+
+static void la_init(void)
+{
+    aio_efd = eventfd(0);
+    if (aio_efd == -1) {
+	fprintf(stderr, "failed to allocate aio fd\n");
+	exit(1);
+    }
+
+    if (io_setup(256, &aio_ctxt_id) == -1) {
+	fprintf(stderr, "failed to initialize linux aio\n");
+	exit(1);
+    }
+
+    qemu_set_fd_handler2(aio_efd, NULL, la_completion, NULL, NULL);
+}
+
+static AIODriver linux_aio_drv = {
+    .name = "linux",
+    .aiocb_size = sizeof(LinuxAIOCB),
+    .aio_init = la_init,
+    .aio_wait = la_wait,
+    .aio_flush = la_flush,
+    .aio_submit = la_submit,
+    .aio_cancel = la_cancel,
+};
+
+int linux_aio_init(void)
+{
+    return qemu_register_aio(&linux_aio_drv);
+}
diff --git a/block-aio.h b/block-aio.h
index 2fe8c58..6e82cb5 100644
--- a/block-aio.h
+++ b/block-aio.h
@@ -42,5 +42,6 @@ int qemu_set_aio_driver(const char *name);
 extern AIODriver *aio_drv;
 
 int posix_aio_init(void);
+int linux_aio_init(void);
 
 #endif
diff --git a/block.c b/block.c
index 44cb747..259bf3a 100644
--- a/block.c
+++ b/block.c
@@ -1349,6 +1349,11 @@ void bdrv_init(void)
     bdrv_register(&bdrv_qcow2);
     bdrv_register(&bdrv_parallels);
 #ifndef _WIN32
+#ifndef QEMU_IMG
+#ifdef CONFIG_LINUX_AIO
+    linux_aio_init();
+#endif
+#endif
     posix_aio_init();
 #endif
 }
diff --git a/configure b/configure
index 85cb68a..95fb88f 100755
--- a/configure
+++ b/configure
@@ -109,6 +109,7 @@ darwin_user="no"
 build_docs="no"
 uname_release=""
 curses="yes"
+linux_aio="yes"
 
 # OS specific
 targetos=`uname -s`
@@ -326,6 +327,8 @@ for opt do
   ;;
   --disable-curses) curses="no"
   ;;
+  --disable-linux-aio) linux_aio="no"
+  ;;
   *) echo "ERROR: unknown option $opt"; show_help="yes"
   ;;
   esac
@@ -418,6 +421,7 @@ echo "  --enable-fmod            enable FMOD audio driver"
 echo "  --enable-dsound          enable DirectSound audio driver"
 echo "  --disable-vnc-tls        disable TLS encryption for VNC server"
 echo "  --disable-curses         disable curses output"
+echo "  --disable-linux-aio      disable Linux AIO support"
 echo "  --enable-system          enable all system emulation targets"
 echo "  --disable-system         disable all system emulation targets"
 echo "  --enable-linux-user      enable all linux usermode emulation targets"
@@ -687,6 +691,24 @@ EOF
   fi
 fi # test "$curses"
 
+# linux aio probe
+
+if test "$linux_aio" = "yes" ; then
+  linux_aio=no
+  cat > $TMPC <<EOF
+#include <linux/aio_abi.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#ifndef SYS_eventfd
+#error No eventfd support
+#endif
+int main(void) { struct iocb iocb; (void)iocb.aio_resfd; return 0; }
+EOF
+  if $cc $ARCH_CFLAGS -o $TMPE $TMPC 2> /dev/null ; then
+    linux_aio=yes
+  fi
+fi
+
 # Check if tools are available to build documentation.
 if [ -x "`which texi2html 2>/dev/null`" ] && \
    [ -x "`which pod2man 2>/dev/null`" ]; then
@@ -738,6 +760,7 @@ echo "SDL support       $sdl"
 if test "$sdl" != "no" ; then
     echo "SDL static link   $sdl_static"
 fi
+echo "Linux AIO support $linux_aio"
 echo "curses support    $curses"
 echo "mingw32 support   $mingw32"
 echo "Adlib support     $adlib"
@@ -1001,6 +1024,10 @@ if test "$curses" = "yes" ; then
   echo "CONFIG_CURSES=yes" >> $config_mak
   echo "CURSES_LIBS=-lcurses" >> $config_mak
 fi
+if test "$linux_aio" = "yes" ; then
+  echo "#define CONFIG_LINUX_AIO 1" >> $config_h
+  echo "CONFIG_LINUX_AIO=yes" >> $config_mak
+fi
 
 # XXX: suppress that
 if [ "$bsd" = "yes" ] ; then

^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-17 19:26 [Qemu-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Anthony Liguori
  2008-04-17 19:26 ` [Qemu-devel] [PATCH 2/3] Split out posix-aio code Anthony Liguori
  2008-04-17 19:26 ` [Qemu-devel] [PATCH 3/3] Implement linux-aio backend Anthony Liguori
@ 2008-04-17 19:38 ` Daniel P. Berrange
  2008-04-17 19:41   ` Anthony Liguori
  2 siblings, 1 reply; 31+ messages in thread
From: Daniel P. Berrange @ 2008-04-17 19:38 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel

On Thu, Apr 17, 2008 at 02:26:50PM -0500, Anthony Liguori wrote:
> Posix AIO, especially as used by QEMU, is not very efficient for disk IO.
> This patch introduces an AIO abstract to allow multiple AIO implements to be
> used.  We can't simply replace posix-aio by linux-aio because linux-aio only
> works on some filesystems and only with files opened with O_DIRECT.
> 
> This patch adds a command line option (-aio) to select the AIO implementation
> to be used.  It avoids code motion to allow for easy review.  The next patch
> separates out the posix-aio implementation.

This is not a very pleasant user experiance. They can not & should not be
expected to figure out which AIO impl works with their particular filesystem.
If the linux-aio impl doesn't work in some cases, then the code should detect
these and automatically fallback to posix-aio. The user should not have to
use a -aio flag to make it work.

Dan.
-- 
|: Red Hat, Engineering, Boston   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-17 19:38 ` [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Daniel P. Berrange
@ 2008-04-17 19:41   ` Anthony Liguori
  2008-04-17 20:00     ` Daniel P. Berrange
  0 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2008-04-17 19:41 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel

Daniel P. Berrange wrote:
> On Thu, Apr 17, 2008 at 02:26:50PM -0500, Anthony Liguori wrote:
>   
>> Posix AIO, especially as used by QEMU, is not very efficient for disk IO.
>> This patch introduces an AIO abstract to allow multiple AIO implements to be
>> used.  We can't simply replace posix-aio by linux-aio because linux-aio only
>> works on some filesystems and only with files opened with O_DIRECT.
>>
>> This patch adds a command line option (-aio) to select the AIO implementation
>> to be used.  It avoids code motion to allow for easy review.  The next patch
>> separates out the posix-aio implementation.
>>     
>
> This is not a very pleasant user experiance. They can not & should not be
> expected to figure out which AIO impl works with their particular filesystem.
> If the linux-aio impl doesn't work in some cases, then the code should detect
> these and automatically fallback to posix-aio. The user should not have to
> use a -aio flag to make it work.
>   

Those cases aren't always discoverable.  Linux-aio just falls back to 
using synchronous IO.  It's pretty terrible.  We need a new AIO 
interface for Linux (and yes, we're working on this).  Once we have 
something better, we'll change that to be the default and things will 
Just Work for most users.

Regards,

Anthony Liguori

> Dan.
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-17 19:41   ` Anthony Liguori
@ 2008-04-17 20:00     ` Daniel P. Berrange
  2008-04-17 20:05       ` Anthony Liguori
  2008-04-18 12:43       ` Jamie Lokier
  0 siblings, 2 replies; 31+ messages in thread
From: Daniel P. Berrange @ 2008-04-17 20:00 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel

On Thu, Apr 17, 2008 at 02:41:32PM -0500, Anthony Liguori wrote:
> Daniel P. Berrange wrote:
> >On Thu, Apr 17, 2008 at 02:26:50PM -0500, Anthony Liguori wrote:
> >  
> >>Posix AIO, especially as used by QEMU, is not very efficient for disk IO.
> >>This patch introduces an AIO abstract to allow multiple AIO implements to 
> >>be
> >>used.  We can't simply replace posix-aio by linux-aio because linux-aio 
> >>only
> >>works on some filesystems and only with files opened with O_DIRECT.
> >>
> >>This patch adds a command line option (-aio) to select the AIO 
> >>implementation
> >>to be used.  It avoids code motion to allow for easy review.  The next 
> >>patch
> >>separates out the posix-aio implementation.
> >>    
> >
> >This is not a very pleasant user experiance. They can not & should not be
> >expected to figure out which AIO impl works with their particular 
> >filesystem.
> >If the linux-aio impl doesn't work in some cases, then the code should 
> >detect
> >these and automatically fallback to posix-aio. The user should not have to
> >use a -aio flag to make it work.
> >  
> 
> Those cases aren't always discoverable.  Linux-aio just falls back to 
> using synchronous IO.  It's pretty terrible.  We need a new AIO 
> interface for Linux (and yes, we're working on this).  Once we have 
> something better, we'll change that to be the default and things will 
> Just Work for most users.

If QEMU can't discover cases where it won't work, what criteria should
the end user use to decide between the impls, or for that matter, what
criteria should a management api/app like libvirt use ? If the only decision
logic is  'try it & benchmark your VM' then its not a particularly useful
option.

I've basically got a choice of making libvirt always ad '-aio linux'
or never add it at all. My inclination is to the latter since it is
compatible with existing QEMU which has no -aio option. Presumably
'-aio linux' is intended to provide some performance benefit so it'd
be nice to use it. If we can't express some criteria under which it
should be turned on, I can't enable it; where as if you can express
some criteria, then QEMU should apply them automatically.

Pushing this choice of AIO impls to the app or user invoking QEMU just
does not seem like a win here.

Dan.
-- 
|: Red Hat, Engineering, Boston   -o-   http://people.redhat.com/berrange/ :|
|: http://libvirt.org  -o-  http://virt-manager.org  -o-  http://ovirt.org :|
|: http://autobuild.org       -o-         http://search.cpan.org/~danberr/ :|
|: GnuPG: 7D3B9505  -o-  F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 :|

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-17 20:00     ` Daniel P. Berrange
@ 2008-04-17 20:05       ` Anthony Liguori
  2008-04-18 12:43       ` Jamie Lokier
  1 sibling, 0 replies; 31+ messages in thread
From: Anthony Liguori @ 2008-04-17 20:05 UTC (permalink / raw)
  To: Daniel P. Berrange; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel

Daniel P. Berrange wrote:
> If QEMU can't discover cases where it won't work, what criteria should
> the end user use to decide between the impls, or for that matter, what
> criteria should a management api/app like libvirt use ? If the only decision
> logic is  'try it & benchmark your VM' then its not a particularly useful
> option.
>
> I've basically got a choice of making libvirt always ad '-aio linux'
> or never add it at all. My inclination is to the latter since it is
> compatible with existing QEMU which has no -aio option. Presumably
> '-aio linux' is intended to provide some performance benefit so it'd
> be nice to use it. If we can't express some criteria under which it
> should be turned on, I can't enable it; where as if you can express
> some criteria, then QEMU should apply them automatically.
>
> Pushing this choice of AIO impls to the app or user invoking QEMU just
> does not seem like a win here.
>   

The one thing we could possibly do is detect the cache where we see a 
block device and then automagically enable cache=off and -aio linux.  
Without cache=off, -aio linux is not so useful ATM.  At the same time 
though, not all users are going to want to disable the use of the host 
page cache.  It's not necessary an easy decision either way.

For libvirt, I'd recommend just never using -aio linux.  We'll have a 
better AIO option in the near future (based on Rusty's vringfd work) and 
I'd like to detect and enable that by default.

Regards,

Anthony Liguori

> Dan.
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-17 20:00     ` Daniel P. Berrange
  2008-04-17 20:05       ` Anthony Liguori
@ 2008-04-18 12:43       ` Jamie Lokier
  2008-04-18 15:23         ` Anthony Liguori
  1 sibling, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2008-04-18 12:43 UTC (permalink / raw)
  To: Daniel P. Berrange, qemu-devel
  Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Daniel P. Berrange wrote:
> > Those cases aren't always discoverable.  Linux-aio just falls back to 
> > using synchronous IO.  It's pretty terrible.  We need a new AIO 
> > interface for Linux (and yes, we're working on this).  Once we have 
> > something better, we'll change that to be the default and things will 
> > Just Work for most users.
> 
> If QEMU can't discover cases where it won't work, what criteria should
> the end user use to decide between the impls, or for that matter, what
> criteria should a management api/app like libvirt use ? If the only decision
> logic is  'try it & benchmark your VM' then its not a particularly useful
> option.

Good use of Linux-AIO requires that you basically "know" which cases
it handles well, and which ones it doesn't.  Falling back to
synchronous I/O with no indication (except speed) is a pretty
atrocious API imho.  But that's what the Linux folks decided to do.

I suspect what you have to do is:

    1. Try opening the file with O_DIRECT.
    2. Use fstat to check the filesystem type and block device type.
    3. If it's on a whitelist of filesystem types,
    4. and a whitelist of block device types,
    5. and the kernel version is later than an fs+bd-dependent value,
    6. then select an alignment size (kernel version dependent)
       and use Linux-AIO with it.

Otherwise don't use Linux-AIO.  You may then decide to use Glibc's
POSIX-AIO (which uses threads), or use threads for I/O yourself.

In future, the above recipe will be more complicated, in that you have
to use the same decision tree to decide between:

    - Synchronous IO.
    - Your own thread based IO.
    - Glibc POSIX-AIO using threads.
    - Linux-AIO.
    - Virtio thing or whatever is based around vringfd.
    - Syslets if they gain traction and perform well.

> I've basically got a choice of making libvirt always ad '-aio linux'
> or never add it at all. My inclination is to the latter since it is
> compatible with existing QEMU which has no -aio option. Presumably
> '-aio linux' is intended to provide some performance benefit so it'd
> be nice to use it. If we can't express some criteria under which it
> should be turned on, I can't enable it; where as if you can express
> some criteria, then QEMU should apply them automatically.

I'm of the view that '-aio auto' would be a really good option - and
when it's proven itself, it should be the default.  It could work on
all QEMU hosts: it would pick synchronous IO when there is nothing else.

The criteria for selecting a good AIO strategy on Linux are quite
complex, and might be worth hard coding.  In that case, putting that
into QEMU itself would be much better than every program which
launches QEMU having it's own implementation of the criteria.

> Pushing this choice of AIO impls to the app or user invoking QEMU just
> does not seem like a win here.

I think having the choice is very good, because whatever the hard
coded selection criteria, there will be times when it's wrong (ideally
in conservative ways - it should always be functional, just suboptimal).

So I do support this patch to add the switch.

But _forcing_ the user to decide is not good, since the criteria are
rather obscure and change with things like filesystem.  At least, a
set of command line options to QEMU ought to work when you copy a VM
to another machine!

So I think '-aio auto', which invokes the selection criteria of the
day and is guaranteed to work (conservatively picking a slower method
if it cannot be sure a faster one will work) would be the most useful
option of all.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [PATCH 3/3] Implement linux-aio backend
  2008-04-17 19:26 ` [Qemu-devel] [PATCH 3/3] Implement linux-aio backend Anthony Liguori
@ 2008-04-18 15:09   ` Marcelo Tosatti
  2008-04-18 15:18     ` Anthony Liguori
  0 siblings, 1 reply; 31+ messages in thread
From: Marcelo Tosatti @ 2008-04-18 15:09 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, qemu-devel

On Thu, Apr 17, 2008 at 02:26:52PM -0500, Anthony Liguori wrote:
> This patch introduces a Linux-aio backend that is disabled by default.  To
> use this backend effectively, the user should disable caching and select
> it with the appropriate -aio option.  For instance:
> 
> qemu-system-x86_64 -drive foo.img,cache=off -aio linux
> 
> There's no universal way to asynchronous wait with linux-aio.  At some point,
> signals were added to signal completion.  More recently, and eventfd interface
> was added.  This patch relies on the later.
> 
> We try hard to detect whether the right support is available in configure to
> avoid compile failures.

> +    do {
> +	err = io_submit(aio_ctxt_id, 1, iocbs);
> +    } while (err == -1 && errno == EINTR);
> +
> +    if (err != 1) {
> +	fprintf(stderr, "failed to submit aio request: %m\n");
> +	exit(1);
> +    }
> +
> +    outstanding_requests++;
> +
> +    return &aiocb->common;
> +}
> +
> +static void la_wait(void)
> +{
> +    main_loop_wait(10);
> +}

Sleeping in the context of vcpu's is extremely bad (eg virtio-block
blocks in write() throttling which kills performance). It should wait
on IO completions instead (qemu-kvm.c creates a pthread "waitqueue" to
resolve that issue).

Other than that looks fine to me, will give it a try.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [PATCH 3/3] Implement linux-aio backend
  2008-04-18 15:09   ` [Qemu-devel] " Marcelo Tosatti
@ 2008-04-18 15:18     ` Anthony Liguori
  2008-04-18 17:46       ` Marcelo Tosatti
  0 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2008-04-18 15:18 UTC (permalink / raw)
  To: Marcelo Tosatti; +Cc: kvm-devel, qemu-devel

Marcelo Tosatti wrote:
> On Thu, Apr 17, 2008 at 02:26:52PM -0500, Anthony Liguori wrote:
>   
>> This patch introduces a Linux-aio backend that is disabled by default.  To
>> use this backend effectively, the user should disable caching and select
>> it with the appropriate -aio option.  For instance:
>>
>> qemu-system-x86_64 -drive foo.img,cache=off -aio linux
>>
>> There's no universal way to asynchronous wait with linux-aio.  At some point,
>> signals were added to signal completion.  More recently, and eventfd interface
>> was added.  This patch relies on the later.
>>
>> We try hard to detect whether the right support is available in configure to
>> avoid compile failures.
>>     
>
>   
>> +    do {
>> +	err = io_submit(aio_ctxt_id, 1, iocbs);
>> +    } while (err == -1 && errno == EINTR);
>> +
>> +    if (err != 1) {
>> +	fprintf(stderr, "failed to submit aio request: %m\n");
>> +	exit(1);
>> +    }
>> +
>> +    outstanding_requests++;
>> +
>> +    return &aiocb->common;
>> +}
>> +
>> +static void la_wait(void)
>> +{
>> +    main_loop_wait(10);
>> +}
>>     
>
> Sleeping in the context of vcpu's is extremely bad (eg virtio-block
> blocks in write() throttling which kills performance). It should wait
> on IO completions instead (qemu-kvm.c creates a pthread "waitqueue" to
> resolve that issue).
>
> Other than that looks fine to me, will give it a try.
>   

FWIW, I'm not getting wonderful results in KVM.  It's hard to tell 
though because time seems wildly inaccurate (even with kvm clock in the 
guest).  The time issue appears unrelated to this set of patches.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-18 12:43       ` Jamie Lokier
@ 2008-04-18 15:23         ` Anthony Liguori
  2008-04-18 16:22           ` Jamie Lokier
  2008-04-18 16:32           ` [kvm-devel] [Qemu-devel] " Avi Kivity
  0 siblings, 2 replies; 31+ messages in thread
From: Anthony Liguori @ 2008-04-18 15:23 UTC (permalink / raw)
  To: Daniel P. Berrange, qemu-devel, Anthony Liguori, kvm-devel,
	Marcelo Tosatti

Jamie Lokier wrote:
>> I've basically got a choice of making libvirt always ad '-aio linux'
>> or never add it at all. My inclination is to the latter since it is
>> compatible with existing QEMU which has no -aio option. Presumably
>> '-aio linux' is intended to provide some performance benefit so it'd
>> be nice to use it. If we can't express some criteria under which it
>> should be turned on, I can't enable it; where as if you can express
>> some criteria, then QEMU should apply them automatically.
>>     
>
> I'm of the view that '-aio auto' would be a really good option - and
> when it's proven itself, it should be the default.  It could work on
> all QEMU hosts: it would pick synchronous IO when there is nothing else.
>   

Right now, not specifying the -aio option is equivalent to your proposed 
-aio auto.

I guess I should include an info aio to let the user know what type of 
aio they are using.  We can add selection criteria later but 
semantically, not specifying an explicit -aio option allows QEMU to 
choose whichever one it thinks is best.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-18 15:23         ` Anthony Liguori
@ 2008-04-18 16:22           ` Jamie Lokier
  2008-04-18 16:32           ` [kvm-devel] [Qemu-devel] " Avi Kivity
  1 sibling, 0 replies; 31+ messages in thread
From: Jamie Lokier @ 2008-04-18 16:22 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Anthony Liguori wrote:
> >I'm of the view that '-aio auto' would be a really good option - and
> >when it's proven itself, it should be the default.  It could work on
> >all QEMU hosts: it would pick synchronous IO when there is nothing else.
> 
> Right now, not specifying the -aio option is equivalent to your proposed 
> -aio auto.
> 
> I guess I should include an info aio to let the user know what type of 
> aio they are using.  We can add selection criteria later but 
> semantically, not specifying an explicit -aio option allows QEMU to 
> choose whichever one it thinks is best.

Great.  I guess the next step is to add selection criteria, otherwise
a million Wikis will tell everyone to use '-aio linux' :-)

Do you know what the selection criteria should be - or is there a
document/paper somewhere which says (ideally from benchmarks)?  I'm
interested for an unrelated project using AIO - so I'm willing to help
get this right to some extent.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-18 15:23         ` Anthony Liguori
  2008-04-18 16:22           ` Jamie Lokier
@ 2008-04-18 16:32           ` Avi Kivity
  2008-04-20 15:49             ` Jamie Lokier
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2008-04-18 16:32 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, Marcelo Tosatti, qemu-devel

Anthony Liguori wrote:
> Right now, not specifying the -aio option is equivalent to your proposed 
> -aio auto.
>
> I guess I should include an info aio to let the user know what type of 
> aio they are using.  We can add selection criteria later but 
> semantically, not specifying an explicit -aio option allows QEMU to 
> choose whichever one it thinks is best.
>
>   
For the majority of deployments posix aio should be sufficient.  The few 
that need something else can use Linux aio.

Of course, a managed environment can use Linux aio unconditionally if 
knows the kernel has all the needed goodies.

-- 
Any sufficiently difficult bug is indistinguishable from a feature.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] Re: [PATCH 3/3] Implement linux-aio backend
  2008-04-18 15:18     ` Anthony Liguori
@ 2008-04-18 17:46       ` Marcelo Tosatti
  0 siblings, 0 replies; 31+ messages in thread
From: Marcelo Tosatti @ 2008-04-18 17:46 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: kvm-devel, qemu-devel

On Fri, Apr 18, 2008 at 10:18:33AM -0500, Anthony Liguori wrote:
> >Sleeping in the context of vcpu's is extremely bad (eg virtio-block
> >blocks in write() throttling which kills performance). It should wait
> >on IO completions instead (qemu-kvm.c creates a pthread "waitqueue" to
> >resolve that issue).
> >
> >Other than that looks fine to me, will give it a try.
> >  
> 
> FWIW, I'm not getting wonderful results in KVM.  It's hard to tell 
> though because time seems wildly inaccurate (even with kvm clock in the 
> guest).  The time issue appears unrelated to this set of patches.

Oh, you won't get completion signals on the aio eventfd. You might want
to try the select-with-timeout() stuff.

Will submit that with proper signalfd emulation shortly.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-18 16:32           ` [kvm-devel] [Qemu-devel] " Avi Kivity
@ 2008-04-20 15:49             ` Jamie Lokier
  2008-04-20 18:43               ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2008-04-20 15:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Avi Kivity wrote:
> For the majority of deployments posix aio should be sufficient.  The few 
> that need something else can use Linux aio.

Does that mean "for the majority of deployments, the slow version is
sufficient.  The few that care about performance can use Linux AIO?"

I'm under the impression that the entire and only point of Linux AIO
is that it's faster than POSIX AIO on Linux.

> Of course, a managed environment can use Linux aio unconditionally if 
> knows the kernel has all the needed goodies.

Does that mean "a managed environment can have some code which check
the host kernel version + filesystem type holding the VM image, to
conditionally enable Linux AIO?"  (Since if you care about
performance, which is the sole reason for using Linux AIO, you
wouldn't want to enable Linux AIO on any host in your cluster where it
will trash performance.)

Just wondering.

Thanks,
-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-20 15:49             ` Jamie Lokier
@ 2008-04-20 18:43               ` Avi Kivity
  2008-04-20 23:39                 ` Jamie Lokier
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2008-04-20 18:43 UTC (permalink / raw)
  To: qemu-devel, Anthony Liguori, kvm-devel, Marcelo Tosatti

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> For the majority of deployments posix aio should be sufficient.  The few 
>> that need something else can use Linux aio.
>>     
>
> Does that mean "for the majority of deployments, the slow version is
> sufficient.  The few that care about performance can use Linux AIO?"
>
>   

In essence, yes. s/slow/slower/ and s/performance/ultimate block device 
performance/.

Many deployments don't care at all about block device performance; they 
care mostly about networking performance.

> I'm under the impression that the entire and only point of Linux AIO
> is that it's faster than POSIX AIO on Linux.
>   

It is.  I estimate posix aio adds a few microseconds above linux aio per 
I/O request, when using O_DIRECT.  Assuming 10 microseconds, you will 
need 10,000 I/O requests per second per vcpu to have a 10% performance 
difference.  That's definitely rare.

>> Of course, a managed environment can use Linux aio unconditionally if 
>> knows the kernel has all the needed goodies.
>>     
>
> Does that mean "a managed environment can have some code which check
> the host kernel version + filesystem type holding the VM image, to
> conditionally enable Linux AIO?"  (Since if you care about
> performance, which is the sole reason for using Linux AIO, you
> wouldn't want to enable Linux AIO on any host in your cluster where it
> will trash performance.)
>   

Either that, or mandate that all hosts use a filesystem and kernel which 
provide the necessary performance.  Take ovirt for example, which 
provides the entire hypervisor environment, and so can guarantee this.

Also, I'd presume that those that need 10K IOPS and above will not place 
their high throughput images on a filesystem; rather on a separate SAN LUN.

> Just wondering.
>   

Hope this clarifies.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-20 18:43               ` Avi Kivity
@ 2008-04-20 23:39                 ` Jamie Lokier
  2008-04-21  6:39                   ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2008-04-20 23:39 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Avi Kivity wrote:
> >Does that mean "for the majority of deployments, the slow version is
> >sufficient.  The few that care about performance can use Linux AIO?"
>
> In essence, yes. s/slow/slower/ and s/performance/ultimate block device 
> performance/.
> 
> Many deployments don't care at all about block device performance; they 
> care mostly about networking performance.

That's interesting.  I'd have expected block device performance to be
important for most things, for the same reason that disk performance
is (well, reasonably) important for non-virtual machines.

But as you say next:

> >I'm under the impression that the entire and only point of Linux AIO
> >is that it's faster than POSIX AIO on Linux.
> 
> It is.  I estimate posix aio adds a few microseconds above linux aio per 
> I/O request, when using O_DIRECT.  Assuming 10 microseconds, you will 
> need 10,000 I/O requests per second per vcpu to have a 10% performance 
> difference.  That's definitely rare.

Oh, I didn't realise the difference was so small.

At such a tiny difference, I'm wondering why Linux-AIO exists at all,
as it complicates the kernel rather a lot.  I can see the theoretical
appeal, but if performance is so marginal, I'm surprised it's in
there.

I'm also surprised the Glibc implementation of AIO using ordinary
threads is so close to it.  And then, I'm wondering why use AIO it
all: it suggests QEMU would run about as fast doing synchronous I/O in
a few dedicated I/O threads.

> >Does that mean "a managed environment can have some code which check
> >the host kernel version + filesystem type holding the VM image, to
> >conditionally enable Linux AIO?"  (Since if you care about
> >performance, which is the sole reason for using Linux AIO, you
> >wouldn't want to enable Linux AIO on any host in your cluster where it
> >will trash performance.)
> 
> Either that, or mandate that all hosts use a filesystem and kernel which 
> provide the necessary performance.  Take ovirt for example, which 
> provides the entire hypervisor environment, and so can guarantee this.
> 
> Also, I'd presume that those that need 10K IOPS and above will not place 
> their high throughput images on a filesystem; rather on a separate SAN LUN.

Does the separate LUN make any difference?  I thought O_DIRECT on a
filesystem was meant to be pretty close to block device performance.
I base this on messages here and there which say swapping to a file is
about as fast as swapping to a block device, nowadays.

Thanks for your useful remarks, btw.  There doesn't seem to be a lot
of good info about Linux-AIO around.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-20 23:39                 ` Jamie Lokier
@ 2008-04-21  6:39                   ` Avi Kivity
  2008-04-21 12:10                     ` Jamie Lokier
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2008-04-21  6:39 UTC (permalink / raw)
  To: qemu-devel, Anthony Liguori, kvm-devel, Marcelo Tosatti

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>>> Does that mean "for the majority of deployments, the slow version is
>>> sufficient.  The few that care about performance can use Linux AIO?"
>>>       
>> In essence, yes. s/slow/slower/ and s/performance/ultimate block device 
>> performance/.
>>
>> Many deployments don't care at all about block device performance; they 
>> care mostly about networking performance.
>>     
>
> That's interesting.  I'd have expected block device performance to be
> important for most things, for the same reason that disk performance
> is (well, reasonably) important for non-virtual machines.
>
>   

Seek time is important.  Bandwidth is somewhat important.  But for one- 
and two- spindle workloads (the majority), the cpu utilization induced 
by getting requests to the disk is not important, and that's what we're 
optimizing here.

Disks work at around 300 Hz.  Processors at around 3 GHz.  That's seven 
orders of magnitude difference.  Even if you spent 100 usec calculating 
what's the next best seek, even if it saves you only 10% of seeks it's a 
win.  And of course modern processors spend a few microseconds at most 
getting a request out.

You really need 50+ disks or a large write-back cache to make 
microoptimizations around the submission path felt.

> But as you say next:
>
>   
>>> I'm under the impression that the entire and only point of Linux AIO
>>> is that it's faster than POSIX AIO on Linux.
>>>       
>> It is.  I estimate posix aio adds a few microseconds above linux aio per 
>> I/O request, when using O_DIRECT.  Assuming 10 microseconds, you will 
>> need 10,000 I/O requests per second per vcpu to have a 10% performance 
>> difference.  That's definitely rare.
>>     
>
> Oh, I didn't realise the difference was so small.
>
> At such a tiny difference, I'm wondering why Linux-AIO exists at all,
> as it complicates the kernel rather a lot.  I can see the theoretical
> appeal, but if performance is so marginal, I'm surprised it's in
> there.
>
>   

Linux aio exists, but that's all that can be said for it.  It works 
mostly for raw disks, doesn't integrate with networking, and doesn't 
advance at the same pace as the rest of the kernel.  I believe only 
databases use it (and a userspace filesystem I wrote some time ago).

> I'm also surprised the Glibc implementation of AIO using ordinary
> threads is so close to it.  

Why are you surprised?

Actually the glibc implementation could be improved from what I've 
heard.  My estimates are for a thread pool implementation, but there is 
not reason why glibc couldn't achieve exactly the same performance.

> And then, I'm wondering why use AIO it
> all: it suggests QEMU would run about as fast doing synchronous I/O in
> a few dedicated I/O threads.
>
>   

Posix aio is the unix API for this, why not use it?

>> Also, I'd presume that those that need 10K IOPS and above will not place 
>> their high throughput images on a filesystem; rather on a separate SAN LUN.
>>     
>
> Does the separate LUN make any difference?  I thought O_DIRECT on a
> filesystem was meant to be pretty close to block device performance.
>   

On a good extent-based filesystem like XFS you will get good performance 
(though more cpu overhead due to needing to go through additional 
mapping layers.  Old clunkers like ext3 will require additional seeks or 
a ton of cache (1 GB per 1 TB).

> I base this on messages here and there which say swapping to a file is
> about as fast as swapping to a block device, nowadays.
>   

Swapping to a file preloads the block mapping into memory, so the 
filesystem is not involved at all in the I/O path.

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-21  6:39                   ` Avi Kivity
@ 2008-04-21 12:10                     ` Jamie Lokier
  2008-04-22  8:10                       ` Avi Kivity
  0 siblings, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2008-04-21 12:10 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Avi Kivity wrote:
> >At such a tiny difference, I'm wondering why Linux-AIO exists at all,
> >as it complicates the kernel rather a lot.  I can see the theoretical
> >appeal, but if performance is so marginal, I'm surprised it's in
> >there.
> 
> Linux aio exists, but that's all that can be said for it.  It works 
> mostly for raw disks, doesn't integrate with networking, and doesn't 
> advance at the same pace as the rest of the kernel.  I believe only 
> databases use it (and a userspace filesystem I wrote some time ago).

And video streaming on some embedded devices with no MMU!  (Due to the
page cache heuristics working poorly with no MMU, sustained reliable
streaming is managed with O_DIRECT and the app managing cache itself
(like a database), and that needs AIO to keep the request queue busy.
At least, that's the theory.)

> >I'm also surprised the Glibc implementation of AIO using ordinary
> >threads is so close to it.  
> 
> Why are you surprised?

Because I've read that Glibc AIO (which uses a thread pool) is a
relatively poor performer as AIO implementations go, and is only there
for API compatibility, not suggested for performance.

But I read that quite a while ago, perhaps it's changed.

> Actually the glibc implementation could be improved from what I've 
> heard.  My estimates are for a thread pool implementation, but there is 
> not reason why glibc couldn't achieve exactly the same performance.

Erm...  I thought you said it _does_ achieve nearly the same
performance, not that it _could_.

Do you mean it could achieve exactly the same performance by using
Linux AIO when possible?

> >And then, I'm wondering why use AIO it
> >all: it suggests QEMU would run about as fast doing synchronous I/O in
> >a few dedicated I/O threads.
> 
> Posix aio is the unix API for this, why not use it?

Because far more host platforms have threads than have POSIX AIO.  (I
suspect both options will end up supported in the end, as dedicated
I/O threads were already suggested for other things.)

> >>Also, I'd presume that those that need 10K IOPS and above will not place 
> >>their high throughput images on a filesystem; rather on a separate SAN 
> >>LUN.
> >
> >Does the separate LUN make any difference?  I thought O_DIRECT on a
> >filesystem was meant to be pretty close to block device performance.
> 
> On a good extent-based filesystem like XFS you will get good performance 
> (though more cpu overhead due to needing to go through additional 
> mapping layers.  Old clunkers like ext3 will require additional seeks or 
> a ton of cache (1 GB per 1 TB).

Hmm.  Thanks.  I may consider switching to XFS now....

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-21 12:10                     ` Jamie Lokier
@ 2008-04-22  8:10                       ` Avi Kivity
  2008-04-22 14:28                         ` Jamie Lokier
  0 siblings, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2008-04-22  8:10 UTC (permalink / raw)
  To: qemu-devel, Anthony Liguori, kvm-devel, Marcelo Tosatti

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>>> At such a tiny difference, I'm wondering why Linux-AIO exists at all,
>>> as it complicates the kernel rather a lot.  I can see the theoretical
>>> appeal, but if performance is so marginal, I'm surprised it's in
>>> there.
>>>       
>> Linux aio exists, but that's all that can be said for it.  It works 
>> mostly for raw disks, doesn't integrate with networking, and doesn't 
>> advance at the same pace as the rest of the kernel.  I believe only 
>> databases use it (and a userspace filesystem I wrote some time ago).
>>     
>
> And video streaming on some embedded devices with no MMU!  (Due to the
> page cache heuristics working poorly with no MMU, sustained reliable
> streaming is managed with O_DIRECT and the app managing cache itself
> (like a database), and that needs AIO to keep the request queue busy.
> At least, that's the theory.)
>
>   

Could use threads as well, no?

>>> I'm also surprised the Glibc implementation of AIO using ordinary
>>> threads is so close to it.  
>>>       
>> Why are you surprised?
>>     
>
> Because I've read that Glibc AIO (which uses a thread pool) is a
> relatively poor performer as AIO implementations go, and is only there
> for API compatibility, not suggested for performance.
>
> But I read that quite a while ago, perhaps it's changed.
>
>   

It's me at fault here.  I just assumed that because it's easy to do aio 
in a thread pool efficiently, that's what glibc does.

Unfortunately the code does some ridiculous things like not service 
multiple requests on a single fd in parallel.  I see absolutely no 
reason for it (the code says "fight for resources").

So my comments only apply to linux-aio vs a sane thread pool.  Sorry for 
spreading confusion.

>> Actually the glibc implementation could be improved from what I've 
>> heard.  My estimates are for a thread pool implementation, but there is 
>> not reason why glibc couldn't achieve exactly the same performance.
>>     
>
> Erm...  I thought you said it _does_ achieve nearly the same
> performance, not that it _could_.
>
> Do you mean it could achieve exactly the same performance by using
> Linux AIO when possible?
>
>   

It could and should.  It probably doesn't.

A simple thread pool implementation could come within 10% of Linux aio 
for most workloads.  It will never be "exactly", but for small numbers 
of disks, close enough.

>>> And then, I'm wondering why use AIO it
>>> all: it suggests QEMU would run about as fast doing synchronous I/O in
>>> a few dedicated I/O threads.
>>>       
>> Posix aio is the unix API for this, why not use it?
>>     
>
> Because far more host platforms have threads than have POSIX AIO.  (I
> suspect both options will end up supported in the end, as dedicated
> I/O threads were already suggested for other things.)
>   

Agree.

>   
>>>> Also, I'd presume that those that need 10K IOPS and above will not place 
>>>> their high throughput images on a filesystem; rather on a separate SAN 
>>>> LUN.
>>>>         
>>> Does the separate LUN make any difference?  I thought O_DIRECT on a
>>> filesystem was meant to be pretty close to block device performance.
>>>       
>> On a good extent-based filesystem like XFS you will get good performance 
>> (though more cpu overhead due to needing to go through additional 
>> mapping layers.  Old clunkers like ext3 will require additional seeks or 
>> a ton of cache (1 GB per 1 TB).
>>     
>
> Hmm.  Thanks.  I may consider switching to XFS now....
>
>   

I'm rooting for btrfs myself.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22  8:10                       ` Avi Kivity
@ 2008-04-22 14:28                         ` Jamie Lokier
  2008-04-22 14:53                           ` Anthony Liguori
  2008-04-22 15:03                           ` Avi Kivity
  0 siblings, 2 replies; 31+ messages in thread
From: Jamie Lokier @ 2008-04-22 14:28 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Avi Kivity wrote:
> >And video streaming on some embedded devices with no MMU!  (Due to the
> >page cache heuristics working poorly with no MMU, sustained reliable
> >streaming is managed with O_DIRECT and the app managing cache itself
> >(like a database), and that needs AIO to keep the request queue busy.
> >At least, that's the theory.)
> 
> Could use threads as well, no?

Perhaps.  This raises another point about AIO vs. threads:

If I submit sequential O_DIRECT reads with aio_read(), will they enter
the device read queue in the same order, and reach the disk in that
order (allowing for reordering when worthwhile by the elevator)?

With threads this isn't guaranteed and scheduling makes it quite
likely to issue the parallel synchronous reads out of order, and for
them to reach the disk out of order because the elevator doesn't see
them simultaneously.

With AIO (non-Glibc! (and non-kthreads)) it might be better at
keeping the intended issue order, I'm not sure.

It is highly desirable: O_DIRECT streaming performance depends on
avoiding seeks (no reordering) and on keeping the request queue
non-empty (no gap).

I read a man page for some other unix, describing AIO as better than
threaded parallel reads for reading tape drives because of this (tape
seeks are very expensive).  But the rest of the man page didn't say
anything more.  Unfortunately I don't remember where I read it.  I
have no idea whether AIO submission order is nearly always preserved
in general, or expected to be.

> It's me at fault here.  I just assumed that because it's easy to do aio 
> in a thread pool efficiently, that's what glibc does.
> 
> Unfortunately the code does some ridiculous things like not service 
> multiple requests on a single fd in parallel.  I see absolutely no 
> reason for it (the code says "fight for resources").

Ouch.  Perhaps that relates to my thought above, about multiple
requests to the same file causing seek storms when thread scheduling
is unlucky?

> So my comments only apply to linux-aio vs a sane thread pool.  Sorry for 
> spreading confusion.

Thanks.  I thought you'd measured it :-)

> It could and should.  It probably doesn't.
> 
> A simple thread pool implementation could come within 10% of Linux aio 
> for most workloads.  It will never be "exactly", but for small numbers 
> of disks, close enough.

I would wait for benchmark results for I/O patterns like sequential
reading and writing, because of potential for seeks caused by request
reordering, before being confident of that.

> >Hmm.  Thanks.  I may consider switching to XFS now....
> 
> I'm rooting for btrfs myself.

In the unlikely event they backport btrfs to kernel 2.4.26-uc0, I'll
be happy to give it a try! :-)

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 14:28                         ` Jamie Lokier
@ 2008-04-22 14:53                           ` Anthony Liguori
  2008-04-22 15:05                             ` Avi Kivity
  2008-04-22 15:12                             ` Jamie Lokier
  2008-04-22 15:03                           ` Avi Kivity
  1 sibling, 2 replies; 31+ messages in thread
From: Anthony Liguori @ 2008-04-22 14:53 UTC (permalink / raw)
  To: qemu-devel, kvm-devel, Marcelo Tosatti

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>>> And video streaming on some embedded devices with no MMU!  (Due to the
>>> page cache heuristics working poorly with no MMU, sustained reliable
>>> streaming is managed with O_DIRECT and the app managing cache itself
>>> (like a database), and that needs AIO to keep the request queue busy.
>>> At least, that's the theory.)
>>>       
>> Could use threads as well, no?
>>     
>
> Perhaps.  This raises another point about AIO vs. threads:
>
> If I submit sequential O_DIRECT reads with aio_read(), will they enter
> the device read queue in the same order, and reach the disk in that
> order (allowing for reordering when worthwhile by the elevator)?
>   

There's no guarantee that any sort of order will be preserved by AIO 
requests.  The same is true with writes.  This is what fdsync is for, to 
guarantee ordering.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 14:28                         ` Jamie Lokier
  2008-04-22 14:53                           ` Anthony Liguori
@ 2008-04-22 15:03                           ` Avi Kivity
  2008-04-22 15:36                             ` Jamie Lokier
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2008-04-22 15:03 UTC (permalink / raw)
  To: qemu-devel, Anthony Liguori, kvm-devel, Marcelo Tosatti

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>>> And video streaming on some embedded devices with no MMU!  (Due to the
>>> page cache heuristics working poorly with no MMU, sustained reliable
>>> streaming is managed with O_DIRECT and the app managing cache itself
>>> (like a database), and that needs AIO to keep the request queue busy.
>>> At least, that's the theory.)
>>>       
>> Could use threads as well, no?
>>     
>
> Perhaps.  This raises another point about AIO vs. threads:
>
> If I submit sequential O_DIRECT reads with aio_read(), will they enter
> the device read queue in the same order, and reach the disk in that
> order (allowing for reordering when worthwhile by the elevator)?
>   

Yes, unless the implementation in the kernel (or glibc) is threaded.

> With threads this isn't guaranteed and scheduling makes it quite
> likely to issue the parallel synchronous reads out of order, and for
> them to reach the disk out of order because the elevator doesn't see
> them simultaneously.
>   

If the disk is busy, it doesn't matter.  The requests will queue and the 
elevator will sort them out.  So it's just the first few requests that 
may get to disk out of order.

> With AIO (non-Glibc! (and non-kthreads)) it might be better at
> keeping the intended issue order, I'm not sure.
>
> It is highly desirable: O_DIRECT streaming performance depends on
> avoiding seeks (no reordering) and on keeping the request queue
> non-empty (no gap).
>
> I read a man page for some other unix, describing AIO as better than
> threaded parallel reads for reading tape drives because of this (tape
> seeks are very expensive).  But the rest of the man page didn't say
> anything more.  Unfortunately I don't remember where I read it.  I
> have no idea whether AIO submission order is nearly always preserved
> in general, or expected to be.
>   

I haven't considered tape, but this is a good point indeed.  I expect it 
doesn't make much of a difference for a loaded disk.

>   
>> It's me at fault here.  I just assumed that because it's easy to do aio 
>> in a thread pool efficiently, that's what glibc does.
>>
>> Unfortunately the code does some ridiculous things like not service 
>> multiple requests on a single fd in parallel.  I see absolutely no 
>> reason for it (the code says "fight for resources").
>>     
>
> Ouch.  Perhaps that relates to my thought above, about multiple
> requests to the same file causing seek storms when thread scheduling
> is unlucky?
>   

My first thought on seeing this is that it relates to a deficiency on 
older kernels servicing multiple requests on a single fd (i.e. a 
per-file lock).  I don't know if such a deficiency ever existed, though.

>   
>> It could and should.  It probably doesn't.
>>
>> A simple thread pool implementation could come within 10% of Linux aio 
>> for most workloads.  It will never be "exactly", but for small numbers 
>> of disks, close enough.
>>     
>
> I would wait for benchmark results for I/O patterns like sequential
> reading and writing, because of potential for seeks caused by request
> reordering, before being confident of that.
>
>   

I did have measurements (and a test rig) at a previous job (where I did 
a lot of I/O work); IIRC the performance of a tuned thread pool was not 
far behind aio, both for seeks and sequential.  It was a while back though.


-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 14:53                           ` Anthony Liguori
@ 2008-04-22 15:05                             ` Avi Kivity
  2008-04-22 15:23                               ` Jamie Lokier
  2008-04-22 15:12                             ` Jamie Lokier
  1 sibling, 1 reply; 31+ messages in thread
From: Avi Kivity @ 2008-04-22 15:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti

Anthony Liguori wrote:
>>
>> If I submit sequential O_DIRECT reads with aio_read(), will they enter
>> the device read queue in the same order, and reach the disk in that
>> order (allowing for reordering when worthwhile by the elevator)?
>>   
>
> There's no guarantee that any sort of order will be preserved by AIO 
> requests.  The same is true with writes.  This is what fdsync is for, 
> to guarantee ordering.

I believe he'd like a hint to get good scheduling, not a guarantee.  
With a thread pool if the threads are scheduled out of order, so are 
your requests.  If the elevator doesn't plug the queue, the first few 
requests may not be optimally sorted.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 14:53                           ` Anthony Liguori
  2008-04-22 15:05                             ` Avi Kivity
@ 2008-04-22 15:12                             ` Jamie Lokier
  1 sibling, 0 replies; 31+ messages in thread
From: Jamie Lokier @ 2008-04-22 15:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti

Anthony Liguori wrote:
> >Perhaps.  This raises another point about AIO vs. threads:
> >
> >If I submit sequential O_DIRECT reads with aio_read(), will they enter
> >the device read queue in the same order, and reach the disk in that
> >order (allowing for reordering when worthwhile by the elevator)?
> 
> There's no guarantee that any sort of order will be preserved by AIO 
> requests.  The same is true with writes.  This is what fdsync is for, to 
> guarantee ordering.

You misunderstand.  I'm not talking about guarantees, I'm talking
about expectations for the performance effect.

Basically, to do performant streaming read with O_DIRECT you need two
things:

   1. Overlap at least 2 requests, so the device is kept busy.

   2. Requests be sent to the disk in a good order, which is usually
      (but not always) sequential offset order.

The kernel does this itself with buffered reads, doing readahead.
It works very well, unless you have other problems caused by readahead.

With O_DIRECT, an application has to do the equivalent of readahead
itself to get performant streaming.

If the app uses two threads calling pread(), it's hard to ensure the
kernel even _sees_ the first two calls in sequential offset order.
You spawn two threads, and then both threads call pread() with
non-deterministic scheduling.  The problem starts before even entering
the kernel.

Then, depending on I/O scheduling in the kernel, it might send the
less good pread() to the disk immediately, then later a backward head
seek and the other one.  The elevator cannot fix this: it doesn't have
enough information, unless it adds artificial delays.  But artificial
delays may harm too; it's not optimal.

After that, the two threads tend to call pread() in the best order
provided there's no scheduling conflicts, but are easily disrupted by
other tasks, especially on SMP (one reading thread per CPU, so when
one of them is descheduled, the other continues and issues a request
in the 'wrong' order.)

With AIO, even though you can't be sure what the kernel does, you can
be sure the kernel receives aio_read() calls in the exact order which
is most likely to perform well.  Application knowledge of it's access
pattern is passed along better.

As I've said, I saw a man page which described why this makes AIO
superior to using threads for reading tapes on that OS.  So it's not a
completely spurious point.

This has nothing to do with guarantees.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 15:05                             ` Avi Kivity
@ 2008-04-22 15:23                               ` Jamie Lokier
  0 siblings, 0 replies; 31+ messages in thread
From: Jamie Lokier @ 2008-04-22 15:23 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Marcelo Tosatti

Avi Kivity wrote:
> Anthony Liguori wrote:
> >>If I submit sequential O_DIRECT reads with aio_read(), will they enter
> >>the device read queue in the same order, and reach the disk in that
> >>order (allowing for reordering when worthwhile by the elevator)?
> >>  
> >There's no guarantee that any sort of order will be preserved by AIO 
> >requests.  The same is true with writes.  This is what fdsync is for, 
> >to guarantee ordering.
> 
> I believe he'd like a hint to get good scheduling, not a guarantee.  
> With a thread pool if the threads are scheduled out of order, so are 
> your requests.

> If the elevator doesn't plug the queue, the first few requests may
> not be optimally sorted.

That's right.  Then they tend to settle to a good order.  But any
delay in scheduling one of the threads, or a signal received by one of
them, can make it lose order briefly, making the streaming stutter as
the disk performes a few local seeks until it settles to good order
again.

You can mitigate the disruption in various ways.

  1. If all threads share an "offset" variable, and reads and
     increments that atomically just prior to calling pread(), that helps
     especially at the start.  (If threaded I/O is used for QEMU disk
     emulation, I would suggest doing that, in the more general form
     of popping a request from QEMU's internal shared queue at the last
     moment.)

  2. Using more threads helps keep it sustained, at the cost of more
     wasted I/O when there's a cancellation (changed mind), and more
     memory.

However, AIO, in principle (if not implementations...) could be better
at keeping the suggested I/O order than thread, without special tricks.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 15:03                           ` Avi Kivity
@ 2008-04-22 15:36                             ` Jamie Lokier
  2008-05-02 16:37                               ` Antonio Vargas
  0 siblings, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2008-04-22 15:36 UTC (permalink / raw)
  To: qemu-devel; +Cc: kvm-devel, Anthony Liguori, Marcelo Tosatti

Avi Kivity wrote:
> >Perhaps.  This raises another point about AIO vs. threads:
> >
> >If I submit sequential O_DIRECT reads with aio_read(), will they enter
> >the device read queue in the same order, and reach the disk in that
> >order (allowing for reordering when worthwhile by the elevator)?
> 
> Yes, unless the implementation in the kernel (or glibc) is threaded.

> 
> >With threads this isn't guaranteed and scheduling makes it quite
> >likely to issue the parallel synchronous reads out of order, and for
> >them to reach the disk out of order because the elevator doesn't see
> >them simultaneously.
> 
> If the disk is busy, it doesn't matter.  The requests will queue and the 
> elevator will sort them out.  So it's just the first few requests that 
> may get to disk out of order.

There's two cases where it matters to a read-streaming app:

    1. Disk isn't busy with anything else, maximum streaming
       performance is desired.

    2. Disk is busy with unrelated things, but you're using I/O
       priorities to give the streaming app near-absolute priority.
       Then you need to maintain overlapped streaming requests,
       otherwise disk is given to a lower priority I/O.  If that
       happens often, you lose, priority is ineffective.  Because one
       of the streaming requests is usually being serviced, elevator
       has similar limitations as for a disk which is not busy with
       anything else.

> I haven't considered tape, but this is a good point indeed.  I expect it 
> doesn't make much of a difference for a loaded disk.

Yes, as long as it's loaded with unrelated requests at the same I/O
priority, the elevator has time to sort requests and hide thread
scheduling artifacts.

Btw, regarding QEMU: QEMU gets requests _after_ sorting by the guest's
elevator, then submits them to the host's elevator.  If the guest and
host elevators are both configured 'anticipatory', do the anticipatory
delays add up?

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-04-22 15:36                             ` Jamie Lokier
@ 2008-05-02 16:37                               ` Antonio Vargas
  2008-05-02 17:18                                 ` Jamie Lokier
  0 siblings, 1 reply; 31+ messages in thread
From: Antonio Vargas @ 2008-05-02 16:37 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 762 bytes --]

On Tue, Apr 22, 2008 at 4:36 PM, Jamie Lokier <jamie@shareable.org> wrote:

*snip*


>
> Btw, regarding QEMU: QEMU gets requests _after_ sorting by the guest's
> elevator, then submits them to the host's elevator.  If the guest and
> host elevators are both configured 'anticipatory', do the anticipatory
> delays add up?
>


Anticipatory is non-work-conserving. If the data is going to end passing
thru
host's deadline scheduler, probably it is better to the guest with deadline
or maybe even no-op since it doesn't really know anything about the real
disk locations of the data.

-- 
Greetz, Antonio Vargas aka winden of rgba^ntw^bg

http://winden.wordpress.com/
windenntw@gmail.com

Every day, every year
you have to work
you have to study
you have to scene.

[-- Attachment #2: Type: text/html, Size: 1187 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-05-02 16:37                               ` Antonio Vargas
@ 2008-05-02 17:18                                 ` Jamie Lokier
  2008-05-02 17:52                                   ` Anthony Liguori
  0 siblings, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2008-05-02 17:18 UTC (permalink / raw)
  To: qemu-devel

Antonio Vargas wrote:
>      Btw, regarding QEMU: QEMU gets requests _after_ sorting by the
>      guest's
>      elevator, then submits them to the host's elevator.  If the guest and
>      host elevators are both configured 'anticipatory', do the
>      anticipatory
>      delays add up?
> 
> 
> Anticipatory is non-work-conserving. If the data is going to end passing thru
> host's deadline scheduler, probably it is better to the guest with deadline
> or maybe even no-op since it doesn't really know anything about the real
> disk locations of the data.

That makes sense - especially for formats like qcow and snapshots, the
guest has very little knowledge of access timings.

It's a bit like a database accessing a large file: the database tries
to schedule and merge I/O requests internally before sending them to
the kernel.  It doesn't know anything about the layout of disk blocks
in the file, but it can guess that nearby accesses are more likely to
involve lower seek times than far apart accesses.

There is still one reason for guests to do a little I/O scheduling,
and that's to merge adjacent requests into fewer ops passing through
the guest/host interface.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-05-02 17:18                                 ` Jamie Lokier
@ 2008-05-02 17:52                                   ` Anthony Liguori
  2008-05-02 18:24                                     ` Jamie Lokier
  0 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2008-05-02 17:52 UTC (permalink / raw)
  To: qemu-devel

Jamie Lokier wrote:
> That makes sense - especially for formats like qcow and snapshots, the
> guest has very little knowledge of access timings.
>
> It's a bit like a database accessing a large file: the database tries
> to schedule and merge I/O requests internally before sending them to
> the kernel.  It doesn't know anything about the layout of disk blocks
> in the file, but it can guess that nearby accesses are more likely to
> involve lower seek times than far apart accesses.
>
> There is still one reason for guests to do a little I/O scheduling,
> and that's to merge adjacent requests into fewer ops passing through
> the guest/host interface.
>   

FWIW, in the process of optimizing the kernel driver for virtio-blk, 
I've found that using a no-op scheduler helps a fair bit.  As long as 
you're using a reasonably sized ring, the back-end can merge adjacent 
requests.  This also helps a fair bit too.

Regards,

Anthony Liguori

> -- Jamie
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [kvm-devel] [Qemu-devel] Re: [PATCH 1/3] Refactor AIO interface to allow other AIO implementations
  2008-05-02 17:52                                   ` Anthony Liguori
@ 2008-05-02 18:24                                     ` Jamie Lokier
  0 siblings, 0 replies; 31+ messages in thread
From: Jamie Lokier @ 2008-05-02 18:24 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:
> FWIW, in the process of optimizing the kernel driver for virtio-blk, 
> I've found that using a no-op scheduler helps a fair bit.  As long as 
> you're using a reasonably sized ring, the back-end can merge adjacent 
> requests.  This also helps a fair bit too.

By 'back-end' do you mean the virtio-blk driver itself merges
requests?

I'm assuming yes, for these questions: Does it honour barriers
properly?  Isn't that the job of the guest elevator?  Does it suggest
the various guest I/O schedulers could be enhanced to operate directly
on requests in flight in the virtio ring?

Oh, one last thing.  Is it plausible to write a Windows driver which
uses the virtio-blk interface, to get the best performing
Windows-in-KVM?  (It's nice to be able to say Linux makes a good host
for Windows guests too.)

Thanks,
- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2008-05-02 18:24 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-17 19:26 [Qemu-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Anthony Liguori
2008-04-17 19:26 ` [Qemu-devel] [PATCH 2/3] Split out posix-aio code Anthony Liguori
2008-04-17 19:26 ` [Qemu-devel] [PATCH 3/3] Implement linux-aio backend Anthony Liguori
2008-04-18 15:09   ` [Qemu-devel] " Marcelo Tosatti
2008-04-18 15:18     ` Anthony Liguori
2008-04-18 17:46       ` Marcelo Tosatti
2008-04-17 19:38 ` [Qemu-devel] Re: [kvm-devel] [PATCH 1/3] Refactor AIO interface to allow other AIO implementations Daniel P. Berrange
2008-04-17 19:41   ` Anthony Liguori
2008-04-17 20:00     ` Daniel P. Berrange
2008-04-17 20:05       ` Anthony Liguori
2008-04-18 12:43       ` Jamie Lokier
2008-04-18 15:23         ` Anthony Liguori
2008-04-18 16:22           ` Jamie Lokier
2008-04-18 16:32           ` [kvm-devel] [Qemu-devel] " Avi Kivity
2008-04-20 15:49             ` Jamie Lokier
2008-04-20 18:43               ` Avi Kivity
2008-04-20 23:39                 ` Jamie Lokier
2008-04-21  6:39                   ` Avi Kivity
2008-04-21 12:10                     ` Jamie Lokier
2008-04-22  8:10                       ` Avi Kivity
2008-04-22 14:28                         ` Jamie Lokier
2008-04-22 14:53                           ` Anthony Liguori
2008-04-22 15:05                             ` Avi Kivity
2008-04-22 15:23                               ` Jamie Lokier
2008-04-22 15:12                             ` Jamie Lokier
2008-04-22 15:03                           ` Avi Kivity
2008-04-22 15:36                             ` Jamie Lokier
2008-05-02 16:37                               ` Antonio Vargas
2008-05-02 17:18                                 ` Jamie Lokier
2008-05-02 17:52                                   ` Anthony Liguori
2008-05-02 18:24                                     ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).