From: Ryan Harper <ryanh@us.ibm.com>
To: qemu-devel@nongnu.org
Cc: Ryan Harper <ryanh@us.ibm.com>
Subject: [Qemu-devel] [PATCH 2/3] Move aio implementation out of raw block driver
Date: Mon, 22 Sep 2008 18:17:34 -0500 [thread overview]
Message-ID: <1222125454-21744-4-git-send-email-ryanh@us.ibm.com> (raw)
In-Reply-To: <1222125454-21744-1-git-send-email-ryanh@us.ibm.com>
This patch adds a linux aio raw block driver implementation. If a raw block
device is opened with cached=off (O_DIRECT) then we can utilize linux aio to
submit io to/from the block device. Utilizing linux aio allows for multiple
outstanding requests to be in flight against the io device potentially providing
higher IO throughput. This implementation uses eventfd for event completion
notification.
Block devices with cache enabled will utilize posix aio since linux aio will
fallback to synchronous IO when used without O_DIRECT[1].
Signed-off-by: Ryan Harper <ryanh@us.ibm.com>
1. http://lse.sourceforge.net/io/aio.html
diff --git a/Makefile b/Makefile
index 18477ba..92ca5d9 100644
--- a/Makefile
+++ b/Makefile
@@ -60,7 +60,7 @@ BLOCK_OBJS += block-raw-posix.o
endif
ifdef CONFIG_AIO
-BLOCK_OBJS += compatfd.o aio-posix.o
+BLOCK_OBJS += compatfd.o aio-posix.o aio-linux.o
endif
######################################################################
diff --git a/Makefile.target b/Makefile.target
index 4c6b3d5..599fa8a 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -482,7 +482,7 @@ OBJS+=block-raw-posix.o
endif
ifdef CONFIG_AIO
-OBJS+=compatfd.o aio-posix.o
+OBJS+=compatfd.o aio-posix.o aio-linux.o
endif
LIBS+=-lz
diff --git a/aio-linux.c b/aio-linux.c
new file mode 100644
index 0000000..0043fd1
--- /dev/null
+++ b/aio-linux.c
@@ -0,0 +1,225 @@
+/*
+ * QEMU Linux AIO implementation for Block Raw devices
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Ryan Harper <ryanh@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+/***********************************************************/
+/* Unix AIO using LINUX AIO */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block.h"
+#include "block-aio.h"
+#include "compatfd.h"
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/aio_abi.h>
+
+#define MAX_LINUX_AIO_EVENTS 256
+
+int eventfd(unsigned int initval)
+{
+ return syscall(SYS_eventfd, initval);
+}
+
+int io_setup(unsigned nr_reqs, aio_context_t *ctx_id)
+{
+ return syscall(SYS_io_setup, nr_reqs, ctx_id);
+}
+
+int io_destroy(aio_context_t ctx_id)
+{
+ return syscall(SYS_io_destroy, ctx_id);
+}
+
+int io_getevents(aio_context_t ctx_id, long min_nr, long nr,
+ struct io_event *events, struct timespec *timeout)
+{
+ return syscall(SYS_io_getevents, ctx_id, min_nr, nr, events, timeout);
+}
+
+int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocb)
+{
+ return syscall(SYS_io_submit, ctx_id, nr, iocb);
+}
+
+int io_cancel(aio_context_t ctx_id, struct iocb *iocb, struct io_event *result)
+{
+ return syscall(SYS_io_cancel, ctx_id, iocb, result);
+}
+
+typedef AIOState LinuxAioState;
+
+static int aio_efd;
+static aio_context_t aio_ctxt_id;
+static int outstanding_requests;
+static LinuxAioState *linux_aio_state;
+
+static RawAIOCB *la_submit(BlockDriverState *bs, int fd,
+ int64_t sector_num, uint8_t *buf,
+ int nb_sectors, int write,
+ BlockDriverCompletionFunc *cb, void *opaque)
+{
+ RawAIOCB *acb;
+ struct iocb *iocbs[1];
+ int err;
+
+ acb = qemu_aio_get(bs, cb, opaque);
+ if (!acb) {
+ fprintf(stderr, "%s: qemu_aio_get returned NULL!?!\n", __FUNCTION__);
+ return NULL;
+ }
+
+ if (write)
+ acb->linux_aiocb.aio_lio_opcode = IOCB_CMD_PWRITE;
+ else
+ acb->linux_aiocb.aio_lio_opcode = IOCB_CMD_PREAD;
+
+ acb->linux_aiocb.aio_data = (unsigned long)acb;
+ acb->linux_aiocb.aio_fildes = fd;
+ acb->linux_aiocb.aio_flags = IOCB_FLAG_RESFD;
+ acb->linux_aiocb.aio_resfd = aio_efd;
+ acb->linux_aiocb.aio_buf = (unsigned long)buf;
+ acb->linux_aiocb.aio_nbytes = nb_sectors * 512;
+ acb->linux_aiocb.aio_offset = sector_num * 512;
+
+ acb->next = linux_aio_state->first_aio;
+ linux_aio_state->first_aio = acb;
+
+ iocbs[0] = &acb->linux_aiocb;
+
+ do {
+ err = io_submit(aio_ctxt_id, 1, iocbs);
+ } while (err == -1 && errno == EINTR);
+
+ if (err != 1) {
+ qemu_aio_release(acb);
+ return NULL;
+ }
+
+ outstanding_requests++;
+
+ return acb;
+}
+
+static int la_flush(void)
+{
+ return outstanding_requests;
+}
+
+static void la_cancel(BlockDriverAIOCB *baiocb)
+{
+ RawAIOCB *acb = (void *)baiocb;
+ struct io_event result;
+ int err;
+
+ do {
+ err = io_cancel(aio_ctxt_id, &acb->linux_aiocb, &result);
+ } while (err == -1 && errno == EINTR);
+
+ /* it may have happened... we probably should check and complete */
+
+ outstanding_requests--;
+
+ qemu_aio_release(acb);
+}
+
+static void la_completion(void *opaque)
+{
+ struct io_event events[MAX_LINUX_AIO_EVENTS];
+ struct timespec ts = {0, 0};
+ uint64_t count;
+ int i, ret;
+
+ BLPRINTF("%s ->\n", __FUNCTION__);
+ do {
+ ret = read(aio_efd, &count, sizeof(count));
+ if (ret == -1 && errno == EAGAIN) {
+ BLPRINTF("linux: got EAGAIN\n");
+ return;
+ }
+ } while (ret == -1 && errno == EINTR);
+
+ if (ret != 8) {
+ BLPRINTF("bad read from eventfd (ret=%d errno=%d)\n", ret, errno);
+ exit(1);
+ }
+
+ BLPRINTF("%s: after fd read\n", __FUNCTION__);
+ BLPRINTF("%s: calling io_getevents, min=%lu events\n", __FUNCTION__, count);
+ do {
+ ret = io_getevents(aio_ctxt_id, count, ARRAY_SIZE(events),
+ events, &ts);
+ } while (ret == -1 && errno == EINTR);
+
+ if (ret < 0) {
+ BLPRINTF("io_getevents failed: %d %m\n", ret);
+ exit(1);
+ }
+
+ for (i = 0; i < ret; i++) {
+ RawAIOCB *acb;
+ int res;
+
+ acb = (RawAIOCB *)(unsigned long)events[i].data;
+ res = events[i].res;
+
+ if (res > 0)
+ res = 0;
+
+ acb->common.cb(acb->common.opaque, res);
+ qemu_aio_release(acb);
+
+ outstanding_requests--;
+ }
+ BLPRINTF("%s <-\n", __FUNCTION__);
+}
+
+static int la_init(void)
+{
+ LinuxAioState *s;
+
+ if (linux_aio_state)
+ return 0;
+
+ s = qemu_malloc(sizeof(LinuxAioState));
+ if (s == NULL)
+ return -ENOMEM;
+
+ /* setup eventfd and init linux aio context, register fd handler */
+ aio_efd = eventfd(0);
+ io_setup(MAX_LINUX_AIO_EVENTS, &aio_ctxt_id);
+ s->first_aio = NULL;
+ s->fd = aio_efd;
+
+ /* switch to non-blocking eventfd mode */
+ fcntl(aio_efd, F_SETFL, O_NONBLOCK);
+
+ qemu_aio_set_fd_handler(aio_efd, la_completion, NULL, la_flush, NULL);
+
+ linux_aio_state = s;
+
+ return 0;
+}
+
+static AIODriver linux_aio_drv = {
+ .name = "linux",
+ .submit = la_submit,
+ .cancel = la_cancel,
+ .flush = la_flush,
+};
+
+AIODriver *linux_aio_init(void) {
+ if (la_init() != 0)
+ return NULL;
+ return &linux_aio_drv;
+}
diff --git a/block-aio.h b/block-aio.h
index b8597d0..b1492d9 100644
--- a/block-aio.h
+++ b/block-aio.h
@@ -21,6 +21,7 @@
#include "qemu-aio.h"
#ifdef CONFIG_AIO
#include <aio.h>
+#include <linux/aio_abi.h>
#endif
//#define DEBUG_BLOCK_AIO
@@ -33,6 +34,7 @@
typedef struct RawAIOCB {
BlockDriverAIOCB common;
struct aiocb posix_aiocb;
+ struct iocb linux_aiocb;
struct RawAIOCB *next;
int ret;
} RawAIOCB;
@@ -75,4 +77,5 @@ typedef struct AIOState
AIODriver* posix_aio_init(void);
+AIODriver* linux_aio_init(void);
#endif /* QEMU_BLOCK_AIO_H */
diff --git a/block-raw-posix.c b/block-raw-posix.c
index cab7094..80034ac 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -125,8 +125,11 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags)
}
}
#endif
- /* init aio driver for this block device */
- s->aio_dvr = posix_aio_init();
+ /* init aio driver for this block device, linux if O_DIRECT is enabled */
+ if (flags & BDRV_O_DIRECT)
+ s->aio_dvr = linux_aio_init();
+ else
+ s->aio_dvr = posix_aio_init();
return 0;
}
@@ -756,8 +759,11 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags)
s->fd_media_changed = 1;
}
#endif
- /* init aio driver for this block device */
- s->aio_dvr = posix_aio_init();
+ /* init aio driver for this block device, linux if O_DIRECT is enabled */
+ if (flags & BDRV_O_DIRECT)
+ s->aio_dvr = linux_aio_init();
+ else
+ s->aio_dvr = posix_aio_init();
return 0;
}
next prev parent reply other threads:[~2008-09-22 23:19 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2008-09-22 23:17 [Qemu-devel] [PATCH 0/3] Refactor AIO to allow multiple AIO implementations Ryan Harper
2008-09-22 23:17 ` [Qemu-devel] [PATCH 1/3] Only call aio flush handler if set Ryan Harper
2008-09-23 2:38 ` [Qemu-devel] " Anthony Liguori
2008-09-23 14:26 ` Ryan Harper
2008-09-23 14:34 ` Anthony Liguori
2008-09-23 14:41 ` Ryan Harper
2008-09-23 14:50 ` Anthony Liguori
2008-09-22 23:17 ` [Qemu-devel] [PATCH 2/3] Move aio implementation out of raw block driver Ryan Harper
2008-09-23 1:16 ` [Qemu-devel] " Ryan Harper
2008-09-23 2:45 ` Anthony Liguori
2008-09-23 14:39 ` Ryan Harper
2008-09-23 14:40 ` Anthony Liguori
2008-09-23 14:53 ` Gerd Hoffmann
2008-09-23 16:06 ` Anthony Liguori
2008-09-23 18:04 ` Gerd Hoffmann
2008-09-23 18:28 ` Anthony Liguori
2008-09-24 22:31 ` Marcelo Tosatti
2008-09-22 23:17 ` Ryan Harper [this message]
2008-09-23 1:22 ` [Qemu-devel] [PATCH 3/3] Add linux aio implementation for raw block devices Ryan Harper
2008-09-23 3:32 ` [Qemu-devel] Re: [PATCH 0/3] Refactor AIO to allow multiple AIO implementations Anthony Liguori
2008-09-23 14:43 ` Ryan Harper
2008-09-23 14:47 ` Anthony Liguori
2008-09-23 16:09 ` Anthony Liguori
2008-09-23 10:27 ` [Qemu-devel] " Jamie Lokier
2008-10-02 22:41 ` [Qemu-devel] " john cooper
2008-10-03 13:33 ` Ryan Harper
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1222125454-21744-4-git-send-email-ryanh@us.ibm.com \
--to=ryanh@us.ibm.com \
--cc=qemu-devel@nongnu.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).