From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1Khugw-000362-8t for qemu-devel@nongnu.org; Mon, 22 Sep 2008 19:19:42 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1Khugv-00035X-ET for qemu-devel@nongnu.org; Mon, 22 Sep 2008 19:19:41 -0400 Received: from [199.232.76.173] (port=44581 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1Khugv-00035R-B5 for qemu-devel@nongnu.org; Mon, 22 Sep 2008 19:19:41 -0400 Received: from e3.ny.us.ibm.com ([32.97.182.143]:37816) by monty-python.gnu.org with esmtps (TLS-1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.60) (envelope-from ) id 1Khugu-00058V-Vr for qemu-devel@nongnu.org; Mon, 22 Sep 2008 19:19:41 -0400 Received: from d01relay02.pok.ibm.com (d01relay02.pok.ibm.com [9.56.227.234]) by e3.ny.us.ibm.com (8.13.8/8.13.8) with ESMTP id m8MNHgxI028209 for ; Mon, 22 Sep 2008 19:17:42 -0400 Received: from d01av04.pok.ibm.com (d01av04.pok.ibm.com [9.56.224.64]) by d01relay02.pok.ibm.com (8.13.8/8.13.8/NCO v9.1) with ESMTP id m8MNHato281636 for ; Mon, 22 Sep 2008 19:17:36 -0400 Received: from d01av04.pok.ibm.com (loopback [127.0.0.1]) by d01av04.pok.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id m8MNHaUj031674 for ; Mon, 22 Sep 2008 19:17:36 -0400 From: Ryan Harper Date: Mon, 22 Sep 2008 18:17:34 -0500 Message-Id: <1222125454-21744-4-git-send-email-ryanh@us.ibm.com> In-Reply-To: <1222125454-21744-1-git-send-email-ryanh@us.ibm.com> References: <1222125454-21744-1-git-send-email-ryanh@us.ibm.com> Subject: [Qemu-devel] [PATCH 2/3] Move aio implementation out of raw block driver Reply-To: qemu-devel@nongnu.org List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: Ryan Harper This patch adds a linux aio raw block driver implementation. If a raw block device is opened with cached=off (O_DIRECT) then we can utilize linux aio to submit io to/from the block device. Utilizing linux aio allows for multiple outstanding requests to be in flight against the io device potentially providing higher IO throughput. This implementation uses eventfd for event completion notification. Block devices with cache enabled will utilize posix aio since linux aio will fallback to synchronous IO when used without O_DIRECT[1]. Signed-off-by: Ryan Harper 1. http://lse.sourceforge.net/io/aio.html diff --git a/Makefile b/Makefile index 18477ba..92ca5d9 100644 --- a/Makefile +++ b/Makefile @@ -60,7 +60,7 @@ BLOCK_OBJS += block-raw-posix.o endif ifdef CONFIG_AIO -BLOCK_OBJS += compatfd.o aio-posix.o +BLOCK_OBJS += compatfd.o aio-posix.o aio-linux.o endif ###################################################################### diff --git a/Makefile.target b/Makefile.target index 4c6b3d5..599fa8a 100644 --- a/Makefile.target +++ b/Makefile.target @@ -482,7 +482,7 @@ OBJS+=block-raw-posix.o endif ifdef CONFIG_AIO -OBJS+=compatfd.o aio-posix.o +OBJS+=compatfd.o aio-posix.o aio-linux.o endif LIBS+=-lz diff --git a/aio-linux.c b/aio-linux.c new file mode 100644 index 0000000..0043fd1 --- /dev/null +++ b/aio-linux.c @@ -0,0 +1,225 @@ +/* + * QEMU Linux AIO implementation for Block Raw devices + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori + * Ryan Harper + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +/***********************************************************/ +/* Unix AIO using LINUX AIO */ + +#include "qemu-common.h" +#include "block_int.h" +#include "block.h" +#include "block-aio.h" +#include "compatfd.h" +#include +#include +#include + +#define MAX_LINUX_AIO_EVENTS 256 + +int eventfd(unsigned int initval) +{ + return syscall(SYS_eventfd, initval); +} + +int io_setup(unsigned nr_reqs, aio_context_t *ctx_id) +{ + return syscall(SYS_io_setup, nr_reqs, ctx_id); +} + +int io_destroy(aio_context_t ctx_id) +{ + return syscall(SYS_io_destroy, ctx_id); +} + +int io_getevents(aio_context_t ctx_id, long min_nr, long nr, + struct io_event *events, struct timespec *timeout) +{ + return syscall(SYS_io_getevents, ctx_id, min_nr, nr, events, timeout); +} + +int io_submit(aio_context_t ctx_id, long nr, struct iocb **iocb) +{ + return syscall(SYS_io_submit, ctx_id, nr, iocb); +} + +int io_cancel(aio_context_t ctx_id, struct iocb *iocb, struct io_event *result) +{ + return syscall(SYS_io_cancel, ctx_id, iocb, result); +} + +typedef AIOState LinuxAioState; + +static int aio_efd; +static aio_context_t aio_ctxt_id; +static int outstanding_requests; +static LinuxAioState *linux_aio_state; + +static RawAIOCB *la_submit(BlockDriverState *bs, int fd, + int64_t sector_num, uint8_t *buf, + int nb_sectors, int write, + BlockDriverCompletionFunc *cb, void *opaque) +{ + RawAIOCB *acb; + struct iocb *iocbs[1]; + int err; + + acb = qemu_aio_get(bs, cb, opaque); + if (!acb) { + fprintf(stderr, "%s: qemu_aio_get returned NULL!?!\n", __FUNCTION__); + return NULL; + } + + if (write) + acb->linux_aiocb.aio_lio_opcode = IOCB_CMD_PWRITE; + else + acb->linux_aiocb.aio_lio_opcode = IOCB_CMD_PREAD; + + acb->linux_aiocb.aio_data = (unsigned long)acb; + acb->linux_aiocb.aio_fildes = fd; + acb->linux_aiocb.aio_flags = IOCB_FLAG_RESFD; + acb->linux_aiocb.aio_resfd = aio_efd; + acb->linux_aiocb.aio_buf = (unsigned long)buf; + acb->linux_aiocb.aio_nbytes = nb_sectors * 512; + acb->linux_aiocb.aio_offset = sector_num * 512; + + acb->next = linux_aio_state->first_aio; + linux_aio_state->first_aio = acb; + + iocbs[0] = &acb->linux_aiocb; + + do { + err = io_submit(aio_ctxt_id, 1, iocbs); + } while (err == -1 && errno == EINTR); + + if (err != 1) { + qemu_aio_release(acb); + return NULL; + } + + outstanding_requests++; + + return acb; +} + +static int la_flush(void) +{ + return outstanding_requests; +} + +static void la_cancel(BlockDriverAIOCB *baiocb) +{ + RawAIOCB *acb = (void *)baiocb; + struct io_event result; + int err; + + do { + err = io_cancel(aio_ctxt_id, &acb->linux_aiocb, &result); + } while (err == -1 && errno == EINTR); + + /* it may have happened... we probably should check and complete */ + + outstanding_requests--; + + qemu_aio_release(acb); +} + +static void la_completion(void *opaque) +{ + struct io_event events[MAX_LINUX_AIO_EVENTS]; + struct timespec ts = {0, 0}; + uint64_t count; + int i, ret; + + BLPRINTF("%s ->\n", __FUNCTION__); + do { + ret = read(aio_efd, &count, sizeof(count)); + if (ret == -1 && errno == EAGAIN) { + BLPRINTF("linux: got EAGAIN\n"); + return; + } + } while (ret == -1 && errno == EINTR); + + if (ret != 8) { + BLPRINTF("bad read from eventfd (ret=%d errno=%d)\n", ret, errno); + exit(1); + } + + BLPRINTF("%s: after fd read\n", __FUNCTION__); + BLPRINTF("%s: calling io_getevents, min=%lu events\n", __FUNCTION__, count); + do { + ret = io_getevents(aio_ctxt_id, count, ARRAY_SIZE(events), + events, &ts); + } while (ret == -1 && errno == EINTR); + + if (ret < 0) { + BLPRINTF("io_getevents failed: %d %m\n", ret); + exit(1); + } + + for (i = 0; i < ret; i++) { + RawAIOCB *acb; + int res; + + acb = (RawAIOCB *)(unsigned long)events[i].data; + res = events[i].res; + + if (res > 0) + res = 0; + + acb->common.cb(acb->common.opaque, res); + qemu_aio_release(acb); + + outstanding_requests--; + } + BLPRINTF("%s <-\n", __FUNCTION__); +} + +static int la_init(void) +{ + LinuxAioState *s; + + if (linux_aio_state) + return 0; + + s = qemu_malloc(sizeof(LinuxAioState)); + if (s == NULL) + return -ENOMEM; + + /* setup eventfd and init linux aio context, register fd handler */ + aio_efd = eventfd(0); + io_setup(MAX_LINUX_AIO_EVENTS, &aio_ctxt_id); + s->first_aio = NULL; + s->fd = aio_efd; + + /* switch to non-blocking eventfd mode */ + fcntl(aio_efd, F_SETFL, O_NONBLOCK); + + qemu_aio_set_fd_handler(aio_efd, la_completion, NULL, la_flush, NULL); + + linux_aio_state = s; + + return 0; +} + +static AIODriver linux_aio_drv = { + .name = "linux", + .submit = la_submit, + .cancel = la_cancel, + .flush = la_flush, +}; + +AIODriver *linux_aio_init(void) { + if (la_init() != 0) + return NULL; + return &linux_aio_drv; +} diff --git a/block-aio.h b/block-aio.h index b8597d0..b1492d9 100644 --- a/block-aio.h +++ b/block-aio.h @@ -21,6 +21,7 @@ #include "qemu-aio.h" #ifdef CONFIG_AIO #include +#include #endif //#define DEBUG_BLOCK_AIO @@ -33,6 +34,7 @@ typedef struct RawAIOCB { BlockDriverAIOCB common; struct aiocb posix_aiocb; + struct iocb linux_aiocb; struct RawAIOCB *next; int ret; } RawAIOCB; @@ -75,4 +77,5 @@ typedef struct AIOState AIODriver* posix_aio_init(void); +AIODriver* linux_aio_init(void); #endif /* QEMU_BLOCK_AIO_H */ diff --git a/block-raw-posix.c b/block-raw-posix.c index cab7094..80034ac 100644 --- a/block-raw-posix.c +++ b/block-raw-posix.c @@ -125,8 +125,11 @@ static int raw_open(BlockDriverState *bs, const char *filename, int flags) } } #endif - /* init aio driver for this block device */ - s->aio_dvr = posix_aio_init(); + /* init aio driver for this block device, linux if O_DIRECT is enabled */ + if (flags & BDRV_O_DIRECT) + s->aio_dvr = linux_aio_init(); + else + s->aio_dvr = posix_aio_init(); return 0; } @@ -756,8 +759,11 @@ static int hdev_open(BlockDriverState *bs, const char *filename, int flags) s->fd_media_changed = 1; } #endif - /* init aio driver for this block device */ - s->aio_dvr = posix_aio_init(); + /* init aio driver for this block device, linux if O_DIRECT is enabled */ + if (flags & BDRV_O_DIRECT) + s->aio_dvr = linux_aio_init(); + else + s->aio_dvr = posix_aio_init(); return 0; }