From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1JyQ4v-0000Fg-3E for qemu-devel@nongnu.org; Tue, 20 May 2008 07:32:25 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1JyQ4r-0000DB-0i for qemu-devel@nongnu.org; Tue, 20 May 2008 07:32:24 -0400 Received: from [199.232.76.173] (port=50708 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1JyQ4q-0000D3-42 for qemu-devel@nongnu.org; Tue, 20 May 2008 07:32:20 -0400 Received: from ecfrec.frec.bull.fr ([129.183.4.8]:58443) by monty-python.gnu.org with esmtp (Exim 4.60) (envelope-from ) id 1JyQ4p-0000BX-I6 for qemu-devel@nongnu.org; Tue, 20 May 2008 07:32:20 -0400 From: Laurent Vivier Content-Type: multipart/mixed; boundary="=-J9sPTdAR3sDRjb2Xg48P" Date: Tue, 20 May 2008 13:32:06 +0200 Message-Id: <1211283126.4314.70.camel@frecb07144> Mime-Version: 1.0 Subject: [Qemu-devel] [PATCH][v2] Align file accesses with cache=off (O_DIRECT) Reply-To: qemu-devel@nongnu.org List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: "qemu-devel@nongnu.org" Cc: Blue Swirl , Kevin Wolf --=-J9sPTdAR3sDRjb2Xg48P Content-Type: text/plain Content-Transfer-Encoding: 7bit This patch is the original patch from Kevin Wolf modified according comments given on the qemu-devel Mailing list. Original Description: "In December a patch was applied which introduced the cache=off option to -drive. When using this option files are opened with the O_DIRECT flag. This means that all accesses have to be aligned. The patch made a couple of changes in this respect, still in other places they are missing (e.g. you can't use cache=off with qcow(2) files). This patch implements wrappers for raw_pread and raw_pwrite which align all file accesses and make qcow(2) work with cache=off. This method might not be the most performant one (compared to fixing qcow, qcow2 and everything else that might be using unaligned accesses), but unaligned accesses don't happen that frequently and with this patch really all image accesses should be covered." Modifications: - Kevin has modified his patch to call the read/write AIO callback outside the aio_read/write - I've modified the buffer management to allocate buffer on open and not on each read/write. As mentioned by Kevin, this patch is really needed to be able to manage all disk images with "cache=off" option, so pleeeaaaase, apply (or comment...) A la GIT: Signed-off-by: Kevin Wolf Signed-off-by: Laurent Vivier -- ------------- Laurent.Vivier@bull.net --------------- "The best way to predict the future is to invent it." - Alan Kay --=-J9sPTdAR3sDRjb2Xg48P Content-Disposition: attachment; filename=align-odirect-accesses-v2.patch Content-Type: text/x-vhdl; name=align-odirect-accesses-v2.patch; charset=UTF-8 Content-Transfer-Encoding: 7bit --- block-raw-posix.c | 240 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 238 insertions(+), 2 deletions(-) Index: qemu/block-raw-posix.c =================================================================== --- qemu.orig/block-raw-posix.c 2008-05-16 17:08:13.000000000 +0200 +++ qemu/block-raw-posix.c 2008-05-20 10:31:59.000000000 +0200 @@ -70,6 +70,8 @@ #define FTYPE_CD 1 #define FTYPE_FD 2 +#define ALIGNED_BUFFER_SIZE (32 * 512) + /* if the FD is not accessed during that time (in ms), we try to reopen it to see if the disk has been changed */ #define FD_OPEN_TIMEOUT 1000 @@ -86,6 +88,9 @@ typedef struct BDRVRawState { int fd_got_error; int fd_media_changed; #endif +#if defined(O_DIRECT) && !defined(QEMU_IMG) + uint8_t* aligned_buf; +#endif } BDRVRawState; static int fd_open(BlockDriverState *bs); @@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs return ret; } s->fd = fd; +#if defined(O_DIRECT) && !defined(QEMU_IMG) + s->aligned_buf = NULL; + if (flags & BDRV_O_DIRECT) { + s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE); + if (s->aligned_buf == NULL) { + ret = -errno; + close(fd); + return ret; + } + } +#endif return 0; } @@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs #endif */ -static int raw_pread(BlockDriverState *bs, int64_t offset, +/* + * offset and count are in bytes, but must be multiples of 512 for files + * opened with O_DIRECT. buf must be aligned to 512 bytes then. + * + * This function may be called without alignment if the caller ensures + * that O_DIRECT is not in effect. + */ +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset, uint8_t *buf, int count) { BDRVRawState *s = bs->opaque; @@ -194,7 +217,14 @@ label__raw_read__success: return ret; } -static int raw_pwrite(BlockDriverState *bs, int64_t offset, +/* + * offset and count are in bytes, but must be multiples of 512 for files + * opened with O_DIRECT. buf must be aligned to 512 bytes then. + * + * This function may be called without alignment if the caller ensures + * that O_DIRECT is not in effect. + */ +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset, const uint8_t *buf, int count) { BDRVRawState *s = bs->opaque; @@ -230,6 +260,164 @@ label__raw_write__success: return ret; } + +#if defined(O_DIRECT) && !defined(QEMU_IMG) +/* + * offset and count are in bytes and possibly not aligned. For files opened + * with O_DIRECT, necessary alignments are ensured before calling + * raw_pread_aligned to do the actual read. + */ +static int raw_pread(BlockDriverState *bs, int64_t offset, + uint8_t *buf, int count) +{ + BDRVRawState *s = bs->opaque; + int size, ret, shift, sum; + + sum = 0; + + if (s->aligned_buf != NULL) { + + if (offset & 0x1ff) { + /* align offset on a 512 bytes boundary */ + + shift = offset & 0x1ff; + size = (shift + count + 0x1ff) & ~0x1ff; + if (size > ALIGNED_BUFFER_SIZE) + size = ALIGNED_BUFFER_SIZE; + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size); + if (ret < 0) + return ret; + + size = 512 - shift; + if (size > count) + size = count; + memcpy(buf, s->aligned_buf + shift, size); + + buf += size; + offset += size; + count -= size; + sum += size; + + if (count == 0) + return sum; + } + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { + + /* read on aligned buffer */ + + while (count) { + + size = (count + 0x1ff) & ~0x1ff; + if (size > ALIGNED_BUFFER_SIZE) + size = ALIGNED_BUFFER_SIZE; + + ret = raw_pread_aligned(bs, offset, s->aligned_buf, size); + if (ret < 0) + return ret; + + size = ret; + if (size > count) + size = count; + + memcpy(buf, s->aligned_buf, size); + + buf += size; + offset += size; + count -= size; + sum += size; + } + + return sum; + } + } + + return raw_pread_aligned(bs, offset, buf, count) + sum; +} + +/* + * offset and count are in bytes and possibly not aligned. For files opened + * with O_DIRECT, necessary alignments are ensured before calling + * raw_pwrite_aligned to do the actual write. + */ +static int raw_pwrite(BlockDriverState *bs, int64_t offset, + const uint8_t *buf, int count) +{ + BDRVRawState *s = bs->opaque; + int size, ret, shift, sum; + + sum = 0; + + if (s->aligned_buf != NULL) { + + if (offset & 0x1ff) { + /* align offset on a 512 bytes boundary */ + shift = offset & 0x1ff; + ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512); + if (ret < 0) + return ret; + + size = 512 - shift; + if (size > count) + size = count; + memcpy(s->aligned_buf + shift, buf, size); + + ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512); + if (ret < 0) + return ret; + + buf += size; + offset += size; + count -= size; + sum += size; + + if (count == 0) + return sum; + } + if (count & 0x1ff || (uintptr_t) buf & 0x1ff) { + + while ((size = (count & ~0x1ff)) != 0) { + + if (size > ALIGNED_BUFFER_SIZE) + size = ALIGNED_BUFFER_SIZE; + + memcpy(s->aligned_buf, buf, size); + + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size); + if (ret < 0) + return ret; + + buf += ret; + offset += ret; + count -= ret; + sum += ret; + } + /* here, count < 512 because (count & ~0x1ff) == 0 */ + if (count) { + ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512); + if (ret < 0) + return ret; + memcpy(s->aligned_buf, buf, count); + + ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512); + if (ret < 0) + return ret; + if (count < ret) + ret = count; + + sum += ret; + } + return sum; + } + } + return raw_pwrite_aligned(bs, offset, buf, count) + sum; +} + +#else +#define raw_pread raw_pread_aligned +#define raw_pwrite raw_pwrite_aligned +#endif + + /***********************************************************/ /* Unix AIO using POSIX AIO */ @@ -237,6 +425,7 @@ typedef struct RawAIOCB { BlockDriverAIOCB common; struct aiocb aiocb; struct RawAIOCB *next; + int ret; } RawAIOCB; static int aio_sig_num = SIGUSR2; @@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriv return acb; } +#ifndef QEMU_IMG +static void raw_aio_em_cb(void* opaque) +{ + RawAIOCB *acb = opaque; + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_release(acb); +} +#endif + static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) { RawAIOCB *acb; + /* + * If O_DIRECT is used and the buffer is not aligned fall back + * to synchronous IO. + */ +#if defined(O_DIRECT) && !defined(QEMU_IMG) + BDRVRawState *s = bs->opaque; + + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { + QEMUBH *bh; + acb = qemu_aio_get(bs, cb, opaque); + acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors); + bh = qemu_bh_new(raw_aio_em_cb, acb); + qemu_bh_schedule(bh); + return &acb->common; + } +#endif + acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque); if (!acb) return NULL; @@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(B { RawAIOCB *acb; + /* + * If O_DIRECT is used and the buffer is not aligned fall back + * to synchronous IO. + */ +#if defined(O_DIRECT) && !defined(QEMU_IMG) + BDRVRawState *s = bs->opaque; + + if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) { + QEMUBH *bh; + acb = qemu_aio_get(bs, cb, opaque); + acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors); + bh = qemu_bh_new(raw_aio_em_cb, acb); + qemu_bh_schedule(bh); + return &acb->common; + } +#endif + acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque); if (!acb) return NULL; @@ -462,6 +694,10 @@ static void raw_close(BlockDriverState * if (s->fd >= 0) { close(s->fd); s->fd = -1; +#if defined(O_DIRECT) && !defined(QEMU_IMG) + if (s->aligned_buf != NULL) + qemu_free(s->aligned_buf); +#endif } } --=-J9sPTdAR3sDRjb2Xg48P--