qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
@ 2008-05-20 11:32 Laurent Vivier
  2008-05-20 19:47 ` [Qemu-devel] " Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Laurent Vivier @ 2008-05-20 11:32 UTC (permalink / raw)
  To: qemu-devel@nongnu.org; +Cc: Blue Swirl, Kevin Wolf

[-- Attachment #1: Type: text/plain, Size: 1454 bytes --]

This patch is the original patch from Kevin Wolf modified according
comments given on the qemu-devel Mailing list.

Original Description:

"In December a patch was applied which introduced the cache=off option
to -drive. When using this option files are opened with the O_DIRECT
flag.
This means that all accesses have to be aligned. The patch made a couple
of changes in this respect, still in other places they are missing (e.g.
you can't use cache=off with qcow(2) files).

This patch implements wrappers for raw_pread and raw_pwrite which align
all file accesses and make qcow(2) work with cache=off. This method
might not be the most performant one (compared to fixing qcow, qcow2 and
everything else that might be using unaligned accesses), but unaligned
accesses don't happen that frequently and with this patch really all
image accesses should be covered."

Modifications:

- Kevin has modified his patch to call the read/write AIO callback
outside the aio_read/write
- I've modified the buffer management to allocate buffer on open and not
on each read/write.

As mentioned by Kevin, this patch is really needed to be able to manage
all disk images with "cache=off" option, so pleeeaaaase, apply (or
comment...)

A la GIT:

Signed-off-by: Kevin Wolf <kwolf@suse.de>
Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

[-- Attachment #2: align-odirect-accesses-v2.patch --]
[-- Type: text/x-vhdl, Size: 9413 bytes --]

---
 block-raw-posix.c |  240 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 238 insertions(+), 2 deletions(-)

Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c	2008-05-16 17:08:13.000000000 +0200
+++ qemu/block-raw-posix.c	2008-05-20 10:31:59.000000000 +0200
@@ -70,6 +70,8 @@
 #define FTYPE_CD     1
 #define FTYPE_FD     2
 
+#define ALIGNED_BUFFER_SIZE (32 * 512)
+
 /* if the FD is not accessed during that time (in ms), we try to
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
@@ -86,6 +88,9 @@ typedef struct BDRVRawState {
     int fd_got_error;
     int fd_media_changed;
 #endif
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    uint8_t* aligned_buf;
+#endif
 } BDRVRawState;
 
 static int fd_open(BlockDriverState *bs);
@@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs
         return ret;
     }
     s->fd = fd;
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    s->aligned_buf = NULL;
+    if (flags & BDRV_O_DIRECT) {
+        s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
+        if (s->aligned_buf == NULL) {
+            ret = -errno;
+            close(fd);
+            return ret;
+        }
+    }
+#endif
     return 0;
 }
 
@@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs
 #endif
 */
 
-static int raw_pread(BlockDriverState *bs, int64_t offset,
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -194,7 +217,14 @@ label__raw_read__success:
     return ret;
 }
 
-static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -230,6 +260,164 @@ label__raw_write__success:
     return ret;
 }
 
+
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL)  {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+
+            shift = offset & 0x1ff;
+            size = (shift + count + 0x1ff) & ~0x1ff;
+            if (size > ALIGNED_BUFFER_SIZE)
+                size = ALIGNED_BUFFER_SIZE;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(buf, s->aligned_buf + shift, size);
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            /* read on aligned buffer */
+
+            while (count) {
+
+                size = (count + 0x1ff) & ~0x1ff;
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                size = ret;
+                if (size > count)
+                    size = count;
+
+                memcpy(buf, s->aligned_buf, size);
+
+                buf += size;
+                offset += size;
+                count -= size;
+                sum += size;
+            }
+
+            return sum;
+        }
+    }
+
+    return raw_pread_aligned(bs, offset, buf, count) + sum;
+}
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL) {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+            shift = offset & 0x1ff;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(s->aligned_buf + shift, buf, size);
+
+            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            while ((size = (count & ~0x1ff)) != 0) {
+
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                memcpy(s->aligned_buf, buf, size);
+
+                ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                buf += ret;
+                offset += ret;
+                count -= ret;
+                sum += ret;
+            }
+            /* here, count < 512 because (count & ~0x1ff) == 0 */
+            if (count) {
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+                if (ret < 0)
+                    return ret;
+                 memcpy(s->aligned_buf, buf, count);
+
+                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+                 if (ret < 0)
+                     return ret;
+                 if (count < ret)
+                     ret = count;
+
+                 sum += ret;
+            }
+            return sum;
+        }
+    }
+    return raw_pwrite_aligned(bs, offset, buf, count) + sum;
+}
+
+#else
+#define raw_pread raw_pread_aligned
+#define raw_pwrite raw_pwrite_aligned
+#endif
+
+
 /***********************************************************/
 /* Unix AIO using POSIX AIO */
 
@@ -237,6 +425,7 @@ typedef struct RawAIOCB {
     BlockDriverAIOCB common;
     struct aiocb aiocb;
     struct RawAIOCB *next;
+    int ret;
 } RawAIOCB;
 
 static int aio_sig_num = SIGUSR2;
@@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriv
     return acb;
 }
 
+#ifndef QEMU_IMG
+static void raw_aio_em_cb(void* opaque)
+{
+    RawAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_aio_release(acb);
+}
+#endif
+
 static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
         int64_t sector_num, uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     RawAIOCB *acb;
 
+    /*
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) {
+        QEMUBH *bh;
+        acb = qemu_aio_get(bs, cb, opaque);
+        acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        bh = qemu_bh_new(raw_aio_em_cb, acb);
+        qemu_bh_schedule(bh);
+        return &acb->common;
+    }
+#endif
+
     acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
@@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(B
 {
     RawAIOCB *acb;
 
+    /*
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) {
+        QEMUBH *bh;
+        acb = qemu_aio_get(bs, cb, opaque);
+        acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        bh = qemu_bh_new(raw_aio_em_cb, acb);
+        qemu_bh_schedule(bh);
+        return &acb->common;
+    }
+#endif
+
     acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
@@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *
     if (s->fd >= 0) {
         close(s->fd);
         s->fd = -1;
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+        if (s->aligned_buf != NULL)
+            qemu_free(s->aligned_buf);
+#endif
     }
 }
 

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2008-05-28  7:05 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-20 11:32 [Qemu-devel] [PATCH][v2] Align file accesses with cache=off (O_DIRECT) Laurent Vivier
2008-05-20 19:47 ` [Qemu-devel] " Anthony Liguori
2008-05-20 22:36   ` Jamie Lokier
2008-05-20 22:52     ` Paul Brook
2008-05-20 22:59       ` Laurent Vivier
2008-05-21  0:54         ` Paul Brook
2008-05-21  7:59           ` Laurent Vivier
2008-05-21  0:58       ` Anthony Liguori
2008-05-21  1:04         ` Jamie Lokier
2008-05-21  1:05         ` Anthony Liguori
2008-05-21  8:06           ` Kevin Wolf
2008-05-21  1:05         ` Paul Brook
2008-05-21  1:14           ` Anthony Liguori
2008-05-21  8:24             ` Kevin Wolf
2008-05-21 12:26               ` Jamie Lokier
2008-05-21 12:37                 ` Avi Kivity
2008-05-21 13:41                   ` Jamie Lokier
2008-05-21 13:55                     ` Anthony Liguori
2008-05-21 14:17                       ` Avi Kivity
2008-05-21 14:26                         ` Anthony Liguori
2008-05-21 14:57                           ` Avi Kivity
2008-05-21 15:34                             ` Jamie Lokier
2008-05-21 16:02                               ` Anthony Liguori
2008-05-21 16:24                                 ` Jamie Lokier
2008-05-21 16:48                                   ` Avi Kivity
2008-05-21 17:01                                     ` Andrea Arcangeli
2008-05-21 17:18                                       ` Avi Kivity
2008-05-21 17:47                                         ` Andrea Arcangeli
2008-05-21 17:53                                           ` Anthony Liguori
2008-05-21 18:08                                             ` Andrea Arcangeli
2008-05-21 18:25                                               ` Anthony Liguori
2008-05-21 20:13                                                 ` Andrea Arcangeli
2008-05-21 20:35                                                   ` Anthony Liguori
2008-05-21 20:42                                                     ` Andrea Arcangeli
2008-05-21 18:29                                           ` Avi Kivity
2008-05-21 16:45                                 ` Avi Kivity
2008-05-21 16:44                               ` Avi Kivity
2008-05-20 23:04     ` Laurent Vivier
2008-05-20 23:13       ` Jamie Lokier
2008-05-21  1:00     ` Anthony Liguori
2008-05-21  1:19       ` Jamie Lokier
2008-05-21  2:12         ` Anthony Liguori
2008-05-21  8:27           ` Andreas Färber
2008-05-21 14:06             ` Anthony Liguori
2008-05-21 15:31               ` Jamie Lokier
2008-05-21 11:43           ` Jamie Lokier
2008-05-23  9:12   ` Laurent Vivier
2008-05-28  7:01     ` Kevin Wolf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).