qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
@ 2008-05-20 11:32 Laurent Vivier
  2008-05-20 19:47 ` [Qemu-devel] " Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Laurent Vivier @ 2008-05-20 11:32 UTC (permalink / raw)
  To: qemu-devel@nongnu.org; +Cc: Blue Swirl, Kevin Wolf

[-- Attachment #1: Type: text/plain, Size: 1454 bytes --]

This patch is the original patch from Kevin Wolf modified according
comments given on the qemu-devel Mailing list.

Original Description:

"In December a patch was applied which introduced the cache=off option
to -drive. When using this option files are opened with the O_DIRECT
flag.
This means that all accesses have to be aligned. The patch made a couple
of changes in this respect, still in other places they are missing (e.g.
you can't use cache=off with qcow(2) files).

This patch implements wrappers for raw_pread and raw_pwrite which align
all file accesses and make qcow(2) work with cache=off. This method
might not be the most performant one (compared to fixing qcow, qcow2 and
everything else that might be using unaligned accesses), but unaligned
accesses don't happen that frequently and with this patch really all
image accesses should be covered."

Modifications:

- Kevin has modified his patch to call the read/write AIO callback
outside the aio_read/write
- I've modified the buffer management to allocate buffer on open and not
on each read/write.

As mentioned by Kevin, this patch is really needed to be able to manage
all disk images with "cache=off" option, so pleeeaaaase, apply (or
comment...)

A la GIT:

Signed-off-by: Kevin Wolf <kwolf@suse.de>
Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

[-- Attachment #2: align-odirect-accesses-v2.patch --]
[-- Type: text/x-vhdl, Size: 9413 bytes --]

---
 block-raw-posix.c |  240 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 238 insertions(+), 2 deletions(-)

Index: qemu/block-raw-posix.c
===================================================================
--- qemu.orig/block-raw-posix.c	2008-05-16 17:08:13.000000000 +0200
+++ qemu/block-raw-posix.c	2008-05-20 10:31:59.000000000 +0200
@@ -70,6 +70,8 @@
 #define FTYPE_CD     1
 #define FTYPE_FD     2
 
+#define ALIGNED_BUFFER_SIZE (32 * 512)
+
 /* if the FD is not accessed during that time (in ms), we try to
    reopen it to see if the disk has been changed */
 #define FD_OPEN_TIMEOUT 1000
@@ -86,6 +88,9 @@ typedef struct BDRVRawState {
     int fd_got_error;
     int fd_media_changed;
 #endif
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    uint8_t* aligned_buf;
+#endif
 } BDRVRawState;
 
 static int fd_open(BlockDriverState *bs);
@@ -121,6 +126,17 @@ static int raw_open(BlockDriverState *bs
         return ret;
     }
     s->fd = fd;
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    s->aligned_buf = NULL;
+    if (flags & BDRV_O_DIRECT) {
+        s->aligned_buf = qemu_memalign(512, ALIGNED_BUFFER_SIZE);
+        if (s->aligned_buf == NULL) {
+            ret = -errno;
+            close(fd);
+            return ret;
+        }
+    }
+#endif
     return 0;
 }
 
@@ -141,7 +157,14 @@ static int raw_open(BlockDriverState *bs
 #endif
 */
 
-static int raw_pread(BlockDriverState *bs, int64_t offset,
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -194,7 +217,14 @@ label__raw_read__success:
     return ret;
 }
 
-static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+/*
+ * offset and count are in bytes, but must be multiples of 512 for files
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -230,6 +260,164 @@ label__raw_write__success:
     return ret;
 }
 
+
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL)  {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+
+            shift = offset & 0x1ff;
+            size = (shift + count + 0x1ff) & ~0x1ff;
+            if (size > ALIGNED_BUFFER_SIZE)
+                size = ALIGNED_BUFFER_SIZE;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, size);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(buf, s->aligned_buf + shift, size);
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            /* read on aligned buffer */
+
+            while (count) {
+
+                size = (count + 0x1ff) & ~0x1ff;
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                size = ret;
+                if (size > count)
+                    size = count;
+
+                memcpy(buf, s->aligned_buf, size);
+
+                buf += size;
+                offset += size;
+                count -= size;
+                sum += size;
+            }
+
+            return sum;
+        }
+    }
+
+    return raw_pread_aligned(bs, offset, buf, count) + sum;
+}
+
+/*
+ * offset and count are in bytes and possibly not aligned. For files opened
+ * with O_DIRECT, necessary alignments are ensured before calling
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+    int size, ret, shift, sum;
+
+    sum = 0;
+
+    if (s->aligned_buf != NULL) {
+
+        if (offset & 0x1ff) {
+            /* align offset on a 512 bytes boundary */
+            shift = offset & 0x1ff;
+            ret = raw_pread_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            size = 512 - shift;
+            if (size > count)
+                size = count;
+            memcpy(s->aligned_buf + shift, buf, size);
+
+            ret = raw_pwrite_aligned(bs, offset - shift, s->aligned_buf, 512);
+            if (ret < 0)
+                return ret;
+
+            buf += size;
+            offset += size;
+            count -= size;
+            sum += size;
+
+            if (count == 0)
+                return sum;
+        }
+        if (count & 0x1ff || (uintptr_t) buf & 0x1ff) {
+
+            while ((size = (count & ~0x1ff)) != 0) {
+
+                if (size > ALIGNED_BUFFER_SIZE)
+                    size = ALIGNED_BUFFER_SIZE;
+
+                memcpy(s->aligned_buf, buf, size);
+
+                ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, size);
+                if (ret < 0)
+                    return ret;
+
+                buf += ret;
+                offset += ret;
+                count -= ret;
+                sum += ret;
+            }
+            /* here, count < 512 because (count & ~0x1ff) == 0 */
+            if (count) {
+                ret = raw_pread_aligned(bs, offset, s->aligned_buf, 512);
+                if (ret < 0)
+                    return ret;
+                 memcpy(s->aligned_buf, buf, count);
+
+                 ret = raw_pwrite_aligned(bs, offset, s->aligned_buf, 512);
+                 if (ret < 0)
+                     return ret;
+                 if (count < ret)
+                     ret = count;
+
+                 sum += ret;
+            }
+            return sum;
+        }
+    }
+    return raw_pwrite_aligned(bs, offset, buf, count) + sum;
+}
+
+#else
+#define raw_pread raw_pread_aligned
+#define raw_pwrite raw_pwrite_aligned
+#endif
+
+
 /***********************************************************/
 /* Unix AIO using POSIX AIO */
 
@@ -237,6 +425,7 @@ typedef struct RawAIOCB {
     BlockDriverAIOCB common;
     struct aiocb aiocb;
     struct RawAIOCB *next;
+    int ret;
 } RawAIOCB;
 
 static int aio_sig_num = SIGUSR2;
@@ -397,12 +586,38 @@ static RawAIOCB *raw_aio_setup(BlockDriv
     return acb;
 }
 
+#ifndef QEMU_IMG
+static void raw_aio_em_cb(void* opaque)
+{
+    RawAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_aio_release(acb);
+}
+#endif
+
 static BlockDriverAIOCB *raw_aio_read(BlockDriverState *bs,
         int64_t sector_num, uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     RawAIOCB *acb;
 
+    /*
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) {
+        QEMUBH *bh;
+        acb = qemu_aio_get(bs, cb, opaque);
+        acb->ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        bh = qemu_bh_new(raw_aio_em_cb, acb);
+        qemu_bh_schedule(bh);
+        return &acb->common;
+    }
+#endif
+
     acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
@@ -419,6 +634,23 @@ static BlockDriverAIOCB *raw_aio_write(B
 {
     RawAIOCB *acb;
 
+    /*
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely(s->aligned_buf != NULL && ((uintptr_t) buf % 512))) {
+        QEMUBH *bh;
+        acb = qemu_aio_get(bs, cb, opaque);
+        acb->ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        bh = qemu_bh_new(raw_aio_em_cb, acb);
+        qemu_bh_schedule(bh);
+        return &acb->common;
+    }
+#endif
+
     acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
@@ -462,6 +694,10 @@ static void raw_close(BlockDriverState *
     if (s->fd >= 0) {
         close(s->fd);
         s->fd = -1;
+#if defined(O_DIRECT) && !defined(QEMU_IMG)
+        if (s->aligned_buf != NULL)
+            qemu_free(s->aligned_buf);
+#endif
     }
 }
 

^ permalink raw reply	[flat|nested] 48+ messages in thread

* [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 11:32 [Qemu-devel] [PATCH][v2] Align file accesses with cache=off (O_DIRECT) Laurent Vivier
@ 2008-05-20 19:47 ` Anthony Liguori
  2008-05-20 22:36   ` Jamie Lokier
  2008-05-23  9:12   ` Laurent Vivier
  0 siblings, 2 replies; 48+ messages in thread
From: Anthony Liguori @ 2008-05-20 19:47 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: Blue Swirl, qemu-devel@nongnu.org, Kevin Wolf

Laurent Vivier wrote:
> This patch is the original patch from Kevin Wolf modified according
> comments given on the qemu-devel Mailing list.
>
> Original Description:
>
> "In December a patch was applied which introduced the cache=off option
> to -drive. When using this option files are opened with the O_DIRECT
> flag.
> This means that all accesses have to be aligned. The patch made a couple
> of changes in this respect, still in other places they are missing (e.g.
> you can't use cache=off with qcow(2) files).
>
> This patch implements wrappers for raw_pread and raw_pwrite which align
> all file accesses and make qcow(2) work with cache=off. This method
> might not be the most performant one (compared to fixing qcow, qcow2 and
> everything else that might be using unaligned accesses), but unaligned
> accesses don't happen that frequently and with this patch really all
> image accesses should be covered."
>
> Modifications:
>
> - Kevin has modified his patch to call the read/write AIO callback
> outside the aio_read/write
> - I've modified the buffer management to allocate buffer on open and not
> on each read/write.
>
> As mentioned by Kevin, this patch is really needed to be able to manage
> all disk images with "cache=off" option, so pleeeaaaase, apply (or
> comment...)
>
> A la GIT:
>
> Signed-off-by: Kevin Wolf <kwolf@suse.de>
> Signed-off-by: Laurent Vivier <Laurent.Vivier@bull.net>
>   

Looks better to me.

Acked-by: Anthony Liguori <aliguori@us.ibm.com>

Regads,

Anthony Liguori

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 19:47 ` [Qemu-devel] " Anthony Liguori
@ 2008-05-20 22:36   ` Jamie Lokier
  2008-05-20 22:52     ` Paul Brook
                       ` (2 more replies)
  2008-05-23  9:12   ` Laurent Vivier
  1 sibling, 3 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-05-20 22:36 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Anthony Liguori wrote:
> >This patch implements wrappers for raw_pread and raw_pwrite which align
> >all file accesses and make qcow(2) work with cache=off. This method
> >might not be the most performant one (compared to fixing qcow, qcow2 and
> >everything else that might be using unaligned accesses), but unaligned
> >accesses don't happen that frequently and with this patch really all
> >image accesses should be covered."

It's a useful patch.

One little esoteric consequence you might want to document.

This occurs when a guest is running something like a database or
journalling filesystem, and is reliant on the host disk's integrity
properties.

One property of disks is that if you overwrite a sector and the're
power loss, when read later that sector might be corrupt.  Even if the
new data is the same as the old data with only some bytes changed,
some of the _unchanged_ bytes may be corrupt by this.

When it writes to sector-aligned offset, there is a possibility that
the guest is depending on power failure not causing corruption of
neighouring sectors.  This is typical with some kinds of journalling.

When sector-aligned guest offsets are converted to sector-unaligned
writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
and power failure of the host disk can cause more damage than the
guest is designed to be resistant to.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 22:36   ` Jamie Lokier
@ 2008-05-20 22:52     ` Paul Brook
  2008-05-20 22:59       ` Laurent Vivier
  2008-05-21  0:58       ` Anthony Liguori
  2008-05-20 23:04     ` Laurent Vivier
  2008-05-21  1:00     ` Anthony Liguori
  2 siblings, 2 replies; 48+ messages in thread
From: Paul Brook @ 2008-05-20 22:52 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

> When sector-aligned guest offsets are converted to sector-unaligned
> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
> and power failure of the host disk can cause more damage than the
> guest is designed to be resistant to.

Seems like the easiest solution would be to have qcow always align its writes.
We don't do on the fly compression, so it should be fairly easy to make this 
happen with minimal overhead.

Paul

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 22:52     ` Paul Brook
@ 2008-05-20 22:59       ` Laurent Vivier
  2008-05-21  0:54         ` Paul Brook
  2008-05-21  0:58       ` Anthony Liguori
  1 sibling, 1 reply; 48+ messages in thread
From: Laurent Vivier @ 2008-05-20 22:59 UTC (permalink / raw)
  To: Paul Brook; +Cc: Blue Swirl, qemu-devel, Kevin Wolf

Le mardi 20 mai 2008 à 23:52 +0100, Paul Brook a écrit :
> > When sector-aligned guest offsets are converted to sector-unaligned
> > writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
> > and power failure of the host disk can cause more damage than the
> > guest is designed to be resistant to.
> 
> Seems like the easiest solution would be to have qcow always align its writes.
> We don't do on the fly compression, so it should be fairly easy to make this 
> happen with minimal overhead.

I did the patch you describe and post it to the mailing list on Tue, 22
Jan 2008 11:17:09 +0100, it was called 
"[PATCH] snapshot=on and cache=off compatibility"
but was never commented.

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 22:36   ` Jamie Lokier
  2008-05-20 22:52     ` Paul Brook
@ 2008-05-20 23:04     ` Laurent Vivier
  2008-05-20 23:13       ` Jamie Lokier
  2008-05-21  1:00     ` Anthony Liguori
  2 siblings, 1 reply; 48+ messages in thread
From: Laurent Vivier @ 2008-05-20 23:04 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Blue Swirl, qemu-devel, Kevin Wolf

Le mardi 20 mai 2008 à 23:36 +0100, Jamie Lokier a écrit :
> Anthony Liguori wrote:
> > >This patch implements wrappers for raw_pread and raw_pwrite which align
> > >all file accesses and make qcow(2) work with cache=off. This method
> > >might not be the most performant one (compared to fixing qcow, qcow2 and
> > >everything else that might be using unaligned accesses), but unaligned
> > >accesses don't happen that frequently and with this patch really all
> > >image accesses should be covered."
> 
> It's a useful patch.
> 
> One little esoteric consequence you might want to document.
> 
> This occurs when a guest is running something like a database or
> journalling filesystem, and is reliant on the host disk's integrity
> properties.
> 
> One property of disks is that if you overwrite a sector and the're
> power loss, when read later that sector might be corrupt.  Even if the
> new data is the same as the old data with only some bytes changed,
> some of the _unchanged_ bytes may be corrupt by this.
> 
> When it writes to sector-aligned offset, there is a possibility that
> the guest is depending on power failure not causing corruption of
> neighouring sectors.  This is typical with some kinds of journalling.
> 
> When sector-aligned guest offsets are converted to sector-unaligned
> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
> and power failure of the host disk can cause more damage than the
> guest is designed to be resistant to.

Sector unaligned guest offset are converted to sector aligned offset.
Sector aligned guest offset are written as is.

And as we use O_DIRECT and aligned offset and data count I think we
increase disk integrity (regarding the case without O_DIRECT and without
aligned access...), so we should document the case without O_DIRECT use,
not the case with O_DIRECT use...

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 23:04     ` Laurent Vivier
@ 2008-05-20 23:13       ` Jamie Lokier
  0 siblings, 0 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-05-20 23:13 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: Blue Swirl, qemu-devel, Kevin Wolf

Laurent Vivier wrote:
> Sector unaligned guest offset are converted to sector aligned offset.

The guest cannot give a sector unaligned offset.  How can it?  It
specifies offset in sectors, through the IDE or SCSI interfaces.

> Sector aligned guest offset are written as is.
> 
> And as we use O_DIRECT and aligned offset and data count I think we
> increase disk integrity (regarding the case without O_DIRECT and without
> aligned access...), so we should document the case without O_DIRECT use,
> not the case with O_DIRECT use...

That's fine, for direct-mapped formats.  I'm thinking of qcow format
etc. where guest aligned writes are sometimes translated to unaligned
(is this even true?).

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 22:59       ` Laurent Vivier
@ 2008-05-21  0:54         ` Paul Brook
  2008-05-21  7:59           ` Laurent Vivier
  0 siblings, 1 reply; 48+ messages in thread
From: Paul Brook @ 2008-05-21  0:54 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

> > Seems like the easiest solution would be to have qcow always align its
> > writes. We don't do on the fly compression, so it should be fairly easy
> > to make this happen with minimal overhead.
>
> I did the patch you describe and post it to the mailing list on Tue, 22
> Jan 2008 11:17:09 +0100, it was called
> "[PATCH] snapshot=on and cache=off compatibility"
> but was never commented.

That patch that patch messes with O_DIRECT on an open file descriptor, which 
was generally agreed to be a bad idea.

Paul

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 22:52     ` Paul Brook
  2008-05-20 22:59       ` Laurent Vivier
@ 2008-05-21  0:58       ` Anthony Liguori
  2008-05-21  1:04         ` Jamie Lokier
                           ` (2 more replies)
  1 sibling, 3 replies; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21  0:58 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Paul Brook wrote:
>> When sector-aligned guest offsets are converted to sector-unaligned
>> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
>> and power failure of the host disk can cause more damage than the
>> guest is designed to be resistant to.
>>     
>
> Seems like the easiest solution would be to have qcow always align its writes.
> We don't do on the fly compression, so it should be fairly easy to make this 
> happen with minimal overhead.
>   

That's not sufficient. O_DIRECT imposes not only offset alignment 
requirements but also requirements on the buffer being read to. Most of 
the code in QEMU does not properly align the read/write buffers.

Regards,

Anthony Liguori

> Paul
>
>
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 22:36   ` Jamie Lokier
  2008-05-20 22:52     ` Paul Brook
  2008-05-20 23:04     ` Laurent Vivier
@ 2008-05-21  1:00     ` Anthony Liguori
  2008-05-21  1:19       ` Jamie Lokier
  2 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21  1:00 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>>> This patch implements wrappers for raw_pread and raw_pwrite which align
>>> all file accesses and make qcow(2) work with cache=off. This method
>>> might not be the most performant one (compared to fixing qcow, qcow2 and
>>> everything else that might be using unaligned accesses), but unaligned
>>> accesses don't happen that frequently and with this patch really all
>>> image accesses should be covered."
>>>       
>
> It's a useful patch.
>
> One little esoteric consequence you might want to document.
>
> This occurs when a guest is running something like a database or
> journalling filesystem, and is reliant on the host disk's integrity
> properties.
>
> One property of disks is that if you overwrite a sector and the're
> power loss, when read later that sector might be corrupt.  Even if the
> new data is the same as the old data with only some bytes changed,
> some of the _unchanged_ bytes may be corrupt by this.
>   

I don't think this is true.  What evidence do you have to support such 
claims?

Regards,

Anthony Liguori

> When it writes to sector-aligned offset, there is a possibility that
> the guest is depending on power failure not causing corruption of
> neighouring sectors.  This is typical with some kinds of journalling.
>
> When sector-aligned guest offsets are converted to sector-unaligned
> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
> and power failure of the host disk can cause more damage than the
> guest is designed to be resistant to.
>
> -- Jamie
>
>
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  0:58       ` Anthony Liguori
@ 2008-05-21  1:04         ` Jamie Lokier
  2008-05-21  1:05         ` Anthony Liguori
  2008-05-21  1:05         ` Paul Brook
  2 siblings, 0 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21  1:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Anthony Liguori wrote:
> Paul Brook wrote:
> >>When sector-aligned guest offsets are converted to sector-unaligned
> >>writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
> >>and power failure of the host disk can cause more damage than the
> >>guest is designed to be resistant to.
> >>    
> >
> >Seems like the easiest solution would be to have qcow always align its 
> >writes.
> >We don't do on the fly compression, so it should be fairly easy to make 
> >this happen with minimal overhead.
> >  
> 
> That's not sufficient. O_DIRECT imposes not only offset alignment 
> requirements but also requirements on the buffer being read to. Most of 
> the code in QEMU does not properly align the read/write buffers.

The offset when reading is not so important.  And of course, can't be
guaranteed with qcow2 - as the sector being read may be compressed.

For writing, if the memory isn't suitably aligned for the
_transformed_ offset, you can either transform it differently, or copy
the memory to somewhere with the right offset first.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  0:58       ` Anthony Liguori
  2008-05-21  1:04         ` Jamie Lokier
@ 2008-05-21  1:05         ` Anthony Liguori
  2008-05-21  8:06           ` Kevin Wolf
  2008-05-21  1:05         ` Paul Brook
  2 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21  1:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Anthony Liguori wrote:
> Paul Brook wrote:
>>> When sector-aligned guest offsets are converted to sector-unaligned
>>> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
>>> and power failure of the host disk can cause more damage than the
>>> guest is designed to be resistant to.
>>>     
>>
>> Seems like the easiest solution would be to have qcow always align 
>> its writes.
>> We don't do on the fly compression, so it should be fairly easy to 
>> make this happen with minimal overhead.
>>   
>
> That's not sufficient. O_DIRECT imposes not only offset alignment 
> requirements but also requirements on the buffer being read to. Most 
> of the code in QEMU does not properly align the read/write buffers.

For instance, the kernel loading code passes a buffer on the stack to 
bdrv_read().

That's a problem with the current patch honestly, we should not rely on 
callers to align buffers.  bdrv_read()/bdrv_aio_read()/bdrv_pread() all 
should be tolerant of unaligned buffers if we're going to support O_DIRECT.

Regards,

Anthony Liguori

> Regards,
>
> Anthony Liguori
>
>> Paul
>>
>>
>>   
>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  0:58       ` Anthony Liguori
  2008-05-21  1:04         ` Jamie Lokier
  2008-05-21  1:05         ` Anthony Liguori
@ 2008-05-21  1:05         ` Paul Brook
  2008-05-21  1:14           ` Anthony Liguori
  2 siblings, 1 reply; 48+ messages in thread
From: Paul Brook @ 2008-05-21  1:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

On Wednesday 21 May 2008, Anthony Liguori wrote:
> Paul Brook wrote:
> >> When sector-aligned guest offsets are converted to sector-unaligned
> >> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
> >> and power failure of the host disk can cause more damage than the
> >> guest is designed to be resistant to.
> >
> > Seems like the easiest solution would be to have qcow always align its
> > writes. We don't do on the fly compression, so it should be fairly easy
> > to make this happen with minimal overhead.
>
> That's not sufficient. O_DIRECT imposes not only offset alignment
> requirements but also requirements on the buffer being read to. Most of
> the code in QEMU does not properly align the read/write buffers.

In that case you need both. For correct operation the qcow layer needs to 
ensure that all file offsets are block aligned (amongst other things, I 
wouldn't be surprised if there are more subtle problems with metadata 
updates).

The memory buffer alignment can occur wherever is most convenient, that's 
trivially atomic w.r.t. unexpected interruptions.

Paul

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  1:05         ` Paul Brook
@ 2008-05-21  1:14           ` Anthony Liguori
  2008-05-21  8:24             ` Kevin Wolf
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21  1:14 UTC (permalink / raw)
  To: Paul Brook; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Kevin Wolf

Paul Brook wrote:
> On Wednesday 21 May 2008, Anthony Liguori wrote:
>   
>> Paul Brook wrote:
>>     
>>>> When sector-aligned guest offsets are converted to sector-unaligned
>>>> writes (e.g. due to qcow2 etc.), that property is no longer satisfied,
>>>> and power failure of the host disk can cause more damage than the
>>>> guest is designed to be resistant to.
>>>>         
>>> Seems like the easiest solution would be to have qcow always align its
>>> writes. We don't do on the fly compression, so it should be fairly easy
>>> to make this happen with minimal overhead.
>>>       
>> That's not sufficient. O_DIRECT imposes not only offset alignment
>> requirements but also requirements on the buffer being read to. Most of
>> the code in QEMU does not properly align the read/write buffers.
>>     
>
> In that case you need both. For correct operation the qcow layer needs to 
> ensure that all file offsets are block aligned (amongst other things, I 
> wouldn't be surprised if there are more subtle problems with metadata 
> updates).
>
> The memory buffer alignment can occur wherever is most convenient, that's 
> trivially atomic w.r.t. unexpected interruptions.
>   

Yes, I don't think qcow is very safe at all wrt unexpected power events. 
If we're going to support O_DIRECT, then it's important that underlying 
block device emulate accesses that don't meet the requirements of 
O_DIRECT. IMHO, this should all happen within block-raw-posix.c.

Keep in mind, the requirements for O_DIRECT since 2.5 is hard sector 
size (which is usually 512 bytes, but not always) for offset, buffer, 
and size. Pre-2.5, the requirement is soft sector size which on a 
filesystem is usually 4k.

I don't think it's that important to try and guess the right alignment 
size, 512 is probably usually sufficient, but spreading alignment 
requirements of 512 throughout QEMU code is a bad idea because this is 
something that's very hardware/OS specific.

For people that care about data integrity, we should be using O_SYNC, 
not O_DIRECT anyway.

Regards,

Anthony Liguori

> Paul
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  1:00     ` Anthony Liguori
@ 2008-05-21  1:19       ` Jamie Lokier
  2008-05-21  2:12         ` Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21  1:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Anthony Liguori wrote:
> >One property of disks is that if you overwrite a sector and the're
> >power loss, when read later that sector might be corrupt.  Even if the
> >new data is the same as the old data with only some bytes changed,
> >some of the _unchanged_ bytes may be corrupt by this.
> 
> I don't think this is true.  What evidence do you have to support such 
> claims?

What do you imagine happens when you pull the power in the middle of
writing a sector to a floppy disk (to pick a more easily imagined
example)?

There is not enough residual power to write the rest of the sector.
That sector's checksum will therefore be corrupt, and (hopefully) have
a CRC read error.  It can be written over again, wiping the CRC error.

No sector which wasn't being written will be corrupt: the write head
isn't activated over those.  The drive waits until it senses the start
of sector N, then activates the write head to write data bits.

The CRC error by itself my cause the whole sector to be reported as
corrupt with no data.  However, if you do manage to get back the bits
from the media, some bits of the sector being written whose values
were not intended to change may be different than expected.  This is
because the way data is recorded does not encode each bit separately,
but multiplexes them together for modulation, and also because bit
timing is not exact.

A modern hard disk uses much more complex data encoding, which further
adds to the effect of a truncated write corrupting even data bits not
intended to be changed, in the vicinity of those being changed.

But it should aim to provide the same basic guarantee that writing a
sector cannot corrupt neighbouring sectors on power failure, only the
one(s) being written.  This is because robustness of journalling
filesystems and databases do rather depend on this property, and
simple old-fashioned disks do provide it.

I am just speculating; I don't know whether modern hard disks provide
this property, or under what circumstances they fail.  But it seems
they could provide it, because they still have physically independent
sectors.

(Interestingly, the journal block size used by Oracle on different
OSes is different, suggesting the "basic unit of corruption"
varies between OSes and is not always a single sector).

Although it's just speculation, do you think modern hard disks behave
differently from this?

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  1:19       ` Jamie Lokier
@ 2008-05-21  2:12         ` Anthony Liguori
  2008-05-21  8:27           ` Andreas Färber
  2008-05-21 11:43           ` Jamie Lokier
  0 siblings, 2 replies; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21  2:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>>> One property of disks is that if you overwrite a sector and the're
>>> power loss, when read later that sector might be corrupt.  Even if the
>>> new data is the same as the old data with only some bytes changed,
>>> some of the _unchanged_ bytes may be corrupt by this.
>>>       
>> I don't think this is true.  What evidence do you have to support such 
>> claims?
>>     
>
> What do you imagine happens when you pull the power in the middle of
> writing a sector to a floppy disk (to pick a more easily imagined
> example)?
>
> There is not enough residual power to write the rest of the sector.
> That sector's checksum will therefore be corrupt, and (hopefully) have
> a CRC read error.  It can be written over again, wiping the CRC error.
>   

Why would the sector's checksum be corrupt?  The checksum wouldn't 
change after the data write.

> No sector which wasn't being written will be corrupt: the write head
> isn't activated over those.  The drive waits until it senses the start
> of sector N, then activates the write head to write data bits.
>
> The CRC error by itself my cause the whole sector to be reported as
> corrupt with no data.  However, if you do manage to get back the bits
> from the media, some bits of the sector being written whose values
> were not intended to change may be different than expected.  This is
> because the way data is recorded does not encode each bit separately,
> but multiplexes them together for modulation, and also because bit
> timing is not exact.
>
> A modern hard disk uses much more complex data encoding, which further
> adds to the effect of a truncated write corrupting even data bits not
> intended to be changed, in the vicinity of those being changed.
>
> But it should aim to provide the same basic guarantee that writing a
> sector cannot corrupt neighbouring sectors on power failure, only the
> one(s) being written.  This is because robustness of journalling
> filesystems and databases do rather depend on this property, and
> simple old-fashioned disks do provide it.
>
> I am just speculating; I don't know whether modern hard disks provide
> this property, or under what circumstances they fail.  But it seems
> they could provide it, because they still have physically independent
> sectors.
>
> (Interestingly, the journal block size used by Oracle on different
> OSes is different, suggesting the "basic unit of corruption"
> varies between OSes and is not always a single sector).
>
> Although it's just speculation, do you think modern hard disks behave
> differently from this?
>   

Modern *enterprise* hard disks have battery backed caches so read/write 
operations always complete or fail.  Low-end disks don't tend to have 
battery backed caches but AFAIK, rewriting the same data will not result 
in any sort of disk corruption.

Regards,

Anthony Liguori


> -- Jamie
>
>
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  0:54         ` Paul Brook
@ 2008-05-21  7:59           ` Laurent Vivier
  0 siblings, 0 replies; 48+ messages in thread
From: Laurent Vivier @ 2008-05-21  7:59 UTC (permalink / raw)
  To: Paul Brook; +Cc: Blue Swirl, qemu-devel, Kevin Wolf

Le mercredi 21 mai 2008 à 01:54 +0100, Paul Brook a écrit :
> > > Seems like the easiest solution would be to have qcow always align its
> > > writes. We don't do on the fly compression, so it should be fairly easy
> > > to make this happen with minimal overhead.
> >
> > I did the patch you describe and post it to the mailing list on Tue, 22
> > Jan 2008 11:17:09 +0100, it was called
> > "[PATCH] snapshot=on and cache=off compatibility"
> > but was never commented.
> 
> That patch that patch messes with O_DIRECT on an open file descriptor, which 
> was generally agreed to be a bad idea.

I agree with that, it's why I sent a new patch yesterday.
But look at the second part of the (first) patch that try to align qcow
buffers (it was to answer to your comment).
Moreover we can't align all data, so we have to make some
buffer/count/offset alignment in block-raw-posix.c.

I sincerely think that the patch I sent yesterday must be apply.

Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  1:05         ` Anthony Liguori
@ 2008-05-21  8:06           ` Kevin Wolf
  0 siblings, 0 replies; 48+ messages in thread
From: Kevin Wolf @ 2008-05-21  8:06 UTC (permalink / raw)
  To: anthony; +Cc: Blue Swirl, Laurent Vivier, qemu-devel

Anthony Liguori schrieb:
> That's a problem with the current patch honestly, we should not rely on 
> callers to align buffers.  bdrv_read()/bdrv_aio_read()/bdrv_pread() all 
> should be tolerant of unaligned buffers if we're going to support O_DIRECT.

I agree, we shouldn't rely on that. And actually we don't. ;-)

If (buf & 0x1ff) we're using the emulation with our own aligned buffer. 
Without this "tolerance" my test image wouldn't even boot up.

Kevin

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  1:14           ` Anthony Liguori
@ 2008-05-21  8:24             ` Kevin Wolf
  2008-05-21 12:26               ` Jamie Lokier
  0 siblings, 1 reply; 48+ messages in thread
From: Kevin Wolf @ 2008-05-21  8:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Anthony Liguori schrieb:
> I don't think it's that important to try and guess the right alignment 
> size, 512 is probably usually sufficient, but spreading alignment 
> requirements of 512 throughout QEMU code is a bad idea because this is 
> something that's very hardware/OS specific.

So better introduce a #define in block.h?

> For people that care about data integrity, we should be using O_SYNC, 
> not O_DIRECT anyway.

Should we implement an option for O_SYNC then? (not in this patch, of 
course)

Kevin

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  2:12         ` Anthony Liguori
@ 2008-05-21  8:27           ` Andreas Färber
  2008-05-21 14:06             ` Anthony Liguori
  2008-05-21 11:43           ` Jamie Lokier
  1 sibling, 1 reply; 48+ messages in thread
From: Andreas Färber @ 2008-05-21  8:27 UTC (permalink / raw)
  To: qemu-devel


Am 21.05.2008 um 04:12 schrieb Anthony Liguori:

> Jamie Lokier wrote:
>> What do you imagine happens when you pull the power in the middle of
>> writing a sector to a floppy disk (to pick a more easily imagined
>> example)?
>>
>> There is not enough residual power to write the rest of the sector.
>> That sector's checksum will therefore be corrupt, and (hopefully)  
>> have
>> a CRC read error.  It can be written over again, wiping the CRC  
>> error.
>>
>
> Why would the sector's checksum be corrupt?  The checksum wouldn't  
> change after the data write.

If you change part of the data but leave the checksum as-is, you'd  
hopefully get a checksum mismatch... ;)

Andreas

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  2:12         ` Anthony Liguori
  2008-05-21  8:27           ` Andreas Färber
@ 2008-05-21 11:43           ` Jamie Lokier
  1 sibling, 0 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21 11:43 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Kevin Wolf

Anthony Liguori wrote:
> >There is not enough residual power to write the rest of the sector.
> >That sector's checksum will therefore be corrupt, and (hopefully) have
> >a CRC read error.  It can be written over again, wiping the CRC error.
> 
> Why would the sector's checksum be corrupt?  The checksum wouldn't 
> change after the data write.

Changing part of the data invalidates a CRC...

Also don't forget:

> >This is because the way data is recorded does not encode each bit
> >separately, but multiplexes them together for modulation, and also
> >because bit timing is not exact.

Inexact bit timing means the bits are not written exactly over the
previous bits, so there's a mismatch if you abort in the middle even
if all the bits are identical.

Finally, some kinds of data modulation use random numbers too, to
make the stored power spectrum independent of what data you are writing.

> Modern *enterprise* hard disks have battery backed caches so read/write 
> operations always complete or fail.

True.  And some have flash - when the battery or capacitor is low,
they can write to flash.

> Low-end disks don't tend to have battery backed caches but AFAIK,
> rewriting the same data will not result in any sort of disk corruption.

There's only one way to find out for sure... do the tests.  Or ask the
manufacturers, maybe they know.  Or ask Oracle, perhaps they have done
these tests.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  8:24             ` Kevin Wolf
@ 2008-05-21 12:26               ` Jamie Lokier
  2008-05-21 12:37                 ` Avi Kivity
  0 siblings, 1 reply; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21 12:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Kevin Wolf wrote:
> Anthony Liguori schrieb:
> >I don't think it's that important to try and guess the right alignment 
> >size, 512 is probably usually sufficient, but spreading alignment 
> >requirements of 512 throughout QEMU code is a bad idea because this is 
> >something that's very hardware/OS specific.
> 
> So better introduce a #define in block.h?
> 
> >For people that care about data integrity, we should be using O_SYNC, 
> >not O_DIRECT anyway.
> 
> Should we implement an option for O_SYNC then? (not in this patch, of 
> course)

Why would O_SYNC be better than O_DIRECT?

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 12:26               ` Jamie Lokier
@ 2008-05-21 12:37                 ` Avi Kivity
  2008-05-21 13:41                   ` Jamie Lokier
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 12:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Jamie Lokier wrote:
> Why would O_SYNC be better than O_DIRECT?
>   

O_SYNC is a write-through cache.  O_DIRECT is completely uncached.  Both 
have their uses (including in this context), so you can't say one is 
better than the other.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 12:37                 ` Avi Kivity
@ 2008-05-21 13:41                   ` Jamie Lokier
  2008-05-21 13:55                     ` Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21 13:41 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Avi Kivity wrote:
> Jamie Lokier wrote:
> >Why would O_SYNC be better than O_DIRECT?
> 
> O_SYNC is a write-through cache.  O_DIRECT is completely uncached.  Both 
> have their uses (including in this context), so you can't say one is 
> better than the other.

Fine, but the question is: why would O_SYNC be better for
*data integrity* than O_DIRECT?  Referring to:

Anthony Liguori wrote:
> For people that care about data integrity, we should be using O_SYNC,
> not O_DIRECT anyway.

Could it be connected with this, from elsewhere?

Rob van Nieuwkerk wrote:
> It appears that somewhere between RH kernels 2.4.18-27.7.x and 2.4.20-18.9
> something has changed so that my application needs a O_SYNC too besides
> the O_DIRECT to make sure that writes will be synchronous.  If I leave
> the O_SYNC out with 2.4.20-18.9 the write will happen physically 35
> seconds after the write()

For that, O_SYNC is used in conjunction with O_DIRECT, rather than
instead of it.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 13:41                   ` Jamie Lokier
@ 2008-05-21 13:55                     ` Anthony Liguori
  2008-05-21 14:17                       ` Avi Kivity
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 13:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> Jamie Lokier wrote:
>>     
>>> Why would O_SYNC be better than O_DIRECT?
>>>       
>> O_SYNC is a write-through cache.  O_DIRECT is completely uncached.  Both 
>> have their uses (including in this context), so you can't say one is 
>> better than the other.
>>     
>
> Fine, but the question is: why would O_SYNC be better for
> *data integrity* than O_DIRECT?  Referring to:
>   

"cached" is not a terribly accurate term.  O_DIRECT avoids the host page 
cache but it doesn't guarantee that the disk is using write-through.  
For that, you need to use hdparm.

O_SYNC basically turns the host page cache into a write-through cache.  
In terms of data integrity, the only question that matters is whether 
you're misleading the guest into thinking data is on the disk when it 
isn't.  Both O_DIRECT and O_SYNC accomplish this.

If you just are concerned with data integrity, O_SYNC is probably better 
because you get the benefits of host caching.  O_DIRECT is really for 
circumstances where you know that using the host page cache is going to 
reduce performance.

Regards,

Anthony Liguori

> Anthony Liguori wrote:
>   
>> For people that care about data integrity, we should be using O_SYNC,
>> not O_DIRECT anyway.
>>     
>
> Could it be connected with this, from elsewhere?
>
> Rob van Nieuwkerk wrote:
>   
>> It appears that somewhere between RH kernels 2.4.18-27.7.x and 2.4.20-18.9
>> something has changed so that my application needs a O_SYNC too besides
>> the O_DIRECT to make sure that writes will be synchronous.  If I leave
>> the O_SYNC out with 2.4.20-18.9 the write will happen physically 35
>> seconds after the write()
>>     
>
> For that, O_SYNC is used in conjunction with O_DIRECT, rather than
> instead of it.
>
> -- Jamie
>
>
>
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21  8:27           ` Andreas Färber
@ 2008-05-21 14:06             ` Anthony Liguori
  2008-05-21 15:31               ` Jamie Lokier
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 14:06 UTC (permalink / raw)
  To: qemu-devel

Andreas Färber wrote:
>
> Am 21.05.2008 um 04:12 schrieb Anthony Liguori:
>
>> Jamie Lokier wrote:
>>> What do you imagine happens when you pull the power in the middle of
>>> writing a sector to a floppy disk (to pick a more easily imagined
>>> example)?
>>>
>>> There is not enough residual power to write the rest of the sector.
>>> That sector's checksum will therefore be corrupt, and (hopefully) have
>>> a CRC read error.  It can be written over again, wiping the CRC error.
>>>
>>
>> Why would the sector's checksum be corrupt?  The checksum wouldn't 
>> change after the data write.
>
> If you change part of the data but leave the checksum as-is, you'd 
> hopefully get a checksum mismatch... ;)

That's the point though, you're not changing part of the data.  You're 
rewriting the same data.  I don't think the checksum gets automatically 
invalidated whenever data is written to the disk.

Regards,

Anthony Liguori

> Andreas
>
>

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 13:55                     ` Anthony Liguori
@ 2008-05-21 14:17                       ` Avi Kivity
  2008-05-21 14:26                         ` Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 14:17 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Anthony Liguori wrote:
>
> "cached" is not a terribly accurate term.  O_DIRECT avoids the host 
> page cache but it doesn't guarantee that the disk is using 
> write-through.  For that, you need to use hdparm.
>
> O_SYNC basically turns the host page cache into a write-through 
> cache.  In terms of data integrity, the only question that matters is 
> whether you're misleading the guest into thinking data is on the disk 
> when it isn't.  Both O_DIRECT and O_SYNC accomplish this.
>
> If you just are concerned with data integrity, O_SYNC is probably 
> better because you get the benefits of host caching.  O_DIRECT is 
> really for circumstances where you know that using the host page cache 
> is going to reduce performance.

In one specific circumstance O_SYNC has data integrity problems: shared 
disks with guests running on different hosts (or even a guest on one 
host, sharing a disk with another host).  In these cases, two reads can 
return different values without an intervening write.

In the general case, O_DIRECT gives better performance.  It avoids 
copying from the host pagecache to guest memory, and if you have spare 
memory to benefit from caching, give it to the guest; the nearer to the 
data consumer the cache is, the faster it performs.

In one specific case O_SYNC (or regular read/write cached operation) is 
better, rebooting the same guest over and over, as the guest cache is 
flushed on reboot.  Not a very interesting case, but unfortunately one 
that is very visible.

(if you have a backing file for a COW disk, then opening the backing 
file without O_DIRECT may be a good idea too, as the file can be shared 
among many guests).

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 14:17                       ` Avi Kivity
@ 2008-05-21 14:26                         ` Anthony Liguori
  2008-05-21 14:57                           ` Avi Kivity
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 14:26 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Avi Kivity wrote:
> Anthony Liguori wrote:
>>
>> "cached" is not a terribly accurate term.  O_DIRECT avoids the host 
>> page cache but it doesn't guarantee that the disk is using 
>> write-through.  For that, you need to use hdparm.
>>
>> O_SYNC basically turns the host page cache into a write-through 
>> cache.  In terms of data integrity, the only question that matters is 
>> whether you're misleading the guest into thinking data is on the disk 
>> when it isn't.  Both O_DIRECT and O_SYNC accomplish this.
>>
>> If you just are concerned with data integrity, O_SYNC is probably 
>> better because you get the benefits of host caching.  O_DIRECT is 
>> really for circumstances where you know that using the host page 
>> cache is going to reduce performance.
>
> In one specific circumstance O_SYNC has data integrity problems: 
> shared disks with guests running on different hosts (or even a guest 
> on one host, sharing a disk with another host).  In these cases, two 
> reads can return different values without an intervening write.

Are you assuming the underlying disk sharing protocol does not keep the 
page cache coherent?

> In the general case, O_DIRECT gives better performance.  It avoids 
> copying from the host pagecache to guest memory, and if you have spare 
> memory to benefit from caching, give it to the guest; the nearer to 
> the data consumer the cache is, the faster it performs.

This assumes the only thing you're running on the machine is VMs.  If 
you're just running one VM on your desktop, it is probably preferable to 
go through the host page cache.  In particular, the host page cache can 
be automatically aged and adjusted whereas, in general, you cannot 
reduce the guests page cache size.

> In one specific case O_SYNC (or regular read/write cached operation) 
> is better, rebooting the same guest over and over, as the guest cache 
> is flushed on reboot.  Not a very interesting case, but unfortunately 
> one that is very visible.
>
> (if you have a backing file for a COW disk, then opening the backing 
> file without O_DIRECT may be a good idea too, as the file can be 
> shared among many guests).

FWIW, we really only need to use O_SYNC when the guest has disabled 
write-back.  I think we should do that unconditionally too as it's an 
issue of correctness.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 14:26                         ` Anthony Liguori
@ 2008-05-21 14:57                           ` Avi Kivity
  2008-05-21 15:34                             ` Jamie Lokier
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 14:57 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Anthony Liguori wrote:
> Avi Kivity wrote:
>> Anthony Liguori wrote:
>>>
>>> "cached" is not a terribly accurate term.  O_DIRECT avoids the host 
>>> page cache but it doesn't guarantee that the disk is using 
>>> write-through.  For that, you need to use hdparm.
>>>
>>> O_SYNC basically turns the host page cache into a write-through 
>>> cache.  In terms of data integrity, the only question that matters 
>>> is whether you're misleading the guest into thinking data is on the 
>>> disk when it isn't.  Both O_DIRECT and O_SYNC accomplish this.
>>>
>>> If you just are concerned with data integrity, O_SYNC is probably 
>>> better because you get the benefits of host caching.  O_DIRECT is 
>>> really for circumstances where you know that using the host page 
>>> cache is going to reduce performance.
>>
>> In one specific circumstance O_SYNC has data integrity problems: 
>> shared disks with guests running on different hosts (or even a guest 
>> on one host, sharing a disk with another host).  In these cases, two 
>> reads can return different values without an intervening write.
>
> Are you assuming the underlying disk sharing protocol does not keep 
> the page cache coherent?
>

I was assuming access to a raw partition.  But yes, with a cluster file 
system this objection goes away.

At a significant cost, though.  You're now running two cache coherency 
protocols on top of each other.

>> In the general case, O_DIRECT gives better performance.  It avoids 
>> copying from the host pagecache to guest memory, and if you have 
>> spare memory to benefit from caching, give it to the guest; the 
>> nearer to the data consumer the cache is, the faster it performs.
>
> This assumes the only thing you're running on the machine is VMs.  If 
> you're just running one VM on your desktop, it is probably preferable 
> to go through the host page cache.  In particular, the host page cache 
> can be automatically aged and adjusted whereas, in general, you cannot 
> reduce the guests page cache size.

Agreed.  For casual uses, O_DIRECT is overkill.  It does get rid of the 
data copies, though.

>
>> In one specific case O_SYNC (or regular read/write cached operation) 
>> is better, rebooting the same guest over and over, as the guest cache 
>> is flushed on reboot.  Not a very interesting case, but unfortunately 
>> one that is very visible.
>>
>> (if you have a backing file for a COW disk, then opening the backing 
>> file without O_DIRECT may be a good idea too, as the file can be 
>> shared among many guests).
>
> FWIW, we really only need to use O_SYNC when the guest has disabled 
> write-back.  I think we should do that unconditionally too as it's an 
> issue of correctness. 

If the guest is used for non critical applications (like testing distro 
installers), then it's just a slowdown.

Even if the guest did not disable disk writeback, pagecache and disk 
write caches have vastly different characteristics, so I think we should 
set O_SYNC there as well.

Here's a summary of the use cases I saw so far:

- casual use, no critical data: write back cache

- backing file shared among many guests: read-only, cached

- desktop system, but don't lose my data: O_SYNC
(significant resources on the host)

- dedicated virtualization engine: O_DIRECT
(most host resources assigned to guests)

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 14:06             ` Anthony Liguori
@ 2008-05-21 15:31               ` Jamie Lokier
  0 siblings, 0 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21 15:31 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:
> >If you change part of the data but leave the checksum as-is, you'd 
> >hopefully get a checksum mismatch... ;)
> 
> That's the point though, you're not changing part of the data.  You're 
> rewriting the same data.

Usually you're changing some bytes in a block but leaving others the same.
Why else would you write at all?

But even if you do write the same bits _exactly_, the bits written
will not overlay the original data bits precisely enough to make
aborting in the middle seamless.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 14:57                           ` Avi Kivity
@ 2008-05-21 15:34                             ` Jamie Lokier
  2008-05-21 16:02                               ` Anthony Liguori
  2008-05-21 16:44                               ` Avi Kivity
  0 siblings, 2 replies; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21 15:34 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Avi Kivity wrote:
> Here's a summary of the use cases I saw so far:
> 
> - casual use, no critical data: write back cache
> 
> - backing file shared among many guests: read-only, cached
> 
> - desktop system, but don't lose my data: O_SYNC
> (significant resources on the host)
> 
> - dedicated virtualization engine: O_DIRECT
> (most host resources assigned to guests)

Sounds alright, but on _my_ desktop system (a laptop), I would use O_DIRECT.

There isn't enough RAM in my system to be happy duplicating data in
guests and hosts at the same time.  VMs are quite demanding on RAM.

However, if you find a way to map host cached pages into the guest
without copying - so sharing the RAM - that would be excellent.  It
can be done in principle, by remapping pages to satisfy IDE/SCSI DMA
requests.  I don't know if it would be fast enough.  Perhaps it would
work better in KVM than QEMU.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 15:34                             ` Jamie Lokier
@ 2008-05-21 16:02                               ` Anthony Liguori
  2008-05-21 16:24                                 ` Jamie Lokier
  2008-05-21 16:45                                 ` Avi Kivity
  2008-05-21 16:44                               ` Avi Kivity
  1 sibling, 2 replies; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 16:02 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> Here's a summary of the use cases I saw so far:
>>
>> - casual use, no critical data: write back cache
>>
>> - backing file shared among many guests: read-only, cached
>>
>> - desktop system, but don't lose my data: O_SYNC
>> (significant resources on the host)
>>
>> - dedicated virtualization engine: O_DIRECT
>> (most host resources assigned to guests)
>>     
>
> Sounds alright, but on _my_ desktop system (a laptop), I would use O_DIRECT.
>
> There isn't enough RAM in my system to be happy duplicating data in
> guests and hosts at the same time.  VMs are quite demanding on RAM.
>
> However, if you find a way to map host cached pages into the guest
> without copying - so sharing the RAM - that would be excellent.  It
> can be done in principle, by remapping pages to satisfy IDE/SCSI DMA
> requests.  I don't know if it would be fast enough.  Perhaps it would
> work better in KVM than QEMU.
>   

Actually, this is precisely what I'd like to do.  The key is to 
mmap(MAP_PRIVATE) from the underling file directly into the guest's 
memory.  Should be just as applicable to QEMU as KVM (although for KVM 
we need mmu-notifiers first).

Should be a pretty good win when running multiple guests with the same 
backing file too.

Regards,

Anthony Liguori

> -- Jamie
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 16:02                               ` Anthony Liguori
@ 2008-05-21 16:24                                 ` Jamie Lokier
  2008-05-21 16:48                                   ` Avi Kivity
  2008-05-21 16:45                                 ` Avi Kivity
  1 sibling, 1 reply; 48+ messages in thread
From: Jamie Lokier @ 2008-05-21 16:24 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Anthony Liguori wrote:
> Actually, this is precisely what I'd like to do.  The key is to 
> mmap(MAP_PRIVATE) from the underling file directly into the guest's 
> memory.  Should be just as applicable to QEMU as KVM (although for KVM 
> we need mmu-notifiers first).

With QEMU, that would lead to a huge number of VMAs - different mmap
address regions.  Linux (or any host for all I know) doesn't handle
that well.

On Linux you could use remap_file_pages to avoid the VMA problem.

-- Jamie

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 15:34                             ` Jamie Lokier
  2008-05-21 16:02                               ` Anthony Liguori
@ 2008-05-21 16:44                               ` Avi Kivity
  1 sibling, 0 replies; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 16:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Jamie Lokier wrote:
> Avi Kivity wrote:
>   
>> Here's a summary of the use cases I saw so far:
>>
>> - casual use, no critical data: write back cache
>>
>> - backing file shared among many guests: read-only, cached
>>
>> - desktop system, but don't lose my data: O_SYNC
>> (significant resources on the host)
>>
>> - dedicated virtualization engine: O_DIRECT
>> (most host resources assigned to guests)
>>     
>
> Sounds alright, but on _my_ desktop system (a laptop), I would use O_DIRECT.
>
> There isn't enough RAM in my system to be happy duplicating data in
> guests and hosts at the same time.  VMs are quite demanding on RAM.
>
>   

Sure, if you're low on resources, and aren't rebooting often, that's the 
right thing to do.

> However, if you find a way to map host cached pages into the guest
> without copying - so sharing the RAM - that would be excellent.  It
> can be done in principle, by remapping pages to satisfy IDE/SCSI DMA
> requests.  I don't know if it would be fast enough.  Perhaps it would
> work better in KVM than QEMU.
>   

Sounds like a memory management nightmare.  With mmu notifiers (or plain 
qemu), though, it can be done.  Have the backing file also contain an 
area for guest RAM.  Use a nonlinear mapping to map this area as guest 
memory.  If the guest issues a properly-aligned read, call 
remap_file_pages() for that page, and write-protect it.  When you get a 
protection violation (as the guest writes to that memory), copy it to 
the RAM area and remap it again.

I don't think remap_file_pages() supports different protections in a 
single VMA; that could kill the whole idea.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 16:02                               ` Anthony Liguori
  2008-05-21 16:24                                 ` Jamie Lokier
@ 2008-05-21 16:45                                 ` Avi Kivity
  1 sibling, 0 replies; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 16:45 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Paul Brook

Anthony Liguori wrote:
>
> Actually, this is precisely what I'd like to do.  The key is to 
> mmap(MAP_PRIVATE) from the underling file directly into the guest's 
> memory.  Should be just as applicable to QEMU as KVM (although for KVM 
> we need mmu-notifiers first).
>

mmap()ing a page at a time will generate a horribly long vma list.  You 
need nonlinear mappings for this to be efficient.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 16:24                                 ` Jamie Lokier
@ 2008-05-21 16:48                                   ` Avi Kivity
  2008-05-21 17:01                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 16:48 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier, Andrea Arcangeli, Paul Brook

Jamie Lokier wrote:
> Anthony Liguori wrote:
>   
>> Actually, this is precisely what I'd like to do.  The key is to 
>> mmap(MAP_PRIVATE) from the underling file directly into the guest's 
>> memory.  Should be just as applicable to QEMU as KVM (although for KVM 
>> we need mmu-notifiers first).
>>     
>
> With QEMU, that would lead to a huge number of VMAs - different mmap
> address regions.  Linux (or any host for all I know) doesn't handle
> that well.
>
> On Linux you could use remap_file_pages to avoid the VMA problem.
>
>   

Hmm, if remap_file_pages() supports MAP_PRIVATE, that solves all the 
problems neatly.

Andrea, do you know if that combination works?

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 16:48                                   ` Avi Kivity
@ 2008-05-21 17:01                                     ` Andrea Arcangeli
  2008-05-21 17:18                                       ` Avi Kivity
  0 siblings, 1 reply; 48+ messages in thread
From: Andrea Arcangeli @ 2008-05-21 17:01 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

On Wed, May 21, 2008 at 07:48:24PM +0300, Avi Kivity wrote:
> Hmm, if remap_file_pages() supports MAP_PRIVATE, that solves all the 
> problems neatly.
>
> Andrea, do you know if that combination works?

No, it only supports MAP_SHARED. Why would anyone want MAP_PRIVATE for
real I/O operations? MAP_PRIVATE is primarily for binaries and things
that are readonly on disk and that we may want to update (like for
dynamic linking w/o hardlinking).

Or is that for -snapshot support, to keep the modifications in
anonymous ram?

I imagine you want remap_file_pages for zerocopy I/O without using
O_DIRECT. If you use O_DIRECT you don't need mmap. However O_DIRECT
work best with kernel asyncio for small seeking blocks so all I/O can
be submitted at the same time. writev also works better than write for
O_DIRECT.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 17:01                                     ` Andrea Arcangeli
@ 2008-05-21 17:18                                       ` Avi Kivity
  2008-05-21 17:47                                         ` Andrea Arcangeli
  0 siblings, 1 reply; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 17:18 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Andrea Arcangeli wrote:
> On Wed, May 21, 2008 at 07:48:24PM +0300, Avi Kivity wrote:
>   
>> Hmm, if remap_file_pages() supports MAP_PRIVATE, that solves all the 
>> problems neatly.
>>
>> Andrea, do you know if that combination works?
>>     
>
> No, it only supports MAP_SHARED. Why would anyone want MAP_PRIVATE for
> real I/O operations? MAP_PRIVATE is primarily for binaries and things
> that are readonly on disk and that we may want to update (like for
> dynamic linking w/o hardlinking).
>
> Or is that for -snapshot support, to keep the modifications in
> anonymous ram?
>
> I imagine you want remap_file_pages for zerocopy I/O without using
> O_DIRECT. If you use O_DIRECT you don't need mmap. However O_DIRECT
> work best with kernel asyncio for small seeking blocks so all I/O can
> be submitted at the same time. writev also works better than write for
> O_DIRECT.
>   

Yes, that's the reason.  Here zerocopy is not the motivation; instead, 
we have host-cached pages that are used directly in the guest.  So we 
get both reduced memory footprint, and host caching.  O_DIRECT reduces 
the memory footprint but kills host caching.

The scenario is desktop/laptop use.  For server use O_DIRECT is clearly 
preferred due to much reduced overhead.

-- 
error compiling committee.c: too many arguments to function

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 17:18                                       ` Avi Kivity
@ 2008-05-21 17:47                                         ` Andrea Arcangeli
  2008-05-21 17:53                                           ` Anthony Liguori
  2008-05-21 18:29                                           ` Avi Kivity
  0 siblings, 2 replies; 48+ messages in thread
From: Andrea Arcangeli @ 2008-05-21 17:47 UTC (permalink / raw)
  To: Avi Kivity; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

On Wed, May 21, 2008 at 08:18:01PM +0300, Avi Kivity wrote:
> Yes, that's the reason.  Here zerocopy is not the motivation; instead, we 
> have host-cached pages that are used directly in the guest.  So we get both 
> reduced memory footprint, and host caching.  O_DIRECT reduces the memory 
> footprint but kills host caching.

Sure. So MAP_SHARED+remap_file_pages should work just fine to achieve
zerocopy I/O.

> The scenario is desktop/laptop use.  For server use O_DIRECT is clearly 
> preferred due to much reduced overhead.

Well in some ways there's more overhead with O_DIRECT because O_DIRECT
has to call get_user_pages and walk pagetables in software before
every I/O operation. MAP_SHARED walks them in hardware and it can take
advantage of the CPU tlb too.

The primary problem of MAP_SHARED isn't the overhead of the operation
itself that infact will be lower with MAP_SHARED after the cache is
allocated, but the write throttling and garbage collection of the host
caches. If you've a 250G guest image, MAP_SHARED will allocate as much
as 250G of cache and a cp /dev/zero /dev/hdb in the guest will mark
100% of guest RAM dirty. The mkclean methods and write throttling for
MAP_SHARED introduced in reasonably recent kernels can avoid filling
100% of host ram with dirty pages, but it still requires write
throttling and it'll pollute the host caches and it can result in
large ram allocations in the host having to block before the ram is
available, the same way as buffered writes in the host (current KVM
default).

I think O_DIRECT is the best solution and MAP_SHARED could become a
secondary option just for certain guest workloads with light I/O where
fairness isn't even a variable worth considering.

The cost of garbage collection of the mapped caches on the host isn't
trivial, and I don't mean because the nonlinear rmap logic has to scan
all pagetables, that's a minor cost compared to shrinking the host
caches before try_to_unmap is ever invoked etc... Leaving the host
caches purely for the host usage is surely more fair that won't ever
lead to one guest doing heavy I/O thrashing the host caches and
leading to all other guests and host tasks hanging. If they will hang
for a few msec with O_DIRECT it'll be because they're waiting for I/O
and the elevator put them on the queue to wait for the disk to return
ready. But it won't be because of some write throttling during writes,
or in alloc_pages shrink methods that are calling ->writepage on the
dirty pages.

The other significant advantage of O_DIRECT is that you won't have to
call msync to provide journaling.

I think O_DIRECT will work best for all usages, and it looks higher
priority to me to have than MAP_SHARED. MAP_SHARED will surely result
in better benchmarks for certain workloads though, imagine 'dd
if=/dev/hda of=/dev/zero iflag=direct bs=1M count=100 run on the
guest', it'll read from cache and it will do zero I/O starting from
the second run with MAP_SHARED ;).

If it was me I'd prefer O_DIRECT by default.

For full disclosure you may also want to read this but I strongly
disagree with those statements. http://kerneltrap.org/node/7563

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 17:47                                         ` Andrea Arcangeli
@ 2008-05-21 17:53                                           ` Anthony Liguori
  2008-05-21 18:08                                             ` Andrea Arcangeli
  2008-05-21 18:29                                           ` Avi Kivity
  1 sibling, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 17:53 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Andrea Arcangeli wrote:
> On Wed, May 21, 2008 at 08:18:01PM +0300, Avi Kivity wrote:
>   
>> Yes, that's the reason.  Here zerocopy is not the motivation; instead, we 
>> have host-cached pages that are used directly in the guest.  So we get both 
>> reduced memory footprint, and host caching.  O_DIRECT reduces the memory 
>> footprint but kills host caching.
>>     
>
> Sure. So MAP_SHARED+remap_file_pages should work just fine to achieve
> zerocopy I/O.
>   

MAP_SHARED cannot be done transparently to the guest, that's the 
motivating reason behind MAP_PRIVATE.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 17:53                                           ` Anthony Liguori
@ 2008-05-21 18:08                                             ` Andrea Arcangeli
  2008-05-21 18:25                                               ` Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Andrea Arcangeli @ 2008-05-21 18:08 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

On Wed, May 21, 2008 at 12:53:52PM -0500, Anthony Liguori wrote:
> MAP_SHARED cannot be done transparently to the guest, that's the motivating 
> reason behind MAP_PRIVATE.

Could you elaborate on what means 'done transparently'? The only
difference is for writes. When guest writes MAP_PRIVATE will
copy-on-write. How can it be good if guest generates many
copy-on-writes and eliminates the cache from the mapping and replaces
it with anonymous memory?

I can't see how MAP_PRIVATE could replace O_DIRECT, there's no way to
write anything to disk with MAP_PRIVATE, msync on a MAP_PRIVATE is a
pure overhead noop for example, only MAP_SHARED has a chance to modify
any bit present on disk and it'll require msync at least every time
the host OS waits for I/O completion and assumes the journal
metadata/data is written on disk.

The real good thing I see of MAP_PRIVATE/MAP_SHARED vs O_DIRECT, is
that the guest would boot the second time without triggering reads
from disks. But after guest is booted, the runtime of the guest is
likely going to be better with O_DIRECT, the guest has its own
filesystem caches in the guest memory, replicating them shouldn't pay
off significantly for the guest runtime even on a laptop, and it
provides disavantages in the host by polluting host caches already
existing in the guest, and it'll decrease fairness of the system,
without mentioning the need of msync for journaling. So besides the
initial boot time I don't see many advantages for
MAP_PRIVATE/MAP_SHARED at least unless you're running msdos ;).

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 18:08                                             ` Andrea Arcangeli
@ 2008-05-21 18:25                                               ` Anthony Liguori
  2008-05-21 20:13                                                 ` Andrea Arcangeli
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 18:25 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Laurent Vivier, Dave Hansen, qemu-devel, Blue Swirl, Paul Brook

Andrea Arcangeli wrote:
> On Wed, May 21, 2008 at 12:53:52PM -0500, Anthony Liguori wrote:
>   
>> MAP_SHARED cannot be done transparently to the guest, that's the motivating 
>> reason behind MAP_PRIVATE.
>>     
>
> Could you elaborate on what means 'done transparently'? The only
> difference is for writes. When guest writes MAP_PRIVATE will
> copy-on-write. How can it be good if guest generates many
> copy-on-writes and eliminates the cache from the mapping and replaces
> it with anonymous memory?
>   

I think we're talking about different things.  What I'm talking about is 
the following:

Guest issues DMA read from disk at offset N of size M to physical 
address X.   Today, we essentially read from the backing disk image from 
offset N into a temporary buffer of size M, and then memcpy() to 
physical address X.

What I would like to do, if N and M are multiples of PAGE_SIZE, is 
replace the memory at guest physical address X, with the host's page 
cache for N, M.  The guest is unaware of this though and it may decide 
to reclaim that memory for something else.  When this happens, we need 
to unmap guest physical address X and replace it with normal memory 
(essentially, CoW'ing).

The effect of this would be that if multiple guests are using the same 
disk image, they would end up sharing memory transparently.

With MMU notifiers, this is possible by just using mmap(MAP_PRIVATE | 
MAP_FIXED) assuming we fix gfn_to_pfn() to take a 'write' parameter, 
right now we always write fault CoW mappings because we unconditionally 
call get_user_pages with write=1.

As has been pointed out, this is probably not ideal since it would cause 
heavy vma fragmentation.  We may be able to simulate this using the 
slots API although slots are quite similar to vma's in that we optimize 
for a small number of them.

I'm not really sure what's the best approach.

Regards,

Anthony Liguori

> I can't see how MAP_PRIVATE could replace O_DIRECT, there's no way to
> write anything to disk with MAP_PRIVATE, msync on a MAP_PRIVATE is a
> pure overhead noop for example, only MAP_SHARED has a chance to modify
> any bit present on disk and it'll require msync at least every time
> the host OS waits for I/O completion and assumes the journal
> metadata/data is written on disk.
>
> The real good thing I see of MAP_PRIVATE/MAP_SHARED vs O_DIRECT, is
> that the guest would boot the second time without triggering reads
> from disks. But after guest is booted, the runtime of the guest is
> likely going to be better with O_DIRECT, the guest has its own
> filesystem caches in the guest memory, replicating them shouldn't pay
> off significantly for the guest runtime even on a laptop, and it
> provides disavantages in the host by polluting host caches already
> existing in the guest, and it'll decrease fairness of the system,
> without mentioning the need of msync for journaling. So besides the
> initial boot time I don't see many advantages for
> MAP_PRIVATE/MAP_SHARED at least unless you're running msdos ;).
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 17:47                                         ` Andrea Arcangeli
  2008-05-21 17:53                                           ` Anthony Liguori
@ 2008-05-21 18:29                                           ` Avi Kivity
  1 sibling, 0 replies; 48+ messages in thread
From: Avi Kivity @ 2008-05-21 18:29 UTC (permalink / raw)
  To: Andrea Arcangeli; +Cc: Blue Swirl, Laurent Vivier, qemu-devel, Paul Brook

Andrea Arcangeli wrote:
> On Wed, May 21, 2008 at 08:18:01PM +0300, Avi Kivity wrote:
>   
>> Yes, that's the reason.  Here zerocopy is not the motivation; instead, we 
>> have host-cached pages that are used directly in the guest.  So we get both 
>> reduced memory footprint, and host caching.  O_DIRECT reduces the memory 
>> footprint but kills host caching.
>>     
>
> Sure. So MAP_SHARED+remap_file_pages should work just fine to achieve
> zerocopy I/O.
>
>   

No, when the guest writes to memory, it will affect the disk, which 
doesn't happen with normal memory writes.  MAP_PRIVATE is needed.

>> The scenario is desktop/laptop use.  For server use O_DIRECT is clearly 
>> preferred due to much reduced overhead.
>>     
>
> Well in some ways there's more overhead with O_DIRECT because O_DIRECT
> has to call get_user_pages and walk pagetables in software before
> every I/O operation. MAP_SHARED walks them in hardware and it can take
> advantage of the CPU tlb too.
>
> The primary problem of MAP_SHARED isn't the overhead of the operation
> itself that infact will be lower with MAP_SHARED after the cache is
> allocated, but the write throttling and garbage collection of the host
> caches. If you've a 250G guest image, MAP_SHARED will allocate as much
> as 250G of cache and a cp /dev/zero /dev/hdb in the guest will mark
> 100% of guest RAM dirty. The mkclean methods and write throttling for
> MAP_SHARED introduced in reasonably recent kernels can avoid filling
> 100% of host ram with dirty pages, but it still requires write
> throttling and it'll pollute the host caches and it can result in
> large ram allocations in the host having to block before the ram is
> available, the same way as buffered writes in the host (current KVM
> default).
>   

I'd do writes via the normal write path, not mmap().

> I think O_DIRECT is the best solution and MAP_SHARED could become a
> secondary option just for certain guest workloads with light I/O where
> fairness isn't even a variable worth considering.
>
> The cost of garbage collection of the mapped caches on the host isn't
> trivial, and I don't mean because the nonlinear rmap logic has to scan
> all pagetables, that's a minor cost compared to shrinking the host
> caches before try_to_unmap is ever invoked etc... Leaving the host
> caches purely for the host usage is surely more fair that won't ever
> lead to one guest doing heavy I/O thrashing the host caches and
> leading to all other guests and host tasks hanging. If they will hang
> for a few msec with O_DIRECT it'll be because they're waiting for I/O
> and the elevator put them on the queue to wait for the disk to return
> ready. But it won't be because of some write throttling during writes,
> or in alloc_pages shrink methods that are calling ->writepage on the
> dirty pages.
>
> The other significant advantage of O_DIRECT is that you won't have to
> call msync to provide journaling.
>
> I think O_DIRECT will work best for all usages, and it looks higher
> priority to me to have than MAP_SHARED. MAP_SHARED will surely result
> in better benchmarks for certain workloads though, imagine 'dd
> if=/dev/hda of=/dev/zero iflag=direct bs=1M count=100 run on the
> guest', it'll read from cache and it will do zero I/O starting from
> the second run with MAP_SHARED ;).
>
> If it was me I'd prefer O_DIRECT by default.
>
>   

Certainly O_DIRECT is the normal path.  We're considering mmap() as a 
way to have both host caching and avoiding double-caching.

> For full disclosure you may also want to read this but I strongly
> disagree with those statements. http://kerneltrap.org/node/7563
>   

I disagree with them strongly too.  For general purpose applications you 
want to avoid O_DIRECT, but special purpose applications that do their 
own caching (databases, virtualization, streaming servers), O_DIRECT is 
critical.

The kernel's cache management algorithms simply cannot compete with a 
specially tuned application, not to mention the additional overhead that 
comes from crossing a protection boundary.

[I've worked on a userspace filesystem that took every possible measure 
to get the OS out of the way: user level threads, O_DIRECT, aio, large 
pages]

-- 
Do not meddle in the internals of kernels, for they are subtle and quick to panic.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 18:25                                               ` Anthony Liguori
@ 2008-05-21 20:13                                                 ` Andrea Arcangeli
  2008-05-21 20:35                                                   ` Anthony Liguori
  0 siblings, 1 reply; 48+ messages in thread
From: Andrea Arcangeli @ 2008-05-21 20:13 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Laurent Vivier, Dave Hansen, qemu-devel, Blue Swirl, Paul Brook

On Wed, May 21, 2008 at 01:25:59PM -0500, Anthony Liguori wrote:
> I think we're talking about different things.  What I'm talking about is 
> the following:
>
> Guest issues DMA read from disk at offset N of size M to physical address 
> X.   Today, we essentially read from the backing disk image from offset N 
> into a temporary buffer of size M, and then memcpy() to physical address X.
>
> What I would like to do, if N and M are multiples of PAGE_SIZE, is replace 
> the memory at guest physical address X, with the host's page cache for N, 
> M.  The guest is unaware of this though and it may decide to reclaim that 
> memory for something else.  When this happens, we need to unmap guest 
> physical address X and replace it with normal memory (essentially, 
> CoW'ing).
>
> The effect of this would be that if multiple guests are using the same disk 
> image, they would end up sharing memory transparently.
>
> With MMU notifiers, this is possible by just using mmap(MAP_PRIVATE | 
> MAP_FIXED) assuming we fix gfn_to_pfn() to take a 'write' parameter, right 
> now we always write fault CoW mappings because we unconditionally call 
> get_user_pages with write=1.

Ok, now I exactly see what you're going after. So it'd save memory
yes, but only with -snapshot... And it'd be zerocopy yes, but it'd
need to flush the tlb of all cpus (both regular pte and spte too) with
ipis for every pte overwritten as the old pte could be cached in the
tlb even if this won't require further writes to the cache. ipis are
likely more costly than a local memcpy of 4k region. One thing is
calling get_user_pages in O_DIRECT to only know which is the physical
page the DMA should be directed to (in our case the anonymous page
pointed by the gpa), one thing is to mangle ptes and having to update
the tlbs for each emulated dma operation etc...

> As has been pointed out, this is probably not ideal since it would cause 
> heavy vma fragmentation.  We may be able to simulate this using the slots 
> API although slots are quite similar to vma's in that we optimize for a 
> small number of them.

I'm quite sure remap_file_pages can be extended to work on
MAP_PRIVATE. But I don't see the big benefit in sharing the ram
between host and guest, when having it in the guest is enough and this
only works for read anyway and it can only share ram among different
guests with -snapshot.

So while it sounds a clever trick, I doubt it's a worthwhile
optimization, it has downsides, and the worst is that I don't see how
we could extend this logic to work for writes because the pagecache of
the guest can't be written on disk before the dma is explicitly
started on the guest.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 20:13                                                 ` Andrea Arcangeli
@ 2008-05-21 20:35                                                   ` Anthony Liguori
  2008-05-21 20:42                                                     ` Andrea Arcangeli
  0 siblings, 1 reply; 48+ messages in thread
From: Anthony Liguori @ 2008-05-21 20:35 UTC (permalink / raw)
  To: Andrea Arcangeli
  Cc: Laurent Vivier, Dave Hansen, qemu-devel, Blue Swirl, Paul Brook

Andrea Arcangeli wrote:
>> As has been pointed out, this is probably not ideal since it would cause 
>> heavy vma fragmentation.  We may be able to simulate this using the slots 
>> API although slots are quite similar to vma's in that we optimize for a 
>> small number of them.
>>     
>
> I'm quite sure remap_file_pages can be extended to work on
> MAP_PRIVATE. But I don't see the big benefit in sharing the ram
> between host and guest, when having it in the guest is enough and this
> only works for read anyway and it can only share ram among different
> guests with -snapshot.
>   

Or if multiple guests are using the same backing file (imagine each 
guest has it's own qcow file backed to a common one).  Or if we had a 
more advanced storage system that did something like content addressable 
storage.

Regards,

Anthony Liguori

> So while it sounds a clever trick, I doubt it's a worthwhile
> optimization, it has downsides, and the worst is that I don't see how
> we could extend this logic to work for writes because the pagecache of
> the guest can't be written on disk before the dma is explicitly
> started on the guest.
>   

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-21 20:35                                                   ` Anthony Liguori
@ 2008-05-21 20:42                                                     ` Andrea Arcangeli
  0 siblings, 0 replies; 48+ messages in thread
From: Andrea Arcangeli @ 2008-05-21 20:42 UTC (permalink / raw)
  To: Anthony Liguori
  Cc: Laurent Vivier, Dave Hansen, qemu-devel, Blue Swirl, Paul Brook

On Wed, May 21, 2008 at 03:35:33PM -0500, Anthony Liguori wrote:
> Or if multiple guests are using the same backing file (imagine each guest 
> has it's own qcow file backed to a common one).  Or if we had a more 
> advanced storage system that did something like content addressable 
> storage.

Agreed. At least the pagecache sharing among different guests, can be
achieved with ksm too though, even if they don't share any qcow.

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-20 19:47 ` [Qemu-devel] " Anthony Liguori
  2008-05-20 22:36   ` Jamie Lokier
@ 2008-05-23  9:12   ` Laurent Vivier
  2008-05-28  7:01     ` Kevin Wolf
  1 sibling, 1 reply; 48+ messages in thread
From: Laurent Vivier @ 2008-05-23  9:12 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Paul Brook, Kevin Wolf

As this patch has been approved by Anthony and all comments of Fabrice
on the previous patch have been addressed, could it be commited to the
SVN repository ?

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net  --------------
          "Software is hard" - Donald Knuth

^ permalink raw reply	[flat|nested] 48+ messages in thread

* Re: [Qemu-devel] Re: [PATCH][v2] Align file accesses with cache=off (O_DIRECT)
  2008-05-23  9:12   ` Laurent Vivier
@ 2008-05-28  7:01     ` Kevin Wolf
  0 siblings, 0 replies; 48+ messages in thread
From: Kevin Wolf @ 2008-05-28  7:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Paul Brook

Laurent Vivier schrieb:
> As this patch has been approved by Anthony and all comments of Fabrice
> on the previous patch have been addressed, could it be commited to the
> SVN repository ?

And still no answer... See, that's what I meant when I said that patches 
are completely ignored until they are committed. And only then the 
reasons are given why it hasn't been committed.

Blue Swirl, Paul, Fabrice: Do any of you have additional comments or why 
don't you commit this patch?

Kevin

^ permalink raw reply	[flat|nested] 48+ messages in thread

end of thread, other threads:[~2008-05-28  7:05 UTC | newest]

Thread overview: 48+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-05-20 11:32 [Qemu-devel] [PATCH][v2] Align file accesses with cache=off (O_DIRECT) Laurent Vivier
2008-05-20 19:47 ` [Qemu-devel] " Anthony Liguori
2008-05-20 22:36   ` Jamie Lokier
2008-05-20 22:52     ` Paul Brook
2008-05-20 22:59       ` Laurent Vivier
2008-05-21  0:54         ` Paul Brook
2008-05-21  7:59           ` Laurent Vivier
2008-05-21  0:58       ` Anthony Liguori
2008-05-21  1:04         ` Jamie Lokier
2008-05-21  1:05         ` Anthony Liguori
2008-05-21  8:06           ` Kevin Wolf
2008-05-21  1:05         ` Paul Brook
2008-05-21  1:14           ` Anthony Liguori
2008-05-21  8:24             ` Kevin Wolf
2008-05-21 12:26               ` Jamie Lokier
2008-05-21 12:37                 ` Avi Kivity
2008-05-21 13:41                   ` Jamie Lokier
2008-05-21 13:55                     ` Anthony Liguori
2008-05-21 14:17                       ` Avi Kivity
2008-05-21 14:26                         ` Anthony Liguori
2008-05-21 14:57                           ` Avi Kivity
2008-05-21 15:34                             ` Jamie Lokier
2008-05-21 16:02                               ` Anthony Liguori
2008-05-21 16:24                                 ` Jamie Lokier
2008-05-21 16:48                                   ` Avi Kivity
2008-05-21 17:01                                     ` Andrea Arcangeli
2008-05-21 17:18                                       ` Avi Kivity
2008-05-21 17:47                                         ` Andrea Arcangeli
2008-05-21 17:53                                           ` Anthony Liguori
2008-05-21 18:08                                             ` Andrea Arcangeli
2008-05-21 18:25                                               ` Anthony Liguori
2008-05-21 20:13                                                 ` Andrea Arcangeli
2008-05-21 20:35                                                   ` Anthony Liguori
2008-05-21 20:42                                                     ` Andrea Arcangeli
2008-05-21 18:29                                           ` Avi Kivity
2008-05-21 16:45                                 ` Avi Kivity
2008-05-21 16:44                               ` Avi Kivity
2008-05-20 23:04     ` Laurent Vivier
2008-05-20 23:13       ` Jamie Lokier
2008-05-21  1:00     ` Anthony Liguori
2008-05-21  1:19       ` Jamie Lokier
2008-05-21  2:12         ` Anthony Liguori
2008-05-21  8:27           ` Andreas Färber
2008-05-21 14:06             ` Anthony Liguori
2008-05-21 15:31               ` Jamie Lokier
2008-05-21 11:43           ` Jamie Lokier
2008-05-23  9:12   ` Laurent Vivier
2008-05-28  7:01     ` Kevin Wolf

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).