qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
@ 2008-04-17 13:31 Kevin Wolf
  2008-04-28 15:34 ` Kevin Wolf
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-04-17 13:31 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 767 bytes --]

In December a patch was applied which introduced the cache=off option to
-drive. When using this option files are opened with the O_DIRECT flag.
This means that all accesses have to be aligned. The patch made a couple
of changes in this respect, still in other places they are missing (e.g.
you can't use cache=off with qcow(2) files).

This patch implements wrappers for raw_pread and raw_pwrite which align
all file accesses and make qcow(2) work with cache=off. This method
might not be the most performant one (compared to fixing qcow, qcow2 and
everything else that might be using unaligned accesses), but unaligned
accesses don't happen that frequently and with this patch really all
image accesses should be covered.

Signed-off-by: Kevin Wolf <kwolf@suse.de>

[-- Attachment #2: align-odirect-accesses.patch --]
[-- Type: text/x-patch, Size: 4674 bytes --]

Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c	(Revision 4215)
+++ block-raw-posix.c	(Arbeitskopie)
@@ -77,6 +77,7 @@
 typedef struct BDRVRawState {
     int fd;
     int type;
+    int flags;
     unsigned int lseek_err_cnt;
 #if defined(__linux__)
     /* linux floppy specific */
@@ -95,6 +96,7 @@
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
 
+    s->flags = flags;
     s->lseek_err_cnt = 0;
 
     open_flags = O_BINARY;
@@ -141,7 +143,11 @@
 #endif
 */
 
-static int raw_pread(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 (for files 
+ * opened with O_DIRECT). buf must be aligned to 512 bytes.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -194,7 +200,11 @@
     return ret;
 }
 
-static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 (for files 
+ * opened with O_DIRECT). buf must be aligned to 512 bytes.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -230,6 +240,92 @@
     return ret;
 }
 
+
+#ifdef O_DIRECT
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->flags & BDRV_O_DIRECT) &&
+            (offset % 512 != 0 || (uintptr_t) buf % 512))) {
+
+        uint8_t* aligned_buf;
+        int64_t aligned_offs = offset & ~511;
+        int aligned_count = (count + offset - aligned_offs + 511) & ~511;
+        int ret;
+
+        aligned_buf = qemu_memalign(512, 512 * aligned_count);
+        ret = raw_pread_aligned(bs, aligned_offs, aligned_buf, aligned_count);
+
+        if (ret > count)
+            ret = count;
+        if (ret > 0)
+            memcpy(buf, aligned_buf + (offset - aligned_offs), ret);
+
+        qemu_free(aligned_buf);
+        return ret;
+
+    } else {
+        return raw_pread_aligned(bs, offset, buf, count);
+    }
+}
+
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->flags & BDRV_O_DIRECT) &&
+            (offset % 512 != 0 || (uintptr_t) buf % 512))) {
+
+        uint8_t* aligned_buf;
+        int64_t aligned_offs = offset & ~511;
+        int aligned_count = (count + offset - aligned_offs + 511) & ~511;
+        int ret;
+
+        aligned_buf = qemu_memalign(512, 512 * aligned_count);
+
+        /* Read in the first block if needed */
+        if (offset - aligned_offs != 0)
+            raw_pread_aligned(bs, aligned_offs, aligned_buf, 512);
+
+        /* Read in the last block if needed */
+        if ((aligned_count > 512) && ((offset + count) % 512 != 0))
+            raw_pread_aligned(bs,
+                aligned_offs + aligned_count - 512,
+                aligned_buf + aligned_count - 512,
+                512);
+
+        memcpy(aligned_buf + (offset - aligned_offs), buf, count);
+        ret = raw_pwrite_aligned(bs, aligned_offs, aligned_buf, aligned_count);
+
+        if (ret > count)
+            ret = count;
+
+        qemu_free(aligned_buf);
+        return ret;
+    } else {
+        return raw_pwrite_aligned(bs, offset, buf, count);
+    }
+}
+
+#else
+#define raw_pread raw_pread_aligned
+#define raw_pwrite raw_pwrite_aligned
+#endif
+
+
 /***********************************************************/
 /* Unix AIO using POSIX AIO */
 
Index: Makefile
===================================================================
--- Makefile	(Revision 4215)
+++ Makefile	(Arbeitskopie)
@@ -34,7 +34,7 @@
 #######################################################################
 # BLOCK_OBJS is code used by both qemu system emulation and qemu-img
 
-BLOCK_OBJS=cutils.o
+BLOCK_OBJS=cutils.o osdep.o
 BLOCK_OBJS+=block-cow.o block-qcow.o aes.o block-vmdk.o block-cloop.o
 BLOCK_OBJS+=block-dmg.o block-bochs.o block-vpc.o block-vvfat.o
 BLOCK_OBJS+=block-qcow2.o block-parallels.o

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-17 13:31 [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT) Kevin Wolf
@ 2008-04-28 15:34 ` Kevin Wolf
  2008-04-29  9:01   ` Laurent Vivier
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-04-28 15:34 UTC (permalink / raw)
  To: qemu-devel

Kevin Wolf schrieb:
> In December a patch was applied which introduced the cache=off option to
> -drive. When using this option files are opened with the O_DIRECT flag.
> This means that all accesses have to be aligned. The patch made a couple
> of changes in this respect, still in other places they are missing (e.g.
> you can't use cache=off with qcow(2) files).
> 
> This patch implements wrappers for raw_pread and raw_pwrite which align
> all file accesses and make qcow(2) work with cache=off. This method
> might not be the most performant one (compared to fixing qcow, qcow2 and
> everything else that might be using unaligned accesses), but unaligned
> accesses don't happen that frequently and with this patch really all
> image accesses should be covered.
> 
> Signed-off-by: Kevin Wolf <kwolf@suse.de>

Anything wrong with this one?

Kevin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-28 15:34 ` Kevin Wolf
@ 2008-04-29  9:01   ` Laurent Vivier
  2008-04-29 14:49     ` Kevin Wolf
  0 siblings, 1 reply; 21+ messages in thread
From: Laurent Vivier @ 2008-04-29  9:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf


Le lundi 28 avril 2008 à 17:34 +0200, Kevin Wolf a écrit :
> Kevin Wolf schrieb:
> > In December a patch was applied which introduced the cache=off option to
> > -drive. When using this option files are opened with the O_DIRECT flag.
> > This means that all accesses have to be aligned. The patch made a couple
> > of changes in this respect, still in other places they are missing (e.g.
> > you can't use cache=off with qcow(2) files).
> > 
> > This patch implements wrappers for raw_pread and raw_pwrite which align
> > all file accesses and make qcow(2) work with cache=off. This method
> > might not be the most performant one (compared to fixing qcow, qcow2 and
> > everything else that might be using unaligned accesses), but unaligned
> > accesses don't happen that frequently and with this patch really all
> > image accesses should be covered.
> > 
> > Signed-off-by: Kevin Wolf <kwolf@suse.de>
> 
> Anything wrong with this one?

Hi Kevin,

I did a patch to manage this problem too:

http://thread.gmane.org/gmane.comp.emulators.qemu/22881

What I like in your patch:
- simplicity
What I don't like in your patch:
- memcpy() and memalign()

What I like in my patch:
- no memcpy()
What I don't like in my patch:
- complexity

But if we want too keep simplicity without memcpy(), we could only
de-activate O_DIRECT on pread() or pwrite() :

 fd_arg = fcntl(fd, F_GETFL);
 if ((fd_arg & O_DIRECT) == 0)
     return 0;
 fcntl(fd, F_SETFL, fd_arg & ~O_DIRECT);

And restore it after.

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29  9:01   ` Laurent Vivier
@ 2008-04-29 14:49     ` Kevin Wolf
  2008-04-29 15:48       ` Laurent Vivier
  2008-04-30  0:02       ` Jamie Lokier
  0 siblings, 2 replies; 21+ messages in thread
From: Kevin Wolf @ 2008-04-29 14:49 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 964 bytes --]

Hi Laurent,

Laurent Vivier schrieb:
> But if we want too keep simplicity without memcpy(), we could only
> de-activate O_DIRECT on pread() or pwrite() :

You're right, this approach is simpler, better and makes the patch 
smaller. I've attached a new version of the patch.

However, now that you've pointed me to your patch I realize that my 
patch might be simple but it is incomplete as well. Normal read/write 
operation on qcow images should work fine (only metadata is unaligned 
and there pread is used). Snapshots don't work though because they end 
up in aio requests which are not routed to raw_pread.

Disabling O_DIRECT for a single aio request is impossible (after all, 
aio is asynchronous), and disabling it for at least one aio request is 
going to be ugly. So maybe we better turn O_DIRECT off for snapsnot 
saving/loading, even if it's not the generic fix I wanted to have when I 
started.

I'm still undecided, though. What do you think?

Kevin

[-- Attachment #2: align-odirect-accesses.patch --]
[-- Type: text/x-patch, Size: 3620 bytes --]

Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c.orig
+++ block-raw-posix.c
@@ -77,6 +77,7 @@
 typedef struct BDRVRawState {
     int fd;
     int type;
+    int flags;
     unsigned int lseek_err_cnt;
 #if defined(__linux__)
     /* linux floppy specific */
@@ -95,6 +96,7 @@ static int raw_open(BlockDriverState *bs
     BDRVRawState *s = bs->opaque;
     int fd, open_flags, ret;
 
+    s->flags = flags;
     s->lseek_err_cnt = 0;
 
     open_flags = O_BINARY;
@@ -141,7 +143,14 @@ static int raw_open(BlockDriverState *bs
 #endif
 */
 
-static int raw_pread(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 for files 
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -194,7 +203,14 @@ label__raw_read__success:
     return ret;
 }
 
-static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 for files 
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -230,6 +246,69 @@ label__raw_write__success:
     return ret;
 }
 
+
+#ifdef O_DIRECT
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->flags & BDRV_O_DIRECT) &&
+            (offset % 512 != 0 || (uintptr_t) buf % 512))) {
+
+        int flags, ret;
+
+        // Temporarily disable O_DIRECT for unaligned access
+        flags = fcntl(s->fd, F_GETFL);
+        fcntl(s->fd, F_SETFL, flags & ~O_DIRECT);
+        ret = raw_pread_aligned(bs, offset, buf, count);
+        fcntl(s->fd, F_SETFL, flags);
+
+        return ret;
+
+    } else {
+        return raw_pread_aligned(bs, offset, buf, count);
+    }
+}
+
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->flags & BDRV_O_DIRECT) &&
+            (offset % 512 != 0 || (uintptr_t) buf % 512))) {
+
+        int flags, ret;
+
+        // Temporarily disable O_DIRECT for unaligned access
+        flags = fcntl(s->fd, F_GETFL);
+        fcntl(s->fd, F_SETFL, flags & ~O_DIRECT);
+        ret = raw_pwrite_aligned(bs, offset, buf, count);
+        fcntl(s->fd, F_SETFL, flags);
+
+        return ret;
+    } else {
+        return raw_pwrite_aligned(bs, offset, buf, count);
+    }
+}
+
+#else
+#define raw_pread raw_pread_aligned
+#define raw_pwrite raw_pwrite_aligned
+#endif
+
+
 /***********************************************************/
 /* Unix AIO using POSIX AIO */
 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29 14:49     ` Kevin Wolf
@ 2008-04-29 15:48       ` Laurent Vivier
  2008-04-29 16:21         ` Kevin Wolf
  2008-04-30  0:02       ` Jamie Lokier
  1 sibling, 1 reply; 21+ messages in thread
From: Laurent Vivier @ 2008-04-29 15:48 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel


Le mardi 29 avril 2008 à 16:49 +0200, Kevin Wolf a écrit :
> Hi Laurent,
> 
> Laurent Vivier schrieb:
> > But if we want too keep simplicity without memcpy(), we could only
> > de-activate O_DIRECT on pread() or pwrite() :
> 
> You're right, this approach is simpler, better and makes the patch 
> smaller. I've attached a new version of the patch.
> 
> However, now that you've pointed me to your patch I realize that my 
> patch might be simple but it is incomplete as well. Normal read/write 
> operation on qcow images should work fine (only metadata is unaligned 
> and there pread is used). Snapshots don't work though because they end 
> up in aio requests which are not routed to raw_pread.
> 
> Disabling O_DIRECT for a single aio request is impossible (after all, 
> aio is asynchronous), and disabling it for at least one aio request is 

Perhaps I'm wrong, but I think it is possible: the only consequence is
the asynchronous I/O becomes synchronous...

> going to be ugly. So maybe we better turn O_DIRECT off for snapsnot 
> saving/loading, even if it's not the generic fix I wanted to have when I 
> started.

I don't think it is a good idea:

In linux world, there are three reasons to use O_DIRECT:

1- to use linux AIO (not POSIX AIO).

2- to avoid a buffer copy between user- and kernel- space
(performance ?)

3- to increase reliability: by using O_DIRECT you are sure your data are
on the disk when the write is over and your system can now crash (if it
wants).

And I think reliability is better when the snapshot is being saved...

> I'm still undecided, though. What do you think?

Is it possible to align the last AIO ?

And see comments below

> Index: block-raw-posix.c
> ===================================================================
> --- block-raw-posix.c.orig
> +++ block-raw-posix.c
> @@ -77,6 +77,7 @@
>  typedef struct BDRVRawState {
>      int fd;
>      int type;
> +    int flags;
>      unsigned int lseek_err_cnt;
>  #if defined(__linux__)
>      /* linux floppy specific */
> @@ -95,6 +96,7 @@ static int raw_open(BlockDriverState *bs
>      BDRVRawState *s = bs->opaque;
>      int fd, open_flags, ret;
>  
> +    s->flags = flags;
>      s->lseek_err_cnt = 0;

I think you should store open_flags instead of flags (see below).

>      open_flags = O_BINARY;
> @@ -141,7 +143,14 @@ static int raw_open(BlockDriverState *bs
>  #endif
>  */
>  
> -static int raw_pread(BlockDriverState *bs, int64_t offset,
> +/* 
> + * offset and count are in bytes, but must be multiples of 512 for
> files 
> + * opened with O_DIRECT. buf must be aligned to 512 bytes then.
> + *
> + * This function may be called without alignment if the caller
> ensures
> + * that O_DIRECT is not in effect.
> + */
> +static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
>                       uint8_t *buf, int count)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -194,7 +203,14 @@ label__raw_read__success:
>      return ret;
>  }
>  
> -static int raw_pwrite(BlockDriverState *bs, int64_t offset,
> +/* 
> + * offset and count are in bytes, but must be multiples of 512 for
> files 
> + * opened with O_DIRECT. buf must be aligned to 512 bytes then.
> + *
> + * This function may be called without alignment if the caller
> ensures
> + * that O_DIRECT is not in effect.
> + */
> +static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
>                        const uint8_t *buf, int count)
>  {
>      BDRVRawState *s = bs->opaque;
> @@ -230,6 +246,69 @@ label__raw_write__success:
>      return ret;
>  }
>  
> +
> +#ifdef O_DIRECT
> +/* 
> + * offset and count are in bytes and possibly not aligned. For files
> opened 
> + * with O_DIRECT, necessary alignments are ensured before calling 
> + * raw_pread_aligned to do the actual read.
> + */
> +static int raw_pread(BlockDriverState *bs, int64_t offset,
> +                     uint8_t *buf, int count)
> +{
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (unlikely((s->flags & BDRV_O_DIRECT) &&
> +            (offset % 512 != 0 || (uintptr_t) buf % 512))) {
> +
> +        int flags, ret;
> +
> +        // Temporarily disable O_DIRECT for unaligned access
> +        flags = fcntl(s->fd, F_GETFL);
> +        fcntl(s->fd, F_SETFL, flags & ~O_DIRECT);
> +        ret = raw_pread_aligned(bs, offset, buf, count);
> +        fcntl(s->fd, F_SETFL, flags);
> +
> +        return ret;

if you store open_flag instead of flags, you can do:

if (unlikely((s->open_flags & O_DIRECT) && 
		(offset % 512 || (uintptr_t) buf % 512))) {

	fcntl(s->fd, F_SETFL, s->open_flags & ~O_DIRECT);
	ret = raw_pread_aligned(bs, offset, buf, count);
	fcntl(s->fd, F_SETFL, s->open_flags);
}

> +    } else {
> +        return raw_pread_aligned(bs, offset, buf, count);
> +    }
> +}
> +
> +/* 
> + * offset and count are in bytes and possibly not aligned. For files
> opened 
> + * with O_DIRECT, necessary alignments are ensured before calling 
> + * raw_pwrite_aligned to do the actual write.
> + */
> +static int raw_pwrite(BlockDriverState *bs, int64_t offset,
> +                      const uint8_t *buf, int count)
> +{
> +    BDRVRawState *s = bs->opaque;
> +
> +    if (unlikely((s->flags & BDRV_O_DIRECT) &&
> +            (offset % 512 != 0 || (uintptr_t) buf % 512))) {
> +
> +        int flags, ret;
> +
> +        // Temporarily disable O_DIRECT for unaligned access
> +        flags = fcntl(s->fd, F_GETFL);
> +        fcntl(s->fd, F_SETFL, flags & ~O_DIRECT);
> +        ret = raw_pwrite_aligned(bs, offset, buf, count);
> +        fcntl(s->fd, F_SETFL, flags);

ditto

> +        return ret;
> +    } else {
> +        return raw_pwrite_aligned(bs, offset, buf, count);
> +    }
> +}
> +
> +#else
> +#define raw_pread raw_pread_aligned
> +#define raw_pwrite raw_pwrite_aligned
> +#endif
> +
> +
>  /***********************************************************/
>  /* Unix AIO using POSIX AIO */
>  
> 
Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29 15:48       ` Laurent Vivier
@ 2008-04-29 16:21         ` Kevin Wolf
  2008-04-29 16:48           ` Laurent Vivier
  2008-04-30  0:05           ` Jamie Lokier
  0 siblings, 2 replies; 21+ messages in thread
From: Kevin Wolf @ 2008-04-29 16:21 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: qemu-devel

Laurent Vivier schrieb:
>> Disabling O_DIRECT for a single aio request is impossible (after all, 
>> aio is asynchronous), and disabling it for at least one aio request is 
> 
> Perhaps I'm wrong, but I think it is possible: the only consequence is
> the asynchronous I/O becomes synchronous...

Hm, yes. We could call raw_pread in raw_aio_read when O_DIRECT is used 
and the request is not properly aligned. Is this what you meant?

>> going to be ugly. So maybe we better turn O_DIRECT off for snapsnot 
>> saving/loading, even if it's not the generic fix I wanted to have when I 
>> started.
> 
> I don't think it is a good idea:
> 
> In linux world, there are three reasons to use O_DIRECT:
> 
> 1- to use linux AIO (not POSIX AIO).
> 
> 2- to avoid a buffer copy between user- and kernel- space
> (performance ?)
> 
> 3- to increase reliability: by using O_DIRECT you are sure your data are
> on the disk when the write is over and your system can now crash (if it
> wants).
> 
> And I think reliability is better when the snapshot is being saved...

I think we agree that it's mostly item 3 why one would use O_DIRECT with 
qemu. In terms of reliability, it is important that the data really is 
written to the disk when the guest OS thinks so. But when for example 
qemu crashes, I don't think it's too important if 40% or 50% of a 
snapshot have already been written - it's unusable anyway. A sync 
afterwards could be enough there.

>> I'm still undecided, though. What do you think?
> 
> Is it possible to align the last AIO ?

I have to admit that I neither know how to recognize the "last AIO" in 
the generic code nor do I understand what you want to achieve with it. 
The main problem are unaligned buffers and these occur on any request.

> And see comments below

Good suggestion, will change the patch accordingly.

Kevin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29 16:21         ` Kevin Wolf
@ 2008-04-29 16:48           ` Laurent Vivier
  2008-04-30  9:21             ` Kevin Wolf
  2008-04-30  0:05           ` Jamie Lokier
  1 sibling, 1 reply; 21+ messages in thread
From: Laurent Vivier @ 2008-04-29 16:48 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel


Le mardi 29 avril 2008 à 18:21 +0200, Kevin Wolf a écrit :
> Laurent Vivier schrieb:
> >> Disabling O_DIRECT for a single aio request is impossible (after all, 
> >> aio is asynchronous), and disabling it for at least one aio request is 
> > 
> > Perhaps I'm wrong, but I think it is possible: the only consequence is
> > the asynchronous I/O becomes synchronous...
> 
> Hm, yes. We could call raw_pread in raw_aio_read when O_DIRECT is used 
> and the request is not properly aligned. Is this what you meant?

No, it was just a (stupid) comment. I think we must not convert
asynchronous I/O to synchronous I/O.

> >> going to be ugly. So maybe we better turn O_DIRECT off for snapsnot 
> >> saving/loading, even if it's not the generic fix I wanted to have when I 
> >> started.
> > 
> > I don't think it is a good idea:
> > 
> > In linux world, there are three reasons to use O_DIRECT:
> > 
> > 1- to use linux AIO (not POSIX AIO).
> > 
> > 2- to avoid a buffer copy between user- and kernel- space
> > (performance ?)
> > 
> > 3- to increase reliability: by using O_DIRECT you are sure your data are
> > on the disk when the write is over and your system can now crash (if it
> > wants).
> > 
> > And I think reliability is better when the snapshot is being saved...
> 
> I think we agree that it's mostly item 3 why one would use O_DIRECT with 
> qemu. In terms of reliability, it is important that the data really is 
> written to the disk when the guest OS thinks so. But when for example 
> qemu crashes, I don't think it's too important if 40% or 50% of a 
> snapshot have already been written - it's unusable anyway. A sync 
> afterwards could be enough there.

I don't speak about "qemu crashes" but about "host crashes".

> 
> >> I'm still undecided, though. What do you think?
> > 
> > Is it possible to align the last AIO ?
> 
> I have to admit that I neither know how to recognize the "last AIO" in 
> the generic code nor do I understand what you want to achieve with it. 
> The main problem are unaligned buffers and these occur on any request.

I'm not in the spirit "my patch is better than yours" (and I don't think
so); but could you try to test my patch ? Because if I remember
correctly I think It manages all cases and this can help you to find a
solution (or perhaps you can add to your patch the part of my patch
about block-qcow2.c)

> > And see comments below
> 
> Good suggestion, will change the patch accordingly.
> 
> Kevin
> 

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29 14:49     ` Kevin Wolf
  2008-04-29 15:48       ` Laurent Vivier
@ 2008-04-30  0:02       ` Jamie Lokier
  1 sibling, 0 replies; 21+ messages in thread
From: Jamie Lokier @ 2008-04-30  0:02 UTC (permalink / raw)
  To: qemu-devel; +Cc: Laurent Vivier

Kevin Wolf wrote:
> Disabling O_DIRECT for a single aio request is impossible (after all, 
> aio is asynchronous), and disabling it for at least one aio request is 
> going to be ugly.

You could try opening two fds to the same file/blockdev, one with
O_DIRECT set.  Then you can disable O_DIRECT as you like per aio
request, by choosing the fd.

I'm not sure if that works, though.  On some OSes, if a file has any
non-O_DIRECT open descriptor, all I/O is buffered ignoring the
O_DIRECT flag.  If both are allowed simultaneously, I'm not sure what
happens with cache-coherency between direct I/Os and buffered I/Os.

-- Jamie

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29 16:21         ` Kevin Wolf
  2008-04-29 16:48           ` Laurent Vivier
@ 2008-04-30  0:05           ` Jamie Lokier
  1 sibling, 0 replies; 21+ messages in thread
From: Jamie Lokier @ 2008-04-30  0:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Laurent Vivier

Kevin Wolf wrote:
> I think we agree that it's mostly item 3 why one would use O_DIRECT with 
> qemu. In terms of reliability, it is important that the data really is 
> written to the disk when the guest OS thinks so. But when for example 
> qemu crashes, I don't think it's too important if 40% or 50% of a 
> snapshot have already been written - it's unusable anyway. A sync 
> afterwards could be enough there.

A call to fsync() after writing provides the same guarantee as
O_DIRECT, or stronger if fsync() does a hard disk cache flush, which
O_DIRECT does not.

-- Jamie

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-29 16:48           ` Laurent Vivier
@ 2008-04-30  9:21             ` Kevin Wolf
  2008-04-30  9:59               ` Laurent Vivier
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-04-30  9:21 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2575 bytes --]

Laurent Vivier schrieb:
>> Hm, yes. We could call raw_pread in raw_aio_read when O_DIRECT is used 
>> and the request is not properly aligned. Is this what you meant?
> 
> No, it was just a (stupid) comment. I think we must not convert
> asynchronous I/O to synchronous I/O.

Why not? If I'm not mistaken (but if you think it's wrong, probably I 
am) the only possible drawback should be performance. And we're not 
talking about normal guest IO, these accesses are aligned anyway.

>> I think we agree that it's mostly item 3 why one would use O_DIRECT with 
>> qemu. In terms of reliability, it is important that the data really is 
>> written to the disk when the guest OS thinks so. But when for example 
>> qemu crashes, I don't think it's too important if 40% or 50% of a 
>> snapshot have already been written - it's unusable anyway. A sync 
>> afterwards could be enough there.
> 
> I don't speak about "qemu crashes" but about "host crashes".

Well, I've never seen a host crash where qemu survived. ;-)

What I wanted to say: If the snapshot is incomplete and not usable 
anyway, why bother if some bytes more or less have been written?

> I'm not in the spirit "my patch is better than yours" (and I don't think
> so); but could you try to test my patch ? Because if I remember
> correctly I think It manages all cases and this can help you to find a
> solution (or perhaps you can add to your patch the part of my patch
> about block-qcow2.c)

I really didn't want to say that your code is bad or it wouldn't work or 
something like that. I just tried it and it works fine as well.

But the approaches are quite different. Your patch makes every user of 
the block driver functions aware that O_DIRECT might be in effect. My 
patch tries to do things in one common place, even though possibly at 
the cost of performance (however, I'm not sure anymore about the bad 
performance now that I use your fcntl method).

So what I like about my patch is that it is one change in one place 
which should make everything work. Your patch could be still of use to 
speed things up, e.g. by making qcow2 aware that there is something like 
O_DIRECT (would have to do a real benchmark with both patches) and have 
it align its buffers in the first place.

I'll attach the current version of my patch which emulates AIO through 
synchronous requests for unaligned buffer. In comparison, with your 
patch the bootup of my test VM was slightly faster but loading/saving of 
snapshots was much faster with mine. Perhaps I'll try to combine them to 
get the best of both.

Kevin

[-- Attachment #2: align-odirect-accesses.patch --]
[-- Type: text/x-patch, Size: 5081 bytes --]

Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c.orig
+++ block-raw-posix.c
@@ -77,6 +77,7 @@
 typedef struct BDRVRawState {
     int fd;
     int type;
+    int flags;
     unsigned int lseek_err_cnt;
 #if defined(__linux__)
     /* linux floppy specific */
@@ -111,6 +112,7 @@ static int raw_open(BlockDriverState *bs
         open_flags |= O_DIRECT;
 #endif
 
+    s->flags = open_flags;
     s->type = FTYPE_FILE;
 
     fd = open(filename, open_flags, 0644);
@@ -141,7 +143,14 @@ static int raw_open(BlockDriverState *bs
 #endif
 */
 
-static int raw_pread(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 for files 
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -194,7 +203,14 @@ label__raw_read__success:
     return ret;
 }
 
-static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 for files 
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -230,6 +246,67 @@ label__raw_write__success:
     return ret;
 }
 
+
+#ifdef O_DIRECT
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->flags & O_DIRECT) &&
+            (offset % 512 || count % 512 || (uintptr_t) buf % 512))) {
+
+        int ret;
+
+        // Temporarily disable O_DIRECT for unaligned access
+        fcntl(s->fd, F_SETFL, s->flags & ~O_DIRECT);
+        ret = raw_pread_aligned(bs, offset, buf, count);
+        fcntl(s->fd, F_SETFL, s->flags);
+
+        return ret;
+
+    } else {
+        return raw_pread_aligned(bs, offset, buf, count);
+    }
+}
+
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->flags & O_DIRECT) &&
+            (offset % 512 || count % 512 || (uintptr_t) buf % 512))) {
+
+        int ret;
+
+        // Temporarily disable O_DIRECT for unaligned access
+        fcntl(s->fd, F_SETFL, s->flags & ~O_DIRECT);
+        ret = raw_pwrite_aligned(bs, offset, buf, count);
+        fcntl(s->fd, F_SETFL, s->flags);
+
+        return ret;
+    } else {
+        return raw_pwrite_aligned(bs, offset, buf, count);
+    }
+}
+
+#else
+#define raw_pread raw_pread_aligned
+#define raw_pwrite raw_pwrite_aligned
+#endif
+
+
 /***********************************************************/
 /* Unix AIO using POSIX AIO */
 
@@ -402,10 +479,26 @@ static BlockDriverAIOCB *raw_aio_read(Bl
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     RawAIOCB *acb;
+    BDRVRawState *s = bs->opaque;
+
+    /* 
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+    if (unlikely((s->flags & O_DIRECT) && ((uintptr_t) buf % 512))) {
+        int ret;
+
+        acb = qemu_aio_get(bs, cb, opaque);
+        ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return &acb->common;
+    }
 
     acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
+
     if (aio_read(&acb->aiocb) < 0) {
         qemu_aio_release(acb);
         return NULL;
@@ -418,6 +511,21 @@ static BlockDriverAIOCB *raw_aio_write(B
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     RawAIOCB *acb;
+    BDRVRawState *s = bs->opaque;
+
+    /* 
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+    if (unlikely((s->flags & O_DIRECT) && ((uintptr_t) buf % 512))) {
+        int ret;
+
+        acb = qemu_aio_get(bs, cb, opaque);
+        ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return &acb->common;
+    }
 
     acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
     if (!acb)

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-30  9:21             ` Kevin Wolf
@ 2008-04-30  9:59               ` Laurent Vivier
  2008-04-30 12:08                 ` Kevin Wolf
  0 siblings, 1 reply; 21+ messages in thread
From: Laurent Vivier @ 2008-04-30  9:59 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel


Le mercredi 30 avril 2008 à 11:21 +0200, Kevin Wolf a écrit :
> Laurent Vivier schrieb:
> >> Hm, yes. We could call raw_pread in raw_aio_read when O_DIRECT is used 
> >> and the request is not properly aligned. Is this what you meant?
> > 
> > No, it was just a (stupid) comment. I think we must not convert
> > asynchronous I/O to synchronous I/O.
> 
> Why not? If I'm not mistaken (but if you think it's wrong, probably I 
> am) the only possible drawback should be performance. And we're not 
> talking about normal guest IO, these accesses are aligned anyway.

I think there can be side effects if the caller of the function thinks
the I/O is asynchronous and it is in fact synchronous (but perhaps we
can call that a "bug"...).

> >> I think we agree that it's mostly item 3 why one would use O_DIRECT with 
> >> qemu. In terms of reliability, it is important that the data really is 
> >> written to the disk when the guest OS thinks so. But when for example 
> >> qemu crashes, I don't think it's too important if 40% or 50% of a 
> >> snapshot have already been written - it's unusable anyway. A sync 
> >> afterwards could be enough there.
> > 
> > I don't speak about "qemu crashes" but about "host crashes".
> 
> Well, I've never seen a host crash where qemu survived. ;-)
> 
> What I wanted to say: If the snapshot is incomplete and not usable 
> anyway, why bother if some bytes more or less have been written?

What I mean is with O_DIRECT we increase reliability because we are sure
data are on disk when qemu has finished to save snapshot, without
O_DIRECT qemu can have finished to save but the snapshot is not really
on the disk but in the host cache. And if the host crashes between these
two points, It's 'bad' (user thinks snapshot has been saved but in fact
it is not). But perhaps it's just a detail...

> > I'm not in the spirit "my patch is better than yours" (and I don't think
> > so); but could you try to test my patch ? Because if I remember
> > correctly I think It manages all cases and this can help you to find a
> > solution (or perhaps you can add to your patch the part of my patch
> > about block-qcow2.c)
> 
> I really didn't want to say that your code is bad or it wouldn't work or 
> something like that. I just tried it and it works fine as well.
> 
> But the approaches are quite different. Your patch makes every user of 
> the block driver functions aware that O_DIRECT might be in effect. My 
> patch tries to do things in one common place, even though possibly at 
> the cost of performance (however, I'm not sure anymore about the bad 
> performance now that I use your fcntl method).
> 
> So what I like about my patch is that it is one change in one place 
> which should make everything work. Your patch could be still of use to 
> speed things up, e.g. by making qcow2 aware that there is something like 
> O_DIRECT (would have to do a real benchmark with both patches) and have 
> it align its buffers in the first place.

OK, I agree with you.

> I'll attach the current version of my patch which emulates AIO through 
> synchronous requests for unaligned buffer. In comparison, with your 
> patch the bootup of my test VM was slightly faster but loading/saving of 
> snapshots was much faster with mine. Perhaps I'll try to combine them to 
> get the best of both.

just a comment on the patch: perhaps you can call your field
"open_flags" instead of "flags", and perhaps you can merge your field
with "fd_open_flags" ?

Some comments from Qemu maintainers ?
Is there someone ready to commit it in SVN ?

I personally think this functionality must be included...

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-30  9:59               ` Laurent Vivier
@ 2008-04-30 12:08                 ` Kevin Wolf
  2008-04-30 14:30                   ` Blue Swirl
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-04-30 12:08 UTC (permalink / raw)
  To: Laurent Vivier; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 210 bytes --]

Laurent Vivier schrieb:
> just a comment on the patch: perhaps you can call your field
> "open_flags" instead of "flags", and perhaps you can merge your field
> with "fd_open_flags" ?

Here you are. ;-)

Kevin

[-- Attachment #2: align-odirect-accesses.patch --]
[-- Type: text/x-patch, Size: 6358 bytes --]

Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c.orig
+++ block-raw-posix.c
@@ -77,10 +77,10 @@
 typedef struct BDRVRawState {
     int fd;
     int type;
+    int open_flags;
     unsigned int lseek_err_cnt;
 #if defined(__linux__)
     /* linux floppy specific */
-    int fd_open_flags;
     int64_t fd_open_time;
     int64_t fd_error_time;
     int fd_got_error;
@@ -111,6 +111,7 @@ static int raw_open(BlockDriverState *bs
         open_flags |= O_DIRECT;
 #endif
 
+    s->open_flags = open_flags;
     s->type = FTYPE_FILE;
 
     fd = open(filename, open_flags, 0644);
@@ -141,7 +142,14 @@ static int raw_open(BlockDriverState *bs
 #endif
 */
 
-static int raw_pread(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 for files 
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pread_aligned(BlockDriverState *bs, int64_t offset,
                      uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -194,7 +202,14 @@ label__raw_read__success:
     return ret;
 }
 
-static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+/* 
+ * offset and count are in bytes, but must be multiples of 512 for files 
+ * opened with O_DIRECT. buf must be aligned to 512 bytes then.
+ *
+ * This function may be called without alignment if the caller ensures
+ * that O_DIRECT is not in effect.
+ */
+static int raw_pwrite_aligned(BlockDriverState *bs, int64_t offset,
                       const uint8_t *buf, int count)
 {
     BDRVRawState *s = bs->opaque;
@@ -230,6 +245,67 @@ label__raw_write__success:
     return ret;
 }
 
+
+#ifdef O_DIRECT
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pread_aligned to do the actual read.
+ */
+static int raw_pread(BlockDriverState *bs, int64_t offset,
+                     uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->open_flags & O_DIRECT) &&
+            (offset % 512 || count % 512 || (uintptr_t) buf % 512))) {
+
+        int ret;
+
+        // Temporarily disable O_DIRECT for unaligned access
+        fcntl(s->fd, F_SETFL, s->open_flags & ~O_DIRECT);
+        ret = raw_pread_aligned(bs, offset, buf, count);
+        fcntl(s->fd, F_SETFL, s->open_flags);
+
+        return ret;
+
+    } else {
+        return raw_pread_aligned(bs, offset, buf, count);
+    }
+}
+
+/* 
+ * offset and count are in bytes and possibly not aligned. For files opened 
+ * with O_DIRECT, necessary alignments are ensured before calling 
+ * raw_pwrite_aligned to do the actual write.
+ */
+static int raw_pwrite(BlockDriverState *bs, int64_t offset,
+                      const uint8_t *buf, int count)
+{
+    BDRVRawState *s = bs->opaque;
+
+    if (unlikely((s->open_flags & O_DIRECT) &&
+            (offset % 512 || count % 512 || (uintptr_t) buf % 512))) {
+
+        int ret;
+
+        // Temporarily disable O_DIRECT for unaligned access
+        fcntl(s->fd, F_SETFL, s->open_flags & ~O_DIRECT);
+        ret = raw_pwrite_aligned(bs, offset, buf, count);
+        fcntl(s->fd, F_SETFL, s->open_flags);
+
+        return ret;
+    } else {
+        return raw_pwrite_aligned(bs, offset, buf, count);
+    }
+}
+
+#else
+#define raw_pread raw_pread_aligned
+#define raw_pwrite raw_pwrite_aligned
+#endif
+
+
 /***********************************************************/
 /* Unix AIO using POSIX AIO */
 
@@ -402,10 +478,26 @@ static BlockDriverAIOCB *raw_aio_read(Bl
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     RawAIOCB *acb;
+    BDRVRawState *s = bs->opaque;
+
+    /* 
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+    if (unlikely((s->open_flags & O_DIRECT) && ((uintptr_t) buf % 512))) {
+        int ret;
+
+        acb = qemu_aio_get(bs, cb, opaque);
+        ret = raw_pread(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return &acb->common;
+    }
 
     acb = raw_aio_setup(bs, sector_num, buf, nb_sectors, cb, opaque);
     if (!acb)
         return NULL;
+
     if (aio_read(&acb->aiocb) < 0) {
         qemu_aio_release(acb);
         return NULL;
@@ -418,6 +510,21 @@ static BlockDriverAIOCB *raw_aio_write(B
         BlockDriverCompletionFunc *cb, void *opaque)
 {
     RawAIOCB *acb;
+    BDRVRawState *s = bs->opaque;
+
+    /* 
+     * If O_DIRECT is used and the buffer is not aligned fall back
+     * to synchronous IO.
+     */
+    if (unlikely((s->open_flags & O_DIRECT) && ((uintptr_t) buf % 512))) {
+        int ret;
+
+        acb = qemu_aio_get(bs, cb, opaque);
+        ret = raw_pwrite(bs, 512 * sector_num, buf, 512 * nb_sectors);
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return &acb->common;
+    }
 
     acb = raw_aio_setup(bs, sector_num, (uint8_t*)buf, nb_sectors, cb, opaque);
     if (!acb)
@@ -679,7 +786,7 @@ static int hdev_open(BlockDriverState *b
         s->type = FTYPE_CD;
     } else if (strstart(filename, "/dev/fd", NULL)) {
         s->type = FTYPE_FD;
-        s->fd_open_flags = open_flags;
+        s->open_flags = open_flags;
         /* open will not fail even if no floppy is inserted */
         open_flags |= O_NONBLOCK;
     } else if (strstart(filename, "/dev/sg", NULL)) {
@@ -734,7 +841,7 @@ static int fd_open(BlockDriverState *bs)
 #endif
             return -EIO;
         }
-        s->fd = open(bs->filename, s->fd_open_flags);
+        s->fd = open(bs->filename, s->open_flags);
         if (s->fd < 0) {
             s->fd_error_time = qemu_get_clock(rt_clock);
             s->fd_got_error = 1;
@@ -831,7 +938,7 @@ static int raw_eject(BlockDriverState *b
                 close(s->fd);
                 s->fd = -1;
             }
-            fd = open(bs->filename, s->fd_open_flags | O_NONBLOCK);
+            fd = open(bs->filename, s->open_flags | O_NONBLOCK);
             if (fd >= 0) {
                 if (ioctl(fd, FDEJECT, 0) < 0)
                     perror("FDEJECT");

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-30 12:08                 ` Kevin Wolf
@ 2008-04-30 14:30                   ` Blue Swirl
  2008-04-30 21:05                     ` Kevin Wolf
  0 siblings, 1 reply; 21+ messages in thread
From: Blue Swirl @ 2008-04-30 14:30 UTC (permalink / raw)
  To: qemu-devel; +Cc: Laurent Vivier

On 4/30/08, Kevin Wolf <kwolf@suse.de> wrote:
> Laurent Vivier schrieb:
>
> > just a comment on the patch: perhaps you can call your field
> > "open_flags" instead of "flags", and perhaps you can merge your field
> > with "fd_open_flags" ?
> >
>
>  Here you are. ;-)

Maybe the alignment could be handled like AIO and synchronous IO
emulation layers are added in bdrv_register, but at open stage?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-30 14:30                   ` Blue Swirl
@ 2008-04-30 21:05                     ` Kevin Wolf
  2008-05-01 14:35                       ` Blue Swirl
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-04-30 21:05 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier

Am Mittwoch, 30. April 2008 16:30:27 schrieb Blue Swirl:
> Maybe the alignment could be handled like AIO and synchronous IO
> emulation layers are added in bdrv_register, but at open stage?

You mean to preserve the original pread if the file is opened without O_DIRECT 
and replace it by the emulation function only if O_DIRECT is really used? 
Certainly possible in some way (having a function pointer in BDRVRawState), 
but this won't save us anything.

But maybe I'm just missing your point. How exactly do you want to handle 
things and what does it improve?

Kevin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-04-30 21:05                     ` Kevin Wolf
@ 2008-05-01 14:35                       ` Blue Swirl
  2008-05-01 17:55                         ` Kevin Wolf
  0 siblings, 1 reply; 21+ messages in thread
From: Blue Swirl @ 2008-05-01 14:35 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Laurent Vivier, qemu-devel

On 5/1/08, Kevin Wolf <mail@kevin-wolf.de> wrote:
> Am Mittwoch, 30. April 2008 16:30:27 schrieb Blue Swirl:
>
> > Maybe the alignment could be handled like AIO and synchronous IO
>  > emulation layers are added in bdrv_register, but at open stage?
>
>
> You mean to preserve the original pread if the file is opened without O_DIRECT
>  and replace it by the emulation function only if O_DIRECT is really used?

Right.

>  Certainly possible in some way (having a function pointer in BDRVRawState),
>  but this won't save us anything.
>
>  But maybe I'm just missing your point. How exactly do you want to handle
>  things and what does it improve?

Maybe it's slightly faster that way and it would be closer to how
other block emulations are handled. It's just an idea.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-05-01 14:35                       ` Blue Swirl
@ 2008-05-01 17:55                         ` Kevin Wolf
  2008-05-06  8:44                           ` Kevin Wolf
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-05-01 17:55 UTC (permalink / raw)
  To: Blue Swirl; +Cc: Laurent Vivier, qemu-devel


Am Donnerstag, 1. Mai 2008 16:35:51 schrieb Blue Swirl:
> On 5/1/08, Kevin Wolf <mail@kevin-wolf.de> wrote:
> > Am Mittwoch, 30. April 2008 16:30:27 schrieb Blue Swirl:
> > > Maybe the alignment could be handled like AIO and synchronous IO
> > >
> >  > emulation layers are added in bdrv_register, but at open stage?
> >
> > You mean to preserve the original pread if the file is opened without
> > O_DIRECT and replace it by the emulation function only if O_DIRECT is
> > really used?
>
> Right.
>
> Maybe it's slightly faster that way and it would be closer to how
> other block emulations are handled. It's just an idea.

Maybe I'm missing something but AFAIK this pread pointer exists once for each 
block driver, i.e. every raw image uses the original pread or every raw image 
uses the emulating one. The difference between the O_DIRECT case and the 
AIO/sync emulation is that AIO/sync is the same for all devices of one driver 
while O_DIRECT can differ between images of the same driver.

So you would need to have one common pread which in turn calls a function 
pointer stored in the BlockDriverState. I doubt that this is much cheaper 
than an if in pread. And it wouldn't get too close to other emulations anyway 
because of the driver/device difference.

Kevin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-05-01 17:55                         ` Kevin Wolf
@ 2008-05-06  8:44                           ` Kevin Wolf
  2008-05-06  9:02                             ` Laurent Vivier
  2008-05-06 16:42                             ` Blue Swirl
  0 siblings, 2 replies; 21+ messages in thread
From: Kevin Wolf @ 2008-05-06  8:44 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier

Kevin Wolf schrieb:
> Am Donnerstag, 1. Mai 2008 16:35:51 schrieb Blue Swirl:
>> On 5/1/08, Kevin Wolf <mail@kevin-wolf.de> wrote:
>>> Am Mittwoch, 30. April 2008 16:30:27 schrieb Blue Swirl:
>>>> Maybe the alignment could be handled like AIO and synchronous IO
>>>>
>>>  > emulation layers are added in bdrv_register, but at open stage?
>>>
>>> You mean to preserve the original pread if the file is opened without
>>> O_DIRECT and replace it by the emulation function only if O_DIRECT is
>>> really used?
>> Right.
>>
>> Maybe it's slightly faster that way and it would be closer to how
>> other block emulations are handled. It's just an idea.
> 
> Maybe I'm missing something but AFAIK this pread pointer exists once for each 
> block driver, i.e. every raw image uses the original pread or every raw image 
> uses the emulating one. The difference between the O_DIRECT case and the 
> AIO/sync emulation is that AIO/sync is the same for all devices of one driver 
> while O_DIRECT can differ between images of the same driver.
> 
> So you would need to have one common pread which in turn calls a function 
> pointer stored in the BlockDriverState. I doubt that this is much cheaper 
> than an if in pread. And it wouldn't get too close to other emulations anyway 
> because of the driver/device difference.

Should I change the patch now (even if I think it doesn't help anything) 
or will you apply the patch as it is? It is quite frustrating to get no 
answer at all.

Kevin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-05-06  8:44                           ` Kevin Wolf
@ 2008-05-06  9:02                             ` Laurent Vivier
  2008-05-06 16:42                             ` Blue Swirl
  1 sibling, 0 replies; 21+ messages in thread
From: Laurent Vivier @ 2008-05-06  9:02 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Blue Swirl, qemu-devel

Le mardi 06 mai 2008 à 10:44 +0200, Kevin Wolf a écrit :
> Kevin Wolf schrieb:
> > Am Donnerstag, 1. Mai 2008 16:35:51 schrieb Blue Swirl:
> >> On 5/1/08, Kevin Wolf <mail@kevin-wolf.de> wrote:
> >>> Am Mittwoch, 30. April 2008 16:30:27 schrieb Blue Swirl:
> >>>> Maybe the alignment could be handled like AIO and synchronous IO
> >>>>
> >>>  > emulation layers are added in bdrv_register, but at open stage?
> >>>
> >>> You mean to preserve the original pread if the file is opened without
> >>> O_DIRECT and replace it by the emulation function only if O_DIRECT is
> >>> really used?
> >> Right.
> >>
> >> Maybe it's slightly faster that way and it would be closer to how
> >> other block emulations are handled. It's just an idea.
> > 
> > Maybe I'm missing something but AFAIK this pread pointer exists once for each 
> > block driver, i.e. every raw image uses the original pread or every raw image 
> > uses the emulating one. The difference between the O_DIRECT case and the 
> > AIO/sync emulation is that AIO/sync is the same for all devices of one driver 
> > while O_DIRECT can differ between images of the same driver.
> > 
> > So you would need to have one common pread which in turn calls a function 
> > pointer stored in the BlockDriverState. I doubt that this is much cheaper 
> > than an if in pread. And it wouldn't get too close to other emulations anyway 
> > because of the driver/device difference.
> 
> Should I change the patch now (even if I think it doesn't help anything) 
> or will you apply the patch as it is? It is quite frustrating to get no 
> answer at all.

Well, what is missing to Qemu is a MAINTAINERs list...

Fabrice, did you define sometime who can submit code in the repository
and on which part ? Perhaps you could write this in a "MAINTAINERS"
file ?
So we can CC: the maintainer when we have code to submit.

As I'm relatively new here, perhaps it is already existing or defined...

Regards, 
Laurent

-- 
------------- Laurent.Vivier@bull.net ---------------
"The best way to predict the future is to invent it."
- Alan Kay

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-05-06  8:44                           ` Kevin Wolf
  2008-05-06  9:02                             ` Laurent Vivier
@ 2008-05-06 16:42                             ` Blue Swirl
  2008-05-06 16:56                               ` Kevin Wolf
  1 sibling, 1 reply; 21+ messages in thread
From: Blue Swirl @ 2008-05-06 16:42 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Laurent Vivier, qemu-devel

On 5/6/08, Kevin Wolf <kwolf@suse.de> wrote:
> Kevin Wolf schrieb:
>
>
> > Am Donnerstag, 1. Mai 2008 16:35:51 schrieb Blue Swirl:
> >
> > > On 5/1/08, Kevin Wolf <mail@kevin-wolf.de> wrote:
> > >
> > > > Am Mittwoch, 30. April 2008 16:30:27 schrieb Blue Swirl:
> > > >
> > > > > Maybe the alignment could be handled like AIO and synchronous IO
> > > > >
> > > > >
> > > >  > emulation layers are added in bdrv_register, but at open stage?
> > > >
> > > > You mean to preserve the original pread if the file is opened without
> > > > O_DIRECT and replace it by the emulation function only if O_DIRECT is
> > > > really used?
> > > >
> > > Right.
> > >
> > > Maybe it's slightly faster that way and it would be closer to how
> > > other block emulations are handled. It's just an idea.
> > >
> >
> > Maybe I'm missing something but AFAIK this pread pointer exists once for
> each block driver, i.e. every raw image uses the original pread or every raw
> image uses the emulating one. The difference between the O_DIRECT case and
> the AIO/sync emulation is that AIO/sync is the same for all devices of one
> driver while O_DIRECT can differ between images of the same driver.
> >
> > So you would need to have one common pread which in turn calls a function
> pointer stored in the BlockDriverState. I doubt that this is much cheaper
> than an if in pread. And it wouldn't get too close to other emulations
> anyway because of the driver/device difference.
> >
>
>  Should I change the patch now (even if I think it doesn't help anything) or
> will you apply the patch as it is? It is quite frustrating to get no answer
> at all.

Well, the patch looks OK. But I try to test the patches before I
commit and for this I don't know how. Could you give some example test
cases?

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-05-06 16:42                             ` Blue Swirl
@ 2008-05-06 16:56                               ` Kevin Wolf
  2008-05-06 17:23                                 ` Blue Swirl
  0 siblings, 1 reply; 21+ messages in thread
From: Kevin Wolf @ 2008-05-06 16:56 UTC (permalink / raw)
  To: qemu-devel; +Cc: Blue Swirl, Laurent Vivier

Blue Swirl schrieb:
> Well, the patch looks OK. But I try to test the patches before I
> commit and for this I don't know how. Could you give some example test
> cases?

If you want to test that the patch fixes what it promises you only have 
to create a qcow or qcow2 image and start qemu with cache=off for this 
image. This will fail without the patch, and it should work with the 
patch applied. This covers the simple case. For the aio part of the 
patch you can try saving/loading of snapshots. Normal qcow2 operation 
doesn't trigger the aio code path.

Kevin

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT)
  2008-05-06 16:56                               ` Kevin Wolf
@ 2008-05-06 17:23                                 ` Blue Swirl
  0 siblings, 0 replies; 21+ messages in thread
From: Blue Swirl @ 2008-05-06 17:23 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Laurent Vivier, qemu-devel

On 5/6/08, Kevin Wolf <kwolf@suse.de> wrote:
> Blue Swirl schrieb:
>
> > Well, the patch looks OK. But I try to test the patches before I
> > commit and for this I don't know how. Could you give some example test
> > cases?
> >
>
>  If you want to test that the patch fixes what it promises you only have to
> create a qcow or qcow2 image and start qemu with cache=off for this image.
> This will fail without the patch, and it should work with the patch applied.
> This covers the simple case. For the aio part of the patch you can try
> saving/loading of snapshots. Normal qcow2 operation doesn't trigger the aio
> code path.

Thanks, it seems to work.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2008-05-06 17:23 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-17 13:31 [Qemu-devel] [PATCH] Align file accesses with cache=off (O_DIRECT) Kevin Wolf
2008-04-28 15:34 ` Kevin Wolf
2008-04-29  9:01   ` Laurent Vivier
2008-04-29 14:49     ` Kevin Wolf
2008-04-29 15:48       ` Laurent Vivier
2008-04-29 16:21         ` Kevin Wolf
2008-04-29 16:48           ` Laurent Vivier
2008-04-30  9:21             ` Kevin Wolf
2008-04-30  9:59               ` Laurent Vivier
2008-04-30 12:08                 ` Kevin Wolf
2008-04-30 14:30                   ` Blue Swirl
2008-04-30 21:05                     ` Kevin Wolf
2008-05-01 14:35                       ` Blue Swirl
2008-05-01 17:55                         ` Kevin Wolf
2008-05-06  8:44                           ` Kevin Wolf
2008-05-06  9:02                             ` Laurent Vivier
2008-05-06 16:42                             ` Blue Swirl
2008-05-06 16:56                               ` Kevin Wolf
2008-05-06 17:23                                 ` Blue Swirl
2008-04-30  0:05           ` Jamie Lokier
2008-04-30  0:02       ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).