[Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
@ 2011-10-21 17:08 Kevin Wolf
  2011-10-21 17:08 ` [Qemu-devel] [PATCH 1/2] raw-posix: Convert to bdrv_co_flush Kevin Wolf
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Kevin Wolf @ 2011-10-21 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, avi

Avi complained that not even writing out qcow2's cache on bdrv_flush() made
cache=unsafe too unsafe to be useful. He's got a point.

Kevin Wolf (2):
  raw-posix: Convert to bdrv_co_flush
  block: Handle cache=unsafe only in raw-posix/win32

 block.c           |    4 +--
 block/raw-posix.c |   57 ++++++++++++++++++++++++++++++++++++++++------------
 block/raw-win32.c |    4 +++
 3 files changed, 49 insertions(+), 16 deletions(-)

-- 
1.7.6.4

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 1/2] raw-posix: Convert to bdrv_co_flush
  2011-10-21 17:08 [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Kevin Wolf
@ 2011-10-21 17:08 ` Kevin Wolf
  2011-10-21 17:08 ` [Qemu-devel] [PATCH 2/2] block: Handle cache=unsafe only in raw-posix/win32 Kevin Wolf
  2011-10-21 18:44 ` [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Paolo Bonzini
  2 siblings, 0 replies; 18+ messages in thread
From: Kevin Wolf @ 2011-10-21 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, avi

The next patch will introduce an early return. Using a bottom half to invoke
the AIO callback wouldn't be much less code, so let's go with the native
block layer interface.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/raw-posix.c |   53 ++++++++++++++++++++++++++++++++++++++++-------------
 1 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/block/raw-posix.c b/block/raw-posix.c
index a3de373..dcae88a 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -357,15 +357,42 @@ static BlockDriverAIOCB *raw_aio_writev(BlockDriverState *bs,
                           cb, opaque, QEMU_AIO_WRITE);
 }
 
-static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
-        BlockDriverCompletionFunc *cb, void *opaque)
+typedef struct CoroutineIOCompletion {
+    Coroutine *coroutine;
+    int ret;
+} CoroutineIOCompletion;
+
+static void raw_aio_flush_cb(void *opaque, int ret)
+{
+    CoroutineIOCompletion *co = opaque;
+
+    co->ret = ret;
+    qemu_coroutine_enter(co->coroutine, NULL);
+}
+
+static int raw_co_flush(BlockDriverState *bs)
 {
     BDRVRawState *s = bs->opaque;
+    BlockDriverAIOCB *acb;
+    int ret;
 
-    if (fd_open(bs) < 0)
-        return NULL;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
 
-    return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH);
+    ret = fd_open(bs);
+    if (ret < 0) {
+        return ret;
+    }
+
+    acb = paio_submit(bs, s->fd, 0, NULL, 0, raw_aio_flush_cb, &co, QEMU_AIO_FLUSH);
+    if (acb == NULL) {
+        return -EIO;
+    }
+
+    qemu_coroutine_yield();
+
+    return co.ret;
 }
 
 static void raw_close(BlockDriverState *bs)
@@ -635,9 +662,9 @@ static BlockDriver bdrv_file = {
     .bdrv_create = raw_create,
     .bdrv_co_discard = raw_co_discard,
 
-    .bdrv_aio_readv = raw_aio_readv,
+    .bdrv_aio_readv  = raw_aio_readv,
     .bdrv_aio_writev = raw_aio_writev,
-    .bdrv_aio_flush = raw_aio_flush,
+    .bdrv_co_flush   = raw_co_flush,
 
     .bdrv_truncate = raw_truncate,
     .bdrv_getlength = raw_getlength,
@@ -903,9 +930,9 @@ static BlockDriver bdrv_host_device = {
     .create_options     = raw_create_options,
     .bdrv_has_zero_init = hdev_has_zero_init,
 
-    .bdrv_aio_readv	= raw_aio_readv,
-    .bdrv_aio_writev	= raw_aio_writev,
-    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_aio_readv     = raw_aio_readv,
+    .bdrv_aio_writev    = raw_aio_writev,
+    .bdrv_co_flush      = raw_co_flush,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -1024,7 +1051,7 @@ static BlockDriver bdrv_host_floppy = {
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
-    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_co_flush      = raw_co_flush,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength	= raw_getlength,
@@ -1123,7 +1150,7 @@ static BlockDriver bdrv_host_cdrom = {
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
-    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_co_flush      = raw_co_flush,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength     = raw_getlength,
@@ -1242,7 +1269,7 @@ static BlockDriver bdrv_host_cdrom = {
 
     .bdrv_aio_readv     = raw_aio_readv,
     .bdrv_aio_writev    = raw_aio_writev,
-    .bdrv_aio_flush	= raw_aio_flush,
+    .bdrv_co_flush      = raw_co_flush,
 
     .bdrv_truncate      = raw_truncate,
     .bdrv_getlength     = raw_getlength,
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 2/2] block: Handle cache=unsafe only in raw-posix/win32
  2011-10-21 17:08 [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Kevin Wolf
  2011-10-21 17:08 ` [Qemu-devel] [PATCH 1/2] raw-posix: Convert to bdrv_co_flush Kevin Wolf
@ 2011-10-21 17:08 ` Kevin Wolf
  2011-10-21 18:44 ` [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Paolo Bonzini
  2 siblings, 0 replies; 18+ messages in thread
From: Kevin Wolf @ 2011-10-21 17:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: kwolf, avi

The expected meaning of cache=unsafe with qcow2 is that on a flush the metadata
caches are written out, but no fsync is performed.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c           |    4 +---
 block/raw-posix.c |    4 ++++
 block/raw-win32.c |    4 ++++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/block.c b/block.c
index 11c7f91..0b7bc06 100644
--- a/block.c
+++ b/block.c
@@ -2908,9 +2908,7 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
-    if (bs->open_flags & BDRV_O_NO_FLUSH) {
-        return 0;
-    } else if (!bs->drv) {
+    if (!bs->drv) {
         return 0;
     } else if (bs->drv->bdrv_co_flush) {
         return bs->drv->bdrv_co_flush(bs);
diff --git a/block/raw-posix.c b/block/raw-posix.c
index dcae88a..9a3d3af 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -380,6 +380,10 @@ static int raw_co_flush(BlockDriverState *bs)
         .coroutine = qemu_coroutine_self(),
     };
 
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        return 0;
+    }
+
     ret = fd_open(bs);
     if (ret < 0) {
         return ret;
diff --git a/block/raw-win32.c b/block/raw-win32.c
index f5f73bc..37a8bdb 100644
--- a/block/raw-win32.c
+++ b/block/raw-win32.c
@@ -156,6 +156,10 @@ static int raw_flush(BlockDriverState *bs)
     BDRVRawState *s = bs->opaque;
     int ret;
 
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+        return 0;
+    }
+
     ret = FlushFileBuffers(s->hfile);
     if (ret == 0) {
         return -EIO;
-- 
1.7.6.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-21 17:08 [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Kevin Wolf
  2011-10-21 17:08 ` [Qemu-devel] [PATCH 1/2] raw-posix: Convert to bdrv_co_flush Kevin Wolf
  2011-10-21 17:08 ` [Qemu-devel] [PATCH 2/2] block: Handle cache=unsafe only in raw-posix/win32 Kevin Wolf
@ 2011-10-21 18:44 ` Paolo Bonzini
  2011-10-22 15:07   ` Alexander Graf
  2011-10-24  7:37   ` Kevin Wolf
  2 siblings, 2 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-21 18:44 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel, avi

On 10/21/2011 07:08 PM, Kevin Wolf wrote:
> Avi complained that not even writing out qcow2's cache on bdrv_flush() made
> cache=unsafe too unsafe to be useful. He's got a point.

Why? cache=unsafe is explicitly allowing to s/data/manure/ on crash.

If you do this for raw-posix, you need to do it for all protocols.

> Kevin Wolf (2):
>    raw-posix: Convert to bdrv_co_flush
>    block: Handle cache=unsafe only in raw-posix/win32

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-21 18:44 ` [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Paolo Bonzini
@ 2011-10-22 15:07   ` Alexander Graf
  2011-10-23 14:33     ` Paolo Bonzini
  2011-10-24  7:37   ` Kevin Wolf
  1 sibling, 1 reply; 18+ messages in thread
From: Alexander Graf @ 2011-10-22 15:07 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Kevin Wolf, qemu-devel, avi


On 21.10.2011, at 11:44, Paolo Bonzini wrote:

> On 10/21/2011 07:08 PM, Kevin Wolf wrote:
>> Avi complained that not even writing out qcow2's cache on bdrv_flush() made
>> cache=unsafe too unsafe to be useful. He's got a point.
> 
> Why? cache=unsafe is explicitly allowing to s/data/manure/ on crash.

Exactly, but not on kill. By not flushing internal caches you're almost guaranteed to get an inconsistent qcow2 image.

> If you do this for raw-posix, you need to do it for all protocols.

Only for the ones that actually do flushes to the block layer. The idea is that while QEMU dies the OS can still finish writing unflushed blocks.


Alex

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-22 15:07   ` Alexander Graf
@ 2011-10-23 14:33     ` Paolo Bonzini
  2011-10-24  8:05       ` Kevin Wolf
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-23 14:33 UTC (permalink / raw)
  To: Alexander Graf; +Cc: Kevin Wolf, qemu-devel, avi

On 10/22/2011 05:07 PM, Alexander Graf wrote:
>
> On 21.10.2011, at 11:44, Paolo Bonzini wrote:
>
>> On 10/21/2011 07:08 PM, Kevin Wolf wrote:
>>> Avi complained that not even writing out qcow2's cache on
>>> bdrv_flush() made cache=unsafe too unsafe to be useful. He's got
>>> a point.
>>
>> Why? cache=unsafe is explicitly allowing to s/data/manure/ on
>> crash.
>
> Exactly, but not on kill. By not flushing internal caches you're
> almost guaranteed to get an inconsistent qcow2 image.

This should be covered already by termsig_handler.  bdrv_close_all 
closes all block devices, and qcow2_close does flush the caches.

SIGKILL doesn't give any guarantee of course but it does not in general, 
even without cache=unsafe; you might get a SIGKILL "a moment before" a 
bdrv_flush even without cache=unsafe, and get unclean qcow2 metadata.

> By not flushing internal caches you're almost guaranteed to get an
> inconsistent qcow2 image.

Of course the inconsistencies with cache=unsafe will be massive if you 
don't have a clean exit, but that's expected.  If in some cases you want 
a clean exit, but right now you don't, the place to fix those cases 
doesn't seem to be the block layer, but the main loop.

Also,

1) why should cache=unsafe differentiate an OS that sends a flush from 
one that doesn't (e.g. MS-DOS), from the point of view of image metadata?

2) why should the guest actually send a flush if cache=unsafe?  Currently

     if (flags & BDRV_O_CACHE_WB)
         bs->enable_write_cache = 1;

covers cache=unsafe.  However, in the end write cache enable means "do I 
need to flush data", and the answer is "no" when cache=unsafe, because 
the flushes are useless and guests are free to reorder requests.

<shot-in-the-dark>Perhaps what you want is to make qcow2 caches 
writethrough in cache=unsafe mode, so that at least a try is made to 
write the metadata</shot-in-the-dark> (even though the underlying raw 
protocol won't flush it)?  I'm not sure that is particularly useful, but 
maybe it can help me understanding the benefit of this change.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-21 18:44 ` [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Paolo Bonzini
  2011-10-22 15:07   ` Alexander Graf
@ 2011-10-24  7:37   ` Kevin Wolf
  2011-10-24  7:53     ` Paolo Bonzini
  1 sibling, 1 reply; 18+ messages in thread
From: Kevin Wolf @ 2011-10-24  7:37 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: qemu-devel, avi

Am 21.10.2011 20:44, schrieb Paolo Bonzini:
> On 10/21/2011 07:08 PM, Kevin Wolf wrote:
>> Avi complained that not even writing out qcow2's cache on bdrv_flush() made
>> cache=unsafe too unsafe to be useful. He's got a point.
> 
> Why? cache=unsafe is explicitly allowing to s/data/manure/ on crash.

It's surely expected on a host crash, but is it for a qemu crash?
cache=unsafe was introduced to avoid fsync() costs, which it still does
after this patch.

> If you do this for raw-posix, you need to do it for all protocols.

rbd could use it, too, right. Any other protocol I missed?

Kevin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  7:37   ` Kevin Wolf
@ 2011-10-24  7:53     ` Paolo Bonzini
  2011-10-24  8:17       ` Kevin Wolf
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-24  7:53 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Alexander Graf, qemu-devel, avi

On 10/24/2011 09:37 AM, Kevin Wolf wrote:
>> Why? cache=unsafe is explicitly allowing to s/data/manure/ on
>> crash.
>
> It's surely expected on a host crash, but is it for a qemu crash?
> cache=unsafe was introduced to avoid fsync() costs, which it still
> does after this patch.

I think it's not about "why is it there", but rather about "what is it 
useful for".  My interpretation of it is "I do not need the image 
anymore unless the command exits cleanly": VM installations, qemu-img 
conversions, BDRV_O_SNAPSHOT (doesn't do it yet, but it could).  Even 
SIGINT and SIGTERM would be excluded from this definition, but they cost 
nothing so it's nice to include them.

>> If you do this for raw-posix, you need to do it for all protocols.
>
> rbd could use it, too, right. Any other protocol I missed?

NBD could, but it doesn't support flush yet.

In general, even if it were useful to implement this, I'm not sure this 
is the best way to implement it.  For example you could simply clear 
BDRV_O_NO_FLUSH in qcow2_open.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-23 14:33     ` Paolo Bonzini
@ 2011-10-24  8:05       ` Kevin Wolf
  0 siblings, 0 replies; 18+ messages in thread
From: Kevin Wolf @ 2011-10-24  8:05 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Alexander Graf, qemu-devel, avi

Am 23.10.2011 16:33, schrieb Paolo Bonzini:
> On 10/22/2011 05:07 PM, Alexander Graf wrote:
>>
>> On 21.10.2011, at 11:44, Paolo Bonzini wrote:
>>
>>> On 10/21/2011 07:08 PM, Kevin Wolf wrote:
>>>> Avi complained that not even writing out qcow2's cache on
>>>> bdrv_flush() made cache=unsafe too unsafe to be useful. He's got
>>>> a point.
>>>
>>> Why? cache=unsafe is explicitly allowing to s/data/manure/ on
>>> crash.
>>
>> Exactly, but not on kill. By not flushing internal caches you're
>> almost guaranteed to get an inconsistent qcow2 image.
> 
> This should be covered already by termsig_handler.  bdrv_close_all 
> closes all block devices, and qcow2_close does flush the caches.
> 
> SIGKILL doesn't give any guarantee of course but it does not in general, 
> even without cache=unsafe; you might get a SIGKILL "a moment before" a 
> bdrv_flush even without cache=unsafe, and get unclean qcow2 metadata.

Unclean yes, in the sense that you may get cluster leaks. If getting
SIGKILL "a moment before" the flush led to real corruption however,
cache=none would be broken as well.

>> By not flushing internal caches you're almost guaranteed to get an
>> inconsistent qcow2 image.
> 
> Of course the inconsistencies with cache=unsafe will be massive if you 
> don't have a clean exit, but that's expected.  If in some cases you want 
> a clean exit, but right now you don't, the place to fix those cases 
> doesn't seem to be the block layer, but the main loop.

I don't think there's much the main loop can do against SIGKILL,
segfaults or abort().

> Also,
> 
> 1) why should cache=unsafe differentiate an OS that sends a flush from 
> one that doesn't (e.g. MS-DOS), from the point of view of image metadata?
> 
> 2) why should the guest actually send a flush if cache=unsafe?  Currently
> 
>      if (flags & BDRV_O_CACHE_WB)
>          bs->enable_write_cache = 1;
> 
> covers cache=unsafe.  However, in the end write cache enable means "do I 
> need to flush data", and the answer is "no" when cache=unsafe, because 
> the flushes are useless and guests are free to reorder requests.
>
> <shot-in-the-dark>Perhaps what you want is to make qcow2 caches 
> writethrough in cache=unsafe mode, so that at least a try is made to 
> write the metadata</shot-in-the-dark> (even though the underlying raw 
> protocol won't flush it)?  I'm not sure that is particularly useful, but 
> maybe it can help me understanding the benefit of this change.

Yes, this is the intention. It's about flushing metadata, not guest
data. The semantics that I think cache=unsafe should have is that after
a bdrv_flush() we have flushed all caches in qemu (so that the image
survives a qemu crash), but we don't care about flushing the host page
cache.

Kevin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  7:53     ` Paolo Bonzini
@ 2011-10-24  8:17       ` Kevin Wolf
  2011-10-24  8:47         ` Paolo Bonzini
  2011-10-24  9:09         ` Peter Maydell
  0 siblings, 2 replies; 18+ messages in thread
From: Kevin Wolf @ 2011-10-24  8:17 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Alexander Graf, qemu-devel, avi

Am 24.10.2011 09:53, schrieb Paolo Bonzini:
> On 10/24/2011 09:37 AM, Kevin Wolf wrote:
>>> Why? cache=unsafe is explicitly allowing to s/data/manure/ on
>>> crash.
>>
>> It's surely expected on a host crash, but is it for a qemu crash?
>> cache=unsafe was introduced to avoid fsync() costs, which it still
>> does after this patch.
> 
> I think it's not about "why is it there", but rather about "what is it 
> useful for".  My interpretation of it is "I do not need the image 
> anymore unless the command exits cleanly": VM installations, qemu-img 
> conversions, BDRV_O_SNAPSHOT (doesn't do it yet, but it could).  Even 
> SIGINT and SIGTERM would be excluded from this definition, but they cost 
> nothing so it's nice to include them.

I think another common interpretation is: "I don't run this VM in
production but for development. I want the VM to go faster and I can
recreate the image in the unlikely event that power fails during my
work. But it certainly would be nasty."

>>> If you do this for raw-posix, you need to do it for all protocols.
>>
>> rbd could use it, too, right. Any other protocol I missed?
> 
> NBD could, but it doesn't support flush yet.
> 
> In general, even if it were useful to implement this, I'm not sure this 
> is the best way to implement it.  For example you could simply clear 
> BDRV_O_NO_FLUSH in qcow2_open.

That could work, too. On the other hand I don't like block drivers to
modify their flags. For example, would query-block still provide the
correct cache mode then?

But I think that starting to make exceptions for single block drivers
isn't a good idea anyway. If we want bdrv_flush() to write out all
metadata internal to qemu, I think the approach with checking the flag
in drivers calling things like fsync() is better. The common thing is to
do the flush.

Kevin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  8:17       ` Kevin Wolf
@ 2011-10-24  8:47         ` Paolo Bonzini
  2011-10-24  8:54           ` Kevin Wolf
  2011-10-24  9:09         ` Peter Maydell
  1 sibling, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-24  8:47 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Alexander Graf, qemu-devel, avi

On 10/24/2011 10:17 AM, Kevin Wolf wrote:
> >  I think it's not about "why is it there", but rather about "what is it
> >  useful for".  My interpretation of it is "I do not need the image
> >  anymore unless the command exits cleanly": VM installations, qemu-img
> >  conversions, BDRV_O_SNAPSHOT (doesn't do it yet, but it could).  Even
> >  SIGINT and SIGTERM would be excluded from this definition, but they cost
> >  nothing so it's nice to include them.
>
> I think another common interpretation is: "I don't run this VM in
> production but for development. I want the VM to go faster and I can
> recreate the image in the unlikely event that power fails during my
> work. But it certainly would be nasty."

Fair enough.

> But I think that starting to make exceptions for single block drivers
> isn't a good idea anyway. If we want bdrv_flush() to write out all
> metadata internal to qemu, I think the approach with checking the flag
> in drivers calling things like fsync() is better. The common thing is to
> do the flush.

I don't know... checking BDRV_O_NO_FLUSH in the drivers rather than in 
the generic code sounds like a layering violation.  Perhaps what you're 
after is a separation of bdrv_co_flush from bdrv_{,co_,aio_}fsync?  Then 
BDRV_O_NO_FLUSH (better renamed to BDRV_O_NO_FSYNC...) would only 
inhibit the latter.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  8:47         ` Paolo Bonzini
@ 2011-10-24  8:54           ` Kevin Wolf
  2011-10-24  9:26             ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: Kevin Wolf @ 2011-10-24  8:54 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Alexander Graf, qemu-devel, avi

Am 24.10.2011 10:47, schrieb Paolo Bonzini:
> On 10/24/2011 10:17 AM, Kevin Wolf wrote:
>>>  I think it's not about "why is it there", but rather about "what is it
>>>  useful for".  My interpretation of it is "I do not need the image
>>>  anymore unless the command exits cleanly": VM installations, qemu-img
>>>  conversions, BDRV_O_SNAPSHOT (doesn't do it yet, but it could).  Even
>>>  SIGINT and SIGTERM would be excluded from this definition, but they cost
>>>  nothing so it's nice to include them.
>>
>> I think another common interpretation is: "I don't run this VM in
>> production but for development. I want the VM to go faster and I can
>> recreate the image in the unlikely event that power fails during my
>> work. But it certainly would be nasty."
> 
> Fair enough.
> 
>> But I think that starting to make exceptions for single block drivers
>> isn't a good idea anyway. If we want bdrv_flush() to write out all
>> metadata internal to qemu, I think the approach with checking the flag
>> in drivers calling things like fsync() is better. The common thing is to
>> do the flush.
> 
> I don't know... checking BDRV_O_NO_FLUSH in the drivers rather than in 
> the generic code sounds like a layering violation.  Perhaps what you're 
> after is a separation of bdrv_co_flush from bdrv_{,co_,aio_}fsync?  Then 
> BDRV_O_NO_FLUSH (better renamed to BDRV_O_NO_FSYNC...) would only 
> inhibit the latter.

Why? All other cache related BDRV_O_* flags are interpreted by the block
drivers, so why should BDRV_O_NO_FLUSH be special?

Kevin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  8:17       ` Kevin Wolf
  2011-10-24  8:47         ` Paolo Bonzini
@ 2011-10-24  9:09         ` Peter Maydell
  1 sibling, 0 replies; 18+ messages in thread
From: Peter Maydell @ 2011-10-24  9:09 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: Paolo Bonzini, avi, Alexander Graf, qemu-devel

On 24 October 2011 09:17, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 24.10.2011 09:53, schrieb Paolo Bonzini:
>> I think it's not about "why is it there", but rather about "what is it
>> useful for".  My interpretation of it is "I do not need the image
>> anymore unless the command exits cleanly": VM installations, qemu-img
>> conversions, BDRV_O_SNAPSHOT (doesn't do it yet, but it could).  Even
>> SIGINT and SIGTERM would be excluded from this definition, but they cost
>> nothing so it's nice to include them.
>
> I think another common interpretation is: "I don't run this VM in
> production but for development. I want the VM to go faster and I can
> recreate the image in the unlikely event that power fails during my
> work. But it certainly would be nasty."

So at the moment the documentation (qemu-doc.html) says:
"Writeback caching will report data writes as completed as soon as the
data is present in the host page cache. This is safe as long as you trust
your host. If your host crashes or loses power, then the guest may
experience data corruption."

and also:
"In case you don't care about data integrity over host failures, use
cache=unsafe. This option tells qemu that it never needs to write any
data to the disk but can instead keeps things in cache. If anything
goes wrong, like your host losing power, the disk storage getting
disconnected accidently, etc. you're image will most probably be
rendered unusable."

which to me reads in both cases as "will not corrupt data if the
guest crashes but may do so if the host crashes" and leaves me
with no idea what the difference is. It might be nice if we could
clarify that a bit...

-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  8:54           ` Kevin Wolf
@ 2011-10-24  9:26             ` Paolo Bonzini
  2011-10-24  9:36               ` Kevin Wolf
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-24  9:26 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: avi, Alexander Graf, qemu-devel

On 10/24/2011 10:54 AM, Kevin Wolf wrote:
> >  I don't know... checking BDRV_O_NO_FLUSH in the drivers rather than in
> >  the generic code sounds like a layering violation.  Perhaps what you're
> >  after is a separation of bdrv_co_flush from bdrv_{,co_,aio_}fsync?  Then
> >  BDRV_O_NO_FLUSH (better renamed to BDRV_O_NO_FSYNC...) would only
> >  inhibit the latter.
>
> Why? All other cache related BDRV_O_* flags are interpreted by the block
> drivers, so why should BDRV_O_NO_FLUSH be special?

You're changing the API and asking for possibly non-trivial changes in 
all protocol drivers, in order to accomodate semantics that all format 
drivers potentially could desire.  So I wonder if the problem is simply 
that the current API is not expressive enough.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  9:26             ` Paolo Bonzini
@ 2011-10-24  9:36               ` Kevin Wolf
  2011-10-24  9:40                 ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: Kevin Wolf @ 2011-10-24  9:36 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: avi, Alexander Graf, qemu-devel

Am 24.10.2011 11:26, schrieb Paolo Bonzini:
> On 10/24/2011 10:54 AM, Kevin Wolf wrote:
>>>  I don't know... checking BDRV_O_NO_FLUSH in the drivers rather than in
>>>  the generic code sounds like a layering violation.  Perhaps what you're
>>>  after is a separation of bdrv_co_flush from bdrv_{,co_,aio_}fsync?  Then
>>>  BDRV_O_NO_FLUSH (better renamed to BDRV_O_NO_FSYNC...) would only
>>>  inhibit the latter.
>>
>> Why? All other cache related BDRV_O_* flags are interpreted by the block
>> drivers, so why should BDRV_O_NO_FLUSH be special?
> 
> You're changing the API and asking for possibly non-trivial changes in 
> all protocol drivers, in order to accomodate semantics that all format 
> drivers potentially could desire.  So I wonder if the problem is simply 
> that the current API is not expressive enough.

Can you think of any cases where a caller would want to invoke
bdrv_flush, but not bdrv_fsync? (The other way round it makes even less
sense)

Kevin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  9:36               ` Kevin Wolf
@ 2011-10-24  9:40                 ` Paolo Bonzini
  2011-10-24  9:53                   ` Kevin Wolf
  0 siblings, 1 reply; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-24  9:40 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: avi, Alexander Graf, qemu-devel

On 10/24/2011 11:36 AM, Kevin Wolf wrote:
>> >  You're changing the API and asking for possibly non-trivial changes in
>> >  all protocol drivers, in order to accomodate semantics that all format
>> >  drivers potentially could desire.  So I wonder if the problem is simply
>> >  that the current API is not expressive enough.
> Can you think of any cases where a caller would want to invoke
> bdrv_flush, but not bdrv_fsync? (The other way round it makes even less
> sense)

I'm talking about the internal driver API only.  The external API is 
fine as is.

Paolo

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  9:40                 ` Paolo Bonzini
@ 2011-10-24  9:53                   ` Kevin Wolf
  2011-10-24 10:08                     ` Paolo Bonzini
  0 siblings, 1 reply; 18+ messages in thread
From: Kevin Wolf @ 2011-10-24  9:53 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: avi, Alexander Graf, qemu-devel

Am 24.10.2011 11:40, schrieb Paolo Bonzini:
> On 10/24/2011 11:36 AM, Kevin Wolf wrote:
>>>>  You're changing the API and asking for possibly non-trivial changes in
>>>>  all protocol drivers, in order to accomodate semantics that all format
>>>>  drivers potentially could desire.  So I wonder if the problem is simply
>>>>  that the current API is not expressive enough.
>> Can you think of any cases where a caller would want to invoke
>> bdrv_flush, but not bdrv_fsync? (The other way round it makes even less
>> sense)
> 
> I'm talking about the internal driver API only.  The external API is 
> fine as is.

Ok, so external callers don't force us to do it.

Yes, we could split bdrv_flush internally into two functions for "flush
one level to the OS" and "flush all the way down to the disk", but I'm
not sure if this really buys us anything or just adds complexity.

Kevin

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe
  2011-10-24  9:53                   ` Kevin Wolf
@ 2011-10-24 10:08                     ` Paolo Bonzini
  0 siblings, 0 replies; 18+ messages in thread
From: Paolo Bonzini @ 2011-10-24 10:08 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: avi, Alexander Graf, qemu-devel

On 10/24/2011 11:53 AM, Kevin Wolf wrote:
>> >  
>> >  I'm talking about the internal driver API only.  The external API is
>> >  fine as is.
> Ok, so external callers don't force us to do it.
> 
> Yes, we could split bdrv_flush internally into two functions for "flush
> one level to the OS" and "flush all the way down to the disk", but I'm
> not sure if this really buys us anything or just adds complexity.

I would say that:

1) the "safe" NBD and iSCSI drivers are not in-tree, but you would have to
convert those too.  iSCSI uses aio, so it would be non-trivial.

2) long-term you wanted to convert raw-posix to coroutines, but in the
meanwhile your patch introduces yet another partial transition;

3) your two patches are more complex compared to something like this:

diff --git a/block.c b/block.c
index 11c7f91..1e06a8a 100644
--- a/block.c
+++ b/block.c
@@ -2908,9 +2908,19 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
-    if (bs->open_flags & BDRV_O_NO_FLUSH) {
+    if (!bs->drv) {
         return 0;
-    } else if (!bs->drv) {
+    }
+
+    /* First send everything to the OS.  */
+    if (bs->drv->bdrv_co_flush_metadata) {
+        int ret = bs->drv->bdrv_co_flush_metadata(bs);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    /* Now flush the host cache to disk if we need to.  */
+    if (bs->open_flags & BDRV_O_NO_FLUSH) {
         return 0;
     } else if (bs->drv->bdrv_co_flush) {
         return bs->drv->bdrv_co_flush(bs);
diff --git a/block/qcow2.c b/block/qcow2.c
index a181932..cd13452 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1105,7 +1105,7 @@ fail:
     return ret;
 }
 
-static int qcow2_co_flush(BlockDriverState *bs)
+static int qcow2_co_flush_metadata(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     int ret;
@@ -1121,7 +1121,11 @@ static int qcow2_co_flush(BlockDriverState *bs)
         return ret;
     }
     qemu_co_mutex_unlock(&s->lock);
+    return 0;
+}
 
+static int qcow2_co_flush(BlockDriverState *bs)
+{
     return bdrv_co_flush(bs->file);
 }
 
@@ -1244,6 +1248,7 @@ static BlockDriver bdrv_qcow2 = {
     .bdrv_co_readv      = qcow2_co_readv,
     .bdrv_co_writev     = qcow2_co_writev,
     .bdrv_co_flush      = qcow2_co_flush,
+    .bdrv_co_flush_metadata = qcow2_co_flush_metadata,
 
     .bdrv_co_discard        = qcow2_co_discard,
     .bdrv_truncate          = qcow2_truncate,
diff --git a/block_int.h b/block_int.h
index dac00f5..ab4ceea 100644
--- a/block_int.h
+++ b/block_int.h
@@ -84,6 +84,7 @@ struct BlockDriver {
     int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
     int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
+    int coroutine_fn (*bdrv_co_flush_metadata)(BlockDriverState *bs);
     int coroutine_fn (*bdrv_co_discard)(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors);
 
Paolo

^ permalink raw reply related	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2011-10-24 10:08 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-10-21 17:08 [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Kevin Wolf
2011-10-21 17:08 ` [Qemu-devel] [PATCH 1/2] raw-posix: Convert to bdrv_co_flush Kevin Wolf
2011-10-21 17:08 ` [Qemu-devel] [PATCH 2/2] block: Handle cache=unsafe only in raw-posix/win32 Kevin Wolf
2011-10-21 18:44 ` [Qemu-devel] [PATCH 0/2] block: Write out internal caches even with cache=unsafe Paolo Bonzini
2011-10-22 15:07   ` Alexander Graf
2011-10-23 14:33     ` Paolo Bonzini
2011-10-24  8:05       ` Kevin Wolf
2011-10-24  7:37   ` Kevin Wolf
2011-10-24  7:53     ` Paolo Bonzini
2011-10-24  8:17       ` Kevin Wolf
2011-10-24  8:47         ` Paolo Bonzini
2011-10-24  8:54           ` Kevin Wolf
2011-10-24  9:26             ` Paolo Bonzini
2011-10-24  9:36               ` Kevin Wolf
2011-10-24  9:40                 ` Paolo Bonzini
2011-10-24  9:53                   ` Kevin Wolf
2011-10-24 10:08                     ` Paolo Bonzini
2011-10-24  9:09         ` Peter Maydell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).