[Qemu-devel] [PATCH] bdrv_aio

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] bdrv_aio_flush
@ 2008-08-29 13:37 Andrea Arcangeli
  2008-09-01 11:27 ` Ian Jackson
  0 siblings, 1 reply; 14+ messages in thread
From: Andrea Arcangeli @ 2008-08-29 13:37 UTC (permalink / raw)
  To: qemu-devel

Hello,

while reading the aio/ide code I noticed the bdrv_flush operation is
unsafe. When a write command is submitted with bdrv_aio_write and
later bdrv_flush is called, fsync will do nothing. fsync only sees the
kernel writeback cache. But the write command is still queued in the
aio kernel thread and is still invisible to the kernel. bdrv_aio_flush
will instead see both the regular bdrv_write (that submits data to the
kernel synchronously) as well as the bdrv_aio_write as the fsync will
be queued at the end of the aio queue and it'll be issued by the aio
pthread thread itself.

IDE works by luck because it can only submit one command at once (no
tagged queueing) so supposedly the guest kernel driver will wait the
IDE emulated device to return ready before issuing a journaling
barrier with WIN_FLUSH_CACHE* but with scsi and tagged command
queueing this bug in the aio common code will become visible and it'll
break the journaling guarantees of the guest if there's a power loss
in the host. So it's not urgent for IDE I think, but it clearly should
be fixed in the qemu block model eventually.

This should fix it. Unfortunately I didn't implement all the backends
and I only tested it with KVM so far. Also I don't know for sure if
the BUSY line should get set during the flush, I assumed yes. It was
irrelevant before because the flush was happening atomically before
returning from the ioport write from guest point of view.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

Index: block_int.h
===================================================================
--- block_int.h	(revision 5089)
+++ block_int.h	(working copy)
@@ -54,6 +54,8 @@
     BlockDriverAIOCB *(*bdrv_aio_write)(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
 
Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c	(revision 5089)
+++ block-raw-posix.c	(working copy)
@@ -638,6 +638,21 @@
     return &acb->common;
 }
 
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (aio_fsync(O_DSYNC, &acb->aiocb) < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
 static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
@@ -857,6 +872,7 @@
 #ifdef CONFIG_AIO
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
+    .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
@@ -1211,6 +1227,7 @@
 #ifdef CONFIG_AIO
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
+    .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
Index: block-qcow2.c
===================================================================
--- block-qcow2.c	(revision 5089)
+++ block-qcow2.c	(working copy)
@@ -1369,6 +1369,42 @@
     return &acb->common;
 }
 
+static void qcow_aio_flush_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return;
+    }
+
+    /* request completed */
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    acb = qcow_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+
+    acb->hd_aiocb = bdrv_aio_flush(s->hd,
+				   qcow_aio_flush_cb, acb);
+    if (acb->hd_aiocb == NULL)
+	    qcow_aio_flush_cb(acb, -ENOMEM);
+
+    return &acb->common;
+}
+
 static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     QCowAIOCB *acb = (QCowAIOCB *)blockacb;
@@ -2612,6 +2648,7 @@
 
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
+    .bdrv_aio_flush = qcow_aio_flush,
     .bdrv_aio_cancel = qcow_aio_cancel,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
Index: block.c
===================================================================
--- block.c	(revision 5089)
+++ block.c	(working copy)
@@ -1181,6 +1181,20 @@
     return ret;
 }
 
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+				 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_flush(bs, cb, opaque);
+
+    return ret;
+}
+
 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
 {
     BlockDriver *drv = acb->bs->drv;
Index: block.h
===================================================================
--- block.h	(revision 5089)
+++ block.h	(working copy)
@@ -87,6 +87,8 @@
 BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
Index: hw/ide.c
===================================================================
--- hw/ide.c	(revision 5089)
+++ hw/ide.c	(working copy)
@@ -1969,6 +1969,13 @@
     ide_if[1].select &= ~(1 << 7);
 }
 
+static void ide_flush_dma_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+    s->status = READY_STAT | SEEK_STAT;
+    ide_set_irq(s);
+}
+
 static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 {
     IDEState *ide_if = opaque;
@@ -2237,8 +2244,11 @@
             break;
         case WIN_FLUSH_CACHE:
         case WIN_FLUSH_CACHE_EXT:
-            if (s->bs)
-                bdrv_flush(s->bs);
+	    if (s->bs) {
+		s->status = READY_STAT | SEEK_STAT | BUSY_STAT;
+		bdrv_aio_flush(s->bs, ide_flush_dma_cb, s);
+		break;
+	    }
 	    s->status = READY_STAT | SEEK_STAT;
             ide_set_irq(s);
             break;

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-08-29 13:37 [Qemu-devel] [PATCH] bdrv_aio_flush Andrea Arcangeli
@ 2008-09-01 11:27 ` Ian Jackson
  2008-09-01 12:25   ` Andrea Arcangeli
  2008-09-01 13:25   ` Jamie Lokier
  0 siblings, 2 replies; 14+ messages in thread
From: Ian Jackson @ 2008-09-01 11:27 UTC (permalink / raw)
  To: qemu-devel

Andrea Arcangeli writes ("[Qemu-devel] [PATCH] bdrv_aio_flush"):
> while reading the aio/ide code I noticed the bdrv_flush operation is
> unsafe. When a write command is submitted with bdrv_aio_write and
> later bdrv_flush is called, fsync will do nothing. fsync only sees the
> kernel writeback cache. But the write command is still queued in the
> aio kernel thread and is still invisible to the kernel. bdrv_aio_flush
> will instead see both the regular bdrv_write (that submits data to the
> kernel synchronously) as well as the bdrv_aio_write as the fsync will
> be queued at the end of the aio queue and it'll be issued by the aio
> pthread thread itself.

I think this is fine.  We discussed this some time ago.  bdrv_flush
guarantees that _already completed_ IO operations are flushed.  It
does not guarantee that in flight AIO operations are completed and
then flushed to disk.

This is fine.

> IDE works by luck because it can only submit one command at once (no
> tagged queueing) so supposedly the guest kernel driver will wait the
> IDE emulated device to return ready before issuing a journaling
> barrier with WIN_FLUSH_CACHE* but with scsi and tagged command
> queueing this bug in the aio common code will become visible and it'll
> break the journaling guarantees of the guest if there's a power loss
> in the host. So it's not urgent for IDE I think, but it clearly should
> be fixed in the qemu block model eventually.

I don't think this criticism is correct because I think the IDE FLUSH
CACHE command should be read the same way.  The spec I have here is
admittedly quite unclear but I can't see any reason to think that the
`write cache' which is referred to by the spec is regarded as
containing data which has not yet been DMAd from the host to the disk
because the command which does that transfer is not yet complete.

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-01 11:27 ` Ian Jackson
@ 2008-09-01 12:25   ` Andrea Arcangeli
  2008-09-01 13:54     ` Jamie Lokier
  2008-09-02 10:52     ` Ian Jackson
  2008-09-01 13:25   ` Jamie Lokier
  1 sibling, 2 replies; 14+ messages in thread
From: Andrea Arcangeli @ 2008-09-01 12:25 UTC (permalink / raw)
  To: qemu-devel

On Mon, Sep 01, 2008 at 12:27:02PM +0100, Ian Jackson wrote:
> I think this is fine.  We discussed this some time ago.  bdrv_flush

You mean fsync is just fine, or you mean replacing fsync with
aio_fsync is fine and needed? ;)

> guarantees that _already completed_ IO operations are flushed.  It
> does not guarantee that in flight AIO operations are completed and
> then flushed to disk.

In case you meant fsync is just fine, Linux will use the
WIN_FLUSH_CACHE/WIN_FLUSH_CACHE_EXT see
idedisk_prepare_flush:

      	   if (barrier) {
	      		ordered = QUEUE_ORDERED_DRAIN_FLUSH;
				  prep_fn = idedisk_prepare_flush;
           }

so if we don't want guest journaling to break with scsi/virtio, we've
to make sure the AIO is committed to disk before the flush returns.

To be clear: this is only a problem if there's a power outage in the
host.

> [..] I can't see any reason to think that the
> `write cache' which is referred to by the spec is regarded as
> containing data which has not yet been DMAd from the host to the disk
> because the command which does that transfer is not yet complete.

I'm not sure I follow, IDE is safe because it submits a command at
once and we don't simulate dirty write cache. So by the time
bdrv_flush is called, the previous aio_write is already completed, and
in turn the dirty data is already visible to the kernel that will
write it to disk with fsync.

But anything a bit more clever than IDE that allows the guest to
submit a barrier in a TCQ way, like scsi or virtio, will break the
guest journaling if fsync is used. By the time the flush operation
returns all previous data must be written to disk. Or at least the
flush operation should return in order, so anything after the barrier
operation should be written after the previous stuff. And fsync can't
guarantee it, because it'll return immediately even if the aio queue
is huge, and after the aio queue is flushed to kernel writeback cache,
the kernel is free to write the writeback cache in whatever order it
wants (in linux it'll try to write it in dirty-inode order first, and
then in logical order according to the offset of the dirty data into
the inode looking up the inode radix tree).

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-01 11:27 ` Ian Jackson
  2008-09-01 12:25   ` Andrea Arcangeli
@ 2008-09-01 13:25   ` Jamie Lokier
  2008-09-02 10:46     ` Ian Jackson
  1 sibling, 1 reply; 14+ messages in thread
From: Jamie Lokier @ 2008-09-01 13:25 UTC (permalink / raw)
  To: qemu-devel

Ian Jackson wrote:
> Andrea Arcangeli writes ("[Qemu-devel] [PATCH] bdrv_aio_flush"):
> > while reading the aio/ide code I noticed the bdrv_flush operation is
> > unsafe. When a write command is submitted with bdrv_aio_write and
> > later bdrv_flush is called, fsync will do nothing. fsync only sees the
> > kernel writeback cache. But the write command is still queued in the
> > aio kernel thread and is still invisible to the kernel. bdrv_aio_flush
> > will instead see both the regular bdrv_write (that submits data to the
> > kernel synchronously) as well as the bdrv_aio_write as the fsync will
> > be queued at the end of the aio queue and it'll be issued by the aio
> > pthread thread itself.
> 
> I think this is fine.  We discussed this some time ago.  bdrv_flush
> guarantees that _already completed_ IO operations are flushed.  It
> does not guarantee that in flight AIO operations are completed and
> then flushed to disk.

Andrea thinks bdrv_aio_flush does guarantee that in flight operations
are flushed, while bdrv_flush definitely does not (fsync doesn't).

I vaguely recall from the discussion before, there was uncertainty
about whether that is true, and therefore the right thing to do was
wait for the in flight AIOs to complete _first_ and then issue an
fsync or aio_fsync call.

The Open Group text for aio_fsync says: "shall asynchronously force
all I/O operations [...]  queued at the time of the call to aio_fsync
[...]".

We weren't certain if operations "queued at the time of the call"
definitely meant everything queued by earlier calls to aio_write(), or
if in implementations which start a thread to call write(), that
"queued" might not include AIOs which were still in a thread somewhere.

This is because we considered there might not be a strong ordering
between submitted AIOs: each one might be _as if_ it launched an
independent thread to process the synchronous equivalent.

This was clear also from somebody asking why Glibc's AIO has only one
thread per file descriptor, limiting its performance.

In fact there's a good reason for Glibc's one thread limitation.

Since then, I've read the Open Group specifications more closely, and
some other OS man pages, and they are consistent that _writes_ always
occur in the order they are submitted to aio_write.

(The Linux man page is curiously an exception in not saying this.)

In other words, there is a queue, and submitted aio_writes() are
strongly ordered.[1]

So it seems very likely that aio_fsync _is_ specified as Andrea
thinks, that it flushes all writes which have earlier been queued with
aio_fsync, and its unfortunate that a different interpretation of the
specification's words is possible.[2]

[1] - This is especially important when writing to a socket, pipe,
      tape drive.  It's a little surprising it doesn't specify the
      same about reads, since reading from sockets, pipes and tape
      drives requires ordering guarantees too.

[2] - This is useful: with some devices, it can be much faster to keep
      a queue going than to wait for it to drain to issue flushes for
      barriers.

> > IDE works by luck because it can only submit one command at once (no
> > tagged queueing) so supposedly the guest kernel driver will wait the
> > IDE emulated device to return ready before issuing a journaling
> > barrier with WIN_FLUSH_CACHE* but with scsi and tagged command
> > queueing this bug in the aio common code will become visible and it'll
> > break the journaling guarantees of the guest if there's a power loss
> > in the host. So it's not urgent for IDE I think, but it clearly should
> > be fixed in the qemu block model eventually.
> 
> I don't think this criticism is correct because I think the IDE FLUSH
> CACHE command should be read the same way.  The spec I have here is
> admittedly quite unclear but I can't see any reason to think that the
> `write cache' which is referred to by the spec is regarded as
> containing data which has not yet been DMAd from the host to the disk
> because the command which does that transfer is not yet complete.

I have just checked ATA-8, the draft.  With ATA TCQ or NCQ (those are
the features to have more than one command in flight), the only
queuable command are specific types of reads and writes.

All other commands can only have one command total in flight.

FLUSH CACHE cannot be queued: the OS must wait for preceding commands
to drain before it can issue FLUSH CACHE (it'll be refused otherwise).
So the question of flushing data not yet DMA'd doesn't apply (maybe in
a future ATA spec it will.)

What _can_ be queued is a WRITE FUA command: meaning write some data
and flush _this_ data to non-volatile storage.

Inside qemu, that should map to a write+fsync sequence somehow, or
write using an O_SYNC or O_DIRECT file descriptor.  (Though a user
option to disable fsyncs and make _all_ writes cached would be handy,
for those temporary VMs you want to run as fast as possible.)

(By the way, READ FUA is also in the ATA spec.  It means force a read
from the non-volatile medium, don't read from cache.  If there is
dirty data in cache, flush it first.)

-- Jamie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-01 12:25   ` Andrea Arcangeli
@ 2008-09-01 13:54     ` Jamie Lokier
  2008-09-02 10:52     ` Ian Jackson
  1 sibling, 0 replies; 14+ messages in thread
From: Jamie Lokier @ 2008-09-01 13:54 UTC (permalink / raw)
  To: qemu-devel

Andrea Arcangeli wrote:
> To be clear: this is only a problem if there's a power outage in the
> host.

Actually also if the host kernel crashes some time afterwards, because
we're talking about data that's written by AIO and not properly fsync'd.

-- Jamie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-01 13:25   ` Jamie Lokier
@ 2008-09-02 10:46     ` Ian Jackson
  2008-09-02 14:28       ` Jens Axboe
  2008-09-02 18:01       ` Jamie Lokier
  0 siblings, 2 replies; 14+ messages in thread
From: Ian Jackson @ 2008-09-02 10:46 UTC (permalink / raw)
  To: qemu-devel

Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> Andrea thinks bdrv_aio_flush does guarantee that in flight operations
> are flushed, while bdrv_flush definitely does not (fsync doesn't).

I read Andrea as complaining that bdrv_aio_flush _should_ flush in
flight operations but does not.

> I vaguely recall from the discussion before, there was uncertainty
> about whether that is true, and therefore the right thing to do was
> wait for the in flight AIOs to complete _first_ and then issue an
> fsync or aio_fsync call.

Whether bdrv_aio_flush should do this is a question of the qemu
internal API.

> The Open Group text for aio_fsync says: "shall asynchronously force
> all I/O operations [...]  queued at the time of the call to aio_fsync
> [...]".

We discussed the meaning of the unix specs before.  I did a close
textual analysis of it here:

  http://lists.nongnu.org/archive/html/qemu-devel/2008-04/msg00046.html

> Since then, I've read the Open Group specifications more closely, and
> some other OS man pages, and they are consistent that _writes_ always
> occur in the order they are submitted to aio_write.

Can you give me a chapter and verse quote for that ?  I'm looking at
SuSv3 eg
  http://www.opengroup.org/onlinepubs/009695399/nfindex.html

All I can see is this:
  If O_APPEND is set for the file descriptor, write operations append
  to the file in the same order as the calls were made.

If things are as you say and aio writes are always executed in the
order queued, there would be no need to specify that because it would
be implied by the behaviour of write(2) and O_APPEND.

> What _can_ be queued is a WRITE FUA command: meaning write some data
> and flush _this_ data to non-volatile storage.

In order to implement that interface without flushing other data
unecessarily, we need to be able to

   submit other IO requests
   submit aio_write request for WRITE FUA
   asynchronously await completion of the aio_write for WRITE FUA
   submit and perhaps collect completion of other IO requests
   collect completion of aio_write for WRITE FUA
   submit and perhaps collect completion of other IO requests
   submit aio_fsync (for WRITE FUA)
   submit and perhaps collect completion of other IO requests
   collect aio_fsync (for WRITE FUA)

This is still not perfect because we unnecessarily flush some data
thus delaying reporting completion of the WRITE FUA.  But there is at
at least no need to wait for _other_ writes to complete.

So the semantics of bdrv_aio_flush should be `flush (at least) writes
which have already completed'.

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-01 12:25   ` Andrea Arcangeli
  2008-09-01 13:54     ` Jamie Lokier
@ 2008-09-02 10:52     ` Ian Jackson
  2008-09-02 14:25       ` Jens Axboe
  1 sibling, 1 reply; 14+ messages in thread
From: Ian Jackson @ 2008-09-02 10:52 UTC (permalink / raw)
  To: qemu-devel

Andrea Arcangeli writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> In case you meant fsync is just fine, Linux will use the
> WIN_FLUSH_CACHE/WIN_FLUSH_CACHE_EXT see
> idedisk_prepare_flush:
> 
>                  if (barrier) {
>                       ordered = QUEUE_ORDERED_DRAIN_FLUSH;
>                       prep_fn = idedisk_prepare_flush;
>                  }

I'm not sure I follow but I'm not familiar with the relevant Linux
code.  Do you mean that Linux does this

    submit IO 1 to device
    submit flush to device
    collect IO 1 completion
    collect flush completion

and then expects IO 1 to be on disk ?

If this is a documented behaviour of the controller or device we're
emulating then I think the qemu emulation (hw/scsi-disk.c or whatever)
should do an aio_flush barrier before aio_fsync.

What spec should I be referring to ?

Perhaps I'm naive but I would expect the device interface to look more
like the kernel syscall interface, whose specification text (in SuSv3)
I analysed in April in
 http://lists.nongnu.org/archive/html/qemu-devel/2008-04/msg00046.html

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 10:52     ` Ian Jackson
@ 2008-09-02 14:25       ` Jens Axboe
  2008-09-02 16:49         ` Ian Jackson
  0 siblings, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2008-09-02 14:25 UTC (permalink / raw)
  To: qemu-devel

On Tue, Sep 02 2008, Ian Jackson wrote:
> Andrea Arcangeli writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> > In case you meant fsync is just fine, Linux will use the
> > WIN_FLUSH_CACHE/WIN_FLUSH_CACHE_EXT see
> > idedisk_prepare_flush:
> > 
> >                  if (barrier) {
> >                       ordered = QUEUE_ORDERED_DRAIN_FLUSH;
> >                       prep_fn = idedisk_prepare_flush;
> >                  }
> 
> I'm not sure I follow but I'm not familiar with the relevant Linux
> code.  Do you mean that Linux does this
> 
>     submit IO 1 to device
>     submit flush to device
>     collect IO 1 completion
>     collect flush completion
> 
> and then expects IO 1 to be on disk ?

Given that this case refers to PATA/SATA and that you can't queue the
flush operation, it looks something like:

        submit IO 1 to device
        IO 1 completes
        submit flush
        flush completes

and at this point, flush completion either ends in error (in which case
we can't rely on IO 1 being safely stored), or it suceeds and Linux then
expects IO 1 to be on disk. If it isn't, then the device is broken (as
simple as that).

> If this is a documented behaviour of the controller or device we're
> emulating then I think the qemu emulation (hw/scsi-disk.c or whatever)
> should do an aio_flush barrier before aio_fsync.

Certainly it is, when the flush command returns, all dirty drive cache
must be safely on platter.

> What spec should I be referring to ?

ata spec, see t13.org

> Perhaps I'm naive but I would expect the device interface to look more
> like the kernel syscall interface, whose specification text (in SuSv3)
> I analysed in April in
>  http://lists.nongnu.org/archive/html/qemu-devel/2008-04/msg00046.html
> 
> Ian.
> 
> 

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 10:46     ` Ian Jackson
@ 2008-09-02 14:28       ` Jens Axboe
  2008-09-02 16:52         ` Ian Jackson
  2008-09-02 18:01       ` Jamie Lokier
  1 sibling, 1 reply; 14+ messages in thread
From: Jens Axboe @ 2008-09-02 14:28 UTC (permalink / raw)
  To: qemu-devel

On Tue, Sep 02 2008, Ian Jackson wrote:
> Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> > Andrea thinks bdrv_aio_flush does guarantee that in flight operations
> > are flushed, while bdrv_flush definitely does not (fsync doesn't).
> 
> I read Andrea as complaining that bdrv_aio_flush _should_ flush in
> flight operations but does not.
> 
> > I vaguely recall from the discussion before, there was uncertainty
> > about whether that is true, and therefore the right thing to do was
> > wait for the in flight AIOs to complete _first_ and then issue an
> > fsync or aio_fsync call.
> 
> Whether bdrv_aio_flush should do this is a question of the qemu
> internal API.
> 
> > The Open Group text for aio_fsync says: "shall asynchronously force
> > all I/O operations [...]  queued at the time of the call to aio_fsync
> > [...]".
> 
> We discussed the meaning of the unix specs before.  I did a close
> textual analysis of it here:
> 
>   http://lists.nongnu.org/archive/html/qemu-devel/2008-04/msg00046.html
> 
> > Since then, I've read the Open Group specifications more closely, and
> > some other OS man pages, and they are consistent that _writes_ always
> > occur in the order they are submitted to aio_write.
> 
> Can you give me a chapter and verse quote for that ?  I'm looking at
> SuSv3 eg
>   http://www.opengroup.org/onlinepubs/009695399/nfindex.html
> 
> All I can see is this:
>   If O_APPEND is set for the file descriptor, write operations append
>   to the file in the same order as the calls were made.
> 
> If things are as you say and aio writes are always executed in the
> order queued, there would be no need to specify that because it would
> be implied by the behaviour of write(2) and O_APPEND.
> 
> > What _can_ be queued is a WRITE FUA command: meaning write some data
> > and flush _this_ data to non-volatile storage.
> 
> In order to implement that interface without flushing other data
> unecessarily, we need to be able to
> 
>    submit other IO requests
>    submit aio_write request for WRITE FUA
>    asynchronously await completion of the aio_write for WRITE FUA
>    submit and perhaps collect completion of other IO requests
>    collect completion of aio_write for WRITE FUA
>    submit and perhaps collect completion of other IO requests
>    submit aio_fsync (for WRITE FUA)
>    submit and perhaps collect completion of other IO requests
>    collect aio_fsync (for WRITE FUA)
> 
> This is still not perfect because we unnecessarily flush some data
> thus delaying reporting completion of the WRITE FUA.  But there is at
> at least no need to wait for _other_ writes to complete.

I don't see how the above works. There's no dependency on FUA and
non-FUA writes, in fact FUA writes tend to jump the device queue due to
certain other operating systems using it for conditions where that is
appropriate. So unless you do all writes using FUA, there's no way
around a flush for committing dirty data. Unfortunately we don't have a
FLUSH_RANGE command, it's just a big sledge hammer.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 14:25       ` Jens Axboe
@ 2008-09-02 16:49         ` Ian Jackson
  0 siblings, 0 replies; 14+ messages in thread
From: Ian Jackson @ 2008-09-02 16:49 UTC (permalink / raw)
  To: qemu-devel

Jens Axboe writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> Given that this case refers to PATA/SATA and that you can't queue the
> flush operation, it looks something like:
>
>         submit IO 1 to device
>         IO 1 completes
>         submit flush
>         flush completes

That's what I thought.

> and at this point, flush completion either ends in error (in which case
> we can't rely on IO 1 being safely stored), or it suceeds and Linux then
> expects IO 1 to be on disk. If it isn't, then the device is broken (as
> simple as that).

Yes.  I agree.  And I think that (with my recent error handling and
cache flushing patchsets - 5 patches total) that's what qemu
implements.

> > What spec should I be referring to ?
> 
> ata spec, see t13.org

Thanks.  I was looking at an older spec I found a link to from
wikipedia ...

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 14:28       ` Jens Axboe
@ 2008-09-02 16:52         ` Ian Jackson
  2008-09-02 18:22           ` Jamie Lokier
  0 siblings, 1 reply; 14+ messages in thread
From: Ian Jackson @ 2008-09-02 16:52 UTC (permalink / raw)
  To: qemu-devel

Jens Axboe writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> On Tue, Sep 02 2008, Ian Jackson wrote:
> > This is still not perfect because we unnecessarily flush some data
> > thus delaying reporting completion of the WRITE FUA.  But there is at
> > at least no need to wait for _other_ writes to complete.
> 
> I don't see how the above works. There's no dependency on FUA and
> non-FUA writes, in fact FUA writes tend to jump the device queue due to
> certain other operating systems using it for conditions where that is
> appropriate. So unless you do all writes using FUA, there's no way
> around a flush for committing dirty data. Unfortunately we don't have a
> FLUSH_RANGE command, it's just a big sledge hammer.

Yes, certainly you do aio_sync _some_ data that doesn't need to be.
Without an O_FSYNC flag on aio_write that's almost inevitable.

But if bdrv_aio_fsync also does a flush first then you're going to
sync _even more_ unnecessarily: the difference between `bdrv_aio_fsync
does flush first' and `bdrv_aio_fsync does not flush' only affects
writes are queued but not completed when bdrv_aio_fsync is called.

That is, non-FUA writes which were submitted after the FUA write.
There is no need to fsync these and that's what I think qemu should
do.

Andrea was making some comments about scsi and virtio.  It's possible
that these have different intended semantics and perhaps those device
models (in hw/*) need to call flush explicitly before sync.

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 10:46     ` Ian Jackson
  2008-09-02 14:28       ` Jens Axboe
@ 2008-09-02 18:01       ` Jamie Lokier
  1 sibling, 0 replies; 14+ messages in thread
From: Jamie Lokier @ 2008-09-02 18:01 UTC (permalink / raw)
  To: qemu-devel

Ian Jackson wrote:
> > The Open Group text for aio_fsync says: "shall asynchronously force
> > all I/O operations [...]  queued at the time of the call to aio_fsync
> > [...]".
> 
> We discussed the meaning of the unix specs before.  I did a close
> textual analysis of it here:
> 
>   http://lists.nongnu.org/archive/html/qemu-devel/2008-04/msg00046.html
> 
> > Since then, I've read the Open Group specifications more closely, and
> > some other OS man pages, and they are consistent that _writes_ always
> > occur in the order they are submitted to aio_write.
> 
> Can you give me a chapter and verse quote for that ?  I'm looking at
> SuSv3 eg
>   http://www.opengroup.org/onlinepubs/009695399/nfindex.html
> 
> All I can see is this:
>   If O_APPEND is set for the file descriptor, write operations append
>   to the file in the same order as the calls were made.

Ack, you are right (again).  The SuS language is unclear, and ordering
cannot be assumed.  I'd forgotten about the O_APPEND constraint.  My
brain was addled from reading more things (below).

> If things are as you say and aio writes are always executed in the
> order queued, there would be no need to specify that because it would
> be implied by the behaviour of write(2) and O_APPEND.

You're right.  That's a subtlety I missed.

The following document on sun.com says:

    "I/O Issues (Multi-threaded programming guide)"
    [http://docs.sun.com/app/docs/doc/816-5137/gen-99376?a=view]

    "In most situations, asynchronous I/O is not required because its
    effects can be achieved with the use of threads, with each thread
    execution of synchronous I/O. However, in a few situations, threads
    cannot achieve what asynchronous I/O can.

    "The most straightforward example is writing to a tape drive to make
    the tape drive stream. Streaming prevents the tape drive from
    stopping while the drive is being written to. The tape moves forward
    at high speed while supplying a constant stream of data that is
    written to tape.

    "To support streaming, the tape driver in the kernel should use
    threads. The tape driver in the kernel must issue a queued write
    request when the tape driver responds to an interrupt. The interrupt
    indicates that the previous tape-write operation has completed.

    "Threads cannot guarantee that asynchronous writes are ordered
    because the order in which threads execute is indeterminate. You
    cannot, for example, specify the order of a write to a tape."

Yet I didn't find anything in Solaris man pages which implies anything
different than the SuSv2/3 description on Solaris, so I don't see how
the above strategy can work.  It does sound like a useful strategy
when streaming data to any device.

Quite a lot of things use aio_read and aio_write with sockets,
although I don't see anything in SuS which permits that, although its
notable that ESPIPE is not listed as an error.  Socket AIO must
preserve request ordering for both aio_read and aio_write (to stream
sockets), otherwise it's limited to one request at a time.

So I'm back to agreeing with you that nothing in SuS that I've seen
says you can rely on the order of requests submitted by AIO (nor can
you expect AIO on sockets to work).

Ugh.

It would be interesting to see what, say, Oracle does with aio_fsync.

> > What _can_ be queued is a WRITE FUA command: meaning write some data
> > and flush _this_ data to non-volatile storage.
> 
> In order to implement that interface without flushing other data
> unecessarily, we need to be able to
> 
>    submit other IO requests
>    submit aio_write request for WRITE FUA
>    asynchronously await completion of the aio_write for WRITE FUA
>    submit and perhaps collect completion of other IO requests
>    collect completion of aio_write for WRITE FUA
>    submit and perhaps collect completion of other IO requests
>    submit aio_fsync (for WRITE FUA)
>    submit and perhaps collect completion of other IO requests
>    collect aio_fsync (for WRITE FUA)
> 
> This is still not perfect because we unnecessarily flush some data
> thus delaying reporting completion of the WRITE FUA.  But there is at
> at least no need to wait for _other_ writes to complete.
> 
> So the semantics of bdrv_aio_flush should be `flush (at least) writes
> which have already completed'.

That's an interesting reason for bdrv_aio_flush() to have that
specification.

It would also make fsync() a correct implementation for
bdrv_aio_flush() - aio_fsync() not required (but better).

Btw, on Linux you can use sync_file_range() to flush specific regions
of a file.  It's messy though, and the documentation doesn't really
explain how to use it properly or what it does.  There is no AIO
equivalent to it.  And it doesn't invoke host disk barriers.

-- Jamie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 16:52         ` Ian Jackson
@ 2008-09-02 18:22           ` Jamie Lokier
  2008-09-03 10:01             ` Ian Jackson
  0 siblings, 1 reply; 14+ messages in thread
From: Jamie Lokier @ 2008-09-02 18:22 UTC (permalink / raw)
  To: qemu-devel

Ian Jackson wrote:
> Jens Axboe writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> > On Tue, Sep 02 2008, Ian Jackson wrote:
> > > This is still not perfect because we unnecessarily flush some data
> > > thus delaying reporting completion of the WRITE FUA.  But there is at
> > > at least no need to wait for _other_ writes to complete.
> > 
> > I don't see how the above works. There's no dependency on FUA and
> > non-FUA writes, in fact FUA writes tend to jump the device queue due to
> > certain other operating systems using it for conditions where that is
> > appropriate. So unless you do all writes using FUA, there's no way
> > around a flush for committing dirty data. Unfortunately we don't have a
> > FLUSH_RANGE command, it's just a big sledge hammer.
> 
> Yes, certainly you do aio_sync _some_ data that doesn't need to be.
> Without an O_FSYNC flag on aio_write that's almost inevitable.

Btw, in principle for FUA writes you can set O_SYNC or O_DSYNC on the
file descriptor just for this operation.  Either using fcntl() (but
I'm not sure I believe that would be portable and really work), or
using two file descriptors.

> But if bdrv_aio_fsync also does a flush first then you're going to
> sync _even more_ unnecessarily: the difference between `bdrv_aio_fsync
> does flush first' and `bdrv_aio_fsync does not flush' only affects
> writes are queued but not completed when bdrv_aio_fsync is called.
> 
> That is, non-FUA writes which were submitted after the FUA write.
> There is no need to fsync these and that's what I think qemu should
> do.

I agree, that's a clever reason to make bdrv_aio_fsync() guarantee
less rather than more.

(Who knows, that might be the reason SuS doesn't offer a stronger
guarantee too, although I doubt it - if that was serious they might
have defined a more selective sync instead.)

It would be interesting to see if using aio_fsync(O_DSYNC) were slower
or faster than fdatasync() on a range of hosts - just in case the
former syncs previously submitted AIOs and the latter doesn't.

Btw, on Linux aio_fsync(O_DSYNC) does the equivalent of fsync(), not
fdatasync().  This is because Glibc defines O_DSYNC to be the same as
O_SYNC.  To get fdatasync(), you have to use the Linux-AIO API and
IOCB_CMD_FDSYNC.

> Andrea was making some comments about scsi and virtio.  It's possible
> that these have different intended semantics and perhaps those device
> models (in hw/*) need to call flush explicitly before sync.

Or perhaps they would benefit from an async equivalent, so they don't
have to pause and can queue more requests?

-- Jamie

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Qemu-devel] [PATCH] bdrv_aio_flush
  2008-09-02 18:22           ` Jamie Lokier
@ 2008-09-03 10:01             ` Ian Jackson
  0 siblings, 0 replies; 14+ messages in thread
From: Ian Jackson @ 2008-09-03 10:01 UTC (permalink / raw)
  To: qemu-devel

Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] bdrv_aio_flush"):
> Ian Jackson wrote:
> > Yes, certainly you do aio_sync _some_ data that doesn't need to be.
> > Without an O_FSYNC flag on aio_write that's almost inevitable.
> 
> Btw, in principle for FUA writes you can set O_SYNC or O_DSYNC on the
> file descriptor just for this operation.  Either using fcntl() (but
> I'm not sure I believe that would be portable and really work), or
> using two file descriptors.

You'd have to have two file descriptors anyway because other IO on the
same virtual disk might well be happening concurrently.

> > Andrea was making some comments about scsi and virtio.  It's possible
> > that these have different intended semantics and perhaps those device
> > models (in hw/*) need to call flush explicitly before sync.
> 
> Or perhaps they would benefit from an async equivalent, so they don't
> have to pause and can queue more requests?

By `flush' and `sync' I meant qemu_aio_flush and bdrv_aio_flush.
The latter is in my patchset and is an aio version of bdrv_flush.
It is unfortunate that the qemu code uses `flush' for `sync' :-).

The hypothetical device I mentioned should probably keep its own list
of its aio operations so that it can start the sync when they're
complete.  qemu_aio_flush isn't right because it flushes _all_ aio in
the whole emulator, even to unrelated devices.

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2008-09-03 10:01 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-08-29 13:37 [Qemu-devel] [PATCH] bdrv_aio_flush Andrea Arcangeli
2008-09-01 11:27 ` Ian Jackson
2008-09-01 12:25   ` Andrea Arcangeli
2008-09-01 13:54     ` Jamie Lokier
2008-09-02 10:52     ` Ian Jackson
2008-09-02 14:25       ` Jens Axboe
2008-09-02 16:49         ` Ian Jackson
2008-09-01 13:25   ` Jamie Lokier
2008-09-02 10:46     ` Ian Jackson
2008-09-02 14:28       ` Jens Axboe
2008-09-02 16:52         ` Ian Jackson
2008-09-02 18:22           ` Jamie Lokier
2008-09-03 10:01             ` Ian Jackson
2008-09-02 18:01       ` Jamie Lokier

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).