qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Andrea Arcangeli <andrea@qumranet.com>
To: qemu-devel@nongnu.org
Subject: [Qemu-devel] [PATCH] bdrv_aio_flush
Date: Fri, 29 Aug 2008 15:37:37 +0200	[thread overview]
Message-ID: <20080829133736.GH24884@duo.random> (raw)

Hello,

while reading the aio/ide code I noticed the bdrv_flush operation is
unsafe. When a write command is submitted with bdrv_aio_write and
later bdrv_flush is called, fsync will do nothing. fsync only sees the
kernel writeback cache. But the write command is still queued in the
aio kernel thread and is still invisible to the kernel. bdrv_aio_flush
will instead see both the regular bdrv_write (that submits data to the
kernel synchronously) as well as the bdrv_aio_write as the fsync will
be queued at the end of the aio queue and it'll be issued by the aio
pthread thread itself.

IDE works by luck because it can only submit one command at once (no
tagged queueing) so supposedly the guest kernel driver will wait the
IDE emulated device to return ready before issuing a journaling
barrier with WIN_FLUSH_CACHE* but with scsi and tagged command
queueing this bug in the aio common code will become visible and it'll
break the journaling guarantees of the guest if there's a power loss
in the host. So it's not urgent for IDE I think, but it clearly should
be fixed in the qemu block model eventually.

This should fix it. Unfortunately I didn't implement all the backends
and I only tested it with KVM so far. Also I don't know for sure if
the BUSY line should get set during the flush, I assumed yes. It was
irrelevant before because the flush was happening atomically before
returning from the ioport write from guest point of view.

Signed-off-by: Andrea Arcangeli <andrea@qumranet.com>

Index: block_int.h
===================================================================
--- block_int.h	(revision 5089)
+++ block_int.h	(working copy)
@@ -54,6 +54,8 @@
     BlockDriverAIOCB *(*bdrv_aio_write)(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
     int aiocb_size;
 
Index: block-raw-posix.c
===================================================================
--- block-raw-posix.c	(revision 5089)
+++ block-raw-posix.c	(working copy)
@@ -638,6 +638,21 @@
     return &acb->common;
 }
 
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (aio_fsync(O_DSYNC, &acb->aiocb) < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
+
 static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque)
@@ -857,6 +872,7 @@
 #ifdef CONFIG_AIO
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
+    .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
@@ -1211,6 +1227,7 @@
 #ifdef CONFIG_AIO
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
+    .bdrv_aio_flush = raw_aio_flush,
     .bdrv_aio_cancel = raw_aio_cancel,
     .aiocb_size = sizeof(RawAIOCB),
 #endif
Index: block-qcow2.c
===================================================================
--- block-qcow2.c	(revision 5089)
+++ block-qcow2.c	(working copy)
@@ -1369,6 +1369,42 @@
     return &acb->common;
 }
 
+static void qcow_aio_flush_cb(void *opaque, int ret)
+{
+    QCowAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVQcowState *s = bs->opaque;
+
+    acb->hd_aiocb = NULL;
+    if (ret < 0) {
+        acb->common.cb(acb->common.opaque, ret);
+        qemu_aio_release(acb);
+        return;
+    }
+
+    /* request completed */
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_aio_release(acb);
+}
+
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    QCowAIOCB *acb;
+
+    acb = qcow_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+
+    acb->hd_aiocb = bdrv_aio_flush(s->hd,
+				   qcow_aio_flush_cb, acb);
+    if (acb->hd_aiocb == NULL)
+	    qcow_aio_flush_cb(acb, -ENOMEM);
+
+    return &acb->common;
+}
+
 static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     QCowAIOCB *acb = (QCowAIOCB *)blockacb;
@@ -2612,6 +2648,7 @@
 
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
+    .bdrv_aio_flush = qcow_aio_flush,
     .bdrv_aio_cancel = qcow_aio_cancel,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
Index: block.c
===================================================================
--- block.c	(revision 5089)
+++ block.c	(working copy)
@@ -1181,6 +1181,20 @@
     return ret;
 }
 
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+				 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+    BlockDriverAIOCB *ret;
+
+    if (!drv)
+        return NULL;
+
+    ret = drv->bdrv_aio_flush(bs, cb, opaque);
+
+    return ret;
+}
+
 void bdrv_aio_cancel(BlockDriverAIOCB *acb)
 {
     BlockDriver *drv = acb->bs->drv;
Index: block.h
===================================================================
--- block.h	(revision 5089)
+++ block.h	(working copy)
@@ -87,6 +87,8 @@
 BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs,
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
Index: hw/ide.c
===================================================================
--- hw/ide.c	(revision 5089)
+++ hw/ide.c	(working copy)
@@ -1969,6 +1969,13 @@
     ide_if[1].select &= ~(1 << 7);
 }
 
+static void ide_flush_dma_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+    s->status = READY_STAT | SEEK_STAT;
+    ide_set_irq(s);
+}
+
 static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
 {
     IDEState *ide_if = opaque;
@@ -2237,8 +2244,11 @@
             break;
         case WIN_FLUSH_CACHE:
         case WIN_FLUSH_CACHE_EXT:
-            if (s->bs)
-                bdrv_flush(s->bs);
+	    if (s->bs) {
+		s->status = READY_STAT | SEEK_STAT | BUSY_STAT;
+		bdrv_aio_flush(s->bs, ide_flush_dma_cb, s);
+		break;
+	    }
 	    s->status = READY_STAT | SEEK_STAT;
             ide_set_irq(s);
             break;

             reply	other threads:[~2008-08-29 14:17 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2008-08-29 13:37 Andrea Arcangeli [this message]
2008-09-01 11:27 ` [Qemu-devel] [PATCH] bdrv_aio_flush Ian Jackson
2008-09-01 12:25   ` Andrea Arcangeli
2008-09-01 13:54     ` Jamie Lokier
2008-09-02 10:52     ` Ian Jackson
2008-09-02 14:25       ` Jens Axboe
2008-09-02 16:49         ` Ian Jackson
2008-09-01 13:25   ` Jamie Lokier
2008-09-02 10:46     ` Ian Jackson
2008-09-02 14:28       ` Jens Axboe
2008-09-02 16:52         ` Ian Jackson
2008-09-02 18:22           ` Jamie Lokier
2008-09-03 10:01             ` Ian Jackson
2008-09-02 18:01       ` Jamie Lokier

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20080829133736.GH24884@duo.random \
    --to=andrea@qumranet.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).