From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from mailman by lists.gnu.org with tmda-scanned (Exim 4.43) id 1KZ4n1-0004go-8w for qemu-devel@nongnu.org; Fri, 29 Aug 2008 10:17:27 -0400 Received: from exim by lists.gnu.org with spam-scanned (Exim 4.43) id 1KZ4mz-0004fo-AP for qemu-devel@nongnu.org; Fri, 29 Aug 2008 10:17:26 -0400 Received: from [199.232.76.173] (port=40513 helo=monty-python.gnu.org) by lists.gnu.org with esmtp (Exim 4.43) id 1KZ4mz-0004fe-52 for qemu-devel@nongnu.org; Fri, 29 Aug 2008 10:17:25 -0400 Received: from host36-195-149-62.serverdedicati.aruba.it ([62.149.195.36]:58892 helo=mx.cpushare.com) by monty-python.gnu.org with esmtps (TLS-1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.60) (envelope-from ) id 1KZ4my-00057O-UG for qemu-devel@nongnu.org; Fri, 29 Aug 2008 10:17:25 -0400 Date: Fri, 29 Aug 2008 15:37:37 +0200 From: Andrea Arcangeli Message-ID: <20080829133736.GH24884@duo.random> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Subject: [Qemu-devel] [PATCH] bdrv_aio_flush Reply-To: qemu-devel@nongnu.org List-Id: qemu-devel.nongnu.org List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Hello, while reading the aio/ide code I noticed the bdrv_flush operation is unsafe. When a write command is submitted with bdrv_aio_write and later bdrv_flush is called, fsync will do nothing. fsync only sees the kernel writeback cache. But the write command is still queued in the aio kernel thread and is still invisible to the kernel. bdrv_aio_flush will instead see both the regular bdrv_write (that submits data to the kernel synchronously) as well as the bdrv_aio_write as the fsync will be queued at the end of the aio queue and it'll be issued by the aio pthread thread itself. IDE works by luck because it can only submit one command at once (no tagged queueing) so supposedly the guest kernel driver will wait the IDE emulated device to return ready before issuing a journaling barrier with WIN_FLUSH_CACHE* but with scsi and tagged command queueing this bug in the aio common code will become visible and it'll break the journaling guarantees of the guest if there's a power loss in the host. So it's not urgent for IDE I think, but it clearly should be fixed in the qemu block model eventually. This should fix it. Unfortunately I didn't implement all the backends and I only tested it with KVM so far. Also I don't know for sure if the BUSY line should get set during the flush, I assumed yes. It was irrelevant before because the flush was happening atomically before returning from the ioport write from guest point of view. Signed-off-by: Andrea Arcangeli Index: block_int.h =================================================================== --- block_int.h (revision 5089) +++ block_int.h (working copy) @@ -54,6 +54,8 @@ BlockDriverAIOCB *(*bdrv_aio_write)(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); + BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque); void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb); int aiocb_size; Index: block-raw-posix.c =================================================================== --- block-raw-posix.c (revision 5089) +++ block-raw-posix.c (working copy) @@ -638,6 +638,21 @@ return &acb->common; } +static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + RawAIOCB *acb; + + acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque); + if (!acb) + return NULL; + if (aio_fsync(O_DSYNC, &acb->aiocb) < 0) { + qemu_aio_release(acb); + return NULL; + } + return &acb->common; +} + static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque) @@ -857,6 +872,7 @@ #ifdef CONFIG_AIO .bdrv_aio_read = raw_aio_read, .bdrv_aio_write = raw_aio_write, + .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif @@ -1211,6 +1227,7 @@ #ifdef CONFIG_AIO .bdrv_aio_read = raw_aio_read, .bdrv_aio_write = raw_aio_write, + .bdrv_aio_flush = raw_aio_flush, .bdrv_aio_cancel = raw_aio_cancel, .aiocb_size = sizeof(RawAIOCB), #endif Index: block-qcow2.c =================================================================== --- block-qcow2.c (revision 5089) +++ block-qcow2.c (working copy) @@ -1369,6 +1369,42 @@ return &acb->common; } +static void qcow_aio_flush_cb(void *opaque, int ret) +{ + QCowAIOCB *acb = opaque; + BlockDriverState *bs = acb->common.bs; + BDRVQcowState *s = bs->opaque; + + acb->hd_aiocb = NULL; + if (ret < 0) { + acb->common.cb(acb->common.opaque, ret); + qemu_aio_release(acb); + return; + } + + /* request completed */ + acb->common.cb(acb->common.opaque, 0); + qemu_aio_release(acb); +} + +static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BDRVQcowState *s = bs->opaque; + QCowAIOCB *acb; + + acb = qcow_aio_setup(bs, 0, NULL, 0, cb, opaque); + if (!acb) + return NULL; + + acb->hd_aiocb = bdrv_aio_flush(s->hd, + qcow_aio_flush_cb, acb); + if (acb->hd_aiocb == NULL) + qcow_aio_flush_cb(acb, -ENOMEM); + + return &acb->common; +} + static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) { QCowAIOCB *acb = (QCowAIOCB *)blockacb; @@ -2612,6 +2648,7 @@ .bdrv_aio_read = qcow_aio_read, .bdrv_aio_write = qcow_aio_write, + .bdrv_aio_flush = qcow_aio_flush, .bdrv_aio_cancel = qcow_aio_cancel, .aiocb_size = sizeof(QCowAIOCB), .bdrv_write_compressed = qcow_write_compressed, Index: block.c =================================================================== --- block.c (revision 5089) +++ block.c (working copy) @@ -1181,6 +1181,20 @@ return ret; } +BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque) +{ + BlockDriver *drv = bs->drv; + BlockDriverAIOCB *ret; + + if (!drv) + return NULL; + + ret = drv->bdrv_aio_flush(bs, cb, opaque); + + return ret; +} + void bdrv_aio_cancel(BlockDriverAIOCB *acb) { BlockDriver *drv = acb->bs->drv; Index: block.h =================================================================== --- block.h (revision 5089) +++ block.h (working copy) @@ -87,6 +87,8 @@ BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors, BlockDriverCompletionFunc *cb, void *opaque); +BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockDriverCompletionFunc *cb, void *opaque); void bdrv_aio_cancel(BlockDriverAIOCB *acb); void qemu_aio_init(void); Index: hw/ide.c =================================================================== --- hw/ide.c (revision 5089) +++ hw/ide.c (working copy) @@ -1969,6 +1969,13 @@ ide_if[1].select &= ~(1 << 7); } +static void ide_flush_dma_cb(void *opaque, int ret) +{ + IDEState *s = opaque; + s->status = READY_STAT | SEEK_STAT; + ide_set_irq(s); +} + static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val) { IDEState *ide_if = opaque; @@ -2237,8 +2244,11 @@ break; case WIN_FLUSH_CACHE: case WIN_FLUSH_CACHE_EXT: - if (s->bs) - bdrv_flush(s->bs); + if (s->bs) { + s->status = READY_STAT | SEEK_STAT | BUSY_STAT; + bdrv_aio_flush(s->bs, ide_flush_dma_cb, s); + break; + } s->status = READY_STAT | SEEK_STAT; ide_set_irq(s); break;