[Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
@ 2008-04-01 17:18 Ian Jackson
  2008-04-01 17:46 ` Paul Brook
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Ian Jackson @ 2008-04-01 17:18 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 606 bytes --]

This patch does the following, as previously discussed here:
 * Makes guests able to control the caching of emulated IDE disks
   using the SETFEATURES 0x02 and 0x82 commands.
 * Introduces a new block entrypoint bdrv_aio_flush for asynchronous
   flushing of the write cache.
 * Makes IDE FLUSH_CACHE and FLUSH_CACHE_EXT use the new asynchronous
   function so that guests can continue to execute, receive timer
   interrupts, etc.
 * Does likewise for the SCSI FLUSH CACHE command.

I think this should address all of the comments.

Regards,
Ian.

Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>


[-- Attachment #2: block driver cache improvements --]
[-- Type: text/plain, Size: 16466 bytes --]

diff --git a/block-qcow.c b/block-qcow.c
index 59f1c66..4798ade 100644
--- a/block-qcow.c
+++ b/block-qcow.c
@@ -725,6 +725,13 @@ static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
     qemu_aio_release(acb);
 }
 
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    return bdrv_aio_flush(s->hd, cb, opaque);
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -913,6 +920,7 @@ BlockDriver bdrv_qcow = {
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
     .bdrv_aio_cancel = qcow_aio_cancel,
+    .bdrv_aio_flush = qcow_aio_flush,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
     .bdrv_get_info = qcow_get_info,
diff --git a/block-qcow2.c b/block-qcow2.c
index 9bad090..9c63a63 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -1007,6 +1007,13 @@ static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
     qemu_aio_release(acb);
 }
 
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    return bdrv_aio_flush(s->hd, cb, opaque);
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -2241,6 +2248,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
     .bdrv_aio_cancel = qcow_aio_cancel,
+    .bdrv_aio_flush = qcow_aio_flush,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 45d0a93..71e304d 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -429,6 +429,20 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
     return &acb->common;
 }
 
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (aio_fsync(O_SYNC, &acb->aiocb) < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
 static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     int ret;
@@ -762,6 +776,7 @@ static int fd_open(BlockDriverState *bs)
 {
     return 0;
 }
+
 #endif
 
 #if defined(__linux__)
@@ -913,6 +928,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
     .bdrv_aio_cancel = raw_aio_cancel,
+    .bdrv_aio_flush = raw_aio_flush,
     .aiocb_size = sizeof(RawAIOCB),
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
diff --git a/block.c b/block.c
index fc32a60..348a215 100644
--- a/block.c
+++ b/block.c
@@ -50,6 +50,8 @@ static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState *bs,
 static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb);
 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
@@ -137,6 +139,8 @@ static void bdrv_register(BlockDriver *bdrv)
         bdrv->bdrv_read = bdrv_read_em;
         bdrv->bdrv_write = bdrv_write_em;
     }
+    if (!bdrv->bdrv_aio_flush)
+        bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
     bdrv->next = first_drv;
     first_drv = bdrv;
 }
@@ -1156,6 +1160,17 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 
+                                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv)
+        return NULL;
+
+    return drv->bdrv_aio_flush(bs, cb, opaque);
+}
+
 
 /**************************************************************/
 /* async block device emulation */
@@ -1181,6 +1196,15 @@ static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
     return NULL;
 }
 
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    ret = bdrv_flush(bs);
+    cb(opaque, ret);
+    return NULL;
+}
+
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb)
 {
 }
@@ -1224,6 +1248,21 @@ static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
     return &acb->common;
 }
 
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriverAIOCBSync *acb;
+    int ret;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb->bh)
+        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+    ret = bdrv_flush(bs);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
 {
     BlockDriverAIOCBSync *acb = (BlockDriverAIOCBSync *)blockacb;
diff --git a/block.h b/block.h
index 861a65b..0ecb0f8 100644
--- a/block.h
+++ b/block.h
@@ -86,6 +86,8 @@ BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num,
 BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
diff --git a/block_int.h b/block_int.h
index 796ce29..bda0eab 100644
--- a/block_int.h
+++ b/block_int.h
@@ -55,6 +55,8 @@ struct BlockDriver {
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
     int aiocb_size;
 
     const char *protocol_name;
diff --git a/hw/ide.c b/hw/ide.c
index b73dba2..8348f43 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -383,6 +383,7 @@ typedef struct IDEState {
     PCIDevice *pci_dev;
     struct BMDMAState *bmdma;
     int drive_serial;
+    uint8_t write_cache;
     /* ide regs */
     uint8_t feature;
     uint8_t error;
@@ -559,7 +560,8 @@ static void ide_identify(IDEState *s)
     put_le16(p + 68, 120);
     put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
     put_le16(p + 81, 0x16); /* conforms to ata5 */
-    put_le16(p + 82, (1 << 14));
+    /* 14=nop 5=write_cache */
+    put_le16(p + 82, (1 << 14) | (1 << 5));
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
     put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
     put_le16(p + 84, (1 << 14));
@@ -722,6 +724,7 @@ static inline void ide_abort_command(IDEState *s)
 static inline void ide_set_irq(IDEState *s)
 {
     BMDMAState *bm = s->bmdma;
+    if (!s->bs) return; /* yikes */
     if (!(s->cmd & IDE_CMD_DISABLE_IRQ)) {
         if (bm) {
             bm->status |= BM_STATUS_INT;
@@ -901,6 +904,8 @@ static void ide_read_dma_cb(void *opaque, int ret)
 	return;
     }
 
+    if (!s->bs) return; /* yikes */
+
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
@@ -965,6 +970,9 @@ static void ide_sector_write(IDEState *s)
     if (n > s->req_nb_sectors)
         n = s->req_nb_sectors;
     ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+    if (ret == 0 && !s->write_cache) {
+	ret = bdrv_flush(s->bs);
+    }
     if (ret != 0) {
 	ide_rw_error(s);
 	return;
@@ -1011,6 +1019,8 @@ static void ide_write_dma_cb(void *opaque, int ret)
 	return;
     }
 
+    if (!s->bs) return; /* yikes */
+
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
@@ -1021,6 +1031,13 @@ static void ide_write_dma_cb(void *opaque, int ret)
 
     /* end of transfer ? */
     if (s->nsector == 0) {
+	if (!s->write_cache) {
+	    ret = bdrv_flush(s->bs);
+	    if (ret != 0) {
+		ide_dma_error(s);
+		return;
+	    }
+	}
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s);
     eot:
@@ -1056,6 +1073,39 @@ static void ide_sector_write_dma(IDEState *s)
     ide_dma_start(s, ide_write_dma_cb);
 }
 
+static void ide_device_utterly_broken(IDEState *s) {
+    s->status |= BUSY_STAT;
+    s->bs = NULL;
+    /* This prevents all future commands from working.  All of the
+     * asynchronous callbacks (and ide_set_irq, as a safety measure)
+     * check to see whether this has happened and bail if so.
+     */
+}
+
+static void ide_flush_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+
+    if (!s->bs) return; /* yikes */
+
+    if (ret) {
+        /* We are completely doomed.  The IDE spec does not permit us
+	 * to return an error from a flush except via a protocol which
+	 * requires us to say where the error is and which
+	 * contemplates the guest repeating the flush attempt to
+	 * attempt flush the remaining data.  We can't support that
+	 * because f(data)sync (which is what the block drivers use
+	 * eventually) doesn't report the necessary information or
+	 * give us the necessary control.  So we make the disk vanish.
+	 */
+	ide_device_utterly_broken(s);
+	return;
+    }
+    else
+        s->status = READY_STAT;
+    ide_set_irq(s);
+}
+
 static void ide_atapi_cmd_ok(IDEState *s)
 {
     s->error = 0;
@@ -1282,6 +1332,8 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
     IDEState *s = bm->ide_if;
     int data_offset, n;
 
+    if (!s->bs) return; /* yikes */
+
     if (ret < 0) {
         ide_atapi_io_error(s, ret);
         goto eot;
@@ -1860,6 +1912,8 @@ static void cdrom_change_cb(void *opaque)
     IDEState *s = opaque;
     uint64_t nb_sectors;
 
+    if (!s->bs) return; /* yikes */
+
     /* XXX: send interrupt too */
     bdrv_get_geometry(s->bs, &nb_sectors);
     s->nb_sectors = nb_sectors;
@@ -1963,8 +2017,8 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         printf("ide: CMD=%02x\n", val);
 #endif
         s = ide_if->cur_drive;
-        /* ignore commands to non existant slave */
-        if (s != ide_if && !s->bs)
+	/* ignore commands to non existant device */
+        if (!s->bs)
             break;
 
         switch(val) {
@@ -2102,8 +2156,6 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
             switch(s->feature) {
             case 0xcc: /* reverting to power-on defaults enable */
             case 0x66: /* reverting to power-on defaults disable */
-            case 0x02: /* write cache enable */
-            case 0x82: /* write cache disable */
             case 0xaa: /* read look-ahead enable */
             case 0x55: /* read look-ahead disable */
             case 0x05: /* set advanced power management mode */
@@ -2117,6 +2169,18 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
                 s->status = READY_STAT | SEEK_STAT;
                 ide_set_irq(s);
                 break;
+            case 0x02: /* write cache enable */
+                s->write_cache = 1;
+                s->status = READY_STAT | SEEK_STAT;
+                ide_set_irq(s);
+                break;
+            case 0x82: /* write cache disable */
+                s->write_cache = 0;
+                ret = bdrv_flush(s->bs);
+		if (ret != 0) goto abort_cmd;
+                s->status = READY_STAT | SEEK_STAT;
+                ide_set_irq(s);
+                break;
             case 0x03: { /* set transfer mode */
 		uint8_t val = s->nsector & 0x07;
 
@@ -2147,12 +2211,8 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
             break;
         case WIN_FLUSH_CACHE:
         case WIN_FLUSH_CACHE_EXT:
-            if (s->bs) {
-                ret = bdrv_flush(s->bs);
-		if (ret) goto abort_cmd;
-	    }
-	    s->status = READY_STAT;
-            ide_set_irq(s);
+	    s->status = BUSY_STAT;
+	    bdrv_aio_flush(s->bs, ide_flush_cb, s);
             break;
         case WIN_STANDBY:
         case WIN_STANDBY2:
@@ -2629,6 +2689,7 @@ static void ide_init2(IDEState *ide_state,
         s->irq = irq;
         s->sector_write_timer = qemu_new_timer(vm_clock,
                                                ide_sector_write_timer_cb, s);
+	s->write_cache = 0;
         ide_reset(s);
     }
 }
@@ -2657,6 +2718,7 @@ static void ide_save(QEMUFile* f, IDEState *s)
     if (s->identify_set) {
         qemu_put_buffer(f, (const uint8_t *)s->identify_data, 512);
     }
+    qemu_put_8s(f, &s->write_cache);
     qemu_put_8s(f, &s->feature);
     qemu_put_8s(f, &s->error);
     qemu_put_be32s(f, &s->nsector);
@@ -2678,13 +2740,15 @@ static void ide_save(QEMUFile* f, IDEState *s)
 }
 
 /* load per IDE drive data */
-static void ide_load(QEMUFile* f, IDEState *s)
+static void ide_load(QEMUFile* f, IDEState *s, int version_id)
 {
     s->mult_sectors=qemu_get_be32(f);
     s->identify_set=qemu_get_be32(f);
     if (s->identify_set) {
         qemu_get_buffer(f, (uint8_t *)s->identify_data, 512);
     }
+    if (version_id >= 2)
+	qemu_get_8s(f, &s->write_cache);
     qemu_get_8s(f, &s->feature);
     qemu_get_8s(f, &s->error);
     qemu_get_be32s(f, &s->nsector);
@@ -3029,7 +3093,7 @@ static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
     PCIIDEState *d = opaque;
     int ret, i;
 
-    if (version_id != 1)
+    if (version_id != 1 && version_id != 2)
         return -EINVAL;
     ret = pci_device_load(&d->dev, f);
     if (ret < 0)
@@ -3054,7 +3118,7 @@ static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
 
     /* per IDE drive data */
     for(i = 0; i < 4; i++) {
-        ide_load(f, &d->ide_if[i]);
+        ide_load(f, &d->ide_if[i], version_id);
     }
     return 0;
 }
@@ -3105,7 +3169,7 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
     ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
     ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
 
-    register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+    register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
 }
 
 /* hd_table must contain 4 block drivers */
@@ -3143,7 +3207,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
     ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
     ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
 
-    register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+    register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
 }
 
 /***********************************************************/
@@ -3577,7 +3641,7 @@ static int md_load(QEMUFile *f, void *opaque, int version_id)
     s->ide->cur_drive = &s->ide[(drive1_selected != 0)];
 
     for (i = 0; i < 2; i ++)
-        ide_load(f, &s->ide[i]);
+        ide_load(f, &s->ide[i], version_id);
 
     return 0;
 }
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 30cc906..99aa259 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -276,6 +276,16 @@ static uint8_t *scsi_get_buf(SCSIDevice *d, uint32_t tag)
     return r->dma_buf;
 }
 
+static void scsi_flush_cb(void *opaque, int ret) {
+    SCSIRequest *r = opaque;
+    if (ret) {
+        BADF("scsi-disc: IO error on flush: %s\n", strerror(-ret));
+	scsi_command_complete(r, SENSE_HARDWARE_ERROR);
+    } else {
+	scsi_command_complete(r, SENSE_NO_SENSE);
+    }
+}
+
 /* Execute a scsi command.  Returns the length of the data expected by the
    command.  This will be Positive for data transfers from the device
    (eg. disk reads), negative for transfers to the device (eg. disk writes),
@@ -621,13 +631,8 @@ static int32_t scsi_send_command(SCSIDevice *d, uint32_t tag,
         break;
     case 0x35:
         DPRINTF("Synchronise cache (sector %d, count %d)\n", lba, len);
-        ret = bdrv_flush(s->bdrv);
-        if (ret) {
-            DPRINTF("IO error on bdrv_flush\n");
-            scsi_command_complete(r, SENSE_HARDWARE_ERROR);
-            return 0;
-        }
-        break;
+	bdrv_aio_flush(s->bdrv, scsi_flush_cb, r);
+	return 0;
     case 0x43:
         {
             int start_track, format, msf, toclen;

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 17:18 [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush Ian Jackson
@ 2008-04-01 17:46 ` Paul Brook
  2008-04-01 18:45   ` Jamie Lokier
  2008-04-02 16:27   ` Ian Jackson
  2008-04-01 18:07 ` Samuel Thibault
  2008-04-02 16:31 ` [Qemu-devel] " Ian Jackson
  2 siblings, 2 replies; 15+ messages in thread
From: Paul Brook @ 2008-04-01 17:46 UTC (permalink / raw)
  To: qemu-devel; +Cc: Ian Jackson

On Tuesday 01 April 2008, Ian Jackson wrote:
> +    if (!s->bs) return; /* yikes */

This comment is distinctly unhelpful. I guess this is the condition mentioned 
in ide_device_utterly_broken, but it's hard to tell.

> @@ -1021,6 +1031,13 @@ static void ide_write_dma_cb(void *opaque, int ret)
>  
>      /* end of transfer ? */
>      if (s->nsector == 0) {
> +       if (!s->write_cache) {
> +           ret = bdrv_flush(s->bs);
> +           if (ret != 0) {
> +               ide_dma_error(s);
> +               return;
> +           }
> +       }

By my reading this is adding a synchronous flush to the end of an async write 
operation, which in practice makes the whole operation synchronous.

> +static void scsi_flush_cb(void *opaque, int ret) {
> +    SCSIRequest *r = opaque;
> +    if (ret) {
> +        BADF("scsi-disc: IO error on flush: %s\n", strerror(-ret));

This is wrong for two reasons: BADF already adds a suitable prefix, and is for 
reporting qemu bugs (i.e. missing features or things that should never 
happen). This is just a normal IO error, which we correctly report to the 
guest.

Paul

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 17:18 [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush Ian Jackson
  2008-04-01 17:46 ` Paul Brook
@ 2008-04-01 18:07 ` Samuel Thibault
  2008-04-02 14:21   ` Ian Jackson
  2008-04-02 16:31 ` [Qemu-devel] " Ian Jackson
  2 siblings, 1 reply; 15+ messages in thread
From: Samuel Thibault @ 2008-04-01 18:07 UTC (permalink / raw)
  To: qemu-devel

Hello,

Oh, by the way, shouldn't this:

Ian Jackson, le Tue 01 Apr 2008 18:18:59 +0100, a écrit :
>      put_le16(p + 81, 0x16); /* conforms to ata5 */
> -    put_le16(p + 82, (1 << 14));
> +    /* 14=nop 5=write_cache */
> +    put_le16(p + 82, (1 << 14) | (1 << 5));
>      /* 13=flush_cache_ext,12=flush_cache,10=lba48 */

use s->write_cache instead of always 1?  Else when using hdparm -i one
would think that write cache is always enabled.

Samuel

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 17:46 ` Paul Brook
@ 2008-04-01 18:45   ` Jamie Lokier
  2008-04-02 12:26     ` Paul Brook
  2008-04-02 16:27   ` Ian Jackson
  1 sibling, 1 reply; 15+ messages in thread
From: Jamie Lokier @ 2008-04-01 18:45 UTC (permalink / raw)
  To: qemu-devel; +Cc: Ian Jackson

Paul Brook wrote:
> > @@ -1021,6 +1031,13 @@ static void ide_write_dma_cb(void *opaque, int ret)
> >  
> >      /* end of transfer ? */
> >      if (s->nsector == 0) {
> > +       if (!s->write_cache) {
> > +           ret = bdrv_flush(s->bs);
> > +           if (ret != 0) {
> > +               ide_dma_error(s);
> > +               return;
> > +           }
> > +       }
> 
> By my reading this is adding a synchronous flush to the end of an
> async write operation, which in practice makes the whole operation
> synchronous.

Looks that way to me too.  It might be simplest to open the device
with O_DSYNC when !s->write_cache and user actually wants fdatasync,
so that async write can be used.  I suspect every platform with useful
Posix AIO has O_DSYNC.

Otherwise, chaining where the completion of aio_write triggers
aio_fsync instead of reporting completion to the guest?

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 18:45   ` Jamie Lokier
@ 2008-04-02 12:26     ` Paul Brook
  2008-04-02 15:23       ` Jamie Lokier
  0 siblings, 1 reply; 15+ messages in thread
From: Paul Brook @ 2008-04-02 12:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: Ian Jackson

On Tuesday 01 April 2008, Jamie Lokier wrote:
> Paul Brook wrote:
> > > @@ -1021,6 +1031,13 @@ static void ide_write_dma_cb(void *opaque, int
> > > ret) 
> > >      /* end of transfer ? */
> > >      if (s->nsector == 0) {
> > > +       if (!s->write_cache) {
> > > +           ret = bdrv_flush(s->bs);
> > > +           if (ret != 0) {
> > > +               ide_dma_error(s);
> > > +               return;
> > > +           }
> > > +       }
> >
> > By my reading this is adding a synchronous flush to the end of an
> > async write operation, which in practice makes the whole operation
> > synchronous.
>
> Looks that way to me too.  It might be simplest to open the device
> with O_DSYNC when !s->write_cache and user actually wants fdatasync,
> so that async write can be used.  I suspect every platform with useful
> Posix AIO has O_DSYNC.

The cache is dynamically enabled/disabled by the target. This means you've got 
to close and repopen the file every time it changes, which is likely to get 
really hairy.

> Otherwise, chaining where the completion of aio_write triggers
> aio_fsync instead of reporting completion to the guest?

Yes, that should do it.

Paul

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 18:07 ` Samuel Thibault
@ 2008-04-02 14:21   ` Ian Jackson
  2008-04-04 14:29     ` Samuel Thibault
  0 siblings, 1 reply; 15+ messages in thread
From: Ian Jackson @ 2008-04-02 14:21 UTC (permalink / raw)
  To: qemu-devel

Samuel Thibault writes ("Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush"):
> Oh, by the way, shouldn't this:
> Ian Jackson, le Tue 01 Apr 2008 18:18:59 +0100, a écrit :
> >      put_le16(p + 81, 0x16); /* conforms to ata5 */
> > -    put_le16(p + 82, (1 << 14));
> > +    /* 14=nop 5=write_cache */
> > +    put_le16(p + 82, (1 << 14) | (1 << 5));
> >      /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
> 
> use s->write_cache instead of always 1?  Else when using hdparm -i one
> would think that write cache is always enabled.

No.  According to the ATA-7 draft I have in front of me, the meaning
of word 82 is `Command set supported' and the meaning of bit 5 is is
`write cache supported'.

Word 85 bit 5 is `write cache enabled'.  I see that the patch I
supplied earlier is missing this, which I will fix.

Ian.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-02 12:26     ` Paul Brook
@ 2008-04-02 15:23       ` Jamie Lokier
  2008-04-02 16:20         ` Ian Jackson
  0 siblings, 1 reply; 15+ messages in thread
From: Jamie Lokier @ 2008-04-02 15:23 UTC (permalink / raw)
  To: Paul Brook; +Cc: Ian Jackson, qemu-devel

Paul Brook wrote:
> > Looks that way to me too.  It might be simplest to open the device
> > with O_DSYNC when !s->write_cache and user actually wants fdatasync,
> > so that async write can be used.  I suspect every platform with useful
> > Posix AIO has O_DSYNC.
> 
> The cache is dynamically enabled/disabled by the target. This means
> you've got to close and repopen the file every time it changes,
> which is likely to get really hairy.

Just open two descriptors :-)

I'm not sure if F_SETFL can be used.

Both descriptors are useful with the cache enabled, if the SATA FUA
(force unit access) bit is set on a write command.  Only those writes
would use the O_DSYNC descriptor.

> > Otherwise, chaining where the completion of aio_write triggers
> > aio_fsync instead of reporting completion to the guest?
> 
> Yes, that should do it.

Yes, though O_DSYNC will save a system call.

-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-02 15:23       ` Jamie Lokier
@ 2008-04-02 16:20         ` Ian Jackson
  2008-04-03 11:38           ` Jamie Lokier
  0 siblings, 1 reply; 15+ messages in thread
From: Ian Jackson @ 2008-04-02 16:20 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Paul Brook, qemu-devel

Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush"):
> Paul Brook wrote:
> > The cache is dynamically enabled/disabled by the target. This means
> > you've got to close and repopen the file every time it changes,
> > which is likely to get really hairy.
> 
> Just open two descriptors :-)

Doing it with bdrv_aio_flush wasn't too hard, so that's what'll be in
my revised patch shortly.

(The original patch I'm porting from xen-unstable predates my
introduction of bdrv_aio_flush, hence the synchronous flush at that
point.)

Ian.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 17:46 ` Paul Brook
  2008-04-01 18:45   ` Jamie Lokier
@ 2008-04-02 16:27   ` Ian Jackson
  2008-04-03  0:40     ` Paul Brook
  1 sibling, 1 reply; 15+ messages in thread
From: Ian Jackson @ 2008-04-02 16:27 UTC (permalink / raw)
  To: Paul Brook; +Cc: qemu-devel

Paul Brook writes ("Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush"):
> On Tuesday 01 April 2008, Ian Jackson wrote:
> > +static void scsi_flush_cb(void *opaque, int ret) {
> > +    SCSIRequest *r = opaque;
> > +    if (ret) {
> > +        BADF("scsi-disc: IO error on flush: %s\n", strerror(-ret));
> 
> This is wrong for two reasons: BADF already adds a suitable prefix,
> and is for reporting qemu bugs (i.e. missing features or things that
> should never happen). This is just a normal IO error, which we
> correctly report to the guest.

I don't think flush failures are a normal IO error.  They can only
occur when the corresponding host block device is having serious
trouble.  Normally (if it's actually a host disk) this is very bad.

The principle I'm trying to follow with this is to not throw away the
errno value.  When we turn a block device write failure (which might
be EIO or perhaps something weirder like EBADF indicating a bug in
qemu) into a SCSI error, we don't have the ability to properly report
that errno value to the guest.  So we should log it somewhere.

In a previous thread on this subject we seemed to conclude that qemu's
stderr is fine for this purpose.  But if not then I'm happy to be
advised to send it somewhere else ...

You're right about the prefix though (and it was the wrong prefix,
too, with a variant spelling of `disk').

Ian.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] Re: [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-01 17:18 [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush Ian Jackson
  2008-04-01 17:46 ` Paul Brook
  2008-04-01 18:07 ` Samuel Thibault
@ 2008-04-02 16:31 ` Ian Jackson
  2 siblings, 0 replies; 15+ messages in thread
From: Ian Jackson @ 2008-04-02 16:31 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: message body text --]
[-- Type: text/plain, Size: 826 bytes --]

This patch does the following, as previously discussed here:
 * Makes guests able to control the caching of emulated IDE disks
   using the SETFEATURES 0x02 and 0x82 commands.
 * Introduces a new block entrypoint bdrv_aio_flush for asynchronous
   flushing of the write cache.
 * Makes IDE FLUSH_CACHE and FLUSH_CACHE_EXT use the new asynchronous
   function so that guests can continue to execute, receive timer
   interrupts, etc.
 * Does likewise for the SCSI FLUSH CACHE command.
 * Removes some clone-and-hackery in DMA completions so that the
   code can be reused for the flush due to dma writes with write_cache
   turned off.

Thanks for the review and comments, particularly to Paul Brook.

Ian.

Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com>
Signed-off-by: Samuel Thibault <samuel.thibault@eu.citrix.com>


[-- Attachment #2: qemu block driver caching and flushing improvements --]
[-- Type: text/plain, Size: 18472 bytes --]

diff --git a/block-qcow.c b/block-qcow.c
index 59f1c66..4798ade 100644
--- a/block-qcow.c
+++ b/block-qcow.c
@@ -725,6 +725,13 @@ static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
     qemu_aio_release(acb);
 }
 
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    return bdrv_aio_flush(s->hd, cb, opaque);
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -913,6 +920,7 @@ BlockDriver bdrv_qcow = {
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
     .bdrv_aio_cancel = qcow_aio_cancel,
+    .bdrv_aio_flush = qcow_aio_flush,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
     .bdrv_get_info = qcow_get_info,
diff --git a/block-qcow2.c b/block-qcow2.c
index 9bad090..9c63a63 100644
--- a/block-qcow2.c
+++ b/block-qcow2.c
@@ -1007,6 +1007,13 @@ static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
     qemu_aio_release(acb);
 }
 
+static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BDRVQcowState *s = bs->opaque;
+    return bdrv_aio_flush(s->hd, cb, opaque);
+}
+
 static void qcow_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
@@ -2241,6 +2248,7 @@ BlockDriver bdrv_qcow2 = {
     .bdrv_aio_read = qcow_aio_read,
     .bdrv_aio_write = qcow_aio_write,
     .bdrv_aio_cancel = qcow_aio_cancel,
+    .bdrv_aio_flush = qcow_aio_flush,
     .aiocb_size = sizeof(QCowAIOCB),
     .bdrv_write_compressed = qcow_write_compressed,
 
diff --git a/block-raw-posix.c b/block-raw-posix.c
index 45d0a93..71e304d 100644
--- a/block-raw-posix.c
+++ b/block-raw-posix.c
@@ -429,6 +429,20 @@ static BlockDriverAIOCB *raw_aio_write(BlockDriverState *bs,
     return &acb->common;
 }
 
+static BlockDriverAIOCB *raw_aio_flush(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    RawAIOCB *acb;
+
+    acb = raw_aio_setup(bs, 0, NULL, 0, cb, opaque);
+    if (!acb)
+        return NULL;
+    if (aio_fsync(O_SYNC, &acb->aiocb) < 0) {
+        qemu_aio_release(acb);
+        return NULL;
+    }
+    return &acb->common;
+}
 static void raw_aio_cancel(BlockDriverAIOCB *blockacb)
 {
     int ret;
@@ -762,6 +776,7 @@ static int fd_open(BlockDriverState *bs)
 {
     return 0;
 }
+
 #endif
 
 #if defined(__linux__)
@@ -913,6 +928,7 @@ BlockDriver bdrv_host_device = {
     .bdrv_aio_read = raw_aio_read,
     .bdrv_aio_write = raw_aio_write,
     .bdrv_aio_cancel = raw_aio_cancel,
+    .bdrv_aio_flush = raw_aio_flush,
     .aiocb_size = sizeof(RawAIOCB),
     .bdrv_pread = raw_pread,
     .bdrv_pwrite = raw_pwrite,
diff --git a/block.c b/block.c
index fc32a60..348a215 100644
--- a/block.c
+++ b/block.c
@@ -50,6 +50,8 @@ static BlockDriverAIOCB *bdrv_aio_read_em(BlockDriverState *bs,
 static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb);
 static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
                         uint8_t *buf, int nb_sectors);
@@ -137,6 +139,8 @@ static void bdrv_register(BlockDriver *bdrv)
         bdrv->bdrv_read = bdrv_read_em;
         bdrv->bdrv_write = bdrv_write_em;
     }
+    if (!bdrv->bdrv_aio_flush)
+        bdrv->bdrv_aio_flush = bdrv_aio_flush_em;
     bdrv->next = first_drv;
     first_drv = bdrv;
 }
@@ -1156,6 +1160,17 @@ void bdrv_aio_cancel(BlockDriverAIOCB *acb)
     drv->bdrv_aio_cancel(acb);
 }
 
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 
+                                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriver *drv = bs->drv;
+
+    if (!drv)
+        return NULL;
+
+    return drv->bdrv_aio_flush(bs, cb, opaque);
+}
+
 
 /**************************************************************/
 /* async block device emulation */
@@ -1181,6 +1196,15 @@ static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
     return NULL;
 }
 
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    int ret;
+    ret = bdrv_flush(bs);
+    cb(opaque, ret);
+    return NULL;
+}
+
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *acb)
 {
 }
@@ -1224,6 +1248,21 @@ static BlockDriverAIOCB *bdrv_aio_write_em(BlockDriverState *bs,
     return &acb->common;
 }
 
+static BlockDriverAIOCB *bdrv_aio_flush_em(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque)
+{
+    BlockDriverAIOCBSync *acb;
+    int ret;
+
+    acb = qemu_aio_get(bs, cb, opaque);
+    if (!acb->bh)
+        acb->bh = qemu_bh_new(bdrv_aio_bh_cb, acb);
+    ret = bdrv_flush(bs);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+    return &acb->common;
+}
+
 static void bdrv_aio_cancel_em(BlockDriverAIOCB *blockacb)
 {
     BlockDriverAIOCBSync *acb = (BlockDriverAIOCBSync *)blockacb;
diff --git a/block.h b/block.h
index 861a65b..0ecb0f8 100644
--- a/block.h
+++ b/block.h
@@ -86,6 +86,8 @@ BlockDriverAIOCB *bdrv_aio_read(BlockDriverState *bs, int64_t sector_num,
 BlockDriverAIOCB *bdrv_aio_write(BlockDriverState *bs, int64_t sector_num,
                                  const uint8_t *buf, int nb_sectors,
                                  BlockDriverCompletionFunc *cb, void *opaque);
+BlockDriverAIOCB *bdrv_aio_flush(BlockDriverState *bs, 
+                                 BlockDriverCompletionFunc *cb, void *opaque);
 void bdrv_aio_cancel(BlockDriverAIOCB *acb);
 
 void qemu_aio_init(void);
diff --git a/block_int.h b/block_int.h
index 796ce29..bda0eab 100644
--- a/block_int.h
+++ b/block_int.h
@@ -55,6 +55,8 @@ struct BlockDriver {
         int64_t sector_num, const uint8_t *buf, int nb_sectors,
         BlockDriverCompletionFunc *cb, void *opaque);
     void (*bdrv_aio_cancel)(BlockDriverAIOCB *acb);
+    BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockDriverCompletionFunc *cb, void *opaque);
     int aiocb_size;
 
     const char *protocol_name;
diff --git a/hw/ide.c b/hw/ide.c
index b73dba2..6062c74 100644
--- a/hw/ide.c
+++ b/hw/ide.c
@@ -383,6 +383,7 @@ typedef struct IDEState {
     PCIDevice *pci_dev;
     struct BMDMAState *bmdma;
     int drive_serial;
+    uint8_t write_cache;
     /* ide regs */
     uint8_t feature;
     uint8_t error;
@@ -559,11 +560,12 @@ static void ide_identify(IDEState *s)
     put_le16(p + 68, 120);
     put_le16(p + 80, 0xf0); /* ata3 -> ata6 supported */
     put_le16(p + 81, 0x16); /* conforms to ata5 */
-    put_le16(p + 82, (1 << 14));
+    /* 14=nop 5=write_cache */
+    put_le16(p + 82, (1 << 14) | (1 << 5));
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
     put_le16(p + 83, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
     put_le16(p + 84, (1 << 14));
-    put_le16(p + 85, (1 << 14));
+    put_le16(p + 85, (1 << 14) | (s->write_cache << 5));
     /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
     put_le16(p + 86, (1 << 14) | (1 << 13) | (1 <<12) | (1 << 10));
     put_le16(p + 87, (1 << 14));
@@ -722,6 +724,7 @@ static inline void ide_abort_command(IDEState *s)
 static inline void ide_set_irq(IDEState *s)
 {
     BMDMAState *bm = s->bmdma;
+    if (!s->bs) return; /* yikes */
     if (!(s->cmd & IDE_CMD_DISABLE_IRQ)) {
         if (bm) {
             bm->status |= BM_STATUS_INT;
@@ -889,6 +892,14 @@ static int dma_buf_rw(BMDMAState *bm, int is_write)
     return 1;
 }
 
+static void ide_dma_eot(BMDMAState *bm) {
+    bm->status &= ~BM_STATUS_DMAING;
+    bm->status |= BM_STATUS_INT;
+    bm->dma_cb = NULL;
+    bm->ide_if = NULL;
+    bm->aiocb = NULL;
+}
+
 static void ide_read_dma_cb(void *opaque, int ret)
 {
     BMDMAState *bm = opaque;
@@ -901,6 +912,8 @@ static void ide_read_dma_cb(void *opaque, int ret)
 	return;
     }
 
+    if (!s->bs) return; /* yikes */
+
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
@@ -916,11 +929,7 @@ static void ide_read_dma_cb(void *opaque, int ret)
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s);
     eot:
-        bm->status &= ~BM_STATUS_DMAING;
-        bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
+	ide_dma_eot(bm);
         return;
     }
 
@@ -965,6 +974,9 @@ static void ide_sector_write(IDEState *s)
     if (n > s->req_nb_sectors)
         n = s->req_nb_sectors;
     ret = bdrv_write(s->bs, sector_num, s->io_buffer, n);
+    if (ret == 0 && !s->write_cache) {
+	ret = bdrv_flush(s->bs);
+    }
     if (ret != 0) {
 	ide_rw_error(s);
 	return;
@@ -999,6 +1011,20 @@ static void ide_sector_write(IDEState *s)
     }
 }
 
+static void ide_write_flush_cb(void *opaque, int ret) {
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
+
+    if (ret != 0) {
+	ide_dma_error(s);
+	return;
+    }
+    s->status = READY_STAT | SEEK_STAT;
+    ide_set_irq(s);
+    ide_dma_eot(bm);
+    return;
+}
+
 static void ide_write_dma_cb(void *opaque, int ret)
 {
     BMDMAState *bm = opaque;
@@ -1011,6 +1037,8 @@ static void ide_write_dma_cb(void *opaque, int ret)
 	return;
     }
 
+    if (!s->bs) return; /* yikes */
+
     n = s->io_buffer_size >> 9;
     sector_num = ide_get_sector(s);
     if (n > 0) {
@@ -1021,14 +1049,14 @@ static void ide_write_dma_cb(void *opaque, int ret)
 
     /* end of transfer ? */
     if (s->nsector == 0) {
+	if (!s->write_cache) {
+	    bdrv_aio_flush(s->bs, ide_write_flush_cb, bm);
+	    return;
+	}
         s->status = READY_STAT | SEEK_STAT;
         ide_set_irq(s);
     eot:
-        bm->status &= ~BM_STATUS_DMAING;
-        bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
+	ide_dma_eot(bm);
         return;
     }
 
@@ -1056,6 +1084,39 @@ static void ide_sector_write_dma(IDEState *s)
     ide_dma_start(s, ide_write_dma_cb);
 }
 
+static void ide_device_utterly_broken(IDEState *s) {
+    s->status |= BUSY_STAT;
+    s->bs = NULL;
+    /* This prevents all future commands from working.  All of the
+     * asynchronous callbacks (and ide_set_irq, as a safety measure)
+     * check to see whether this has happened and bail if so.
+     */
+}
+
+static void ide_flush_cb(void *opaque, int ret)
+{
+    IDEState *s = opaque;
+
+    if (!s->bs) return; /* yikes */
+
+    if (ret) {
+        /* We are completely doomed.  The IDE spec does not permit us
+	 * to return an error from a flush except via a protocol which
+	 * requires us to say where the error is and which
+	 * contemplates the guest repeating the flush attempt to
+	 * attempt flush the remaining data.  We can't support that
+	 * because f(data)sync (which is what the block drivers use
+	 * eventually) doesn't report the necessary information or
+	 * give us the necessary control.  So we make the disk vanish.
+	 */
+	ide_device_utterly_broken(s);
+	return;
+    }
+    else
+        s->status = READY_STAT;
+    ide_set_irq(s);
+}
+
 static void ide_atapi_cmd_ok(IDEState *s)
 {
     s->error = 0;
@@ -1282,6 +1343,8 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
     IDEState *s = bm->ide_if;
     int data_offset, n;
 
+    if (!s->bs) return; /* yikes */
+
     if (ret < 0) {
         ide_atapi_io_error(s, ret);
         goto eot;
@@ -1314,11 +1377,7 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
         s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO | ATAPI_INT_REASON_CD;
         ide_set_irq(s);
     eot:
-        bm->status &= ~BM_STATUS_DMAING;
-        bm->status |= BM_STATUS_INT;
-        bm->dma_cb = NULL;
-        bm->ide_if = NULL;
-        bm->aiocb = NULL;
+	ide_dma_eot(bm);
         return;
     }
 
@@ -1860,6 +1919,8 @@ static void cdrom_change_cb(void *opaque)
     IDEState *s = opaque;
     uint64_t nb_sectors;
 
+    if (!s->bs) return; /* yikes */
+
     /* XXX: send interrupt too */
     bdrv_get_geometry(s->bs, &nb_sectors);
     s->nb_sectors = nb_sectors;
@@ -1963,8 +2024,8 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
         printf("ide: CMD=%02x\n", val);
 #endif
         s = ide_if->cur_drive;
-        /* ignore commands to non existant slave */
-        if (s != ide_if && !s->bs)
+	/* ignore commands to non existant device */
+        if (!s->bs)
             break;
 
         switch(val) {
@@ -2102,8 +2163,6 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
             switch(s->feature) {
             case 0xcc: /* reverting to power-on defaults enable */
             case 0x66: /* reverting to power-on defaults disable */
-            case 0x02: /* write cache enable */
-            case 0x82: /* write cache disable */
             case 0xaa: /* read look-ahead enable */
             case 0x55: /* read look-ahead disable */
             case 0x05: /* set advanced power management mode */
@@ -2117,6 +2176,18 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
                 s->status = READY_STAT | SEEK_STAT;
                 ide_set_irq(s);
                 break;
+            case 0x02: /* write cache enable */
+                s->write_cache = 1;
+                s->status = READY_STAT | SEEK_STAT;
+                ide_set_irq(s);
+                break;
+            case 0x82: /* write cache disable */
+                s->write_cache = 0;
+                ret = bdrv_flush(s->bs);
+		if (ret != 0) goto abort_cmd;
+                s->status = READY_STAT | SEEK_STAT;
+                ide_set_irq(s);
+                break;
             case 0x03: { /* set transfer mode */
 		uint8_t val = s->nsector & 0x07;
 
@@ -2147,12 +2218,8 @@ static void ide_ioport_write(void *opaque, uint32_t addr, uint32_t val)
             break;
         case WIN_FLUSH_CACHE:
         case WIN_FLUSH_CACHE_EXT:
-            if (s->bs) {
-                ret = bdrv_flush(s->bs);
-		if (ret) goto abort_cmd;
-	    }
-	    s->status = READY_STAT;
-            ide_set_irq(s);
+	    s->status = BUSY_STAT;
+	    bdrv_aio_flush(s->bs, ide_flush_cb, s);
             break;
         case WIN_STANDBY:
         case WIN_STANDBY2:
@@ -2629,6 +2696,7 @@ static void ide_init2(IDEState *ide_state,
         s->irq = irq;
         s->sector_write_timer = qemu_new_timer(vm_clock,
                                                ide_sector_write_timer_cb, s);
+	s->write_cache = 0;
         ide_reset(s);
     }
 }
@@ -2657,6 +2725,7 @@ static void ide_save(QEMUFile* f, IDEState *s)
     if (s->identify_set) {
         qemu_put_buffer(f, (const uint8_t *)s->identify_data, 512);
     }
+    qemu_put_8s(f, &s->write_cache);
     qemu_put_8s(f, &s->feature);
     qemu_put_8s(f, &s->error);
     qemu_put_be32s(f, &s->nsector);
@@ -2678,13 +2747,15 @@ static void ide_save(QEMUFile* f, IDEState *s)
 }
 
 /* load per IDE drive data */
-static void ide_load(QEMUFile* f, IDEState *s)
+static void ide_load(QEMUFile* f, IDEState *s, int version_id)
 {
     s->mult_sectors=qemu_get_be32(f);
     s->identify_set=qemu_get_be32(f);
     if (s->identify_set) {
         qemu_get_buffer(f, (uint8_t *)s->identify_data, 512);
     }
+    if (version_id >= 2)
+	qemu_get_8s(f, &s->write_cache);
     qemu_get_8s(f, &s->feature);
     qemu_get_8s(f, &s->error);
     qemu_get_be32s(f, &s->nsector);
@@ -3029,7 +3100,7 @@ static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
     PCIIDEState *d = opaque;
     int ret, i;
 
-    if (version_id != 1)
+    if (version_id != 1 && version_id != 2)
         return -EINVAL;
     ret = pci_device_load(&d->dev, f);
     if (ret < 0)
@@ -3054,7 +3125,7 @@ static int pci_ide_load(QEMUFile* f, void *opaque, int version_id)
 
     /* per IDE drive data */
     for(i = 0; i < 4; i++) {
-        ide_load(f, &d->ide_if[i]);
+        ide_load(f, &d->ide_if[i], version_id);
     }
     return 0;
 }
@@ -3105,7 +3176,7 @@ void pci_piix3_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
     ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
     ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
 
-    register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+    register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
 }
 
 /* hd_table must contain 4 block drivers */
@@ -3143,7 +3214,7 @@ void pci_piix4_ide_init(PCIBus *bus, BlockDriverState **hd_table, int devfn,
     ide_init_ioport(&d->ide_if[0], 0x1f0, 0x3f6);
     ide_init_ioport(&d->ide_if[2], 0x170, 0x376);
 
-    register_savevm("ide", 0, 1, pci_ide_save, pci_ide_load, d);
+    register_savevm("ide", 0, 2, pci_ide_save, pci_ide_load, d);
 }
 
 /***********************************************************/
@@ -3577,7 +3648,7 @@ static int md_load(QEMUFile *f, void *opaque, int version_id)
     s->ide->cur_drive = &s->ide[(drive1_selected != 0)];
 
     for (i = 0; i < 2; i ++)
-        ide_load(f, &s->ide[i]);
+        ide_load(f, &s->ide[i], version_id);
 
     return 0;
 }
diff --git a/hw/scsi-disk.c b/hw/scsi-disk.c
index 30cc906..91fcc70 100644
--- a/hw/scsi-disk.c
+++ b/hw/scsi-disk.c
@@ -276,6 +276,16 @@ static uint8_t *scsi_get_buf(SCSIDevice *d, uint32_t tag)
     return r->dma_buf;
 }
 
+static void scsi_flush_cb(void *opaque, int ret) {
+    SCSIRequest *r = opaque;
+    if (ret) {
+        BADF("IO error on flush: %s\n", strerror(-ret));
+	scsi_command_complete(r, SENSE_HARDWARE_ERROR);
+    } else {
+	scsi_command_complete(r, SENSE_NO_SENSE);
+    }
+}
+
 /* Execute a scsi command.  Returns the length of the data expected by the
    command.  This will be Positive for data transfers from the device
    (eg. disk reads), negative for transfers to the device (eg. disk writes),
@@ -621,13 +631,8 @@ static int32_t scsi_send_command(SCSIDevice *d, uint32_t tag,
         break;
     case 0x35:
         DPRINTF("Synchronise cache (sector %d, count %d)\n", lba, len);
-        ret = bdrv_flush(s->bdrv);
-        if (ret) {
-            DPRINTF("IO error on bdrv_flush\n");
-            scsi_command_complete(r, SENSE_HARDWARE_ERROR);
-            return 0;
-        }
-        break;
+	bdrv_aio_flush(s->bdrv, scsi_flush_cb, r);
+	return 0;
     case 0x43:
         {
             int start_track, format, msf, toclen;

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-02 16:27   ` Ian Jackson
@ 2008-04-03  0:40     ` Paul Brook
  2008-04-03 10:00       ` Ian Jackson
  0 siblings, 1 reply; 15+ messages in thread
From: Paul Brook @ 2008-04-03  0:40 UTC (permalink / raw)
  To: qemu-devel; +Cc: Ian Jackson

> > > +    if (ret) {
> > > +        BADF("scsi-disc: IO error on flush: %s\n", strerror(-ret));
> >
> > This is wrong for two reasons: BADF already adds a suitable prefix,
> > and is for reporting qemu bugs (i.e. missing features or things that
> > should never happen). This is just a normal IO error, which we
> > correctly report to the guest.
>
> I don't think flush failures are a normal IO error.  They can only
> occur when the corresponding host block device is having serious
> trouble.  Normally (if it's actually a host disk) this is very bad.

In practice any IO error indicates that the target device is pretty screwed.
I don't see why flushes should be special. When caches are enabled I'd be 
fairly surprised if normal write operations ever returned an error.

Paul

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-03  0:40     ` Paul Brook
@ 2008-04-03 10:00       ` Ian Jackson
  0 siblings, 0 replies; 15+ messages in thread
From: Ian Jackson @ 2008-04-03 10:00 UTC (permalink / raw)
  To: qemu-devel

Paul Brook writes ("Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush"):
> [Ian Jackson:]
> > I don't think flush failures are a normal IO error.  They can only
> > occur when the corresponding host block device is having serious
> > trouble.  Normally (if it's actually a host disk) this is very bad.
> 
> In practice any IO error indicates that the target device is pretty screwed.
> I don't see why flushes should be special. When caches are enabled I'd be 
> fairly surprised if normal write operations ever returned an error.

The most common kind of `IO error' will be disk full (or the
equivalent) on the difference file for a COW device.  That can't be
detected at flush time (at least, not with most host filesystems).

Ian.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-02 16:20         ` Ian Jackson
@ 2008-04-03 11:38           ` Jamie Lokier
  2008-04-03 12:06             ` Ian Jackson
  0 siblings, 1 reply; 15+ messages in thread
From: Jamie Lokier @ 2008-04-03 11:38 UTC (permalink / raw)
  To: Ian Jackson; +Cc: Marcelo Tosatti, Paul Brook, qemu-devel

Ian Jackson wrote:
> > > The cache is dynamically enabled/disabled by the target. This means
> > > you've got to close and repopen the file every time it changes,
> > > which is likely to get really hairy.
> > 
> > Just open two descriptors :-)
> 
> Doing it with bdrv_aio_flush wasn't too hard, so that's what'll be in
> my revised patch shortly.

For "uncached" writes, do you wait until _after_ the aio_write has
completed before calling aio_fsync, or do you assume the aio_fsync
will wait for all aio_writes queued (but not completed) prior to it,
as Marcelo Tosatti believes the documentation implies?

Thanks,
-- Jamie

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-03 11:38           ` Jamie Lokier
@ 2008-04-03 12:06             ` Ian Jackson
  0 siblings, 0 replies; 15+ messages in thread
From: Ian Jackson @ 2008-04-03 12:06 UTC (permalink / raw)
  To: Jamie Lokier; +Cc: Marcelo Tosatti, Paul Brook, qemu-devel

Jamie Lokier writes ("Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush"):
> Ian Jackson wrote:
> > Doing it with bdrv_aio_flush wasn't too hard, so that's what'll be in
> > my revised patch shortly.
> 
> For "uncached" writes, do you wait until _after_ the aio_write has
> completed before calling aio_fsync, or do you assume the aio_fsync
> will wait for all aio_writes queued (but not completed) prior to it,
> as Marcelo Tosatti believes the documentation implies?

I do the former.  In general, I assume that in the following case:

   aio_write
     returns 0, operation is in progress
       .
       .                      aio_fsync
       .                       returns 0, operation is in progress
       .                          .
   aio_return on the aio_write    .
    says completed OK             .
                                  .
                              aio_return on the aio_fsync
                               says completed OK

the aio_f(data)sync completion does not tell us that the write (which
had not been completed at the time aio_fsync was called) has been
synchronised.

I don't agree with Marcelo's analysis.

I think SuSv3 unhelpfully uses the word `queued' in two different
ways: firstly, to refer to data for which a write(2) or equivalent has
completed but where the data has not yet reached stable storage, and
secondly to refer to asynchronous IO operations (including aio_write
and aio_fsync) which have been submitted but not yet completed.

The former usage is demonstrated clearly in the SuSv3 page for
write(2) which describes the data as having been `queued' when
write(2) completes - ie when it is in the buffer cache.  This
interpretation for write(2) is supported by the page for read(2) not
mentioning queues (except for STREAMS queues which are irrelevant).

The latter usage is demonstrated by the spec for aio_read, which talks
about a read having been queued, and even for aio_fsync which is
specified to return when `the synchronisation request has been
... queued'.

Clearly it is the former meaning of `queue' which is referred to by
the specification of fsync and fdatasync since those calls apply to
data from write(2) as well.

In the spec for aio_write, `queued' is used in the first paragraph
with an identical wording to that for aio_read, so has the latter of
the two meanings above.

Sadly in aio_fsync these two meanings of `queued' appear in different
parts of the page.  Firstly, we have the statement that aio_fsync
shall return when the synchronisation request has been queued
(identical wording to that used for aio_read and aio_write).  Then in
the second paragraph we refer to queued IO operations but this can
only refer to the former, fsync, sense (since the text explicitly says
`as if by a call to fsync and since otherwise the reference would be
semi-recursive as the aio_fsync would itself be a queued IO
operation).

If one thinks that both of these meanings of `queue' are the same:
what is the difference between aio_write and write(2) ?  Why can't
write(2) return immediately just like aio_write ?

Ian.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush
  2008-04-02 14:21   ` Ian Jackson
@ 2008-04-04 14:29     ` Samuel Thibault
  0 siblings, 0 replies; 15+ messages in thread
From: Samuel Thibault @ 2008-04-04 14:29 UTC (permalink / raw)
  To: qemu-devel

Ian Jackson, le Wed 02 Apr 2008 15:21:32 +0100, a écrit :
> Samuel Thibault writes ("Re: [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush"):
> > Oh, by the way, shouldn't this:
> > Ian Jackson, le Tue 01 Apr 2008 18:18:59 +0100, a écrit :
> > >      put_le16(p + 81, 0x16); /* conforms to ata5 */
> > > -    put_le16(p + 82, (1 << 14));
> > > +    /* 14=nop 5=write_cache */
> > > +    put_le16(p + 82, (1 << 14) | (1 << 5));
> > >      /* 13=flush_cache_ext,12=flush_cache,10=lba48 */
> > 
> > use s->write_cache instead of always 1?  Else when using hdparm -i one
> > would think that write cache is always enabled.
> 
> No.  According to the ATA-7 draft I have in front of me, the meaning
> of word 82 is `Command set supported' and the meaning of bit 5 is is
> `write cache supported'.

Oh, I was actually talking about word 85 indeed.  In Xen we lack the
word 82 bit, I'll make a patch.

Samuel

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2008-04-04 14:30 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2008-04-01 17:18 [Qemu-devel] [PATCH] Asynchronous reliable and configurable cache flush Ian Jackson
2008-04-01 17:46 ` Paul Brook
2008-04-01 18:45   ` Jamie Lokier
2008-04-02 12:26     ` Paul Brook
2008-04-02 15:23       ` Jamie Lokier
2008-04-02 16:20         ` Ian Jackson
2008-04-03 11:38           ` Jamie Lokier
2008-04-03 12:06             ` Ian Jackson
2008-04-02 16:27   ` Ian Jackson
2008-04-03  0:40     ` Paul Brook
2008-04-03 10:00       ` Ian Jackson
2008-04-01 18:07 ` Samuel Thibault
2008-04-02 14:21   ` Ian Jackson
2008-04-04 14:29     ` Samuel Thibault
2008-04-02 16:31 ` [Qemu-devel] " Ian Jackson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).