qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 0/2 v2] Open disk images with O_DIRECT
@ 2007-12-03 10:09 Laurent Vivier
  2007-12-03 10:09 ` [Qemu-devel] [PATCH 1/2 v2] Add "cache" parameter to "-drive" Laurent Vivier
  0 siblings, 1 reply; 31+ messages in thread
From: Laurent Vivier @ 2007-12-03 10:09 UTC (permalink / raw)
  Cc: qemu-devel


These patches allow to open file using O_DIRECT and bypass the host I/O cache.

The v2 is a new version including comments from Anthony Liguori ("directio" 
has been renamed "cache"), from Balazs Attila-Mihaly (for Win32 support, 
not tested) and Samuel Thibault (for the generic function qemu_memalign()".

[PATCH 1/2] Add "cache" parameter to "-drive"

    Using "cache=off" with "-drive" will open the disk image 
    file using "O_DIRECT".

[PATCH 2/2] Direct IDE I/O

    This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
    by removing the buffer used in the IDE emulation.

Laurent

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] [PATCH 1/2 v2] Add "cache" parameter to "-drive"
  2007-12-03 10:09 [Qemu-devel] [PATCH 0/2 v2] Open disk images with O_DIRECT Laurent Vivier
@ 2007-12-03 10:09 ` Laurent Vivier
  2007-12-03 10:09   ` [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O Laurent Vivier
  0 siblings, 1 reply; 31+ messages in thread
From: Laurent Vivier @ 2007-12-03 10:09 UTC (permalink / raw)
  Cc: qemu-devel


This patch adds a new parameter to "-drive"

Using "cache=off" with "-drive" will open the disk image file using
"O_DIRECT".

By default, "cache" is set to "on" to keep original behavior of qemu.

example:

"-drive file=my_disk.qcow2,cache=off"
---
 block-raw.c    |   12 ++++++++++++
 block.c        |    2 +-
 block.h        |    1 +
 hw/fdc.c       |    7 ++++++-
 hw/ide.c       |   18 ++++++++++++++----
 hw/scsi-disk.c |    3 ++-
 hw/sd.c        |   11 ++++++++++-
 osdep.c        |   20 ++++++++++++++++++++
 osdep.h        |    1 +
 vl.c           |   26 +++++++++++++++++++++++---
 10 files changed, 90 insertions(+), 11 deletions(-)

Index: qemu/block-raw.c
===================================================================
--- qemu.orig/block-raw.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/block-raw.c	2007-12-03 09:54:47.000000000 +0100
@@ -107,6 +107,10 @@ static int raw_open(BlockDriverState *bs
     }
     if (flags & BDRV_O_CREAT)
         open_flags |= O_CREAT | O_TRUNC;
+#ifdef O_DIRECT
+    if (flags & BDRV_O_DIRECT)
+        open_flags |= O_DIRECT;
+#endif
 
     s->type = FTYPE_FILE;
 
@@ -660,6 +664,10 @@ static int hdev_open(BlockDriverState *b
         open_flags |= O_RDONLY;
         bs->read_only = 1;
     }
+#ifdef O_DIRECT
+    if (flags & BDRV_O_DIRECT)
+        open_flags |= O_DIRECT;
+#endif
 
     s->type = FTYPE_FILE;
 #if defined(__linux__)
@@ -981,6 +989,8 @@ static int raw_open(BlockDriverState *bs
 #else
     overlapped = FILE_FLAG_OVERLAPPED;
 #endif
+    if (flags & BDRV_O_DIRECT)
+        overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           create_flags, overlapped, NULL);
@@ -1349,6 +1359,8 @@ static int hdev_open(BlockDriverState *b
 #else
     overlapped = FILE_FLAG_OVERLAPPED;
 #endif
+    if (flags & BDRV_O_DIRECT)
+        overlapped |= FILE_FLAG_NO_BUFFERING | FILE_FLAG_WRITE_THROUGH;
     s->hfile = CreateFile(filename, access_flags,
                           FILE_SHARE_READ, NULL,
                           create_flags, overlapped, NULL);
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2007-12-03 09:53:31.000000000 +0100
+++ qemu/block.h	2007-12-03 09:54:47.000000000 +0100
@@ -44,6 +44,7 @@ typedef struct QEMUSnapshotInfo {
                                      use a disk image format on top of
                                      it (default for
                                      bdrv_file_open()) */
+#define BDRV_O_DIRECT      0x0020
 
 #ifndef QEMU_IMG
 void bdrv_info(void);
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c	2007-12-03 09:53:43.000000000 +0100
+++ qemu/vl.c	2007-12-03 09:54:47.000000000 +0100
@@ -4851,8 +4851,11 @@ static int drive_init(const char *str, i
     BlockDriverState *bdrv;
     int max_devs;
     int index;
+    int cache;
+    int bdrv_flags;
     char *params[] = { "bus", "unit", "if", "index", "cyls", "heads",
-                       "secs", "trans", "media", "snapshot", "file", NULL };
+                       "secs", "trans", "media", "snapshot", "file",
+                       "cache", NULL };
 
     if (check_params(buf, sizeof(buf), params, str) < 0) {
          fprintf(stderr, "qemu: unknowm parameter '%s' in '%s'\n",
@@ -4866,6 +4869,7 @@ static int drive_init(const char *str, i
     unit_id = -1;
     translation = BIOS_ATA_TRANSLATION_AUTO;
     index = -1;
+    cache = 1;
 
     if (!strcmp(machine->name, "realview") ||
         !strcmp(machine->name, "SS-5") ||
@@ -5005,6 +5009,17 @@ static int drive_init(const char *str, i
 	}
     }
 
+    if (get_param_value(buf, sizeof(buf), "cache", str)) {
+        if (!strcmp(buf, "off"))
+            cache = 0;
+        else if (!strcmp(buf, "on"))
+            cache = 1;
+        else {
+	    fprintf(stderr, "qemu: invalid cache option\n");
+	    return -1;
+        }
+    }
+
     get_param_value(file, sizeof(file), "file", str);
 
     /* compute bus and unit according index */
@@ -5092,8 +5107,12 @@ static int drive_init(const char *str, i
     }
     if (!file[0])
         return 0;
-    if (bdrv_open(bdrv, file, snapshot ? BDRV_O_SNAPSHOT : 0) < 0 ||
-        qemu_key_check(bdrv, file)) {
+    bdrv_flags = 0;
+    if (snapshot)
+        bdrv_flags |= BDRV_O_SNAPSHOT;
+    if (!cache)
+        bdrv_flags |= BDRV_O_DIRECT;
+    if (bdrv_open(bdrv, file, bdrv_flags) < 0 || qemu_key_check(bdrv, file)) {
         fprintf(stderr, "qemu: could not open disk image %s\n",
                         file);
         return -1;
@@ -7440,6 +7459,7 @@ static void help(int exitcode)
            "-cdrom file     use 'file' as IDE cdrom image (cdrom is ide1 master)\n"
 	   "-drive [file=file][,if=type][,bus=n][,unit=m][,media=d][index=i]\n"
            "       [,cyls=c,heads=h,secs=s[,trans=t]][snapshot=on|off]\n"
+           "       [cache=on|off]\n"
 	   "                use 'file' as a drive image\n"
            "-mtdblock file  use 'file' as on-board Flash memory image\n"
            "-sd file        use 'file' as SecureDigital card image\n"
Index: qemu/hw/ide.c
===================================================================
--- qemu.orig/hw/ide.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/hw/ide.c	2007-12-03 09:54:47.000000000 +0100
@@ -365,7 +365,7 @@ typedef struct IDEState {
     EndTransferFunc *end_transfer_func;
     uint8_t *data_ptr;
     uint8_t *data_end;
-    uint8_t io_buffer[MAX_MULT_SECTORS*512 + 4];
+    uint8_t *io_buffer;
     QEMUTimer *sector_write_timer; /* only used for win2k install hack */
     uint32_t irq_count; /* counts IRQs when using win2k install hack */
     /* CF-ATA extended error */
@@ -2372,17 +2372,24 @@ struct partition {
 static int guess_disk_lchs(IDEState *s,
                            int *pcylinders, int *pheads, int *psectors)
 {
-    uint8_t buf[512];
+    uint8_t *buf;
     int ret, i, heads, sectors, cylinders;
     struct partition *p;
     uint32_t nr_sects;
 
+    buf = qemu_memalign(512, 512);
+    if (buf == NULL)
+        return -1;
     ret = bdrv_read(s->bs, 0, buf, 1);
-    if (ret < 0)
+    if (ret < 0) {
+        qemu_free(buf);
         return -1;
+    }
     /* test msdos magic */
-    if (buf[510] != 0x55 || buf[511] != 0xaa)
+    if (buf[510] != 0x55 || buf[511] != 0xaa) {
+        qemu_free(buf);
         return -1;
+    }
     for(i = 0; i < 4; i++) {
         p = ((struct partition *)(buf + 0x1be)) + i;
         nr_sects = le32_to_cpu(p->nr_sects);
@@ -2403,9 +2410,11 @@ static int guess_disk_lchs(IDEState *s,
             printf("guessed geometry: LCHS=%d %d %d\n",
                    cylinders, heads, sectors);
 #endif
+            qemu_free(buf);
             return 0;
         }
     }
+    qemu_free(buf);
     return -1;
 }
 
@@ -2420,6 +2429,7 @@ static void ide_init2(IDEState *ide_stat
 
     for(i = 0; i < 2; i++) {
         s = ide_state + i;
+        s->io_buffer = qemu_memalign(512, MAX_MULT_SECTORS*512 + 4);
         if (i == 0)
             s->bs = hd0;
         else
Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/block.c	2007-12-03 09:54:47.000000000 +0100
@@ -380,7 +380,7 @@ int bdrv_open2(BlockDriverState *bs, con
     /* Note: for compatibility, we open disk image files as RDWR, and
        RDONLY as fallback */
     if (!(flags & BDRV_O_FILE))
-        open_flags = BDRV_O_RDWR;
+        open_flags = BDRV_O_RDWR | (flags & BDRV_O_DIRECT);
     else
         open_flags = flags & ~(BDRV_O_FILE | BDRV_O_SNAPSHOT);
     ret = drv->bdrv_open(bs, filename, open_flags);
Index: qemu/hw/scsi-disk.c
===================================================================
--- qemu.orig/hw/scsi-disk.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/hw/scsi-disk.c	2007-12-03 09:54:47.000000000 +0100
@@ -46,7 +46,7 @@ typedef struct SCSIRequest {
     int sector_count;
     /* The amounnt of data in the buffer.  */
     int buf_len;
-    uint8_t dma_buf[SCSI_DMA_BUF_SIZE];
+    uint8_t *dma_buf;
     BlockDriverAIOCB *aiocb;
     struct SCSIRequest *next;
 } SCSIRequest;
@@ -78,6 +78,7 @@ static SCSIRequest *scsi_new_request(SCS
         free_requests = r->next;
     } else {
         r = qemu_malloc(sizeof(SCSIRequest));
+        r->dma_buf = qemu_memalign(512, SCSI_DMA_BUF_SIZE);
     }
     r->dev = s;
     r->tag = tag;
Index: qemu/hw/sd.c
===================================================================
--- qemu.orig/hw/sd.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/hw/sd.c	2007-12-03 09:54:47.000000000 +0100
@@ -1284,10 +1284,17 @@ int sd_do_command(SDState *sd, struct sd
 static void sd_blk_read(BlockDriverState *bdrv,
                 void *data, uint32_t addr, uint32_t len)
 {
-    uint8_t buf[512];
+    uint8_t *buf;
     uint32_t end = addr + len;
 
+    buf = qemu_memalign(512, 512);
+    if (buf == NULL) {
+        printf("sd_blk_read: cannot allocate memory\n");
+        return;
+    }
+
     if (!bdrv || bdrv_read(bdrv, addr >> 9, buf, 1) == -1) {
+        qemu_free(buf);
         printf("sd_blk_read: read error on host side\n");
         return;
     }
@@ -1297,11 +1304,13 @@ static void sd_blk_read(BlockDriverState
 
         if (bdrv_read(bdrv, end >> 9, buf, 1) == -1) {
             printf("sd_blk_read: read error on host side\n");
+            qemu_free(buf);
             return;
         }
         memcpy(data + 512 - (addr & 511), buf, end & 511);
     } else
         memcpy(data, buf + (addr & 511), len);
+    qemu_free(buf);
 }
 
 static void sd_blk_write(BlockDriverState *bdrv,
Index: qemu/hw/fdc.c
===================================================================
--- qemu.orig/hw/fdc.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/hw/fdc.c	2007-12-03 09:54:47.000000000 +0100
@@ -382,7 +382,7 @@ struct fdctrl_t {
     uint8_t cur_drv;
     uint8_t bootsel;
     /* Command FIFO */
-    uint8_t fifo[FD_SECTOR_LEN];
+    uint8_t *fifo;
     uint32_t data_pos;
     uint32_t data_len;
     uint8_t data_state;
@@ -598,6 +598,11 @@ fdctrl_t *fdctrl_init (qemu_irq irq, int
     fdctrl = qemu_mallocz(sizeof(fdctrl_t));
     if (!fdctrl)
         return NULL;
+    fdctrl->fifo = qemu_memalign(512, FD_SECTOR_LEN);
+    if (fdctrl->fifo == NULL) {
+        qemu_free(fdctrl);
+        return NULL;
+    }
     fdctrl->result_timer = qemu_new_timer(vm_clock,
                                           fdctrl_result_timer, fdctrl);
 
Index: qemu/osdep.c
===================================================================
--- qemu.orig/osdep.c	2007-12-03 09:53:31.000000000 +0100
+++ qemu/osdep.c	2007-12-03 09:54:47.000000000 +0100
@@ -60,6 +60,10 @@ void *qemu_malloc(size_t size)
 }
 
 #if defined(_WIN32)
+void *qemu_memalign(size_t alignment, size_t size)
+{
+    return VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
+}
 
 void *qemu_vmalloc(size_t size)
 {
@@ -171,6 +175,22 @@ static void kqemu_vfree(void *ptr)
 
 #endif
 
+void *qemu_memalign(size_t alignment, size_t size)
+{
+#if defined(_POSIX_C_SOURCE)
+    int ret;
+    void *ptr;
+    ret = posix_memalign(&ptr, alignment, size);
+    if (ret != 0)
+        return NULL;
+    return ptr;
+#elif defined(_BSD)
+    return valloc(size);
+#else
+    return memalign(alignment, size);
+#endif
+}
+
 /* alloc shared memory pages */
 void *qemu_vmalloc(size_t size)
 {
Index: qemu/osdep.h
===================================================================
--- qemu.orig/osdep.h	2007-12-03 09:53:31.000000000 +0100
+++ qemu/osdep.h	2007-12-03 09:54:47.000000000 +0100
@@ -48,6 +48,7 @@ void *qemu_mallocz(size_t size);
 void qemu_free(void *ptr);
 char *qemu_strdup(const char *str);
 
+void *qemu_memalign(size_t alignment, size_t size);
 void *qemu_vmalloc(size_t size);
 void qemu_vfree(void *ptr);
 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 10:09 ` [Qemu-devel] [PATCH 1/2 v2] Add "cache" parameter to "-drive" Laurent Vivier
@ 2007-12-03 10:09   ` Laurent Vivier
  2007-12-03 10:23     ` Fabrice Bellard
  0 siblings, 1 reply; 31+ messages in thread
From: Laurent Vivier @ 2007-12-03 10:09 UTC (permalink / raw)
  Cc: qemu-devel


This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
by removing the buffer used in the IDE emulation.
---
 block.c     |   10 +++
 block.h     |    2 
 block_int.h |    1 
 cpu-all.h   |    1 
 exec.c      |   19 ++++++
 hw/ide.c    |  176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 vl.c        |    1 
 7 files changed, 204 insertions(+), 6 deletions(-)

Index: qemu/block.c
===================================================================
--- qemu.orig/block.c	2007-12-03 09:54:47.000000000 +0100
+++ qemu/block.c	2007-12-03 09:54:53.000000000 +0100
@@ -758,6 +758,11 @@ void bdrv_set_translation_hint(BlockDriv
     bs->translation = translation;
 }
 
+void bdrv_set_cache_hint(BlockDriverState *bs, int cache)
+{
+    bs->cache = cache;
+}
+
 void bdrv_get_geometry_hint(BlockDriverState *bs,
                             int *pcyls, int *pheads, int *psecs)
 {
@@ -786,6 +791,11 @@ int bdrv_is_read_only(BlockDriverState *
     return bs->read_only;
 }
 
+int bdrv_is_cached(BlockDriverState *bs)
+{
+    return bs->cache;
+}
+
 /* XXX: no longer used */
 void bdrv_set_change_cb(BlockDriverState *bs,
                         void (*change_cb)(void *opaque), void *opaque)
Index: qemu/block.h
===================================================================
--- qemu.orig/block.h	2007-12-03 09:54:47.000000000 +0100
+++ qemu/block.h	2007-12-03 09:54:53.000000000 +0100
@@ -113,6 +113,7 @@ void bdrv_set_geometry_hint(BlockDriverS
                             int cyls, int heads, int secs);
 void bdrv_set_type_hint(BlockDriverState *bs, int type);
 void bdrv_set_translation_hint(BlockDriverState *bs, int translation);
+void bdrv_set_cache_hint(BlockDriverState *bs, int cache);
 void bdrv_get_geometry_hint(BlockDriverState *bs,
                             int *pcyls, int *pheads, int *psecs);
 int bdrv_get_type_hint(BlockDriverState *bs);
@@ -120,6 +121,7 @@ int bdrv_get_translation_hint(BlockDrive
 int bdrv_is_removable(BlockDriverState *bs);
 int bdrv_is_read_only(BlockDriverState *bs);
 int bdrv_is_inserted(BlockDriverState *bs);
+int bdrv_is_cached(BlockDriverState *bs);
 int bdrv_media_changed(BlockDriverState *bs);
 int bdrv_is_locked(BlockDriverState *bs);
 void bdrv_set_locked(BlockDriverState *bs, int locked);
Index: qemu/block_int.h
===================================================================
--- qemu.orig/block_int.h	2007-12-03 09:53:30.000000000 +0100
+++ qemu/block_int.h	2007-12-03 09:54:53.000000000 +0100
@@ -124,6 +124,7 @@ struct BlockDriverState {
        drivers. They are not used by the block driver */
     int cyls, heads, secs, translation;
     int type;
+    int cache;
     char device_name[32];
     BlockDriverState *next;
 };
Index: qemu/vl.c
===================================================================
--- qemu.orig/vl.c	2007-12-03 09:54:47.000000000 +0100
+++ qemu/vl.c	2007-12-03 09:54:53.000000000 +0100
@@ -5112,6 +5112,7 @@ static int drive_init(const char *str, i
         bdrv_flags |= BDRV_O_SNAPSHOT;
     if (!cache)
         bdrv_flags |= BDRV_O_DIRECT;
+    bdrv_set_cache_hint(bdrv, cache);
     if (bdrv_open(bdrv, file, bdrv_flags) < 0 || qemu_key_check(bdrv, file)) {
         fprintf(stderr, "qemu: could not open disk image %s\n",
                         file);
Index: qemu/hw/ide.c
===================================================================
--- qemu.orig/hw/ide.c	2007-12-03 09:54:47.000000000 +0100
+++ qemu/hw/ide.c	2007-12-03 09:54:53.000000000 +0100
@@ -816,7 +816,7 @@ static int dma_buf_rw(BMDMAState *bm, in
 }
 
 /* XXX: handle errors */
-static void ide_read_dma_cb(void *opaque, int ret)
+static void ide_read_dma_cb_buffered(void *opaque, int ret)
 {
     BMDMAState *bm = opaque;
     IDEState *s = bm->ide_if;
@@ -856,7 +856,86 @@ static void ide_read_dma_cb(void *opaque
     printf("aio_read: sector_num=%lld n=%d\n", sector_num, n);
 #endif
     bm->aiocb = bdrv_aio_read(s->bs, sector_num, s->io_buffer, n,
-                              ide_read_dma_cb, bm);
+                              ide_read_dma_cb_buffered, bm);
+}
+
+static void ide_read_dma_cb_unbuffered(void *opaque, int ret)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
+    int64_t sector_num;
+    int nsector;
+    int len;
+    uint8_t *phy_addr;
+
+    if (s->nsector == 0) {
+        s->status = READY_STAT | SEEK_STAT;
+       ide_set_irq(s);
+    eot:
+       bm->status &= ~BM_STATUS_DMAING;
+       bm->status |= BM_STATUS_INT;
+       bm->dma_cb = NULL;
+       bm->ide_if = NULL;
+       bm->aiocb = NULL;
+       return;
+    }
+
+    /* launch next transfer */
+
+    if (bm->cur_prd_len == 0) {
+        struct {
+            uint32_t addr;
+            uint32_t size;
+        } prd;
+
+        cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+
+        bm->cur_addr += 8;
+        prd.addr = le32_to_cpu(prd.addr);
+        prd.size = le32_to_cpu(prd.size);
+        len = prd.size & 0xfffe;
+        if (len == 0)
+            len = 0x10000;
+        bm->cur_prd_addr = prd.addr;
+       while ((bm->cur_addr - bm->addr) < 4096) {
+           int tmp_len;
+           cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+           prd.addr = le32_to_cpu(prd.addr);
+           prd.size = le32_to_cpu(prd.size);
+           if (bm->cur_prd_addr + len != prd.addr)
+               break;
+            tmp_len = prd.size & 0xfffe;
+            if (tmp_len == 0)
+                tmp_len = 0x10000;
+           len += tmp_len;
+            bm->cur_addr += 8;
+           if (prd.size & 0x80000000)
+               break;
+       }
+        bm->cur_prd_len = len;
+    }
+
+    phy_addr = cpu_physical_page_addr(bm->cur_prd_addr);
+    if (phy_addr == (uint8_t *)-1)
+       goto eot;
+
+    len = (s->nsector<<9);
+    if (len > bm->cur_prd_len)
+       len = bm->cur_prd_len;
+
+    nsector = (len>>9);
+    bm->cur_prd_addr += (nsector<<9);
+    bm->cur_prd_len -= (nsector<<9);
+
+    sector_num = ide_get_sector(s);
+    ide_set_sector(s, sector_num + nsector);
+    s->nsector-=nsector;
+
+#ifdef DEBUG_AIO
+    printf("aio_read: sector_num=%lld n=%d\n", (unsigned long long)sector_num, nsector);
+#endif
+    bm->aiocb = bdrv_aio_read(s->bs, sector_num, phy_addr, nsector,
+                             ide_read_dma_cb_unbuffered, bm);
 }
 
 static void ide_sector_read_dma(IDEState *s)
@@ -864,7 +943,10 @@ static void ide_sector_read_dma(IDEState
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
-    ide_dma_start(s, ide_read_dma_cb);
+    if (bdrv_is_cached(s->bs))
+        ide_dma_start(s, ide_read_dma_cb_buffered);
+    else
+        ide_dma_start(s, ide_read_dma_cb_unbuffered);
 }
 
 static void ide_sector_write_timer_cb(void *opaque)
@@ -917,7 +999,7 @@ static void ide_sector_write(IDEState *s
 }
 
 /* XXX: handle errors */
-static void ide_write_dma_cb(void *opaque, int ret)
+static void ide_write_dma_cb_buffered(void *opaque, int ret)
 {
     BMDMAState *bm = opaque;
     IDEState *s = bm->ide_if;
@@ -958,7 +1040,86 @@ static void ide_write_dma_cb(void *opaqu
     printf("aio_write: sector_num=%lld n=%d\n", sector_num, n);
 #endif
     bm->aiocb = bdrv_aio_write(s->bs, sector_num, s->io_buffer, n,
-                               ide_write_dma_cb, bm);
+                               ide_write_dma_cb_buffered, bm);
+}
+
+static void ide_write_dma_cb_unbuffered(void *opaque, int ret)
+{
+    BMDMAState *bm = opaque;
+    IDEState *s = bm->ide_if;
+    int64_t sector_num;
+    int nsector;
+    int len;
+    uint8_t *phy_addr;
+
+    if (s->nsector == 0) {
+        s->status = READY_STAT | SEEK_STAT;
+       ide_set_irq(s);
+    eot:
+       bm->status &= ~BM_STATUS_DMAING;
+       bm->status |= BM_STATUS_INT;
+       bm->dma_cb = NULL;
+       bm->ide_if = NULL;
+       bm->aiocb = NULL;
+       return;
+    }
+
+    /* launch next transfer */
+
+    if (bm->cur_prd_len == 0) {
+        struct {
+            uint32_t addr;
+            uint32_t size;
+        } prd;
+
+        cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+
+        bm->cur_addr += 8;
+        prd.addr = le32_to_cpu(prd.addr);
+        prd.size = le32_to_cpu(prd.size);
+        len = prd.size & 0xfffe;
+        if (len == 0)
+            len = 0x10000;
+        bm->cur_prd_addr = prd.addr;
+       while ((bm->cur_addr - bm->addr) < 4096) {
+           int tmp_len;
+           cpu_physical_memory_read(bm->cur_addr, (uint8_t *)&prd, 8);
+           prd.addr = le32_to_cpu(prd.addr);
+           prd.size = le32_to_cpu(prd.size);
+           if (bm->cur_prd_addr + len != prd.addr)
+               break;
+            tmp_len = prd.size & 0xfffe;
+            if (tmp_len == 0)
+                tmp_len = 0x10000;
+           len += tmp_len;
+            bm->cur_addr += 8;
+           if (prd.size & 0x80000000)
+               break;
+       }
+        bm->cur_prd_len = len;
+    }
+
+    phy_addr = cpu_physical_page_addr(bm->cur_prd_addr);
+    if (phy_addr == (uint8_t *)-1)
+       goto eot;
+
+    len = (s->nsector<<9);
+    if (len > bm->cur_prd_len)
+       len = bm->cur_prd_len;
+
+    nsector = (len>>9);
+    bm->cur_prd_addr += (nsector<<9);
+    bm->cur_prd_len -= (nsector<<9);
+
+    sector_num = ide_get_sector(s);
+    ide_set_sector(s, sector_num + nsector);
+    s->nsector-=nsector;
+
+#ifdef DEBUG_AIO
+    printf("aio_write: sector_num=%lld n=%d\n", (unsigned long long)sector_num, nsector);
+#endif
+    bm->aiocb = bdrv_aio_write(s->bs, sector_num, phy_addr, nsector,
+                             ide_write_dma_cb_unbuffered, bm);
 }
 
 static void ide_sector_write_dma(IDEState *s)
@@ -966,7 +1127,10 @@ static void ide_sector_write_dma(IDEStat
     s->status = READY_STAT | SEEK_STAT | DRQ_STAT | BUSY_STAT;
     s->io_buffer_index = 0;
     s->io_buffer_size = 0;
-    ide_dma_start(s, ide_write_dma_cb);
+    if (bdrv_is_cached(s->bs))
+        ide_dma_start(s, ide_write_dma_cb_buffered);
+    else
+        ide_dma_start(s, ide_write_dma_cb_unbuffered);
 }
 
 static void ide_atapi_cmd_ok(IDEState *s)
Index: qemu/exec.c
===================================================================
--- qemu.orig/exec.c	2007-12-03 09:53:30.000000000 +0100
+++ qemu/exec.c	2007-12-03 09:54:53.000000000 +0100
@@ -2054,6 +2054,25 @@ void cpu_register_physical_memory(target
     }
 }
 
+uint8_t * cpu_physical_page_addr(target_phys_addr_t addr)
+{
+    target_phys_addr_t page;
+    unsigned long pd;
+    PhysPageDesc *p;
+    unsigned long addr1;
+
+    page = addr & TARGET_PAGE_MASK;
+    p = phys_page_find(page >> TARGET_PAGE_BITS);
+    if (!p)
+       return (uint8_t*)-1;
+
+    pd = p->phys_offset;
+
+    addr1 = (pd & TARGET_PAGE_MASK) + (addr & ~TARGET_PAGE_MASK);
+
+    return phys_ram_base + addr1;
+}
+
 /* XXX: temporary until new memory mapping API */
 uint32_t cpu_get_physical_page_desc(target_phys_addr_t addr)
 {
Index: qemu/cpu-all.h
===================================================================
--- qemu.orig/cpu-all.h	2007-12-03 09:53:30.000000000 +0100
+++ qemu/cpu-all.h	2007-12-03 09:54:53.000000000 +0100
@@ -837,6 +837,7 @@ int cpu_register_io_memory(int io_index,
 CPUWriteMemoryFunc **cpu_get_io_memory_write(int io_index);
 CPUReadMemoryFunc **cpu_get_io_memory_read(int io_index);
 
+extern uint8_t * cpu_physical_page_addr(target_phys_addr_t addr);
 void cpu_physical_memory_rw(target_phys_addr_t addr, uint8_t *buf,
                             int len, int is_write);
 static inline void cpu_physical_memory_read(target_phys_addr_t addr,

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 10:09   ` [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O Laurent Vivier
@ 2007-12-03 10:23     ` Fabrice Bellard
  2007-12-03 10:30       ` Laurent Vivier
  2007-12-03 11:14       ` Johannes Schindelin
  0 siblings, 2 replies; 31+ messages in thread
From: Fabrice Bellard @ 2007-12-03 10:23 UTC (permalink / raw)
  To: Laurent Vivier, qemu-devel

Laurent Vivier wrote:
> This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
> by removing the buffer used in the IDE emulation.
> ---
>  block.c     |   10 +++
>  block.h     |    2 
>  block_int.h |    1 
>  cpu-all.h   |    1 
>  exec.c      |   19 ++++++
>  hw/ide.c    |  176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>  vl.c        |    1 
>  7 files changed, 204 insertions(+), 6 deletions(-)

What's the use of keeping the buffered case ?

Fabrice.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 10:23     ` Fabrice Bellard
@ 2007-12-03 10:30       ` Laurent Vivier
  2007-12-03 11:40         ` Markus Hitter
  2007-12-03 15:54         ` Anthony Liguori
  2007-12-03 11:14       ` Johannes Schindelin
  1 sibling, 2 replies; 31+ messages in thread
From: Laurent Vivier @ 2007-12-03 10:30 UTC (permalink / raw)
  To: Fabrice Bellard; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 1079 bytes --]

Le lundi 03 décembre 2007 à 11:23 +0100, Fabrice Bellard a écrit :
> Laurent Vivier wrote:
> > This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
> > by removing the buffer used in the IDE emulation.
> > ---
> >  block.c     |   10 +++
> >  block.h     |    2 
> >  block_int.h |    1 
> >  cpu-all.h   |    1 
> >  exec.c      |   19 ++++++
> >  hw/ide.c    |  176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  vl.c        |    1 
> >  7 files changed, 204 insertions(+), 6 deletions(-)
> 
> What's the use of keeping the buffered case ?

Well, I don't like to remove code written by others...
and I don't want to break something.

But if you think I should remove the buffered case, I can.

BTW, do you think I should enable "cache=off" by default ?
Or even remove the option from the command line and always use
O_DIRECT ?

Regards,
Laurent
-- 
------------- Laurent.Vivier@bull.net  --------------
       "Any sufficiently advanced technology is
  indistinguishable from magic." - Arthur C. Clarke

[-- Attachment #2: Ceci est une partie de message numériquement signée --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 10:23     ` Fabrice Bellard
  2007-12-03 10:30       ` Laurent Vivier
@ 2007-12-03 11:14       ` Johannes Schindelin
  1 sibling, 0 replies; 31+ messages in thread
From: Johannes Schindelin @ 2007-12-03 11:14 UTC (permalink / raw)
  To: Fabrice Bellard; +Cc: Laurent Vivier, qemu-devel

Hi,

On Mon, 3 Dec 2007, Fabrice Bellard wrote:

> Laurent Vivier wrote:
> > This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
> > by removing the buffer used in the IDE emulation.
> > ---
> >  block.c     |   10 +++
> >  block.h     |    2  block_int.h |    1  cpu-all.h   |    1  exec.c      |
> > 19 ++++++
> >  hw/ide.c    |  176
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >  vl.c        |    1  7 files changed, 204 insertions(+), 6 deletions(-)
> 
> What's the use of keeping the buffered case ?

AFAICT if your guest is DOS without a disk caching driver, you do not 
really want to use O_DIRECT.

Ciao,
Dscho

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 10:30       ` Laurent Vivier
@ 2007-12-03 11:40         ` Markus Hitter
  2007-12-03 15:39           ` Paul Brook
  2007-12-03 15:54         ` Anthony Liguori
  1 sibling, 1 reply; 31+ messages in thread
From: Markus Hitter @ 2007-12-03 11:40 UTC (permalink / raw)
  To: qemu-devel


Am 03.12.2007 um 11:30 schrieb Laurent Vivier:

> But if you think I should remove the buffered case, I can.

In doubt, less code is always better. For the unlikely case you broke  
something badly, there's always the option to take back the patch.

> BTW, do you think I should enable "cache=off" by default ?


This would be fine for a transition phase, but likely, the cache=on  
case gets forgotten to be removed later. So, do it now.


my $ 0.02,
Markus

- - - - - - - - - - - - - - - - - - -
Dipl. Ing. Markus Hitter
http://www.jump-ing.de/

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 11:40         ` Markus Hitter
@ 2007-12-03 15:39           ` Paul Brook
  2007-12-03 19:26             ` Samuel Thibault
  0 siblings, 1 reply; 31+ messages in thread
From: Paul Brook @ 2007-12-03 15:39 UTC (permalink / raw)
  To: qemu-devel; +Cc: Markus Hitter

On Monday 03 December 2007, Markus Hitter wrote:
> Am 03.12.2007 um 11:30 schrieb Laurent Vivier:
> > But if you think I should remove the buffered case, I can.
>
> In doubt, less code is always better. For the unlikely case you broke
> something badly, there's always the option to take back the patch.
>
> > BTW, do you think I should enable "cache=off" by default ?
>
> This would be fine for a transition phase, but likely, the cache=on
> case gets forgotten to be removed later. So, do it now.

I think host caching is still useful enough to be enabled by default, and 
provides a significant performance increase in several cases. 

- The guest typically has a relatively small quantity of RAM, compared to a 
modern machine.  Allowing the host OS to act as a demand-based L2 cache 
allows this to be used without having to dedicate excessive quantities of ram 
to qemu.
- I've seen reports that it significantly speeds up the windows installer.
- Host cache is persistent between multiple qemu runs. f you're doing anything 
that requires frequent guest reboots (e.g. kernel debugging) this is going to 
be a huge win.
- You're running a host OS that has limited or no caching (e.g. DOS).

I'd hope that the host OS would have cache use heuristics that would help 
limit cache pollution.

Paul

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 10:30       ` Laurent Vivier
  2007-12-03 11:40         ` Markus Hitter
@ 2007-12-03 15:54         ` Anthony Liguori
  2007-12-03 17:08           ` Samuel Thibault
  2007-12-03 19:00           ` Laurent Vivier
  1 sibling, 2 replies; 31+ messages in thread
From: Anthony Liguori @ 2007-12-03 15:54 UTC (permalink / raw)
  To: qemu-devel

Laurent Vivier wrote:
> Le lundi 03 décembre 2007 à 11:23 +0100, Fabrice Bellard a écrit :
>   
>> Laurent Vivier wrote:
>>     
>>> This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
>>> by removing the buffer used in the IDE emulation.
>>> ---
>>>  block.c     |   10 +++
>>>  block.h     |    2 
>>>  block_int.h |    1 
>>>  cpu-all.h   |    1 
>>>  exec.c      |   19 ++++++
>>>  hw/ide.c    |  176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
>>>  vl.c        |    1 
>>>  7 files changed, 204 insertions(+), 6 deletions(-)
>>>       
>> What's the use of keeping the buffered case ?
>>     
>
> Well, I don't like to remove code written by others...
> and I don't want to break something.
>
> But if you think I should remove the buffered case, I can.
>
> BTW, do you think I should enable "cache=off" by default ?
> Or even remove the option from the command line and always use
> O_DIRECT ?
>   

Hi Laurent,

Have you done any performance testing?  Buffered IO should absolutely 
beat direct IO simply because buffered IO allows writes to complete 
before they actually hit disk.  I've observed this myself.  Plus the 
host typically has a much larger page cache then the guest so the second 
level of caching helps an awful lot.

Regards,

Anthony Liguori

> Regards,
> Laurent
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 15:54         ` Anthony Liguori
@ 2007-12-03 17:08           ` Samuel Thibault
  2007-12-03 17:17             ` Paul Brook
  2007-12-03 18:06             ` Anthony Liguori
  2007-12-03 19:00           ` Laurent Vivier
  1 sibling, 2 replies; 31+ messages in thread
From: Samuel Thibault @ 2007-12-03 17:08 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori, le Mon 03 Dec 2007 09:54:47 -0600, a écrit :
> Have you done any performance testing?  Buffered IO should absolutely 
> beat direct IO simply because buffered IO allows writes to complete 
> before they actually hit disk.

Since qemu can use the aio interface, that shouldn't matter.

Samuel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 17:08           ` Samuel Thibault
@ 2007-12-03 17:17             ` Paul Brook
  2007-12-03 17:49               ` Jamie Lokier
  2007-12-03 18:06             ` Anthony Liguori
  1 sibling, 1 reply; 31+ messages in thread
From: Paul Brook @ 2007-12-03 17:17 UTC (permalink / raw)
  To: qemu-devel; +Cc: Samuel Thibault

On Monday 03 December 2007, Samuel Thibault wrote:
> Anthony Liguori, le Mon 03 Dec 2007 09:54:47 -0600, a écrit :
> > Have you done any performance testing?  Buffered IO should absolutely
> > beat direct IO simply because buffered IO allows writes to complete
> > before they actually hit disk.
>
> Since qemu can use the aio interface, that shouldn't matter.

Only if the emulated hardware and guest OS support multiple concurrent 
commands.  IDE supports async operation, but not concurrent commmands. In 
practice this means you only get full performance if you're using the SCSI 
emulation.

Paul

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 17:17             ` Paul Brook
@ 2007-12-03 17:49               ` Jamie Lokier
  2007-12-03 18:08                 ` Anthony Liguori
  0 siblings, 1 reply; 31+ messages in thread
From: Jamie Lokier @ 2007-12-03 17:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Samuel Thibault

Paul Brook wrote:
> On Monday 03 December 2007, Samuel Thibault wrote:
> > Anthony Liguori, le Mon 03 Dec 2007 09:54:47 -0600, a écrit :
> > > Have you done any performance testing?  Buffered IO should absolutely
> > > beat direct IO simply because buffered IO allows writes to complete
> > > before they actually hit disk.
> >
> > Since qemu can use the aio interface, that shouldn't matter.
> 
> Only if the emulated hardware and guest OS support multiple concurrent 
> commands.  IDE supports async operation, but not concurrent commmands. In 
> practice this means you only get full performance if you're using the SCSI 
> emulation.

With the IDE emulation, when the emulated "disk write cache" flag is
on it may be reasonable to report a write as completed when the AIO is
dispatched, without waiting for the AIO to complete. 

An IDE flush cache command would wait for all outstanding write AIOs
to complete, and then issue a flush cache (fdatasync) to the real
device before reporting it has completed.

That's roughly equivalent to what an IDE disk with write caching does,
and it would provide exactly the guarantees for safe storage to the
real physical medium that a journalling filesystem or database in the
guest requires.

If a guest doesn't use journalling with IDE write cache safely
(e.g. 2.4 Linux and earler), it can simply turn off the IDE "disk
write cache" flag, which is what it has to do on a real physical disk
too.

Terminating the qemu process abruptly might cancel some AIOs, but even
that is ok, as it's equivalent to pulling the power on a real disk
with uncommitted cached writes.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 17:08           ` Samuel Thibault
  2007-12-03 17:17             ` Paul Brook
@ 2007-12-03 18:06             ` Anthony Liguori
  2007-12-03 19:10               ` Laurent Vivier
  2007-12-03 19:14               ` Paul Brook
  1 sibling, 2 replies; 31+ messages in thread
From: Anthony Liguori @ 2007-12-03 18:06 UTC (permalink / raw)
  To: qemu-devel

Samuel Thibault wrote:
> Anthony Liguori, le Mon 03 Dec 2007 09:54:47 -0600, a écrit :
>   
>> Have you done any performance testing?  Buffered IO should absolutely 
>> beat direct IO simply because buffered IO allows writes to complete 
>> before they actually hit disk.
>>     
>
> Since qemu can use the aio interface, that shouldn't matter.
>   

Well, let's separate a few things.  QEMU uses posix-aio which uses 
threads and normal read/write operations.  It also limits the number of 
threads that aio uses to 1 which effectively makes everything 
synchronous anyway.

But it still doesn't matter.  When you issue a write() on an O_DIRECT 
fd, the write does not complete until the data has made it's way to 
disk.  The guest can still run if you're using O_NONBLOCK but the IDE 
device will not submit another IO request until you complete the DMA 
operation.

The SCSI device supports multiple outstanding operations but it's 
limited to 16 but you'll never see more than one request at a time in 
QEMU currently because of the limitation to a single thread.

Regards,

Anthony Liguori

> Samuel
>
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 17:49               ` Jamie Lokier
@ 2007-12-03 18:08                 ` Anthony Liguori
  2007-12-03 18:40                   ` Jamie Lokier
  0 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2007-12-03 18:08 UTC (permalink / raw)
  To: qemu-devel; +Cc: Samuel Thibault

Jamie Lokier wrote:
> Paul Brook wrote:
>   
>> On Monday 03 December 2007, Samuel Thibault wrote:
>>     
>>> Anthony Liguori, le Mon 03 Dec 2007 09:54:47 -0600, a écrit :
>>>       
>>>> Have you done any performance testing?  Buffered IO should absolutely
>>>> beat direct IO simply because buffered IO allows writes to complete
>>>> before they actually hit disk.
>>>>         
>>> Since qemu can use the aio interface, that shouldn't matter.
>>>       
>> Only if the emulated hardware and guest OS support multiple concurrent 
>> commands.  IDE supports async operation, but not concurrent commmands. In 
>> practice this means you only get full performance if you're using the SCSI 
>> emulation.
>>     
>
> With the IDE emulation, when the emulated "disk write cache" flag is
> on it may be reasonable to report a write as completed when the AIO is
> dispatched, without waiting for the AIO to complete. 
>
> An IDE flush cache command would wait for all outstanding write AIOs
> to complete, and then issue a flush cache (fdatasync) to the real
> device before reporting it has completed.
>
> That's roughly equivalent to what an IDE disk with write caching does,
> and it would provide exactly the guarantees for safe storage to the
> real physical medium that a journalling filesystem or database in the
> guest requires.
>
> If a guest doesn't use journalling with IDE write cache safely
> (e.g. 2.4 Linux and earler), it can simply turn off the IDE "disk
> write cache" flag, which is what it has to do on a real physical disk
> too.
>
> Terminating the qemu process abruptly might cancel some AIOs, but even
> that is ok, as it's equivalent to pulling the power on a real disk
> with uncommitted cached writes.
>   

Except that in an enterprise environment, you typically have battery 
backed disk cache.  It really doesn't matter though b/c in QEMU today, 
submitting the request blocks until it's completed anyway (which is 
nearly instant anyway since I/O is buffered).

Regards,

Anthony Liguori

> -- Jamie
>
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 18:08                 ` Anthony Liguori
@ 2007-12-03 18:40                   ` Jamie Lokier
  0 siblings, 0 replies; 31+ messages in thread
From: Jamie Lokier @ 2007-12-03 18:40 UTC (permalink / raw)
  To: qemu-devel; +Cc: Samuel Thibault

Anthony Liguori wrote:
> >With the IDE emulation, when the emulated "disk write cache" flag is
> >on it may be reasonable to report a write as completed when the AIO is
> >dispatched, without waiting for the AIO to complete. 
> >
> >An IDE flush cache command would wait for all outstanding write AIOs
> >to complete, and then issue a flush cache (fdatasync) to the real
> >device before reporting it has completed.
> >
> >That's roughly equivalent to what an IDE disk with write caching does,
> >and it would provide exactly the guarantees for safe storage to the
> >real physical medium that a journalling filesystem or database in the
> >guest requires.
> 
> Except that in an enterprise environment, you typically have battery 
> backed disk cache.  It really doesn't matter though b/c in QEMU today, 
> submitting the request blocks until it's completed anyway (which is 
> nearly instant anyway since I/O is buffered).

Buffered I/O is less reliable in a sense.

With buffered I/O, if the host crashes, you may lose data that a
filesystem or database on the guest reported as committed to
applications.  That can result, on those rare occasions, in guest
journalled filesystem corruption (something that should be
impossible), and in database corruption or durability failure.

With direct I/O and write cache emulation (as described), when a guest
journalling filesystem or database reports data is committed, it has
much the same committment/durability guarantee that the same
applications would have running on the host.  Namely, the data has
reached the disk, and the disk has reported it's committed.

This may matter if you want to run those sort of applications in a
guest, which clearly people often do, especially with KVM or Xen.

Anecdote: This is already a problem in some environments.  I have a
rented virtual machine; it's running UML.  The UML disk uses O_SYNC
writes (nowadays), because buffered host writes resulted in occasional
guest data loss, and journalled filesystem corruption.  Unfortunately,
this is a performance slowdown, but it's better than occasional
corruption.  I imagine similar things apply with Qemu machines
occasionally.

-- Jamie

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 15:54         ` Anthony Liguori
  2007-12-03 17:08           ` Samuel Thibault
@ 2007-12-03 19:00           ` Laurent Vivier
  1 sibling, 0 replies; 31+ messages in thread
From: Laurent Vivier @ 2007-12-03 19:00 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2480 bytes --]

Le lundi 03 décembre 2007 à 09:54 -0600, Anthony Liguori a écrit :
> Laurent Vivier wrote:
> > Le lundi 03 décembre 2007 à 11:23 +0100, Fabrice Bellard a écrit :
> >   
> >> Laurent Vivier wrote:
> >>     
> >>> This patch enhances the "-drive ,cache=off" mode with IDE drive emulation
> >>> by removing the buffer used in the IDE emulation.
> >>> ---
> >>>  block.c     |   10 +++
> >>>  block.h     |    2 
> >>>  block_int.h |    1 
> >>>  cpu-all.h   |    1 
> >>>  exec.c      |   19 ++++++
> >>>  hw/ide.c    |  176 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
> >>>  vl.c        |    1 
> >>>  7 files changed, 204 insertions(+), 6 deletions(-)
> >>>       
> >> What's the use of keeping the buffered case ?
> >>     
> >
> > Well, I don't like to remove code written by others...
> > and I don't want to break something.
> >
> > But if you think I should remove the buffered case, I can.
> >
> > BTW, do you think I should enable "cache=off" by default ?
> > Or even remove the option from the command line and always use
> > O_DIRECT ?
> >   
> 
> Hi Laurent,

Hi Anthony,

> Have you done any performance testing?  Buffered IO should absolutely 
> beat direct IO simply because buffered IO allows writes to complete 
> before they actually hit disk.  I've observed this myself.  Plus the 
> host typically has a much larger page cache then the guest so the second 
> level of caching helps an awful lot.

I don't have real benchmarks. I just saw some improvements with dbench
(which is not a good benchmark, I know...)

Direct I/O can be good in some cases (because it avoids multiple copies)
and good in others (because it avoids disk access, and as you say it
doesn't wait I/O completion).

But there are at least two other good reasons to use it:

- reliability: by avoiding cache we improve probability of data are on
disk (and the ordering of I/O). And as you say, as we wait write
completion, we are sure data have been written.

- isolation: it allows to avoid to pollute host cache with guest data
(and if we have several guests, it avoids to have performance impact at
the cache level between guests).

But there is no perfect solution, it's why I think it's good thing to
let the choice to the user.

Laurent
- 
-- 
------------- Laurent.Vivier@bull.net  --------------
       "Any sufficiently advanced technology is
  indistinguishable from magic." - Arthur C. Clarke

[-- Attachment #2: Ceci est une partie de message numériquement signée --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 18:06             ` Anthony Liguori
@ 2007-12-03 19:10               ` Laurent Vivier
  2007-12-03 19:16                 ` Paul Brook
  2007-12-03 21:13                 ` Gerd Hoffmann
  2007-12-03 19:14               ` Paul Brook
  1 sibling, 2 replies; 31+ messages in thread
From: Laurent Vivier @ 2007-12-03 19:10 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 2040 bytes --]

Le lundi 03 décembre 2007 à 12:06 -0600, Anthony Liguori a écrit :
> Samuel Thibault wrote:
> > Anthony Liguori, le Mon 03 Dec 2007 09:54:47 -0600, a écrit :
> >   
> >> Have you done any performance testing?  Buffered IO should absolutely 
> >> beat direct IO simply because buffered IO allows writes to complete 
> >> before they actually hit disk.
> >>     
> >
> > Since qemu can use the aio interface, that shouldn't matter.
> >   
> 
> Well, let's separate a few things.  QEMU uses posix-aio which uses 
> threads and normal read/write operations.  It also limits the number of 
> threads that aio uses to 1 which effectively makes everything 
> synchronous anyway.

Yes, librt is providing posix-aio, and librt coming with GNU libc uses
threads.
But if I remember correctly librt coming with RHEL uses a mix of threads
and linux kernel AIO (you can have a look to the .srpm of libc).

There is also the libaio I wrote some years ago (with Sébastien Dugué)
which is purely linux kernel AIO (but kernel patches were never included
because of Zach Brown Asynchronous System Call work)

BTW, if everyone thinks it could be a good idea I can port block-raw.c
to use linux kernel AIO (without removing POSIX AIO support, of course)

> But it still doesn't matter.  When you issue a write() on an O_DIRECT 
> fd, the write does not complete until the data has made it's way to 
> disk.  The guest can still run if you're using O_NONBLOCK but the IDE 
> device will not submit another IO request until you complete the DMA 
> operation.
> 
> The SCSI device supports multiple outstanding operations but it's 
> limited to 16 but you'll never see more than one request at a time in 
> QEMU currently because of the limitation to a single thread.
> 
> Regards,
> 
> Anthony Liguori
> 
> > Samuel
> >
> >
> >
> >   
> 
> 
> 
-- 
------------- Laurent.Vivier@bull.net  --------------
       "Any sufficiently advanced technology is
  indistinguishable from magic." - Arthur C. Clarke

[-- Attachment #2: Ceci est une partie de message numériquement signée --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 18:06             ` Anthony Liguori
  2007-12-03 19:10               ` Laurent Vivier
@ 2007-12-03 19:14               ` Paul Brook
  1 sibling, 0 replies; 31+ messages in thread
From: Paul Brook @ 2007-12-03 19:14 UTC (permalink / raw)
  To: qemu-devel

> Well, let's separate a few things.  QEMU uses posix-aio which uses
> threads and normal read/write operations.  It also limits the number of
> threads that aio uses to 1 which effectively makes everything
> synchronous anyway.

This is a bug. Allegedly this is to workaround an old broken glibc, so we 
should probably make it conditional on old glibc.

Paul

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 19:10               ` Laurent Vivier
@ 2007-12-03 19:16                 ` Paul Brook
  2007-12-03 21:36                   ` Anthony Liguori
  2007-12-04  8:13                   ` Laurent Vivier
  2007-12-03 21:13                 ` Gerd Hoffmann
  1 sibling, 2 replies; 31+ messages in thread
From: Paul Brook @ 2007-12-03 19:16 UTC (permalink / raw)
  To: qemu-devel; +Cc: Laurent Vivier

> Yes, librt is providing posix-aio, and librt coming with GNU libc uses
> threads.
> But if I remember correctly librt coming with RHEL uses a mix of threads
> and linux kernel AIO (you can have a look to the .srpm of libc).
>
> BTW, if everyone thinks it could be a good idea I can port block-raw.c
> to use linux kernel AIO (without removing POSIX AIO support, of course)

This seems rather pointless, given a user can just use a linux-AIO librt 
instead.

Paul

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 15:39           ` Paul Brook
@ 2007-12-03 19:26             ` Samuel Thibault
  0 siblings, 0 replies; 31+ messages in thread
From: Samuel Thibault @ 2007-12-03 19:26 UTC (permalink / raw)
  To: qemu-devel; +Cc: Markus Hitter

Paul Brook, le Mon 03 Dec 2007 15:39:48 +0000, a écrit :
> I think host caching is still useful enough to be enabled by default, and 
> provides a significant performance increase in several cases. 
> 
> - The guest typically has a relatively small quantity of RAM, compared to a 
> modern machine.  Allowing the host OS to act as a demand-based L2 cache 
> allows this to be used without having to dedicate excessive quantities of ram 
> to qemu.
> - I've seen reports that it significantly speeds up the windows installer.
> - Host cache is persistent between multiple qemu runs. f you're doing anything 
> that requires frequent guest reboots (e.g. kernel debugging) this is going to 
> be a huge win.
> - You're running a host OS that has limited or no caching (e.g. DOS).

Yes, and in other cases (e.g. real-production KVM/Xen servers), this is
just cache duplication.

> I'd hope that the host OS would have cache use heuristics that would help 
> limit cache pollution.

How could it?  It can't detect that the guest also has a buffer/page
cache.

Samuel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 19:10               ` Laurent Vivier
  2007-12-03 19:16                 ` Paul Brook
@ 2007-12-03 21:13                 ` Gerd Hoffmann
  2007-12-03 21:23                   ` Samuel Thibault
  2007-12-03 21:38                   ` Anthony Liguori
  1 sibling, 2 replies; 31+ messages in thread
From: Gerd Hoffmann @ 2007-12-03 21:13 UTC (permalink / raw)
  To: qemu-devel

  Hi,

> BTW, if everyone thinks it could be a good idea I can port block-raw.c
> to use linux kernel AIO (without removing POSIX AIO support, of course)

IMHO it would be a much better idea to kill the aio interface altogether
and instead make the block drivers reentrant.  Then you can use
(multiple) posix threads to run the I/O async if you want.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 21:13                 ` Gerd Hoffmann
@ 2007-12-03 21:23                   ` Samuel Thibault
  2007-12-03 21:38                   ` Anthony Liguori
  1 sibling, 0 replies; 31+ messages in thread
From: Samuel Thibault @ 2007-12-03 21:23 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann, le Mon 03 Dec 2007 22:13:07 +0100, a écrit :
> > BTW, if everyone thinks it could be a good idea I can port block-raw.c
> > to use linux kernel AIO (without removing POSIX AIO support, of course)
> 
> IMHO it would be a much better idea to kill the aio interface altogether
> and instead make the block drivers reentrant.  Then you can use
> (multiple) posix threads to run the I/O async if you want.

Mmm, that will not make my life easier... I'm precisely trying to avoid
threads so as to get better throughput.

Samuel

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 19:16                 ` Paul Brook
@ 2007-12-03 21:36                   ` Anthony Liguori
  2007-12-04 12:49                     ` Gerd Hoffmann
  2007-12-04  8:13                   ` Laurent Vivier
  1 sibling, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2007-12-03 21:36 UTC (permalink / raw)
  To: qemu-devel; +Cc: Laurent Vivier

Paul Brook wrote:
>> Yes, librt is providing posix-aio, and librt coming with GNU libc uses
>> threads.
>> But if I remember correctly librt coming with RHEL uses a mix of threads
>> and linux kernel AIO (you can have a look to the .srpm of libc).
>>
>> BTW, if everyone thinks it could be a good idea I can port block-raw.c
>> to use linux kernel AIO (without removing POSIX AIO support, of course)
>>     
>
> This seems rather pointless, given a user can just use a linux-AIO librt 
> instead.
>   

Not at all.  linux-aio is the only interface that allows you to do 
asynchronous fdsync which simulates a barrier which allows for an 
ordered queue.

I have a patch that uses linux-aio for the virtio-blk driver I'll be 
posting tomorrow and I'm extremely happy with the results.  In recent 
kernels, you can use an eventfd interface along with linux-aio so that 
polling is unnecessary.  Along with O_DIRECT and the preadv/pwritev 
interface, you can make a block backend in userspace that performs just 
as well as if it were in the kernel.

The posix-aio interface simply doesn't provide a mechanism to do these 
things.

Regards,

Anthony Liguori

> Paul
>
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 21:13                 ` Gerd Hoffmann
  2007-12-03 21:23                   ` Samuel Thibault
@ 2007-12-03 21:38                   ` Anthony Liguori
  2007-12-04 13:21                     ` Gerd Hoffmann
  1 sibling, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2007-12-03 21:38 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann wrote:
>   Hi,
>
>   
>> BTW, if everyone thinks it could be a good idea I can port block-raw.c
>> to use linux kernel AIO (without removing POSIX AIO support, of course)
>>     
>
> IMHO it would be a much better idea to kill the aio interface altogether
> and instead make the block drivers reentrant.  Then you can use
> (multiple) posix threads to run the I/O async if you want.
>   

Threads are a poor substitute for a proper AIO interface.  linux-aio 
gives you everything you could possibly want in an interface since it 
allows you to submit multiple vectored operations in a single syscall, 
use an fd to signal request completion, complete multiple requests in a 
single syscall, and inject barriers via fdsync.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>
>
>
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 19:16                 ` Paul Brook
  2007-12-03 21:36                   ` Anthony Liguori
@ 2007-12-04  8:13                   ` Laurent Vivier
  1 sibling, 0 replies; 31+ messages in thread
From: Laurent Vivier @ 2007-12-04  8:13 UTC (permalink / raw)
  To: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 854 bytes --]

Le lundi 03 décembre 2007 à 19:16 +0000, Paul Brook a écrit :
> > Yes, librt is providing posix-aio, and librt coming with GNU libc uses
> > threads.
> > But if I remember correctly librt coming with RHEL uses a mix of threads
> > and linux kernel AIO (you can have a look to the .srpm of libc).
> >
> > BTW, if everyone thinks it could be a good idea I can port block-raw.c
> > to use linux kernel AIO (without removing POSIX AIO support, of course)
> 
> This seems rather pointless, given a user can just use a linux-AIO librt 
> instead.

Just a comment: to use linux-aio, file must be opened with O_DIRECT.
(it's a good reason to include my patch, isn't it ?)

Laurent
-- 
------------- Laurent.Vivier@bull.net  --------------
       "Any sufficiently advanced technology is
  indistinguishable from magic." - Arthur C. Clarke

[-- Attachment #2: Ceci est une partie de message numériquement signée --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 21:36                   ` Anthony Liguori
@ 2007-12-04 12:49                     ` Gerd Hoffmann
  2007-12-04 13:02                       ` Laurent Vivier
  0 siblings, 1 reply; 31+ messages in thread
From: Gerd Hoffmann @ 2007-12-04 12:49 UTC (permalink / raw)
  To: qemu-devel; +Cc: Laurent Vivier

Anthony Liguori wrote:
> I have a patch that uses linux-aio for the virtio-blk driver I'll be
> posting tomorrow and I'm extremely happy with the results.  In recent
> kernels, you can use an eventfd interface along with linux-aio so that
> polling is unnecessary.

Which kernel version is "recent"?

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-04 12:49                     ` Gerd Hoffmann
@ 2007-12-04 13:02                       ` Laurent Vivier
  0 siblings, 0 replies; 31+ messages in thread
From: Laurent Vivier @ 2007-12-04 13:02 UTC (permalink / raw)
  To: Gerd Hoffmann; +Cc: qemu-devel

[-- Attachment #1: Type: text/plain, Size: 597 bytes --]

Le mardi 04 décembre 2007 à 13:49 +0100, Gerd Hoffmann a écrit :
> Anthony Liguori wrote:
> > I have a patch that uses linux-aio for the virtio-blk driver I'll be
> > posting tomorrow and I'm extremely happy with the results.  In recent
> > kernels, you can use an eventfd interface along with linux-aio so that
> > polling is unnecessary.
> 
> Which kernel version is "recent"?

I think it is 2.6.22 and after

Laurent
-- 
------------- Laurent.Vivier@bull.net  --------------
       "Any sufficiently advanced technology is
  indistinguishable from magic." - Arthur C. Clarke

[-- Attachment #2: Ceci est une partie de message numériquement signée --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-03 21:38                   ` Anthony Liguori
@ 2007-12-04 13:21                     ` Gerd Hoffmann
  2007-12-04 15:03                       ` Anthony Liguori
  0 siblings, 1 reply; 31+ messages in thread
From: Gerd Hoffmann @ 2007-12-04 13:21 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:
>> IMHO it would be a much better idea to kill the aio interface altogether
>> and instead make the block drivers reentrant.  Then you can use
>> (multiple) posix threads to run the I/O async if you want.
> 
> Threads are a poor substitute for a proper AIO interface.  linux-aio
> gives you everything you could possibly want in an interface since it
> allows you to submit multiple vectored operations in a single syscall,
> use an fd to signal request completion, complete multiple requests in a
> single syscall, and inject barriers via fdsync.

I still think implementing async i/o at block driver level is the wrong
thing to do.  You'll end up reinventing the wheel over and over again
and add complexity to the block drivers which simply doesn't belong
there (or not supporting async I/O for most file formats).  Just look at
the insane file size of the block driver for the simplest possible disk
format: block-raw.c.  It will become even worse when adding a
linux-specific aio variant.

In contrast:  Making the disk drivers reentrant should be easy for most
of them.  For the raw driver it should be just using pread/pwrite
syscalls instead of lseek + read/write (also saves a syscall along the
way, yea!).  Others probably need an additional lock for metadata
updates.  With that in place you can easily implement async I/O via
threads one layer above, and only once, in block.c.

IMHO the only alternative to that scheme would be to turn the block
drivers in some kind of remapping drivers for the various file formats
which don't actually perform the I/O.  Then you can handle the actual
I/O in a generic way using whatever API is available, be it posix-aio,
linux-aio or slow-sync-io.

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-04 13:21                     ` Gerd Hoffmann
@ 2007-12-04 15:03                       ` Anthony Liguori
  2007-12-04 16:18                         ` Gerd Hoffmann
  0 siblings, 1 reply; 31+ messages in thread
From: Anthony Liguori @ 2007-12-04 15:03 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann wrote:
> Anthony Liguori wrote:
>   
>>> IMHO it would be a much better idea to kill the aio interface altogether
>>> and instead make the block drivers reentrant.  Then you can use
>>> (multiple) posix threads to run the I/O async if you want.
>>>       
>> Threads are a poor substitute for a proper AIO interface.  linux-aio
>> gives you everything you could possibly want in an interface since it
>> allows you to submit multiple vectored operations in a single syscall,
>> use an fd to signal request completion, complete multiple requests in a
>> single syscall, and inject barriers via fdsync.
>>     
>
> I still think implementing async i/o at block driver level is the wrong
> thing to do.  You'll end up reinventing the wheel over and over again
> and add complexity to the block drivers which simply doesn't belong
> there (or not supporting async I/O for most file formats).  Just look at
> the insane file size of the block driver for the simplest possible disk
> format: block-raw.c.  It will become even worse when adding a
> linux-specific aio variant.
>
> In contrast:  Making the disk drivers reentrant should be easy for most
> of them.  For the raw driver it should be just using pread/pwrite
> syscalls instead of lseek + read/write (also saves a syscall along the
> way, yea!).  Others probably need an additional lock for metadata
> updates.  With that in place you can easily implement async I/O via
> threads one layer above, and only once, in block.c.
>   

I really want to use readv/writev though.  With virtio, we get a 
scatter/gather list for each IO request.

Once I post the virtio-blk driver, I'll follow up a little later with 
some refactoring of the block device layers.  I think it can be made 
much simpler while still remaining asynchronous.

> IMHO the only alternative to that scheme would be to turn the block
> drivers in some kind of remapping drivers for the various file formats
> which don't actually perform the I/O.  Then you can handle the actual
> I/O in a generic way using whatever API is available, be it posix-aio,
> linux-aio or slow-sync-io.
>   

That's part of my plan.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-04 15:03                       ` Anthony Liguori
@ 2007-12-04 16:18                         ` Gerd Hoffmann
  2007-12-05 14:47                           ` Anthony Liguori
  0 siblings, 1 reply; 31+ messages in thread
From: Gerd Hoffmann @ 2007-12-04 16:18 UTC (permalink / raw)
  To: qemu-devel

Anthony Liguori wrote:
> Gerd Hoffmann wrote:
  Hi,

> I really want to use readv/writev though.  With virtio, we get a
> scatter/gather list for each IO request.

Yep, I've also missed pwritev (or whatever that syscall would be named).

> Once I post the virtio-blk driver, I'll follow up a little later with
> some refactoring of the block device layers.  I think it can be made
> much simpler while still remaining asynchronous.
> 
>> IMHO the only alternative to that scheme would be to turn the block
>> drivers in some kind of remapping drivers for the various file formats
>> which don't actually perform the I/O.  Then you can handle the actual
>> I/O in a generic way using whatever API is available, be it posix-aio,
>> linux-aio or slow-sync-io.
> 
> That's part of my plan.

Oh, cool.  Can you also turn them into a sane shared library while being
at it?  The current approach to compile it once for qemu and once for
qemu-img with -DQEMU_TOOL isn't that great.  But if you factor out the
actual I/O the block-raw.c code should have no need to mess with qemu
internals any more and become much cleaner and simpler ...

cheers,
  Gerd

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O
  2007-12-04 16:18                         ` Gerd Hoffmann
@ 2007-12-05 14:47                           ` Anthony Liguori
  0 siblings, 0 replies; 31+ messages in thread
From: Anthony Liguori @ 2007-12-05 14:47 UTC (permalink / raw)
  To: qemu-devel

Gerd Hoffmann wrote:
> Anthony Liguori wrote:
>   
>> Gerd Hoffmann wrote:
>>     
>   Hi,
>
>   
>> I really want to use readv/writev though.  With virtio, we get a
>> scatter/gather list for each IO request.
>>     
>
> Yep, I've also missed pwritev (or whatever that syscall would be named).
>
>   
>> Once I post the virtio-blk driver, I'll follow up a little later with
>> some refactoring of the block device layers.  I think it can be made
>> much simpler while still remaining asynchronous.
>>
>>     
>>> IMHO the only alternative to that scheme would be to turn the block
>>> drivers in some kind of remapping drivers for the various file formats
>>> which don't actually perform the I/O.  Then you can handle the actual
>>> I/O in a generic way using whatever API is available, be it posix-aio,
>>> linux-aio or slow-sync-io.
>>>       
>> That's part of my plan.
>>     
>
> Oh, cool.  Can you also turn them into a sane shared library while being
> at it?  The current approach to compile it once for qemu and once for
> qemu-img with -DQEMU_TOOL isn't that great.  But if you factor out the
> actual I/O the block-raw.c code should have no need to mess with qemu
> internals any more and become much cleaner and simpler ...
>   

Yeah, it is definitely something that should be turned into a shared 
library.  I don't think I'll attempt that at first but I do agree it's 
the right direction to move toward.

Regards,

Anthony Liguori

> cheers,
>   Gerd
>
>
>
>
>   

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2007-12-05 14:47 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2007-12-03 10:09 [Qemu-devel] [PATCH 0/2 v2] Open disk images with O_DIRECT Laurent Vivier
2007-12-03 10:09 ` [Qemu-devel] [PATCH 1/2 v2] Add "cache" parameter to "-drive" Laurent Vivier
2007-12-03 10:09   ` [Qemu-devel] [PATCH 2/2 v2] Direct IDE I/O Laurent Vivier
2007-12-03 10:23     ` Fabrice Bellard
2007-12-03 10:30       ` Laurent Vivier
2007-12-03 11:40         ` Markus Hitter
2007-12-03 15:39           ` Paul Brook
2007-12-03 19:26             ` Samuel Thibault
2007-12-03 15:54         ` Anthony Liguori
2007-12-03 17:08           ` Samuel Thibault
2007-12-03 17:17             ` Paul Brook
2007-12-03 17:49               ` Jamie Lokier
2007-12-03 18:08                 ` Anthony Liguori
2007-12-03 18:40                   ` Jamie Lokier
2007-12-03 18:06             ` Anthony Liguori
2007-12-03 19:10               ` Laurent Vivier
2007-12-03 19:16                 ` Paul Brook
2007-12-03 21:36                   ` Anthony Liguori
2007-12-04 12:49                     ` Gerd Hoffmann
2007-12-04 13:02                       ` Laurent Vivier
2007-12-04  8:13                   ` Laurent Vivier
2007-12-03 21:13                 ` Gerd Hoffmann
2007-12-03 21:23                   ` Samuel Thibault
2007-12-03 21:38                   ` Anthony Liguori
2007-12-04 13:21                     ` Gerd Hoffmann
2007-12-04 15:03                       ` Anthony Liguori
2007-12-04 16:18                         ` Gerd Hoffmann
2007-12-05 14:47                           ` Anthony Liguori
2007-12-03 19:14               ` Paul Brook
2007-12-03 19:00           ` Laurent Vivier
2007-12-03 11:14       ` Johannes Schindelin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).