[Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim'

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim'
@ 2011-02-25 22:37 Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 02/26] FVD: extend qemu-io to do fully automated testing Chunqiang Tang
                   ` (24 more replies)
  0 siblings, 25 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the 'blksim' block device driver, which is a tool to
facilitate testing and debugging. blksim operates on a RAW image, but it uses
neither AIO nor posix threads to perform actual I/Os.  blksim function like an
event-driven disk simulator, and allows a block device driver developer to
fully control the order of disk I/Os, the order of callbacks, and the return
code of every I/O operation. The purpose is to extensively test a block device
driver under failures and race conditions.  Bugs found by blksim under rare
race conditions are guranteed to be precisely reproducible.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile.objs  |    1 +
 block/blksim.c |  757 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/blksim.h |   35 +++
 3 files changed, 793 insertions(+), 0 deletions(-)
 create mode 100644 block/blksim.c
 create mode 100644 block/blksim.h

diff --git a/Makefile.objs b/Makefile.objs
index 9e98a66..264aab3 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,6 +23,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += blksim.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/blksim.c b/block/blksim.c
new file mode 100644
index 0000000..5c7ef43
--- /dev/null
+++ b/block/blksim.c
@@ -0,0 +1,757 @@
+/*
+ * QEMU Simulated Block Device to Facilitate Testing and Debugging
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+
+#if 1
+# define QDEBUG(format,...) do {} while (0)
+#else
+# define QDEBUG printf
+#endif
+
+typedef enum
+{
+    SIM_NULL,
+    SIM_READ,
+    SIM_WRITE,
+    SIM_FLUSH,
+    SIM_READ_CALLBACK,
+    SIM_WRITE_CALLBACK,
+    SIM_FLUSH_CALLBACK,
+    SIM_TIMER
+} sim_op_t;
+
+static void sim_aio_cancel(BlockDriverAIOCB * acb);
+static int64_t sim_uuid = 0;
+static int64_t current_time = 0;
+static int64_t rand_time = 0;
+static int interactive_print = true;
+static int blksim_invoked = false;
+static bool instant_qemubh = true;
+struct SimAIOCB;
+
+/*
+ * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
+ * together to ensure that multiple subrequests triggered by the same
+ * outtermost request either succeed together or fail together. This behavior
+ * is required by qemu-test.  Here is one example of problems caused by
+ * departuring from this behavior.  Consider a write request that generates
+ * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
+ * be written into qemu-test's "truth image" but the part of the data handled
+ * by w1 will be written into qemu-test's "test image". As a result, their
+ * contents diverge can automated testing cannot continue.
+ */
+static int disk_io_return_code = 0;
+
+typedef struct BDRVSimState
+{
+    int fd;
+} BDRVSimState;
+
+typedef struct SimAIOCB
+{
+    BlockDriverAIOCB common;
+    int64_t uuid;
+    sim_op_t op;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    int ret;
+    int64_t time;
+    struct SimAIOCB *next;
+    struct SimAIOCB *prev;
+
+} SimAIOCB;
+
+static AIOPool sim_aio_pool = {
+    .aiocb_size = sizeof(SimAIOCB),
+    .cancel = sim_aio_cancel,
+};
+
+static SimAIOCB head = {
+    .uuid = -1,
+    .time = (int64_t) (9223372036854775807ULL),
+    .op = SIM_NULL,
+    .next = &head,
+    .prev = &head,
+};
+
+/* Debug a specific task.*/
+#if 0
+static inline void CHECK_TASK(int64_t uuid)
+{
+    if (uuid == 19LL) {
+        printf("CHECK_TASK pause for task %" PRId64 "\n", uuid);
+    }
+}
+#else
+#  define CHECK_TASK(acb) do { } while (0)
+#endif
+
+/* do_io() should never fail. A failure indicates a bug in the upper layer
+ * block device driver, or failure in the real hardware. */
+static int do_io(BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                 int nb_sectors, int do_read)
+{
+    BDRVSimState *s = bs->opaque;
+    size_t size = nb_sectors * 512;
+    uint8_t *new_buf, *p;
+    int ret;
+
+    if (interactive_print) {
+        printf ("Do %s %s sector_num=%"PRId64" nb_sectors=%d\n",
+                do_read ? "READ" : "WRITE", bs->filename,
+                sector_num, nb_sectors);
+    }
+
+    if ((ret=lseek(s->fd, sector_num * 512, SEEK_SET)) < 0) {
+        fprintf(stderr, "Error: lseek %s sector_num=%"PRId64"\n",
+                bs->filename, sector_num);
+    }
+
+    /* Buffer must be aligned for O_DIRECT. */
+    if ((bs->open_flags & BDRV_O_NOCACHE) &&
+        ((uintptr_t)buf & (unsigned)(bs->buffer_alignment - 1))) {
+        new_buf = p = qemu_blockalign(bs, size);
+        if (!do_read) {
+            memcpy (p, buf, size);
+        }
+    } else {
+        p = buf;
+        new_buf = NULL;
+    }
+
+    while (size > 0) {
+        if (do_read) {
+            ret = read(s->fd, p, size);
+            if (ret == 0) {
+                fprintf(stderr, "Error: read beyond the end of %s: sector_num=%"
+                        PRId64" nb_sectors=%d\n",
+                        bs->filename, sector_num, nb_sectors);
+                abort();
+            }
+        } else {
+            ret = write(s->fd, p, size);
+        }
+
+        if (ret >= 0) {
+            size -= ret;
+            p += ret;
+        } else if (errno != EINTR) {
+            fprintf(stderr, "Error: %s %s sector_num=%"PRId64" nb_sectors=%d\n",
+                    do_read ? "READ" : "WRITE", bs->filename,
+                    sector_num, nb_sectors);
+            abort();
+        }
+    }
+
+    if (new_buf) {
+        if (do_read) {
+            memcpy (buf, new_buf, nb_sectors * 512);
+        }
+        qemu_vfree (new_buf);
+    }
+
+    return 0;
+}
+
+static int blksim_read(BlockDriverState * bs, int64_t sector_num,
+                       uint8_t * buf, int nb_sectors)
+{
+    return do_io(bs, sector_num, buf, nb_sectors, true);
+}
+
+static int blksim_write(BlockDriverState * bs, int64_t sector_num,
+                        const uint8_t * buf, int nb_sectors)
+{
+    return do_io(bs, sector_num, (uint8_t *) buf, nb_sectors, false);
+}
+
+static void insert_in_list(SimAIOCB * acb)
+{
+    int64_t new_id = sim_uuid++;
+    CHECK_TASK(new_id);
+    acb->uuid = new_id;
+
+    if (rand_time <= 0) {
+        /* Working with qemu-io.c and not doing delay randomization.
+         * Insert it to the tail. */
+        acb->time = 0;
+        acb->prev = head.prev;
+        acb->next = &head;
+        head.prev->next = acb;
+        head.prev = acb;
+    } else {
+        SimAIOCB *p = head.next;
+
+        if (acb->time >= 0) {
+            /* Introduce a random delay to trigger rare race conditions. */
+            acb->time += rand() % rand_time;
+
+            /* Find the position to insert. List sorted in ascending time. */
+            while (1) {
+                if (p->time > acb->time) {
+                    break;
+                }
+                if (p->time == acb->time && (rand() % 2 == 0)) {
+                    break;
+                }
+                p = p->next;
+            }
+        }
+
+        /* Insert acb before p. */
+        acb->next = p;
+        acb->prev = p->prev;
+        p->prev->next = acb;
+        p->prev = acb;
+    }
+
+    QDEBUG("BLKSIM: insert task%" PRId64 " time=%" PRId64 "\n",
+           acb->uuid, acb->time);
+}
+
+/* Debug problems related to reusing task objects. */
+#if 1
+#  define my_qemu_aio_get qemu_aio_get
+#  define my_qemu_aio_release qemu_aio_release
+
+#else
+static SimAIOCB *search_task_list(SimAIOCB * acb)
+{
+    SimAIOCB *p;
+    for (p = head.next; p != &head; p = p->next) {
+        if (p == acb) {
+            return p;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void *my_qemu_aio_get(AIOPool * pool, BlockDriverState * bs,
+                                    BlockDriverCompletionFunc * cb,
+                                    void *opaque)
+{
+    SimAIOCB *acb = (SimAIOCB *) qemu_aio_get(&sim_aio_pool, bs, cb, opaque);
+    QDEBUG("BLKSIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
+    ASSERT(!search_task_list(acb));
+    return acb;
+}
+
+static inline void my_qemu_aio_release(SimAIOCB * acb)
+{
+    QDEBUG("BLKSIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
+    qemu_aio_release(acb);
+}
+#endif
+
+static BlockDriverAIOCB *insert_task(int op, BlockDriverState * bs,
+                                     int64_t sector_num, QEMUIOVector * qiov,
+                                     int nb_sectors,
+                                     BlockDriverCompletionFunc * cb,
+                                     void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get(&sim_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->op = op;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = disk_io_return_code;
+    acb->time = current_time;
+    insert_in_list(acb);
+
+    if (interactive_print) {
+        if (op == SIM_READ) {
+            printf("Queue READ uuid=%" PRId64 " filename=%s sector_num=%"
+                   PRId64 " nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (op == SIM_WRITE) {
+            printf("Queue WRITE uuid=%" PRId64 " filename=%s sector_num=%"
+                   PRId64 " nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf(stderr, "Unknown op %d\n", op);
+            exit(1);
+        }
+    }
+
+    return &acb->common;
+}
+
+static void insert_aio_callback(SimAIOCB * acb)
+{
+    acb->time = current_time;
+    insert_in_list(acb);
+
+    if (acb->op == SIM_FLUSH) {
+        acb->op = SIM_FLUSH_CALLBACK;
+        if (interactive_print) {
+            printf("Queue FLUSH_CALLBACK uuid=%" PRId64 " filename=%s\n",
+                   acb->uuid, acb->common.bs->filename);
+        }
+    } else if (acb->op == SIM_READ) {
+        acb->op = SIM_READ_CALLBACK;
+        if (interactive_print) {
+            printf("Queue READ_CALLBACK uuid=%" PRId64
+                   " filename=%s sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb->common.bs->filename, acb->sector_num,
+                   acb->nb_sectors);
+        }
+    } else if (acb->op == SIM_WRITE) {
+        acb->op = SIM_WRITE_CALLBACK;
+        if (interactive_print) {
+            printf("Queue WRITE_CALLBACK uuid=%" PRId64
+                   " filename=%s sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb->common.bs->filename, acb->sector_num,
+                   acb->nb_sectors);
+        }
+    } else {
+        fprintf(stderr, "Wrong op %d\n", acb->op);
+        exit(1);
+    }
+}
+
+void blksim_list_tasks(void)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->op == SIM_READ) {
+            printf("uuid=%" PRId64 "  READ           file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE) {
+            printf("uuid=%" PRId64 "  WRITE          file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_READ_CALLBACK) {
+            printf("uuid=%" PRId64 "  CALLBACK READ  file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE_CALLBACK) {
+            printf("uuid=%" PRId64 "  CALLBACK WRITE file=%s  sector_num=%"
+                   PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                   acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf(stderr, "Wrong OP %d\n", acb->op);
+            exit(1);
+        }
+    }
+}
+
+static inline void sim_callback(SimAIOCB * acb)
+{
+    acb->common.cb(acb->common.opaque, acb->ret);
+}
+
+int64_t blksim_get_time(void)
+{
+    return current_time;
+}
+
+void *blksim_new_timer(void *cb, void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get(&sim_aio_pool, NULL, cb, opaque);
+    acb->op = SIM_TIMER;
+    acb->prev = NULL;
+    return acb;
+}
+
+void blksim_mod_timer(void *ts, int64_t expire_time)
+{
+    SimAIOCB *acb = ts;
+
+    if (acb->prev) {
+        /* Remove it first. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+    }
+    acb->time = expire_time;
+    insert_in_list(acb);
+
+    if (interactive_print) {
+        printf("Queue TIMER uuid=%" PRId64 " expire_time=%" PRId64
+               " current_time=%" PRId64 "\n",
+               acb->uuid, expire_time, current_time);
+    }
+}
+
+void blksim_free_timer(void *ts)
+{
+    SimAIOCB *acb = ts;
+    CHECK_TASK(acb->uuid);
+    my_qemu_aio_release(acb);
+}
+
+void blksim_del_timer(void *ts)
+{
+    SimAIOCB *acb = ts;
+
+    CHECK_TASK(acb->uuid);
+    if (acb->prev) {
+        /* Remove it from the list. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+
+        /* Mark it as not in list. */
+        acb->prev = NULL;
+    }
+}
+
+void blksim_bh_schedule(void *bh)
+{
+    if (instant_qemubh) {
+        blksim_mod_timer(bh, -1);
+    } else {
+        blksim_mod_timer(bh, current_time);
+    }
+}
+
+void blksim_set_instant_qemubh(bool instant)
+{
+    instant_qemubh = instant;
+}
+
+void blksim_set_disk_io_return_code(int ret)
+{
+    disk_io_return_code = ret;
+}
+
+static void run_task_by_acb(SimAIOCB * acb)
+{
+    CHECK_TASK(acb->uuid);
+
+    /* Remove it from the list. */
+    acb->next->prev = acb->prev;
+    acb->prev->next = acb->next;
+    acb->prev = NULL;   /* Indicate that it is no longer in the list. */
+
+    if (acb->time > current_time) {
+        current_time = acb->time;
+    }
+
+    if (acb->op == SIM_TIMER) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
+               acb->uuid, acb->time);
+        ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
+        return;
+    }
+
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->op == SIM_READ) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64
+               " READ %s sector_num=%" PRId64 " nb_sectors=%d\n",
+               acb->uuid, acb->time, bs->filename, acb->sector_num,
+               acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (blksim_read
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf(stderr, "Error in reading %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign(acb->common.bs, acb->qiov->size);
+                if (blksim_read(bs, acb->sector_num, buf, acb->nb_sectors)!=0) {
+                    fprintf(stderr, "Error in reading %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+                qemu_iovec_from_buffer(acb->qiov, buf, acb->qiov->size);
+                qemu_vfree(buf);
+            }
+        }
+
+        insert_aio_callback(acb);
+    } else if (acb->op == SIM_WRITE) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64
+               " WRITE %s sector_num=%" PRId64 " nb_sectors=%d\n",
+               acb->uuid, acb->time, bs->filename,
+               acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (blksim_write(bs, acb->sector_num, acb->qiov->iov->iov_base,
+                                 acb->nb_sectors) != 0) {
+                    fprintf(stderr, "Error in writing %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign(acb->common.bs,
+                                               acb->qiov->size);
+                qemu_iovec_to_buffer(acb->qiov, buf);
+                if (blksim_write(bs, acb->sector_num, buf,acb->nb_sectors)!=0) {
+                    fprintf(stderr, "Error in writing %s sector_num=%"PRId64
+                            " nb_sectors=%d\n", acb->common.bs->filename,
+                            acb->sector_num, acb->nb_sectors);
+                    exit(1);
+                }
+                qemu_vfree(buf);
+            }
+        }
+
+        insert_aio_callback(acb);
+    } else if (acb->op == SIM_FLUSH) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64 " FLUSH %s\n",
+               acb->uuid, acb->time, bs->filename);
+        if (interactive_print) {
+            printf ("Do FLUSH %s\n", bs->filename);
+        }
+         if (acb->ret == 0) {
+             BDRVSimState *s = acb->common.bs->opaque;
+             qemu_fdatasync (s->fd);
+         }
+        insert_aio_callback(acb);
+    } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
+               || acb->op == SIM_FLUSH_CALLBACK) {
+        QDEBUG("BLKSIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
+               acb->uuid, acb->time);
+        sim_callback(acb);
+        CHECK_TASK(acb->uuid);
+        my_qemu_aio_release(acb);
+    } else {
+        fprintf(stderr, "Unknown op %d\n", acb->op);
+        exit(1);
+    }
+}
+
+int blksim_run_task_by_uuid(int64_t uuid)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->uuid == uuid) {
+            run_task_by_acb(acb);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
+int blksim_run_all_tasks(void)
+{
+    int n = 0;
+
+    while (1) {
+        SimAIOCB *acb = head.next;
+        if (acb == &head) {
+            /* No more tasks. */
+            return n;
+        }
+
+        run_task_by_acb(acb);
+        n++;
+    }
+}
+
+static BlockDriverAIOCB *blksim_aio_readv(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (disk_io_return_code == RETURN_CODE_FOR_NULL_ACB) {
+        return NULL;
+    }
+    return insert_task(SIM_READ, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_writev(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (disk_io_return_code == RETURN_CODE_FOR_NULL_ACB) {
+        return NULL;
+    }
+
+    return insert_task(SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *blksim_aio_flush(BlockDriverState * bs,
+            BlockDriverCompletionFunc * cb, void *opaque)
+{
+    if (disk_io_return_code == RETURN_CODE_FOR_NULL_ACB) {
+        return NULL;
+    }
+    return insert_task(SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
+}
+
+static void sim_aio_cancel(BlockDriverAIOCB * blockacb)
+{
+    SimAIOCB *acb = container_of(blockacb, SimAIOCB, common);
+
+    CHECK_TASK(acb->uuid);
+    QDEBUG("BLKSIM: cancel task%" PRId64 "\n", acb->uuid);
+
+    if (acb->prev) {
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+        acb->prev = NULL;
+        my_qemu_aio_release(acb);
+    } else {
+        fprintf(stderr, "Error: cancel a blksim task that does not exist: "
+                "uuid=%" PRId64 ". Halt process %d for debugging...\n",
+                acb->uuid, getpid());
+        abort();
+    }
+}
+
+static int blksim_open(BlockDriverState * bs, const char *filename,
+                       int bdrv_flags)
+{
+    BDRVSimState *s = bs->opaque;
+    int open_flags = O_BINARY | O_LARGEFILE;
+
+    blksim_invoked = true;
+
+    if ((bdrv_flags & BDRV_O_RDWR)) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    /* Parse the "blksim:" prefix */
+    if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
+        filename += strlen("blksim:");
+    }
+
+    s->fd = qemu_open(filename, open_flags);
+    if (s->fd < 0) {
+        return -errno;
+    }
+
+    int64_t len = lseek(s->fd, 0, SEEK_END);
+    if (len >= 0) {
+        bs->total_sectors = len / 512;
+    } else {
+        bs->total_sectors = 0;
+    }
+
+    bs->growable = 1;
+    bs->buffer_alignment = 512;
+    return 0;
+}
+
+static void blksim_close(BlockDriverState * bs)
+{
+    BDRVSimState *s = bs->opaque;
+    close(s->fd);
+}
+
+static int blksim_flush(BlockDriverState * bs)
+{
+    if (interactive_print) {
+        printf ("Do FLUSH %s\n", bs->filename);
+    }
+    BDRVSimState *s = bs->opaque;
+    qemu_fdatasync (s->fd);
+    return 0;
+}
+
+static int blksim_has_zero_init(BlockDriverState * bs)
+{
+    struct stat buf;
+
+    if (stat(bs->filename, &buf) != 0) {
+        fprintf(stderr, "Failed to stat() %s\n", bs->filename);
+        exit(1);
+    }
+
+    if (S_ISBLK(buf.st_mode) || S_ISCHR(buf.st_mode)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static int blksim_truncate(BlockDriverState * bs, int64_t offset)
+{
+    BDRVSimState *s = bs->opaque;
+    return ftruncate(s->fd, offset);
+}
+
+static BlockDriver bdrv_blksim = {
+    .format_name = "blksim",
+    .protocol_name = "blksim",
+    .instance_size = sizeof(BDRVSimState),
+    .bdrv_file_open = blksim_open,
+    .bdrv_close = blksim_close,
+    .bdrv_flush = blksim_flush,
+    .bdrv_read = blksim_read,
+    .bdrv_write = blksim_write,
+    .bdrv_aio_readv = blksim_aio_readv,
+    .bdrv_aio_writev = blksim_aio_writev,
+    .bdrv_aio_flush = blksim_aio_flush,
+    .bdrv_has_zero_init = blksim_has_zero_init,
+    .bdrv_truncate = blksim_truncate,
+};
+
+static void bdrv_blksim_init(void)
+{
+    bdrv_register(&bdrv_blksim);
+}
+
+block_init(bdrv_blksim_init);
+
+void init_blksim(int print, int64_t _rand_time)
+{
+    interactive_print = print;
+    rand_time = _rand_time;
+}
+
+/*
+ * To work properly in the simulation mode, block device drivers that
+ * explicitly invoke qemu_aio_wait() should invoke blksim_qemu_aio_wait() if
+ * the block device is openned using blksim. Most block device drivers do not
+ * invoke qemu_aio_wait() and hence should not be concerned about this.
+ * Return false if no more tasks to run.
+ */
+bool blksim_qemu_aio_wait(void)
+{
+    SimAIOCB *acb = head.next;
+    if (acb == &head) {
+        return false;
+    } else {
+        run_task_by_acb(acb);
+        return true;
+    }
+}
+
+int blksim_has_task(void)
+{
+    return head.next != &head;
+}
+
+int using_blksim(void)
+{
+    return blksim_invoked;
+}
diff --git a/block/blksim.h b/block/blksim.h
new file mode 100644
index 0000000..5c9533d
--- /dev/null
+++ b/block/blksim.h
@@ -0,0 +1,35 @@
+/*
+ * QEMU Simulated Block Device to Facilitate Testing and Debugging
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef __block_sim_h__
+#define __block_sim_h__
+
+#define RETURN_CODE_FOR_NULL_ACB        (-9999)
+
+void init_blksim (int print, int64_t _rand_time);
+int using_blksim (void);
+int blksim_has_task (void);
+void blksim_list_tasks (void);
+int blksim_run_task_by_uuid (int64_t uuid);
+int blksim_run_all_tasks (void);
+int64_t blksim_get_time (void);
+void *blksim_new_timer (void *cb, void *opaque);
+void blksim_mod_timer (void *ts, int64_t expire_time);
+void blksim_free_timer (void *ts);
+void blksim_del_timer (void *ts);
+void blksim_bh_schedule (void *bh);
+void blksim_set_disk_io_return_code (int ret);
+bool blksim_qemu_aio_wait(void);
+void blksim_set_instant_qemubh (bool instant);
+
+#endif
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 02/26] FVD: extend qemu-io to do fully automated testing
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 03/26] FVD: add fully automated test-qcow2.sh Chunqiang Tang
                   ` (23 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch extends qemu-io in two ways. First, it adds the 'sim' command to
work with the simulated block device driver 'blksim', which allows a developer
to fully control the order of disk I/Os, the order of callbacks, and the
return code of every I/O operation. Second, it adds a fully automated testing
mode, 'qemu-io --auto'. This mode can, e.g., simulate 1,000 threads
concurrently submitting overlapping disk I/O requests to QEMU block drivers,
use blksim to inject I/O errors and race conditions, and automatically verify
the correctness of I/O results. This tool can run unattended to exercise an
unlimited number of randomized test cases. Once it finds a bug, the bug is
precisely repeatable with the help of blksim, even if it is a rare race
condition bug. This makes debugging much easier.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 qemu-io-auto.c |  947 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-io-sim.c  |  127 ++++++++
 qemu-io.c      |   50 +++-
 qemu-tool.c    |  107 ++++++-
 4 files changed, 1209 insertions(+), 22 deletions(-)
 create mode 100644 qemu-io-auto.c
 create mode 100644 qemu-io-sim.c

diff --git a/qemu-io-auto.c b/qemu-io-auto.c
new file mode 100644
index 0000000..73d79c7
--- /dev/null
+++ b/qemu-io-auto.c
@@ -0,0 +1,947 @@
+/*
+ * Extension of qemu-io to perform automated random tests
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ *  This module implements a fully automated testing tool for block device
+ *  drivers. It works with block/blksim.c to test race conditions by
+ *  randomizing event timing. It is recommended to perform automated testing
+ *  on a ramdisk or tmpfs, which stores files in memory and avoids wearing out
+ *  the disk. Below is one example of using qemu-io to perform a fully
+ *  automated testing.
+
+# mount -t tmpfs none /var/tmpfs -o size=4G
+# dd if=/dev/zero of=/var/tmpfs/truth.raw count=0 bs=1 seek=1G
+# dd if=/dev/zero of=/var/tmpfs/zero-500M.raw count=0 bs=1 seek=500M
+# qemu-img create -f qcow2 -obacking_fmt=blksim -b /var/tmpfs/zero-500M.raw \
+            /var/tmpfs/test.qcow2 1G
+# qemu-io --auto --seed=1 --truth=/var/tmpfs/truth.raw --format=qcow2 \
+    --test=blksim:/var/tmpfs/test.qcow2 --verify_write=true \
+    --compare_before=false --compare_after=true --round=100000 \
+    --parallel=1000 --io_size=10485760 --fail_prob=0 --cancel_prob=0 \
+    --instant_qemubh=true
+ *=============================================================================
+ */
+
+#include "qemu-timer.h"
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/blksim.h"
+
+#if 1
+# define QDEBUG(format,...) do {} while (0)
+#else
+# define QDEBUG printf
+#endif
+
+#define die(format,...) \
+    do { \
+        fprintf (stderr, "%s:%d --- ", __FILE__, __LINE__); \
+        fprintf (stderr, format, ##__VA_ARGS__); \
+        abort(); \
+    } while(0)
+
+typedef enum { OP_NULL = 0, OP_READ, OP_WRITE, OP_FLUSH,
+    OP_AIO_FLUSH } op_type_t;
+const char *op_type_str[] = { "NULL", "READ", "WRITE", "FLUSH", "AIO_FLUSH"};
+
+typedef struct CompareFullCB
+{
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int64_t sector_num;
+    int nb_sectors;
+    int max_nb_sectors;
+    uint8_t *truth_buf;
+} CompareFullCB;
+
+typedef struct RandomIO
+{
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+    uint8_t *truth_buf;
+    uint8_t *test_buf;
+    op_type_t type;
+    int tester;
+    int64_t uuid;
+    int allow_cancel;
+    BlockDriverAIOCB *acb;
+} RandomIO;
+
+static int fd;
+static int64_t total_sectors;
+static int64_t io_size = 262144;
+static bool verify_write = false;
+static int parallel = 1;
+static int max_iov = 10;
+static int64_t round = 10;
+static int64_t finished_round = 0;
+static RandomIO *testers = NULL;
+static double fail_prob = 0;
+static double cancel_prob = 0;
+static double aio_flush_prob = 0;
+static double flush_prob = 0;
+static int64_t rand_time = 1000;
+static int64_t test_uuid = 0;
+static int finished_testers = 0;
+
+static void rand_io_cb(void *opaque, int ret);
+static void perform_next_io(RandomIO * r);
+
+static void auto_test_usage(void)
+{
+    printf("%s --auto [--help]\n"
+           "\t[--truth=<truth_img>]\n"
+           "\t[--test=<img_to_test>]\n"
+           "\t[--seed=<#d>]\n"
+           "\t[--format=<test_img_fmt>]\n"
+           "\t[--round=<#d>]\n"
+           "\t[--instant_qemubh=<true|false>]\n"
+           "\t[--fail_prob=<#f>]\n"
+           "\t[--cancel_prob=<#f>]\n"
+           "\t[--aio_flush_prob=<#f>]\n"
+           "\t[--flush_prob=<#f>]\n"
+           "\t[--io_size=<#d>]\n"
+           "\t[--verify_write=[true|false]]\n"
+           "\t[--parallel=[#d]\n"
+           "\t[--max_iov=[#d]\n"
+           "\t[--compare_before=[true|false]]\n"
+           "\t[--compare_after=[true|false]]\n"
+           "\t[--create <file>\\n"
+           "\t[--block_size=<#d>]\n"
+           "\t[--empty_block_prob=<#f>]\n"
+           "\t[--empty_block_chain=<#d>]\n"
+           "\n", progname);
+    exit(1);
+}
+
+static int truth_io(void *buf, int64_t sector_num, int nb_sectors, int do_read)
+{
+    off_t offset = sector_num * 512;
+    size_t size = nb_sectors * 512;
+
+    if (lseek(fd, offset, SEEK_SET) < 0) {
+        die("lseek\n");
+    }
+
+    while (size > 0) {
+        int r;
+        if (do_read) {
+            r = read(fd, buf, size);
+        } else {
+            r = write(fd, buf, size);
+        }
+        if (r >= 0) {
+            size -= r;
+            offset += r;
+            buf = (void *)(((char *)buf) + r);
+        } else if (errno != EINTR) {
+            die("I/O error on the truth file.\n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int verify(uint8_t * truth_buf, uint8_t * test_buf,
+                  int64_t sector_num, int nb_sectors)
+{
+    int i;
+    for (i = 0; i < nb_sectors; i++) {
+        int64_t offset = i * 512;
+        if (truth_buf[offset] == (uint8_t)0) {
+            /* Skip this sector as data were never written to it before. It is
+             * possible that the test image has random garbage data in this
+             * sector. */
+            continue;
+        }
+
+        if (memcmp(&truth_buf[offset], &test_buf[offset], 512) != 0) {
+            int j;
+            fprintf(stderr, "Sector %"PRId64" differs, discovered by "
+                    "process %d\n", sector_num + i, getpid());
+            QDEBUG("Sector %"PRId64" differs.\noffset\texpect\tactual\n",
+                   sector_num + i);
+            for (j = 0; j < 512; j++) {
+                if (truth_buf[offset + j] == test_buf[offset + j]) {
+                    QDEBUG("%02d:\t%02X\t%02X\n", j, truth_buf[offset + j],
+                           test_buf[offset + j]);
+                } else {
+                    QDEBUG("%02d:\t%02X\t%02X   ***\n", j,
+                           truth_buf[offset + j], test_buf[offset + j]);
+                }
+            }
+            abort();
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void compare_full_images_cb(void *opaque, int ret)
+{
+    CompareFullCB *cf = opaque;
+
+    if (ret) {
+        /* Failed. Retry the operation. */
+        bdrv_aio_readv(bs, cf->sector_num, &cf->qiov, cf->nb_sectors,
+                       compare_full_images_cb, cf);
+        return;
+    }
+
+    truth_io(cf->truth_buf, cf->sector_num, cf->nb_sectors, true);
+    verify(cf->truth_buf, cf->iov.iov_base, cf->sector_num, cf->nb_sectors);
+
+    cf->sector_num += cf->nb_sectors;
+    if (cf->sector_num >= total_sectors) {
+        /* Finished. */
+        free(cf->truth_buf);
+        qemu_vfree(cf->iov.iov_base);
+        qemu_free(cf);
+        return;
+    }
+
+    /* Read more data to compare. */
+    if (cf->sector_num + cf->max_nb_sectors > total_sectors) {
+        cf->nb_sectors = total_sectors - cf->sector_num;
+    } else {
+        cf->nb_sectors = cf->max_nb_sectors;
+    }
+    cf->iov.iov_len = cf->nb_sectors * 512;
+    qemu_iovec_init_external(&cf->qiov, &cf->iov, 1);
+    QDEBUG("FULL IMAGE COMPARISON: read sector_num=%" PRId64 " nb_sectors=%d\n",
+           cf->sector_num, cf->nb_sectors);
+
+    if (!bdrv_aio_readv(bs, cf->sector_num, &cf->qiov,
+                        cf->nb_sectors, compare_full_images_cb, cf)) {
+        die("bdrv_aio_readv\n");
+    }
+}
+
+static int compare_full_images(void)
+{
+    CompareFullCB *cf;
+
+    printf("Performing a full comparison of the truth image and "
+           "the test image...\n");
+    fflush(stdout);
+
+    cf = qemu_malloc(sizeof(CompareFullCB));
+    cf->max_nb_sectors = 1048576L / 512;
+    cf->nb_sectors = MIN(cf->max_nb_sectors, total_sectors);
+    cf->truth_buf = qemu_memalign(512, cf->max_nb_sectors * 512);
+    cf->iov.iov_base = qemu_blockalign(bs, cf->max_nb_sectors * 512);
+    cf->iov.iov_len = cf->nb_sectors * 512;
+    cf->sector_num = 0;
+    qemu_iovec_init_external(&cf->qiov, &cf->iov, 1);
+    QDEBUG("FULL IMAGE COMPARISON: read sector_num=%" PRId64 " nb_sectors=%d\n",
+           cf->sector_num, cf->nb_sectors);
+    if (!bdrv_aio_readv(bs, cf->sector_num, &cf->qiov,
+                        cf->nb_sectors, compare_full_images_cb, cf)) {
+        die("bdrv_aio_readv\n");
+    }
+
+    blksim_run_all_tasks();
+    QDEBUG("Finished full image comparison.\n");
+    return 0;
+}
+
+static inline int64_t rand64(void)
+{
+    int64_t f1 = rand();
+    int64_t f2 = rand();
+    int64_t f3 = (f1 << 32) | f2;
+    return f3 >= 0 ? f3 : -f3;
+}
+
+static bool check_conflict(RandomIO * r)
+{
+    int i;
+
+    for (i = 0; i < parallel; i++) {
+        RandomIO *s = &testers[i];
+        if (s == r || s->type == OP_AIO_FLUSH || s->type == OP_FLUSH ||
+            (r->type == OP_READ && s->type == OP_READ)) {
+            continue;
+        }
+
+        if ((r->sector_num <= s->sector_num &&
+             s->sector_num < r->sector_num + r->nb_sectors) ||
+            (s->sector_num <= r->sector_num &&
+             r->sector_num < s->sector_num + s->nb_sectors)) {
+            return true;   /* Conflict. */
+        }
+    }
+
+    return false;   /* No confict. */
+}
+
+/* Return false if the submitted request is cancelled or already finished. */
+static bool submit_rand_io(RandomIO * r)
+{
+    BlockDriverAIOCB *acb = NULL;
+    int ret;
+    const char *fail;
+
+    if (!r->allow_cancel || r->type == OP_FLUSH || r->type == OP_AIO_FLUSH
+        || fail_prob <= 0) {
+        ret = 0;
+    } else if (rand() / (double)RAND_MAX > fail_prob) {
+        ret = 0;
+    } else if (rand() % 10 == 0) {
+        /* Tell blksim to return NULL acb. */
+        ret = RETURN_CODE_FOR_NULL_ACB;
+    } else {
+        /* Tell blksim to fail I/O operations with error code -EIO. */
+        ret = -EIO;
+    }
+
+    if (ret == 0) {
+        fail = "";
+    } else {
+        fail = "fail ";
+    }
+    QDEBUG("TESTER %03d:  %s%s  test%" PRIX64 " sector_num=%" PRId64
+           " nb_sectors=%d niov=%d\n", r->tester, fail, op_type_str[r->type],
+           r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+    printf("TESTER %03d:  %s%s  sector_num=%" PRId64 " nb_sectors=%d "
+           "niov=%d\n", r->tester, fail, op_type_str[r->type],
+           r->sector_num, r->nb_sectors, r->qiov.niov);
+
+    if (r->type == OP_FLUSH) {
+        /* This is special because it is a synchronous operation. */
+        if (bdrv_flush(bs) != 0) {
+            die("bdrv_flush failed");
+        }
+        return false;
+    }
+
+    /* Ensure all subrequests triggered by one outtermost request either
+     * succeed together or fail together. Otherwise, the truth image and the
+     * test image will diverge. */
+    blksim_set_disk_io_return_code(ret);
+
+    switch (r->type) {
+    case OP_READ:
+        acb = bdrv_aio_readv(bs, r->sector_num, &r->qiov, r->nb_sectors,
+                             rand_io_cb, r);
+        break;
+    case OP_WRITE:
+        acb = bdrv_aio_writev(bs, r->sector_num, &r->qiov, r->nb_sectors,
+                              rand_io_cb, r);
+        break;
+    case OP_AIO_FLUSH:
+        acb = bdrv_aio_flush(bs, rand_io_cb, r);
+        break;
+    default:
+        die("Unknown OP");
+        break;
+    }
+
+    blksim_set_disk_io_return_code(0);
+
+    if (!acb) {
+        if (ret != RETURN_CODE_FOR_NULL_ACB) {
+            die("Unexpected NULL ACB");
+        }
+        return false;
+    }
+
+    if (r->allow_cancel && cancel_prob > 0 &&
+        rand() / (double)RAND_MAX <= cancel_prob) {
+        QDEBUG("TESTER %03d:  cancel %s test%" PRIX64 " sector_num=%" PRId64
+               " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+               r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+        printf("TESTER %03d:  cancel %s sector_num=%" PRId64
+               " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+               r->sector_num, r->nb_sectors, r->qiov.niov);
+        bdrv_aio_cancel(acb);
+        return false;
+    } else {
+        return true;
+    }
+}
+
+static void prepare_read_write(RandomIO * r)
+{
+    /* Find the next region to perform io. */
+    do {
+        /* Do a READ or WRITE? */
+        if (rand() % 2) {
+            r->type = OP_READ;
+        } else {
+            r->type = OP_WRITE;
+        }
+
+        if (parallel <= 1 || (rand() % 2 == 0)) {
+            /* Perform a random I/O. */
+            r->sector_num = rand64() % total_sectors;
+        } else {
+            /* Perform an I/O next to a currently ongoing I/O. */
+            int id;
+            do {
+                id = rand() % parallel;
+            } while (id == r->tester);
+
+            RandomIO *p = &testers[id];
+            r->sector_num =
+                p->sector_num + 2 * io_size - rand64() % (4 * io_size);
+            if (r->sector_num < 0) {
+                r->sector_num = 0;
+            } else if (r->sector_num >= total_sectors) {
+                r->sector_num = total_sectors - 1;
+            }
+        }
+
+        r->nb_sectors = 1 + rand64() % io_size;
+        if (r->sector_num + r->nb_sectors > total_sectors) {
+            r->nb_sectors = total_sectors - r->sector_num;
+        }
+    } while (check_conflict(r));
+
+    if (r->type == OP_WRITE) {
+        /* Fill test_buf with random data. */
+        int i, j;
+        for (i = 0; i < r->nb_sectors; i++) {
+            const uint64_t TEST_MAGIC = 0x0123456789ABCDEFULL;
+            /* This first byte is always 0xBB to indicate that this is not an
+             * empty sector (see check_conflict()). The next 7 bytes of the
+             * sector stores the current testing round. The next 8 bytes store
+             * a magic number.  This info helps debugging. */
+            uint64_t *p = (uint64_t *)&r->test_buf[i * 512];
+            *p = r->uuid;
+            cpu_to_be64s(p);
+            r->test_buf[i * 512] = 0xBB; /* First byte marks sector non-empty */
+
+            p++;
+            *p = TEST_MAGIC;
+            cpu_to_be64s(p);
+
+            /* The rest of the sector are filled with random data. */
+            uint32_t *q = (uint32_t *) (p + 1);
+            int n = (512 - 2 * sizeof(uint64_t)) / sizeof(uint32_t);
+            for (j = 0; j < n; j++) {
+                *q++ = rand();
+            }
+        }
+    }
+
+    /* Determine the number of iov. */
+    int niov = 0;
+    uint8_t *p = r->test_buf;
+    int left = r->nb_sectors;
+    do {
+        if (niov == max_iov - 1) {
+            r->qiov.iov[niov].iov_len = left * 512;
+            r->qiov.iov[niov].iov_base = p;
+            niov++;
+            break;
+        }
+
+        int nb = 1 + rand() % left;
+        r->qiov.iov[niov].iov_len = nb * 512;
+        r->qiov.iov[niov].iov_base = p;
+        p += r->qiov.iov[niov].iov_len;
+        left -= nb;
+        niov++;
+    } while (left > 0);
+
+    qemu_iovec_init_external(&r->qiov, r->qiov.iov, niov);
+}
+
+static void perform_next_io(RandomIO * r)
+{
+    if (finished_round >= round) {
+        finished_testers++;
+        return;
+    }
+
+    finished_round++;
+    r->allow_cancel = true;
+
+    do {
+        r->uuid = test_uuid++;
+
+        if (aio_flush_prob > 0 && rand() / (double)RAND_MAX < aio_flush_prob) {
+            r->type = OP_AIO_FLUSH;
+        } else if (flush_prob > 0 && rand() / (double)RAND_MAX < flush_prob) {
+            r->type = OP_FLUSH;
+        }
+        else {
+            prepare_read_write(r);
+        }
+    } while (!submit_rand_io(r));
+}
+
+static void rand_io_cb(void *opaque, int ret)
+{
+    RandomIO *r = opaque;
+
+    if (ret) {
+        if (fail_prob <= 0) {
+            die("Request %s sector_num=%"PRId64" nb_sectors=%d "
+                "failed while fail_prob=0.\n",
+                op_type_str[r->type], r->sector_num, r->nb_sectors);
+        } else {
+            /* Failed. Retry the operation. */
+            QDEBUG("TESTER %03d:  retry %s  test%" PRIX64 " sector_num=%"
+                   PRId64 " nb_sectors=%d niov=%d\n",
+                   r->tester, op_type_str[r->type], r->uuid,
+                   r->sector_num, r->nb_sectors, r->qiov.niov);
+            if (!submit_rand_io(r)) {
+                perform_next_io(r);
+            }
+            return;
+        }
+    } else {
+        QDEBUG("TESTER %03d:  finished %s  test%" PRIX64 " sector_num=%" PRId64
+               " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+               r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+    }
+
+    switch (r->type) {
+    case OP_AIO_FLUSH:
+        perform_next_io(r);
+        return;
+
+    case OP_READ:
+        truth_io(r->truth_buf, r->sector_num, r->nb_sectors, true);
+        verify(r->truth_buf, r->test_buf, r->sector_num, r->nb_sectors);
+        perform_next_io(r);
+        return;
+
+    case OP_WRITE:
+        /* Write data to the truth image. */
+        truth_io(r->test_buf, r->sector_num, r->nb_sectors, false);
+        if (verify_write) {
+            r->type = OP_READ; /* Perform a read for the same data. */
+            r->allow_cancel = false; /* Ensure verification happens. */
+            r->qiov.niov = 1;
+            r->qiov.iov[0].iov_len = r->qiov.size;
+            memset(r->test_buf, 0xA5, r->qiov.size);    /* Fill in garbage. */
+            submit_rand_io(r);
+        } else {
+            perform_next_io(r);
+        }
+        return;
+
+    default:
+        die("Unknown OP");
+        return;
+    }
+}
+
+static int read_bool(const char *arg)
+{
+    int val = true;
+    if (strcmp(optarg, "true") == 0) {
+        val = true;
+    } else if (strcmp(optarg, "false") == 0) {
+        val = false;
+    } else {
+        printf("%s is neither 'true' nor 'false'\n", arg);
+        auto_test_usage();
+    }
+
+    return val;
+}
+
+static void open_test_file(const char *format, const char *test_file, int flags)
+{
+    if (flags & BDRV_O_RDWR) {
+        QDEBUG("Open image for test.\n");
+    } else {
+        QDEBUG("Open image for comparison.\n");
+    }
+
+    bs = bdrv_new("hda");
+    if (!bs) {
+        die("bdrv_new failed\n");
+    }
+
+    BlockDriver *drv = NULL;
+    if (format) {
+        drv = bdrv_find_format(format);
+        if (!drv) {
+            die("Found no driver for format '%s'.\n", format);
+        }
+    }
+
+    if (bdrv_open(bs, test_file, flags, drv) < 0) {
+        die("Failed to open '%s'\n", test_file);
+    }
+}
+
+static void perform_test(const char *truth_file, const char *test_file,
+                    const char *format, int compare_before, int compare_after,
+                    int cache_flag)
+{
+    int i;
+
+    if (compare_before) {
+        /* Open as read-only to compare. */
+        open_test_file(format, test_file, cache_flag);
+    } else {
+        open_test_file(format, test_file, BDRV_O_RDWR | cache_flag);
+    }
+
+    fd = open(truth_file, O_RDWR | O_LARGEFILE, 0);
+    if (fd < 0) {
+        die("Failed to open '%s'\n", truth_file);
+    }
+
+    int64_t l0 = lseek(fd, 0, SEEK_END);
+    int64_t l1 = bdrv_getlength(bs);
+    if (l0 < 0 || l1 < 0 || l0 < l1) {
+        die("Mismatch: truth image %s length %"PRId64", test image %s "
+            "length %"PRId64"\n", truth_file, l0, test_file, l1);
+    }
+
+    total_sectors = l1 / 512;
+    if (total_sectors <= 1) {
+        die("Total sectors: %" PRId64 "\n", total_sectors);
+    }
+
+    io_size /= 512;
+    if (io_size <= 0) {
+        io_size = 1;
+    } else if (io_size > total_sectors / 2) {
+        io_size = total_sectors / 2;
+    }
+
+    if (compare_before) {
+        if (compare_full_images()) {
+            die("The original two files do not match.\n");
+        }
+
+        /* After comparison, reopen as writeable. */
+        bdrv_delete(bs);
+        open_test_file(format, test_file, BDRV_O_RDWR | cache_flag);
+    }
+
+    if (round > 0) {
+        /* Create testers. */
+        testers = qemu_malloc(sizeof(RandomIO) * parallel);
+        for (i = 0; i < parallel; i++) {
+            RandomIO *r = &testers[i];
+            r->test_buf = qemu_blockalign(bs, io_size * 512);
+            r->truth_buf = qemu_memalign(512, io_size * 512);
+            r->qiov.iov = qemu_malloc(sizeof(struct iovec) * max_iov);
+            r->sector_num = 0;
+            r->nb_sectors = 0;
+            r->type = OP_READ;
+            r->tester = i;
+        }
+        for (i = 0; i < parallel; i++) {
+            perform_next_io(&testers[i]);
+        }
+    }
+
+    /* Run the tests. It is possible that all testers have finished but there
+     * are still tasks in blksim due to copy_on_read or prefetching. Those
+     * tasks are ignored and a properly implemented driver should cancel
+     * those I/Os in bdrv_close() anyway. */
+    while (blksim_qemu_aio_wait() && finished_testers < parallel);
+
+    if (rand() % 10 == 0) {
+        /* With a random probability, finish the remaining tasks (especially
+         * prefetching so that it can test more code paths. */
+        while (blksim_qemu_aio_wait());
+    }
+
+    if (round > 0) {
+        /* Create testers. */
+        if (compare_after) {
+            /* Reopen as read-only to compare. */
+            bdrv_delete(bs);
+            if (blksim_has_task()) {
+                die("blksim still has tasks after the device is closed.\n"
+                    "This indicates that the device driver's bdrv_close() did"
+                    "not fully clean up timers, QEMUBH, copy_on_read, "
+                    "or prefetch.\n");
+            }
+            open_test_file(format, test_file, cache_flag);
+            if (compare_full_images()) {
+                die("The two files do not match after I/O operations.\n");
+            }
+        }
+
+        for (i = 0; i < parallel; i++) {
+            RandomIO *r = &testers[i];
+            qemu_vfree(r->test_buf);
+            free(r->truth_buf);
+            qemu_free(r->qiov.iov);
+        }
+        qemu_free(testers);
+    }
+
+    printf("Test process %d finished successfully\n", getpid());
+    bdrv_delete(bs);
+    close(fd);
+}
+
+static int create_test_file(int seed, const char *file, int64_t file_size,
+                            int block_size, double empty_block_prob,
+                            int empty_block_chain)
+{
+    if (file_size <= 0) {
+        fprintf (stderr, "file_size is not positive: %"PRId64"\n", file_size);
+        return -EINVAL;
+    }
+
+    fd = qemu_open(file, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0) {
+        fprintf(stderr, "Failed to create %s.\n", file);
+        return -errno;
+    }
+
+    if (empty_block_chain <= 0) {
+        empty_block_chain = 1;
+    }
+    if (block_size <= 0) {
+        block_size = 65536;
+    }
+
+    srand (seed);
+    uint8_t *buf = qemu_malloc (empty_block_chain * block_size);
+    ssize_t len;
+
+    printf ("Creating test file...\n");
+    int64_t offset = 0;
+    while (file_size > 0) {
+        if (empty_block_prob == 0 ||
+            rand() / (double)RAND_MAX >= empty_block_prob) {
+            /* Generate a random, non-empty block. */
+            int i;
+            uint32_t *q = (uint32_t *)buf;
+            len = MIN(file_size, block_size);
+            for (i = 0; i < len / sizeof(uint32_t); i++) {
+                if (offset % 512 == 0) {
+                    *q++ = UINT32_C(0xAAAAAAAA); /* Make sector non-empty. */
+                } else {
+                    *q++ = rand();
+                }
+                offset += sizeof(uint32_t);
+            }
+        } else {
+            /* Generate a chain of empty blocks. */
+            int n = 1 + rand () % empty_block_chain;
+            len = n * block_size;
+            if (len > file_size) {
+                len = file_size;
+            }
+            memset (buf, 0, len);
+            offset += len;
+        }
+
+        if (qemu_write_full(fd, buf, len) != len) {
+            die("Error in writing %s: %s\n", file, strerror(errno));
+        }
+
+        file_size -= len;
+    }
+
+    qemu_free (buf);
+    close (fd);
+    return 0;
+}
+
+static int auto_test(int argc, char **argv)
+{
+    int c;
+    const char *truth_file = NULL;
+    const char *test_file = NULL;
+    const char *format = NULL;
+    int compare_before = false;
+    int compare_after = true;
+    int seed = 0;
+    const char *create_file = NULL;
+    int block_size = 65536;
+    double empty_block_prob = 0.2;
+    int empty_block_chain = 10;
+    int64_t file_size = 0;
+    int cache_flag = BDRV_O_CACHE_WB;
+
+    const struct option lopt[] = {
+        {"auto", 0, 0, 'a'},
+        {"help", 0, 0, 'h'},
+        {"seed", 1, 0, 'd'},
+        {"truth", 1, 0, 'b'},
+        {"test", 1, 0, 't'},
+        {"format", 1, 0, 'f'},
+        {"rand_time", 1, 0, 'n'},
+        {"fail_prob", 1, 0, 'u'},
+        {"cancel_prob", 1, 0, 'c'},
+        {"aio_flush_prob", 1, 0, 'w'},
+        {"flush_prob", 1, 0, 'y'},
+        {"round", 1, 0, 'r'},
+        {"parallel", 1, 0, 'p'},
+        {"compare_before", 1, 0, 'm'},
+        {"verify_write", 1, 0, 'v'},
+        {"compare_after", 1, 0, 'e'},
+        {"max_iov", 1, 0, 'i'},
+        {"io_size", 1, 0, 's'},
+        {"instant_qemubh", 1, 0, 'q'},
+        {"create", 1, 0, 'g'},
+        {"file_size", 1, 0, 'j'},
+        {"block_size", 1, 0, 'k'},
+        {"empty_block_prob", 1, 0, 'l'},
+        {"empty_block_chain", 1, 0, 'x'},
+        {"cache", 1, 0, 'z'},
+        {NULL, 0, NULL, 0}
+    };
+
+    optind = 1;
+    while ((c = getopt_long(argc, argv,
+            "ahc:u:p:q:i:f:d:b:t:r:m:v:e:s:g:j:k:l:x:z:", lopt, NULL)) != -1) {
+        switch (c) {
+        case 'a':
+            break;
+
+        case 'h':
+            auto_test_usage();
+            return 0;
+
+        case 'q':
+            blksim_set_instant_qemubh(read_bool(optarg));
+            break;
+
+        case 'w':
+            aio_flush_prob = atof(optarg);
+            break;
+
+        case 'y':
+            flush_prob = atof(optarg);
+            break;
+
+        case 'c':
+            cancel_prob = atof(optarg);
+            break;
+
+        case 'u':
+            fail_prob = atof(optarg);
+            break;
+
+        case 'n':
+            rand_time = atoll(optarg);
+            break;
+
+        case 'i':
+            max_iov = atoi(optarg);
+            break;
+
+        case 'p':
+            parallel = atoi(optarg);
+            break;
+
+        case 'v':
+            verify_write = read_bool(optarg);
+            break;
+
+        case 'm':
+            compare_before = read_bool(optarg);
+            break;
+
+        case 'e':
+            compare_after = read_bool(optarg);
+            break;
+
+        case 'd':
+            seed = atoll(optarg);
+            break;
+
+        case 'f':
+            format = optarg;
+            break;
+
+        case 'b':
+            truth_file = optarg;
+            break;
+
+        case 't':
+            test_file = optarg;
+            break;
+
+        case 's':
+            io_size = atoll(optarg);
+            break;
+
+        case 'r':
+            round = atoll(optarg);
+            break;
+
+        case 'g':
+            create_file = optarg;
+            break;
+
+        case 'k':
+            block_size = atoi(optarg);
+            break;
+
+        case 'j':
+            file_size = atoll(optarg);
+            break;
+
+        case 'l':
+            empty_block_prob = atof(optarg);
+            break;
+
+        case 'x':
+            empty_block_chain = atoi(optarg);
+            break;
+
+        case 'z':
+            if (!strcasecmp(optarg, "writethrough")) {
+                cache_flag = 0;
+            } else if (!strcasecmp(optarg, "writeback")) {
+                cache_flag = BDRV_O_CACHE_WB;
+            } else if (!strcasecmp(optarg, "none")) {
+                cache_flag = BDRV_O_NOCACHE;
+            } else {
+                die ("Unknown cache option: %s\n", optarg);
+            }
+            break;
+
+        default:
+            auto_test_usage();
+            return 1;
+        }
+    }
+
+    if (create_file) {
+        return create_test_file(seed, create_file, file_size, block_size,
+                                empty_block_prob, empty_block_chain);
+    }
+
+    if (!truth_file || !test_file) {
+        auto_test_usage();
+        return 1;
+    }
+
+    /* A hack to convince FVD that it is not running in a qemu-tool so that
+     * prefetching and copy_on_read can be enabled for testing. Note that
+     * prefetching and copy_on_read are disabled for qemu-nbd. */
+    rt_clock = (QEMUClock *) - 1;
+
+    if (parallel <= 0) {
+        parallel = 1;
+    }
+    init_blksim(false /*no print */ , rand_time);
+    bdrv_init();
+    srand(seed);
+    perform_test(truth_file, test_file, format, compare_before,
+                 compare_after, cache_flag);
+    return 0;
+}
diff --git a/qemu-io-sim.c b/qemu-io-sim.c
new file mode 100644
index 0000000..923c1b8
--- /dev/null
+++ b/qemu-io-sim.c
@@ -0,0 +1,127 @@
+/*
+ * Extension of qemu-io to work with the simulated block device driver blksim
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ * qemu-io-sim works with qemu-io to perform simulated testing. The 'sim'
+ * command allows the user to control the order of disk I/O and callback
+ * activities in order to test rare race conditions. See block/blksim.c
+ * Note that in the manual mode, qemu-io's 'sim' command can only work with
+ * qemu-io's 'aio_read', 'aio_write', and 'flush' commands. The automated
+ * testing mode, 'qemu-io --auto', performs a much more comprehensive fully
+ * automated test (see qemu-io-auto.c). Below is one example of using qemu-io
+ * to perform manual testing in the simulation mode.
+
+$ qemu-img create -f qcow2 -obacking_fmt=blksim -b base.raw img.qcow2
+Formatting 'img.qcow2', fmt=qcow2 size=1073741824 backing_file='base.raw' backing_fmt='blksim' encryption=off cluster_size=0
+
+$ qemu-io -f qcow2 blksim:img.qcow2
+Execute READ blksim:img.qcow2 sector_num=0 nb_sectors=1
+Execute READ blksim:img.qcow2 sector_num=384 nb_sectors=1
+Execute READ blksim:img.qcow2 sector_num=128 nb_sectors=128
+Execute READ blksim:img.qcow2 sector_num=0 nb_sectors=1
+Execute READ blksim:img.qcow2 sector_num=0 nb_sectors=1
+Execute READ blksim:img.qcow2 sector_num=0 nb_sectors=1
+qemu-io> aio_write 0 512
+Execute READ blksim:img.qcow2 sector_num=256 nb_sectors=128
+Execute WRITE blksim:img.qcow2 sector_num=256 nb_sectors=128
+Execute FLUSH blksim:img.qcow2
+Execute WRITE blksim:img.qcow2 sector_num=512 nb_sectors=128
+Execute FLUSH blksim:img.qcow2
+Execute WRITE blksim:img.qcow2 sector_num=384 nb_sectors=1
+Execute WRITE blksim:img.qcow2 sector_num=256 nb_sectors=128
+Queue WRITE uuid=0 filename=blksim:img.qcow2 sector_num=640 nb_sectors=1
+qemu-io> sim list
+uuid=0  WRITE          file=blksim:img.qcow2  sector_num=640  nb_sectors=1
+qemu-io> sim 0
+Execute WRITE blksim:img.qcow2 sector_num=640 nb_sectors=1
+Queue WRITE_CALLBACK uuid=1 filename=blksim:img.qcow2 sector_num=640 nb_sectors=1
+qemu-io> sim 1
+Execute READ base.raw sector_num=1 nb_sectors=127
+Execute WRITE blksim:img.qcow2 sector_num=641 nb_sectors=127
+Execute FLUSH blksim:img.qcow2
+Execute WRITE blksim:img.qcow2 sector_num=512 nb_sectors=128
+wrote 512/512 bytes at offset 0
+512.000000 bytes, 1 ops; 0:00:09.00 (53.333589 bytes/sec and 0.1042 ops/sec)
+qemu-io> aio_write 65536 1024
+Execute WRITE blksim:img.qcow2 sector_num=256 nb_sectors=128
+Queue WRITE uuid=2 filename=blksim:img.qcow2 sector_num=768 nb_sectors=2
+qemu-io> aio_read 1048576 1024
+Queue READ uuid=3 filename=base.raw sector_num=2048 nb_sectors=2
+qemu-io> sim list
+uuid=2  WRITE          file=blksim:img.qcow2  sector_num=768  nb_sectors=2
+uuid=3  READ           file=base.raw  sector_num=2048  nb_sectors=2
+qemu-io> sim 2
+Execute WRITE blksim:img.qcow2 sector_num=768 nb_sectors=2
+Queue WRITE_CALLBACK uuid=4 filename=blksim:img.qcow2 sector_num=768 nb_sectors=2
+qemu-io> sim list
+uuid=3  READ           file=base.raw  sector_num=2048  nb_sectors=2
+uuid=4  CALLBACK WRITE file=blksim:img.qcow2  sector_num=768  nb_sectors=2
+qemu-io> sim 4
+Execute READ base.raw sector_num=130 nb_sectors=126
+Execute WRITE blksim:img.qcow2 sector_num=770 nb_sectors=126
+Execute FLUSH blksim:img.qcow2
+Execute WRITE blksim:img.qcow2 sector_num=512 nb_sectors=128
+wrote 1024/1024 bytes at offset 65536
+1 KiB, 1 ops; 0:00:20.00 (50.304774 bytes/sec and 0.0491 ops/sec)
+
+*=============================================================================*/
+
+#include "block/blksim.h"
+
+static void sim_help(void)
+{
+    printf("\nsim list\t\tlist all simulation tasks\n"
+           "\nsim <#task> [#ret]\trun a simulation task, optionally "
+                "using #ret as the return value of a read/write operation\n"
+           "\nsim all [#ret]\t\trun all tasks, optionally using #ret as "
+                "the return value of read/write tasks\n"
+           "\nsim prefetch\t\tstart prefetching\n");
+}
+
+static int sim_f(int argc, char **argv)
+{
+    int ret = 0;
+
+    if (argc == 3) {
+        ret = atoi(argv[2]);
+    } else if (argc != 2) {
+        sim_help();
+        return 0;
+    }
+
+    if (strcmp(argv[1], "list") == 0) {
+        blksim_list_tasks();
+    } else if (strcmp(argv[1], "all") == 0) {
+        blksim_set_disk_io_return_code(ret);
+        int n = blksim_run_all_tasks();
+        blksim_set_disk_io_return_code(0);
+        printf("Executed %d tasks.\n", n);
+    } else {
+        blksim_set_disk_io_return_code(ret);
+        blksim_run_task_by_uuid(atoll(argv[1]));
+        blksim_set_disk_io_return_code(0);
+    }
+
+    return 0;
+}
+
+static const cmdinfo_t sim_cmd = {
+    .name = "sim",
+    .altname = "s",
+    .cfunc = sim_f,
+    .argmin = 1,
+    .argmax = 2,
+    .args = "",
+    .oneline = "use simulation to control the order of disk I/Os and callbacks",
+    .help = sim_help,
+};
diff --git a/qemu-io.c b/qemu-io.c
index 4470e49..27591f0 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1584,7 +1584,7 @@ static const cmdinfo_t close_cmd = {
 	.oneline	= "close the current open file",
 };
 
-static int openfile(char *name, int flags, int growable)
+static int openfile(char *name, const char *fmt, int flags, int growable)
 {
 	if (bs) {
 		fprintf(stderr, "file open already, try 'help close'\n");
@@ -1597,9 +1597,17 @@ static int openfile(char *name, int flags, int growable)
 			return 1;
 		}
 	} else {
+                BlockDriver *drv = NULL;
+                if (fmt && !(drv = bdrv_find_format (fmt))) {
+                        fprintf(stderr, "%s: can't find driver for format "
+                                "%s \n", progname, fmt);
+                        bs = NULL;
+                        return 1;
+                }
+
 		bs = bdrv_new("hda");
 
-		if (bdrv_open(bs, name, flags, NULL) < 0) {
+		if (bdrv_open(bs, name, flags, drv) < 0) {
 			fprintf(stderr, "%s: can't open device %s\n", progname, name);
 			bs = NULL;
 			return 1;
@@ -1636,7 +1644,7 @@ static const cmdinfo_t open_cmd = {
 	.argmin		= 1,
 	.argmax		= -1,
 	.flags		= CMD_NOFILE_OK,
-	.args		= "[-Crsn] [path]",
+	.args		= "[-Crsn] [-f <format>] [path]",
 	.oneline	= "open the file specified by path",
 	.help		= open_help,
 };
@@ -1648,8 +1656,9 @@ open_f(int argc, char **argv)
 	int readonly = 0;
 	int growable = 0;
 	int c;
+        const char *fmt = NULL;
 
-	while ((c = getopt(argc, argv, "snrg")) != EOF) {
+	while ((c = getopt(argc, argv, "snrgf:")) != EOF) {
 		switch (c) {
 		case 's':
 			flags |= BDRV_O_SNAPSHOT;
@@ -1663,6 +1672,9 @@ open_f(int argc, char **argv)
 		case 'g':
 			growable = 1;
 			break;
+		case 'f':
+                        fmt = optarg;
+			break;
 		default:
 			return command_usage(&open_cmd);
 		}
@@ -1675,7 +1687,7 @@ open_f(int argc, char **argv)
 	if (optind != argc - 1)
 		return command_usage(&open_cmd);
 
-	return openfile(argv[optind], flags, growable);
+	return openfile(argv[optind], fmt, flags, growable);
 }
 
 static int
@@ -1701,10 +1713,13 @@ init_check_command(
 	return 1;
 }
 
+#include "qemu-io-sim.c"
+#include "qemu-io-auto.c"
+
 static void usage(const char *name)
 {
 	printf(
-"Usage: %s [-h] [-V] [-rsnm] [-c cmd] ... [file]\n"
+"Usage: %s [-h] [-a] [-V] [-rsnm] [-c cmd] ... [file]\n"
 "QEMU Disk exerciser\n"
 "\n"
 "  -c, --cmd            command to execute\n"
@@ -1714,18 +1729,19 @@ static void usage(const char *name)
 "  -g, --growable       allow file to grow (only applies to protocols)\n"
 "  -m, --misalign       misalign allocations for O_DIRECT\n"
 "  -k, --native-aio     use kernel AIO implementation (on Linux only)\n"
+"  -f, --format         image format of the file\n"
+"  -a, --auto           fully automated test\n"
 "  -h, --help           display this help and exit\n"
 "  -V, --version        output version information and exit\n"
 "\n",
 	name);
 }
 
-
 int main(int argc, char **argv)
 {
 	int readonly = 0;
 	int growable = 0;
-	const char *sopt = "hVc:rsnmgk";
+	const char *sopt = "hVc:rsnmgkaf:d";
         const struct option lopt[] = {
 		{ "help", 0, NULL, 'h' },
 		{ "version", 0, NULL, 'V' },
@@ -1737,11 +1753,15 @@ int main(int argc, char **argv)
 		{ "misalign", 0, NULL, 'm' },
 		{ "growable", 0, NULL, 'g' },
 		{ "native-aio", 0, NULL, 'k' },
+		{ "format", 1, NULL, 'f' },
+		{ "auto", 0, NULL, 'a' },
+		{ "sim", 0, NULL, 'd' },
 		{ NULL, 0, NULL, 0 }
 	};
 	int c;
 	int opt_index = 0;
 	int flags = 0;
+        const char *fmt = NULL;
 
 	progname = basename(argv[0]);
 
@@ -1756,6 +1776,12 @@ int main(int argc, char **argv)
 		case 'c':
 			add_user_command(optarg);
 			break;
+		case 'd':
+                        /* A hack to convince FVD that it is running in a
+                         * qemu-tool so that prefetching and copy_on_read can
+                         * be enabled for testing with blksim. */
+                        rt_clock = (QEMUClock *) - 1;
+			break;
 		case 'r':
 			readonly = 1;
 			break;
@@ -1768,6 +1794,11 @@ int main(int argc, char **argv)
 		case 'k':
 			flags |= BDRV_O_NATIVE_AIO;
 			break;
+		case 'f':
+                        fmt = optarg;
+                        break;
+		case 'a':
+                        return auto_test(argc, argv);
 		case 'V':
 			printf("%s version %s\n", progname, VERSION);
 			exit(0);
@@ -1807,6 +1838,7 @@ int main(int argc, char **argv)
 	add_command(&discard_cmd);
 	add_command(&alloc_cmd);
 	add_command(&map_cmd);
+        add_command(&sim_cmd);
 
 	add_args_command(init_args_command);
 	add_check_command(init_check_command);
@@ -1817,7 +1849,7 @@ int main(int argc, char **argv)
         }
 
 	if ((argc - optind) == 1)
-		openfile(argv[optind], flags, growable);
+		openfile(argv[optind], fmt, flags, growable);
 	command_loop();
 
 	/*
diff --git a/qemu-tool.c b/qemu-tool.c
index 392e1c9..ebf7355 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -16,11 +16,11 @@
 #include "qemu-timer.h"
 #include "qemu-log.h"
 #include "sysemu.h"
+#include "block/blksim.h"
 
 #include <sys/time.h>
 
 QEMUClock *rt_clock;
-
 FILE *logfile;
 
 struct QEMUBH
@@ -73,34 +73,115 @@ void monitor_protocol_event(MonitorEvent event, QObject *data)
 {
 }
 
-QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
+/*
+ * In the simulation mode, the QEMUBH and time related functions are handled
+ * differently through simulatoin.
+ */
+int64_t qemu_get_clock (QEMUClock * clock)
+{
+    if (using_blksim()) {
+        return blksim_get_time ();
+    }
+    else {
+        qemu_timeval tv;
+        qemu_gettimeofday (&tv);
+        return (tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000)) / 1000000;
+    }
+}
+
+void qemu_mod_timer (QEMUTimer * ts, int64_t expire_time)
+{
+    if (using_blksim()) {
+        blksim_mod_timer (ts, expire_time);
+    }
+    else {
+        fprintf (stderr, "A QEMU tool should not invoke qemu_mod_timer() "
+                 "unless it is in the simulation mode.\n");
+        exit (1);
+    }
+}
+
+QEMUTimer *qemu_new_timer (QEMUClock * clock, QEMUTimerCB * cb, void *opaque)
+{
+    if (using_blksim()) {
+        return blksim_new_timer (cb, opaque);
+    }
+    else {
+        fprintf (stderr, "A QEMU tool should not invoke qemu_new_timer() "
+                 "unless it is in the simulation mode.\n");
+        exit (1);
+        return NULL;
+    }
+}
+
+void qemu_free_timer (QEMUTimer * ts)
 {
-    QEMUBH *bh;
+    if (using_blksim()) {
+        blksim_free_timer (ts);
+    }
+    else {
+        fprintf (stderr, "A QEMU tool should not invoke qemu_free_timer() "
+                 "unless it is in the simulation mode.\n");
+        exit (1);
+    }
+}
 
-    bh = qemu_malloc(sizeof(*bh));
-    bh->cb = cb;
-    bh->opaque = opaque;
+void qemu_del_timer (QEMUTimer * ts)
+{
+    if (using_blksim()) {
+        blksim_del_timer (ts);
+    }
+    else {
+        fprintf (stderr, "A QEMU tool should not invoke qemu_del_timer() "
+                 "unless it is in the simulation mode.\n");
+        exit (1);
+    }
+}
 
-    return bh;
+QEMUBH *qemu_bh_new (QEMUBHFunc * cb, void *opaque)
+{
+    if (using_blksim()) {
+        return blksim_new_timer (cb, opaque);
+    }
+    else {
+        QEMUBH *bh;
+        bh = qemu_malloc (sizeof (*bh));
+        bh->cb = cb;
+        bh->opaque = opaque;
+        return bh;
+    }
 }
 
-int qemu_bh_poll(void)
+int qemu_bh_poll (void)
 {
     return 0;
 }
 
-void qemu_bh_schedule(QEMUBH *bh)
+void qemu_bh_schedule (QEMUBH * bh)
 {
-    bh->cb(bh->opaque);
+    if (using_blksim()) {
+        blksim_bh_schedule (bh);
+    }
+    else {
+        bh->cb (bh->opaque);
+    }
 }
 
-void qemu_bh_cancel(QEMUBH *bh)
+void qemu_bh_cancel (QEMUBH * bh)
 {
+    if (using_blksim()) {
+        blksim_del_timer (bh);
+    }
 }
 
-void qemu_bh_delete(QEMUBH *bh)
+void qemu_bh_delete (QEMUBH * bh)
 {
-    qemu_free(bh);
+    if (using_blksim()) {
+        blksim_free_timer (bh);
+    }
+    else {
+        qemu_free (bh);
+    }
 }
 
 int qemu_set_fd_handler2(int fd,
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 03/26] FVD: add fully automated test-qcow2.sh
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 02/26] FVD: extend qemu-io to do fully automated testing Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 04/26] FVD: add fully automated test-vdi.sh Chunqiang Tang
                   ` (22 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

test-qcow2.sh drives 'qemu-io --auto' to perform fully automated testing for
QCOW2.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 test-qcow2.sh |   89 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 89 insertions(+), 0 deletions(-)
 create mode 100755 test-qcow2.sh

diff --git a/test-qcow2.sh b/test-qcow2.sh
new file mode 100755
index 0000000..d1e4dc0
--- /dev/null
+++ b/test-qcow2.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# Drive 'qemu-io --auto' to test the QCOW2 image format.
+#
+# Copyright IBM, Corp. 2010
+#
+# Authors:
+#     Chunqiang Tang <ctang@us.ibm.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_IO=$QEMU_DIR/qemu-io
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_IO ]; then
+    echo "$QEMU_IO does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.qcow2
+TEST_BASE=$DATA_DIR/zero-500M.raw
+CMD_LOG=./test-qcow2.log
+
+parallel=100
+round=10000
+fail_prob=0.1
+cancel_prob=0
+instant_qemubh=true
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    $*
+    ret=$?
+    if [ $ret -ne 0 ]; then
+        echo "Exit with error code $ret: $*"
+        exit $ret
+    fi
+}
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    invoke "mount -t tmpfs none $DATA_DIR -o size=4G"
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+/bin/rm -f $CMD_LOG $DATA_DIR/*
+touch $CMD_LOG
+
+while [ -t ]; do
+for cache in none writethrough writeback; do
+for cluster_size in 65536 ; do
+for io_size in 1048576 ; do
+    count=$[$count + 1]
+    echo "Round $count" >> $CMD_LOG
+
+    # QCOW2 image is about 1G
+    img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+    # base image is about 500MB
+    base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+    invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE"
+    invoke "$QEMU_IO --auto --create=$TEST_BASE --seed=$seed --block_size=1048576 --empty_block_prob=0 --empty_block_chain=1 --file_size=$base_size"
+    invoke "cp --sparse=always $TEST_BASE $TRUTH_IMG"
+    invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+    invoke "$QEMU_IMG create -f qcow2 -ocluster_size=$cluster_size,backing_fmt=blksim -b $TEST_BASE $TEST_IMG $img_size"
+
+    invoke "$QEMU_IO --auto --cache=$cache --seed=$seed --truth=$TRUTH_IMG --format=qcow2 --test="blksim:$TEST_IMG" --verify_write=true --compare_before=false --compare_after=true --round=$round --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --instant_qemubh=$instant_qemubh"
+
+    seed=$[$seed + 1]
+done; done; done; done
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 04/26] FVD: add fully automated test-vdi.sh
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 02/26] FVD: extend qemu-io to do fully automated testing Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 03/26] FVD: add fully automated test-qcow2.sh Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 05/26] FVD: add the 'qemu-img update' command Chunqiang Tang
                   ` (21 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

test-vdi.sh drives 'qemu-io --auto' to perform fully automated testing for VDI.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 test-vdi.sh |   83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 83 insertions(+), 0 deletions(-)
 create mode 100755 test-vdi.sh

diff --git a/test-vdi.sh b/test-vdi.sh
new file mode 100755
index 0000000..b0bfe65
--- /dev/null
+++ b/test-vdi.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+# Drive 'qemu-io --auto' to test the VDI image format.
+#
+# Copyright IBM, Corp. 2010
+#
+# Authors:
+#     Chunqiang Tang <ctang@us.ibm.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_IO=$QEMU_DIR/qemu-io
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_IO ]; then
+    echo "$QEMU_IO does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.vdi
+CMD_LOG=./test-vdi.log
+
+parallel=10
+round=1000
+fail_prob=0.1
+cancel_prob=0
+flush_prob=0
+aio_flush_prob=0
+instant_qemubh=true
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    $*
+    ret=$?
+    if [ $? -ne 0 ]; then
+        echo "Exit with error code $?: $*"
+        exit $ret
+    fi
+}
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    invoke "mount -t tmpfs none $DATA_DIR -o size=400M"
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+/bin/rm -f $CMD_LOG
+touch $CMD_LOG
+
+while [ -t ]; do
+for io_size in 3145728; do
+    count=$[$count + 1]
+    echo "Round $count" >> $CMD_LOG
+
+    # VDI image is about 100M
+    img_size=$[(104857600 + ($RANDOM$RANDOM$RANDOM % 10485760)) / 512 * 512]
+
+    invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG"
+    invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+    invoke "$QEMU_IMG create -f vdi $TEST_IMG $img_size"
+
+    invoke "$QEMU_IO --auto --seed=$seed --truth=$TRUTH_IMG --format=vdi --test="blksim:$TEST_IMG" --verify_write=true --compare_before=false --compare_after=true --round=$round --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --aio_flush_prob=$aio_flush_prob --flush_prob=$flush_prob --instant_qemubh=$instant_qemubh"
+
+    seed=$[$seed + 1]
+done; done
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 05/26] FVD: add the 'qemu-img update' command
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (2 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 04/26] FVD: add fully automated test-vdi.sh Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 06/26] FVD: skeleton of Fast Virtual Disk Chunqiang Tang
                   ` (20 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the 'update' command to qemu-img. It is a general interface
that allows various image format specific manipulations. For example,
'qemu-img rebase' and 'qemu-img resize' can be considered as two special cases
of update.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block_int.h      |    3 +
 qemu-img-cmds.hx |    6 +++
 qemu-img.c       |  125 +++++++++++++++++++++++++++++++++++++++++++++++-------
 qemu-option.c    |   79 ++++++++++++++++++++++++++++++++++
 qemu-option.h    |    4 ++
 5 files changed, 201 insertions(+), 16 deletions(-)

diff --git a/block_int.h b/block_int.h
index 545ad11..8f6b6d0 100644
--- a/block_int.h
+++ b/block_int.h
@@ -98,6 +98,7 @@ struct BlockDriver {
     int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
                                   const char *snapshot_name);
     int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+    int (*bdrv_update)(BlockDriverState *bs, QEMUOptionParameter *options);
 
     int (*bdrv_save_vmstate)(BlockDriverState *bs, const uint8_t *buf,
                              int64_t pos, int size);
@@ -122,6 +123,8 @@ struct BlockDriver {
     /* List of options for creating images, terminated by name == NULL */
     QEMUOptionParameter *create_options;
 
+    /* List of options for updating images, terminated by name == NULL */
+    QEMUOptionParameter *update_options;
 
     /*
      * Returns 0 for completed check, -errno for internal errors.
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 6c7176f..a7ed395 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -39,6 +39,12 @@ STEXI
 @item info [-f @var{fmt}] @var{filename}
 ETEXI
 
+DEF("update", img_update,
+    "update [-f fmt] [-o options] filename")
+STEXI
+@item update [-f @var{fmt}] [-o @var{options}] @var{filename} [@var{size}]
+ETEXI
+
 DEF("snapshot", img_snapshot,
     "snapshot [-l | -a snapshot | -c snapshot | -d snapshot] filename")
 STEXI
diff --git a/qemu-img.c b/qemu-img.c
index 7e3cc4c..215e7b9 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -179,10 +179,11 @@ static int read_password(char *buf, int buf_size)
 }
 #endif
 
-static int print_block_option_help(const char *filename, const char *fmt)
+static int print_block_option_help(const char *filename, const char *fmt,
+                                   bool create_options)
 {
     BlockDriver *drv, *proto_drv;
-    QEMUOptionParameter *create_options = NULL;
+    QEMUOptionParameter *options = NULL;
 
     /* Find driver and parse its options */
     drv = bdrv_find_format(fmt);
@@ -197,12 +198,15 @@ static int print_block_option_help(const char *filename, const char *fmt)
         return 1;
     }
 
-    create_options = append_option_parameters(create_options,
-                                              drv->create_options);
-    create_options = append_option_parameters(create_options,
-                                              proto_drv->create_options);
-    print_option_help(create_options);
-    free_option_parameters(create_options);
+    if (create_options) {
+        options = append_option_parameters(options, drv->create_options);
+        options = append_option_parameters(options, proto_drv->create_options);
+    } else {
+        options = append_option_parameters(options, drv->update_options);
+        options = append_option_parameters(options, proto_drv->update_options);
+    }
+    print_option_help(options);
+    free_option_parameters(options);
     return 0;
 }
 
@@ -337,7 +341,7 @@ static int img_create(int argc, char **argv)
     }
 
     if (options && !strcmp(options, "?")) {
-        ret = print_block_option_help(filename, fmt);
+        ret = print_block_option_help(filename, fmt, true /*create*/);
         goto out;
     }
 
@@ -631,7 +635,7 @@ static int img_convert(int argc, char **argv)
     out_filename = argv[argc - 1];
 
     if (options && !strcmp(options, "?")) {
-        ret = print_block_option_help(out_filename, out_fmt);
+        ret = print_block_option_help(out_filename, out_fmt, true /*create*/);
         goto out;
     }
 
@@ -869,7 +873,7 @@ static int img_convert(int argc, char **argv)
                    assume that sectors which are unallocated in the input image
                    are present in both the output's and input's base images (no
                    need to copy them). */
-                if (out_baseimg) {
+                if (out_baseimg || bs[bs_i]->backing_file[0]==0) {
                     if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
                                            n, &n1)) {
                         sector_num += n1;
@@ -1040,11 +1044,6 @@ static int img_info(int argc, char **argv)
     if (bdrv_is_encrypted(bs)) {
         printf("encrypted: yes\n");
     }
-    if (bdrv_get_info(bs, &bdi) >= 0) {
-        if (bdi.cluster_size != 0) {
-            printf("cluster_size: %d\n", bdi.cluster_size);
-        }
-    }
     bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
     if (backing_filename[0] != '\0') {
         path_combine(backing_filename2, sizeof(backing_filename2),
@@ -1053,11 +1052,105 @@ static int img_info(int argc, char **argv)
                backing_filename,
                backing_filename2);
     }
+    if (bdrv_get_info(bs, &bdi) >= 0) {
+        if (bdi.cluster_size != 0)
+            printf("cluster_size: %d\n", bdi.cluster_size);
+    }
     dump_snapshots(bs);
     bdrv_delete(bs);
     return 0;
 }
 
+static int img_update(int argc, char **argv)
+{
+    int c, ret = 0;
+    const char *filename, *fmt = NULL;
+    BlockDriverState *bs;
+    char *options = NULL;
+    QEMUOptionParameter *param = NULL, *option_template = NULL;
+    BlockDriver *drv, *proto_drv;
+    char fmt_name[128];
+
+    for(;;) {
+        c = getopt(argc, argv, "f:o:h");
+        if (c == -1)
+            break;
+        switch(c) {
+        case 'h':
+            help();
+            break;
+        case 'f':
+            fmt = optarg;
+            break;
+        case 'o':
+            options = optarg;
+            break;
+        }
+    }
+    if (optind >= argc)
+        help();
+    filename = argv[optind++];
+    if (!options) {
+        error_report("No options were given\n");
+        return -EINVAL;
+    }
+
+    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_NO_BACKING
+                       | BDRV_O_RDWR);
+    if (!bs) {
+        ret = -EIO;
+        goto out;
+    }
+
+    bdrv_get_format(bs, fmt_name, sizeof(fmt_name));
+
+    if (!strcmp(options, "?")) {
+        return print_block_option_help(filename, fmt_name, false /*update*/);
+    }
+
+    if (!bs->drv->bdrv_update) {
+        error_report("the 'update' command is not supported for the '%s' "
+                     "image format.", fmt_name);
+        goto out;
+    }
+
+    /* Find driver and parse its options */
+    drv = bdrv_find_format(fmt_name);
+    if (!drv) {
+        error_report("Unknown file format '%s'", fmt_name);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    proto_drv = bdrv_find_protocol(filename);
+    if (!proto_drv) {
+        error_report("Unknown protocol '%s'", filename);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    option_template = append_option_parameters(option_template,
+                                              drv->update_options);
+    option_template = append_option_parameters(option_template,
+                                              proto_drv->update_options);
+
+    if (!(param = parse_specified_parameters(options, option_template))) {
+        error_report("Invalid options for file format '%s'.", fmt_name);
+        ret = -EINVAL;
+        goto out;
+    }
+
+    ret = bs->drv->bdrv_update(bs, param);
+
+out:
+    free_option_parameters(option_template);
+    free_option_parameters(param);
+    if(bs) {
+        bdrv_delete(bs);
+    }
+    return ret;
+}
+
 #define SNAPSHOT_LIST   1
 #define SNAPSHOT_CREATE 2
 #define SNAPSHOT_APPLY  3
diff --git a/qemu-option.c b/qemu-option.c
index 65db542..28b19b5 100644
--- a/qemu-option.c
+++ b/qemu-option.c
@@ -289,6 +289,10 @@ int set_option_parameter(QEMUOptionParameter *list, const char *name,
             return -1;
         break;
 
+    case OPT_NUMBER:
+        list->value.n = atoi (value);
+        break;
+
     default:
         fprintf(stderr, "Bug: Option '%s' has an unknown type\n", name);
         return -1;
@@ -391,6 +395,22 @@ QEMUOptionParameter *append_option_parameters(QEMUOptionParameter *dest,
     return dest;
 }
 
+QEMUOptionParameter *append_one_option_parameter(QEMUOptionParameter *dest,
+    QEMUOptionParameter *param)
+{
+    QEMUOptionParameter *target;
+    if ((target = get_option_parameter(dest, param->name))) {
+        *target = *param;
+    } else {
+        size_t n = count_option_parameters(dest);
+        dest = qemu_realloc(dest, (n + 2) * sizeof(QEMUOptionParameter));
+        dest[n] = *param;
+        dest[n + 1].name = NULL;
+    }
+
+    return dest;
+}
+
 /*
  * Parses a parameter string (param) into an option list (dest).
  *
@@ -461,6 +481,65 @@ fail:
 }
 
 /*
+ * Parses a parameter string (param) into an option list (dest).
+ *
+ * list is the template option list. If list is NULL, this function fails.
+ * Only options explicitly specified in param are returned in dest.
+ */
+QEMUOptionParameter *parse_specified_parameters(const char *param,
+    QEMUOptionParameter *list)
+{
+    QEMUOptionParameter *dest = NULL;
+    char name[256];
+    char value[256];
+    char *param_delim, *value_delim;
+    char next_delim;
+    QEMUOptionParameter *opt;
+
+    if (list == NULL) {
+        return NULL;
+    }
+
+    while (*param) {
+        /* Find parameter name and value in the string. */
+        param_delim = strchr(param, ',');
+        value_delim = strchr(param, '=');
+
+        if (value_delim && (value_delim < param_delim || !param_delim)) {
+            next_delim = '=';
+        } else {
+            next_delim = ',';
+            value_delim = NULL;
+        }
+
+        param = get_opt_name(name, sizeof(name), param, next_delim);
+        if (value_delim) {
+            param = get_opt_value(value, sizeof(value), param + 1);
+        }
+        if (*param != '\0') {
+            param++;
+        }
+
+        /* Set the parameter in the template. */
+        if (set_option_parameter(list, name, value_delim ? value : NULL)) {
+            goto fail;
+        }
+
+        /* Copy from template to dest. */
+        opt = get_option_parameter(list, name);
+        dest = append_one_option_parameter(dest, opt);
+    }
+
+    return dest;
+
+fail:
+    if (dest) {
+        free_option_parameters(dest);
+    }
+    return NULL;
+}
+
+/*
  * Prints all options of a list that have a value to stdout
  */
 void print_option_parameters(QEMUOptionParameter *list)
diff --git a/qemu-option.h b/qemu-option.h
index b515813..81ca734 100644
--- a/qemu-option.h
+++ b/qemu-option.h
@@ -72,8 +72,12 @@ int set_option_parameter_int(QEMUOptionParameter *list, const char *name,
     uint64_t value);
 QEMUOptionParameter *append_option_parameters(QEMUOptionParameter *dest,
     QEMUOptionParameter *list);
+QEMUOptionParameter *append_one_option_parameter(QEMUOptionParameter *dest,
+    QEMUOptionParameter *param);
 QEMUOptionParameter *parse_option_parameters(const char *param,
     QEMUOptionParameter *list, QEMUOptionParameter *dest);
+QEMUOptionParameter *parse_specified_parameters(const char *param,
+    QEMUOptionParameter *list);
 void free_option_parameters(QEMUOptionParameter *list);
 void print_option_parameters(QEMUOptionParameter *list);
 void print_option_help(QEMUOptionParameter *list);
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 06/26] FVD: skeleton of Fast Virtual Disk
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (3 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 05/26] FVD: add the 'qemu-img update' command Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 07/26] FVD: extend FVD header fvd.h to be more complete Chunqiang Tang
                   ` (19 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the skeleton of the block device driver for
Fast Virtual Disk (FVD).

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile.objs      |    2 +-
 block/fvd-create.c |   21 +++++++
 block/fvd-flush.c  |   24 +++++++
 block/fvd-misc.c   |   37 +++++++++++
 block/fvd-open.c   |   17 +++++
 block/fvd-read.c   |   21 +++++++
 block/fvd-update.c |   21 +++++++
 block/fvd-write.c  |   21 +++++++
 block/fvd.c        |   60 ++++++++++++++++++
 block/fvd.h        |  171 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 10 files changed, 394 insertions(+), 1 deletions(-)
 create mode 100644 block/fvd-create.c
 create mode 100644 block/fvd-flush.c
 create mode 100644 block/fvd-misc.c
 create mode 100644 block/fvd-open.c
 create mode 100644 block/fvd-read.c
 create mode 100644 block/fvd-update.c
 create mode 100644 block/fvd-write.c
 create mode 100644 block/fvd.c
 create mode 100644 block/fvd.h

diff --git a/Makefile.objs b/Makefile.objs
index 264aab3..9185d3e 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,7 +23,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
-block-nested-y += blksim.o
+block-nested-y += blksim.o fvd.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block/fvd-create.c b/block/fvd-create.c
new file mode 100644
index 0000000..5593cea
--- /dev/null
+++ b/block/fvd-create.c
@@ -0,0 +1,21 @@
+/*
+ * QEMU Fast Virtual Disk Format bdrv_create()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static int fvd_create(const char *filename, QEMUOptionParameter * options)
+{
+    return -ENOTSUP;
+}
+
+static QEMUOptionParameter fvd_create_options[] = {
+    {NULL}
+};
diff --git a/block/fvd-flush.c b/block/fvd-flush.c
new file mode 100644
index 0000000..34bd5cb
--- /dev/null
+++ b/block/fvd-flush.c
@@ -0,0 +1,24 @@
+/*
+ * QEMU Fast Virtual Disk Format bdrv_flush() and bdrv_aio_flush()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    return NULL;
+}
+
+static int fvd_flush(BlockDriverState * bs)
+{
+    return -ENOTSUP;
+}
diff --git a/block/fvd-misc.c b/block/fvd-misc.c
new file mode 100644
index 0000000..f4e1038
--- /dev/null
+++ b/block/fvd-misc.c
@@ -0,0 +1,37 @@
+/*
+ * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static void fvd_close(BlockDriverState * bs)
+{
+}
+
+static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename)
+{
+    return 0;
+}
+
+static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
+                            int nb_sectors, int *pnum)
+{
+    return 0;
+}
+
+static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
+{
+    return -ENOTSUP;
+}
+
+static int fvd_has_zero_init(BlockDriverState * bs)
+{
+    return 0;
+}
diff --git a/block/fvd-open.c b/block/fvd-open.c
new file mode 100644
index 0000000..056b994
--- /dev/null
+++ b/block/fvd-open.c
@@ -0,0 +1,17 @@
+/*
+ * QEMU Fast Virtual Disk Format bdrv_file_open()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
+{
+    return -ENOTSUP;
+}
diff --git a/block/fvd-read.c b/block/fvd-read.c
new file mode 100644
index 0000000..b9f3ac9
--- /dev/null
+++ b/block/fvd-read.c
@@ -0,0 +1,21 @@
+/*
+ * QEMU Fast Virtual Disk Format bdrv_aio_readv()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
+                                       int64_t sector_num, QEMUIOVector * qiov,
+                                       int nb_sectors,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd-update.c b/block/fvd-update.c
new file mode 100644
index 0000000..2498618
--- /dev/null
+++ b/block/fvd-update.c
@@ -0,0 +1,21 @@
+/*
+ * QEMU Fast Virtual Disk Format bdrv_update
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
+{
+    return -ENOTSUP;
+}
+
+static QEMUOptionParameter fvd_update_options[] = {
+    {NULL}
+};
diff --git a/block/fvd-write.c b/block/fvd-write.c
new file mode 100644
index 0000000..a736a37
--- /dev/null
+++ b/block/fvd-write.c
@@ -0,0 +1,21 @@
+/*
+ * QEMU Fast Virtual Disk Format bdrv_aio_writev()
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
+                                        int64_t sector_num,
+                                        QEMUIOVector * qiov, int nb_sectors,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd.c b/block/fvd.c
new file mode 100644
index 0000000..bc2645c
--- /dev/null
+++ b/block/fvd.c
@@ -0,0 +1,60 @@
+/*
+ * QEMU Fast Virtual Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ *  See the following companion papers for a detailed description of FVD:
+ *  1. The so-called "FVD-cow paper":
+ *          "FVD: a High-Performance Virtual Machine Image Format for Cloud",
+ *      by Chunqiang Tang, 2010.
+ *  2. The so-called "FVD-compact paper":
+ *          "FVD: a High-Performance Virtual Machine Image Format for Cloud
+ *           with Sparse Image Capability", by Chunqiang Tang, 2010.
+ *============================================================================*/
+
+#include "block/fvd.h"
+
+/* Use include to avoid exposing too many FVD symbols, and to allow inline
+ * function optimization. */
+#include "block/fvd-flush.c"
+#include "block/fvd-update.c"
+#include "block/fvd-misc.c"
+#include "block/fvd-create.c"
+#include "block/fvd-open.c"
+#include "block/fvd-read.c"
+#include "block/fvd-write.c"
+
+static BlockDriver bdrv_fvd = {
+    .format_name = "fvd",
+    .instance_size = sizeof(BDRVFvdState),
+    .bdrv_create = fvd_create,
+    .bdrv_probe = fvd_probe,
+    .bdrv_file_open = fvd_open,
+    .bdrv_close = fvd_close,
+    .bdrv_is_allocated = fvd_is_allocated,
+    .bdrv_flush = fvd_flush,
+    .bdrv_aio_readv = fvd_aio_readv,
+    .bdrv_aio_writev = fvd_aio_writev,
+    .bdrv_aio_flush = fvd_aio_flush,
+    .create_options = fvd_create_options,
+    .update_options = fvd_update_options,
+    .bdrv_get_info = fvd_get_info,
+    .bdrv_update = fvd_update,
+    .bdrv_has_zero_init = fvd_has_zero_init
+};
+
+static void bdrv_fvd_init(void)
+{
+    bdrv_register(&bdrv_fvd);
+}
+
+block_init(bdrv_fvd_init);
diff --git a/block/fvd.h b/block/fvd.h
new file mode 100644
index 0000000..f2da330
--- /dev/null
+++ b/block/fvd.h
@@ -0,0 +1,171 @@
+/*
+ * QEMU Fast Virtual Disk Format
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+
+enum {
+    FVD_MAGIC = ('F' | 'V' << 8 | 'D' << 16 | '\0' << 24),
+    FVD_VERSION = 1,
+    INCOMPATIBLE_FEATURES_SPACE = 4096, /* in bytes. */
+    DEF_PAGE_SIZE = 4096                /* in bytes. */
+};
+
+/*
+ * The FVD format consists of the following fields in little endian:
+ *   + Header fields of FvdHeader.
+ *   + Bitmap, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.bitmap_offset.
+ *   + Journal, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.journal_offset.
+ *   + Table, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.table_offset. When expanding the size of an existing FVD
+ *     image, the table can be expanded to borrow space from the next,
+ *     "virtual disk data" section, by relocating some data chunks.
+ *   + Virtual disk data,  starting on a 4KB page boundary. Optionally, disk
+ *     data can be stored in a separate data file specified by
+ *     FvdHeader.data_file.
+ */
+typedef struct __attribute__ ((__packed__)) FvdHeader {
+    uint32_t magic;             /* FVD_MAGIC */
+
+    /* Size of FvdHeader in bytes, rounded up to DEF_PAGE_SIZE. A new FVD
+     * version may add fields to FvdHeader and hence need to increase
+     * header_size. When an old FVD version reads an image created by a new
+     * FVD version, the old version only reads the beginning part of FvdHeader
+     * that it can understand and ignroes the new fields at the end of
+     * FvdHeader. */
+    uint32_t header_size;
+
+    /* Version of the FVD software that created the image. */
+    uint32_t create_version;
+
+    /* Version of the FVD software that openned the image most recently. This
+     * field is for forward compatibility. Consider one example. Suppos FVD
+     * version N+1 introduces a compatible feature, e.g., adding a
+     * 'last_modified' timestamp into the FVD image header. Even if FVD
+     * version N is unaware of this new feature, it can still open an image
+     * created by FVD version N+1 without problem, but won't update the
+     * last_modified field. FVD version N sets the image's
+     * 'last_open_version=N' when it opens the image.  When FVD version N+1
+     * opens this image, it knows that the 'last_modified' field cannot be
+     * trusted and may take some actions accordingly, e.g., being conservative
+     * in some optimization heuristics that depend on the value of
+     * 'last_modified' to avoid making the optimization counter effective. */
+    uint32_t last_open_version;
+
+    uint64_t virtual_disk_size;  /* in bytes. Disk size perceived by the VM. */
+    uint64_t data_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
+
+    /* Data can be optionally stored in a different file. */
+    char data_file[1024];
+    char data_file_fmt[16];
+
+    /* Base image. */
+    char base_img[1024];
+    char base_img_fmt[16];
+    uint64_t base_img_size;      /* in bytes. */
+
+    /* Bitmap for base image. */
+    uint64_t bitmap_offset;      /* in bytes. Aligned on DEF_PAGE_SIZE. */
+    uint64_t bitmap_size;        /* in bytes. Rounded up to DEF_PAGE_SIZE. */
+    uint64_t block_size;         /* in bytes. */
+
+    /* Journal */
+    uint64_t journal_offset;      /* in bytes. */
+    uint64_t journal_size;        /* in bytes. On-disk journal size. */
+    uint32_t clean_shutdown;      /* true if VM's last shutdown was graceful. */
+    uint64_t stable_journal_epoch; /* Needed only if a chunk can be relocated.*/
+    uint64_t journal_buf_size;     /* in bytes. In-memory buffer size. */
+    uint64_t journal_clean_buf_period; /* in milliseconds. */
+
+    /* Table for compact image. */
+    uint64_t table_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
+    uint64_t table_size;          /* in bytes. Rounded up to DEF_PAGE_SIZE. */
+    uint64_t chunk_size;          /* in bytes. */
+    uint64_t storage_grow_unit;   /* in bytes. */
+    char add_storage_cmd[1024];
+    uint32_t chunks_relocated;    /* Affect bdrv_has_zero_init(). */
+
+    /* Copy-on-read */
+    uint32_t copy_on_read;       /* true or false */
+    uint64_t max_outstanding_copy_on_read_data;        /* in bytes. */
+
+    /* Prefetching. */
+    int64_t prefetch_start_delay; /* in seconds. -1 means disabled. */
+    uint32_t base_img_fully_prefetched; /* true or false. */
+    uint32_t num_prefetch_slots; /* Max number of oustanding prefetch writes. */
+    uint64_t bytes_per_prefetch; /* For whole image prefetching. */
+    uint64_t prefetch_read_throughput_measure_time;  /* in milliseconds. */
+    uint64_t prefetch_write_throughput_measure_time; /* in milliseconds. */
+    uint64_t prefetch_min_read_throughput;  /* in KB/second. */
+    uint64_t prefetch_min_write_throughput; /* in KB/second. */
+    uint64_t prefetch_max_read_throughput;  /* in KB/second. */
+    uint64_t prefetch_max_write_throughput; /* in KB/second. */
+    uint64_t prefetch_throttle_time;  /* in milliseconds. */
+
+    /* need_zero_init is true if the image mandates that the storage layer
+     * (BDRVFvdState.fvd_data) must return true for bdrv_has_zero_init().
+     * This is the case if the optimization described in Section 3.3.3 of the
+     * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
+     * create' sets need_zero_init to true, 'qemu-img update' can be used to
+     * manually reset it to false, if the user always manually pre-fills the
+     * storage (e.g., a raw partition) with zeros. If the image is stored on a
+     * file system, it already supports zero_init, and hence there is no need
+     * to manually manipulate this field. */
+    uint32_t need_zero_init;
+
+    /* This field enables adding incompatible features. For example, Suppose
+     * FVD version N+1 adds image compression. A compressed image cannot be
+     * openned by FVD version N. Suppose in FVD version N, the value of
+     * INCOMPATIBLE_FEATURES_SPACE is 4096. Introducing image compression
+     * in FVD version N+1 causes the following changes to the header.
+     *   In FVD version N:
+     *          uint8_t incompatible_features[4096];
+     *   In FVD version N+1:
+     *          uint8_t image_compressed;
+     *          uint8_t incompatible_features[4095];
+     *
+     * When any FVD version X opens an image, it always scans through the
+     * entire array of 'incompatible_features', although the size of
+     * INCOMPATIBLE_FEATURES_SPACE may be different for different FVD
+     * versions. If any bit of 'incompatible_features' is non-zero, FVD
+     * version X refuses to open the image. In the example above, if FVD
+     * version N+1 creates a non-compressed image, it sets
+     * 'image_compressed=0', which then still allows FVD version N to open the
+     * image. Instead of using one byte to represent a new feature, it can
+     * also use one bit to represent a new feature, which then allows a total
+     * of 32768 incompatible features to be added in the future.
+     */
+    uint8_t incompatible_features[INCOMPATIBLE_FEATURES_SPACE];
+
+    /* When a new FVD version introduces a new feature (which may or may not
+     * be backward compatible), an arbitrary number of new fields can be added
+     * to the image header, but those new fields must be added at the end of
+     * 'FvdHeader'. Old FVD versions simply won't read or write those new
+     * fields. Old FVD versions can still correctly access the bitmap, the
+     * journal, and the table, because no FVD version assumes a fixed header
+     * size, but instead accesses the bitmap, the journal, and the table
+     * through bitmap_offset, journal_offset, and table_offset, respectively.
+     * Similarly, if a new data structure of a variable size is added to the
+     * image header in the future, it must also be indexed by an offset field
+     * and a size field. */
+} FvdHeader;
+
+typedef struct BDRVFvdState {
+} BDRVFvdState;
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 07/26] FVD: extend FVD header fvd.h to be more complete
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (4 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 06/26] FVD: skeleton of Fast Virtual Disk Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 08/26] FVD: add debugging utilities Chunqiang Tang
                   ` (18 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch makes FVD's header file fvd.h more complete, by adding type
definition for BDRVFvdState, FvdAIOCB, etc.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd.h |  337 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 337 insertions(+), 0 deletions(-)

diff --git a/block/fvd.h b/block/fvd.h
index f2da330..b83b7aa 100644
--- a/block/fvd.h
+++ b/block/fvd.h
@@ -168,4 +168,341 @@ typedef struct __attribute__ ((__packed__)) FvdHeader {
 } FvdHeader;
 
 typedef struct BDRVFvdState {
+    BlockDriverState *fvd_metadata;
+    BlockDriverState *fvd_data;
+    uint64_t virtual_disk_size;  /*in bytes. */
+    uint64_t bitmap_offset;      /* in sectors */
+    uint64_t bitmap_size;        /* in bytes. */
+    uint64_t data_offset;        /* in sectors. Begin of real data. */
+    uint64_t base_img_sectors;
+    uint64_t block_size;         /* in sectors. */
+    bool copy_on_read;
+    uint64_t max_outstanding_copy_on_read_data;    /* in bytes. */
+    uint64_t outstanding_copy_on_read_data;        /* in bytes. */
+    bool data_region_prepared;
+    QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
+    QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and CoW. */
+
+    /* Keep two copies of bitmap to reduce the overhead of updating the
+     * on-disk bitmap, i.e., copy-on-read and prefetching do not update the
+     * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
+    uint8_t *fresh_bitmap;
+    uint8_t *stale_bitmap;
+
+    /******** Begin: for compact image. *************************************/
+    uint32_t *table;    /* Mapping table stored in memory in little endian. */
+    uint64_t table_size;        /* in bytes. */
+    uint64_t used_storage;        /* in sectors. */
+    uint64_t avail_storage;        /* in sectors. */
+    uint64_t chunk_size;          /* in sectors. */
+    uint64_t storage_grow_unit;   /* in sectors. */
+    uint64_t table_offset;        /* in sectors. */
+    char *add_storage_cmd;
+    uint32_t *leaked_chunks;
+    uint32_t num_leaked_chunks;
+    uint32_t next_avail_leaked_chunk;
+    uint32_t chunks_relocated;    /* Affect bdrv_has_zero_init(). */
+    /******** Begin: for compact image. *************************************/
+
+    /******** Begin: for journal. *******************************************/
+    uint64_t journal_offset;       /* in sectors. */
+    uint64_t journal_size;         /* in sectors. */
+    uint64_t journal_epoch;
+    uint64_t next_journal_sector;  /* in sector. */
+    bool dirty_image;
+    bool metadata_err_prohibit_write;
+
+    /* There are two different ways of writing metadata changes to the
+     * journal. If cache=writethrough, metadata changes are written to the
+     * journal immediately. If (cache!=writethrough||IN_QEMU_TOOL), metadata
+     * changes are buffered in memory (bjnl.journal_buf below), and later
+     * written to the journal either triggered by bdrv_aio_flush() or by a
+     * timeout (bjnl.clean_buf_timer below). */
+    bool use_bjnl;      /* 'bjnl' stands for buffered journal update. */
+    union {
+        /* 'ujnl' stands for unbuffered journal update. */
+        struct {
+            int active_writes;
+            /* Journal writes waiting for journal recycle to finish.
+             * See JournalCB.ujnl_next_wait4_recycle. */
+            QLIST_HEAD(JournalRecycle, FvdAIOCB) wait4_recycle;
+        } ujnl;
+
+        /* 'bjnl' stands for buffered journal update. */
+        struct {
+            uint8_t *buf;
+            size_t buf_size;
+            size_t def_buf_size;
+            size_t buf_used;
+            bool buf_contains_bitmap_update;
+            QEMUTimer *clean_buf_timer;
+            bool timer_scheduled;
+            uint64_t clean_buf_period;
+            /* See JournalCB.bjnl_next_queued_buf. */
+            QTAILQ_HEAD(CleanBuf, FvdAIOCB) queued_bufs;
+        } bjnl;
+    };
+    /******** End: for journal. ********************************************/
+
+    /******** Begin: for prefetching. ***********************************/
+    struct FvdAIOCB **prefetch_acb;
+    int prefetch_state;    /* PREFETCH_STATE_RUNNING, FINISHED, or DISABLED. */
+    int num_prefetch_slots;
+    int num_filled_prefetch_slots;
+    int next_prefetch_read_slot;
+    bool prefetch_read_active;
+    bool pause_prefetch_requested;
+    int64_t prefetch_start_delay;      /* in seconds  */
+    uint64_t unclaimed_prefetch_region_start;
+    uint64_t prefetch_read_time;                     /* in milliseconds. */
+    uint64_t prefetch_write_time;                    /* in milliseconds. */
+    uint64_t prefetch_data_read;                     /* in bytes. */
+    uint64_t prefetch_data_written;                  /* in bytes. */
+    double prefetch_read_throughput;                 /* in bytes/millisecond. */
+    double prefetch_write_throughput;                /* in bytes/millisecond. */
+    double prefetch_min_read_throughput;             /* in bytes/millisecond. */
+    double prefetch_min_write_throughput;            /* in bytes/millisecond. */
+    uint64_t prefetch_read_throughput_measure_time;  /* in millisecond. */
+    uint64_t prefetch_write_throughput_measure_time; /* in millisecond.*/
+    uint64_t prefetch_throttle_time;                 /* in millisecond. */
+    uint64_t sectors_per_prefetch;
+    QEMUTimer *prefetch_timer;
+    /******** End: for prefetching. ***********************************/
+
+#ifdef FVD_DEBUG
+    int64_t total_copy_on_read_data;  /* in bytes. */
+    int64_t total_prefetch_data;      /* in bytes. */
+#endif
 } BDRVFvdState;
+
+/* Begin of data type definitions. */
+struct FvdAIOCB;
+
+typedef struct JournalCB {
+    BlockDriverAIOCB *hd_acb;
+    QEMUIOVector qiov;
+    struct iovec iov;
+    bool bitmap_updated;
+    union {
+        QLIST_ENTRY(FvdAIOCB) ujnl_next_wait4_recycle;
+        QTAILQ_ENTRY(FvdAIOCB) bjnl_next_queued_buf;
+    };
+} JournalCB;
+
+/* CopyLock is used by AIOWriteCB and AIOCopyCB. */
+typedef struct CopyLock {
+    QLIST_ENTRY(FvdAIOCB) next;
+    int64_t begin;
+    int64_t end;
+     QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
+} CopyLock;
+
+typedef struct ChildAIOReadCB {
+    BlockDriverAIOCB *hd_acb;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+    int done;
+} ChildAIOReadCB;
+
+typedef struct AIOReadCB {
+    QEMUIOVector *qiov;
+    int ret;
+    ChildAIOReadCB read_backing;
+    ChildAIOReadCB read_fvd;
+} AIOReadCB;
+
+/* For copy-on-read and prefetching. */
+typedef struct AIOCopyCB {
+    BlockDriverAIOCB *hd_acb;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    uint8_t *buf;
+    int64_t buffered_sector_begin;
+    int64_t buffered_sector_end;
+    int64_t last_prefetch_op_start_time; /* For prefetch only. */
+} AIOCopyCB;
+
+typedef struct AIOWriteCB {
+    BlockDriverAIOCB *hd_acb;
+    QEMUIOVector *qiov;
+    uint8_t *cow_buf;
+    QEMUIOVector *cow_qiov;
+    int64_t cow_start_sector;
+    int ret;
+    union {
+        bool update_table;
+        bool update_bitmap;
+    };
+
+    /* See BDRVFvdState.write_locks */
+    QLIST_ENTRY(FvdAIOCB) next_write_lock;
+
+    /* See FvdAIOCB.write.dependent_writes. */
+    QLIST_ENTRY(FvdAIOCB) next_dependent_write;
+} AIOWriteCB;
+
+/* For AIOStoreCompactCB and AIOLoadCompactCB. */
+typedef struct CompactChildCB {
+    struct FvdAIOCB *acb;
+    BlockDriverAIOCB *hd_acb;
+} CompactChildCB;
+
+/* For storing data to a compact image. */
+typedef struct AIOStoreCompactCB {
+    CompactChildCB one_child;
+    CompactChildCB *children;
+    int update_table;
+    int num_children;
+    int finished_children;
+    struct FvdAIOCB *parent_acb;
+    int ret;
+    int soft_write; /*true if the store is caused by copy-on-read or prefetch.*/
+    QEMUIOVector *orig_qiov;
+} AIOStoreCompactCB;
+
+/* For loading data from a compact image. */
+typedef struct AIOLoadCompactCB {
+    CompactChildCB *children;
+    CompactChildCB one_child;
+    int num_children;
+    int finished_children;
+    struct FvdAIOCB *parent_acb;
+    int ret;
+    QEMUIOVector *orig_qiov;
+} AIOLoadCompactCB;
+
+typedef struct AIOFlushCB {
+    BlockDriverAIOCB *data_acb;
+    BlockDriverAIOCB *metadata_acb;
+    int num_finished;
+    int ret;
+} AIOFlushCB;
+
+typedef struct AIOCleanJournalBufCB {
+    uint8_t *buf;
+} AIOCleanJournalBufCB;
+
+typedef struct AIOWrapperCB {
+    QEMUBH *bh;
+} AIOWrapperCB;
+
+typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
+    OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH, OP_BJNL_BUF_WRITE, OP_BJNL_FLUSH
+} op_type;
+
+#ifdef FVD_DEBUG
+/* For debugging memory leadk. */
+typedef struct alloc_tracer_t {
+    int64_t magic;
+    int alloc_tracer;
+    const char *alloc_file;
+    int alloc_line;
+    size_t size;
+} alloc_tracer_t;
+#endif
+
+typedef struct FvdAIOCB {
+    BlockDriverAIOCB common;
+    op_type type;
+    int64_t sector_num;
+    int nb_sectors;
+    JournalCB jcb;       /* For AIOWriteCB and AIOStoreCompactCB. */
+    CopyLock copy_lock;  /* For AIOWriteCB and AIOCopyCB. */
+    bool cancel_in_progress;
+
+    /* Use a union so that all requests can efficiently share one big AIOPool.*/
+    union {
+        AIOWrapperCB wrapper;
+        AIOReadCB read;
+        AIOWriteCB write;
+        AIOCopyCB copy;
+        AIOLoadCompactCB load;
+        AIOStoreCompactCB store;
+        AIOFlushCB flush;
+    };
+
+#ifdef FVD_DEBUG
+    int64_t magic;
+    alloc_tracer_t tracer; /* For debugging memory leak. */
+    /* Uniquely identifies a request across all processing activities. */
+    unsigned long long int uuid;
+#endif
+} FvdAIOCB;
+
+static BlockDriver bdrv_fvd;
+static QEMUOptionParameter fvd_create_options[];
+static QEMUOptionParameter fvd_update_options[];
+
+/* Function prototypes. */
+static int fvd_create(const char *filename, QEMUOptionParameter * options);
+static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename);
+static int fvd_open(BlockDriverState * bs, const char *filename, int flags);
+static void fvd_close(BlockDriverState * bs);
+static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
+                            int nb_sectors, int *pnum);
+static int fvd_flush(BlockDriverState * bs);
+static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
+static int fvd_update (BlockDriverState * bs, QEMUOptionParameter * options);
+static int fvd_has_zero_init(BlockDriverState * bs);
+
+/* Default configurations. */
+#define BYTES_PER_PREFETCH                      1048576     /* bytes */
+#define PREFETCH_THROTTLING_TIME                30000       /* milliseconds */
+#define NUM_PREFETCH_SLOTS                      2
+#define PREFETCH_MIN_MEASURE_READ_TIME          100         /* milliseconds */
+#define PREFETCH_MIN_MEASURE_WRITE_TIME         100         /* milliseconds */
+#define PREFETCH_MIN_READ_THROUGHPUT            5120        /* KB/s */
+#define PREFETCH_MIN_WRITE_THROUGHPUT           5120        /* KB/s */
+#define PREFETCH_MAX_READ_THROUGHPUT            1000000000L /* KB/s */
+#define PREFETCH_MAX_WRITE_THROUGHPUT           1000000000L /* KB/s */
+#define PREFETCH_PERF_CALC_ALPHA                0.8
+#define MAX_OUTSTANDING_COPY_ON_READ_DATA       2000000     /* bytes */
+#define MODERATE_BITMAP_SIZE                    4194304L    /* bytes */
+#define CHUNK_SIZE                              1048576LL   /* bytes */
+#define JOURNAL_SIZE                            16777216LL  /* bytes */
+#define STORAGE_GROW_UNIT                       104857600LL /* bytes */
+#define JOURNAL_BUF_SIZE                        (64*1024)  /* bytes */
+#define JOURNAL_CLEAN_BUF_PERIOD                5000        /* milliseconds */
+
+/* State of BDRVFvdState.prefetch_state. */
+#define PREFETCH_STATE_RUNNING  1
+#define PREFETCH_STATE_FINISHED 2
+#define PREFETCH_STATE_DISABLED 3
+
+/* For convience. */
+#define IN_QEMU_TOOL            (rt_clock == NULL) /* a trick */
+#define ROUND_UP(x, base)       ((((x)+(base)-1) / (base)) * (base))
+#define ROUND_DOWN(x, base)     ((((x) / (base)) * (base)))
+#define BOOL(x)                 ((x) ? "true" : "false")
+#define EMPTY_TABLE             ((uint32_t)0xFFFFFFFF)
+#define DIRTY_TABLE             ((uint32_t)0x80000000)
+#define READ_TABLE(entry)       (le32_to_cpu(entry) & ~DIRTY_TABLE)
+# define FVDAIOCB_MAGIC         ((uint64_t)0x3A8FCE89325B976DULL)
+# define FVD_ALLOC_MAGIC        ((uint64_t)0x4A7dCEF9925B976DULL)
+#define IS_EMPTY(entry)         ((entry) == EMPTY_TABLE)
+#define IS_DIRTY(entry)         (le32_to_cpu(entry) & DIRTY_TABLE)
+#define WRITE_TABLE(entry,id)   ((entry) = cpu_to_le32(id))
+#define READ_TABLE2(entry) \
+    ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry) & ~DIRTY_TABLE))
+
+#define CLEAN_DIRTY(entry) \
+    do {  \
+        if (!IS_EMPTY(entry))  \
+            entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    } while (0)
+
+#define CLEAN_DIRTY2(entry) \
+    do { \
+        ASSERT(!IS_EMPTY(entry)); \
+        entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    } while (0)
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 08/26] FVD: add debugging utilities
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (5 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 07/26] FVD: extend FVD header fvd.h to be more complete Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 09/26] FVD: add impl of interface bdrv_create() Chunqiang Tang
                   ` (17 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds some debugging utilities to FVD.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/blksim.c      |    7 +-
 block/fvd-debug.c   |  369 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-ext.h     |   71 ++++++++++
 block/fvd-journal.c |   23 +++
 block/fvd.c         |    2 +
 block/fvd.h         |    1 +
 qemu-io-auto.c      |   17 ++-
 7 files changed, 478 insertions(+), 12 deletions(-)
 create mode 100644 block/fvd-debug.c
 create mode 100644 block/fvd-ext.h
 create mode 100644 block/fvd-journal.c

diff --git a/block/blksim.c b/block/blksim.c
index 5c7ef43..16e44ee 100644
--- a/block/blksim.c
+++ b/block/blksim.c
@@ -19,12 +19,7 @@
 #include "qemu-queue.h"
 #include "qemu-common.h"
 #include "block/blksim.h"
-
-#if 1
-# define QDEBUG(format,...) do {} while (0)
-#else
-# define QDEBUG printf
-#endif
+#include "block/fvd-ext.h"
 
 typedef enum
 {
diff --git a/block/fvd-debug.c b/block/fvd-debug.c
new file mode 100644
index 0000000..36b4c43
--- /dev/null
+++ b/block/fvd-debug.c
@@ -0,0 +1,369 @@
+/*
+ * QEMU Fast Virtual Disk Format Debugging Utilities
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef ENABLE_TRACE_IO
+#define TRACE_REQUEST(...) do {} while (0)
+#define TRACE_STORE_IN_FVD(...) do {} while (0)
+
+#else
+
+static void TRACE_REQUEST(int do_write, int64_t sector_num, int nb_sectors)
+{
+    if (do_write) {
+        QDEBUG("TRACE_REQUEST: write sector_num=%" PRId64
+               " nb_sectors=%d    [ ", sector_num, nb_sectors);
+    } else {
+        QDEBUG("TRACE_REQUEST: read  sector_num=%" PRId64 " nb_sectors=%d    "
+               "[ ", sector_num, nb_sectors);
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    int64_t sec;
+    for (sec = sector_num; sec < end; sec++) {
+        QDEBUG("sec%" PRId64 " ", sec);
+    }
+    QDEBUG(" ]\n");
+}
+
+static void TRACE_STORE_IN_FVD(const char *str, int64_t sector_num,
+                               int nb_sectors)
+{
+    QDEBUG("TRACE_STORE: %s sector_num=%" PRId64 " nb_sectors=%d   [ ",
+           str, sector_num, nb_sectors);
+    int64_t end = sector_num + nb_sectors;
+    int64_t sec;
+    for (sec = sector_num; sec < end; sec++) {
+        QDEBUG("sec%" PRId64 " ", sec);
+    }
+    QDEBUG(" ]\n");
+}
+#endif
+
+#ifndef FVD_DEBUG
+#define my_qemu_malloc qemu_malloc
+#define my_qemu_mallocz qemu_mallocz
+#define my_qemu_blockalign qemu_blockalign
+#define my_qemu_free qemu_free
+#define my_qemu_vfree qemu_vfree
+#define my_qemu_aio_get qemu_aio_get
+#define my_qemu_aio_release qemu_aio_release
+#define COPY_UUID(to,from) do {} while (0)
+
+#else
+FILE *__fvd_debug_fp;
+static unsigned long long int fvd_uuid = 1;
+static int64_t pending_qemu_malloc = 0;
+static int64_t pending_qemu_aio_get = 0;
+static int64_t pending_local_writes = 0;
+static const char *alloc_file;
+static int alloc_line;
+
+#define my_qemu_malloc(size) \
+    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_malloc(size)))
+
+#define my_qemu_mallocz(size) \
+    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_mallocz(size)))
+
+#define my_qemu_blockalign(bs,size) \
+    ((void*)(alloc_file=__FILE__, \
+             alloc_line=__LINE__, \
+             _my_qemu_blockalign(bs,size)))
+
+#define my_qemu_aio_get(pool,bs,cb,op) \
+    ((void*)(alloc_file=__FILE__, \
+             alloc_line=__LINE__, \
+             _my_qemu_aio_get(pool,bs,cb,op)))
+
+#define my_qemu_free(p) \
+    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_free(p))
+
+#define my_qemu_vfree(p) \
+    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_vfree(p))
+
+static void COPY_UUID(FvdAIOCB * to, FvdAIOCB * from)
+{
+    if (from) {
+        to->uuid = from->uuid;
+        FVD_DEBUG_ACB(to);
+    }
+}
+
+#ifdef DEBUG_MEMORY_LEAK
+#define MAX_TRACER 10485760
+static int alloc_tracer_used = 1;       /* slot 0 is not used. */
+static void **alloc_tracers = NULL;
+
+static void __attribute__ ((constructor)) init_mem_alloc_tracers(void)
+{
+    if (!alloc_tracers) {
+        alloc_tracers = qemu_mallocz(sizeof(void *) * MAX_TRACER);
+    }
+}
+
+static void trace_alloc(void *p, size_t size)
+{
+    alloc_tracer_t *t = p;
+    t->magic = FVD_ALLOC_MAGIC;
+    t->alloc_file = alloc_file;
+    t->alloc_line = alloc_line;
+    t->size = size;
+
+    if (alloc_tracer_used < MAX_TRACER) {
+        t->alloc_tracer = alloc_tracer_used++;
+        alloc_tracers[t->alloc_tracer] = t;
+        QDEBUG("Allocate memory using tracer%d in %s on line %d.\n",
+               t->alloc_tracer, alloc_file, alloc_line);
+    } else {
+        t->alloc_tracer = 0;
+    }
+
+    /* Set header and footer to detect out-of-range writes. */
+    if (size != (size_t) - 1) {
+        uint8_t *q = (uint8_t *) p;
+        uint64_t *header = (uint64_t *) (q + 512 - sizeof(uint64_t));
+        uint64_t *footer = (uint64_t *) (q + size - 512);
+        *header = FVD_ALLOC_MAGIC;
+        *footer = FVD_ALLOC_MAGIC;
+    }
+}
+
+static void trace_free(void *p)
+{
+    alloc_tracer_t *t = p;
+
+    QDEBUG("Free memory with tracer%d in %s on line %d.\n",
+           t->alloc_tracer, alloc_file, alloc_line);
+    ASSERT(t->magic == FVD_ALLOC_MAGIC && t->alloc_tracer >= 0);
+
+    /* Check header and footer to detect out-of-range writes. */
+    if (t->size != (size_t) - 1) {
+        uint8_t *q = (uint8_t *) p;
+        uint64_t *header = (uint64_t *) (q + 512 - sizeof(uint64_t));
+        uint64_t *footer = (uint64_t *) (q + t->size - 512);
+        ASSERT(*header == FVD_ALLOC_MAGIC);
+        ASSERT(*footer == FVD_ALLOC_MAGIC);
+    }
+
+    if (t->alloc_tracer) {
+        ASSERT(alloc_tracers[t->alloc_tracer] == t);
+        alloc_tracers[t->alloc_tracer] = NULL;
+        t->alloc_tracer = -INT_MAX;
+    } else {
+        t->alloc_tracer *= -1;  /* Guard against double free. */
+    }
+}
+
+static void dump_alloc_tracers(void)
+{
+    int unfreed = 0;
+    int i;
+    for (i = 1; i < alloc_tracer_used; i++) {
+        if (!alloc_tracers[i]) {
+            continue;
+        }
+
+        unfreed++;
+        alloc_tracer_t *t = alloc_tracers[i];
+
+        if (t->size == (size_t) - 1) {
+            FvdAIOCB *acb = container_of(alloc_tracers[i], FvdAIOCB, tracer);
+            ASSERT(acb->magic == FVDAIOCB_MAGIC);
+            QDEBUG("Memory %p with tracer%d allocated in %s on line %d "
+                   "(FvdAIOCB acb%llu-%p) is not freed. magic %s\n",
+                   alloc_tracers[i], i, t->alloc_file, t->alloc_line,
+                   acb->uuid, acb,
+                   t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
+        } else {
+            QDEBUG("Memory %p with tracer%d allocated in %s on line %d is "
+                   "not freed. magic %s\n",
+                   alloc_tracers[i], i, t->alloc_file, t->alloc_line,
+                   t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
+
+            uint8_t *q = (uint8_t *) t;
+            uint64_t *header = (uint64_t *) (q + 512 - sizeof(uint64_t));
+            uint64_t *footer = (uint64_t *) (q + t->size - 512);
+            ASSERT(*header == FVD_ALLOC_MAGIC);
+            ASSERT(*footer == FVD_ALLOC_MAGIC);
+        }
+    }
+
+    QDEBUG("Unfreed memory allocations: %d\n", unfreed);
+}
+#endif
+
+static inline void *_my_qemu_aio_get(AIOPool * pool, BlockDriverState * bs,
+                                     BlockDriverCompletionFunc * cb,
+                                     void *opaque)
+{
+    pending_qemu_aio_get++;
+    FvdAIOCB *acb = (FvdAIOCB *) qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    acb->uuid = ++fvd_uuid;
+    acb->magic = FVDAIOCB_MAGIC;
+
+    FVD_DEBUG_ACB(acb);
+
+#ifdef DEBUG_MEMORY_LEAK
+    trace_alloc(&acb->tracer, -1);
+#endif
+
+    return acb;
+}
+
+static inline void my_qemu_aio_release(void *p)
+{
+    pending_qemu_aio_get--;
+    ASSERT(pending_qemu_aio_get >= 0);
+
+#ifdef DEBUG_MEMORY_LEAK
+    FvdAIOCB *acb = p;
+    trace_free(&acb->tracer);
+#endif
+
+    qemu_aio_release(p);
+}
+
+static inline void *_my_qemu_malloc(size_t size)
+{
+    ASSERT(size > 0);
+    pending_qemu_malloc++;
+#ifndef DEBUG_MEMORY_LEAK
+    return qemu_malloc(size);
+#else
+
+    size += 1024;       /* 512 bytes header and 512 bytes footer. */
+    uint8_t *ret = qemu_malloc(size);
+    trace_alloc(ret, size);
+    return ret + 512;
+#endif
+}
+
+static inline void *_my_qemu_mallocz(size_t size)
+{
+    ASSERT(size > 0);
+    pending_qemu_malloc++;
+#ifndef DEBUG_MEMORY_LEAK
+    return qemu_mallocz(size);
+#else
+
+    size += 1024;       /* 512 bytes header and 512 bytes footer. */
+    uint8_t *ret = qemu_mallocz(size);
+    trace_alloc(ret, size);
+    return ret + 512;
+#endif
+}
+
+static inline void *_my_qemu_blockalign(BlockDriverState * bs, size_t size)
+{
+    ASSERT(size > 0);
+    pending_qemu_malloc++;
+
+#ifndef DEBUG_MEMORY_LEAK
+    return qemu_blockalign(bs, size);
+#else
+
+    size += 1024;       /* 512 bytes header and 512 bytes footer. */
+    uint8_t *ret = qemu_blockalign(bs, size);
+    trace_alloc(ret, size);
+    return ret + 512;
+#endif
+}
+
+static inline void _my_qemu_free(void *ptr)
+{
+    pending_qemu_malloc--;
+    ASSERT(pending_qemu_malloc >= 0);
+#ifndef DEBUG_MEMORY_LEAK
+    qemu_free(ptr);
+#else
+
+    uint8_t *q = ((uint8_t *) ptr) - 512;
+    trace_free(q);
+    qemu_free(q);
+#endif
+}
+
+static inline void _my_qemu_vfree(void *ptr)
+{
+    pending_qemu_malloc--;
+    ASSERT(pending_qemu_malloc >= 0);
+#ifndef DEBUG_MEMORY_LEAK
+    qemu_vfree(ptr);
+#else
+
+    uint8_t *q = ((uint8_t *) ptr) - 512;
+    trace_free(q);
+    qemu_vfree(q);
+#endif
+}
+
+static void count_pending_requests(BDRVFvdState * s)
+{
+    int m = 0, n = 0, k = 0;
+    FvdAIOCB *w;
+
+    QLIST_FOREACH(w, &s->copy_locks, copy_lock.next) {
+        m++;
+        QDEBUG("copy_lock: acb%llu-%p\n", w->uuid, w);
+    }
+
+    QLIST_FOREACH(w, &s->write_locks, write.next_write_lock) {
+        k++;
+        QDEBUG("write_lock: acb%llu-%p\n", w->uuid, w);
+    }
+
+    if (s->use_bjnl) {
+        QTAILQ_FOREACH(w, &s->bjnl.queued_bufs, jcb.bjnl_next_queued_buf) {
+            n++;
+            QDEBUG("bjnl.pending_write: acb%llu-%p\n", w->uuid, w);
+        }
+    } else {
+        QLIST_FOREACH(w, &s->ujnl.wait4_recycle, jcb.ujnl_next_wait4_recycle) {
+            n++;
+            QDEBUG("ujnl.wait4_recycle: acb%llu-%p\n", w->uuid, w);
+        }
+    }
+
+    QDEBUG("Debug_memory_leak: copy_locks=%d  write_locks=%d "
+           "journal_locks=%d\n", m, k, n);
+}
+
+static void dump_resource_summary(BDRVFvdState * s)
+{
+#ifdef DEBUG_MEMORY_LEAK
+    dump_alloc_tracers();
+#endif
+
+    QDEBUG("Resource summary: outstanding_copy_on_read_data=%" PRId64
+           " total_copy_on_read_data=%" PRId64 " total_prefetch_data=%" PRId64
+           " " " pending_qemu_malloc=%" PRId64 " pending_qemu_aio_get=%" PRId64
+           " pending_local_writes=%" PRId64 "\n",
+           s->outstanding_copy_on_read_data, s->total_copy_on_read_data,
+           s->total_prefetch_data, pending_qemu_malloc, pending_qemu_aio_get,
+           pending_local_writes);
+    count_pending_requests(s);
+}
+
+void init_fvd_debug_fp(void)
+{
+    char buf[256];
+    sprintf(buf, "/tmp/fvd.log-%d", getpid());
+    if ((__fvd_debug_fp = fopen(buf, "wt")) == NULL) {
+        __fvd_debug_fp = stdout;
+    }
+}
+#endif
+
+void fvd_check_memory_leak(void)
+{
+    ASSERT(pending_qemu_malloc == 0);
+}
diff --git a/block/fvd-ext.h b/block/fvd-ext.h
new file mode 100644
index 0000000..641b9e9
--- /dev/null
+++ b/block/fvd-ext.h
@@ -0,0 +1,71 @@
+/*
+ * QEMU Fast Virtual Disk Format Exported Symbols
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ *  A short description: this header file contains functions of the FVD block
+ *  device driver that are used by other external modules. These functions are
+ *  mainly for testing and debugging urposes.
+ *============================================================================*/
+
+#ifndef __fvd_ext_h__
+#define __fvd_ext_h__
+
+//#define FVD_DEBUG
+//#define ENABLE_QDEBUG
+
+void fvd_check_memory_leak (void);
+void fvd_init_prefetch (void * bs);
+void fvd_emulate_host_crash (bool cond);
+
+#ifndef FVD_DEBUG
+# define ASSERT(x) do {} while (0)
+# define FVD_DEBUG_ACB(...) do {} while (0)
+# define QPAUSE(...) do {} while (0)
+# undef ENABLE_QDEBUG
+
+#else
+
+extern FILE *__fvd_debug_fp;
+void init_fvd_debug_fp(void);
+void FVD_DEBUG_ACB(void *p);
+
+# define ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "Assertion failed in process %d at %s:%d. Wait.", \
+                    getpid(),__FILE__, __LINE__); \
+            fgetc (stdin); abort(); \
+        } \
+    } while (0) \
+
+# define QPAUSE(format,...) \
+    do { \
+        fprintf(stderr, format, ##__VA_ARGS__); \
+        fprintf(stderr, " Pause process %d for debugging...\n", getpid()); \
+        fgetc(stdin); \
+    } while (0)
+
+#endif
+
+#ifndef ENABLE_QDEBUG
+# define QDEBUG(format,...) do {} while (0)
+#else
+# define QDEBUG(format,...) \
+    do { \
+        if (__fvd_debug_fp==NULL) init_fvd_debug_fp(); \
+        fprintf(__fvd_debug_fp, format, ##__VA_ARGS__); \
+        fflush(__fvd_debug_fp); \
+    } while(0)
+#endif
+
+#endif
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
new file mode 100644
index 0000000..5824e35
--- /dev/null
+++ b/block/fvd-journal.c
@@ -0,0 +1,23 @@
+/*
+ * QEMU Fast Virtual Disk Format Metadata Journal
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifdef FVD_DEBUG
+static bool emulate_host_crash = true;
+#else
+static bool emulate_host_crash = false;
+#endif
+
+void fvd_emulate_host_crash(bool cond)
+{
+    emulate_host_crash = cond;
+}
diff --git a/block/fvd.c b/block/fvd.c
index bc2645c..13fe940 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -25,6 +25,7 @@
 
 /* Use include to avoid exposing too many FVD symbols, and to allow inline
  * function optimization. */
+#include "block/fvd-debug.c"
 #include "block/fvd-flush.c"
 #include "block/fvd-update.c"
 #include "block/fvd-misc.c"
@@ -32,6 +33,7 @@
 #include "block/fvd-open.c"
 #include "block/fvd-read.c"
 #include "block/fvd-write.c"
+#include "block/fvd-journal.c"
 
 static BlockDriver bdrv_fvd = {
     .format_name = "fvd",
diff --git a/block/fvd.h b/block/fvd.h
index b83b7aa..9847e7f 100644
--- a/block/fvd.h
+++ b/block/fvd.h
@@ -18,6 +18,7 @@
 #include "block.h"
 #include "qemu-queue.h"
 #include "qemu-common.h"
+#include "block/fvd-ext.h"
 
 enum {
     FVD_MAGIC = ('F' | 'V' << 8 | 'D' << 16 | '\0' << 24),
diff --git a/qemu-io-auto.c b/qemu-io-auto.c
index 73d79c7..67c84f8 100644
--- a/qemu-io-auto.c
+++ b/qemu-io-auto.c
@@ -35,14 +35,9 @@
 #include "qemu-timer.h"
 #include "qemu-common.h"
 #include "block_int.h"
+#include "block/fvd-ext.h"
 #include "block/blksim.h"
 
-#if 1
-# define QDEBUG(format,...) do {} while (0)
-#else
-# define QDEBUG printf
-#endif
-
 #define die(format,...) \
     do { \
         fprintf (stderr, "%s:%d --- ", __FILE__, __LINE__); \
@@ -582,6 +577,11 @@ static void open_test_file(const char *format, const char *test_file, int flags)
     if (bdrv_open(bs, test_file, flags, drv) < 0) {
         die("Failed to open '%s'\n", test_file);
     }
+
+    if (!strncmp(bs->drv->format_name, "fvd", 3)) {
+        bool emulate_crash = (rand() % 10 != 0);      /* Random crash test. */
+        fvd_emulate_host_crash(emulate_crash);
+    }
 }
 
 static void perform_test(const char *truth_file, const char *test_file,
@@ -688,7 +688,12 @@ static void perform_test(const char *truth_file, const char *test_file,
     }
 
     printf("Test process %d finished successfully\n", getpid());
+
+    int is_fvd = (strncmp(bs->drv->format_name, "fvd", 3) == 0);
     bdrv_delete(bs);
+    if (is_fvd) {
+        fvd_check_memory_leak();
+    }
     close(fd);
 }
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 09/26] FVD: add impl of interface bdrv_create()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (6 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 08/26] FVD: add debugging utilities Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open() Chunqiang Tang
                   ` (16 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_create() interface. It
supports FVD image creation.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-create.c  |  702 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-journal.c |    5 +
 block/fvd.c         |    2 +-
 3 files changed, 707 insertions(+), 2 deletions(-)

diff --git a/block/fvd-create.c b/block/fvd-create.c
index 5593cea..c8912aa 100644
--- a/block/fvd-create.c
+++ b/block/fvd-create.c
@@ -11,11 +11,711 @@
  *
  */
 
+static void fvd_header_cpu_to_le(FvdHeader * header);
+static inline int64_t calc_min_journal_size(int64_t table_entries);
+static inline int search_empty_blocks(int fd, uint8_t * bitmap,
+                                      BlockDriverState * bs,
+                                      int64_t nb_sectors,
+                                      int32_t hole_size,
+                                      int32_t block_size);
+
 static int fvd_create(const char *filename, QEMUOptionParameter * options)
 {
-    return -ENOTSUP;
+    int fd, ret = 0;
+    FvdHeader *header;
+    int64_t virtual_disk_size = DEF_PAGE_SIZE;
+    int32_t header_size;
+    const char *base_img = NULL;
+    const char *base_img_fmt = NULL;
+    const char *data_file = NULL;
+    const char *data_file_fmt = NULL;
+    int32_t hole_size = 0;
+    int copy_on_read = false;
+    int prefetch_start_delay = -1;
+    BlockDriverState *bs = NULL;
+    int bitmap_size = 0;
+    int64_t base_img_size = 0;
+    int64_t table_size = 0;
+    int64_t journal_size = 0;
+    int32_t block_size = 0;
+    int compact_image = false;
+    uint64_t max_copy_on_read = MAX_OUTSTANDING_COPY_ON_READ_DATA;
+    uint32_t num_prefetch_slots = NUM_PREFETCH_SLOTS;
+    uint64_t bytes_per_prefetch = BYTES_PER_PREFETCH;
+    uint64_t prefetch_throttle_time = PREFETCH_THROTTLING_TIME;
+    uint64_t prefetch_read_measure_time = PREFETCH_MIN_MEASURE_READ_TIME;
+    uint64_t prefetch_write_measure_time = PREFETCH_MIN_MEASURE_WRITE_TIME;
+    uint64_t prefetch_min_read_throughput = PREFETCH_MIN_READ_THROUGHPUT;
+    uint64_t prefetch_min_write_throughput = PREFETCH_MIN_WRITE_THROUGHPUT;
+    uint64_t prefetch_max_read_throughput = PREFETCH_MAX_READ_THROUGHPUT;
+    uint64_t prefetch_max_write_throughput = PREFETCH_MAX_WRITE_THROUGHPUT;
+
+    header_size = sizeof(FvdHeader);
+    header_size = ROUND_UP(header_size, DEF_PAGE_SIZE);
+    header = my_qemu_mallocz(header_size);
+    header->header_size = header_size;
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            virtual_disk_size = options->value.n;
+        } else if (!strcmp(options->name, "prefetch_start_delay")) {
+            if (options->value.n <= 0) {
+                prefetch_start_delay = -1;
+            } else {
+                prefetch_start_delay = options->value.n;
+            }
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            base_img = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            base_img_fmt = options->value.s;
+        } else if (!strcmp(options->name, "copy_on_read")) {
+            copy_on_read = options->value.n;
+        } else if (!strcmp(options->name, "data_file")) {
+            data_file = options->value.s;
+        } else if (!strcmp(options->name, "data_file_fmt")) {
+            data_file_fmt = options->value.s;
+        } else if (!strcmp(options->name, "optimize_empty_block")) {
+            hole_size = options->value.n;
+        } else if (!strcmp(options->name, "compact_image")) {
+            compact_image = options->value.n;
+        } else if (!strcmp(options->name, "block_size")) {
+            block_size = options->value.n;
+        } else if (!strcmp(options->name, "chunk_size")) {
+            header->chunk_size = options->value.n;
+        } else if (!strcmp(options->name, "journal_size")) {
+            journal_size = options->value.n;
+        } else if (!strcmp(options->name, "journal_buf_size")) {
+            header->journal_buf_size = options->value.n;
+        } else if (!strcmp(options->name, "journal_clean_buf_period")) {
+            header->journal_clean_buf_period = options->value.n;
+        } else if (!strcmp(options->name, "storage_grow_unit")) {
+            header->storage_grow_unit = options->value.n;
+        } else if (!strcmp(options->name, "add_storage_cmd") &&
+                   options->value.s) {
+            pstrcpy(header->add_storage_cmd, sizeof(header->add_storage_cmd),
+                    options->value.s);
+        } else if (!strcmp(options->name, "num_prefetch_slots") &&
+                   options->value.n > 0) {
+            num_prefetch_slots = options->value.n;
+        } else if (!strcmp(options->name, "bytes_per_prefetch") &&
+                   options->value.n > 0) {
+            bytes_per_prefetch = options->value.n;
+        } else if (!strcmp(options->name, "prefetch_throttle_time") &&
+                   options->value.n > 0) {
+            prefetch_throttle_time = options->value.n;
+        } else if (!strcmp(options->name,
+                           "prefetch_read_throughput_measure_time") &&
+                   options->value.n > 0) {
+            prefetch_read_measure_time = options->value.n;
+        } else if (!strcmp(options->name,
+                           "prefetch_write_throughput_measure_time") &&
+                   options->value.n > 0) {
+            prefetch_write_measure_time = options->value.n;
+        } else if (!strcmp(options->name,
+                           "prefetch_min_read_throughput") &&
+                   options->value.n > 0) {
+            prefetch_min_read_throughput = options->value.n;
+        } else if (!strcmp(options->name,
+                           "prefetch_min_write_throughput") &&
+                   options->value.n > 0) {
+            prefetch_min_write_throughput = options->value.n;
+        } else if (!strcmp(options->name,
+                           "prefetch_max_read_throughput") &&
+                   options->value.n > 0) {
+            prefetch_max_read_throughput = options->value.n;
+        } else if (!strcmp(options->name,
+                           "prefetch_max_write_throughput") &&
+                   options->value.n > 0) {
+            prefetch_max_write_throughput = options->value.n;
+        } else if (!strcmp(options->name,
+                           "max_outstanding_copy_on_read_data") &&
+                   options->value.n > 0) {
+            max_copy_on_read = options->value.n;
+        }
+        options++;
+    }
+
+    virtual_disk_size = ROUND_UP(virtual_disk_size, 512);
+
+    /* Check if arguments are valid. */
+    if (base_img && strlen(base_img) > 1023) {
+        fprintf(stderr, "The base image name is longer than 1023 characters, "
+                "which is not allowed.\n");
+        return -EINVAL;
+    }
+
+    if (base_img && hole_size > 0) {
+        if (compact_image) {
+            fprintf(stderr, "compact_image and optimize_empty_block cannot be "
+                    "enabled together. Please disable optimize_empty_block.\n");
+            return -EINVAL;
+        }
+        header->need_zero_init = true;
+    } else {
+        header->need_zero_init = false;
+    }
+
+    if (data_file) {
+        pstrcpy(header->data_file, 1024, data_file);
+        if (data_file_fmt) {
+            pstrcpy(header->data_file_fmt, 16, data_file_fmt);
+        }
+    }
+
+    header->magic = FVD_MAGIC;
+    header->last_open_version = header->create_version = FVD_VERSION;
+    header->virtual_disk_size = virtual_disk_size;
+    header->clean_shutdown = true;
+
+    if (!base_img) {
+        header->base_img_fully_prefetched = true;
+    } else {
+        /* Handle base image. */
+        int ret;
+
+        bs = bdrv_new("");
+        if (!bs) {
+            fprintf(stderr, "Failed to create a new block driver\n");
+            return -EIO;
+        }
+
+        pstrcpy(header->base_img, 1024, base_img);
+        if (base_img_fmt) {
+            pstrcpy(header->base_img_fmt, 16, base_img_fmt);
+            BlockDriver *drv = bdrv_find_format(base_img_fmt);
+            if (!drv) {
+                fprintf(stderr, "Failed to find driver for format '%s'\n",
+                        base_img_fmt);
+                return -EINVAL;
+            }
+            ret = bdrv_open(bs, base_img, 0, drv);
+        } else {
+            ret = bdrv_open(bs, base_img, 0, NULL);
+        }
+
+        if (ret < 0) {
+            fprintf(stderr, "Failed to open the base image %s\n", base_img);
+            return -EIO;
+        }
+
+        base_img_size = bdrv_getlength(bs);
+        base_img_size = MIN(virtual_disk_size, base_img_size);
+        base_img_size = ROUND_UP(base_img_size, 512);
+
+        if (block_size <= 0) {
+            /* No block size is provided. Find the smallest block size that
+             * does not make the bitmap too big. */
+            block_size = 512;
+            while (1) {
+                int64_t blocks = (base_img_size + block_size - 1) / block_size;
+                bitmap_size = (blocks + 7) / 8;
+                if (bitmap_size <= MODERATE_BITMAP_SIZE) {
+                    break;
+                }
+                block_size *= 2;
+            }
+        } else {
+            block_size = ROUND_UP(block_size, 512);
+            int64_t blocks = (base_img_size + block_size - 1) / block_size;
+            bitmap_size = (blocks + 7) / 8;
+        }
+
+        bitmap_size = ROUND_UP(bitmap_size, DEF_PAGE_SIZE);
+        header->bitmap_size = bitmap_size;
+        header->block_size = block_size;
+        header->bitmap_offset = header_size;
+        header->base_img_size = base_img_size;
+        header->max_outstanding_copy_on_read_data = max_copy_on_read;
+        header->copy_on_read = copy_on_read;
+        header->prefetch_start_delay = prefetch_start_delay;
+        header->num_prefetch_slots = num_prefetch_slots;
+        header->bytes_per_prefetch = ROUND_UP(bytes_per_prefetch, block_size);
+        header->prefetch_throttle_time = prefetch_throttle_time;
+        header->prefetch_read_throughput_measure_time =
+                                        prefetch_read_measure_time;
+        header->prefetch_write_throughput_measure_time =
+                                        prefetch_write_measure_time;
+        header->prefetch_min_read_throughput = prefetch_min_read_throughput;
+        header->prefetch_min_write_throughput = prefetch_min_write_throughput;
+        header->prefetch_max_read_throughput = prefetch_max_read_throughput;
+        header->prefetch_max_write_throughput = prefetch_max_write_throughput;
+        header->base_img_fully_prefetched = false;
+    }
+
+    /* Set the table size. */
+    if (compact_image) {
+        if (header->chunk_size <= 0) {
+            header->chunk_size = CHUNK_SIZE;
+        }
+        if (base_img) {
+            /* chunk_size must be a multiple of block_size. */
+            header->chunk_size = ROUND_UP(header->chunk_size, block_size);
+        } else {
+            header->chunk_size = ROUND_UP(header->chunk_size, DEF_PAGE_SIZE);
+        }
+
+        if (header->storage_grow_unit <= 0) {
+            header->storage_grow_unit = STORAGE_GROW_UNIT;
+        }
+        if (header->storage_grow_unit < header->chunk_size) {
+            header->storage_grow_unit = header->chunk_size;
+        }
+        int64_t table_entries =
+            (virtual_disk_size + header->chunk_size - 1) / header->chunk_size;
+        table_size = sizeof(uint32_t) * table_entries;
+        table_size = ROUND_UP(table_size, DEF_PAGE_SIZE);
+        if (table_size > 0) {
+            header->table_size = table_size;
+        }
+    }
+
+    /* Set the journal size. */
+    if (bitmap_size <= 0 && table_size <= 0) {
+        header->journal_size = 0;       /* No need to use journal. */
+    } else if (journal_size < 0) {
+        /* Disable the use of journal, which reduces overhead but may cause
+         * data corruption if the host crashes. This is a valid configuration
+         * for some use cases, where data integrity is not critical.  */
+        header->journal_size = 0;
+    } else {
+        if (journal_size == 0) {
+            /* No journal size is specified. Use a default size. */
+            journal_size = JOURNAL_SIZE;
+        }
+        if (table_size > 0) {
+            /* Make sure that the journal is at least large enough to record
+             * all table changes in one shot, which is the extremely unlikely
+             * worst case. */
+            int64_t vsize = virtual_disk_size + header->chunk_size - 1;
+            int64_t table_entries = vsize / header->chunk_size;
+            int64_t min_journal_size = calc_min_journal_size(table_entries);
+            if (journal_size < min_journal_size) {
+                journal_size = min_journal_size;
+            }
+        }
+        journal_size = ROUND_UP(journal_size, DEF_PAGE_SIZE);
+        header->journal_size = journal_size;
+        header->journal_offset = header_size + bitmap_size;
+
+        if (header->journal_buf_size <= 0) {
+            header->journal_buf_size = JOURNAL_BUF_SIZE;
+        }
+        header->journal_buf_size = ROUND_UP(header->journal_buf_size, 512);
+        if (header->journal_buf_size > header->journal_size) {
+            header->journal_buf_size = header->journal_size;
+        }
+        if (header->journal_clean_buf_period == 0) {
+            header->journal_clean_buf_period = JOURNAL_CLEAN_BUF_PERIOD;
+        }
+    }
+
+    if (table_size > 0) {
+        /* Table is located right before the data region. When expanding the
+         * size of an existing FVD image, the table can be expanded to borrow
+         * space from the data region, by relocating some data chunks. */
+        header->table_offset = header_size + bitmap_size + journal_size;
+    }
+
+    header->data_offset = header_size + bitmap_size + table_size +
+        MAX(0, journal_size);
+
+    /* Create the image file. */
+    fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0) {
+        fprintf(stderr, "Failed to open %s\n", filename);
+        goto fail;
+    }
+    fvd_header_cpu_to_le(header);
+
+    if (qemu_write_full(fd, header, header_size) != header_size) {
+        fprintf(stderr, "Failed to write the header of %s\n", filename);
+        goto fail;
+    }
+
+    /* Initialize the bitmap. */
+    if (bitmap_size > 0) {
+        uint8_t *bitmap = my_qemu_mallocz(bitmap_size);
+        if (hole_size > 0) {
+            if ((ret = search_empty_blocks(fd, bitmap, bs, base_img_size / 512,
+                                           hole_size, block_size))) {
+                goto fail;
+            }
+        }
+
+        ret = qemu_write_full(fd, bitmap, bitmap_size);
+        my_qemu_free(bitmap);
+        if (ret != bitmap_size) {
+            fprintf(stderr, "Failed to zero out the bitmap of %s\n", filename);
+            goto fail;
+        }
+    }
+
+    /* Initialize the journal. */
+    if (journal_size > 0) {
+        uint8_t *empty_journal = my_qemu_malloc(journal_size);
+        memset(empty_journal, 0xA5, journal_size); /* EMPTY_JRECORD */
+        ret = qemu_write_full(fd, empty_journal, journal_size);
+        my_qemu_free(empty_journal);
+        if (ret != journal_size) {
+            fprintf(stderr, "Failed to initialize the journal\n");
+            goto fail;
+        }
+    }
+
+    /* Initialize the table. */
+    if (table_size > 0) {
+        /* Set all entries to EMPTY_TABLE (0xFFFFFFFF). */
+        uint8_t *empty_table = my_qemu_malloc(table_size);
+        memset(empty_table, 0xFF, table_size);
+        ret = qemu_write_full(fd, empty_table, table_size);
+        my_qemu_free(empty_table);
+        if (ret != table_size) {
+            fprintf(stderr, "Failed to write the table of %s\n.", filename);
+            goto fail;
+        }
+    }
+
+    if (bs) {
+        bdrv_close(bs);
+    }
+    my_qemu_free(header);
+    return 0;
+
+fail:
+    if (bs) {
+        bdrv_close(bs);
+    }
+    close(fd);
+    my_qemu_free(header);
+    return -EIO;
 }
 
+/* For the optimization called "free write to zero-filled blocks". See Section
+ * 3.3.3 of the FVD-cow paper. Briefly, it finds zero-filled blocks in the
+ * base image and sets the corresponding bits in the bitmap to one. */
+static inline int search_empty_blocks(int fd, uint8_t * bitmap,
+                                      BlockDriverState * bs,
+                                      int64_t nb_sectors,
+                                      int32_t hole_size,
+                                      int32_t block_size)
+{
+    printf("Searching empty blocks in the base image. Please wait...");
+    fflush(stdout);
+
+    if (hole_size < block_size) {
+        hole_size = block_size;
+    }
+    hole_size = ROUND_UP(hole_size, block_size);
+    nb_sectors = ROUND_DOWN(nb_sectors, hole_size);
+    const int sectors_per_hole = hole_size / 512;
+    const int sectors_per_block = block_size / 512;
+    int num_int64_in_hole = hole_size / 8;
+    int64_t hole_count = 0;
+    int i;
+    int64_t sec = 0;
+    uint8_t *p = my_qemu_blockalign(bs, hole_size);
+
+    while (sec < nb_sectors) {
+        int64_t *q;
+
+        if (bdrv_read(bs, sec, p, sectors_per_hole) < 0) {
+            fprintf(stderr, "Error in reading the base image\n");
+            my_qemu_vfree(p);
+            return -EIO;
+        }
+
+        /* All zeros? */
+        q = (int64_t *) p;
+        for (i = 0; i < num_int64_in_hole; i++) {
+            if (*q != 0) {
+                break;
+            }
+            q++;
+        }
+
+        if (i < num_int64_in_hole) {
+            /* This is not a hole. */
+            sec += sectors_per_hole;
+        } else {
+            /* These  sectors consist of only zeros.  Set the flag to
+             * indicate that there is no need to read this sector from the
+             * base image.  See Section 3.3.3 of the FVD-cow paper for the
+             * rationale. */
+            hole_count++;
+            int64_t end = sec + sectors_per_hole;
+            while (sec < end) {
+                int block_num = sec / sectors_per_block;
+                int64_t bitmap_byte_offset = block_num / 8;
+                uint8_t bitmap_bit_offset = block_num % 8;
+                int8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+                uint8_t b = bitmap[bitmap_byte_offset];
+                if (!(b & mask)) {
+                    b |= mask;
+                    bitmap[bitmap_byte_offset] |= mask;
+                }
+                sec += sectors_per_block;
+            }
+        }
+    }
+
+    printf("\nFound %"PRId64" zero-filled empty regions.\n", hole_count);
+    my_qemu_vfree(p);
+    return 0;
+}
+
+static void fvd_header_cpu_to_le(FvdHeader * header)
+{
+    cpu_to_le32s(&header->magic);
+    cpu_to_le32s(&header->header_size);
+    cpu_to_le32s(&header->create_version);
+    cpu_to_le32s(&header->last_open_version);
+    cpu_to_le32s((uint32_t *) & header->base_img_fully_prefetched);
+    cpu_to_le64s((uint64_t *) & header->data_offset);
+    cpu_to_le64s((uint64_t *) & header->virtual_disk_size);
+    cpu_to_le64s((uint64_t *) & header->base_img_size);
+    cpu_to_le64s((uint64_t *) & header->max_outstanding_copy_on_read_data);
+    cpu_to_le64s((uint64_t *) & header->bitmap_offset);
+    cpu_to_le64s((uint64_t *) & header->bitmap_size);
+    cpu_to_le32s((uint32_t *) & header->copy_on_read);
+    cpu_to_le32s((uint32_t *) & header->need_zero_init);
+    cpu_to_le64s((uint64_t *) & header->prefetch_start_delay);
+    cpu_to_le32s((uint32_t *) & header->num_prefetch_slots);
+    cpu_to_le64s((uint64_t *) & header->bytes_per_prefetch);
+    cpu_to_le64s((uint64_t *) & header->prefetch_throttle_time);
+    cpu_to_le64s((uint64_t *) & header->prefetch_read_throughput_measure_time);
+    cpu_to_le64s((uint64_t *) & header->prefetch_write_throughput_measure_time);
+    cpu_to_le64s((uint64_t *) & header->prefetch_min_read_throughput);
+    cpu_to_le64s((uint64_t *) & header->prefetch_min_write_throughput);
+    cpu_to_le64s((uint64_t *) & header->prefetch_max_read_throughput);
+    cpu_to_le64s((uint64_t *) & header->prefetch_max_write_throughput);
+    cpu_to_le64s((uint64_t *) & header->block_size);
+    cpu_to_le64s((uint64_t *) & header->chunk_size);
+    cpu_to_le64s((uint64_t *) & header->storage_grow_unit);
+    cpu_to_le64s((uint64_t *) & header->table_offset);
+    cpu_to_le32s((uint32_t *) & header->clean_shutdown);
+    cpu_to_le64s((uint64_t *) & header->journal_offset);
+    cpu_to_le64s((uint64_t *) & header->journal_size);
+    cpu_to_le64s((uint64_t *) & header->stable_journal_epoch);
+    cpu_to_le64s((uint64_t *) & header->journal_buf_size);
+    cpu_to_le64s((uint64_t *) & header->journal_clean_buf_period);
+}
+
+static void fvd_header_le_to_cpu(FvdHeader * header)
+{
+    le32_to_cpus(&header->magic);
+    le32_to_cpus(&header->header_size);
+    le32_to_cpus(&header->create_version);
+    le32_to_cpus(&header->last_open_version);
+    le32_to_cpus((uint32_t *) & header->base_img_fully_prefetched);
+    le64_to_cpus((uint64_t *) & header->data_offset);
+    le64_to_cpus((uint64_t *) & header->virtual_disk_size);
+    le64_to_cpus((uint64_t *) & header->base_img_size);
+    le64_to_cpus((uint64_t *) & header->max_outstanding_copy_on_read_data);
+    le64_to_cpus((uint64_t *) & header->bitmap_offset);
+    le64_to_cpus((uint64_t *) & header->bitmap_size);
+    le32_to_cpus((uint32_t *) & header->copy_on_read);
+    le32_to_cpus((uint32_t *) & header->need_zero_init);
+    le64_to_cpus((uint64_t *) & header->prefetch_start_delay);
+    le64_to_cpus((uint64_t *) & header->num_prefetch_slots);
+    le64_to_cpus((uint64_t *) & header->bytes_per_prefetch);
+    le64_to_cpus((uint64_t *) & header->prefetch_throttle_time);
+    le64_to_cpus((uint64_t *) & header->prefetch_read_throughput_measure_time);
+    le64_to_cpus((uint64_t *) & header->prefetch_write_throughput_measure_time);
+    le64_to_cpus((uint64_t *) & header->prefetch_min_read_throughput);
+    le64_to_cpus((uint64_t *) & header->prefetch_min_write_throughput);
+    le64_to_cpus((uint64_t *) & header->prefetch_max_read_throughput);
+    le64_to_cpus((uint64_t *) & header->prefetch_max_write_throughput);
+    le64_to_cpus((uint64_t *) & header->block_size);
+    le64_to_cpus((uint64_t *) & header->chunk_size);
+    le64_to_cpus((uint64_t *) & header->storage_grow_unit);
+    le64_to_cpus((uint64_t *) & header->table_offset);
+    le32_to_cpus((uint32_t *) & header->clean_shutdown);
+    le64_to_cpus((uint64_t *) & header->journal_offset);
+    le64_to_cpus((uint64_t *) & header->journal_size);
+    le64_to_cpus((uint64_t *) & header->stable_journal_epoch);
+    le64_to_cpus((uint64_t *) & header->journal_buf_size);
+    le64_to_cpus((uint64_t *) & header->journal_clean_buf_period);
+}
+
+/* This function can handle incompatibility issues between different FVD
+ * versions, specifically, FvdHeader might have different sizes. */
+static int read_fvd_header(BDRVFvdState * s, FvdHeader * header)
+{
+    /* FvdHeader of different FVD versions might have different sizes. Read
+     * header->header_size first. */
+    if (bdrv_pread(s->fvd_metadata, 0, header, 512) != 512) {
+        fprintf(stderr, "Failed to read the FVD header.\n");
+        return -EIO;
+    }
+
+    /* Now read the part of FvdHeader that is commonly understandable to the
+     * FVD version that created the image and this FVD version. */
+    le32_to_cpus(&header->header_size);
+    int common_size = MIN(header->header_size, sizeof(FvdHeader));
+    if (bdrv_pread(s->fvd_metadata, 0, header, common_size) != common_size) {
+        fprintf(stderr, "Failed to read the FVD header.\n");
+        return -EIO;
+    }
+
+    fvd_header_le_to_cpu(header);
+
+    if (header->magic != FVD_MAGIC) {
+        fprintf(stderr, "Error: image does not have the correct FVD format "
+                "magic number in header\n");
+        return -EIO;
+    }
+
+    return 0;
+}
+
+/* This function can handle incompatibility issues between different FVD
+ * versions, specifically, FvdHeader might have different sizes. */
+static int update_fvd_header(BDRVFvdState * s, FvdHeader * header)
+{
+    /* FvdHeader of different FVD versions might have different sizes. Only
+     * write the part of FvdHeader that is commonly understandable to the
+     * FVD version that created the image and this FVD version. */
+    int common_size = MIN(header->header_size, sizeof(FvdHeader));
+    fvd_header_cpu_to_le(header);
+    int ret = bdrv_pwrite(s->fvd_metadata, 0, header, common_size);
+
+    if (ret != common_size) {
+        fprintf(stderr, "Failed to update the FVD header.\n");
+        ASSERT(false);
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static inline void update_clean_shutdown_flag(BDRVFvdState * s, int clean)
+{
+    FvdHeader header;
+    if (!read_fvd_header(s, &header)) {
+        header.last_open_version = FVD_VERSION;
+        header.clean_shutdown = clean;
+
+        if (!update_fvd_header(s, &header)) {
+            QDEBUG("Set clean_shutdown to %s\n", BOOL(clean));
+            if (bdrv_flush(s->fvd_metadata)) {
+                s->metadata_err_prohibit_write = true;
+            }
+        }
+    }
+}
+
+
 static QEMUOptionParameter fvd_create_options[] = {
+    {
+     .name = BLOCK_OPT_SIZE,
+     .type = OPT_SIZE,
+     .help = "Virtual disk size"},
+    {
+     .name = "compact_image",
+     .type = OPT_FLAG,
+     .help = "compact_image=on|off"},
+    {
+     .name = "block_size",
+     .type = OPT_SIZE,
+     .help = "Block size"},
+    {
+     .name = "chunk_size",
+     .type = OPT_SIZE,
+     .help = "Chunk size"},
+    {
+     .name = "storage_grow_unit",
+     .type = OPT_SIZE,
+     .help = "Storage grow unit"},
+    {
+     .name = "add_storage_cmd",
+     .type = OPT_STRING,
+     .help = "Command to add storage when running out of space"},
+    {
+     .name = BLOCK_OPT_BACKING_FILE,
+     .type = OPT_STRING,
+     .help = "File name of a backing image"},
+    {
+     .name = BLOCK_OPT_BACKING_FMT,
+     .type = OPT_STRING,
+     .help = "Image format of the backing image"},
+    {
+     .name = "data_file",
+     .type = OPT_STRING,
+     .help = "File name of a data file"},
+    {
+     .name = "data_file_fmt",
+     .type = OPT_STRING,
+     .help = "Image format of the data file"},
+    {
+     .name = "copy_on_read",
+     .type = OPT_FLAG,
+     .help = "copy_on_read=on|off"},
+    {
+     .name = "prefetch_start_delay",
+     .type = OPT_NUMBER,
+     .help = "Delay in seconds before starting whole image prefetching. "},
+    {
+     .name = "journal_size",
+     .type = OPT_SIZE,
+     .help = "Journal size"},
+    {
+     .name = "max_outstanding_copy_on_read_data",
+     .type = OPT_SIZE,
+     .help = "copy_on_read is temporarily disabled when the unsaved data "
+     "exceed this threshold (in bytes)"},
+    {
+     .name = "journal_buf_size",
+     .type = OPT_SIZE,
+     .help = "size of in-memory journal buffer (in bytes)"},
+    {
+     .name = "journal_clean_buf_period",
+     .type = OPT_NUMBER,
+     .help = "(milliseconds)"},
+    {
+     .name = "num_prefetch_slots",
+     .type = OPT_NUMBER,
+     .help = "number of concurrent prefetches allowed"},
+    {
+     .name = "bytes_per_prefetch",
+     .type = OPT_NUMBER,
+     .help = "data to read per prefetch"},
+    {
+     .name = "bytes_per_prefetch",
+     .type = OPT_NUMBER,
+     .help = "data to read per prefetch"},
+    {
+     .name = "prefetch_over_threshold_throttle_time",
+     .type = OPT_NUMBER,
+     .help = "in milliseconds"},
+    {
+     .name = "prefetch_read_throughput_measure_time",
+     .type = OPT_NUMBER,
+     .help = "in milliseconds"},
+    {
+     .name = "prefetch_write_throughput_measure_time",
+     .type = OPT_NUMBER,
+     .help = "in milliseconds"},
+    {
+     .name = "prefetch_min_read_throughput",
+     .type = OPT_NUMBER,
+     .help = "in KB/s"},
+    {
+     .name = "prefetch_max_read_throughput",
+     .type = OPT_NUMBER,
+     .help = "in KB/s"},
+    {
+     .name = "prefetch_min_write_throughput",
+     .type = OPT_NUMBER,
+     .help = "in KB/s"},
+    {
+     .name = "prefetch_max_write_throughput",
+     .type = OPT_NUMBER,
+     .help = "in KB/s"},
+    {
+     .name = "optimize_empty_block",
+     .type = OPT_SIZE,
+     .help = "Minimum size (in bytes) of a zero-filled region whose state will "
+     "be preset in the bitmap. Settingto 0 turns off this optimization"},
     {NULL}
 };
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 5824e35..246f425 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -17,6 +17,11 @@ static bool emulate_host_crash = true;
 static bool emulate_host_crash = false;
 #endif
 
+static inline int64_t calc_min_journal_size(int64_t table_entries)
+{
+    return 512;
+}
+
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
diff --git a/block/fvd.c b/block/fvd.c
index 13fe940..d6263e7 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -27,13 +27,13 @@
  * function optimization. */
 #include "block/fvd-debug.c"
 #include "block/fvd-flush.c"
-#include "block/fvd-update.c"
 #include "block/fvd-misc.c"
 #include "block/fvd-create.c"
 #include "block/fvd-open.c"
 #include "block/fvd-read.c"
 #include "block/fvd-write.c"
 #include "block/fvd-journal.c"
+#include "block/fvd-update.c"
 
 static BlockDriver bdrv_fvd = {
     .format_name = "fvd",
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (7 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 09/26] FVD: add impl of interface bdrv_create() Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 11/26] FVD: add impl of interface bdrv_aio_writev() Chunqiang Tang
                   ` (15 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_file_open() interface.
It supports openning an FVD image.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-journal.c  |    6 +
 block/fvd-open.c     |  445 +++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-prefetch.c |   17 ++
 block/fvd.c          |    1 +
 4 files changed, 468 insertions(+), 1 deletions(-)
 create mode 100644 block/fvd-prefetch.c

diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 246f425..5ba34bd 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -22,6 +22,12 @@ static inline int64_t calc_min_journal_size(int64_t table_entries)
     return 512;
 }
 
+static int init_journal(int read_only, BlockDriverState * bs,
+                        FvdHeader * header)
+{
+    return -ENOTSUP;
+}
+
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
diff --git a/block/fvd-open.c b/block/fvd-open.c
index 056b994..8caf8d3 100644
--- a/block/fvd-open.c
+++ b/block/fvd-open.c
@@ -11,7 +11,450 @@
  *
  */
 
+static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s);
+static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags);
+static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
+                       FvdHeader * header, const char *const filename);
+static int load_table(BDRVFvdState * s, FvdHeader * header,
+                      const char *const filename);
+static int init_journal(int read_only, BlockDriverState * bs,
+                        FvdHeader * header);
+static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
+                              const char *const filename);
+
 static int fvd_open(BlockDriverState * bs, const char *filename, int flags)
 {
-    return -ENOTSUP;
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+    FvdHeader header;
+    BlockDriver *drv;
+    int i;
+
+    const char *protocol = strchr(filename, ':');
+    if (protocol) {
+        drv = bdrv_find_protocol(filename);
+        filename = protocol + 1;
+    } else {
+        /* Use "raw" instead of "file" to allow storing the image on device. */
+        drv = bdrv_find_format("raw");
+        if (!drv) {
+            fprintf(stderr, "Failed to find the block device driver\n");
+            return -EINVAL;
+        }
+    }
+
+    s->fvd_metadata = bdrv_new("");
+    ret = bdrv_open(s->fvd_metadata, filename, flags, drv);
+    if (ret < 0) {
+        fprintf(stderr, "Failed to open %s\n", filename);
+        return ret;
+    }
+
+    /* Initialize so that jumping to 'fail' would do cleanup properly. */
+    s->stale_bitmap = NULL;
+    s->fresh_bitmap = NULL;
+    s->table = NULL;
+    s->outstanding_copy_on_read_data = 0;
+    QLIST_INIT(&s->write_locks);
+    QLIST_INIT(&s->copy_locks);
+    s->prefetch_acb = NULL;
+    s->add_storage_cmd = NULL;
+#ifdef FVD_DEBUG
+    s->total_copy_on_read_data = s->total_prefetch_data = 0;
+#endif
+
+    if (bdrv_pread(s->fvd_metadata, 0, &header, sizeof(header)) !=
+        sizeof(header)) {
+        fprintf(stderr, "Failed to read the header of %s\n", filename);
+        ret = -EIO;
+        goto fail;
+    }
+
+    fvd_header_le_to_cpu(&header);
+
+    if (header.magic != FVD_MAGIC) {
+        fprintf(stderr, "Incorrect magic number in header: %0X\n",
+                header.magic);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Check incompatible features. */
+    for (i = 0; i < INCOMPATIBLE_FEATURES_SPACE; i++) {
+        if (header.incompatible_features[i] != 0) {
+            fprintf(stderr, "The image was created by FVD version %d "
+                    " and uses features not supported by this FVD version %d\n",
+                    header.create_version, FVD_VERSION);
+            ret = -ENOTSUP;
+        }
+    }
+
+    if (header.virtual_disk_size % 512 != 0) {
+        fprintf(stderr, "Disk size %" PRId64 " in the header of %s is not "
+                "a multple of 512.\n", header.virtual_disk_size, filename);
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Initialize the fields of BDRVFvdState. */
+    s->chunks_relocated = header.chunks_relocated;
+    s->dirty_image = false;
+    s->metadata_err_prohibit_write = false;
+    s->block_size = header.block_size / 512;
+    s->bitmap_size = header.bitmap_size;
+    s->prefetch_timer = NULL;
+    s->sectors_per_prefetch = (header.bytes_per_prefetch + 511) / 512;
+    s->prefetch_throttle_time = header.prefetch_throttle_time;
+    s->prefetch_read_throughput_measure_time =
+        header.prefetch_read_throughput_measure_time;
+    s->prefetch_write_throughput_measure_time =
+        header.prefetch_write_throughput_measure_time;
+
+    /* Convert KB/s to bytes/millisec. */
+    s->prefetch_min_read_throughput =
+        ((double)header.prefetch_min_read_throughput) * 1024.0 / 1000.0;
+    s->prefetch_min_write_throughput =
+        ((double)header.prefetch_min_write_throughput) * 1024.0 / 1000.0;
+
+    if (header.base_img[0] != 0 && s->sectors_per_prefetch % s->block_size!=0) {
+        fprintf(stderr, "sectors_per_prefetch (%" PRIu64 ") is not a multiple "
+                "of block_size (%" PRIu64 ")\n",
+                s->sectors_per_prefetch * 512, s->block_size * 512);
+    }
+    s->max_outstanding_copy_on_read_data =
+        header.max_outstanding_copy_on_read_data;
+    if (s->max_outstanding_copy_on_read_data < header.block_size * 2) {
+        s->max_outstanding_copy_on_read_data = header.block_size;
+    }
+
+    if (header.num_prefetch_slots < 1) {
+        s->num_prefetch_slots = 1;
+    } else {
+        s->num_prefetch_slots = header.num_prefetch_slots;
+    }
+
+    const int read_only = !(flags & BDRV_O_RDWR);
+
+    if (read_only || IN_QEMU_TOOL) {
+        /* Disable prefetching and copy_on_read. */
+        s->prefetch_start_delay = -1;
+        s->copy_on_read = false;
+    } else {
+        s->prefetch_start_delay = header.prefetch_start_delay;
+        s->copy_on_read = header.copy_on_read;
+    }
+    s->virtual_disk_size = header.virtual_disk_size;
+    s->bitmap_offset = header.bitmap_offset / 512;
+    s->base_img_sectors = header.base_img_size / 512;
+    bs->total_sectors = s->virtual_disk_size / 512;
+
+    if ((ret = init_data_file(s, &header, flags))) {
+        goto fail;
+    }
+
+    if ((ret = init_bitmap(bs, s, &header, filename))) {
+        goto fail;
+    }
+
+    if ((ret = load_table(s, &header, filename))) {
+        goto fail;
+    }
+
+    if ((ret = init_journal(read_only, bs, &header))) {
+        goto fail;
+    }
+
+    /* This must be done after init_journal() because it may use metadata
+     * recovered from the journal. */
+    if ((ret = init_compact_image(s, &header, filename))) {
+        goto fail;
+    }
+
+    if (!read_only) {
+        /* This flag will be cleaned when the image is shut down gracefully. */
+        update_clean_shutdown_flag(s, false);
+        init_prefetch_timer(bs, s);
+    }
+
+    QDEBUG("copy_on_read=%s compact_image=%s block_size=%" PRIu64
+           " chunk_size=%"PRId64
+           " journal_size=%" PRId64 " prefetching_delay=%" PRId64
+           " prefetch_slots=%d "
+           "prefetch_read_threshold_KB=%.0lf "
+           "prefetch_write_threshold_KB=%.0lf "
+           "prefetch_throttle_time=%" PRIu64 " bytes_per_prefetch=%" PRIu64
+           " max_outstanding_copy_on_read_data=%" PRId64 "\n",
+           BOOL(s->copy_on_read), BOOL(s->table_offset > 0),
+           s->block_size * 512, s->chunk_size * 512,
+           s->journal_size * 512, s->prefetch_start_delay,
+           s->num_prefetch_slots,
+           s->prefetch_min_read_throughput * 1000.0 / 1024.0,
+           s->prefetch_min_write_throughput * 1000.0 / 1024.0,
+           s->prefetch_throttle_time, s->sectors_per_prefetch * 512,
+           s->max_outstanding_copy_on_read_data);
+
+    return 0;
+
+fail:
+    fprintf(stderr, "Failed to open %s using the FVD format.\n", filename);
+    fvd_close(bs);
+    return ret;
+}
+
+static int load_table(BDRVFvdState * s, FvdHeader * header,
+                      const char *const filename)
+{
+    if (header->table_offset <= 0) {
+        return 0;       /* Not a compact image and no table. */
+    }
+
+    /* Initialize the table. */
+    s->table_offset = header->table_offset / 512;
+    s->table_size = header->table_size;
+    s->chunk_size = header->chunk_size / 512;
+    s->table = my_qemu_blockalign(s->fvd_metadata, s->table_size);
+
+    if (bdrv_pread(s->fvd_metadata, header->table_offset, s->table,
+                   (int)s->table_size) != (int)s->table_size) {
+        fprintf(stderr, "Failed to read the table of %s\n", filename);
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static int init_compact_image(BDRVFvdState * s, FvdHeader * header,
+                              const char *const filename)
+{
+    s->leaked_chunks = NULL;
+    s->num_leaked_chunks = 0;
+    s->next_avail_leaked_chunk = 0;
+
+    if (header->table_offset <= 0) {
+        /* Not a compact image. */
+        s->data_region_prepared = false;
+        return 0;
+    }
+
+    /* Scan the table to find the max used chunk and leaked chunks. */
+    uint32_t i;
+    uint32_t max_chunk = 0;
+    uint32_t table_entries = ROUND_UP(header->virtual_disk_size,
+                                      header->chunk_size) / header->chunk_size;
+    uint8_t *used_chunks = my_qemu_mallocz(table_entries);
+    for (i = 0; i < table_entries; i++) {
+        if (!IS_EMPTY(s->table[i])) {
+            uint32_t id = READ_TABLE(s->table[i]);
+            if (id >= max_chunk) {
+                max_chunk = id + 1;
+            }
+            if (used_chunks[id]) {
+                fprintf(stderr, "ERROR: corrupted image with multiple "
+                        "virtual chunks mapped to physical chunk %u\n", id);
+                my_qemu_free(used_chunks);
+                return -EIO;
+            }
+            used_chunks[id] = true;
+        }
+    }
+
+    /* Count the number of leaked chunks. */
+    uint32_t num_leaked_chunks = 0;
+    for (i = 0; i < max_chunk; i++) {
+        if (!used_chunks[i]) {
+            num_leaked_chunks++;
+        }
+    }
+    QDEBUG("leaked_chunks=%u max_chunk=%u\n", num_leaked_chunks, max_chunk);
+
+    /* Record leaked chunks, which will be used later. */
+    if (num_leaked_chunks > 0) {
+        s->num_leaked_chunks = num_leaked_chunks;
+        s->leaked_chunks = my_qemu_malloc(sizeof(uint32_t) * num_leaked_chunks);
+        num_leaked_chunks = 0;
+        for (i = 0; i < max_chunk; i++) {
+            if (!used_chunks[i]) {
+                s->leaked_chunks[num_leaked_chunks++] = i;
+                QDEBUG("Recover leaked physical chunk %u\n", i);
+            }
+        }
+    }
+    s->used_storage = max_chunk * s->chunk_size;
+    s->storage_grow_unit = header->storage_grow_unit / 512;
+    my_qemu_free(used_chunks);
+
+    /* Check if the image is directly stored on a raw device, including
+     * logical volume. If so, figure out the size of the device. */
+    struct stat stat_buf;
+    if (stat(filename, &stat_buf) != 0) {
+        fprintf(stderr, "Failed to stat() %s\n", filename);
+        return -EIO;
+    }
+
+    /* Check how much storage space is already allocated. */
+    int64_t size = bdrv_getlength(s->fvd_data);
+    if (size < 0) {
+        fprintf(stderr, "Failed in bdrv_getlength(%s)\n", filename);
+        return -EIO;
+    }
+
+    if (S_ISBLK(stat_buf.st_mode) || S_ISCHR(stat_buf.st_mode)) {
+        const int64_t min_size = (s->data_offset + s->used_storage) * 512;
+        if (size < min_size) {
+            fprintf(stderr, "The size of device %s is not even big enough to "
+                    "store already allocated data.\n", filename);
+            return -EIO;
+        }
+
+        /* Initialize the command to grow storage space. */
+        char cmd[2048];
+        if (header->add_storage_cmd[0] == 0) {
+            s->add_storage_cmd = NULL;
+        } else {
+            if (strcmp(header->add_storage_cmd, "builtin:lvextend") == 0) {
+                /* Note the following:
+                 *     1. lvextend may generate warning messages like "File
+                 *     descriptor...leaked...", * which is fine.  See the
+                 *     following from LVM manual: "On invocation, lvm requires
+                 *     that only  the  standard  file  descriptors stdin,
+                 *     stdout * and stderr are available.  If others are
+                 *     found, they get closed and messages are issued warning
+                 *     about the leak."
+                 *     2. Instead of using the lvextend command line, one
+                 *     option is to use liblvm directly, which avoids creating
+                 *     a process to resize a LV.
+                 *     3. On Ubuntu, /bin/sh is linked to /bin/dash, which
+                 *     does not support ">&" for stdout and stderr
+                 *     redirection. */
+                snprintf(cmd, sizeof(cmd) - 1, "/sbin/lvextend -L+%" PRIu64
+                         "B %s >/dev/null 2>/dev/null",
+                         header->storage_grow_unit,
+                         header->data_file[0] ? header->data_file : filename);
+            } else {
+                snprintf(cmd, sizeof(cmd) - 1, "%s %" PRIu64
+                         " %s >/dev/null 2>/dev/null",
+                         header->add_storage_cmd, header->storage_grow_unit,
+                         header->data_file[0] ? header->data_file : filename);
+            }
+            int len = strlen(cmd);
+            s->add_storage_cmd = my_qemu_malloc(len + 1);
+            memcpy(s->add_storage_cmd, cmd, len + 1);
+        }
+    }
+
+    s->avail_storage = size / 512 - s->data_offset;
+    s->fvd_data->growable = true;
+    s->data_region_prepared = true;
+
+    return 0;
+}
+
+static int init_data_file(BDRVFvdState * s, FvdHeader * header, int flags)
+{
+    int ret;
+
+    if (header->data_file[0]) {
+        /* Open a separate data file. */
+        s->data_offset = 0;
+        s->fvd_data = bdrv_new("");
+        if (!s->fvd_data) {
+            fprintf(stderr, "Failed to create a new block device driver.\n");
+            return -EIO;
+        }
+
+        if (header->data_file_fmt[0] == 0) {
+            ret = bdrv_open(s->fvd_data, header->data_file, flags, NULL);
+        } else {
+            BlockDriver *data_drv = bdrv_find_format(header->data_file_fmt);
+            if (!data_drv) {
+                fprintf(stderr, "Failed to find driver for image format "
+                        "'%s' of data file %s\n",
+                        header->data_file_fmt, header->data_file);
+                return -EINVAL;
+            }
+            ret = bdrv_open(s->fvd_data, header->data_file, flags, data_drv);
+        }
+        if (ret != 0) {
+            fprintf(stderr, "Failed to open data file %s\n", header->data_file);
+            return -EIO;
+        }
+    } else {
+        s->data_offset = header->data_offset / 512;     /* In sectors. */
+        s->fvd_data = s->fvd_metadata;
+    }
+
+    if (header->need_zero_init && !bdrv_has_zero_init(s->fvd_data)) {
+        if (IN_QEMU_TOOL) {
+            /* Only give a warning to allow 'qemu-img update' to modify
+             * need_zero_init if the user manually zero-init the device. */
+            fprintf(stderr, "Warning: image needs zero_init but it is not "
+                    "supported by the storage media.\n");
+        } else {
+            fprintf(stderr, "Error: image needs zero_init but it is not "
+                    "supported by the storage media.\n");
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
+static int init_bitmap(BlockDriverState * bs, BDRVFvdState * s,
+                       FvdHeader * header, const char *const filename)
+{
+    if (header->base_img_fully_prefetched) {
+        /* This also covers the case of no base image. */
+        s->prefetch_state = PREFETCH_STATE_FINISHED;
+        s->copy_on_read = false;
+        s->prefetch_start_delay = -1;
+
+        if (bs->backing_file[0] != 0) {
+            /* No need to use the base image. It may operate without problem
+             * even if the base image is no longer accessible. */
+            bs->backing_file[0] = 0;
+        }
+    } else {
+        ASSERT(header->base_img[0] != 0);
+        pstrcpy(bs->backing_file, 1024, header->base_img);
+        pstrcpy(bs->backing_format, 16, header->base_img_fmt);
+
+        /* This will be enabled in init_prefetch() after a timer expires. */
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        s->stale_bitmap = my_qemu_blockalign(s->fvd_metadata, s->bitmap_size);
+        if (bdrv_pread(s->fvd_metadata, header->bitmap_offset,
+                       s->stale_bitmap, s->bitmap_size) != s->bitmap_size) {
+            fprintf(stderr, "Failed to read the bitmap of %s.\n", filename);
+            return -EIO;
+        }
+
+        if (s->copy_on_read || (s->prefetch_state != PREFETCH_STATE_FINISHED &&
+                                s->prefetch_start_delay > 0)) {
+            /* Use two bitmaps only if copy_on_read or prefetching is enabled.
+             * See Section 3.3.4 of the FVD-cow paper. */
+            s->fresh_bitmap = my_qemu_blockalign(s->fvd_metadata,
+                                                 s->bitmap_size);
+            memcpy(s->fresh_bitmap, s->stale_bitmap, s->bitmap_size);
+        } else {
+            s->fresh_bitmap = s->stale_bitmap;
+        }
+    }
+
+    return 0;
+}
+
+static void init_prefetch_timer(BlockDriverState * bs, BDRVFvdState * s)
+{
+    if (IN_QEMU_TOOL) {
+        return;
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
+        s->prefetch_start_delay <= 0) {
+        return;
+    }
+
+    /* Start prefetching after a delay. Times 1000 to convert sec to ms. */
+    int64_t expire = qemu_get_clock(rt_clock) + s->prefetch_start_delay * 1000;
+    s->prefetch_timer = qemu_new_timer(rt_clock, fvd_init_prefetch, bs);
+    qemu_mod_timer(s->prefetch_timer, expire);
 }
diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
new file mode 100644
index 0000000..5844aa7
--- /dev/null
+++ b/block/fvd-prefetch.c
@@ -0,0 +1,17 @@
+/*
+ * QEMU Fast Virtual Disk Format Adaptive Prefetching
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+void fvd_init_prefetch(void *opaque)
+{
+    /* To be implemented. */
+}
diff --git a/block/fvd.c b/block/fvd.c
index d6263e7..e41f419 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -33,6 +33,7 @@
 #include "block/fvd-read.c"
 #include "block/fvd-write.c"
 #include "block/fvd-journal.c"
+#include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
 
 static BlockDriver bdrv_fvd = {
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 11/26] FVD: add impl of interface bdrv_aio_writev()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (8 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open() Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 12/26] FVD: add impl of interface bdrv_aio_readv() Chunqiang Tang
                   ` (14 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_aio_writev() interface. It
supports copy-on-write in FVD.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-bitmap.c  |  150 ++++++++++++++++
 block/fvd-journal.c |    4 +
 block/fvd-store.c   |   20 +++
 block/fvd-write.c   |  468 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd.c         |    4 +-
 block/fvd.h         |    1 +
 6 files changed, 645 insertions(+), 2 deletions(-)
 create mode 100644 block/fvd-bitmap.c
 create mode 100644 block/fvd-store.c

diff --git a/block/fvd-bitmap.c b/block/fvd-bitmap.c
new file mode 100644
index 0000000..7e96201
--- /dev/null
+++ b/block/fvd-bitmap.c
@@ -0,0 +1,150 @@
+/*
+ * QEMU Fast Virtual Disk Format Utility Functions for Bitmap
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline bool stale_bitmap_show_sector_in_base_img(int64_t sector_num,
+                                                        const BDRVFvdState * s)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+    return 0 == (int)((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline bool fresh_bitmap_show_sector_in_base_img(int64_t sector_num,
+                                                        const BDRVFvdState * s)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+    return 0 == (int)((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline void update_fresh_bitmap(int64_t sector_num, int nb_sectors,
+                                       const BDRVFvdState * s)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+static inline bool bitmap_show_sector_in_base_img(int64_t sector_num,
+                                                  const BDRVFvdState * s,
+                                                  int bitmap_offset,
+                                                  uint8_t * bitmap)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8 - bitmap_offset;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = bitmap[bitmap_byte_offset];
+    return 0 == (int)((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline bool stale_bitmap_need_update(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t end = acb->sector_num + acb->nb_sectors;
+
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+    int64_t block_end = (end - 1) / s->block_size;
+    int64_t block_num = acb->sector_num / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* Return true if stable_bitmap needs update. */
+static bool update_fresh_bitmap_and_check_stale_bitmap(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->sector_num >= s->base_img_sectors) {
+        return false;
+    }
+
+    bool need_update = false;
+    int64_t end = acb->sector_num + acb->nb_sectors;
+
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_end = (end - 1) / s->block_size;
+    int64_t block_num = acb->sector_num / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (b & mask) {
+            /* If the bit in stale_bitmap is set, the corresponding bit in
+             * fresh_bitmap must be set already. */
+            continue;
+        }
+
+        need_update = true;
+        b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+
+    return need_update;
+}
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 5ba34bd..2edfc70 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -28,6 +28,10 @@ static int init_journal(int read_only, BlockDriverState * bs,
     return -ENOTSUP;
 }
 
+static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap)
+{
+}
+
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
diff --git a/block/fvd-store.c b/block/fvd-store.c
new file mode 100644
index 0000000..85e45d4
--- /dev/null
+++ b/block/fvd-store.c
@@ -0,0 +1,20 @@
+/*
+ * QEMU Fast Virtual Disk Format Store Data in Compact Image
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline BlockDriverAIOCB *store_data(int soft_write,
+                FvdAIOCB * parent_acb, BlockDriverState * bs,
+                int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
+                BlockDriverCompletionFunc * cb, void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd-write.c b/block/fvd-write.c
index a736a37..f0580d4 100644
--- a/block/fvd-write.c
+++ b/block/fvd-write.c
@@ -11,11 +11,477 @@
  *
  */
 
+static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap);
+static int do_aio_write(struct FvdAIOCB *acb);
+static void restart_dependent_writes(struct FvdAIOCB *acb);
+static void free_write_resource(struct FvdAIOCB *acb);
+static inline BlockDriverAIOCB *store_data(int soft_write,
+                FvdAIOCB * parent_acb, BlockDriverState * bs,
+                int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
+                BlockDriverCompletionFunc * cb, void *opaque);
+
+static inline void init_data_region(BDRVFvdState * s)
+{
+    bdrv_truncate(s->fvd_data, s->data_offset * 512 + s->virtual_disk_size);
+    s->data_region_prepared = true;
+}
+
 static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
                                         int64_t sector_num,
                                         QEMUIOVector * qiov, int nb_sectors,
                                         BlockDriverCompletionFunc * cb,
                                         void *opaque)
 {
-    return NULL;
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+
+    TRACE_REQUEST(true, sector_num, nb_sectors);
+
+    if (s->metadata_err_prohibit_write) {
+        return NULL;
+    }
+
+    if (!s->data_region_prepared) {
+        init_data_region(s);
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED
+        || sector_num >= s->base_img_sectors) {
+        /* This is an  efficient case. See Section 3.3.5 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+        return store_data(false, NULL, bs, sector_num, qiov,
+                          nb_sectors, cb, opaque);
+    }
+
+    /* Check if all requested sectors are in the FVD data file. */
+    int64_t sec = ROUND_DOWN(sector_num, s->block_size);
+    int64_t sec_in_last_block = ROUND_DOWN(sector_num + nb_sectors - 1,
+                                           s->block_size);
+    do {
+        if (stale_bitmap_show_sector_in_base_img(sec, s)) {
+            goto slow_path;
+        }
+        sec += s->block_size;
+    } while (sec <= sec_in_last_block);
+
+    /* This is the fast path, as all requested data are in the FVD data file
+     * and no need to update the bitmap. */
+    return store_data(false, NULL, bs, sector_num, qiov,
+                      nb_sectors, cb, opaque);
+
+slow_path:
+    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->type = OP_WRITE;
+    acb->cancel_in_progress = false;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->write.ret = 0;
+    acb->write.update_table = false;
+    acb->write.qiov = qiov;
+    acb->write.hd_acb = NULL;
+    acb->write.cow_buf = NULL;
+    acb->copy_lock.next.le_prev = NULL;
+    acb->write.next_write_lock.le_prev = NULL;
+    acb->write.next_dependent_write.le_prev = NULL;
+    acb->jcb.iov.iov_base = NULL;
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL;
+    QLIST_INIT(&acb->copy_lock.dependent_writes);
+
+    QDEBUG("WRITE: acb%llu-%p  start  sector_num=%" PRId64 " nb_sectors=%d\n",
+           acb->uuid, acb, acb->sector_num, acb->nb_sectors);
+
+    if (do_aio_write(acb) < 0) {
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+#ifdef FVD_DEBUG
+    pending_local_writes++;
+#endif
+    return &acb->common;
+}
+
+static void free_write_resource(FvdAIOCB * acb)
+{
+    if (acb->write.next_write_lock.le_prev) {
+        QLIST_REMOVE(acb, write.next_write_lock);
+    }
+    if (acb->copy_lock.next.le_prev) {
+        QLIST_REMOVE(acb, copy_lock.next);
+        restart_dependent_writes(acb);
+    }
+    if (acb->write.cow_buf) {
+        my_qemu_vfree(acb->write.cow_buf);
+    }
+    if (acb->jcb.iov.iov_base != NULL) {
+        my_qemu_vfree(acb->jcb.iov.iov_base);
+    }
+
+    my_qemu_aio_release(acb);
+
+#ifdef FVD_DEBUG
+    pending_local_writes--;
+#endif
+}
+
+static inline void finish_write(FvdAIOCB * acb, int ret)
+{
+    QDEBUG("WRITE: acb%llu-%p  completely_finished ret=%d\n", acb->uuid, acb,
+           ret);
+    acb->common.cb(acb->common.opaque, ret);
+    free_write_resource(acb);
+}
+
+static void write_data_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    acb->write.ret = ret;
+    acb->write.hd_acb = NULL;
+
+    if (ret != 0) {
+        QDEBUG("WRITE: acb%llu-%p  write_data_cb error ret=%d\n",
+               acb->uuid, acb, ret);
+        finish_write(acb, ret);
+        return;
+    }
+
+    QDEBUG("WRITE: acb%llu-%p  write_data_cb\n", acb->uuid, acb);
+
+    /* Figure out whether to update metadata or not. */
+    if (s->fresh_bitmap == s->stale_bitmap) {
+        /* Neither copy_on_read nor prefetching is enabled. Cannot update
+         * fresh_bitmap until the on-disk metadata is updated. */
+        if (stale_bitmap_need_update(acb)) {
+            write_metadata_to_journal(acb, true);
+        } else if (acb->write.update_table) {
+            write_metadata_to_journal(acb, false);
+        } else {
+            finish_write(acb, ret);     /* No need to update metadata. */
+        }
+
+        return;
+    }
+
+    /* stale_bitmap and fresh_bitmap are different. Update fresh_bitmap now
+     * and stale_bitmap will be updated after on-disk metadata are updated. */
+    bool bitmap_need_update = update_fresh_bitmap_and_check_stale_bitmap(acb);
+
+    /* Release lock on data now since fresh_bitmap has been updated. */
+    QLIST_REMOVE(acb, write.next_write_lock);
+    acb->write.next_write_lock.le_prev = NULL;
+    if (acb->copy_lock.next.le_prev) {
+        QLIST_REMOVE(acb, copy_lock.next);
+        restart_dependent_writes(acb);
+    }
+
+    if (bitmap_need_update) {
+        write_metadata_to_journal(acb, true);
+    } else if (acb->write.update_table) {
+        write_metadata_to_journal(acb, false);
+    } else {
+        finish_write(acb, ret);
+    }
+}
+
+static void read_backing_for_copy_on_write_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    if (ret != 0) {
+        QDEBUG("WRITE: acb%llu-%p  read_backing with error "
+               "ret=%d\n", acb->uuid, acb, ret);
+        finish_write(acb, ret);
+    } else {
+        QDEBUG("WRITE: acb%llu-%p  "
+               "finish_read_from_backing_and_start_write_data\n",
+               acb->uuid, acb);
+        acb->write.hd_acb = store_data(false, acb, bs,
+                                       acb->write.cow_start_sector,
+                                       acb->write.cow_qiov,
+                                       acb->write.cow_qiov->size / 512,
+                                       write_data_cb, acb);
+        if (!acb->write.hd_acb) {
+            finish_write(acb, -EIO);
+        }
+    }
+}
+
+static int do_aio_write(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    /* Calculate the data region need be locked. */
+    const int64_t sector_end = acb->sector_num + acb->nb_sectors;
+    const int64_t block_begin = ROUND_DOWN(acb->sector_num, s->block_size);
+    int64_t block_end = ROUND_UP(sector_end, s->block_size);
+
+    /* Check for conflicting copy-on-reads. */
+    FvdAIOCB *old;
+    QLIST_FOREACH(old, &s->copy_locks, copy_lock.next) {
+        if (old->copy_lock.end > acb->sector_num &&
+            sector_end > old->copy_lock.begin) {
+            QLIST_INSERT_HEAD(&old->copy_lock.dependent_writes, acb,
+                              write.next_dependent_write);
+            QDEBUG("WRITE: acb%llu-%p  put_on_hold_due_to_data_conflict "
+                   "with %s acb%llu-%p\n", acb->uuid, acb,
+                   old->type == OP_WRITE ? "write" : "copy_on_read",
+                   old->uuid, old);
+            return 0;
+        }
+    }
+
+    /* No conflict. check if this write updates partial blocks and need to
+     * read those blocks from the base image and merge with this write. */
+    int read_first_block, read_last_block;
+    if (acb->sector_num % s->block_size == 0) {
+        read_first_block = false;
+    } else if (fresh_bitmap_show_sector_in_base_img(acb->sector_num, s)) {
+        read_first_block = true;
+    } else {
+        read_first_block = false;
+    }
+
+    if (sector_end % s->block_size == 0) {
+        read_last_block = false;
+    } else if (fresh_bitmap_show_sector_in_base_img(sector_end, s)) {
+        read_last_block = true;
+    } else {
+        read_last_block = false;
+    }
+
+    if (read_first_block) {
+        if (read_last_block) {
+            /* Case 1: Read all the blocks involved from the base image. */
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+            if (block_end > s->base_img_sectors) {
+                block_end = s->base_img_sectors;
+            }
+
+            int buf_size = (block_end - block_begin) * 512
+                + 2 * sizeof(QEMUIOVector)
+                + sizeof(struct iovec) * (old_qiov->niov + 3);
+            buf_size = ROUND_UP(buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf +
+                                            (block_end - block_begin) * 512);
+            read_qiov->iov = (struct iovec *)(read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size =
+                (block_end - block_begin) * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *)(write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 2;
+            write_qiov->size = read_qiov->size;
+
+            /* The first entry is for data read from the base image. */
+            write_qiov->iov[0].iov_base = acb->write.cow_buf;
+            write_qiov->iov[0].iov_len = (acb->sector_num - block_begin) * 512;
+            memcpy(&write_qiov->iov[1], old_qiov->iov,
+                   sizeof(struct iovec) * old_qiov->niov);
+
+            /* The last entry is for data read from the base image. */
+            const int last = old_qiov->niov + 1;
+            write_qiov->iov[last].iov_base = acb->write.cow_buf
+                                        + (sector_end - block_begin) * 512;
+            write_qiov->iov[last].iov_len = (block_end - sector_end) * 512;
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = block_begin;
+
+            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd, block_begin,
+                                    read_qiov, block_end - block_begin,
+                                    read_backing_for_copy_on_write_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.begin = block_begin;
+            acb->copy_lock.end = block_end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            QDEBUG("WRITE: acb%llu-%p  "
+                   "read_first_last_partial_blocks_from_backing  sector_num=%"
+                   PRId64 " nb_sectors=%d\n", acb->uuid, acb, block_begin,
+                   (int)(block_end - block_begin));
+        } else {
+            /* Case 2: Read the first block from the base image. */
+            int nb = acb->sector_num - block_begin;
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+
+            /* Space for data and metadata. */
+            int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector)
+                                + sizeof(struct iovec) * (old_qiov->niov + 2);
+            buf_size = ROUND_UP(buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov =
+                (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
+            read_qiov->iov = (struct iovec *)(read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *)(write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 1;
+            write_qiov->size = old_qiov->size + read_qiov->size;
+
+            /* The first entry is added for data read from the base image. */
+            write_qiov->iov[0].iov_base = acb->write.cow_buf;
+            write_qiov->iov[0].iov_len = read_qiov->size;
+            memcpy(&write_qiov->iov[1], old_qiov->iov,
+                   sizeof(struct iovec) * old_qiov->niov);
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = block_begin;
+
+            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd,
+                                    block_begin, read_qiov, nb,
+                                    read_backing_for_copy_on_write_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.begin = block_begin;
+            acb->copy_lock.end = block_begin + s->block_size;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            QDEBUG("WRITE: acb%llu-%p  read_first_partial_block_from_backing  "
+                   "sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb, block_begin, nb);
+        }
+    } else {
+        if (read_last_block) {
+            /* Case 3: Read the last block from the base image. */
+            int nb;
+            if (block_end < s->base_img_sectors) {
+                nb = block_end - sector_end;
+            } else {
+                nb = s->base_img_sectors - sector_end;
+            }
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+
+            /* Space for data and metadata. */
+            int buf_size = nb * 512 + 2 * sizeof(QEMUIOVector)
+                                + sizeof(struct iovec) * (old_qiov->niov + 2);
+            buf_size = ROUND_UP(buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign(bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf
+                                                        + nb * 512);
+            read_qiov->iov = (struct iovec *)(read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *)(write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 1;
+            write_qiov->size = old_qiov->size + read_qiov->size;
+            memcpy(write_qiov->iov, old_qiov->iov,
+                   sizeof(struct iovec) * old_qiov->niov);
+
+            /* The last appended entry is for data read from the base image. */
+            write_qiov->iov[old_qiov->niov].iov_base = acb->write.cow_buf;
+            write_qiov->iov[old_qiov->niov].iov_len = read_qiov->size;
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = acb->sector_num;
+
+            acb->write.hd_acb = bdrv_aio_readv(bs->backing_hd,
+                                    sector_end, read_qiov, nb,
+                                    read_backing_for_copy_on_write_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.end = block_end;
+            acb->copy_lock.begin = block_end - s->block_size;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            QDEBUG("WRITE: acb%llu-%p  read_last_partial_block_from_backing  "
+                   "sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->uuid, acb, sector_end, nb);
+        } else {
+            /* Case 4: Can write directly and no need to merge with data from
+             * the base image. */
+            QDEBUG("WRITE: acb%llu-%p  "
+                   "write_fvd_without_read_partial_block_from_backing\n",
+                   acb->uuid, acb);
+            acb->write.hd_acb = store_data(false, acb, bs, acb->sector_num,
+                                           acb->write.qiov, acb->nb_sectors,
+                                           write_data_cb, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+        }
+    }
+
+    QLIST_INSERT_HEAD(&s->write_locks, acb, write.next_write_lock);
+    return 0;
+
+fail:
+    if (acb->write.cow_buf) {
+        my_qemu_vfree(acb->write.cow_buf);
+    }
+    return -EIO;
+}
+
+static void restart_dependent_writes(FvdAIOCB * acb)
+{
+    acb->copy_lock.next.le_prev = NULL;
+    FvdAIOCB *req = acb->copy_lock.dependent_writes.lh_first;
+
+    while (req) {
+        /* Keep a copy of 'next' as it may be changed in do_aiO_write(). */
+        FvdAIOCB *next = req->write.next_dependent_write.le_next;
+
+        /* Indicate that this write is no longer on any depedent list. This
+         * helps fvd_aio_cancel_read() work properly. */
+        req->write.next_dependent_write.le_prev = NULL;
+
+        if (acb->type == OP_WRITE) {
+            QDEBUG("WRITE: acb%llu-%p  finished_and_restart_conflict_write "
+                   "acb%llu-%p\n", acb->uuid, acb, req->uuid, req);
+        } else {
+            QDEBUG("READ: copy_on_read acb%llu-%p  "
+                   "finished_and_restart_conflict_write acb%llu-%p\n",
+                   acb->uuid, acb, req->uuid, req);
+        }
+
+        if (do_aio_write(req) < 0) {
+            QDEBUG("WRITE: acb%llu-%p  finished with error ret=%d\n",
+                   req->uuid, req, -1);
+            req->common.cb(req->common.opaque, -1);
+            my_qemu_aio_release(req);
+        }
+
+        req = next;
+    }
 }
diff --git a/block/fvd.c b/block/fvd.c
index e41f419..5b3dcac 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -27,11 +27,13 @@
  * function optimization. */
 #include "block/fvd-debug.c"
 #include "block/fvd-flush.c"
+#include "block/fvd-bitmap.c"
 #include "block/fvd-misc.c"
 #include "block/fvd-create.c"
 #include "block/fvd-open.c"
-#include "block/fvd-read.c"
 #include "block/fvd-write.c"
+#include "block/fvd-read.c"
+#include "block/fvd-store.c"
 #include "block/fvd-journal.c"
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
diff --git a/block/fvd.h b/block/fvd.h
index 9847e7f..34ea2b4 100644
--- a/block/fvd.h
+++ b/block/fvd.h
@@ -432,6 +432,7 @@ typedef struct FvdAIOCB {
 #endif
 } FvdAIOCB;
 
+static AIOPool fvd_aio_pool;
 static BlockDriver bdrv_fvd;
 static QEMUOptionParameter fvd_create_options[];
 static QEMUOptionParameter fvd_update_options[];
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 12/26] FVD: add impl of interface bdrv_aio_readv()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (9 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 11/26] FVD: add impl of interface bdrv_aio_writev() Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 13/26] FVD: add impl of storing data in compact image Chunqiang Tang
                   ` (13 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_aio_readv() interface. It
supports read and copy-on-read in FVD.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-bitmap.c |   88 ++++++++++
 block/fvd-load.c   |   20 +++
 block/fvd-read.c   |  484 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-utils.c  |   44 +++++
 block/fvd.c        |    2 +
 5 files changed, 637 insertions(+), 1 deletions(-)
 create mode 100644 block/fvd-load.c
 create mode 100644 block/fvd-utils.c

diff --git a/block/fvd-bitmap.c b/block/fvd-bitmap.c
index 7e96201..30e4a4b 100644
--- a/block/fvd-bitmap.c
+++ b/block/fvd-bitmap.c
@@ -148,3 +148,91 @@ static bool update_fresh_bitmap_and_check_stale_bitmap(FvdAIOCB * acb)
 
     return need_update;
 }
+
+/* Return true if a valid region is found. */
+static bool find_region_in_base_img(BDRVFvdState * s, int64_t * from,
+                                    int64_t * to)
+{
+    int64_t sec = *from;
+    int64_t region_end = *to;
+
+    if (region_end > s->base_img_sectors) {
+        region_end = s->base_img_sectors;
+    }
+
+check_next_region:
+    if (sec >= region_end) {
+        return false;
+    }
+
+    if (!fresh_bitmap_show_sector_in_base_img(sec, s)) {
+        /* Find the first sector in the base image. */
+
+        sec = ROUND_UP(sec + 1, s->block_size); /* Begin of next block. */
+        while (1) {
+            if (sec >= region_end) {
+                return false;
+            }
+            if (fresh_bitmap_show_sector_in_base_img(sec, s)) {
+                break;
+            }
+            sec += s->block_size;       /* Begin of the next block. */
+        }
+    }
+
+    /* Find the end of the region in the base image. */
+    int64_t first_sec = sec;
+    sec = ROUND_UP(sec + 1, s->block_size);     /* Begin of next block. */
+    while (1) {
+        if (sec >= region_end) {
+            sec = region_end;
+            break;
+        }
+        if (!fresh_bitmap_show_sector_in_base_img(sec, s)) {
+            break;
+        }
+        sec += s->block_size;   /* Begin of the next block. */
+    }
+    int64_t last_sec = sec;
+
+    /* Check conflicting writes. */
+    FvdAIOCB *old;
+    QLIST_FOREACH(old, &s->write_locks, write.next_write_lock) {
+        int64_t old_begin = ROUND_DOWN(old->sector_num, s->block_size);
+        int64_t old_end = old->sector_num + old->nb_sectors;
+        old_end = ROUND_UP(old_end, s->block_size);
+        if (old_begin <= first_sec && first_sec < old_end) {
+            first_sec = old_end;
+        }
+        if (old_begin < last_sec && last_sec <= old_end) {
+            last_sec = old_begin;
+        }
+    }
+
+    if (first_sec >= last_sec) {
+        /* The region in [first_sec, sec) is fully covered. */
+        goto check_next_region;
+    }
+
+    /* This loop cannot be merged with the loop above. Otherwise, the logic
+     * would be incorrect.  This loop covers the case that an old request
+     * spans over a subset of the region being checked. */
+    QLIST_FOREACH(old, &s->write_locks, write.next_write_lock) {
+        int64_t old_begin = ROUND_DOWN(old->sector_num, s->block_size);
+        if (first_sec <= old_begin && old_begin < last_sec) {
+            last_sec = old_begin;
+        }
+    }
+
+    if (first_sec >= last_sec) {
+        /* The region in [first_sec, sec) is fully covered. */
+        goto check_next_region;
+    }
+
+    ASSERT(first_sec % s->block_size == 0 && (last_sec % s->block_size == 0 ||
+           last_sec == s->base_img_sectors));
+
+    *from = first_sec;
+    *to = last_sec;
+    return true;
+}
diff --git a/block/fvd-load.c b/block/fvd-load.c
new file mode 100644
index 0000000..80ab32c
--- /dev/null
+++ b/block/fvd-load.c
@@ -0,0 +1,20 @@
+/*
+ * QEMU Fast Virtual Disk Format Load Data from Compact Image
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
+                    BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * orig_qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque)
+{
+    return NULL;
+}
diff --git a/block/fvd-read.c b/block/fvd-read.c
index b9f3ac9..cd041e5 100644
--- a/block/fvd-read.c
+++ b/block/fvd-read.c
@@ -11,11 +11,493 @@
  *
  */
 
+static void read_backing_for_copy_on_read_cb(void *opaque, int ret);
+static void read_fvd_cb(void *opaque, int ret);
+static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num,
+                    int nb_sectors, int64_t * p_first_sec_in_fvd,
+                    int64_t * p_last_sec_in_fvd,
+                    int64_t * p_first_sec_in_backing,
+                    int64_t * p_last_sec_in_backing);
+static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
+                    BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * orig_qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque);
+
 static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
                                        int64_t sector_num, QEMUIOVector * qiov,
                                        int nb_sectors,
                                        BlockDriverCompletionFunc * cb,
                                        void *opaque)
 {
-    return NULL;
+    BDRVFvdState *s = bs->opaque;
+    TRACE_REQUEST(false, sector_num, nb_sectors);
+
+    if (!s->data_region_prepared) {
+        init_data_region(s);
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED
+        || sector_num >= s->base_img_sectors) {
+        /* This is an  efficient case. See Section 3.3.5 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+        return load_data(NULL, bs, sector_num, qiov, nb_sectors, cb, opaque);
+    }
+
+    /* Figure out data regions in the base image and in the FVD data file. */
+    int64_t last_sec_in_backing, first_sec_in_backing;
+    int64_t last_sec_in_fvd, first_sec_in_fvd;
+    calc_read_region(s, sector_num, nb_sectors, &first_sec_in_fvd,
+                     &last_sec_in_fvd, &first_sec_in_backing,
+                     &last_sec_in_backing);
+
+    if (first_sec_in_backing < 0) {
+        /* A simple case: all requested data are in the FVD data file. */
+        return load_data(NULL, bs, sector_num, qiov, nb_sectors, cb, opaque);
+    }
+
+    /* Do copy-on-read only if the context id is 0, i.e., it is not emulating
+     * synchronous I/O.  Doing copy-on-read in emulated synchronous I/O may
+     * leave the copy-on-read callbacks never being processed due to
+     * mismatching contextid. */
+    const bool copy_on_read = s->copy_on_read && (get_async_context_id() == 0);
+
+    if (first_sec_in_fvd < 0 && !copy_on_read) {
+        /* A simple case: all requested data are in the base image and no need
+         * to do copy_on_read. */
+        return bdrv_aio_readv(bs->backing_hd, sector_num, qiov, nb_sectors, cb,
+                              opaque);
+    }
+
+    /* The remaining cases are more complicated, which can be: 1. Data are
+     * only in the base image and copy-on-read is needed.  2. Data are in both
+     * the base image and the FVD data file. Copy-on-read may be either true
+     * or false. */
+    FvdAIOCB *acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    QDEBUG("READ: acb%llu-%p  start  sector_num=%" PRId64 " nb_sectors=%d\n",
+           acb->uuid, acb, sector_num, nb_sectors);
+
+    acb->type = OP_READ;
+    acb->cancel_in_progress = false;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->read.qiov = qiov;
+    acb->read.ret = 0;
+    acb->read.read_backing.hd_acb = NULL;
+    acb->read.read_backing.done = false;
+    acb->read.read_backing.iov.iov_base = NULL;
+    acb->read.read_fvd.hd_acb = NULL;
+    acb->read.read_fvd.iov.iov_base = NULL;
+    acb->read.read_fvd.done = (first_sec_in_fvd < 0);
+
+    /* Read from the base image. */
+    if (copy_on_read) {
+        /* Round the request to the block boundary. */
+        acb->read.read_backing.sector_num =
+            ROUND_DOWN(first_sec_in_backing, s->block_size);
+        int64_t end = ROUND_UP(last_sec_in_backing + 1, s->block_size);
+        if (end > s->base_img_sectors) {
+            end = s->base_img_sectors;
+        }
+        acb->read.read_backing.nb_sectors =
+            end - acb->read.read_backing.sector_num;
+    } else {
+        acb->read.read_backing.sector_num = first_sec_in_backing;
+        acb->read.read_backing.nb_sectors =
+            last_sec_in_backing - first_sec_in_backing + 1;
+    }
+
+    acb->read.read_backing.iov.iov_len = acb->read.read_backing.nb_sectors*512;
+    acb->read.read_backing.iov.iov_base =
+        my_qemu_blockalign(bs->backing_hd, acb->read.read_backing.iov.iov_len);
+    qemu_iovec_init_external(&acb->read.read_backing.qiov,
+                             &acb->read.read_backing.iov, 1);
+    acb->read.read_backing.hd_acb =
+        bdrv_aio_readv(bs->backing_hd, acb->read.read_backing.sector_num,
+                       &acb->read.read_backing.qiov,
+                       acb->read.read_backing.nb_sectors,
+                       read_backing_for_copy_on_read_cb, acb);
+    QDEBUG("READ: acb%llu-%p  read_backing  backing_sector_num=%" PRId64
+           " backing_nb_sectors=%d\n", acb->uuid, acb,
+           acb->read.read_backing.sector_num,
+           acb->read.read_backing.nb_sectors);
+
+    if (!acb->read.read_backing.hd_acb) {
+        my_qemu_vfree(acb->read.read_backing.iov.iov_base);
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+
+    if (first_sec_in_fvd >= 0) {
+        /* Read the FVD data file. */
+        acb->read.read_fvd.sector_num = first_sec_in_fvd;
+        acb->read.read_fvd.nb_sectors = last_sec_in_fvd - first_sec_in_fvd + 1;
+        acb->read.read_fvd.iov.iov_len = acb->read.read_fvd.nb_sectors * 512;
+
+        /* Make a copy of the current bitmap because it may change when the
+         * read requests finish. */
+        int64_t b = MIN(acb->read.read_backing.sector_num,
+                        acb->read.read_fvd.sector_num);
+        b = b / s->block_size / 8;      /* First byte of the bitmap we need. */
+        int64_t e1 = acb->read.read_backing.sector_num +
+            acb->read.read_backing.nb_sectors;
+        int64_t e2 = acb->read.read_fvd.sector_num +
+            acb->read.read_fvd.nb_sectors;
+        int64_t e = MAX(e1, e2);
+        if (e > s->base_img_sectors) {
+            e = s->base_img_sectors;
+        }
+        e = (e - 1) / s->block_size / 8; /* Last byte of the bitmap we need. */
+        int bitmap_bytes = e - b + 1;
+        int buf_size = acb->read.read_fvd.iov.iov_len +
+            ROUND_UP(bitmap_bytes, 512);
+        acb->read.read_fvd.iov.iov_base =
+            my_qemu_blockalign(s->fvd_data, buf_size);
+        uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
+            acb->read.read_fvd.iov.iov_len;
+        memcpy(saved_bitmap, s->fresh_bitmap + b, bitmap_bytes);
+
+        qemu_iovec_init_external(&acb->read.read_fvd.qiov,
+                                 &acb->read.read_fvd.iov, 1);
+        QDEBUG("READ: acb%llu-%p  read_fvd  fvd_sector_num=%" PRId64
+               " fvd_nb_sectors=%d\n", acb->uuid, acb,
+               acb->read.read_fvd.sector_num, acb->read.read_fvd.nb_sectors);
+        acb->read.read_fvd.hd_acb = load_data(acb, bs, first_sec_in_fvd,
+                                              &acb->read.read_fvd.qiov,
+                                              acb->read.read_fvd.nb_sectors,
+                                              read_fvd_cb, acb);
+        if (!acb->read.read_fvd.hd_acb) {
+            if (acb->read.read_backing.hd_acb) {
+                bdrv_aio_cancel(acb->read.read_backing.hd_acb);
+                my_qemu_vfree(acb->read.read_backing.iov.iov_base);
+            }
+            my_qemu_vfree(acb->read.read_fvd.iov.iov_base);
+            my_qemu_aio_release(acb);
+            return NULL;
+        }
+    }
+
+    return &acb->common;
+}
+
+static void copy_on_read_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    if (ret == 0) {
+        /* Update fresh_bitmap but do not update stale_bitmap or the on-disk
+         * bitmap. See Section 3.3.4 of the FVD-cow paper. */
+        update_fresh_bitmap(acb->sector_num, acb->nb_sectors, s);
+    }
+
+    s->outstanding_copy_on_read_data -= acb->nb_sectors * 512;
+
+#ifdef FVD_DEBUG
+    s->total_copy_on_read_data += acb->nb_sectors * 512;
+#endif
+    QDEBUG("READ: acb%llu-%p  copy_on_read_cb  buffer_sector_num=%" PRId64
+           " buffer_nb_sectors=%d write_sector_num=%" PRId64
+           " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
+           acb->uuid, acb, acb->copy.buffered_sector_begin,
+           (int)(acb->copy.buffered_sector_end -
+                 acb->copy.buffered_sector_begin), acb->sector_num,
+           acb->nb_sectors, s->outstanding_copy_on_read_data);
+
+    QLIST_REMOVE(acb, copy_lock.next);
+    restart_dependent_writes(acb);
+
+    int64_t begin = acb->sector_num + acb->nb_sectors;
+    int64_t end = acb->copy.buffered_sector_end;
+
+    if (find_region_in_base_img(s, &begin, &end)) {
+        acb->sector_num = begin;
+        acb->nb_sectors = end - begin;
+        acb->copy.iov.iov_base = acb->copy.buf +
+            (begin - acb->copy.buffered_sector_begin) * 512;
+        acb->copy.iov.iov_len = acb->nb_sectors * 512;
+        qemu_iovec_init_external(&acb->copy.qiov, &acb->copy.iov, 1);
+        QDEBUG("READ: acb%llu-%p  copy_on_read  buffer_sector_num=%" PRId64
+               " buffer_nb_sectors=%d write_sector_num=%" PRId64
+               " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
+               acb->uuid, acb, acb->copy.buffered_sector_begin,
+               (int)(acb->copy.buffered_sector_end -
+                     acb->copy.buffered_sector_begin), acb->sector_num,
+               acb->nb_sectors, s->outstanding_copy_on_read_data);
+        acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                      &acb->copy.qiov, acb->nb_sectors,
+                                      copy_on_read_cb, acb);
+        if (acb->copy.hd_acb) {
+            QLIST_INIT(&acb->copy_lock.dependent_writes);
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            s->outstanding_copy_on_read_data += acb->copy.iov.iov_len;
+            return;
+        }
+    }
+
+    QDEBUG("READ: acb%llu-%p  no_more_copy_on_read\n", acb->uuid, acb);
+    my_qemu_vfree(acb->copy.buf);
+    my_qemu_aio_release(acb);
+}
+
+static void finish_read(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->read.ret != 0) {
+        QDEBUG("READ: acb%llu-%p  finish_read error ret=%d sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->uuid, acb, acb->read.ret,
+               acb->sector_num, acb->nb_sectors);
+        acb->common.cb(acb->common.opaque, acb->read.ret);
+        if (acb->read.read_backing.iov.iov_base) {
+            my_qemu_vfree(acb->read.read_backing.iov.iov_base);
+        }
+        if (acb->read.read_fvd.iov.iov_base) {
+            my_qemu_vfree(acb->read.read_fvd.iov.iov_base);
+        }
+        my_qemu_aio_release(acb);
+
+        return;
+    }
+
+    if (!acb->read.read_fvd.iov.iov_base) {
+        /* Only read data from the base image. */
+        uint8_t *data = ((uint8_t *) acb->read.read_backing.iov.iov_base) +
+            (acb->sector_num - acb->read.read_backing.sector_num) * 512;
+        qemu_iovec_from_buffer(acb->read.qiov, data, acb->nb_sectors * 512);
+    } else {
+        /* Under the guidance of the saved bitmap, merge data from the FVD
+         * data file and the base image. */
+        uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
+            acb->read.read_fvd.iov.iov_len;
+        int64_t bitmap_offset = MIN(acb->read.read_backing.sector_num,
+                                    acb->read.read_fvd.sector_num);
+        bitmap_offset = bitmap_offset / s->block_size / 8;
+        int iov_index = 0;
+        uint8_t *iov_buf = acb->read.qiov->iov[0].iov_base;
+        int iov_left = acb->read.qiov->iov[0].iov_len;
+        int64_t sec = acb->sector_num;
+        const int64_t end = acb->sector_num + acb->nb_sectors;
+        int64_t first_sec;
+        uint8_t *source;
+
+        if (bitmap_show_sector_in_base_img(sec,s,bitmap_offset,saved_bitmap)) {
+            goto in_backing;
+        }
+
+        while (1) {
+            /* For a section of data in the FVD data file. */
+            if (sec >= end) {
+                break;
+            }
+
+            first_sec = sec;
+            do {
+                sec++;
+            } while (sec < end && !bitmap_show_sector_in_base_img(sec, s,
+                                     bitmap_offset, saved_bitmap));
+
+            source = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
+                (first_sec - acb->read.read_fvd.sector_num) * 512;
+            copy_iov(acb->read.qiov->iov, &iov_index, &iov_buf, &iov_left,
+                        source, (sec - first_sec) * 512);
+
+in_backing:
+            /* For a section of data in the base image. */
+            if (sec >= end) {
+                break;
+            }
+
+            first_sec = sec;
+            do {
+                sec++;
+            } while (sec < end && bitmap_show_sector_in_base_img(sec, s,
+                                    bitmap_offset, saved_bitmap));
+
+            source = ((uint8_t *) acb->read.read_backing.iov.iov_base) +
+                (first_sec - acb->read.read_backing.sector_num) * 512;
+            copy_iov(acb->read.qiov->iov, &iov_index, &iov_buf, &iov_left,
+                        source, (sec - first_sec) * 512);
+        }
+
+        ASSERT(iov_index == acb->read.qiov->niov - 1 && iov_left == 0);
+        my_qemu_vfree(acb->read.read_fvd.iov.iov_base);
+    }
+
+    QDEBUG("READ: acb%llu-%p  finish_read  ret=%d\n", acb->uuid, acb,
+           acb->read.ret);
+    acb->common.cb(acb->common.opaque, acb->read.ret);
+
+    if (!s->copy_on_read || get_async_context_id() != 0) {
+        /* Do copy-on-read only if the context id is 0, i.e., it is not
+         * emulating synchronous I/O.  Doing copy-on-read in emulated
+         * synchronous I/O may leave the copy-on-read callbacks never being
+         * processed due to mismatching context id. */
+        my_qemu_vfree(acb->read.read_backing.iov.iov_base);
+        my_qemu_aio_release(acb);
+        return;
+    }
+
+    /* Convert AIOReadCB into a AIOCopyCB for copy-on-read. */
+    uint8_t *buf = acb->read.read_backing.iov.iov_base;
+    int64_t begin = acb->read.read_backing.sector_num;
+    int64_t end = begin + acb->read.read_backing.nb_sectors;
+
+    acb->type = OP_COPY;
+    acb->copy.buf = buf;
+    acb->copy.buffered_sector_begin = begin;
+    acb->copy.buffered_sector_end = end;
+
+    if (s->outstanding_copy_on_read_data < s->max_outstanding_copy_on_read_data
+        && find_region_in_base_img(s, &begin, &end)) {
+        /* Write to the FVD data file. */
+        acb->sector_num = begin;
+        acb->nb_sectors = end - begin;
+        acb->copy.iov.iov_base =
+            buf + (begin - acb->copy.buffered_sector_begin) * 512;
+        acb->copy.iov.iov_len = acb->nb_sectors * 512;
+        qemu_iovec_init_external(&acb->copy.qiov, &acb->copy.iov, 1);
+        QDEBUG("READ: acb%llu-%p  copy_on_read  buffer_sector_num=%" PRId64
+               " buffer_nb_sectors=%d write_sector_num=%" PRId64
+               " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
+               acb->uuid, acb, acb->copy.buffered_sector_begin,
+               (int)(acb->copy.buffered_sector_end -
+                     acb->copy.buffered_sector_begin), acb->sector_num,
+               acb->nb_sectors, s->outstanding_copy_on_read_data);
+        acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                      &acb->copy.qiov, acb->nb_sectors,
+                                      copy_on_read_cb, acb);
+        if (acb->copy.hd_acb) {
+            QLIST_INIT(&acb->copy_lock.dependent_writes);
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            s->outstanding_copy_on_read_data += acb->copy.iov.iov_len;
+            return;
+        }
+    }
+
+    /* No more copy-on-read to do. */
+    my_qemu_vfree(acb->copy.buf);
+    my_qemu_aio_release(acb);
+}
+
+static void read_fvd_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("READ: acb%llu-%p  read_fvd_cb ret=%d\n", acb->uuid, acb, ret);
+    acb->read.read_fvd.hd_acb = NULL;
+    acb->read.read_fvd.done = true;
+    if (acb->read.ret == 0) {
+        acb->read.ret = ret;
+    }
+
+    if (acb->read.read_backing.done) {
+        finish_read(acb);       /* The other request also finished. */
+    }
+}
+
+static void read_backing_for_copy_on_read_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("READ: acb%llu-%p  read_backing_cb ret=%d\n", acb->uuid, acb, ret);
+    acb->read.read_backing.hd_acb = NULL;
+    acb->read.read_backing.done = true;
+    if (acb->read.ret == 0) {
+        acb->read.ret = ret;
+    }
+
+    if (acb->read.read_fvd.done) {
+        finish_read(acb);       /* The other request also finished. */
+    }
+}
+
+static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num,
+                                    int nb_sectors,
+                                    int64_t * p_first_sec_in_fvd,
+                                    int64_t * p_last_sec_in_fvd,
+                                    int64_t * p_first_sec_in_backing,
+                                    int64_t * p_last_sec_in_backing)
+{
+    int64_t last_sec_in_backing = -1, first_sec_in_backing = -1;
+    int64_t last_sec_in_fvd = -1, first_sec_in_fvd = -1;
+    int prev_block_in_backing;
+
+    if (fresh_bitmap_show_sector_in_base_img(sector_num, s)) {
+        first_sec_in_backing = last_sec_in_backing = sector_num;
+        prev_block_in_backing = true;
+    } else {
+        first_sec_in_fvd = last_sec_in_fvd = sector_num;
+        prev_block_in_backing = false;
+    }
+
+    /* Begin of next block. */
+    int64_t sec = ROUND_UP(sector_num + 1, s->block_size);
+
+    const int64_t sec_end = sector_num + nb_sectors;
+    int64_t last_sec = MIN(sec_end, s->base_img_sectors) - 1;
+
+    while (1) {
+        if (sec > last_sec) {
+            sec = last_sec;
+        }
+
+        if (fresh_bitmap_show_sector_in_base_img(sec, s)) {
+            if (first_sec_in_backing < 0) {
+                first_sec_in_backing = sec;
+            }
+            if (!prev_block_in_backing) {
+                last_sec_in_fvd = sec - 1;
+                prev_block_in_backing = true;
+            }
+            last_sec_in_backing = sec;
+        } else {
+            if (first_sec_in_fvd < 0) {
+                first_sec_in_fvd = sec;
+            }
+            if (prev_block_in_backing) {
+                last_sec_in_backing = sec - 1;
+                prev_block_in_backing = false;
+            }
+            last_sec_in_fvd = sec;
+        }
+
+        if (sec == last_sec) {
+            break;
+        }
+        sec += s->block_size;
+    }
+
+    if (sec_end > s->base_img_sectors) {
+        if (first_sec_in_fvd < 0) {
+            first_sec_in_fvd = s->base_img_sectors;
+        }
+        last_sec_in_fvd = sec_end - 1;
+    }
+
+    *p_first_sec_in_fvd = first_sec_in_fvd;
+    *p_last_sec_in_fvd = last_sec_in_fvd;
+    *p_first_sec_in_backing = first_sec_in_backing;
+    *p_last_sec_in_backing = last_sec_in_backing;
 }
diff --git a/block/fvd-utils.c b/block/fvd-utils.c
new file mode 100644
index 0000000..ff2bb8f
--- /dev/null
+++ b/block/fvd-utils.c
@@ -0,0 +1,44 @@
+/*
+ * QEMU Fast Virtual Disk Format Utility Functions
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+static inline void copy_iov(struct iovec *iov, int *p_index,
+                               uint8_t ** p_buf, int *p_left,
+                               uint8_t * source, int total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+
+    if (left <= 0) {
+        index++;
+        buf = iov[index].iov_base;
+        left = iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            memcpy(buf, source, total);
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return;
+        }
+
+        memcpy(buf, source, left);
+        total -= left;
+        source += left;
+        index++;
+        buf = iov[index].iov_base;
+        left = iov[index].iov_len;
+    }
+}
diff --git a/block/fvd.c b/block/fvd.c
index 5b3dcac..74845e7 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -27,6 +27,7 @@
  * function optimization. */
 #include "block/fvd-debug.c"
 #include "block/fvd-flush.c"
+#include "block/fvd-utils.c"
 #include "block/fvd-bitmap.c"
 #include "block/fvd-misc.c"
 #include "block/fvd-create.c"
@@ -34,6 +35,7 @@
 #include "block/fvd-write.c"
 #include "block/fvd-read.c"
 #include "block/fvd-store.c"
+#include "block/fvd-load.c"
 #include "block/fvd-journal.c"
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 13/26] FVD: add impl of storing data in compact image
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (10 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 12/26] FVD: add impl of interface bdrv_aio_readv() Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 14/26] FVD: add impl of loading data from " Chunqiang Tang
                   ` (12 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the implementation of storing data in a compact image. This
capability is needed for both copy-on-write (see fvd_aio_writev()) and
copy-on-read (see fvd_aio_readv()).

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-store.c |  459 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-utils.c |   65 ++++++++
 2 files changed, 524 insertions(+), 0 deletions(-)

diff --git a/block/fvd-store.c b/block/fvd-store.c
index 85e45d4..fe670eb 100644
--- a/block/fvd-store.c
+++ b/block/fvd-store.c
@@ -11,10 +11,469 @@
  *
  */
 
+static uint32_t allocate_chunk(BlockDriverState * bs);
+static inline FvdAIOCB *init_store_acb(int soft_write,
+                QEMUIOVector * orig_qiov, BlockDriverState * bs,
+                int64_t sector_num, int nb_sectors, FvdAIOCB * parent_acb,
+                BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
+            struct FvdAIOCB *parent_acb, BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static void store_data_in_compact_image_cb(void *opaque, int ret);
+
 static inline BlockDriverAIOCB *store_data(int soft_write,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
                 BlockDriverCompletionFunc * cb, void *opaque)
 {
+    BDRVFvdState *s = bs->opaque;
+
+    TRACE_STORE_IN_FVD("store_data", sector_num, nb_sectors);
+
+    if (!s->table) {
+        /* Write directly since it is not a compact image. */
+        return bdrv_aio_writev(s->fvd_data, s->data_offset + sector_num,
+                               orig_qiov, nb_sectors, cb, opaque);
+    } else {
+        return store_data_in_compact_image(soft_write, parent_acb, bs,
+                                           sector_num, orig_qiov, nb_sectors,
+                                           cb, opaque);
+    }
+}
+
+/* Store data in the compact image. The argument 'soft_write' means
+ * the store was caused by copy-on-read or prefetching, which need not
+ * update metadata immediately. */
+static BlockDriverAIOCB *store_data_in_compact_image(int soft_write,
+                                                     FvdAIOCB * parent_acb,
+                                                     BlockDriverState * bs,
+                                                     int64_t sector_num,
+                                                     QEMUIOVector * orig_qiov,
+                                                     const int nb_sectors,
+                                                     BlockDriverCompletionFunc
+                                                     * cb, void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    const uint32_t first_chunk = sector_num / s->chunk_size;
+    const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+    int table_dirty = false;
+    uint32_t chunk;
+    int64_t start_sec;
+
+    /* Check if storag space is allocated. */
+    for (chunk = first_chunk; chunk <= last_chunk; chunk++) {
+        if (IS_EMPTY(s->table[chunk])) {
+            uint32_t id = allocate_chunk(bs);
+            if (IS_EMPTY(id)) {
+                return NULL;
+            }
+            QDEBUG ("STORE: map chunk %u to %u\n", chunk, id);
+            id |= DIRTY_TABLE;
+            WRITE_TABLE(s->table[chunk], id);
+            table_dirty = true;
+        } else if (IS_DIRTY(s->table[chunk])) {
+            /* This is possible in several cases. 1) If a previous soft-write
+             * allocated the storage space but did not flush the table entry
+             * change to the journal and hence did not clean the dirty bit. 2)
+             * This is possible if a previous hard-write was canceled before
+             * it could write the table entry to disk. 3) Finally, this is
+             * also possible with two concurrent hard-writes. The first
+             * hard-write allocated the storage space but has not flushed the
+             * table entry change to the journal yet and hence the table entry
+             * remains dirty. In this case, the second hard-write will also
+             * try to flush this dirty table entry to the journal. The outcome
+             * is correct since they store the same metadata change in the
+             * journal (although twice). For this race condition, we prefer to
+             * have two writes to the journal rather than introducing a
+             * locking mechanism, because this happens rarely and those two
+             * writes to the journal are likely to be merged by the kernel
+             * into a single write since they are likely to update
+             * back-to-back sectors in the journal.  A locking mechanism would
+             * be less efficient, because the large size of chunks would cause
+             * unnecessary locking due to ``false sharing'' of a chunk by two
+             * writes. */
+            table_dirty = true;
+        }
+    }
+
+    if (!(acb = init_store_acb(soft_write, orig_qiov, bs, sector_num,
+                               nb_sectors, parent_acb, cb, opaque))) {
+        return NULL;
+    }
+
+    const bool update_table = (!soft_write && table_dirty);
+    size_t iov_left;
+    uint8_t *iov_buf;
+    int nb, iov_index, nqiov, niov;
+    uint32_t prev;
+
+    if (first_chunk == last_chunk) {
+        goto handle_one_continuous_region;
+    }
+
+    /* Count the number of qiov and iov needed to cover the continuous regions
+     * of the compact image. */
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    iov_index = 0;
+    nqiov = 0;
+    niov = 0;
+    prev = READ_TABLE(s->table[first_chunk]);
+
+    /* Data in the first chunk. */
+    nb = s->chunk_size - (sector_num % s->chunk_size);
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE(s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if (current == prev + 1) {
+            nb += data_size;    /* Continue the previous region. */
+        } else {
+            /* Terminate the previous region. */
+            niov += count_iov(orig_qiov->iov, &iov_index, &iov_buf,
+                              &iov_left, nb * 512);
+            nqiov++;
+            nb = data_size;     /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    if (nqiov == 0) {
+handle_one_continuous_region:
+        /* A simple case. All data can be written out in one qiov and no new
+         * chunks are allocated. */
+        start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
+            (sector_num % s->chunk_size);
+
+        acb->store.update_table = update_table;
+        acb->store.num_children = 1;
+        acb->store.one_child.hd_acb =
+            bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, orig_qiov,
+                            nb_sectors, store_data_in_compact_image_cb,
+                            &acb->store.one_child);
+        if (acb->store.one_child.hd_acb) {
+            acb->store.one_child.acb = acb;
+            return &acb->common;
+        } else {
+            my_qemu_aio_release(acb);
+            return NULL;
+        }
+    }
+
+    /* qiov for the last continuous region. */
+    niov += count_iov(orig_qiov->iov, &iov_index, &iov_buf,
+                      &iov_left, nb * 512);
+    nqiov++;
+    ASSERT(iov_index == orig_qiov->niov - 1 && iov_left == 0);
+
+    /* Need to submit multiple requests to the lower layer. */
+    acb->store.update_table = update_table;
+    acb->store.num_children = nqiov;
+
+    if (!parent_acb) {
+        QDEBUG("STORE: acb%llu-%p  start  sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->uuid, acb, acb->sector_num,
+               acb->nb_sectors);
+    }
+
+    /* Allocate memory and create multiple requests. */
+    const size_t metadata_size = nqiov * (sizeof(CompactChildCB) +
+                                          sizeof(QEMUIOVector)) +
+        niov * sizeof(struct iovec);
+    acb->store.children = (CompactChildCB *) my_qemu_malloc(metadata_size);
+    QEMUIOVector *q = (QEMUIOVector *) (acb->store.children + nqiov);
+    struct iovec *v = (struct iovec *)(q + nqiov);
+
+    start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
+        (sector_num % s->chunk_size);
+    nqiov = 0;
+    iov_index = 0;
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    prev = READ_TABLE(s->table[first_chunk]);
+
+    /* Data in the first chunk. */
+    if (first_chunk == last_chunk) {
+        nb = nb_sectors;
+    } else {
+        nb = s->chunk_size - (sector_num % s->chunk_size);
+    }
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE(s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if (current == prev + 1) {
+            nb += data_size;    /* Continue the previous region. */
+        } else {
+            /* Terminate the previous continuous region. */
+            niov = setup_iov(orig_qiov->iov, v, &iov_index,
+                             &iov_buf, &iov_left, nb * 512);
+            qemu_iovec_init_external(q, v, niov);
+            QDEBUG("STORE: acb%llu-%p  create_child %d sector_num=%" PRId64
+                   " nb_sectors=%zu niov=%d\n", acb->uuid, acb, nqiov,
+                   start_sec, q->size / 512, q->niov);
+            acb->store.children[nqiov].hd_acb =
+                bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, q,
+                                q->size / 512, store_data_in_compact_image_cb,
+                                &acb->store.children[nqiov]);
+            if (!acb->store.children[nqiov].hd_acb) {
+                goto fail;
+            }
+            acb->store.children[nqiov].acb = acb;
+            v += niov;
+            q++;
+            nqiov++;
+            start_sec = current * s->chunk_size; /* Begin of the new region. */
+            nb = data_size;     /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    /* Requst for the last chunk. */
+    niov = setup_iov(orig_qiov->iov, v, &iov_index, &iov_buf,
+                     &iov_left, nb * 512);
+    ASSERT(iov_index == orig_qiov->niov - 1 && iov_left == 0);
+    qemu_iovec_init_external(q, v, niov);
+
+    QDEBUG("STORE: acb%llu-%p  create_child_last %d sector_num=%" PRId64
+           " nb_sectors=%zu niov=%d\n", acb->uuid, acb, nqiov, start_sec,
+           q->size / 512, q->niov);
+    acb->store.children[nqiov].hd_acb =
+        bdrv_aio_writev(s->fvd_data, s->data_offset + start_sec, q,
+                        q->size / 512, store_data_in_compact_image_cb,
+                        &acb->store.children[nqiov]);
+    if (acb->store.children[nqiov].hd_acb) {
+        acb->store.children[nqiov].acb = acb;
+        return &acb->common;
+    }
+
+    int i;
+fail:
+    QDEBUG("STORE: acb%llu-%p  failed\n", acb->uuid, acb);
+    for (i = 0; i < nqiov; i++) {
+        bdrv_aio_cancel(acb->store.children[i].hd_acb);
+    }
+    my_qemu_free(acb->store.children);
+    my_qemu_aio_release(acb);
     return NULL;
 }
+
+static uint32_t allocate_chunk(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    uint32_t physical_chunk;
+
+    /* Reuse a previously leaked chunk if possible. */
+    if (s->next_avail_leaked_chunk < s->num_leaked_chunks) {
+        physical_chunk = s->leaked_chunks[s->next_avail_leaked_chunk++];
+        QDEBUG("Reuse leaked physical chunk %u\n", physical_chunk);
+        if (s->next_avail_leaked_chunk == s->num_leaked_chunks) {
+            /* All leaked chunks have been used. */
+            my_qemu_free(s->leaked_chunks);
+            s->leaked_chunks = NULL;
+            s->num_leaked_chunks = s->next_avail_leaked_chunk = 0;
+            QDEBUG("All leaked physical chunks reused\n");
+        }
+        if (!s->chunks_relocated) {
+            s->chunks_relocated = true;
+            /* Update the header. */
+            FvdHeader header;
+            if (read_fvd_header(s, &header)) {
+                s->metadata_err_prohibit_write = true;
+            } else {
+                header.chunks_relocated = true;
+                if (update_fvd_header(s, &header)
+                    || bdrv_flush(s->fvd_metadata)) {
+                    s->metadata_err_prohibit_write = true;
+                }
+            }
+        }
+        return physical_chunk;
+    }
+
+    /* Grow storage space if needed. */
+    if (s->add_storage_cmd &&
+        s->used_storage + s->chunk_size > s->avail_storage) {
+        if (system(s->add_storage_cmd)) {
+            fprintf(stderr, "Error in executing %s\n", s->add_storage_cmd);
+        }
+
+        /* Check how much storage is available now. */
+        int64_t size = bdrv_getlength(s->fvd_data);
+        if (size < 0) {
+            fprintf(stderr, "Error in bdrv_getlength(%s)\n", bs->filename);
+            return EMPTY_TABLE;
+        }
+        s->avail_storage = size / 512 - s->data_offset;
+        if (s->used_storage + s->chunk_size > s->avail_storage) {
+            fprintf(stderr, "Could not allocate more storage space.\n");
+            return EMPTY_TABLE;
+        }
+
+        QDEBUG("Increased storage to %" PRId64 " bytes.\n", size);
+    }
+
+    physical_chunk = s->used_storage / s->chunk_size;
+    s->used_storage += s->chunk_size;
+    return physical_chunk;
+}
+
+static void store_data_in_compact_image_cb(void *opaque, int ret)
+{
+    CompactChildCB *child = opaque;
+    FvdAIOCB *acb = child->acb;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    /* Now fvd_aio_cancel_store_compact() won't cancel this child request. */
+    child->hd_acb = NULL;
+
+    if (acb->store.ret == 0) {
+        acb->store.ret = ret;
+    } else {
+        QDEBUG("STORE: acb%llu-%p  store_child=%d total_children=%d error "
+               "ret=%d\n", acb->uuid, acb, acb->store.finished_children,
+               acb->store.num_children, ret);
+    }
+
+    acb->store.finished_children++;
+    if (acb->store.finished_children < acb->store.num_children) {
+        QDEBUG("STORE: acb%llu-%p  store_finished_children=%d "
+               "total_children=%d\n", acb->uuid, acb,
+               acb->store.finished_children, acb->store.num_children);
+        return;
+    }
+
+    /* All child requests finished. Free buffers. */
+    if (acb->store.children) {
+        my_qemu_free(acb->store.children);
+        acb->store.children = NULL;
+    }
+
+    if (acb->store.ret) {       /* error */
+        QDEBUG("STORE: acb%llu-%p  store_last_child_finished_with_error "
+               "ret=%d\n", acb->uuid, acb, acb->store.ret);
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+        return;
+    }
+
+    /* Update the frontier of sectors already written (i.e.,avail_storage).
+     * This affects load_data_from_compact_image(). A load from unwritten
+     * sectors in allocated chunks should return an array of zeros.  Also
+     * check whether the table entries are still dirty. Note that while saving
+     * this write to disk, other writes might have already flushed the dirty
+     * table entries to the journal. If those table entries are no longer
+     * dirty, depending on the behavior of parent_acb, it might be able to
+     * skip a journal update. */
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    const uint32_t first_chunk = acb->sector_num / s->chunk_size;
+    const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                                / s->chunk_size;
+    bool update_table = false;
+    uint32_t chunk;
+    for (chunk = first_chunk; chunk <= last_chunk; chunk ++) {
+        int64_t end;
+        if (chunk == last_chunk) {
+            int64_t data = (acb->sector_num + acb->nb_sectors) % s->chunk_size;
+            if (data == 0) {
+                data = s->chunk_size;
+            }
+            end = READ_TABLE(s->table[chunk]) * s->chunk_size + data;
+        } else {
+            end = (READ_TABLE(s->table[chunk]) + 1) * s->chunk_size;
+        }
+        if (end > s->avail_storage) {
+            s->avail_storage = end;
+        }
+
+        if (IS_DIRTY(s->table[chunk])) {
+            update_table = true;
+        }
+    }
+
+    if (!acb->store.update_table) {
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_without_table_update\n",
+               acb->uuid, acb);
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+        return;
+    }
+
+    if (acb->store.parent_acb) {
+        /* Metadata update will be handled by the parent write. */
+        ASSERT(acb->store.parent_acb->type == OP_WRITE);
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_with_parent_do_table_update\n",
+               acb->uuid, acb);
+        acb->store.parent_acb->write.update_table = update_table;
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+    } else if (update_table) {
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_and_start_table_update\n",
+               acb->uuid, acb);
+        write_metadata_to_journal(acb, false);
+    } else {
+        QDEBUG("STORE: acb%llu-%p  "
+               "store_last_child_finished_without_table_update\n",
+               acb->uuid, acb);
+        acb->common.cb(acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release(acb);
+    }
+}
+
+static inline FvdAIOCB *init_store_acb(int soft_write,
+                                       QEMUIOVector * orig_qiov,
+                                       BlockDriverState * bs,
+                                       int64_t sector_num, int nb_sectors,
+                                       FvdAIOCB * parent_acb,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    FvdAIOCB *acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    acb->type = OP_STORE_COMPACT;
+    acb->cancel_in_progress = false;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->store.soft_write = soft_write;
+    acb->store.orig_qiov = orig_qiov;
+    acb->store.parent_acb = parent_acb;
+    acb->store.finished_children = 0;
+    acb->store.num_children = 0;
+    acb->store.one_child.hd_acb = NULL;
+    acb->store.children = NULL;
+    acb->store.ret = 0;
+    acb->jcb.iov.iov_base = NULL;
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL;
+    COPY_UUID(acb, parent_acb);
+    return acb;
+}
diff --git a/block/fvd-utils.c b/block/fvd-utils.c
index ff2bb8f..9feaa35 100644
--- a/block/fvd-utils.c
+++ b/block/fvd-utils.c
@@ -42,3 +42,68 @@ static inline void copy_iov(struct iovec *iov, int *p_index,
         left = iov[index].iov_len;
     }
 }
+
+static int count_iov(struct iovec *orig_iov, int *p_index, uint8_t ** p_buf,
+                     size_t * p_left, size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
+
+static int setup_iov(struct iovec *orig_iov, struct iovec *new_iov,
+                     int *p_index, uint8_t ** p_buf, size_t * p_left,
+                     size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            new_iov[count].iov_base = buf;
+            new_iov[count].iov_len = total;
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        new_iov[count].iov_base = buf;
+        new_iov[count].iov_len = left;
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 14/26] FVD: add impl of loading data from compact image
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (11 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 13/26] FVD: add impl of storing data in compact image Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 15/26] FVD: add basic journal functionality Chunqiang Tang
                   ` (11 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the implementation of load data from a compact image. This
capability is to support fvd_aio_readv() when FVD is configured to use its
one-level lookup table to do storage allocation.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-load.c  |  448 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-utils.c |   40 +++++
 2 files changed, 488 insertions(+), 0 deletions(-)

diff --git a/block/fvd-load.c b/block/fvd-load.c
index 80ab32c..88e5fb4 100644
--- a/block/fvd-load.c
+++ b/block/fvd-load.c
@@ -11,10 +11,458 @@
  *
  */
 
+static void load_data_from_compact_image_cb(void *opaque, int ret);
+static BlockDriverAIOCB *load_data_from_compact_image(FvdAIOCB *parent_acb,
+                    BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque);
+static inline FvdAIOCB *init_load_acb(FvdAIOCB * parent_acb,
+                    BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * orig_qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque);
+static int load_create_child_requests(bool count_only, BDRVFvdState *s,
+                    QEMUIOVector * orig_qiov, int64_t sector_num,
+                    int nb_sectors, int *p_nziov, int *p_niov, int *p_nqiov,
+                    FvdAIOCB *acb,  QEMUIOVector *q, struct iovec *v);
+
 static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
                     BlockDriverState * bs, int64_t sector_num,
                     QEMUIOVector * orig_qiov, int nb_sectors,
                     BlockDriverCompletionFunc * cb, void *opaque)
 {
+    BDRVFvdState *s = bs->opaque;
+
+    if (!s->table) {
+        /* Load directly since it is not a compact image. */
+        return bdrv_aio_readv(s->fvd_data, s->data_offset + sector_num,
+                              orig_qiov, nb_sectors, cb, opaque);
+    } else {
+        return load_data_from_compact_image(parent_acb, bs, sector_num,
+                                            orig_qiov, nb_sectors, cb, opaque);
+    }
+}
+
+static BlockDriverAIOCB *load_data_from_compact_image(FvdAIOCB * parent_acb,
+                    BlockDriverState * bs, int64_t sector_num,
+                    QEMUIOVector * orig_qiov, int nb_sectors,
+                    BlockDriverCompletionFunc * cb, void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB * acb;
+    int64_t start_sec = -1;
+    int nziov = 0;
+    int nqiov = 0;
+    int niov = 0;
+    int i;
+
+    /* Count the number of qiov and iov needed to cover the continuous regions
+     * of the compact image. */
+    load_create_child_requests(true/*count_only*/, s, orig_qiov, sector_num,
+                          nb_sectors, &nziov, &niov, &nqiov, NULL, NULL, NULL);
+
+    if (nqiov + nziov == 1) {
+        /* All data can be read in one qiov. Reuse orig_qiov. */
+        if (nziov == 1) {
+            /* This is a zero-filled region. */
+            for (i = 0; i < orig_qiov->niov; i++) {
+                memset(orig_qiov->iov[i].iov_base,
+                       0, orig_qiov->iov[i].iov_len);
+            }
+
+            /* Use a bh to invoke the callback. */
+            if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) {
+                return NULL;
+            }
+            COPY_UUID(acb, parent_acb);
+            QDEBUG("LOAD: acb%llu-%p  load_fill_all_with_zeros\n",
+                   acb->uuid, acb);
+            acb->type = OP_WRAPPER;
+            acb->cancel_in_progress = false;
+            acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb);
+            qemu_bh_schedule(acb->wrapper.bh);
+            return &acb->common;
+        } else {
+            /* A non-empty region. */
+            const uint32_t first_chunk = sector_num / s->chunk_size;
+            start_sec = READ_TABLE(s->table[first_chunk]) * s->chunk_size +
+                        (sector_num % s->chunk_size);
+            if (parent_acb) {
+                QDEBUG("LOAD: acb%llu-%p  "
+                       "load_directly_as_one_continuous_region\n",
+                       parent_acb->uuid, parent_acb);
+            }
+            return bdrv_aio_readv(s->fvd_data, s->data_offset + start_sec,
+                                  orig_qiov, nb_sectors, cb, opaque);
+        }
+    }
+
+    /* Need to submit multiple requests to the lower layer. Initialize acb. */
+    if (!(acb = init_load_acb(parent_acb, bs, sector_num, orig_qiov,
+                              nb_sectors, cb, opaque))) {
+        return NULL;
+    }
+    acb->load.num_children = nqiov;
+
+    /* Allocate memory and create multiple requests. */
+    acb->load.children = my_qemu_malloc((sizeof(CompactChildCB) +
+                                         sizeof(QEMUIOVector)) * nqiov +
+                                        sizeof(struct iovec) * niov);
+    QEMUIOVector *q = (QEMUIOVector *) (acb->load.children + nqiov);
+    struct iovec *v = (struct iovec *)(q + nqiov);
+
+    if (!load_create_child_requests(false/*count_only*/, s, orig_qiov,
+                                    sector_num, nb_sectors, NULL, NULL,
+                                    &nqiov, acb, q, v)) {
+        return &acb->common;
+    }
+
+    /* Clean up after failure. nqiov is the no. of submitted child requests. */
+    for (i = 0; i < nqiov; i++) {
+        bdrv_aio_cancel(acb->load.children[i].hd_acb);
+    }
+    my_qemu_free(acb->load.children);
+    my_qemu_aio_release(acb);
     return NULL;
 }
+
+static void load_data_from_compact_image_cb(void *opaque, int ret)
+{
+    CompactChildCB *child = opaque;
+    FvdAIOCB *acb = child->acb;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    /* Now fvd_aio_cancel_store_compact() won't cancel this child request. */
+    child->hd_acb = NULL;
+
+    if (acb->load.ret == 0) {
+        acb->load.ret = ret;
+    } else {
+        QDEBUG("LOAD: acb%llu-%p  load_child=%d total_children=%d "
+               "error ret=%d\n", acb->uuid, acb, acb->load.finished_children,
+               acb->load.num_children, ret);
+    }
+
+    acb->load.finished_children++;
+    if (acb->load.finished_children < acb->load.num_children) {
+        QDEBUG("LOAD: acb%llu-%p  load_finished_children=%d "
+               "total_children=%d\n", acb->uuid, acb,
+               acb->load.finished_children, acb->load.num_children);
+        return;
+    }
+
+    QDEBUG("LOAD: acb%llu-%p  load_last_child_finished ret=%d\n", acb->uuid,
+           acb, acb->load.ret);
+    acb->common.cb(acb->common.opaque, acb->load.ret);
+    if (acb->load.children) {
+        my_qemu_free(acb->load.children);
+    }
+    my_qemu_aio_release(acb);
+}
+
+static inline FvdAIOCB *init_load_acb(FvdAIOCB * parent_acb,
+                                      BlockDriverState * bs,
+                                      int64_t sector_num,
+                                      QEMUIOVector * orig_qiov,
+                                      int nb_sectors,
+                                      BlockDriverCompletionFunc * cb,
+                                      void *opaque)
+{
+    FvdAIOCB *const acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    acb->type = OP_LOAD_COMPACT;
+    acb->cancel_in_progress = false;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->load.parent_acb = parent_acb;
+    acb->load.finished_children = 0;
+    acb->load.children = NULL;
+    acb->load.one_child.hd_acb = NULL;
+    acb->load.orig_qiov = orig_qiov;
+    acb->load.ret = 0;
+    COPY_UUID(acb, parent_acb);
+    return acb;
+}
+
+static inline int load_create_one_child(bool count_only, bool empty,
+                    QEMUIOVector * orig_qiov, int *iov_index, size_t *iov_left,
+                    uint8_t **iov_buf, int64_t start_sec, int sectors_in_region,
+                    int *p_niov, int *p_nziov, int *p_nqiov, BDRVFvdState *s,
+                    FvdAIOCB *acb, QEMUIOVector **q, struct iovec **v)
+{
+    int niov;
+
+    if (count_only) {
+        if (empty) {
+            count_iov(orig_qiov->iov, iov_index, iov_buf,
+                      iov_left, sectors_in_region * 512);
+            (*p_nziov)++;
+        } else {
+            niov = count_iov(orig_qiov->iov, iov_index, iov_buf,
+                              iov_left, sectors_in_region * 512);
+            *p_niov += niov;
+            (*p_nqiov)++;
+        }
+        return 0;
+    }
+
+    /* Not count_only, need to take real actions. */
+    if (empty) {
+        /* Fill iov data with zeros. */
+        zero_iov(orig_qiov->iov, iov_index, iov_buf, iov_left,
+                 sectors_in_region * 512);
+        return 0;
+    }
+
+    /* Create a child request to read data. */
+    niov = setup_iov(orig_qiov->iov, *v, iov_index, iov_buf,
+                     iov_left, sectors_in_region * 512);
+    qemu_iovec_init_external(*q, *v, niov);
+    QDEBUG("LOAD: acb%llu-%p  create_child %d sector_num=%" PRId64
+           " nb_sectors=%d niov=%d\n", acb->uuid, acb, *p_nqiov,
+           start_sec, sectors_in_region, niov);
+    acb->load.children[*p_nqiov].hd_acb =
+        bdrv_aio_readv(s->fvd_data, s->data_offset + start_sec, *q,
+                       sectors_in_region, load_data_from_compact_image_cb,
+                       &acb->load.children[*p_nqiov]);
+    if (!acb->load.children[*p_nqiov].hd_acb) {
+        return -1;
+    }
+    acb->load.children[*p_nqiov].acb = acb;
+    *v = *v + niov;
+    (*q)++;
+    (*p_nqiov)++;
+
+    return 0;
+}
+
+static int load_create_child_requests(bool count_only, BDRVFvdState *s,
+    QEMUIOVector * orig_qiov, int64_t sector_num, int nb_sectors, int *p_nziov,
+    int *p_niov, int *p_nqiov, FvdAIOCB *acb,  QEMUIOVector *q, struct iovec *v)
+{
+    const uint32_t first_chunk = sector_num / s->chunk_size;
+    const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+    int iov_index = 0;
+    size_t iov_left = orig_qiov->iov[0].iov_len;
+    uint8_t *iov_buf = orig_qiov->iov[0].iov_base;
+    int nziov = 0;      /* Number of empty regions. */
+    int nqiov = 0;
+    int niov = 0;
+    int64_t prev = READ_TABLE2(s->table[first_chunk]);
+    int64_t start_sec = -1;
+    int sectors_in_region;
+    int32_t chunk;
+    int64_t chunk_end;
+    int64_t last_chunk_data;
+
+    /* Calculate data in the last chunk. */
+    last_chunk_data = (sector_num + nb_sectors) % s->chunk_size;
+    if (last_chunk_data == 0) {
+        last_chunk_data = s->chunk_size;
+    }
+
+    /* Calculate data in the first chunk. */
+    if (first_chunk < last_chunk) {
+        sectors_in_region = s->chunk_size - (sector_num % s->chunk_size);
+    } else {
+        sectors_in_region = nb_sectors;
+    }
+
+    /* Check if the first chunk spans over s->avail_storage. If so, the part
+     * beyond avail_storage must be filled with zeros rather than reading from
+     * the underlying storage as it may not be written yet, which is possible.
+     * This is explained using the following example. Suppose a chunk consists
+     * of 4 sectors (i.e., chunk_size=4) and the last allocated chunk,
+     * c=[s0 s1 s2 s3], was allocated when the VM wrote to sector s1.
+     * Although the table indicates the full chunk is allocated, the
+     * underlying host file system only grows the image file to the size just
+     * enough to accomdating sector s1, as s1 is the frontier of the sectors
+     * written. This frontier (s1 in this example) is recorded in
+     * s->avail_storage. If the VM reads sector s2, which is beyond the
+     * frontier, the driver should return an array of zeros rather than trying
+     * to read from the underlying host file system. Otherwise, it will cause
+     * a read error as sector s2 is beyond the current size of the image file.
+     */
+    if (!IS_EMPTY(prev)) {
+        start_sec = prev * s->chunk_size + (sector_num % s->chunk_size);
+
+        if (start_sec >= s->avail_storage) {
+            prev = EMPTY_TABLE; /* Pretend the first chunk is empty. */
+        } else {
+            if (first_chunk < last_chunk) {
+                chunk_end = (prev + 1) * s->chunk_size;
+            } else {
+                chunk_end = prev * s->chunk_size + last_chunk_data;
+            }
+
+            if (s->avail_storage < chunk_end) {
+                /* First chunk spans over s->avail_storage. Split it into
+                 * two regions. The first region is read from disk while the
+                 * second region is filled with zeros. */
+
+                /* Handle the first region. */
+                sectors_in_region = (s->avail_storage % s->chunk_size) -
+                    (sector_num % s->chunk_size);
+
+                if (load_create_one_child(count_only, false/*!empty*/,
+                                    orig_qiov, &iov_index, &iov_left,
+                                    &iov_buf, start_sec, sectors_in_region,
+                                    &niov, &nziov, &nqiov, s,
+                                    acb, &q, &v)) {
+                    goto fail;
+                }
+
+                /* Start the second, empty region. */
+                prev = EMPTY_TABLE;
+                if (first_chunk < last_chunk) {
+                    sectors_in_region = s->chunk_size -
+                            (s->avail_storage % s->chunk_size);
+                } else {
+                    sectors_in_region = nb_sectors - sectors_in_region;
+                }
+            }
+        }
+    }
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE2(s->table[chunk]);
+        int64_t data_size;
+
+        /* Check if the chunk spans over s->avail_storage. */
+        if (!IS_EMPTY(current)) {
+            if (current * s->chunk_size >= s->avail_storage) {
+                current = EMPTY_TABLE; /* Pretend this chunk is empty. */
+            } else {
+                if (chunk < last_chunk) {
+                    chunk_end = (current + 1) * s->chunk_size;
+                } else {
+                    chunk_end = current * s->chunk_size + last_chunk_data;
+                }
+
+                if (s->avail_storage < chunk_end) {
+                    /* This chunk spans over s->avail_storage. Split
+                     * it into two regions. The first region is read from disk
+                     * while the second region is filled with zeros. */
+                    if (IS_EMPTY(prev)) {
+                        /* Terminate the previous empty region. */
+                        load_create_one_child(count_only, true/*empty*/,
+                                            orig_qiov, &iov_index, &iov_left,
+                                            &iov_buf, start_sec,
+                                            sectors_in_region, &niov, &nziov,
+                                            &nqiov, s, acb, &q, &v);
+
+                        /* Start the first region of this split chunk. */
+                        start_sec = current * s->chunk_size;
+                        sectors_in_region = s->avail_storage % s->chunk_size;
+
+                    } else {
+                        if (current == prev + 1) {
+                            /* Append the first region to the previous one. */
+                            sectors_in_region +=
+                                s->avail_storage % s->chunk_size;
+                        } else {
+                            /* Terminate the previous region. */
+                            if (load_create_one_child(count_only,
+                                    false/*!empty*/, orig_qiov, &iov_index,
+                                    &iov_left, &iov_buf, start_sec,
+                                    sectors_in_region, &niov, &nziov, &nqiov,
+                                    s, acb, &q, &v)) {
+                                goto fail;
+                            }
+
+                            /* Start the first region of this split chunk. */
+                            start_sec = current * s->chunk_size;
+                            sectors_in_region =
+                                s->avail_storage % s->chunk_size;
+                        }
+                    }
+
+                    /* Terminate the first region of this split chunk. */
+                    if (load_create_one_child(count_only, false/*!empty*/,
+                            orig_qiov, &iov_index, &iov_left, &iov_buf,
+                            start_sec, sectors_in_region, &niov, &nziov,
+                            &nqiov, s, acb, &q, &v)) {
+                        goto fail;
+                    }
+
+                    /* Start the second, empty region of this split chunk. */
+                    prev = EMPTY_TABLE;
+                    sectors_in_region = chunk_end - s->avail_storage;
+                    continue; /* This chunk is done. Go to handle next chunk. */
+                }
+            }
+        }
+
+        /* Simple case: not spanning over s->avail_storage. */
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = last_chunk_data;
+        }
+
+        if ((IS_EMPTY(prev) && IS_EMPTY(current)) ||
+            (!IS_EMPTY(prev) && !IS_EMPTY(current) && current == prev + 1)) {
+            /* Continue the previous region. */
+            sectors_in_region += data_size;
+        } else {
+            /* Terminate the previous region. */
+            if (load_create_one_child(count_only, IS_EMPTY(prev), orig_qiov,
+                    &iov_index, &iov_left, &iov_buf, start_sec,
+                    sectors_in_region, &niov, &nziov, &nqiov, s, acb, &q, &v)) {
+                goto fail;
+            }
+
+            /* Start the next region. */
+            start_sec = current * s->chunk_size;
+            sectors_in_region = data_size;
+        }
+        prev = current;
+    }
+
+    /* Handle the last continuous region. */
+    if (count_only) {
+        if (IS_EMPTY(prev)) {
+            nziov++;
+        } else {
+            niov += count_iov(orig_qiov->iov, &iov_index, &iov_buf,
+                              &iov_left, sectors_in_region * 512);
+            nqiov++;
+        }
+
+        *p_nqiov = nqiov;
+        *p_nziov = nziov;
+        *p_niov = niov;
+        return 0;
+    }
+
+    /* Handle the last continuous region. */
+    if (IS_EMPTY(prev)) {
+        zero_iov(orig_qiov->iov, &iov_index, &iov_buf, &iov_left,
+                 sectors_in_region * 512);
+    } else {
+        niov = setup_iov(orig_qiov->iov, v, &iov_index, &iov_buf,
+                         &iov_left, sectors_in_region * 512);
+        qemu_iovec_init_external(q, v, niov);
+        QDEBUG("LOAD: acb%llu-%p  create_child %d sector_num=%" PRId64
+               " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov, start_sec,
+               sectors_in_region, niov);
+        acb->load.children[nqiov].hd_acb =
+            bdrv_aio_readv(s->fvd_data, s->data_offset + start_sec, q,
+                           sectors_in_region, load_data_from_compact_image_cb,
+                           &acb->load.children[nqiov]);
+        if (!acb->load.children[nqiov].hd_acb) {
+            goto fail;
+        }
+        acb->load.children[nqiov].acb = acb;
+    }
+    ASSERT(iov_index == orig_qiov->niov - 1 && iov_left == 0);
+    return 0;
+
+fail:
+    *p_nqiov = nqiov; /* The number of children already created. */
+    return -1;
+}
diff --git a/block/fvd-utils.c b/block/fvd-utils.c
index 9feaa35..578eed4 100644
--- a/block/fvd-utils.c
+++ b/block/fvd-utils.c
@@ -107,3 +107,43 @@ static int setup_iov(struct iovec *orig_iov, struct iovec *new_iov,
         count++;
     }
 }
+
+static int zero_iov(struct iovec *orig_iov, int *p_index, uint8_t ** p_buf,
+                    size_t * p_left, size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            memset(buf, 0, total);
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        memset(buf, 0, left);
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
+
+static void aio_wrapper_bh(void *opaque)
+{
+    FvdAIOCB *acb = opaque;
+    acb->common.cb(acb->common.opaque, 0);
+    qemu_bh_delete(acb->wrapper.bh);
+    my_qemu_aio_release(acb);
+}
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 15/26] FVD: add basic journal functionality
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (12 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 14/26] FVD: add impl of loading data from " Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 16/26] FVD: add impl for buffered journal updates Chunqiang Tang
                   ` (10 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the basic journal functionality to FVD. The journal provides
several benefits. First, updating both the bitmap and the lookup table
requires only a single write to journal. Second, K concurrent updates to any
potions of the bitmap or the lookup table are converted to K sequential writes
in the journal, which can be merged into a single write by the host Linux
kernel. Third, it increases concurrency by avoiding locking the bitmap or the
lookup table. For example, updating one bit in the bitmap requires writing a
512-byte sector to the on-disk bitmap. This bitmap sector covers a total of
512*8*64K=256MB data, and any two writes to that same bitmap sector cannot
proceed concurrently. The journal solves this problem and eliminates
unnecessary locking.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block.c                 |    2 +-
 block/fvd-bitmap.c      |   57 ++++
 block/fvd-journal-buf.c |   34 ++
 block/fvd-journal.c     |  814 ++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-write.c       |    1 +
 block/fvd.c             |   19 ++
 6 files changed, 920 insertions(+), 7 deletions(-)
 create mode 100644 block/fvd-journal-buf.c

diff --git a/block.c b/block.c
index f7d91a2..8b3083d 100644
--- a/block.c
+++ b/block.c
@@ -58,7 +58,7 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 
-static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
+QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
diff --git a/block/fvd-bitmap.c b/block/fvd-bitmap.c
index 30e4a4b..06d7912 100644
--- a/block/fvd-bitmap.c
+++ b/block/fvd-bitmap.c
@@ -66,6 +66,63 @@ static inline void update_fresh_bitmap(int64_t sector_num, int nb_sectors,
     }
 }
 
+static void update_stale_bitmap(BDRVFvdState * s, int64_t sector_num,
+                                int nb_sectors)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    const int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            ASSERT(s->stale_bitmap == s->fresh_bitmap ||
+                   (s->fresh_bitmap[bitmap_byte_offset] & mask));
+            b |= mask;
+            s->stale_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+static void update_both_bitmaps(BDRVFvdState * s, int64_t sector_num,
+                                int nb_sectors)
+{
+    if (sector_num >= s->base_img_sectors) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    const int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] =
+                s->stale_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
 static inline bool bitmap_show_sector_in_base_img(int64_t sector_num,
                                                   const BDRVFvdState * s,
                                                   int bitmap_offset,
diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
new file mode 100644
index 0000000..3efdd47
--- /dev/null
+++ b/block/fvd-journal-buf.c
@@ -0,0 +1,34 @@
+/*
+ * QEMU Fast Virtual Disk Format Metadata Journal
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *    Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+/*=============================================================================
+ * There are two different ways of writing metadata changes to the journal:
+ * immediate write or buffered write. If cache=writethrough, metadata changes
+ * are written to the journal immediately. If cache!=writethrough, metadata
+ * changes are buffered in memory and later written to the journal either
+ * triggered by bdrv_aio_flush() or by a timeout. This module implements the
+ * case for cache!=writethrough.
+ *============================================================================*/
+
+static uint8_t * bjnl_alloc_journal_records_from_buf(BlockDriverState *bs,
+                                                bool update_bitmap,
+                                                size_t record_size)
+{
+    return NULL;
+}
+
+
+static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
+{
+    /* To be implemented. */
+}
diff --git a/block/fvd-journal.c b/block/fvd-journal.c
index 2edfc70..11796b0 100644
--- a/block/fvd-journal.c
+++ b/block/fvd-journal.c
@@ -11,28 +11,830 @@
  *
  */
 
-#ifdef FVD_DEBUG
-static bool emulate_host_crash = true;
+/*=============================================================================
+ *  A short description: this FVD module implements a journal for committing
+ *  metadata changes. Each sector in the journal is self-contained so that
+ *  updates are atomic. A sector may contain one or multiple journal records.
+ *  There are two types of journal records:
+ * bitmap_update and table_update.
+ *   Format of a bitmap_update record:
+ *         BITMAP_JRECORD (uint32_t)
+ *         num_dirty_sectors (uint32_t)
+ *         dirty_sector_begin (int64_t)
+ *   Format of a table_update record:
+ *         TABLE_JRECORD (uint32_t)
+ *         journal_epoch (uint64_t)
+ *         num_dirty_table_entries (uint32_t)
+ *         dirty_table_begin (uint32_t)
+ *         table_entry_1 (uint32_t)
+ *         table_entry_2 (uint32_t)
+ *         ...
+ * If both the bitmap and the table need update, one sector contains a
+ * TABLE_JRECORD and a BITMAP_JRECORD, and these two records cover the same
+ * range of virtual disk data so that the corresponding parts of the bitmap
+ * and the table are always updated in one atomic operation.
+ *
+ * There are two different ways of writing metadata changes to the journal:
+ * immediate write or buffered write. If cache=writethrough, metadata changes
+ * are written to the journal immediately. If cache!=writethrough, metadata
+ * changes are buffered in memory and later written to the journal either
+ * triggered by bdrv_aio_flush() or by a timeout.
+ *
+ * Immediate journal write is implemented in this file, see
+ * ujnl_write_metadata_to_journal_now(). Buffered journal write is implemented
+ * in fvd-journal-buf.c, see bjnl_store_metadata_change_in_buffer() and
+ *============================================================================*/
+
+#define BITMAP_JRECORD               ((uint32_t)0xEF2AB8ED)
+#define TABLE_JRECORD                ((uint32_t)0xB4E6F7AC)
+#define EMPTY_JRECORD                ((uint32_t)0xA5A5A5A5)
+#define BITMAP_JRECORD_SIZE          (2 * sizeof(uint32_t) + sizeof(int64_t))
+#define TABLE_JRECORD_HDR_SIZE       (3 * sizeof(uint32_t) + sizeof(uint64_t))
+#define TABLE_JRECORDS_PER_SECTOR \
+                ((512 - TABLE_JRECORD_HDR_SIZE)/sizeof(uint32_t))
+
+/* One BITMAP_JRECORD and this number of BITMAP_JRECORDs can fit
+ * in one journal sector. */
+#define MIXED_JRECORDS_PER_SECTOR ((512 - TABLE_JRECORD_HDR_SIZE - \
+                                BITMAP_JRECORD_SIZE) / sizeof(uint32_t))
+
+#ifndef ENABLE_QDEBUG
+#  define PRINT_TABLE_JRECORD(type) do{}while(0)
 #else
-static bool emulate_host_crash = false;
+static void print_table_jrecord(uint32_t * type);
+#  define PRINT_TABLE_JRECORD print_table_jrecord
 #endif
 
+static int flush_metadata_to_disk (BlockDriverState * bs,
+                                   bool update_journal_epoch,
+                                   bool update_base_img_prefetched);
+static int64_t ujnl_allocate_journal_sectors(BlockDriverState * bs,
+                                             FvdAIOCB * acb, int nb_sectors);
+static void ujnl_write_metadata_to_journal_now(FvdAIOCB * acb,
+                                bool update_bitmap, uint8_t *buf,
+                                int64_t journal_sec, int nb_journal_sectors);
+static void bjnl_clean_buf_timer_cb(BlockDriverState * bs);
+static uint8_t * bjnl_alloc_journal_records_from_buf(BlockDriverState *bs,
+                                    bool update_bitmap, size_t record_size);
+
 static inline int64_t calc_min_journal_size(int64_t table_entries)
 {
-    return 512;
+    return (table_entries + MIXED_JRECORDS_PER_SECTOR - 1)
+        / MIXED_JRECORDS_PER_SECTOR * 512;
 }
 
 static int init_journal(int read_only, BlockDriverState * bs,
                         FvdHeader * header)
 {
-    return -ENOTSUP;
+    BDRVFvdState *s = bs->opaque;
+    s->journal_size = header->journal_size / 512;
+    s->journal_offset = header->journal_offset / 512;
+    s->journal_epoch = header->stable_journal_epoch + 1;
+    s->next_journal_sector = 0;
+    s->use_bjnl = false;
+    QLIST_INIT(&s->ujnl.wait4_recycle);
+    s->ujnl.active_writes = 0;
+
+    if (s->journal_size <= 0) {
+        if (!s->table && !s->fresh_bitmap) {
+            return 0;   /* No need to use the journal. */
+        }
+
+        if (!header->clean_shutdown) {
+            fprintf(stderr, "ERROR: the image may be corrupted because it was "
+                    "not shut down gracefully last\ntime and it does not use "
+                    "a journal. You may continue to use the image at your\n"
+                    "own risk by manually resetting the clean_shutdown flag "
+                    "in the image.\n\n");
+            s->dirty_image = true;
+            if (IN_QEMU_TOOL) {
+                return 0;       /* Allow qemu tools to use the image. */
+            } else {
+                /* Do not allow boot the VM until the clean_shutdown flag is
+                 * manually cleaned. */
+                return -EINVAL;
+            }
+        }
+
+        QDEBUG("Journal is disabled\n");
+        return 0;
+    }
+
+    if (!read_only && !IN_QEMU_TOOL && s->fvd_metadata->enable_write_cache
+        && header->journal_buf_size > 0) {
+        s->use_bjnl = true;
+        QTAILQ_INIT(&s->bjnl.queued_bufs);
+        s->bjnl.buf = NULL;
+        s->bjnl.def_buf_size = header->journal_buf_size;
+        s->bjnl.clean_buf_period = header->journal_clean_buf_period;
+        s->bjnl.buf_contains_bitmap_update = false;
+        s->bjnl.clean_buf_timer = qemu_new_timer(rt_clock,
+                                    (QEMUTimerCB *)bjnl_clean_buf_timer_cb, bs);
+        s->bjnl.timer_scheduled = false;
+    }
+
+    if (header->clean_shutdown) {
+        QDEBUG("Journal is skipped as the VM was shut down gracefully "
+               "last time.\n");
+        return 0;
+    }
+
+    QDEBUG("Recover from the journal as the VM was not shut down gracefully "
+           "last time.\n");
+
+    uint8_t *journal = my_qemu_blockalign(s->fvd_metadata,
+                                          s->journal_size * 512);
+    int ret = bdrv_read(s->fvd_metadata, s->journal_offset,
+                        journal, s->journal_size);
+    if (ret < 0) {
+        my_qemu_vfree(journal);
+        fprintf(stderr, "Failed to read the journal (%" PRId64 ") bytes\n",
+                s->journal_size * 512);
+        return -EIO;
+    }
+
+    /* Go through every journal sector. */
+    uint64_t max_epoch = 0;
+    uint8_t *sector = journal;
+    uint8_t *journal_end = journal + s->journal_size * 512;
+    uint64_t *chunk_epoch = NULL;
+
+    if (header->table_offset > 0) {
+        int table_entries = ROUND_UP(header->virtual_disk_size,
+                                     header->chunk_size) / header->chunk_size;
+        chunk_epoch = my_qemu_mallocz(sizeof(uint64_t) * table_entries);
+    }
+
+    while (sector < journal_end) {
+        uint32_t *type = (uint32_t *) sector;   /* Journal record type. */
+        while ((uint8_t *) type < (sector + 512)) {
+            if (le32_to_cpu(*type) == BITMAP_JRECORD) {
+                uint32_t *nb_sectors = type + 1;
+                int64_t *sector_num = (int64_t *) (type + 2);
+                if (s->stale_bitmap) {
+                    update_both_bitmaps(s, le64_to_cpu(*sector_num),
+                                        le32_to_cpu(*nb_sectors));
+                    QDEBUG("JOURNAL: recover BITMAP_JRECORD sector_num=%"
+                           PRId64 " nb_sectors=%u\n",
+                           le64_to_cpu(*sector_num), le32_to_cpu(*nb_sectors));
+                }
+
+                /* First field of the next journal record. */
+                type = (uint32_t *) (sector_num + 1);
+            } else if (le32_to_cpu(*type) == TABLE_JRECORD) {
+                uint64_t *epoch = (uint64_t *) (type + 1);
+                uint32_t *count = (uint32_t *) (epoch + 1);
+                uint32_t *offset = count + 1;
+                uint32_t *content = offset + 1;
+                const uint32_t chunk = le32_to_cpu(*offset);
+                const uint64_t epo = le64_to_cpu(*epoch);
+                const uint32_t n = le32_to_cpu(*count);
+                uint32_t i;
+                QDEBUG("JOURNAL: recover TABLE_JRECORD epoch=%" PRIu64
+                       " chunk_start=%u " "nb_chunks=%u\n", epo, chunk, n);
+                for (i = 0; i < n; i++) {
+                    /* If a chunk can be mapped to different locations at
+                     * different times, e.g., due to defragmentation
+                     * activities that move chunks, the epoch number is used
+                     * to identify the last effective mapping of a chunk. */
+                    if (epo > header->stable_journal_epoch &&
+                        epo > chunk_epoch[chunk + i]) {
+                        chunk_epoch[chunk + i] = epo;
+                        s->table[chunk + i] = content[i];
+
+                        /* The dirty bit was not cleaned when the table entry
+                         * was saved in the journal. */
+                        CLEAN_DIRTY2(s->table[chunk + i]);
+                        QDEBUG("\tAccept mapping chunk %u to %u\n",
+                               chunk + i, READ_TABLE(content[i]));
+                    } else {
+                        QDEBUG("\tReject mapping chunk %u to %u\n",
+                               chunk + i, READ_TABLE(content[i]));
+                    }
+                }
+                type = content + n;     /* First field of the next record. */
+                if (epo > max_epoch) {
+                    max_epoch = epo;
+                }
+            } else {
+                /* End of valid records in this journal sector. */
+                ASSERT(le32_to_cpu(*type) == EMPTY_JRECORD);
+                break;
+            }
+        }
+
+        sector += 512;
+    }
+    my_qemu_vfree(journal);
+    if (chunk_epoch) {
+        my_qemu_free(chunk_epoch);
+    }
+
+    if (++max_epoch > s->journal_epoch) {
+        s->journal_epoch = max_epoch;
+    }
+    QDEBUG("JOURNAL: journal_epoch=%" PRIu64 "\n", s->journal_epoch);
+
+    if (!read_only) {
+        /* Write the recovered metadata. */
+        flush_metadata_to_disk(bs, true /*journal */ , false /*prefetch */ );
+    }
+
+    return 0;
 }
 
-static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap)
+/*
+ * This function first flushes in-memory metadata to disk and then recycle the
+ * used journal sectors. It is possible to make this operation asynchronous so
+ * that the performance is better.  However, the overall performance
+ * improvement may be limited since recycling the journal happens very
+ * infrequently and updating on-disk metadata finishes quickly because of the
+ * small size of the metadata.
+ */
+static int recycle_journal(BlockDriverState * bs)
 {
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+
+#ifdef ENABLE_QDEBUG
+    static int64_t recycle_count = 0;
+    QDEBUG("JOURNAL: start journal recycle %" PRId64 ".\n", recycle_count);
+    recycle_count++;
+    int64_t begin_time = qemu_get_clock(rt_clock);
+#endif
+
+    ret = flush_metadata_to_disk(bs, true /*journal */ , false /*prefetch */);
+    if (ret == 0) {
+        s->next_journal_sector = 0;
+    }
+
+#ifdef ENABLE_QDEBUG
+    int64_t end_time = qemu_get_clock(rt_clock);
+    QDEBUG("JOURNAL: journal recycle took %" PRId64 " ms.\n",
+           (end_time - begin_time));
+#endif
+
+    return ret;
+}
+
+static void write_metadata_to_journal_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    if (ret == 0) {
+        QDEBUG("JOURNAL: acb%llu-%p  write_metadata_to_journal_cb\n",
+               acb->uuid, acb);
+
+        if (s->table) {
+            /* Update the table. */
+            int i;
+            const uint32_t first_chunk = acb->sector_num / s->chunk_size;
+            const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                / s->chunk_size;
+            for (i = first_chunk; i <= last_chunk; i++) {
+                CLEAN_DIRTY2(s->table[i]);
+            }
+        }
+
+        if (s->stale_bitmap) {
+            /* If fresh_bitmap differs from stale_bitmap, fresh_bitmap has
+             * already been updated in write_data_cb() when invoking
+             * update_fresh_bitmap_and_check_stale_bitmap(). */
+            update_stale_bitmap(s, acb->sector_num, acb->nb_sectors);
+        }
+    } else {
+        QDEBUG("JOURNAL: acb%llu-%p  write_metadata_to_journal_cb err ret=%d\n",
+               acb->uuid, acb, ret);
+    }
+
+    /* Clean up. */
+    if (acb->type == OP_STORE_COMPACT) {
+        acb->common.cb(acb->common.opaque, ret);
+        if (acb->jcb.iov.iov_base != NULL) {
+            my_qemu_vfree(acb->jcb.iov.iov_base);
+        }
+        my_qemu_aio_release(acb);
+    } else {
+        ASSERT(acb->type == OP_WRITE);
+        finish_write(acb, ret);
+    }
+
+    if (!s->use_bjnl) {
+        ujnl_free_journal_sectors(bs);
+    }
+}
+
+static inline uint8_t * alloc_journal_records(FvdAIOCB *acb,
+                                              bool update_bitmap,
+                                              size_t buf_size,
+                                              int64_t *journal_sec)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->use_bjnl) {
+        /* Use buffered journal update. */
+        return bjnl_alloc_journal_records_from_buf(bs, update_bitmap, buf_size);
+    }
+
+    /* Allocate journal sectors for unbuffered journal update. */
+    size_t nb_sectors = (buf_size + 511) / 512;
+    *journal_sec = ujnl_allocate_journal_sectors(bs, acb, nb_sectors);
+    if (*journal_sec < 0) {
+        /* No journal sector is available now. It will be waken up later
+         * in ujnl_free_journal_sectors(). */
+        return NULL;
+    }
+
+    uint8_t *buf = my_qemu_blockalign(s->fvd_metadata, 512 * nb_sectors);
+    if (buf_size % 512 != 0) {
+        *((uint32_t*)(buf + buf_size)) = EMPTY_JRECORD; /* Mark buffer end. */
+    }
+    return buf;
+}
+
+static uint8_t * create_journal_records(FvdAIOCB * acb,
+                                        bool update_bitmap,
+                                        int64_t *p_journal_sec,
+                                        size_t *p_buf_size)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    uint8_t *buf;
+    int64_t journal_sec = -1;
+    size_t buf_size;
+
+    if (update_bitmap && !s->table) {
+        /* Only update the bitmap. */
+        buf_size = BITMAP_JRECORD_SIZE;
+        buf = alloc_journal_records(acb, update_bitmap, buf_size, &journal_sec);
+        if (!buf) {
+            return NULL; /* Wake up later in ujnl_free_journal_sectors(). */
+        }
+
+        uint32_t *type = (uint32_t *)buf; /*BITMAP_JRECORD*/
+        uint32_t *nb_sectors = type + 1;
+        int64_t *sector_num = (int64_t *) (type + 2);
+        *type = cpu_to_le32(BITMAP_JRECORD);
+        *nb_sectors = cpu_to_le32((uint32_t) acb->nb_sectors);
+        *sector_num = cpu_to_le64(acb->sector_num);
+        QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
+               " nb_sectors=%u\n", acb->sector_num, acb->nb_sectors);
+
+    } else if (!update_bitmap) {
+        /* Only update the table. */
+
+        const int64_t first_chunk = acb->sector_num / s->chunk_size;
+        const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+            / s->chunk_size;
+        int num_chunks = last_chunk - first_chunk + 1;
+
+        /* Buf space for complete journal sectors. */
+        buf_size = (num_chunks / TABLE_JRECORDS_PER_SECTOR) * 512;
+
+        /* Buf space for the last partial journal sectors. */
+        int rc = num_chunks % TABLE_JRECORDS_PER_SECTOR;
+        if (rc > 0) {
+            buf_size += TABLE_JRECORD_HDR_SIZE + sizeof(uint32_t) * rc;
+        }
+
+        buf = alloc_journal_records(acb, update_bitmap, buf_size, &journal_sec);
+        if (!buf) {
+            return NULL; /* Wake up later in ujnl_free_journal_sectors(). */
+        }
+
+        uint32_t *type = (uint32_t *)buf; /* TABLE_JRECORD */
+        int64_t chunk = first_chunk;
+
+        while (1) {
+            /* Start a new journal sector. */
+            uint64_t *epoch = (uint64_t *) (type + 1);
+            uint32_t *count = (uint32_t *) (epoch + 1);
+            uint32_t *offset = count + 1;
+            uint32_t *content = offset + 1;
+
+            *type = cpu_to_le32(TABLE_JRECORD);
+            *offset = cpu_to_le32(chunk);
+            *epoch = cpu_to_le64(s->journal_epoch);
+            s->journal_epoch++;
+            if (num_chunks <= TABLE_JRECORDS_PER_SECTOR) {
+                /* This is the last journal sector. */
+                *count = cpu_to_le32(num_chunks);
+                memcpy(content, &s->table[chunk],
+                       sizeof(uint32_t) * num_chunks);
+                PRINT_TABLE_JRECORD(type);
+                break;
+            }
+
+            *count = cpu_to_le32(TABLE_JRECORDS_PER_SECTOR);
+            memcpy(content, &s->table[chunk],
+                   sizeof(uint32_t) * TABLE_JRECORDS_PER_SECTOR);
+            chunk += TABLE_JRECORDS_PER_SECTOR;
+            num_chunks -= TABLE_JRECORDS_PER_SECTOR;
+            PRINT_TABLE_JRECORD(type);
+
+            /* Next TABLE_JRECORD field 1. */
+            type = content + TABLE_JRECORDS_PER_SECTOR;
+        }
+    } else {
+        /* Update both the table and the bitmap. It may use multiple journal
+         * sectors. Each sector is self-contained, including a TABLE_JRECORD
+         * and a BITMAP_JRECORD. The two records one the same sector cover the
+         * same range of virtual disk data.  The purpose is to update the
+         * corresponding parts of the bitmap and the table in one atomic
+         * operation. */
+        const int64_t first_chunk = acb->sector_num / s->chunk_size;
+        const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+            / s->chunk_size;
+        int num_chunks = last_chunk - first_chunk + 1;
+
+        /* Buf space for complete journal sectors. */
+        buf_size = (num_chunks / MIXED_JRECORDS_PER_SECTOR) * 512;
+
+        /* Buf space for the last partial journal sectors. */
+        int rc = num_chunks % MIXED_JRECORDS_PER_SECTOR;
+        if (rc > 0) {
+            buf_size += BITMAP_JRECORD_SIZE + TABLE_JRECORD_HDR_SIZE
+                        + sizeof(uint32_t) * rc;
+        }
+
+        buf = alloc_journal_records(acb, update_bitmap, buf_size, &journal_sec);
+        if (!buf) {
+            return NULL; /* Wake up later in ujnl_free_journal_sectors(). */
+        }
+
+        uint32_t *type = (uint32_t *)buf; /*TABLE_JRECORD*/
+        int64_t chunk = first_chunk;
+        int64_t sector_num = acb->sector_num;
+        uint32_t nb_sectors;
+
+        /* Determine the number of data sectors whose bitmap change fits in
+         * the first journal sector. */
+        if (buf_size <= 512) {
+            nb_sectors = acb->nb_sectors; /* All fit in one journal sector. */
+        } else {
+            nb_sectors = (first_chunk + MIXED_JRECORDS_PER_SECTOR)
+                * s->chunk_size - acb->sector_num;
+        }
+
+        while (1) {
+            /* Start a new journal sector. */
+            uint64_t *epoch = (uint64_t *) (type + 1);
+            uint32_t *count = (uint32_t *) (epoch + 1);
+            uint32_t *offset = count + 1;
+            uint32_t *content = offset + 1;
+
+            *type = cpu_to_le32(TABLE_JRECORD);
+            *offset = cpu_to_le32(chunk);
+            *epoch = cpu_to_le64(s->journal_epoch);
+            s->journal_epoch++;
+
+            if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
+                /* This is the last journal sector. */
+                *count = cpu_to_le32(num_chunks);
+                memcpy(content, &s->table[chunk],
+                       sizeof(uint32_t) * num_chunks);
+                PRINT_TABLE_JRECORD(type);
+
+                /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
+                 * updated in one atomic operatoin. */
+                type = content + num_chunks;    /* BITMAP_JRECORD. */
+                uint32_t *p_nb_sectors = type + 1;
+                int64_t *p_sector_num = (int64_t *) (type + 2);
+                *type = cpu_to_le32(BITMAP_JRECORD);
+                *p_nb_sectors = cpu_to_le32(nb_sectors);
+                *p_sector_num = cpu_to_le64(sector_num);
+                QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
+                       " nb_sectors=%u\n", sector_num, nb_sectors);
+                break;
+            }
+
+            *count = cpu_to_le32(MIXED_JRECORDS_PER_SECTOR);
+            memcpy(content, &s->table[chunk],
+                   sizeof(uint32_t) * MIXED_JRECORDS_PER_SECTOR);
+            PRINT_TABLE_JRECORD(type);
+
+            /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
+             * updated in one atomic operatoin. */
+            type = content + MIXED_JRECORDS_PER_SECTOR; /* BITMAP_JRECORD */
+            uint32_t *p_nb_sectors = type + 1;
+            int64_t *p_sector_num = (int64_t *) (type + 2);
+            *type = cpu_to_le32(BITMAP_JRECORD);
+            *p_nb_sectors = cpu_to_le32(nb_sectors);
+            *p_sector_num = cpu_to_le64(sector_num);
+            QDEBUG("JOURNAL: record BITMAP_JRECORD sector_num=%" PRId64
+                   " nb_sectors=%u\n", sector_num, nb_sectors);
+
+            /* Prepare for the next journal sector. */
+            type = (uint32_t *) (p_sector_num + 1);
+            chunk += MIXED_JRECORDS_PER_SECTOR;
+            sector_num = chunk * s->chunk_size;
+            num_chunks -= MIXED_JRECORDS_PER_SECTOR;
+            if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
+                /* Data sectors covered by the last journal sector. */
+                nb_sectors = (acb->sector_num + acb->nb_sectors)
+                    - chunk * s->chunk_size;
+            } else {
+                nb_sectors = s->chunk_size * MIXED_JRECORDS_PER_SECTOR;
+            }
+        }
+    }
+
+    if (p_journal_sec) {
+        *p_journal_sec = journal_sec;
+    }
+    if (p_buf_size) {
+        *p_buf_size = buf_size;
+    }
+    return buf;
+}
+
+static void write_metadata_to_journal(FvdAIOCB * acb, bool update_bitmap)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    ASSERT((s->table || s->fresh_bitmap)
+           && (!update_bitmap || (s->fresh_bitmap && acb->type == OP_WRITE))
+           && (update_bitmap || s->table)
+           && (acb->type == OP_WRITE || acb->type == OP_STORE_COMPACT));
+
+    if (acb->type == OP_WRITE) {
+        /* Save update_bitmap as it may be needed later if this request is
+         * queued.*/
+        acb->write.update_bitmap = update_bitmap;
+    }
+
+    if (s->metadata_err_prohibit_write) {
+        write_metadata_to_journal_cb(acb, -EIO);  /* Fail the request now. */
+        return;
+    }
+
+    if (s->journal_size <= 0) {
+        write_metadata_to_journal_cb(acb, 0);     /* Journal is disabled. */
+        return;
+    }
+
+    int64_t journal_sec = -1;
+    size_t buf_size = -1;
+    uint8_t *buf = create_journal_records(acb, update_bitmap,
+                                          &journal_sec, &buf_size);
+
+    /* Depending on the cache mode, either write metadata changes to journal
+     * immediately, or put it in buffer first. */
+    if (s->use_bjnl) {
+        /* Done for now. The buffer will be written to the journal later. */
+        write_metadata_to_journal_cb(acb, 0);
+    } else if (buf) {
+        int nb_sectors = (buf_size + 511) / 512;
+        ujnl_write_metadata_to_journal_now(acb, update_bitmap, buf,
+                                           journal_sec, nb_sectors);
+    }
+}
+
+static int flush_metadata_to_disk(BlockDriverState * bs,
+                                  bool update_journal_epoch,
+                                  bool update_base_img_prefetched)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (bs->read_only) {
+        return 0;
+    }
+
+    /* Clean DIRTY_TABLE bit and write the table to disk. */
+    if (s->table) {
+        int i;
+        int table_entries = ROUND_UP(s->virtual_disk_size,
+                                     s->chunk_size * 512) / (s->chunk_size *
+                                                             512);
+        for (i = 0; i < table_entries; i++) {
+            CLEAN_DIRTY(s->table[i]);
+        }
+
+        int nb = (int)(s->table_size / 512);
+        QDEBUG("JOURNAL: flush table (%d sectors) to disk\n", nb);
+
+        if (bdrv_write(s->fvd_metadata, s->table_offset, (uint8_t *) s->table,
+                       nb) < 0) {
+            goto fail;
+        }
+    }
+
+    /* Write fresh_bitmap to disk. */
+    if (s->fresh_bitmap) {
+        /* Ensure copy-on-read and prefetching data are stable. */
+        if (bdrv_flush(s->fvd_data)) {
+            goto fail;
+        }
+
+        if (s->fvd_data != s->fvd_metadata && s->table) {
+            /* Ensure table is stable before updating bitmap. */
+            if (bdrv_flush(s->fvd_metadata)) {
+                goto fail;
+            }
+        }
+
+        int nb = (int)(s->bitmap_size / 512);
+        QDEBUG("JOURNAL: flush bitmap (%d sectors) to disk\n", nb);
+
+        if (bdrv_write(s->fvd_metadata, s->bitmap_offset,
+                       s->fresh_bitmap, nb) < 0) {
+            goto fail;
+        }
+        if (s->fresh_bitmap != s->stale_bitmap) {
+            memcpy(s->stale_bitmap, s->fresh_bitmap, s->bitmap_size);
+        }
+    }
+
+    if (update_journal_epoch || update_base_img_prefetched) {
+        /* Update the header. */
+        FvdHeader header;
+        if (read_fvd_header(s, &header)) {
+            goto fail;
+        }
+        if (update_base_img_prefetched) {
+            header.base_img_fully_prefetched = true;
+        }
+        if (update_journal_epoch) {
+            header.stable_journal_epoch = s->journal_epoch++;
+        }
+        if (update_fvd_header(s, &header)) {
+            goto fail;
+        }
+    }
+
+    /* Perform a final flush to ensure all metadata are stable. */
+    if (!bdrv_flush(s->fvd_metadata)) {
+        return 0;
+    }
+
+fail:
+    s->metadata_err_prohibit_write = true;
+    return -EIO;
+}
+
+#ifdef FVD_DEBUG
+static bool emulate_host_crash = true;
+#else
+static bool emulate_host_crash = false;
+#endif
+
+static void flush_metadata_to_disk_on_exit(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (bs->read_only || !s->fvd_metadata) {
+        return;
+    }
+
+    /* If (emulate_host_crash==true), do not flush metadata to disk
+     * so that it has to rely on journal for recovery. */
+    if (s->journal_size <= 0 || !emulate_host_crash) {
+        if (!flush_metadata_to_disk(bs, true, false) && !s->dirty_image) {
+            update_clean_shutdown_flag(s, true);
+        }
+    }
 }
 
 void fvd_emulate_host_crash(bool cond)
 {
     emulate_host_crash = cond;
 }
+
+#ifdef ENABLE_QDEBUG
+static void print_table_jrecord(uint32_t * type)
+{
+    int32_t i;
+    uint64_t *p_epoch = (uint64_t *) (type + 1);
+    uint32_t *p_count = (uint32_t *) (p_epoch + 1);
+    uint32_t *p_offset = p_count + 1;
+    uint32_t *content = p_offset + 1;
+
+    uint64_t epoch = le64_to_cpu(*p_epoch);
+    uint32_t count = le32_to_cpu(*p_count);
+    uint32_t offset = le32_to_cpu(*p_offset);
+
+    QDEBUG("JOURNAL: record TABLE_JRECORD epoch=%" PRIu64
+           " chunk_start=%u " "nb_chunks=%u\n", epoch, offset, count);
+    for (i = 0; i < count; i++) {
+        QDEBUG("\tMap chunk %u to %u\n", offset + i, READ_TABLE(content[i]));
+    }
+}
+#endif
+
+/* Only used for unbuffered journal update. */
+static void ujnl_write_metadata_to_journal_now(FvdAIOCB * acb,
+                                               bool update_bitmap,
+                                               uint8_t *buf,
+                                               int64_t journal_sec,
+                                               int nb_journal_sectors)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    QDEBUG("JOURNAL: acb%llu-%p  write_metadata_to_journal journal_sec=%"
+           PRId64 " nb_journal_sectors=%d\n", acb->uuid, acb, journal_sec,
+           nb_journal_sectors);
+
+    acb->jcb.iov.iov_base = buf;
+    acb->jcb.iov.iov_len = 512 * nb_journal_sectors;
+    qemu_iovec_init_external(&acb->jcb.qiov, &acb->jcb.iov, 1);
+    acb->jcb.hd_acb = bdrv_aio_writev(s->fvd_metadata,
+                                      s->journal_offset + journal_sec,
+                                      &acb->jcb.qiov, nb_journal_sectors,
+                                      write_metadata_to_journal_cb, acb);
+    if (!acb->jcb.hd_acb) {
+        write_metadata_to_journal_cb(acb, -1);
+    }
+}
+
+/* Only used for unbuffered journal update. */
+static void ujnl_free_journal_sectors(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->journal_size <= 0) {
+        return;
+    }
+
+    s->ujnl.active_writes--;
+    ASSERT(s->ujnl.active_writes >= 0);
+
+    if (s->ujnl.active_writes > 0 || QLIST_EMPTY(&s->ujnl.wait4_recycle)) {
+        return;
+    }
+
+    /* Some requests are waiting for the journal to be recycled in order to
+     * get free journal sectors. */
+    recycle_journal(bs);
+
+    /* Restart requests in the ujnl.wait4_recycle list.  First make a copy of
+     * the head and then empty the head. */
+    FvdAIOCB *acb = QLIST_FIRST(&s->ujnl.wait4_recycle);
+    QLIST_INIT(&s->ujnl.wait4_recycle);
+    FvdAIOCB *next;
+
+    /* Restart all dependent requests. Cannot use QLIST_FOREACH here, because
+     * the next link might not be the same any more after the callback. */
+    while (acb) {
+        next = acb->jcb.ujnl_next_wait4_recycle.le_next;
+        acb->jcb.ujnl_next_wait4_recycle.le_prev = NULL;
+        QDEBUG("WRITE: acb%llu-%p  restart_write_metadata_to_journal "
+               "after recycle_journal\n", acb->uuid, acb);
+        if (acb->type == OP_WRITE) {
+            write_metadata_to_journal(acb, acb->write.update_bitmap);
+        } else {
+            write_metadata_to_journal(acb, false);
+        }
+        acb = next;
+    }
+}
+
+/* Only used for unbuffered journal update. */
+static int64_t ujnl_allocate_journal_sectors(BlockDriverState * bs,
+                                             FvdAIOCB * acb, int nb_sectors)
+{
+    BDRVFvdState *s = bs->opaque;
+    ASSERT(nb_sectors <= s->journal_size);
+
+    if (!QLIST_EMPTY(&s->ujnl.wait4_recycle)) {
+        /* Waiting for journal recycle to finish. */
+        ASSERT(s->ujnl.active_writes > 0);
+        QDEBUG("WRITE: acb%llu-%p  wait4_journal_recycle active_writes=%d\n",
+               acb->uuid, acb, s->ujnl.active_writes);
+        QLIST_INSERT_HEAD(&s->ujnl.wait4_recycle, acb,
+                          jcb.ujnl_next_wait4_recycle);
+        return -1;
+    }
+
+    int64_t journal_sec;
+    if (s->next_journal_sector + nb_sectors <= s->journal_size) {
+        journal_sec = s->next_journal_sector;
+        s->next_journal_sector += nb_sectors;
+        s->ujnl.active_writes++;
+        return journal_sec;
+    }
+
+    /* No free journal sector is available. Check if the journal can be
+     * recycled now. */
+    if (s->ujnl.active_writes == 0) {
+        recycle_journal(bs);
+        s->next_journal_sector = nb_sectors;
+        s->ujnl.active_writes = 1;
+        return 0; /* Use the first sector. */
+    }
+
+    /* Waiting for journal recycle to finish. It will be waken up later in
+     * ujnl_free_journal_sectors(). */
+    QLIST_INSERT_HEAD(&s->ujnl.wait4_recycle, acb, jcb.ujnl_next_wait4_recycle);
+    QDEBUG("WRITE: acb%llu-%p  wait4_journal_recycle active_writes=%d\n",
+           acb->uuid, acb, s->ujnl.active_writes);
+    return -1;
+}
diff --git a/block/fvd-write.c b/block/fvd-write.c
index f0580d4..623ec83 100644
--- a/block/fvd-write.c
+++ b/block/fvd-write.c
@@ -15,6 +15,7 @@ static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap);
 static int do_aio_write(struct FvdAIOCB *acb);
 static void restart_dependent_writes(struct FvdAIOCB *acb);
 static void free_write_resource(struct FvdAIOCB *acb);
+static void ujnl_free_journal_sectors(BlockDriverState * bs);
 static inline BlockDriverAIOCB *store_data(int soft_write,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
diff --git a/block/fvd.c b/block/fvd.c
index 74845e7..2402a94 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -37,6 +37,7 @@
 #include "block/fvd-store.c"
 #include "block/fvd-load.c"
 #include "block/fvd-journal.c"
+#include "block/fvd-journal-buf.c"
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
 
@@ -65,3 +66,21 @@ static void bdrv_fvd_init(void)
 }
 
 block_init(bdrv_fvd_init);
+
+/* Since bdrv_close may not be properly invoked on a VM shutdown, we use a
+ * destructor to flush metadata to disk. This only affects performance and
+ * does not affect correctness.  See Section 3.3.4 of the FVD-cow paper for
+ * the rationale. */
+extern QTAILQ_HEAD(, BlockDriverState) bdrv_states;
+  static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk(void)
+{
+    BlockDriverState *bs;
+    QTAILQ_FOREACH(bs, &bdrv_states, list) {
+        if (bs->drv == &bdrv_fvd) {
+            flush_metadata_to_disk_on_exit(bs);
+#ifdef FVD_DEBUG
+            dump_resource_summary(bs->opaque);
+#endif
+        }
+    }
+}
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 16/26] FVD: add impl for buffered journal updates
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (13 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 15/26] FVD: add basic journal functionality Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush() Chunqiang Tang
                   ` (9 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch enhances FVD's journal with the capability of buffering
multiple metadata updates and sending them to the journal in a single write.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-journal-buf.c |  336 ++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 333 insertions(+), 3 deletions(-)

diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
index 3efdd47..b4077ce 100644
--- a/block/fvd-journal-buf.c
+++ b/block/fvd-journal-buf.c
@@ -20,15 +20,345 @@
  * case for cache!=writethrough.
  *============================================================================*/
 
+static inline int bjnl_write_buf(FvdAIOCB *acb);
+static void bjnl_send_current_buf_to_write_queue(BlockDriverState *bs);
+
+static inline void bjnl_finish_write_buf(FvdAIOCB *acb, int ret)
+{
+    ASSERT (acb->type == OP_BJNL_BUF_WRITE);
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    QDEBUG("JOURNAL: bjnl_finish_write_buf acb%llu-%p\n", acb->uuid, acb);
+
+    my_qemu_vfree(acb->jcb.iov.iov_base);
+    QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+    my_qemu_aio_release(acb);
+
+    if (ret != 0) {
+        s->metadata_err_prohibit_write = true;
+    }
+}
+
+static inline void bjnl_write_next_buf(BDRVFvdState *s)
+{
+    FvdAIOCB *acb;
+    while ((acb = QTAILQ_FIRST(&s->bjnl.queued_bufs))) {
+        if (bjnl_write_buf(acb) == 0) {
+            return;
+        }
+    }
+}
+
+static inline void bjnl_aio_flush_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("JOURNAL: bjnl_aio_flush_cb acb%llu-%p\n", acb->uuid, acb);
+
+    /* Invoke the callback initially provided to fvd_aio_flush(). */
+    acb->common.cb(acb->common.opaque, ret);
+    my_qemu_aio_release(acb);
+}
+
+static inline void bjnl_write_buf_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("JOURNAL: bjnl_write_buf_cb acb%llu-%p\n", acb->uuid, acb);
+    bjnl_finish_write_buf(acb, ret);
+    bjnl_write_next_buf(s);
+}
+
+#ifndef ENABLE_QDEBUG
+#  define PRINT_JRECORDS(buf,len) do{}while(0)
+#else
+static void print_jrecords(const uint8_t *buf, size_t len);
+#  define PRINT_JRECORDS print_jrecords
+#endif
+
+static int bjnl_write_buf_start(FvdAIOCB *acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t journal_sec;
+    int nb_sectors = acb->jcb.iov.iov_len / 512;
+    int ret;
+
+    ASSERT (nb_sectors <= s->journal_size);
+    QDEBUG("JOURNAL: bjnl_write_buf_start acb%llu-%p\n", acb->uuid, acb);
+
+    if (s->next_journal_sector + nb_sectors <= s->journal_size) {
+        journal_sec = s->next_journal_sector;
+        s->next_journal_sector += nb_sectors;
+    } else {
+        if ((ret = recycle_journal(bs))) {
+            goto fail;
+        }
+        journal_sec = 0;
+        s->next_journal_sector = nb_sectors;
+    }
+
+    PRINT_JRECORDS(acb->jcb.iov.iov_base, acb->jcb.iov.iov_len);
+
+    acb->jcb.hd_acb = bdrv_aio_writev(s->fvd_metadata,
+                                      s->journal_offset + journal_sec,
+                                      &acb->jcb.qiov, nb_sectors,
+                                      bjnl_write_buf_cb, acb);
+    if (acb->jcb.hd_acb) {
+        return 0;
+    } else {
+        ret = -EIO;
+    }
+
+fail:
+    bjnl_finish_write_buf(acb, ret);
+    return ret;
+}
+
+static void bjnl_flush_data_before_update_bitmap_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("JOURNAL: bjnl_flush_data_before_update_bitmap_cb acb%llu-%p\n",
+           acb->uuid, acb);
+
+    if (ret != 0) {
+        bjnl_finish_write_buf(acb, ret);
+    } else if (bjnl_write_buf_start(acb) == 0) {
+        return;
+    }
+
+    bjnl_write_next_buf(acb->common.bs->opaque);
+}
+
+static inline int bjnl_write_buf(FvdAIOCB *acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    QDEBUG("JOURNAL: bjnl_write_buf acb%llu-%p\n", acb->uuid, acb);
+
+    if (!acb->jcb.bitmap_updated) {
+        return bjnl_write_buf_start(acb);
+    }
+
+    /* If bitmap_updated, fvd_data need be flushed first before bitmap changes
+     * can be committed. Otherwise, a host crashes after bitmap metadata are
+     * updated but before the corresponding data are persisted on disk, the VM
+     * will get corrupted data, as correct data may be in the base image. */
+    acb->jcb.hd_acb = bdrv_aio_flush(s->fvd_data,
+                                     bjnl_flush_data_before_update_bitmap_cb,
+                                     acb);
+    if (acb->jcb.hd_acb) {
+        return 0;
+    } else {
+        bjnl_finish_write_buf(acb, -1);
+        return -1;
+    }
+}
+
+static void bjnl_send_current_buf_to_write_queue(BlockDriverState *bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (!s->bjnl.buf) {
+        return;
+    }
+    if (s->bjnl.buf_used == 0) {
+        my_qemu_vfree (s->bjnl.buf);
+        s->bjnl.buf = NULL;
+        return;
+    }
+    if (s->bjnl.buf_used < s->bjnl.buf_size) {
+        /* Mark the end of the buffer as EMPTY_JRECORD. */
+        *((uint32_t*)(s->bjnl.buf + s->bjnl.buf_used)) = EMPTY_JRECORD;
+    }
+
+    /* Cretae a new acb and put it in the queue of bjnl.queued_bufs. */
+    FvdAIOCB *acb = my_qemu_aio_get(&fvd_aio_pool, bs, NULL, NULL);
+    if (!acb) {
+        s->metadata_err_prohibit_write = true;
+        my_qemu_vfree (s->bjnl.buf);
+        s->bjnl.buf = NULL;
+        return;
+    }
+
+    acb->type = OP_BJNL_BUF_WRITE;
+    acb->cancel_in_progress = false;
+    acb->jcb.iov.iov_base = s->bjnl.buf;
+    acb->jcb.iov.iov_len = ROUND_UP(s->bjnl.buf_used, 512); /* Full jnl sector*/
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.bitmap_updated = s->bjnl.buf_contains_bitmap_update;
+    s->bjnl.buf_contains_bitmap_update = false;
+    qemu_iovec_init_external(&acb->jcb.qiov, &acb->jcb.iov, 1);
+    QTAILQ_INSERT_TAIL(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+
+    PRINT_JRECORDS(acb->jcb.iov.iov_base, acb->jcb.iov.iov_len);
+
+    /* If no ongoing journal write, start this one. */
+    if (acb == QTAILQ_FIRST(&s->bjnl.queued_bufs)) {
+        /* Since this acb is not owned by any VM-generated request, it can
+         * only be started in context_id 0. Otherwise, qemu_aio_wait() may
+         * never process the callbacks generated by bjnl_write_buf(). */
+        if (get_async_context_id() == 0) {
+            bjnl_write_buf(acb);
+        } else {
+            /* Start journal write in the timer callback. */
+            qemu_mod_timer(s->bjnl.clean_buf_timer, qemu_get_clock(rt_clock));
+            s->bjnl.timer_scheduled = true;
+        }
+    }
+
+    s->bjnl.buf = NULL;
+    QDEBUG("JOURNAL: acb%llu-%p  added to bjnl_write_queue\n", acb->uuid, acb);
+}
+
 static uint8_t * bjnl_alloc_journal_records_from_buf(BlockDriverState *bs,
                                                 bool update_bitmap,
                                                 size_t record_size)
 {
-    return NULL;
-}
+    BDRVFvdState *s = bs->opaque;
+
+    if (!s->bjnl.timer_scheduled) {
+        QDEBUG("JOURNAL: bjnl_start_timer\n");
+        /* Now we have dirty data. Start a timer to write it out later. */
+        int64_t expire = qemu_get_clock(rt_clock) + s->bjnl.clean_buf_period;
+        qemu_mod_timer(s->bjnl.clean_buf_timer, expire);
+        s->bjnl.timer_scheduled = true;
+    }
+
+    if (s->bjnl.buf && s->bjnl.buf_used + record_size <= s->bjnl.buf_size) {
+        size_t current_sector_left = 512 - s->bjnl.buf_used % 512;
+        if (current_sector_left >= record_size) {
+            /* Continue to use current sector.*/
+use_current_buf:
+            QDEBUG("JOURNAL: bjnl_alloc_buf buf_used=%zu new=%zu limit=%zu\n",
+                   s->bjnl.buf_used, record_size, s->bjnl.buf_size);
+            uint8_t *buf = s->bjnl.buf + s->bjnl.buf_used;
+            s->bjnl.buf_used += record_size;
+            if (update_bitmap) {
+                s->bjnl.buf_contains_bitmap_update = true;
+            }
+            return buf;
+        }
+
+        /* Mark the end of the valid section of the current buffer sector
+         * and start to use the next sector.*/
+        *((uint32_t*)(s->bjnl.buf + s->bjnl.buf_used)) = EMPTY_JRECORD;
+        s->bjnl.buf_used += current_sector_left;
 
+        if (s->bjnl.buf_used + record_size <= s->bjnl.buf_size) {
+            goto use_current_buf;
+        }
+    }
+
+    if (s->bjnl.buf) {
+        QDEBUG("JOURNAL: bjnl_buf_full_start_new buf_used=%zu new=%zu "
+               "limit=%zu\n", s->bjnl.buf_used, record_size, s->bjnl.buf_size);
+    } else {
+        QDEBUG("JOURNAL: bjnl_buf_full_start_new no_current_buf\n");
+    }
+
+    /* Need to start a new buffer. Send current buffer to write queue first. */
+    bjnl_send_current_buf_to_write_queue(bs);
+
+    s->bjnl.buf_used = record_size;
+    record_size = ROUND_UP(record_size, 512);
+    s->bjnl.buf_size = MAX(record_size, s->bjnl.def_buf_size);
+    s->bjnl.buf_contains_bitmap_update = update_bitmap;
+    s->bjnl.buf = my_qemu_blockalign(s->fvd_metadata, s->bjnl.buf_size);
+
+    return s->bjnl.buf;
+}
 
 static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
 {
-    /* To be implemented. */
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+
+    ASSERT (get_async_context_id() == 0);
+    QDEBUG("JOURNAL: bjnl_timer_expired\n");
+
+    /* Clean the current buffer. */
+    if (s->bjnl.buf && s->bjnl.buf_used > 0) {
+        QDEBUG ("JOURNAL: timer bjnl_send_current_buf_to_write_queue\n");
+        bjnl_send_current_buf_to_write_queue(bs);
+    }
+
+    /* Start writing the first buffer if it is not already started. */
+    while ((acb = QTAILQ_FIRST(&s->bjnl.queued_bufs)) && !acb->jcb.hd_acb) {
+        QDEBUG("JOURNAL: acb%llu-%p  bjnl_write_buf by timer",
+               acb->uuid, acb);
+        if (bjnl_write_buf(acb) == 0) {
+            break;
+        }
+    }
+
+    /* The timer is no longer scheduled. It will be scheduled when needed. */
+    s->bjnl.timer_scheduled = false;
+}
+
+#ifdef ENABLE_QDEBUG
+static void print_jrecords(const uint8_t *sector, size_t len)
+{
+    const uint8_t *end = sector + len;
+    ASSERT(len % 512 == 0);
+
+    QDEBUG("JOURNAL: write bjnl_records\n");
+    while (sector < end) {
+        uint32_t *type = (uint32_t *) sector;   /* Journal record type. */
+        while ((uint8_t *) type < (sector + 512)) {
+            if (le32_to_cpu(*type) == BITMAP_JRECORD) {
+                uint32_t *nb_sectors = type + 1;
+                int64_t *sector_num = (int64_t *) (type + 2);
+
+                QDEBUG("JOURNAL: write BITMAP_JRECORD sector_num=%" PRId64
+                       " nb_sectors=%u\n", le64_to_cpu(*sector_num),
+                       le32_to_cpu(*nb_sectors));
+
+                /* First field of the next journal record. */
+                type = (uint32_t *) (sector_num + 1);
+            } else if (le32_to_cpu(*type) == TABLE_JRECORD) {
+                uint64_t *epoch = (uint64_t *) (type + 1);
+                uint32_t *count = (uint32_t *) (epoch + 1);
+                uint32_t *offset = count + 1;
+                uint32_t *content = offset + 1;
+                const uint32_t chunk = le32_to_cpu(*offset);
+                const uint64_t epo = le64_to_cpu(*epoch);
+                const uint32_t n = le32_to_cpu(*count);
+                uint32_t i;
+
+                QDEBUG("JOURNAL: write TABLE_JRECORD epoch=%" PRIu64
+                       " chunk_start=%u " "nb_chunks=%u\n", epo, chunk, n);
+                for (i = 0; i < n; i++) {
+                    QDEBUG("\tMap chunk %u to %u\n", chunk + i,
+                           READ_TABLE(content[i]));
+                }
+
+                type = content + n;     /* First field of the next record. */
+            } else {
+                /* End of valid records in this journal sector. */
+                ASSERT(le32_to_cpu(*type) == EMPTY_JRECORD);
+                break;
+            }
+        }
+
+        sector += 512;
+    }
 }
+#endif
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (14 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 16/26] FVD: add impl for buffered journal updates Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 18/26] FVD: add support for base image prefetching Chunqiang Tang
                   ` (8 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_flush() and bdrv_aio_flush()
interfaces.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-flush.c       |  176 +++++++++++++++++++++++++++++++++++++-
 block/fvd-journal-buf.c |  218 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 390 insertions(+), 4 deletions(-)

diff --git a/block/fvd-flush.c b/block/fvd-flush.c
index 34bd5cb..6658d27 100644
--- a/block/fvd-flush.c
+++ b/block/fvd-flush.c
@@ -1,5 +1,5 @@
 /*
- * QEMU Fast Virtual Disk Format bdrv_flush() and bdrv_aio_flush()
+ * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
  *
  * Copyright IBM, Corp. 2010
  *
@@ -11,14 +11,182 @@
  *
  */
 
+static void aio_wrapper_bh(void *opaque);
+static int bjnl_sync_flush(BlockDriverState * bs);
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, BlockDriverAIOCB **p_acb);
+static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, FvdAIOCB *parent_acb);
+
+static int fvd_flush(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+
+    QDEBUG("fvd_flush() invoked\n");
+
+    if (s->metadata_err_prohibit_write) {
+        return -EIO;
+    }
+
+    if (!s->fvd_metadata->enable_write_cache) {
+        /* No need to flush since it uses O_DSYNC. */
+        return 0;
+    }
+
+    if (s->use_bjnl) {
+        return bjnl_sync_flush(bs);
+    }
+
+    /* Simply flush for unbuffered journal update. */
+    if ((ret = bdrv_flush(s->fvd_data))) {
+        return ret;
+    }
+    if (s->fvd_metadata == s->fvd_data) {
+        return 0;
+    }
+    return bdrv_flush(s->fvd_metadata);
+}
+
 static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
                                        BlockDriverCompletionFunc * cb,
                                        void *opaque)
 {
-    return NULL;
+    BDRVFvdState *s = bs->opaque;
+    BlockDriverAIOCB * pacb;
+    FvdAIOCB  *acb;
+
+    QDEBUG("fvd_aio_flush() invoked\n");
+
+    if (s->metadata_err_prohibit_write) {
+        return NULL;
+    }
+
+    if (!s->fvd_data->enable_write_cache) {
+        /* Need to flush since it uses O_DSYNC. Use a QEMUBH to invoke the
+         * callback. */
+
+        if (!(acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque))) {
+            return NULL;
+        }
+
+        acb->type = OP_WRAPPER;
+        acb->cancel_in_progress = false;
+        acb->wrapper.bh = qemu_bh_new(aio_wrapper_bh, acb);
+        qemu_bh_schedule(acb->wrapper.bh);
+        return &acb->common;
+    }
+
+    if (!s->use_bjnl) {
+        QDEBUG("FLUSH: start now for unbuffered journal update");
+        return fvd_aio_flush_start(bs, cb, opaque, NULL);
+    }
+
+    if (bjnl_clean_buf_on_aio_flush(bs, cb, opaque, &pacb)) {
+        /* Waiting for the journal buffer to be cleaned first. */
+        return pacb;
+    }
+
+    /* No buffered journal data. Start flush now. */
+    QDEBUG("FLUSH: start now as no buffered journal data");
+    return fvd_aio_flush_start(bs, cb, opaque, NULL);
+}
+
+static inline void finish_flush(FvdAIOCB * acb)
+{
+    QDEBUG("FLUSH: acb%llu-%p  finish_flush ret=%d\n",
+           acb->uuid, acb, acb->flush.ret);
+    acb->common.cb(acb->common.opaque, acb->flush.ret);
+    my_qemu_aio_release(acb);
 }
 
-static int fvd_flush(BlockDriverState * bs)
+static void flush_data_cb(void *opaque, int ret)
 {
-    return -ENOTSUP;
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("FLUSH: acb%llu-%p  flush_data_cb ret=%d\n", acb->uuid, acb, ret);
+
+    if (acb->flush.ret == 0) {
+        acb->flush.ret = ret;
+    }
+
+    acb->flush.data_acb = NULL;
+    acb->flush.num_finished++;
+    if (acb->flush.num_finished == 2) {
+        finish_flush(acb);
+    }
+}
+
+static void flush_metadata_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    QDEBUG("FLUSH: acb%llu-%p  flush_metadata_cb ret=%d\n",
+           acb->uuid, acb, ret);
+
+    if (acb->flush.ret == 0) {
+        acb->flush.ret = ret;
+    }
+
+    acb->flush.metadata_acb = NULL;
+    acb->flush.num_finished++;
+    if (acb->flush.num_finished == 2) {
+        finish_flush(acb);
+    }
+}
+
+static BlockDriverAIOCB *fvd_aio_flush_start(BlockDriverState * bs,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque, FvdAIOCB *parent_acb)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB  *acb;
+
+    if (s->fvd_data == s->fvd_metadata) {
+        if (parent_acb) {
+            QDEBUG("FLUSH: acb%llu-%p  started.\n",parent_acb->uuid,parent_acb);
+        }
+        return bdrv_aio_flush(s->fvd_metadata, cb, opaque);
+    }
+
+    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    COPY_UUID(acb, parent_acb); /* UUID helps debugging. */
+
+    /* fvd_data and fvd_metadata are different. Need to flush both. The order
+     * is not important. If (cache != writethrough && bitmap_updated), a flush
+     * on fvd_data must have already been performed by write_journal_buf(). */
+
+    acb->type = OP_FLUSH;
+    acb->cancel_in_progress = false;
+    acb->flush.num_finished = 0;
+    acb->flush.ret = 0;
+    acb->flush.data_acb = bdrv_aio_flush(s->fvd_data, flush_data_cb, acb);
+    if (!acb->flush.data_acb) {
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+
+    acb->flush.metadata_acb = bdrv_aio_flush(s->fvd_metadata,
+                                             flush_metadata_cb, acb);
+    if (!acb->flush.metadata_acb) {
+        bdrv_aio_cancel(acb->flush.data_acb);
+        my_qemu_aio_release(acb);
+        return NULL;
+    }
+
+    QDEBUG("FLUSH: acb%llu-%p  started.\n", acb->uuid, acb);
+    return &acb->common;
 }
diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
index b4077ce..e99a585 100644
--- a/block/fvd-journal-buf.c
+++ b/block/fvd-journal-buf.c
@@ -23,6 +23,48 @@
 static inline int bjnl_write_buf(FvdAIOCB *acb);
 static void bjnl_send_current_buf_to_write_queue(BlockDriverState *bs);
 
+/* Return false if no buffered journal data. Invoked by fvd_aio_flush(). */
+static bool bjnl_clean_buf_on_aio_flush(BlockDriverState *bs,
+                              BlockDriverCompletionFunc * cb,
+                              void *opaque, BlockDriverAIOCB **p_acb)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+
+    if (!s->bjnl.buf || s->bjnl.buf_used == 0) {
+        /* The current journal buffer is empty. */
+
+        if (QTAILQ_EMPTY(&s->bjnl.queued_bufs)) {
+            return false; /* Indicatte no previously buffered journal data. */
+        }
+    } else {
+        QDEBUG("JOURNAL: bjnl_clean_buf_on_aio_flush invoke "
+               "bjnl_send_current_buf_to_write_queue\n");
+        bjnl_send_current_buf_to_write_queue(bs);
+    }
+
+    /* Append an acb at the tail of bjnl.queued_bufs to invoke the aio_flush
+     * callback after all previous pending journal writes finish. See
+     * bjnl_write_next_buf() -> bjnl_write_buf(). */
+
+    acb = my_qemu_aio_get(&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        *p_acb = NULL; /* Indicate failure. */
+        return true;
+    }
+
+    acb->type = OP_BJNL_FLUSH;
+    acb->cancel_in_progress = false;
+    acb->jcb.iov.iov_base = NULL; /* Indicate no data. */
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.bitmap_updated = false;
+    QTAILQ_INSERT_TAIL(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+    *p_acb = &acb->common;
+
+    QDEBUG("JOURNAL: inserted OP_BJNL_FLUSH acb%llu-%p\n", acb->uuid, acb);
+    return true;
+}
+
 static inline void bjnl_finish_write_buf(FvdAIOCB *acb, int ret)
 {
     ASSERT (acb->type == OP_BJNL_BUF_WRITE);
@@ -65,6 +107,30 @@ static inline void bjnl_aio_flush_cb(void *opaque, int ret)
     my_qemu_aio_release(acb);
 }
 
+/* This acb is inserted by clean_journal_buf() on behalf of a pending
+ * bdrv_aio_flush(). */
+static inline void bjnl_handle_aio_flush(FvdAIOCB *acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+
+    if (!s->metadata_err_prohibit_write) {
+        /* Buffered data have been written to journal. Now start flush. */
+        QDEBUG("JOURNAL: bjnl_start_flush for acb%llu-%p\n", acb->uuid, acb);
+        acb->jcb.hd_acb = fvd_aio_flush_start(bs, bjnl_aio_flush_cb, acb, acb);
+        if (acb->jcb.hd_acb) {
+            return;
+        }
+    }
+
+    QDEBUG("JOURNAL: bjnl_handle_aio_flush err acb%llu-%p\n", acb->uuid, acb);
+    /* Failed. Invoke aio_flush callback. */
+    acb->common.cb(acb->common.opaque, -EIO);
+    my_qemu_aio_release(acb);
+}
+
 static inline void bjnl_write_buf_cb(void *opaque, int ret)
 {
     FvdAIOCB *acb = (FvdAIOCB *) opaque;
@@ -153,6 +219,14 @@ static inline int bjnl_write_buf(FvdAIOCB *acb)
 
     QDEBUG("JOURNAL: bjnl_write_buf acb%llu-%p\n", acb->uuid, acb);
 
+    if (acb->type == OP_BJNL_FLUSH) {
+        bjnl_handle_aio_flush(acb);
+
+        /* Return -1 to tell bjnl_write_next_buf() to move on to the next
+         * buffer write as no buffered journal data are being written.*/
+        return -1;
+    }
+
     if (!acb->jcb.bitmap_updated) {
         return bjnl_write_buf_start(acb);
     }
@@ -313,6 +387,150 @@ static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
     s->bjnl.timer_scheduled = false;
 }
 
+/* Perform a synchronous flush. Invoked by fvd_close() and fvd_flush(). */
+static int bjnl_sync_flush(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb, *a;
+    int ret = 0;
+    size_t buf_size;
+    uint8_t *p, *buf = NULL;
+    bool bitmap_updated = false;
+    int nb_sectors;
+    int64_t journal_sec;
+
+    /* Calculate the total buffered metadata updates. Check the current buffer
+     * first. */
+    if (!s->bjnl.buf) {
+        buf_size = 0;
+    } else if (s->bjnl.buf_used == 0) {
+        buf_size = 0;
+    } else {
+        if (s->bjnl.buf_used < s->bjnl.buf_size) {
+            /* Mark the end of the buffer as EMPTY_JRECORD. */
+            *((uint32_t*)(s->bjnl.buf + s->bjnl.buf_used)) = EMPTY_JRECORD;
+        }
+        buf_size = s->bjnl.buf_used = ROUND_UP(s->bjnl.buf_used, 512);
+        bitmap_updated = s->bjnl.buf_contains_bitmap_update;
+    }
+
+    /* Go through the queued buffers. */
+    acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+    if (acb) {
+        if (acb->jcb.hd_acb) {
+            /* The first acb is the ongoing operation. Cancel and re-do it
+             * synchronously below. */
+            QDEBUG("JOURNAL: bjnl_sync_flush cancel ongoing buf_write "
+                   "acb%llu-%p\n", acb->uuid, acb);
+            bdrv_aio_cancel(acb->jcb.hd_acb);
+        }
+
+        /* Calcualte buf_size. */
+        while (acb) {
+            if (acb->type == OP_BJNL_BUF_WRITE) {
+                buf_size += acb->jcb.iov.iov_len;
+                if (acb->jcb.bitmap_updated) {
+                    bitmap_updated = true;
+                }
+            }
+            acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+        }
+    }
+
+    if (buf_size == 0) {
+        QDEBUG("JOURNAL: bjnl_sync_flush no_data\n");
+        goto done; /* No buffered metadata updates. */
+    }
+
+    if (bitmap_updated) {
+        /* Need a flush to ensure the correct semantics of copy-on-write in
+         * the event of a host crash. */
+        QDEBUG("JOURNAL: bjnl_sync_flush bitmap_updated flush_fvd_data\n");
+        if ((ret = bdrv_flush(s->fvd_data))) {
+            goto cleanup;
+        }
+    }
+
+    /* Allocate journal sectors. */
+    ASSERT(buf_size % 512 == 0);
+    nb_sectors = buf_size / 512;
+    if (s->next_journal_sector + nb_sectors > s->journal_size) {
+        QDEBUG("JOURNAL: bjnl_sync_flush recycle_journal\n");
+        ret = recycle_journal(bs);
+        /* Journal recycle writes out the entire bitmap and table. Therefore,
+         * there is no need to write buffered metadata updates to journal. */
+        goto done;
+    }
+    journal_sec = s->next_journal_sector;
+    s->next_journal_sector += nb_sectors;
+
+    /* Copy all metadata updates into one buffer. */
+    p = buf = my_qemu_blockalign(s->fvd_metadata, buf_size);
+    acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+    while (acb) {
+        if (acb->type == OP_BJNL_BUF_WRITE) {
+            QDEBUG("JOURNAL: bjnl_sync_flush takes care buf_write acb%llu-%p\n",
+                   acb->uuid, acb);
+            ASSERT(acb->jcb.iov.iov_len > 0);
+            memcpy(p, acb->jcb.iov.iov_base, acb->jcb.iov.iov_len);
+            PRINT_JRECORDS(p, acb->jcb.iov.iov_len);
+            p += acb->jcb.iov.iov_len;
+        }
+        acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+    }
+
+    if (s->bjnl.buf && s->bjnl.buf_used > 0) {
+        /* Copy the current buffer. */
+        memcpy(p, s->bjnl.buf, s->bjnl.buf_used);
+        PRINT_JRECORDS(p, s->bjnl.buf_used);
+    }
+
+    /* Write all metadata updates synchronously. */
+    QDEBUG("JOURNAL: bjnl_sync_flush write_buffer\n");
+    if ((ret=bdrv_write(s->fvd_metadata, s->journal_offset + journal_sec,
+                        buf, nb_sectors)) < 0) {
+        goto cleanup;
+    }
+
+done:
+    /* Flush finally. */
+    QDEBUG("JOURNAL: bjnl_sync_flush do final flush\n");
+    if (s->fvd_data != s->fvd_metadata) {
+        if ((ret = bdrv_flush(s->fvd_data)) != 0) {
+            goto cleanup;
+        }
+    }
+    ret = bdrv_flush(s->fvd_metadata);
+
+cleanup:
+    if (buf) {
+        my_qemu_vfree(buf);
+    }
+    if (s->bjnl.buf) {
+        my_qemu_vfree (s->bjnl.buf);
+        s->bjnl.buf = NULL;
+    }
+
+    acb = QTAILQ_FIRST(&s->bjnl.queued_bufs);
+    QTAILQ_INIT(&s->bjnl.queued_bufs);
+    while (acb) {
+        if (acb->type == OP_BJNL_BUF_WRITE) {
+            my_qemu_vfree(acb->jcb.iov.iov_base);
+        } else {
+            ASSERT(acb->type == OP_BJNL_FLUSH);
+            /* Invoke the callback for bdrv_aio_flush(). */
+            QDEBUG("JOURNAL: aio_flush acb%llu-%p finished by sync_flush\n",
+                   acb->uuid, acb);
+            acb->common.cb(acb->common.opaque, ret);
+        }
+        a = acb;
+        acb = QTAILQ_NEXT(acb, jcb.bjnl_next_queued_buf);
+        my_qemu_aio_release(a);
+    }
+
+    return ret;
+}
+
 #ifdef ENABLE_QDEBUG
 static void print_jrecords(const uint8_t *sector, size_t len)
 {
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 18/26] FVD: add support for base image prefetching
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (15 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush() Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 19/26] FVD: add support for aio_cancel Chunqiang Tang
                   ` (7 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds adaptive prefetching of base image to FVD.  FVD supports both
copy-on-write and copy-on-read of base image. Adaptive prefetching is similar
to copy-on-read except that it is initiated by the FVD driver rather than
triggered by the VM's read requests. FVD's prefetching is conservative in
that, if it detects resource contention, it will back off and temporarily
pause prefetching.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-prefetch.c |  600 +++++++++++++++++++++++++++++++++++++++++++++++++-
 block/fvd-read.c     |    1 +
 qemu-io-sim.c        |   13 +
 3 files changed, 613 insertions(+), 1 deletions(-)

diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
index 5844aa7..b8be98c 100644
--- a/block/fvd-prefetch.c
+++ b/block/fvd-prefetch.c
@@ -11,7 +11,605 @@
  *
  */
 
+static void prefetch_read_cb(void *opaque, int ret);
+static void resume_prefetch(BlockDriverState * bs);
+static void do_next_prefetch_read(BlockDriverState * bs, int64_t current_time);
+
 void fvd_init_prefetch(void *opaque)
 {
-    /* To be implemented. */
+    BlockDriverState *bs = opaque;
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    int i;
+
+    QDEBUG("Start prefetching\n");
+
+    if (!s->data_region_prepared) {
+        init_data_region(s);
+    }
+
+    s->prefetch_acb = my_qemu_malloc(sizeof(FvdAIOCB *)*s->num_prefetch_slots);
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        acb = my_qemu_aio_get(&fvd_aio_pool, bs, prefetch_null_cb, NULL);
+        s->prefetch_acb[i] = acb;
+        if (!acb) {
+            int j;
+            for (j = 0; j < i; j++) {
+                my_qemu_aio_release(s->prefetch_acb[j]);
+                s->prefetch_acb[j] = NULL;
+            }
+
+            my_qemu_free(s->prefetch_acb);
+            s->prefetch_acb = NULL;
+            fprintf(stderr, "No acb and cannot start prefetching.\n");
+            return;
+        }
+
+        acb->type = OP_COPY;
+        acb->cancel_in_progress = false;
+    }
+
+    s->prefetch_state = PREFETCH_STATE_RUNNING;
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        acb = s->prefetch_acb[i];
+        acb->copy.buffered_sector_begin = acb->copy.buffered_sector_end = 0;
+        QLIST_INIT(&acb->copy_lock.dependent_writes);
+        acb->copy_lock.next.le_prev = NULL;
+        acb->copy.hd_acb = NULL;
+        acb->sector_num = 0;
+        acb->nb_sectors = 0;
+        acb->copy.iov.iov_len = s->sectors_per_prefetch * 512;
+        acb->copy.buf = acb->copy.iov.iov_base =
+            my_qemu_blockalign(bs->backing_hd, acb->copy.iov.iov_len);
+        qemu_iovec_init_external(&acb->copy.qiov, &acb->copy.iov, 1);
+    }
+
+    if (s->prefetch_timer) {
+        qemu_free_timer(s->prefetch_timer);
+        s->prefetch_timer =
+            qemu_new_timer(rt_clock, (QEMUTimerCB *) resume_prefetch, bs);
+    }
+
+    s->pause_prefetch_requested = false;
+    s->unclaimed_prefetch_region_start = 0;
+    s->prefetch_read_throughput = -1;   /* Indicate not initialized. */
+    s->prefetch_write_throughput = -1;  /* Indicate not initialized. */
+    s->prefetch_read_time = 0;
+    s->prefetch_write_time = 0;
+    s->prefetch_data_read = 0;
+    s->prefetch_data_written = 0;
+    s->next_prefetch_read_slot = 0;
+    s->num_filled_prefetch_slots = 0;
+    s->prefetch_read_active = false;
+
+    do_next_prefetch_read(bs, qemu_get_clock(rt_clock));
+}
+
+static void pause_prefetch(BDRVFvdState * s)
+{
+    int64_t ms = 1 + (int64_t) ((rand() / ((double)RAND_MAX))
+                                * s->prefetch_throttle_time);
+    QDEBUG("Pause prefetch for %" PRId64 " milliseconds\n", ms);
+    /* When the timer expires, it goes to resume_prefetch(). */
+    qemu_mod_timer(s->prefetch_timer, qemu_get_clock(rt_clock) + ms);
+}
+
+/* Return true if every bit of freshbitmap is set to 1. */
+static bool all_data_prefetched(BDRVFvdState *s)
+{
+    uint64_t n = s->base_img_sectors / s->block_size / sizeof(uint64_t) / 8;
+    uint64_t *p = (uint64_t*)s->fresh_bitmap;
+    uint64_t i;
+
+    for (i = 0; i < n; i++, p++) {
+        if (*p != UINT64_C(0xFFFFFFFFFFFFFFFF)) {
+            return false;
+        }
+    }
+
+    uint64_t sec = n * sizeof(uint64_t) * 8 * s->block_size;
+    while (sec < s->base_img_sectors) {
+        if (fresh_bitmap_show_sector_in_base_img(sec, s)) {
+            return false;
+        }
+        sec += s->block_size;
+    }
+
+    return true;
+}
+
+static void terminate_prefetch(BlockDriverState * bs, int final_state)
+{
+    BDRVFvdState *s = bs->opaque;
+    int i;
+
+    ASSERT(!s->prefetch_read_active && s->num_filled_prefetch_slots == 0);
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        if (s->prefetch_acb) {
+            my_qemu_vfree(s->prefetch_acb[i]->copy.buf);
+            my_qemu_aio_release(s->prefetch_acb[i]);
+            s->prefetch_acb[i] = NULL;
+        }
+    }
+    my_qemu_free(s->prefetch_acb);
+    s->prefetch_acb = NULL;
+
+    if (s->prefetch_timer) {
+        qemu_del_timer(s->prefetch_timer);
+        qemu_free_timer(s->prefetch_timer);
+        s->prefetch_timer = NULL;
+    }
+
+    if (final_state == PREFETCH_STATE_FINISHED) {
+        if (all_data_prefetched(s)) {
+            s->prefetch_state = PREFETCH_STATE_FINISHED;
+            s->copy_on_read = false;
+        } else {
+            s->prefetch_state = PREFETCH_STATE_DISABLED;
+        }
+    } else {
+        s->prefetch_state = final_state;
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED) {
+        flush_metadata_to_disk(bs, false/*journal*/, true/*all_prefetched*/);
+        QDEBUG("FVD prefetching finished successfully.\n");
+    } else {
+        flush_metadata_to_disk(bs, false/*journal*/, false/*all_prefetched*/);
+        QDEBUG("FVD prefetching disabled.\n");
+    }
+}
+
+static void do_next_prefetch_read(BlockDriverState * bs, int64_t current_time)
+{
+    FvdAIOCB *acb;
+    BDRVFvdState *s = bs->opaque;
+    int64_t begin, end;
+
+    ASSERT(!s->prefetch_read_active
+           && s->num_filled_prefetch_slots < s->num_prefetch_slots
+           && !s->pause_prefetch_requested);
+
+    /* Find the next region to prefetch. */
+    begin = s->unclaimed_prefetch_region_start;
+    while (1) {
+        /*Check the bitmap to determine if it is truly finished. If not
+            schedule a timer to retry again. */
+
+        if (begin >= s->base_img_sectors) {
+            s->unclaimed_prefetch_region_start = s->base_img_sectors;
+            if (s->num_filled_prefetch_slots == 0) {
+                terminate_prefetch(bs, PREFETCH_STATE_FINISHED);
+            }
+            return;
+        }
+        end = begin + s->sectors_per_prefetch;
+        if (end > s->base_img_sectors) {
+            end = s->base_img_sectors;
+        }
+        if (find_region_in_base_img(s, &begin, &end)) {
+            break;
+        }
+        begin = end;
+    }
+
+    ASSERT(begin % s->block_size == 0 && (end % s->block_size == 0
+           || end == s->base_img_sectors));
+
+    acb = s->prefetch_acb[s->next_prefetch_read_slot];
+    acb->copy.buffered_sector_begin = acb->sector_num = begin;
+    acb->copy.buffered_sector_end = s->unclaimed_prefetch_region_start = end;
+    acb->nb_sectors = end - begin;
+    acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+    acb->copy.iov.iov_base = acb->copy.buf;
+    acb->copy.last_prefetch_op_start_time = current_time;
+    acb->copy.hd_acb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
+                                      &acb->copy.qiov, acb->nb_sectors,
+                                      prefetch_read_cb, acb);
+
+
+    if (acb->copy.hd_acb == NULL) {
+        QDEBUG("PREFETCH: error when starting read for sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+    } else {
+        s->prefetch_read_active = true;
+        QDEBUG("PREFETCH: start read for sector_num=%" PRId64
+               " nb_sectors=%d total_prefetched_bytes=%" PRId64 "\n",
+               acb->sector_num, acb->nb_sectors, s->total_prefetch_data);
+#ifdef FVD_DEBUG
+        s->total_prefetch_data += acb->copy.iov.iov_len;
+#endif
+    }
+}
+
+static void prefetch_write_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t begin, end;
+    const int64_t current_time = qemu_get_clock(rt_clock);
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    ASSERT(acb->nb_sectors > 0 && s->num_filled_prefetch_slots > 0);
+
+    if (ret == 0) {
+        /* No need to update the on-disk bitmap or the stale bitmap.
+         * See Section 3.3.4 of the FVD-cow paper. */
+        update_fresh_bitmap(acb->sector_num, acb->nb_sectors, s);
+    }
+
+    QLIST_REMOVE(acb, copy_lock.next);
+    restart_dependent_writes(acb);
+    acb->copy.hd_acb = NULL;
+    QLIST_INIT(&acb->copy_lock.dependent_writes);
+
+    if (ret != 0) {
+        QDEBUG("PREFETCH: finished write with error for sector_num=%" PRId64
+               " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+        s->num_filled_prefetch_slots = 0;
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (!s->prefetch_read_active) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    const int64_t write_time =
+        current_time - acb->copy.last_prefetch_op_start_time;
+    s->prefetch_write_time += write_time;
+    s->prefetch_data_written += acb->nb_sectors * 512;
+
+    QDEBUG("PREFETCH: write_finished  sector_num=%" PRId64
+           " nb_sectors=%d  write_time=%"PRId64" (ms)\n", acb->sector_num,
+           acb->nb_sectors, write_time);
+
+    /* Calculate throughput and determine if it needs to pause prefetching due
+     * to low throughput. */
+    if (s->prefetch_timer && s->prefetch_throttle_time > 0
+        && !s->pause_prefetch_requested
+        && s->prefetch_write_time > s->prefetch_write_throughput_measure_time) {
+        const double this_round_throughput =
+            s->prefetch_data_written / (double)s->prefetch_write_time;
+        if (s->prefetch_write_throughput < 0) {
+            /* Previously not initialized. */
+            s->prefetch_write_throughput = this_round_throughput;
+        } else {
+            s->prefetch_write_throughput =
+                PREFETCH_PERF_CALC_ALPHA * s->prefetch_write_throughput +
+                (1 - PREFETCH_PERF_CALC_ALPHA) * this_round_throughput;
+        }
+        if (s->prefetch_write_throughput < s->prefetch_min_write_throughput) {
+            QDEBUG("PREFETCH: slow_write  this_write=%"PRId64" (ms)  "
+                   "this_write_throughput=%.3lf (MB/s)   "
+                   "avg_write_throughput=%.3lf (MB/s)\n",
+                   write_time, this_round_throughput / 1048576 * 1000,
+                   s->prefetch_write_throughput / 1048576 * 1000);
+
+            /* Make a randomized decision to pause prefetching. This avoids
+             * pausing all contending FVD drivers. See Section 3.4.2 of the
+             * FVD-cow paper. */
+            if (rand() > (RAND_MAX / 2)) {
+                QDEBUG("PREFETCH: pause requested.\n");
+                s->pause_prefetch_requested = true;
+            } else {
+                QDEBUG("PREFETCH: continue due to 50%% probability, despite "
+                       "slow write.\n");
+                s->prefetch_write_throughput = -1; /*Indicate not initialized*/
+            }
+        } else {
+            QDEBUG("PREFETCH: this_write_throughput=%.3lf (MB/s)   "
+                   "avg_write_throughput=%.3lf (MB/s)\n",
+                   this_round_throughput / 1048576 * 1000,
+                   s->prefetch_write_throughput / 1048576 * 1000);
+        }
+
+        /* Preparing for measuring the next round of throughput. */
+        s->prefetch_data_written = 0;
+        s->prefetch_write_time = 0;
+    }
+
+    /* Find in this prefetch slot the next section of prefetched but
+     * not-yet-written data. */
+    begin = acb->sector_num + acb->nb_sectors;
+    if (begin < acb->copy.buffered_sector_end) {
+        end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img(s, &begin, &end)) {
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base = acb->copy.buf +
+                (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG("PREFETCH: write_data  sector_num=%" PRId64
+                   " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                          &acb->copy.qiov, acb->nb_sectors,
+                                          prefetch_write_cb, acb);
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG("PREFETCH: error in starting bdrv_aio_writev().\n");
+                s->num_filled_prefetch_slots = 0;
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                if (!s->prefetch_read_active) {
+                    terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+                }
+            } else {
+                acb->copy_lock.begin = begin;
+                acb->copy_lock.end = end;
+                QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            }
+
+            return;
+        }
+    }
+
+    s->num_filled_prefetch_slots--;
+
+    if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        if (s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    if (begin >= s->base_img_sectors) {
+        /* Prefetching finished. */
+        ASSERT(s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active);
+        terminate_prefetch(bs, PREFETCH_STATE_FINISHED);
+        return;
+    }
+
+    if (s->pause_prefetch_requested) {
+        if (s->num_filled_prefetch_slots == 0) {
+            if (!s->prefetch_read_active) {
+                pause_prefetch(s);
+            } else {
+                QDEBUG("PREFETCH: wait for the read operation to finish in "
+                       "order to pause prefetch.\n");
+            }
+            return;
+        }
+    }
+
+    /* Write out data in the next prefetched slot. */
+    while (s->num_filled_prefetch_slots > 0) {
+        int k = s->next_prefetch_read_slot - s->num_filled_prefetch_slots;
+        if (k < 0) {
+            k += s->num_prefetch_slots;
+        }
+        acb = s->prefetch_acb[k];
+
+        int64_t begin = acb->copy.buffered_sector_begin;
+        int64_t end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img(s, &begin, &end)) {
+            acb->copy.last_prefetch_op_start_time = current_time;
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base =
+                acb->copy.buf + (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG("PREFETCH: writes data: sector_num=%" PRId64
+                   " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                          &acb->copy.qiov, acb->nb_sectors,
+                                          prefetch_write_cb, acb);
+
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG("PREFETCH: error cannot get a control block to write "
+                       "a prefetched block.\n");
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                s->num_filled_prefetch_slots = 0;
+                if (!s->prefetch_read_active) {
+                    terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+                }
+                return;
+            }
+
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            break;
+        } else {
+            QDEBUG("PREFETCH: discard prefetched data as they have been "
+                   "covered: sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->sector_num, acb->nb_sectors);
+            s->num_filled_prefetch_slots--;
+        }
+    }
+
+    /* If the reader was stopped due to lack of slots, start the reader. */
+    if (!s->prefetch_read_active && !s->pause_prefetch_requested) {
+        do_next_prefetch_read(bs, current_time);
+    }
+}
+
+static void prefetch_read_cb(void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->cancel_in_progress) {
+        return;
+    }
+
+    ASSERT(s->prefetch_read_active && s->num_filled_prefetch_slots >= 0
+           && s->num_filled_prefetch_slots < s->num_prefetch_slots);
+
+    s->prefetch_read_active = false;
+    acb->copy.hd_acb = NULL;
+
+    if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    if (ret != 0) {
+        QDEBUG("PREFETCH: read_error  sector_num=%" PRId64 " nb_sectors=%d.\n",
+               acb->sector_num, acb->nb_sectors);
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    const int64_t current_time = qemu_get_clock(rt_clock);
+    const int64_t read_time = current_time -
+        acb->copy.last_prefetch_op_start_time;
+    s->prefetch_read_time += read_time;
+    s->prefetch_data_read += acb->nb_sectors * 512;
+
+    QDEBUG("PREFETCH: read_finished  sector_num=%" PRId64
+           " nb_sectors=%d  read_time=%"PRId64" (ms)\n", acb->sector_num,
+           acb->nb_sectors, read_time);
+
+    /* Calculate throughput and determine if it needs to pause prefetching due
+     * to low throughput. */
+    if (s->prefetch_timer && s->prefetch_throttle_time > 0
+        && !s->pause_prefetch_requested
+        && s->prefetch_read_time > s->prefetch_read_throughput_measure_time) {
+        const double this_round_throughput =
+            s->prefetch_data_read / (double)s->prefetch_read_time;
+        if (s->prefetch_read_throughput < 0) {
+            /* Previously not initialized. */
+            s->prefetch_read_throughput = this_round_throughput;
+        } else {
+            s->prefetch_read_throughput = PREFETCH_PERF_CALC_ALPHA *
+                s->prefetch_read_throughput +
+                (1 - PREFETCH_PERF_CALC_ALPHA) * this_round_throughput;
+        }
+        if (s->prefetch_read_throughput < s->prefetch_min_read_throughput) {
+            QDEBUG("PREFETCH: slow_read read_time=%"PRId64" (ms)   "
+                   "this_read_throughput=%.3lf (MB/s) "
+                   "avg_read_throughput=%.3lf (MB/s)\n",
+                   read_time, this_round_throughput / 1048576 * 1000,
+                   s->prefetch_read_throughput / 1048576 * 1000);
+
+            /* Make a randomized decision to pause prefetching. This avoids
+             * pausing all contending FVD drivers. See Section 3.4.2 of the
+             * FVD-cow paper. */
+            if (rand() > (RAND_MAX / 2)) {
+                QDEBUG("PREFETCH: pause requested.\n");
+                s->pause_prefetch_requested = true;
+            } else {
+                QDEBUG("PREFETCH: continue due to 50%% probability, "
+                       "despite slow read.\n");
+                s->prefetch_read_throughput = -1;  /*Indicate not initialized*/
+            }
+        } else {
+            QDEBUG("PREFETCH: this_read_throughput=%.3lf (MB/s)    "
+                   "avg_read_throughput=%.3lf (MB/s)\n",
+                   this_round_throughput / 1048576 * 1000,
+                   s->prefetch_read_throughput / 1048576 * 1000);
+        }
+
+        /* Preparing for measuring the next round of throughput. */
+        s->prefetch_data_read = 0;
+        s->prefetch_read_time = 0;
+    }
+
+    if (s->num_filled_prefetch_slots > 0) {
+        /* There is one ongoing write for prefetched data. This slot will be
+         * written out later. */
+        s->num_filled_prefetch_slots++;
+        s->next_prefetch_read_slot++;
+        if (s->next_prefetch_read_slot >= s->num_prefetch_slots) {
+            s->next_prefetch_read_slot = 0;
+        }
+    } else {
+        /* The writer is not active. Start the writer. */
+        int64_t begin = acb->copy.buffered_sector_begin;
+        int64_t end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img(s, &begin, &end)) {
+            acb->copy.last_prefetch_op_start_time = current_time;
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base =
+                acb->copy.buf + (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG("PREFETCH: writes_data sector_num=%" PRId64
+                   " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data(true, acb, bs, acb->sector_num,
+                                          &acb->copy.qiov, acb->nb_sectors,
+                                          prefetch_write_cb, acb);
+
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG("PREFETCH: error cannot get control block to write a "
+                       "prefetched block.\n");
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                if (s->num_filled_prefetch_slots == 0) {
+                    terminate_prefetch(bs, PREFETCH_STATE_DISABLED);
+                }
+                return;
+            }
+
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD(&s->copy_locks, acb, copy_lock.next);
+            s->num_filled_prefetch_slots++;
+            s->next_prefetch_read_slot++;
+            if (s->next_prefetch_read_slot >= s->num_prefetch_slots) {
+                s->next_prefetch_read_slot = 0;
+            }
+        } else {
+            /* The current prefetch slot will be reused to prefetch the next
+             * bunch of data. */
+            QDEBUG("PREFETCH: discard prefetched data as they have been "
+                   "covered: sector_num=%" PRId64 " nb_sectors=%d\n",
+                   acb->sector_num, acb->nb_sectors);
+        }
+    }
+
+    if (s->num_filled_prefetch_slots >= s->num_prefetch_slots) {
+        QDEBUG("PREFETCH: halt read because no slot is available.\n");
+    } else {
+        if (s->pause_prefetch_requested) {
+            if (s->num_filled_prefetch_slots == 0) {
+                pause_prefetch(s);
+            }
+        } else {
+            do_next_prefetch_read(bs, current_time);
+        }
+    }
+}
+
+static void resume_prefetch(BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->prefetch_state != PREFETCH_STATE_RUNNING) {
+        return;
+    }
+
+    ASSERT(s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active);
+    QDEBUG("PREFETCH: resume.\n");
+
+    s->pause_prefetch_requested = false;
+    s->prefetch_read_throughput = -1;   /* Indicate not initialized. */
+    s->prefetch_write_throughput = -1;  /* Indicate not initialized. */
+    s->prefetch_read_time = 0;
+    s->prefetch_write_time = 0;
+    s->prefetch_data_read = 0;
+    s->prefetch_data_written = 0;
+
+    do_next_prefetch_read(bs, qemu_get_clock(rt_clock));
+}
+
+static void prefetch_null_cb(void *opaque, int ret)
+{
+    /* Nothing to do and will never be invoked. Only need it to distinguish
+     * copy-on-read from prefetch. */
+    ASSERT(false);
 }
diff --git a/block/fvd-read.c b/block/fvd-read.c
index cd041e5..675af9e 100644
--- a/block/fvd-read.c
+++ b/block/fvd-read.c
@@ -11,6 +11,7 @@
  *
  */
 
+static void prefetch_null_cb(void *opaque, int ret);
 static void read_backing_for_copy_on_read_cb(void *opaque, int ret);
 static void read_fvd_cb(void *opaque, int ret);
 static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num,
diff --git a/qemu-io-sim.c b/qemu-io-sim.c
index 923c1b8..d420fdb 100644
--- a/qemu-io-sim.c
+++ b/qemu-io-sim.c
@@ -77,6 +77,17 @@ wrote 1024/1024 bytes at offset 65536
 *=============================================================================*/
 
 #include "block/blksim.h"
+#include "block/fvd-ext.h"
+
+static void sim_start_prefetch(void)
+{
+    if (!bs->drv->format_name || !strncmp(bs->drv->format_name, "fvd", 3)) {
+        printf("This image does not support prefetching.\n");
+        return;
+    }
+    fvd_init_prefetch(bs);
+    printf("Prefetching started\n");
+}
 
 static void sim_help(void)
 {
@@ -101,6 +112,8 @@ static int sim_f(int argc, char **argv)
 
     if (strcmp(argv[1], "list") == 0) {
         blksim_list_tasks();
+    } else if (strcmp(argv[1], "prefetch") == 0) {
+        sim_start_prefetch();
     } else if (strcmp(argv[1], "all") == 0) {
         blksim_set_disk_io_return_code(ret);
         int n = blksim_run_all_tasks();
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 19/26] FVD: add support for aio_cancel
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (16 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 18/26] FVD: add support for base image prefetching Chunqiang Tang
@ 2011-02-25 22:37 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 20/26] FVD: add impl of interface bdrv_get_info() Chunqiang Tang
                   ` (6 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:37 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds the support for aio_cancel into FVD. FVD faithfully cleans up
all resources upon aio_cancel.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-journal-buf.c |   16 +++++++++++
 block/fvd-load.c        |   24 +++++++++++++++++
 block/fvd-misc.c        |   67 +++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-read.c        |   37 ++++++++++++++++++++++++++
 block/fvd-store.c       |   31 +++++++++++++++++++++
 block/fvd-write.c       |   23 +++++++++++++++-
 block/fvd.c             |   25 +++++++++++++++++
 7 files changed, 222 insertions(+), 1 deletions(-)

diff --git a/block/fvd-journal-buf.c b/block/fvd-journal-buf.c
index e99a585..c6b60f9 100644
--- a/block/fvd-journal-buf.c
+++ b/block/fvd-journal-buf.c
@@ -360,6 +360,22 @@ use_current_buf:
     return s->bjnl.buf;
 }
 
+static void fvd_aio_cancel_bjnl_flush(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    QTAILQ_REMOVE(&s->bjnl.queued_bufs, acb, jcb.bjnl_next_queued_buf);
+    my_qemu_aio_release(acb);
+}
+
+static void fvd_aio_cancel_bjnl_buf_write(FvdAIOCB * acb)
+{
+    /* OP_BJNL_BUF_WRITE is never exposed to any external entity, and this
+     * should not be invoked. Internal cancellation of OP_BJNL_BUF_WRITE
+     * is handled by bjnl_sync_flush(). */
+    abort();
+}
+
 static void bjnl_clean_buf_timer_cb(BlockDriverState * bs)
 {
     BDRVFvdState *s = bs->opaque;
diff --git a/block/fvd-load.c b/block/fvd-load.c
index 88e5fb4..9789cc5 100644
--- a/block/fvd-load.c
+++ b/block/fvd-load.c
@@ -188,6 +188,30 @@ static inline FvdAIOCB *init_load_acb(FvdAIOCB * parent_acb,
     return acb;
 }
 
+static void fvd_aio_cancel_wrapper(FvdAIOCB * acb)
+{
+    qemu_bh_cancel(acb->wrapper.bh);
+    qemu_bh_delete(acb->wrapper.bh);
+    my_qemu_aio_release(acb);
+}
+
+static void fvd_aio_cancel_load_compact(FvdAIOCB * acb)
+{
+    if (acb->load.children) {
+        int i;
+        for (i = 0; i < acb->load.num_children; i++) {
+            if (acb->load.children[i].hd_acb) {
+                bdrv_aio_cancel(acb->load.children[i].hd_acb);
+            }
+        }
+        my_qemu_free(acb->load.children);
+    }
+    if (acb->load.one_child.hd_acb) {
+        bdrv_aio_cancel(acb->load.one_child.hd_acb);
+    }
+    my_qemu_aio_release(acb);
+}
+
 static inline int load_create_one_child(bool count_only, bool empty,
                     QEMUIOVector * orig_qiov, int *iov_index, size_t *iov_left,
                     uint8_t **iov_buf, int64_t start_sec, int sectors_in_region,
diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index f4e1038..a42bfac 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -11,6 +11,73 @@
  *
  */
 
+static void fvd_aio_cancel_bjnl_buf_write(FvdAIOCB * acb);
+static void fvd_aio_cancel_bjnl_flush(FvdAIOCB * acb);
+static void fvd_aio_cancel_read(FvdAIOCB * acb);
+static void fvd_aio_cancel_write(FvdAIOCB * acb);
+static void fvd_aio_cancel_copy(FvdAIOCB * acb);
+static void fvd_aio_cancel_load_compact(FvdAIOCB * acb);
+static void fvd_aio_cancel_store_compact(FvdAIOCB * acb);
+static void fvd_aio_cancel_wrapper(FvdAIOCB * acb);
+static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
+
+static void fvd_aio_cancel_flush(FvdAIOCB * acb)
+{
+    if (acb->flush.data_acb) {
+        bdrv_aio_cancel(acb->flush.data_acb);
+    }
+    if (acb->flush.metadata_acb) {
+        bdrv_aio_cancel(acb->flush.metadata_acb);
+    }
+    my_qemu_aio_release(acb);
+}
+
+static void fvd_aio_cancel(BlockDriverAIOCB * blockacb)
+{
+    FvdAIOCB *acb = container_of(blockacb, FvdAIOCB, common);
+
+    QDEBUG("CANCEL: acb%llu-%p\n", acb->uuid, acb);
+    acb->cancel_in_progress = true;
+
+    switch (acb->type) {
+    case OP_READ:
+        fvd_aio_cancel_read(acb);
+        break;
+
+    case OP_WRITE:
+        fvd_aio_cancel_write(acb);
+        break;
+
+    case OP_COPY:
+        fvd_aio_cancel_copy(acb);
+        break;
+
+    case OP_LOAD_COMPACT:
+        fvd_aio_cancel_load_compact(acb);
+        break;
+
+    case OP_STORE_COMPACT:
+        fvd_aio_cancel_store_compact(acb);
+        break;
+
+    case OP_WRAPPER:
+        fvd_aio_cancel_wrapper(acb);
+        break;
+
+    case OP_FLUSH:
+        fvd_aio_cancel_flush(acb);
+        break;
+
+    case OP_BJNL_BUF_WRITE:
+        fvd_aio_cancel_bjnl_buf_write(acb);
+        break;
+
+    case OP_BJNL_FLUSH:
+        fvd_aio_cancel_bjnl_flush(acb);
+        break;
+    }
+}
+
 static void fvd_close(BlockDriverState * bs)
 {
 }
diff --git a/block/fvd-read.c b/block/fvd-read.c
index 675af9e..b18fdf2 100644
--- a/block/fvd-read.c
+++ b/block/fvd-read.c
@@ -502,3 +502,40 @@ static inline void calc_read_region(BDRVFvdState * s, int64_t sector_num,
     *p_first_sec_in_backing = first_sec_in_backing;
     *p_last_sec_in_backing = last_sec_in_backing;
 }
+
+static void fvd_aio_cancel_read(FvdAIOCB * acb)
+{
+    if (acb->read.read_backing.hd_acb) {
+        bdrv_aio_cancel(acb->read.read_backing.hd_acb);
+    }
+    if (acb->read.read_fvd.hd_acb) {
+        bdrv_aio_cancel(acb->read.read_fvd.hd_acb);
+    }
+    if (acb->read.read_backing.iov.iov_base) {
+        my_qemu_vfree(acb->read.read_backing.iov.iov_base);
+    }
+    if (acb->read.read_fvd.iov.iov_base) {
+        my_qemu_vfree(acb->read.read_fvd.iov.iov_base);
+    }
+    my_qemu_aio_release(acb);
+}
+
+static void fvd_aio_cancel_copy(FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->copy.hd_acb) {
+        bdrv_aio_cancel(acb->copy.hd_acb);
+    }
+    if (acb->copy_lock.next.le_prev != NULL) {
+        QLIST_REMOVE(acb, copy_lock.next);
+        restart_dependent_writes(acb);
+    }
+    my_qemu_vfree(acb->copy.buf);
+    if (acb->common.cb != prefetch_null_cb) {
+        /* This is a copy-on-read operation. */
+        s->outstanding_copy_on_read_data -= acb->nb_sectors * 512;
+    }
+    my_qemu_aio_release(acb);
+}
diff --git a/block/fvd-store.c b/block/fvd-store.c
index fe670eb..ec23fd7 100644
--- a/block/fvd-store.c
+++ b/block/fvd-store.c
@@ -477,3 +477,34 @@ static inline FvdAIOCB *init_store_acb(int soft_write,
     COPY_UUID(acb, parent_acb);
     return acb;
 }
+
+static void fvd_aio_cancel_store_compact(FvdAIOCB * acb)
+{
+    if (acb->store.children) {
+        int i;
+        for (i = 0; i < acb->store.num_children; i++) {
+            if (acb->store.children[i].hd_acb) {
+                bdrv_aio_cancel(acb->store.children[i].hd_acb);
+            }
+        }
+        my_qemu_free(acb->store.children);
+    }
+    if (acb->store.one_child.hd_acb) {
+        bdrv_aio_cancel(acb->store.one_child.hd_acb);
+    }
+    if (acb->jcb.hd_acb) {
+        bdrv_aio_cancel(acb->jcb.hd_acb);
+        BDRVFvdState *s = acb->common.bs->opaque;
+        if (!s->use_bjnl) {
+            ujnl_free_journal_sectors(acb->common.bs);
+        }
+    }
+    if (acb->jcb.iov.iov_base != NULL) {
+        my_qemu_vfree(acb->jcb.iov.iov_base);
+    }
+    if (acb->jcb.ujnl_next_wait4_recycle.le_prev) {
+        QLIST_REMOVE(acb, jcb.ujnl_next_wait4_recycle);
+    }
+
+    my_qemu_aio_release(acb);
+}
diff --git a/block/fvd-write.c b/block/fvd-write.c
index 623ec83..a74dc5d 100644
--- a/block/fvd-write.c
+++ b/block/fvd-write.c
@@ -15,7 +15,7 @@ static void write_metadata_to_journal(struct FvdAIOCB *acb, bool update_bitmap);
 static int do_aio_write(struct FvdAIOCB *acb);
 static void restart_dependent_writes(struct FvdAIOCB *acb);
 static void free_write_resource(struct FvdAIOCB *acb);
-static void ujnl_free_journal_sectors(BlockDriverState * bs);
+static void ujnl_free_journal_sectors(BlockDriverState *bs);
 static inline BlockDriverAIOCB *store_data(int soft_write,
                 FvdAIOCB * parent_acb, BlockDriverState * bs,
                 int64_t sector_num, QEMUIOVector * orig_qiov, int nb_sectors,
@@ -106,6 +106,27 @@ slow_path:
     return &acb->common;
 }
 
+static void fvd_aio_cancel_write(FvdAIOCB * acb)
+{
+    if (acb->write.hd_acb) {
+        bdrv_aio_cancel(acb->write.hd_acb);
+    }
+    if (acb->jcb.hd_acb) {
+        bdrv_aio_cancel(acb->jcb.hd_acb);
+        BDRVFvdState *s = acb->common.bs->opaque;
+        if (!s->use_bjnl) {
+            ujnl_free_journal_sectors(acb->common.bs);
+        }
+    }
+    if (acb->jcb.ujnl_next_wait4_recycle.le_prev) {
+        QLIST_REMOVE(acb, jcb.ujnl_next_wait4_recycle);
+    }
+    if (acb->write.next_dependent_write.le_prev) {
+        QLIST_REMOVE(acb, write.next_dependent_write);
+    }
+    free_write_resource(acb);
+}
+
 static void free_write_resource(FvdAIOCB * acb)
 {
     if (acb->write.next_write_lock.le_prev) {
diff --git a/block/fvd.c b/block/fvd.c
index 2402a94..c779d65 100644
--- a/block/fvd.c
+++ b/block/fvd.c
@@ -23,6 +23,16 @@
 
 #include "block/fvd.h"
 
+#define ENABLE_TRACE_IO
+//#define DEBUG_MEMORY_LEAK
+
+#ifndef FVD_DEBUG
+#undef DEBUG_MEMORY_LEAK
+#endif
+#ifndef ENABLE_QDEBUG
+#undef ENABLE_TRACE_IO
+#endif
+
 /* Use include to avoid exposing too many FVD symbols, and to allow inline
  * function optimization. */
 #include "block/fvd-debug.c"
@@ -41,6 +51,11 @@
 #include "block/fvd-prefetch.c"
 #include "block/fvd-update.c"
 
+static AIOPool fvd_aio_pool = {
+    .aiocb_size = sizeof(FvdAIOCB),
+    .cancel = fvd_aio_cancel,
+};
+
 static BlockDriver bdrv_fvd = {
     .format_name = "fvd",
     .instance_size = sizeof(BDRVFvdState),
@@ -62,6 +77,8 @@ static BlockDriver bdrv_fvd = {
 
 static void bdrv_fvd_init(void)
 {
+    /* Random numbers are used in fvd-prefetch.c. */
+    srand(time(NULL) + getpid() + getpid() * 987654 + rand());
     bdrv_register(&bdrv_fvd);
 }
 
@@ -84,3 +101,11 @@ extern QTAILQ_HEAD(, BlockDriverState) bdrv_states;
         }
     }
 }
+
+/*
+ * TODOs:
+ * - Cap the prefetch throughput at the upper limit. See Section 3.4.2 of
+ * the FVD-cow paper.  Related metadata are
+ * FvdHeader.prefetch_max_read_throughput and
+ * FvdHeader.prefetch_max_write_throughput.
+ */
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 20/26] FVD: add impl of interface bdrv_get_info()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (17 preceding siblings ...)
  2011-02-25 22:37 ` [Qemu-devel] [PATCH 19/26] FVD: add support for aio_cancel Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 21/26] FVD: add impl of interface bdrv_close() Chunqiang Tang
                   ` (5 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_get_info() interface.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-misc.c |   98 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 97 insertions(+), 1 deletions(-)

diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index a42bfac..c515d74 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -11,6 +11,7 @@
  *
  */
 
+static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
 static void fvd_aio_cancel_bjnl_buf_write(FvdAIOCB * acb);
 static void fvd_aio_cancel_bjnl_flush(FvdAIOCB * acb);
 static void fvd_aio_cancel_read(FvdAIOCB * acb);
@@ -95,7 +96,102 @@ static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
 
 static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
 {
-    return -ENOTSUP;
+    BDRVFvdState *s = bs->opaque;
+    FvdHeader header;
+
+    if (read_fvd_header(s, &header) < 0) {
+        return -1;
+    }
+
+    printf("========= Begin of FVD specific information ==================\n");
+    printf("magic\t\t\t\t\t\t%0X\n", header.magic);
+    printf("header_size\t\t\t\t\t%d\n", header.header_size);
+    printf("create_version\t\t\t\t\t%d\n", header.create_version);
+    printf("last_open_version\t\t\t\t%d\n", header.last_open_version);
+    printf("virtual_disk_size (bytes)\t\t\t%" PRId64 "\n",
+           header.virtual_disk_size);
+    printf("disk_metadata_size (bytes)\t\t\t%" PRId64 "\n", header.data_offset);
+    if (header.data_file[0]) {
+        printf("data_file\t\t\t\t\t%s\n", header.data_file);
+    }
+    if (header.data_file_fmt[0]) {
+        printf("data_file_fmt\t\t\t\t\t%s\n", header.data_file_fmt);
+    }
+
+    if (header.table_offset > 0) {
+        printf("table_size (bytes)\t\t\t\t%" PRId64 "\n", header.table_size);
+        printf("avail_storage (bytes)\t\t\t\t%" PRId64 "\n",
+               s->avail_storage * 512);
+        printf("chunk_size (bytes)\t\t\t\t%" PRId64 "\n", header.chunk_size);
+        printf("used_chunks (bytes)\t\t\t\t%" PRId64 "\n",
+               s->used_storage * 512);
+        printf("storage_grow_unit (bytes)\t\t\t%" PRId64 "\n",
+               header.storage_grow_unit);
+        printf("table_offset (bytes)\t\t\t\t%" PRId64 "\n",
+               header.table_offset);
+        printf("table_size (bytes)\t\t\t\t%" PRId64 "\n", s->table_size);
+        printf("chunks_relocated\t\t\t\t%s\n", BOOL(s->chunks_relocated));
+
+        if (header.add_storage_cmd[0] != 0) {
+            printf("add_storage_cmd\t\t\t\t\t%s\n", header.add_storage_cmd);
+        }
+    }
+
+    printf("clean_shutdown\t\t\t\t\t%s\n", BOOL(header.clean_shutdown));
+    if (header.journal_size > 0) {
+        printf("journal_offset\t\t\t\t\t%" PRId64 "\n", header.journal_offset);
+        printf("journal_size\t\t\t\t\t%" PRId64 "\n", header.journal_size);
+        printf("stable_journal_epoch\t\t\t\t%" PRId64 "\n",
+               header.stable_journal_epoch);
+        printf("journal_buf_size (bytes)\t\t\t%" PRId64 "\n",
+               header.journal_buf_size);
+        printf("journal_clean_buf_period (ms)\t\t\t%" PRId64 "\n",
+               header.journal_clean_buf_period);
+    }
+
+    if (header.base_img[0] != 0) {
+        printf("base_img_fully_prefetched\t\t\t%s\n",
+               BOOL(header.base_img_fully_prefetched));
+        printf("base_img\t\t\t\t\t%s\n", header.base_img);
+        if (header.base_img_fmt[0]) {
+            printf("base_img_fmt\t\t\t\t\t%s\n", header.base_img_fmt);
+        }
+        printf("base_img_size (bytes)\t\t\t\t%" PRId64 "\n",
+               header.base_img_size);
+        printf("bitmap_offset (bytes)\t\t\t\t%" PRId64 "\n",
+               header.bitmap_offset);
+        printf("bitmap_size (bytes)\t\t\t\t%" PRId64 "\n", header.bitmap_size);
+        printf("block_size\t\t\t\t\t%" PRId64 "\n", header.block_size);
+        printf("copy_on_read\t\t\t\t\t%s\n", BOOL(header.copy_on_read));
+        printf("max_outstanding_copy_on_read_data (bytes)\t%" PRId64 "\n",
+               header.max_outstanding_copy_on_read_data);
+        printf("need_zero_init\t\t\t\t\t%s\n", BOOL(header.need_zero_init));
+        printf("prefetch_start_delay (sec)\t\t\t%" PRId64 "\n",
+               header.prefetch_start_delay);
+        printf("num_prefetch_slots\t\t\t\t%d\n", header.num_prefetch_slots);
+        printf("bytes_per_prefetch\t\t\t\t%" PRIu64 "\n",
+               header.bytes_per_prefetch);
+        printf("prefetch_over_threshold_throttle_time (ms)\t%" PRIu64 "\n",
+               header.prefetch_throttle_time);
+        printf("prefetch_read_throughput_measure_time (ms)\t%" PRIu64 "\n",
+               header.prefetch_read_throughput_measure_time);
+        printf("prefetch_write_throughput_measure_time (ms)\t%" PRIu64 "\n",
+               header.prefetch_write_throughput_measure_time);
+        printf("prefetch_min_read_throughput (KB/s)\t\t%" PRIu64 "\n",
+               header.prefetch_min_read_throughput);
+        printf("prefetch_min_write_throughput (KB/s)\t\t%" PRIu64 "\n",
+               header.prefetch_min_write_throughput);
+        printf("prefetch_max_read_throughput (KB/s)\t\t%" PRIu64 "\n",
+               header.prefetch_max_read_throughput);
+        printf("prefetch_max_write_throughput (KB/s)\t\t%" PRIu64 "\n",
+               header.prefetch_max_write_throughput);
+    }
+
+    printf("========= End of FVD specific information ====================\n");
+
+    bdi->cluster_size = 0;
+    bdi->vm_state_offset = 0;
+    return 0;
 }
 
 static int fvd_has_zero_init(BlockDriverState * bs)
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 21/26] FVD: add impl of interface bdrv_close()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (18 preceding siblings ...)
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 20/26] FVD: add impl of interface bdrv_get_info() Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 22/26] FVD: add impl of interface bdrv_update() Chunqiang Tang
                   ` (4 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_close() interface.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-misc.c |   78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 78 insertions(+), 0 deletions(-)

diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index c515d74..63ed168 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -81,6 +81,84 @@ static void fvd_aio_cancel(BlockDriverAIOCB * blockacb)
 
 static void fvd_close(BlockDriverState * bs)
 {
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    int i;
+
+    if (s->prefetch_state == PREFETCH_STATE_RUNNING) {
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+    }
+    if (s->prefetch_timer) {
+        qemu_del_timer(s->prefetch_timer);
+        qemu_free_timer(s->prefetch_timer);
+        s->prefetch_timer = NULL;
+    }
+
+    if (s->prefetch_acb) {
+        /* Clean up prefetch operations. */
+        for (i = 0; i < s->num_prefetch_slots; i++) {
+            if (s->prefetch_acb[i] != NULL) {
+                fvd_aio_cancel_copy(s->prefetch_acb[i]);
+                s->prefetch_acb[i] = NULL;
+            }
+        }
+        my_qemu_free(s->prefetch_acb);
+        s->prefetch_acb = NULL;
+    }
+
+    if (s->use_bjnl) {
+        /* Clean up buffered journal update. */
+        bjnl_sync_flush(bs);
+        if (s->bjnl.timer_scheduled) {
+            qemu_del_timer(s->bjnl.clean_buf_timer);
+        }
+        qemu_free_timer(s->bjnl.clean_buf_timer);
+    }
+
+    /* Clean up unfinished copy_on_read operations. */
+    QLIST_FOREACH(acb, &s->copy_locks, copy_lock.next) {
+        fvd_aio_cancel_copy(acb);
+    }
+
+    flush_metadata_to_disk_on_exit(bs);
+
+    if (s->stale_bitmap) {
+        my_qemu_vfree(s->stale_bitmap);
+        if (s->fresh_bitmap != s->stale_bitmap) {
+            my_qemu_vfree(s->fresh_bitmap);
+        }
+        s->stale_bitmap = NULL;
+        s->fresh_bitmap = NULL;
+    }
+
+    if (s->table) {
+        my_qemu_vfree(s->table);
+        s->table = NULL;
+    }
+
+    if (s->fvd_metadata) {
+        if (s->fvd_metadata != s->fvd_data) {
+            bdrv_delete(s->fvd_metadata);
+        }
+        s->fvd_metadata = NULL;
+    }
+    if (s->fvd_data) {
+        bdrv_delete(s->fvd_data);
+        s->fvd_data = NULL;
+    }
+
+    if (s->add_storage_cmd) {
+        my_qemu_free(s->add_storage_cmd);
+        s->add_storage_cmd = NULL;
+    }
+
+    if (s->leaked_chunks) {
+        my_qemu_free(s->leaked_chunks);
+        s->leaked_chunks = NULL;
+    }
+#ifdef FVD_DEBUG
+    dump_resource_summary(s);
+#endif
 }
 
 static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename)
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 22/26] FVD: add impl of interface bdrv_update()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (19 preceding siblings ...)
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 21/26] FVD: add impl of interface bdrv_close() Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 23/26] FVD: add impl of interface bdrv_is_allocated() Chunqiang Tang
                   ` (3 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_update() interface.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-update.c |  274 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 files changed, 272 insertions(+), 2 deletions(-)

diff --git a/block/fvd-update.c b/block/fvd-update.c
index 2498618..4ef4969 100644
--- a/block/fvd-update.c
+++ b/block/fvd-update.c
@@ -1,5 +1,5 @@
 /*
- * QEMU Fast Virtual Disk Format bdrv_update
+ * QEMU Fast Virtual Disk Format Misc Functions of BlockDriver Interface
  *
  * Copyright IBM, Corp. 2010
  *
@@ -13,9 +13,279 @@
 
 static int fvd_update(BlockDriverState * bs, QEMUOptionParameter * options)
 {
-    return -ENOTSUP;
+    BDRVFvdState *s = bs->opaque;
+    FvdHeader header;
+    int ret;
+
+    read_fvd_header(s, &header);
+
+    while (options && options->name) {
+        if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
+            if (header.table_offset > 0) {
+                fprintf(stderr, "Cannot resize a compact FVD image.\n");
+                return -EINVAL;
+            }
+            if (options->value.n < header.virtual_disk_size) {
+                printf("Warning: image's new size %" PRId64
+                       " is smaller than the original size %" PRId64
+                       ". Some image data will be truncated.\n",
+                       options->value.n, header.virtual_disk_size);
+            }
+            header.virtual_disk_size = options->value.n;
+            printf("Image resized to %" PRId64 " bytes.\n", options->value.n);
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
+            if (strlen(options->value.s) > 1023) {
+                fprintf(stderr, "Error: the new base image name is longer "
+                        "than 1023, which is not allowed.\n");
+                return -EINVAL;
+            }
+            memset(header.base_img, 0, 1024);
+            pstrcpy(header.base_img, 1024, options->value.s);
+            printf("Backing file updated to '%s'.\n", options->value.s);
+        } else if (!strcmp(options->name, "data_file")) {
+            if (strlen(options->value.s) > 1023) {
+                fprintf(stderr, "Error: the new data file name is longer "
+                        "than 1023, which is not allowed.\n");
+                return -EINVAL;
+            }
+
+            memset(header.data_file, 0, 1024);
+            pstrcpy(header.data_file, 1024, options->value.s);
+            printf("Data file updated to '%s'.\n", options->value.s);
+        } else if (!strcmp(options->name, "need_zero_init")) {
+            header.need_zero_init = options->value.n;
+            if (header.need_zero_init) {
+                printf("need_zero_init is turned on.\n");
+            } else {
+                printf("need_zero_init is turned off.\n");
+            }
+        } else if (!strcmp(options->name, "copy_on_read")) {
+            header.copy_on_read = options->value.n;
+            if (header.copy_on_read) {
+                printf("Copy on read is enabled for this disk.\n");
+            } else {
+                printf("Copy on read is disabled for this disk.\n");
+            }
+        } else if (!strcmp(options->name, "clean_shutdown")) {
+            header.clean_shutdown = options->value.n;
+            if (header.clean_shutdown) {
+                printf("clean_shutdown is manually set to true\n");
+            } else {
+                printf("clean_shutdown is manually set to false\n");
+            }
+        } else if (!strcmp(options->name, "journal_buf_size")) {
+            header.journal_buf_size = options->value.n;
+            printf("journal_buf_size is updated to %"PRIu64" bytes.\n",
+                   header.journal_buf_size);
+        } else if (!strcmp(options->name, "journal_clean_buf_period")) {
+            header.journal_clean_buf_period = options->value.n;
+            printf("journal_clean_buf_period is updated to %"PRIu64
+                   " milliseconds.\n",
+                   header.journal_clean_buf_period);
+        } else if (!strcmp(options->name,"max_outstanding_copy_on_read_data")) {
+            header.max_outstanding_copy_on_read_data = options->value.n;
+            if (header.max_outstanding_copy_on_read_data <= 0) {
+                fprintf(stderr, "Error: max_outstanding_copy_on_read_data "
+                        "must be positive.\n");
+                return -EINVAL;
+            }
+            printf("max_outstanding_copy_on_read_data updated to %" PRId64
+                   ".\n", header.max_outstanding_copy_on_read_data);
+        } else if (!strcmp(options->name, "init_data_region")) {
+            if (options->value.n && !s->data_region_prepared) {
+                init_data_region(s);
+            }
+        } else if (!strcmp(options->name, "prefetch_start_delay")) {
+            if (options->value.n <= 0) {
+                header.prefetch_start_delay = -1;
+            } else {
+                header.prefetch_start_delay = options->value.n;
+            }
+            if (header.prefetch_start_delay > 0) {
+                printf("Prefetch starting delay updated to %" PRId64
+                       " seconds.\n", header.prefetch_start_delay);
+            } else {
+                printf("Prefetch starting delay updated to %" PRId64
+                       " seconds. "
+                       "Because of the negative value, prefetching is "
+                       "disabled for this image.\n",
+                       header.prefetch_start_delay);
+            }
+        } else if (!strcmp(options->name, "num_prefetch_slots")) {
+            header.num_prefetch_slots = options->value.n;
+            if (header.num_prefetch_slots < 1) {
+                fprintf(stderr, "Error: num_prefetch_slots "
+                        "%d is not a positive integer.\n",
+                        header.num_prefetch_slots);
+                return -EINVAL;
+            }
+            printf("num_prefetch_slots updated to %d.\n",
+                   header.num_prefetch_slots);
+        } else if (!strcmp(options->name, "bytes_per_prefetch")) {
+            header.bytes_per_prefetch = options->value.n;
+            if (header.bytes_per_prefetch < DEF_PAGE_SIZE) {
+                fprintf(stderr, "Error: bytes_per_prefetch cannot be smaller "
+                        "than %d.\n", DEF_PAGE_SIZE);
+                return -EINVAL;
+            }
+            printf("bytes_per_prefetch updated to %" PRIu64 ".\n",
+                   header.bytes_per_prefetch);
+        } else if (!strcmp(options->name, "prefetch_min_read_throughput")) {
+            header.prefetch_min_read_throughput = options->value.n;
+            printf("prefetch_min_read_throughput updated to %"
+                   PRIu64 " KB/s\n", header.prefetch_min_read_throughput);
+        } else if (!strcmp(options->name, "prefetch_min_write_throughput")) {
+            header.prefetch_min_write_throughput = options->value.n;
+            printf("prefetch_min_write_throughput updated to %"
+                   PRIu64 "KB/s\n", header.prefetch_min_write_throughput);
+        } else if (!strcmp(options->name,
+                           "prefetch_read_throughput_measure_time")) {
+            header.prefetch_read_throughput_measure_time = options->value.n;
+            printf("prefetch_read_throughput_measure_time updated to %" PRIu64
+                   " ms\n", header.prefetch_read_throughput_measure_time);
+        } else if (!strcmp(options->name,
+                           "prefetch_write_throughput_measure_time")) {
+            header.prefetch_write_throughput_measure_time = options->value.n;
+            printf("prefetch_write_throughput_measure_time updated to %" PRIu64
+                   " ms\n", header.prefetch_write_throughput_measure_time);
+        } else if (!strcmp(options->name,
+                           "prefetch_over_threshold_throttle_time")) {
+            header.prefetch_throttle_time = options->value.n;
+            if (header.prefetch_throttle_time > 0) {
+                printf("prefetch_over_threshold_throttle_time updated to %"
+                       PRIu64 "ms.\n", header.prefetch_throttle_time);
+            } else {
+                printf("prefetch_over_threshold_throttle_time updated to %"
+                       PRIu64 "ms. It is not positive and hence no "
+                       "throttling will be applied to prefetch.\n",
+                       header.prefetch_throttle_time);
+            }
+        } else if (!strcmp(options->name, "storage_grow_unit")) {
+            header.storage_grow_unit = options->value.n;
+            if (header.storage_grow_unit < header.chunk_size) {
+                header.storage_grow_unit = header.chunk_size;
+            }
+            printf("storage_grow_unit updated to %" PRIu64 "\n",
+                   header.storage_grow_unit);
+        } else if (!strcmp(options->name, "add_storage_cmd")) {
+            if (strlen(options->value.s) > 1023) {
+                fprintf(stderr, "Error: add_storage_cmd is longer than 1023, "
+                        "which is not allowed.\n");
+                return -EINVAL;
+            }
+            pstrcpy(header.add_storage_cmd, 1024, options->value.s);
+        } else {
+            fprintf(stderr, "Error: unknown option '%s=%s'\n",
+                    options->name, options->value.s);
+            return -EINVAL;
+        }
+        options++;
+    }
+
+    if ((ret = update_fvd_header(s, &header))) {
+        return ret;
+    }
+    ret = bdrv_flush(s->fvd_metadata);
+    return ret;
 }
 
 static QEMUOptionParameter fvd_update_options[] = {
+    {
+     .name = BLOCK_OPT_SIZE,
+     .type = OPT_SIZE,
+     .help = "Virtual disk size"},
+    {
+     .name = "storage_grow_unit",
+     .type = OPT_SIZE,
+     .help = "Storage grow unit"},
+    {
+     .name = "add_storage_cmd",
+     .type = OPT_STRING,
+     .help = "Command to add storage when running out of space"},
+    {
+     .name = BLOCK_OPT_BACKING_FILE,
+     .type = OPT_STRING,
+     .help = "File name of a backing image"},
+    {
+     .name = BLOCK_OPT_BACKING_FMT,
+     .type = OPT_STRING,
+     .help = "Image format of the backing image"},
+    {
+     .name = "data_file",
+     .type = OPT_STRING,
+     .help = "File name of a data file"},
+    {
+     .name = "data_file_fmt",
+     .type = OPT_STRING,
+     .help = "Image format of the data file"},
+    {
+     .name = "copy_on_read",
+     .type = OPT_FLAG,
+     .help = "copy_on_read=on|off"},
+    {
+     .name = "prefetch_start_delay",
+     .type = OPT_NUMBER,
+     .help = "Delay in seconds before starting whole image prefetching. "},
+    {
+     .name = "journal_size",
+     .type = OPT_SIZE,
+     .help = "Journal size"},
+    {
+     .name = "need_zero_init",
+     .type = OPT_FLAG,
+     .help = "compact_image=on|off"},
+    {
+     .name = "max_outstanding_copy_on_read_data",
+     .type = OPT_SIZE,
+     .help = "copy_on_read is temporarily disabled when unsaved data exceed "
+     "this threshold (in bytes)"},
+    {
+     .name = "init_data_region",
+     .type = OPT_FLAG,
+     .help = "if enabled the image file will be expanded to its full size"},
+    {
+     .name = "journal_buf_size",
+     .type = OPT_SIZE,
+     .help = "size of in-memory journal buffer (in bytes)"},
+    {
+     .name = "journal_clean_buf_period",
+     .type = OPT_NUMBER,
+     .help = "(milliseconds)"},
+    {
+     .name = "num_prefetch_slots",
+     .type = OPT_NUMBER,
+     .help = "Number of concurrent prefetches allowed"},
+    {
+     .name = "bytes_per_prefetch",
+     .type = OPT_NUMBER,
+     .help = "Data to read per prefetch"},
+    {
+     .name = "prefetch_over_threshold_throttle_time",
+     .type = OPT_NUMBER,
+     .help = "(in milliseconds)"},
+    {
+     .name = "prefetch_read_throughput_measure_time",
+     .type = OPT_NUMBER,
+     .help = "(in milliseconds)"},
+    {
+     .name = "prefetch_write_throughput_measure_time",
+     .type = OPT_NUMBER,
+     .help = "(in milliseconds)"},
+    {
+     .name = "prefetch_min_read_throughput",
+     .type = OPT_NUMBER,
+     .help = "(in KB/s)"},
+    {
+     .name = "prefetch_max_read_throughput",
+     .type = OPT_NUMBER,
+     .help = "(in KB/s)"},
+    {
+     .name = "prefetch_min_write_throughput",
+     .type = OPT_NUMBER,
+     .help = "(in KB/s)"},
+    {
+     .name = "prefetch_max_write_throughput",
+     .type = OPT_NUMBER,
+     .help = "(in KB/s)"},
     {NULL}
 };
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 23/26] FVD: add impl of interface bdrv_is_allocated()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (20 preceding siblings ...)
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 22/26] FVD: add impl of interface bdrv_update() Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 24/26] FVD: add impl of interface bdrv_has_zero_init() Chunqiang Tang
                   ` (2 subsequent siblings)
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_is_allocated() interface.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-misc.c |   67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 67 insertions(+), 0 deletions(-)

diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index 63ed168..766b62b 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -169,6 +169,73 @@ static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename)
 static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
                             int nb_sectors, int *pnum)
 {
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
+        sector_num >= s->base_img_sectors ||
+        !fresh_bitmap_show_sector_in_base_img(sector_num, s)) {
+        /* For the three cases that data may be saved in the FVD data file, we
+         * still need to check the underlying storage because those data could
+         * be holes in a sparse image, due to the optimization of "free write
+         * to zero-filled blocks". See Section 3.3.3 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+
+        if (!s->table) {
+            return bdrv_is_allocated(s->fvd_data, s->data_offset + sector_num,
+                                     nb_sectors, pnum);
+        }
+
+        /* Use the table to figure it out. */
+        int64_t first_chunk = sector_num / s->chunk_size;
+        int64_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+        int allocated = !IS_EMPTY(s->table[first_chunk]);
+        int count;
+
+        if (first_chunk == last_chunk) {
+            /* All data in one chunk. */
+            *pnum = nb_sectors;
+            return allocated;
+        }
+
+        /* Data in the first chunk. */
+        count = s->chunk_size - (sector_num % s->chunk_size);
+
+        /* Full chunks. */
+        first_chunk++;
+        while (first_chunk < last_chunk) {
+            if ((allocated && IS_EMPTY(s->table[first_chunk]))
+                || (!allocated && !IS_EMPTY(s->table[first_chunk]))) {
+                *pnum = count;
+                return allocated;
+            }
+
+            count += s->chunk_size;
+            first_chunk++;
+        }
+
+        /* Data in the last chunk. */
+        if ((allocated && !IS_EMPTY(s->table[last_chunk]))
+            || (!allocated && IS_EMPTY(s->table[last_chunk]))) {
+            int nb = (sector_num + nb_sectors) % s->chunk_size;
+            count += nb ? nb : s->chunk_size;
+        }
+
+        *pnum = count;
+        return allocated;
+    }
+
+    /* Use the FVD metadata to find out sectors in the base image. */
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->base_img_sectors) {
+        end = s->base_img_sectors;
+    }
+
+    int64_t next = sector_num + 1;
+    while (next < end && fresh_bitmap_show_sector_in_base_img(next, s)) {
+        next++;
+    }
+
+    *pnum = next - sector_num;
     return 0;
 }
 
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 24/26] FVD: add impl of interface bdrv_has_zero_init()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (21 preceding siblings ...)
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 23/26] FVD: add impl of interface bdrv_is_allocated() Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 25/26] FVD: add impl of interface bdrv_probe() Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 26/26] FVD: add fully automated test-fvd.sh Chunqiang Tang
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_has_zero_init() interface.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-misc.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index 766b62b..61e39bb 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -341,5 +341,12 @@ static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi)
 
 static int fvd_has_zero_init(BlockDriverState * bs)
 {
-    return 0;
+    BDRVFvdState *s = bs->opaque;
+
+    /* For a non-compact image, chunks_relocated is always false. For a
+     * compact image with chunks_relocated=true, it can no longer guarantee
+     * zero init even if the file system does that. This is because a partialy
+     * written chunk X may be relocated to a location previously used by
+     * another chunk Y and some garbage data are left there by Y. */
+    return s->chunks_relocated ? 0 : bdrv_has_zero_init(s->fvd_data);
 }
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 25/26] FVD: add impl of interface bdrv_probe()
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (22 preceding siblings ...)
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 24/26] FVD: add impl of interface bdrv_has_zero_init() Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 26/26] FVD: add fully automated test-fvd.sh Chunqiang Tang
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

This patch adds FVD's implementation of the bdrv_probe() interface.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-misc.c |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/block/fvd-misc.c b/block/fvd-misc.c
index 61e39bb..6315218 100644
--- a/block/fvd-misc.c
+++ b/block/fvd-misc.c
@@ -163,7 +163,14 @@ static void fvd_close(BlockDriverState * bs)
 
 static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename)
 {
-    return 0;
+    const FvdHeader *header = (const void *)buf;
+
+    if (buf_size >= sizeof(uint32_t) &&
+        le32_to_cpu(header->magic) == FVD_MAGIC) {
+        return 100;
+    } else {
+        return 0;
+    }
 }
 
 static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [Qemu-devel] [PATCH 26/26] FVD: add fully automated test-fvd.sh
  2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
                   ` (23 preceding siblings ...)
  2011-02-25 22:38 ` [Qemu-devel] [PATCH 25/26] FVD: add impl of interface bdrv_probe() Chunqiang Tang
@ 2011-02-25 22:38 ` Chunqiang Tang
  24 siblings, 0 replies; 26+ messages in thread
From: Chunqiang Tang @ 2011-02-25 22:38 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

This patch is part of the Fast Virtual Disk (FVD) proposal.
See http://wiki.qemu.org/Features/FVD.

test-fvd.sh drives 'qemu-io --auto' to perform fully automated testing for FVD.

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 test-fvd.sh |  161 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 161 insertions(+), 0 deletions(-)
 create mode 100755 test-fvd.sh

diff --git a/test-fvd.sh b/test-fvd.sh
new file mode 100755
index 0000000..3d67c3f
--- /dev/null
+++ b/test-fvd.sh
@@ -0,0 +1,161 @@
+#!/bin/bash
+
+# Drive 'qemu-io --auto' to test the FVD image format.
+#
+# Copyright IBM, Corp. 2010
+#
+# Authors:
+#     Chunqiang Tang <ctang@us.ibm.com>
+#
+# This work is licensed under the terms of the GNU GPL, version 2 or later.
+# See the COPYING.LIB file in the top-level directory.
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_IO=$QEMU_DIR/qemu-io
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_IO ]; then
+    echo "$QEMU_IO does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.fvd
+TEST_BASE=$DATA_DIR/zero-500M.raw
+TEST_IMG_DATA=$DATA_DIR/test.dat
+CMD_LOG=./test-fvd.log
+
+G1=1073741824
+MAX_MEM=536870912
+MAX_ROUND=1000000
+MAX_IO_SIZE=100000000
+fail_prob=0.1
+cancel_prob=0.1
+flush_prob_base=0.05
+aio_flush_prob_base=0.1
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    sync
+    $*
+    ret=$?
+    if [ $ret -ne 0 ]; then
+        echo "$Exit with error code $ret: $*"
+        exit $ret
+    fi
+}
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    invoke "mount -t tmpfs none $DATA_DIR -o size=4G"
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+/bin/rm -f $CMD_LOG $DATA_DIR/*
+touch $CMD_LOG
+
+while [ -t ]; do
+    for block_size in 7680 512 1024 15872 65536 65024 1048576 1048064; do
+    for chunk_mult in 5 1 2 3 7 9 12 16 33 99 ; do
+    for cache in writeback writethrough ; do
+    #for compact_image in on off ; do
+    for compact_image in on ; do
+    for prefetch_delay in 1 0; do
+    for copy_on_read in on off; do
+    for base_img in "-b $TEST_BASE" "" ; do
+        chunk_size=$[$block_size * $chunk_mult]
+        large_io_size=$[$chunk_size * 5]
+        if [ $large_io_size -gt $MAX_IO_SIZE ]; then large_io_size=$MAX_IO_SIZE; fi
+    for io_size in $large_io_size 1048576 ; do
+    for use_data_file in "" "data_file=$TEST_IMG_DATA," ; do
+
+    if [ cache == "writethrough" ]; then
+        JOURNAL_BUF_SIZE=0
+        JOURNAL_CLEAN_BUF_PERIOD=0
+    else
+        JOURNAL_BUF_SIZE="512 1024 65536"
+        JOURNAL_CLEAN_BUF_PERIOD="5000 1000 60000"
+    fi
+
+    for journal_buf_size in $JOURNAL_BUF_SIZE ; do
+    for journal_clean_buf_period in $JOURNAL_CLEAN_BUF_PERIOD ; do
+        /bin/rm -rf /tmp/fvd.log*
+
+        # FVD image is about 1G
+        img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+        # base image is about 500MB
+        base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+        count=$[$count + 1]
+        echo "Round $count" >> $CMD_LOG
+
+        invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE $TEST_IMG_DATA"
+
+        if [ -z "$base_img" ]; then
+            # Use zero-filled empty images.
+            invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+        else
+            # Use images with random contents.
+            invoke "$QEMU_IO --auto --create=$TEST_BASE --seed=$seed --block_size=$block_size --empty_block_prob=0.2 --empty_block_chain=10 --file_size=$base_size"
+            invoke "cp --sparse=always $TEST_BASE $TRUTH_IMG"
+            invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+        fi
+
+        if [ ! -z $use_data_file ]; then invoke "touch $TEST_IMG_DATA"; fi
+
+        # Ensure the journal is large enough to hold at least one write.
+        mixed_records_per_journal_sector=119
+        if [ cache == "writethrough" ]; then
+            journal_size_factor=1000
+        else
+            journal_size_factor=100
+        fi
+        journal_size=$[(((($io_size / $chunk_size ) + 1 ) / $mixed_records_per_journal_sector ) + 1) * 512 * (1 + $RANDOM$RANDOM % $journal_size_factor) ]
+
+        invoke "$QEMU_IMG create -f fvd $base_img -ojournal_buf_size=$journal_buf_size,journal_clean_buf_period=$journal_clean_buf_period,${use_data_file}data_file_fmt=blksim,backing_fmt=blksim,compact_image=$compact_image,copy_on_read=$copy_on_read,block_size=$block_size,chunk_size=$chunk_size,journal_size=$journal_size,prefetch_start_delay=$prefetch_delay $TEST_IMG $img_size"
+        invoke "$QEMU_IMG update -oinit_data_region=on $TEST_IMG"
+        if [ $prefetch_delay -eq 1 ]; then invoke "$QEMU_IMG update -f fvd -oprefetch_over_threshold_throttle_time=0 $TEST_IMG" ; fi
+
+        # Use no more 1GB memory.
+        mem=$[$io_size * 1000]
+        if [ $mem -gt $MAX_MEM ]; then
+            parallel=$[$MAX_MEM / $io_size]
+        else
+            parallel=200
+        fi
+        parallel=$[${RANDOM}${RANDOM} % $parallel]
+
+        flush_prob=`echo $flush_prob_base / $parallel | bc -l`
+        aio_flush_prob=`echo $aio_flush_prob_base / $parallel | bc -l`
+
+        round=$[$G1 * 10 / $io_size]
+        if [ $round -gt $MAX_ROUND ]; then round=$MAX_ROUND; fi
+
+        b3=$[$round * 2 / 3]
+        [ $b3 -eq 0 ] && b3=1
+        for rep in 0 1 2 3 4 5 6 7 8 ; do
+            if [ $rep -eq 0 ]; then
+                compare_before=false
+            else
+                compare_before=true
+            fi
+            r=$[${RANDOM}${RANDOM} % $b3]
+            seed=$[$seed + 1]
+            invoke "$QEMU_IO --auto --cache=$cache --truth=$TRUTH_IMG --format=fvd --test="blksim:$TEST_IMG" --verify_write=true --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --aio_flush_prob=$aio_flush_prob --flush_prob=$flush_prob --compare_after=true --round=$r --compare_before=$compare_before --instant_qemubh=false --seed=$seed"
+done; done; done; done; done; done; done; done; done; done; done; done; done
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2011-02-25 23:26 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-02-25 22:37 [Qemu-devel] [PATCH 01/26] FVD: add simulated block driver 'blksim' Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 02/26] FVD: extend qemu-io to do fully automated testing Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 03/26] FVD: add fully automated test-qcow2.sh Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 04/26] FVD: add fully automated test-vdi.sh Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 05/26] FVD: add the 'qemu-img update' command Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 06/26] FVD: skeleton of Fast Virtual Disk Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 07/26] FVD: extend FVD header fvd.h to be more complete Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 08/26] FVD: add debugging utilities Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 09/26] FVD: add impl of interface bdrv_create() Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 10/26] FVD: add impl of interface bdrv_file_open() Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 11/26] FVD: add impl of interface bdrv_aio_writev() Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 12/26] FVD: add impl of interface bdrv_aio_readv() Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 13/26] FVD: add impl of storing data in compact image Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 14/26] FVD: add impl of loading data from " Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 15/26] FVD: add basic journal functionality Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 16/26] FVD: add impl for buffered journal updates Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 17/26] FVD: add impl of bdrv_flush() and bdrv_aio_flush() Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 18/26] FVD: add support for base image prefetching Chunqiang Tang
2011-02-25 22:37 ` [Qemu-devel] [PATCH 19/26] FVD: add support for aio_cancel Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 20/26] FVD: add impl of interface bdrv_get_info() Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 21/26] FVD: add impl of interface bdrv_close() Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 22/26] FVD: add impl of interface bdrv_update() Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 23/26] FVD: add impl of interface bdrv_is_allocated() Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 24/26] FVD: add impl of interface bdrv_has_zero_init() Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 25/26] FVD: add impl of interface bdrv_probe() Chunqiang Tang
2011-02-25 22:38 ` [Qemu-devel] [PATCH 26/26] FVD: add fully automated test-fvd.sh Chunqiang Tang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).