qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
@ 2011-01-19 22:04 Chunqiang Tang
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 2/5] Fast Virtual Disk (FVD) Proposal Part 2 Chunqiang Tang
                   ` (5 more replies)
  0 siblings, 6 replies; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-19 22:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

Part 1 of the block device driver for the proposed FVD image format.
Multiple patches are used in order to manage the size of each patch.
This patch includes existing files that are modified by FVD.

See the related discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 Makefile         |   10 +++++---
 Makefile.objs    |    1 +
 block.c          |   12 +++++-----
 block_int.h      |    5 ++-
 configure        |    2 +-
 qemu-img-cmds.hx |    6 +++++
 qemu-img.c       |   62 ++++++++++++++++++++++++++++++++++++++++++++---------
 qemu-io.c        |    3 ++
 qemu-option.c    |    4 +++
 qemu-tool.c      |   36 -------------------------------
 10 files changed, 81 insertions(+), 60 deletions(-)

diff --git a/Makefile b/Makefile
index 6d601ee..da4d777 100644
--- a/Makefile
+++ b/Makefile
@@ -151,13 +151,15 @@ version-obj-$(CONFIG_WIN32) += version.o
 ######################################################################
 
 qemu-img.o: qemu-img-cmds.h
-qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o cmd.o: $(GENERATED_HEADERS)
+qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o cmd.o qemu-test.o: $(GENERATED_HEADERS)
 
-qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
+qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-tool-time.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
 
-qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
+qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-tool-time.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
 
-qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
+qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-tool-time.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
+
+qemu-test$(EXESUF): qemu-test.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
 
 qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
 	$(call quiet-command,sh $(SRC_PATH)/hxtool -h < $< > $@,"  GEN   $@")
diff --git a/Makefile.objs b/Makefile.objs
index c3e52c5..c0c1155 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -23,6 +23,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += blksim.o fvd.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_CURL) += curl.o
diff --git a/block.c b/block.c
index ff2795b..856bb1a 100644
--- a/block.c
+++ b/block.c
@@ -58,7 +58,7 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
 static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
                          const uint8_t *buf, int nb_sectors);
 
-static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
+QTAILQ_HEAD(, BlockDriverState) bdrv_states =
     QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
 static QLIST_HEAD(, BlockDriver) bdrv_drivers =
@@ -768,7 +768,7 @@ int bdrv_commit(BlockDriverState *bs)
 
     if (!drv)
         return -ENOMEDIUM;
-    
+
     if (!bs->backing_hd) {
         return -ENOTSUP;
     }
@@ -1538,7 +1538,7 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
  * 'nb_sectors' is the max value 'pnum' should be set to.
  */
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-	int *pnum)
+        int *pnum)
 {
     int64_t n;
     if (!bs->drv->bdrv_is_allocated) {
@@ -2050,9 +2050,9 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
                               cb, opaque);
 
     if (ret) {
-	/* Update stats even though technically transfer has not happened. */
-	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
-	bs->rd_ops ++;
+        /* Update stats even though technically transfer has not happened. */
+        bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
+        bs->rd_ops ++;
     }
 
     return ret;
diff --git a/block_int.h b/block_int.h
index 12663e8..2343d07 100644
--- a/block_int.h
+++ b/block_int.h
@@ -28,8 +28,8 @@
 #include "qemu-option.h"
 #include "qemu-queue.h"
 
-#define BLOCK_FLAG_ENCRYPT	1
-#define BLOCK_FLAG_COMPAT6	4
+#define BLOCK_FLAG_ENCRYPT        1
+#define BLOCK_FLAG_COMPAT6        4
 
 #define BLOCK_OPT_SIZE          "size"
 #define BLOCK_OPT_ENCRYPT       "encryption"
@@ -98,6 +98,7 @@ struct BlockDriver {
     int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
                                   const char *snapshot_name);
     int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+    int (*bdrv_update)(BlockDriverState *bs, int argc, char **argv);
 
     int (*bdrv_save_vmstate)(BlockDriverState *bs, const uint8_t *buf,
                              int64_t pos, int size);
diff --git a/configure b/configure
index d68f862..35b29e9 100755
--- a/configure
+++ b/configure
@@ -2362,7 +2362,7 @@ confdir=$sysconfdir$confsuffix
 
 tools=
 if test "$softmmu" = yes ; then
-  tools="qemu-img\$(EXESUF) qemu-io\$(EXESUF) $tools"
+  tools="qemu-img\$(EXESUF) qemu-io\$(EXESUF) qemu-test\$(EXESUF) $tools"
   if [ "$linux" = "yes" -o "$bsd" = "yes" -o "$solaris" = "yes" ] ; then
       tools="qemu-nbd\$(EXESUF) $tools"
     if [ "$check_utests" = "yes" ]; then
diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 6c7176f..1ad378b 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -39,6 +39,12 @@ STEXI
 @item info [-f @var{fmt}] @var{filename}
 ETEXI
 
+DEF("update", img_update,
+    "update [-f fmt] filename [attr1=val1 attr2=val2 ...]")
+STEXI
+@item update [-f @var{fmt}] @var{filename} [@var{attr1=val1 attr2=val2 ...}]")
+ETEXI
+
 DEF("snapshot", img_snapshot,
     "snapshot [-l | -a snapshot | -c snapshot | -d snapshot] filename")
 STEXI
diff --git a/qemu-img.c b/qemu-img.c
index afd9ed2..1694206 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -637,7 +637,7 @@ static int img_convert(int argc, char **argv)
         ret = -1;
         goto out;
     }
-        
+
     bs = qemu_mallocz(bs_n * sizeof(BlockDriverState *));
 
     total_sectors = 0;
@@ -865,7 +865,7 @@ static int img_convert(int argc, char **argv)
                    assume that sectors which are unallocated in the input image
                    are present in both the output's and input's base images (no
                    need to copy them). */
-                if (out_baseimg) {
+                if (out_baseimg || bs[bs_i]->backing_file[0]==0) {
                     if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
                                            n, &n1)) {
                         sector_num += n1;
@@ -941,10 +941,10 @@ static int64_t get_allocated_file_size(const char *filename)
     /* WinNT support GetCompressedFileSize to determine allocate size */
     get_compressed = (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"), "GetCompressedFileSizeA");
     if (get_compressed) {
-    	DWORD high, low;
-    	low = get_compressed(filename, &high);
-    	if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
-	    return (((int64_t) high) << 32) + low;
+            DWORD high, low;
+            low = get_compressed(filename, &high);
+            if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
+            return (((int64_t) high) << 32) + low;
     }
 
     if (_stati64(filename, &st) < 0)
@@ -1036,11 +1036,6 @@ static int img_info(int argc, char **argv)
     if (bdrv_is_encrypted(bs)) {
         printf("encrypted: yes\n");
     }
-    if (bdrv_get_info(bs, &bdi) >= 0) {
-        if (bdi.cluster_size != 0) {
-            printf("cluster_size: %d\n", bdi.cluster_size);
-        }
-    }
     bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
     if (backing_filename[0] != '\0') {
         path_combine(backing_filename2, sizeof(backing_filename2),
@@ -1049,11 +1044,56 @@ static int img_info(int argc, char **argv)
                backing_filename,
                backing_filename2);
     }
+    if (bdrv_get_info(bs, &bdi) >= 0) {
+        if (bdi.cluster_size != 0)
+            printf("cluster_size: %d\n", bdi.cluster_size);
+    }
     dump_snapshots(bs);
     bdrv_delete(bs);
     return 0;
 }
 
+static int img_update(int argc, char **argv)
+{
+    int c;
+    const char *filename, *fmt;
+    BlockDriverState *bs;
+
+    fmt = NULL;
+    for(;;) {
+        c = getopt(argc, argv, "f:h");
+        if (c == -1)
+            break;
+        switch(c) {
+        case 'h':
+            help();
+            break;
+        case 'f':
+            fmt = optarg;
+            break;
+        }
+    }
+    if (optind >= argc)
+        help();
+    filename = argv[optind++];
+
+    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_NO_BACKING | BDRV_O_RDWR);
+    if (!bs) {
+        return 1;
+    }
+
+    if (bs->drv->bdrv_update==NULL) {
+        char fmt_name[128];
+        bdrv_get_format(bs, fmt_name, sizeof(fmt_name));
+        error_report ("the 'update' command is not supported for the '%s' image format.", fmt_name);
+    }
+
+    bs->drv->bdrv_update(bs, argc-optind, &argv[optind]);
+
+    bdrv_delete(bs);
+    return 0;
+}
+
 #define SNAPSHOT_LIST   1
 #define SNAPSHOT_CREATE 2
 #define SNAPSHOT_APPLY  3
diff --git a/qemu-io.c b/qemu-io.c
index 5b24c5e..c32f8d4 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -1701,6 +1701,8 @@ init_check_command(
 	return 1;
 }
 
+#include "qemu-io-sim.c"
+
 static void usage(const char *name)
 {
 	printf(
@@ -1807,6 +1809,7 @@ int main(int argc, char **argv)
 	add_command(&discard_cmd);
 	add_command(&alloc_cmd);
 	add_command(&map_cmd);
+        add_command(&sim_cmd);
 
 	add_args_command(init_args_command);
 	add_check_command(init_check_command);
diff --git a/qemu-option.c b/qemu-option.c
index 65db542..10ef45f 100644
--- a/qemu-option.c
+++ b/qemu-option.c
@@ -289,6 +289,10 @@ int set_option_parameter(QEMUOptionParameter *list, const char *name,
             return -1;
         break;
 
+    case OPT_NUMBER:
+        list->value.n = atoi (value);
+        break;
+
     default:
         fprintf(stderr, "Bug: Option '%s' has an unknown type\n", name);
         return -1;
diff --git a/qemu-tool.c b/qemu-tool.c
index 392e1c9..fdcb2f8 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -23,12 +23,6 @@ QEMUClock *rt_clock;
 
 FILE *logfile;
 
-struct QEMUBH
-{
-    QEMUBHFunc *cb;
-    void *opaque;
-};
-
 void qemu_service_io(void)
 {
 }
@@ -73,36 +67,6 @@ void monitor_protocol_event(MonitorEvent event, QObject *data)
 {
 }
 
-QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
-{
-    QEMUBH *bh;
-
-    bh = qemu_malloc(sizeof(*bh));
-    bh->cb = cb;
-    bh->opaque = opaque;
-
-    return bh;
-}
-
-int qemu_bh_poll(void)
-{
-    return 0;
-}
-
-void qemu_bh_schedule(QEMUBH *bh)
-{
-    bh->cb(bh->opaque);
-}
-
-void qemu_bh_cancel(QEMUBH *bh)
-{
-}
-
-void qemu_bh_delete(QEMUBH *bh)
-{
-    qemu_free(bh);
-}
-
 int qemu_set_fd_handler2(int fd,
                          IOCanReadHandler *fd_read_poll,
                          IOHandler *fd_read,
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 2/5] Fast Virtual Disk (FVD) Proposal Part 2
  2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
@ 2011-01-19 22:04 ` Chunqiang Tang
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3 Chunqiang Tang
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-19 22:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

Part 2 of the block device driver for the proposed FVD image format.
Multiple patches are used in order to manage the size of each patch.
This patch includes the new testing tools developed together with FVD.

See the related discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/blksim.c   |  688 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blksim.h   |   30 ++
 qemu-io-sim.c    |  107 ++++++++
 qemu-test.c      |  794 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 qemu-tool-time.c |   88 ++++++
 test-fvd.sh      |  120 ++++++++
 test-qcow2.sh    |   75 +++++
 7 files changed, 1902 insertions(+), 0 deletions(-)
 create mode 100644 block/blksim.c
 create mode 100644 block/blksim.h
 create mode 100644 qemu-io-sim.c
 create mode 100644 qemu-test.c
 create mode 100644 qemu-tool-time.c
 create mode 100755 test-fvd.sh
 create mode 100755 test-qcow2.sh

diff --git a/block/blksim.c b/block/blksim.c
new file mode 100644
index 0000000..35d918f
--- /dev/null
+++ b/block/blksim.c
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements a simulated block device
+ *  driver "blksim". It works with qemu-io and qemu-test to perform testing,
+ *  allowing changing the  order of disk I/O and callback activities to test
+ *  rare race conditions. See qemu-test.c, qemu-io.c, and qemu-io-sim.c.
+ *============================================================================*/
+
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+#include "block/fvd-ext.h"
+
+typedef enum {
+    SIM_NULL,
+    SIM_READ,
+    SIM_WRITE,
+    SIM_FLUSH,
+    SIM_READ_CALLBACK,
+    SIM_WRITE_CALLBACK,
+    SIM_FLUSH_CALLBACK,
+    SIM_TIMER
+} sim_op_t;
+
+static void sim_aio_cancel (BlockDriverAIOCB * acb);
+static int64_t sim_uuid = 0;
+static int64_t current_time = 0;
+static int64_t rand_time = 0;
+static int interactive_print = FALSE;
+struct SimAIOCB;
+
+/*
+ * Note: disk_io_return_code, set_disk_io_return_code(), and insert_task() work
+ * together to ensure that multiple subrequests triggered by the same
+ * outtermost request either succeed together or fail together. This behavior
+ * is required by qemu-test.  Here is one example of problems caused by
+ * departuring from this behavior.  Consider a write request that generates
+ * two subrequests, w1 and w2. If w1 succeeds but w2 fails, the data will not
+ * be written into qemu-test's "truth image" but the part of the data handled
+ * by w1 will be written into qemu-test's "test image". As a result, their
+ * contents diverge can automated testing cannot continue.
+ */
+static int disk_io_return_code = 0;
+
+typedef struct BDRVSimState {
+    int fd;
+} BDRVSimState;
+
+typedef struct SimAIOCB {
+    BlockDriverAIOCB common;
+    int64_t uuid;
+    sim_op_t op;
+    int64_t sector_num;
+    QEMUIOVector *qiov;
+    int nb_sectors;
+    int ret;
+    int64_t time;
+    struct SimAIOCB *next;
+    struct SimAIOCB *prev;
+
+} SimAIOCB;
+
+static AIOPool sim_aio_pool = {
+    .aiocb_size = sizeof (SimAIOCB),
+    .cancel = sim_aio_cancel,
+};
+
+static SimAIOCB head = {
+    .uuid = -1,
+    .time = (int64_t) (9223372036854775807ULL),
+    .op = SIM_NULL,
+    .next = &head,
+    .prev = &head,
+};
+
+/* Debug a specific task.*/
+#if 1
+# define CHECK_TASK(acb) do { } while (0)
+#else
+static inline void CHECK_TASK (int64_t uuid)
+{
+    if (uuid == 19LL) {
+        printf ("CHECK_TASK pause for task %" PRId64 "\n", uuid);
+    }
+}
+#endif
+
+/* do_io() should never fail. A failure indicates a bug in the upper layer
+ * block device driver, or failure in the real hardware. */
+static int do_io (BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                  int nb_sectors, int do_read)
+{
+    BDRVSimState *s = bs->opaque;
+    size_t size = nb_sectors * 512;
+    int ret;
+
+    if (lseek (s->fd, sector_num * 512, SEEK_SET) < 0) {
+        fprintf (stderr, "Error: lseek %s sector_num=%" PRId64 ". "
+                 "Pause process %d for debugging...\n",
+                 bs->filename, sector_num, getpid ());
+        fgetc (stdin);
+    }
+
+    while (size > 0) {
+
+        if (do_read) {
+            ret = read (s->fd, buf, size);
+            if (ret == 0) {
+                fprintf (stderr,
+                         "Error: read beyond the size of %s sector_num=%" PRId64
+                         " nb_sectors=%d. Pause process %d for debugging...\n",
+                         bs->filename, sector_num, nb_sectors, getpid ());
+                fgetc (stdin);
+            }
+        } else {
+            ret = write (s->fd, buf, size);
+        }
+
+        if (ret >= 0) {
+            size -= ret;
+            buf += ret;
+        } else if (errno != EINTR) {
+            fprintf (stderr, "Error: %s %s sector_num=%" PRId64
+                     " nb_sectors=%d. Pause process %d for debugging...\n",
+                     do_read ? "READ" : "WRITE", bs->filename, sector_num,
+                     nb_sectors, getpid ());
+            fgetc (stdin);
+            return -errno;
+        }
+    }
+
+    return 0;
+}
+
+static int sim_read (BlockDriverState * bs, int64_t sector_num, uint8_t * buf,
+                     int nb_sectors)
+{
+    return do_io (bs, sector_num, buf, nb_sectors, TRUE);
+}
+
+static int sim_write (BlockDriverState * bs, int64_t sector_num,
+                      const uint8_t * buf, int nb_sectors)
+{
+    return do_io (bs, sector_num, (uint8_t *) buf, nb_sectors, FALSE);
+}
+
+static void insert_in_list (SimAIOCB * acb)
+{
+    int64_t new_id = sim_uuid++;
+    CHECK_TASK (new_id);
+    acb->uuid = new_id;
+
+    if (rand_time <= 0) {
+        /* Working with qemu-io.c and not doing delay randomization.
+         * Insert it to the tail. */
+        acb->time = 0;
+        acb->prev = head.prev;
+        acb->next = &head;
+        head.prev->next = acb;
+        head.prev = acb;
+        return;
+    }
+
+    if (acb->time >= 0) {
+        /* Introduce a random delay to better trigger rare race conditions. */
+        acb->time += random () % rand_time;
+    }
+
+    /* Find the position to insert. The list is sorted in ascending time. */
+    SimAIOCB *p = head.next;
+    while (1) {
+        if (p->time > acb->time) {
+            break;
+        }
+        if (p->time == acb->time && (random () % 2 == 0)) {
+            break;
+        }
+        p = p->next;
+    }
+
+    /* Insert acb before p. */
+    acb->next = p;
+    acb->prev = p->prev;
+    p->prev->next = acb;
+    p->prev = acb;
+}
+
+/* Debug problems related to reusing task objects. Problem already solved.*/
+#if 1
+# define my_qemu_aio_get qemu_aio_get
+# define my_qemu_aio_release qemu_aio_release
+
+#else
+static SimAIOCB *search_task_list (SimAIOCB * acb)
+{
+    SimAIOCB *p;
+    for (p = head.next; p != &head; p = p->next) {
+        if (p == acb) {
+            return p;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void *my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
+                                     BlockDriverCompletionFunc * cb,
+                                     void *opaque)
+{
+    SimAIOCB *acb = (SimAIOCB *) qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+    QDEBUG ("SIM: qemu_aio_get reuse old task%" PRId64 "\n", acb->uuid);
+    ASSERT (!search_task_list (acb));
+    return acb;
+}
+
+static inline void my_qemu_aio_release (SimAIOCB * acb)
+{
+    QDEBUG ("SIM: qemu_aio_release task%" PRId64 "\n", acb->uuid);
+    qemu_aio_release (acb);
+}
+#endif
+
+static BlockDriverAIOCB *insert_task (int op, BlockDriverState * bs,
+                                      int64_t sector_num, QEMUIOVector * qiov,
+                                      int nb_sectors,
+                                      BlockDriverCompletionFunc * cb,
+                                      void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->op = op;
+    acb->sector_num = sector_num;
+    acb->qiov = qiov;
+    acb->nb_sectors = nb_sectors;
+    acb->ret = disk_io_return_code;
+    acb->time = current_time;
+    insert_in_list (acb);
+
+    if (interactive_print) {
+        if (op == SIM_READ) {
+            printf ("Added READ uuid=%" PRId64 "  filename=%s  sector_num=%"
+                    PRId64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (op == SIM_WRITE) {
+            printf ("Added WRITE uuid=%" PRId64 "  filename=%s  sector_num=%"
+                    PRId64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf (stderr, "Unknown op %d\n", op);
+            exit (1);
+        }
+    }
+
+    return &acb->common;
+}
+
+static void insert_aio_callback (SimAIOCB * acb)
+{
+    acb->time = current_time;
+    insert_in_list (acb);
+
+    if (acb->op == SIM_FLUSH) {
+        acb->op = SIM_FLUSH_CALLBACK;
+        if (interactive_print) {
+            printf ("Added FLUSH_CALLBACK uuid=%" PRId64 "  filename=%s\n",
+                    acb->uuid, acb->common.bs->filename);
+        }
+    } else if (acb->op == SIM_READ) {
+        acb->op = SIM_READ_CALLBACK;
+        if (interactive_print) {
+            printf ("Added READ_CALLBACK uuid=%" PRId64
+                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
+                    acb->uuid, acb->common.bs->filename, acb->sector_num,
+                    acb->nb_sectors);
+        }
+    } else if (acb->op == SIM_WRITE) {
+        acb->op = SIM_WRITE_CALLBACK;
+        if (interactive_print) {
+            printf ("Added WRITE_CALLBACK uuid=%" PRId64
+                    "  filename=%s  sector_num=%" PRId64 "  nb_sectors=%d\n",
+                    acb->uuid, acb->common.bs->filename, acb->sector_num,
+                    acb->nb_sectors);
+        }
+    } else {
+        fprintf (stderr, "Wrong op %d\n", acb->op);
+        exit (1);
+    }
+}
+
+void sim_list_tasks (void)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->op == SIM_READ) {
+            printf ("uuid=%" PRId64 "  READ           file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE) {
+            printf ("uuid=%" PRId64 "  WRITE          file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_READ_CALLBACK) {
+            printf ("uuid=%" PRId64 "  CALLBACK READ  file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else if (acb->op == SIM_WRITE_CALLBACK) {
+            printf ("uuid=%" PRId64 "  CALLBACK WRITE file=%s  sector_num=%"
+                    PRIu64 "  nb_sectors=%d\n", acb->uuid,
+                    acb->common.bs->filename, acb->sector_num, acb->nb_sectors);
+        } else {
+            fprintf (stderr, "Wrong OP %d\n", acb->op);
+            exit (1);
+        }
+    }
+}
+
+static inline void sim_callback (SimAIOCB * acb)
+{
+    ASSERT (disk_io_return_code == 0);
+    FVD_DEBUG_ACB (acb->common.opaque);
+    acb->common.cb (acb->common.opaque, acb->ret);
+}
+
+int64_t sim_get_time (void)
+{
+    return current_time;
+}
+
+void *sim_new_timer (void *cb, void *opaque)
+{
+    SimAIOCB *acb = my_qemu_aio_get (&sim_aio_pool, NULL, cb, opaque);
+    acb->op = SIM_TIMER;
+    acb->prev = NULL;
+    return acb;
+}
+
+void sim_mod_timer (void *ts, int64_t expire_time)
+{
+    SimAIOCB *acb = ts;
+
+    if (acb->prev) {
+        /* Remove it first. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+    }
+    acb->time = expire_time;
+    insert_in_list (acb);
+}
+
+void sim_free_timer (void *ts)
+{
+    SimAIOCB *acb = ts;
+    CHECK_TASK (acb->uuid);
+    my_qemu_aio_release (acb);
+}
+
+void sim_del_timer (void *ts)
+{
+    SimAIOCB *acb = ts;
+
+    CHECK_TASK (acb->uuid);
+    if (acb->prev) {
+        /* Remove it from the list. */
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+
+        /* Mark it as not in list. */
+        acb->prev = NULL;
+    }
+}
+
+void sim_set_disk_io_return_code (int ret)
+{
+    disk_io_return_code = ret;
+}
+
+static void sim_task_by_acb (SimAIOCB * acb)
+{
+    CHECK_TASK (acb->uuid);
+
+    /* Remove it from the list. */
+    acb->next->prev = acb->prev;
+    acb->prev->next = acb->next;
+    acb->prev = NULL;        /* Indicate that it is no longer in the list. */
+
+    if (acb->time > current_time) {
+        current_time = acb->time;
+    }
+
+    if (acb->op == SIM_TIMER) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " TIMER \n",
+                acb->uuid, acb->time);
+
+        FVD_DEBUG_ACB (acb->common.opaque);
+        ((QEMUTimerCB *) acb->common.cb) (acb->common.opaque);
+        return;
+    }
+
+    BlockDriverState *bs = acb->common.bs;
+
+    if (acb->op == SIM_READ) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+                " READ sector_num=%" PRId64 " nb_sectors=%d\n",
+                acb->uuid, acb->time, acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (sim_read
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in reading %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+            } else {
+                uint8_t *buf =
+                    qemu_blockalign (acb->common.bs, acb->qiov->size);
+                if (sim_read (bs, acb->sector_num, buf, acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in reading %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+                qemu_iovec_from_buffer (acb->qiov, buf, acb->qiov->size);
+                qemu_vfree (buf);
+            }
+        }
+
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_WRITE) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64
+                " WRITE sector_num=%" PRId64 " nb_sectors=%d\n",
+                acb->uuid, acb->time, acb->sector_num, acb->nb_sectors);
+
+        if (acb->ret == 0) {
+            if (acb->qiov->niov == 1) {
+                if (sim_write
+                    (bs, acb->sector_num, acb->qiov->iov->iov_base,
+                     acb->nb_sectors) != 0) {
+                    fprintf (stderr, "Error in writing %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+            } else {
+                uint8_t *buf = qemu_blockalign (acb->common.bs,
+                                                acb->qiov->size);
+                qemu_iovec_to_buffer (acb->qiov, buf);
+                if (sim_write (bs, acb->sector_num, buf, acb->nb_sectors)!= 0) {
+                    fprintf (stderr, "Error in writing %s sector_num=%lld "
+                             "nb_sectors=%d\n", acb->common.bs->filename,
+                             acb->sector_num, acb->nb_sectors);
+                    exit (1);
+                }
+                qemu_vfree (buf);
+            }
+        }
+
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_FLUSH) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " FLUSH\n",
+                acb->uuid, acb->time);
+        /* Skip real flushing to speed up simulation:
+         *         if (ret == 0) { * fdatasync (s->fd); } */
+        insert_aio_callback (acb);
+    } else if (acb->op == SIM_WRITE_CALLBACK || acb->op == SIM_READ_CALLBACK
+               || acb->op == SIM_FLUSH_CALLBACK) {
+        QDEBUG ("SIM: execute task%" PRId64 " time=%" PRId64 " CALLBACK\n",
+                acb->uuid, acb->time);
+        sim_callback (acb);
+        CHECK_TASK (acb->uuid);
+        my_qemu_aio_release (acb);
+    } else {
+        fprintf (stderr, "Unknown op %d\n", acb->op);
+        exit (1);
+    }
+}
+
+int sim_task_by_uuid (int64_t uuid)
+{
+    SimAIOCB *acb;
+
+    for (acb = head.next; acb != &head; acb = acb->next) {
+        if (acb->uuid == uuid) {
+            sim_task_by_acb (acb);
+            return 0;
+        }
+    }
+
+    return -1;
+}
+
+int sim_all_tasks (void)
+{
+    int n = 0;
+
+    while (1) {
+        SimAIOCB *acb = head.next;
+        if (acb == &head) {
+            return n;
+        }
+
+        sim_task_by_acb (acb);
+        n++;
+    }
+}
+
+static BlockDriverAIOCB *sim_aio_readv (BlockDriverState * bs,
+                                        int64_t sector_num,
+                                        QEMUIOVector * qiov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return insert_task (SIM_READ, bs, sector_num, qiov, nb_sectors, cb, opaque);
+}
+
+static BlockDriverAIOCB *sim_aio_writev (BlockDriverState * bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector * qiov,
+                                         int nb_sectors,
+                                         BlockDriverCompletionFunc * cb,
+                                         void *opaque)
+{
+    return insert_task (SIM_WRITE, bs, sector_num, qiov, nb_sectors, cb,
+                        opaque);
+}
+
+static BlockDriverAIOCB *sim_aio_flush (BlockDriverState * bs,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    return insert_task (SIM_FLUSH, bs, 0, NULL, 0, cb, opaque);
+}
+
+static void sim_aio_cancel (BlockDriverAIOCB * blockacb)
+{
+    SimAIOCB *acb = container_of (blockacb, SimAIOCB, common);
+
+    CHECK_TASK (acb->uuid);
+
+    if (acb->prev) {
+        acb->next->prev = acb->prev;
+        acb->prev->next = acb->next;
+        acb->prev = NULL;
+        my_qemu_aio_release (acb);
+    } else {
+        ASSERT (FALSE);        /* Cancel a task not in the list. */
+    }
+}
+
+static int sim_probe (const uint8_t * buf, int buf_size, const char *filename)
+{
+    /* Return a score higher than RAW so that the image will be openned using
+     * the 'sim' format. */
+    return 2;
+}
+
+static int sim_open (BlockDriverState * bs, const char *filename,
+                     int bdrv_flags)
+{
+    BDRVSimState *s = bs->opaque;
+    int open_flags = O_BINARY | O_LARGEFILE;
+
+    if ((bdrv_flags & BDRV_O_RDWR)) {
+        open_flags |= O_RDWR;
+    } else {
+        open_flags |= O_RDONLY;
+    }
+
+    if ((bdrv_flags & BDRV_O_NOCACHE)) {
+        open_flags |= O_DIRECT;
+    } else if (!(bdrv_flags & BDRV_O_CACHE_WB)) {
+        open_flags |= O_DSYNC;
+    }
+
+    /* Parse the "blksim:" prefix */
+    if (!strncmp(filename, "blksim:", strlen("blksim:"))) {
+        filename += strlen("blksim:");
+    }
+
+    s->fd = open (filename, open_flags);
+    if (s->fd < 0)
+        return -1;
+
+    int64_t len = lseek (s->fd, 0, SEEK_END);
+    if (len >= 0) {
+        bs->total_sectors = len / 512;
+    } else {
+        bs->total_sectors = 0;
+    }
+
+    bs->growable = 1;
+    return 0;
+}
+
+static void sim_close (BlockDriverState * bs)
+{
+    BDRVSimState *s = bs->opaque;
+    close (s->fd);
+}
+
+static int sim_flush (BlockDriverState * bs)
+{
+    /*
+     * Skip real flushing to speed up simulation.
+         * BDRVSimState *s = bs->opaque;
+         * fdatasync (s->fd);
+     */
+    return 0;
+}
+
+static int sim_has_zero_init (BlockDriverState * bs)
+{
+    struct stat buf;
+
+    if (stat (bs->filename, &buf) != 0) {
+        fprintf (stderr, "Failed to stat() %s\n", bs->filename);
+        exit (1);
+    }
+
+    if (S_ISBLK (buf.st_mode) || S_ISCHR (buf.st_mode)) {
+        return 0;
+    }
+
+    return 1;
+}
+
+static int sim_truncate (BlockDriverState * bs, int64_t offset)
+{
+    BDRVSimState *s = bs->opaque;
+    return ftruncate (s->fd, offset);
+}
+
+BlockDriver bdrv_sim = {
+    .format_name = "blksim",
+    .protocol_name = "blksim",
+    .instance_size = sizeof (BDRVSimState),
+    .bdrv_probe = sim_probe,
+    .bdrv_file_open = sim_open,
+    .bdrv_close = sim_close,
+    .bdrv_flush = sim_flush,
+    .bdrv_read = sim_read,
+    .bdrv_write = sim_write,
+    .bdrv_aio_readv = sim_aio_readv,
+    .bdrv_aio_writev = sim_aio_writev,
+    .bdrv_aio_flush = sim_aio_flush,
+    .bdrv_has_zero_init = sim_has_zero_init,
+    .bdrv_truncate = sim_truncate,
+};
+
+void enable_block_sim (int print, int64_t _rand_time)
+{
+    BlockDriver *drv = bdrv_find_format ("blksim");
+    if (!drv) {
+        bdrv_register (&bdrv_sim);
+    }
+    interactive_print = print;
+    rand_time = _rand_time;
+}
diff --git a/block/blksim.h b/block/blksim.h
new file mode 100644
index 0000000..7afca98
--- /dev/null
+++ b/block/blksim.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this is the header of the simulated block device
+ *  driver "sim".
+ *============================================================================*/
+
+#ifndef __block_sim_h__
+#define __block_sim_h__
+
+void enable_block_sim (int print, int64_t _rand_time);
+void sim_list_tasks (void);
+int sim_task_by_uuid (int64_t uuid);
+int sim_all_tasks (void);
+int64_t sim_get_time (void);
+void *sim_new_timer (void *cb, void *opaque);
+void sim_mod_timer (void *ts, int64_t expire_time);
+void sim_free_timer (void *ts);
+void sim_del_timer (void *ts);
+void sim_set_disk_io_return_code (int ret);
+
+#endif
diff --git a/qemu-io-sim.c b/qemu-io-sim.c
new file mode 100644
index 0000000..1e7a2aa
--- /dev/null
+++ b/qemu-io-sim.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ * qemu-io-sim works with qemu-io to perform simulated testing. The 'sim'
+ * command allows the user to control the order of disk I/O and callback
+ * activities in order to test rare race conditions. Note that once 'sim
+ * enable' is done, it can only test aio_read and aio_write. See block/sim.c
+ * for the simulated block device driver.
+ *============================================================================*/
+
+#include "block/blksim.h"
+
+void fvd_init_prefetch (BlockDriverState * bs);
+static void sim_start_prefetch (void)
+{
+    if (!bs->drv->format_name || !strncmp (bs->drv->format_name, "fvd", 3)) {
+        printf ("This image does not support prefetching.\n");
+        return;
+    }
+    fvd_init_prefetch (bs);
+    printf ("Prefetching started\n");
+}
+
+static void sim_help (void)
+{
+    printf ("\n"
+            " sim enable\t\tenable simulation\n"
+            " sim list\t\tlist all simulation tasks\n"
+            " sim <#task> [#ret]\trun a simulation task, optionally uing #ret as the return value of a read/write operation\n"
+            " sim all [#ret]\t\trun all tasks, optionally using #ret as the return value of read/write tasks\n"
+            " sim prefetch\t\tstart prefetching\n");
+}
+
+static int sim_f (int argc, char **argv)
+{
+    int ret = 0;
+
+    if (argc == 3) {
+        ret = atoi (argv[2]);
+    }
+    else if (argc != 2) {
+        sim_help ();
+        return 0;
+    }
+
+    if (strcmp (argv[1], "enable") == 0) {
+        if (bs) {
+            printf ("Please close the image first. \"sim enable\" must be done before the\n"
+                    "image is openned so that the image is openned with simulation support.\n");
+        }
+        else {
+            enable_block_sim(1/*print*/, 0 /*no random time*/);
+            printf ("Block device simulation is enabled.\n");
+        }
+        return 0;
+    }
+
+    if (!bs) {
+        fprintf(stderr, "no file open, try 'help open'\n");
+        return 0;
+    }
+
+    if (!bdrv_find_format("blksim")) {
+        printf ("\"sim enable\" must be done before invoking any other sim commands.\n");
+        return 0;
+    }
+
+    if (strcmp (argv[1], "list") == 0) {
+        sim_list_tasks ();
+    }
+    else if (strcmp (argv[1], "prefetch") == 0) {
+        sim_start_prefetch ();
+    }
+    else if (strcmp (argv[1], "all") == 0) {
+        sim_set_disk_io_return_code (ret);
+        int n = sim_all_tasks ();
+        sim_set_disk_io_return_code (0);
+        printf ("Executed %d tasks.\n", n);
+    }
+    else {
+        sim_set_disk_io_return_code (ret);
+        sim_task_by_uuid (atoll (argv[1]));
+        sim_set_disk_io_return_code (0);
+    }
+
+    return 0;
+}
+
+static const cmdinfo_t sim_cmd = {
+    .name = "sim",
+    .altname = "s",
+    .cfunc = sim_f,
+    .argmin = 1,
+    .argmax = 2,
+    .args = "",
+    .oneline = "use simulation to control the order of disk I/Os and callbacks",
+    .flags = CMD_NOFILE_OK,
+    .help = sim_help,
+};
diff --git a/qemu-test.c b/qemu-test.c
new file mode 100644
index 0000000..12aefa3
--- /dev/null
+++ b/qemu-test.c
@@ -0,0 +1,794 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *        Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements a fully automated testing tool
+ *  for block device drivers. It works with block/sim.c.
+ *=============================================================================
+ */
+
+#include <sys/time.h>
+#include <sys/types.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <getopt.h>
+
+#include "qemu-timer.h"
+#include "qemu-common.h"
+#include "block_int.h"
+#include "block/fvd-ext.h"
+#include "block/blksim.h"
+
+#define die(format,...) \
+    do { \
+        fprintf (stderr, "%s:%d --- ", __FILE__, __LINE__); \
+        fprintf (stderr, format, ##__VA_ARGS__); \
+        exit (-1);\
+    } while(0)
+
+typedef enum { OP_NULL = 0, OP_READ, OP_WRITE, OP_FLUSH } op_type_t;
+const char *op_type_str[] = { "NULL ", "READ ", "WRITE", "FLUSH" };
+
+typedef struct CompareFullCB {
+    QEMUIOVector qiov;
+    struct iovec iov;
+    int64_t sector_num;
+    int nb_sectors;
+    int max_nb_sectors;
+    uint8_t *truth_buf;
+} CompareFullCB;
+
+typedef struct RandomIO {
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+    uint8_t *truth_buf;
+    uint8_t *test_buf;
+    op_type_t type;
+    int tester;
+    int64_t uuid;
+    int allow_cancel;
+    BlockDriverAIOCB *acb;
+} RandomIO;
+
+static char *progname;
+static BlockDriverState *bs;
+static int fd;
+static int64_t total_sectors;
+static int64_t io_size = 262144;
+static int verify_write = TRUE;
+static int parallel = 1;
+static int max_iov = 10;
+static int64_t round = 10;
+static int64_t finished_round = 0;
+static RandomIO *testers = NULL;
+static double fail_prob = 0;
+static double cancel_prob = 0;
+static double flush_prob = 0;
+static int64_t rand_time = 1000;
+static int64_t test_uuid = 0;
+static int instant_qemubh = FALSE;
+
+static void rand_io_cb (void *opaque, int ret);
+static void perform_next_io (RandomIO * r);
+
+int64_t qemu_get_clock (QEMUClock * clock)
+{
+    return sim_get_time ();
+}
+
+void qemu_mod_timer (QEMUTimer * ts, int64_t expire_time)
+{
+    sim_mod_timer (ts, expire_time);
+}
+
+QEMUTimer *qemu_new_timer (QEMUClock * clock, QEMUTimerCB * cb, void *opaque)
+{
+    return sim_new_timer (cb, opaque);
+}
+
+void qemu_free_timer (QEMUTimer * ts)
+{
+    sim_free_timer (ts);
+}
+
+void qemu_del_timer (QEMUTimer * ts)
+{
+    sim_del_timer (ts);
+}
+
+QEMUBH *qemu_bh_new (QEMUBHFunc * cb, void *opaque)
+{
+    return sim_new_timer (cb, opaque);
+}
+
+int qemu_bh_poll (void)
+{
+    return 0;
+}
+
+void qemu_bh_schedule (QEMUBH * bh)
+{
+    if (instant_qemubh) {
+        sim_mod_timer (bh, -1);        /* Run this bh next. */
+    } else {
+        sim_mod_timer (bh, sim_get_time ());
+    }
+}
+
+void qemu_bh_cancel (QEMUBH * bh)
+{
+    sim_del_timer (bh);
+}
+
+void qemu_bh_delete (QEMUBH * bh)
+{
+    sim_free_timer (bh);
+}
+
+static void usage (void)
+{
+    printf ("%s [--help]\n"
+            "\t--truth=<truth_img>\n"
+            "\t--test=<img_to_test>\n"
+            "\t[--format=<test_img_fmt>]\n"
+            "\t[--round=<#d>]\n"
+            "\t[--instant_qemubh=<true|false>]\n"
+            "\t[--fail_prob=<#f>]\n"
+            "\t[--cancel_prob=<#f>]\n"
+            "\t[--flush_prob=<#f>]\n"
+            "\t[--io_size=<#d>]\n"
+            "\t[--verify_write=[true|false]]\n"
+            "\t[--parallel=[#d]\n"
+            "\t[--max_iov=[#d]\n"
+            "\t[--compare_before=[true|false]]\n"
+            "\t[--compare_after=[true|false]]\n" "\n", progname);
+    exit (1);
+}
+
+static int truth_io (void *buf, int64_t sector_num, int nb_sectors, int do_read)
+{
+    off_t offset = sector_num * 512;
+    size_t size = nb_sectors * 512;
+
+    while (size > 0) {
+        int r;
+        if (do_read) {
+            r = pread (fd, buf, size, offset);
+        } else {
+            r = pwrite (fd, buf, size, offset);
+        }
+        if (r >= 0) {
+            size -= r;
+            offset += r;
+            buf = (void *) (((char *) buf) + r);
+        } else if (errno != EINTR) {
+            perror ("io");
+            die ("I/O error on the truth file.\n");
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static int verify (uint8_t * truth_buf, uint8_t * test_buf,
+                   int64_t sector_num, int nb_sectors)
+{
+    int i;
+    for (i = 0; i < nb_sectors; i++) {
+        int64_t offset = i * 512;
+        if (memcmp (&truth_buf[offset], &test_buf[offset], 512) != 0) {
+            int j;
+            printf ("Sector %lld differs\n", sector_num + i);
+            QDEBUG ("Sector %lld differs\n", sector_num + i);
+            for (j = 0; j < 512; j++) {
+                if (truth_buf[offset + j] == test_buf[offset + j]) {
+                    QDEBUG ("%02d: %02X  %02X\n", j, truth_buf[offset + j],
+                            test_buf[offset + j]);
+                } else {
+                    QDEBUG ("%02d: %02X  %02X   ***\n", j,
+                            truth_buf[offset + j], test_buf[offset + j]);
+                }
+            }
+
+            fprintf (stderr, "Pause process %d for debugging...\n", getpid ());
+            fgetc (stdin);
+
+            return -1;
+        }
+    }
+
+    return 0;
+}
+
+static void compare_full_images_cb (void *opaque, int ret)
+{
+    CompareFullCB *cf = opaque;
+
+    if (ret) {
+        /* Failed. Retry the operation. */
+        bdrv_aio_readv (bs, cf->sector_num, &cf->qiov, cf->nb_sectors,
+                        compare_full_images_cb, cf);
+        return;
+    }
+
+    truth_io (cf->truth_buf, cf->sector_num, cf->nb_sectors, TRUE);
+    verify (cf->truth_buf, cf->iov.iov_base, cf->sector_num, cf->nb_sectors);
+
+    cf->sector_num += cf->nb_sectors;
+    if (cf->sector_num >= total_sectors) {
+        /* Finished. */
+        free (cf->truth_buf);
+        qemu_vfree (cf->iov.iov_base);
+        qemu_free (cf);
+        return;
+    }
+
+    /* Read more data to compare. */
+    if (cf->sector_num + cf->max_nb_sectors > total_sectors) {
+        cf->nb_sectors = total_sectors - cf->sector_num;
+    } else {
+        cf->nb_sectors = cf->max_nb_sectors;
+    }
+    cf->iov.iov_len = cf->nb_sectors * 512;
+    qemu_iovec_init_external (&cf->qiov, &cf->iov, 1);
+    if (!bdrv_aio_readv (bs, cf->sector_num, &cf->qiov,
+                         cf->nb_sectors, compare_full_images_cb, cf)) {
+        die ("bdrv_aio_readv\n");
+    }
+}
+
+static int compare_full_images (void)
+{
+    CompareFullCB *cf;
+    int old_copy_on_read = FALSE;
+
+    printf ("Performing a full comparison of the truth image and "
+            "the test image...\n");
+
+    if (!strncmp (bs->drv->format_name, "fvd", 3)) {
+        /* Disable copy-on-read when scanning through the entire image. */
+        old_copy_on_read = fvd_get_copy_on_read (bs);
+        fvd_set_copy_on_read (bs, FALSE);
+    }
+
+    cf = qemu_malloc (sizeof (CompareFullCB));
+    cf->max_nb_sectors = 1048576L / 512;
+    cf->nb_sectors = MIN (cf->max_nb_sectors, total_sectors);
+    if (posix_memalign ((void **) &cf->truth_buf, 512,
+                        cf->max_nb_sectors * 512) != 0) {
+        die ("posix_memalign");
+    }
+    cf->iov.iov_base = qemu_blockalign (bs, cf->max_nb_sectors * 512);
+    cf->iov.iov_len = cf->nb_sectors * 512;
+    cf->sector_num = 0;
+    qemu_iovec_init_external (&cf->qiov, &cf->iov, 1);
+    if (!bdrv_aio_readv (bs, cf->sector_num, &cf->qiov,
+                         cf->nb_sectors, compare_full_images_cb, cf)) {
+        die ("bdrv_aio_readv\n");
+    }
+
+    sim_all_tasks ();
+
+    if (!strncmp (bs->drv->format_name, "fvd", 3)) {
+        fvd_set_copy_on_read (bs, old_copy_on_read);
+    }
+
+    return 0;
+}
+
+static inline int64_t rand64 (void)
+{
+    int64_t f1 = random ();
+    int64_t f2 = random ();
+    int64_t f3 = (f1 << 32) | f2;
+    return f3 >= 0 ? f3 : -f3;
+}
+
+static int check_conflict (RandomIO * r)
+{
+    int i;
+
+    for (i = 0; i < parallel; i++) {
+        RandomIO *s = &testers[i];
+        if (s == r || s->type == OP_FLUSH ||
+            (r->type == OP_READ && s->type == OP_READ)) {
+            continue;
+        }
+
+        if ((r->sector_num <= s->sector_num &&
+             s->sector_num < r->sector_num + r->nb_sectors) ||
+            (s->sector_num <= r->sector_num &&
+             r->sector_num < s->sector_num + s->nb_sectors)) {
+            return 1;        /* Conflict. */
+        }
+    }
+
+    return 0;        /* No confict. */
+}
+
+/* Return FALSE if the submitted request is cancelled. */
+static int submit_rand_io (RandomIO * r)
+{
+    BlockDriverAIOCB *acb = NULL;
+
+    QDEBUG ("TESTER %03d:  %s  test%" PRIX64 " sector_num=%" PRId64
+            " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+            r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+    printf ("TESTER %03d:  %s  sector_num=%" PRId64 " nb_sectors=%d niov=%d\n",
+            r->tester, op_type_str[r->type], r->sector_num, r->nb_sectors,
+            r->qiov.niov);
+
+    int ret;
+    if (fail_prob <= 0) {
+        ret = 0;
+    } else if (random () / (double) RAND_MAX <= fail_prob) {
+        ret = -EIO;
+    } else {
+        ret = 0;
+    }
+
+    /* This affects whether this request will fail or not. */
+    sim_set_disk_io_return_code (ret);
+
+    switch (r->type) {
+    case OP_READ:
+        if (!(acb = bdrv_aio_readv (bs, r->sector_num, &r->qiov, r->nb_sectors,
+                             rand_io_cb, r))) {
+            die ("bdrv_aio_readv\n");
+        }
+        break;
+    case OP_WRITE:
+        if (!(acb = bdrv_aio_writev (bs, r->sector_num, &r->qiov, r->nb_sectors,
+                              rand_io_cb, r))) {
+            die ("bdrv_aio_writev\n");
+        }
+        break;
+    case OP_FLUSH:
+        if (!(acb = bdrv_aio_flush (bs, rand_io_cb, r))) {
+            die ("bdrv_aio_flush\n");
+        }
+        break;
+    case OP_NULL:
+        die ("OP_NULL");
+        break;
+    }
+
+    sim_set_disk_io_return_code (0);        /* Reset to no failure state. */
+
+    if (r->allow_cancel && cancel_prob > 0 &&
+                random () / (double) RAND_MAX <= cancel_prob) {
+        QDEBUG ("TESTER %03d:  cancel %s test%" PRIX64 " sector_num=%" PRId64
+                " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+                r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+        printf ("TESTER %03d:  cancel %s sector_num=%" PRId64
+                " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+                r->sector_num, r->nb_sectors, r->qiov.niov);
+        bdrv_aio_cancel (acb);
+        return FALSE;
+    } else {
+        return TRUE;
+    }
+}
+
+static void prepare_read_write (RandomIO * r)
+{
+    /* Do a READ or WRITE? */
+    if (random () % 2) {
+        r->type = OP_READ;
+    } else {
+        r->type = OP_WRITE;
+    }
+
+    /* Find the next region to perform io. */
+    do {
+        if (parallel <= 1 || (random () % 2 == 0)) {
+            /* Perform a random I/O. */
+            r->sector_num = rand64 () % total_sectors;
+        } else {
+            /* Perform an I/O next to a currently ongoing I/O. */
+            int id;
+            do {
+                id = random () % parallel;
+            } while (id == r->tester);
+
+            RandomIO *p = &testers[id];
+            r->sector_num =
+                p->sector_num + 2 * io_size - rand64 () % (4 * io_size);
+            if (r->sector_num < 0) {
+                r->sector_num = 0;
+            } else if (r->sector_num >= total_sectors) {
+                r->sector_num = total_sectors - 1;
+            }
+        }
+
+        r->nb_sectors = 1 + rand64 () % io_size;
+        if (r->sector_num + r->nb_sectors > total_sectors) {
+            r->nb_sectors = total_sectors - r->sector_num;
+        }
+    } while (check_conflict (r));
+
+    if (r->type == OP_WRITE) {
+        /* Fill test_buf with random data. */
+        int i, j;
+        for (i = 0; i < r->nb_sectors; i++) {
+            const uint64_t TEST_MAGIC = 0x0123456789ABCDEFULL;
+            /* This first 8 bytes of the sector stores the current testing
+             * round. The next 8 bytes store a magic number.  This info helps
+             * debugging. */
+            uint64_t *p = (uint64_t *) & r->test_buf[i * 512];
+            *p = r->uuid;
+            cpu_to_be64s (p);
+            p++;
+            *p = TEST_MAGIC;
+            cpu_to_be64s (p);
+
+            /* The rest of the sector are filled with random data. */
+            uint32_t *q = (uint32_t *) (p + 1);
+            int n = (512 - 2 * sizeof (uint64_t)) / sizeof (uint32_t);
+            for (j = 0; j < n; j++) {
+                *q++ = random ();
+            }
+        }
+    }
+
+    /* Determine the number of iov. */
+    int niov = 0;
+    uint8_t *p = r->test_buf;
+    int left = r->nb_sectors;
+    do {
+        if (niov == max_iov - 1) {
+            r->qiov.iov[niov].iov_len = left * 512;
+            r->qiov.iov[niov].iov_base = p;
+            niov++;
+            break;
+        }
+
+        int nb = 1 + random () % left;
+        r->qiov.iov[niov].iov_len = nb * 512;
+        r->qiov.iov[niov].iov_base = p;
+        p += r->qiov.iov[niov].iov_len;
+        left -= nb;
+        niov++;
+    } while (left > 0);
+
+    qemu_iovec_init_external (&r->qiov, r->qiov.iov, niov);
+}
+
+static void perform_next_io (RandomIO * r)
+{
+    if (finished_round >= round) {
+        return;
+    }
+
+    finished_round++;
+    r->allow_cancel = TRUE;
+
+    do {
+        r->uuid = test_uuid++;
+
+        if (flush_prob > 0 && random () / (double) RAND_MAX < flush_prob) {
+            r->type = OP_FLUSH;
+        } else {
+            prepare_read_write (r);
+        }
+    } while (!submit_rand_io (r));
+}
+
+static void rand_io_cb (void *opaque, int ret)
+{
+    RandomIO *r = opaque;
+
+    if (ret) {
+        if (fail_prob <= 0) {
+            fprintf (stderr, "Request %s sector_num=%" PRId64
+                     " nb_sectors=%d failed while fail_prob=0. "
+                     "Pause for debugging...\n",
+                     op_type_str[r->type], r->sector_num, r->nb_sectors);
+            fgetc (stdin);
+        } else {
+            /* Failed. Retry the operation. */
+            QDEBUG ("TESTER %03d:  retry %s  test%" PRIX64 " sector_num=%"
+                    PRId64 " nb_sectors=%d niov=%d\n",
+                    r->tester, op_type_str[r->type], r->uuid,
+                    r->sector_num, r->nb_sectors, r->qiov.niov);
+            if (!submit_rand_io (r)) {
+                perform_next_io (r);
+            }
+            return;
+        }
+    } else {
+        QDEBUG ("TESTER %03d:  finished %s  test%" PRIX64 " sector_num=%"PRId64
+                " nb_sectors=%d niov=%d\n", r->tester, op_type_str[r->type],
+                r->uuid, r->sector_num, r->nb_sectors, r->qiov.niov);
+    }
+
+    switch (r->type) {
+    case OP_FLUSH:
+        perform_next_io (r);
+        return;
+
+    case OP_READ:
+        truth_io (r->truth_buf, r->sector_num, r->nb_sectors, TRUE);
+        verify (r->truth_buf, r->test_buf, r->sector_num, r->nb_sectors);
+        perform_next_io (r);
+        return;
+
+    case OP_WRITE:
+        truth_io (r->test_buf, r->sector_num, r->nb_sectors, FALSE);
+        if (verify_write) {
+            /* Perform a read for the same data. */
+            r->type = OP_READ;
+
+            /* To verify the write, this read cannot be cancelled. */
+            r->allow_cancel = FALSE;
+            r->qiov.niov = 1;
+            r->qiov.iov[0].iov_len = r->qiov.size;
+            memset (r->test_buf, 0xA5, r->qiov.size); /* Fill in garbage. */
+            submit_rand_io (r);
+        } else {
+            perform_next_io (r);
+        }
+        return;
+
+    case OP_NULL:
+        die ("OP_NULL");
+        return;
+    }
+}
+
+static int read_bool (const char *arg)
+{
+    int val = TRUE;
+    if (strcmp (optarg, "true") == 0) {
+        val = TRUE;
+    } else if (strcmp (optarg, "false") == 0) {
+        val = FALSE;
+    } else {
+        printf ("%s is neither 'true' nor 'false'\n", arg);
+        usage ();
+    }
+
+    return val;
+}
+
+
+static void perform_test(const char *truth_file, const char *test_file,
+                         const char *format, int compare_before,
+                         int compare_after)
+{
+    int flags, i;
+
+    bs = bdrv_new ("hda");
+    if (!bs) {
+        die ("bdrv_new failed\n");
+    }
+
+    BlockDriver *drv = NULL;
+    if (format) {
+        drv = bdrv_find_format (format);
+        if (!drv) {
+            die ("Found no driver for format '%s'.\n", format);
+        }
+    }
+
+    flags = BDRV_O_RDWR | BDRV_O_CACHE_WB;
+
+    if (bdrv_open (bs, test_file, flags, drv) < 0) {
+        die ("Failed to open '%s'\n", test_file);
+    }
+
+    fd = open (truth_file, O_RDWR | O_LARGEFILE, 0);
+    if (fd < 0) {
+        perror ("open");
+        die ("Failed to open '%s'\n", truth_file);
+    }
+
+    int64_t l0 = lseek (fd, 0, SEEK_END);
+    int64_t l1 = bdrv_getlength (bs);
+    if (l0 < 0 || l1 < 0 || l0 < l1) {
+        die ("Mismatch: truth image %s length %lld, test image %s "
+             "length %lld\n", truth_file, l0, test_file, l1);
+    }
+
+    total_sectors = l1 / 512;
+    if (total_sectors <= 1) {
+        die ("Total sectors: %" PRId64 "\n", total_sectors);
+    }
+
+    io_size /= 512;
+    if (io_size <= 0) {
+        io_size = 1;
+    } else if (io_size > total_sectors / 2) {
+        io_size = total_sectors / 2;
+    }
+
+    if (compare_before) {
+        if (compare_full_images ()) {
+            die ("The original two files do not match.\n");
+        }
+    }
+
+    if (round > 0) {
+        /* Create testers. */
+        testers = qemu_malloc (sizeof (RandomIO) * parallel);
+        for (i = 0; i < parallel; i++) {
+            RandomIO *r = &testers[i];
+            r->test_buf = qemu_blockalign (bs, io_size * 512);
+            if (posix_memalign ((void **) &r->truth_buf, 512, io_size * 512)) {
+                die ("posix_memalign");
+            }
+            r->qiov.iov = qemu_malloc (sizeof (struct iovec) * max_iov);
+            r->sector_num = 0;
+            r->nb_sectors = 0;
+            r->type = OP_READ;
+            r->tester = i;
+        }
+        for (i = 0; i < parallel; i++) {
+            perform_next_io (&testers[i]);
+        }
+    }
+
+    sim_all_tasks ();        /* Run tests. */
+
+    if (round > 0) {
+        /* Create testers. */
+        if (compare_after) {
+            if (compare_full_images ()) {
+                die ("The two files do not match after I/O operations.\n");
+            }
+        }
+
+        for (i = 0; i < parallel; i++) {
+            RandomIO *r = &testers[i];
+            qemu_vfree (r->test_buf);
+            free (r->truth_buf);
+            qemu_free (r->qiov.iov);
+        }
+        qemu_free (testers);
+    }
+
+    printf ("Test process %d finished successfully\n", getpid ());
+
+    int fvd = (strncmp (bs->drv->format_name, "fvd", 3) == 0);
+    bdrv_delete (bs);
+    if (fvd) {
+        fvd_check_memory_usage ();
+    }
+    close (fd);
+}
+
+int main (int argc, char **argv)
+{
+    int c;
+    const char *truth_file = NULL;
+    const char *test_file = NULL;
+    const char *format = NULL;
+    int compare_before = FALSE;
+    int compare_after = TRUE;
+    int seed = 0;
+
+    const struct option lopt[] = {
+        {"help", 0, 0, 'h'},
+        {"seed", 1, 0, 'd'},
+        {"truth", 1, 0, 'b'},
+        {"test", 1, 0, 't'},
+        {"format", 1, 0, 'f'},
+        {"rand_time", 1, 0, 'n'},
+        {"fail_prob", 1, 0, 'u'},
+        {"cancel_prob", 1, 0, 'c'},
+        {"flush_prob", 1, 0, 'w'},
+        {"round", 1, 0, 'r'},
+        {"parallel", 1, 0, 'p'},
+        {"compare_before", 1, 0, 'm'},
+        {"verify_write", 1, 0, 'v'},
+        {"compare_after", 1, 0, 'a'},
+        {"max_iov", 1, 0, 'i'},
+        {"io_size", 1, 0, 's'},
+        {"instant_qemubh", 1, 0, 'q'},
+        {NULL, 0, NULL, 0}
+    };
+
+    progname = basename (argv[0]);
+
+    while ((c = getopt_long (argc, argv, "hc:u:p:q:i:f:d:b:t:r:m:v:a:s:",
+                             lopt, NULL)) != -1) {
+        switch (c) {
+        case 'h':
+            usage ();
+            return 0;
+
+        case 'q':
+            instant_qemubh = read_bool (optarg);
+            break;
+
+        case 'w':
+            flush_prob = atof (optarg);
+            break;
+
+        case 'c':
+            cancel_prob = atof (optarg);
+            break;
+
+        case 'u':
+            fail_prob = atof (optarg);
+            break;
+
+        case 'n':
+            rand_time = atoll (optarg);
+            break;
+
+        case 'i':
+            max_iov = atoi (optarg);
+            break;
+
+        case 'p':
+            parallel = atoi (optarg);
+            break;
+
+        case 'v':
+            verify_write = read_bool (optarg);
+            break;
+
+        case 'm':
+            compare_before = read_bool (optarg);
+            break;
+
+        case 'a':
+            compare_after = read_bool (optarg);
+            break;
+
+        case 'd':
+            seed = atoll (optarg);
+            break;
+
+        case 'f':
+            format = optarg;
+            break;
+
+        case 'b':
+            truth_file = optarg;
+            break;
+
+        case 't':
+            test_file = optarg;
+            break;
+
+        case 's':
+            io_size = atoll (optarg);
+            break;
+
+        case 'r':
+            round = atoll (optarg);
+            break;
+
+        default:
+            usage ();
+            return 1;
+        }
+    }
+
+    if (!truth_file || !test_file) {
+        usage ();
+        return 1;
+    }
+
+    if (parallel <= 0) {
+        parallel = 1;
+    }
+    srandom (seed);
+    rt_clock = (QEMUClock *) - 1; /* Convince FVD this is not in a qemu-tool. */
+    enable_block_sim (FALSE /*no print */ , rand_time);
+    fvd_enable_host_crash_test ();
+    bdrv_init ();
+    perform_test (truth_file, test_file, format, compare_before, compare_after);
+    return 0;
+}
diff --git a/qemu-tool-time.c b/qemu-tool-time.c
new file mode 100644
index 0000000..4aa2466
--- /dev/null
+++ b/qemu-tool-time.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements the qemu-tool functions that
+ *  are related to time. In the simulation mode (see block/sim.c), these
+ *  functions are implemented differently in qemu-test.c because they have to
+ *  work with the simulation engine block/sim.c
+ *============================================================================*/
+
+#include "qemu-timer.h"
+#include "sysemu.h"
+
+struct QEMUBH {
+    QEMUBHFunc *cb;
+    void *opaque;
+};
+
+#if 1
+int64_t qemu_get_clock (QEMUClock * clock)
+{
+    qemu_timeval tv;
+    qemu_gettimeofday (&tv);
+    return (tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000)) / 1000000;
+}
+#endif
+
+QEMUBH *qemu_bh_new (QEMUBHFunc * cb, void *opaque)
+{
+    QEMUBH *bh;
+
+    bh = qemu_malloc (sizeof (*bh));
+    bh->cb = cb;
+    bh->opaque = opaque;
+
+    return bh;
+}
+
+int qemu_bh_poll (void)
+{
+    return 0;
+}
+
+void qemu_bh_schedule (QEMUBH * bh)
+{
+    bh->cb (bh->opaque);
+}
+
+void qemu_bh_cancel (QEMUBH * bh)
+{
+}
+
+void qemu_bh_delete (QEMUBH * bh)
+{
+    qemu_free (bh);
+}
+
+void qemu_mod_timer (QEMUTimer * ts, int64_t expire_time)
+{
+    fprintf (stderr, "qemu_mod_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+}
+
+QEMUTimer *qemu_new_timer (QEMUClock * clock, QEMUTimerCB * cb, void *opaque)
+{
+    fprintf (stderr, "qemu_new_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+    return NULL;
+}
+
+void qemu_free_timer (QEMUTimer * ts)
+{
+    fprintf (stderr, "qemu_free_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+}
+
+void qemu_del_timer (QEMUTimer * ts)
+{
+    fprintf (stderr, "qemu_del_timer() should not be invoked in qemu-tool\n");
+    exit (1);
+}
diff --git a/test-fvd.sh b/test-fvd.sh
new file mode 100755
index 0000000..adf4e1f
--- /dev/null
+++ b/test-fvd.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_TEST=$QEMU_DIR/qemu-test
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_TEST ]; then
+    echo "$QEMU_TEST does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.fvd
+TEST_BASE=$DATA_DIR/zero-500M.raw
+TEST_IMG_DATA=$DATA_DIR/test.dat
+CMD_LOG=/tmp/test-fvd.log
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    mount -t tmpfs none $DATA_DIR -o size=4G
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+G1=1073741824
+MAX_MEM=536870912
+MAX_ROUND=1000000
+MAX_IO_SIZE=100000000
+fail_prob=0.1
+cancel_prob=0.1
+flush_prob=0.01
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    sync
+    $*
+    ret=$?
+    if [ $ret -ne 0 ]; then
+        echo "$Exit with error code $ret: $*"
+        exit $ret;
+    fi
+}
+
+/bin/rm -f $CMD_LOG
+touch $CMD_LOG
+
+while [ -t ]; do
+    for compact_image in on off ; do
+    for prefetch_delay in 1 0; do
+    for copy_on_read in on off; do
+    for block_size in 7680 512 1024 15872 65536 65024 1048576 1048064; do
+    for chunk_mult in 5 1 2 3 7 9 12 16 33 99 ; do
+    for base_img in ""  "-b $TEST_BASE"; do
+        chunk_size=$[$block_size * $chunk_mult]
+        large_io_size=$[$chunk_size * 5]
+        if [ $large_io_size -gt $MAX_IO_SIZE ]; then large_io_size=$MAX_IO_SIZE; fi
+    for io_size in $large_io_size 1048576 ; do
+    for use_data_file in "" "data_file=$TEST_IMG_DATA," ; do
+
+        # FVD image is about 1G
+        img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+        # base image is about 500MB
+        base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+        count=$[$count + 1]
+        echo "Round $count" >> $CMD_LOG
+
+        invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE $TEST_IMG_DATA"
+        invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+        invoke "dd if=/dev/zero of=$TEST_BASE count=0 bs=1 seek=$base_size"
+        if [ ! -z $use_data_file ]; then invoke "touch $TEST_IMG_DATA"; fi
+
+        mixed_records_per_journal_sector=121
+        journal_size=$[(((($io_size / $chunk_size ) + 1 ) / $mixed_records_per_journal_sector ) + 1) * 512 * 100]
+
+        invoke "$QEMU_IMG create -f fvd $base_img -o${use_data_file}data_file_fmt=blksim,compact_image=$compact_image,copy_on_read=$copy_on_read,block_size=$block_size,chunk_size=$chunk_size,journal_size=$journal_size,prefetch_start_delay=$prefetch_delay $TEST_IMG $img_size"
+        if [ $prefetch_delay -eq 1 ]; then $QEMU_IMG update $TEST_IMG prefetch_over_threshold_throttle_time=0; fi
+
+        # Use no more 1GB memory.
+        mem=$[$io_size * 1000]
+        if [ $mem -gt $MAX_MEM ]; then
+            parallel=$[$MAX_MEM / $io_size]
+        else
+            parallel=1000
+        fi
+        parallel=$[${RANDOM}${RANDOM} % $parallel]
+
+        round=$[$G1 * 10 / $io_size]
+        if [ $round -gt $MAX_ROUND ]; then round=$MAX_ROUND; fi
+
+        b3=$[$round * 2 / 3]
+        [ $b3 -eq 0 ] && b3=1
+        for rep in 0 1 2 ; do
+            if [ $rep -eq 0 ]; then
+                compare_before=false
+            else
+                compare_before=true
+            fi
+            r=$[${RANDOM}${RANDOM} % $b3]
+            seed=$[$seed + 1]
+            invoke "$QEMU_TEST --truth=$TRUTH_IMG --format=fvd --test="blksim:$TEST_IMG" --verify_write=true --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --flush_prob=$flush_prob --compare_after=true --round=$r --compare_before=$compare_before --seed=$seed"
+        done
+
+        /bin/rm -rf /tmp/fvd.log*
+done; done; done; done; done; done; done; done; done
diff --git a/test-qcow2.sh b/test-qcow2.sh
new file mode 100755
index 0000000..1b6a39b
--- /dev/null
+++ b/test-qcow2.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+if [ $USER != "root" ]; then
+    echo "This command must be run by root in order to mount tmpfs."
+    exit 1
+fi
+
+QEMU_DIR=.
+QEMU_IMG=$QEMU_DIR/qemu-img
+QEMU_TEST=$QEMU_DIR/qemu-test
+
+if [ ! -e $QEMU_IMG ]; then
+    echo "$QEMU_IMG does not exist."
+    exit 1;
+fi
+
+if [ ! -e $QEMU_TEST ]; then
+    echo "$QEMU_TEST does not exist."
+    exit 1;
+fi
+
+DATA_DIR=/var/ramdisk
+TRUTH_IMG=$DATA_DIR/truth.raw
+TEST_IMG=$DATA_DIR/test.qcow2
+TEST_BASE=$DATA_DIR/zero-500M.raw
+CMD_LOG=/tmp/test-qcow2.log
+
+mount | grep $DATA_DIR > /dev/null
+if [ $? -ne 0 ]; then
+    echo "Create tmpfs at $DATA_DIR to store testing images."
+    if [ ! -e $DATA_DIR ]; then mkdir -p $DATA_DIR ; fi
+    mount -t tmpfs none $DATA_DIR -o size=4G
+    if [ $? -ne 0 ]; then exit 1; fi
+fi
+
+parallel=100
+round=100000
+fail_prob=0
+cancel_prob=0
+instant_qemubh=true
+seed=$RANDOM$RANDOM
+count=0
+
+function invoke() {
+    echo "$*" >> $CMD_LOG
+    $*
+    if [ $? -ne 0 ]; then
+        echo "Exit with error code $?: $*"
+    fi
+}
+
+/bin/rm -f $CMD_LOG
+touch $CMD_LOG
+
+while [ -t ]; do
+for cluster_size in 65536 7680 512 1024 15872 65024 1048576 1048064; do
+for io_size in 10485760 ; do
+    count=$[$count + 1]
+    echo "Round $count" >> $CMD_LOG
+
+    # QCOW2 image is about 1G
+    img_size=$[(1073741824 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+    # base image is about 500MB
+    base_size=$[(536870912 + ($RANDOM$RANDOM$RANDOM % 104857600)) / 512 * 512]
+
+    invoke "/bin/rm -rf $TRUTH_IMG $TEST_IMG $TEST_BASE"
+    invoke "dd if=/dev/zero of=$TRUTH_IMG count=0 bs=1 seek=$img_size"
+    invoke "dd if=/dev/zero of=$TEST_BASE count=0 bs=1 seek=$base_size"
+    invoke "$QEMU_IMG create -f qcow2 -ocluster_size=$cluster_size -b $TEST_BASE $TEST_IMG $img_size"
+
+    invoke "$QEMU_TEST --seed=$seed --truth=$TRUTH_IMG --format=qcow2 --test="blksim:$TEST_IMG" --verify_write=true --compare_before=false --compare_after=true --round=$round --parallel=$parallel --io_size=$io_size --fail_prob=$fail_prob --cancel_prob=$cancel_prob --instant_qemubh=$instant_qemubh"
+
+    seed=$[$seed + 1]
+done; done; done
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3
  2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 2/5] Fast Virtual Disk (FVD) Proposal Part 2 Chunqiang Tang
@ 2011-01-19 22:04 ` Chunqiang Tang
  2011-01-21 22:57   ` Anthony Liguori
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 4/5] Fast Virtual Disk (FVD) Proposal Part 4 Chunqiang Tang
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-19 22:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

Part 3 of the block device driver for the proposed FVD image format.
Multiple patches are used in order to manage the size of each patch.
This patch includes some new files for FVD.

See the related discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-create.c |  475 +++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-debug.c  |  406 ++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-ext.h    |   71 ++++++++
 block/fvd.c        |  127 ++++++++++++++
 block/fvd.h        |  481 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 1560 insertions(+), 0 deletions(-)
 create mode 100644 block/fvd-create.c
 create mode 100644 block/fvd-debug.c
 create mode 100644 block/fvd-ext.h
 create mode 100644 block/fvd.c
 create mode 100644 block/fvd.h

diff --git a/block/fvd-create.c b/block/fvd-create.c
new file mode 100644
index 0000000..b978ecb
--- /dev/null
+++ b/block/fvd-create.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements bdrv_create() for FVD.
+ *============================================================================*/
+
+static inline int64_t calc_min_journal_size (int64_t table_entries);
+static inline int search_holes(const char *filename, size_t bitmap_size,
+                    int32_t bitmap_start_offset, BlockDriverState * bs,
+                    int64_t nb_sectors, int32_t hole_size, int32_t block_size);
+
+static int fvd_create (const char *filename, QEMUOptionParameter * options)
+{
+    int fd, ret;
+    FvdHeader *header;
+    int64_t virtual_disk_size = DEF_PAGE_SIZE;
+    int32_t header_size;
+    const char *base_img = NULL;
+    const char *base_img_fmt = NULL;
+    const char *data_file = NULL;
+    const char *data_file_fmt = NULL;
+    int32_t hole_size = 0;
+    int copy_on_read = FALSE;
+    int prefetch_start_delay = -1;
+    int64_t prefetch_profile_size = 0;
+    BlockDriverState *bs = NULL;
+    int bitmap_size = 0;
+    int64_t base_img_size = 0;
+    int64_t table_size = 0;
+    int64_t journal_size = 0;
+    int32_t block_size = 0;
+
+    header_size = sizeof (FvdHeader);
+    header_size = ROUND_UP (header_size, DEF_PAGE_SIZE);
+    header = my_qemu_mallocz (header_size);
+
+    /* Read out options */
+    while (options && options->name) {
+        if (!strcmp (options->name, BLOCK_OPT_SIZE)) {
+            virtual_disk_size = options->value.n;
+        } else if (!strcmp (options->name,"prefetch_start_delay")) {
+            if (options->value.n <= 0) {
+                prefetch_start_delay = -1;
+            } else {
+                prefetch_start_delay = options->value.n;
+            }
+        } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FILE)) {
+            base_img = options->value.s;
+        } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FMT)) {
+            base_img_fmt = options->value.s;
+        } else if (!strcmp (options->name, "copy_on_read")) {
+            copy_on_read = options->value.n;
+        } else if (!strcmp (options->name, "data_file")) {
+            data_file = options->value.s;
+        } else if (!strcmp (options->name, "data_file_fmt")) {
+            data_file_fmt = options->value.s;
+        } else if (!strcmp (options->name, "detect_sparse_hole")) {
+            hole_size = options->value.n;
+        } else if (!strcmp (options->name, "compact_image")) {
+            header->compact_image = options->value.n;
+        } else if (!strcmp (options->name, "block_size")) {
+            block_size = options->value.n;
+        } else if (!strcmp (options->name, "chunk_size")) {
+            header->chunk_size = options->value.n;
+        } else if (!strcmp (options->name, "journal_size")) {
+            journal_size = options->value.n;
+        } else if (!strcmp (options->name, "storage_grow_unit")) {
+            header->storage_grow_unit = options->value.n;
+        } else if (!strcmp (options->name, "add_storage_cmd")
+                   && options->value.s) {
+            pstrcpy (header->add_storage_cmd, sizeof (header->add_storage_cmd),
+                     options->value.s);
+        }
+        options++;
+    }
+
+    virtual_disk_size = ROUND_UP (virtual_disk_size, 512);
+
+    /* Check if arguments are valid. */
+    if (base_img && strlen (base_img) > 1023) {
+        fprintf (stderr, "The base image name is longer than 1023 characters, "
+                 "which is not allowed.\n");
+        return -EINVAL;
+    }
+
+    if (base_img && hole_size > 0) {
+        if (header->compact_image) {
+            fprintf (stderr, "compact_image and detect_sparse_hole cannot be "
+                     "enabled together. Please disable detect_sparse_hole. \n");
+            return -EINVAL;
+        }
+        header->need_zero_init = TRUE;
+    } else {
+        header->need_zero_init = FALSE;
+    }
+
+    if (data_file) {
+        pstrcpy (header->data_file, 1024, data_file);
+        if (data_file_fmt) {
+            pstrcpy (header->data_file_fmt, 16, data_file_fmt);
+        }
+    }
+
+    header->magic = FVD_MAGIC;
+    header->version = FVD_VERSION;
+    header->virtual_disk_size = virtual_disk_size;
+    header->clean_shutdown = TRUE;
+
+    if (!base_img) {
+        header->all_data_in_fvd_img = TRUE;
+    } else {
+        int ret;
+
+        bs = bdrv_new ("");
+        if (!bs) {
+            fprintf (stderr, "Failed to create a new block driver\n");
+            return -1;
+        }
+
+        pstrcpy (header->base_img, 1024, base_img);
+        if (base_img_fmt) {
+            pstrcpy (header->base_img_fmt, 16, base_img_fmt);
+            BlockDriver *drv = bdrv_find_format (base_img_fmt);
+            if (!drv) {
+                fprintf (stderr, "Failed to find driver for format '%s'\n",
+                         base_img_fmt);
+                return -1;
+            }
+            ret = bdrv_open (bs, header->data_file, 0, drv);
+        } else {
+            ret = bdrv_open (bs, base_img, 0, NULL);
+        }
+
+        if (ret < 0) {
+            fprintf (stderr, "Failed to open the base image %s\n", base_img);
+            return -1;
+        }
+
+        base_img_size = bdrv_getlength (bs);
+        base_img_size = MIN (virtual_disk_size, base_img_size);
+        base_img_size = ROUND_UP (base_img_size, 512);
+
+        if (block_size <= 0) {
+            /* No block size is provided. Find the smallest block size that
+             * does not make the bitmap too big. */
+            block_size = 512;
+            while (1) {
+                int64_t blocks = (base_img_size + block_size - 1) / block_size;
+                bitmap_size = (blocks + 7) / 8;
+                if (bitmap_size <= MODERATE_BITMAP_SIZE) {
+                    break;
+                }
+                block_size *= 2;
+            }
+        } else {
+            block_size = ROUND_UP (block_size, 512);
+            int64_t blocks = (base_img_size + block_size - 1) / block_size;
+            bitmap_size = (blocks + 7) / 8;
+        }
+
+        bitmap_size = ROUND_UP (bitmap_size, DEF_PAGE_SIZE);
+        header->bitmap_size = bitmap_size;
+        header->block_size = block_size;
+        header->bitmap_offset = header_size;
+
+        prefetch_profile_size = header->prefetch_profile_entries *
+                                    sizeof (PrefetchProfileEntry);
+        prefetch_profile_size = ROUND_UP (prefetch_profile_size, DEF_PAGE_SIZE);
+        header->base_img_size = base_img_size;
+        header->max_outstanding_copy_on_read_data =
+                                    MAX_OUTSTANDING_COPY_ON_READ_DATA;
+        header->copy_on_read = copy_on_read;
+        header->prefetch_start_delay =
+                                    prefetch_start_delay;
+        header->num_prefetch_slots = NUM_PREFETCH_SLOTS;
+        header->bytes_per_prefetch = ROUND_UP (BYTES_PER_PREFETCH, block_size);
+        header->prefetch_throttle_time = PREFETCH_THROTTLING_TIME;
+        header->prefetch_read_throughput_measure_time =
+                                    PREFETCH_MIN_MEASURE_READ_TIME;
+        header->prefetch_write_throughput_measure_time =
+                                    PREFETCH_MIN_MEASURE_WRITE_TIME;
+        header->prefetch_perf_calc_alpha = PREFETCH_PERF_CALC_ALPHA;
+        header->prefetch_min_read_throughput = PREFETCH_MIN_READ_THROUGHPUT;
+        header->prefetch_min_write_throughput = PREFETCH_MIN_WRITE_THROUGHPUT;
+        header->prefetch_max_read_throughput = PREFETCH_MAX_READ_THROUGHPUT;
+        header->prefetch_max_write_throughput = PREFETCH_MAX_WRITE_THROUGHPUT;
+        header->all_data_in_fvd_img = FALSE;
+        header->unit_of_PrefetchProfileEntry_len = DEF_PAGE_SIZE;
+        header->generate_prefetch_profile = FALSE; /* To be implemented. */
+        header->profile_directed_prefetch_start_delay = -1;/*To be implemented*/
+    }
+
+    /* Set the table size. */
+    if (header->compact_image) {
+        if (header->chunk_size <= 0) {
+            header->chunk_size = CHUNK_SIZE;
+        }
+        header->chunk_size = ROUND_UP (header->chunk_size, DEF_PAGE_SIZE);
+        if (header->storage_grow_unit <= 0) {
+            header->storage_grow_unit = STORAGE_GROW_UNIT;
+        }
+        if (header->storage_grow_unit < header->chunk_size) {
+            header->storage_grow_unit = header->chunk_size;
+        }
+        int64_t table_entries =
+            (virtual_disk_size + header->chunk_size - 1) / header->chunk_size;
+        table_size = sizeof (uint32_t) * table_entries;
+        table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
+        header->table_offset = header_size + bitmap_size;
+    }
+
+    /* Set the journal size. */
+    if (bitmap_size <= 0 && table_size <= 0) {
+        header->journal_size = 0;        /* No need to use journal. */
+    } else if (journal_size < 0) {
+        /* Disable the use of journal, which reduces overhead but may cause
+         * data corruption if the host crashes. This is a valid configuration
+         * for some use cases, where data integrity is not critical.  */
+        header->journal_size = 0;
+    } else {
+        if (journal_size == 0) {
+            /* No journal size is specified. Use a default size. */
+            journal_size = JOURNAL_SIZE;
+        }
+        if (table_size > 0) {
+            /* Make sure that the journal is at least large enough to record
+             * all table changes in one shot, which is the extremely unlikely
+             * worst case. */
+            int64_t vsize = virtual_disk_size + header->chunk_size - 1;
+            int64_t table_entries = vsize / header->chunk_size;
+            int64_t min_journal_size = calc_min_journal_size (table_entries);
+            if (journal_size < min_journal_size) {
+                journal_size = min_journal_size;
+            }
+        }
+        journal_size = ROUND_UP (journal_size, DEF_PAGE_SIZE);
+        header->journal_size = journal_size;
+        header->journal_offset = header_size + bitmap_size + table_size;
+    }
+
+    const int64_t metadata_size = header_size + bitmap_size + table_size +
+                                prefetch_profile_size + MAX (0, journal_size);
+    header->metadata_size = metadata_size;
+
+    fd = open (filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
+    if (fd < 0) {
+        fprintf (stderr, "Failed to open %s\n", filename);
+        goto fail;
+    }
+    fvd_header_cpu_to_le (header);
+
+    if (qemu_write_full (fd, header, header_size) != header_size) {
+        fprintf (stderr, "Failed to write the header of %s\n", filename);
+        goto fail;
+    }
+
+    /* Initialize the bitmap. */
+    if (bitmap_size > 0) {
+        uint8_t *bitmap = my_qemu_mallocz (bitmap_size);
+        ret = qemu_write_full (fd, bitmap, bitmap_size);
+        my_qemu_free (bitmap);
+        if (ret != bitmap_size) {
+            fprintf (stderr, "Failed to zero out the bitmap of %s\n", filename);
+            goto fail;
+        }
+    }
+
+    /* Initialize the table. */
+    if (table_size > 0) {
+        /* Set all entries to EMPTY_TABLE (0xFFFFFFFF). */
+        uint8_t *empty_table = my_qemu_malloc (table_size);
+        memset (empty_table, 0xFF, table_size);
+        ret = qemu_write_full (fd, empty_table, table_size);
+        my_qemu_free (empty_table);
+        if (ret != table_size) {
+            fprintf (stderr, "Failed to write the table of %s\n.", filename);
+            goto fail;
+        }
+    }
+
+    /* Initialize the journal. */
+    if (journal_size > 0) {
+        uint8_t *empty_journal = my_qemu_mallocz (journal_size);
+        ret = qemu_write_full (fd, empty_journal, journal_size);
+        my_qemu_free (empty_journal);
+        if (ret != journal_size) {
+            fprintf (stderr, "Failed to initialize the journal for %s\n.",
+                     filename);
+            goto fail;
+        }
+    }
+
+    close (fd);
+    ret = 0;
+
+    if (bs && hole_size > 0) {
+        ret = search_holes (filename, (size_t) bitmap_size, header_size, bs,
+                            base_img_size / 512, hole_size, block_size);
+    }
+
+    if (bs) {
+        bdrv_close (bs);
+    }
+    my_qemu_free (header);
+    return ret;
+
+  fail:
+    if (bs) {
+        bdrv_close (bs);
+    }
+    close (fd);
+    my_qemu_free (header);
+    return -1;
+}
+
+/* For the optimization called "free write to zero-filled blocks". See Section
+ * 3.3.3 of the FVD-cow paper. */
+static inline int search_holes (const char *filename, size_t bitmap_size,
+                                int32_t bitmap_start_offset,
+                                BlockDriverState * bs, int64_t nb_sectors,
+                                int32_t hole_size, int32_t block_size)
+{
+    const int fd = open (filename, O_RDWR | O_BINARY | O_LARGEFILE, 0);
+    if (fd < 0) {
+        fprintf (stderr, "Failed to open %s for read and write.\n", filename);
+        return -1;
+    }
+
+    printf ("Searching zero-filled sectors in the base image. Please wait...");
+    fflush (stdout);
+
+    uint8_t *bitmap =
+        (uint8_t *) mmap (NULL, bitmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
+                          fd, (off_t) bitmap_start_offset);
+    if (bitmap == MAP_FAILED) {
+        fprintf (stderr, "Failed to mmap() %s\n", filename);
+        close (fd);
+        return -1;
+    }
+
+    if (hole_size < block_size) {
+        hole_size = block_size;
+    }
+    hole_size = ROUND_UP (hole_size, block_size);
+    nb_sectors = ROUND_DOWN (nb_sectors, hole_size);
+    const int sectors_per_hole = hole_size / 512;
+    const int sectors_per_block = block_size / 512;
+    int num_int64_in_hole = hole_size / 8;
+    int64_t hole_count = 0;
+    int i, ret = 0;
+    int64_t sec = 0;
+    uint8_t *p = my_qemu_blockalign (bs, hole_size);
+
+    while (sec < nb_sectors) {
+        int64_t *q;
+
+        if (bdrv_read (bs, sec, p, sectors_per_hole) < 0) {
+            fprintf (stderr, "Error in reading the base image\n");
+            ret = -1;
+            goto done;
+        }
+
+        /* All zeros? */
+        q = (int64_t *) p;
+        for (i = 0; i < num_int64_in_hole; i++) {
+            if (*q != 0) {
+                break;
+            }
+            q++;
+        }
+
+        if (i < num_int64_in_hole) {
+            /* This is not a hole. */
+            sec += sectors_per_hole;
+        } else {
+             /* These  sectors consist of only zeros.  Set the flag to
+              * indicate that there is no need to read this sector from the
+              * base image.  See Section 3.3.3 of the FVD-cow paper for the
+              * rationale. */
+            hole_count++;
+            int64_t end = sec + sectors_per_hole;
+            while (sec < end) {
+                int block_num = sec / sectors_per_block;
+                int64_t bitmap_byte_offset = block_num / 8;
+                uint8_t bitmap_bit_offset = block_num % 8;
+                int8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+                uint8_t b = bitmap[bitmap_byte_offset];
+                if (!(b & mask)) {
+                    b |= mask;
+                    bitmap[bitmap_byte_offset] |= mask;
+                }
+                sec += sectors_per_block;
+            }
+        }
+    }
+
+  done:
+    printf ("\nFound %" PRId64
+            " zero-filled hole regions. Image creation done.\n", hole_count);
+    my_qemu_vfree (p);
+    munmap (bitmap, bitmap_size);
+    close (fd);
+    return ret;
+}
+
+static QEMUOptionParameter fvd_create_options[] = {
+    {
+     .name = BLOCK_OPT_SIZE,
+     .type = OPT_SIZE,
+     .help = "Virtual disk size"},
+    {
+     .name = "compact_image",
+     .type = OPT_FLAG,
+     .help = "compact_image=on|off"},
+    {
+     .name = "block_size",
+     .type = OPT_SIZE,
+     .help = "Block size"},
+    {
+     .name = "chunk_size",
+     .type = OPT_SIZE,
+     .help = "Chunk size"},
+    {
+     .name = "storage_grow_unit",
+     .type = OPT_SIZE,
+     .help = "Storage grow unit"},
+    {
+     .name = "add_storage_cmd",
+     .type = OPT_STRING,
+     .help = "Command to add storage when FSI runs out of space"},
+    {
+     .name = BLOCK_OPT_BACKING_FILE,
+     .type = OPT_STRING,
+     .help = "File name of a backing image"},
+    {
+     .name = BLOCK_OPT_BACKING_FMT,
+     .type = OPT_STRING,
+     .help = "Image format of the backing image"},
+    {
+     .name = "data_file",
+     .type = OPT_STRING,
+     .help = "File name of a separate data file"},
+    {
+     .name = "data_file_fmt",
+     .type = OPT_STRING,
+     .help = "Image format of the separate data file"},
+    {
+     .name = "copy_on_read",
+     .type = OPT_FLAG,
+     .help = "copy_on_read=on|off"},
+    {
+     .name = "prefetch_start_delay",
+     .type = OPT_NUMBER,
+     .help = "Delay in seconds before starting whole image prefetching. "
+         "Prefetching is disabled if the delay is not a positive number."},
+    {
+     .name = "detect_sparse_hole",
+     .type = OPT_SIZE,
+     .help = "Minimum size (in bytes) of a continuous zero-filled region to be "
+         "considered as a sparse file hole in the backing image (setting it "
+         "to 0 turns off sparse file detection)"},
+    {
+     .name = "journal_size",
+     .type = OPT_SIZE,
+     .help = "Journal size"},
+    {NULL}
+};
diff --git a/block/fvd-debug.c b/block/fvd-debug.c
new file mode 100644
index 0000000..4cef5ec
--- /dev/null
+++ b/block/fvd-debug.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements debugging functions for
+ *  the Fast Virtual Disk (FVD) format.
+ *============================================================================*/
+
+#ifndef ENABLE_TRACE_IO
+# define TRACE_REQUEST(...) do {} while (0)
+# define TRACE_STORE_IN_FVD(...) do {} while (0)
+
+#else
+/* Monitor IO on a specific sector that triggers bugs. */
+static inline void debug_sector (int64_t sector_num)
+{
+    if (FALSE) {
+        if (sector_num == ((int64_t) 1023990LL)) {
+            QPAUSE ("right sector");
+        }
+    }
+}
+
+static void TRACE_REQUEST (int do_write, int64_t sector_num, int nb_sectors)
+{
+    if (do_write) {
+        QDEBUG ("TRACE_REQUEST: write sector_num=%" PRId64
+                " nb_sectors=%d\n    [ ", sector_num, nb_sectors);
+    } else {
+        QDEBUG ("TRACE_REQUEST: read  sector_num=%" PRId64 " nb_sectors=%d\n"
+                "[ ", sector_num, nb_sectors);
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    int64_t sec;
+    for (sec = sector_num; sec < end; sec++) {
+        QDEBUG ("sec%" PRId64 " ", sec);
+        debug_sector (sec);
+    }
+    QDEBUG (" ]\n");
+}
+
+static void TRACE_STORE_IN_FVD (const char *str, int64_t sector_num,
+                                int nb_sectors)
+{
+    QDEBUG ("TRACE_STORE: %s sector_num=%" PRId64 " nb_sectors=%d\n    [ ",
+            str, sector_num, nb_sectors);
+    int64_t end = sector_num + nb_sectors;
+    int64_t sec;
+    for (sec = sector_num; sec < end; sec++) {
+        QDEBUG ("sec%" PRId64 " ", sec);
+        debug_sector (sec);
+    }
+    QDEBUG (" ]\n");
+}
+#endif
+
+#ifndef FVD_DEBUG
+# define my_qemu_malloc qemu_malloc
+# define my_qemu_mallocz qemu_mallocz
+# define my_qemu_blockalign qemu_blockalign
+# define my_qemu_free qemu_free
+# define my_qemu_vfree qemu_vfree
+# define my_qemu_aio_get qemu_aio_get
+# define my_qemu_aio_release qemu_aio_release
+# define COPY_UUID(to,from) do {} while (0)
+
+#else
+FILE *__fvd_debug_fp;
+static unsigned long long int fvd_uuid = 1;
+static int64_t pending_qemu_malloc = 0;
+static int64_t pending_qemu_aio_get = 0;
+static int64_t pending_local_writes = 0;
+static const char *alloc_file;
+static int alloc_line;
+
+#define my_qemu_malloc(size) \
+    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_malloc(size)))
+
+#define my_qemu_mallocz(size) \
+    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_mallocz(size)))
+
+#define my_qemu_blockalign(bs,size) \
+    ((void*)(alloc_file=__FILE__, \
+             alloc_line=__LINE__, \
+             _my_qemu_blockalign(bs,size)))
+
+#define my_qemu_aio_get(pool,bs,cb,op) \
+    ((void*)(alloc_file=__FILE__, \
+             alloc_line=__LINE__, \
+             _my_qemu_aio_get(pool,bs,cb,op)))
+
+#define my_qemu_free(p) \
+    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_free(p))
+
+#define my_qemu_vfree(p) \
+    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_vfree(p))
+
+static void COPY_UUID (FvdAIOCB * to, FvdAIOCB * from)
+{
+    if (from) {
+        to->uuid = from->uuid;
+        FVD_DEBUG_ACB (to);
+    }
+}
+
+#ifdef DEBUG_MEMORY_LEAK
+# define MAX_TRACER 10485760
+static int alloc_tracer_used = 1;        /* slot 0 is not used. */
+static void **alloc_tracers = NULL;
+
+static void __attribute__ ((constructor)) init_mem_alloc_tracers (void)
+{
+    if (!alloc_tracers) {
+        alloc_tracers = qemu_mallocz (sizeof (void *) * MAX_TRACER);
+    }
+}
+
+static void trace_alloc (void *p, size_t size)
+{
+    alloc_tracer_t *t = p;
+    t->magic = FVD_ALLOC_MAGIC;
+    t->alloc_file = alloc_file;
+    t->alloc_line = alloc_line;
+    t->size = size;
+
+    if (alloc_tracer_used < MAX_TRACER) {
+        t->alloc_tracer = alloc_tracer_used++;
+        alloc_tracers[t->alloc_tracer] = t;
+        QDEBUG ("Allocate memory using tracer%d in %s on line %d.\n",
+                t->alloc_tracer, alloc_file, alloc_line);
+    } else {
+        t->alloc_tracer = 0;
+    }
+
+    /* Set header and footer to detect out-of-range writes. */
+    if (size != (size_t) - 1) {
+        uint8_t *q = (uint8_t *) p;
+        uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
+        uint64_t *footer = (uint64_t *) (q + size - 512);
+        *header = FVD_ALLOC_MAGIC;
+        *footer = FVD_ALLOC_MAGIC;
+    }
+}
+
+static void trace_free (void *p)
+{
+    alloc_tracer_t *t = p;
+
+    QDEBUG ("Free memory with tracer%d in %s on line %d.\n",
+            t->alloc_tracer, alloc_file, alloc_line);
+    ASSERT (t->magic == FVD_ALLOC_MAGIC && t->alloc_tracer >= 0);
+
+    /* Check header and footer to detect out-of-range writes. */
+    if (t->size != (size_t) - 1) {
+        uint8_t *q = (uint8_t *) p;
+        uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
+        uint64_t *footer = (uint64_t *) (q + t->size - 512);
+        ASSERT (*header == FVD_ALLOC_MAGIC);
+        ASSERT (*footer == FVD_ALLOC_MAGIC);
+    }
+
+    if (t->alloc_tracer) {
+        ASSERT (alloc_tracers[t->alloc_tracer] == t);
+        alloc_tracers[t->alloc_tracer] = NULL;
+        t->alloc_tracer = -INT_MAX;
+    } else {
+        t->alloc_tracer *= -1;        /* Guard against double free. */
+    }
+}
+
+static void dump_alloc_tracers (void)
+{
+    int unfreed = 0;
+    int i;
+    for (i = 1; i < alloc_tracer_used; i++) {
+        if (!alloc_tracers[i]) {
+            continue;
+        }
+
+        unfreed++;
+        alloc_tracer_t *t = alloc_tracers[i];
+
+        if (t->size == (size_t) - 1) {
+            FvdAIOCB *acb = container_of (alloc_tracers[i], FvdAIOCB, tracer);
+            ASSERT (acb->magic == FVDAIOCB_MAGIC);
+            QDEBUG ("Memory %p with tracer%d allocated in %s on line %d "
+                    "(FvdAIOCB acb%llu-%p) is not freed. magic %s\n",
+                    alloc_tracers[i], i, t->alloc_file, t->alloc_line,
+                    acb->uuid, acb,
+                    t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
+        } else {
+            QDEBUG ("Memory %p with tracer%d allocated in %s on line %d is "
+                    "not freed. magic %s\n",
+                    alloc_tracers[i], i, t->alloc_file, t->alloc_line,
+                    t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
+
+            uint8_t *q = (uint8_t *) t;
+            uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
+            uint64_t *footer = (uint64_t *) (q + t->size - 512);
+            ASSERT (*header == FVD_ALLOC_MAGIC);
+            ASSERT (*footer == FVD_ALLOC_MAGIC);
+        }
+    }
+
+    QDEBUG ("Unfreed memory allocations: %d\n", unfreed);
+}
+#endif
+
+static inline void *_my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
+                                      BlockDriverCompletionFunc * cb,
+                                      void *opaque)
+{
+    pending_qemu_aio_get++;
+    FvdAIOCB *acb = (FvdAIOCB *) qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
+    acb->uuid = ++fvd_uuid;
+    acb->magic = FVDAIOCB_MAGIC;
+
+    FVD_DEBUG_ACB (acb);
+
+#ifdef DEBUG_MEMORY_LEAK
+    trace_alloc (&acb->tracer, -1);
+#endif
+
+    return acb;
+}
+
+static inline void my_qemu_aio_release (void *p)
+{
+    pending_qemu_aio_get--;
+    ASSERT (pending_qemu_aio_get >= 0);
+
+#ifdef DEBUG_MEMORY_LEAK
+    FvdAIOCB *acb = p;
+    trace_free (&acb->tracer);
+#endif
+
+    qemu_aio_release (p);
+}
+
+static inline void *_my_qemu_malloc (size_t size)
+{
+    ASSERT (size > 0);
+    pending_qemu_malloc++;
+#ifndef DEBUG_MEMORY_LEAK
+    return qemu_malloc (size);
+#else
+
+    size += 1024;        /* 512 bytes header and 512 bytes footer. */
+    uint8_t *ret = qemu_malloc (size);
+    trace_alloc (ret, size);
+    return ret + 512;
+#endif
+}
+
+static inline void *_my_qemu_mallocz (size_t size)
+{
+    ASSERT (size > 0);
+    pending_qemu_malloc++;
+#ifndef DEBUG_MEMORY_LEAK
+    return qemu_mallocz (size);
+#else
+
+    size += 1024;        /* 512 bytes header and 512 bytes footer. */
+    uint8_t *ret = qemu_mallocz (size);
+    trace_alloc (ret, size);
+    return ret + 512;
+#endif
+}
+
+static inline void *_my_qemu_blockalign (BlockDriverState * bs, size_t size)
+{
+    ASSERT (size > 0);
+    pending_qemu_malloc++;
+
+#ifndef DEBUG_MEMORY_LEAK
+    return qemu_blockalign (bs, size);
+#else
+
+    size += 1024;        /* 512 bytes header and 512 bytes footer. */
+    uint8_t *ret = qemu_blockalign (bs, size);
+    trace_alloc (ret, size);
+    return ret + 512;
+#endif
+}
+
+static inline void _my_qemu_free (void *ptr)
+{
+    pending_qemu_malloc--;
+    ASSERT (pending_qemu_malloc >= 0);
+#ifndef DEBUG_MEMORY_LEAK
+    qemu_free (ptr);
+#else
+
+    uint8_t *q = ((uint8_t *) ptr) - 512;
+    trace_free (q);
+    qemu_free (q);
+#endif
+}
+
+static inline void _my_qemu_vfree (void *ptr)
+{
+    pending_qemu_malloc--;
+    ASSERT (pending_qemu_malloc >= 0);
+#ifndef DEBUG_MEMORY_LEAK
+    qemu_vfree (ptr);
+#else
+
+    uint8_t *q = ((uint8_t *) ptr) - 512;
+    trace_free (q);
+    qemu_vfree (q);
+#endif
+}
+
+static void count_pending_requests (BDRVFvdState * s)
+{
+    int m = 0, k = 0;
+    FvdAIOCB *w;
+
+    QLIST_FOREACH (w, &s->copy_locks, copy_lock.next) {
+        m++;
+        QDEBUG ("copy_lock: acb%llu-%p\n", w->uuid, w);
+    }
+
+    QLIST_FOREACH (w, &s->write_locks, write.next_write_lock) {
+        k++;
+        QDEBUG ("write_lock: acb%llu-%p\n", w->uuid, w);
+    }
+
+    QDEBUG ("Debug_memory_leak: copy_locks=%d  write_locks=%d\n", m, k);
+}
+
+static void dump_resource_summary (BDRVFvdState * s)
+{
+#ifdef DEBUG_MEMORY_LEAK
+    dump_alloc_tracers ();
+#endif
+
+    QDEBUG ("Resource summary: outstanding_copy_on_read_data=%" PRId64
+            " total_copy_on_read_data=%" PRId64 " total_prefetch_data=%" PRId64
+            " " " pending_qemu_malloc=%" PRId64 " pending_qemu_aio_get=%" PRId64
+            " pending_local_writes=%" PRId64 "\n",
+            s->outstanding_copy_on_read_data, s->total_copy_on_read_data,
+            s->total_prefetch_data, pending_qemu_malloc, pending_qemu_aio_get,
+            pending_local_writes);
+    count_pending_requests (s);
+}
+
+/* Monitor processing a specific FvdAIOCB that triggers bugs. */
+void FVD_DEBUG_ACB (void *p)
+{
+    if (FALSE) {
+        FvdAIOCB *acb = p;
+
+        /* Is it FvdAIOCB? */
+        if (acb->magic != FVDAIOCB_MAGIC || acb->common.bs->drv != &bdrv_fvd) {
+            /* Is it CompactChildCB? */
+            CompactChildCB *child = p;
+            acb = child->acb;
+            if (acb->magic != FVDAIOCB_MAGIC
+                || acb->common.bs->drv != &bdrv_fvd
+                || (acb->type != OP_LOAD_COMPACT
+                    && acb->type != OP_STORE_COMPACT)) {
+                return;
+            }
+        }
+
+        if (acb->uuid == 20ULL) {
+            QPAUSE ("Processing the right acb");
+        }
+    }
+}
+
+void init_fvd_debug_fp (void)
+{
+    char buf[256];
+    sprintf (buf, "/tmp/fvd.log-%d", getpid ());
+    if ((__fvd_debug_fp = fopen (buf, "wt")) == NULL) {
+        __fvd_debug_fp = stdout;
+    }
+}
+#endif
+
+void fvd_check_memory_usage (void)
+{
+    ASSERT (pending_qemu_malloc == 0);
+}
+
+int fvd_get_copy_on_read (BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    return s->copy_on_read;
+}
+
+void fvd_set_copy_on_read (BlockDriverState * bs, int copy_on_read)
+{
+    BDRVFvdState *s = bs->opaque;
+    s->copy_on_read = copy_on_read;
+}
diff --git a/block/fvd-ext.h b/block/fvd-ext.h
new file mode 100644
index 0000000..6839e25
--- /dev/null
+++ b/block/fvd-ext.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this header file contains functions of the FVD block
+ *  device driver that are used by other external modules. These functions are
+ *  mainly for testing and debugging urposes.
+ *============================================================================*/
+
+#ifndef __fvd_debug_h__
+#define __fvd_debug_h__
+
+//#define FVD_DEBUG
+
+int fvd_get_copy_on_read (BlockDriverState *bs);
+void fvd_set_copy_on_read (BlockDriverState *bs, int copy_on_read);
+void fvd_check_memory_usage (void);
+void fvd_init_prefetch(void * bs);
+void fvd_enable_host_crash_test (void);
+
+#ifndef TRUE
+# define TRUE 1
+#endif
+#ifndef FALSE
+# define FALSE 0
+#endif
+
+#ifndef FVD_DEBUG
+# define QDEBUG(format,...) do {} while (0)
+# define ASSERT(x) do {} while (0)
+# define FVD_DEBUG_ACB(...) do {} while (0)
+# define QPAUSE(...) do {} while (0)
+
+#else
+
+extern FILE *__fvd_debug_fp;
+void init_fvd_debug_fp (void);
+void FVD_DEBUG_ACB (void *p);
+# define QDEBUG(format,...) \
+    do { \
+        if (__fvd_debug_fp==NULL) init_fvd_debug_fp(); \
+        fprintf (__fvd_debug_fp, format, ##__VA_ARGS__); \
+        fflush(__fvd_debug_fp); \
+    } while(0)
+
+# define ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf (stderr, "Assertion failed in process %d at %s:%d. " \
+                "Waiting for debugging...\n", getpid(),__FILE__, __LINE__); \
+            fgetc (stdin); exit (1);  \
+        } \
+    } while (0) \
+
+# define QPAUSE(format,...) \
+    do { \
+        printf (format, ##__VA_ARGS__); \
+        printf (" Pause process %d for debugging...\n", getpid()); \
+        fgetc (stdin); \
+    } while (0)
+
+#endif
+
+#endif
diff --git a/block/fvd.c b/block/fvd.c
new file mode 100644
index 0000000..311ff58
--- /dev/null
+++ b/block/fvd.c
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements the QEMU block device driver
+ *  for the Fast Virtual Disk (FVD) format.  See the following companion
+ *  papers for a detailed description of FVD:
+ *  1. The so-called "FVD-cow paper":
+ *          "FVD: a High-Performance Virtual Machine Image Format for Cloud",
+ *      by Chunqiang Tang, 2010.
+ *  2. The so-called "FVD-compact paper":
+ *          "FVD: a High-Performance Virtual Machine Image Format for Cloud
+ *           with Sparse Image Capability", by Chunqiang Tang, 2010.
+ *============================================================================*/
+
+#include "block/fvd.h"
+
+//#define ENABLE_TRACE_IO
+//#define DEBUG_MEMORY_LEAK
+//#define SIMULATED_TEST_WITH_QEMU_IO
+
+#ifndef FVD_DEBUG
+#undef DEBUG_MEMORY_LEAK
+#undef ENABLE_TRACE_IO
+#undef SIMULATED_TEST_WITH_QEMU_IO
+#endif
+
+/* Use include to avoid exposing too many FVD symbols, and to allow inline
+ * function optimization. */
+#include "block/fvd-utils.c"
+#include "block/fvd-debug.c"
+#include "block/fvd-misc.c"
+#include "block/fvd-create.c"
+#include "block/fvd-open.c"
+#include "block/fvd-read.c"
+#include "block/fvd-write.c"
+#include "block/fvd-load.c"
+#include "block/fvd-store.c"
+#include "block/fvd-journal.c"
+#include "block/fvd-prefetch.c"
+
+static AIOPool fvd_aio_pool = {
+    .aiocb_size = sizeof (FvdAIOCB),
+    .cancel = fvd_aio_cancel,
+};
+
+static BlockDriver bdrv_fvd = {
+    .format_name = "fvd",
+    .instance_size = sizeof (BDRVFvdState),
+    .bdrv_create = fvd_create,
+    .bdrv_probe = fvd_probe,
+    .bdrv_file_open = fvd_open,
+    .bdrv_close = fvd_close,
+    .bdrv_is_allocated = fvd_is_allocated,
+    .bdrv_flush = fvd_flush,
+    .bdrv_aio_readv = fvd_aio_readv,
+    .bdrv_aio_writev = fvd_aio_writev,
+    .bdrv_aio_flush = fvd_aio_flush,
+    .create_options = fvd_create_options,
+    .bdrv_get_info = fvd_get_info,
+    .bdrv_update = fvd_update,
+    .bdrv_has_zero_init = fvd_has_zero_init
+};
+
+static void bdrv_fvd_init (void)
+{
+    bdrv_register (&bdrv_fvd);
+}
+
+block_init (bdrv_fvd_init);
+
+/*
+ * Since bdrv_close may not be properly invoked on a VM shutdown, we
+ * use a destructor to flush metadata to disk. This only affects
+ * performance and does not affect correctness.
+ * See Section 3.3.4 of the FVD-cow paper for the rationale.
+ */
+extern QTAILQ_HEAD (, BlockDriverState) bdrv_states;
+static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk (void)
+{
+    BlockDriverState *bs;
+    QTAILQ_FOREACH (bs, &bdrv_states, list) {
+        if (bs->drv == &bdrv_fvd) {
+            flush_metadata_to_disk_on_exit (bs);
+
+#ifdef FVD_DEBUG
+            dump_resource_summary (bs->opaque);
+#endif
+        }
+    }
+}
+
+/*
+ * TODOs: Below are some potential enhancements for future development:
+ * 1. Handle storage leak on failure.
+ *
+ * 2. Profile-directed prefetch. See Section 3.4.1 of the FVD-cow paper.
+ * Related metadata are FvdHeader.prefetch_profile_offset and
+ * FvdHeader.prefetch_profile_entries,
+ * FvdHeader.profile_directed_prefetch_start_delay,
+ * FvdHeader.generate_prefetch_profile.
+ *
+ * 3.  Cap the prefetch throughput at the upper limit. See Section 3.4.2 of
+ * the FVD-cow paper.  Related metadata are
+ * FvdHeader.prefetch_max_read_throughput and
+ * FvdHeader.prefetch_max_write_throughput.
+ *
+ * 4. Support write through to the base image. When a VM issues a write
+ * request, in addition to saving the data in the FVD data file, also save the
+ * data in the base image if the address of write request is not beyond the
+ * size of the base image (this of course requires the base image NOT to be
+ * 'read_only'. This feature changes the semantics of copy-on-write, but it
+ * suits a different use case, where the base image is stored on a remote
+ * storage server, and the FVD image is stored on a local disk and acts as a
+ * write-through cache of the base image. This can be used to cache and
+ * improve the performance of persistent storage on network-attached storage,
+ * e.g., Amazon EBS.  This feature is not described in the FVD-cow paper as it
+ * would complicate the discussion.  Related metadata are
+ * FvdHeader.write_updates_base_img.
+ */
diff --git a/block/fvd.h b/block/fvd.h
new file mode 100644
index 0000000..cce8cc8
--- /dev/null
+++ b/block/fvd.h
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this is the header of the FVD block device driver.
+ *============================================================================*/
+
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <pthread.h>
+#include <execinfo.h>
+#include <stdlib.h>
+#include <sys/ioctl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <inttypes.h>
+#include "block_int.h"
+#include "osdep.h"
+#include "qemu-option.h"
+#include "qemu-timer.h"
+#include "block.h"
+#include "qemu-queue.h"
+#include "qemu-common.h"
+#include "block/blksim.h"
+#include "block/fvd-ext.h"
+
+#define FVD_MAGIC         (('Q' << 24) | ('C' << 16) | (0xF5 << 8) | 0xA9)
+#define FVD_VERSION         1
+
+/* Profile-directed prefetch. (to be implemented). */
+typedef struct __attribute__ ((__packed__)) PrefetchProfileEntry {
+    int64_t offset;        /* in bytes */
+
+    /* In the unit of FvdHeader.prefetch_profile_entry_len_unit, i.e.,
+     * len_in_bytes = len * FvdHeader.unit_of_PrefetchProfileEntry_len. */
+    uint32_t len;
+} PrefetchProfileEntry;
+
+/*
+ * The FVD format consists of:
+ *   + Header fields of FvdHeader.
+ *   + Bitmap, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.bitmap_offset.
+ *   + Table, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.table_offset.
+ *   + Journal, starting on a 4KB page boundary at a location specified by
+ *     FvdHeader.journal_offset.
+ *   + Prefetch profile entries, starting on a 4KB page boundary at a location
+ *     specified by FvdHeader.prefetch_profile_offset. (to be implemented)
+ *   + Virtual disk data,  starting on a 4KB page boundary. Optionally, disk
+ *     data can be stored in a separate data file specified by
+ *     FvdHeader.data_file.
+ */
+typedef struct __attribute__ ((__packed__)) FvdHeader {
+    uint32_t magic;
+    uint32_t version;
+
+    /* This field is set to TRUE after whole-image prefetching finishes. */
+    int32_t all_data_in_fvd_img;
+
+    int64_t virtual_disk_size;        /* in bytes. Disk size perceived by the VM. */
+    int64_t metadata_size;        /* in bytes. */
+    char base_img[1024];
+    char base_img_fmt[16];
+    int64_t base_img_size;        /* in bytes. */
+    int64_t bitmap_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
+    int64_t bitmap_size;        /* in bytes. Rounded up to DEF_PAGE_SIZE */
+    int32_t block_size;                /* in bytes. */
+    int32_t copy_on_read;        /* TRUE or FALSE */
+    int64_t max_outstanding_copy_on_read_data;        /* in bytes. */
+
+    /* If (data_file[0]==0), the FVD metadata and data are stored in one file.*/
+    char data_file[1024];
+    char data_file_fmt[16];
+
+    /******** Begin: for prefetching. *******************************/
+    /* in seconds. -1 means disable whole image prefetching. */
+    int32_t prefetch_start_delay;
+
+    /* in bytes. Aligned on DEF_PAGE_SIZE. (to be implemented) */
+    int64_t prefetch_profile_offset;
+
+    /* Number of PrefetchProfileEntry. (to be implemented) */
+    int64_t prefetch_profile_entries;
+
+    int32_t num_prefetch_slots;        /* Max number of oustanding prefetch writes. */
+    int32_t bytes_per_prefetch;        /* For whole image prefetching. */
+    int32_t prefetch_read_throughput_measure_time;        /* in milliseconds. */
+    int32_t prefetch_write_throughput_measure_time;        /* in milliseconds. */
+
+    /* Controls the calculation of the moving average of throughput. Must be a
+     * value between [0,100].
+     *   actual_normalized_alpha = * prefetch_perf_calc_alpha / 100.0 */
+    int32_t prefetch_perf_calc_alpha;
+
+    int32_t prefetch_min_read_throughput;        /* in KB/second. */
+    int32_t prefetch_min_write_throughput;        /* in KB/second. */
+    int32_t prefetch_max_read_throughput;        /* in KB/second. */
+    int32_t prefetch_max_write_throughput;        /* in KB/second. */
+
+    /* in milliseconds. When prefetch read/write throughput is low, prefetch
+     * pauses for a random time uniformly distributed in
+     * [0, prefetch_throttle_time]. */
+    int32_t prefetch_throttle_time;
+    /******** End: for prefetching. *******************************/
+
+    /******** Begin: for compact image. *****************************/
+    int32_t compact_image;        /* TRUE or FALSE */
+    int64_t table_offset;        /* in bytes. */
+    int64_t chunk_size;                /* in bytes. */
+    int64_t storage_grow_unit;        /* in bytes. */
+    char add_storage_cmd[2048];
+    /******** End: for compact image. *******************************/
+
+    /******** Begin: for journal. ***********************************/
+    int64_t journal_offset;        /* in bytes. */
+    int64_t journal_size;        /* in bytes. */
+    int32_t clean_shutdown;        /* TRUE if VM's last shutdown was graceful. */
+    /******** End: for journal. *************************************/
+
+    /*
+     * This field is TRUE if the image mandates that the storage layer
+     * (BDRVFvdState.fvd_data) must return TRUE for bdrv_has_zero_init().
+     * This is the case if the optimization described in Section 3.3.3 of the
+     * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
+     * create' sets need_zero_init to TRUE, 'qemu-img update' can be used to
+     * manually reset it to FALSE, if the user always manually pre-fills the
+     * storage (e.g., a raw partition) with zeros. If the image is stored on a
+     * file system, it already supports zero_init, and hence there is no need
+     * to manually manipulate this field.
+     */
+    int32_t need_zero_init;
+
+    /* If TRUE, FVD dumps a prefetch profile after the VM shuts down.
+     * (to be implemented) */
+    int32_t generate_prefetch_profile;
+
+    /* See the comment on PrefetchProfileEntry.len. (to be implemented) */
+    int32_t unit_of_PrefetchProfileEntry_len;
+
+    /* in seconds. -1 means disable profile-directed prefetching.
+     * (to be implemented) */
+    int32_t profile_directed_prefetch_start_delay;
+
+    /* Possible values are "no", "writethrough", "writeback", or
+     * "writenocache". (to be implemented) */
+    char write_updates_base_img[16];
+} FvdHeader;
+
+typedef struct BDRVFvdState {
+    BlockDriverState *fvd_metadata;
+    BlockDriverState *fvd_data;
+    int64_t virtual_disk_size;        /*in bytes. */
+    int64_t bitmap_offset;        /* in sectors */
+    int64_t bitmap_size;        /* in bytes. */
+    int64_t data_offset;        /* in sectors. Begin of real data. */
+    int64_t nb_sectors_in_base_img;
+    int32_t block_size;        /* in sectors. */
+    int copy_on_read;        /* TRUE or FALSE */
+    int64_t max_outstanding_copy_on_read_data;        /* in bytes. */
+    int64_t outstanding_copy_on_read_data;        /* in bytes. */
+    int data_region_prepared;        /* TRUE or FALSE */
+     QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
+     QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and CoW. */
+
+    /* Keep two copies of bitmap to reduce the overhead of updating the
+     * on-disk bitmap, i.e., copy-on-read and prefetching do not update the
+     * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
+    uint8_t *fresh_bitmap;
+    uint8_t *stale_bitmap;
+
+    /******** Begin: for prefetching. ***********************************/
+    struct FvdAIOCB **prefetch_acb;
+    int prefetch_state; /* PREFETCH_STATE_RUNNING, FINISHED, or DISABLED. */
+    int prefetch_error;        /* TRUE or FALSE */
+    int num_prefetch_slots;
+    int num_filled_prefetch_slots;
+    int next_prefetch_read_slot;
+    int prefetch_read_active;                        /* TRUE or FALSE */
+    int pause_prefetch_requested;                /* TRUE or FALSE */
+    int prefetch_start_delay;        /* in seconds  */
+    int64_t unclaimed_prefetch_region_start;
+    int64_t prefetch_read_time;                        /* in milliseconds. */
+    int64_t prefetch_write_time;                /* in milliseconds. */
+    int64_t prefetch_data_read;                        /* in bytes. */
+    int64_t prefetch_data_written;                /* in bytes. */
+    double prefetch_read_throughput;                /* in bytes/millisecond. */
+    double prefetch_write_throughput;                /* in bytes/millisecond. */
+    double prefetch_min_read_throughput;        /* in bytes/millisecond. */
+    double prefetch_min_write_throughput;        /* in bytes/millisecond. */
+    int64_t prefetch_read_throughput_measure_time;        /* in millisecond. */
+    int64_t prefetch_write_throughput_measure_time;        /* in millisecond.*/
+    int prefetch_throttle_time;        /* in millisecond. */
+    int sectors_per_prefetch;
+    QEMUTimer *prefetch_timer;
+    /* prefetch_perf_calc_alpha = FvdHeader.prefetch_perf_calc_alpha/100.0 */
+    double prefetch_perf_calc_alpha;
+    /******** End: for prefetching. ***********************************/
+
+    /******** Begin: for compact image. *************************************/
+    uint32_t *table;        /* Mapping table stored in memory in little endian. */
+    int64_t data_storage;        /* in sectors. */
+    int64_t used_storage;        /* in sectors. */
+    int64_t chunk_size;        /* in sectors. */
+    int64_t storage_grow_unit;        /* in sectors. */
+    int64_t table_offset;        /* in sectors. */
+    char *add_storage_cmd;
+    /******** Begin: for compact image. *************************************/
+
+    /******** Begin: for journal. *******************************************/
+    int64_t journal_offset;        /* in sectors. */
+    int64_t journal_size;        /* in sectors. */
+    int64_t next_journal_sector;        /* in sector. */
+    int ongoing_journal_updates;        /* Number of ongoing journal updates. */
+    int dirty_image;        /* TRUE or FALSE. */
+
+    /* Requests waiting for metadata flush and journal recycle to finish. */
+    QLIST_HEAD(JournalFlush, FvdAIOCB) wait_for_journal;
+    /******** End: for journal. ********************************************/
+
+#ifdef FVD_DEBUG
+    int64_t total_copy_on_read_data;                /* in bytes. */
+    int64_t total_prefetch_data;                /* in bytes. */
+#endif
+} BDRVFvdState;
+
+/* Begin of data type definitions. */
+struct FvdAIOCB;
+
+typedef struct JournalCB {
+    BlockDriverAIOCB *hd_acb;
+    QEMUIOVector qiov;
+    struct iovec iov;
+     QLIST_ENTRY(FvdAIOCB) next_wait_for_journal;
+} JournalCB;
+
+/* CopyLock is used by AIOWriteCB and AIOCopyCB. */
+typedef struct CopyLock {
+    QLIST_ENTRY(FvdAIOCB) next;
+    int64_t begin;
+    int64_t end;
+     QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
+} CopyLock;
+
+typedef struct ChildAIOReadCB {
+    BlockDriverAIOCB *hd_acb;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    int64_t sector_num;
+    int nb_sectors;
+    int done;
+} ChildAIOReadCB;
+
+typedef struct AIOReadCB {
+    QEMUIOVector *qiov;
+    int ret;
+    ChildAIOReadCB read_backing;
+    ChildAIOReadCB read_fvd;
+} AIOReadCB;
+
+/* For copy-on-read and prefetching. */
+typedef struct AIOCopyCB {
+    BlockDriverAIOCB *hd_acb;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    uint8_t *buf;
+    int64_t buffered_sector_begin;
+    int64_t buffered_sector_end;
+    int64_t last_prefetch_op_start_time;        /* For prefetch only. */
+} AIOCopyCB;
+
+typedef struct AIOWriteCB {
+    BlockDriverAIOCB *hd_acb;
+    QEMUIOVector *qiov;
+    uint8_t *cow_buf;
+    QEMUIOVector *cow_qiov;
+    int64_t cow_start_sector;
+    int update_table;        /* TRUE or FALSE. */
+    int ret;
+    QLIST_ENTRY(FvdAIOCB) next_write_lock;   /* See BDRVFvdState.write_locks */
+
+    /* See FvdAIOCB.write.dependent_writes. */
+    QLIST_ENTRY(FvdAIOCB) next_dependent_write;
+} AIOWriteCB;
+
+/* For AIOStoreCompactCB and AIOLoadCompactCB. */
+typedef struct CompactChildCB {
+    struct FvdAIOCB *acb;
+    BlockDriverAIOCB *hd_acb;
+} CompactChildCB;
+
+/* For storing data to a compact image. */
+typedef struct AIOStoreCompactCB {
+    CompactChildCB one_child;
+    CompactChildCB *children;
+    int update_table;
+    int num_children;
+    int finished_children;
+    struct FvdAIOCB *parent_acb;
+    int ret;
+    int soft_write; /*TRUE if the store is caused by copy-on-read or prefetch.*/
+    QEMUIOVector *orig_qiov;
+} AIOStoreCompactCB;
+
+/* For loading data from a compact image. */
+typedef struct AIOLoadCompactCB {
+    CompactChildCB *children;
+    CompactChildCB one_child;
+    int num_children;
+    int finished_children;
+    struct FvdAIOCB *parent_acb;
+    int ret;
+    QEMUIOVector *orig_qiov;
+} AIOLoadCompactCB;
+
+typedef struct AIOFlushCB {
+    BlockDriverAIOCB *data_acb;
+    BlockDriverAIOCB *metadata_acb;
+    int num_finished;
+    int ret;
+} AIOFlushCB;
+
+typedef struct AIOWrapperCB {
+    QEMUBH *bh;
+} AIOWrapperCB;
+
+typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
+    OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH } op_type;
+
+#ifdef FVD_DEBUG
+/* For debugging memory leadk. */
+typedef struct alloc_tracer_t {
+    int64_t magic;
+    int alloc_tracer;
+    const char *alloc_file;
+    int alloc_line;
+    size_t size;
+} alloc_tracer_t;
+#endif
+
+typedef struct FvdAIOCB {
+    BlockDriverAIOCB common;
+    op_type type;
+    int64_t sector_num;
+    int nb_sectors;
+    JournalCB jcb;        /* For AIOWriteCB and AIOStoreCompactCB. */
+    CopyLock copy_lock;        /* For AIOWriteCB and AIOCopyCB. */
+
+    /* Use a union so that all requests can efficiently share one big AIOPool.*/
+    union {
+        AIOWrapperCB wrapper;
+        AIOReadCB read;
+        AIOWriteCB write;
+        AIOCopyCB copy;
+        AIOLoadCompactCB load;
+        AIOStoreCompactCB store;
+        AIOFlushCB flush;
+    };
+
+#ifdef FVD_DEBUG
+    int64_t magic;
+    alloc_tracer_t tracer;
+
+    /* Uniquely identifies a request across all processing activities. */
+    unsigned long long int uuid;
+#endif
+} FvdAIOCB;
+
+static AIOPool fvd_aio_pool;
+static BlockDriver bdrv_fvd;
+static QEMUOptionParameter fvd_create_options[];
+
+/* Function prototypes. */
+static int do_aio_write(struct FvdAIOCB *acb);
+static void finish_write_data(void *opaque, int ret);
+static void restart_dependent_writes(struct FvdAIOCB *acb);
+static void finish_prefetch_read(void *opaque, int ret);
+static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
+static int update_fvd_header(BDRVFvdState * s, FvdHeader * header);
+static void fvd_aio_cancel(BlockDriverAIOCB * blockacb);
+static BlockDriverAIOCB *store_data_in_compact_image(struct FvdAIOCB *acb,
+            int soft_write, struct FvdAIOCB *parent_acb, BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *load_data_from_compact_image(struct FvdAIOCB *acb,
+            struct FvdAIOCB *parent_acb, BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static void free_write_resource(struct FvdAIOCB *acb);
+static void write_metadata_to_journal(struct FvdAIOCB *acb);
+static void flush_metadata_to_disk(BlockDriverState * bs);
+static void free_journal_sectors(BDRVFvdState * s);
+static int fvd_create(const char *filename, QEMUOptionParameter * options);
+static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename);
+static int fvd_open(BlockDriverState * bs, const char *filename, int flags);
+static void fvd_close(BlockDriverState * bs);
+static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
+                            int nb_sectors, int *pnum);
+static int fvd_flush(BlockDriverState * bs);
+static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
+            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
+            BlockDriverCompletionFunc * cb, void *opaque);
+static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
+static int fvd_update(BlockDriverState * bs, int argc, char **argv);
+static int fvd_has_zero_init(BlockDriverState * bs);
+static void fvd_read_cancel(FvdAIOCB * acb);
+static void fvd_write_cancel(FvdAIOCB * acb);
+static void fvd_copy_cancel(FvdAIOCB * acb);
+static void fvd_load_compact_cancel(FvdAIOCB * acb);
+static void fvd_store_compact_cancel(FvdAIOCB * acb);
+static void fvd_wrapper_cancel(FvdAIOCB * acb);
+static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
+static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
+            BlockDriverState * bs, int64_t sector_num, QEMUIOVector * orig_qiov,
+            int nb_sectors, BlockDriverCompletionFunc * cb, void *opaque);
+static inline BlockDriverAIOCB *store_data(int soft_write,
+            FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t sector_num,
+            QEMUIOVector * orig_qiov, int nb_sectors,
+            BlockDriverCompletionFunc * cb, void *opaque);
+
+/* Default configurations. */
+#define DEF_PAGE_SIZE                                 4096        /* bytes */
+#define BYTES_PER_PREFETCH                        1048576        /* bytes */
+#define PREFETCH_THROTTLING_TIME                30000        /* milliseconds */
+#define NUM_PREFETCH_SLOTS                        2
+#define PREFETCH_MIN_MEASURE_READ_TIME                 100        /* milliseconds */
+#define PREFETCH_MIN_MEASURE_WRITE_TIME         100        /* milliseconds */
+#define PREFETCH_MIN_READ_THROUGHPUT                 5120        /* KB/s */
+#define PREFETCH_MIN_WRITE_THROUGHPUT                 5120        /* KB/s */
+#define PREFETCH_MAX_READ_THROUGHPUT                 1000000000L        /* KB/s */
+#define PREFETCH_MAX_WRITE_THROUGHPUT                 1000000000L        /* KB/s */
+#define PREFETCH_PERF_CALC_ALPHA                80        /* in [0,100]. */
+#define MAX_OUTSTANDING_COPY_ON_READ_DATA        2000000                /* bytes */
+#define MODERATE_BITMAP_SIZE                         4194304L        /* bytes */
+#define CHUNK_SIZE                                1048576LL        /* bytes */
+#define JOURNAL_SIZE                                16777216LL        /* bytes */
+#define STORAGE_GROW_UNIT                        104857600LL        /* bytes */
+
+/* State of BDRVFvdState.prefetch_state. */
+#define PREFETCH_STATE_RUNNING                        1
+#define PREFETCH_STATE_FINISHED                        2
+#define PREFETCH_STATE_DISABLED                        3
+
+/* For convience. */
+#define ROUND_UP(x, base)           ((((x)+(base)-1) / (base)) * (base))
+#define ROUND_DOWN(x, base)           ((((x) / (base)) * (base)))
+#define BOOL(x)                 ((x) ? "true" : "false")
+#define EMPTY_TABLE                ((uint32_t)0xFFFFFFFF)
+#define DIRTY_TABLE                ((uint32_t)0x80000000)
+#define READ_TABLE(entry)         (le32_to_cpu(entry) & ~DIRTY_TABLE)
+# define FVDAIOCB_MAGIC         ((uint64_t)0x3A8FCE89325B976DULL)
+# define FVD_ALLOC_MAGIC         ((uint64_t)0x4A7dCEF9925B976DULL)
+#define IS_EMPTY(entry)         ((entry) == EMPTY_TABLE)
+#define IS_DIRTY(entry)         (le32_to_cpu(entry) & DIRTY_TABLE)
+#define WRITE_TABLE(entry,id)         ((entry) = cpu_to_le32(id))
+#define READ_TABLE2(entry) \
+    ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry) & ~DIRTY_TABLE))
+
+#define CLEAN_DIRTY(entry) \
+    do {  \
+        if (!IS_EMPTY(entry))  \
+            entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    } while (0)
+
+#define CLEAN_DIRTY2(entry) \
+    do { \
+        ASSERT(!IS_EMPTY(entry)); \
+        entry = cpu_to_le32(le32_to_cpu(entry) & ~DIRTY_TABLE);  \
+    } while (0)
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 4/5] Fast Virtual Disk (FVD) Proposal Part 4
  2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 2/5] Fast Virtual Disk (FVD) Proposal Part 2 Chunqiang Tang
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3 Chunqiang Tang
@ 2011-01-19 22:04 ` Chunqiang Tang
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 5/5] Fast Virtual Disk (FVD) Proposal Part 5 Chunqiang Tang
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-19 22:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

Part 4 of the block device driver for the proposed FVD image format.
Multiple patches are used in order to manage the size of each patch.
This patch includes some new files for FVD.

See the related discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-journal.c  |  558 +++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-load.c     |  364 +++++++++++++++++++++++++++++
 block/fvd-misc.c     |  616 ++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-open.c     |  446 ++++++++++++++++++++++++++++++++++++
 block/fvd-prefetch.c |  598 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 2582 insertions(+), 0 deletions(-)
 create mode 100644 block/fvd-journal.c
 create mode 100644 block/fvd-load.c
 create mode 100644 block/fvd-misc.c
 create mode 100644 block/fvd-open.c
 create mode 100644 block/fvd-prefetch.c

diff --git a/block/fvd-journal.c b/block/fvd-journal.c
new file mode 100644
index 0000000..7bd316a
--- /dev/null
+++ b/block/fvd-journal.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this FVD module implements a journal for committing
+ *  metadata changes. Each sector in the journal is self-contained so that
+ *  updates are atomic. A sector may contain one or multiple journal records.
+ *  There are two types of journal records:
+ * bitmap_update and table_update.
+ *   Format of a bitmap_update record:
+ *         + BITMAP_JRECORD (uint32_t)
+ *         + num_dirty_sectors (uint32_t)
+ *         + dirty_sector_begin (int64_t)
+ *   Format of a table_update record:
+ *         + TABLE_JRECORD (uint32_t)
+ *         + dirty_table_offset (uint32_t)
+ *         + num_dirty_table_entries (uint32_t)
+ *         +   table_entry_1 (uint32_t)
+ *         +   table_entry_2 (uint32_t)
+ *         +   ...
+ * If both the bitmap and the table need update, one sector contains a
+ * TABLE_JRECORD and a BITMAP_JRECORD, and these two records cover
+ * the same range of virtual disk data so that the corresponding parts of the
+ * bitmap and the table are always updated in one atomic operation.
+ *============================================================================*/
+
+#define BITMAP_JRECORD                 ((uint32_t)0x3F2AB8ED)
+#define TABLE_JRECORD                ((uint32_t)0xB4E6F7AC)
+#define EMPTY_JRECORD                ((uint32_t)0)
+#define BITMAP_JRECORD_SIZE         (2*sizeof(uint32_t) + sizeof(int64_t))
+#define TABLE_JRECORD_HDR_SIZE         (3*sizeof(uint32_t))
+#define TABLE_JRECORDS_PER_SECTOR \
+                ((512 - TABLE_JRECORD_HDR_SIZE)/sizeof(uint32_t))
+
+/* One BITMAP_JRECORD and this number of BITMAP_JRECORDs can fit
+ * in one journal sector. */
+#define MIXED_JRECORDS_PER_SECTOR ((512 - TABLE_JRECORD_HDR_SIZE - \
+                                BITMAP_JRECORD_SIZE) / sizeof(uint32_t))
+
+static inline int64_t calc_min_journal_size (int64_t table_entries)
+{
+    return (table_entries + MIXED_JRECORDS_PER_SECTOR - 1)
+                            / MIXED_JRECORDS_PER_SECTOR * 512;
+}
+
+static int init_journal (int read_only, BlockDriverState * bs,
+                         FvdHeader * header)
+{
+    /* A trick to figure out whether it is runningin a qemu tool. */
+    const int in_qemu_tool = (rt_clock == NULL);
+
+    BDRVFvdState *s = bs->opaque;
+    s->journal_size = header->journal_size / 512;
+    s->journal_offset = header->journal_offset / 512;
+    s->next_journal_sector = 0;
+
+    if (read_only) {
+        return 0;
+    }
+
+    if (s->journal_size <= 0) {
+        if (!s->table && !s->fresh_bitmap) {
+            return 0;        /* No need to use the journal. */
+        }
+
+        if (!header->clean_shutdown) {
+            fprintf (stderr, "ERROR: the image may be corrupted because it was "
+                     "not shut down gracefully last\ntime and it does not use "
+                     "a journal. You may continue to use the image at your\n"
+                     "own risk by manually resetting the clean_shutdown flag "
+                     "in the image.\n\n");
+            s->dirty_image = TRUE;
+            if (in_qemu_tool) {
+                return 0;        /* Allow qemu tools to use the image. */
+            } else {
+                /* Do not allow boot the VM until the clean_shutdown flag is
+                 * manually cleaned. */
+                return -1;
+            }
+        }
+
+        QDEBUG ("Journal is disabled\n");
+        return 0;
+    }
+
+    if (header->clean_shutdown) {
+        QDEBUG ("Journal is skipped as the VM was shut down gracefully "
+                "last time.\n");
+        return 0;
+    }
+
+    QDEBUG ("Recover from the journal as the VM was not shut down gracefully "
+            "last time.\n");
+
+    uint8_t *journal = my_qemu_blockalign (s->fvd_metadata,
+                                           s->journal_size * 512);
+    int ret = bdrv_read (s->fvd_metadata, s->journal_offset,
+                         journal, s->journal_size);
+    if (ret < 0) {
+        my_qemu_vfree (journal);
+        fprintf (stderr, "Failed to read the journal (%" PRId64 ") bytes\n",
+                 s->journal_size * 512);
+        return -1;
+    }
+
+    /* Go through every journal sector. */
+    uint8_t *sector = journal;
+    uint8_t *journal_end = journal + s->journal_size * 512;
+    while (sector < journal_end) {
+        uint32_t *type = (uint32_t *) sector;        /* Journal record type. */
+        while ((uint8_t *) type < (sector + 512)) {
+            if (le32_to_cpu (*type) == BITMAP_JRECORD) {
+                uint32_t *nb_sectors = type + 1; /* BITMAP_JRECORD field 2. */
+                int64_t *sector_num = (int64_t *) (type + 2);        /* field 3. */
+                if (s->stale_bitmap) {
+                    update_both_bitmaps (s, le64_to_cpu (*sector_num),
+                                     le32_to_cpu (*nb_sectors));
+                    QDEBUG ("JOURNAL: recover BITMAP_JRECORD sector_num=%"
+                            PRId64 " nb_sectors=%u\n",
+                            le64_to_cpu (*sector_num),
+                            le32_to_cpu (*nb_sectors));
+                }
+
+                /* First field of the next journal record. */
+                type = (uint32_t *) sector_num + 1;
+            } else if (le32_to_cpu (*type) == TABLE_JRECORD) {
+                uint32_t *offset = type + 1;        /* TABLE_JRECORD field 2. */
+                uint32_t *count = type + 2;        /* TABLE_JRECORD field 3. */
+                uint32_t *content = type + 3;        /* fields 4 and beyond. */
+                const uint32_t chunk = le32_to_cpu (*offset);
+                const uint32_t n = le32_to_cpu (*count);
+                uint32_t i;
+                for (i = 0; i < n; i++) {
+                    s->table[chunk + i] = content[i];
+
+                    /* The dirty bit was not cleaned when the table entry was
+                     * saved in the journal. */
+                    CLEAN_DIRTY2 (s->table[chunk + i]);
+                }
+                type = content + n; /* First field of the next record. */
+                QDEBUG ("JOURNAL: recover TABLE_JRECORD chunk_start=%u "
+                        "nb_chunks=%u\n", chunk, n);
+            } else {
+                /* End of valid records in this journal sector. */
+                ASSERT (le32_to_cpu (*type) == EMPTY_JRECORD);
+                break;
+            }
+        }
+
+        sector += 512;
+    }
+    my_qemu_vfree (journal);
+    flush_metadata_to_disk (bs);        /* Write the recovered metadata. */
+
+    return 0;
+}
+
+/*
+ * This function first flushes in-memory metadata to disk and then recycle the
+ * used journal sectors. It is possible to make this operation asynchronous so
+ * that the performance is better.  However, the overall performance
+ * improvement may be limited since recycling the journal happens very
+ * infrequently and updating on-disk metadata finishes quickly because of the
+ * small size of the metadata.
+ */
+static void recycle_journal (BDRVFvdState * s)
+{
+#ifdef FVD_DEBUG
+    static int64_t recycle_count = 0;
+    QDEBUG ("JOURNAL: start journal recycle %" PRId64 ".\n", recycle_count);
+    recycle_count++;
+    int64_t begin_time = qemu_get_clock (rt_clock);
+#endif
+
+    /* Write fresh_bitmap to disk. */
+    if (s->fresh_bitmap) {
+        int nb = (int) (s->bitmap_size / 512);
+        QDEBUG ("JOURNAL: flush bitmap (%d sectors) to disk\n", nb);
+
+        /* How to recover if this write fails? */
+        bdrv_write (s->fvd_metadata, s->bitmap_offset, s->fresh_bitmap, nb);
+
+        if (s->fresh_bitmap != s->stale_bitmap) {
+            memcpy (s->stale_bitmap, s->fresh_bitmap, s->bitmap_size);
+        }
+    }
+
+    /* Clean DIRTY_TABLE bit and write the table to disk. */
+    if (s->table) {
+        int table_entries =
+            (int) (ROUND_UP (s->virtual_disk_size, s->chunk_size * 512) /
+                   (s->chunk_size * 512));
+        int i;
+        for (i = 0; i < table_entries; i++) {
+            CLEAN_DIRTY (s->table[i]);
+        }
+
+        int64_t table_size = sizeof (uint32_t) * table_entries;
+        table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
+        int nb = (int) (table_size / 512);
+        QDEBUG ("JOURNAL: flush table (%d sectors) to disk\n", nb);
+
+        /* How to recover if this write fails? */
+        bdrv_write (s->fvd_metadata, s->table_offset, (uint8_t *) s->table, nb);
+    }
+    s->next_journal_sector = 0;
+
+#ifdef FVD_DEBUG
+    int64_t end_time = qemu_get_clock (rt_clock);
+    QDEBUG ("JOURNAL: journal recycle took %" PRId64 " ms.\n",
+            (end_time - begin_time));
+#endif
+}
+
+static void free_journal_sectors (BDRVFvdState * s)
+{
+    if (s->journal_size <= 0) {
+        return;
+    }
+
+    s->ongoing_journal_updates--;
+    ASSERT (s->ongoing_journal_updates >= 0);
+    if (s->ongoing_journal_updates > 0 || QLIST_EMPTY (&s->wait_for_journal)) {
+        return;
+    }
+
+    /* Some requests are waiting for the journal to be recycled in order to
+     * get free journal sectors. */
+    recycle_journal (s);
+
+    /* Restart requests in the wait_for_journal list.  First make a copy of
+     * the head and then empty the head. */
+    FvdAIOCB *acb = QLIST_FIRST (&s->wait_for_journal);
+    QLIST_INIT (&s->wait_for_journal);
+    FvdAIOCB *next;
+
+    /* Restart all dependent requests. Cannot use QLIST_FOREACH here, because
+     * the next link might not be the same any more after the callback. */
+    while (acb) {
+        next = acb->jcb.next_wait_for_journal.le_next;
+        acb->jcb.next_wait_for_journal.le_prev = NULL;
+        QDEBUG ("WRITE: acb%llu-%p  restart_write_metadata_to_journal "
+                "after recycle_journal\n", acb->uuid, acb);
+        write_metadata_to_journal (acb);
+        acb = next;
+    }
+}
+
+static int64_t allocate_journal_sectors (BDRVFvdState * s, FvdAIOCB * acb,
+                                         int num_sectors)
+{
+    ASSERT (num_sectors <= s->journal_size);
+
+    if (!QLIST_EMPTY (&s->wait_for_journal)) {
+        /* Waiting for journal recycle to finish. */
+        ASSERT (s->ongoing_journal_updates > 0);
+        QDEBUG ("WRITE: acb%llu-%p  wait_for_journal_recycle\n",
+                acb->uuid, acb);
+        QLIST_INSERT_HEAD (&s->wait_for_journal, acb,
+                           jcb.next_wait_for_journal);
+        return -1;
+    }
+
+    int64_t journal_sec;
+    if (s->next_journal_sector + num_sectors <= s->journal_size) {
+      alloc_sector:
+        journal_sec = s->next_journal_sector;
+        s->next_journal_sector += num_sectors;
+        s->ongoing_journal_updates++;
+        return journal_sec;
+    }
+
+    /* No free journal sector is available. Check if the journal can be
+     * recycled now. */
+    if (s->ongoing_journal_updates == 0) {
+        recycle_journal (s);
+        goto alloc_sector;
+    }
+
+    /* Waiting for journal recycle to finish. It will be waken up later in
+     * free_journal_sectors(). */
+    QLIST_INSERT_HEAD (&s->wait_for_journal, acb, jcb.next_wait_for_journal);
+    QDEBUG ("WRITE: acb%llu-%p  wait_for_journal_recycle\n", acb->uuid, acb);
+    return -1;
+}
+
+static void finish_write_journal (void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (ret == 0) {
+        QDEBUG ("JOURNAL: acb%llu-%p  finish_write_journal\n", acb->uuid, acb);
+
+        if (s->table) {
+            /* Update the table. */
+            int i;
+            const uint32_t first_chunk = acb->sector_num / s->chunk_size;
+            const uint32_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                                                            / s->chunk_size;
+            for (i = first_chunk; i <= last_chunk; i++) {
+                CLEAN_DIRTY2 (s->table[i]);
+            }
+        }
+
+        if (s->stale_bitmap) {
+            /* If fresh_bitmap differs from stale_bitmap, fresh_bitmap has
+             * already been updated in finish_write_data() when invoking
+             * update_fresh_bitmap_and_check_stale_bitmap(). */
+            update_stale_bitmap (s, acb->sector_num, acb->nb_sectors);
+        }
+    } else {
+        QDEBUG ("JOURNAL: acb%llu-%p  finish_write_journal error ret=%d\n",
+                acb->uuid, acb, ret);
+    }
+
+    /* Clean up. */
+    if (acb->type == OP_STORE_COMPACT) {
+        acb->common.cb (acb->common.opaque, ret);
+        if (acb->jcb.iov.iov_base != NULL) {
+            my_qemu_vfree (acb->jcb.iov.iov_base);
+        }
+        my_qemu_aio_release (acb);
+    } else {
+        ASSERT (acb->type == OP_WRITE);
+        finish_write (acb, ret);
+    }
+
+    free_journal_sectors (s);
+}
+
+static void write_metadata_to_journal (FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t journal_sec;
+    int num_journal_sectors;
+
+    ASSERT ((s->table || s->fresh_bitmap)
+            && (acb->type == OP_WRITE || acb->type == OP_STORE_COMPACT));
+
+    /* Is journal is disabled? */
+    if (s->journal_size <= 0) {
+        finish_write_journal (acb, 0);
+        return;
+    }
+
+    if (!s->table) {
+        /* Only update the bitmap. */
+        num_journal_sectors = 1;
+        journal_sec = allocate_journal_sectors (s, acb, num_journal_sectors);
+        if (journal_sec < 0) {
+            /* No journal sector is available now. It will be waken up later
+             * in free_journal_sectors(). */
+            return;
+        }
+        acb->jcb.iov.iov_len = 512;
+        acb->jcb.iov.iov_base = my_qemu_blockalign (s->fvd_metadata, 512);
+
+        uint32_t *type = (uint32_t *) acb->jcb.iov.iov_base; /* Field 1. */
+        uint32_t *nb_sectors = type + 1;        /* BITMAP_JRECORD field 2. */
+        int64_t *sector_num = (int64_t *) (type + 2);        /* field 3. */
+        *type = cpu_to_le32 (BITMAP_JRECORD);
+        *nb_sectors = cpu_to_le32 ((uint32_t) acb->nb_sectors);
+        *sector_num = cpu_to_le64 (acb->sector_num);
+        *((uint32_t *) (sector_num + 1)) = EMPTY_JRECORD;/* Mark record end. */
+
+    } else if (!s->fresh_bitmap) {
+        /* Only update the table. */
+        const int64_t first_chunk = acb->sector_num / s->chunk_size;
+        const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                                                            / s->chunk_size;
+        int num_chunks = last_chunk - first_chunk + 1;
+        num_journal_sectors = (num_chunks + TABLE_JRECORDS_PER_SECTOR - 1)
+                                                / TABLE_JRECORDS_PER_SECTOR;
+        journal_sec = allocate_journal_sectors (s, acb, num_journal_sectors);
+        if (journal_sec < 0) {
+            /* No journal sector is available now. It will be waken up later
+             * in free_journal_sectors(). */
+            return;
+        }
+
+        acb->jcb.iov.iov_len = num_journal_sectors * 512;
+        acb->jcb.iov.iov_base = my_qemu_blockalign (s->fvd_metadata,
+                                                    acb->jcb.iov.iov_len);
+
+        uint32_t *type = (uint32_t *) acb->jcb.iov.iov_base; /* Field 1. */
+        int64_t chunk = first_chunk;
+
+        while (1) {
+            /* Start a new journal sector. */
+            uint32_t *offset = type + 1;        /* TABLE_JRECORD field 2. */
+            uint32_t *count = type + 2;        /* TABLE_JRECORD field 3. */
+            uint32_t *content = type + 3;        /* Fields 4 and beyond. */
+            *type = cpu_to_le32 (TABLE_JRECORD);
+            *offset = cpu_to_le32 (chunk);
+
+            if (num_chunks <= TABLE_JRECORDS_PER_SECTOR) {
+                /* This is the last journal sector. */
+                *count = cpu_to_le32 (num_chunks);
+                memcpy (content, &s->table[chunk],
+                        sizeof (uint32_t) * num_chunks);
+                if (num_chunks < TABLE_JRECORDS_PER_SECTOR) {
+                    *(content + num_chunks) = EMPTY_JRECORD; /* Mark end. */
+                }
+                break;
+            }
+
+            *count = cpu_to_le32 (TABLE_JRECORDS_PER_SECTOR);
+            memcpy (content, &s->table[chunk],
+                    sizeof (uint32_t) * TABLE_JRECORDS_PER_SECTOR);
+            chunk += TABLE_JRECORDS_PER_SECTOR;
+            num_chunks -= TABLE_JRECORDS_PER_SECTOR;
+
+            /* Next TABLE_JRECORD field 1. */
+            type = content + TABLE_JRECORDS_PER_SECTOR;
+        }
+    } else {
+        /* Update both the table and the bitmap. It may use multiple journal
+         * sectors. Each sector is self-contained, including a TABLE_JRECORD
+         * and a BITMAP_JRECORD. The two records one the same sector cover the
+         * same range of virtual disk data.  The purpose is to update the
+         * corresponding parts of the bitmap and the table in one atomic
+         * operation. */
+        const int64_t first_chunk = acb->sector_num / s->chunk_size;
+        const int64_t last_chunk = (acb->sector_num + acb->nb_sectors - 1)
+                                                / s->chunk_size;
+        int num_chunks = last_chunk - first_chunk + 1;
+        num_journal_sectors = (num_chunks + MIXED_JRECORDS_PER_SECTOR - 1)
+                                                / MIXED_JRECORDS_PER_SECTOR;
+        journal_sec = allocate_journal_sectors (s, acb, num_journal_sectors);
+        if (journal_sec < 0) {
+            /* No journal sector is available now. It will be waken up later
+             * in free_journal_sectors(). */
+            return;
+        }
+        acb->jcb.iov.iov_len = num_journal_sectors * 512;
+        acb->jcb.iov.iov_base = my_qemu_blockalign (s->fvd_metadata,
+                                                    acb->jcb.iov.iov_len);
+
+        uint32_t *type = (uint32_t *) acb->jcb.iov.iov_base; /* Field 1. */
+        int64_t chunk = first_chunk;
+        int64_t sector_num = acb->sector_num;
+        uint32_t nb_sectors;
+        if (num_journal_sectors == 1) {
+            nb_sectors = acb->nb_sectors;
+        } else {
+            /* Number of sectors that fall into the first chunk. */
+            nb_sectors = (first_chunk + MIXED_JRECORDS_PER_SECTOR)
+                                    * s->chunk_size - acb->sector_num;
+        }
+
+        while (1) {
+            /* Start a new journal sector. */
+            uint32_t *offset = type + 1;        /* TABLE_JRECORD field 2. */
+            uint32_t *count = type + 2;                /* TABLE_JRECORD field 3. */
+            uint32_t *content = type + 3;         /* Fields 4 and beyond. */
+            *type = cpu_to_le32 (TABLE_JRECORD);
+            *offset = cpu_to_le32 (chunk);
+
+            if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
+                /* This is the last journal sector. */
+                *count = cpu_to_le32 (num_chunks);
+                memcpy (content, &s->table[chunk],
+                        sizeof (uint32_t) * num_chunks);
+
+                /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
+                 * updated in one atomic operatoin. */
+                type = content + num_chunks;        /* BITMAP_JRECORD field 1. */
+                uint32_t *p_nb_sectors = type + 1; /* BITMAP_JRECORD field 2. */
+                int64_t *p_sector_num = (int64_t *) (type + 2);        /* Field 3. */
+                *type = cpu_to_le32 (BITMAP_JRECORD);
+                *p_nb_sectors = cpu_to_le32 (nb_sectors);
+                *p_sector_num = cpu_to_le64 (sector_num);
+
+                if (num_chunks < MIXED_JRECORDS_PER_SECTOR) {
+                    *((uint32_t *) (p_sector_num + 1)) = EMPTY_JRECORD;        /*End*/
+                }
+                break;
+            }
+
+            *count = cpu_to_le32 (MIXED_JRECORDS_PER_SECTOR);
+            memcpy (content, &s->table[chunk],
+                    sizeof (uint32_t) * MIXED_JRECORDS_PER_SECTOR);
+
+            /* A BITMAP_JRECORD follows a TABLE_JRECORD so that they are
+             * updated in one atomic operatoin. */
+            type = content + MIXED_JRECORDS_PER_SECTOR;                /* Field 1. */
+            uint32_t *p_nb_sectors = type + 1;        /* BITMAP_JRECORD field 2. */
+            int64_t *p_sector_num = (int64_t *) (type + 2);        /* Field 3. */
+            *type = cpu_to_le32 (BITMAP_JRECORD);
+            *p_nb_sectors = cpu_to_le32 (nb_sectors);
+            *p_sector_num = cpu_to_le64 (sector_num);
+
+            /* Prepare for the next journal sector. */
+            type = (uint32_t *) (p_sector_num + 1);
+            chunk += MIXED_JRECORDS_PER_SECTOR;
+            sector_num = chunk * s->chunk_size;
+            num_chunks -= MIXED_JRECORDS_PER_SECTOR;
+            if (num_chunks <= MIXED_JRECORDS_PER_SECTOR) {
+                /* Data sectors covered by the last journal sector. */
+                nb_sectors = (acb->sector_num + acb->nb_sectors)
+                                            - chunk * s->chunk_size;
+            } else {
+                nb_sectors = s->chunk_size * MIXED_JRECORDS_PER_SECTOR;
+            }
+        }
+    }
+
+    QDEBUG ("JOURNAL: acb%llu-%p  write_metadata_to_journal journal_sec=%"
+            PRId64 " nb_journal_sectors=%d\n", acb->uuid, acb, journal_sec,
+            num_journal_sectors);
+    qemu_iovec_init_external (&acb->jcb.qiov, &acb->jcb.iov, 1);
+    acb->jcb.hd_acb = bdrv_aio_writev (s->fvd_metadata,
+                                       s->journal_offset + journal_sec,
+                                       &acb->jcb.qiov, num_journal_sectors,
+                                       finish_write_journal, acb);
+    if (!acb->jcb.hd_acb) {
+        finish_write_journal (acb, -1);
+    }
+}
+
+#ifdef FVD_DEBUG
+static int emulate_host_crash = TRUE;
+#else
+static int emulate_host_crash = FALSE;
+#endif
+
+static void flush_metadata_to_disk_on_exit (BlockDriverState *bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (bs->read_only || !s->fvd_metadata) {
+        return;
+    }
+
+    /* If (emulate_host_crash==TRUE), do not flush metadata to disk
+     * so that it has to rely on journal for recovery. */
+    if (s->journal_size <= 0 || !emulate_host_crash) {
+        flush_metadata_to_disk (bs);
+        if (!s->dirty_image) {
+            update_clean_shutdown_flag (s, TRUE);
+        }
+    }
+}
+
+void fvd_enable_host_crash_test (void)
+{
+    emulate_host_crash = TRUE;
+}
diff --git a/block/fvd-load.c b/block/fvd-load.c
new file mode 100644
index 0000000..fd72e31
--- /dev/null
+++ b/block/fvd-load.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this FVD module implements loading data from a
+ *  compact image.
+ *============================================================================*/
+
+static void aio_wrapper_bh (void *opaque);
+static void finish_load_data_from_compact_image (void *opaque, int ret);
+static inline FvdAIOCB *init_load_acb (FvdAIOCB * parent_acb,
+                                       BlockDriverState * bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector * orig_qiov, int nb_sectors,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque);
+
+static inline BlockDriverAIOCB *load_data (FvdAIOCB * parent_acb,
+                                           BlockDriverState * bs,
+                                           int64_t sector_num,
+                                           QEMUIOVector * orig_qiov,
+                                           int nb_sectors,
+                                           BlockDriverCompletionFunc * cb,
+                                           void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (!s->table) {
+        /* Load directly since it is not a compact image. */
+        return bdrv_aio_readv (s->fvd_data, s->data_offset + sector_num,
+                               orig_qiov, nb_sectors, cb, opaque);
+    } else {
+        return load_data_from_compact_image (NULL, parent_acb, bs, sector_num,
+                                             orig_qiov, nb_sectors, cb, opaque);
+    }
+}
+
+static BlockDriverAIOCB *
+load_data_from_compact_image (FvdAIOCB * acb, FvdAIOCB * parent_acb,
+                              BlockDriverState * bs, int64_t sector_num,
+                              QEMUIOVector * orig_qiov, int nb_sectors,
+                              BlockDriverCompletionFunc * cb, void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    const uint32_t first_chunk = sector_num / s->chunk_size;
+    const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+    uint32_t chunk;
+    int64_t start_sec;
+    int i;
+
+    if (first_chunk == last_chunk) {
+        goto handle_one_continuous_region;
+    }
+
+    /* Count the number of qiov and iov needed to cover the continuous regions
+     * of the compact image. */
+    int iov_index = 0;
+    size_t iov_left = orig_qiov->iov[0].iov_len;
+    uint8_t *iov_buf = orig_qiov->iov[0].iov_base;
+    int nqiov = 0;
+    int nziov = 0;        /* Number of empty regions. */
+    int niov = 0;
+    uint32_t prev = READ_TABLE2 (s->table[first_chunk]);
+
+    /* Amount of data in the first chunk. */
+    int nb = s->chunk_size - (sector_num % s->chunk_size);
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE2 (s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if ((IS_EMPTY (current) && IS_EMPTY (prev)) ||
+            (!IS_EMPTY (prev) && !IS_EMPTY (current) && current == prev + 1)) {
+            nb += data_size;        /* Belong to the previous continuous region. */
+        } else {
+            /* Terminate the previous continuous region. */
+            if (IS_EMPTY (prev)) {
+                /* Skip this empty region. */
+                count_iov (orig_qiov->iov, &iov_index, &iov_buf,
+                           &iov_left, nb * 512);
+                nziov++;
+            } else {
+                niov += count_iov (orig_qiov->iov, &iov_index, &iov_buf,
+                                   &iov_left, nb * 512);
+                nqiov++;
+            }
+            nb = data_size;        /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    if (nqiov == 0 && nziov == 0) {
+        /* All data can be read in one qiov. Reuse orig_qiov. */
+      handle_one_continuous_region:
+        if (IS_EMPTY (s->table[first_chunk])) {
+            /* Fill qiov with zeros. */
+            for (i = 0; i < orig_qiov->niov; i++) {
+                memset (orig_qiov->iov[i].iov_base,
+                        0, orig_qiov->iov[i].iov_len);
+            }
+
+            /* Use a bh to invoke the callback. */
+            if (!acb) {
+                if (!(acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque))) {
+                    return NULL;
+                }
+                COPY_UUID (acb, parent_acb);
+            }
+
+            QDEBUG ("LOAD: acb%llu-%p  load_fill_all_with_zeros\n",
+                    acb->uuid, acb);
+            acb->type = OP_WRAPPER;
+            acb->wrapper.bh = qemu_bh_new (aio_wrapper_bh, acb);
+            qemu_bh_schedule (acb->wrapper.bh);
+            return &acb->common;
+        }
+
+        /* A non-empty region. */
+        start_sec = READ_TABLE (s->table[first_chunk]) * s->chunk_size +
+                                    (sector_num % s->chunk_size);
+        if (!acb) {
+            if (parent_acb) {
+                QDEBUG ("LOAD: acb%llu-%p  "
+                        "load_directly_as_one_continuous_region\n",
+                        parent_acb->uuid, acb);
+            }
+            return bdrv_aio_readv (s->fvd_data, s->data_offset + start_sec,
+                                   orig_qiov, nb_sectors, cb, opaque);
+        }
+
+        QDEBUG ("LOAD: acb%llu-%p  load_directly_as_one_continuous_region\n",
+                acb->uuid, acb);
+        acb->load.num_children = 1;
+        acb->load.one_child.hd_acb =
+            bdrv_aio_readv (s->fvd_data, s->data_offset + start_sec, orig_qiov,
+                            nb_sectors, finish_load_data_from_compact_image,
+                            &acb->load.one_child);
+        if (acb->load.one_child.hd_acb) {
+            acb->load.one_child.acb = acb;
+            return &acb->common;
+        } else {
+            my_qemu_aio_release (acb);
+            return NULL;
+        }
+    }
+
+    /* qiov for the last continuous region. */
+    if (!IS_EMPTY (prev)) {
+        niov += count_iov (orig_qiov->iov, &iov_index, &iov_buf,
+                           &iov_left, nb * 512);
+        nqiov++;
+        ASSERT (iov_index == orig_qiov->niov - 1 && iov_left == 0);
+    }
+
+    /* Need to submit multiple requests to the lower layer. Initialize acb. */
+    if (!acb && !(acb = init_load_acb (parent_acb, bs, sector_num,
+                                       orig_qiov, nb_sectors, cb, opaque))) {
+        return NULL;
+    }
+    acb->load.num_children = nqiov;
+
+    /* Allocate memory and create multiple requests. */
+    acb->load.children = my_qemu_malloc ((sizeof (CompactChildCB) +
+                                          sizeof (QEMUIOVector)) * nqiov +
+                                         sizeof (struct iovec) * niov);
+    QEMUIOVector *q = (QEMUIOVector *) (acb->load.children + nqiov);
+    struct iovec *v = (struct iovec *) (q + nqiov);
+
+    /* Set up iov and qiov. */
+    nqiov = 0;
+    iov_index = 0;
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    nb = s->chunk_size - (sector_num % s->chunk_size); /* Data in first chunk.*/
+    prev = READ_TABLE2 (s->table[first_chunk]);
+
+    /* if (IS_EMPTY(prev)), start_sec will not be used later, and hence safe. */
+    start_sec = prev * s->chunk_size + (sector_num % s->chunk_size);
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE2 (s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if ((IS_EMPTY (prev) && IS_EMPTY (current)) ||
+            (!IS_EMPTY (prev) && !IS_EMPTY (current) && current == prev + 1)) {
+            nb += data_size;        /* Continue the previous region. */
+        } else {
+            /* Terminate the previous continuous region. */
+            if (IS_EMPTY (prev)) {
+                zero_iov (orig_qiov->iov, &iov_index, &iov_buf, &iov_left,
+                          nb * 512);        /* Fill iov data with zeros. */
+            } else {
+                niov = setup_iov (orig_qiov->iov, v, &iov_index, &iov_buf,
+                                  &iov_left, nb * 512);
+                qemu_iovec_init_external (q, v, niov);
+                QDEBUG ("LOAD: acb%llu-%p  create_child %d sector_num=%" PRId64
+                        " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov,
+                        start_sec, nb, niov);
+                acb->load.children[nqiov].hd_acb =
+                    bdrv_aio_readv (s->fvd_data, s->data_offset + start_sec, q,
+                                    nb, finish_load_data_from_compact_image,
+                                    &acb->load.children[nqiov]);
+                if (!acb->load.children[nqiov].hd_acb) {
+                    goto fail;
+                }
+                acb->load.children[nqiov].acb = acb;
+                v += niov;
+                q++;
+                nqiov++;
+            }
+
+            nb = data_size;
+
+            /* if (IS_EMPTY(current)), start_sec will not be used later. */
+            start_sec = current * s->chunk_size;
+        }
+        prev = current;
+    }
+
+    /* The last continuous region. */
+    if (IS_EMPTY (prev)) {
+        zero_iov (orig_qiov->iov, &iov_index, &iov_buf, &iov_left, nb * 512);
+    } else {
+        niov = setup_iov (orig_qiov->iov, v, &iov_index, &iov_buf,
+                          &iov_left, nb * 512);
+        qemu_iovec_init_external (q, v, niov);
+        QDEBUG ("LOAD: acb%llu-%p  create_child %d sector_num=%" PRId64
+                " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov, start_sec,
+                nb, niov);
+        acb->load.children[nqiov].hd_acb =
+            bdrv_aio_readv (s->fvd_data, s->data_offset + start_sec, q, nb,
+                            finish_load_data_from_compact_image,
+                            &acb->load.children[nqiov]);
+        if (!acb->load.children[nqiov].hd_acb) {
+            goto fail;
+        }
+        acb->load.children[nqiov].acb = acb;
+    }
+    ASSERT (iov_index == orig_qiov->niov - 1 && iov_left == 0);
+
+    return &acb->common;
+
+  fail:
+    for (i = 0; i < nqiov; i++) {
+        bdrv_aio_cancel (acb->load.children[i].hd_acb);
+    }
+    my_qemu_free (acb->load.children);
+    my_qemu_aio_release (acb);
+    return NULL;
+}
+
+static void aio_wrapper_bh (void *opaque)
+{
+    FvdAIOCB *acb = opaque;
+    acb->common.cb (acb->common.opaque, 0);
+    qemu_bh_delete (acb->wrapper.bh);
+    my_qemu_aio_release (acb);
+}
+
+static void finish_load_data_from_compact_image (void *opaque, int ret)
+{
+    CompactChildCB *child = opaque;
+    FvdAIOCB *acb = child->acb;
+
+    /* Now fvd_store_compact_cancel(), if invoked, won't cancel this child
+     * request. */
+    child->hd_acb = NULL;
+
+    if (acb->load.ret == 0) {
+        acb->load.ret = ret;
+    } else {
+        QDEBUG ("LOAD: acb%llu-%p  load_child=%d total_children=%d "
+                "error ret=%d\n", acb->uuid, acb, acb->load.finished_children,
+                acb->load.num_children, ret);
+    }
+
+    acb->load.finished_children++;
+    if (acb->load.finished_children < acb->load.num_children) {
+        QDEBUG ("LOAD: acb%llu-%p  load_finished_children=%d "
+                "total_children=%d\n", acb->uuid, acb,
+                acb->load.finished_children, acb->load.num_children);
+        return;
+    }
+
+    QDEBUG ("LOAD: acb%llu-%p  load_last_child_finished ret=%d\n", acb->uuid,
+            acb, acb->load.ret);
+    acb->common.cb (acb->common.opaque, acb->load.ret);
+    if (acb->load.children) {
+        my_qemu_free (acb->load.children);
+    }
+    my_qemu_aio_release (acb);
+}
+
+static inline FvdAIOCB *init_load_acb (FvdAIOCB * parent_acb,
+                                       BlockDriverState * bs,
+                                       int64_t sector_num,
+                                       QEMUIOVector * orig_qiov,
+                                       int nb_sectors,
+                                       BlockDriverCompletionFunc * cb,
+                                       void *opaque)
+{
+    FvdAIOCB *const acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    acb->type = OP_LOAD_COMPACT;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->load.parent_acb = parent_acb;
+    acb->load.finished_children = 0;
+    acb->load.children = NULL;
+    acb->load.one_child.hd_acb = NULL;
+    acb->load.orig_qiov = orig_qiov;
+    acb->load.ret = 0;
+    COPY_UUID (acb, parent_acb);
+    return acb;
+}
+
+static void fvd_wrapper_cancel (FvdAIOCB * acb)
+{
+    qemu_bh_cancel (acb->wrapper.bh);
+    qemu_bh_delete (acb->wrapper.bh);
+    my_qemu_aio_release (acb);
+}
+
+static void fvd_load_compact_cancel (FvdAIOCB * acb)
+{
+    if (acb->load.children) {
+        int i;
+        for (i = 0; i < acb->load.num_children; i++) {
+            if (acb->load.children[i].hd_acb) {
+                bdrv_aio_cancel (acb->load.children[i].hd_acb);
+            }
+        }
+        my_qemu_free (acb->load.children);
+    }
+    if (acb->load.one_child.hd_acb) {
+        bdrv_aio_cancel (acb->load.one_child.hd_acb);
+    }
+    my_qemu_aio_release (acb);
+}
diff --git a/block/fvd-misc.c b/block/fvd-misc.c
new file mode 100644
index 0000000..da184c8
--- /dev/null
+++ b/block/fvd-misc.c
@@ -0,0 +1,616 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements misc functions of the
+ *  BlockDriver interface for the Fast Virtual Disk (FVD) format.
+ *===========================================================================*/
+
+static void fvd_flush_cancel (FvdAIOCB * acb)
+{
+    if (acb->flush.data_acb) {
+        bdrv_aio_cancel (acb->flush.data_acb);
+    }
+    if (acb->flush.metadata_acb) {
+        bdrv_aio_cancel (acb->flush.metadata_acb);
+    }
+    my_qemu_aio_release (acb);
+}
+
+static void fvd_aio_cancel (BlockDriverAIOCB * blockacb)
+{
+    FvdAIOCB *acb = container_of (blockacb, FvdAIOCB, common);
+
+    QDEBUG ("CANCEL: acb%llu-%p\n", acb->uuid, acb);
+
+    switch (acb->type) {
+    case OP_READ:
+        fvd_read_cancel (acb);
+        break;
+
+    case OP_WRITE:
+        fvd_write_cancel (acb);
+        break;
+
+    case OP_COPY:
+        fvd_copy_cancel (acb);
+        break;
+
+    case OP_LOAD_COMPACT:
+        fvd_load_compact_cancel (acb);
+        break;
+
+    case OP_STORE_COMPACT:
+        fvd_store_compact_cancel (acb);
+        break;
+
+    case OP_WRAPPER:
+        fvd_wrapper_cancel (acb);
+        break;
+
+    case OP_FLUSH:
+        fvd_flush_cancel (acb);
+        break;
+    }
+}
+
+static inline void finish_flush (FvdAIOCB * acb)
+{
+    QDEBUG ("FLUSH: acb%llu-%p  finish_flush ret=%d\n",
+            acb->uuid, acb, acb->flush.ret);
+    acb->common.cb (acb->common.opaque, acb->flush.ret);
+    my_qemu_aio_release (acb);
+}
+
+static void finish_flush_data (void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    QDEBUG ("FLUSH: acb%llu-%p  finish_flush_data ret=%d\n",
+            acb->uuid, acb, ret);
+
+    if (acb->flush.ret == 0) {
+        acb->flush.ret = ret;
+    }
+
+    acb->flush.data_acb = NULL;
+    acb->flush.num_finished++;
+    if (acb->flush.num_finished == 2) {
+        finish_flush (acb);
+    }
+}
+
+static void finish_flush_metadata (void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    QDEBUG ("FLUSH: acb%llu-%p  finish_flush_metadata ret=%d\n",
+            acb->uuid, acb, ret);
+
+    if (acb->flush.ret == 0) {
+        acb->flush.ret = ret;
+    }
+
+    acb->flush.metadata_acb = NULL;
+    acb->flush.num_finished++;
+    if (acb->flush.num_finished == 2) {
+        finish_flush (acb);
+    }
+}
+
+static BlockDriverAIOCB *fvd_aio_flush (BlockDriverState * bs,
+                                BlockDriverCompletionFunc * cb, void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    if (s->fvd_data == s->fvd_metadata) {
+        return bdrv_aio_flush (s->fvd_metadata, cb, opaque);
+    }
+
+    FvdAIOCB *acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->type = OP_FLUSH;
+    acb->flush.num_finished = 0;
+    acb->flush.ret = 0;
+    acb->flush.data_acb = bdrv_aio_flush (s->fvd_data, finish_flush_data, acb);
+    if (!acb->flush.data_acb) {
+        my_qemu_aio_release (acb);
+        return NULL;
+    }
+
+    acb->flush.metadata_acb = bdrv_aio_flush (s->fvd_metadata,
+                                              finish_flush_metadata, acb);
+    if (!acb->flush.metadata_acb) {
+        bdrv_aio_cancel (acb->flush.data_acb);
+        my_qemu_aio_release (acb);
+        return NULL;
+    }
+
+    QDEBUG ("FLUSH: acb%llu-%p  start\n", acb->uuid, acb);
+    return &acb->common;
+}
+
+static int fvd_flush (BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+
+    QDEBUG ("fvd_flush() invoked\n");
+
+    if (s->fvd_data) {
+        if ((ret = bdrv_flush (s->fvd_data))) {
+            return ret;
+        }
+    }
+    if (s->fvd_metadata == s->fvd_data) {
+        return 0;
+    }
+
+    return bdrv_flush (s->fvd_metadata);
+}
+
+static void fvd_close (BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    int i;
+
+    if (s->prefetch_state == PREFETCH_STATE_RUNNING) {
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+    }
+    if (s->prefetch_timer) {
+        qemu_del_timer (s->prefetch_timer);
+        qemu_free_timer (s->prefetch_timer);
+        s->prefetch_timer = NULL;
+    }
+
+    /* Clean up prefetch operations. */
+    if (s->prefetch_acb) {
+        for (i = 0; i < s->num_prefetch_slots; i++) {
+            if (s->prefetch_acb[i] != NULL) {
+                acb = s->prefetch_acb[i];
+                if (acb->copy.hd_acb) {
+                    bdrv_aio_cancel (acb->copy.hd_acb);
+                }
+                my_qemu_vfree (s->prefetch_acb[i]->copy.buf);
+                my_qemu_aio_release (s->prefetch_acb[i]);
+                s->prefetch_acb[i] = NULL;
+            }
+        }
+        my_qemu_free (s->prefetch_acb);
+        s->prefetch_acb = NULL;
+    }
+
+    flush_metadata_to_disk_on_exit (bs);
+
+    if (s->stale_bitmap) {
+        my_qemu_vfree (s->stale_bitmap);
+        if (s->fresh_bitmap != s->stale_bitmap) {
+            my_qemu_vfree (s->fresh_bitmap);
+        }
+        s->stale_bitmap = NULL;
+        s->fresh_bitmap = NULL;
+    }
+
+    if (s->table) {
+        my_qemu_vfree (s->table);
+        s->table = NULL;
+    }
+
+    if (s->fvd_metadata) {
+        if (s->fvd_metadata != s->fvd_data) {
+            bdrv_delete (s->fvd_metadata);
+        }
+        s->fvd_metadata = NULL;
+    }
+    if (s->fvd_data) {
+        bdrv_delete (s->fvd_data);
+        s->fvd_data = NULL;
+    }
+
+    if (s->add_storage_cmd) {
+        my_qemu_free (s->add_storage_cmd);
+        s->add_storage_cmd = NULL;
+    }
+#ifdef FVD_DEBUG
+    dump_resource_summary (s);
+#endif
+}
+
+static int fvd_probe (const uint8_t * buf, int buf_size, const char *filename)
+{
+    const FvdHeader *header = (const void *) buf;
+
+    if (buf_size >= 2 * sizeof (uint32_t)
+        && le32_to_cpu (header->magic) == FVD_MAGIC
+        && le32_to_cpu (header->version) == FVD_VERSION) {
+        return 100;
+    } else {
+        return 0;
+    }
+}
+
+static int fvd_is_allocated (BlockDriverState * bs, int64_t sector_num,
+                             int nb_sectors, int *pnum)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED
+        || sector_num >= s->nb_sectors_in_base_img
+        || !fresh_bitmap_show_sector_in_base_img (sector_num, s)) {
+        /* For the three cases that data may be saved in the FVD data file, we
+         * still need to check the underlying storage because those data could
+         * be holes in a sparse image, due to the optimization of "free write
+         * to zero-filled blocks". See Section 3.3.3 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+
+        if (!s->table) {
+            return bdrv_is_allocated (s->fvd_data, s->data_offset + sector_num,
+                                      nb_sectors, pnum);
+        }
+
+        /* Use the table to figure it out. */
+        int64_t first_chunk = sector_num / s->chunk_size;
+        int64_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+        int allocated = !IS_EMPTY (s->table[first_chunk]);
+        int count;
+
+        if (first_chunk == last_chunk) {
+            /* All data in one chunk. */
+            *pnum = nb_sectors;
+            return allocated;
+        }
+
+        /* Data in the first chunk. */
+        count = s->chunk_size - (sector_num % s->chunk_size);
+
+        /* Full chunks. */
+        first_chunk++;
+        while (first_chunk < last_chunk) {
+            if ((allocated && IS_EMPTY (s->table[first_chunk]))
+                || (!allocated && !IS_EMPTY (s->table[first_chunk]))) {
+                *pnum = count;
+                return allocated;
+            }
+
+            count += s->chunk_size;
+            first_chunk++;
+        }
+
+        /* Data in the last chunk. */
+        if ((allocated && !IS_EMPTY (s->table[last_chunk]))
+            || (!allocated && IS_EMPTY (s->table[last_chunk]))) {
+            int nb = (sector_num + nb_sectors) % s->chunk_size;
+            count += nb ? nb : s->chunk_size;
+        }
+
+        *pnum = count;
+        return allocated;
+    }
+
+    /* Use the FVD metadata to find out sectors in the base image. */
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->nb_sectors_in_base_img) {
+        end = s->nb_sectors_in_base_img;
+    }
+
+    int64_t next = sector_num + 1;
+    while (next < end && fresh_bitmap_show_sector_in_base_img (next, s)) {
+        next++;
+    }
+
+    *pnum = next - sector_num;
+    return FALSE;
+}
+
+static void update_usage (void)
+{
+    printf ("Usage: update <image_file> [attribute=val]\n       See outputs of"
+            "the 'info' command for all available attributes.\n");
+}
+
+static int fvd_get_info (BlockDriverState * bs, BlockDriverInfo * bdi)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdHeader header;
+
+    if (read_fvd_header (s, &header) < 0) {
+        return -1;
+    }
+
+    printf ("========= Begin of FVD specific information ==================\n");
+    printf ("magic\t\t\t\t\t\t%0X\n", header.magic);
+    printf ("version\t\t\t\t\t\t%d\n", header.version);
+    printf ("virtual_disk_size (bytes)\t\t\t%" PRId64 "\n",
+            header.virtual_disk_size);
+    printf ("disk_metadata_size (bytes)\t\t\t%" PRId64 "\n",
+            header.metadata_size);
+    if (header.data_file[0]) {
+        printf ("data_file\t\t\t\t\t%s\n", header.data_file);
+    }
+    if (header.data_file_fmt[0]) {
+        printf ("data_file_fmt\t\t\t\t%s\n", header.data_file_fmt);
+    }
+
+    if (header.base_img[0] != 0) {
+        printf ("base_img\t\t\t\t\t%s\n", header.base_img);
+        printf ("all_data_in_fvd_img\t\t\t\t%s\n",
+                BOOL (header.all_data_in_fvd_img));
+        printf ("base_img_size (bytes)\t\t\t\t%" PRId64 "\n",
+                header.base_img_size);
+        printf ("bitmap_offset (bytes)\t\t\t\t%" PRId64 "\n",
+                header.bitmap_offset);
+        printf ("bitmap_size (bytes)\t\t\t\t%" PRId64 "\n", header.bitmap_size);
+        printf ("prefetch_profile_offset (bytes)\t\t\t%" PRId64 "\n",
+                header.prefetch_profile_offset);
+        printf ("prefetch_profile_entries\t\t\t%" PRId64 "\n",
+                header.prefetch_profile_entries);
+        printf ("prefetch_profile_entry_len_unit\t\t\t%d\n",
+                header.unit_of_PrefetchProfileEntry_len);
+        printf ("block_size\t\t\t\t\t%d\n", header.block_size);
+        printf ("copy_on_read\t\t\t\t\t%s\n", BOOL (header.copy_on_read));
+        printf ("max_outstanding_copy_on_read_data (bytes)\t%" PRId64 "\n",
+                header.max_outstanding_copy_on_read_data);
+        printf ("prefetch_start_delay (sec)\t\t\t%d\n",
+                header.prefetch_start_delay);
+        printf ("profile_directed_prefetch_start_delay (sec)\t%d\n",
+                header.profile_directed_prefetch_start_delay);
+        printf ("max_num_outstanding_prefetch_writes\t\t%d\n",
+                header.num_prefetch_slots);
+        printf ("bytes_per_prefetch\t\t\t\t%d\n", header.bytes_per_prefetch);
+        printf ("prefetch_over_threshold_throttle_time (ms)\t%d\n",
+                header.prefetch_throttle_time);
+        printf ("prefetch_read_throughput_measure_time (ms)\t%d\n",
+                header.prefetch_read_throughput_measure_time);
+        printf ("prefetch_write_throughput_measure_time (ms)\t%d\n",
+                header.prefetch_write_throughput_measure_time);
+        printf ("prefetch_min_read_throughput_threshold (KB/s)\t%d\n",
+                header.prefetch_min_read_throughput);
+        printf ("prefetch_min_write_throughput_threshold (KB/s)\t%d\n",
+                header.prefetch_min_write_throughput);
+        printf ("prefetch_max_read_throughput_threshold (KB/s)\t%d\n",
+                header.prefetch_max_read_throughput);
+        printf ("prefetch_max_write_throughput_threshold (KB/s)\t%d\n",
+                header.prefetch_max_write_throughput);
+        printf ("prefetch_perf_calc_alpha\t\t\t%d\n",
+                header.prefetch_perf_calc_alpha);
+        printf ("generate_prefetch_profile\t\t\t%s\n",
+                BOOL (header.generate_prefetch_profile));
+    }
+
+    printf ("need_zero_init\t\t\t\t\t%s\n", BOOL (header.need_zero_init));
+    printf ("compact_image\t\t\t\t\t%s\n", BOOL (header.compact_image));
+    if (header.compact_image) {
+        printf ("data_storage (bytes)\t\t\t\t%" PRId64 "\n",
+                s->data_storage * 512);
+        printf ("chunk_size (bytes)\t\t\t\t%" PRId64 "\n", header.chunk_size);
+        printf ("used_chunks (bytes)\t\t\t\t%" PRId64 "\n",
+                s->used_storage * 512);
+        printf ("storage_grow_unit (bytes)\t\t\t%" PRId64 "\n",
+                header.storage_grow_unit);
+        printf ("table_offset (bytes)\t\t\t\t%" PRId64 "\n",
+                header.table_offset);
+        int64_t vsize = ROUND_UP (s->virtual_disk_size, s->chunk_size * 512);
+        int table_entries = vsize / (s->chunk_size * 512);
+        int64_t table_size = sizeof (uint32_t) * table_entries;
+        table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
+        printf ("table_size (bytes)\t\t\t\t%" PRId64 "\n", table_size);
+
+        if (header.add_storage_cmd[0] != 0) {
+            printf ("add_storage_cmd\t\t\t\t\t%s\n", header.add_storage_cmd);
+        }
+    }
+    printf ("clean_shutdown\t\t\t\t\t%s\n", BOOL (header.clean_shutdown));
+    if (header.journal_size > 0) {
+        printf ("journal_offset\t\t\t\t\t%" PRId64 "\n", header.journal_offset);
+        printf ("journal_size\t\t\t\t\t%" PRId64 "\n", header.journal_size);
+    }
+    printf ("========= End of FVD specific information ====================\n");
+
+    bdi->cluster_size = 0;
+    bdi->vm_state_offset = 0;
+    return 0;
+}
+
+static int fvd_has_zero_init (BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+    return bdrv_has_zero_init (s->fvd_data);
+}
+
+static int fvd_update (BlockDriverState * bs, int argc, char **argv)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdHeader header;
+    int i;
+
+    if (argc <= 0) {
+        update_usage ();
+        return -1;
+    }
+
+    if (strcmp (argv[0], "-h") == 0 || strcmp (argv[0], "--help") == 0
+        || strcmp (argv[0], "-o") == 0) {
+        update_usage ();
+        return 0;
+    }
+
+    read_fvd_header (s, &header);
+
+    for (i = 0; i < argc; i++) {
+        char *attr = argv[i];
+        char *val = strchr (attr, '=');
+        if (val == NULL) {
+            fprintf (stderr, "Error: string '%s' is not in the format of "
+                     "'attribute=val' without spaces.\n", attr);
+            return -1;
+        }
+        *val = 0;
+        val++;
+
+        if (strcmp (attr, "size") == 0) {
+            int64_t new_size;
+            new_size = atoll (val);
+            int len = strlen (val);
+            if (val[len - 1] == 'G') {
+                new_size *= ((int64_t) 1024) * 1024 * 1024;
+            } else if (val[len - 1] == 'M') {
+                new_size *= ((int64_t) 1024) * 1024;
+            } else if (val[len - 1] == 'K') {
+                new_size *= ((int64_t) 1024);
+            } else if (val[len - 1] == 'B') {
+                /* No change to new_size as it is already in bytes. */
+            } else {
+                /* If no unit is specified, the default unit is KB. */
+                new_size *= ((int64_t) 1024);
+            }
+
+            if (new_size <= 0) {
+                fprintf (stderr, "Error: size %s is not positive.\n", val);
+                return -1;
+            }
+
+            new_size = ROUND_UP (new_size, 512);
+            if (new_size < header.virtual_disk_size) {
+                printf ("Warning: image's new size %" PRId64
+                        " is smaller than the original size %" PRId64
+                        ". Some image data will be truncated.\n",
+                        new_size, header.virtual_disk_size);
+            }
+            header.virtual_disk_size = new_size;
+            printf ("Image resized to %" PRId64 " bytes.\n", new_size);
+        } else if (strcmp (attr, "base_img") == 0) {
+            if (strlen (val) > 1023) {
+                fprintf (stderr, "Error: the new base image name is longer "
+                         "than 1023, which is not allowed.\n");
+                return -1;
+            }
+
+            memset (header.base_img, 0, 1024);
+            pstrcpy (header.base_img, 1024, val);
+            printf ("Backing file updated to '%s'.\n", val);
+        } else if (strcmp (attr, "data_file") == 0) {
+            if (strlen (val) > 1023) {
+                fprintf (stderr, "Error: the new data file name is longer "
+                         "than 1023, which is not allowed.\n");
+                return -1;
+            }
+
+            memset (header.data_file, 0, 1024);
+            pstrcpy (header.data_file, 1024, val);
+            printf ("Data file updated to '%s'.\n", val);
+        } else if (strcmp (attr, "need_zero_init") == 0) {
+            if (strcasecmp (val, "true") == 0 || strcasecmp (val, "on") == 0) {
+                header.need_zero_init = TRUE;
+                printf ("need_zero_init is turned on for this disk.\n");
+            } else {
+                header.need_zero_init = FALSE;
+                printf ("need_zero_init is turned off for this disk.\n");
+            }
+        } else if (strcmp (attr, "copy_on_read") == 0) {
+            if (strcasecmp (val, "true") == 0 || strcasecmp (val, "on") == 0) {
+                header.copy_on_read = TRUE;
+                printf ("Copy on read is enabled for this disk.\n");
+            } else {
+                header.copy_on_read = FALSE;
+                printf ("Copy on read is disabled for this disk.\n");
+            }
+        } else if (strcmp (attr, "clean_shutdown") == 0) {
+            if (strcasecmp (val, "true") == 0 || strcasecmp (val, "on") == 0) {
+                header.clean_shutdown = TRUE;
+                printf ("clean_shutdown is manually set to true\n");
+            } else {
+                header.clean_shutdown = FALSE;
+                printf ("clean_shutdown is manually set to false\n");
+            }
+        } else if (strcmp (attr, "max_outstanding_copy_on_read_data") == 0) {
+            header.max_outstanding_copy_on_read_data = atoll (val);
+            if (header.max_outstanding_copy_on_read_data <= 0) {
+                fprintf (stderr, "Error: max_outstanding_copy_on_read_data "
+                         "must be positive while the provided value is %"
+                         PRId64 ".\n",
+                         header.max_outstanding_copy_on_read_data);
+                return -1;
+            }
+            printf ("max_outstanding_copy_on_read_data updated to %" PRId64
+                    ".\n", header.max_outstanding_copy_on_read_data);
+        } else if (strcmp (attr, "prefetch_start_delay") == 0) {
+            header.prefetch_start_delay = atoi (val);
+            if (header.prefetch_start_delay >= 0) {
+                printf ("Prefetch starting delay updated to %d seconds.\n",
+                        header.prefetch_start_delay);
+            }
+            else {
+                printf ("Prefetch starting delay updated to %d seconds. "
+                        "Because of the negative value, prefetching is "
+                        "disabled for this image.\n",
+                        header.prefetch_start_delay);
+            }
+        } else if (strcmp (attr, "max_num_outstanding_prefetch_writes") == 0) {
+            header.num_prefetch_slots = atoi (val);
+            if (header.num_prefetch_slots < 1) {
+                fprintf (stderr, "Error: max_num_outstanding_prefetch_writes "
+                         "%d is not a positive integer.\n",
+                         header.num_prefetch_slots);
+                return -1;
+            }
+            printf ("max_num_outstanding_prefetch_writes updated to %d.\n",
+                    header.num_prefetch_slots);
+        } else if (strcmp (attr, "bytes_per_prefetch") == 0) {
+            header.bytes_per_prefetch = atoi (val);
+            if (header.bytes_per_prefetch < DEF_PAGE_SIZE) {
+                fprintf (stderr, "Error: bytes_per_prefetch cannot be smaller "
+                         "than %d.\n", DEF_PAGE_SIZE);
+                return -1;
+            }
+            printf ("bytes_per_prefetch updated to %d.\n",
+                    header.bytes_per_prefetch);
+        } else if (strcmp (attr, "prefetch_min_read_throughput_threshold")==0) {
+            header.prefetch_min_read_throughput = atoi (val);
+            printf ("prefetch_min_read_throughput_threshold updated to %d "
+                    "KB/s\n", header.prefetch_min_read_throughput);
+        } else if (strcmp (attr,"prefetch_min_write_throughput_threshold")==0) {
+            header.prefetch_min_write_throughput = atoi (val);
+            printf ("prefetch_min_write_throughput_threshold updated to %d "
+                    "KB/s\n", header.prefetch_min_write_throughput);
+        } else if (strcmp (attr, "prefetch_perf_calc_alpha") == 0) {
+            header.prefetch_perf_calc_alpha = atoi (val);
+            printf ("prefetch_perf_calc_alpha updated to %d\n",
+                    header.prefetch_perf_calc_alpha);
+        } else if (strcmp (attr, "prefetch_read_throughput_measure_time")==0) {
+            header.prefetch_read_throughput_measure_time = atoi (val);
+            printf ("prefetch_read_throughput_measure_time updated to %d ms\n",
+                    header.prefetch_read_throughput_measure_time);
+        } else if (strcmp (attr, "prefetch_write_throughput_measure_time")==0) {
+            header.prefetch_write_throughput_measure_time = atoi (val);
+            printf ("prefetch_write_throughput_measure_time updated to %d ms\n",
+                    header.prefetch_write_throughput_measure_time);
+        } else if (strcmp (attr, "prefetch_over_threshold_throttle_time")==0) {
+            header.prefetch_throttle_time = atoi (val);
+            if (header.prefetch_throttle_time > 0) {
+                printf ("prefetch_over_threshold_throttle_time updated to %d "
+                        "milliseconds.\n", header.prefetch_throttle_time);
+            } else {
+                printf ("prefetch_over_threshold_throttle_time updated to %d "
+                        "milliseconds. It is not positive and hence no "
+                        "throttling will be applied to prefetch.\n",
+                        header.prefetch_throttle_time);
+            }
+        } else {
+            fprintf (stderr, "Error: unknown setting '%s=%s'\n", attr, val);
+            return -1;
+        }
+    }
+
+    update_fvd_header (s, &header);
+    return 0;
+}
diff --git a/block/fvd-open.c b/block/fvd-open.c
new file mode 100644
index 0000000..9ca8e2e
--- /dev/null
+++ b/block/fvd-open.c
@@ -0,0 +1,446 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements bdrv_file_open() for FVD.
+ *============================================================================*/
+
+static void init_prefetch_timer (BlockDriverState * bs, BDRVFvdState * s);
+static int init_data_file (BDRVFvdState * s, FvdHeader * header, int flags);
+static int init_bitmap (BlockDriverState * bs, BDRVFvdState * s,
+                        FvdHeader * header, const char *const filename);
+static int load_table (BDRVFvdState * s, FvdHeader * header,
+                       const char *const filename);
+static int init_journal (int read_only, BlockDriverState * bs,
+                         FvdHeader * header);
+static int init_compact_image (BDRVFvdState * s, FvdHeader * header,
+                               const char *const filename);
+
+static int fvd_open (BlockDriverState * bs, const char *filename, int flags)
+{
+    BDRVFvdState *s = bs->opaque;
+    int ret;
+    FvdHeader header;
+    BlockDriver *drv;
+
+    /* A trick to figure out whether it runs a qemu tool such as qemu-nbd. */
+    const int in_qemu_tool = (rt_clock == NULL);
+
+    const char * protocol = strchr (filename, ':');
+    if (protocol) {
+        drv = bdrv_find_protocol (filename);
+        filename = protocol + 1;
+    }
+    else {
+        /* Use "raw" instead of "file" to allow storing the image on device. */
+        drv = bdrv_find_format ("raw");
+        if (!drv) {
+            fprintf (stderr, "Failed to find the block device driver\n");
+            return -EINVAL;
+        }
+    }
+
+    s->fvd_metadata = bdrv_new ("");
+    ret = bdrv_open (s->fvd_metadata, filename, flags, drv);
+    if (ret < 0) {
+        fprintf (stderr, "Failed to open %s\n", filename);
+        return ret;
+    }
+
+    /* Initialize so that jumping to 'fail' would do cleanup properly. */
+    s->stale_bitmap = NULL;
+    s->fresh_bitmap = NULL;
+    s->table = NULL;
+    s->outstanding_copy_on_read_data = 0;
+    QLIST_INIT (&s->write_locks);
+    QLIST_INIT (&s->copy_locks);
+    QLIST_INIT (&s->wait_for_journal);
+    s->ongoing_journal_updates = 0;
+    s->prefetch_acb = NULL;
+    s->add_storage_cmd = NULL;
+#ifdef FVD_DEBUG
+    s->total_copy_on_read_data = s->total_prefetch_data = 0;
+#endif
+
+    if (bdrv_pread (s->fvd_metadata, 0, &header, sizeof (header)) !=
+        sizeof (header)) {
+        fprintf (stderr, "Failed to read the header of %s\n", filename);
+        goto fail;
+    }
+
+    fvd_header_le_to_cpu (&header);
+
+    if (header.magic != FVD_MAGIC || header.version != FVD_VERSION) {
+        fprintf (stderr, "Incorrect magic number in the header of %s: "
+                 "magic=%0X version=%d expect_magic=%0X expect_version=%d\n",
+                 filename, header.magic, header.version, FVD_MAGIC,
+                 FVD_VERSION);
+        goto fail;
+    }
+    if (header.virtual_disk_size % 512 != 0) {
+        fprintf (stderr, "Disk size %"PRId64" in the header of %s is not "
+                 "a multple of 512.\n", header.virtual_disk_size, filename);
+        goto fail;
+    }
+
+    /* Initialize the fields of BDRVFvdState. */
+    s->dirty_image = FALSE;
+    s->block_size = header.block_size / 512;
+    s->bitmap_size = header.bitmap_size;
+    s->prefetch_error = FALSE;
+    s->prefetch_timer = NULL;
+    s->sectors_per_prefetch = (header.bytes_per_prefetch + 511) / 512;
+    s->prefetch_throttle_time = header.prefetch_throttle_time;
+    s->prefetch_perf_calc_alpha = header.prefetch_perf_calc_alpha / 100.0;
+    s->prefetch_read_throughput_measure_time =
+                        header.prefetch_read_throughput_measure_time;
+    s->prefetch_write_throughput_measure_time =
+                        header.prefetch_write_throughput_measure_time;
+
+    /* Convert KB/s to bytes/millisec. */
+    s->prefetch_min_read_throughput =
+            ((double) header.prefetch_min_read_throughput) * 1024.0 / 1000.0;
+    s->prefetch_min_write_throughput =
+            ((double) header.prefetch_min_write_throughput) * 1024.0 / 1000.0;
+
+    if (header.base_img[0] != 0 && s->sectors_per_prefetch%s->block_size != 0) {
+        fprintf (stderr, "sectors_per_prefetch (%d) is not a multiple of "
+                 "block_size (%d)\n",
+                 s->sectors_per_prefetch * 512, s->block_size * 512);
+    }
+    s->max_outstanding_copy_on_read_data =
+        header.max_outstanding_copy_on_read_data;
+    if (s->max_outstanding_copy_on_read_data < header.block_size * 2) {
+        s->max_outstanding_copy_on_read_data = header.block_size;
+    }
+
+    if (header.num_prefetch_slots < 1) {
+        s->num_prefetch_slots = 1;
+    } else {
+        s->num_prefetch_slots = header.num_prefetch_slots;
+    }
+    if (in_qemu_tool) {
+        /* No prefetching in a qemu tool. */
+        s->prefetch_start_delay = -1;
+
+#ifndef SIMULATED_TEST_WITH_QEMU_IO
+        s->copy_on_read = FALSE;        /* No prefetching in a qemu tool. */
+#else
+        /* But allow debugging copy_on_read in qemu-io if configured. */
+        s->copy_on_read = header.copy_on_read;
+#endif
+    } else {
+        s->prefetch_start_delay = header.prefetch_start_delay;
+        s->copy_on_read = header.copy_on_read;
+    }
+    s->virtual_disk_size = header.virtual_disk_size;
+    s->bitmap_offset = header.bitmap_offset / 512;
+    s->nb_sectors_in_base_img = header.base_img_size / 512;
+    bs->total_sectors = s->virtual_disk_size / 512;
+
+    if (init_data_file (s, &header, flags)) {
+        goto fail;
+    }
+
+    if (init_bitmap (bs, s, &header, filename)) {
+        goto fail;
+    }
+
+    if (load_table (s, &header, filename)) {
+        goto fail;
+    }
+
+    const int read_only = !(flags & BDRV_O_RDWR);
+    if (init_journal (read_only, bs, &header)) {
+        goto fail;
+    }
+
+    /* This must be done after init_journal() because it may use metadata
+     * recovered from the journal. */
+    if (init_compact_image (s, &header, filename)) {
+        goto fail;
+    }
+
+    if (!read_only) {
+        /* This flag will be cleaned later when the image is shut down
+         * gracefully. */
+        update_clean_shutdown_flag (s, FALSE);
+    }
+    init_prefetch_timer (bs, s);
+
+    QDEBUG ("copy_on_read=%s block_size=%d journal_size=%" PRId64
+            " prefetching_delay=%d prefetch_slots=%d "
+            "prefetch_read_threshold_KB=%.0lf "
+            "prefetch_write_threshold_KB=%.0lf "
+            "prefetch_throttle_time=%d bytes_per_prefetch=%d "
+            "max_outstanding_copy_on_read_data=%"PRId64"\n",
+            BOOL (s->copy_on_read), s->block_size * 512,
+            s->journal_size * 512, s->prefetch_start_delay,
+            s->num_prefetch_slots,
+            s->prefetch_min_read_throughput * 1000.0 / 1024.0,
+            s->prefetch_min_write_throughput * 1000.0 / 1024.0,
+            s->prefetch_throttle_time, s->sectors_per_prefetch * 512,
+            s->max_outstanding_copy_on_read_data);
+
+    return 0;
+
+  fail:
+    fprintf (stderr, "Failed to open %s using the FVD format.\n", filename);
+    fvd_close (bs);
+    return -1;
+}
+
+static int load_table (BDRVFvdState * s, FvdHeader * header,
+                       const char *const filename)
+{
+    if (!header->compact_image) {
+        return 0;
+    }
+
+    /* Initialize the table. */
+    s->table_offset = header->table_offset / 512;
+    s->chunk_size = header->chunk_size / 512;
+    int64_t vsize = header->virtual_disk_size + header->chunk_size - 1;
+    int table_entries = vsize / header->chunk_size;
+    int64_t table_size = sizeof (uint32_t) * table_entries;
+    table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
+    s->table = my_qemu_blockalign (s->fvd_metadata, (size_t) table_size);
+
+    if (bdrv_pread (s->fvd_metadata, header->table_offset, s->table, table_size)
+        != table_size) {
+        fprintf (stderr, "Failed to read the table of %s\n", filename);
+        return -1;
+    }
+
+    return 0;
+}
+
+static int init_compact_image (BDRVFvdState * s, FvdHeader * header,
+                               const char *const filename)
+{
+    if (!header->compact_image) {
+        s->data_region_prepared = FALSE;
+        return 0;
+    }
+
+    /* Scan the table to find the max allocated chunk. */
+    int i;
+    uint32_t max_chunk = 0;
+    int empty_disk = TRUE;
+    int table_entries =
+        (int) (ROUND_UP (header->virtual_disk_size, header->chunk_size) /
+               header->chunk_size);
+    for (i = 0; i < table_entries; i++) {
+        if (!IS_EMPTY (s->table[i])) {
+            empty_disk = FALSE;
+            uint32_t id = READ_TABLE (s->table[i]);
+            if (id > max_chunk) {
+                max_chunk = id;
+            }
+        }
+    }
+    if (!empty_disk) {
+        max_chunk++;
+    }
+    s->used_storage = max_chunk * s->chunk_size;
+    s->storage_grow_unit = header->storage_grow_unit / 512;
+
+    /* Check if the image is directly stored on a raw device, including
+     * logical volume. If so, figure out the size of the device. */
+    struct stat stat_buf;
+    if (stat (filename, &stat_buf) != 0) {
+        fprintf (stderr, "Failed to stat() %s\n", filename);
+        return -1;
+    }
+
+    /* Check how much storage space is already allocated. */
+    int64_t size = bdrv_getlength (s->fvd_data);
+    if (size < 0) {
+        fprintf (stderr, "Failed in bdrv_getlength(%s)\n", filename);
+        return -1;
+    }
+    const int64_t min_size = (s->data_offset + s->used_storage) * 512;
+    if (size < min_size) {
+        fprintf (stderr, "The size of device %s is not even big enough to "
+                 "store already allocated data.\n",
+                 filename);
+        return -1;
+    }
+
+    if (S_ISBLK (stat_buf.st_mode) || S_ISCHR (stat_buf.st_mode)) {
+        /* Initialize the command to grow storage space. */
+        char cmd[2048];
+        if (header->add_storage_cmd[0] == 0) {
+            s->add_storage_cmd = NULL;
+        } else {
+            if (strcmp (header->add_storage_cmd, "builtin:lvextend") == 0) {
+                /* Note the following:
+                 *     1. lvextend may generate warning messages like "File
+                 *     descriptor...leaked...", * which is fine.  See the
+                 *     following from LVM manual: "On invocation, lvm requires
+                 *     that only  the  standard  file  descriptors stdin,
+                 *     stdout * and stderr are available.  If others are
+                 *     found, they get closed and messages are issued warning
+                 *     about the leak."
+                 *     2. Instead of using the lvextend command line, one
+                 *     option is to use liblvm directly, which avoids creating
+                 *     a process to resize a LV.
+                 *     3. On Ubuntu, /bin/sh is linked to /bin/dash, which
+                 *     does not support ">&" for stdout and stderr
+                 *     redirection. */
+                snprintf (cmd, sizeof (cmd) - 1, "/sbin/lvextend -L+%" PRId64
+                          "B %s >/dev/null 2>/dev/null",
+                          header->storage_grow_unit,
+                          header->data_file[0] ? header->data_file : filename);
+            } else {
+                snprintf (cmd, sizeof (cmd) - 1, "%s %" PRId64
+                          " %s >/dev/null 2>/dev/null",
+                          header->add_storage_cmd, header->storage_grow_unit,
+                          header->data_file[0] ? header->data_file : filename);
+            }
+
+            int len = strlen (cmd);
+            s->add_storage_cmd = my_qemu_malloc (len + 1);
+            memcpy (s->add_storage_cmd, cmd, len + 1);
+        }
+    }
+
+    s->data_storage = size / 512 - s->data_offset;
+    s->fvd_data->growable = TRUE;
+    s->data_region_prepared = TRUE;
+
+    return 0;
+}
+
+static int init_data_file (BDRVFvdState * s, FvdHeader * header, int flags)
+{
+    int ret;
+
+    if (header->data_file[0]) {
+        /* Open a separate data file. */
+        s->data_offset = 0;
+        s->fvd_data = bdrv_new ("");
+        if (!s->fvd_data) {
+            fprintf (stderr, "Failed to create a new block device driver.\n");
+            return -1;
+        }
+
+        if (header->data_file_fmt[0] == 0) {
+            ret = bdrv_open (s->fvd_data, header->data_file, flags, NULL);
+        } else {
+            BlockDriver *data_drv = bdrv_find_format (header->data_file_fmt);
+            if (!data_drv) {
+                fprintf (stderr, "Failed to find driver for image format "
+                         "'%s' of data file %s\n",
+                         header->data_file_fmt, header->data_file);
+                return -1;
+            }
+            ret = bdrv_open (s->fvd_data, header->data_file, flags, data_drv);
+        }
+        if (ret != 0) {
+            fprintf (stderr, "Failed to open data file %s\n",
+                     header->data_file);
+            return -1;
+        }
+    } else {
+        s->data_offset = header->metadata_size / 512;        /* In sectors. */
+        s->fvd_data = s->fvd_metadata;
+    }
+
+    if (header->need_zero_init && !bdrv_has_zero_init (s->fvd_data)) {
+        /* A trick to figure out whether it runs a qemu tool such as qemu-nbd.*/
+        const int in_qemu_tool = (rt_clock == NULL);
+        if (in_qemu_tool) {
+            /* Only give a warning to allow 'qemu-img update' to modify
+             * need_zero_init if the user manually zero-init the device. */
+            fprintf (stderr, "Warning: image needs zero_init but it is not "
+                     "supported by the storage media.\n");
+        } else {
+            fprintf (stderr, "Error: image needs zero_init but it is not "
+                     "supported by the storage media.\n");
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
+static int init_bitmap (BlockDriverState * bs, BDRVFvdState * s,
+                        FvdHeader * header, const char *const filename)
+{
+    if (header->all_data_in_fvd_img) {
+        /* This also covers the case of no base image. */
+        s->prefetch_state = PREFETCH_STATE_FINISHED;
+        s->copy_on_read = FALSE;
+        s->prefetch_start_delay = -1;
+
+        if (bs->backing_file[0] != 0) {
+            /* No need to use the base image. It may operate without problem
+             * even if the base image is no longer accessible. */
+            bs->backing_file[0] = 0;
+        }
+    } else {
+        ASSERT (header->base_img[0] != 0);
+        pstrcpy (bs->backing_file, 1024, header->base_img);
+        const int flags = O_RDONLY | O_BINARY | O_LARGEFILE;
+        int test_backing_fd = open (bs->backing_file, flags);
+        if (test_backing_fd < 0) {
+            fprintf (stderr, "Failed to open the base image %s for read.\n",
+                     bs->backing_file);
+            return -1;
+        }
+        close (test_backing_fd);
+
+        /* This will be enabled in init_prefetch() after a timer expires. */
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+
+        s->stale_bitmap = my_qemu_blockalign (s->fvd_metadata,
+                                              (size_t) s->bitmap_size);
+        if (bdrv_pread (s->fvd_metadata, header->bitmap_offset,
+                        s->stale_bitmap, s->bitmap_size) != s->bitmap_size) {
+            fprintf (stderr, "Failed to the bitmap of %s.\n", filename);
+            return -1;
+        }
+
+        if (s->copy_on_read || (s->prefetch_state != PREFETCH_STATE_FINISHED &&
+                                s->prefetch_start_delay > 0)) {
+            /* Use two bitmaps only if copy_on_read or prefetching is enabled.
+             * See Section 3.3.4 of the FVD-cow paper. */
+            s->fresh_bitmap = my_qemu_blockalign (s->fvd_metadata,
+                                                  s->bitmap_size);
+            memcpy (s->fresh_bitmap, s->stale_bitmap, s->bitmap_size);
+        } else {
+            s->fresh_bitmap = s->stale_bitmap;
+        }
+    }
+
+    return 0;
+}
+
+static void init_prefetch_timer (BlockDriverState * bs, BDRVFvdState * s)
+{
+#ifndef SIMULATED_TEST_WITH_QEMU_IO
+    /* A trick to figure out whether it is runningin a qemu tool. */
+    const int in_qemu_tool = (rt_clock == NULL);
+    if (in_qemu_tool) {
+        return;
+    }
+#endif
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED ||
+        s->prefetch_start_delay <= 0) {
+        return;
+    }
+
+    /* Start prefetching after a delay. Times 1000 to convert sec to ms. */
+    int64_t expire = qemu_get_clock (rt_clock) + s->prefetch_start_delay * 1000;
+    s->prefetch_timer = qemu_new_timer (rt_clock, fvd_init_prefetch, bs);
+    qemu_mod_timer (s->prefetch_timer, expire);
+}
diff --git a/block/fvd-prefetch.c b/block/fvd-prefetch.c
new file mode 100644
index 0000000..0ad8a8e
--- /dev/null
+++ b/block/fvd-prefetch.c
@@ -0,0 +1,598 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this FVD module implements the function of
+ *  prefetching data from the base image and storing it in the FVD image.
+ *============================================================================*/
+
+static void resume_prefetch (BlockDriverState * bs, int64_t current_time);
+static void do_next_prefetch_read (BlockDriverState * bs, int64_t current_time);
+
+void fvd_init_prefetch (void *opaque)
+{
+    BlockDriverState * bs = opaque;
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+    int i;
+
+    QDEBUG ("Start prefetching\n");
+
+    if (bdrv_find_format ("blksim") == NULL) {
+        /* In simulation mode, the random seed should not be initialized here.*/
+        srandom (time (NULL) + getpid () + getpid () * 987654 + random ());
+    }
+
+    s->prefetch_acb =
+        my_qemu_malloc (sizeof (FvdAIOCB *) * s->num_prefetch_slots);
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        acb = s->prefetch_acb[i] =
+            my_qemu_aio_get (&fvd_aio_pool, bs, null_prefetch_cb, NULL);
+
+        if (!acb) {
+            s->prefetch_error = TRUE;
+            int j;
+            for (j = 0; j < i; j++) {
+                my_qemu_aio_release (s->prefetch_acb[j]);
+                s->prefetch_acb[j] = NULL;
+            }
+
+            my_qemu_free (s->prefetch_acb);
+            s->prefetch_acb = NULL;
+            fprintf (stderr,
+                     "qemu_aio_get() failed and cannot start prefetching.\n");
+            return;
+        }
+
+        acb->type = OP_COPY;
+    }
+
+    s->prefetch_state = PREFETCH_STATE_RUNNING;
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        acb = s->prefetch_acb[i];
+        acb->copy.buffered_sector_begin = acb->copy.buffered_sector_end = 0;
+        QLIST_INIT (&acb->copy_lock.dependent_writes);
+        acb->copy_lock.next.le_prev = NULL;
+        acb->copy.hd_acb = NULL;
+        acb->sector_num = 0;
+        acb->nb_sectors = 0;
+        acb->copy.iov.iov_len = s->sectors_per_prefetch * 512;
+        acb->copy.buf = acb->copy.iov.iov_base =
+            my_qemu_blockalign (bs->backing_hd, acb->copy.iov.iov_len);
+        qemu_iovec_init_external (&acb->copy.qiov, &acb->copy.iov, 1);
+    }
+
+    if (s->prefetch_timer) {
+        qemu_free_timer (s->prefetch_timer);
+        s->prefetch_timer =
+            qemu_new_timer (rt_clock, (QEMUTimerCB *) resume_prefetch, bs);
+    }
+
+    s->pause_prefetch_requested = FALSE;
+    s->unclaimed_prefetch_region_start = 0;
+    s->prefetch_read_throughput = -1;        /* Indicate not initialized. */
+    s->prefetch_write_throughput = -1;        /* Indicate not initialized. */
+    s->prefetch_read_time = 0;
+    s->prefetch_write_time = 0;
+    s->prefetch_data_read = 0;
+    s->prefetch_data_written = 0;
+    s->next_prefetch_read_slot = 0;
+    s->num_filled_prefetch_slots = 0;
+    s->prefetch_read_active = FALSE;
+
+    do_next_prefetch_read (bs, qemu_get_clock (rt_clock));
+}
+
+static void pause_prefetch (BDRVFvdState * s)
+{
+    int64_t ms = 1 + (int64_t) ((random () / ((double) RAND_MAX))
+                                * s->prefetch_throttle_time);
+    QDEBUG ("Pause prefetch for %" PRId64 " milliseconds\n", ms);
+    /* When the timer expires, it goes to resume_prefetch(). */
+    qemu_mod_timer (s->prefetch_timer, qemu_get_clock (rt_clock) + ms);
+}
+
+static void terminate_prefetch (BlockDriverState * bs, int final_state)
+{
+    BDRVFvdState *s = bs->opaque;
+    int i;
+
+    ASSERT (!s->prefetch_read_active && s->num_filled_prefetch_slots == 0);
+
+    for (i = 0; i < s->num_prefetch_slots; i++) {
+        if (s->prefetch_acb) {
+            my_qemu_vfree (s->prefetch_acb[i]->copy.buf);
+            my_qemu_aio_release (s->prefetch_acb[i]);
+            s->prefetch_acb[i] = NULL;
+        }
+    }
+    my_qemu_free (s->prefetch_acb);
+    s->prefetch_acb = NULL;
+
+    if (s->prefetch_timer) {
+        qemu_del_timer (s->prefetch_timer);
+        qemu_free_timer (s->prefetch_timer);
+        s->prefetch_timer = NULL;
+    }
+
+    if (final_state == PREFETCH_STATE_FINISHED) {
+        if (s->prefetch_error) {
+            s->prefetch_state = PREFETCH_STATE_DISABLED;
+        } else {
+            s->prefetch_state = PREFETCH_STATE_FINISHED;
+        }
+    } else {
+        s->prefetch_state = final_state;
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED) {
+        QDEBUG ("FVD prefetching finished successfully.\n");
+
+        if (s->stale_bitmap) {
+            memset (s->stale_bitmap, 0xFF, s->bitmap_size);
+            if (s->fresh_bitmap && s->fresh_bitmap != s->stale_bitmap) {
+                memset (s->fresh_bitmap, 0xFF, s->bitmap_size);
+            }
+        }
+
+        /* Flush the table since its entries may be dirty due to 'soft-write'
+         * by prefetching or copy-on-read. */
+        flush_metadata_to_disk (bs);
+
+        /* Update the on-disk header. */
+        FvdHeader header;
+        read_fvd_header (s, &header);
+        header.all_data_in_fvd_img = TRUE;
+        update_fvd_header (s, &header);
+        s->copy_on_read = FALSE;
+    } else if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        QDEBUG ("FVD disk prefetching disabled.\n");
+    }
+}
+
+static void do_next_prefetch_read (BlockDriverState * bs, int64_t current_time)
+{
+    FvdAIOCB *acb;
+    BDRVFvdState *s = bs->opaque;
+    int64_t begin, end;
+
+    ASSERT (!s->prefetch_read_active
+            && s->num_filled_prefetch_slots < s->num_prefetch_slots
+            && !s->pause_prefetch_requested);
+
+    /* Find the next region to prefetch. */
+    begin = s->unclaimed_prefetch_region_start;
+    while (1) {
+        if (begin >= s->nb_sectors_in_base_img) {
+            s->unclaimed_prefetch_region_start = s->nb_sectors_in_base_img;
+            if (s->num_filled_prefetch_slots == 0) {
+                terminate_prefetch (bs, PREFETCH_STATE_FINISHED);
+            }
+            return;
+        }
+        end = begin + s->sectors_per_prefetch;
+        if (end > s->nb_sectors_in_base_img) {
+            end = s->nb_sectors_in_base_img;
+        }
+        if (find_region_in_base_img (s, &begin, &end)) {
+            break;
+        }
+        begin = end;
+    }
+
+    ASSERT (begin % s->block_size == 0
+            && (end % s->block_size == 0 || end == s->nb_sectors_in_base_img));
+
+    acb = s->prefetch_acb[s->next_prefetch_read_slot];
+    acb->copy.buffered_sector_begin = acb->sector_num = begin;
+    acb->copy.buffered_sector_end = s->unclaimed_prefetch_region_start = end;
+    acb->nb_sectors = end - begin;
+    acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+    acb->copy.iov.iov_base = acb->copy.buf;
+    acb->copy.last_prefetch_op_start_time = current_time;
+    acb->copy.hd_acb = bdrv_aio_readv (bs->backing_hd, acb->sector_num,
+                                       &acb->copy.qiov, acb->nb_sectors,
+                                       finish_prefetch_read, acb);
+
+
+    if (acb->copy.hd_acb == NULL) {
+        QDEBUG ("PREFETCH: error when starting read for sector_num=%" PRId64
+                " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+        s->prefetch_error = TRUE;
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+        }
+    } else {
+        s->prefetch_read_active = TRUE;
+        QDEBUG ("PREFETCH: start read for sector_num=%" PRId64
+                " nb_sectors=%d total_prefetched_bytes=%" PRId64 "\n",
+                acb->sector_num, acb->nb_sectors, s->total_prefetch_data);
+#ifdef FVD_DEBUG
+    s->total_prefetch_data += acb->copy.iov.iov_len;
+#endif
+    }
+}
+
+static void finish_prefetch_write (void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t begin, end;
+    const int64_t current_time = qemu_get_clock (rt_clock);
+
+    ASSERT (acb->nb_sectors > 0 && s->num_filled_prefetch_slots > 0);
+
+    QLIST_REMOVE (acb, copy_lock.next);
+    restart_dependent_writes (acb);
+    acb->copy.hd_acb = NULL;
+    QLIST_INIT (&acb->copy_lock.dependent_writes);
+
+    if (ret != 0) {
+        QDEBUG ("PREFETCH: finished write with error for sector_num=%" PRId64
+                " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+        s->num_filled_prefetch_slots = 0;
+        s->prefetch_error = TRUE;
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (!s->prefetch_read_active) {
+            terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    /* No need to update the on-disk bitmap or the stale bitmap. See Section
+     * 3.3.4 of the FVD-cow paper. */
+    update_fresh_bitmap (acb->sector_num, acb->nb_sectors, s);
+
+    const int64_t write_time =
+        current_time - acb->copy.last_prefetch_op_start_time;
+    s->prefetch_write_time += write_time;
+    s->prefetch_data_written += acb->nb_sectors * 512;
+
+    QDEBUG ("PREFETCH: write_finished  sector_num=%" PRId64
+            " nb_sectors=%d  write_time=%d (ms)\n", acb->sector_num,
+            acb->nb_sectors, (int) write_time);
+
+    /* Calculate throughput and determine if it needs to pause prefetching due
+     * to low throughput. */
+    if (s->prefetch_timer && s->prefetch_throttle_time > 0
+        && !s->pause_prefetch_requested
+        && s->prefetch_write_time > s->prefetch_write_throughput_measure_time) {
+        const double this_round_throughput =
+            s->prefetch_data_written / (double) s->prefetch_write_time;
+        if (s->prefetch_write_throughput < 0) {
+            /* Previously not initialized. */
+            s->prefetch_write_throughput = this_round_throughput;
+        } else {
+            s->prefetch_write_throughput =
+                s->prefetch_perf_calc_alpha * s->prefetch_write_throughput +
+                (1 - s->prefetch_perf_calc_alpha) * this_round_throughput;
+        }
+        if (s->prefetch_write_throughput < s->prefetch_min_write_throughput) {
+            QDEBUG ("PREFETCH: slow_write  this_write=%d (ms)  "
+                    "this_write_throughput=%.3lf (MB/s)   "
+                    "avg_write_throughput=%.3lf (MB/s)\n",
+                    (int) write_time,
+                    this_round_throughput / 1048576 * 1000,
+                    s->prefetch_write_throughput / 1048576 * 1000);
+
+            /* Make a randomized decision to pause prefetching. This avoids
+             * pausing all contending FVD drivers. See Section 3.4.2 of the
+             * FVD-cow paper. */
+            if (random () > (RAND_MAX / 2)) {
+                QDEBUG ("PREFETCH: pause requested.\n");
+                s->pause_prefetch_requested = TRUE;
+            } else {
+                QDEBUG ("PREFETCH: continue due to 50%% probability, despite "
+                        "slow write.\n");
+                s->prefetch_write_throughput = -1; /*Indicate not initialized.*/
+            }
+        } else {
+            QDEBUG ("PREFETCH: this_write_throughput=%.3lf (MB/s)   "
+                    "avg_write_throughput=%.3lf (MB/s)\n",
+                    this_round_throughput / 1048576 * 1000,
+                    s->prefetch_write_throughput / 1048576 * 1000);
+        }
+
+        /* Preparing for measuring the next round of throughput. */
+        s->prefetch_data_written = 0;
+        s->prefetch_write_time = 0;
+    }
+
+    /* Find in this prefetch slot the next section of prefetched but
+     * not-yet-written data. */
+    begin = acb->sector_num + acb->nb_sectors;
+    if (begin < acb->copy.buffered_sector_end) {
+        end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img (s, &begin, &end)) {
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base = acb->copy.buf +
+                            (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG ("PREFETCH: write_data  sector_num=%" PRId64
+                    " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
+                                           &acb->copy.qiov, acb->nb_sectors,
+                                           finish_prefetch_write, acb);
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG ("PREFETCH: error in starting bdrv_aio_writev().\n");
+                s->num_filled_prefetch_slots = 0;
+                s->prefetch_error = TRUE;
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                if (!s->prefetch_read_active) {
+                    terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+                }
+            } else {
+                acb->copy_lock.begin = begin;
+                acb->copy_lock.end = end;
+                QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            }
+
+            return;
+        }
+    }
+
+    s->num_filled_prefetch_slots--;
+
+    if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        if (s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active) {
+            terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    if (begin >= s->nb_sectors_in_base_img) {
+        /* Prefetching finished. */
+        ASSERT (s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active);
+        terminate_prefetch (bs, PREFETCH_STATE_FINISHED);
+        return;
+    }
+
+    if (s->pause_prefetch_requested) {
+        if (s->num_filled_prefetch_slots == 0) {
+            if (!s->prefetch_read_active) {
+                pause_prefetch (s);
+            } else {
+                QDEBUG ("PREFETCH: wait for the read operation to finish in "
+                        "order to pause prefetch.\n");
+            }
+            return;
+        }
+    }
+
+    /* Write out data in the next prefetched slot. */
+    while (s->num_filled_prefetch_slots > 0) {
+        int k = s->next_prefetch_read_slot - s->num_filled_prefetch_slots;
+        if (k < 0) {
+            k += s->num_prefetch_slots;
+        }
+        acb = s->prefetch_acb[k];
+
+        int64_t begin = acb->copy.buffered_sector_begin;
+        int64_t end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img (s, &begin, &end)) {
+            acb->copy.last_prefetch_op_start_time = current_time;
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base =
+                acb->copy.buf + (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG ("PREFETCH: writes data: sector_num=%" PRId64
+                    " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
+                                           &acb->copy.qiov, acb->nb_sectors,
+                                           finish_prefetch_write, acb);
+
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG ("PREFETCH: error cannot get a control block to write "
+                        "a prefetched block.\n");
+                s->prefetch_error = TRUE;
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                s->num_filled_prefetch_slots = 0;
+                if (!s->prefetch_read_active) {
+                    terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+                }
+                return;
+            }
+
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            break;
+        } else {
+            QDEBUG ("PREFETCH: discard prefetched data as they have been "
+                    "covered: sector_num=%" PRId64 " nb_sectors=%d\n",
+                    acb->sector_num, acb->nb_sectors);
+            s->num_filled_prefetch_slots--;
+        }
+    }
+
+    /* If the reader was stopped due to lack of slots, start the reader. */
+    if (!s->prefetch_read_active && !s->pause_prefetch_requested) {
+        do_next_prefetch_read (bs, current_time);
+    }
+}
+
+static void finish_prefetch_read (void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    ASSERT (s->prefetch_read_active && s->num_filled_prefetch_slots >= 0
+            && s->num_filled_prefetch_slots < s->num_prefetch_slots);
+
+    s->prefetch_read_active = FALSE;
+    acb->copy.hd_acb = NULL;
+
+    if (s->prefetch_state == PREFETCH_STATE_DISABLED) {
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    if (ret != 0) {
+        QDEBUG ("PREFETCH: read_error  sector_num=%" PRId64 " nb_sectors=%d.\n",
+                acb->sector_num, acb->nb_sectors);
+        s->prefetch_error = TRUE;
+        s->prefetch_state = PREFETCH_STATE_DISABLED;
+        if (s->num_filled_prefetch_slots == 0) {
+            terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+        }
+        return;
+    }
+
+    const int64_t current_time = qemu_get_clock (rt_clock);
+    const int64_t read_time = current_time -
+                        acb->copy.last_prefetch_op_start_time;
+    s->prefetch_read_time += read_time;
+    s->prefetch_data_read += acb->nb_sectors * 512;
+
+    QDEBUG ("PREFETCH: read_finished  sector_num=%" PRId64
+            " nb_sectors=%d  read_time=%d (ms)\n", acb->sector_num,
+            acb->nb_sectors, (int) read_time);
+
+    /* Calculate throughput and determine if it needs to pause prefetching due
+     * to low throughput. */
+    if (s->prefetch_timer && s->prefetch_throttle_time > 0
+        && !s->pause_prefetch_requested
+        && s->prefetch_read_time > s->prefetch_read_throughput_measure_time) {
+        const double this_round_throughput =
+            s->prefetch_data_read / (double) s->prefetch_read_time;
+        if (s->prefetch_read_throughput < 0) {
+            /* Previously not initialized. */
+            s->prefetch_read_throughput = this_round_throughput;
+        } else {
+            s->prefetch_read_throughput = s->prefetch_perf_calc_alpha *
+                s->prefetch_read_throughput +
+                (1 - s->prefetch_perf_calc_alpha) * this_round_throughput;
+        }
+        if (s->prefetch_read_throughput < s->prefetch_min_read_throughput) {
+            QDEBUG ("PREFETCH: slow_read read_time=%d (ms)   "
+                    "this_read_throughput=%.3lf (MB/s) "
+                    "avg_read_throughput=%.3lf (MB/s)\n",
+                    (int) read_time, this_round_throughput / 1048576 * 1000,
+                    s->prefetch_read_throughput / 1048576 * 1000);
+
+            /* Make a randomized decision to pause prefetching. This avoids
+             * pausing all contending FVD drivers. See Section 3.4.2 of the
+             * FVD-cow paper. */
+            if (random () > (RAND_MAX / 2)) {
+                QDEBUG ("PREFETCH: pause requested.\n");
+                s->pause_prefetch_requested = TRUE;
+            } else {
+                QDEBUG ("PREFETCH: continue due to 50%% probability, "
+                        "despite slow read.\n");
+                s->prefetch_read_throughput = -1; /*Indicate not initialized.*/
+            }
+        } else {
+            QDEBUG ("PREFETCH: this_read_throughput=%.3lf (MB/s)    "
+                    "avg_read_throughput=%.3lf (MB/s)\n",
+                    this_round_throughput / 1048576 * 1000,
+                    s->prefetch_read_throughput / 1048576 * 1000);
+        }
+
+        /* Preparing for measuring the next round of throughput. */
+        s->prefetch_data_read = 0;
+        s->prefetch_read_time = 0;
+    }
+
+    if (s->num_filled_prefetch_slots > 0) {
+        /* There is one ongoing write for prefetched data. This slot will be
+         * written out later. */
+        s->num_filled_prefetch_slots++;
+        s->next_prefetch_read_slot++;
+        if (s->next_prefetch_read_slot >= s->num_prefetch_slots) {
+            s->next_prefetch_read_slot = 0;
+        }
+    } else {
+        /* The writer is not active. Start the writer. */
+        int64_t begin = acb->copy.buffered_sector_begin;
+        int64_t end = acb->copy.buffered_sector_end;
+        if (find_region_in_base_img (s, &begin, &end)) {
+            acb->copy.last_prefetch_op_start_time = current_time;
+            acb->sector_num = begin;
+            acb->nb_sectors = end - begin;
+            acb->copy.iov.iov_base =
+                acb->copy.buf + (begin - acb->copy.buffered_sector_begin) * 512;
+            acb->copy.qiov.size = acb->copy.iov.iov_len = acb->nb_sectors * 512;
+            QDEBUG ("PREFETCH: writes_data sector_num=%" PRId64
+                    " nb_sectors=%d\n", acb->sector_num, acb->nb_sectors);
+            acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
+                                           &acb->copy.qiov, acb->nb_sectors,
+                                           finish_prefetch_write, acb);
+
+            if (acb->copy.hd_acb == NULL) {
+                QDEBUG ("PREFETCH: error cannot get control block to write a "
+                        "prefetched block.\n");
+                s->prefetch_error = TRUE;
+                s->prefetch_state = PREFETCH_STATE_DISABLED;
+                if (s->num_filled_prefetch_slots == 0) {
+                    terminate_prefetch (bs, PREFETCH_STATE_DISABLED);
+                }
+                return;
+            }
+
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            s->num_filled_prefetch_slots++;
+            s->next_prefetch_read_slot++;
+            if (s->next_prefetch_read_slot >= s->num_prefetch_slots) {
+                s->next_prefetch_read_slot = 0;
+            }
+        } else {
+            /* The current prefetch slot will be reused to prefetch the next
+             * bunch of data. */
+            QDEBUG ("PREFETCH: discard prefetched data as they have been "
+                    "covered: sector_num=%" PRId64 " nb_sectors=%d\n",
+                    acb->sector_num, acb->nb_sectors);
+        }
+    }
+
+    if (s->num_filled_prefetch_slots >= s->num_prefetch_slots) {
+        QDEBUG ("PREFETCH: halt read because no slot is available.\n");
+    } else {
+        if (s->pause_prefetch_requested) {
+            if (s->num_filled_prefetch_slots == 0) {
+                pause_prefetch (s);
+            }
+        } else {
+            do_next_prefetch_read (bs, current_time);
+        }
+    }
+}
+
+static void resume_prefetch (BlockDriverState * bs, int64_t current_time)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (s->prefetch_state != PREFETCH_STATE_RUNNING) {
+        return;
+    }
+
+    ASSERT (s->num_filled_prefetch_slots == 0 && !s->prefetch_read_active);
+    QDEBUG ("PREFETCH: resume.\n");
+
+    s->pause_prefetch_requested = FALSE;
+    s->prefetch_read_throughput = -1;        /* Indicate not initialized. */
+    s->prefetch_write_throughput = -1;        /* Indicate not initialized. */
+    s->prefetch_read_time = 0;
+    s->prefetch_write_time = 0;
+    s->prefetch_data_read = 0;
+    s->prefetch_data_written = 0;
+
+    do_next_prefetch_read (bs, qemu_get_clock (rt_clock));
+}
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [Qemu-devel] [PATCH 5/5] Fast Virtual Disk (FVD) Proposal Part 5
  2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
                   ` (2 preceding siblings ...)
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 4/5] Fast Virtual Disk (FVD) Proposal Part 4 Chunqiang Tang
@ 2011-01-19 22:04 ` Chunqiang Tang
  2011-01-20 13:01 ` [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Christoph Hellwig
  2011-01-21 22:41 ` Anthony Liguori
  5 siblings, 0 replies; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-19 22:04 UTC (permalink / raw)
  To: qemu-devel; +Cc: Chunqiang Tang

Part 5 of the block device driver for the proposed FVD image format.
Multiple patches are used in order to manage the size of each patch.
This patch includes some new files for FVD.

See the related discussions at
http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .

Signed-off-by: Chunqiang Tang <ctang@us.ibm.com>
---
 block/fvd-read.c  |  562 ++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-store.c |  494 ++++++++++++++++++++++++++++++++++++++++++
 block/fvd-utils.c |  612 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 block/fvd-write.c |  449 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 2117 insertions(+), 0 deletions(-)
 create mode 100644 block/fvd-read.c
 create mode 100644 block/fvd-store.c
 create mode 100644 block/fvd-utils.c
 create mode 100644 block/fvd-write.c

diff --git a/block/fvd-read.c b/block/fvd-read.c
new file mode 100644
index 0000000..b0cfb91
--- /dev/null
+++ b/block/fvd-read.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements bdrv_aio_readv() for FVD.
+ *============================================================================*/
+
+static void finish_read_backing_for_copy_on_read (void *opaque, int ret);
+static void finish_read_fvd (void *opaque, int ret);
+static inline void calc_read_region (BDRVFvdState * s, int64_t sector_num,
+                                     int nb_sectors,
+                                     int64_t * p_first_sec_in_fvd,
+                                     int64_t * p_last_sec_in_fvd,
+                                     int64_t * p_first_sec_in_backing,
+                                     int64_t * p_last_sec_in_backing);
+
+static BlockDriverAIOCB *fvd_aio_readv (BlockDriverState * bs,
+                                        int64_t sector_num, QEMUIOVector * qiov,
+                                        int nb_sectors,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    TRACE_REQUEST (FALSE, sector_num, nb_sectors);
+
+    if (!s->data_region_prepared) {
+        init_data_region (s);
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED
+        || sector_num >= s->nb_sectors_in_base_img) {
+        /* This is an  efficient case. See Section 3.3.5 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+        return load_data (NULL, bs, sector_num, qiov, nb_sectors, cb, opaque);
+    }
+
+    /* Figure out data regions in the base image and in the FVD data file. */
+    int64_t last_sec_in_backing, first_sec_in_backing;
+    int64_t last_sec_in_fvd, first_sec_in_fvd;
+    calc_read_region (s, sector_num, nb_sectors, &first_sec_in_fvd,
+                      &last_sec_in_fvd, &first_sec_in_backing,
+                      &last_sec_in_backing);
+
+    if (first_sec_in_backing < 0) {
+        /* A simple case: all requested data are in the FVD data file. */
+        return load_data (NULL, bs, sector_num, qiov, nb_sectors, cb, opaque);
+    }
+
+    /* Do copy-on-read only if the context id is 0, i.e., it is not emulating
+     * synchronous I/O.  Doing copy-on-read in emulated synchronous I/O may
+     * leave the copy-on-read callbacks never being processed due to
+     * mismatching contextid. */
+    const int copy_on_read = s->copy_on_read && (get_async_context_id () == 0);
+
+    if (first_sec_in_fvd < 0 && !copy_on_read) {
+        /* A simple case: all requested data are in the base image and no need
+         * to do copy_on_read. */
+        return bdrv_aio_readv (bs->backing_hd, sector_num, qiov, nb_sectors, cb,
+                               opaque);
+    }
+
+    /* The remaining cases are more complicated, which can be: 1. Data are
+     * only in the base image and copy-on-read is needed.  2. Data are in both
+     * the base image and the FVD data file. Copy-on-read may be either TRUE
+     * or FALSE. */
+    FvdAIOCB *acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    QDEBUG ("READ: acb%llu-%p  start  sector_num=%" PRId64 " nb_sectors=%d\n",
+            acb->uuid, acb, sector_num, nb_sectors);
+
+    acb->type = OP_READ;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->read.qiov = qiov;
+    acb->read.ret = 0;
+    acb->read.read_backing.hd_acb = NULL;
+    acb->read.read_backing.done = FALSE;
+    acb->read.read_backing.iov.iov_base = NULL;
+    acb->read.read_fvd.hd_acb = NULL;
+    acb->read.read_fvd.iov.iov_base = NULL;
+    acb->read.read_fvd.done = (first_sec_in_fvd < 0);
+
+    /* Read from the base image. */
+    if (copy_on_read) {
+        /* Round the request to the block boundary. */
+        acb->read.read_backing.sector_num =
+            ROUND_DOWN (first_sec_in_backing, s->block_size);
+        int64_t end = ROUND_UP (last_sec_in_backing + 1, s->block_size);
+        if (end > s->nb_sectors_in_base_img) {
+            end = s->nb_sectors_in_base_img;
+        }
+        acb->read.read_backing.nb_sectors =
+            end - acb->read.read_backing.sector_num;
+    } else {
+        acb->read.read_backing.sector_num = first_sec_in_backing;
+        acb->read.read_backing.nb_sectors =
+            last_sec_in_backing - first_sec_in_backing + 1;
+    }
+
+    acb->read.read_backing.iov.iov_len =
+        acb->read.read_backing.nb_sectors * 512;
+    acb->read.read_backing.iov.iov_base =
+        my_qemu_blockalign (bs->backing_hd, acb->read.read_backing.iov.iov_len);
+    qemu_iovec_init_external (&acb->read.read_backing.qiov,
+                              &acb->read.read_backing.iov, 1);
+    acb->read.read_backing.hd_acb =
+        bdrv_aio_readv (bs->backing_hd, acb->read.read_backing.sector_num,
+                        &acb->read.read_backing.qiov,
+                        acb->read.read_backing.nb_sectors,
+                        finish_read_backing_for_copy_on_read, acb);
+    QDEBUG ("READ: acb%llu-%p  read_backing  backing_sector_num=%" PRId64
+            " backing_nb_sectors=%d\n", acb->uuid, acb,
+            acb->read.read_backing.sector_num,
+            acb->read.read_backing.nb_sectors);
+
+    if (!acb->read.read_backing.hd_acb) {
+        my_qemu_vfree (acb->read.read_backing.iov.iov_base);
+        my_qemu_aio_release (acb);
+        return NULL;
+    }
+
+    if (first_sec_in_fvd >= 0) {
+        /* Read the FVD data file. */
+        acb->read.read_fvd.sector_num = first_sec_in_fvd;
+        acb->read.read_fvd.nb_sectors = last_sec_in_fvd - first_sec_in_fvd + 1;
+        acb->read.read_fvd.iov.iov_len = acb->read.read_fvd.nb_sectors * 512;
+
+        /* Make a copy of the current bitmap because it may change when the
+         * read requests finish. */
+        int64_t b = MIN (acb->read.read_backing.sector_num,
+                         acb->read.read_fvd.sector_num);
+        b = b / s->block_size / 8;        /* First byte of the bitmap we need. */
+        int64_t e1 = acb->read.read_backing.sector_num +
+                            acb->read.read_backing.nb_sectors;
+        int64_t e2 = acb->read.read_fvd.sector_num +
+                            acb->read.read_fvd.nb_sectors;
+        int64_t e = MAX (e1, e2);
+        if (e > s->nb_sectors_in_base_img) {
+            e = s->nb_sectors_in_base_img;
+        }
+        e = (e - 1) / s->block_size / 8;/* Last byte of the bitmap we need. */
+        int bitmap_bytes = e - b + 1;
+        int buf_size = acb->read.read_fvd.iov.iov_len +
+                                    ROUND_UP (bitmap_bytes, 512);
+        acb->read.read_fvd.iov.iov_base =
+            my_qemu_blockalign (s->fvd_data, buf_size);
+        uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
+                                    acb->read.read_fvd.iov.iov_len;
+        memcpy (saved_bitmap, s->fresh_bitmap + b, bitmap_bytes);
+
+        qemu_iovec_init_external (&acb->read.read_fvd.qiov,
+                                  &acb->read.read_fvd.iov, 1);
+        QDEBUG ("READ: acb%llu-%p  read_fvd  fvd_sector_num=%" PRId64
+                " fvd_nb_sectors=%d\n", acb->uuid, acb,
+                acb->read.read_fvd.sector_num, acb->read.read_fvd.nb_sectors);
+        acb->read.read_fvd.hd_acb = load_data (acb, bs, first_sec_in_fvd,
+                                               &acb->read.read_fvd.qiov,
+                                               acb->read.read_fvd.nb_sectors,
+                                               finish_read_fvd, acb);
+        if (!acb->read.read_fvd.hd_acb) {
+            if (acb->read.read_backing.hd_acb) {
+                bdrv_aio_cancel (acb->read.read_backing.hd_acb);
+                my_qemu_vfree (acb->read.read_backing.iov.iov_base);
+            }
+            my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
+            my_qemu_aio_release (acb);
+            return NULL;
+        }
+    }
+
+    return &acb->common;
+}
+
+static void finish_copy_on_read (void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (ret == 0) {
+        /* Update fresh_bitmap but do not update stale_bitmap or the on-disk
+         * bitmap. See Section 3.3.4 of the FVD-cow paper. */
+        update_fresh_bitmap (acb->sector_num, acb->nb_sectors, s);
+    }
+
+    s->outstanding_copy_on_read_data -= acb->nb_sectors * 512;
+
+#ifdef FVD_DEBUG
+    s->total_copy_on_read_data += acb->nb_sectors * 512;
+#endif
+    QDEBUG ("READ: acb%llu-%p  finish_copy_on_read  buffer_sector_num=%" PRId64
+            " buffer_nb_sectors=%d write_sector_num=%" PRId64
+            " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
+            acb->uuid, acb, acb->copy.buffered_sector_begin,
+            (int) (acb->copy.buffered_sector_end -
+                   acb->copy.buffered_sector_begin), acb->sector_num,
+            acb->nb_sectors, s->outstanding_copy_on_read_data);
+
+    QLIST_REMOVE (acb, copy_lock.next);
+    restart_dependent_writes (acb);
+
+    int64_t begin = acb->sector_num + acb->nb_sectors;
+    int64_t end = acb->copy.buffered_sector_end;
+
+    if (find_region_in_base_img (s, &begin, &end)) {
+        acb->sector_num = begin;
+        acb->nb_sectors = end - begin;
+        acb->copy.iov.iov_base = acb->copy.buf +
+                                (begin - acb->copy.buffered_sector_begin) * 512;
+        acb->copy.iov.iov_len = acb->nb_sectors * 512;
+        qemu_iovec_init_external (&acb->copy.qiov, &acb->copy.iov, 1);
+        QDEBUG ("READ: acb%llu-%p  copy_on_read  buffer_sector_num=%" PRId64
+                " buffer_nb_sectors=%d write_sector_num=%" PRId64
+                " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
+                acb->uuid, acb, acb->copy.buffered_sector_begin,
+                (int) (acb->copy.buffered_sector_end -
+                       acb->copy.buffered_sector_begin), acb->sector_num,
+                acb->nb_sectors, s->outstanding_copy_on_read_data);
+        acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
+                                       &acb->copy.qiov, acb->nb_sectors,
+                                       finish_copy_on_read, acb);
+        if (acb->copy.hd_acb) {
+            QLIST_INIT (&acb->copy_lock.dependent_writes);
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            s->outstanding_copy_on_read_data += acb->copy.iov.iov_len;
+            return;
+        }
+    }
+
+    QDEBUG ("READ: acb%llu-%p  no_more_copy_on_read\n", acb->uuid, acb);
+    my_qemu_vfree (acb->copy.buf);
+    my_qemu_aio_release (acb);
+}
+
+static void finish_read (FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->read.ret != 0) {
+        QDEBUG ("READ: acb%llu-%p  finish_read error ret=%d sector_num=%" PRId64
+                " nb_sectors=%d\n", acb->uuid, acb, acb->read.ret,
+                acb->sector_num, acb->nb_sectors);
+        acb->common.cb (acb->common.opaque, acb->read.ret);
+        if (acb->read.read_backing.iov.iov_base) {
+            my_qemu_vfree (acb->read.read_backing.iov.iov_base);
+        }
+        if (acb->read.read_fvd.iov.iov_base) {
+            my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
+        }
+        my_qemu_aio_release (acb);
+
+        return;
+    }
+
+    if (!acb->read.read_fvd.iov.iov_base) {
+        /* Only read data from the base image. */
+        uint8_t *data = ((uint8_t *) acb->read.read_backing.iov.iov_base) +
+                    (acb->sector_num - acb->read.read_backing.sector_num) * 512;
+        qemu_iovec_from_buffer (acb->read.qiov, data, acb->nb_sectors * 512);
+    } else {
+        /* Under the guidance of the saved bitmap, merge data from the FVD
+         * data file and the base image. */
+        uint8_t *saved_bitmap = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
+                                            acb->read.read_fvd.iov.iov_len;
+        int64_t bitmap_offset = MIN (acb->read.read_backing.sector_num,
+                                     acb->read.read_fvd.sector_num);
+        bitmap_offset = bitmap_offset / s->block_size / 8;
+        int iov_index = 0;
+        uint8_t *iov_buf = acb->read.qiov->iov[0].iov_base;
+        int iov_left = acb->read.qiov->iov[0].iov_len;
+        int64_t sec = acb->sector_num;
+        const int64_t end = acb->sector_num + acb->nb_sectors;
+        int64_t first_sec;
+        uint8_t *source;
+
+        if (bitmap_show_sector_in_base_img
+            (sec, s, bitmap_offset, saved_bitmap)) {
+            goto in_backing;
+        }
+
+        while (1) {
+            /* For a section of data in the FVD data file. */
+            if (sec >= end) {
+                break;
+            }
+
+            first_sec = sec;
+            do {
+                sec++;
+            } while (sec < end && !bitmap_show_sector_in_base_img (sec, s,
+                                        bitmap_offset, saved_bitmap));
+
+            source = ((uint8_t *) acb->read.read_fvd.iov.iov_base) +
+                            (first_sec - acb->read.read_fvd.sector_num) * 512;
+            copy_to_iov (acb->read.qiov->iov, &iov_index, &iov_buf, &iov_left,
+                         source, (sec - first_sec) * 512);
+
+          in_backing:
+            /* For a section of data in the base image. */
+            if (sec >= end) {
+                break;
+            }
+
+            first_sec = sec;
+            do {
+                sec++;
+            } while (sec < end && bitmap_show_sector_in_base_img (sec, s,
+                                                bitmap_offset, saved_bitmap));
+
+            source = ((uint8_t *) acb->read.read_backing.iov.iov_base) +
+                        (first_sec - acb->read.read_backing.sector_num) * 512;
+            copy_to_iov (acb->read.qiov->iov, &iov_index, &iov_buf, &iov_left,
+                         source, (sec - first_sec) * 512);
+        }
+
+        ASSERT (iov_index == acb->read.qiov->niov - 1 && iov_left == 0);
+        my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
+    }
+
+    QDEBUG ("READ: acb%llu-%p  finish_read  ret=%d\n", acb->uuid, acb,
+            acb->read.ret);
+    acb->common.cb (acb->common.opaque, acb->read.ret);
+
+    if (!s->copy_on_read || get_async_context_id () != 0) {
+        /* Do copy-on-read only if the context id is 0, i.e., it is not
+         * emulating synchronous I/O.  Doing copy-on-read in emulated
+         * synchronous I/O may leave the copy-on-read callbacks never being
+         * processed due to mismatching context id. */
+        my_qemu_vfree (acb->read.read_backing.iov.iov_base);
+        my_qemu_aio_release (acb);
+        return;
+    }
+
+    /* Convert AIOReadCB into a AIOCopyCB for copy-on-read. */
+    uint8_t *buf = acb->read.read_backing.iov.iov_base;
+    int64_t begin = acb->read.read_backing.sector_num;
+    int64_t end = begin + acb->read.read_backing.nb_sectors;
+
+    acb->type = OP_COPY;
+    acb->copy.buf = buf;
+    acb->copy.buffered_sector_begin = begin;
+    acb->copy.buffered_sector_end = end;
+
+    if (s->outstanding_copy_on_read_data < s->max_outstanding_copy_on_read_data
+        && find_region_in_base_img (s, &begin, &end)) {
+        /* Write to the FVD data file. */
+        acb->sector_num = begin;
+        acb->nb_sectors = end - begin;
+        acb->copy.iov.iov_base =
+            buf + (begin - acb->copy.buffered_sector_begin) * 512;
+        acb->copy.iov.iov_len = acb->nb_sectors * 512;
+        qemu_iovec_init_external (&acb->copy.qiov, &acb->copy.iov, 1);
+        QDEBUG ("READ: acb%llu-%p  copy_on_read  buffer_sector_num=%" PRId64
+                " buffer_nb_sectors=%d write_sector_num=%" PRId64
+                " write_nb_sectors=%d outstanding_copy_on_read=%" PRId64 "\n",
+                acb->uuid, acb, acb->copy.buffered_sector_begin,
+                (int) (acb->copy.buffered_sector_end -
+                       acb->copy.buffered_sector_begin), acb->sector_num,
+                acb->nb_sectors, s->outstanding_copy_on_read_data);
+        acb->copy.hd_acb = store_data (TRUE, acb, bs, acb->sector_num,
+                                       &acb->copy.qiov, acb->nb_sectors,
+                                       finish_copy_on_read, acb);
+        if (acb->copy.hd_acb) {
+            QLIST_INIT (&acb->copy_lock.dependent_writes);
+            acb->copy_lock.begin = begin;
+            acb->copy_lock.end = end;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            s->outstanding_copy_on_read_data += acb->copy.iov.iov_len;
+            return;
+        }
+    }
+
+    /* No more copy-on-read to do. */
+    my_qemu_vfree (acb->copy.buf);
+    my_qemu_aio_release (acb);
+}
+
+static void finish_read_fvd (void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    QDEBUG ("READ: acb%llu-%p  finish_read_fvd ret=%d\n", acb->uuid, acb, ret);
+    acb->read.read_fvd.hd_acb = NULL;
+    acb->read.read_fvd.done = TRUE;
+    if (acb->read.ret == 0) {
+        acb->read.ret = ret;
+    }
+
+    if (acb->read.read_backing.done) {
+        finish_read (acb);        /* The other request also finished. */
+    }
+}
+
+static void finish_read_backing_for_copy_on_read (void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+
+    QDEBUG ("READ: acb%llu-%p  finish_read_backing ret=%d\n", acb->uuid, acb,
+            ret);
+    acb->read.read_backing.hd_acb = NULL;
+    acb->read.read_backing.done = TRUE;
+    if (acb->read.ret == 0) {
+        acb->read.ret = ret;
+    }
+
+    if (acb->read.read_fvd.done) {
+        finish_read (acb);        /* The other request also finished. */
+    }
+}
+
+static inline void calc_read_region (BDRVFvdState * s, int64_t sector_num,
+                                     int nb_sectors,
+                                     int64_t * p_first_sec_in_fvd,
+                                     int64_t * p_last_sec_in_fvd,
+                                     int64_t * p_first_sec_in_backing,
+                                     int64_t * p_last_sec_in_backing)
+{
+    int64_t last_sec_in_backing = -1, first_sec_in_backing = -1;
+    int64_t last_sec_in_fvd = -1, first_sec_in_fvd = -1;
+    int prev_block_in_backing;
+
+    if (fresh_bitmap_show_sector_in_base_img (sector_num, s)) {
+        first_sec_in_backing = last_sec_in_backing = sector_num;
+        prev_block_in_backing = TRUE;
+    } else {
+        first_sec_in_fvd = last_sec_in_fvd = sector_num;
+        prev_block_in_backing = FALSE;
+    }
+
+    /* Begin of next block. */
+    int64_t sec = ROUND_UP (sector_num + 1, s->block_size);
+
+    const int64_t sec_end = sector_num + nb_sectors;
+    int64_t last_sec = MIN (sec_end, s->nb_sectors_in_base_img) - 1;
+
+    while (1) {
+        if (sec > last_sec) {
+            sec = last_sec;
+        }
+
+        if (fresh_bitmap_show_sector_in_base_img (sec, s)) {
+            if (first_sec_in_backing < 0) {
+                first_sec_in_backing = sec;
+            }
+            if (!prev_block_in_backing) {
+                last_sec_in_fvd = sec - 1;
+                prev_block_in_backing = TRUE;
+            }
+            last_sec_in_backing = sec;
+        } else {
+            if (first_sec_in_fvd < 0) {
+                first_sec_in_fvd = sec;
+            }
+            if (prev_block_in_backing) {
+                last_sec_in_backing = sec - 1;
+                prev_block_in_backing = FALSE;
+            }
+            last_sec_in_fvd = sec;
+        }
+
+        if (sec == last_sec) {
+            break;
+        }
+        sec += s->block_size;
+    }
+
+    if (sec_end > s->nb_sectors_in_base_img) {
+        if (first_sec_in_fvd < 0) {
+            first_sec_in_fvd = s->nb_sectors_in_base_img;
+        }
+        last_sec_in_fvd = sec_end - 1;
+    }
+
+    *p_first_sec_in_fvd = first_sec_in_fvd;
+    *p_last_sec_in_fvd = last_sec_in_fvd;
+    *p_first_sec_in_backing = first_sec_in_backing;
+    *p_last_sec_in_backing = last_sec_in_backing;
+}
+
+static void fvd_read_cancel (FvdAIOCB * acb)
+{
+    if (acb->read.read_backing.hd_acb) {
+        bdrv_aio_cancel (acb->read.read_backing.hd_acb);
+    }
+    if (acb->read.read_fvd.hd_acb) {
+        bdrv_aio_cancel (acb->read.read_fvd.hd_acb);
+    }
+    if (acb->read.read_backing.iov.iov_base) {
+        my_qemu_vfree (acb->read.read_backing.iov.iov_base);
+    }
+    if (acb->read.read_fvd.iov.iov_base) {
+        my_qemu_vfree (acb->read.read_fvd.iov.iov_base);
+    }
+    my_qemu_aio_release (acb);
+}
+
+static void fvd_copy_cancel (FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->copy.hd_acb) {
+        bdrv_aio_cancel (acb->copy.hd_acb);
+    }
+    if (acb->copy_lock.next.le_prev != NULL) {
+        QLIST_REMOVE (acb, copy_lock.next);
+        restart_dependent_writes (acb);
+    }
+    my_qemu_vfree (acb->copy.buf);
+    if (acb->common.cb != null_prefetch_cb) {
+        /* This is a copy-on-read operation. */
+        s->outstanding_copy_on_read_data -= acb->nb_sectors * 512;
+    }
+    my_qemu_aio_release (acb);
+}
+
+static void restart_dependent_writes (FvdAIOCB * acb)
+{
+    acb->copy_lock.next.le_prev = NULL;
+    FvdAIOCB *req = acb->copy_lock.dependent_writes.lh_first;
+
+    while (req) {
+        /* Keep a copy of 'next' as it may be changed in do_aiO_write(). */
+        FvdAIOCB *next = req->write.next_dependent_write.le_next;
+
+        /* Indicate that this write is no longer on any depedent list. This
+         * helps fvd_read_cancel() work properly. */
+        req->write.next_dependent_write.le_prev = NULL;
+
+        if (acb->type == OP_WRITE) {
+            QDEBUG ("WRITE: acb%llu-%p  finished_and_restart_conflict_write "
+                    "acb%llu-%p\n", acb->uuid, acb, req->uuid, req);
+        } else {
+            QDEBUG ("READ: copy_on_read acb%llu-%p  "
+                    "finished_and_restart_conflict_write acb%llu-%p\n",
+                    acb->uuid, acb, req->uuid, req);
+        }
+
+        if (do_aio_write (req) < 0) {
+            QDEBUG ("WRITE: acb%llu-%p  finished with error ret=%d\n",
+                    req->uuid, req, -1);
+            req->common.cb (req->common.opaque, -1);
+            my_qemu_aio_release (req);
+        }
+
+        req = next;
+    }
+}
diff --git a/block/fvd-store.c b/block/fvd-store.c
new file mode 100644
index 0000000..ae7f045
--- /dev/null
+++ b/block/fvd-store.c
@@ -0,0 +1,494 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this FVD module implements storing data to a
+ *  compact image.
+ *===========================================================================*/
+
+static uint32_t allocate_chunk (BlockDriverState * bs);
+static inline FvdAIOCB *init_store_acb (int soft_write,
+                                        QEMUIOVector * orig_qiov,
+                                        BlockDriverState * bs,
+                                        int64_t sector_num, int nb_sectors,
+                                        FvdAIOCB * parent_acb,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque);
+static void finish_store_data_in_compact_image (void *opaque, int ret);
+
+static inline BlockDriverAIOCB *store_data (int soft_write,
+                                            FvdAIOCB * parent_acb,
+                                            BlockDriverState * bs,
+                                            int64_t sector_num,
+                                            QEMUIOVector * orig_qiov,
+                                            int nb_sectors,
+                                            BlockDriverCompletionFunc * cb,
+                                            void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    TRACE_STORE_IN_FVD ("store_data", sector_num, nb_sectors);
+
+    if (!s->table) {
+        /* Write directly since it is not a compact image. */
+        return bdrv_aio_writev (s->fvd_data, s->data_offset + sector_num,
+                                orig_qiov, nb_sectors, cb, opaque);
+    } else {
+        return store_data_in_compact_image (NULL, soft_write, parent_acb, bs,
+                                            sector_num, orig_qiov, nb_sectors,
+                                            cb, opaque);
+    }
+}
+
+/* Store data in the compact image. The argument 'soft_write' means
+ * the store was caused by copy-on-read or prefetching, which need not
+ * update metadata immediately. */
+static BlockDriverAIOCB *store_data_in_compact_image (FvdAIOCB * acb,
+                                                      int soft_write,
+                                                      FvdAIOCB * parent_acb,
+                                                      BlockDriverState * bs,
+                                                      int64_t sector_num,
+                                                      QEMUIOVector * orig_qiov,
+                                                      const int nb_sectors,
+                                                      BlockDriverCompletionFunc
+                                                      * cb, void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    const uint32_t first_chunk = sector_num / s->chunk_size;
+    const uint32_t last_chunk = (sector_num + nb_sectors - 1) / s->chunk_size;
+    int table_dirty = FALSE;
+    uint32_t chunk;
+    int64_t start_sec;
+
+    /* Check if storag space is allocated. */
+    for (chunk = first_chunk; chunk <= last_chunk; chunk++) {
+        if (IS_EMPTY (s->table[chunk])) {
+            uint32_t id = allocate_chunk (bs);
+            if (IS_EMPTY (id)) {
+                return NULL;
+            }
+            id |= DIRTY_TABLE;
+            WRITE_TABLE (s->table[chunk], id);
+
+            table_dirty = TRUE;
+        } else if (IS_DIRTY (s->table[chunk])) {
+            /* This is possible if a previous soft-write allocated the storage
+             * space but did not flush the table entry change to the journal
+             * and hence did not clean the dirty bit. This is also possible
+             * with two concurrent hard-writes. The first hard-write allocated
+             * the storage space but has not flushed the table entry change to
+             * the journal yet and hence the table entry remains dirty. In
+             * this case, the second hard-write will also try to flush this
+             * dirty table entry to the journal. The outcome is correct since
+             * they store the same metadata change in the journal (although
+             * twice). For this race condition, we prefer to have two writes
+             * to the journal rather than introducing a locking mechanism,
+             * because this happens rarely and those two writes to the journal
+             * are likely to be merged by the kernel into a single write since
+             * they are likely to update back-to-back sectors in the journal.
+             * A locking mechanism would be less efficient, because the large
+             * size of chunks would cause unnecessary locking due to ``false
+             * sharing'' of a chunk by two writes. */
+            table_dirty = TRUE;
+        }
+    }
+
+    const int update_table = (!soft_write && table_dirty);
+    size_t iov_left;
+    uint8_t *iov_buf;
+    int nb, iov_index, nqiov, niov;
+    uint32_t prev;
+
+    if (first_chunk == last_chunk) {
+        goto handle_one_continuous_region;
+    }
+
+    /* Count the number of qiov and iov needed to cover the continuous regions
+     * of the compact image. */
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    iov_index = 0;
+    nqiov = 0;
+    niov = 0;
+    prev = READ_TABLE (s->table[first_chunk]);
+
+    /* Data in the first chunk. */
+    nb = s->chunk_size - (sector_num % s->chunk_size);
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE (s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if (current == prev + 1) {
+            nb += data_size;        /* Continue the previous region. */
+        } else {
+            /* Terminate the previous region. */
+            niov +=
+                count_iov (orig_qiov->iov, &iov_index, &iov_buf, &iov_left,
+                           nb * 512);
+            nqiov++;
+            nb = data_size;        /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    if (nqiov == 0) {
+      handle_one_continuous_region:
+        /* A simple case. All data can be written out in one qiov and no new
+         * chunks are allocated. */
+        start_sec = READ_TABLE (s->table[first_chunk]) * s->chunk_size +
+                                        (sector_num % s->chunk_size);
+
+        if (!update_table && !acb) {
+            if (parent_acb) {
+                QDEBUG ("STORE: acb%llu-%p  "
+                        "store_directly_without_table_update\n",
+                        parent_acb->uuid, parent_acb);
+            }
+            return bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec,
+                                    orig_qiov, nb_sectors, cb, opaque);
+        }
+
+        if (!acb && !(acb = init_store_acb (soft_write, orig_qiov, bs,
+                            sector_num, nb_sectors, parent_acb, cb, opaque))) {
+            return NULL;
+        }
+
+        QDEBUG ("STORE: acb%llu-%p  store_directly  sector_num=%" PRId64
+                " nb_sectors=%d\n", acb->uuid, acb, acb->sector_num,
+                acb->nb_sectors);
+
+        acb->store.update_table = update_table;
+        acb->store.num_children = 1;
+        acb->store.one_child.hd_acb =
+            bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, orig_qiov,
+                             nb_sectors, finish_store_data_in_compact_image,
+                             &acb->store.one_child);
+        if (acb->store.one_child.hd_acb) {
+            acb->store.one_child.acb = acb;
+            return &acb->common;
+        } else {
+            my_qemu_aio_release (acb);
+            return NULL;
+        }
+    }
+
+    /* qiov for the last continuous region. */
+    niov += count_iov (orig_qiov->iov, &iov_index, &iov_buf,
+                       &iov_left, nb * 512);
+    nqiov++;
+    ASSERT (iov_index == orig_qiov->niov - 1 && iov_left == 0);
+
+    /* Need to submit multiple requests to the lower layer. */
+    if (!acb && !(acb = init_store_acb (soft_write, orig_qiov, bs, sector_num,
+                                        nb_sectors, parent_acb, cb, opaque))) {
+        return NULL;
+    }
+    acb->store.update_table = update_table;
+    acb->store.num_children = nqiov;
+
+    if (!parent_acb) {
+        QDEBUG ("STORE: acb%llu-%p  start  sector_num=%" PRId64
+                " nb_sectors=%d\n", acb->uuid, acb, acb->sector_num,
+                acb->nb_sectors);
+    }
+
+    /* Allocate memory and create multiple requests. */
+    const size_t metadata_size = nqiov * (sizeof (CompactChildCB) +
+                                          sizeof (QEMUIOVector))
+                                    + niov * sizeof (struct iovec);
+    acb->store.children = (CompactChildCB *) my_qemu_malloc (metadata_size);
+    QEMUIOVector *q = (QEMUIOVector *) (acb->store.children + nqiov);
+    struct iovec *v = (struct iovec *) (q + nqiov);
+
+    start_sec = READ_TABLE (s->table[first_chunk]) * s->chunk_size +
+                                        (sector_num % s->chunk_size);
+    nqiov = 0;
+    iov_index = 0;
+    iov_left = orig_qiov->iov[0].iov_len;
+    iov_buf = orig_qiov->iov[0].iov_base;
+    prev = READ_TABLE (s->table[first_chunk]);
+
+    /* Data in the first chunk. */
+    if (first_chunk == last_chunk) {
+        nb = nb_sectors;
+    }
+    else {
+        nb = s->chunk_size - (sector_num % s->chunk_size);
+    }
+
+    for (chunk = first_chunk + 1; chunk <= last_chunk; chunk++) {
+        uint32_t current = READ_TABLE (s->table[chunk]);
+        int64_t data_size;
+        if (chunk < last_chunk) {
+            data_size = s->chunk_size;
+        } else {
+            data_size = (sector_num + nb_sectors) % s->chunk_size;
+            if (data_size == 0) {
+                data_size = s->chunk_size;
+            }
+        }
+
+        if (current == prev + 1) {
+            nb += data_size;        /* Continue the previous region. */
+        } else {
+            /* Terminate the previous continuous region. */
+            niov = setup_iov (orig_qiov->iov, v, &iov_index,
+                              &iov_buf, &iov_left, nb * 512);
+            qemu_iovec_init_external (q, v, niov);
+            QDEBUG ("STORE: acb%llu-%p  create_child %d sector_num=%" PRId64
+                    " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov,
+                    start_sec, q->size / 512, q->niov);
+            acb->store.children[nqiov].hd_acb =
+                bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, q,
+                                 q->size / 512,
+                                 finish_store_data_in_compact_image,
+                                 &acb->store.children[nqiov]);
+            if (!acb->store.children[nqiov].hd_acb) {
+                goto fail;
+            }
+            acb->store.children[nqiov].acb = acb;
+            v += niov;
+            q++;
+            nqiov++;
+            start_sec = current * s->chunk_size; /* Begin of the new region. */
+            nb = data_size;        /* Data in the new region. */
+        }
+        prev = current;
+    }
+
+    /* Requst for the last chunk. */
+    niov = setup_iov (orig_qiov->iov, v, &iov_index, &iov_buf,
+                      &iov_left, nb * 512);
+    ASSERT (iov_index == orig_qiov->niov - 1 && iov_left == 0);
+    qemu_iovec_init_external (q, v, niov);
+
+    QDEBUG ("STORE: acb%llu-%p  create_child_last %d sector_num=%" PRId64
+            " nb_sectors=%d niov=%d\n", acb->uuid, acb, nqiov, start_sec,
+            q->size / 512, q->niov);
+    acb->store.children[nqiov].hd_acb =
+        bdrv_aio_writev (s->fvd_data, s->data_offset + start_sec, q,
+                         q->size / 512, finish_store_data_in_compact_image,
+                         &acb->store.children[nqiov]);
+    if (acb->store.children[nqiov].hd_acb) {
+        acb->store.children[nqiov].acb = acb;
+        return &acb->common;
+    }
+
+    int i;
+  fail:
+    QDEBUG ("STORE: acb%llu-%p  failed\n", acb->uuid, acb);
+    for (i = 0; i < nqiov; i++) {
+        bdrv_aio_cancel (acb->store.children[i].hd_acb);
+    }
+    my_qemu_free (acb->store.children);
+    my_qemu_aio_release (acb);
+    return NULL;
+}
+
+static uint32_t allocate_chunk (BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    /* Check if there is sufficient storage space. */
+    if (s->used_storage + s->chunk_size > s->data_storage) {
+        if (s->add_storage_cmd) {
+            if (system (s->add_storage_cmd)) {
+                fprintf (stderr, "Error in executing %s\n", s->add_storage_cmd);
+            }
+        } else {
+            /* If the image is stored on a file system, the image file size
+             * can be increased by bdrv_truncate. */
+            int64_t new_size = (s->data_offset + s->used_storage +
+                                s->storage_grow_unit) * 512;
+            bdrv_truncate (s->fvd_data, new_size);
+        }
+
+        /* Check how much storage is available now. */
+        int64_t size = bdrv_getlength (s->fvd_data);
+        if (size < 0) {
+            fprintf (stderr, "Error in bdrv_getlength(%s)\n", bs->filename);
+            return EMPTY_TABLE;
+        }
+        s->data_storage = size / 512 - s->data_offset;
+        if (s->used_storage + s->chunk_size > s->data_storage) {
+            fprintf (stderr, "Could not allocate more storage space.\n");
+            return EMPTY_TABLE;
+        }
+
+        QDEBUG ("Increased storage to %" PRId64 " bytes.\n", size);
+    }
+
+    uint32_t allocated_chunk_id = s->used_storage / s->chunk_size;
+    s->used_storage += s->chunk_size;
+    return allocated_chunk_id;
+}
+
+static void finish_store_data_in_compact_image (void *opaque, int ret)
+{
+    CompactChildCB *child = opaque;
+    FvdAIOCB *acb = child->acb;
+
+    /* Now fvd_store_compact_cancel(), if invoked, won't cancel this child
+     * request. */
+    child->hd_acb = NULL;
+
+    if (acb->store.ret == 0) {
+        acb->store.ret = ret;
+    } else {
+        QDEBUG ("STORE: acb%llu-%p  store_child=%d total_children=%d error "
+                "ret=%d\n", acb->uuid, acb, acb->store.finished_children,
+             acb->store.num_children, ret);
+    }
+
+    acb->store.finished_children++;
+    if (acb->store.finished_children < acb->store.num_children) {
+        QDEBUG ("STORE: acb%llu-%p  store_finished_children=%d "
+                "total_children=%d\n", acb->uuid, acb,
+                acb->store.finished_children, acb->store.num_children);
+        return;
+    }
+
+    /* All child requests finished. Free buffers. */
+    if (acb->store.children) {
+        my_qemu_free (acb->store.children);
+        acb->store.children = NULL;
+    }
+
+    if (acb->store.ret) {        /* error */
+        QDEBUG ("STORE: acb%llu-%p  "
+                "store_last_child_finished_with_error ret=%d\n",
+                acb->uuid, acb, acb->store.ret);
+        acb->common.cb (acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release (acb);
+        return;
+    }
+
+    if (!acb->store.update_table) {
+        QDEBUG ("STORE: acb%llu-%p  "
+                "store_last_child_finished_without_table_update\n",
+                acb->uuid, acb);
+        acb->common.cb (acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release (acb);
+        return;
+    }
+
+    /* Check whether the table entries are still dirty. Note that while saving
+     * this write to disk, other writes might have already flushed the dirty
+     * table entries to the journal. If those table entries are no longer
+     * dirty, depending on the behavior of parent_acb, it might be able to
+     * skip a journal update. */
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    uint32_t first_chunk = acb->sector_num / s->chunk_size;
+    const uint32_t last_chunk =
+        (acb->sector_num + acb->nb_sectors - 1) / s->chunk_size;
+    int update_table = FALSE;
+    uint32_t chunk;
+    for (chunk = first_chunk; chunk <= last_chunk; chunk++) {
+        if (IS_DIRTY (s->table[chunk])) {
+            update_table = TRUE;
+            break;
+        }
+    }
+
+    if (acb->store.parent_acb) {
+        /* Metadata update will be handled by the parent write. */
+        ASSERT (acb->store.parent_acb->type == OP_WRITE);
+        QDEBUG ("STORE: acb%llu-%p  "
+                "store_last_child_finished_with_parent_do_table_update\n",
+                acb->uuid, acb);
+        acb->store.parent_acb->write.update_table = update_table;
+        acb->common.cb (acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release (acb);
+        return;
+    }
+
+    if (update_table) {
+        QDEBUG ("STORE: acb%llu-%p  "
+                "store_last_child_finished_and_start_table_update\n",
+                acb->uuid, acb);
+        write_metadata_to_journal (acb);
+    } else {
+        QDEBUG ("STORE: acb%llu-%p  "
+                "store_last_child_finished_without_table_update\n",
+                acb->uuid, acb);
+        acb->common.cb (acb->common.opaque, acb->store.ret);
+        my_qemu_aio_release (acb);
+    }
+}
+
+static inline FvdAIOCB *init_store_acb (int soft_write,
+                                        QEMUIOVector * orig_qiov,
+                                        BlockDriverState * bs,
+                                        int64_t sector_num, int nb_sectors,
+                                        FvdAIOCB * parent_acb,
+                                        BlockDriverCompletionFunc * cb,
+                                        void *opaque)
+{
+    FvdAIOCB *acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+    acb->type = OP_STORE_COMPACT;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->store.soft_write = soft_write;
+    acb->store.orig_qiov = orig_qiov;
+    acb->store.parent_acb = parent_acb;
+    acb->store.finished_children = 0;
+    acb->store.num_children = 0;
+    acb->store.one_child.hd_acb = NULL;
+    acb->store.children = NULL;
+    acb->store.ret = 0;
+    acb->jcb.iov.iov_base = NULL;
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.next_wait_for_journal.le_prev = NULL;
+    COPY_UUID (acb, parent_acb);
+
+    return acb;
+}
+
+static void fvd_store_compact_cancel (FvdAIOCB * acb)
+{
+    if (acb->store.children) {
+        int i;
+        for (i = 0; i < acb->store.num_children; i++) {
+            if (acb->store.children[i].hd_acb) {
+                bdrv_aio_cancel (acb->store.children[i].hd_acb);
+            }
+        }
+        my_qemu_free (acb->store.children);
+    }
+    if (acb->store.one_child.hd_acb) {
+        bdrv_aio_cancel (acb->store.one_child.hd_acb);
+    }
+    if (acb->jcb.hd_acb) {
+        bdrv_aio_cancel (acb->jcb.hd_acb);
+        free_journal_sectors (acb->common.bs->opaque);
+    }
+    if (acb->jcb.iov.iov_base != NULL) {
+        my_qemu_vfree (acb->jcb.iov.iov_base);
+    }
+    if (acb->jcb.next_wait_for_journal.le_prev) {
+        QLIST_REMOVE (acb, jcb.next_wait_for_journal);
+    }
+
+    my_qemu_aio_release (acb);
+}
diff --git a/block/fvd-utils.c b/block/fvd-utils.c
new file mode 100644
index 0000000..3f7d4ec
--- /dev/null
+++ b/block/fvd-utils.c
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*==============================================================================
+ *  A short description: this module implements basic utility functions for
+ *  the Fast Virtual Disk (FVD) format.
+ *============================================================================*/
+
+static inline int stale_bitmap_show_sector_in_base_img (int64_t sector_num,
+                                                const BDRVFvdState * s)
+{
+    if (sector_num >= s->nb_sectors_in_base_img) {
+        return FALSE;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+    return 0 == (int) ((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline int
+fresh_bitmap_show_sector_in_base_img (int64_t sector_num,
+                                              const BDRVFvdState * s)
+{
+    if (sector_num >= s->nb_sectors_in_base_img) {
+        return FALSE;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+    return 0 == (int) ((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline void update_fresh_bitmap (int64_t sector_num, int nb_sectors,
+                                           const BDRVFvdState * s)
+{
+    if (sector_num >= s->nb_sectors_in_base_img) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->nb_sectors_in_base_img) {
+        end = s->nb_sectors_in_base_img;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+static void update_stale_bitmap (BDRVFvdState * s, int64_t sector_num,
+                                 int nb_sectors)
+{
+    if (sector_num >= s->nb_sectors_in_base_img) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->nb_sectors_in_base_img) {
+        end = s->nb_sectors_in_base_img;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    const int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            ASSERT (s->stale_bitmap == s->fresh_bitmap ||
+                    (s->fresh_bitmap[bitmap_byte_offset] & mask));
+            b |= mask;
+            s->stale_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+static void update_both_bitmaps (BDRVFvdState * s, int64_t sector_num,
+                                 int nb_sectors)
+{
+    if (sector_num >= s->nb_sectors_in_base_img) {
+        return;
+    }
+
+    int64_t end = sector_num + nb_sectors;
+    if (end > s->nb_sectors_in_base_img) {
+        end = s->nb_sectors_in_base_img;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    const int64_t block_end = (end - 1) / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] =
+                s->stale_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+}
+
+/* Return TRUE if a valid region is found. */
+static int find_region_in_base_img (BDRVFvdState * s, int64_t * from,
+                                    int64_t * to)
+{
+    int64_t sec = *from;
+    int64_t last_sec = *to;
+
+    if (last_sec > s->nb_sectors_in_base_img) {
+        last_sec = s->nb_sectors_in_base_img;
+    }
+
+    if (sec >= last_sec) {
+        return FALSE;
+    }
+
+    if (!fresh_bitmap_show_sector_in_base_img (sec, s)) {
+        /* Find the first sector in the base image. */
+
+        sec = ROUND_UP (sec + 1, s->block_size); /* Begin of next block. */
+        while (1) {
+            if (sec >= last_sec) {
+                return FALSE;
+            }
+            if (fresh_bitmap_show_sector_in_base_img (sec, s)) {
+                break;
+            }
+            sec += s->block_size;        /* Begin of the next block. */
+        }
+    }
+
+    /* Find the end of the region in the base image. */
+    int64_t first_sec = sec;
+    sec = ROUND_UP (sec + 1, s->block_size);        /* Begin of next block. */
+    while (1) {
+        if (sec >= last_sec) {
+            sec = last_sec;
+            break;
+        }
+        if (!fresh_bitmap_show_sector_in_base_img (sec, s)) {
+            break;
+        }
+        sec += s->block_size;        /* Begin of the next block. */
+    }
+    last_sec = sec;
+
+    /* Check conflicting copy-on-reads. */
+    FvdAIOCB *old;
+    QLIST_FOREACH (old, &s->copy_locks, copy_lock.next) {
+        if (old->copy_lock.begin <= first_sec
+                && first_sec < old->copy_lock.end) {
+            first_sec = old->copy_lock.end;
+        }
+        if (old->copy_lock.begin < last_sec && last_sec <= old->copy_lock.end) {
+            last_sec = old->copy_lock.begin;
+        }
+    }
+
+    if (first_sec >= last_sec) {
+        return FALSE;        /* The entire region is already covered. */
+    }
+
+     /* This loop cannot be merged with the loop above. Otherwise, the logic
+      * would be incorrect.  This loop covers the case that an old request
+      * spans over a subset of the region being checked. */
+    QLIST_FOREACH (old, &s->copy_locks, copy_lock.next) {
+        if (first_sec <= old->copy_lock.begin
+            && old->copy_lock.begin < last_sec) {
+            last_sec = old->copy_lock.begin;
+        }
+    }
+
+    /* Check conflicting writes. */
+    QLIST_FOREACH (old, &s->write_locks, write.next_write_lock) {
+        int64_t old_end = old->sector_num + old->nb_sectors;
+        if (old->sector_num <= first_sec && first_sec < old_end) {
+            first_sec = old_end;
+        }
+        if (old->sector_num < last_sec && last_sec <= old_end) {
+            last_sec = old->sector_num;
+        }
+    }
+
+    if (first_sec >= last_sec) {
+        return FALSE;        /* The entire region is already covered. */
+    }
+
+     /* This loop cannot be merged with the loop above. Otherwise, the logic
+      * would be incorrect.  This loop covers the case that an old request
+      * spans over a subset of the region being checked. */
+    QLIST_FOREACH (old, &s->write_locks, write.next_write_lock) {
+        if (first_sec <= old->sector_num && old->sector_num < last_sec) {
+            last_sec = old->sector_num;
+        }
+    }
+
+    ASSERT (first_sec % s->block_size == 0 && (last_sec % s->block_size == 0
+                || last_sec == s->nb_sectors_in_base_img));
+
+    *from = first_sec;
+    *to = last_sec;
+    return TRUE;
+}
+
+static inline int bitmap_show_sector_in_base_img (int64_t sector_num,
+                                                       const BDRVFvdState * s,
+                                                       int bitmap_offset,
+                                                       uint8_t * bitmap)
+{
+    if (sector_num >= s->nb_sectors_in_base_img) {
+        return FALSE;
+    }
+
+    int64_t block_num = sector_num / s->block_size;
+    int64_t bitmap_byte_offset = block_num / 8 - bitmap_offset;
+    uint8_t bitmap_bit_offset = block_num % 8;
+    uint8_t b = bitmap[bitmap_byte_offset];
+    return 0 == (int) ((b >> bitmap_bit_offset) & 0x01);
+}
+
+static inline void copy_to_iov (struct iovec *iov, int *p_index,
+                                uint8_t ** p_buf, int *p_left,
+                                uint8_t * source, int total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+
+    if (left <= 0) {
+        index++;
+        buf = iov[index].iov_base;
+        left = iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            memcpy (buf, source, total);
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return;
+        }
+
+        memcpy (buf, source, left);
+        total -= left;
+        source += left;
+        index++;
+        buf = iov[index].iov_base;
+        left = iov[index].iov_len;
+    }
+}
+
+static inline void init_data_region (BDRVFvdState * s)
+{
+    bdrv_truncate (s->fvd_data, s->data_offset * 512 + s->virtual_disk_size);
+    s->data_region_prepared = TRUE;
+}
+
+static inline void update_clean_shutdown_flag (BDRVFvdState * s, int clean)
+{
+    FvdHeader header;
+    if (!read_fvd_header (s, &header)) {
+        header.clean_shutdown = clean;
+
+        if (!update_fvd_header (s, &header)) {
+            QDEBUG ("Set clean_shutdown to %s\n", BOOL (clean));
+        }
+    }
+}
+
+static inline int stale_bitmap_need_update (FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+    int64_t end = acb->sector_num + acb->nb_sectors;
+
+    if (end > s->nb_sectors_in_base_img) {
+        end = s->nb_sectors_in_base_img;
+    }
+    int64_t block_end = (end - 1) / s->block_size;
+    int64_t block_num = acb->sector_num / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            return TRUE;
+        }
+    }
+
+    return FALSE;
+}
+
+static int update_fresh_bitmap_and_check_stale_bitmap (FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    if (acb->sector_num >= s->nb_sectors_in_base_img) {
+        return FALSE;
+    }
+
+    int need_update = FALSE;
+    int64_t end = acb->sector_num + acb->nb_sectors;
+
+    if (end > s->nb_sectors_in_base_img) {
+        end = s->nb_sectors_in_base_img;
+    }
+
+    int64_t block_end = (end - 1) / s->block_size;
+    int64_t block_num = acb->sector_num / s->block_size;
+
+    for (; block_num <= block_end; block_num++) {
+        int64_t bitmap_byte_offset = block_num / 8;
+        uint8_t bitmap_bit_offset = block_num % 8;
+        uint8_t mask = (uint8_t) (0x01 << bitmap_bit_offset);
+        uint8_t b = s->stale_bitmap[bitmap_byte_offset];
+        if (b & mask) {
+            /* If the bit in stale_bitmap is set, the corresponding bit in
+             * fresh_bitmap must be set already. */
+            continue;
+        }
+
+        need_update = TRUE;
+        b = s->fresh_bitmap[bitmap_byte_offset];
+        if (!(b & mask)) {
+            b |= mask;
+            s->fresh_bitmap[bitmap_byte_offset] = b;
+        }
+    }
+
+    return need_update;
+}
+
+static void fvd_header_cpu_to_le (FvdHeader * header)
+{
+    cpu_to_le32s (&header->magic);
+    cpu_to_le32s (&header->version);
+    cpu_to_le32s ((uint32_t *) & header->all_data_in_fvd_img);
+    cpu_to_le32s ((uint32_t *) & header->generate_prefetch_profile);
+    cpu_to_le64s ((uint64_t *) & header->metadata_size);
+    cpu_to_le64s ((uint64_t *) & header->virtual_disk_size);
+    cpu_to_le64s ((uint64_t *) & header->base_img_size);
+    cpu_to_le64s ((uint64_t *) & header->max_outstanding_copy_on_read_data);
+    cpu_to_le64s ((uint64_t *) & header->bitmap_offset);
+    cpu_to_le64s ((uint64_t *) & header->prefetch_profile_offset);
+    cpu_to_le64s ((uint64_t *) & header->prefetch_profile_entries);
+    cpu_to_le64s ((uint64_t *) & header->bitmap_size);
+    cpu_to_le32s ((uint32_t *) & header->copy_on_read);
+    cpu_to_le32s ((uint32_t *) & header->need_zero_init);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_start_delay);
+    cpu_to_le32s ((uint32_t *) & header->profile_directed_prefetch_start_delay);
+    cpu_to_le32s ((uint32_t *) & header->num_prefetch_slots);
+    cpu_to_le32s ((uint32_t *) & header->bytes_per_prefetch);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_throttle_time);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_read_throughput_measure_time);
+    cpu_to_le32s ((uint32_t *) &header->prefetch_write_throughput_measure_time);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_perf_calc_alpha);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_min_read_throughput);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_min_write_throughput);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_max_read_throughput);
+    cpu_to_le32s ((uint32_t *) & header->prefetch_max_write_throughput);
+    cpu_to_le32s ((uint32_t *) & header->block_size);
+    cpu_to_le32s ((uint32_t *) & header->unit_of_PrefetchProfileEntry_len);
+    cpu_to_le32s ((uint32_t *) & header->compact_image);
+    cpu_to_le64s ((uint64_t *) & header->chunk_size);
+    cpu_to_le64s ((uint64_t *) & header->storage_grow_unit);
+    cpu_to_le64s ((uint64_t *) & header->table_offset);
+    cpu_to_le32s ((uint32_t *) & header->clean_shutdown);
+    cpu_to_le64s ((uint64_t *) & header->journal_offset);
+    cpu_to_le64s ((uint64_t *) & header->journal_size);
+}
+
+static void fvd_header_le_to_cpu (FvdHeader * header)
+{
+    le32_to_cpus (&header->magic);
+    le32_to_cpus (&header->version);
+    le32_to_cpus ((uint32_t *) & header->all_data_in_fvd_img);
+    le32_to_cpus ((uint32_t *) & header->generate_prefetch_profile);
+    le64_to_cpus ((uint64_t *) & header->metadata_size);
+    le64_to_cpus ((uint64_t *) & header->virtual_disk_size);
+    le64_to_cpus ((uint64_t *) & header->base_img_size);
+    le64_to_cpus ((uint64_t *) & header->max_outstanding_copy_on_read_data);
+    le64_to_cpus ((uint64_t *) & header->bitmap_offset);
+    le64_to_cpus ((uint64_t *) & header->prefetch_profile_offset);
+    le64_to_cpus ((uint64_t *) & header->prefetch_profile_entries);
+    le64_to_cpus ((uint64_t *) & header->bitmap_size);
+    le32_to_cpus ((uint32_t *) & header->copy_on_read);
+    le32_to_cpus ((uint32_t *) & header->need_zero_init);
+    le32_to_cpus ((uint32_t *) & header->prefetch_start_delay);
+    le32_to_cpus ((uint32_t *) & header->profile_directed_prefetch_start_delay);
+    le32_to_cpus ((uint32_t *) & header->num_prefetch_slots);
+    le32_to_cpus ((uint32_t *) & header->bytes_per_prefetch);
+    le32_to_cpus ((uint32_t *) & header->prefetch_throttle_time);
+    le32_to_cpus ((uint32_t *) & header->prefetch_read_throughput_measure_time);
+    le32_to_cpus ((uint32_t *) &header->prefetch_write_throughput_measure_time);
+    le32_to_cpus ((uint32_t *) & header->prefetch_perf_calc_alpha);
+    le32_to_cpus ((uint32_t *) & header->prefetch_min_read_throughput);
+    le32_to_cpus ((uint32_t *) & header->prefetch_min_write_throughput);
+    le32_to_cpus ((uint32_t *) & header->prefetch_max_read_throughput);
+    le32_to_cpus ((uint32_t *) & header->prefetch_max_write_throughput);
+    le32_to_cpus ((uint32_t *) & header->block_size);
+    le32_to_cpus ((uint32_t *) & header->unit_of_PrefetchProfileEntry_len);
+    le32_to_cpus ((uint32_t *) & header->compact_image);
+    le64_to_cpus ((uint64_t *) & header->chunk_size);
+    le64_to_cpus ((uint64_t *) & header->storage_grow_unit);
+    le64_to_cpus ((uint64_t *) & header->table_offset);
+    le32_to_cpus ((uint32_t *) & header->clean_shutdown);
+    le64_to_cpus ((uint64_t *) & header->journal_offset);
+    le64_to_cpus ((uint64_t *) & header->journal_size);
+}
+
+static void flush_metadata_to_disk (BlockDriverState * bs)
+{
+    BDRVFvdState *s = bs->opaque;
+
+    if (bs->read_only || !s->fvd_metadata) {
+        return;
+    }
+
+    if (s->stale_bitmap) {
+        /* Flush fresh_bitmap to disk. */
+        int nb = (int) (s->bitmap_size / 512);
+        QDEBUG ("Flush FVD bitmap (%d sectors) to disk\n", nb);
+        bdrv_write (s->fvd_metadata, s->bitmap_offset, s->fresh_bitmap, nb);
+    }
+
+    if (s->table) {
+        /* Flush table to disk. */
+        int table_entries =
+            (int) (ROUND_UP (s->virtual_disk_size, s->chunk_size * 512) /
+                   (s->chunk_size * 512));
+
+        /* Clean the DIRTY_TABLE bit. */
+        int i;
+        for (i = 0; i < table_entries; i++) {
+            CLEAN_DIRTY (s->table[i]);
+        }
+
+        int64_t table_size = sizeof (uint32_t) * table_entries;
+        table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
+        int nb = (int) (table_size / 512);
+        QDEBUG ("Flush FVD table (%d sectors) to disk\n", nb);
+        bdrv_write (s->fvd_metadata, s->table_offset, (uint8_t *) s->table, nb);
+    }
+}
+
+static int read_fvd_header (BDRVFvdState * s, FvdHeader * header)
+{
+    if (bdrv_pread (s->fvd_metadata, 0, header, sizeof (FvdHeader)) !=
+        sizeof (FvdHeader)) {
+        fprintf (stderr, "Failed to read the FVD header.\n");
+        return -1;
+    }
+
+    fvd_header_le_to_cpu (header);
+
+    if (header->magic != FVD_MAGIC || header->version != FVD_VERSION) {
+        fprintf (stderr, "Error: image does not have the correct FVD format "
+                 "magic number in header\n");
+        return -1;
+    }
+
+    return 0;
+}
+
+static int update_fvd_header (BDRVFvdState * s, FvdHeader * header)
+{
+    fvd_header_cpu_to_le (header);
+    int ret = bdrv_pwrite (s->fvd_metadata, 0, header, sizeof (FvdHeader));
+
+    if (ret != sizeof (FvdHeader)) {
+        fprintf (stderr, "Failed to update the FVD header.\n");
+        ASSERT (FALSE);
+        return -EIO;
+    }
+
+    return 0;
+}
+
+static void null_prefetch_cb (void *opaque, int ret)
+{
+    /* Nothing to do and will never be invoked. Only need it to distinguish
+     * copy-on-read from prefetch. */
+    ASSERT (FALSE);
+}
+
+static int count_iov (struct iovec *orig_iov, int *p_index, uint8_t ** p_buf,
+                      size_t * p_left, size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
+
+static int setup_iov (struct iovec *orig_iov, struct iovec *new_iov,
+                      int *p_index, uint8_t ** p_buf, size_t * p_left,
+                      size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            new_iov[count].iov_base = buf;
+            new_iov[count].iov_len = total;
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        new_iov[count].iov_base = buf;
+        new_iov[count].iov_len = left;
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
+
+static int zero_iov (struct iovec *orig_iov, int *p_index, uint8_t ** p_buf,
+                     size_t * p_left, size_t total)
+{
+    int index = *p_index;
+    uint8_t *buf = *p_buf;
+    int left = *p_left;
+    int count = 0;
+
+    if (left <= 0) {
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+    }
+
+    while (1) {
+        if (left >= total) {
+            memset (buf, 0, total);
+            *p_buf = buf + total;
+            *p_left = left - total;
+            *p_index = index;
+            return count + 1;
+        }
+
+        memset (buf, 0, left);
+        total -= left;
+        index++;
+        buf = orig_iov[index].iov_base;
+        left = orig_iov[index].iov_len;
+        count++;
+    }
+}
diff --git a/block/fvd-write.c b/block/fvd-write.c
new file mode 100644
index 0000000..90350ce
--- /dev/null
+++ b/block/fvd-write.c
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2010-2011 IBM
+ *
+ * Authors:
+ *         Chunqiang Tang <ctang@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*=============================================================================
+ *  A short description: this module implements bdrv_aio_writev() for FVD.
+ *===========================================================================*/
+
+static BlockDriverAIOCB *fvd_aio_writev (BlockDriverState * bs,
+                                         int64_t sector_num,
+                                         QEMUIOVector * qiov, int nb_sectors,
+                                         BlockDriverCompletionFunc * cb,
+                                         void *opaque)
+{
+    BDRVFvdState *s = bs->opaque;
+    FvdAIOCB *acb;
+
+    TRACE_REQUEST (TRUE, sector_num, nb_sectors);
+
+    if (!s->data_region_prepared) {
+        init_data_region (s);
+    }
+
+    if (s->prefetch_state == PREFETCH_STATE_FINISHED
+        || sector_num >= s->nb_sectors_in_base_img) {
+        /* This is an  efficient case. See Section 3.3.5 of the FVD-cow paper.
+         * This also covers the case of no base image. */
+        return store_data (FALSE, NULL, bs, sector_num, qiov,
+                           nb_sectors, cb, opaque);
+    }
+
+    /* Check if all requested sectors are in the FVD data file. */
+    int64_t sec = ROUND_DOWN (sector_num, s->block_size);
+    int64_t sec_in_last_block = ROUND_DOWN (sector_num + nb_sectors - 1,
+                                            s->block_size);
+    do {
+        if (stale_bitmap_show_sector_in_base_img (sec, s)) {
+            goto slow_path;
+        }
+        sec += s->block_size;
+    } while (sec <= sec_in_last_block);
+
+    /* This is the fast path, as all requested data are in the FVD data file
+     * and no need to update the bitmap. */
+    return store_data (FALSE, NULL, bs, sector_num, qiov,
+                       nb_sectors, cb, opaque);
+
+  slow_path:
+    acb = my_qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
+    if (!acb) {
+        return NULL;
+    }
+
+    acb->type = OP_WRITE;
+    acb->sector_num = sector_num;
+    acb->nb_sectors = nb_sectors;
+    acb->write.ret = 0;
+    acb->write.update_table = FALSE;
+    acb->write.qiov = qiov;
+    acb->write.hd_acb = NULL;
+    acb->write.cow_buf = NULL;
+    acb->copy_lock.next.le_prev = NULL;
+    acb->write.next_write_lock.le_prev = NULL;
+    acb->write.next_dependent_write.le_prev = NULL;
+    acb->jcb.iov.iov_base = NULL;
+    acb->jcb.hd_acb = NULL;
+    acb->jcb.next_wait_for_journal.le_prev = NULL;
+    QLIST_INIT (&acb->copy_lock.dependent_writes);
+
+    QDEBUG ("WRITE: acb%llu-%p  start  sector_num=%" PRId64 " nb_sectors=%d\n",
+            acb->uuid, acb, acb->sector_num, acb->nb_sectors);
+
+    if (do_aio_write (acb) < 0) {
+        my_qemu_aio_release (acb);
+        return NULL;
+    }
+#ifdef FVD_DEBUG
+    pending_local_writes++;
+#endif
+    return &acb->common;
+}
+
+static void fvd_write_cancel (FvdAIOCB * acb)
+{
+    if (acb->write.hd_acb) {
+        bdrv_aio_cancel (acb->write.hd_acb);
+    }
+    if (acb->jcb.hd_acb) {
+        bdrv_aio_cancel (acb->jcb.hd_acb);
+        free_journal_sectors (acb->common.bs->opaque);
+    }
+    if (acb->jcb.next_wait_for_journal.le_prev) {
+        QLIST_REMOVE (acb, jcb.next_wait_for_journal);
+    }
+    if (acb->write.next_dependent_write.le_prev) {
+        QLIST_REMOVE (acb, write.next_dependent_write);
+    }
+    free_write_resource (acb);
+}
+
+static void free_write_resource (FvdAIOCB * acb)
+{
+    if (acb->write.next_write_lock.le_prev) {
+        QLIST_REMOVE (acb, write.next_write_lock);
+    }
+    if (acb->copy_lock.next.le_prev) {
+        QLIST_REMOVE (acb, copy_lock.next);
+        restart_dependent_writes (acb);
+    }
+    if (acb->write.cow_buf) {
+        my_qemu_vfree (acb->write.cow_buf);
+    }
+    if (acb->jcb.iov.iov_base != NULL) {
+        my_qemu_vfree (acb->jcb.iov.iov_base);
+    }
+
+    my_qemu_aio_release (acb);
+
+#ifdef FVD_DEBUG
+    pending_local_writes--;
+#endif
+}
+
+static inline void finish_write (FvdAIOCB * acb, int ret)
+{
+    QDEBUG ("WRITE: acb%llu-%p  completely_finished ret=%d\n", acb->uuid, acb,
+            ret);
+    acb->common.cb (acb->common.opaque, ret);
+    free_write_resource (acb);
+}
+
+static void finish_write_data (void *opaque, int ret)
+{
+    FvdAIOCB *acb = opaque;
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    acb->write.ret = ret;
+    acb->write.hd_acb = NULL;
+
+    if (ret != 0) {
+        QDEBUG ("WRITE: acb%llu-%p  finish_write_data error ret=%d\n",
+                acb->uuid, acb, ret);
+        finish_write (acb, ret);
+        return;
+    }
+
+    QDEBUG ("WRITE: acb%llu-%p  finish_write_data\n", acb->uuid, acb);
+
+    /* Figure out whether to update metadata or not. */
+    if (s->fresh_bitmap == s->stale_bitmap) {
+        /* This is the case if neither copy_on_read nor prefetching is
+         * enabled. Cannot update fresh_bitmap until the on-disk metadata is
+         * updated. */
+        if (acb->write.update_table || stale_bitmap_need_update (acb)) {
+            /* Cannot release lock on data now since fresh_bitmap has not been
+             * updated. Otherwise, a copy-on-write or copy-on-read operation
+             * may use data from the backing image to overwrite the data just
+             * been written. */
+            write_metadata_to_journal (acb);
+        } else {
+            finish_write (acb, ret);        /* No need to update metadata. */
+        }
+        return;
+    }
+
+    /* stale_bitmap and fresh_bitmap are different. Now we can update
+     * fresh_bitmap. stale_bitmap will be updated after the on-disk metadata
+     * are updated. */
+    int update_stale_bitmap = update_fresh_bitmap_and_check_stale_bitmap (acb);
+
+    if (acb->write.update_table || update_stale_bitmap) {
+        /* Release lock on data now since fresh_bitmap has been updated. */
+        QLIST_REMOVE (acb, write.next_write_lock);
+        acb->write.next_write_lock.le_prev = NULL;
+        if (acb->copy_lock.next.le_prev) {
+            QLIST_REMOVE (acb, copy_lock.next);
+            restart_dependent_writes (acb);
+        }
+
+        write_metadata_to_journal (acb);
+    } else {
+        finish_write (acb, ret);
+    }
+}
+
+static void finish_read_backing_for_copy_on_write (void *opaque, int ret)
+{
+    FvdAIOCB *acb = (FvdAIOCB *) opaque;
+    BlockDriverState *bs = acb->common.bs;
+
+    if (ret != 0) {
+        QDEBUG ("WRITE: acb%llu-%p  finish_read_from_backing with error "
+                "ret=%d\n", acb->uuid, acb, ret);
+        finish_write (acb, ret);
+    } else {
+        QDEBUG ("WRITE: acb%llu-%p  "
+                "finish_read_from_backing_and_start_write_data\n",
+                acb->uuid, acb);
+        acb->write.hd_acb = store_data (FALSE, acb, bs,
+                                        acb->write.cow_start_sector,
+                                        acb->write.cow_qiov,
+                                        acb->write.cow_qiov->size / 512,
+                                        finish_write_data, acb);
+        if (!acb->write.hd_acb) {
+            finish_write (acb, -1);
+        }
+    }
+}
+
+static int do_aio_write (FvdAIOCB * acb)
+{
+    BlockDriverState *bs = acb->common.bs;
+    BDRVFvdState *s = bs->opaque;
+
+    /* Calculate the data region need be locked. */
+    const int64_t sector_end = acb->sector_num + acb->nb_sectors;
+    const int64_t block_begin = ROUND_DOWN (acb->sector_num, s->block_size);
+    int64_t block_end = ROUND_UP (sector_end, s->block_size);
+
+    /* Check for conflicting copy-on-reads. */
+    FvdAIOCB *old;
+    QLIST_FOREACH (old, &s->copy_locks, copy_lock.next) {
+        if (old->copy_lock.end > acb->sector_num &&
+            sector_end > old->copy_lock.begin) {
+            QLIST_INSERT_HEAD (&old->copy_lock.dependent_writes, acb,
+                               write.next_dependent_write);
+            QDEBUG ("WRITE: acb%llu-%p  put_on_hold_due_to_data_conflict "
+                    "with %s acb%llu-%p\n", acb->uuid, acb,
+                    old->type == OP_WRITE ? "write" : "copy_on_read",
+                    old->uuid, old);
+            return 0;
+        }
+    }
+
+    /* No conflict. Now check if this write updates partial blocks and hence
+     * need to read those blocks from the base image and merge with this
+     * write. */
+    int read_first_block, read_last_block;
+    if (acb->sector_num % s->block_size == 0) {
+        read_first_block = FALSE;
+    } else
+        if (fresh_bitmap_show_sector_in_base_img (acb->sector_num, s)) {
+        read_first_block = TRUE;
+    } else {
+        read_first_block = FALSE;
+    }
+
+    if (sector_end % s->block_size == 0) {
+        read_last_block = FALSE;
+    } else if (fresh_bitmap_show_sector_in_base_img (sector_end - 1, s)) {
+        read_last_block = TRUE;
+    } else {
+        read_last_block = FALSE;
+    }
+
+    if (read_first_block) {
+        if (read_last_block) {
+            /* Case 1: Read all the blocks involved from the base image. */
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+            if (block_end > s->nb_sectors_in_base_img) {
+                block_end = s->nb_sectors_in_base_img;
+            }
+
+            int buf_size = (block_end - block_begin) * 512
+                    + 2 * sizeof (QEMUIOVector)
+                    + sizeof (struct iovec) * (old_qiov->niov + 3);
+            buf_size = ROUND_UP (buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign (bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf +
+                                  (block_end - block_begin) * 512);
+            read_qiov->iov = (struct iovec *) (read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size =
+                (block_end - block_begin) * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *) (write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 2;
+            write_qiov->size = read_qiov->size;
+
+            /* The first entry is for data read from the base image. */
+            write_qiov->iov[0].iov_base = acb->write.cow_buf;
+            write_qiov->iov[0].iov_len = (acb->sector_num - block_begin) * 512;
+            memcpy (&write_qiov->iov[1], old_qiov->iov,
+                    sizeof (struct iovec) * old_qiov->niov);
+
+            /* The last entry is for data read from the base image. */
+            write_qiov->iov[old_qiov->niov + 1].iov_base = acb->write.cow_buf
+                                            + (sector_end - block_begin) * 512;
+            write_qiov->iov[old_qiov->niov + 1].iov_len =
+                                                (block_end - sector_end) * 512;
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = block_begin;
+
+            acb->write.hd_acb = bdrv_aio_readv (bs->backing_hd, block_begin,
+                                    read_qiov, block_end - block_begin,
+                                    finish_read_backing_for_copy_on_write, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.begin = block_begin;
+            acb->copy_lock.end = block_end;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            QDEBUG ("WRITE: acb%llu-%p  "
+                    "read_first_last_partial_blocks_from_backing  sector_num=%"
+                    PRId64 " nb_sectors=%d\n", acb->uuid, acb, block_begin,
+                    (int) (block_end - block_begin));
+        } else {
+            /* Case 2: Read the first block from the base image. */
+            int nb = acb->sector_num - block_begin;
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+
+            /* Space for data and metadata. */
+            int buf_size = nb * 512 + 2 * sizeof (QEMUIOVector)
+                                + sizeof (struct iovec) * (old_qiov->niov + 2);
+            buf_size = ROUND_UP (buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign (bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov =
+                (QEMUIOVector *) (acb->write.cow_buf + nb * 512);
+            read_qiov->iov = (struct iovec *) (read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *) (write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 1;
+            write_qiov->size = old_qiov->size + read_qiov->size;
+
+            /* The first entry is added for data read from the base image. */
+            write_qiov->iov[0].iov_base = acb->write.cow_buf;
+            write_qiov->iov[0].iov_len = read_qiov->size;
+            memcpy (&write_qiov->iov[1], old_qiov->iov,
+                    sizeof (struct iovec) * old_qiov->niov);
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = block_begin;
+
+            acb->write.hd_acb = bdrv_aio_readv (bs->backing_hd,
+                                    block_begin, read_qiov, nb,
+                                    finish_read_backing_for_copy_on_write, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.begin = block_begin;
+            acb->copy_lock.end = block_begin + s->block_size;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            QDEBUG ("WRITE: acb%llu-%p  read_first_partial_block_from_backing  "
+                    "sector_num=%" PRId64 " nb_sectors=%d\n",
+                    acb->uuid, acb, block_begin, nb);
+        }
+    } else {
+        if (read_last_block) {
+            /* Case 3: Read the last block from the base image. */
+            int nb;
+            if (block_end < s->nb_sectors_in_base_img) {
+                nb = block_end - sector_end;
+            }
+            else {
+                nb = s->nb_sectors_in_base_img - sector_end;
+            }
+            const QEMUIOVector *old_qiov = acb->write.qiov;
+
+            /* Space for data and metadata. */
+            int buf_size = nb * 512 + 2 * sizeof (QEMUIOVector)
+                                + sizeof (struct iovec) * (old_qiov->niov + 2);
+            buf_size = ROUND_UP (buf_size, 512);
+            acb->write.cow_buf = my_qemu_blockalign (bs->backing_hd, buf_size);
+
+            /* For reading from the base image. */
+            QEMUIOVector *read_qiov = (QEMUIOVector *) (acb->write.cow_buf
+                                                        + nb * 512);
+            read_qiov->iov = (struct iovec *) (read_qiov + 1);
+            read_qiov->nalloc = -1;
+            read_qiov->niov = 1;
+            read_qiov->iov[0].iov_base = acb->write.cow_buf;
+            read_qiov->iov[0].iov_len = read_qiov->size = nb * 512;
+
+            /* For writing to the FVD data file. */
+            QEMUIOVector *write_qiov = (QEMUIOVector *) (read_qiov->iov + 1);
+            write_qiov->iov = (struct iovec *) (write_qiov + 1);
+            write_qiov->nalloc = -1;
+            write_qiov->niov = old_qiov->niov + 1;
+            write_qiov->size = old_qiov->size + read_qiov->size;
+            memcpy (write_qiov->iov, old_qiov->iov,
+                    sizeof (struct iovec) * old_qiov->niov);
+
+            /* The last appended entry is for data read from the base image. */
+            write_qiov->iov[old_qiov->niov].iov_base = acb->write.cow_buf;
+            write_qiov->iov[old_qiov->niov].iov_len = read_qiov->size;
+            acb->write.cow_qiov = write_qiov;
+            acb->write.cow_start_sector = acb->sector_num;
+
+            acb->write.hd_acb = bdrv_aio_readv (bs->backing_hd,
+                                    sector_end, read_qiov, nb,
+                                    finish_read_backing_for_copy_on_write, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+
+            acb->copy_lock.end = block_end;
+            acb->copy_lock.begin = block_end - s->block_size;
+            QLIST_INSERT_HEAD (&s->copy_locks, acb, copy_lock.next);
+            QDEBUG ("WRITE: acb%llu-%p  read_last_partial_block_from_backing  "
+                    "sector_num=%" PRId64 " nb_sectors=%d\n",
+                    acb->uuid, acb, sector_end, nb);
+        } else {
+            /* Case 4: Can write directly and no need to merge with data from
+             * the base image. */
+            QDEBUG ("WRITE: acb%llu-%p  "
+                    "write_fvd_without_read_partial_block_from_backing\n",
+                    acb->uuid, acb);
+            acb->write.hd_acb = store_data (FALSE, acb, bs, acb->sector_num,
+                                            acb->write.qiov, acb->nb_sectors,
+                                            finish_write_data, acb);
+            if (!acb->write.hd_acb) {
+                goto fail;
+            }
+        }
+    }
+
+    QLIST_INSERT_HEAD (&s->write_locks, acb, write.next_write_lock);
+    return 0;
+
+  fail:
+    if (acb->write.cow_buf) {
+        my_qemu_vfree (acb->write.cow_buf);
+    }
+    return -1;
+}
-- 
1.7.0.4

^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
                   ` (3 preceding siblings ...)
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 5/5] Fast Virtual Disk (FVD) Proposal Part 5 Chunqiang Tang
@ 2011-01-20 13:01 ` Christoph Hellwig
  2011-01-20 14:49   ` Chunqiang Tang
  2011-01-21 22:41 ` Anthony Liguori
  5 siblings, 1 reply; 18+ messages in thread
From: Christoph Hellwig @ 2011-01-20 13:01 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: qemu-devel

On Wed, Jan 19, 2011 at 05:04:44PM -0500, Chunqiang Tang wrote:
> Part 1 of the block device driver for the proposed FVD image format.
> Multiple patches are used in order to manage the size of each patch.
> This patch includes existing files that are modified by FVD.

Please try to split the patches into logical parts, and use descriptive
subject lines for each patch.

E.g. adding the new sim command to qemu-io could be one patch, adding
the img_update (why not just update?) command to qemu-img another,
moving code into qemu-tool-time.c one more, etc.


> -    
> +

Please do not introduce random whitespace changes in patches.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-20 13:01 ` [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Christoph Hellwig
@ 2011-01-20 14:49   ` Chunqiang Tang
  2011-01-20 17:08     ` Stefan Weil
  0 siblings, 1 reply; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-20 14:49 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: qemu-devel

> Please try to split the patches into logical parts, and use descriptive
> subject lines for each patch.
> E.g. adding the new sim command to qemu-io could be one patch, adding
> the img_update (why not just update?) command to qemu-img another,
> moving code into qemu-tool-time.c one more, etc.

Will do and thank you for the detailed instructions. 
 
> > - 
> > +
> Please do not introduce random whitespace changes in patches.

Stefan Weil previously suggested removing spaces at the end of a line, and 
I used a script to do that. It seems that in this example, the old code 
has multiple spaces on an empty line, which were automatically removed by 
the script.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-20 14:49   ` Chunqiang Tang
@ 2011-01-20 17:08     ` Stefan Weil
  2011-01-22  9:02       ` Peter Maydell
  0 siblings, 1 reply; 18+ messages in thread
From: Stefan Weil @ 2011-01-20 17:08 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: Christoph Hellwig, qemu-devel

Am 20.01.2011 15:49, schrieb Chunqiang Tang:
>> Please try to split the patches into logical parts, and use descriptive
>> subject lines for each patch.
>> E.g. adding the new sim command to qemu-io could be one patch, adding
>> the img_update (why not just update?) command to qemu-img another,
>> moving code into qemu-tool-time.c one more, etc.
>
> Will do and thank you for the detailed instructions.
>
>>> -
>>> +
>> Please do not introduce random whitespace changes in patches.
>
> Stefan Weil previously suggested removing spaces at the end of a line, 
> and
> I used a script to do that. It seems that in this example, the old code
> has multiple spaces on an empty line, which were automatically removed by
> the script.

Yes, that's a problem with some parts of the old code.
For files which you want to modify, you could remove
the spaces with your script before applying your other
modifications and create a separate patch which only
removes the superfluous spaces.

So your patch series would start with patches which
only remove spaces at line endings (and say so in the
patch descriptions). Then these changes are no longer
random whitespace changes.

Regards,
Stefan Weil

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
                   ` (4 preceding siblings ...)
  2011-01-20 13:01 ` [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Christoph Hellwig
@ 2011-01-21 22:41 ` Anthony Liguori
  2011-01-22  2:51   ` Chunqiang Tang
  5 siblings, 1 reply; 18+ messages in thread
From: Anthony Liguori @ 2011-01-21 22:41 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: qemu-devel

On 01/19/2011 04:04 PM, Chunqiang Tang wrote:
> Part 1 of the block device driver for the proposed FVD image format.
> Multiple patches are used in order to manage the size of each patch.
> This patch includes existing files that are modified by FVD.
>
> See the related discussions at
> http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .
>
> Signed-off-by: Chunqiang Tang<ctang@us.ibm.com>
> ---
>   Makefile         |   10 +++++---
>   Makefile.objs    |    1 +
>   block.c          |   12 +++++-----
>   block_int.h      |    5 ++-
>   configure        |    2 +-
>   qemu-img-cmds.hx |    6 +++++
>   qemu-img.c       |   62 ++++++++++++++++++++++++++++++++++++++++++++---------
>   qemu-io.c        |    3 ++
>   qemu-option.c    |    4 +++
>   qemu-tool.c      |   36 -------------------------------
>   10 files changed, 81 insertions(+), 60 deletions(-)
>
> diff --git a/Makefile b/Makefile
> index 6d601ee..da4d777 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -151,13 +151,15 @@ version-obj-$(CONFIG_WIN32) += version.o
>   ######################################################################
>
>   qemu-img.o: qemu-img-cmds.h
> -qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o cmd.o: $(GENERATED_HEADERS)
> +qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o cmd.o qemu-test.o: $(GENERATED_HEADERS)
>
> -qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
> +qemu-img$(EXESUF): qemu-img.o qemu-tool.o qemu-tool-time.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
>
> -qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
> +qemu-nbd$(EXESUF): qemu-nbd.o qemu-tool.o qemu-tool-time.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
>
> -qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
> +qemu-io$(EXESUF): qemu-io.o cmd.o qemu-tool.o qemu-tool-time.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
> +
> +qemu-test$(EXESUF): qemu-test.o qemu-tool.o qemu-error.o $(oslib-obj-y) $(trace-obj-y) $(block-obj-y) $(qobject-obj-y) $(version-obj-y) qemu-timer-common.o
>
>   qemu-img-cmds.h: $(SRC_PATH)/qemu-img-cmds.hx
>   	$(call quiet-command,sh $(SRC_PATH)/hxtool -h<  $<  >  $@,"  GEN   $@")
> diff --git a/Makefile.objs b/Makefile.objs
> index c3e52c5..c0c1155 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -23,6 +23,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o
>   block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>   block-nested-y += qed-check.o
>   block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += blksim.o fvd.o
>   block-nested-$(CONFIG_WIN32) += raw-win32.o
>   block-nested-$(CONFIG_POSIX) += raw-posix.o
>   block-nested-$(CONFIG_CURL) += curl.o
> diff --git a/block.c b/block.c
> index ff2795b..856bb1a 100644
> --- a/block.c
> +++ b/block.c
> @@ -58,7 +58,7 @@ static int bdrv_read_em(BlockDriverState *bs, int64_t sector_num,
>   static int bdrv_write_em(BlockDriverState *bs, int64_t sector_num,
>                            const uint8_t *buf, int nb_sectors);
>
> -static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
> +QTAILQ_HEAD(, BlockDriverState) bdrv_states =
>       QTAILQ_HEAD_INITIALIZER(bdrv_states);
>    

This looks suspicious and indicates your doing something bad.

>
>   static QLIST_HEAD(, BlockDriver) bdrv_drivers =
> @@ -768,7 +768,7 @@ int bdrv_commit(BlockDriverState *bs)
>
>       if (!drv)
>           return -ENOMEDIUM;
> -
> +
>       if (!bs->backing_hd) {
>           return -ENOTSUP;
>       }
> @@ -1538,7 +1538,7 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
>    * 'nb_sectors' is the max value 'pnum' should be set to.
>    */
>   int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
> -	int *pnum)
> +        int *pnum)
>   {
>       int64_t n;
>       if (!bs->drv->bdrv_is_allocated) {
> @@ -2050,9 +2050,9 @@ BlockDriverAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num,
>                                 cb, opaque);
>
>       if (ret) {
> -	/* Update stats even though technically transfer has not happened. */
> -	bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> -	bs->rd_ops ++;
> +        /* Update stats even though technically transfer has not happened. */
> +        bs->rd_bytes += (unsigned) nb_sectors * BDRV_SECTOR_SIZE;
> +        bs->rd_ops ++;
>       }
>
>       return ret;
> diff --git a/block_int.h b/block_int.h
> index 12663e8..2343d07 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -28,8 +28,8 @@
>   #include "qemu-option.h"
>   #include "qemu-queue.h"
>
> -#define BLOCK_FLAG_ENCRYPT	1
> -#define BLOCK_FLAG_COMPAT6	4
> +#define BLOCK_FLAG_ENCRYPT        1
> +#define BLOCK_FLAG_COMPAT6        4
>
>   #define BLOCK_OPT_SIZE          "size"
>   #define BLOCK_OPT_ENCRYPT       "encryption"
> @@ -98,6 +98,7 @@ struct BlockDriver {
>       int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
>                                     const char *snapshot_name);
>       int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
> +    int (*bdrv_update)(BlockDriverState *bs, int argc, char **argv);
>    

argc/argv is an awful interface because the semantics end up varying 
widely.  If we want to support changing disk format parameters, we 
should use a structured option format so we can ensure it's exposed to 
the user in a consistent way.  IOW, a size is always parsed as 
<integer>[SUFFIX] and not 8 different variations of that theme.

>       bs = qemu_mallocz(bs_n * sizeof(BlockDriverState *));
>
>       total_sectors = 0;
> @@ -865,7 +865,7 @@ static int img_convert(int argc, char **argv)
>                      assume that sectors which are unallocated in the input image
>                      are present in both the output's and input's base images (no
>                      need to copy them). */
> -                if (out_baseimg) {
> +                if (out_baseimg || bs[bs_i]->backing_file[0]==0) {
>    

This looks like a bug fix of some sort and should be it's own patch with 
an explanation.

>                       if (!bdrv_is_allocated(bs[bs_i], sector_num - bs_offset,
>                                              n,&n1)) {
>                           sector_num += n1;
> @@ -941,10 +941,10 @@ static int64_t get_allocated_file_size(const char *filename)
>       /* WinNT support GetCompressedFileSize to determine allocate size */
>       get_compressed = (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"), "GetCompressedFileSizeA");
>       if (get_compressed) {
> -    	DWORD high, low;
> -    	low = get_compressed(filename,&high);
> -    	if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
> -	    return (((int64_t) high)<<  32) + low;
> +            DWORD high, low;
> +            low = get_compressed(filename,&high);
> +            if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR)
> +            return (((int64_t) high)<<  32) + low;
>       }
>
>       if (_stati64(filename,&st)<  0)
> @@ -1036,11 +1036,6 @@ static int img_info(int argc, char **argv)
>       if (bdrv_is_encrypted(bs)) {
>           printf("encrypted: yes\n");
>       }
> -    if (bdrv_get_info(bs,&bdi)>= 0) {
> -        if (bdi.cluster_size != 0) {
> -            printf("cluster_size: %d\n", bdi.cluster_size);
> -        }
> -    }
>       bdrv_get_backing_filename(bs, backing_filename, sizeof(backing_filename));
>       if (backing_filename[0] != '\0') {
>           path_combine(backing_filename2, sizeof(backing_filename2),
> @@ -1049,11 +1044,56 @@ static int img_info(int argc, char **argv)
>                  backing_filename,
>                  backing_filename2);
>       }
> +    if (bdrv_get_info(bs,&bdi)>= 0) {
> +        if (bdi.cluster_size != 0)
> +            printf("cluster_size: %d\n", bdi.cluster_size);
> +    }
>       dump_snapshots(bs);
>       bdrv_delete(bs);
>       return 0;
>   }
>
> +static int img_update(int argc, char **argv)
> +{
> +    int c;
> +    const char *filename, *fmt;
> +    BlockDriverState *bs;
> +
> +    fmt = NULL;
> +    for(;;) {
> +        c = getopt(argc, argv, "f:h");
> +        if (c == -1)
> +            break;
> +        switch(c) {
> +        case 'h':
> +            help();
> +            break;
> +        case 'f':
> +            fmt = optarg;
> +            break;
> +        }
> +    }
> +    if (optind>= argc)
> +        help();
> +    filename = argv[optind++];
> +
> +    bs = bdrv_new_open(filename, fmt, BDRV_O_FLAGS | BDRV_O_NO_BACKING | BDRV_O_RDWR);
> +    if (!bs) {
> +        return 1;
> +    }
> +
> +    if (bs->drv->bdrv_update==NULL) {
> +        char fmt_name[128];
> +        bdrv_get_format(bs, fmt_name, sizeof(fmt_name));
> +        error_report ("the 'update' command is not supported for the '%s' image format.", fmt_name);
> +    }
> +
> +    bs->drv->bdrv_update(bs, argc-optind,&argv[optind]);
> +
> +    bdrv_delete(bs);
> +    return 0;
> +}
> +
>   #define SNAPSHOT_LIST   1
>   #define SNAPSHOT_CREATE 2
>   #define SNAPSHOT_APPLY  3
> diff --git a/qemu-io.c b/qemu-io.c
> index 5b24c5e..c32f8d4 100644
> --- a/qemu-io.c
> +++ b/qemu-io.c
> @@ -1701,6 +1701,8 @@ init_check_command(
>   	return 1;
>   }
>
> +#include "qemu-io-sim.c"
> +
>    

1) I don't see qemu-io-sim.c in this patch which means this breaks the build

2) Including C files is evil.

>   static void usage(const char *name)
>   {
>   	printf(
> @@ -1807,6 +1809,7 @@ int main(int argc, char **argv)
>   	add_command(&discard_cmd);
>   	add_command(&alloc_cmd);
>   	add_command(&map_cmd);
> +        add_command(&sim_cmd);
>
>   	add_args_command(init_args_command);
>   	add_check_command(init_check_command);
> diff --git a/qemu-option.c b/qemu-option.c
> index 65db542..10ef45f 100644
> --- a/qemu-option.c
> +++ b/qemu-option.c
> @@ -289,6 +289,10 @@ int set_option_parameter(QEMUOptionParameter *list, const char *name,
>               return -1;
>           break;
>
> +    case OPT_NUMBER:
> +        list->value.n = atoi (value);
> +        break;
> +
>       default:
>           fprintf(stderr, "Bug: Option '%s' has an unknown type\n", name);
>           return -1;
> diff --git a/qemu-tool.c b/qemu-tool.c
> index 392e1c9..fdcb2f8 100644
> --- a/qemu-tool.c
> +++ b/qemu-tool.c
> @@ -23,12 +23,6 @@ QEMUClock *rt_clock;
>
>   FILE *logfile;
>
> -struct QEMUBH
> -{
> -    QEMUBHFunc *cb;
> -    void *opaque;
> -};
> -
>   void qemu_service_io(void)
>   {
>   }
> @@ -73,36 +67,6 @@ void monitor_protocol_event(MonitorEvent event, QObject *data)
>   {
>   }
>
> -QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque)
> -{
> -    QEMUBH *bh;
> -
> -    bh = qemu_malloc(sizeof(*bh));
> -    bh->cb = cb;
> -    bh->opaque = opaque;
> -
> -    return bh;
> -}
> -
> -int qemu_bh_poll(void)
> -{
> -    return 0;
> -}
> -
> -void qemu_bh_schedule(QEMUBH *bh)
> -{
> -    bh->cb(bh->opaque);
> -}
> -
> -void qemu_bh_cancel(QEMUBH *bh)
> -{
> -}
> -
> -void qemu_bh_delete(QEMUBH *bh)
> -{
> -    qemu_free(bh);
> -}
> -
>   int qemu_set_fd_handler2(int fd,
>                            IOCanReadHandler *fd_read_poll,
>                            IOHandler *fd_read,
>    

These functions surely cannot just be deleted like this.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3
  2011-01-19 22:04 ` [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3 Chunqiang Tang
@ 2011-01-21 22:57   ` Anthony Liguori
  2011-01-21 23:09     ` Anthony Liguori
  2011-01-24 15:29     ` Chunqiang Tang
  0 siblings, 2 replies; 18+ messages in thread
From: Anthony Liguori @ 2011-01-21 22:57 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: qemu-devel

On 01/19/2011 04:04 PM, Chunqiang Tang wrote:
> Part 3 of the block device driver for the proposed FVD image format.
> Multiple patches are used in order to manage the size of each patch.
> This patch includes some new files for FVD.
>
> See the related discussions at
> http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .
>    

Before going any further with this series, I'd like to see

1) a specification (on the QEMU wiki) describing this image format that 
can be reviewed

2) a concise explanation of why qcow2/qed cannot satisfy the use cases 
addressed by FVD

3) performance data to backup the claims of (2)

I don't want a reference to a website or a paper.  Just a concise and 
clear explanation on the mailing list.

I think blocksim might be worth merging but I'm still extremely 
sceptical of the claimed benefits of FVD.

Comparing FVD performance to qcow2 is not really all that interesting.  
qcow2 is terrible from a performance perspective in the face of any 
rigorous benchmarking.  I would suggest focusing on QED in terms of 
comparative performance.

Regards,

Anthony Liguori

> Signed-off-by: Chunqiang Tang<ctang@us.ibm.com>
> ---
>   block/fvd-create.c |  475 +++++++++++++++++++++++++++++++++++++++++++++++++++
>   block/fvd-debug.c  |  406 ++++++++++++++++++++++++++++++++++++++++++++
>   block/fvd-ext.h    |   71 ++++++++
>   block/fvd.c        |  127 ++++++++++++++
>   block/fvd.h        |  481 ++++++++++++++++++++++++++++++++++++++++++++++++++++
>   5 files changed, 1560 insertions(+), 0 deletions(-)
>   create mode 100644 block/fvd-create.c
>   create mode 100644 block/fvd-debug.c
>   create mode 100644 block/fvd-ext.h
>   create mode 100644 block/fvd.c
>   create mode 100644 block/fvd.h
>
> diff --git a/block/fvd-create.c b/block/fvd-create.c
> new file mode 100644
> index 0000000..b978ecb
> --- /dev/null
> +++ b/block/fvd-create.c
> @@ -0,0 +1,475 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this module implements bdrv_create() for FVD.
> + *============================================================================*/
> +
> +static inline int64_t calc_min_journal_size (int64_t table_entries);
> +static inline int search_holes(const char *filename, size_t bitmap_size,
> +                    int32_t bitmap_start_offset, BlockDriverState * bs,
> +                    int64_t nb_sectors, int32_t hole_size, int32_t block_size);
> +
> +static int fvd_create (const char *filename, QEMUOptionParameter * options)
> +{
> +    int fd, ret;
> +    FvdHeader *header;
> +    int64_t virtual_disk_size = DEF_PAGE_SIZE;
> +    int32_t header_size;
> +    const char *base_img = NULL;
> +    const char *base_img_fmt = NULL;
> +    const char *data_file = NULL;
> +    const char *data_file_fmt = NULL;
> +    int32_t hole_size = 0;
> +    int copy_on_read = FALSE;
> +    int prefetch_start_delay = -1;
> +    int64_t prefetch_profile_size = 0;
> +    BlockDriverState *bs = NULL;
> +    int bitmap_size = 0;
> +    int64_t base_img_size = 0;
> +    int64_t table_size = 0;
> +    int64_t journal_size = 0;
> +    int32_t block_size = 0;
> +
> +    header_size = sizeof (FvdHeader);
> +    header_size = ROUND_UP (header_size, DEF_PAGE_SIZE);
> +    header = my_qemu_mallocz (header_size);
> +
> +    /* Read out options */
> +    while (options&&  options->name) {
> +        if (!strcmp (options->name, BLOCK_OPT_SIZE)) {
> +            virtual_disk_size = options->value.n;
> +        } else if (!strcmp (options->name,"prefetch_start_delay")) {
> +            if (options->value.n<= 0) {
> +                prefetch_start_delay = -1;
> +            } else {
> +                prefetch_start_delay = options->value.n;
> +            }
> +        } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FILE)) {
> +            base_img = options->value.s;
> +        } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FMT)) {
> +            base_img_fmt = options->value.s;
> +        } else if (!strcmp (options->name, "copy_on_read")) {
> +            copy_on_read = options->value.n;
> +        } else if (!strcmp (options->name, "data_file")) {
> +            data_file = options->value.s;
> +        } else if (!strcmp (options->name, "data_file_fmt")) {
> +            data_file_fmt = options->value.s;
> +        } else if (!strcmp (options->name, "detect_sparse_hole")) {
> +            hole_size = options->value.n;
> +        } else if (!strcmp (options->name, "compact_image")) {
> +            header->compact_image = options->value.n;
> +        } else if (!strcmp (options->name, "block_size")) {
> +            block_size = options->value.n;
> +        } else if (!strcmp (options->name, "chunk_size")) {
> +            header->chunk_size = options->value.n;
> +        } else if (!strcmp (options->name, "journal_size")) {
> +            journal_size = options->value.n;
> +        } else if (!strcmp (options->name, "storage_grow_unit")) {
> +            header->storage_grow_unit = options->value.n;
> +        } else if (!strcmp (options->name, "add_storage_cmd")
> +&&  options->value.s) {
> +            pstrcpy (header->add_storage_cmd, sizeof (header->add_storage_cmd),
> +                     options->value.s);
> +        }
> +        options++;
> +    }
> +
> +    virtual_disk_size = ROUND_UP (virtual_disk_size, 512);
> +
> +    /* Check if arguments are valid. */
> +    if (base_img&&  strlen (base_img)>  1023) {
> +        fprintf (stderr, "The base image name is longer than 1023 characters, "
> +                 "which is not allowed.\n");
> +        return -EINVAL;
> +    }
> +
> +    if (base_img&&  hole_size>  0) {
> +        if (header->compact_image) {
> +            fprintf (stderr, "compact_image and detect_sparse_hole cannot be "
> +                     "enabled together. Please disable detect_sparse_hole. \n");
> +            return -EINVAL;
> +        }
> +        header->need_zero_init = TRUE;
> +    } else {
> +        header->need_zero_init = FALSE;
> +    }
> +
> +    if (data_file) {
> +        pstrcpy (header->data_file, 1024, data_file);
> +        if (data_file_fmt) {
> +            pstrcpy (header->data_file_fmt, 16, data_file_fmt);
> +        }
> +    }
> +
> +    header->magic = FVD_MAGIC;
> +    header->version = FVD_VERSION;
> +    header->virtual_disk_size = virtual_disk_size;
> +    header->clean_shutdown = TRUE;
> +
> +    if (!base_img) {
> +        header->all_data_in_fvd_img = TRUE;
> +    } else {
> +        int ret;
> +
> +        bs = bdrv_new ("");
> +        if (!bs) {
> +            fprintf (stderr, "Failed to create a new block driver\n");
> +            return -1;
> +        }
> +
> +        pstrcpy (header->base_img, 1024, base_img);
> +        if (base_img_fmt) {
> +            pstrcpy (header->base_img_fmt, 16, base_img_fmt);
> +            BlockDriver *drv = bdrv_find_format (base_img_fmt);
> +            if (!drv) {
> +                fprintf (stderr, "Failed to find driver for format '%s'\n",
> +                         base_img_fmt);
> +                return -1;
> +            }
> +            ret = bdrv_open (bs, header->data_file, 0, drv);
> +        } else {
> +            ret = bdrv_open (bs, base_img, 0, NULL);
> +        }
> +
> +        if (ret<  0) {
> +            fprintf (stderr, "Failed to open the base image %s\n", base_img);
> +            return -1;
> +        }
> +
> +        base_img_size = bdrv_getlength (bs);
> +        base_img_size = MIN (virtual_disk_size, base_img_size);
> +        base_img_size = ROUND_UP (base_img_size, 512);
> +
> +        if (block_size<= 0) {
> +            /* No block size is provided. Find the smallest block size that
> +             * does not make the bitmap too big. */
> +            block_size = 512;
> +            while (1) {
> +                int64_t blocks = (base_img_size + block_size - 1) / block_size;
> +                bitmap_size = (blocks + 7) / 8;
> +                if (bitmap_size<= MODERATE_BITMAP_SIZE) {
> +                    break;
> +                }
> +                block_size *= 2;
> +            }
> +        } else {
> +            block_size = ROUND_UP (block_size, 512);
> +            int64_t blocks = (base_img_size + block_size - 1) / block_size;
> +            bitmap_size = (blocks + 7) / 8;
> +        }
> +
> +        bitmap_size = ROUND_UP (bitmap_size, DEF_PAGE_SIZE);
> +        header->bitmap_size = bitmap_size;
> +        header->block_size = block_size;
> +        header->bitmap_offset = header_size;
> +
> +        prefetch_profile_size = header->prefetch_profile_entries *
> +                                    sizeof (PrefetchProfileEntry);
> +        prefetch_profile_size = ROUND_UP (prefetch_profile_size, DEF_PAGE_SIZE);
> +        header->base_img_size = base_img_size;
> +        header->max_outstanding_copy_on_read_data =
> +                                    MAX_OUTSTANDING_COPY_ON_READ_DATA;
> +        header->copy_on_read = copy_on_read;
> +        header->prefetch_start_delay =
> +                                    prefetch_start_delay;
> +        header->num_prefetch_slots = NUM_PREFETCH_SLOTS;
> +        header->bytes_per_prefetch = ROUND_UP (BYTES_PER_PREFETCH, block_size);
> +        header->prefetch_throttle_time = PREFETCH_THROTTLING_TIME;
> +        header->prefetch_read_throughput_measure_time =
> +                                    PREFETCH_MIN_MEASURE_READ_TIME;
> +        header->prefetch_write_throughput_measure_time =
> +                                    PREFETCH_MIN_MEASURE_WRITE_TIME;
> +        header->prefetch_perf_calc_alpha = PREFETCH_PERF_CALC_ALPHA;
> +        header->prefetch_min_read_throughput = PREFETCH_MIN_READ_THROUGHPUT;
> +        header->prefetch_min_write_throughput = PREFETCH_MIN_WRITE_THROUGHPUT;
> +        header->prefetch_max_read_throughput = PREFETCH_MAX_READ_THROUGHPUT;
> +        header->prefetch_max_write_throughput = PREFETCH_MAX_WRITE_THROUGHPUT;
> +        header->all_data_in_fvd_img = FALSE;
> +        header->unit_of_PrefetchProfileEntry_len = DEF_PAGE_SIZE;
> +        header->generate_prefetch_profile = FALSE; /* To be implemented. */
> +        header->profile_directed_prefetch_start_delay = -1;/*To be implemented*/
> +    }
> +
> +    /* Set the table size. */
> +    if (header->compact_image) {
> +        if (header->chunk_size<= 0) {
> +            header->chunk_size = CHUNK_SIZE;
> +        }
> +        header->chunk_size = ROUND_UP (header->chunk_size, DEF_PAGE_SIZE);
> +        if (header->storage_grow_unit<= 0) {
> +            header->storage_grow_unit = STORAGE_GROW_UNIT;
> +        }
> +        if (header->storage_grow_unit<  header->chunk_size) {
> +            header->storage_grow_unit = header->chunk_size;
> +        }
> +        int64_t table_entries =
> +            (virtual_disk_size + header->chunk_size - 1) / header->chunk_size;
> +        table_size = sizeof (uint32_t) * table_entries;
> +        table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
> +        header->table_offset = header_size + bitmap_size;
> +    }
> +
> +    /* Set the journal size. */
> +    if (bitmap_size<= 0&&  table_size<= 0) {
> +        header->journal_size = 0;        /* No need to use journal. */
> +    } else if (journal_size<  0) {
> +        /* Disable the use of journal, which reduces overhead but may cause
> +         * data corruption if the host crashes. This is a valid configuration
> +         * for some use cases, where data integrity is not critical.  */
> +        header->journal_size = 0;
> +    } else {
> +        if (journal_size == 0) {
> +            /* No journal size is specified. Use a default size. */
> +            journal_size = JOURNAL_SIZE;
> +        }
> +        if (table_size>  0) {
> +            /* Make sure that the journal is at least large enough to record
> +             * all table changes in one shot, which is the extremely unlikely
> +             * worst case. */
> +            int64_t vsize = virtual_disk_size + header->chunk_size - 1;
> +            int64_t table_entries = vsize / header->chunk_size;
> +            int64_t min_journal_size = calc_min_journal_size (table_entries);
> +            if (journal_size<  min_journal_size) {
> +                journal_size = min_journal_size;
> +            }
> +        }
> +        journal_size = ROUND_UP (journal_size, DEF_PAGE_SIZE);
> +        header->journal_size = journal_size;
> +        header->journal_offset = header_size + bitmap_size + table_size;
> +    }
> +
> +    const int64_t metadata_size = header_size + bitmap_size + table_size +
> +                                prefetch_profile_size + MAX (0, journal_size);
> +    header->metadata_size = metadata_size;
> +
> +    fd = open (filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
> +    if (fd<  0) {
> +        fprintf (stderr, "Failed to open %s\n", filename);
> +        goto fail;
> +    }
> +    fvd_header_cpu_to_le (header);
> +
> +    if (qemu_write_full (fd, header, header_size) != header_size) {
> +        fprintf (stderr, "Failed to write the header of %s\n", filename);
> +        goto fail;
> +    }
> +
> +    /* Initialize the bitmap. */
> +    if (bitmap_size>  0) {
> +        uint8_t *bitmap = my_qemu_mallocz (bitmap_size);
> +        ret = qemu_write_full (fd, bitmap, bitmap_size);
> +        my_qemu_free (bitmap);
> +        if (ret != bitmap_size) {
> +            fprintf (stderr, "Failed to zero out the bitmap of %s\n", filename);
> +            goto fail;
> +        }
> +    }
> +
> +    /* Initialize the table. */
> +    if (table_size>  0) {
> +        /* Set all entries to EMPTY_TABLE (0xFFFFFFFF). */
> +        uint8_t *empty_table = my_qemu_malloc (table_size);
> +        memset (empty_table, 0xFF, table_size);
> +        ret = qemu_write_full (fd, empty_table, table_size);
> +        my_qemu_free (empty_table);
> +        if (ret != table_size) {
> +            fprintf (stderr, "Failed to write the table of %s\n.", filename);
> +            goto fail;
> +        }
> +    }
> +
> +    /* Initialize the journal. */
> +    if (journal_size>  0) {
> +        uint8_t *empty_journal = my_qemu_mallocz (journal_size);
> +        ret = qemu_write_full (fd, empty_journal, journal_size);
> +        my_qemu_free (empty_journal);
> +        if (ret != journal_size) {
> +            fprintf (stderr, "Failed to initialize the journal for %s\n.",
> +                     filename);
> +            goto fail;
> +        }
> +    }
> +
> +    close (fd);
> +    ret = 0;
> +
> +    if (bs&&  hole_size>  0) {
> +        ret = search_holes (filename, (size_t) bitmap_size, header_size, bs,
> +                            base_img_size / 512, hole_size, block_size);
> +    }
> +
> +    if (bs) {
> +        bdrv_close (bs);
> +    }
> +    my_qemu_free (header);
> +    return ret;
> +
> +  fail:
> +    if (bs) {
> +        bdrv_close (bs);
> +    }
> +    close (fd);
> +    my_qemu_free (header);
> +    return -1;
> +}
> +
> +/* For the optimization called "free write to zero-filled blocks". See Section
> + * 3.3.3 of the FVD-cow paper. */
> +static inline int search_holes (const char *filename, size_t bitmap_size,
> +                                int32_t bitmap_start_offset,
> +                                BlockDriverState * bs, int64_t nb_sectors,
> +                                int32_t hole_size, int32_t block_size)
> +{
> +    const int fd = open (filename, O_RDWR | O_BINARY | O_LARGEFILE, 0);
> +    if (fd<  0) {
> +        fprintf (stderr, "Failed to open %s for read and write.\n", filename);
> +        return -1;
> +    }
> +
> +    printf ("Searching zero-filled sectors in the base image. Please wait...");
> +    fflush (stdout);
> +
> +    uint8_t *bitmap =
> +        (uint8_t *) mmap (NULL, bitmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> +                          fd, (off_t) bitmap_start_offset);
> +    if (bitmap == MAP_FAILED) {
> +        fprintf (stderr, "Failed to mmap() %s\n", filename);
> +        close (fd);
> +        return -1;
> +    }
> +
> +    if (hole_size<  block_size) {
> +        hole_size = block_size;
> +    }
> +    hole_size = ROUND_UP (hole_size, block_size);
> +    nb_sectors = ROUND_DOWN (nb_sectors, hole_size);
> +    const int sectors_per_hole = hole_size / 512;
> +    const int sectors_per_block = block_size / 512;
> +    int num_int64_in_hole = hole_size / 8;
> +    int64_t hole_count = 0;
> +    int i, ret = 0;
> +    int64_t sec = 0;
> +    uint8_t *p = my_qemu_blockalign (bs, hole_size);
> +
> +    while (sec<  nb_sectors) {
> +        int64_t *q;
> +
> +        if (bdrv_read (bs, sec, p, sectors_per_hole)<  0) {
> +            fprintf (stderr, "Error in reading the base image\n");
> +            ret = -1;
> +            goto done;
> +        }
> +
> +        /* All zeros? */
> +        q = (int64_t *) p;
> +        for (i = 0; i<  num_int64_in_hole; i++) {
> +            if (*q != 0) {
> +                break;
> +            }
> +            q++;
> +        }
> +
> +        if (i<  num_int64_in_hole) {
> +            /* This is not a hole. */
> +            sec += sectors_per_hole;
> +        } else {
> +             /* These  sectors consist of only zeros.  Set the flag to
> +              * indicate that there is no need to read this sector from the
> +              * base image.  See Section 3.3.3 of the FVD-cow paper for the
> +              * rationale. */
> +            hole_count++;
> +            int64_t end = sec + sectors_per_hole;
> +            while (sec<  end) {
> +                int block_num = sec / sectors_per_block;
> +                int64_t bitmap_byte_offset = block_num / 8;
> +                uint8_t bitmap_bit_offset = block_num % 8;
> +                int8_t mask = (uint8_t) (0x01<<  bitmap_bit_offset);
> +                uint8_t b = bitmap[bitmap_byte_offset];
> +                if (!(b&  mask)) {
> +                    b |= mask;
> +                    bitmap[bitmap_byte_offset] |= mask;
> +                }
> +                sec += sectors_per_block;
> +            }
> +        }
> +    }
> +
> +  done:
> +    printf ("\nFound %" PRId64
> +            " zero-filled hole regions. Image creation done.\n", hole_count);
> +    my_qemu_vfree (p);
> +    munmap (bitmap, bitmap_size);
> +    close (fd);
> +    return ret;
> +}
> +
> +static QEMUOptionParameter fvd_create_options[] = {
> +    {
> +     .name = BLOCK_OPT_SIZE,
> +     .type = OPT_SIZE,
> +     .help = "Virtual disk size"},
> +    {
> +     .name = "compact_image",
> +     .type = OPT_FLAG,
> +     .help = "compact_image=on|off"},
> +    {
> +     .name = "block_size",
> +     .type = OPT_SIZE,
> +     .help = "Block size"},
> +    {
> +     .name = "chunk_size",
> +     .type = OPT_SIZE,
> +     .help = "Chunk size"},
> +    {
> +     .name = "storage_grow_unit",
> +     .type = OPT_SIZE,
> +     .help = "Storage grow unit"},
> +    {
> +     .name = "add_storage_cmd",
> +     .type = OPT_STRING,
> +     .help = "Command to add storage when FSI runs out of space"},
> +    {
> +     .name = BLOCK_OPT_BACKING_FILE,
> +     .type = OPT_STRING,
> +     .help = "File name of a backing image"},
> +    {
> +     .name = BLOCK_OPT_BACKING_FMT,
> +     .type = OPT_STRING,
> +     .help = "Image format of the backing image"},
> +    {
> +     .name = "data_file",
> +     .type = OPT_STRING,
> +     .help = "File name of a separate data file"},
> +    {
> +     .name = "data_file_fmt",
> +     .type = OPT_STRING,
> +     .help = "Image format of the separate data file"},
> +    {
> +     .name = "copy_on_read",
> +     .type = OPT_FLAG,
> +     .help = "copy_on_read=on|off"},
> +    {
> +     .name = "prefetch_start_delay",
> +     .type = OPT_NUMBER,
> +     .help = "Delay in seconds before starting whole image prefetching. "
> +         "Prefetching is disabled if the delay is not a positive number."},
> +    {
> +     .name = "detect_sparse_hole",
> +     .type = OPT_SIZE,
> +     .help = "Minimum size (in bytes) of a continuous zero-filled region to be "
> +         "considered as a sparse file hole in the backing image (setting it "
> +         "to 0 turns off sparse file detection)"},
> +    {
> +     .name = "journal_size",
> +     .type = OPT_SIZE,
> +     .help = "Journal size"},
> +    {NULL}
> +};
> diff --git a/block/fvd-debug.c b/block/fvd-debug.c
> new file mode 100644
> index 0000000..4cef5ec
> --- /dev/null
> +++ b/block/fvd-debug.c
> @@ -0,0 +1,406 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this module implements debugging functions for
> + *  the Fast Virtual Disk (FVD) format.
> + *============================================================================*/
> +
> +#ifndef ENABLE_TRACE_IO
> +# define TRACE_REQUEST(...) do {} while (0)
> +# define TRACE_STORE_IN_FVD(...) do {} while (0)
> +
> +#else
> +/* Monitor IO on a specific sector that triggers bugs. */
> +static inline void debug_sector (int64_t sector_num)
> +{
> +    if (FALSE) {
> +        if (sector_num == ((int64_t) 1023990LL)) {
> +            QPAUSE ("right sector");
> +        }
> +    }
> +}
> +
> +static void TRACE_REQUEST (int do_write, int64_t sector_num, int nb_sectors)
> +{
> +    if (do_write) {
> +        QDEBUG ("TRACE_REQUEST: write sector_num=%" PRId64
> +                " nb_sectors=%d\n    [ ", sector_num, nb_sectors);
> +    } else {
> +        QDEBUG ("TRACE_REQUEST: read  sector_num=%" PRId64 " nb_sectors=%d\n"
> +                "[ ", sector_num, nb_sectors);
> +    }
> +
> +    int64_t end = sector_num + nb_sectors;
> +    int64_t sec;
> +    for (sec = sector_num; sec<  end; sec++) {
> +        QDEBUG ("sec%" PRId64 " ", sec);
> +        debug_sector (sec);
> +    }
> +    QDEBUG (" ]\n");
> +}
> +
> +static void TRACE_STORE_IN_FVD (const char *str, int64_t sector_num,
> +                                int nb_sectors)
> +{
> +    QDEBUG ("TRACE_STORE: %s sector_num=%" PRId64 " nb_sectors=%d\n    [ ",
> +            str, sector_num, nb_sectors);
> +    int64_t end = sector_num + nb_sectors;
> +    int64_t sec;
> +    for (sec = sector_num; sec<  end; sec++) {
> +        QDEBUG ("sec%" PRId64 " ", sec);
> +        debug_sector (sec);
> +    }
> +    QDEBUG (" ]\n");
> +}
> +#endif
> +
> +#ifndef FVD_DEBUG
> +# define my_qemu_malloc qemu_malloc
> +# define my_qemu_mallocz qemu_mallocz
> +# define my_qemu_blockalign qemu_blockalign
> +# define my_qemu_free qemu_free
> +# define my_qemu_vfree qemu_vfree
> +# define my_qemu_aio_get qemu_aio_get
> +# define my_qemu_aio_release qemu_aio_release
> +# define COPY_UUID(to,from) do {} while (0)
> +
> +#else
> +FILE *__fvd_debug_fp;
> +static unsigned long long int fvd_uuid = 1;
> +static int64_t pending_qemu_malloc = 0;
> +static int64_t pending_qemu_aio_get = 0;
> +static int64_t pending_local_writes = 0;
> +static const char *alloc_file;
> +static int alloc_line;
> +
> +#define my_qemu_malloc(size) \
> +    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_malloc(size)))
> +
> +#define my_qemu_mallocz(size) \
> +    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_mallocz(size)))
> +
> +#define my_qemu_blockalign(bs,size) \
> +    ((void*)(alloc_file=__FILE__, \
> +             alloc_line=__LINE__, \
> +             _my_qemu_blockalign(bs,size)))
> +
> +#define my_qemu_aio_get(pool,bs,cb,op) \
> +    ((void*)(alloc_file=__FILE__, \
> +             alloc_line=__LINE__, \
> +             _my_qemu_aio_get(pool,bs,cb,op)))
> +
> +#define my_qemu_free(p) \
> +    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_free(p))
> +
> +#define my_qemu_vfree(p) \
> +    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_vfree(p))
> +
> +static void COPY_UUID (FvdAIOCB * to, FvdAIOCB * from)
> +{
> +    if (from) {
> +        to->uuid = from->uuid;
> +        FVD_DEBUG_ACB (to);
> +    }
> +}
> +
> +#ifdef DEBUG_MEMORY_LEAK
> +# define MAX_TRACER 10485760
> +static int alloc_tracer_used = 1;        /* slot 0 is not used. */
> +static void **alloc_tracers = NULL;
> +
> +static void __attribute__ ((constructor)) init_mem_alloc_tracers (void)
> +{
> +    if (!alloc_tracers) {
> +        alloc_tracers = qemu_mallocz (sizeof (void *) * MAX_TRACER);
> +    }
> +}
> +
> +static void trace_alloc (void *p, size_t size)
> +{
> +    alloc_tracer_t *t = p;
> +    t->magic = FVD_ALLOC_MAGIC;
> +    t->alloc_file = alloc_file;
> +    t->alloc_line = alloc_line;
> +    t->size = size;
> +
> +    if (alloc_tracer_used<  MAX_TRACER) {
> +        t->alloc_tracer = alloc_tracer_used++;
> +        alloc_tracers[t->alloc_tracer] = t;
> +        QDEBUG ("Allocate memory using tracer%d in %s on line %d.\n",
> +                t->alloc_tracer, alloc_file, alloc_line);
> +    } else {
> +        t->alloc_tracer = 0;
> +    }
> +
> +    /* Set header and footer to detect out-of-range writes. */
> +    if (size != (size_t) - 1) {
> +        uint8_t *q = (uint8_t *) p;
> +        uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
> +        uint64_t *footer = (uint64_t *) (q + size - 512);
> +        *header = FVD_ALLOC_MAGIC;
> +        *footer = FVD_ALLOC_MAGIC;
> +    }
> +}
> +
> +static void trace_free (void *p)
> +{
> +    alloc_tracer_t *t = p;
> +
> +    QDEBUG ("Free memory with tracer%d in %s on line %d.\n",
> +            t->alloc_tracer, alloc_file, alloc_line);
> +    ASSERT (t->magic == FVD_ALLOC_MAGIC&&  t->alloc_tracer>= 0);
> +
> +    /* Check header and footer to detect out-of-range writes. */
> +    if (t->size != (size_t) - 1) {
> +        uint8_t *q = (uint8_t *) p;
> +        uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
> +        uint64_t *footer = (uint64_t *) (q + t->size - 512);
> +        ASSERT (*header == FVD_ALLOC_MAGIC);
> +        ASSERT (*footer == FVD_ALLOC_MAGIC);
> +    }
> +
> +    if (t->alloc_tracer) {
> +        ASSERT (alloc_tracers[t->alloc_tracer] == t);
> +        alloc_tracers[t->alloc_tracer] = NULL;
> +        t->alloc_tracer = -INT_MAX;
> +    } else {
> +        t->alloc_tracer *= -1;        /* Guard against double free. */
> +    }
> +}
> +
> +static void dump_alloc_tracers (void)
> +{
> +    int unfreed = 0;
> +    int i;
> +    for (i = 1; i<  alloc_tracer_used; i++) {
> +        if (!alloc_tracers[i]) {
> +            continue;
> +        }
> +
> +        unfreed++;
> +        alloc_tracer_t *t = alloc_tracers[i];
> +
> +        if (t->size == (size_t) - 1) {
> +            FvdAIOCB *acb = container_of (alloc_tracers[i], FvdAIOCB, tracer);
> +            ASSERT (acb->magic == FVDAIOCB_MAGIC);
> +            QDEBUG ("Memory %p with tracer%d allocated in %s on line %d "
> +                    "(FvdAIOCB acb%llu-%p) is not freed. magic %s\n",
> +                    alloc_tracers[i], i, t->alloc_file, t->alloc_line,
> +                    acb->uuid, acb,
> +                    t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
> +        } else {
> +            QDEBUG ("Memory %p with tracer%d allocated in %s on line %d is "
> +                    "not freed. magic %s\n",
> +                    alloc_tracers[i], i, t->alloc_file, t->alloc_line,
> +                    t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
> +
> +            uint8_t *q = (uint8_t *) t;
> +            uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
> +            uint64_t *footer = (uint64_t *) (q + t->size - 512);
> +            ASSERT (*header == FVD_ALLOC_MAGIC);
> +            ASSERT (*footer == FVD_ALLOC_MAGIC);
> +        }
> +    }
> +
> +    QDEBUG ("Unfreed memory allocations: %d\n", unfreed);
> +}
> +#endif
> +
> +static inline void *_my_qemu_aio_get (AIOPool * pool, BlockDriverState * bs,
> +                                      BlockDriverCompletionFunc * cb,
> +                                      void *opaque)
> +{
> +    pending_qemu_aio_get++;
> +    FvdAIOCB *acb = (FvdAIOCB *) qemu_aio_get (&fvd_aio_pool, bs, cb, opaque);
> +    acb->uuid = ++fvd_uuid;
> +    acb->magic = FVDAIOCB_MAGIC;
> +
> +    FVD_DEBUG_ACB (acb);
> +
> +#ifdef DEBUG_MEMORY_LEAK
> +    trace_alloc (&acb->tracer, -1);
> +#endif
> +
> +    return acb;
> +}
> +
> +static inline void my_qemu_aio_release (void *p)
> +{
> +    pending_qemu_aio_get--;
> +    ASSERT (pending_qemu_aio_get>= 0);
> +
> +#ifdef DEBUG_MEMORY_LEAK
> +    FvdAIOCB *acb = p;
> +    trace_free (&acb->tracer);
> +#endif
> +
> +    qemu_aio_release (p);
> +}
> +
> +static inline void *_my_qemu_malloc (size_t size)
> +{
> +    ASSERT (size>  0);
> +    pending_qemu_malloc++;
> +#ifndef DEBUG_MEMORY_LEAK
> +    return qemu_malloc (size);
> +#else
> +
> +    size += 1024;        /* 512 bytes header and 512 bytes footer. */
> +    uint8_t *ret = qemu_malloc (size);
> +    trace_alloc (ret, size);
> +    return ret + 512;
> +#endif
> +}
> +
> +static inline void *_my_qemu_mallocz (size_t size)
> +{
> +    ASSERT (size>  0);
> +    pending_qemu_malloc++;
> +#ifndef DEBUG_MEMORY_LEAK
> +    return qemu_mallocz (size);
> +#else
> +
> +    size += 1024;        /* 512 bytes header and 512 bytes footer. */
> +    uint8_t *ret = qemu_mallocz (size);
> +    trace_alloc (ret, size);
> +    return ret + 512;
> +#endif
> +}
> +
> +static inline void *_my_qemu_blockalign (BlockDriverState * bs, size_t size)
> +{
> +    ASSERT (size>  0);
> +    pending_qemu_malloc++;
> +
> +#ifndef DEBUG_MEMORY_LEAK
> +    return qemu_blockalign (bs, size);
> +#else
> +
> +    size += 1024;        /* 512 bytes header and 512 bytes footer. */
> +    uint8_t *ret = qemu_blockalign (bs, size);
> +    trace_alloc (ret, size);
> +    return ret + 512;
> +#endif
> +}
> +
> +static inline void _my_qemu_free (void *ptr)
> +{
> +    pending_qemu_malloc--;
> +    ASSERT (pending_qemu_malloc>= 0);
> +#ifndef DEBUG_MEMORY_LEAK
> +    qemu_free (ptr);
> +#else
> +
> +    uint8_t *q = ((uint8_t *) ptr) - 512;
> +    trace_free (q);
> +    qemu_free (q);
> +#endif
> +}
> +
> +static inline void _my_qemu_vfree (void *ptr)
> +{
> +    pending_qemu_malloc--;
> +    ASSERT (pending_qemu_malloc>= 0);
> +#ifndef DEBUG_MEMORY_LEAK
> +    qemu_vfree (ptr);
> +#else
> +
> +    uint8_t *q = ((uint8_t *) ptr) - 512;
> +    trace_free (q);
> +    qemu_vfree (q);
> +#endif
> +}
> +
> +static void count_pending_requests (BDRVFvdState * s)
> +{
> +    int m = 0, k = 0;
> +    FvdAIOCB *w;
> +
> +    QLIST_FOREACH (w,&s->copy_locks, copy_lock.next) {
> +        m++;
> +        QDEBUG ("copy_lock: acb%llu-%p\n", w->uuid, w);
> +    }
> +
> +    QLIST_FOREACH (w,&s->write_locks, write.next_write_lock) {
> +        k++;
> +        QDEBUG ("write_lock: acb%llu-%p\n", w->uuid, w);
> +    }
> +
> +    QDEBUG ("Debug_memory_leak: copy_locks=%d  write_locks=%d\n", m, k);
> +}
> +
> +static void dump_resource_summary (BDRVFvdState * s)
> +{
> +#ifdef DEBUG_MEMORY_LEAK
> +    dump_alloc_tracers ();
> +#endif
> +
> +    QDEBUG ("Resource summary: outstanding_copy_on_read_data=%" PRId64
> +            " total_copy_on_read_data=%" PRId64 " total_prefetch_data=%" PRId64
> +            " " " pending_qemu_malloc=%" PRId64 " pending_qemu_aio_get=%" PRId64
> +            " pending_local_writes=%" PRId64 "\n",
> +            s->outstanding_copy_on_read_data, s->total_copy_on_read_data,
> +            s->total_prefetch_data, pending_qemu_malloc, pending_qemu_aio_get,
> +            pending_local_writes);
> +    count_pending_requests (s);
> +}
> +
> +/* Monitor processing a specific FvdAIOCB that triggers bugs. */
> +void FVD_DEBUG_ACB (void *p)
> +{
> +    if (FALSE) {
> +        FvdAIOCB *acb = p;
> +
> +        /* Is it FvdAIOCB? */
> +        if (acb->magic != FVDAIOCB_MAGIC || acb->common.bs->drv !=&bdrv_fvd) {
> +            /* Is it CompactChildCB? */
> +            CompactChildCB *child = p;
> +            acb = child->acb;
> +            if (acb->magic != FVDAIOCB_MAGIC
> +                || acb->common.bs->drv !=&bdrv_fvd
> +                || (acb->type != OP_LOAD_COMPACT
> +&&  acb->type != OP_STORE_COMPACT)) {
> +                return;
> +            }
> +        }
> +
> +        if (acb->uuid == 20ULL) {
> +            QPAUSE ("Processing the right acb");
> +        }
> +    }
> +}
> +
> +void init_fvd_debug_fp (void)
> +{
> +    char buf[256];
> +    sprintf (buf, "/tmp/fvd.log-%d", getpid ());
> +    if ((__fvd_debug_fp = fopen (buf, "wt")) == NULL) {
> +        __fvd_debug_fp = stdout;
> +    }
> +}
> +#endif
> +
> +void fvd_check_memory_usage (void)
> +{
> +    ASSERT (pending_qemu_malloc == 0);
> +}
> +
> +int fvd_get_copy_on_read (BlockDriverState * bs)
> +{
> +    BDRVFvdState *s = bs->opaque;
> +    return s->copy_on_read;
> +}
> +
> +void fvd_set_copy_on_read (BlockDriverState * bs, int copy_on_read)
> +{
> +    BDRVFvdState *s = bs->opaque;
> +    s->copy_on_read = copy_on_read;
> +}
> diff --git a/block/fvd-ext.h b/block/fvd-ext.h
> new file mode 100644
> index 0000000..6839e25
> --- /dev/null
> +++ b/block/fvd-ext.h
> @@ -0,0 +1,71 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this header file contains functions of the FVD block
> + *  device driver that are used by other external modules. These functions are
> + *  mainly for testing and debugging urposes.
> + *============================================================================*/
> +
> +#ifndef __fvd_debug_h__
> +#define __fvd_debug_h__
> +
> +//#define FVD_DEBUG
> +
> +int fvd_get_copy_on_read (BlockDriverState *bs);
> +void fvd_set_copy_on_read (BlockDriverState *bs, int copy_on_read);
> +void fvd_check_memory_usage (void);
> +void fvd_init_prefetch(void * bs);
> +void fvd_enable_host_crash_test (void);
> +
> +#ifndef TRUE
> +# define TRUE 1
> +#endif
> +#ifndef FALSE
> +# define FALSE 0
> +#endif
> +
> +#ifndef FVD_DEBUG
> +# define QDEBUG(format,...) do {} while (0)
> +# define ASSERT(x) do {} while (0)
> +# define FVD_DEBUG_ACB(...) do {} while (0)
> +# define QPAUSE(...) do {} while (0)
> +
> +#else
> +
> +extern FILE *__fvd_debug_fp;
> +void init_fvd_debug_fp (void);
> +void FVD_DEBUG_ACB (void *p);
> +# define QDEBUG(format,...) \
> +    do { \
> +        if (__fvd_debug_fp==NULL) init_fvd_debug_fp(); \
> +        fprintf (__fvd_debug_fp, format, ##__VA_ARGS__); \
> +        fflush(__fvd_debug_fp); \
> +    } while(0)
> +
> +# define ASSERT(x) \
> +    do { \
> +        if (!(x)) { \
> +            fprintf (stderr, "Assertion failed in process %d at %s:%d. " \
> +                "Waiting for debugging...\n", getpid(),__FILE__, __LINE__); \
> +            fgetc (stdin); exit (1);  \
> +        } \
> +    } while (0) \
> +
> +# define QPAUSE(format,...) \
> +    do { \
> +        printf (format, ##__VA_ARGS__); \
> +        printf (" Pause process %d for debugging...\n", getpid()); \
> +        fgetc (stdin); \
> +    } while (0)
> +
> +#endif
> +
> +#endif
> diff --git a/block/fvd.c b/block/fvd.c
> new file mode 100644
> index 0000000..311ff58
> --- /dev/null
> +++ b/block/fvd.c
> @@ -0,0 +1,127 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this module implements the QEMU block device driver
> + *  for the Fast Virtual Disk (FVD) format.  See the following companion
> + *  papers for a detailed description of FVD:
> + *  1. The so-called "FVD-cow paper":
> + *          "FVD: a High-Performance Virtual Machine Image Format for Cloud",
> + *      by Chunqiang Tang, 2010.
> + *  2. The so-called "FVD-compact paper":
> + *          "FVD: a High-Performance Virtual Machine Image Format for Cloud
> + *           with Sparse Image Capability", by Chunqiang Tang, 2010.
> + *============================================================================*/
> +
> +#include "block/fvd.h"
> +
> +//#define ENABLE_TRACE_IO
> +//#define DEBUG_MEMORY_LEAK
> +//#define SIMULATED_TEST_WITH_QEMU_IO
> +
> +#ifndef FVD_DEBUG
> +#undef DEBUG_MEMORY_LEAK
> +#undef ENABLE_TRACE_IO
> +#undef SIMULATED_TEST_WITH_QEMU_IO
> +#endif
> +
> +/* Use include to avoid exposing too many FVD symbols, and to allow inline
> + * function optimization. */
> +#include "block/fvd-utils.c"
> +#include "block/fvd-debug.c"
> +#include "block/fvd-misc.c"
> +#include "block/fvd-create.c"
> +#include "block/fvd-open.c"
> +#include "block/fvd-read.c"
> +#include "block/fvd-write.c"
> +#include "block/fvd-load.c"
> +#include "block/fvd-store.c"
> +#include "block/fvd-journal.c"
> +#include "block/fvd-prefetch.c"
> +
> +static AIOPool fvd_aio_pool = {
> +    .aiocb_size = sizeof (FvdAIOCB),
> +    .cancel = fvd_aio_cancel,
> +};
> +
> +static BlockDriver bdrv_fvd = {
> +    .format_name = "fvd",
> +    .instance_size = sizeof (BDRVFvdState),
> +    .bdrv_create = fvd_create,
> +    .bdrv_probe = fvd_probe,
> +    .bdrv_file_open = fvd_open,
> +    .bdrv_close = fvd_close,
> +    .bdrv_is_allocated = fvd_is_allocated,
> +    .bdrv_flush = fvd_flush,
> +    .bdrv_aio_readv = fvd_aio_readv,
> +    .bdrv_aio_writev = fvd_aio_writev,
> +    .bdrv_aio_flush = fvd_aio_flush,
> +    .create_options = fvd_create_options,
> +    .bdrv_get_info = fvd_get_info,
> +    .bdrv_update = fvd_update,
> +    .bdrv_has_zero_init = fvd_has_zero_init
> +};
> +
> +static void bdrv_fvd_init (void)
> +{
> +    bdrv_register (&bdrv_fvd);
> +}
> +
> +block_init (bdrv_fvd_init);
> +
> +/*
> + * Since bdrv_close may not be properly invoked on a VM shutdown, we
> + * use a destructor to flush metadata to disk. This only affects
> + * performance and does not affect correctness.
> + * See Section 3.3.4 of the FVD-cow paper for the rationale.
> + */
> +extern QTAILQ_HEAD (, BlockDriverState) bdrv_states;
> +static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk (void)
> +{
> +    BlockDriverState *bs;
> +    QTAILQ_FOREACH (bs,&bdrv_states, list) {
> +        if (bs->drv ==&bdrv_fvd) {
> +            flush_metadata_to_disk_on_exit (bs);
> +
> +#ifdef FVD_DEBUG
> +            dump_resource_summary (bs->opaque);
> +#endif
> +        }
> +    }
> +}
> +
> +/*
> + * TODOs: Below are some potential enhancements for future development:
> + * 1. Handle storage leak on failure.
> + *
> + * 2. Profile-directed prefetch. See Section 3.4.1 of the FVD-cow paper.
> + * Related metadata are FvdHeader.prefetch_profile_offset and
> + * FvdHeader.prefetch_profile_entries,
> + * FvdHeader.profile_directed_prefetch_start_delay,
> + * FvdHeader.generate_prefetch_profile.
> + *
> + * 3.  Cap the prefetch throughput at the upper limit. See Section 3.4.2 of
> + * the FVD-cow paper.  Related metadata are
> + * FvdHeader.prefetch_max_read_throughput and
> + * FvdHeader.prefetch_max_write_throughput.
> + *
> + * 4. Support write through to the base image. When a VM issues a write
> + * request, in addition to saving the data in the FVD data file, also save the
> + * data in the base image if the address of write request is not beyond the
> + * size of the base image (this of course requires the base image NOT to be
> + * 'read_only'. This feature changes the semantics of copy-on-write, but it
> + * suits a different use case, where the base image is stored on a remote
> + * storage server, and the FVD image is stored on a local disk and acts as a
> + * write-through cache of the base image. This can be used to cache and
> + * improve the performance of persistent storage on network-attached storage,
> + * e.g., Amazon EBS.  This feature is not described in the FVD-cow paper as it
> + * would complicate the discussion.  Related metadata are
> + * FvdHeader.write_updates_base_img.
> + */
> diff --git a/block/fvd.h b/block/fvd.h
> new file mode 100644
> index 0000000..cce8cc8
> --- /dev/null
> +++ b/block/fvd.h
> @@ -0,0 +1,481 @@
> +/*
> + * Copyright (c) 2010-2011 IBM
> + *
> + * Authors:
> + *         Chunqiang Tang<ctang@us.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +/*=============================================================================
> + *  A short description: this is the header of the FVD block device driver.
> + *============================================================================*/
> +
> +#include<sys/vfs.h>
> +#include<sys/mman.h>
> +#include<pthread.h>
> +#include<execinfo.h>
> +#include<stdlib.h>
> +#include<sys/ioctl.h>
> +#include<stdint.h>
> +#include<stdio.h>
> +#include<inttypes.h>
> +#include "block_int.h"
> +#include "osdep.h"
> +#include "qemu-option.h"
> +#include "qemu-timer.h"
> +#include "block.h"
> +#include "qemu-queue.h"
> +#include "qemu-common.h"
> +#include "block/blksim.h"
> +#include "block/fvd-ext.h"
> +
> +#define FVD_MAGIC         (('Q'<<  24) | ('C'<<  16) | (0xF5<<  8) | 0xA9)
> +#define FVD_VERSION         1
> +
> +/* Profile-directed prefetch. (to be implemented). */
> +typedef struct __attribute__ ((__packed__)) PrefetchProfileEntry {
> +    int64_t offset;        /* in bytes */
> +
> +    /* In the unit of FvdHeader.prefetch_profile_entry_len_unit, i.e.,
> +     * len_in_bytes = len * FvdHeader.unit_of_PrefetchProfileEntry_len. */
> +    uint32_t len;
> +} PrefetchProfileEntry;
> +
> +/*
> + * The FVD format consists of:
> + *   + Header fields of FvdHeader.
> + *   + Bitmap, starting on a 4KB page boundary at a location specified by
> + *     FvdHeader.bitmap_offset.
> + *   + Table, starting on a 4KB page boundary at a location specified by
> + *     FvdHeader.table_offset.
> + *   + Journal, starting on a 4KB page boundary at a location specified by
> + *     FvdHeader.journal_offset.
> + *   + Prefetch profile entries, starting on a 4KB page boundary at a location
> + *     specified by FvdHeader.prefetch_profile_offset. (to be implemented)
> + *   + Virtual disk data,  starting on a 4KB page boundary. Optionally, disk
> + *     data can be stored in a separate data file specified by
> + *     FvdHeader.data_file.
> + */
> +typedef struct __attribute__ ((__packed__)) FvdHeader {
> +    uint32_t magic;
> +    uint32_t version;
> +
> +    /* This field is set to TRUE after whole-image prefetching finishes. */
> +    int32_t all_data_in_fvd_img;
> +
> +    int64_t virtual_disk_size;        /* in bytes. Disk size perceived by the VM. */
> +    int64_t metadata_size;        /* in bytes. */
> +    char base_img[1024];
> +    char base_img_fmt[16];
> +    int64_t base_img_size;        /* in bytes. */
> +    int64_t bitmap_offset;        /* in bytes. Aligned on DEF_PAGE_SIZE. */
> +    int64_t bitmap_size;        /* in bytes. Rounded up to DEF_PAGE_SIZE */
> +    int32_t block_size;                /* in bytes. */
> +    int32_t copy_on_read;        /* TRUE or FALSE */
> +    int64_t max_outstanding_copy_on_read_data;        /* in bytes. */
> +
> +    /* If (data_file[0]==0), the FVD metadata and data are stored in one file.*/
> +    char data_file[1024];
> +    char data_file_fmt[16];
> +
> +    /******** Begin: for prefetching. *******************************/
> +    /* in seconds. -1 means disable whole image prefetching. */
> +    int32_t prefetch_start_delay;
> +
> +    /* in bytes. Aligned on DEF_PAGE_SIZE. (to be implemented) */
> +    int64_t prefetch_profile_offset;
> +
> +    /* Number of PrefetchProfileEntry. (to be implemented) */
> +    int64_t prefetch_profile_entries;
> +
> +    int32_t num_prefetch_slots;        /* Max number of oustanding prefetch writes. */
> +    int32_t bytes_per_prefetch;        /* For whole image prefetching. */
> +    int32_t prefetch_read_throughput_measure_time;        /* in milliseconds. */
> +    int32_t prefetch_write_throughput_measure_time;        /* in milliseconds. */
> +
> +    /* Controls the calculation of the moving average of throughput. Must be a
> +     * value between [0,100].
> +     *   actual_normalized_alpha = * prefetch_perf_calc_alpha / 100.0 */
> +    int32_t prefetch_perf_calc_alpha;
> +
> +    int32_t prefetch_min_read_throughput;        /* in KB/second. */
> +    int32_t prefetch_min_write_throughput;        /* in KB/second. */
> +    int32_t prefetch_max_read_throughput;        /* in KB/second. */
> +    int32_t prefetch_max_write_throughput;        /* in KB/second. */
> +
> +    /* in milliseconds. When prefetch read/write throughput is low, prefetch
> +     * pauses for a random time uniformly distributed in
> +     * [0, prefetch_throttle_time]. */
> +    int32_t prefetch_throttle_time;
> +    /******** End: for prefetching. *******************************/
> +
> +    /******** Begin: for compact image. *****************************/
> +    int32_t compact_image;        /* TRUE or FALSE */
> +    int64_t table_offset;        /* in bytes. */
> +    int64_t chunk_size;                /* in bytes. */
> +    int64_t storage_grow_unit;        /* in bytes. */
> +    char add_storage_cmd[2048];
> +    /******** End: for compact image. *******************************/
> +
> +    /******** Begin: for journal. ***********************************/
> +    int64_t journal_offset;        /* in bytes. */
> +    int64_t journal_size;        /* in bytes. */
> +    int32_t clean_shutdown;        /* TRUE if VM's last shutdown was graceful. */
> +    /******** End: for journal. *************************************/
> +
> +    /*
> +     * This field is TRUE if the image mandates that the storage layer
> +     * (BDRVFvdState.fvd_data) must return TRUE for bdrv_has_zero_init().
> +     * This is the case if the optimization described in Section 3.3.3 of the
> +     * FVD-cow paper is enabled (see function search_holes()). If 'qemu-img
> +     * create' sets need_zero_init to TRUE, 'qemu-img update' can be used to
> +     * manually reset it to FALSE, if the user always manually pre-fills the
> +     * storage (e.g., a raw partition) with zeros. If the image is stored on a
> +     * file system, it already supports zero_init, and hence there is no need
> +     * to manually manipulate this field.
> +     */
> +    int32_t need_zero_init;
> +
> +    /* If TRUE, FVD dumps a prefetch profile after the VM shuts down.
> +     * (to be implemented) */
> +    int32_t generate_prefetch_profile;
> +
> +    /* See the comment on PrefetchProfileEntry.len. (to be implemented) */
> +    int32_t unit_of_PrefetchProfileEntry_len;
> +
> +    /* in seconds. -1 means disable profile-directed prefetching.
> +     * (to be implemented) */
> +    int32_t profile_directed_prefetch_start_delay;
> +
> +    /* Possible values are "no", "writethrough", "writeback", or
> +     * "writenocache". (to be implemented) */
> +    char write_updates_base_img[16];
> +} FvdHeader;
> +
> +typedef struct BDRVFvdState {
> +    BlockDriverState *fvd_metadata;
> +    BlockDriverState *fvd_data;
> +    int64_t virtual_disk_size;        /*in bytes. */
> +    int64_t bitmap_offset;        /* in sectors */
> +    int64_t bitmap_size;        /* in bytes. */
> +    int64_t data_offset;        /* in sectors. Begin of real data. */
> +    int64_t nb_sectors_in_base_img;
> +    int32_t block_size;        /* in sectors. */
> +    int copy_on_read;        /* TRUE or FALSE */
> +    int64_t max_outstanding_copy_on_read_data;        /* in bytes. */
> +    int64_t outstanding_copy_on_read_data;        /* in bytes. */
> +    int data_region_prepared;        /* TRUE or FALSE */
> +     QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
> +     QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and CoW. */
> +
> +    /* Keep two copies of bitmap to reduce the overhead of updating the
> +     * on-disk bitmap, i.e., copy-on-read and prefetching do not update the
> +     * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
> +    uint8_t *fresh_bitmap;
> +    uint8_t *stale_bitmap;
> +
> +    /******** Begin: for prefetching. ***********************************/
> +    struct FvdAIOCB **prefetch_acb;
> +    int prefetch_state; /* PREFETCH_STATE_RUNNING, FINISHED, or DISABLED. */
> +    int prefetch_error;        /* TRUE or FALSE */
> +    int num_prefetch_slots;
> +    int num_filled_prefetch_slots;
> +    int next_prefetch_read_slot;
> +    int prefetch_read_active;                        /* TRUE or FALSE */
> +    int pause_prefetch_requested;                /* TRUE or FALSE */
> +    int prefetch_start_delay;        /* in seconds  */
> +    int64_t unclaimed_prefetch_region_start;
> +    int64_t prefetch_read_time;                        /* in milliseconds. */
> +    int64_t prefetch_write_time;                /* in milliseconds. */
> +    int64_t prefetch_data_read;                        /* in bytes. */
> +    int64_t prefetch_data_written;                /* in bytes. */
> +    double prefetch_read_throughput;                /* in bytes/millisecond. */
> +    double prefetch_write_throughput;                /* in bytes/millisecond. */
> +    double prefetch_min_read_throughput;        /* in bytes/millisecond. */
> +    double prefetch_min_write_throughput;        /* in bytes/millisecond. */
> +    int64_t prefetch_read_throughput_measure_time;        /* in millisecond. */
> +    int64_t prefetch_write_throughput_measure_time;        /* in millisecond.*/
> +    int prefetch_throttle_time;        /* in millisecond. */
> +    int sectors_per_prefetch;
> +    QEMUTimer *prefetch_timer;
> +    /* prefetch_perf_calc_alpha = FvdHeader.prefetch_perf_calc_alpha/100.0 */
> +    double prefetch_perf_calc_alpha;
> +    /******** End: for prefetching. ***********************************/
> +
> +    /******** Begin: for compact image. *************************************/
> +    uint32_t *table;        /* Mapping table stored in memory in little endian. */
> +    int64_t data_storage;        /* in sectors. */
> +    int64_t used_storage;        /* in sectors. */
> +    int64_t chunk_size;        /* in sectors. */
> +    int64_t storage_grow_unit;        /* in sectors. */
> +    int64_t table_offset;        /* in sectors. */
> +    char *add_storage_cmd;
> +    /******** Begin: for compact image. *************************************/
> +
> +    /******** Begin: for journal. *******************************************/
> +    int64_t journal_offset;        /* in sectors. */
> +    int64_t journal_size;        /* in sectors. */
> +    int64_t next_journal_sector;        /* in sector. */
> +    int ongoing_journal_updates;        /* Number of ongoing journal updates. */
> +    int dirty_image;        /* TRUE or FALSE. */
> +
> +    /* Requests waiting for metadata flush and journal recycle to finish. */
> +    QLIST_HEAD(JournalFlush, FvdAIOCB) wait_for_journal;
> +    /******** End: for journal. ********************************************/
> +
> +#ifdef FVD_DEBUG
> +    int64_t total_copy_on_read_data;                /* in bytes. */
> +    int64_t total_prefetch_data;                /* in bytes. */
> +#endif
> +} BDRVFvdState;
> +
> +/* Begin of data type definitions. */
> +struct FvdAIOCB;
> +
> +typedef struct JournalCB {
> +    BlockDriverAIOCB *hd_acb;
> +    QEMUIOVector qiov;
> +    struct iovec iov;
> +     QLIST_ENTRY(FvdAIOCB) next_wait_for_journal;
> +} JournalCB;
> +
> +/* CopyLock is used by AIOWriteCB and AIOCopyCB. */
> +typedef struct CopyLock {
> +    QLIST_ENTRY(FvdAIOCB) next;
> +    int64_t begin;
> +    int64_t end;
> +     QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
> +} CopyLock;
> +
> +typedef struct ChildAIOReadCB {
> +    BlockDriverAIOCB *hd_acb;
> +    struct iovec iov;
> +    QEMUIOVector qiov;
> +    int64_t sector_num;
> +    int nb_sectors;
> +    int done;
> +} ChildAIOReadCB;
> +
> +typedef struct AIOReadCB {
> +    QEMUIOVector *qiov;
> +    int ret;
> +    ChildAIOReadCB read_backing;
> +    ChildAIOReadCB read_fvd;
> +} AIOReadCB;
> +
> +/* For copy-on-read and prefetching. */
> +typedef struct AIOCopyCB {
> +    BlockDriverAIOCB *hd_acb;
> +    struct iovec iov;
> +    QEMUIOVector qiov;
> +    uint8_t *buf;
> +    int64_t buffered_sector_begin;
> +    int64_t buffered_sector_end;
> +    int64_t last_prefetch_op_start_time;        /* For prefetch only. */
> +} AIOCopyCB;
> +
> +typedef struct AIOWriteCB {
> +    BlockDriverAIOCB *hd_acb;
> +    QEMUIOVector *qiov;
> +    uint8_t *cow_buf;
> +    QEMUIOVector *cow_qiov;
> +    int64_t cow_start_sector;
> +    int update_table;        /* TRUE or FALSE. */
> +    int ret;
> +    QLIST_ENTRY(FvdAIOCB) next_write_lock;   /* See BDRVFvdState.write_locks */
> +
> +    /* See FvdAIOCB.write.dependent_writes. */
> +    QLIST_ENTRY(FvdAIOCB) next_dependent_write;
> +} AIOWriteCB;
> +
> +/* For AIOStoreCompactCB and AIOLoadCompactCB. */
> +typedef struct CompactChildCB {
> +    struct FvdAIOCB *acb;
> +    BlockDriverAIOCB *hd_acb;
> +} CompactChildCB;
> +
> +/* For storing data to a compact image. */
> +typedef struct AIOStoreCompactCB {
> +    CompactChildCB one_child;
> +    CompactChildCB *children;
> +    int update_table;
> +    int num_children;
> +    int finished_children;
> +    struct FvdAIOCB *parent_acb;
> +    int ret;
> +    int soft_write; /*TRUE if the store is caused by copy-on-read or prefetch.*/
> +    QEMUIOVector *orig_qiov;
> +} AIOStoreCompactCB;
> +
> +/* For loading data from a compact image. */
> +typedef struct AIOLoadCompactCB {
> +    CompactChildCB *children;
> +    CompactChildCB one_child;
> +    int num_children;
> +    int finished_children;
> +    struct FvdAIOCB *parent_acb;
> +    int ret;
> +    QEMUIOVector *orig_qiov;
> +} AIOLoadCompactCB;
> +
> +typedef struct AIOFlushCB {
> +    BlockDriverAIOCB *data_acb;
> +    BlockDriverAIOCB *metadata_acb;
> +    int num_finished;
> +    int ret;
> +} AIOFlushCB;
> +
> +typedef struct AIOWrapperCB {
> +    QEMUBH *bh;
> +} AIOWrapperCB;
> +
> +typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
> +    OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH } op_type;
> +
> +#ifdef FVD_DEBUG
> +/* For debugging memory leadk. */
> +typedef struct alloc_tracer_t {
> +    int64_t magic;
> +    int alloc_tracer;
> +    const char *alloc_file;
> +    int alloc_line;
> +    size_t size;
> +} alloc_tracer_t;
> +#endif
> +
> +typedef struct FvdAIOCB {
> +    BlockDriverAIOCB common;
> +    op_type type;
> +    int64_t sector_num;
> +    int nb_sectors;
> +    JournalCB jcb;        /* For AIOWriteCB and AIOStoreCompactCB. */
> +    CopyLock copy_lock;        /* For AIOWriteCB and AIOCopyCB. */
> +
> +    /* Use a union so that all requests can efficiently share one big AIOPool.*/
> +    union {
> +        AIOWrapperCB wrapper;
> +        AIOReadCB read;
> +        AIOWriteCB write;
> +        AIOCopyCB copy;
> +        AIOLoadCompactCB load;
> +        AIOStoreCompactCB store;
> +        AIOFlushCB flush;
> +    };
> +
> +#ifdef FVD_DEBUG
> +    int64_t magic;
> +    alloc_tracer_t tracer;
> +
> +    /* Uniquely identifies a request across all processing activities. */
> +    unsigned long long int uuid;
> +#endif
> +} FvdAIOCB;
> +
> +static AIOPool fvd_aio_pool;
> +static BlockDriver bdrv_fvd;
> +static QEMUOptionParameter fvd_create_options[];
> +
> +/* Function prototypes. */
> +static int do_aio_write(struct FvdAIOCB *acb);
> +static void finish_write_data(void *opaque, int ret);
> +static void restart_dependent_writes(struct FvdAIOCB *acb);
> +static void finish_prefetch_read(void *opaque, int ret);
> +static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
> +static int update_fvd_header(BDRVFvdState * s, FvdHeader * header);
> +static void fvd_aio_cancel(BlockDriverAIOCB * blockacb);
> +static BlockDriverAIOCB *store_data_in_compact_image(struct FvdAIOCB *acb,
> +            int soft_write, struct FvdAIOCB *parent_acb, BlockDriverState * bs,
> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
> +            BlockDriverCompletionFunc * cb, void *opaque);
> +static BlockDriverAIOCB *load_data_from_compact_image(struct FvdAIOCB *acb,
> +            struct FvdAIOCB *parent_acb, BlockDriverState * bs,
> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
> +            BlockDriverCompletionFunc * cb, void *opaque);
> +static void free_write_resource(struct FvdAIOCB *acb);
> +static void write_metadata_to_journal(struct FvdAIOCB *acb);
> +static void flush_metadata_to_disk(BlockDriverState * bs);
> +static void free_journal_sectors(BDRVFvdState * s);
> +static int fvd_create(const char *filename, QEMUOptionParameter * options);
> +static int fvd_probe(const uint8_t * buf, int buf_size, const char *filename);
> +static int fvd_open(BlockDriverState * bs, const char *filename, int flags);
> +static void fvd_close(BlockDriverState * bs);
> +static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
> +                            int nb_sectors, int *pnum);
> +static int fvd_flush(BlockDriverState * bs);
> +static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
> +            BlockDriverCompletionFunc * cb, void *opaque);
> +static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
> +            BlockDriverCompletionFunc * cb, void *opaque);
> +static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
> +            BlockDriverCompletionFunc * cb, void *opaque);
> +static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
> +static int fvd_update(BlockDriverState * bs, int argc, char **argv);
> +static int fvd_has_zero_init(BlockDriverState * bs);
> +static void fvd_read_cancel(FvdAIOCB * acb);
> +static void fvd_write_cancel(FvdAIOCB * acb);
> +static void fvd_copy_cancel(FvdAIOCB * acb);
> +static void fvd_load_compact_cancel(FvdAIOCB * acb);
> +static void fvd_store_compact_cancel(FvdAIOCB * acb);
> +static void fvd_wrapper_cancel(FvdAIOCB * acb);
> +static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
> +static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
> +            BlockDriverState * bs, int64_t sector_num, QEMUIOVector * orig_qiov,
> +            int nb_sectors, BlockDriverCompletionFunc * cb, void *opaque);
> +static inline BlockDriverAIOCB *store_data(int soft_write,
> +            FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t sector_num,
> +            QEMUIOVector * orig_qiov, int nb_sectors,
> +            BlockDriverCompletionFunc * cb, void *opaque);
> +
> +/* Default configurations. */
> +#define DEF_PAGE_SIZE                                 4096        /* bytes */
> +#define BYTES_PER_PREFETCH                        1048576        /* bytes */
> +#define PREFETCH_THROTTLING_TIME                30000        /* milliseconds */
> +#define NUM_PREFETCH_SLOTS                        2
> +#define PREFETCH_MIN_MEASURE_READ_TIME                 100        /* milliseconds */
> +#define PREFETCH_MIN_MEASURE_WRITE_TIME         100        /* milliseconds */
> +#define PREFETCH_MIN_READ_THROUGHPUT                 5120        /* KB/s */
> +#define PREFETCH_MIN_WRITE_THROUGHPUT                 5120        /* KB/s */
> +#define PREFETCH_MAX_READ_THROUGHPUT                 1000000000L        /* KB/s */
> +#define PREFETCH_MAX_WRITE_THROUGHPUT                 1000000000L        /* KB/s */
> +#define PREFETCH_PERF_CALC_ALPHA                80        /* in [0,100]. */
> +#define MAX_OUTSTANDING_COPY_ON_READ_DATA        2000000                /* bytes */
> +#define MODERATE_BITMAP_SIZE                         4194304L        /* bytes */
> +#define CHUNK_SIZE                                1048576LL        /* bytes */
> +#define JOURNAL_SIZE                                16777216LL        /* bytes */
> +#define STORAGE_GROW_UNIT                        104857600LL        /* bytes */
> +
> +/* State of BDRVFvdState.prefetch_state. */
> +#define PREFETCH_STATE_RUNNING                        1
> +#define PREFETCH_STATE_FINISHED                        2
> +#define PREFETCH_STATE_DISABLED                        3
> +
> +/* For convience. */
> +#define ROUND_UP(x, base)           ((((x)+(base)-1) / (base)) * (base))
> +#define ROUND_DOWN(x, base)           ((((x) / (base)) * (base)))
> +#define BOOL(x)                 ((x) ? "true" : "false")
> +#define EMPTY_TABLE                ((uint32_t)0xFFFFFFFF)
> +#define DIRTY_TABLE                ((uint32_t)0x80000000)
> +#define READ_TABLE(entry)         (le32_to_cpu(entry)&  ~DIRTY_TABLE)
> +# define FVDAIOCB_MAGIC         ((uint64_t)0x3A8FCE89325B976DULL)
> +# define FVD_ALLOC_MAGIC         ((uint64_t)0x4A7dCEF9925B976DULL)
> +#define IS_EMPTY(entry)         ((entry) == EMPTY_TABLE)
> +#define IS_DIRTY(entry)         (le32_to_cpu(entry)&  DIRTY_TABLE)
> +#define WRITE_TABLE(entry,id)         ((entry) = cpu_to_le32(id))
> +#define READ_TABLE2(entry) \
> +    ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry)&  ~DIRTY_TABLE))
> +
> +#define CLEAN_DIRTY(entry) \
> +    do {  \
> +        if (!IS_EMPTY(entry))  \
> +            entry = cpu_to_le32(le32_to_cpu(entry)&  ~DIRTY_TABLE);  \
> +    } while (0)
> +
> +#define CLEAN_DIRTY2(entry) \
> +    do { \
> +        ASSERT(!IS_EMPTY(entry)); \
> +        entry = cpu_to_le32(le32_to_cpu(entry)&  ~DIRTY_TABLE);  \
> +    } while (0)
>    

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3
  2011-01-21 22:57   ` Anthony Liguori
@ 2011-01-21 23:09     ` Anthony Liguori
  2011-01-24 15:29     ` Chunqiang Tang
  1 sibling, 0 replies; 18+ messages in thread
From: Anthony Liguori @ 2011-01-21 23:09 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: qemu-devel

On 01/21/2011 04:57 PM, Anthony Liguori wrote:
> On 01/19/2011 04:04 PM, Chunqiang Tang wrote:
>> Part 3 of the block device driver for the proposed FVD image format.
>> Multiple patches are used in order to manage the size of each patch.
>> This patch includes some new files for FVD.
>>
>> See the related discussions at
>> http://lists.gnu.org/archive/html/qemu-devel/2011-01/msg00426.html .
>
> Before going any further with this series, I'd like to see
>
> 1) a specification (on the QEMU wiki) describing this image format 
> that can be reviewed
>
> 2) a concise explanation of why qcow2/qed cannot satisfy the use cases 
> addressed by FVD

FYI, I don't accept fragmentation avoidance as a justification.  This 
has been discussed at great length in the past on the mailing list but 
essentially:

a. any reasonable block format can perform online defragmentation such 
that for a fully allocated image, with a fixed cost that's amortized 
over the lifetime of the image, you have an ideal layout

b. it's only applicable to simple scenarios where you're directly on top 
of raw spindles which is increasingly rare.  File systems reorder data 
on sparsely allocated files such you're no better off with a growing 
image vs. a sparse image if the extent size is large enough.

c. most storage has some degree of virtualization today so assumptions 
about contiguous allocation are more or less irrelevant

d. flash drives don't have a seek cost

Regards,

Anthony Liguori

> 3) performance data to backup the claims of (2)
>
> I don't want a reference to a website or a paper.  Just a concise and 
> clear explanation on the mailing list.
>
> I think blocksim might be worth merging but I'm still extremely 
> sceptical of the claimed benefits of FVD.
>
> Comparing FVD performance to qcow2 is not really all that 
> interesting.  qcow2 is terrible from a performance perspective in the 
> face of any rigorous benchmarking.  I would suggest focusing on QED in 
> terms of comparative performance.
>
> Regards,
>
> Anthony Liguori
>
>> Signed-off-by: Chunqiang Tang<ctang@us.ibm.com>
>> ---
>>   block/fvd-create.c |  475 
>> +++++++++++++++++++++++++++++++++++++++++++++++++++
>>   block/fvd-debug.c  |  406 ++++++++++++++++++++++++++++++++++++++++++++
>>   block/fvd-ext.h    |   71 ++++++++
>>   block/fvd.c        |  127 ++++++++++++++
>>   block/fvd.h        |  481 
>> ++++++++++++++++++++++++++++++++++++++++++++++++++++
>>   5 files changed, 1560 insertions(+), 0 deletions(-)
>>   create mode 100644 block/fvd-create.c
>>   create mode 100644 block/fvd-debug.c
>>   create mode 100644 block/fvd-ext.h
>>   create mode 100644 block/fvd.c
>>   create mode 100644 block/fvd.h
>>
>> diff --git a/block/fvd-create.c b/block/fvd-create.c
>> new file mode 100644
>> index 0000000..b978ecb
>> --- /dev/null
>> +++ b/block/fvd-create.c
>> @@ -0,0 +1,475 @@
>> +/*
>> + * Copyright (c) 2010-2011 IBM
>> + *
>> + * Authors:
>> + *         Chunqiang Tang<ctang@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +/*============================================================================= 
>>
>> + *  A short description: this module implements bdrv_create() for FVD.
>> + 
>> *============================================================================*/ 
>>
>> +
>> +static inline int64_t calc_min_journal_size (int64_t table_entries);
>> +static inline int search_holes(const char *filename, size_t 
>> bitmap_size,
>> +                    int32_t bitmap_start_offset, BlockDriverState * bs,
>> +                    int64_t nb_sectors, int32_t hole_size, int32_t 
>> block_size);
>> +
>> +static int fvd_create (const char *filename, QEMUOptionParameter * 
>> options)
>> +{
>> +    int fd, ret;
>> +    FvdHeader *header;
>> +    int64_t virtual_disk_size = DEF_PAGE_SIZE;
>> +    int32_t header_size;
>> +    const char *base_img = NULL;
>> +    const char *base_img_fmt = NULL;
>> +    const char *data_file = NULL;
>> +    const char *data_file_fmt = NULL;
>> +    int32_t hole_size = 0;
>> +    int copy_on_read = FALSE;
>> +    int prefetch_start_delay = -1;
>> +    int64_t prefetch_profile_size = 0;
>> +    BlockDriverState *bs = NULL;
>> +    int bitmap_size = 0;
>> +    int64_t base_img_size = 0;
>> +    int64_t table_size = 0;
>> +    int64_t journal_size = 0;
>> +    int32_t block_size = 0;
>> +
>> +    header_size = sizeof (FvdHeader);
>> +    header_size = ROUND_UP (header_size, DEF_PAGE_SIZE);
>> +    header = my_qemu_mallocz (header_size);
>> +
>> +    /* Read out options */
>> +    while (options&&  options->name) {
>> +        if (!strcmp (options->name, BLOCK_OPT_SIZE)) {
>> +            virtual_disk_size = options->value.n;
>> +        } else if (!strcmp (options->name,"prefetch_start_delay")) {
>> +            if (options->value.n<= 0) {
>> +                prefetch_start_delay = -1;
>> +            } else {
>> +                prefetch_start_delay = options->value.n;
>> +            }
>> +        } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FILE)) {
>> +            base_img = options->value.s;
>> +        } else if (!strcmp (options->name, BLOCK_OPT_BACKING_FMT)) {
>> +            base_img_fmt = options->value.s;
>> +        } else if (!strcmp (options->name, "copy_on_read")) {
>> +            copy_on_read = options->value.n;
>> +        } else if (!strcmp (options->name, "data_file")) {
>> +            data_file = options->value.s;
>> +        } else if (!strcmp (options->name, "data_file_fmt")) {
>> +            data_file_fmt = options->value.s;
>> +        } else if (!strcmp (options->name, "detect_sparse_hole")) {
>> +            hole_size = options->value.n;
>> +        } else if (!strcmp (options->name, "compact_image")) {
>> +            header->compact_image = options->value.n;
>> +        } else if (!strcmp (options->name, "block_size")) {
>> +            block_size = options->value.n;
>> +        } else if (!strcmp (options->name, "chunk_size")) {
>> +            header->chunk_size = options->value.n;
>> +        } else if (!strcmp (options->name, "journal_size")) {
>> +            journal_size = options->value.n;
>> +        } else if (!strcmp (options->name, "storage_grow_unit")) {
>> +            header->storage_grow_unit = options->value.n;
>> +        } else if (!strcmp (options->name, "add_storage_cmd")
>> +&&  options->value.s) {
>> +            pstrcpy (header->add_storage_cmd, sizeof 
>> (header->add_storage_cmd),
>> +                     options->value.s);
>> +        }
>> +        options++;
>> +    }
>> +
>> +    virtual_disk_size = ROUND_UP (virtual_disk_size, 512);
>> +
>> +    /* Check if arguments are valid. */
>> +    if (base_img&&  strlen (base_img)>  1023) {
>> +        fprintf (stderr, "The base image name is longer than 1023 
>> characters, "
>> +                 "which is not allowed.\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (base_img&&  hole_size>  0) {
>> +        if (header->compact_image) {
>> +            fprintf (stderr, "compact_image and detect_sparse_hole 
>> cannot be "
>> +                     "enabled together. Please disable 
>> detect_sparse_hole. \n");
>> +            return -EINVAL;
>> +        }
>> +        header->need_zero_init = TRUE;
>> +    } else {
>> +        header->need_zero_init = FALSE;
>> +    }
>> +
>> +    if (data_file) {
>> +        pstrcpy (header->data_file, 1024, data_file);
>> +        if (data_file_fmt) {
>> +            pstrcpy (header->data_file_fmt, 16, data_file_fmt);
>> +        }
>> +    }
>> +
>> +    header->magic = FVD_MAGIC;
>> +    header->version = FVD_VERSION;
>> +    header->virtual_disk_size = virtual_disk_size;
>> +    header->clean_shutdown = TRUE;
>> +
>> +    if (!base_img) {
>> +        header->all_data_in_fvd_img = TRUE;
>> +    } else {
>> +        int ret;
>> +
>> +        bs = bdrv_new ("");
>> +        if (!bs) {
>> +            fprintf (stderr, "Failed to create a new block driver\n");
>> +            return -1;
>> +        }
>> +
>> +        pstrcpy (header->base_img, 1024, base_img);
>> +        if (base_img_fmt) {
>> +            pstrcpy (header->base_img_fmt, 16, base_img_fmt);
>> +            BlockDriver *drv = bdrv_find_format (base_img_fmt);
>> +            if (!drv) {
>> +                fprintf (stderr, "Failed to find driver for format 
>> '%s'\n",
>> +                         base_img_fmt);
>> +                return -1;
>> +            }
>> +            ret = bdrv_open (bs, header->data_file, 0, drv);
>> +        } else {
>> +            ret = bdrv_open (bs, base_img, 0, NULL);
>> +        }
>> +
>> +        if (ret<  0) {
>> +            fprintf (stderr, "Failed to open the base image %s\n", 
>> base_img);
>> +            return -1;
>> +        }
>> +
>> +        base_img_size = bdrv_getlength (bs);
>> +        base_img_size = MIN (virtual_disk_size, base_img_size);
>> +        base_img_size = ROUND_UP (base_img_size, 512);
>> +
>> +        if (block_size<= 0) {
>> +            /* No block size is provided. Find the smallest block 
>> size that
>> +             * does not make the bitmap too big. */
>> +            block_size = 512;
>> +            while (1) {
>> +                int64_t blocks = (base_img_size + block_size - 1) / 
>> block_size;
>> +                bitmap_size = (blocks + 7) / 8;
>> +                if (bitmap_size<= MODERATE_BITMAP_SIZE) {
>> +                    break;
>> +                }
>> +                block_size *= 2;
>> +            }
>> +        } else {
>> +            block_size = ROUND_UP (block_size, 512);
>> +            int64_t blocks = (base_img_size + block_size - 1) / 
>> block_size;
>> +            bitmap_size = (blocks + 7) / 8;
>> +        }
>> +
>> +        bitmap_size = ROUND_UP (bitmap_size, DEF_PAGE_SIZE);
>> +        header->bitmap_size = bitmap_size;
>> +        header->block_size = block_size;
>> +        header->bitmap_offset = header_size;
>> +
>> +        prefetch_profile_size = header->prefetch_profile_entries *
>> +                                    sizeof (PrefetchProfileEntry);
>> +        prefetch_profile_size = ROUND_UP (prefetch_profile_size, 
>> DEF_PAGE_SIZE);
>> +        header->base_img_size = base_img_size;
>> +        header->max_outstanding_copy_on_read_data =
>> +                                    MAX_OUTSTANDING_COPY_ON_READ_DATA;
>> +        header->copy_on_read = copy_on_read;
>> +        header->prefetch_start_delay =
>> +                                    prefetch_start_delay;
>> +        header->num_prefetch_slots = NUM_PREFETCH_SLOTS;
>> +        header->bytes_per_prefetch = ROUND_UP (BYTES_PER_PREFETCH, 
>> block_size);
>> +        header->prefetch_throttle_time = PREFETCH_THROTTLING_TIME;
>> +        header->prefetch_read_throughput_measure_time =
>> +                                    PREFETCH_MIN_MEASURE_READ_TIME;
>> +        header->prefetch_write_throughput_measure_time =
>> +                                    PREFETCH_MIN_MEASURE_WRITE_TIME;
>> +        header->prefetch_perf_calc_alpha = PREFETCH_PERF_CALC_ALPHA;
>> +        header->prefetch_min_read_throughput = 
>> PREFETCH_MIN_READ_THROUGHPUT;
>> +        header->prefetch_min_write_throughput = 
>> PREFETCH_MIN_WRITE_THROUGHPUT;
>> +        header->prefetch_max_read_throughput = 
>> PREFETCH_MAX_READ_THROUGHPUT;
>> +        header->prefetch_max_write_throughput = 
>> PREFETCH_MAX_WRITE_THROUGHPUT;
>> +        header->all_data_in_fvd_img = FALSE;
>> +        header->unit_of_PrefetchProfileEntry_len = DEF_PAGE_SIZE;
>> +        header->generate_prefetch_profile = FALSE; /* To be 
>> implemented. */
>> +        header->profile_directed_prefetch_start_delay = -1;/*To be 
>> implemented*/
>> +    }
>> +
>> +    /* Set the table size. */
>> +    if (header->compact_image) {
>> +        if (header->chunk_size<= 0) {
>> +            header->chunk_size = CHUNK_SIZE;
>> +        }
>> +        header->chunk_size = ROUND_UP (header->chunk_size, 
>> DEF_PAGE_SIZE);
>> +        if (header->storage_grow_unit<= 0) {
>> +            header->storage_grow_unit = STORAGE_GROW_UNIT;
>> +        }
>> +        if (header->storage_grow_unit<  header->chunk_size) {
>> +            header->storage_grow_unit = header->chunk_size;
>> +        }
>> +        int64_t table_entries =
>> +            (virtual_disk_size + header->chunk_size - 1) / 
>> header->chunk_size;
>> +        table_size = sizeof (uint32_t) * table_entries;
>> +        table_size = ROUND_UP (table_size, DEF_PAGE_SIZE);
>> +        header->table_offset = header_size + bitmap_size;
>> +    }
>> +
>> +    /* Set the journal size. */
>> +    if (bitmap_size<= 0&&  table_size<= 0) {
>> +        header->journal_size = 0;        /* No need to use journal. */
>> +    } else if (journal_size<  0) {
>> +        /* Disable the use of journal, which reduces overhead but 
>> may cause
>> +         * data corruption if the host crashes. This is a valid 
>> configuration
>> +         * for some use cases, where data integrity is not 
>> critical.  */
>> +        header->journal_size = 0;
>> +    } else {
>> +        if (journal_size == 0) {
>> +            /* No journal size is specified. Use a default size. */
>> +            journal_size = JOURNAL_SIZE;
>> +        }
>> +        if (table_size>  0) {
>> +            /* Make sure that the journal is at least large enough 
>> to record
>> +             * all table changes in one shot, which is the extremely 
>> unlikely
>> +             * worst case. */
>> +            int64_t vsize = virtual_disk_size + header->chunk_size - 1;
>> +            int64_t table_entries = vsize / header->chunk_size;
>> +            int64_t min_journal_size = calc_min_journal_size 
>> (table_entries);
>> +            if (journal_size<  min_journal_size) {
>> +                journal_size = min_journal_size;
>> +            }
>> +        }
>> +        journal_size = ROUND_UP (journal_size, DEF_PAGE_SIZE);
>> +        header->journal_size = journal_size;
>> +        header->journal_offset = header_size + bitmap_size + 
>> table_size;
>> +    }
>> +
>> +    const int64_t metadata_size = header_size + bitmap_size + 
>> table_size +
>> +                                prefetch_profile_size + MAX (0, 
>> journal_size);
>> +    header->metadata_size = metadata_size;
>> +
>> +    fd = open (filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 
>> 0644);
>> +    if (fd<  0) {
>> +        fprintf (stderr, "Failed to open %s\n", filename);
>> +        goto fail;
>> +    }
>> +    fvd_header_cpu_to_le (header);
>> +
>> +    if (qemu_write_full (fd, header, header_size) != header_size) {
>> +        fprintf (stderr, "Failed to write the header of %s\n", 
>> filename);
>> +        goto fail;
>> +    }
>> +
>> +    /* Initialize the bitmap. */
>> +    if (bitmap_size>  0) {
>> +        uint8_t *bitmap = my_qemu_mallocz (bitmap_size);
>> +        ret = qemu_write_full (fd, bitmap, bitmap_size);
>> +        my_qemu_free (bitmap);
>> +        if (ret != bitmap_size) {
>> +            fprintf (stderr, "Failed to zero out the bitmap of 
>> %s\n", filename);
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    /* Initialize the table. */
>> +    if (table_size>  0) {
>> +        /* Set all entries to EMPTY_TABLE (0xFFFFFFFF). */
>> +        uint8_t *empty_table = my_qemu_malloc (table_size);
>> +        memset (empty_table, 0xFF, table_size);
>> +        ret = qemu_write_full (fd, empty_table, table_size);
>> +        my_qemu_free (empty_table);
>> +        if (ret != table_size) {
>> +            fprintf (stderr, "Failed to write the table of %s\n.", 
>> filename);
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    /* Initialize the journal. */
>> +    if (journal_size>  0) {
>> +        uint8_t *empty_journal = my_qemu_mallocz (journal_size);
>> +        ret = qemu_write_full (fd, empty_journal, journal_size);
>> +        my_qemu_free (empty_journal);
>> +        if (ret != journal_size) {
>> +            fprintf (stderr, "Failed to initialize the journal for 
>> %s\n.",
>> +                     filename);
>> +            goto fail;
>> +        }
>> +    }
>> +
>> +    close (fd);
>> +    ret = 0;
>> +
>> +    if (bs&&  hole_size>  0) {
>> +        ret = search_holes (filename, (size_t) bitmap_size, 
>> header_size, bs,
>> +                            base_img_size / 512, hole_size, 
>> block_size);
>> +    }
>> +
>> +    if (bs) {
>> +        bdrv_close (bs);
>> +    }
>> +    my_qemu_free (header);
>> +    return ret;
>> +
>> +  fail:
>> +    if (bs) {
>> +        bdrv_close (bs);
>> +    }
>> +    close (fd);
>> +    my_qemu_free (header);
>> +    return -1;
>> +}
>> +
>> +/* For the optimization called "free write to zero-filled blocks". 
>> See Section
>> + * 3.3.3 of the FVD-cow paper. */
>> +static inline int search_holes (const char *filename, size_t 
>> bitmap_size,
>> +                                int32_t bitmap_start_offset,
>> +                                BlockDriverState * bs, int64_t 
>> nb_sectors,
>> +                                int32_t hole_size, int32_t block_size)
>> +{
>> +    const int fd = open (filename, O_RDWR | O_BINARY | O_LARGEFILE, 0);
>> +    if (fd<  0) {
>> +        fprintf (stderr, "Failed to open %s for read and write.\n", 
>> filename);
>> +        return -1;
>> +    }
>> +
>> +    printf ("Searching zero-filled sectors in the base image. Please 
>> wait...");
>> +    fflush (stdout);
>> +
>> +    uint8_t *bitmap =
>> +        (uint8_t *) mmap (NULL, bitmap_size, PROT_READ | PROT_WRITE, 
>> MAP_SHARED,
>> +                          fd, (off_t) bitmap_start_offset);
>> +    if (bitmap == MAP_FAILED) {
>> +        fprintf (stderr, "Failed to mmap() %s\n", filename);
>> +        close (fd);
>> +        return -1;
>> +    }
>> +
>> +    if (hole_size<  block_size) {
>> +        hole_size = block_size;
>> +    }
>> +    hole_size = ROUND_UP (hole_size, block_size);
>> +    nb_sectors = ROUND_DOWN (nb_sectors, hole_size);
>> +    const int sectors_per_hole = hole_size / 512;
>> +    const int sectors_per_block = block_size / 512;
>> +    int num_int64_in_hole = hole_size / 8;
>> +    int64_t hole_count = 0;
>> +    int i, ret = 0;
>> +    int64_t sec = 0;
>> +    uint8_t *p = my_qemu_blockalign (bs, hole_size);
>> +
>> +    while (sec<  nb_sectors) {
>> +        int64_t *q;
>> +
>> +        if (bdrv_read (bs, sec, p, sectors_per_hole)<  0) {
>> +            fprintf (stderr, "Error in reading the base image\n");
>> +            ret = -1;
>> +            goto done;
>> +        }
>> +
>> +        /* All zeros? */
>> +        q = (int64_t *) p;
>> +        for (i = 0; i<  num_int64_in_hole; i++) {
>> +            if (*q != 0) {
>> +                break;
>> +            }
>> +            q++;
>> +        }
>> +
>> +        if (i<  num_int64_in_hole) {
>> +            /* This is not a hole. */
>> +            sec += sectors_per_hole;
>> +        } else {
>> +             /* These  sectors consist of only zeros.  Set the flag to
>> +              * indicate that there is no need to read this sector 
>> from the
>> +              * base image.  See Section 3.3.3 of the FVD-cow paper 
>> for the
>> +              * rationale. */
>> +            hole_count++;
>> +            int64_t end = sec + sectors_per_hole;
>> +            while (sec<  end) {
>> +                int block_num = sec / sectors_per_block;
>> +                int64_t bitmap_byte_offset = block_num / 8;
>> +                uint8_t bitmap_bit_offset = block_num % 8;
>> +                int8_t mask = (uint8_t) (0x01<<  bitmap_bit_offset);
>> +                uint8_t b = bitmap[bitmap_byte_offset];
>> +                if (!(b&  mask)) {
>> +                    b |= mask;
>> +                    bitmap[bitmap_byte_offset] |= mask;
>> +                }
>> +                sec += sectors_per_block;
>> +            }
>> +        }
>> +    }
>> +
>> +  done:
>> +    printf ("\nFound %" PRId64
>> +            " zero-filled hole regions. Image creation done.\n", 
>> hole_count);
>> +    my_qemu_vfree (p);
>> +    munmap (bitmap, bitmap_size);
>> +    close (fd);
>> +    return ret;
>> +}
>> +
>> +static QEMUOptionParameter fvd_create_options[] = {
>> +    {
>> +     .name = BLOCK_OPT_SIZE,
>> +     .type = OPT_SIZE,
>> +     .help = "Virtual disk size"},
>> +    {
>> +     .name = "compact_image",
>> +     .type = OPT_FLAG,
>> +     .help = "compact_image=on|off"},
>> +    {
>> +     .name = "block_size",
>> +     .type = OPT_SIZE,
>> +     .help = "Block size"},
>> +    {
>> +     .name = "chunk_size",
>> +     .type = OPT_SIZE,
>> +     .help = "Chunk size"},
>> +    {
>> +     .name = "storage_grow_unit",
>> +     .type = OPT_SIZE,
>> +     .help = "Storage grow unit"},
>> +    {
>> +     .name = "add_storage_cmd",
>> +     .type = OPT_STRING,
>> +     .help = "Command to add storage when FSI runs out of space"},
>> +    {
>> +     .name = BLOCK_OPT_BACKING_FILE,
>> +     .type = OPT_STRING,
>> +     .help = "File name of a backing image"},
>> +    {
>> +     .name = BLOCK_OPT_BACKING_FMT,
>> +     .type = OPT_STRING,
>> +     .help = "Image format of the backing image"},
>> +    {
>> +     .name = "data_file",
>> +     .type = OPT_STRING,
>> +     .help = "File name of a separate data file"},
>> +    {
>> +     .name = "data_file_fmt",
>> +     .type = OPT_STRING,
>> +     .help = "Image format of the separate data file"},
>> +    {
>> +     .name = "copy_on_read",
>> +     .type = OPT_FLAG,
>> +     .help = "copy_on_read=on|off"},
>> +    {
>> +     .name = "prefetch_start_delay",
>> +     .type = OPT_NUMBER,
>> +     .help = "Delay in seconds before starting whole image 
>> prefetching. "
>> +         "Prefetching is disabled if the delay is not a positive 
>> number."},
>> +    {
>> +     .name = "detect_sparse_hole",
>> +     .type = OPT_SIZE,
>> +     .help = "Minimum size (in bytes) of a continuous zero-filled 
>> region to be "
>> +         "considered as a sparse file hole in the backing image 
>> (setting it "
>> +         "to 0 turns off sparse file detection)"},
>> +    {
>> +     .name = "journal_size",
>> +     .type = OPT_SIZE,
>> +     .help = "Journal size"},
>> +    {NULL}
>> +};
>> diff --git a/block/fvd-debug.c b/block/fvd-debug.c
>> new file mode 100644
>> index 0000000..4cef5ec
>> --- /dev/null
>> +++ b/block/fvd-debug.c
>> @@ -0,0 +1,406 @@
>> +/*
>> + * Copyright (c) 2010-2011 IBM
>> + *
>> + * Authors:
>> + *         Chunqiang Tang<ctang@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +/*============================================================================= 
>>
>> + *  A short description: this module implements debugging functions for
>> + *  the Fast Virtual Disk (FVD) format.
>> + 
>> *============================================================================*/ 
>>
>> +
>> +#ifndef ENABLE_TRACE_IO
>> +# define TRACE_REQUEST(...) do {} while (0)
>> +# define TRACE_STORE_IN_FVD(...) do {} while (0)
>> +
>> +#else
>> +/* Monitor IO on a specific sector that triggers bugs. */
>> +static inline void debug_sector (int64_t sector_num)
>> +{
>> +    if (FALSE) {
>> +        if (sector_num == ((int64_t) 1023990LL)) {
>> +            QPAUSE ("right sector");
>> +        }
>> +    }
>> +}
>> +
>> +static void TRACE_REQUEST (int do_write, int64_t sector_num, int 
>> nb_sectors)
>> +{
>> +    if (do_write) {
>> +        QDEBUG ("TRACE_REQUEST: write sector_num=%" PRId64
>> +                " nb_sectors=%d\n    [ ", sector_num, nb_sectors);
>> +    } else {
>> +        QDEBUG ("TRACE_REQUEST: read  sector_num=%" PRId64 " 
>> nb_sectors=%d\n"
>> +                "[ ", sector_num, nb_sectors);
>> +    }
>> +
>> +    int64_t end = sector_num + nb_sectors;
>> +    int64_t sec;
>> +    for (sec = sector_num; sec<  end; sec++) {
>> +        QDEBUG ("sec%" PRId64 " ", sec);
>> +        debug_sector (sec);
>> +    }
>> +    QDEBUG (" ]\n");
>> +}
>> +
>> +static void TRACE_STORE_IN_FVD (const char *str, int64_t sector_num,
>> +                                int nb_sectors)
>> +{
>> +    QDEBUG ("TRACE_STORE: %s sector_num=%" PRId64 " 
>> nb_sectors=%d\n    [ ",
>> +            str, sector_num, nb_sectors);
>> +    int64_t end = sector_num + nb_sectors;
>> +    int64_t sec;
>> +    for (sec = sector_num; sec<  end; sec++) {
>> +        QDEBUG ("sec%" PRId64 " ", sec);
>> +        debug_sector (sec);
>> +    }
>> +    QDEBUG (" ]\n");
>> +}
>> +#endif
>> +
>> +#ifndef FVD_DEBUG
>> +# define my_qemu_malloc qemu_malloc
>> +# define my_qemu_mallocz qemu_mallocz
>> +# define my_qemu_blockalign qemu_blockalign
>> +# define my_qemu_free qemu_free
>> +# define my_qemu_vfree qemu_vfree
>> +# define my_qemu_aio_get qemu_aio_get
>> +# define my_qemu_aio_release qemu_aio_release
>> +# define COPY_UUID(to,from) do {} while (0)
>> +
>> +#else
>> +FILE *__fvd_debug_fp;
>> +static unsigned long long int fvd_uuid = 1;
>> +static int64_t pending_qemu_malloc = 0;
>> +static int64_t pending_qemu_aio_get = 0;
>> +static int64_t pending_local_writes = 0;
>> +static const char *alloc_file;
>> +static int alloc_line;
>> +
>> +#define my_qemu_malloc(size) \
>> +    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, 
>> _my_qemu_malloc(size)))
>> +
>> +#define my_qemu_mallocz(size) \
>> +    ((void*)(alloc_file=__FILE__, alloc_line=__LINE__, 
>> _my_qemu_mallocz(size)))
>> +
>> +#define my_qemu_blockalign(bs,size) \
>> +    ((void*)(alloc_file=__FILE__, \
>> +             alloc_line=__LINE__, \
>> +             _my_qemu_blockalign(bs,size)))
>> +
>> +#define my_qemu_aio_get(pool,bs,cb,op) \
>> +    ((void*)(alloc_file=__FILE__, \
>> +             alloc_line=__LINE__, \
>> +             _my_qemu_aio_get(pool,bs,cb,op)))
>> +
>> +#define my_qemu_free(p) \
>> +    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_free(p))
>> +
>> +#define my_qemu_vfree(p) \
>> +    (alloc_file=__FILE__, alloc_line=__LINE__, _my_qemu_vfree(p))
>> +
>> +static void COPY_UUID (FvdAIOCB * to, FvdAIOCB * from)
>> +{
>> +    if (from) {
>> +        to->uuid = from->uuid;
>> +        FVD_DEBUG_ACB (to);
>> +    }
>> +}
>> +
>> +#ifdef DEBUG_MEMORY_LEAK
>> +# define MAX_TRACER 10485760
>> +static int alloc_tracer_used = 1;        /* slot 0 is not used. */
>> +static void **alloc_tracers = NULL;
>> +
>> +static void __attribute__ ((constructor)) init_mem_alloc_tracers (void)
>> +{
>> +    if (!alloc_tracers) {
>> +        alloc_tracers = qemu_mallocz (sizeof (void *) * MAX_TRACER);
>> +    }
>> +}
>> +
>> +static void trace_alloc (void *p, size_t size)
>> +{
>> +    alloc_tracer_t *t = p;
>> +    t->magic = FVD_ALLOC_MAGIC;
>> +    t->alloc_file = alloc_file;
>> +    t->alloc_line = alloc_line;
>> +    t->size = size;
>> +
>> +    if (alloc_tracer_used<  MAX_TRACER) {
>> +        t->alloc_tracer = alloc_tracer_used++;
>> +        alloc_tracers[t->alloc_tracer] = t;
>> +        QDEBUG ("Allocate memory using tracer%d in %s on line %d.\n",
>> +                t->alloc_tracer, alloc_file, alloc_line);
>> +    } else {
>> +        t->alloc_tracer = 0;
>> +    }
>> +
>> +    /* Set header and footer to detect out-of-range writes. */
>> +    if (size != (size_t) - 1) {
>> +        uint8_t *q = (uint8_t *) p;
>> +        uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
>> +        uint64_t *footer = (uint64_t *) (q + size - 512);
>> +        *header = FVD_ALLOC_MAGIC;
>> +        *footer = FVD_ALLOC_MAGIC;
>> +    }
>> +}
>> +
>> +static void trace_free (void *p)
>> +{
>> +    alloc_tracer_t *t = p;
>> +
>> +    QDEBUG ("Free memory with tracer%d in %s on line %d.\n",
>> +            t->alloc_tracer, alloc_file, alloc_line);
>> +    ASSERT (t->magic == FVD_ALLOC_MAGIC&&  t->alloc_tracer>= 0);
>> +
>> +    /* Check header and footer to detect out-of-range writes. */
>> +    if (t->size != (size_t) - 1) {
>> +        uint8_t *q = (uint8_t *) p;
>> +        uint64_t *header = (uint64_t *) (q + 512 - sizeof (uint64_t));
>> +        uint64_t *footer = (uint64_t *) (q + t->size - 512);
>> +        ASSERT (*header == FVD_ALLOC_MAGIC);
>> +        ASSERT (*footer == FVD_ALLOC_MAGIC);
>> +    }
>> +
>> +    if (t->alloc_tracer) {
>> +        ASSERT (alloc_tracers[t->alloc_tracer] == t);
>> +        alloc_tracers[t->alloc_tracer] = NULL;
>> +        t->alloc_tracer = -INT_MAX;
>> +    } else {
>> +        t->alloc_tracer *= -1;        /* Guard against double free. */
>> +    }
>> +}
>> +
>> +static void dump_alloc_tracers (void)
>> +{
>> +    int unfreed = 0;
>> +    int i;
>> +    for (i = 1; i<  alloc_tracer_used; i++) {
>> +        if (!alloc_tracers[i]) {
>> +            continue;
>> +        }
>> +
>> +        unfreed++;
>> +        alloc_tracer_t *t = alloc_tracers[i];
>> +
>> +        if (t->size == (size_t) - 1) {
>> +            FvdAIOCB *acb = container_of (alloc_tracers[i], 
>> FvdAIOCB, tracer);
>> +            ASSERT (acb->magic == FVDAIOCB_MAGIC);
>> +            QDEBUG ("Memory %p with tracer%d allocated in %s on line 
>> %d "
>> +                    "(FvdAIOCB acb%llu-%p) is not freed. magic %s\n",
>> +                    alloc_tracers[i], i, t->alloc_file, t->alloc_line,
>> +                    acb->uuid, acb,
>> +                    t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
>> +        } else {
>> +            QDEBUG ("Memory %p with tracer%d allocated in %s on line 
>> %d is "
>> +                    "not freed. magic %s\n",
>> +                    alloc_tracers[i], i, t->alloc_file, t->alloc_line,
>> +                    t->magic == FVD_ALLOC_MAGIC ? "correct" : "wrong");
>> +
>> +            uint8_t *q = (uint8_t *) t;
>> +            uint64_t *header = (uint64_t *) (q + 512 - sizeof 
>> (uint64_t));
>> +            uint64_t *footer = (uint64_t *) (q + t->size - 512);
>> +            ASSERT (*header == FVD_ALLOC_MAGIC);
>> +            ASSERT (*footer == FVD_ALLOC_MAGIC);
>> +        }
>> +    }
>> +
>> +    QDEBUG ("Unfreed memory allocations: %d\n", unfreed);
>> +}
>> +#endif
>> +
>> +static inline void *_my_qemu_aio_get (AIOPool * pool, 
>> BlockDriverState * bs,
>> +                                      BlockDriverCompletionFunc * cb,
>> +                                      void *opaque)
>> +{
>> +    pending_qemu_aio_get++;
>> +    FvdAIOCB *acb = (FvdAIOCB *) qemu_aio_get (&fvd_aio_pool, bs, 
>> cb, opaque);
>> +    acb->uuid = ++fvd_uuid;
>> +    acb->magic = FVDAIOCB_MAGIC;
>> +
>> +    FVD_DEBUG_ACB (acb);
>> +
>> +#ifdef DEBUG_MEMORY_LEAK
>> +    trace_alloc (&acb->tracer, -1);
>> +#endif
>> +
>> +    return acb;
>> +}
>> +
>> +static inline void my_qemu_aio_release (void *p)
>> +{
>> +    pending_qemu_aio_get--;
>> +    ASSERT (pending_qemu_aio_get>= 0);
>> +
>> +#ifdef DEBUG_MEMORY_LEAK
>> +    FvdAIOCB *acb = p;
>> +    trace_free (&acb->tracer);
>> +#endif
>> +
>> +    qemu_aio_release (p);
>> +}
>> +
>> +static inline void *_my_qemu_malloc (size_t size)
>> +{
>> +    ASSERT (size>  0);
>> +    pending_qemu_malloc++;
>> +#ifndef DEBUG_MEMORY_LEAK
>> +    return qemu_malloc (size);
>> +#else
>> +
>> +    size += 1024;        /* 512 bytes header and 512 bytes footer. */
>> +    uint8_t *ret = qemu_malloc (size);
>> +    trace_alloc (ret, size);
>> +    return ret + 512;
>> +#endif
>> +}
>> +
>> +static inline void *_my_qemu_mallocz (size_t size)
>> +{
>> +    ASSERT (size>  0);
>> +    pending_qemu_malloc++;
>> +#ifndef DEBUG_MEMORY_LEAK
>> +    return qemu_mallocz (size);
>> +#else
>> +
>> +    size += 1024;        /* 512 bytes header and 512 bytes footer. */
>> +    uint8_t *ret = qemu_mallocz (size);
>> +    trace_alloc (ret, size);
>> +    return ret + 512;
>> +#endif
>> +}
>> +
>> +static inline void *_my_qemu_blockalign (BlockDriverState * bs, 
>> size_t size)
>> +{
>> +    ASSERT (size>  0);
>> +    pending_qemu_malloc++;
>> +
>> +#ifndef DEBUG_MEMORY_LEAK
>> +    return qemu_blockalign (bs, size);
>> +#else
>> +
>> +    size += 1024;        /* 512 bytes header and 512 bytes footer. */
>> +    uint8_t *ret = qemu_blockalign (bs, size);
>> +    trace_alloc (ret, size);
>> +    return ret + 512;
>> +#endif
>> +}
>> +
>> +static inline void _my_qemu_free (void *ptr)
>> +{
>> +    pending_qemu_malloc--;
>> +    ASSERT (pending_qemu_malloc>= 0);
>> +#ifndef DEBUG_MEMORY_LEAK
>> +    qemu_free (ptr);
>> +#else
>> +
>> +    uint8_t *q = ((uint8_t *) ptr) - 512;
>> +    trace_free (q);
>> +    qemu_free (q);
>> +#endif
>> +}
>> +
>> +static inline void _my_qemu_vfree (void *ptr)
>> +{
>> +    pending_qemu_malloc--;
>> +    ASSERT (pending_qemu_malloc>= 0);
>> +#ifndef DEBUG_MEMORY_LEAK
>> +    qemu_vfree (ptr);
>> +#else
>> +
>> +    uint8_t *q = ((uint8_t *) ptr) - 512;
>> +    trace_free (q);
>> +    qemu_vfree (q);
>> +#endif
>> +}
>> +
>> +static void count_pending_requests (BDRVFvdState * s)
>> +{
>> +    int m = 0, k = 0;
>> +    FvdAIOCB *w;
>> +
>> +    QLIST_FOREACH (w,&s->copy_locks, copy_lock.next) {
>> +        m++;
>> +        QDEBUG ("copy_lock: acb%llu-%p\n", w->uuid, w);
>> +    }
>> +
>> +    QLIST_FOREACH (w,&s->write_locks, write.next_write_lock) {
>> +        k++;
>> +        QDEBUG ("write_lock: acb%llu-%p\n", w->uuid, w);
>> +    }
>> +
>> +    QDEBUG ("Debug_memory_leak: copy_locks=%d  write_locks=%d\n", m, 
>> k);
>> +}
>> +
>> +static void dump_resource_summary (BDRVFvdState * s)
>> +{
>> +#ifdef DEBUG_MEMORY_LEAK
>> +    dump_alloc_tracers ();
>> +#endif
>> +
>> +    QDEBUG ("Resource summary: outstanding_copy_on_read_data=%" PRId64
>> +            " total_copy_on_read_data=%" PRId64 " 
>> total_prefetch_data=%" PRId64
>> +            " " " pending_qemu_malloc=%" PRId64 " 
>> pending_qemu_aio_get=%" PRId64
>> +            " pending_local_writes=%" PRId64 "\n",
>> +            s->outstanding_copy_on_read_data, 
>> s->total_copy_on_read_data,
>> +            s->total_prefetch_data, pending_qemu_malloc, 
>> pending_qemu_aio_get,
>> +            pending_local_writes);
>> +    count_pending_requests (s);
>> +}
>> +
>> +/* Monitor processing a specific FvdAIOCB that triggers bugs. */
>> +void FVD_DEBUG_ACB (void *p)
>> +{
>> +    if (FALSE) {
>> +        FvdAIOCB *acb = p;
>> +
>> +        /* Is it FvdAIOCB? */
>> +        if (acb->magic != FVDAIOCB_MAGIC || acb->common.bs->drv 
>> !=&bdrv_fvd) {
>> +            /* Is it CompactChildCB? */
>> +            CompactChildCB *child = p;
>> +            acb = child->acb;
>> +            if (acb->magic != FVDAIOCB_MAGIC
>> +                || acb->common.bs->drv !=&bdrv_fvd
>> +                || (acb->type != OP_LOAD_COMPACT
>> +&&  acb->type != OP_STORE_COMPACT)) {
>> +                return;
>> +            }
>> +        }
>> +
>> +        if (acb->uuid == 20ULL) {
>> +            QPAUSE ("Processing the right acb");
>> +        }
>> +    }
>> +}
>> +
>> +void init_fvd_debug_fp (void)
>> +{
>> +    char buf[256];
>> +    sprintf (buf, "/tmp/fvd.log-%d", getpid ());
>> +    if ((__fvd_debug_fp = fopen (buf, "wt")) == NULL) {
>> +        __fvd_debug_fp = stdout;
>> +    }
>> +}
>> +#endif
>> +
>> +void fvd_check_memory_usage (void)
>> +{
>> +    ASSERT (pending_qemu_malloc == 0);
>> +}
>> +
>> +int fvd_get_copy_on_read (BlockDriverState * bs)
>> +{
>> +    BDRVFvdState *s = bs->opaque;
>> +    return s->copy_on_read;
>> +}
>> +
>> +void fvd_set_copy_on_read (BlockDriverState * bs, int copy_on_read)
>> +{
>> +    BDRVFvdState *s = bs->opaque;
>> +    s->copy_on_read = copy_on_read;
>> +}
>> diff --git a/block/fvd-ext.h b/block/fvd-ext.h
>> new file mode 100644
>> index 0000000..6839e25
>> --- /dev/null
>> +++ b/block/fvd-ext.h
>> @@ -0,0 +1,71 @@
>> +/*
>> + * Copyright (c) 2010-2011 IBM
>> + *
>> + * Authors:
>> + *         Chunqiang Tang<ctang@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +/*============================================================================= 
>>
>> + *  A short description: this header file contains functions of the 
>> FVD block
>> + *  device driver that are used by other external modules. These 
>> functions are
>> + *  mainly for testing and debugging urposes.
>> + 
>> *============================================================================*/ 
>>
>> +
>> +#ifndef __fvd_debug_h__
>> +#define __fvd_debug_h__
>> +
>> +//#define FVD_DEBUG
>> +
>> +int fvd_get_copy_on_read (BlockDriverState *bs);
>> +void fvd_set_copy_on_read (BlockDriverState *bs, int copy_on_read);
>> +void fvd_check_memory_usage (void);
>> +void fvd_init_prefetch(void * bs);
>> +void fvd_enable_host_crash_test (void);
>> +
>> +#ifndef TRUE
>> +# define TRUE 1
>> +#endif
>> +#ifndef FALSE
>> +# define FALSE 0
>> +#endif
>> +
>> +#ifndef FVD_DEBUG
>> +# define QDEBUG(format,...) do {} while (0)
>> +# define ASSERT(x) do {} while (0)
>> +# define FVD_DEBUG_ACB(...) do {} while (0)
>> +# define QPAUSE(...) do {} while (0)
>> +
>> +#else
>> +
>> +extern FILE *__fvd_debug_fp;
>> +void init_fvd_debug_fp (void);
>> +void FVD_DEBUG_ACB (void *p);
>> +# define QDEBUG(format,...) \
>> +    do { \
>> +        if (__fvd_debug_fp==NULL) init_fvd_debug_fp(); \
>> +        fprintf (__fvd_debug_fp, format, ##__VA_ARGS__); \
>> +        fflush(__fvd_debug_fp); \
>> +    } while(0)
>> +
>> +# define ASSERT(x) \
>> +    do { \
>> +        if (!(x)) { \
>> +            fprintf (stderr, "Assertion failed in process %d at 
>> %s:%d. " \
>> +                "Waiting for debugging...\n", getpid(),__FILE__, 
>> __LINE__); \
>> +            fgetc (stdin); exit (1);  \
>> +        } \
>> +    } while (0) \
>> +
>> +# define QPAUSE(format,...) \
>> +    do { \
>> +        printf (format, ##__VA_ARGS__); \
>> +        printf (" Pause process %d for debugging...\n", getpid()); \
>> +        fgetc (stdin); \
>> +    } while (0)
>> +
>> +#endif
>> +
>> +#endif
>> diff --git a/block/fvd.c b/block/fvd.c
>> new file mode 100644
>> index 0000000..311ff58
>> --- /dev/null
>> +++ b/block/fvd.c
>> @@ -0,0 +1,127 @@
>> +/*
>> + * Copyright (c) 2010-2011 IBM
>> + *
>> + * Authors:
>> + *         Chunqiang Tang<ctang@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +/*============================================================================= 
>>
>> + *  A short description: this module implements the QEMU block 
>> device driver
>> + *  for the Fast Virtual Disk (FVD) format.  See the following 
>> companion
>> + *  papers for a detailed description of FVD:
>> + *  1. The so-called "FVD-cow paper":
>> + *          "FVD: a High-Performance Virtual Machine Image Format 
>> for Cloud",
>> + *      by Chunqiang Tang, 2010.
>> + *  2. The so-called "FVD-compact paper":
>> + *          "FVD: a High-Performance Virtual Machine Image Format 
>> for Cloud
>> + *           with Sparse Image Capability", by Chunqiang Tang, 2010.
>> + 
>> *============================================================================*/ 
>>
>> +
>> +#include "block/fvd.h"
>> +
>> +//#define ENABLE_TRACE_IO
>> +//#define DEBUG_MEMORY_LEAK
>> +//#define SIMULATED_TEST_WITH_QEMU_IO
>> +
>> +#ifndef FVD_DEBUG
>> +#undef DEBUG_MEMORY_LEAK
>> +#undef ENABLE_TRACE_IO
>> +#undef SIMULATED_TEST_WITH_QEMU_IO
>> +#endif
>> +
>> +/* Use include to avoid exposing too many FVD symbols, and to allow 
>> inline
>> + * function optimization. */
>> +#include "block/fvd-utils.c"
>> +#include "block/fvd-debug.c"
>> +#include "block/fvd-misc.c"
>> +#include "block/fvd-create.c"
>> +#include "block/fvd-open.c"
>> +#include "block/fvd-read.c"
>> +#include "block/fvd-write.c"
>> +#include "block/fvd-load.c"
>> +#include "block/fvd-store.c"
>> +#include "block/fvd-journal.c"
>> +#include "block/fvd-prefetch.c"
>> +
>> +static AIOPool fvd_aio_pool = {
>> +    .aiocb_size = sizeof (FvdAIOCB),
>> +    .cancel = fvd_aio_cancel,
>> +};
>> +
>> +static BlockDriver bdrv_fvd = {
>> +    .format_name = "fvd",
>> +    .instance_size = sizeof (BDRVFvdState),
>> +    .bdrv_create = fvd_create,
>> +    .bdrv_probe = fvd_probe,
>> +    .bdrv_file_open = fvd_open,
>> +    .bdrv_close = fvd_close,
>> +    .bdrv_is_allocated = fvd_is_allocated,
>> +    .bdrv_flush = fvd_flush,
>> +    .bdrv_aio_readv = fvd_aio_readv,
>> +    .bdrv_aio_writev = fvd_aio_writev,
>> +    .bdrv_aio_flush = fvd_aio_flush,
>> +    .create_options = fvd_create_options,
>> +    .bdrv_get_info = fvd_get_info,
>> +    .bdrv_update = fvd_update,
>> +    .bdrv_has_zero_init = fvd_has_zero_init
>> +};
>> +
>> +static void bdrv_fvd_init (void)
>> +{
>> +    bdrv_register (&bdrv_fvd);
>> +}
>> +
>> +block_init (bdrv_fvd_init);
>> +
>> +/*
>> + * Since bdrv_close may not be properly invoked on a VM shutdown, we
>> + * use a destructor to flush metadata to disk. This only affects
>> + * performance and does not affect correctness.
>> + * See Section 3.3.4 of the FVD-cow paper for the rationale.
>> + */
>> +extern QTAILQ_HEAD (, BlockDriverState) bdrv_states;
>> +static void __attribute__ ((destructor)) flush_fvd_bitmap_to_disk 
>> (void)
>> +{
>> +    BlockDriverState *bs;
>> +    QTAILQ_FOREACH (bs,&bdrv_states, list) {
>> +        if (bs->drv ==&bdrv_fvd) {
>> +            flush_metadata_to_disk_on_exit (bs);
>> +
>> +#ifdef FVD_DEBUG
>> +            dump_resource_summary (bs->opaque);
>> +#endif
>> +        }
>> +    }
>> +}
>> +
>> +/*
>> + * TODOs: Below are some potential enhancements for future development:
>> + * 1. Handle storage leak on failure.
>> + *
>> + * 2. Profile-directed prefetch. See Section 3.4.1 of the FVD-cow 
>> paper.
>> + * Related metadata are FvdHeader.prefetch_profile_offset and
>> + * FvdHeader.prefetch_profile_entries,
>> + * FvdHeader.profile_directed_prefetch_start_delay,
>> + * FvdHeader.generate_prefetch_profile.
>> + *
>> + * 3.  Cap the prefetch throughput at the upper limit. See Section 
>> 3.4.2 of
>> + * the FVD-cow paper.  Related metadata are
>> + * FvdHeader.prefetch_max_read_throughput and
>> + * FvdHeader.prefetch_max_write_throughput.
>> + *
>> + * 4. Support write through to the base image. When a VM issues a write
>> + * request, in addition to saving the data in the FVD data file, 
>> also save the
>> + * data in the base image if the address of write request is not 
>> beyond the
>> + * size of the base image (this of course requires the base image 
>> NOT to be
>> + * 'read_only'. This feature changes the semantics of copy-on-write, 
>> but it
>> + * suits a different use case, where the base image is stored on a 
>> remote
>> + * storage server, and the FVD image is stored on a local disk and 
>> acts as a
>> + * write-through cache of the base image. This can be used to cache and
>> + * improve the performance of persistent storage on network-attached 
>> storage,
>> + * e.g., Amazon EBS.  This feature is not described in the FVD-cow 
>> paper as it
>> + * would complicate the discussion.  Related metadata are
>> + * FvdHeader.write_updates_base_img.
>> + */
>> diff --git a/block/fvd.h b/block/fvd.h
>> new file mode 100644
>> index 0000000..cce8cc8
>> --- /dev/null
>> +++ b/block/fvd.h
>> @@ -0,0 +1,481 @@
>> +/*
>> + * Copyright (c) 2010-2011 IBM
>> + *
>> + * Authors:
>> + *         Chunqiang Tang<ctang@us.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.
>> + * See the COPYING file in the top-level directory.
>> + */
>> +
>> +/*============================================================================= 
>>
>> + *  A short description: this is the header of the FVD block device 
>> driver.
>> + 
>> *============================================================================*/ 
>>
>> +
>> +#include<sys/vfs.h>
>> +#include<sys/mman.h>
>> +#include<pthread.h>
>> +#include<execinfo.h>
>> +#include<stdlib.h>
>> +#include<sys/ioctl.h>
>> +#include<stdint.h>
>> +#include<stdio.h>
>> +#include<inttypes.h>
>> +#include "block_int.h"
>> +#include "osdep.h"
>> +#include "qemu-option.h"
>> +#include "qemu-timer.h"
>> +#include "block.h"
>> +#include "qemu-queue.h"
>> +#include "qemu-common.h"
>> +#include "block/blksim.h"
>> +#include "block/fvd-ext.h"
>> +
>> +#define FVD_MAGIC         (('Q'<<  24) | ('C'<<  16) | (0xF5<<  8) | 
>> 0xA9)
>> +#define FVD_VERSION         1
>> +
>> +/* Profile-directed prefetch. (to be implemented). */
>> +typedef struct __attribute__ ((__packed__)) PrefetchProfileEntry {
>> +    int64_t offset;        /* in bytes */
>> +
>> +    /* In the unit of FvdHeader.prefetch_profile_entry_len_unit, i.e.,
>> +     * len_in_bytes = len * 
>> FvdHeader.unit_of_PrefetchProfileEntry_len. */
>> +    uint32_t len;
>> +} PrefetchProfileEntry;
>> +
>> +/*
>> + * The FVD format consists of:
>> + *   + Header fields of FvdHeader.
>> + *   + Bitmap, starting on a 4KB page boundary at a location 
>> specified by
>> + *     FvdHeader.bitmap_offset.
>> + *   + Table, starting on a 4KB page boundary at a location 
>> specified by
>> + *     FvdHeader.table_offset.
>> + *   + Journal, starting on a 4KB page boundary at a location 
>> specified by
>> + *     FvdHeader.journal_offset.
>> + *   + Prefetch profile entries, starting on a 4KB page boundary at 
>> a location
>> + *     specified by FvdHeader.prefetch_profile_offset. (to be 
>> implemented)
>> + *   + Virtual disk data,  starting on a 4KB page boundary. 
>> Optionally, disk
>> + *     data can be stored in a separate data file specified by
>> + *     FvdHeader.data_file.
>> + */
>> +typedef struct __attribute__ ((__packed__)) FvdHeader {
>> +    uint32_t magic;
>> +    uint32_t version;
>> +
>> +    /* This field is set to TRUE after whole-image prefetching 
>> finishes. */
>> +    int32_t all_data_in_fvd_img;
>> +
>> +    int64_t virtual_disk_size;        /* in bytes. Disk size 
>> perceived by the VM. */
>> +    int64_t metadata_size;        /* in bytes. */
>> +    char base_img[1024];
>> +    char base_img_fmt[16];
>> +    int64_t base_img_size;        /* in bytes. */
>> +    int64_t bitmap_offset;        /* in bytes. Aligned on 
>> DEF_PAGE_SIZE. */
>> +    int64_t bitmap_size;        /* in bytes. Rounded up to 
>> DEF_PAGE_SIZE */
>> +    int32_t block_size;                /* in bytes. */
>> +    int32_t copy_on_read;        /* TRUE or FALSE */
>> +    int64_t max_outstanding_copy_on_read_data;        /* in bytes. */
>> +
>> +    /* If (data_file[0]==0), the FVD metadata and data are stored in 
>> one file.*/
>> +    char data_file[1024];
>> +    char data_file_fmt[16];
>> +
>> +    /******** Begin: for prefetching. *******************************/
>> +    /* in seconds. -1 means disable whole image prefetching. */
>> +    int32_t prefetch_start_delay;
>> +
>> +    /* in bytes. Aligned on DEF_PAGE_SIZE. (to be implemented) */
>> +    int64_t prefetch_profile_offset;
>> +
>> +    /* Number of PrefetchProfileEntry. (to be implemented) */
>> +    int64_t prefetch_profile_entries;
>> +
>> +    int32_t num_prefetch_slots;        /* Max number of oustanding 
>> prefetch writes. */
>> +    int32_t bytes_per_prefetch;        /* For whole image 
>> prefetching. */
>> +    int32_t prefetch_read_throughput_measure_time;        /* in 
>> milliseconds. */
>> +    int32_t prefetch_write_throughput_measure_time;        /* in 
>> milliseconds. */
>> +
>> +    /* Controls the calculation of the moving average of throughput. 
>> Must be a
>> +     * value between [0,100].
>> +     *   actual_normalized_alpha = * prefetch_perf_calc_alpha / 
>> 100.0 */
>> +    int32_t prefetch_perf_calc_alpha;
>> +
>> +    int32_t prefetch_min_read_throughput;        /* in KB/second. */
>> +    int32_t prefetch_min_write_throughput;        /* in KB/second. */
>> +    int32_t prefetch_max_read_throughput;        /* in KB/second. */
>> +    int32_t prefetch_max_write_throughput;        /* in KB/second. */
>> +
>> +    /* in milliseconds. When prefetch read/write throughput is low, 
>> prefetch
>> +     * pauses for a random time uniformly distributed in
>> +     * [0, prefetch_throttle_time]. */
>> +    int32_t prefetch_throttle_time;
>> +    /******** End: for prefetching. *******************************/
>> +
>> +    /******** Begin: for compact image. *****************************/
>> +    int32_t compact_image;        /* TRUE or FALSE */
>> +    int64_t table_offset;        /* in bytes. */
>> +    int64_t chunk_size;                /* in bytes. */
>> +    int64_t storage_grow_unit;        /* in bytes. */
>> +    char add_storage_cmd[2048];
>> +    /******** End: for compact image. *******************************/
>> +
>> +    /******** Begin: for journal. ***********************************/
>> +    int64_t journal_offset;        /* in bytes. */
>> +    int64_t journal_size;        /* in bytes. */
>> +    int32_t clean_shutdown;        /* TRUE if VM's last shutdown was 
>> graceful. */
>> +    /******** End: for journal. *************************************/
>> +
>> +    /*
>> +     * This field is TRUE if the image mandates that the storage layer
>> +     * (BDRVFvdState.fvd_data) must return TRUE for 
>> bdrv_has_zero_init().
>> +     * This is the case if the optimization described in Section 
>> 3.3.3 of the
>> +     * FVD-cow paper is enabled (see function search_holes()). If 
>> 'qemu-img
>> +     * create' sets need_zero_init to TRUE, 'qemu-img update' can be 
>> used to
>> +     * manually reset it to FALSE, if the user always manually 
>> pre-fills the
>> +     * storage (e.g., a raw partition) with zeros. If the image is 
>> stored on a
>> +     * file system, it already supports zero_init, and hence there 
>> is no need
>> +     * to manually manipulate this field.
>> +     */
>> +    int32_t need_zero_init;
>> +
>> +    /* If TRUE, FVD dumps a prefetch profile after the VM shuts down.
>> +     * (to be implemented) */
>> +    int32_t generate_prefetch_profile;
>> +
>> +    /* See the comment on PrefetchProfileEntry.len. (to be 
>> implemented) */
>> +    int32_t unit_of_PrefetchProfileEntry_len;
>> +
>> +    /* in seconds. -1 means disable profile-directed prefetching.
>> +     * (to be implemented) */
>> +    int32_t profile_directed_prefetch_start_delay;
>> +
>> +    /* Possible values are "no", "writethrough", "writeback", or
>> +     * "writenocache". (to be implemented) */
>> +    char write_updates_base_img[16];
>> +} FvdHeader;
>> +
>> +typedef struct BDRVFvdState {
>> +    BlockDriverState *fvd_metadata;
>> +    BlockDriverState *fvd_data;
>> +    int64_t virtual_disk_size;        /*in bytes. */
>> +    int64_t bitmap_offset;        /* in sectors */
>> +    int64_t bitmap_size;        /* in bytes. */
>> +    int64_t data_offset;        /* in sectors. Begin of real data. */
>> +    int64_t nb_sectors_in_base_img;
>> +    int32_t block_size;        /* in sectors. */
>> +    int copy_on_read;        /* TRUE or FALSE */
>> +    int64_t max_outstanding_copy_on_read_data;        /* in bytes. */
>> +    int64_t outstanding_copy_on_read_data;        /* in bytes. */
>> +    int data_region_prepared;        /* TRUE or FALSE */
>> +     QLIST_HEAD(WriteLocks, FvdAIOCB) write_locks; /* All writes. */
>> +     QLIST_HEAD(CopyLocks, FvdAIOCB) copy_locks; /* copy-on-read and 
>> CoW. */
>> +
>> +    /* Keep two copies of bitmap to reduce the overhead of updating the
>> +     * on-disk bitmap, i.e., copy-on-read and prefetching do not 
>> update the
>> +     * on-disk bitmap. See Section 3.3.4 of the FVD-cow paper. */
>> +    uint8_t *fresh_bitmap;
>> +    uint8_t *stale_bitmap;
>> +
>> +    /******** Begin: for prefetching. 
>> ***********************************/
>> +    struct FvdAIOCB **prefetch_acb;
>> +    int prefetch_state; /* PREFETCH_STATE_RUNNING, FINISHED, or 
>> DISABLED. */
>> +    int prefetch_error;        /* TRUE or FALSE */
>> +    int num_prefetch_slots;
>> +    int num_filled_prefetch_slots;
>> +    int next_prefetch_read_slot;
>> +    int prefetch_read_active;                        /* TRUE or 
>> FALSE */
>> +    int pause_prefetch_requested;                /* TRUE or FALSE */
>> +    int prefetch_start_delay;        /* in seconds  */
>> +    int64_t unclaimed_prefetch_region_start;
>> +    int64_t prefetch_read_time;                        /* in 
>> milliseconds. */
>> +    int64_t prefetch_write_time;                /* in milliseconds. */
>> +    int64_t prefetch_data_read;                        /* in bytes. */
>> +    int64_t prefetch_data_written;                /* in bytes. */
>> +    double prefetch_read_throughput;                /* in 
>> bytes/millisecond. */
>> +    double prefetch_write_throughput;                /* in 
>> bytes/millisecond. */
>> +    double prefetch_min_read_throughput;        /* in 
>> bytes/millisecond. */
>> +    double prefetch_min_write_throughput;        /* in 
>> bytes/millisecond. */
>> +    int64_t prefetch_read_throughput_measure_time;        /* in 
>> millisecond. */
>> +    int64_t prefetch_write_throughput_measure_time;        /* in 
>> millisecond.*/
>> +    int prefetch_throttle_time;        /* in millisecond. */
>> +    int sectors_per_prefetch;
>> +    QEMUTimer *prefetch_timer;
>> +    /* prefetch_perf_calc_alpha = 
>> FvdHeader.prefetch_perf_calc_alpha/100.0 */
>> +    double prefetch_perf_calc_alpha;
>> +    /******** End: for prefetching. 
>> ***********************************/
>> +
>> +    /******** Begin: for compact image. 
>> *************************************/
>> +    uint32_t *table;        /* Mapping table stored in memory in 
>> little endian. */
>> +    int64_t data_storage;        /* in sectors. */
>> +    int64_t used_storage;        /* in sectors. */
>> +    int64_t chunk_size;        /* in sectors. */
>> +    int64_t storage_grow_unit;        /* in sectors. */
>> +    int64_t table_offset;        /* in sectors. */
>> +    char *add_storage_cmd;
>> +    /******** Begin: for compact image. 
>> *************************************/
>> +
>> +    /******** Begin: for journal. 
>> *******************************************/
>> +    int64_t journal_offset;        /* in sectors. */
>> +    int64_t journal_size;        /* in sectors. */
>> +    int64_t next_journal_sector;        /* in sector. */
>> +    int ongoing_journal_updates;        /* Number of ongoing journal 
>> updates. */
>> +    int dirty_image;        /* TRUE or FALSE. */
>> +
>> +    /* Requests waiting for metadata flush and journal recycle to 
>> finish. */
>> +    QLIST_HEAD(JournalFlush, FvdAIOCB) wait_for_journal;
>> +    /******** End: for journal. 
>> ********************************************/
>> +
>> +#ifdef FVD_DEBUG
>> +    int64_t total_copy_on_read_data;                /* in bytes. */
>> +    int64_t total_prefetch_data;                /* in bytes. */
>> +#endif
>> +} BDRVFvdState;
>> +
>> +/* Begin of data type definitions. */
>> +struct FvdAIOCB;
>> +
>> +typedef struct JournalCB {
>> +    BlockDriverAIOCB *hd_acb;
>> +    QEMUIOVector qiov;
>> +    struct iovec iov;
>> +     QLIST_ENTRY(FvdAIOCB) next_wait_for_journal;
>> +} JournalCB;
>> +
>> +/* CopyLock is used by AIOWriteCB and AIOCopyCB. */
>> +typedef struct CopyLock {
>> +    QLIST_ENTRY(FvdAIOCB) next;
>> +    int64_t begin;
>> +    int64_t end;
>> +     QLIST_HEAD(DependentWritesHead, FvdAIOCB) dependent_writes;
>> +} CopyLock;
>> +
>> +typedef struct ChildAIOReadCB {
>> +    BlockDriverAIOCB *hd_acb;
>> +    struct iovec iov;
>> +    QEMUIOVector qiov;
>> +    int64_t sector_num;
>> +    int nb_sectors;
>> +    int done;
>> +} ChildAIOReadCB;
>> +
>> +typedef struct AIOReadCB {
>> +    QEMUIOVector *qiov;
>> +    int ret;
>> +    ChildAIOReadCB read_backing;
>> +    ChildAIOReadCB read_fvd;
>> +} AIOReadCB;
>> +
>> +/* For copy-on-read and prefetching. */
>> +typedef struct AIOCopyCB {
>> +    BlockDriverAIOCB *hd_acb;
>> +    struct iovec iov;
>> +    QEMUIOVector qiov;
>> +    uint8_t *buf;
>> +    int64_t buffered_sector_begin;
>> +    int64_t buffered_sector_end;
>> +    int64_t last_prefetch_op_start_time;        /* For prefetch 
>> only. */
>> +} AIOCopyCB;
>> +
>> +typedef struct AIOWriteCB {
>> +    BlockDriverAIOCB *hd_acb;
>> +    QEMUIOVector *qiov;
>> +    uint8_t *cow_buf;
>> +    QEMUIOVector *cow_qiov;
>> +    int64_t cow_start_sector;
>> +    int update_table;        /* TRUE or FALSE. */
>> +    int ret;
>> +    QLIST_ENTRY(FvdAIOCB) next_write_lock;   /* See 
>> BDRVFvdState.write_locks */
>> +
>> +    /* See FvdAIOCB.write.dependent_writes. */
>> +    QLIST_ENTRY(FvdAIOCB) next_dependent_write;
>> +} AIOWriteCB;
>> +
>> +/* For AIOStoreCompactCB and AIOLoadCompactCB. */
>> +typedef struct CompactChildCB {
>> +    struct FvdAIOCB *acb;
>> +    BlockDriverAIOCB *hd_acb;
>> +} CompactChildCB;
>> +
>> +/* For storing data to a compact image. */
>> +typedef struct AIOStoreCompactCB {
>> +    CompactChildCB one_child;
>> +    CompactChildCB *children;
>> +    int update_table;
>> +    int num_children;
>> +    int finished_children;
>> +    struct FvdAIOCB *parent_acb;
>> +    int ret;
>> +    int soft_write; /*TRUE if the store is caused by copy-on-read or 
>> prefetch.*/
>> +    QEMUIOVector *orig_qiov;
>> +} AIOStoreCompactCB;
>> +
>> +/* For loading data from a compact image. */
>> +typedef struct AIOLoadCompactCB {
>> +    CompactChildCB *children;
>> +    CompactChildCB one_child;
>> +    int num_children;
>> +    int finished_children;
>> +    struct FvdAIOCB *parent_acb;
>> +    int ret;
>> +    QEMUIOVector *orig_qiov;
>> +} AIOLoadCompactCB;
>> +
>> +typedef struct AIOFlushCB {
>> +    BlockDriverAIOCB *data_acb;
>> +    BlockDriverAIOCB *metadata_acb;
>> +    int num_finished;
>> +    int ret;
>> +} AIOFlushCB;
>> +
>> +typedef struct AIOWrapperCB {
>> +    QEMUBH *bh;
>> +} AIOWrapperCB;
>> +
>> +typedef enum { OP_READ = 1, OP_WRITE, OP_COPY, OP_STORE_COMPACT,
>> +    OP_LOAD_COMPACT, OP_WRAPPER, OP_FLUSH } op_type;
>> +
>> +#ifdef FVD_DEBUG
>> +/* For debugging memory leadk. */
>> +typedef struct alloc_tracer_t {
>> +    int64_t magic;
>> +    int alloc_tracer;
>> +    const char *alloc_file;
>> +    int alloc_line;
>> +    size_t size;
>> +} alloc_tracer_t;
>> +#endif
>> +
>> +typedef struct FvdAIOCB {
>> +    BlockDriverAIOCB common;
>> +    op_type type;
>> +    int64_t sector_num;
>> +    int nb_sectors;
>> +    JournalCB jcb;        /* For AIOWriteCB and AIOStoreCompactCB. */
>> +    CopyLock copy_lock;        /* For AIOWriteCB and AIOCopyCB. */
>> +
>> +    /* Use a union so that all requests can efficiently share one 
>> big AIOPool.*/
>> +    union {
>> +        AIOWrapperCB wrapper;
>> +        AIOReadCB read;
>> +        AIOWriteCB write;
>> +        AIOCopyCB copy;
>> +        AIOLoadCompactCB load;
>> +        AIOStoreCompactCB store;
>> +        AIOFlushCB flush;
>> +    };
>> +
>> +#ifdef FVD_DEBUG
>> +    int64_t magic;
>> +    alloc_tracer_t tracer;
>> +
>> +    /* Uniquely identifies a request across all processing 
>> activities. */
>> +    unsigned long long int uuid;
>> +#endif
>> +} FvdAIOCB;
>> +
>> +static AIOPool fvd_aio_pool;
>> +static BlockDriver bdrv_fvd;
>> +static QEMUOptionParameter fvd_create_options[];
>> +
>> +/* Function prototypes. */
>> +static int do_aio_write(struct FvdAIOCB *acb);
>> +static void finish_write_data(void *opaque, int ret);
>> +static void restart_dependent_writes(struct FvdAIOCB *acb);
>> +static void finish_prefetch_read(void *opaque, int ret);
>> +static int read_fvd_header(BDRVFvdState * s, FvdHeader * header);
>> +static int update_fvd_header(BDRVFvdState * s, FvdHeader * header);
>> +static void fvd_aio_cancel(BlockDriverAIOCB * blockacb);
>> +static BlockDriverAIOCB *store_data_in_compact_image(struct FvdAIOCB 
>> *acb,
>> +            int soft_write, struct FvdAIOCB *parent_acb, 
>> BlockDriverState * bs,
>> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
>> +            BlockDriverCompletionFunc * cb, void *opaque);
>> +static BlockDriverAIOCB *load_data_from_compact_image(struct 
>> FvdAIOCB *acb,
>> +            struct FvdAIOCB *parent_acb, BlockDriverState * bs,
>> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
>> +            BlockDriverCompletionFunc * cb, void *opaque);
>> +static void free_write_resource(struct FvdAIOCB *acb);
>> +static void write_metadata_to_journal(struct FvdAIOCB *acb);
>> +static void flush_metadata_to_disk(BlockDriverState * bs);
>> +static void free_journal_sectors(BDRVFvdState * s);
>> +static int fvd_create(const char *filename, QEMUOptionParameter * 
>> options);
>> +static int fvd_probe(const uint8_t * buf, int buf_size, const char 
>> *filename);
>> +static int fvd_open(BlockDriverState * bs, const char *filename, int 
>> flags);
>> +static void fvd_close(BlockDriverState * bs);
>> +static int fvd_is_allocated(BlockDriverState * bs, int64_t sector_num,
>> +                            int nb_sectors, int *pnum);
>> +static int fvd_flush(BlockDriverState * bs);
>> +static BlockDriverAIOCB *fvd_aio_readv(BlockDriverState * bs,
>> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
>> +            BlockDriverCompletionFunc * cb, void *opaque);
>> +static BlockDriverAIOCB *fvd_aio_writev(BlockDriverState * bs,
>> +            int64_t sector_num, QEMUIOVector * qiov, int nb_sectors,
>> +            BlockDriverCompletionFunc * cb, void *opaque);
>> +static BlockDriverAIOCB *fvd_aio_flush(BlockDriverState * bs,
>> +            BlockDriverCompletionFunc * cb, void *opaque);
>> +static int fvd_get_info(BlockDriverState * bs, BlockDriverInfo * bdi);
>> +static int fvd_update(BlockDriverState * bs, int argc, char **argv);
>> +static int fvd_has_zero_init(BlockDriverState * bs);
>> +static void fvd_read_cancel(FvdAIOCB * acb);
>> +static void fvd_write_cancel(FvdAIOCB * acb);
>> +static void fvd_copy_cancel(FvdAIOCB * acb);
>> +static void fvd_load_compact_cancel(FvdAIOCB * acb);
>> +static void fvd_store_compact_cancel(FvdAIOCB * acb);
>> +static void fvd_wrapper_cancel(FvdAIOCB * acb);
>> +static void flush_metadata_to_disk_on_exit (BlockDriverState *bs);
>> +static inline BlockDriverAIOCB *load_data(FvdAIOCB * parent_acb,
>> +            BlockDriverState * bs, int64_t sector_num, QEMUIOVector 
>> * orig_qiov,
>> +            int nb_sectors, BlockDriverCompletionFunc * cb, void 
>> *opaque);
>> +static inline BlockDriverAIOCB *store_data(int soft_write,
>> +            FvdAIOCB * parent_acb, BlockDriverState * bs, int64_t 
>> sector_num,
>> +            QEMUIOVector * orig_qiov, int nb_sectors,
>> +            BlockDriverCompletionFunc * cb, void *opaque);
>> +
>> +/* Default configurations. */
>> +#define DEF_PAGE_SIZE                                 4096        /* 
>> bytes */
>> +#define BYTES_PER_PREFETCH                        1048576        /* 
>> bytes */
>> +#define PREFETCH_THROTTLING_TIME                30000        /* 
>> milliseconds */
>> +#define NUM_PREFETCH_SLOTS                        2
>> +#define PREFETCH_MIN_MEASURE_READ_TIME                 100        /* 
>> milliseconds */
>> +#define PREFETCH_MIN_MEASURE_WRITE_TIME         100        /* 
>> milliseconds */
>> +#define PREFETCH_MIN_READ_THROUGHPUT                 5120        /* 
>> KB/s */
>> +#define PREFETCH_MIN_WRITE_THROUGHPUT                 5120        /* 
>> KB/s */
>> +#define PREFETCH_MAX_READ_THROUGHPUT                 
>> 1000000000L        /* KB/s */
>> +#define PREFETCH_MAX_WRITE_THROUGHPUT                 
>> 1000000000L        /* KB/s */
>> +#define PREFETCH_PERF_CALC_ALPHA                80        /* in 
>> [0,100]. */
>> +#define MAX_OUTSTANDING_COPY_ON_READ_DATA        
>> 2000000                /* bytes */
>> +#define MODERATE_BITMAP_SIZE                         4194304L        
>> /* bytes */
>> +#define CHUNK_SIZE                                1048576LL        
>> /* bytes */
>> +#define JOURNAL_SIZE                                
>> 16777216LL        /* bytes */
>> +#define STORAGE_GROW_UNIT                        104857600LL        
>> /* bytes */
>> +
>> +/* State of BDRVFvdState.prefetch_state. */
>> +#define PREFETCH_STATE_RUNNING                        1
>> +#define PREFETCH_STATE_FINISHED                        2
>> +#define PREFETCH_STATE_DISABLED                        3
>> +
>> +/* For convience. */
>> +#define ROUND_UP(x, base)           ((((x)+(base)-1) / (base)) * 
>> (base))
>> +#define ROUND_DOWN(x, base)           ((((x) / (base)) * (base)))
>> +#define BOOL(x)                 ((x) ? "true" : "false")
>> +#define EMPTY_TABLE                ((uint32_t)0xFFFFFFFF)
>> +#define DIRTY_TABLE                ((uint32_t)0x80000000)
>> +#define READ_TABLE(entry)         (le32_to_cpu(entry)&  ~DIRTY_TABLE)
>> +# define FVDAIOCB_MAGIC         ((uint64_t)0x3A8FCE89325B976DULL)
>> +# define FVD_ALLOC_MAGIC         ((uint64_t)0x4A7dCEF9925B976DULL)
>> +#define IS_EMPTY(entry)         ((entry) == EMPTY_TABLE)
>> +#define IS_DIRTY(entry)         (le32_to_cpu(entry)&  DIRTY_TABLE)
>> +#define WRITE_TABLE(entry,id)         ((entry) = cpu_to_le32(id))
>> +#define READ_TABLE2(entry) \
>> +    ((entry)==EMPTY_TABLE ? EMPTY_TABLE : (le32_to_cpu(entry)&  
>> ~DIRTY_TABLE))
>> +
>> +#define CLEAN_DIRTY(entry) \
>> +    do {  \
>> +        if (!IS_EMPTY(entry))  \
>> +            entry = cpu_to_le32(le32_to_cpu(entry)&  ~DIRTY_TABLE);  \
>> +    } while (0)
>> +
>> +#define CLEAN_DIRTY2(entry) \
>> +    do { \
>> +        ASSERT(!IS_EMPTY(entry)); \
>> +        entry = cpu_to_le32(le32_to_cpu(entry)&  ~DIRTY_TABLE);  \
>> +    } while (0)
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-21 22:41 ` Anthony Liguori
@ 2011-01-22  2:51   ` Chunqiang Tang
  2011-01-23 23:27     ` Anthony Liguori
  0 siblings, 1 reply; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-22  2:51 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

> > -void qemu_bh_schedule(QEMUBH *bh)
> > -{
> > -    bh->cb(bh->opaque);
> > -}
> > -
> > -void qemu_bh_cancel(QEMUBH *bh)
> > -{
> > -}
> > -
> > -void qemu_bh_delete(QEMUBH *bh)
> > -{
> > -    qemu_free(bh);
> > -}
> > -
> >   int qemu_set_fd_handler2(int fd,
> >                            IOCanReadHandler *fd_read_poll,
> >                            IOHandler *fd_read,
> > 
> 
> These functions surely cannot just be deleted like this.

These functions were not deleted but instead moved into a separate file 
qemu-tool-time.c, because those functions are time related and the 
implementations are different in the simulation mode and in the real mode. 
In the latest patches, these functions are kept in qemu-tool.c but their 
implementations support both cases based on a switch.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-20 17:08     ` Stefan Weil
@ 2011-01-22  9:02       ` Peter Maydell
  2011-01-24 14:56         ` Chunqiang Tang
  0 siblings, 1 reply; 18+ messages in thread
From: Peter Maydell @ 2011-01-22  9:02 UTC (permalink / raw)
  To: Stefan Weil; +Cc: Chunqiang Tang, Christoph Hellwig, qemu-devel

On 20 January 2011 17:08, Stefan Weil <weil@mail.berlios.de> wrote:
> Yes, that's a problem with some parts of the old code.
> For files which you want to modify, you could remove
> the spaces with your script before applying your other
> modifications and create a separate patch which only
> removes the superfluous spaces.

(This kind of came up in the other thread about fixing
non-C89 comments. I don't have any particular interest in
this area of the qemu source so this is a general remark.)

I definitely dislike patches which change whitespace or
indentation for an entire file, even if they are standalone
"only fixing whitespace" patches; they make it much harder
to deal with forks and branches of qemu. I would prefer
it if we only fix whitespace, indent and bracing for lines
we're touching anyway.

-- PMM

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-22  2:51   ` Chunqiang Tang
@ 2011-01-23 23:27     ` Anthony Liguori
  2011-01-24 14:50       ` Chunqiang Tang
  0 siblings, 1 reply; 18+ messages in thread
From: Anthony Liguori @ 2011-01-23 23:27 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: qemu-devel

On 01/21/2011 08:51 PM, Chunqiang Tang wrote:
>>> -void qemu_bh_schedule(QEMUBH *bh)
>>> -{
>>> -    bh->cb(bh->opaque);
>>> -}
>>> -
>>> -void qemu_bh_cancel(QEMUBH *bh)
>>> -{
>>> -}
>>> -
>>> -void qemu_bh_delete(QEMUBH *bh)
>>> -{
>>> -    qemu_free(bh);
>>> -}
>>> -
>>>    int qemu_set_fd_handler2(int fd,
>>>                             IOCanReadHandler *fd_read_poll,
>>>                             IOHandler *fd_read,
>>>
>>>        
>> These functions surely cannot just be deleted like this.
>>      
> These functions were not deleted but instead moved into a separate file
> qemu-tool-time.c, because those functions are time related and the
> implementations are different in the simulation mode and in the real mode.
> In the latest patches, these functions are kept in qemu-tool.c but their
> implementations support both cases based on a switch.
>    

I think the root of the problem is that your series didn't maintain 
bisectability.

IOW, each patch needs to be able to be applied one at a time such that 
at each point, the build doesn't break and functionality doesn't break.

Otherwise, tools like git bisect don't work.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-23 23:27     ` Anthony Liguori
@ 2011-01-24 14:50       ` Chunqiang Tang
  2011-01-27 12:23         ` Jes Sorensen
  0 siblings, 1 reply; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-24 14:50 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

> I think the root of the problem is that your series didn't maintain 
> bisectability.
> 
> IOW, each patch needs to be able to be applied one at a time such that 
> at each point, the build doesn't break and functionality doesn't break.
> 
> Otherwise, tools like git bisect don't work.

This was true with the old, big FVD patches you reviewed. Following 
Christoph Hellwig's suggestion, the new series of FVD patches submitted 
last Friday, e.g., "FVD: Added the simulated 'blksim' driver", add 
individual smaller functions and breaks neither compilation nor execution. 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-22  9:02       ` Peter Maydell
@ 2011-01-24 14:56         ` Chunqiang Tang
  0 siblings, 0 replies; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-24 14:56 UTC (permalink / raw)
  To: Peter Maydell; +Cc: Christoph Hellwig, qemu-devel

> On 20 January 2011 17:08, Stefan Weil <weil@mail.berlios.de> wrote:
> > Yes, that's a problem with some parts of the old code.
> > For files which you want to modify, you could remove
> > the spaces with your script before applying your other
> > modifications and create a separate patch which only
> > removes the superfluous spaces.
> 
> (This kind of came up in the other thread about fixing
> non-C89 comments. I don't have any particular interest in
> this area of the qemu source so this is a general remark.)
> 
> I definitely dislike patches which change whitespace or
> indentation for an entire file, even if they are standalone
> "only fixing whitespace" patches; they make it much harder
> to deal with forks and branches of qemu. I would prefer
> it if we only fix whitespace, indent and bracing for lines
> we're touching anyway.

I agree with this, i.e., only fixing the lines we are touching anyway. 
This is the approach the new series of FVD patches took.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3
  2011-01-21 22:57   ` Anthony Liguori
  2011-01-21 23:09     ` Anthony Liguori
@ 2011-01-24 15:29     ` Chunqiang Tang
  1 sibling, 0 replies; 18+ messages in thread
From: Chunqiang Tang @ 2011-01-24 15:29 UTC (permalink / raw)
  To: Anthony Liguori; +Cc: qemu-devel

> Before going any further with this series, I'd like to see
> 
> 1) a specification (on the QEMU wiki) describing this image format that 
> can be reviewed
> 
> 2) a concise explanation of why qcow2/qed cannot satisfy the use cases 
> addressed by FVD
> 
> 3) performance data to backup the claims of (2)

Page created at http://wiki.qemu.org/Features/FVD. Content will be added 
soon.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1
  2011-01-24 14:50       ` Chunqiang Tang
@ 2011-01-27 12:23         ` Jes Sorensen
  0 siblings, 0 replies; 18+ messages in thread
From: Jes Sorensen @ 2011-01-27 12:23 UTC (permalink / raw)
  To: Chunqiang Tang; +Cc: qemu-devel

On 01/24/11 15:50, Chunqiang Tang wrote:
>> I think the root of the problem is that your series didn't maintain 
>> bisectability.
>>
>> IOW, each patch needs to be able to be applied one at a time such that 
>> at each point, the build doesn't break and functionality doesn't break.
>>
>> Otherwise, tools like git bisect don't work.
> 
> This was true with the old, big FVD patches you reviewed. Following 
> Christoph Hellwig's suggestion, the new series of FVD patches submitted 
> last Friday, e.g., "FVD: Added the simulated 'blksim' driver", add 
> individual smaller functions and breaks neither compilation nor execution. 

Then you need to break up the new patch into smaller chunks and make
sure they can each be applied without breaking the build.

Having a separate patch that moves functions to another file, because
they are too be shared, is a completely valid patch.


Jes

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2011-01-27 12:23 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-01-19 22:04 [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Chunqiang Tang
2011-01-19 22:04 ` [Qemu-devel] [PATCH 2/5] Fast Virtual Disk (FVD) Proposal Part 2 Chunqiang Tang
2011-01-19 22:04 ` [Qemu-devel] [PATCH 3/5] Fast Virtual Disk (FVD) Proposal Part 3 Chunqiang Tang
2011-01-21 22:57   ` Anthony Liguori
2011-01-21 23:09     ` Anthony Liguori
2011-01-24 15:29     ` Chunqiang Tang
2011-01-19 22:04 ` [Qemu-devel] [PATCH 4/5] Fast Virtual Disk (FVD) Proposal Part 4 Chunqiang Tang
2011-01-19 22:04 ` [Qemu-devel] [PATCH 5/5] Fast Virtual Disk (FVD) Proposal Part 5 Chunqiang Tang
2011-01-20 13:01 ` [Qemu-devel] [PATCH 1/5] Fast Virtual Disk (FVD) Proposal Part 1 Christoph Hellwig
2011-01-20 14:49   ` Chunqiang Tang
2011-01-20 17:08     ` Stefan Weil
2011-01-22  9:02       ` Peter Maydell
2011-01-24 14:56         ` Chunqiang Tang
2011-01-21 22:41 ` Anthony Liguori
2011-01-22  2:51   ` Chunqiang Tang
2011-01-23 23:27     ` Anthony Liguori
2011-01-24 14:50       ` Chunqiang Tang
2011-01-27 12:23         ` Jes Sorensen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).