[Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage
@ 2010-01-21 15:24 Liran Schour
  2010-01-21 15:24 ` [Qemu-devel] [PATCH v2 1/4] Remove unused code Liran Schour
  2010-01-25  9:28 ` [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage Pierre Riteau
  0 siblings, 2 replies; 9+ messages in thread
From: Liran Schour @ 2010-01-21 15:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

This series of patches reduce the down time of the guest during a migration
without shared storage. It does that by start transfer dirty blocks in the 
iterative phase. In the current code transferring of dirty blocks begins only 
during the full phase while the guest is suspended. Therefore the guest will 
be suspended linear to the amount of data that was written to disk during
migration.

Changes from v1: - infer storage performance by get_clock()
- remove dirty max iterations - user is responsible for migration convergence
- remove trailing whitespaces
- minor cleanups

 block-migration.c |  244 +++++++++++++++++++++++++++++++++++------------------
 block.c           |   20 ++++-
 block.h           |    1 +
 block_int.h       |    1 +
 4 files changed, 181 insertions(+), 85 deletions(-)

Signed-off-by: Liran Schour <lirans@il.ibm.com>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH v2 1/4] Remove unused code
  2010-01-21 15:24 [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage Liran Schour
@ 2010-01-21 15:24 ` Liran Schour
  2010-01-21 15:24   ` [Qemu-devel] [PATCH v2 2/4] Tranfer dirty blocks during iterative phase Liran Schour
  2010-01-25  9:28 ` [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage Pierre Riteau
  1 sibling, 1 reply; 9+ messages in thread
From: Liran Schour @ 2010-01-21 15:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

blk_mig_save_bulked_block is never called with sync flag. Remove the sync
flag. Calculate bulk completion during blk_mig_save_bulked_block.

Changes from v1: remove trailing whitespaces and minor cleanups.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |   59 +++++++++++++++++-----------------------------------
 1 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 258a88a..f9bb42c 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -72,6 +72,7 @@ typedef struct BlkMigState {
     int transferred;
     int64_t total_sector_sum;
     int prev_progress;
+    int bulk_completed;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -138,7 +139,7 @@ static void blk_mig_read_cb(void *opaque, int ret)
 }
 
 static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
-                                BlkMigDevState *bmds, int is_async)
+                                BlkMigDevState *bmds)
 {
     int64_t total_sectors = bmds->total_sectors;
     int64_t cur_sector = bmds->cur_sector;
@@ -175,27 +176,16 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
     blk->bmds = bmds;
     blk->sector = cur_sector;
 
-    if (is_async) {
-        blk->iov.iov_base = blk->buf;
-        blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
-        qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
-
-        blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
-                                    nr_sectors, blk_mig_read_cb, blk);
-        if (!blk->aiocb) {
-            goto error;
-        }
-        block_mig_state.submitted++;
-    } else {
-        if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
-            goto error;
-        }
-        blk_send(f, blk);
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
-        qemu_free(blk->buf);
-        qemu_free(blk);
+    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
+                                nr_sectors, blk_mig_read_cb, blk);
+    if (!blk->aiocb) {
+        goto error;
     }
-
+    block_mig_state.submitted++;
     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
     bmds->cur_sector = cur_sector + nr_sectors;
 
@@ -229,6 +219,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.transferred = 0;
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
+    block_mig_state.bulk_completed = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -260,7 +251,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     }
 }
 
-static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
+static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
 {
     int64_t completed_sector_sum = 0;
     BlkMigDevState *bmds;
@@ -269,7 +260,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         if (bmds->bulk_completed == 0) {
-            if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) {
+            if (mig_save_device_bulk(mon, f, bmds) == 1) {
                 /* completed bulk section for this device */
                 bmds->bulk_completed = 1;
             }
@@ -362,19 +353,8 @@ static void flush_blks(QEMUFile* f)
 
 static int is_stage2_completed(void)
 {
-    BlkMigDevState *bmds;
-
-    if (block_mig_state.submitted > 0) {
-        return 0;
-    }
-
-    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        if (bmds->bulk_completed == 0) {
-            return 0;
-        }
-    }
-
-    return 1;
+    return (block_mig_state.submitted == 0 &&
+	    block_mig_state.bulk_completed);
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -432,8 +412,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     while ((block_mig_state.submitted +
             block_mig_state.read_done) * BLOCK_SIZE <
            qemu_file_get_rate_limit(f)) {
-        if (blk_mig_save_bulked_block(mon, f, 1) == 0) {
-            /* no more bulk blocks for now */
+        if (blk_mig_save_bulked_block(mon, f) == 0) {
+            /* finish saving bulk on all devices */
+            block_mig_state.bulk_completed = 1;
             break;
         }
     }
@@ -446,9 +427,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     if (stage == 3) {
-        while (blk_mig_save_bulked_block(mon, f, 0) != 0) {
-            /* empty */
-        }
+        /* we now for sure that save bulk is completed */
 
         blk_mig_save_dirty_blocks(mon, f);
         blk_mig_cleanup(mon);
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH v2 2/4] Tranfer dirty blocks during iterative phase
  2010-01-21 15:24 ` [Qemu-devel] [PATCH v2 1/4] Remove unused code Liran Schour
@ 2010-01-21 15:24   ` Liran Schour
  2010-01-21 15:24     ` [Qemu-devel] [PATCH v2 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
  0 siblings, 1 reply; 9+ messages in thread
From: Liran Schour @ 2010-01-21 15:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

Start transfer dirty blocks during the iterative stage. That will
reduce the time that the guest will be suspended

Changes from v1: remove trailing whitespaces and remove max iterations limit.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |  135 +++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 99 insertions(+), 36 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index f9bb42c..16df75f 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -45,6 +45,7 @@ typedef struct BlkMigDevState {
     int bulk_completed;
     int shared_base;
     int64_t cur_sector;
+    int64_t cur_dirty;
     int64_t completed_sectors;
     int64_t total_sectors;
     int64_t dirty;
@@ -73,6 +74,7 @@ typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
+    int dirty_iterations;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -186,6 +188,7 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
         goto error;
     }
     block_mig_state.submitted++;
+
     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
     bmds->cur_sector = cur_sector + nr_sectors;
 
@@ -284,39 +287,88 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
     return ret;
 }
 
-#define MAX_NUM_BLOCKS 4
-
-static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f)
+static void blk_mig_reset_dirty_cursor(void)
 {
     BlkMigDevState *bmds;
-    BlkMigBlock blk;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        bmds->cur_dirty = 0;
+    }
+}
+
+static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
+                                 BlkMigDevState *bmds, int is_async)
+{
+    BlkMigBlock *blk;
+    int64_t total_sectors = bmds->total_sectors;
     int64_t sector;
+    int nr_sectors;
 
-    blk.buf = qemu_malloc(BLOCK_SIZE);
+    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
+        if (bdrv_get_dirty(bmds->bs, sector)) {
 
-    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        for (sector = 0; sector < bmds->cur_sector;) {
-            if (bdrv_get_dirty(bmds->bs, sector)) {
-                if (bdrv_read(bmds->bs, sector, blk.buf,
-                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
-                    monitor_printf(mon, "Error reading sector %" PRId64 "\n",
-                                   sector);
-                    qemu_file_set_error(f);
-                    qemu_free(blk.buf);
-                    return;
+            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
+                nr_sectors = total_sectors - sector;
+            } else {
+                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+            }
+            blk = qemu_malloc(sizeof(BlkMigBlock));
+            blk->buf = qemu_malloc(BLOCK_SIZE);
+            blk->bmds = bmds;
+            blk->sector = sector;
+
+            if(is_async) {
+                blk->iov.iov_base = blk->buf;
+                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
+                                            nr_sectors, blk_mig_read_cb, blk);
+                if (!blk->aiocb) {
+                    goto error;
+                }
+                block_mig_state.submitted++;
+            } else {
+                if (bdrv_read(bmds->bs, sector, blk->buf,
+                              nr_sectors) < 0) {
+                    goto error;
                 }
-                blk.bmds = bmds;
-                blk.sector = sector;
-                blk_send(f, &blk);
+                blk_send(f, blk);
 
-                bdrv_reset_dirty(bmds->bs, sector,
-                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+                qemu_free(blk->buf);
+                qemu_free(blk);
             }
-            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
+            break;
         }
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        bmds->cur_dirty = sector;
     }
 
-    qemu_free(blk.buf);
+    return (bmds->cur_dirty >= bmds->total_sectors);
+
+ error:
+    monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
+    qemu_file_set_error(f);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+    return 0;
+}
+
+static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
+{
+    BlkMigDevState *bmds;
+    int ret = 0;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
+            ret = 1;
+            break;
+        }
+    }
+
+    return ret;
 }
 
 static void flush_blks(QEMUFile* f)
@@ -408,28 +460,39 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
         return 0;
     }
 
-    /* control the rate of transfer */
-    while ((block_mig_state.submitted +
-            block_mig_state.read_done) * BLOCK_SIZE <
-           qemu_file_get_rate_limit(f)) {
-        if (blk_mig_save_bulked_block(mon, f) == 0) {
-            /* finish saving bulk on all devices */
-            block_mig_state.bulk_completed = 1;
-            break;
+    blk_mig_reset_dirty_cursor();
+
+    if(stage == 2) {
+        /* control the rate of transfer */
+        while ((block_mig_state.submitted +
+                block_mig_state.read_done) * BLOCK_SIZE <
+               qemu_file_get_rate_limit(f)) {
+            if (block_mig_state.bulk_completed == 0) {
+                /* first finish the bulk phase */
+                if (blk_mig_save_bulked_block(mon, f) == 0) {
+                    /* finish saving bulk on all devices */
+                    block_mig_state.bulk_completed = 1;
+                }
+            } else {
+                if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
+                    /* no more dirty blocks */
+                    break;
+                }
+            }
         }
-    }
 
-    flush_blks(f);
+        flush_blks(f);
 
-    if (qemu_file_has_error(f)) {
-        blk_mig_cleanup(mon);
-        return 0;
+        if (qemu_file_has_error(f)) {
+            blk_mig_cleanup(mon);
+            return 0;
+        }
     }
 
     if (stage == 3) {
         /* we now for sure that save bulk is completed */
 
-        blk_mig_save_dirty_blocks(mon, f);
+        while(blk_mig_save_dirty_block(mon, f, 0) != 0);
         blk_mig_cleanup(mon);
 
         /* report completion */
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH v2 3/4] Count dirty blocks and expose an API to get dirty count
  2010-01-21 15:24   ` [Qemu-devel] [PATCH v2 2/4] Tranfer dirty blocks during iterative phase Liran Schour
@ 2010-01-21 15:24     ` Liran Schour
  2010-01-21 15:24       ` [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3 Liran Schour
  0 siblings, 1 reply; 9+ messages in thread
From: Liran Schour @ 2010-01-21 15:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

This will manage dirty counter for each device and will allow to get the
dirty counter from above.

Changes from v1: remove trailing whitespaces.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block.c     |   16 ++++++++++++++--
 block.h     |    1 +
 block_int.h |    1 +
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 30ae2b1..a6381ad 100644
--- a/block.c
+++ b/block.c
@@ -653,9 +653,15 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
         bit = start % (sizeof(unsigned long) * 8);
         val = bs->dirty_bitmap[idx];
         if (dirty) {
-            val |= 1 << bit;
+            if (!(val & (1 << bit))) {
+                bs->dirty_count++;
+                val |= 1 << bit;
+            }
         } else {
-            val &= ~(1 << bit);
+            if (val & (1 << bit)) {
+                bs->dirty_count--;
+                val &= ~(1 << bit);
+            }
         }
         bs->dirty_bitmap[idx] = val;
     }
@@ -2116,6 +2122,7 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
 {
     int64_t bitmap_size;
 
+    bs->dirty_count = 0;
     if (enable) {
         if (!bs->dirty_bitmap) {
             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
@@ -2150,3 +2157,8 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
 {
     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
 }
+
+int64_t bdrv_get_dirty_count(BlockDriverState *bs)
+{
+    return bs->dirty_count;
+}
diff --git a/block.h b/block.h
index fa51ddf..1012303 100644
--- a/block.h
+++ b/block.h
@@ -201,4 +201,5 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                       int nr_sectors);
+int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 #endif
diff --git a/block_int.h b/block_int.h
index 9a3b2e0..8d5d9bc 100644
--- a/block_int.h
+++ b/block_int.h
@@ -172,6 +172,7 @@ struct BlockDriverState {
     int type;
     char device_name[32];
     unsigned long *dirty_bitmap;
+    int64_t dirty_count;
     BlockDriverState *next;
     void *private;
 };
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3
  2010-01-21 15:24     ` [Qemu-devel] [PATCH v2 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
@ 2010-01-21 15:24       ` Liran Schour
  2010-01-21 18:03         ` Pierre Riteau
  0 siblings, 1 reply; 9+ messages in thread
From: Liran Schour @ 2010-01-21 15:24 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

Move to stage3 only when remaining work can be done below max downtime.

Changes from v1: remove max iterations. Try to infer storage performance and by that calculate remaining work.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 132 insertions(+), 4 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 16df75f..5ef3eb8 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -17,6 +17,7 @@
 #include "qemu-queue.h"
 #include "monitor.h"
 #include "block-migration.h"
+#include "migration.h"
 #include <assert.h>
 
 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
@@ -60,6 +61,7 @@ typedef struct BlkMigBlock {
     QEMUIOVector qiov;
     BlockDriverAIOCB *aiocb;
     int ret;
+    long double time;
     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
 } BlkMigBlock;
 
@@ -74,11 +76,79 @@ typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
-    int dirty_iterations;
+    long double total_time;
+    int reads;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
 
+static int64_t get_clock_realtime(void)
+{
+    struct timeval tv;
+
+    gettimeofday(&tv, NULL);
+    return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
+}
+
+#ifdef WIN32
+
+static int64_t clock_freq;
+
+static void init_get_clock(void)
+{
+    LARGE_INTEGER freq;
+    int ret;
+    ret = QueryPerformanceFrequency(&freq);
+    if (ret == 0) {
+        fprintf(stderr, "Could not calibrate ticks\n");
+        exit(1);
+    }
+    clock_freq = freq.QuadPart;
+}
+
+static int64_t get_clock(void)
+{
+    LARGE_INTEGER ti;
+    QueryPerformanceCounter(&ti);
+    return muldiv64(ti.QuadPart, get_ticks_per_sec(), clock_freq);
+}
+
+#else
+
+static int use_rt_clock;
+
+static void init_get_clock(void)
+{
+    use_rt_clock = 0;
+#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
+    || defined(__DragonFly__) || defined(__FreeBSD_kernel__)
+    {
+        struct timespec ts;
+        if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+            use_rt_clock = 1;
+        }
+    }
+#endif
+}
+
+static int64_t get_clock(void)
+{
+#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
+	|| defined(__DragonFly__) || defined(__FreeBSD_kernel__)
+    if (use_rt_clock) {
+        struct timespec ts;
+        clock_gettime(CLOCK_MONOTONIC, &ts);
+        return ts.tv_sec * 1000000000LL + ts.tv_nsec;
+    } else
+#endif
+    {
+        /* XXX: using gettimeofday leads to problems if the date
+           changes, so it should be avoided. */
+        return get_clock_realtime();
+    }
+}
+#endif
+
 static void blk_send(QEMUFile *f, BlkMigBlock * blk)
 {
     int len;
@@ -127,12 +197,28 @@ uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }
 
+static inline void add_avg_read_time(long double time)
+{
+    block_mig_state.reads++;
+    block_mig_state.total_time += time;
+}
+
+static inline long double compute_read_bwidth(void)
+{
+    assert(block_mig_state.total_time != 0);
+    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
+}
+
 static void blk_mig_read_cb(void *opaque, int ret)
 {
     BlkMigBlock *blk = opaque;
 
     blk->ret = ret;
 
+    blk->time = get_clock() - blk->time;
+
+    add_avg_read_time(blk->time);
+
     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
 
     block_mig_state.submitted--;
@@ -182,6 +268,8 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
+    blk->time = get_clock();
+
     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
                                 nr_sectors, blk_mig_read_cb, blk);
     if (!blk->aiocb) {
@@ -223,6 +311,8 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
+    block_mig_state.total_time = 0;
+    block_mig_state.reads = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -321,6 +411,8 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
                 blk->iov.iov_base = blk->buf;
                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+		blk->time = get_clock();
 
                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
                                             nr_sectors, blk_mig_read_cb, blk);
@@ -403,10 +495,42 @@ static void flush_blks(QEMUFile* f)
             block_mig_state.transferred);
 }
 
+static int64_t get_remaining_dirty(void)
+{
+    BlkMigDevState *bmds;
+    int64_t dirty = 0;
+
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        dirty += bdrv_get_dirty_count(bmds->bs);
+    }
+
+    return dirty * BLOCK_SIZE;
+}
+
 static int is_stage2_completed(void)
 {
-    return (block_mig_state.submitted == 0 &&
-	    block_mig_state.bulk_completed);
+    int64_t remaining_dirty;
+    long double bwidth;
+
+    if (block_mig_state.bulk_completed == 1) {
+
+        remaining_dirty = get_remaining_dirty();
+	if(remaining_dirty == 0) {
+	    return 1;
+	}
+
+	bwidth = compute_read_bwidth();
+
+	if ((remaining_dirty / bwidth) <=
+            migrate_max_downtime()) {
+            /* finish stage2 because we think that we can finish remaing work
+               below max_downtime */
+
+            return 1;
+        }
+    }
+
+    return 0;
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -490,7 +614,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     if (stage == 3) {
-        /* we now for sure that save bulk is completed */
+        /* we know for sure that save bulk is completed and
+           all async read completed */
+        assert(block_mig_state.submitted == 0);
 
         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
         blk_mig_cleanup(mon);
@@ -580,4 +706,6 @@ void blk_mig_init(void)
 
     register_savevm_live("block", 0, 1, block_set_params, block_save_live,
                          NULL, block_load, &block_mig_state);
+
+    init_get_clock();
 }
-- 
1.6.0.4

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3
  2010-01-21 15:24       ` [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3 Liran Schour
@ 2010-01-21 18:03         ` Pierre Riteau
  2010-01-25  8:57           ` Liran Schour
  0 siblings, 1 reply; 9+ messages in thread
From: Pierre Riteau @ 2010-01-21 18:03 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

On 21 janv. 2010, at 16:24, Liran Schour wrote:

> Move to stage3 only when remaining work can be done below max downtime.
> 
> Changes from v1: remove max iterations. Try to infer storage performance and by that calculate remaining work.
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
> block-migration.c |  136 +++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 files changed, 132 insertions(+), 4 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 16df75f..5ef3eb8 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -17,6 +17,7 @@
> #include "qemu-queue.h"
> #include "monitor.h"
> #include "block-migration.h"
> +#include "migration.h"
> #include <assert.h>
> 
> #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> @@ -60,6 +61,7 @@ typedef struct BlkMigBlock {
>     QEMUIOVector qiov;
>     BlockDriverAIOCB *aiocb;
>     int ret;
> +    long double time;
>     QSIMPLEQ_ENTRY(BlkMigBlock) entry;
> } BlkMigBlock;
> 
> @@ -74,11 +76,79 @@ typedef struct BlkMigState {
>     int64_t total_sector_sum;
>     int prev_progress;
>     int bulk_completed;
> -    int dirty_iterations;
> +    long double total_time;
> +    int reads;
> } BlkMigState;
> 
> static BlkMigState block_mig_state;
> 
> +static int64_t get_clock_realtime(void)
> +{
> +    struct timeval tv;
> +
> +    gettimeofday(&tv, NULL);
> +    return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
> +}
> +
> +#ifdef WIN32
> +
> +static int64_t clock_freq;
> +
> +static void init_get_clock(void)
> +{
> +    LARGE_INTEGER freq;
> +    int ret;
> +    ret = QueryPerformanceFrequency(&freq);
> +    if (ret == 0) {
> +        fprintf(stderr, "Could not calibrate ticks\n");
> +        exit(1);
> +    }
> +    clock_freq = freq.QuadPart;
> +}
> +
> +static int64_t get_clock(void)
> +{
> +    LARGE_INTEGER ti;
> +    QueryPerformanceCounter(&ti);
> +    return muldiv64(ti.QuadPart, get_ticks_per_sec(), clock_freq);
> +}
> +
> +#else
> +
> +static int use_rt_clock;
> +
> +static void init_get_clock(void)
> +{
> +    use_rt_clock = 0;
> +#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
> +    || defined(__DragonFly__) || defined(__FreeBSD_kernel__)
> +    {
> +        struct timespec ts;
> +        if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
> +            use_rt_clock = 1;
> +        }
> +    }
> +#endif
> +}
> +
> +static int64_t get_clock(void)
> +{
> +#if defined(__linux__) || (defined(__FreeBSD__) && __FreeBSD_version >= 500000) \
> +	|| defined(__DragonFly__) || defined(__FreeBSD_kernel__)
> +    if (use_rt_clock) {
> +        struct timespec ts;
> +        clock_gettime(CLOCK_MONOTONIC, &ts);
> +        return ts.tv_sec * 1000000000LL + ts.tv_nsec;
> +    } else
> +#endif
> +    {
> +        /* XXX: using gettimeofday leads to problems if the date
> +           changes, so it should be avoided. */
> +        return get_clock_realtime();
> +    }
> +}
> +#endif
> +
> static void blk_send(QEMUFile *f, BlkMigBlock * blk)
> {
>     int len;
> @@ -127,12 +197,28 @@ uint64_t blk_mig_bytes_total(void)
>     return sum << BDRV_SECTOR_BITS;
> }
> 
> +static inline void add_avg_read_time(long double time)
> +{
> +    block_mig_state.reads++;
> +    block_mig_state.total_time += time;
> +}
> +
> +static inline long double compute_read_bwidth(void)
> +{
> +    assert(block_mig_state.total_time != 0);
> +    return  (block_mig_state.reads * BLOCK_SIZE)/ block_mig_state.total_time;
> +}
> +
> static void blk_mig_read_cb(void *opaque, int ret)
> {
>     BlkMigBlock *blk = opaque;
> 
>     blk->ret = ret;
> 
> +    blk->time = get_clock() - blk->time;
> +
> +    add_avg_read_time(blk->time);
> +
>     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
> 
>     block_mig_state.submitted--;
> @@ -182,6 +268,8 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
>     blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
>     qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> 
> +    blk->time = get_clock();
> +
>     blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
>                                 nr_sectors, blk_mig_read_cb, blk);
>     if (!blk->aiocb) {
> @@ -223,6 +311,8 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>     block_mig_state.total_sector_sum = 0;
>     block_mig_state.prev_progress = -1;
>     block_mig_state.bulk_completed = 0;
> +    block_mig_state.total_time = 0;
> +    block_mig_state.reads = 0;
> 
>     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
>         if (bs->type == BDRV_TYPE_HD) {
> @@ -321,6 +411,8 @@ static int mig_save_device_dirty(Monitor *mon, QEMUFile *f,
>                 blk->iov.iov_base = blk->buf;
>                 blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
>                 qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> +
> +		blk->time = get_clock();
> 
>                 blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
>                                             nr_sectors, blk_mig_read_cb, blk);
> @@ -403,10 +495,42 @@ static void flush_blks(QEMUFile* f)
>             block_mig_state.transferred);
> }
> 
> +static int64_t get_remaining_dirty(void)
> +{
> +    BlkMigDevState *bmds;
> +    int64_t dirty = 0;
> +
> +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> +        dirty += bdrv_get_dirty_count(bmds->bs);
> +    }
> +
> +    return dirty * BLOCK_SIZE;
> +}
> +
> static int is_stage2_completed(void)
> {
> -    return (block_mig_state.submitted == 0 &&
> -	    block_mig_state.bulk_completed);
> +    int64_t remaining_dirty;
> +    long double bwidth;
> +
> +    if (block_mig_state.bulk_completed == 1) {
> +
> +        remaining_dirty = get_remaining_dirty();
> +	if(remaining_dirty == 0) {
> +	    return 1;
> +	}
> +
> +	bwidth = compute_read_bwidth();
> +
> +	if ((remaining_dirty / bwidth) <=
> +            migrate_max_downtime()) {
> +            /* finish stage2 because we think that we can finish remaing work
> +               below max_downtime */
> +
> +            return 1;
> +        }
> +    }
> +
> +    return 0;
> }
> 
> static void blk_mig_cleanup(Monitor *mon)
> @@ -490,7 +614,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>     }
> 
>     if (stage == 3) {
> -        /* we now for sure that save bulk is completed */
> +        /* we know for sure that save bulk is completed and
> +           all async read completed */
> +        assert(block_mig_state.submitted == 0);
> 
>         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
>         blk_mig_cleanup(mon);
> @@ -580,4 +706,6 @@ void blk_mig_init(void)
> 
>     register_savevm_live("block", 0, 1, block_set_params, block_save_live,
>                          NULL, block_load, &block_mig_state);
> +
> +    init_get_clock();
> }
> -- 
> 1.6.0.4
> 
> 
> 


I haven't read the patch in detail but I think we should be able to avoid duplicating code from vl.c by using qemu_get_clock.
Also, is floating point really necessary?

-- 
Pierre Riteau -- PhD student, Myriads team, IRISA, Rennes, France
http://perso.univ-rennes1.fr/pierre.riteau/

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3
  2010-01-21 18:03         ` Pierre Riteau
@ 2010-01-25  8:57           ` Liran Schour
  2010-01-25  9:16             ` Pierre Riteau
  0 siblings, 1 reply; 9+ messages in thread
From: Liran Schour @ 2010-01-25  8:57 UTC (permalink / raw)
  To: Pierre Riteau; +Cc: qemu-devel



Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 21/01/2010 20:03:32:

> On 21 janv. 2010, at 16:24, Liran Schour wrote:
>
> > Move to stage3 only when remaining work can be done below max downtime.
> >
> > Changes from v1: remove max iterations. Try to infer storage
> performance and by that calculate remaining work.
...
>
> I haven't read the patch in detail but I think we should be able to
> avoid duplicating code from vl.c by using qemu_get_clock.
> Also, is floating point really necessary?

I thought that qemu_get_clock will return a value in 1000HZ (and that is
too low resolution). But now I see that I can use qemu_get_clock
(host_clock) and get nanoseconds resolution. I will switch it to
qemu_get_clock(host_clock) to avoid duplicating of code.
And I think we can avoid floating point here.

Thanks for the review,
- Liran

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3
  2010-01-25  8:57           ` Liran Schour
@ 2010-01-25  9:16             ` Pierre Riteau
  0 siblings, 0 replies; 9+ messages in thread
From: Pierre Riteau @ 2010-01-25  9:16 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

On 25 janv. 2010, at 09:57, Liran Schour wrote:

> 
> 
> Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 21/01/2010 20:03:32:
> 
>> On 21 janv. 2010, at 16:24, Liran Schour wrote:
>> 
>>> Move to stage3 only when remaining work can be done below max downtime.
>>> 
>>> Changes from v1: remove max iterations. Try to infer storage
>> performance and by that calculate remaining work.
> ...
>> 
>> I haven't read the patch in detail but I think we should be able to
>> avoid duplicating code from vl.c by using qemu_get_clock.
>> Also, is floating point really necessary?
> 
> I thought that qemu_get_clock will return a value in 1000HZ (and that is
> too low resolution). But now I see that I can use qemu_get_clock
> (host_clock) and get nanoseconds resolution. I will switch it to
> qemu_get_clock(host_clock) to avoid duplicating of code.
> And I think we can avoid floating point here.
> 
> Thanks for the review,
> - Liran
> 

You probably don't want to use qemu_get_clock(host_clock): it calls get_clock_realtime(), which uses gettimeofday(). If the clock is modified by NTP, you could get wrong values.
Instead, you could simply introduce code to get the value you want in nanoseconds. Paolo Bonzini has a patch for this in his tree:
http://github.com/bonzini/qemu/commit/cbff458ad6a021582bfddb0f11c4628bbb2cd1e5

-- 
Pierre Riteau -- PhD student, Myriads team, IRISA, Rennes, France
http://perso.univ-rennes1.fr/pierre.riteau/

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage
  2010-01-21 15:24 [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage Liran Schour
  2010-01-21 15:24 ` [Qemu-devel] [PATCH v2 1/4] Remove unused code Liran Schour
@ 2010-01-25  9:28 ` Pierre Riteau
  1 sibling, 0 replies; 9+ messages in thread
From: Pierre Riteau @ 2010-01-25  9:28 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

On 21 janv. 2010, at 16:24, Liran Schour wrote:

> This series of patches reduce the down time of the guest during a migration
> without shared storage. It does that by start transfer dirty blocks in the 
> iterative phase. In the current code transferring of dirty blocks begins only 
> during the full phase while the guest is suspended. Therefore the guest will 
> be suspended linear to the amount of data that was written to disk during
> migration.
> 
> Changes from v1: - infer storage performance by get_clock()
> - remove dirty max iterations - user is responsible for migration convergence
> - remove trailing whitespaces
> - minor cleanups
> 
> block-migration.c |  244 +++++++++++++++++++++++++++++++++++------------------
> block.c           |   20 ++++-
> block.h           |    1 +
> block_int.h       |    1 +
> 4 files changed, 181 insertions(+), 85 deletions(-)
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>

I've cleaned up a little bit this patch series. You can find it at http://github.com/priteau/qemu/commits/lirans
Feel free to squash my commits with yours before submitting again.

Also, I don't think comments on changes from the last patch submission should be part of the commit message.

-- 
Pierre Riteau -- PhD student, Myriads team, IRISA, Rennes, France
http://perso.univ-rennes1.fr/pierre.riteau/

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2010-01-25  9:28 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-21 15:24 [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage Liran Schour
2010-01-21 15:24 ` [Qemu-devel] [PATCH v2 1/4] Remove unused code Liran Schour
2010-01-21 15:24   ` [Qemu-devel] [PATCH v2 2/4] Tranfer dirty blocks during iterative phase Liran Schour
2010-01-21 15:24     ` [Qemu-devel] [PATCH v2 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
2010-01-21 15:24       ` [Qemu-devel] [PATCH v2 4/4] Try not to exceed max downtime on stage3 Liran Schour
2010-01-21 18:03         ` Pierre Riteau
2010-01-25  8:57           ` Liran Schour
2010-01-25  9:16             ` Pierre Riteau
2010-01-25  9:28 ` [Qemu-devel] [PATCH v2 0/4] Reduce down time during migration without shared storage Pierre Riteau

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).