[Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

* [Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage
@ 2010-01-12  8:27 Liran Schour
  2010-01-12  8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour
  0 siblings, 1 reply; 13+ messages in thread
From: Liran Schour @ 2010-01-12  8:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

This series of patches reduce the down time of the guest during a migration
without shared storage. It does that by start transfer dirty blocks in the 
iterative phase. In the current code transferring of dirty blocks begins only 
during the full phase while the guest is suspended. Therefore the guest will 
be suspended linear to the amount of data that was written to disk during
migration.

 block-migration.c |  244 +++++++++++++++++++++++++++++++++++------------------
 block.c           |   20 ++++-
 block.h           |    1 +
 block_int.h       |    1 +
 4 files changed, 181 insertions(+), 85 deletions(-)

Signed-off-by: Liran Schour <lirans@il.ibm.com>

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 1/4] Remove unused code
  2010-01-12  8:27 [Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage Liran Schour
@ 2010-01-12  8:27 ` Liran Schour
  2010-01-12  8:27   ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour
  2010-01-12 11:50   ` [Qemu-devel] Re: [PATCH 1/4] Remove unused code Jan Kiszka
  0 siblings, 2 replies; 13+ messages in thread
From: Liran Schour @ 2010-01-12  8:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

blk_mig_save_bulked_block is never called with sync flag. Remove the sync
flag. Calculate bulk completion during blk_mig_save_bulked_block.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |   63 ++++++++++++++++++++--------------------------------
 1 files changed, 24 insertions(+), 39 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 258a88a..6957909 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -71,7 +71,8 @@ typedef struct BlkMigState {
     int read_done;
     int transferred;
     int64_t total_sector_sum;
-    int prev_progress;
+    int prev_progress;                                     
+    int bulk_completed;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -138,7 +139,7 @@ static void blk_mig_read_cb(void *opaque, int ret)
 }
 
 static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
-                                BlkMigDevState *bmds, int is_async)
+                                BlkMigDevState *bmds)
 {
     int64_t total_sectors = bmds->total_sectors;
     int64_t cur_sector = bmds->cur_sector;
@@ -175,27 +176,17 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
     blk->bmds = bmds;
     blk->sector = cur_sector;
 
-    if (is_async) {
-        blk->iov.iov_base = blk->buf;
-        blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
-        qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
-
-        blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
-                                    nr_sectors, blk_mig_read_cb, blk);
-        if (!blk->aiocb) {
-            goto error;
-        }
-        block_mig_state.submitted++;
-    } else {
-        if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
-            goto error;
-        }
-        blk_send(f, blk);
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
 
-        qemu_free(blk->buf);
-        qemu_free(blk);
+    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
+                                nr_sectors, blk_mig_read_cb, blk);
+    if (!blk->aiocb) {
+        goto error;
     }
-
+    block_mig_state.submitted++;
+    
     bdrv_reset_dirty(bs, cur_sector, nr_sectors);
     bmds->cur_sector = cur_sector + nr_sectors;
 
@@ -229,6 +220,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.transferred = 0;
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
+    block_mig_state.bulk_completed = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -260,7 +252,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     }
 }
 
-static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
+static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
 {
     int64_t completed_sector_sum = 0;
     BlkMigDevState *bmds;
@@ -269,7 +261,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
 
     QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
         if (bmds->bulk_completed == 0) {
-            if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) {
+            if (mig_save_device_bulk(mon, f, bmds) == 1) {
                 /* completed bulk section for this device */
                 bmds->bulk_completed = 1;
             }
@@ -289,7 +281,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
         monitor_printf(mon, "Completed %d %%\r", progress);
         monitor_flush(mon);
     }
-
+    
     return ret;
 }
 
@@ -362,19 +354,13 @@ static void flush_blks(QEMUFile* f)
 
 static int is_stage2_completed(void)
 {
-    BlkMigDevState *bmds;
 
-    if (block_mig_state.submitted > 0) {
+    if (block_mig_state.submitted == 0 &&
+        block_mig_state.bulk_completed == 1) {
+        return 1;
+    } else {
         return 0;
     }
-
-    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        if (bmds->bulk_completed == 0) {
-            return 0;
-        }
-    }
-
-    return 1;
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -432,8 +418,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     while ((block_mig_state.submitted +
             block_mig_state.read_done) * BLOCK_SIZE <
            qemu_file_get_rate_limit(f)) {
-        if (blk_mig_save_bulked_block(mon, f, 1) == 0) {
-            /* no more bulk blocks for now */
+        if (blk_mig_save_bulked_block(mon, f) == 0) {
+            /* finish saving bulk on all devices */
+            block_mig_state.bulk_completed = 1;
             break;
         }
     }
@@ -446,9 +433,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
 
     if (stage == 3) {
-        while (blk_mig_save_bulked_block(mon, f, 0) != 0) {
-            /* empty */
-        }
+        /* we now for sure that save bulk is completed */
 
         blk_mig_save_dirty_blocks(mon, f);
         blk_mig_cleanup(mon);
-- 
1.5.2.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase
  2010-01-12  8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour
@ 2010-01-12  8:27   ` Liran Schour
  2010-01-12  8:27     ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
  2010-01-12 11:50     ` [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase Jan Kiszka
  2010-01-12 11:50   ` [Qemu-devel] Re: [PATCH 1/4] Remove unused code Jan Kiszka
  1 sibling, 2 replies; 13+ messages in thread
From: Liran Schour @ 2010-01-12  8:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

Start transfer dirty blocks during the iterative stage. That will
reduce the time that the guest will be suspended

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |  158 +++++++++++++++++++++++++++++++++++++++--------------
 1 files changed, 116 insertions(+), 42 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 6957909..90c84b1 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -29,6 +29,7 @@
 #define MAX_BLOCKS_READ 10000
 #define BLOCKS_READ_CHANGE 100
 #define INITIAL_BLOCKS_READ 100
+#define MAX_DIRTY_ITERATIONS 100
 
 //#define DEBUG_BLK_MIGRATION
 
@@ -45,6 +46,7 @@ typedef struct BlkMigDevState {
     int bulk_completed;
     int shared_base;
     int64_t cur_sector;
+    int64_t cur_dirty;
     int64_t completed_sectors;
     int64_t total_sectors;
     int64_t dirty;
@@ -73,6 +75,7 @@ typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;                                     
     int bulk_completed;
+    int dirty_iterations;
 } BlkMigState;
 
 static BlkMigState block_mig_state;
@@ -221,6 +224,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
+    block_mig_state.dirty_iterations = 0;
 
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
@@ -285,39 +289,88 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
     return ret;
 }
 
-#define MAX_NUM_BLOCKS 4
-
-static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f)
+static void blk_mig_reset_dirty_curser(void)
 {
     BlkMigDevState *bmds;
-    BlkMigBlock blk;
-    int64_t sector;
+    
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        bmds->cur_dirty = 0;
+    }
+}
 
-    blk.buf = qemu_malloc(BLOCK_SIZE);
+static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, 
+                                 BlkMigDevState *bmds, int is_async)
+{   
+    BlkMigBlock *blk;
+    int64_t total_sectors = bmds->total_sectors;
+    int64_t sector;
+    int nr_sectors;
 
-    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
-        for (sector = 0; sector < bmds->cur_sector;) {
-            if (bdrv_get_dirty(bmds->bs, sector)) {
-                if (bdrv_read(bmds->bs, sector, blk.buf,
-                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
-                    monitor_printf(mon, "Error reading sector %" PRId64 "\n",
-                                   sector);
-                    qemu_file_set_error(f);
-                    qemu_free(blk.buf);
-                    return;
+    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
+        if (bdrv_get_dirty(bmds->bs, sector)) {
+            
+            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
+                nr_sectors = total_sectors - sector;
+            } else {
+                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+            }
+            blk = qemu_malloc(sizeof(BlkMigBlock));
+            blk->buf = qemu_malloc(BLOCK_SIZE);
+            blk->bmds = bmds;
+            blk->sector = sector;
+            
+            if(is_async) {
+                blk->iov.iov_base = blk->buf;
+                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+                
+                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
+                                            nr_sectors, blk_mig_read_cb, blk);
+                if (!blk->aiocb) {
+                    goto error;
                 }
-                blk.bmds = bmds;
-                blk.sector = sector;
-                blk_send(f, &blk);
-
-                bdrv_reset_dirty(bmds->bs, sector,
-                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+                block_mig_state.submitted++;
+            } else {
+                if (bdrv_read(bmds->bs, sector, blk->buf,
+                              nr_sectors) < 0) {
+                    goto error;
+                }
+                blk_send(f, blk);
+            
+                qemu_free(blk->buf);
+                qemu_free(blk);
             }
-            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+                
+            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
+            break;
         }
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        bmds->cur_dirty = sector;
     }
+    
+    return (bmds->cur_dirty >= bmds->total_sectors);
+
+    error:
+    monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
+    qemu_file_set_error(f);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+    return 0;
+}
+
+static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
+{
+    BlkMigDevState *bmds;
+    int ret = 0;
 
-    qemu_free(blk.buf);
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
+            ret = 1;
+            break;
+        } 
+    }
+    
+    return ret;
 }
 
 static void flush_blks(QEMUFile* f)
@@ -386,6 +439,8 @@ static void blk_mig_cleanup(Monitor *mon)
 
 static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
 {
+    int dirty_iteration = 0;
+    
     dprintf("Enter save live stage %d submitted %d transferred %d\n",
             stage, block_mig_state.submitted, block_mig_state.transferred);
 
@@ -413,29 +468,48 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
         blk_mig_cleanup(mon);
         return 0;
     }
-
-    /* control the rate of transfer */
-    while ((block_mig_state.submitted +
-            block_mig_state.read_done) * BLOCK_SIZE <
-           qemu_file_get_rate_limit(f)) {
-        if (blk_mig_save_bulked_block(mon, f) == 0) {
-            /* finish saving bulk on all devices */
-            block_mig_state.bulk_completed = 1;
-            break;
+    
+    blk_mig_reset_dirty_curser();
+    
+    if(stage == 2) {
+        /* control the rate of transfer */
+        while ((block_mig_state.submitted +
+                block_mig_state.read_done) * BLOCK_SIZE <
+               qemu_file_get_rate_limit(f)) {
+            if (block_mig_state.bulk_completed == 0) {
+                /* first finish the bulk phase */
+                if (blk_mig_save_bulked_block(mon, f) == 0) {
+                    /* finish saving bulk on all devices */
+                    block_mig_state.bulk_completed = 1;
+                }
+            } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) {
+                if (dirty_iteration == 0) {
+                    /* increment dirty iteration only once per round */
+                    dirty_iteration = 1;
+                    block_mig_state.dirty_iterations++;
+                }
+                if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
+                    /* no more dirty blocks */
+                    break;
+                }
+            } else {
+                /* if we got here stop the loop */
+                break;
+            }
+        }
+        
+        flush_blks(f);
+        
+        if (qemu_file_has_error(f)) {
+            blk_mig_cleanup(mon);
+            return 0;
         }
     }
-
-    flush_blks(f);
-
-    if (qemu_file_has_error(f)) {
-        blk_mig_cleanup(mon);
-        return 0;
-    }
-
+    
     if (stage == 3) {
         /* we now for sure that save bulk is completed */
 
-        blk_mig_save_dirty_blocks(mon, f);
+        while(blk_mig_save_dirty_block(mon, f, 0) != 0);
         blk_mig_cleanup(mon);
 
         /* report completion */
-- 
1.5.2.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count
  2010-01-12  8:27   ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour
@ 2010-01-12  8:27     ` Liran Schour
  2010-01-12  8:27       ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour
  2010-01-12 11:50       ` [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Jan Kiszka
  2010-01-12 11:50     ` [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase Jan Kiszka
  1 sibling, 2 replies; 13+ messages in thread
From: Liran Schour @ 2010-01-12  8:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

This will manage dirty counter for each device and will allow to get the
dirty counter from above.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block.c     |   20 ++++++++++++++++----
 block.h     |    1 +
 block_int.h |    1 +
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index 853f025..ecfcba4 100644
--- a/block.c
+++ b/block.c
@@ -652,9 +652,15 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
         bit = start % (sizeof(unsigned long) * 8);
         val = bs->dirty_bitmap[idx];
         if (dirty) {
-            val |= 1 << bit;
+            if (!(val & (1 << bit))) {
+                bs->dirty_count++;
+                val |= 1 << bit;
+            }
         } else {
-            val &= ~(1 << bit);
+            if (val & (1 << bit)) {
+                bs->dirty_count--;
+                val &= ~(1 << bit);
+            }
         }
         bs->dirty_bitmap[idx] = val;
     }
@@ -1972,14 +1978,15 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
 void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
 {
     int64_t bitmap_size;
-
+    
+    bs->dirty_count = 0;
     if (enable) {
         if (!bs->dirty_bitmap) {
             bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
                     BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
             bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
 
-            bs->dirty_bitmap = qemu_mallocz(bitmap_size);
+            bs->dirty_bitmap = qemu_mallocz(bitmap_size);  
         }
     } else {
         if (bs->dirty_bitmap) {
@@ -2007,3 +2014,8 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
 {
     set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
 }
+
+int64_t bdrv_get_dirty_count(BlockDriverState *bs)
+{
+    return bs->dirty_count;
+}
diff --git a/block.h b/block.h
index 4a8b628..bf489d0 100644
--- a/block.h
+++ b/block.h
@@ -198,4 +198,5 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
 int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
 void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
                       int nr_sectors);
+int64_t bdrv_get_dirty_count(BlockDriverState *bs);
 #endif
diff --git a/block_int.h b/block_int.h
index 9a3b2e0..8d5d9bc 100644
--- a/block_int.h
+++ b/block_int.h
@@ -172,6 +172,7 @@ struct BlockDriverState {
     int type;
     char device_name[32];
     unsigned long *dirty_bitmap;
+    int64_t dirty_count;
     BlockDriverState *next;
     void *private;
 };
-- 
1.5.2.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3
  2010-01-12  8:27     ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
@ 2010-01-12  8:27       ` Liran Schour
  2010-01-12  9:52         ` Pierre Riteau
  2010-01-12 11:51         ` [Qemu-devel] " Jan Kiszka
  2010-01-12 11:50       ` [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Jan Kiszka
  1 sibling, 2 replies; 13+ messages in thread
From: Liran Schour @ 2010-01-12  8:27 UTC (permalink / raw)
  To: qemu-devel; +Cc: Liran Schour

Move to stage3 only when remaining work can be done below max downtime.
To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.

Signed-off-by: Liran Schour <lirans@il.ibm.com>
---
 block-migration.c |   67 +++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 90c84b1..9ae04c4 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -17,6 +17,7 @@
 #include "qemu-queue.h"
 #include "monitor.h"
 #include "block-migration.h"
+#include "migration.h"
 #include <assert.h>
 
 #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
@@ -30,6 +31,7 @@
 #define BLOCKS_READ_CHANGE 100
 #define INITIAL_BLOCKS_READ 100
 #define MAX_DIRTY_ITERATIONS 100
+#define DISK_RATE (30 << 20) //30 MB/sec
 
 //#define DEBUG_BLK_MIGRATION
 
@@ -135,10 +137,11 @@ static void blk_mig_read_cb(void *opaque, int ret)
     blk->ret = ret;
 
     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
-
+    
     block_mig_state.submitted--;
     block_mig_state.read_done++;
     assert(block_mig_state.submitted >= 0);
+    
 }
 
 static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
@@ -225,7 +228,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
     block_mig_state.dirty_iterations = 0;
-
+    
     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
         if (bs->type == BDRV_TYPE_HD) {
             sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
@@ -405,15 +408,41 @@ static void flush_blks(QEMUFile* f)
             block_mig_state.transferred);
 }
 
-static int is_stage2_completed(void)
+static int64_t get_remaining_dirty(void)
 {
+    BlkMigDevState *bmds;
+    int64_t dirty = 0;
+    
+    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
+        dirty += bdrv_get_dirty_count(bmds->bs);
+    }
+    
+    return dirty;
+}
 
-    if (block_mig_state.submitted == 0 &&
-        block_mig_state.bulk_completed == 1) {
-        return 1;
-    } else {
-        return 0;
+static int is_stage2_completed(void)
+{
+    int64_t remaining_dirty;
+    
+    if (block_mig_state.bulk_completed == 1) {
+        if (block_mig_state.dirty_iterations++ > MAX_DIRTY_ITERATIONS) {
+            /* finish stage2 because we have too much dirty iterations */
+            
+            return 1;
+        }
+        
+        remaining_dirty = get_remaining_dirty();
+        
+        if ((remaining_dirty * BLOCK_SIZE) * 1000000000 / DISK_RATE <= 
+            migrate_max_downtime()) {
+            /* finish stage2 because we think that we can finish remaing work
+               below max_downtime */
+            
+            return 1;
+        }
     }
+    
+    return 0;
 }
 
 static void blk_mig_cleanup(Monitor *mon)
@@ -438,9 +467,7 @@ static void blk_mig_cleanup(Monitor *mon)
 }
 
 static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
-{
-    int dirty_iteration = 0;
-    
+{    
     dprintf("Enter save live stage %d submitted %d transferred %d\n",
             stage, block_mig_state.submitted, block_mig_state.transferred);
 
@@ -482,19 +509,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
                     /* finish saving bulk on all devices */
                     block_mig_state.bulk_completed = 1;
                 }
-            } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) {
-                if (dirty_iteration == 0) {
-                    /* increment dirty iteration only once per round */
-                    dirty_iteration = 1;
-                    block_mig_state.dirty_iterations++;
-                }
+            } else {
+                
                 if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
                     /* no more dirty blocks */
                     break;
                 }
-            } else {
-                /* if we got here stop the loop */
-                break;
             }
         }
         
@@ -507,9 +527,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
     }
     
     if (stage == 3) {
-        /* we now for sure that save bulk is completed */
-
+        /* we know for sure that save bulk is completed and
+           all async read completed */
+        assert(block_mig_state.submitted == 0);
+        
         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
+        
         blk_mig_cleanup(mon);
 
         /* report completion */
-- 
1.5.2.4

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3
  2010-01-12  8:27       ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour
@ 2010-01-12  9:52         ` Pierre Riteau
  2010-01-12 11:56           ` Liran Schour
  2010-01-12 11:51         ` [Qemu-devel] " Jan Kiszka
  1 sibling, 1 reply; 13+ messages in thread
From: Pierre Riteau @ 2010-01-12  9:52 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

On 12 janv. 2010, at 09:27, Liran Schour wrote:

> Move to stage3 only when remaining work can be done below max downtime.
> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
> block-migration.c |   67 +++++++++++++++++++++++++++++++++++-----------------
> 1 files changed, 45 insertions(+), 22 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 90c84b1..9ae04c4 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -17,6 +17,7 @@
> #include "qemu-queue.h"
> #include "monitor.h"
> #include "block-migration.h"
> +#include "migration.h"
> #include <assert.h>
> 
> #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> @@ -30,6 +31,7 @@
> #define BLOCKS_READ_CHANGE 100
> #define INITIAL_BLOCKS_READ 100
> #define MAX_DIRTY_ITERATIONS 100
> +#define DISK_RATE (30 << 20) //30 MB/sec

This number seems\x0e rather arbitrary. We should try to infer the storage performance from previous reads instead (but it could be difficult, for example when we switch from bulk copy to dirty blocks only, we may switch from sequential reads to random reads).
Also, shouldn't the migration speed limit (migrate_set_speed) be taken into account?	

> //#define DEBUG_BLK_MIGRATION
> 
> @@ -135,10 +137,11 @@ static void blk_mig_read_cb(void *opaque, int ret)
>     blk->ret = ret;
> 
>     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
> -
> +    

Please don't add trailing white space...

>     block_mig_state.submitted--;
>     block_mig_state.read_done++;
>     assert(block_mig_state.submitted >= 0);
> +    
> }

... and unnecessary new lines. Comments valid for the rest of this patch and the other ones as well.

> 
> static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
> @@ -225,7 +228,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>     block_mig_state.prev_progress = -1;
>     block_mig_state.bulk_completed = 0;
>     block_mig_state.dirty_iterations = 0;
> -
> +    
>     for (bs = bdrv_first; bs != NULL; bs = bs->next) {
>         if (bs->type == BDRV_TYPE_HD) {
>             sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> @@ -405,15 +408,41 @@ static void flush_blks(QEMUFile* f)
>             block_mig_state.transferred);
> }
> 
> -static int is_stage2_completed(void)
> +static int64_t get_remaining_dirty(void)
> {
> +    BlkMigDevState *bmds;
> +    int64_t dirty = 0;
> +    
> +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> +        dirty += bdrv_get_dirty_count(bmds->bs);
> +    }
> +    
> +    return dirty;
> +}
> 
> -    if (block_mig_state.submitted == 0 &&
> -        block_mig_state.bulk_completed == 1) {
> -        return 1;
> -    } else {
> -        return 0;
> +static int is_stage2_completed(void)
> +{
> +    int64_t remaining_dirty;
> +    
> +    if (block_mig_state.bulk_completed == 1) {
> +        if (block_mig_state.dirty_iterations++ > MAX_DIRTY_ITERATIONS) {
> +            /* finish stage2 because we have too much dirty iterations */
> +            
> +            return 1;
> +        }
> +        
> +        remaining_dirty = get_remaining_dirty();
> +        
> +        if ((remaining_dirty * BLOCK_SIZE) * 1000000000 / DISK_RATE <= 
> +            migrate_max_downtime()) {
> +            /* finish stage2 because we think that we can finish remaing work
> +               below max_downtime */
> +            
> +            return 1;
> +        }
>     }
> +    
> +    return 0;
> }
> 
> static void blk_mig_cleanup(Monitor *mon)
> @@ -438,9 +467,7 @@ static void blk_mig_cleanup(Monitor *mon)
> }
> 
> static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
> -{
> -    int dirty_iteration = 0;
> -    
> +{    
>     dprintf("Enter save live stage %d submitted %d transferred %d\n",
>             stage, block_mig_state.submitted, block_mig_state.transferred);
> 
> @@ -482,19 +509,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>                     /* finish saving bulk on all devices */
>                     block_mig_state.bulk_completed = 1;
>                 }
> -            } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) {
> -                if (dirty_iteration == 0) {
> -                    /* increment dirty iteration only once per round */
> -                    dirty_iteration = 1;
> -                    block_mig_state.dirty_iterations++;
> -                }
> +            } else {
> +                
>                 if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
>                     /* no more dirty blocks */
>                     break;
>                 }
> -            } else {
> -                /* if we got here stop the loop */
> -                break;
>             }
>         }
> 
> @@ -507,9 +527,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>     }
> 
>     if (stage == 3) {
> -        /* we now for sure that save bulk is completed */
> -
> +        /* we know for sure that save bulk is completed and
> +           all async read completed */
> +        assert(block_mig_state.submitted == 0);
> +        
>         while(blk_mig_save_dirty_block(mon, f, 0) != 0);
> +        
>         blk_mig_cleanup(mon);
> 
>         /* report completion */
> -- 
> 1.5.2.4
> 
> 
> 

-- 
Pierre Riteau -- http://perso.univ-rennes1.fr/pierre.riteau/

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] Re: [PATCH 1/4] Remove unused code
  2010-01-12  8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour
  2010-01-12  8:27   ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour
@ 2010-01-12 11:50   ` Jan Kiszka
  1 sibling, 0 replies; 13+ messages in thread
From: Jan Kiszka @ 2010-01-12 11:50 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

Liran Schour wrote:
> blk_mig_save_bulked_block is never called with sync flag. Remove the sync
> flag. Calculate bulk completion during blk_mig_save_bulked_block.
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
>  block-migration.c |   63 ++++++++++++++++++++--------------------------------
>  1 files changed, 24 insertions(+), 39 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 258a88a..6957909 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -71,7 +71,8 @@ typedef struct BlkMigState {
>      int read_done;
>      int transferred;
>      int64_t total_sector_sum;
> -    int prev_progress;
> +    int prev_progress;                                     
> +    int bulk_completed;

Trailing whitespaces - here and elsewhere in the series. Please make
sure to clean such things up. Almost all editors come with visualization
for trailing spaces and also tabs.

>  } BlkMigState;
>  
>  static BlkMigState block_mig_state;
> @@ -138,7 +139,7 @@ static void blk_mig_read_cb(void *opaque, int ret)
>  }
>  
>  static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
> -                                BlkMigDevState *bmds, int is_async)
> +                                BlkMigDevState *bmds)
>  {
>      int64_t total_sectors = bmds->total_sectors;
>      int64_t cur_sector = bmds->cur_sector;
> @@ -175,27 +176,17 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f,
>      blk->bmds = bmds;
>      blk->sector = cur_sector;
>  
> -    if (is_async) {
> -        blk->iov.iov_base = blk->buf;
> -        blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> -        qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> -
> -        blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
> -                                    nr_sectors, blk_mig_read_cb, blk);
> -        if (!blk->aiocb) {
> -            goto error;
> -        }
> -        block_mig_state.submitted++;
> -    } else {
> -        if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) {
> -            goto error;
> -        }
> -        blk_send(f, blk);
> +    blk->iov.iov_base = blk->buf;
> +    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> +    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
>  
> -        qemu_free(blk->buf);
> -        qemu_free(blk);
> +    blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov,
> +                                nr_sectors, blk_mig_read_cb, blk);
> +    if (!blk->aiocb) {
> +        goto error;
>      }
> -
> +    block_mig_state.submitted++;
> +    
>      bdrv_reset_dirty(bs, cur_sector, nr_sectors);
>      bmds->cur_sector = cur_sector + nr_sectors;
>  
> @@ -229,6 +220,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>      block_mig_state.transferred = 0;
>      block_mig_state.total_sector_sum = 0;
>      block_mig_state.prev_progress = -1;
> +    block_mig_state.bulk_completed = 0;
>  
>      for (bs = bdrv_first; bs != NULL; bs = bs->next) {
>          if (bs->type == BDRV_TYPE_HD) {
> @@ -260,7 +252,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>      }
>  }
>  
> -static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
> +static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
>  {
>      int64_t completed_sector_sum = 0;
>      BlkMigDevState *bmds;
> @@ -269,7 +261,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
>  
>      QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
>          if (bmds->bulk_completed == 0) {
> -            if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) {
> +            if (mig_save_device_bulk(mon, f, bmds) == 1) {
>                  /* completed bulk section for this device */
>                  bmds->bulk_completed = 1;
>              }
> @@ -289,7 +281,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async)
>          monitor_printf(mon, "Completed %d %%\r", progress);
>          monitor_flush(mon);
>      }
> -
> +    

Such hunks should always ring a bell... :)

>      return ret;
>  }
>  
> @@ -362,19 +354,13 @@ static void flush_blks(QEMUFile* f)
>  
>  static int is_stage2_completed(void)
>  {
> -    BlkMigDevState *bmds;
>  
> -    if (block_mig_state.submitted > 0) {
> +    if (block_mig_state.submitted == 0 &&
> +        block_mig_state.bulk_completed == 1) {
> +        return 1;
> +    } else {

return (block_mig_state.submitted == 0 &&
        block_mig_state.bulk_completed);

>          return 0;
>      }
> -
> -    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> -        if (bmds->bulk_completed == 0) {
> -            return 0;
> -        }
> -    }
> -
> -    return 1;
>  }
>  
>  static void blk_mig_cleanup(Monitor *mon)
> @@ -432,8 +418,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>      while ((block_mig_state.submitted +
>              block_mig_state.read_done) * BLOCK_SIZE <
>             qemu_file_get_rate_limit(f)) {
> -        if (blk_mig_save_bulked_block(mon, f, 1) == 0) {
> -            /* no more bulk blocks for now */
> +        if (blk_mig_save_bulked_block(mon, f) == 0) {
> +            /* finish saving bulk on all devices */
> +            block_mig_state.bulk_completed = 1;
>              break;
>          }
>      }
> @@ -446,9 +433,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>      }
>  
>      if (stage == 3) {
> -        while (blk_mig_save_bulked_block(mon, f, 0) != 0) {
> -            /* empty */
> -        }
> +        /* we now for sure that save bulk is completed */
>  
>          blk_mig_save_dirty_blocks(mon, f);
>          blk_mig_cleanup(mon);

Otherwise nice cleanup.

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase
  2010-01-12  8:27   ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour
  2010-01-12  8:27     ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
@ 2010-01-12 11:50     ` Jan Kiszka
  1 sibling, 0 replies; 13+ messages in thread
From: Jan Kiszka @ 2010-01-12 11:50 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

Liran Schour wrote:
> Start transfer dirty blocks during the iterative stage. That will
> reduce the time that the guest will be suspended
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
>  block-migration.c |  158 +++++++++++++++++++++++++++++++++++++++--------------
>  1 files changed, 116 insertions(+), 42 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 6957909..90c84b1 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -29,6 +29,7 @@
>  #define MAX_BLOCKS_READ 10000
>  #define BLOCKS_READ_CHANGE 100
>  #define INITIAL_BLOCKS_READ 100
> +#define MAX_DIRTY_ITERATIONS 100
>  
>  //#define DEBUG_BLK_MIGRATION
>  
> @@ -45,6 +46,7 @@ typedef struct BlkMigDevState {
>      int bulk_completed;
>      int shared_base;
>      int64_t cur_sector;
> +    int64_t cur_dirty;
>      int64_t completed_sectors;
>      int64_t total_sectors;
>      int64_t dirty;
> @@ -73,6 +75,7 @@ typedef struct BlkMigState {
>      int64_t total_sector_sum;
>      int prev_progress;                                     
>      int bulk_completed;
> +    int dirty_iterations;
>  } BlkMigState;
>  
>  static BlkMigState block_mig_state;
> @@ -221,6 +224,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f)
>      block_mig_state.total_sector_sum = 0;
>      block_mig_state.prev_progress = -1;
>      block_mig_state.bulk_completed = 0;
> +    block_mig_state.dirty_iterations = 0;
>  
>      for (bs = bdrv_first; bs != NULL; bs = bs->next) {
>          if (bs->type == BDRV_TYPE_HD) {
> @@ -285,39 +289,88 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f)
>      return ret;
>  }
>  
> -#define MAX_NUM_BLOCKS 4
> -
> -static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f)
> +static void blk_mig_reset_dirty_curser(void)

My translation help makes me believe that 'curser' is not what you
intended call this thing. :)

>  {
>      BlkMigDevState *bmds;
> -    BlkMigBlock blk;
> -    int64_t sector;
> +    
> +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> +        bmds->cur_dirty = 0;
> +    }
> +}
>  
> -    blk.buf = qemu_malloc(BLOCK_SIZE);
> +static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, 
> +                                 BlkMigDevState *bmds, int is_async)
> +{   
> +    BlkMigBlock *blk;
> +    int64_t total_sectors = bmds->total_sectors;
> +    int64_t sector;
> +    int nr_sectors;
>  
> -    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> -        for (sector = 0; sector < bmds->cur_sector;) {
> -            if (bdrv_get_dirty(bmds->bs, sector)) {
> -                if (bdrv_read(bmds->bs, sector, blk.buf,
> -                              BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) {
> -                    monitor_printf(mon, "Error reading sector %" PRId64 "\n",
> -                                   sector);
> -                    qemu_file_set_error(f);
> -                    qemu_free(blk.buf);
> -                    return;
> +    for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) {
> +        if (bdrv_get_dirty(bmds->bs, sector)) {
> +            
> +            if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) {
> +                nr_sectors = total_sectors - sector;
> +            } else {
> +                nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
> +            }
> +            blk = qemu_malloc(sizeof(BlkMigBlock));
> +            blk->buf = qemu_malloc(BLOCK_SIZE);
> +            blk->bmds = bmds;
> +            blk->sector = sector;
> +            
> +            if(is_async) {
> +                blk->iov.iov_base = blk->buf;
> +                blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
> +                qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
> +                
> +                blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov,
> +                                            nr_sectors, blk_mig_read_cb, blk);
> +                if (!blk->aiocb) {
> +                    goto error;
>                  }
> -                blk.bmds = bmds;
> -                blk.sector = sector;
> -                blk_send(f, &blk);
> -
> -                bdrv_reset_dirty(bmds->bs, sector,
> -                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
> +                block_mig_state.submitted++;
> +            } else {
> +                if (bdrv_read(bmds->bs, sector, blk->buf,
> +                              nr_sectors) < 0) {
> +                    goto error;
> +                }
> +                blk_send(f, blk);
> +            
> +                qemu_free(blk->buf);
> +                qemu_free(blk);
>              }
> -            sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
> +                
> +            bdrv_reset_dirty(bmds->bs, sector, nr_sectors);
> +            break;
>          }
> +        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
> +        bmds->cur_dirty = sector;
>      }
> +    
> +    return (bmds->cur_dirty >= bmds->total_sectors);
> +
> +    error:

Please do not indent the label.

> +    monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector);
> +    qemu_file_set_error(f);
> +    qemu_free(blk->buf);
> +    qemu_free(blk);
> +    return 0;
> +}
> +
> +static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async)
> +{
> +    BlkMigDevState *bmds;
> +    int ret = 0;
>  
> -    qemu_free(blk.buf);
> +    QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) {
> +        if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) {
> +            ret = 1;
> +            break;
> +        } 
> +    }
> +    
> +    return ret;
>  }
>  
>  static void flush_blks(QEMUFile* f)
> @@ -386,6 +439,8 @@ static void blk_mig_cleanup(Monitor *mon)
>  
>  static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>  {
> +    int dirty_iteration = 0;
> +    
>      dprintf("Enter save live stage %d submitted %d transferred %d\n",
>              stage, block_mig_state.submitted, block_mig_state.transferred);
>  
> @@ -413,29 +468,48 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque)
>          blk_mig_cleanup(mon);
>          return 0;
>      }
> -
> -    /* control the rate of transfer */
> -    while ((block_mig_state.submitted +
> -            block_mig_state.read_done) * BLOCK_SIZE <
> -           qemu_file_get_rate_limit(f)) {
> -        if (blk_mig_save_bulked_block(mon, f) == 0) {
> -            /* finish saving bulk on all devices */
> -            block_mig_state.bulk_completed = 1;
> -            break;
> +    
> +    blk_mig_reset_dirty_curser();
> +    
> +    if(stage == 2) {
> +        /* control the rate of transfer */
> +        while ((block_mig_state.submitted +
> +                block_mig_state.read_done) * BLOCK_SIZE <
> +               qemu_file_get_rate_limit(f)) {
> +            if (block_mig_state.bulk_completed == 0) {
> +                /* first finish the bulk phase */
> +                if (blk_mig_save_bulked_block(mon, f) == 0) {
> +                    /* finish saving bulk on all devices */
> +                    block_mig_state.bulk_completed = 1;
> +                }
> +            } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) {
> +                if (dirty_iteration == 0) {
> +                    /* increment dirty iteration only once per round */
> +                    dirty_iteration = 1;
> +                    block_mig_state.dirty_iterations++;
> +                }
> +                if (blk_mig_save_dirty_block(mon, f, 1) == 0) {
> +                    /* no more dirty blocks */
> +                    break;
> +                }
> +            } else {
> +                /* if we got here stop the loop */
> +                break;
> +            }

I did not yet get the purpose and effect of the iteration limitation.
Can you help me?

> +        }
> +        
> +        flush_blks(f);
> +        
> +        if (qemu_file_has_error(f)) {
> +            blk_mig_cleanup(mon);
> +            return 0;
>          }
>      }
> -
> -    flush_blks(f);
> -
> -    if (qemu_file_has_error(f)) {
> -        blk_mig_cleanup(mon);
> -        return 0;
> -    }
> -
> +    
>      if (stage == 3) {
>          /* we now for sure that save bulk is completed */
>  
> -        blk_mig_save_dirty_blocks(mon, f);
> +        while(blk_mig_save_dirty_block(mon, f, 0) != 0);

while (...

>          blk_mig_cleanup(mon);
>  
>          /* report completion */

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count
  2010-01-12  8:27     ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
  2010-01-12  8:27       ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour
@ 2010-01-12 11:50       ` Jan Kiszka
  1 sibling, 0 replies; 13+ messages in thread
From: Jan Kiszka @ 2010-01-12 11:50 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

Liran Schour wrote:
> This will manage dirty counter for each device and will allow to get the
> dirty counter from above.
> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
>  block.c     |   20 ++++++++++++++++----
>  block.h     |    1 +
>  block_int.h |    1 +
>  3 files changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/block.c b/block.c
> index 853f025..ecfcba4 100644
> --- a/block.c
> +++ b/block.c
> @@ -652,9 +652,15 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num,
>          bit = start % (sizeof(unsigned long) * 8);
>          val = bs->dirty_bitmap[idx];
>          if (dirty) {
> -            val |= 1 << bit;
> +            if (!(val & (1 << bit))) {
> +                bs->dirty_count++;
> +                val |= 1 << bit;
> +            }
>          } else {
> -            val &= ~(1 << bit);
> +            if (val & (1 << bit)) {
> +                bs->dirty_count--;
> +                val &= ~(1 << bit);
> +            }
>          }
>          bs->dirty_bitmap[idx] = val;
>      }
> @@ -1972,14 +1978,15 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size)
>  void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable)
>  {
>      int64_t bitmap_size;
> -
> +    
> +    bs->dirty_count = 0;
>      if (enable) {
>          if (!bs->dirty_bitmap) {
>              bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
>                      BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
>              bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
>  
> -            bs->dirty_bitmap = qemu_mallocz(bitmap_size);
> +            bs->dirty_bitmap = qemu_mallocz(bitmap_size);  
>          }
>      } else {
>          if (bs->dirty_bitmap) {
> @@ -2007,3 +2014,8 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
>  {
>      set_dirty_bitmap(bs, cur_sector, nr_sectors, 0);
>  }
> +
> +int64_t bdrv_get_dirty_count(BlockDriverState *bs)
> +{
> +    return bs->dirty_count;
> +}
> diff --git a/block.h b/block.h
> index 4a8b628..bf489d0 100644
> --- a/block.h
> +++ b/block.h
> @@ -198,4 +198,5 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable);
>  int bdrv_get_dirty(BlockDriverState *bs, int64_t sector);
>  void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector,
>                        int nr_sectors);
> +int64_t bdrv_get_dirty_count(BlockDriverState *bs);
>  #endif
> diff --git a/block_int.h b/block_int.h
> index 9a3b2e0..8d5d9bc 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -172,6 +172,7 @@ struct BlockDriverState {
>      int type;
>      char device_name[32];
>      unsigned long *dirty_bitmap;
> +    int64_t dirty_count;
>      BlockDriverState *next;
>      void *private;
>  };

Looks good (except for the whitespace issues).

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] Re: [PATCH 4/4] Try not to exceed max downtime on stage3
  2010-01-12  8:27       ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour
  2010-01-12  9:52         ` Pierre Riteau
@ 2010-01-12 11:51         ` Jan Kiszka
  2010-01-12 15:07           ` Anthony Liguori
  2010-01-12 15:07           ` Liran Schour
  1 sibling, 2 replies; 13+ messages in thread
From: Jan Kiszka @ 2010-01-12 11:51 UTC (permalink / raw)
  To: Liran Schour; +Cc: qemu-devel

Liran Schour wrote:
> Move to stage3 only when remaining work can be done below max downtime.
> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.

OK, that explains now patch 2. But do we have such barrier for memory
migration as well? I don't thinks so, and I don't think this hard-coded
limit is the right approach. Such thing should be derived from the
bandwidth the user can control during runtime.

> 
> Signed-off-by: Liran Schour <lirans@il.ibm.com>
> ---
>  block-migration.c |   67 +++++++++++++++++++++++++++++++++++-----------------
>  1 files changed, 45 insertions(+), 22 deletions(-)
> 
> diff --git a/block-migration.c b/block-migration.c
> index 90c84b1..9ae04c4 100644
> --- a/block-migration.c
> +++ b/block-migration.c
> @@ -17,6 +17,7 @@
>  #include "qemu-queue.h"
>  #include "monitor.h"
>  #include "block-migration.h"
> +#include "migration.h"
>  #include <assert.h>
>  
>  #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> @@ -30,6 +31,7 @@
>  #define BLOCKS_READ_CHANGE 100
>  #define INITIAL_BLOCKS_READ 100
>  #define MAX_DIRTY_ITERATIONS 100
> +#define DISK_RATE (30 << 20) //30 MB/sec

IMHO a bad idea (e.g. mine was 6 MB/s last time I tried). Measure it
during runtime just like the mem-migration does.

<skipping the rest of the patch>

Jan

-- 
Siemens AG, Corporate Technology, CT T DE IT 1
Corporate Competence Center Embedded Linux

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3
  2010-01-12  9:52         ` Pierre Riteau
@ 2010-01-12 11:56           ` Liran Schour
  0 siblings, 0 replies; 13+ messages in thread
From: Liran Schour @ 2010-01-12 11:56 UTC (permalink / raw)
  To: Pierre Riteau; +Cc: qemu-devel


Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 12/01/2010 11:52:18:

> On 12 janv. 2010, at 09:27, Liran Schour wrote:
>
> > Move to stage3 only when remaining work can be done below max downtime.
> > To make sure the process will converge we will try only
> MAX_DIRTY_ITERATIONS.
> >
> > Signed-off-by: Liran Schour <lirans@il.ibm.com>
> > ---
> > block-migration.c |   67 ++++++++++++++++++++++++++++++++++
> +-----------------
> > 1 files changed, 45 insertions(+), 22 deletions(-)
> >
> > diff --git a/block-migration.c b/block-migration.c
> > index 90c84b1..9ae04c4 100644
> > --- a/block-migration.c
> > +++ b/block-migration.c
> > @@ -17,6 +17,7 @@
> > #include "qemu-queue.h"
> > #include "monitor.h"
> > #include "block-migration.h"
> > +#include "migration.h"
> > #include <assert.h>
> >
> > #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> > @@ -30,6 +31,7 @@
> > #define BLOCKS_READ_CHANGE 100
> > #define INITIAL_BLOCKS_READ 100
> > #define MAX_DIRTY_ITERATIONS 100
> > +#define DISK_RATE (30 << 20) //30 MB/sec
>
> This number seems\x0e rather arbitrary. We should try to infer the
> storage performance from previous reads instead (but it could be
> difficult, for example when we switch from bulk copy to dirty blocks
> only, we may switch from sequential reads to random reads).
> Also, shouldn't the migration speed limit (migrate_set_speed) be
> taken into account?
>
It will be hard to estimate the disk rate in advance. We need to estimate
the rate of sync writes (during the final stage) while we are writing async
to disk iteratively. Anyone has a simple solution for this?
About the migration speed limit, we do not need to take this into account
because on stage 3 we will transfer the data without referring to migration
speed limit.

Thanks,
- Liran

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [Qemu-devel] Re: [PATCH 4/4] Try not to exceed max downtime on stage3
  2010-01-12 11:51         ` [Qemu-devel] " Jan Kiszka
@ 2010-01-12 15:07           ` Anthony Liguori
  2010-01-12 15:07           ` Liran Schour
  1 sibling, 0 replies; 13+ messages in thread
From: Anthony Liguori @ 2010-01-12 15:07 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: qemu-devel, Liran Schour

On 01/12/2010 05:51 AM, Jan Kiszka wrote:
> Liran Schour wrote:
>    
>> Move to stage3 only when remaining work can be done below max downtime.
>> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS.
>>      
> OK, that explains now patch 2. But do we have such barrier for memory
> migration as well?

No, we explicitly don't because making that decision is a management 
tool job.

A management tool can force convergence by explicitly stopping the guest.

Iterations is a bad metric because there's no real useful meaning to a user.

Time is probably the best metric and it's easy for a management tool to 
set a timer to stop the guest if it hasn't completed by a certain time 
period.

Regards,

Anthony Liguori

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [Qemu-devel] Re: [PATCH 4/4] Try not to exceed max downtime on stage3
  2010-01-12 11:51         ` [Qemu-devel] " Jan Kiszka
  2010-01-12 15:07           ` Anthony Liguori
@ 2010-01-12 15:07           ` Liran Schour
  1 sibling, 0 replies; 13+ messages in thread
From: Liran Schour @ 2010-01-12 15:07 UTC (permalink / raw)
  To: Jan Kiszka; +Cc: qemu-devel


Jan Kiszka <jan.kiszka@siemens.com> wrote on 12/01/2010 13:51:09:

> Liran Schour wrote:
> > Move to stage3 only when remaining work can be done below max downtime.
> > To make sure the process will converge we will try only
> MAX_DIRTY_ITERATIONS.
>
> OK, that explains now patch 2. But do we have such barrier for memory
> migration as well? I don't thinks so, and I don't think this hard-coded
> limit is the right approach. Such thing should be derived from the
> bandwidth the user can control during runtime.

So if I understand you correctly you say that the way to assure convergence
is by the user controlling the bandwidth during runtime. So we can remove
this MAX_DIRTY_ITERATIONS.

> >
> > Signed-off-by: Liran Schour <lirans@il.ibm.com>
> > ---
> >  block-migration.c |   67 ++++++++++++++++++++++++++++++++++
> +-----------------
> >  1 files changed, 45 insertions(+), 22 deletions(-)
> >
> > diff --git a/block-migration.c b/block-migration.c
> > index 90c84b1..9ae04c4 100644
> > --- a/block-migration.c
> > +++ b/block-migration.c
> > @@ -17,6 +17,7 @@
> >  #include "qemu-queue.h"
> >  #include "monitor.h"
> >  #include "block-migration.h"
> > +#include "migration.h"
> >  #include <assert.h>
> >
> >  #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
> > @@ -30,6 +31,7 @@
> >  #define BLOCKS_READ_CHANGE 100
> >  #define INITIAL_BLOCKS_READ 100
> >  #define MAX_DIRTY_ITERATIONS 100
> > +#define DISK_RATE (30 << 20) //30 MB/sec
>
> IMHO a bad idea (e.g. mine was 6 MB/s last time I tried). Measure it
> during runtime just like the mem-migration does.
How about measuring the performance of reading BLOCK_SIZE and then compute
the disk rate? I can compute the average time to read BLOCK_SIZE and then
estimate the remaining time to complete the work.

Thanks,
- Liran

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2010-01-12 15:09 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2010-01-12  8:27 [Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage Liran Schour
2010-01-12  8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour
2010-01-12  8:27   ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour
2010-01-12  8:27     ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour
2010-01-12  8:27       ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour
2010-01-12  9:52         ` Pierre Riteau
2010-01-12 11:56           ` Liran Schour
2010-01-12 11:51         ` [Qemu-devel] " Jan Kiszka
2010-01-12 15:07           ` Anthony Liguori
2010-01-12 15:07           ` Liran Schour
2010-01-12 11:50       ` [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Jan Kiszka
2010-01-12 11:50     ` [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase Jan Kiszka
2010-01-12 11:50   ` [Qemu-devel] Re: [PATCH 1/4] Remove unused code Jan Kiszka

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).