* [Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage @ 2010-01-12 8:27 Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour 0 siblings, 1 reply; 13+ messages in thread From: Liran Schour @ 2010-01-12 8:27 UTC (permalink / raw) To: qemu-devel; +Cc: Liran Schour This series of patches reduce the down time of the guest during a migration without shared storage. It does that by start transfer dirty blocks in the iterative phase. In the current code transferring of dirty blocks begins only during the full phase while the guest is suspended. Therefore the guest will be suspended linear to the amount of data that was written to disk during migration. block-migration.c | 244 +++++++++++++++++++++++++++++++++++------------------ block.c | 20 ++++- block.h | 1 + block_int.h | 1 + 4 files changed, 181 insertions(+), 85 deletions(-) Signed-off-by: Liran Schour <lirans@il.ibm.com> ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] [PATCH 1/4] Remove unused code 2010-01-12 8:27 [Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage Liran Schour @ 2010-01-12 8:27 ` Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 1/4] Remove unused code Jan Kiszka 0 siblings, 2 replies; 13+ messages in thread From: Liran Schour @ 2010-01-12 8:27 UTC (permalink / raw) To: qemu-devel; +Cc: Liran Schour blk_mig_save_bulked_block is never called with sync flag. Remove the sync flag. Calculate bulk completion during blk_mig_save_bulked_block. Signed-off-by: Liran Schour <lirans@il.ibm.com> --- block-migration.c | 63 ++++++++++++++++++++-------------------------------- 1 files changed, 24 insertions(+), 39 deletions(-) diff --git a/block-migration.c b/block-migration.c index 258a88a..6957909 100644 --- a/block-migration.c +++ b/block-migration.c @@ -71,7 +71,8 @@ typedef struct BlkMigState { int read_done; int transferred; int64_t total_sector_sum; - int prev_progress; + int prev_progress; + int bulk_completed; } BlkMigState; static BlkMigState block_mig_state; @@ -138,7 +139,7 @@ static void blk_mig_read_cb(void *opaque, int ret) } static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, - BlkMigDevState *bmds, int is_async) + BlkMigDevState *bmds) { int64_t total_sectors = bmds->total_sectors; int64_t cur_sector = bmds->cur_sector; @@ -175,27 +176,17 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, blk->bmds = bmds; blk->sector = cur_sector; - if (is_async) { - blk->iov.iov_base = blk->buf; - blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; - qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); - - blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, - nr_sectors, blk_mig_read_cb, blk); - if (!blk->aiocb) { - goto error; - } - block_mig_state.submitted++; - } else { - if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) { - goto error; - } - blk_send(f, blk); + blk->iov.iov_base = blk->buf; + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; + qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); - qemu_free(blk->buf); - qemu_free(blk); + blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, + nr_sectors, blk_mig_read_cb, blk); + if (!blk->aiocb) { + goto error; } - + block_mig_state.submitted++; + bdrv_reset_dirty(bs, cur_sector, nr_sectors); bmds->cur_sector = cur_sector + nr_sectors; @@ -229,6 +220,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) block_mig_state.transferred = 0; block_mig_state.total_sector_sum = 0; block_mig_state.prev_progress = -1; + block_mig_state.bulk_completed = 0; for (bs = bdrv_first; bs != NULL; bs = bs->next) { if (bs->type == BDRV_TYPE_HD) { @@ -260,7 +252,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) } } -static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async) +static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f) { int64_t completed_sector_sum = 0; BlkMigDevState *bmds; @@ -269,7 +261,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async) QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { if (bmds->bulk_completed == 0) { - if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) { + if (mig_save_device_bulk(mon, f, bmds) == 1) { /* completed bulk section for this device */ bmds->bulk_completed = 1; } @@ -289,7 +281,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async) monitor_printf(mon, "Completed %d %%\r", progress); monitor_flush(mon); } - + return ret; } @@ -362,19 +354,13 @@ static void flush_blks(QEMUFile* f) static int is_stage2_completed(void) { - BlkMigDevState *bmds; - if (block_mig_state.submitted > 0) { + if (block_mig_state.submitted == 0 && + block_mig_state.bulk_completed == 1) { + return 1; + } else { return 0; } - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - if (bmds->bulk_completed == 0) { - return 0; - } - } - - return 1; } static void blk_mig_cleanup(Monitor *mon) @@ -432,8 +418,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) while ((block_mig_state.submitted + block_mig_state.read_done) * BLOCK_SIZE < qemu_file_get_rate_limit(f)) { - if (blk_mig_save_bulked_block(mon, f, 1) == 0) { - /* no more bulk blocks for now */ + if (blk_mig_save_bulked_block(mon, f) == 0) { + /* finish saving bulk on all devices */ + block_mig_state.bulk_completed = 1; break; } } @@ -446,9 +433,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) } if (stage == 3) { - while (blk_mig_save_bulked_block(mon, f, 0) != 0) { - /* empty */ - } + /* we now for sure that save bulk is completed */ blk_mig_save_dirty_blocks(mon, f); blk_mig_cleanup(mon); -- 1.5.2.4 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase 2010-01-12 8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour @ 2010-01-12 8:27 ` Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase Jan Kiszka 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 1/4] Remove unused code Jan Kiszka 1 sibling, 2 replies; 13+ messages in thread From: Liran Schour @ 2010-01-12 8:27 UTC (permalink / raw) To: qemu-devel; +Cc: Liran Schour Start transfer dirty blocks during the iterative stage. That will reduce the time that the guest will be suspended Signed-off-by: Liran Schour <lirans@il.ibm.com> --- block-migration.c | 158 +++++++++++++++++++++++++++++++++++++++-------------- 1 files changed, 116 insertions(+), 42 deletions(-) diff --git a/block-migration.c b/block-migration.c index 6957909..90c84b1 100644 --- a/block-migration.c +++ b/block-migration.c @@ -29,6 +29,7 @@ #define MAX_BLOCKS_READ 10000 #define BLOCKS_READ_CHANGE 100 #define INITIAL_BLOCKS_READ 100 +#define MAX_DIRTY_ITERATIONS 100 //#define DEBUG_BLK_MIGRATION @@ -45,6 +46,7 @@ typedef struct BlkMigDevState { int bulk_completed; int shared_base; int64_t cur_sector; + int64_t cur_dirty; int64_t completed_sectors; int64_t total_sectors; int64_t dirty; @@ -73,6 +75,7 @@ typedef struct BlkMigState { int64_t total_sector_sum; int prev_progress; int bulk_completed; + int dirty_iterations; } BlkMigState; static BlkMigState block_mig_state; @@ -221,6 +224,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) block_mig_state.total_sector_sum = 0; block_mig_state.prev_progress = -1; block_mig_state.bulk_completed = 0; + block_mig_state.dirty_iterations = 0; for (bs = bdrv_first; bs != NULL; bs = bs->next) { if (bs->type == BDRV_TYPE_HD) { @@ -285,39 +289,88 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f) return ret; } -#define MAX_NUM_BLOCKS 4 - -static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f) +static void blk_mig_reset_dirty_curser(void) { BlkMigDevState *bmds; - BlkMigBlock blk; - int64_t sector; + + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { + bmds->cur_dirty = 0; + } +} - blk.buf = qemu_malloc(BLOCK_SIZE); +static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, + BlkMigDevState *bmds, int is_async) +{ + BlkMigBlock *blk; + int64_t total_sectors = bmds->total_sectors; + int64_t sector; + int nr_sectors; - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - for (sector = 0; sector < bmds->cur_sector;) { - if (bdrv_get_dirty(bmds->bs, sector)) { - if (bdrv_read(bmds->bs, sector, blk.buf, - BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) { - monitor_printf(mon, "Error reading sector %" PRId64 "\n", - sector); - qemu_file_set_error(f); - qemu_free(blk.buf); - return; + for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { + if (bdrv_get_dirty(bmds->bs, sector)) { + + if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { + nr_sectors = total_sectors - sector; + } else { + nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; + } + blk = qemu_malloc(sizeof(BlkMigBlock)); + blk->buf = qemu_malloc(BLOCK_SIZE); + blk->bmds = bmds; + blk->sector = sector; + + if(is_async) { + blk->iov.iov_base = blk->buf; + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; + qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); + + blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, + nr_sectors, blk_mig_read_cb, blk); + if (!blk->aiocb) { + goto error; } - blk.bmds = bmds; - blk.sector = sector; - blk_send(f, &blk); - - bdrv_reset_dirty(bmds->bs, sector, - BDRV_SECTORS_PER_DIRTY_CHUNK); + block_mig_state.submitted++; + } else { + if (bdrv_read(bmds->bs, sector, blk->buf, + nr_sectors) < 0) { + goto error; + } + blk_send(f, blk); + + qemu_free(blk->buf); + qemu_free(blk); } - sector += BDRV_SECTORS_PER_DIRTY_CHUNK; + + bdrv_reset_dirty(bmds->bs, sector, nr_sectors); + break; } + sector += BDRV_SECTORS_PER_DIRTY_CHUNK; + bmds->cur_dirty = sector; } + + return (bmds->cur_dirty >= bmds->total_sectors); + + error: + monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector); + qemu_file_set_error(f); + qemu_free(blk->buf); + qemu_free(blk); + return 0; +} + +static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async) +{ + BlkMigDevState *bmds; + int ret = 0; - qemu_free(blk.buf); + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { + if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) { + ret = 1; + break; + } + } + + return ret; } static void flush_blks(QEMUFile* f) @@ -386,6 +439,8 @@ static void blk_mig_cleanup(Monitor *mon) static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) { + int dirty_iteration = 0; + dprintf("Enter save live stage %d submitted %d transferred %d\n", stage, block_mig_state.submitted, block_mig_state.transferred); @@ -413,29 +468,48 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) blk_mig_cleanup(mon); return 0; } - - /* control the rate of transfer */ - while ((block_mig_state.submitted + - block_mig_state.read_done) * BLOCK_SIZE < - qemu_file_get_rate_limit(f)) { - if (blk_mig_save_bulked_block(mon, f) == 0) { - /* finish saving bulk on all devices */ - block_mig_state.bulk_completed = 1; - break; + + blk_mig_reset_dirty_curser(); + + if(stage == 2) { + /* control the rate of transfer */ + while ((block_mig_state.submitted + + block_mig_state.read_done) * BLOCK_SIZE < + qemu_file_get_rate_limit(f)) { + if (block_mig_state.bulk_completed == 0) { + /* first finish the bulk phase */ + if (blk_mig_save_bulked_block(mon, f) == 0) { + /* finish saving bulk on all devices */ + block_mig_state.bulk_completed = 1; + } + } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) { + if (dirty_iteration == 0) { + /* increment dirty iteration only once per round */ + dirty_iteration = 1; + block_mig_state.dirty_iterations++; + } + if (blk_mig_save_dirty_block(mon, f, 1) == 0) { + /* no more dirty blocks */ + break; + } + } else { + /* if we got here stop the loop */ + break; + } + } + + flush_blks(f); + + if (qemu_file_has_error(f)) { + blk_mig_cleanup(mon); + return 0; } } - - flush_blks(f); - - if (qemu_file_has_error(f)) { - blk_mig_cleanup(mon); - return 0; - } - + if (stage == 3) { /* we now for sure that save bulk is completed */ - blk_mig_save_dirty_blocks(mon, f); + while(blk_mig_save_dirty_block(mon, f, 0) != 0); blk_mig_cleanup(mon); /* report completion */ -- 1.5.2.4 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count 2010-01-12 8:27 ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour @ 2010-01-12 8:27 ` Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Jan Kiszka 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase Jan Kiszka 1 sibling, 2 replies; 13+ messages in thread From: Liran Schour @ 2010-01-12 8:27 UTC (permalink / raw) To: qemu-devel; +Cc: Liran Schour This will manage dirty counter for each device and will allow to get the dirty counter from above. Signed-off-by: Liran Schour <lirans@il.ibm.com> --- block.c | 20 ++++++++++++++++---- block.h | 1 + block_int.h | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/block.c b/block.c index 853f025..ecfcba4 100644 --- a/block.c +++ b/block.c @@ -652,9 +652,15 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, bit = start % (sizeof(unsigned long) * 8); val = bs->dirty_bitmap[idx]; if (dirty) { - val |= 1 << bit; + if (!(val & (1 << bit))) { + bs->dirty_count++; + val |= 1 << bit; + } } else { - val &= ~(1 << bit); + if (val & (1 << bit)) { + bs->dirty_count--; + val &= ~(1 << bit); + } } bs->dirty_bitmap[idx] = val; } @@ -1972,14 +1978,15 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) { int64_t bitmap_size; - + + bs->dirty_count = 0; if (enable) { if (!bs->dirty_bitmap) { bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; - bs->dirty_bitmap = qemu_mallocz(bitmap_size); + bs->dirty_bitmap = qemu_mallocz(bitmap_size); } } else { if (bs->dirty_bitmap) { @@ -2007,3 +2014,8 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, { set_dirty_bitmap(bs, cur_sector, nr_sectors, 0); } + +int64_t bdrv_get_dirty_count(BlockDriverState *bs) +{ + return bs->dirty_count; +} diff --git a/block.h b/block.h index 4a8b628..bf489d0 100644 --- a/block.h +++ b/block.h @@ -198,4 +198,5 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable); int bdrv_get_dirty(BlockDriverState *bs, int64_t sector); void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors); +int64_t bdrv_get_dirty_count(BlockDriverState *bs); #endif diff --git a/block_int.h b/block_int.h index 9a3b2e0..8d5d9bc 100644 --- a/block_int.h +++ b/block_int.h @@ -172,6 +172,7 @@ struct BlockDriverState { int type; char device_name[32]; unsigned long *dirty_bitmap; + int64_t dirty_count; BlockDriverState *next; void *private; }; -- 1.5.2.4 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 2010-01-12 8:27 ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour @ 2010-01-12 8:27 ` Liran Schour 2010-01-12 9:52 ` Pierre Riteau 2010-01-12 11:51 ` [Qemu-devel] " Jan Kiszka 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Jan Kiszka 1 sibling, 2 replies; 13+ messages in thread From: Liran Schour @ 2010-01-12 8:27 UTC (permalink / raw) To: qemu-devel; +Cc: Liran Schour Move to stage3 only when remaining work can be done below max downtime. To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS. Signed-off-by: Liran Schour <lirans@il.ibm.com> --- block-migration.c | 67 +++++++++++++++++++++++++++++++++++----------------- 1 files changed, 45 insertions(+), 22 deletions(-) diff --git a/block-migration.c b/block-migration.c index 90c84b1..9ae04c4 100644 --- a/block-migration.c +++ b/block-migration.c @@ -17,6 +17,7 @@ #include "qemu-queue.h" #include "monitor.h" #include "block-migration.h" +#include "migration.h" #include <assert.h> #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) @@ -30,6 +31,7 @@ #define BLOCKS_READ_CHANGE 100 #define INITIAL_BLOCKS_READ 100 #define MAX_DIRTY_ITERATIONS 100 +#define DISK_RATE (30 << 20) //30 MB/sec //#define DEBUG_BLK_MIGRATION @@ -135,10 +137,11 @@ static void blk_mig_read_cb(void *opaque, int ret) blk->ret = ret; QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); - + block_mig_state.submitted--; block_mig_state.read_done++; assert(block_mig_state.submitted >= 0); + } static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, @@ -225,7 +228,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) block_mig_state.prev_progress = -1; block_mig_state.bulk_completed = 0; block_mig_state.dirty_iterations = 0; - + for (bs = bdrv_first; bs != NULL; bs = bs->next) { if (bs->type == BDRV_TYPE_HD) { sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; @@ -405,15 +408,41 @@ static void flush_blks(QEMUFile* f) block_mig_state.transferred); } -static int is_stage2_completed(void) +static int64_t get_remaining_dirty(void) { + BlkMigDevState *bmds; + int64_t dirty = 0; + + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { + dirty += bdrv_get_dirty_count(bmds->bs); + } + + return dirty; +} - if (block_mig_state.submitted == 0 && - block_mig_state.bulk_completed == 1) { - return 1; - } else { - return 0; +static int is_stage2_completed(void) +{ + int64_t remaining_dirty; + + if (block_mig_state.bulk_completed == 1) { + if (block_mig_state.dirty_iterations++ > MAX_DIRTY_ITERATIONS) { + /* finish stage2 because we have too much dirty iterations */ + + return 1; + } + + remaining_dirty = get_remaining_dirty(); + + if ((remaining_dirty * BLOCK_SIZE) * 1000000000 / DISK_RATE <= + migrate_max_downtime()) { + /* finish stage2 because we think that we can finish remaing work + below max_downtime */ + + return 1; + } } + + return 0; } static void blk_mig_cleanup(Monitor *mon) @@ -438,9 +467,7 @@ static void blk_mig_cleanup(Monitor *mon) } static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) -{ - int dirty_iteration = 0; - +{ dprintf("Enter save live stage %d submitted %d transferred %d\n", stage, block_mig_state.submitted, block_mig_state.transferred); @@ -482,19 +509,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) /* finish saving bulk on all devices */ block_mig_state.bulk_completed = 1; } - } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) { - if (dirty_iteration == 0) { - /* increment dirty iteration only once per round */ - dirty_iteration = 1; - block_mig_state.dirty_iterations++; - } + } else { + if (blk_mig_save_dirty_block(mon, f, 1) == 0) { /* no more dirty blocks */ break; } - } else { - /* if we got here stop the loop */ - break; } } @@ -507,9 +527,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) } if (stage == 3) { - /* we now for sure that save bulk is completed */ - + /* we know for sure that save bulk is completed and + all async read completed */ + assert(block_mig_state.submitted == 0); + while(blk_mig_save_dirty_block(mon, f, 0) != 0); + blk_mig_cleanup(mon); /* report completion */ -- 1.5.2.4 ^ permalink raw reply related [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 2010-01-12 8:27 ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour @ 2010-01-12 9:52 ` Pierre Riteau 2010-01-12 11:56 ` Liran Schour 2010-01-12 11:51 ` [Qemu-devel] " Jan Kiszka 1 sibling, 1 reply; 13+ messages in thread From: Pierre Riteau @ 2010-01-12 9:52 UTC (permalink / raw) To: Liran Schour; +Cc: qemu-devel On 12 janv. 2010, at 09:27, Liran Schour wrote: > Move to stage3 only when remaining work can be done below max downtime. > To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS. > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > --- > block-migration.c | 67 +++++++++++++++++++++++++++++++++++----------------- > 1 files changed, 45 insertions(+), 22 deletions(-) > > diff --git a/block-migration.c b/block-migration.c > index 90c84b1..9ae04c4 100644 > --- a/block-migration.c > +++ b/block-migration.c > @@ -17,6 +17,7 @@ > #include "qemu-queue.h" > #include "monitor.h" > #include "block-migration.h" > +#include "migration.h" > #include <assert.h> > > #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) > @@ -30,6 +31,7 @@ > #define BLOCKS_READ_CHANGE 100 > #define INITIAL_BLOCKS_READ 100 > #define MAX_DIRTY_ITERATIONS 100 > +#define DISK_RATE (30 << 20) //30 MB/sec This number seems\x0e rather arbitrary. We should try to infer the storage performance from previous reads instead (but it could be difficult, for example when we switch from bulk copy to dirty blocks only, we may switch from sequential reads to random reads). Also, shouldn't the migration speed limit (migrate_set_speed) be taken into account? > //#define DEBUG_BLK_MIGRATION > > @@ -135,10 +137,11 @@ static void blk_mig_read_cb(void *opaque, int ret) > blk->ret = ret; > > QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); > - > + Please don't add trailing white space... > block_mig_state.submitted--; > block_mig_state.read_done++; > assert(block_mig_state.submitted >= 0); > + > } ... and unnecessary new lines. Comments valid for the rest of this patch and the other ones as well. > > static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, > @@ -225,7 +228,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) > block_mig_state.prev_progress = -1; > block_mig_state.bulk_completed = 0; > block_mig_state.dirty_iterations = 0; > - > + > for (bs = bdrv_first; bs != NULL; bs = bs->next) { > if (bs->type == BDRV_TYPE_HD) { > sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS; > @@ -405,15 +408,41 @@ static void flush_blks(QEMUFile* f) > block_mig_state.transferred); > } > > -static int is_stage2_completed(void) > +static int64_t get_remaining_dirty(void) > { > + BlkMigDevState *bmds; > + int64_t dirty = 0; > + > + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { > + dirty += bdrv_get_dirty_count(bmds->bs); > + } > + > + return dirty; > +} > > - if (block_mig_state.submitted == 0 && > - block_mig_state.bulk_completed == 1) { > - return 1; > - } else { > - return 0; > +static int is_stage2_completed(void) > +{ > + int64_t remaining_dirty; > + > + if (block_mig_state.bulk_completed == 1) { > + if (block_mig_state.dirty_iterations++ > MAX_DIRTY_ITERATIONS) { > + /* finish stage2 because we have too much dirty iterations */ > + > + return 1; > + } > + > + remaining_dirty = get_remaining_dirty(); > + > + if ((remaining_dirty * BLOCK_SIZE) * 1000000000 / DISK_RATE <= > + migrate_max_downtime()) { > + /* finish stage2 because we think that we can finish remaing work > + below max_downtime */ > + > + return 1; > + } > } > + > + return 0; > } > > static void blk_mig_cleanup(Monitor *mon) > @@ -438,9 +467,7 @@ static void blk_mig_cleanup(Monitor *mon) > } > > static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > -{ > - int dirty_iteration = 0; > - > +{ > dprintf("Enter save live stage %d submitted %d transferred %d\n", > stage, block_mig_state.submitted, block_mig_state.transferred); > > @@ -482,19 +509,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > /* finish saving bulk on all devices */ > block_mig_state.bulk_completed = 1; > } > - } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) { > - if (dirty_iteration == 0) { > - /* increment dirty iteration only once per round */ > - dirty_iteration = 1; > - block_mig_state.dirty_iterations++; > - } > + } else { > + > if (blk_mig_save_dirty_block(mon, f, 1) == 0) { > /* no more dirty blocks */ > break; > } > - } else { > - /* if we got here stop the loop */ > - break; > } > } > > @@ -507,9 +527,12 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > } > > if (stage == 3) { > - /* we now for sure that save bulk is completed */ > - > + /* we know for sure that save bulk is completed and > + all async read completed */ > + assert(block_mig_state.submitted == 0); > + > while(blk_mig_save_dirty_block(mon, f, 0) != 0); > + > blk_mig_cleanup(mon); > > /* report completion */ > -- > 1.5.2.4 > > > -- Pierre Riteau -- http://perso.univ-rennes1.fr/pierre.riteau/ ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 2010-01-12 9:52 ` Pierre Riteau @ 2010-01-12 11:56 ` Liran Schour 0 siblings, 0 replies; 13+ messages in thread From: Liran Schour @ 2010-01-12 11:56 UTC (permalink / raw) To: Pierre Riteau; +Cc: qemu-devel Pierre Riteau <Pierre.Riteau@irisa.fr> wrote on 12/01/2010 11:52:18: > On 12 janv. 2010, at 09:27, Liran Schour wrote: > > > Move to stage3 only when remaining work can be done below max downtime. > > To make sure the process will converge we will try only > MAX_DIRTY_ITERATIONS. > > > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > > --- > > block-migration.c | 67 ++++++++++++++++++++++++++++++++++ > +----------------- > > 1 files changed, 45 insertions(+), 22 deletions(-) > > > > diff --git a/block-migration.c b/block-migration.c > > index 90c84b1..9ae04c4 100644 > > --- a/block-migration.c > > +++ b/block-migration.c > > @@ -17,6 +17,7 @@ > > #include "qemu-queue.h" > > #include "monitor.h" > > #include "block-migration.h" > > +#include "migration.h" > > #include <assert.h> > > > > #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) > > @@ -30,6 +31,7 @@ > > #define BLOCKS_READ_CHANGE 100 > > #define INITIAL_BLOCKS_READ 100 > > #define MAX_DIRTY_ITERATIONS 100 > > +#define DISK_RATE (30 << 20) //30 MB/sec > > This number seems\x0e rather arbitrary. We should try to infer the > storage performance from previous reads instead (but it could be > difficult, for example when we switch from bulk copy to dirty blocks > only, we may switch from sequential reads to random reads). > Also, shouldn't the migration speed limit (migrate_set_speed) be > taken into account? > It will be hard to estimate the disk rate in advance. We need to estimate the rate of sync writes (during the final stage) while we are writing async to disk iteratively. Anyone has a simple solution for this? About the migration speed limit, we do not need to take this into account because on stage 3 we will transfer the data without referring to migration speed limit. Thanks, - Liran ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] Re: [PATCH 4/4] Try not to exceed max downtime on stage3 2010-01-12 8:27 ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour 2010-01-12 9:52 ` Pierre Riteau @ 2010-01-12 11:51 ` Jan Kiszka 2010-01-12 15:07 ` Anthony Liguori 2010-01-12 15:07 ` Liran Schour 1 sibling, 2 replies; 13+ messages in thread From: Jan Kiszka @ 2010-01-12 11:51 UTC (permalink / raw) To: Liran Schour; +Cc: qemu-devel Liran Schour wrote: > Move to stage3 only when remaining work can be done below max downtime. > To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS. OK, that explains now patch 2. But do we have such barrier for memory migration as well? I don't thinks so, and I don't think this hard-coded limit is the right approach. Such thing should be derived from the bandwidth the user can control during runtime. > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > --- > block-migration.c | 67 +++++++++++++++++++++++++++++++++++----------------- > 1 files changed, 45 insertions(+), 22 deletions(-) > > diff --git a/block-migration.c b/block-migration.c > index 90c84b1..9ae04c4 100644 > --- a/block-migration.c > +++ b/block-migration.c > @@ -17,6 +17,7 @@ > #include "qemu-queue.h" > #include "monitor.h" > #include "block-migration.h" > +#include "migration.h" > #include <assert.h> > > #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) > @@ -30,6 +31,7 @@ > #define BLOCKS_READ_CHANGE 100 > #define INITIAL_BLOCKS_READ 100 > #define MAX_DIRTY_ITERATIONS 100 > +#define DISK_RATE (30 << 20) //30 MB/sec IMHO a bad idea (e.g. mine was 6 MB/s last time I tried). Measure it during runtime just like the mem-migration does. <skipping the rest of the patch> Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux ^ permalink raw reply [flat|nested] 13+ messages in thread
* Re: [Qemu-devel] Re: [PATCH 4/4] Try not to exceed max downtime on stage3 2010-01-12 11:51 ` [Qemu-devel] " Jan Kiszka @ 2010-01-12 15:07 ` Anthony Liguori 2010-01-12 15:07 ` Liran Schour 1 sibling, 0 replies; 13+ messages in thread From: Anthony Liguori @ 2010-01-12 15:07 UTC (permalink / raw) To: Jan Kiszka; +Cc: qemu-devel, Liran Schour On 01/12/2010 05:51 AM, Jan Kiszka wrote: > Liran Schour wrote: > >> Move to stage3 only when remaining work can be done below max downtime. >> To make sure the process will converge we will try only MAX_DIRTY_ITERATIONS. >> > OK, that explains now patch 2. But do we have such barrier for memory > migration as well? No, we explicitly don't because making that decision is a management tool job. A management tool can force convergence by explicitly stopping the guest. Iterations is a bad metric because there's no real useful meaning to a user. Time is probably the best metric and it's easy for a management tool to set a timer to stop the guest if it hasn't completed by a certain time period. Regards, Anthony Liguori ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] Re: [PATCH 4/4] Try not to exceed max downtime on stage3 2010-01-12 11:51 ` [Qemu-devel] " Jan Kiszka 2010-01-12 15:07 ` Anthony Liguori @ 2010-01-12 15:07 ` Liran Schour 1 sibling, 0 replies; 13+ messages in thread From: Liran Schour @ 2010-01-12 15:07 UTC (permalink / raw) To: Jan Kiszka; +Cc: qemu-devel Jan Kiszka <jan.kiszka@siemens.com> wrote on 12/01/2010 13:51:09: > Liran Schour wrote: > > Move to stage3 only when remaining work can be done below max downtime. > > To make sure the process will converge we will try only > MAX_DIRTY_ITERATIONS. > > OK, that explains now patch 2. But do we have such barrier for memory > migration as well? I don't thinks so, and I don't think this hard-coded > limit is the right approach. Such thing should be derived from the > bandwidth the user can control during runtime. So if I understand you correctly you say that the way to assure convergence is by the user controlling the bandwidth during runtime. So we can remove this MAX_DIRTY_ITERATIONS. > > > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > > --- > > block-migration.c | 67 ++++++++++++++++++++++++++++++++++ > +----------------- > > 1 files changed, 45 insertions(+), 22 deletions(-) > > > > diff --git a/block-migration.c b/block-migration.c > > index 90c84b1..9ae04c4 100644 > > --- a/block-migration.c > > +++ b/block-migration.c > > @@ -17,6 +17,7 @@ > > #include "qemu-queue.h" > > #include "monitor.h" > > #include "block-migration.h" > > +#include "migration.h" > > #include <assert.h> > > > > #define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS) > > @@ -30,6 +31,7 @@ > > #define BLOCKS_READ_CHANGE 100 > > #define INITIAL_BLOCKS_READ 100 > > #define MAX_DIRTY_ITERATIONS 100 > > +#define DISK_RATE (30 << 20) //30 MB/sec > > IMHO a bad idea (e.g. mine was 6 MB/s last time I tried). Measure it > during runtime just like the mem-migration does. How about measuring the performance of reading BLOCK_SIZE and then compute the disk rate? I can compute the average time to read BLOCK_SIZE and then estimate the remaining time to complete the work. Thanks, - Liran ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count 2010-01-12 8:27 ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour @ 2010-01-12 11:50 ` Jan Kiszka 1 sibling, 0 replies; 13+ messages in thread From: Jan Kiszka @ 2010-01-12 11:50 UTC (permalink / raw) To: Liran Schour; +Cc: qemu-devel Liran Schour wrote: > This will manage dirty counter for each device and will allow to get the > dirty counter from above. > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > --- > block.c | 20 ++++++++++++++++---- > block.h | 1 + > block_int.h | 1 + > 3 files changed, 18 insertions(+), 4 deletions(-) > > diff --git a/block.c b/block.c > index 853f025..ecfcba4 100644 > --- a/block.c > +++ b/block.c > @@ -652,9 +652,15 @@ static void set_dirty_bitmap(BlockDriverState *bs, int64_t sector_num, > bit = start % (sizeof(unsigned long) * 8); > val = bs->dirty_bitmap[idx]; > if (dirty) { > - val |= 1 << bit; > + if (!(val & (1 << bit))) { > + bs->dirty_count++; > + val |= 1 << bit; > + } > } else { > - val &= ~(1 << bit); > + if (val & (1 << bit)) { > + bs->dirty_count--; > + val &= ~(1 << bit); > + } > } > bs->dirty_bitmap[idx] = val; > } > @@ -1972,14 +1978,15 @@ void *qemu_blockalign(BlockDriverState *bs, size_t size) > void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable) > { > int64_t bitmap_size; > - > + > + bs->dirty_count = 0; > if (enable) { > if (!bs->dirty_bitmap) { > bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) + > BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; > bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; > > - bs->dirty_bitmap = qemu_mallocz(bitmap_size); > + bs->dirty_bitmap = qemu_mallocz(bitmap_size); > } > } else { > if (bs->dirty_bitmap) { > @@ -2007,3 +2014,8 @@ void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, > { > set_dirty_bitmap(bs, cur_sector, nr_sectors, 0); > } > + > +int64_t bdrv_get_dirty_count(BlockDriverState *bs) > +{ > + return bs->dirty_count; > +} > diff --git a/block.h b/block.h > index 4a8b628..bf489d0 100644 > --- a/block.h > +++ b/block.h > @@ -198,4 +198,5 @@ void bdrv_set_dirty_tracking(BlockDriverState *bs, int enable); > int bdrv_get_dirty(BlockDriverState *bs, int64_t sector); > void bdrv_reset_dirty(BlockDriverState *bs, int64_t cur_sector, > int nr_sectors); > +int64_t bdrv_get_dirty_count(BlockDriverState *bs); > #endif > diff --git a/block_int.h b/block_int.h > index 9a3b2e0..8d5d9bc 100644 > --- a/block_int.h > +++ b/block_int.h > @@ -172,6 +172,7 @@ struct BlockDriverState { > int type; > char device_name[32]; > unsigned long *dirty_bitmap; > + int64_t dirty_count; > BlockDriverState *next; > void *private; > }; Looks good (except for the whitespace issues). Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase 2010-01-12 8:27 ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour @ 2010-01-12 11:50 ` Jan Kiszka 1 sibling, 0 replies; 13+ messages in thread From: Jan Kiszka @ 2010-01-12 11:50 UTC (permalink / raw) To: Liran Schour; +Cc: qemu-devel Liran Schour wrote: > Start transfer dirty blocks during the iterative stage. That will > reduce the time that the guest will be suspended > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > --- > block-migration.c | 158 +++++++++++++++++++++++++++++++++++++++-------------- > 1 files changed, 116 insertions(+), 42 deletions(-) > > diff --git a/block-migration.c b/block-migration.c > index 6957909..90c84b1 100644 > --- a/block-migration.c > +++ b/block-migration.c > @@ -29,6 +29,7 @@ > #define MAX_BLOCKS_READ 10000 > #define BLOCKS_READ_CHANGE 100 > #define INITIAL_BLOCKS_READ 100 > +#define MAX_DIRTY_ITERATIONS 100 > > //#define DEBUG_BLK_MIGRATION > > @@ -45,6 +46,7 @@ typedef struct BlkMigDevState { > int bulk_completed; > int shared_base; > int64_t cur_sector; > + int64_t cur_dirty; > int64_t completed_sectors; > int64_t total_sectors; > int64_t dirty; > @@ -73,6 +75,7 @@ typedef struct BlkMigState { > int64_t total_sector_sum; > int prev_progress; > int bulk_completed; > + int dirty_iterations; > } BlkMigState; > > static BlkMigState block_mig_state; > @@ -221,6 +224,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) > block_mig_state.total_sector_sum = 0; > block_mig_state.prev_progress = -1; > block_mig_state.bulk_completed = 0; > + block_mig_state.dirty_iterations = 0; > > for (bs = bdrv_first; bs != NULL; bs = bs->next) { > if (bs->type == BDRV_TYPE_HD) { > @@ -285,39 +289,88 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f) > return ret; > } > > -#define MAX_NUM_BLOCKS 4 > - > -static void blk_mig_save_dirty_blocks(Monitor *mon, QEMUFile *f) > +static void blk_mig_reset_dirty_curser(void) My translation help makes me believe that 'curser' is not what you intended call this thing. :) > { > BlkMigDevState *bmds; > - BlkMigBlock blk; > - int64_t sector; > + > + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { > + bmds->cur_dirty = 0; > + } > +} > > - blk.buf = qemu_malloc(BLOCK_SIZE); > +static int mig_save_device_dirty(Monitor *mon, QEMUFile *f, > + BlkMigDevState *bmds, int is_async) > +{ > + BlkMigBlock *blk; > + int64_t total_sectors = bmds->total_sectors; > + int64_t sector; > + int nr_sectors; > > - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { > - for (sector = 0; sector < bmds->cur_sector;) { > - if (bdrv_get_dirty(bmds->bs, sector)) { > - if (bdrv_read(bmds->bs, sector, blk.buf, > - BDRV_SECTORS_PER_DIRTY_CHUNK) < 0) { > - monitor_printf(mon, "Error reading sector %" PRId64 "\n", > - sector); > - qemu_file_set_error(f); > - qemu_free(blk.buf); > - return; > + for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { > + if (bdrv_get_dirty(bmds->bs, sector)) { > + > + if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { > + nr_sectors = total_sectors - sector; > + } else { > + nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; > + } > + blk = qemu_malloc(sizeof(BlkMigBlock)); > + blk->buf = qemu_malloc(BLOCK_SIZE); > + blk->bmds = bmds; > + blk->sector = sector; > + > + if(is_async) { > + blk->iov.iov_base = blk->buf; > + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; > + qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); > + > + blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, > + nr_sectors, blk_mig_read_cb, blk); > + if (!blk->aiocb) { > + goto error; > } > - blk.bmds = bmds; > - blk.sector = sector; > - blk_send(f, &blk); > - > - bdrv_reset_dirty(bmds->bs, sector, > - BDRV_SECTORS_PER_DIRTY_CHUNK); > + block_mig_state.submitted++; > + } else { > + if (bdrv_read(bmds->bs, sector, blk->buf, > + nr_sectors) < 0) { > + goto error; > + } > + blk_send(f, blk); > + > + qemu_free(blk->buf); > + qemu_free(blk); > } > - sector += BDRV_SECTORS_PER_DIRTY_CHUNK; > + > + bdrv_reset_dirty(bmds->bs, sector, nr_sectors); > + break; > } > + sector += BDRV_SECTORS_PER_DIRTY_CHUNK; > + bmds->cur_dirty = sector; > } > + > + return (bmds->cur_dirty >= bmds->total_sectors); > + > + error: Please do not indent the label. > + monitor_printf(mon, "Error reading sector %" PRId64 "\n", sector); > + qemu_file_set_error(f); > + qemu_free(blk->buf); > + qemu_free(blk); > + return 0; > +} > + > +static int blk_mig_save_dirty_block(Monitor *mon, QEMUFile *f, int is_async) > +{ > + BlkMigDevState *bmds; > + int ret = 0; > > - qemu_free(blk.buf); > + QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { > + if(mig_save_device_dirty(mon, f, bmds, is_async) == 0) { > + ret = 1; > + break; > + } > + } > + > + return ret; > } > > static void flush_blks(QEMUFile* f) > @@ -386,6 +439,8 @@ static void blk_mig_cleanup(Monitor *mon) > > static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > { > + int dirty_iteration = 0; > + > dprintf("Enter save live stage %d submitted %d transferred %d\n", > stage, block_mig_state.submitted, block_mig_state.transferred); > > @@ -413,29 +468,48 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > blk_mig_cleanup(mon); > return 0; > } > - > - /* control the rate of transfer */ > - while ((block_mig_state.submitted + > - block_mig_state.read_done) * BLOCK_SIZE < > - qemu_file_get_rate_limit(f)) { > - if (blk_mig_save_bulked_block(mon, f) == 0) { > - /* finish saving bulk on all devices */ > - block_mig_state.bulk_completed = 1; > - break; > + > + blk_mig_reset_dirty_curser(); > + > + if(stage == 2) { > + /* control the rate of transfer */ > + while ((block_mig_state.submitted + > + block_mig_state.read_done) * BLOCK_SIZE < > + qemu_file_get_rate_limit(f)) { > + if (block_mig_state.bulk_completed == 0) { > + /* first finish the bulk phase */ > + if (blk_mig_save_bulked_block(mon, f) == 0) { > + /* finish saving bulk on all devices */ > + block_mig_state.bulk_completed = 1; > + } > + } else if (block_mig_state.dirty_iterations < MAX_DIRTY_ITERATIONS) { > + if (dirty_iteration == 0) { > + /* increment dirty iteration only once per round */ > + dirty_iteration = 1; > + block_mig_state.dirty_iterations++; > + } > + if (blk_mig_save_dirty_block(mon, f, 1) == 0) { > + /* no more dirty blocks */ > + break; > + } > + } else { > + /* if we got here stop the loop */ > + break; > + } I did not yet get the purpose and effect of the iteration limitation. Can you help me? > + } > + > + flush_blks(f); > + > + if (qemu_file_has_error(f)) { > + blk_mig_cleanup(mon); > + return 0; > } > } > - > - flush_blks(f); > - > - if (qemu_file_has_error(f)) { > - blk_mig_cleanup(mon); > - return 0; > - } > - > + > if (stage == 3) { > /* we now for sure that save bulk is completed */ > > - blk_mig_save_dirty_blocks(mon, f); > + while(blk_mig_save_dirty_block(mon, f, 0) != 0); while (... > blk_mig_cleanup(mon); > > /* report completion */ Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux ^ permalink raw reply [flat|nested] 13+ messages in thread
* [Qemu-devel] Re: [PATCH 1/4] Remove unused code 2010-01-12 8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour @ 2010-01-12 11:50 ` Jan Kiszka 1 sibling, 0 replies; 13+ messages in thread From: Jan Kiszka @ 2010-01-12 11:50 UTC (permalink / raw) To: Liran Schour; +Cc: qemu-devel Liran Schour wrote: > blk_mig_save_bulked_block is never called with sync flag. Remove the sync > flag. Calculate bulk completion during blk_mig_save_bulked_block. > > Signed-off-by: Liran Schour <lirans@il.ibm.com> > --- > block-migration.c | 63 ++++++++++++++++++++-------------------------------- > 1 files changed, 24 insertions(+), 39 deletions(-) > > diff --git a/block-migration.c b/block-migration.c > index 258a88a..6957909 100644 > --- a/block-migration.c > +++ b/block-migration.c > @@ -71,7 +71,8 @@ typedef struct BlkMigState { > int read_done; > int transferred; > int64_t total_sector_sum; > - int prev_progress; > + int prev_progress; > + int bulk_completed; Trailing whitespaces - here and elsewhere in the series. Please make sure to clean such things up. Almost all editors come with visualization for trailing spaces and also tabs. > } BlkMigState; > > static BlkMigState block_mig_state; > @@ -138,7 +139,7 @@ static void blk_mig_read_cb(void *opaque, int ret) > } > > static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, > - BlkMigDevState *bmds, int is_async) > + BlkMigDevState *bmds) > { > int64_t total_sectors = bmds->total_sectors; > int64_t cur_sector = bmds->cur_sector; > @@ -175,27 +176,17 @@ static int mig_save_device_bulk(Monitor *mon, QEMUFile *f, > blk->bmds = bmds; > blk->sector = cur_sector; > > - if (is_async) { > - blk->iov.iov_base = blk->buf; > - blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; > - qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); > - > - blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, > - nr_sectors, blk_mig_read_cb, blk); > - if (!blk->aiocb) { > - goto error; > - } > - block_mig_state.submitted++; > - } else { > - if (bdrv_read(bs, cur_sector, blk->buf, nr_sectors) < 0) { > - goto error; > - } > - blk_send(f, blk); > + blk->iov.iov_base = blk->buf; > + blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; > + qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); > > - qemu_free(blk->buf); > - qemu_free(blk); > + blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, > + nr_sectors, blk_mig_read_cb, blk); > + if (!blk->aiocb) { > + goto error; > } > - > + block_mig_state.submitted++; > + > bdrv_reset_dirty(bs, cur_sector, nr_sectors); > bmds->cur_sector = cur_sector + nr_sectors; > > @@ -229,6 +220,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) > block_mig_state.transferred = 0; > block_mig_state.total_sector_sum = 0; > block_mig_state.prev_progress = -1; > + block_mig_state.bulk_completed = 0; > > for (bs = bdrv_first; bs != NULL; bs = bs->next) { > if (bs->type == BDRV_TYPE_HD) { > @@ -260,7 +252,7 @@ static void init_blk_migration(Monitor *mon, QEMUFile *f) > } > } > > -static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async) > +static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f) > { > int64_t completed_sector_sum = 0; > BlkMigDevState *bmds; > @@ -269,7 +261,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async) > > QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { > if (bmds->bulk_completed == 0) { > - if (mig_save_device_bulk(mon, f, bmds, is_async) == 1) { > + if (mig_save_device_bulk(mon, f, bmds) == 1) { > /* completed bulk section for this device */ > bmds->bulk_completed = 1; > } > @@ -289,7 +281,7 @@ static int blk_mig_save_bulked_block(Monitor *mon, QEMUFile *f, int is_async) > monitor_printf(mon, "Completed %d %%\r", progress); > monitor_flush(mon); > } > - > + Such hunks should always ring a bell... :) > return ret; > } > > @@ -362,19 +354,13 @@ static void flush_blks(QEMUFile* f) > > static int is_stage2_completed(void) > { > - BlkMigDevState *bmds; > > - if (block_mig_state.submitted > 0) { > + if (block_mig_state.submitted == 0 && > + block_mig_state.bulk_completed == 1) { > + return 1; > + } else { return (block_mig_state.submitted == 0 && block_mig_state.bulk_completed); > return 0; > } > - > - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { > - if (bmds->bulk_completed == 0) { > - return 0; > - } > - } > - > - return 1; > } > > static void blk_mig_cleanup(Monitor *mon) > @@ -432,8 +418,9 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > while ((block_mig_state.submitted + > block_mig_state.read_done) * BLOCK_SIZE < > qemu_file_get_rate_limit(f)) { > - if (blk_mig_save_bulked_block(mon, f, 1) == 0) { > - /* no more bulk blocks for now */ > + if (blk_mig_save_bulked_block(mon, f) == 0) { > + /* finish saving bulk on all devices */ > + block_mig_state.bulk_completed = 1; > break; > } > } > @@ -446,9 +433,7 @@ static int block_save_live(Monitor *mon, QEMUFile *f, int stage, void *opaque) > } > > if (stage == 3) { > - while (blk_mig_save_bulked_block(mon, f, 0) != 0) { > - /* empty */ > - } > + /* we now for sure that save bulk is completed */ > > blk_mig_save_dirty_blocks(mon, f); > blk_mig_cleanup(mon); Otherwise nice cleanup. Jan -- Siemens AG, Corporate Technology, CT T DE IT 1 Corporate Competence Center Embedded Linux ^ permalink raw reply [flat|nested] 13+ messages in thread
end of thread, other threads:[~2010-01-12 15:09 UTC | newest] Thread overview: 13+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2010-01-12 8:27 [Qemu-devel] [PATCH 0/4] Reduce down time during migration without shared storage Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 1/4] Remove unused code Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 2/4] Tranfer dirty blocks during iterative phase Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Liran Schour 2010-01-12 8:27 ` [Qemu-devel] [PATCH 4/4] Try not to exceed max downtime on stage3 Liran Schour 2010-01-12 9:52 ` Pierre Riteau 2010-01-12 11:56 ` Liran Schour 2010-01-12 11:51 ` [Qemu-devel] " Jan Kiszka 2010-01-12 15:07 ` Anthony Liguori 2010-01-12 15:07 ` Liran Schour 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 3/4] Count dirty blocks and expose an API to get dirty count Jan Kiszka 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 2/4] Tranfer dirty blocks during iterative phase Jan Kiszka 2010-01-12 11:50 ` [Qemu-devel] Re: [PATCH 1/4] Remove unused code Jan Kiszka
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).