[Qemu-devel] [PATCH 13/18] savevm: New save live migration method: pending

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: Juan Quintela <quintela@redhat.com>
To: qemu-devel@nongnu.org
Cc: owasserm@redhat.com, mtosatti@redhat.com, avi@redhat.com,
	pbonzini@redhat.com
Subject: [Qemu-devel] [PATCH 13/18] savevm: New save live migration method: pending
Date: Mon, 29 Oct 2012 15:11:38 +0100	[thread overview]
Message-ID: <1351519903-26607-14-git-send-email-quintela@redhat.com> (raw)
In-Reply-To: <1351519903-26607-1-git-send-email-quintela@redhat.com>

Code just now does (simplified for clarity)

    if (qemu_savevm_state_iterate(s->file) == 1) {
       vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
       qemu_savevm_state_complete(s->file);
    }

Problem here is that qemu_savevm_state_iterate() returns 1 when it
knows that remaining memory to sent takes less than max downtime.

But this means that we could end spending 2x max_downtime, one
downtime in qemu_savevm_iterate, and the other in
qemu_savevm_state_complete.

Changed code to:

    pending_size = qemu_savevm_state_pending(s->file, max_size);
    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
    if (pending_size >= max_size) {
        ret = qemu_savevm_state_iterate(s->file);
     } else {
        vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
        qemu_savevm_state_complete(s->file);
     }

So what we do is: at current network speed, we calculate the maximum
number of bytes we can sent: max_size.

Then we ask every save_live section how much they have pending.  If
they are less than max_size, we move to complete phase, otherwise we
do an iterate one.

This makes things much simpler, because now individual sections don't
have to caluclate the bandwidth (it was implossible to do right from
there).

Signed-off-by: Juan Quintela <quintela@redhat.com>
---
 arch_init.c       | 48 ++++++++++++++++++------------------------------
 block-migration.c | 49 ++++++++++---------------------------------------
 buffered_file.c   | 23 +++++++++++++++++------
 migration.c       | 22 +++++++++++++++-------
 migration.h       |  2 +-
 savevm.c          | 19 +++++++++++++++++++
 sysemu.h          |  1 +
 vmstate.h         |  1 +
 8 files changed, 82 insertions(+), 83 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 2d29828..1eefef8 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -608,12 +608,9 @@ static int ram_save_setup(QEMUFile *f, void *opaque)

 static int ram_save_iterate(QEMUFile *f, void *opaque)
 {
-    uint64_t bytes_transferred_last;
-    double bwidth = 0;
     int ret;
     int i;
-    uint64_t expected_downtime;
-    MigrationState *s = migrate_get_current();
+    int64_t t0;

     qemu_mutex_lock_ramlist();

@@ -621,9 +618,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
         reset_ram_globals();
     }

-    bytes_transferred_last = bytes_transferred;
-    bwidth = qemu_get_clock_ns(rt_clock);
-
+    t0 = qemu_get_clock_ns(rt_clock);
     i = 0;
     while ((ret = qemu_file_rate_limit(f)) == 0) {
         int bytes_sent;
@@ -641,7 +636,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
            iterations
         */
         if ((i & 63) == 0) {
-            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - bwidth) / 1000000;
+            uint64_t t1 = (qemu_get_clock_ns(rt_clock) - t0) / 1000000;
             if (t1 > MAX_WAIT) {
                 DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n",
                         t1, i);
@@ -655,31 +650,10 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
         return ret;
     }

-    bwidth = qemu_get_clock_ns(rt_clock) - bwidth;
-    bwidth = (bytes_transferred - bytes_transferred_last) / bwidth;
-
-    /* if we haven't transferred anything this round, force
-     * expected_downtime to a very high value, but without
-     * crashing */
-    if (bwidth == 0) {
-        bwidth = 0.000001;
-    }
-
     qemu_mutex_unlock_ramlist();
     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);

-    expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-    DPRINTF("ram_save_live: expected(%" PRIu64 ") <= max(" PRIu64 ")?\n",
-            expected_downtime, migrate_max_downtime());
-
-    if (expected_downtime <= migrate_max_downtime()) {
-        migration_bitmap_sync();
-        expected_downtime = ram_save_remaining() * TARGET_PAGE_SIZE / bwidth;
-        s->expected_downtime = expected_downtime / 1000000; /* ns -> ms */
-
-        return expected_downtime <= migrate_max_downtime();
-    }
-    return 0;
+    return i;
 }

 static int ram_save_complete(QEMUFile *f, void *opaque)
@@ -712,6 +686,19 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
     return 0;
 }

+static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+    uint64_t remaining_size;
+
+    remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+
+    if (remaining_size < max_size) {
+        migration_bitmap_sync();
+        remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
+    }
+    return remaining_size;
+}
+
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 {
     int ret, rc = 0;
@@ -897,6 +884,7 @@ SaveVMHandlers savevm_ram_handlers = {
     .save_live_setup = ram_save_setup,
     .save_live_iterate = ram_save_iterate,
     .save_live_complete = ram_save_complete,
+    .save_live_pending = ram_save_pending,
     .load_state = ram_load,
     .cancel = ram_migration_cancel,
 };
diff --git a/block-migration.c b/block-migration.c
index 71b9601..5db01fe 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -77,9 +77,7 @@ typedef struct BlkMigState {
     int64_t total_sector_sum;
     int prev_progress;
     int bulk_completed;
-    long double total_time;
     long double prev_time_offset;
-    int reads;
 } BlkMigState;

 static BlkMigState block_mig_state;
@@ -132,12 +130,6 @@ uint64_t blk_mig_bytes_total(void)
     return sum << BDRV_SECTOR_BITS;
 }

-static inline long double compute_read_bwidth(void)
-{
-    assert(block_mig_state.total_time != 0);
-    return (block_mig_state.reads / block_mig_state.total_time) * BLOCK_SIZE;
-}
-
 static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector)
 {
     int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
@@ -191,8 +183,6 @@ static void blk_mig_read_cb(void *opaque, int ret)

     blk->ret = ret;

-    block_mig_state.reads++;
-    block_mig_state.total_time += (curr_time - block_mig_state.prev_time_offset);
     block_mig_state.prev_time_offset = curr_time;

     QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry);
@@ -310,8 +300,6 @@ static void init_blk_migration(QEMUFile *f)
     block_mig_state.total_sector_sum = 0;
     block_mig_state.prev_progress = -1;
     block_mig_state.bulk_completed = 0;
-    block_mig_state.total_time = 0;
-    block_mig_state.reads = 0;

     bdrv_iterate(init_blk_migration_it, NULL);
 }
@@ -493,32 +481,6 @@ static int64_t get_remaining_dirty(void)
     return dirty * BLOCK_SIZE;
 }

-static int is_stage2_completed(void)
-{
-    int64_t remaining_dirty;
-    long double bwidth;
-
-    if (block_mig_state.bulk_completed == 1) {
-
-        remaining_dirty = get_remaining_dirty();
-        if (remaining_dirty == 0) {
-            return 1;
-        }
-
-        bwidth = compute_read_bwidth();
-
-        if ((remaining_dirty / bwidth) <=
-            migrate_max_downtime()) {
-            /* finish stage2 because we think that we can finish remaining work
-               below max_downtime */
-
-            return 1;
-        }
-    }
-
-    return 0;
-}
-
 static void blk_mig_cleanup(void)
 {
     BlkMigDevState *bmds;
@@ -619,7 +581,7 @@ static int block_save_iterate(QEMUFile *f, void *opaque)

     qemu_put_be64(f, BLK_MIG_FLAG_EOS);

-    return is_stage2_completed();
+    return 0;
 }

 static int block_save_complete(QEMUFile *f, void *opaque)
@@ -659,6 +621,14 @@ static int block_save_complete(QEMUFile *f, void *opaque)
     return 0;
 }

+static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+{
+
+    DPRINTF("Enter save live pending  %ld\n", get_remaining_dirty());
+
+    return get_remaining_dirty();
+}
+
 static int block_load(QEMUFile *f, void *opaque, int version_id)
 {
     static int banner_printed;
@@ -755,6 +725,7 @@ SaveVMHandlers savevm_block_handlers = {
     .save_live_setup = block_save_setup,
     .save_live_iterate = block_save_iterate,
     .save_live_complete = block_save_complete,
+    .save_live_pending = block_save_pending,
     .load_state = block_load,
     .cancel = block_migration_cancel,
     .is_active = block_is_active,
diff --git a/buffered_file.c b/buffered_file.c
index 017745d..fbd11c4 100644
--- a/buffered_file.c
+++ b/buffered_file.c
@@ -181,7 +181,9 @@ static int64_t buffered_get_rate_limit(void *opaque)
 static void *buffered_file_thread(void *opaque)
 {
     QEMUFileBuffered *s = opaque;
-    int64_t expire_time = qemu_get_clock_ms(rt_clock) + BUFFER_DELAY;
+    int64_t initial_time = qemu_get_clock_ms(rt_clock);
+    int64_t max_size = 0;
+    bool last_round = false;

     while (true) {
         int64_t current_time = qemu_get_clock_ms(rt_clock);
@@ -189,20 +191,29 @@ static void *buffered_file_thread(void *opaque)
         if (s->migration_state->complete) {
             break;
         }
-        if (current_time >= expire_time) {
+        if (current_time >= initial_time + BUFFER_DELAY) {
+            uint64_t transferred_bytes = s->bytes_xfer;
+            uint64_t time_spent = current_time - initial_time;
+            double bandwidth = transferred_bytes / time_spent;
+            max_size = bandwidth * migrate_max_downtime() / 1000000;
+
+            DPRINTF("transferred %" PRIu64 " time_spent %" PRIu64
+                    " bandwidth %g max_size %" PRId64 "\n",
+                    transferred_bytes, time_spent, bandwidth, max_size);
+
             s->bytes_xfer = 0;
-            expire_time = current_time + BUFFER_DELAY;
+            initial_time = current_time;
         }
-        if (s->bytes_xfer >= s->xfer_limit) {
+        if (!last_round && (s->bytes_xfer >= s->xfer_limit)) {
             /* usleep expects microseconds */
-            g_usleep((expire_time - current_time)*1000);
+            g_usleep((initial_time + BUFFER_DELAY - current_time)*1000);
         }
         buffered_flush(s);

         DPRINTF("file is ready\n");
         if (s->bytes_xfer < s->xfer_limit) {
             DPRINTF("notifying client\n");
-            migrate_fd_put_ready(s->migration_state);
+            last_round = migrate_fd_put_ready(s->migration_state, max_size);
         }
     }

diff --git a/migration.c b/migration.c
index 9c409d7..769ae56 100644
--- a/migration.c
+++ b/migration.c
@@ -297,16 +297,18 @@ ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
     return ret;
 }

-void migrate_fd_put_ready(MigrationState *s)
+bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size)
 {
     int ret;
     static bool first_time = true;
+    uint64_t pending_size;
+    bool last_round = false;

     qemu_mutex_lock_iothread();
     if (s->state != MIG_STATE_ACTIVE) {
         DPRINTF("put_ready returning because of non-active state\n");
         qemu_mutex_unlock_iothread();
-        return;
+        return false;
     }
     if (first_time) {
         first_time = false;
@@ -316,15 +318,19 @@ void migrate_fd_put_ready(MigrationState *s)
             DPRINTF("failed, %d\n", ret);
             migrate_fd_error(s);
             qemu_mutex_unlock_iothread();
-            return;
+            return false;
         }
     }

     DPRINTF("iterate\n");
-    ret = qemu_savevm_state_iterate(s->file);
-    if (ret < 0) {
-        migrate_fd_error(s);
-    } else if (ret == 1) {
+    pending_size = qemu_savevm_state_pending(s->file, max_size);
+    DPRINTF("pending size %lu max %lu\n", pending_size, max_size);
+    if (pending_size >= max_size) {
+        ret = qemu_savevm_state_iterate(s->file);
+        if (ret < 0) {
+            migrate_fd_error(s);
+        }
+    } else {
         int old_vm_running = runstate_is_running();
         int64_t start_time, end_time;

@@ -350,9 +356,11 @@ void migrate_fd_put_ready(MigrationState *s)
                 vm_start();
             }
         }
+        last_round = true;
     }
     qemu_mutex_unlock_iothread();

+    return last_round;
 }

 static void migrate_fd_cancel(MigrationState *s)
diff --git a/migration.h b/migration.h
index 5ff68eb..72e638b 100644
--- a/migration.h
+++ b/migration.h
@@ -80,7 +80,7 @@ void migrate_fd_connect(MigrationState *s);

 ssize_t migrate_fd_put_buffer(MigrationState *s, const void *data,
                               size_t size);
-void migrate_fd_put_ready(MigrationState *s);
+bool migrate_fd_put_ready(MigrationState *s, uint64_t max_size);
 int migrate_fd_close(MigrationState *s);

 void add_migration_state_change_notifier(Notifier *notify);
diff --git a/savevm.c b/savevm.c
index 69f1768..a68c37b 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1681,6 +1681,25 @@ int qemu_savevm_state_complete(QEMUFile *f)
     return qemu_file_get_error(f);
 }

+uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+{
+    SaveStateEntry *se;
+    uint64_t ret = 0;
+
+    QTAILQ_FOREACH(se, &savevm_handlers, entry) {
+        if (!se->ops || !se->ops->save_live_pending) {
+            continue;
+        }
+        if (se->ops && se->ops->is_active) {
+            if (!se->ops->is_active(se->opaque)) {
+                continue;
+            }
+        }
+        ret += se->ops->save_live_pending(f, se->opaque, max_size);
+    }
+    return ret;
+}
+
 void qemu_savevm_state_cancel(QEMUFile *f)
 {
     SaveStateEntry *se;
diff --git a/sysemu.h b/sysemu.h
index 0c39a3a..0276205 100644
--- a/sysemu.h
+++ b/sysemu.h
@@ -83,6 +83,7 @@ int qemu_savevm_state_begin(QEMUFile *f,
 int qemu_savevm_state_iterate(QEMUFile *f);
 int qemu_savevm_state_complete(QEMUFile *f);
 void qemu_savevm_state_cancel(QEMUFile *f);
+uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
 int qemu_loadvm_state(QEMUFile *f);

 /* SLIRP */
diff --git a/vmstate.h b/vmstate.h
index c9c320e..241d962 100644
--- a/vmstate.h
+++ b/vmstate.h
@@ -35,6 +35,7 @@ typedef struct SaveVMHandlers {
     int (*save_live_setup)(QEMUFile *f, void *opaque);
     int (*save_live_iterate)(QEMUFile *f, void *opaque);
     int (*save_live_complete)(QEMUFile *f, void *opaque);
+    uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
     void (*cancel)(void *opaque);
     LoadStateHandler *load_state;
     bool (*is_active)(void *opaque);
-- 
1.7.11.7

next prev parent reply	other threads:[~2012-10-29 14:12 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-10-29 14:11 [Qemu-devel] [PATCH 00/18] Migration thread lite (20121029) Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 01/18] split MRU ram list Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 02/18] add a version number to ram_list Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 03/18] protect the ramlist with a separate mutex Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 04/18] buffered_file: Move from using a timer to use a thread Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 05/18] migration: make qemu_fopen_ops_buffered() return void Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 06/18] migration: stop all cpus correctly Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 07/18] migration: make writes blocking Juan Quintela
2012-10-29 15:51   ` Markus Armbruster
2012-10-29 17:32     ` Juan Quintela
2012-10-29 22:13       ` Paolo Bonzini
2012-10-30  7:36       ` Markus Armbruster
2012-10-30  8:23         ` Juan Quintela
2012-10-30 10:50           ` Markus Armbruster
2012-10-30 11:08             ` Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 08/18] migration: remove unfreeze logic Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 09/18] migration: take finer locking Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 10/18] buffered_file: Unfold the trick to restart generating migration data Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 11/18] buffered_file: don't flush on put buffer Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 12/18] buffered_file: unfold buffered_append in buffered_put_buffer Juan Quintela
2012-10-29 14:11 ` Juan Quintela [this message]
2012-10-29 14:11 ` [Qemu-devel] [PATCH 14/18] migration: include qemu-file.h Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 15/18] migration-fd: remove duplicate include Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 16/18] memory: introduce memory_region_test_and_clear_dirty Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 17/18] ram: Use memory_region_test_and_clear_dirty Juan Quintela
2012-10-29 14:11 ` [Qemu-devel] [PATCH 18/18] ram: optimize migration bitmap walking Juan Quintela

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:2d29828 dfblob:1eefef8 dfblob:71b9601 dfblob:5db01fe
dfblob:017745d dfblob:fbd11c4 dfblob:9c409d7 dfblob:769ae56
dfblob:5ff68eb dfblob:72e638b dfblob:69f1768 dfblob:a68c37b
dfblob:0c39a3a dfblob:0276205 dfblob:c9c320e dfblob:241d962 )
 OR (
bs:"[Qemu-devel] [PATCH 13/18] savevm: New save live migration method: pending" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1351519903-26607-14-git-send-email-quintela@redhat.com \
    --to=quintela@redhat.com \
    --cc=avi@redhat.com \
    --cc=mtosatti@redhat.com \
    --cc=owasserm@redhat.com \
    --cc=pbonzini@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).