qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Md Haris Iqbal <haris.phnx@gmail.com>
To: qemu-devel@nongnu.org
Cc: dgilbert@redhat.com, Md Haris Iqbal <haris.phnx@gmail.com>
Subject: [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side alive incase of network failure (Migration recovery from network failure)
Date: Wed,  8 Jun 2016 23:43:04 +0530	[thread overview]
Message-ID: <1465409584-16308-1-git-send-email-haris.phnx@gmail.com> (raw)

---
 include/migration/migration.h |  1 +
 migration/migration.c         | 76 ++++++++++++++++++++++++++++++++++++++++---
 qapi-schema.json              | 11 +++++--
 vl.c                          |  4 +++
 4 files changed, 85 insertions(+), 7 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 4a3201b..59e26e6 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -326,6 +326,7 @@ void global_state_store_running(void);
 void flush_page_queue(MigrationState *ms);
 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
                          ram_addr_t start, ram_addr_t len);
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
 
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
diff --git a/migration/migration.c b/migration/migration.c
index a77f62e..41c28e1 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -676,6 +676,33 @@ MigrationInfo *qmp_query_migrate(Error **errp)
     case MIGRATION_STATUS_CANCELLED:
         info->has_status = true;
         break;
+    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
+        info->has_status = true;
+        info->has_total_time = true;
+        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+        info->has_ram = true;
+        info->ram = g_malloc0(sizeof(*info->ram));
+        info->ram->transferred = ram_bytes_transferred();
+        info->ram->remaining = ram_bytes_remaining();
+        info->ram->total = ram_bytes_total();
+        info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
+        info->ram->normal = norm_mig_pages_transferred();
+        info->ram->normal_bytes = norm_mig_bytes_transferred();
+        info->ram->dirty_pages_rate = s->dirty_pages_rate;
+        info->ram->mbps = s->mbps;
+        info->ram->dirty_sync_count = s->dirty_sync_count;
+
+        if (blk_mig_active()) {
+            info->has_disk = true;
+            info->disk = g_malloc0(sizeof(*info->disk));
+            info->disk->transferred = blk_mig_bytes_transferred();
+            info->disk->remaining = blk_mig_bytes_remaining();
+            info->disk->total = blk_mig_bytes_total();
+        }
+
+        get_xbzrle_cache_stats(info);
     }
     info->status = s->state;
 
@@ -1660,6 +1687,8 @@ static void *migration_thread(void *opaque)
     /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
     enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
 
+    int ret;
+
     rcu_register_thread();
 
     qemu_savevm_state_header(s->to_dst_file);
@@ -1726,11 +1755,32 @@ static void *migration_thread(void *opaque)
             }
         }
 
-        if (qemu_file_get_error(s->to_dst_file)) {
-            migrate_set_state(&s->state, current_active_state,
-                              MIGRATION_STATUS_FAILED);
-            trace_migration_thread_file_err();
-            break;
+        if ((ret = qemu_file_get_error(s->to_dst_file))) {
+            fprintf(stderr, "1 : Error %s %d\n", strerror(-ret), -ret);
+
+            /*  This check is based on how the error is set during the network
+             *  recv(). When recv() returns 0 (i.e. no data to read), the error
+             *  is set to -EIO. For all other network errors, it is set
+             *  according to the return value received.
+             */
+            if (ret != -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+                /* Network Failure during postcopy */
+
+                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
+                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
+                fprintf(stderr, "1.1 : Error %s %d\n", strerror(-ret), -ret);
+                ret = qemu_migrate_postcopy_outgoing_recovery(s);
+                if(ret < 0) {
+                    break;
+                }
+
+            } else {
+                migrate_set_state(&s->state, current_active_state,
+                                             MIGRATION_STATUS_FAILED);
+                fprintf(stderr, "1.2 : Error %s %d\n", strerror(-ret), -ret);
+                trace_migration_thread_file_err();
+                break;
+            }
         }
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
         if (current_time >= initial_time + BUFFER_DELAY) {
@@ -1831,6 +1881,22 @@ void migrate_fd_connect(MigrationState *s)
     s->migration_thread_running = true;
 }
 
+int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
+{
+    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
+
+    ms->in_recovery = true;
+    /* Code for network recovery to be added here */
+    while(atomic_mb_read(&ms->in_recovery) == true) {
+        fprintf(stderr, "Not letting it fail %p\n", ms->to_dst_file);
+        sleep(5);
+    }
+
+    return -1;
+
+}
+
 PostcopyState  postcopy_state_get(void)
 {
     return atomic_mb_read(&incoming_postcopy_state);
diff --git a/qapi-schema.json b/qapi-schema.json
index 1b7b1e1..613f8d2 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -154,12 +154,14 @@
 # @watchdog: the watchdog action is configured to pause and has been triggered
 #
 # @guest-panicked: guest has been panicked as a result of guest OS panic
+#
+# @postmigrate-recovery: guest is paused for recovery after a network failure
 ##
 { 'enum': 'RunState',
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
             'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
             'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-            'guest-panicked' ] }
+            'guest-panicked', 'postmigrate-recovery' ] }
 
 ##
 # @StatusInfo:
@@ -434,12 +436,15 @@
 #
 # @failed: some error occurred during migration process.
 #
+# @postcopy-recovery: in recovery mode, after a network failure.
+#
 # Since: 2.3
 #
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'postcopy-active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed',
+            'postcopy-recovery' ] }
 
 ##
 # @MigrationInfo
@@ -2058,6 +2063,8 @@
 #
 # @uri: the Uniform Resource Identifier of the destination VM
 #
+# @recover: #optional recover from a broken migration
+#
 # @blk: #optional do block migration (full disk copy)
 #
 # @inc: #optional incremental disk copy migration
diff --git a/vl.c b/vl.c
index 5fd22cb..c237140 100644
--- a/vl.c
+++ b/vl.c
@@ -618,6 +618,10 @@ static const RunStateTransition runstate_transitions_def[] = {
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
     { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
+    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
+
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
+    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
 
     { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
     { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
-- 
2.7.4

             reply	other threads:[~2016-06-08 18:13 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-08 18:13 Md Haris Iqbal [this message]
2016-06-08 20:41 ` [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side alive incase of network failure (Migration recovery from network failure) Eric Blake
2016-06-13  6:38   ` haris iqbal
2016-06-13 15:38     ` Eric Blake
2016-06-15 13:03       ` Dr. David Alan Gilbert
2016-06-15 13:10         ` Eric Blake
2016-06-15 13:02 ` Dr. David Alan Gilbert

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1465409584-16308-1-git-send-email-haris.phnx@gmail.com \
    --to=haris.phnx@gmail.com \
    --cc=dgilbert@redhat.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).