Re: [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side alive incase of network failure (Migration recovery from network failure)

qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed

From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
To: Md Haris Iqbal <haris.phnx@gmail.com>
Cc: qemu-devel@nongnu.org
Subject: Re: [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side alive incase of network failure (Migration recovery from network failure)
Date: Wed, 15 Jun 2016 14:02:50 +0100	[thread overview]
Message-ID: <20160615130249.GE2272@work-vm> (raw)
In-Reply-To: <1465409584-16308-1-git-send-email-haris.phnx@gmail.com>

* Md Haris Iqbal (haris.phnx@gmail.com) wrote:
> ---
>  include/migration/migration.h |  1 +
>  migration/migration.c         | 76 ++++++++++++++++++++++++++++++++++++++++---
>  qapi-schema.json              | 11 +++++--
>  vl.c                          |  4 +++
>  4 files changed, 85 insertions(+), 7 deletions(-)

You need to pull this pair of patches together as a set of patches rather
than two individual patches; it just gets too confusing (like with the
version numbers).
Pull them together in git, and then we can also get the patches split
up into smaller parts and you can keep upto date with current qemu.

> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 4a3201b..59e26e6 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -326,6 +326,7 @@ void global_state_store_running(void);
>  void flush_page_queue(MigrationState *ms);
>  int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>                           ram_addr_t start, ram_addr_t len);
> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState *ms);
>  
>  PostcopyState postcopy_state_get(void);
>  /* Set the state and return the old state */
> diff --git a/migration/migration.c b/migration/migration.c
> index a77f62e..41c28e1 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -676,6 +676,33 @@ MigrationInfo *qmp_query_migrate(Error **errp)
>      case MIGRATION_STATUS_CANCELLED:
>          info->has_status = true;
>          break;
> +    case MIGRATION_STATUS_POSTCOPY_RECOVERY:
> +        info->has_status = true;
> +        info->has_total_time = true;
> +        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +
> +        info->has_ram = true;
> +        info->ram = g_malloc0(sizeof(*info->ram));
> +        info->ram->transferred = ram_bytes_transferred();
> +        info->ram->remaining = ram_bytes_remaining();
> +        info->ram->total = ram_bytes_total();
> +        info->ram->duplicate = dup_mig_pages_transferred();
> +        info->ram->skipped = skipped_mig_pages_transferred();
> +        info->ram->normal = norm_mig_pages_transferred();
> +        info->ram->normal_bytes = norm_mig_bytes_transferred();
> +        info->ram->dirty_pages_rate = s->dirty_pages_rate;
> +        info->ram->mbps = s->mbps;
> +        info->ram->dirty_sync_count = s->dirty_sync_count;

If you update to the current qemu git you'll see there's a
populate_ram_info function that avoids a lot of this duplication.

> +        if (blk_mig_active()) {
> +            info->has_disk = true;
> +            info->disk = g_malloc0(sizeof(*info->disk));
> +            info->disk->transferred = blk_mig_bytes_transferred();
> +            info->disk->remaining = blk_mig_bytes_remaining();
> +            info->disk->total = blk_mig_bytes_total();
> +        }
> +
> +        get_xbzrle_cache_stats(info);
>      }
>      info->status = s->state;
>  
> @@ -1660,6 +1687,8 @@ static void *migration_thread(void *opaque)
>      /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
>      enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
>  
> +    int ret;
> +
>      rcu_register_thread();
>  
>      qemu_savevm_state_header(s->to_dst_file);
> @@ -1726,11 +1755,32 @@ static void *migration_thread(void *opaque)
>              }
>          }
>  
> -        if (qemu_file_get_error(s->to_dst_file)) {
> -            migrate_set_state(&s->state, current_active_state,
> -                              MIGRATION_STATUS_FAILED);
> -            trace_migration_thread_file_err();
> -            break;
> +        if ((ret = qemu_file_get_error(s->to_dst_file))) {
> +            fprintf(stderr, "1 : Error %s %d\n", strerror(-ret), -ret);
> +
> +            /*  This check is based on how the error is set during the network
> +             *  recv(). When recv() returns 0 (i.e. no data to read), the error
> +             *  is set to -EIO. For all other network errors, it is set
> +             *  according to the return value received.
> +             */
> +            if (ret != -EIO && s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {

shouldn't that be   ret == -EIO ?

> +                /* Network Failure during postcopy */
> +
> +                current_active_state = MIGRATION_STATUS_POSTCOPY_RECOVERY;
> +                runstate_set(RUN_STATE_POSTMIGRATE_RECOVERY);
> +                fprintf(stderr, "1.1 : Error %s %d\n", strerror(-ret), -ret);

All the fprintf's need to come out for the final version; use traces instaed.

> +                ret = qemu_migrate_postcopy_outgoing_recovery(s);
> +                if(ret < 0) {
> +                    break;
> +                }
> +
> +            } else {
> +                migrate_set_state(&s->state, current_active_state,
> +                                             MIGRATION_STATUS_FAILED);
> +                fprintf(stderr, "1.2 : Error %s %d\n", strerror(-ret), -ret);
> +                trace_migration_thread_file_err();
> +                break;
> +            }
>          }
>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>          if (current_time >= initial_time + BUFFER_DELAY) {
> @@ -1831,6 +1881,22 @@ void migrate_fd_connect(MigrationState *s)
>      s->migration_thread_running = true;
>  }
>  
> +int qemu_migrate_postcopy_outgoing_recovery(MigrationState* ms)
> +{
> +    migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> +                                  MIGRATION_STATUS_POSTCOPY_RECOVERY);
> +
> +    ms->in_recovery = true;
> +    /* Code for network recovery to be added here */
> +    while(atomic_mb_read(&ms->in_recovery) == true) {
> +        fprintf(stderr, "Not letting it fail %p\n", ms->to_dst_file);
> +        sleep(5);
> +    }
> +
> +    return -1;
> +
> +}
> +
>  PostcopyState  postcopy_state_get(void)
>  {
>      return atomic_mb_read(&incoming_postcopy_state);
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 1b7b1e1..613f8d2 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -154,12 +154,14 @@
>  # @watchdog: the watchdog action is configured to pause and has been triggered
>  #
>  # @guest-panicked: guest has been panicked as a result of guest OS panic
> +#
> +# @postmigrate-recovery: guest is paused for recovery after a network failure
>  ##
>  { 'enum': 'RunState',
>    'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
>              'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
>              'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
> -            'guest-panicked' ] }
> +            'guest-panicked', 'postmigrate-recovery' ] }

Would it be possible to have that as postcopy-recovery rather than postmigrate?

>  ##
>  # @StatusInfo:
> @@ -434,12 +436,15 @@
>  #
>  # @failed: some error occurred during migration process.
>  #
> +# @postcopy-recovery: in recovery mode, after a network failure.
> +#
>  # Since: 2.3
>  #
>  ##
>  { 'enum': 'MigrationStatus',
>    'data': [ 'none', 'setup', 'cancelling', 'cancelled',
> -            'active', 'postcopy-active', 'completed', 'failed' ] }
> +            'active', 'postcopy-active', 'completed', 'failed',
> +            'postcopy-recovery' ] }
>  
>  ##
>  # @MigrationInfo
> @@ -2058,6 +2063,8 @@
>  #
>  # @uri: the Uniform Resource Identifier of the destination VM
>  #
> +# @recover: #optional recover from a broken migration
> +#

That looks like it belongs in a different patch.

>  # @blk: #optional do block migration (full disk copy)
>  #
>  # @inc: #optional incremental disk copy migration
> diff --git a/vl.c b/vl.c
> index 5fd22cb..c237140 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -618,6 +618,10 @@ static const RunStateTransition runstate_transitions_def[] = {
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
>      { RUN_STATE_FINISH_MIGRATE, RUN_STATE_PRELAUNCH },
> +    { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE_RECOVERY },
> +
> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_FINISH_MIGRATE },
> +    { RUN_STATE_POSTMIGRATE_RECOVERY, RUN_STATE_SHUTDOWN },
>  
>      { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
>      { RUN_STATE_RESTORE_VM, RUN_STATE_PRELAUNCH },
> -- 
> 2.7.4
> 

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK

     prev parent reply	other threads:[~2016-06-15 13:03 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-06-08 18:13 [Qemu-devel] [Qemu-devel [RFC] [WIP] v2] Keeping the Source side alive incase of network failure (Migration recovery from network failure) Md Haris Iqbal
2016-06-08 20:41 ` Eric Blake
2016-06-13  6:38   ` haris iqbal
2016-06-13 15:38     ` Eric Blake
2016-06-15 13:03       ` Dr. David Alan Gilbert
2016-06-15 13:10         ` Eric Blake
2016-06-15 13:02 ` Dr. David Alan Gilbert [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20160615130249.GE2272@work-vm \
    --to=dgilbert@redhat.com \
    --cc=haris.phnx@gmail.com \
    --cc=qemu-devel@nongnu.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).