From mboxrd@z Thu Jan 1 00:00:00 1970 From: Hongyang Yang Subject: Re: [RFC PATCH 2/3] remus: implement remus checkpoint in v2 save Date: Thu, 10 Jul 2014 11:25:29 +0800 Message-ID: <53BE07A9.3070608@cn.fujitsu.com> References: <1404892050-24650-1-git-send-email-yanghy@cn.fujitsu.com> <1404892050-24650-3-git-send-email-yanghy@cn.fujitsu.com> <53BD1F13.2040301@citrix.com> Mime-Version: 1.0 Content-Type: text/plain; charset="us-ascii"; Format="flowed" Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <53BD1F13.2040301@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xen.org Errors-To: xen-devel-bounces@lists.xen.org To: Andrew Cooper , xen-devel@lists.xen.org Cc: rshriram@cs.ubc.ca, Ian.Jackson@eu.citrix.com, Ian.Campbell@citrix.com List-Id: xen-devel@lists.xenproject.org On 07/09/2014 06:53 PM, Andrew Cooper wrote: > On 09/07/14 08:47, Yang Hongyang wrote: >> implement remus checkpoint in v2 save >> >> Signed-off-by: Yang Hongyang >> --- >> tools/libxc/saverestore/common.h | 1 + >> tools/libxc/saverestore/save.c | 88 ++++++++++++++++++++++++---------------- >> 2 files changed, 55 insertions(+), 34 deletions(-) >> >> diff --git a/tools/libxc/saverestore/common.h b/tools/libxc/saverestore/common.h >> index 24ba95b..1dd9f51 100644 >> --- a/tools/libxc/saverestore/common.h >> +++ b/tools/libxc/saverestore/common.h >> @@ -153,6 +153,7 @@ struct xc_sr_context >> >> xc_dominfo_t dominfo; >> bool checkpointed; >> + bool firsttime; > > This is also only used on the save side. Yes, the restore side won't use this by now, but I'm not sure it will be used later, maybe it can be moved to .save union now, and when we need to use it in restore side, we can then move it out. In v2, the checkpointed_stream parameter in xc_domain_restore() seems not necessary, can we remove it? cause remove it will breaks the API... > >> >> union >> { >> diff --git a/tools/libxc/saverestore/save.c b/tools/libxc/saverestore/save.c >> index d2fa8a6..98a5c2f 100644 >> --- a/tools/libxc/saverestore/save.c >> +++ b/tools/libxc/saverestore/save.c >> @@ -375,6 +375,8 @@ static int send_domain_memory_live(struct xc_sr_context *ctx) >> goto out; >> } >> >> + if ( ctx->checkpointed && !ctx->firsttime ) >> + goto lastiter; >> /* This juggling is required if logdirty is already on, e.g. VRAM tracking */ >> if ( xc_shadow_control(xch, ctx->domid, >> XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY, >> @@ -436,6 +438,7 @@ static int send_domain_memory_live(struct xc_sr_context *ctx) >> break; >> } >> >> +lastiter: >> rc = suspend_domain(ctx); >> if ( rc ) >> goto out; >> @@ -570,44 +573,60 @@ static int save(struct xc_sr_context *ctx, uint16_t guest_type) >> if ( rc ) >> goto err; >> >> - rc = ctx->save.ops.start_of_stream(ctx); >> - if ( rc ) >> - goto err; >> + do { >> + rc = ctx->save.ops.start_of_stream(ctx); >> + if ( rc ) >> + goto err; > > I am not sure start_of_stream() wants to be inside the loop. For PV > guests, it sends the X86_PV_INFO which is only expected to be sent > once. The X86_PV_P2M_FRAMES record is deliberately safe to send > multiple times (in the hope that someone might evenutally fix the > ballooning issues), but is a waste of time to send like this, as its > content wont be changing. It you make sure all records that has been sent in start_of_stream() wont be changing, then we can surely move this out of the loop. > >> >> - if ( ctx->save.live ) >> - { >> - DPRINTF("Starting live migrate"); >> - rc = send_domain_memory_live(ctx); >> - } >> - else >> - { >> - DPRINTF("Starting nonlive save"); >> - rc = send_domain_memory_nonlive(ctx); >> - } >> + if ( ctx->save.live ) >> + { >> + DPRINTF("Starting live migrate"); >> + rc = send_domain_memory_live(ctx); >> + } >> + else >> + { >> + DPRINTF("Starting nonlive save"); >> + rc = send_domain_memory_nonlive(ctx); >> + } >> >> - if ( rc ) >> - goto err; >> + if ( rc ) >> + goto err; >> >> - /* Refresh domain information now it has paused. */ >> - if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) || >> - (ctx->dominfo.domid != ctx->domid) ) >> - { >> - PERROR("Unable to refresh domain information"); >> - rc = -1; >> - goto err; >> - } >> - else if ( (!ctx->dominfo.shutdown || >> - ctx->dominfo.shutdown_reason != SHUTDOWN_suspend ) && >> - !ctx->dominfo.paused ) >> - { >> - ERROR("Domain has not been suspended"); >> - rc = -1; >> - goto err; >> - } >> + /* Refresh domain information now it has paused. */ >> + if ( (xc_domain_getinfo(xch, ctx->domid, 1, &ctx->dominfo) != 1) || >> + (ctx->dominfo.domid != ctx->domid) ) >> + { >> + PERROR("Unable to refresh domain information"); >> + rc = -1; >> + goto err; >> + } >> + else if ( (!ctx->dominfo.shutdown || >> + ctx->dominfo.shutdown_reason != SHUTDOWN_suspend ) && >> + !ctx->dominfo.paused ) >> + { >> + ERROR("Domain has not been suspended"); >> + rc = -1; >> + goto err; >> + } >> >> - rc = ctx->save.ops.end_of_stream(ctx); >> - if ( rc ) >> - goto err; >> + rc = ctx->save.ops.end_of_stream(ctx); >> + if ( rc ) >> + goto err; >> + >> + if ( ctx->checkpointed ) { >> + if ( ctx->firsttime ) >> + ctx->firsttime = false; >> + >> + ctx->save.callbacks->postcopy(ctx->save.callbacks->data); > > Can postcopy() fail? It can, this was inherited from the legacy code, but should be improved here by adding error check. > > ~Andrew > >> + >> + rc = ctx->save.callbacks->checkpoint(ctx->save.callbacks->data); >> + if ( rc > 0 ) { >> + IPRINTF("Next checkpoint\n"); >> + } else { >> + ctx->checkpointed = false; >> + } >> + } >> + } while ( ctx->checkpointed ); >> >> rc = write_end_record(ctx); >> if ( rc ) >> @@ -653,6 +672,7 @@ int xc_domain_save2(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_ite >> ctx.save.live = !!(flags & XCFLAGS_LIVE); >> ctx.save.debug = !!(flags & XCFLAGS_DEBUG); >> ctx.checkpointed = !!(flags & XCFLAGS_CHECKPOINTED); >> + ctx.firsttime = true; >> >> if ( ctx.checkpointed ) { >> /* This is a checkpointed save, we need these callbacks */ > > . > -- Thanks, Yang.