From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:58731) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1dcUc0-0004tC-L9 for qemu-devel@nongnu.org; Tue, 01 Aug 2017 06:41:17 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1dcUbx-0004fc-NG for qemu-devel@nongnu.org; Tue, 01 Aug 2017 06:41:16 -0400 Received: from mx1.redhat.com ([209.132.183.28]:58814) by eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32) (Exim 4.71) (envelope-from ) id 1dcUbx-0004f3-E3 for qemu-devel@nongnu.org; Tue, 01 Aug 2017 06:41:13 -0400 Date: Tue, 1 Aug 2017 11:41:05 +0100 From: "Dr. David Alan Gilbert" Message-ID: <20170801104105.GH2079@work-vm> References: <1501229198-30588-1-git-send-email-peterx@redhat.com> <1501229198-30588-16-git-send-email-peterx@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <1501229198-30588-16-git-send-email-peterx@redhat.com> Subject: Re: [Qemu-devel] [RFC 15/29] migration: allow fault thread to pause List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Peter Xu Cc: qemu-devel@nongnu.org, Laurent Vivier , Alexey Perevalov , Juan Quintela , Andrea Arcangeli * Peter Xu (peterx@redhat.com) wrote: > Allows the fault thread to stop handling page faults temporarily. When > network failure happened (and if we expect a recovery afterwards), we > should not allow the fault thread to continue sending things to source, > instead, it should halt for a while until the connection is rebuilt. > > When the dest main thread noticed the failure, it kicks the fault thread > to switch to pause state. > > Signed-off-by: Peter Xu Reviewed-by: Dr. David Alan Gilbert > --- > migration/migration.c | 1 + > migration/migration.h | 1 + > migration/postcopy-ram.c | 50 ++++++++++++++++++++++++++++++++++++++++++++---- > migration/savevm.c | 3 +++ > migration/trace-events | 2 ++ > 5 files changed, 53 insertions(+), 4 deletions(-) > > diff --git a/migration/migration.c b/migration/migration.c > index 9a0b5b0..9d93836 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -147,6 +147,7 @@ MigrationIncomingState *migration_incoming_get_current(void) > qemu_mutex_init(&mis_current.rp_mutex); > qemu_event_init(&mis_current.main_thread_load_event, false); > qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0); > + qemu_sem_init(&mis_current.postcopy_pause_sem_fault, 0); > once = true; > } > return &mis_current; > diff --git a/migration/migration.h b/migration/migration.h > index 047872b..574fedd 100644 > --- a/migration/migration.h > +++ b/migration/migration.h > @@ -63,6 +63,7 @@ struct MigrationIncomingState { > > /* notify PAUSED postcopy incoming migrations to try to continue */ > QemuSemaphore postcopy_pause_sem_dst; > + QemuSemaphore postcopy_pause_sem_fault; > }; > > MigrationIncomingState *migration_incoming_get_current(void); > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 9ce391d..ba53155 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -418,6 +418,17 @@ static int ram_block_enable_notify(const char *block_name, void *host_addr, > return 0; > } > > +static bool postcopy_pause_fault_thread(MigrationIncomingState *mis) > +{ > + trace_postcopy_pause_fault_thread(); > + > + qemu_sem_wait(&mis->postcopy_pause_sem_fault); > + > + trace_postcopy_pause_fault_thread_continued(); > + > + return true; > +} > + > /* > * Handle faults detected by the USERFAULT markings > */ > @@ -465,6 +476,22 @@ static void *postcopy_ram_fault_thread(void *opaque) > } > } > > + if (!mis->to_src_file) { > + /* > + * Possibly someone tells us that the return path is > + * broken already using the event. We should hold until > + * the channel is rebuilt. > + */ > + if (postcopy_pause_fault_thread(mis)) { > + last_rb = NULL; > + /* Continue to read the userfaultfd */ > + } else { > + error_report("%s: paused but don't allow to continue", > + __func__); > + break; > + } > + } > + > ret = read(mis->userfault_fd, &msg, sizeof(msg)); > if (ret != sizeof(msg)) { > if (errno == EAGAIN) { > @@ -504,18 +531,33 @@ static void *postcopy_ram_fault_thread(void *opaque) > qemu_ram_get_idstr(rb), > rb_offset); > > +retry: > /* > * Send the request to the source - we want to request one > * of our host page sizes (which is >= TPS) > */ > if (rb != last_rb) { > last_rb = rb; > - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > - rb_offset, qemu_ram_pagesize(rb)); > + ret = migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), > + rb_offset, qemu_ram_pagesize(rb)); > } else { > /* Save some space */ > - migrate_send_rp_req_pages(mis, NULL, > - rb_offset, qemu_ram_pagesize(rb)); > + ret = migrate_send_rp_req_pages(mis, NULL, > + rb_offset, qemu_ram_pagesize(rb)); > + } > + > + if (ret) { > + /* May be network failure, try to wait for recovery */ > + if (ret == -EIO && postcopy_pause_fault_thread(mis)) { > + /* We got reconnected somehow, try to continue */ > + last_rb = NULL; > + goto retry; > + } else { > + /* This is a unavoidable fault */ > + error_report("%s: migrate_send_rp_req_pages() get %d", > + __func__, ret); > + break; > + } > } > } > trace_postcopy_ram_fault_thread_exit(); > diff --git a/migration/savevm.c b/migration/savevm.c > index 1f62268..386788d 100644 > --- a/migration/savevm.c > +++ b/migration/savevm.c > @@ -1974,6 +1974,9 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis) > mis->to_src_file = NULL; > qemu_mutex_unlock(&mis->rp_mutex); > > + /* Notify the fault thread for the invalidated file handle */ > + postcopy_fault_thread_notify(mis); > + > while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) { > qemu_sem_wait(&mis->postcopy_pause_sem_dst); > } > diff --git a/migration/trace-events b/migration/trace-events > index a269eec..dbb4971 100644 > --- a/migration/trace-events > +++ b/migration/trace-events > @@ -100,6 +100,8 @@ open_return_path_on_source_continue(void) "" > postcopy_start(void) "" > postcopy_pause_return_path(void) "" > postcopy_pause_return_path_continued(void) "" > +postcopy_pause_fault_thread(void) "" > +postcopy_pause_fault_thread_continued(void) "" > postcopy_pause_continued(void) "" > postcopy_pause_incoming(void) "" > postcopy_pause_incoming_continued(void) "" > -- > 2.7.4 > -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK