From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:40638) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1bNbbl-0001qO-Tp for qemu-devel@nongnu.org; Thu, 14 Jul 2016 04:02:59 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1bNbbh-00010m-0o for qemu-devel@nongnu.org; Thu, 14 Jul 2016 04:02:57 -0400 Received: from szxga03-in.huawei.com ([119.145.14.66]:52835) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1bNbbg-00010H-0g for qemu-devel@nongnu.org; Thu, 14 Jul 2016 04:02:52 -0400 References: <1452169208-840-1-git-send-email-zhang.zhanghailiang@huawei.com> <1452169208-840-14-git-send-email-zhang.zhanghailiang@huawei.com> <20160713175253.GF4573@work-vm> From: Hailiang Zhang Message-ID: <57874706.8050505@huawei.com> Date: Thu, 14 Jul 2016 16:02:14 +0800 MIME-Version: 1.0 In-Reply-To: <20160713175253.GF4573@work-vm> Content-Type: text/plain; charset="windows-1252"; format=flowed Content-Transfer-Encoding: 7bit Subject: Re: [Qemu-devel] [RFC 13/13] snapshot: Remove page's write-protect and copy the content during setup stage List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: "Dr. David Alan Gilbert" Cc: peter.huangpeng@huawei.com, qemu-devel@nongnu.org, aarcange@redhat.com, quintela@redhat.com, amit.shah@redhat.com, hanweidong@huawei.com On 2016/7/14 1:52, Dr. David Alan Gilbert wrote: > * zhanghailiang (zhang.zhanghailiang@huawei.com) wrote: >> If we modify VM's RAM (pages) during setup stage after enable write-protect >> notification in snapshot thread, the modification action will get stuck because >> we only remove the page's write-protect in savevm process, it blocked by itself. >> >> To fix this bug, we will remove page's write-protect in fault thread during >> the setup stage. Besides, we should not try to get global lock after setup stage, >> or there maybe an deadlock error. > > Hmm this complicates things a bit more doesn't it. > What's the order of: > a) setup > b) savings devices > c) Being able to transmit the pages? > > Are these pages that are being modified during setup, being modified > as part of the device state save? > Yes, I'm not sure if the problem still exist or not after exchanging the sequence of 'save devices' and 'enable ram notify'. I'll look into it. Hailaing > Dave > >> >> Signed-off-by: zhanghailiang >> --- >> include/migration/migration.h | 4 ++-- >> migration/migration.c | 2 +- >> migration/postcopy-ram.c | 17 ++++++++++++++++- >> migration/ram.c | 37 +++++++++++++++++++++++++++++++------ >> 4 files changed, 50 insertions(+), 10 deletions(-) >> >> diff --git a/include/migration/migration.h b/include/migration/migration.h >> index ef4c071..435de31 100644 >> --- a/include/migration/migration.h >> +++ b/include/migration/migration.h >> @@ -127,7 +127,7 @@ struct MigrationSrcPageRequest { >> RAMBlock *rb; >> hwaddr offset; >> hwaddr len; >> - >> + uint8_t *pages_copy_addr; >> QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req; >> }; >> >> @@ -333,7 +333,7 @@ void global_state_store_running(void); >> >> void flush_page_queue(MigrationState *ms); >> int ram_save_queue_pages(MigrationState *ms, const char *rbname, >> - ram_addr_t start, ram_addr_t len); >> + ram_addr_t start, ram_addr_t len, bool copy_pages); >> >> PostcopyState postcopy_state_get(void); >> /* Set the state and return the old state */ >> diff --git a/migration/migration.c b/migration/migration.c >> index 3765c3b..bf4c7a1 100644 >> --- a/migration/migration.c >> +++ b/migration/migration.c >> @@ -1248,7 +1248,7 @@ static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, >> return; >> } >> >> - if (ram_save_queue_pages(ms, rbname, start, len)) { >> + if (ram_save_queue_pages(ms, rbname, start, len, false)) { >> mark_source_rp_bad(ms); >> } >> } >> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c >> index 61392d3..2cf477d 100644 >> --- a/migration/postcopy-ram.c >> +++ b/migration/postcopy-ram.c >> @@ -543,13 +543,28 @@ static void *postcopy_ram_fault_thread(void *opaque) >> MigrationState *ms = container_of(us, MigrationState, >> userfault_state); >> ret = ram_save_queue_pages(ms, qemu_ram_get_idstr(rb), rb_offset, >> - hostpagesize); >> + hostpagesize, true); >> >> if (ret < 0) { >> error_report("%s: Save: %"PRIx64 " failed!", >> __func__, (uint64_t)msg.arg.pagefault.address); >> break; >> } >> + >> + /* Note: In the setup process, snapshot_thread may modify VM's >> + * write-protected pages, we should not block it there, or there >> + * will be an deadlock error. >> + */ >> + if (migration_in_setup(ms)) { >> + uint64_t host = msg.arg.pagefault.address; >> + >> + host &= ~(hostpagesize - 1); >> + ret = ram_set_pages_wp(host, getpagesize(), true, >> + us->userfault_fd); >> + if (ret < 0) { >> + error_report("Remove page's write-protect failed"); >> + } >> + } >> } >> } >> trace_postcopy_ram_fault_thread_exit(); >> diff --git a/migration/ram.c b/migration/ram.c >> index 8656719..747f9aa 100644 >> --- a/migration/ram.c >> +++ b/migration/ram.c >> @@ -233,6 +233,7 @@ struct PageSearchStatus { >> ram_addr_t offset; >> /* Set once we wrap around */ >> bool complete_round; >> + uint8_t *pages_copy; >> }; >> typedef struct PageSearchStatus PageSearchStatus; >> >> @@ -742,7 +743,12 @@ static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, >> RAMBlock *block = pss->block; >> ram_addr_t offset = pss->offset; >> >> - p = block->host + offset; >> + /* If we have a copy of this page, use the backup page first */ >> + if (pss->pages_copy) { >> + p = pss->pages_copy; >> + } else { >> + p = block->host + offset; >> + } >> >> /* In doubt sent page as normal */ >> bytes_xmit = 0; >> @@ -926,7 +932,12 @@ static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, >> RAMBlock *block = pss->block; >> ram_addr_t offset = pss->offset; >> >> - p = block->host + offset; >> + /* If we have a copy of this page, use the backup first */ >> + if (pss->pages_copy) { >> + p = pss->pages_copy; >> + } else { >> + p = block->host + offset; >> + } >> >> bytes_xmit = 0; >> ret = ram_control_save_page(f, block->offset, >> @@ -1043,7 +1054,7 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, >> * Returns: block (or NULL if none available) >> */ >> static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, >> - ram_addr_t *ram_addr_abs) >> + ram_addr_t *ram_addr_abs, uint8 **pages_copy_addr) >> { >> RAMBlock *block = NULL; >> >> @@ -1055,7 +1066,7 @@ static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, >> *offset = entry->offset; >> *ram_addr_abs = (entry->offset + entry->rb->offset) & >> TARGET_PAGE_MASK; >> - >> + *pages_copy_addr = entry->pages_copy_addr; >> if (entry->len > TARGET_PAGE_SIZE) { >> entry->len -= TARGET_PAGE_SIZE; >> entry->offset += TARGET_PAGE_SIZE; >> @@ -1086,9 +1097,10 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, >> RAMBlock *block; >> ram_addr_t offset; >> bool dirty; >> + uint8 *pages_backup_addr = NULL; >> >> do { >> - block = unqueue_page(ms, &offset, ram_addr_abs); >> + block = unqueue_page(ms, &offset, ram_addr_abs, &pages_backup_addr); >> /* >> * We're sending this page, and since it's postcopy nothing else >> * will dirty it, and we must make sure it doesn't get sent again >> @@ -1130,6 +1142,7 @@ static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, >> */ >> pss->block = block; >> pss->offset = offset; >> + pss->pages_copy = pages_backup_addr; >> } >> >> return !!block; >> @@ -1166,7 +1179,7 @@ void flush_page_queue(MigrationState *ms) >> * Return: 0 on success >> */ >> int ram_save_queue_pages(MigrationState *ms, const char *rbname, >> - ram_addr_t start, ram_addr_t len) >> + ram_addr_t start, ram_addr_t len, bool copy_pages) >> { >> RAMBlock *ramblock; >> >> @@ -1206,6 +1219,17 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname, >> new_entry->rb = ramblock; >> new_entry->offset = start; >> new_entry->len = len; >> + if (copy_pages) { >> + /* Fix me: Better to realize a memory pool */ >> + new_entry->pages_copy_addr = g_try_malloc0(len); >> + >> + if (!new_entry->pages_copy_addr) { >> + error_report("%s: Failed to alloc memory", __func__); >> + return -1; >> + } >> + >> + memcpy(new_entry->pages_copy_addr, ramblock_ptr(ramblock, start), len); >> + } >> >> memory_region_ref(ramblock->mr); >> qemu_mutex_lock(&ms->src_page_req_mutex); >> @@ -1342,6 +1366,7 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage, >> pss.block = last_seen_block; >> pss.offset = last_offset; >> pss.complete_round = false; >> + pss.pages_copy = NULL; >> >> if (!pss.block) { >> pss.block = QLIST_FIRST_RCU(&ram_list.blocks); >> -- >> 1.8.3.1 >> >> > -- > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK > > . >