From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:43137) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZD2M6-0005wB-75 for qemu-devel@nongnu.org; Wed, 08 Jul 2015 23:18:35 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1ZD2M4-0007ct-5U for qemu-devel@nongnu.org; Wed, 08 Jul 2015 23:18:34 -0400 Received: from [119.145.14.65] (port=38390 helo=szxga02-in.huawei.com) by eggs.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1ZD2M3-0007cH-FR for qemu-devel@nongnu.org; Wed, 08 Jul 2015 23:18:32 -0400 From: zhanghailiang Date: Thu, 9 Jul 2015 11:16:42 +0800 Message-ID: <1436411802-181876-35-git-send-email-zhang.zhanghailiang@huawei.com> In-Reply-To: <1436411802-181876-1-git-send-email-zhang.zhanghailiang@huawei.com> References: <1436411802-181876-1-git-send-email-zhang.zhanghailiang@huawei.com> MIME-Version: 1.0 Content-Type: text/plain Subject: [Qemu-devel] [PATCH COLO-Frame v7 34/34] COLO: Add block replication into colo process List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: qemu-devel@nongnu.org Cc: lizhijian@cn.fujitsu.com, quintela@redhat.com, yunhong.jiang@intel.com, eddie.dong@intel.com, peter.huangpeng@huawei.com, dgilbert@redhat.com, arei.gonglei@huawei.com, amit.shah@redhat.com, Yang Hongyang , zhanghailiang Make sure master start block replication after slave's block replication started. Signed-off-by: zhanghailiang Signed-off-by: Wen Congyang Signed-off-by: Yang Hongyang Signed-off-by: Li Zhijian --- migration/colo.c | 71 +++++++++++++++++++++++++++++++++++++++++++++++++++++--- trace-events | 2 ++ 2 files changed, 70 insertions(+), 3 deletions(-) diff --git a/migration/colo.c b/migration/colo.c index d6f2c73..067102f 100644 --- a/migration/colo.c +++ b/migration/colo.c @@ -20,6 +20,7 @@ #include "qapi-event.h" #include "net/colo-nic.h" #include "qmp-commands.h" +#include "block/block_int.h" /* * We should not do checkpoint one after another without any time interval, @@ -119,6 +120,8 @@ static bool colo_runstate_is_stopped(void) static void secondary_vm_do_failover(void) { int old_state; + Error *local_err = NULL; + /* Can not do failover during the process of VM's loading VMstate, Or * it will break the secondary VM. */ @@ -137,6 +140,12 @@ static void secondary_vm_do_failover(void) } colo_proxy_destroy(COLO_MODE_SECONDARY); + bdrv_stop_replication_all(true, &local_err); + if (local_err) { + error_report_err(local_err); + } + trace_colo_stop_block_replication("failover"); + /* It means that VM exit from COLO state */ colo = NULL; @@ -163,6 +172,7 @@ static void primary_vm_do_failover(void) { MigrationState *s = migrate_get_current(); int old_state; + Error *local_err = NULL; if (!colo_runstate_is_stopped()) { vm_stop_force_state(RUN_STATE_COLO); @@ -174,6 +184,12 @@ static void primary_vm_do_failover(void) migrate_set_state(s, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED); } + bdrv_stop_replication_all(true, &local_err); + if (local_err) { + error_report_err(local_err); + } + trace_colo_stop_block_replication("failover"); + vm_start(); old_state = failover_set_state(FAILOVER_STATUS_HANDLING, @@ -252,6 +268,7 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) int colo_shutdown, ret; size_t size; QEMUFile *trans = NULL; + Error *local_err = NULL; ret = colo_ctl_put(s->file, COLO_CHECKPOINT_NEW); if (ret < 0) { @@ -305,6 +322,16 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) goto out; } + /* we call this api although this may do nothing on primary side */ + qemu_mutex_lock_iothread(); + bdrv_do_checkpoint_all(&local_err); + qemu_mutex_unlock_iothread(); + if (local_err) { + error_report_err(local_err); + ret = -1; + goto out; + } + ret = colo_ctl_put(s->file, COLO_CHECKPOINT_SEND); if (ret < 0) { goto out; @@ -336,6 +363,10 @@ static int colo_do_checkpoint_transaction(MigrationState *s, QEMUFile *control) trace_colo_receive_message("COLO_CHECKPOINT_LOADED"); if (colo_shutdown) { + qemu_mutex_lock_iothread(); + bdrv_stop_replication_all(false, NULL); + trace_colo_stop_block_replication("shutdown"); + qemu_mutex_unlock_iothread(); colo_ctl_put(s->file, COLO_GUEST_SHUTDOWN); qemu_fflush(s->file); colo_shutdown_requested = 0; @@ -367,6 +398,7 @@ static void *colo_thread(void *opaque) QEMUFile *colo_control = NULL; int64_t current_time, checkpoint_time = qemu_clock_get_ms(QEMU_CLOCK_HOST); int i, ret; + Error *local_err = NULL; if (colo_proxy_init(COLO_MODE_PRIMARY) != 0) { error_report("Init colo proxy error"); @@ -398,6 +430,12 @@ static void *colo_thread(void *opaque) } qemu_mutex_lock_iothread(); + /* start block replication */ + bdrv_start_replication_all(REPLICATION_MODE_PRIMARY, &local_err); + if (local_err) { + goto out; + } + trace_colo_start_block_replication(); vm_start(); qemu_mutex_unlock_iothread(); trace_colo_vm_state_change("stop", "run"); @@ -448,7 +486,11 @@ do_checkpoint: } out: - error_report("colo: some error happens in colo_thread"); + if (local_err) { + error_report_err(local_err); + } else { + error_report("colo: some error happens in colo_thread"); + } qapi_event_send_colo_exit("primary", true, "unknown", NULL);; /* Give users time (2s) to get involved in this verdict */ for (i = 0; i < 10; i++) { @@ -533,6 +575,8 @@ static int colo_wait_handle_cmd(QEMUFile *f, int *checkpoint_request) case COLO_GUEST_SHUTDOWN: qemu_mutex_lock_iothread(); vm_stop_force_state(RUN_STATE_COLO); + bdrv_stop_replication_all(false, NULL); + trace_colo_stop_block_replication("shutdown"); qemu_system_shutdown_request_core(); qemu_mutex_unlock_iothread(); trace_colo_receive_message("COLO_GUEST_SHUTDOWN"); @@ -556,6 +600,7 @@ void *colo_process_incoming_checkpoints(void *opaque) QEMUFile *ctl = NULL, *fb = NULL; int i, ret; uint64_t total_size; + Error *local_err = NULL; qdev_hotplug = 0; @@ -585,6 +630,15 @@ void *colo_process_incoming_checkpoints(void *opaque) goto out; } + qemu_mutex_lock_iothread(); + /* start block replication */ + bdrv_start_replication_all(REPLICATION_MODE_SECONDARY, &local_err); + if (local_err) { + goto out; + } + qemu_mutex_unlock_iothread(); + trace_colo_start_block_replication(); + ret = colo_ctl_put(ctl, COLO_CHECPOINT_READY); if (ret < 0) { goto out; @@ -671,8 +725,15 @@ void *colo_process_incoming_checkpoints(void *opaque) goto out; } - vmstate_loading = false; + /* discard colo disk buffer */ + bdrv_do_checkpoint_all(&local_err); qemu_mutex_unlock_iothread(); + if (local_err) { + vmstate_loading = false; + goto out; + } + + vmstate_loading = false; if (failover_get_state() == FAILOVER_STATUS_RELAUNCH) { failover_set_state(FAILOVER_STATUS_RELAUNCH, FAILOVER_STATUS_NONE); @@ -695,7 +756,11 @@ void *colo_process_incoming_checkpoints(void *opaque) } out: - error_report("Detect some error or get a failover request"); + if (local_err) { + error_report_err(local_err); + } else { + error_report("Detect some error or get a failover request"); + } /* * Here, we raise a qmp event to the user, * It can help user to know what happens, and help deciding whether to diff --git a/trace-events b/trace-events index 66bcd08..914eb90 100644 --- a/trace-events +++ b/trace-events @@ -1477,6 +1477,8 @@ colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'" colo_receive_message(const char *msg) "Receive '%s'" failover_set_state(int new_state) "new state %d" colo_rcv_pkt(int result) "Result of net packets comparing is different: %d" +colo_start_block_replication(void) "Block replication is started" +colo_stop_block_replication(const char *reason) "Block replication is stopped(reason: '%s')" # kvm-all.c kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" -- 1.7.12.4