* [RFC Patch v4 01/18] move remus related codes to libxl_remus.c
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
@ 2014-10-24 7:05 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 02/18] rename remus device to checkpoint device Wen Congyang
` (18 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:05 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Shriram Rajagopalan, Yang Hongyang, Lai Jiangshan
libxl_domain_remus_start() is external API, and is not moved.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: Shriram Rajagopalan <rshriram@cs.ubc.ca>
---
tools/libxl/Makefile | 2 +-
tools/libxl/libxl.c | 57 +--------
tools/libxl/libxl_dom.c | 220 +-------------------------------
tools/libxl/libxl_remus.c | 318 ++++++++++++++++++++++++++++++++++++++++++++++
tools/libxl/libxl_remus.h | 28 ++++
5 files changed, 352 insertions(+), 273 deletions(-)
create mode 100644 tools/libxl/libxl_remus.c
create mode 100644 tools/libxl/libxl_remus.h
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index df08c8a..4fb98ab 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -56,7 +56,7 @@ else
LIBXL_OBJS-y += libxl_nonetbuffer.o
endif
-LIBXL_OBJS-y += libxl_remus_device.o libxl_remus_disk_drbd.o
+LIBXL_OBJS-y += libxl_remus.o libxl_remus_device.o libxl_remus_disk_drbd.o
LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index e0522f7..b3d01f8 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -17,6 +17,7 @@
#include "libxl_osdeps.h"
#include "libxl_internal.h"
+#include "libxl_remus.h"
#define PAGE_TO_MEMKB(pages) ((pages) * 4)
#define BACKEND_STRING_SIZE 5
@@ -808,11 +809,6 @@ out:
GC_FREE;
return ptr;
}
-
-static void libxl__remus_setup_done(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc);
-static void libxl__remus_setup_failed(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc);
static void remus_failover_cb(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc);
@@ -861,63 +857,14 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
assert(info);
- /* Convenience aliases */
- libxl__remus_devices_state *const rds = &dss->rds;
-
- if (libxl_defbool_val(info->netbuf)) {
- if (!libxl__netbuffer_enabled(gc)) {
- LOG(ERROR, "Remus: No support for network buffering");
- rc = ERROR_FAIL;
- goto out;
- }
- rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VIF);
- }
-
- if (libxl_defbool_val(info->diskbuf))
- rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VBD);
-
- rds->ao = ao;
- rds->domid = domid;
- rds->callback = libxl__remus_setup_done;
-
/* Point of no return */
- libxl__remus_devices_setup(egc, rds);
+ libxl__remus_setup(egc, dss);
return AO_INPROGRESS;
out:
return AO_ABORT(rc);
}
-static void libxl__remus_setup_done(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc)
-{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
- STATE_AO_GC(dss->ao);
-
- if (!rc) {
- libxl__domain_suspend(egc, dss);
- return;
- }
-
- LOG(ERROR, "Remus: failed to setup device for guest with domid %u, rc %d",
- dss->domid, rc);
- rds->callback = libxl__remus_setup_failed;
- libxl__remus_devices_teardown(egc, rds);
-}
-
-static void libxl__remus_setup_failed(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc)
-{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
- STATE_AO_GC(dss->ao);
-
- if (rc)
- LOG(ERROR, "Remus: failed to teardown device after setup failed"
- " for guest with domid %u, rc %d", dss->domid, rc);
-
- dss->callback(egc, dss, rc);
-}
-
static void remus_failover_cb(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc)
{
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 224f865..30c0de0 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -19,6 +19,7 @@
#include "libxl_internal.h"
#include "libxl_arch.h"
+#include "libxl_remus.h"
#include <xc_dom.h>
#include <xen/hvm/hvm_info_table.h>
@@ -1640,194 +1641,6 @@ static void domain_suspend_callback_common_done(libxl__egc *egc,
libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
}
-/*----- remus callbacks -----*/
-static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
- libxl__domain_suspend_state2 *dss2, int ok);
-static void remus_devices_postsuspend_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc);
-static void remus_devices_preresume_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc);
-
-static void libxl__remus_domain_suspend_callback(void *data)
-{
- libxl__save_helper_state *shs = data;
- libxl__egc *egc = shs->egc;
- libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
-
- /* Convenience aliases */
- libxl__domain_suspend_state2 *const dss2 = &dss->dss2;
-
- dss2->callback_common_done = remus_domain_suspend_callback_common_done;
- domain_suspend_callback_common(egc, dss2);
-}
-
-static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
- libxl__domain_suspend_state2 *dss2, int ok)
-{
- libxl__domain_suspend_state *dss = CONTAINER_OF(dss2, *dss, dss2);
-
- if (!ok)
- goto out;
-
- libxl__remus_devices_state *const rds = &dss->rds;
- rds->callback = remus_devices_postsuspend_cb;
- libxl__remus_devices_postsuspend(egc, rds);
- return;
-
-out:
- libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
-}
-
-static void remus_devices_postsuspend_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc)
-{
- int ok = 0;
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
-
- if (rc)
- goto out;
-
- ok = 1;
-
-out:
- libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
-}
-
-static void libxl__remus_domain_resume_callback(void *data)
-{
- libxl__save_helper_state *shs = data;
- libxl__egc *egc = shs->egc;
- libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
- STATE_AO_GC(dss->ao);
-
- libxl__remus_devices_state *const rds = &dss->rds;
- rds->callback = remus_devices_preresume_cb;
- libxl__remus_devices_preresume(egc, rds);
-}
-
-static void remus_devices_preresume_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc)
-{
- int ok = 0;
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
- STATE_AO_GC(dss->ao);
-
- if (rc)
- goto out;
-
- /* Resumes the domain and the device model */
- rc = libxl__domain_resume(gc, dss->domid, /* Fast Suspend */1);
- if (rc)
- goto out;
-
- ok = 1;
-
-out:
- libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
-}
-
-/*----- remus asynchronous checkpoint callback -----*/
-
-static void remus_checkpoint_dm_saved(libxl__egc *egc,
- libxl__domain_suspend_state *dss, int rc);
-static void remus_devices_commit_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc);
-static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
- const struct timeval *requested_abs);
-
-static void libxl__remus_domain_checkpoint_callback(void *data)
-{
- libxl__save_helper_state *shs = data;
- libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
- libxl__egc *egc = dss->shs.egc;
- STATE_AO_GC(dss->ao);
-
- /* This would go into tailbuf. */
- if (dss->hvm) {
- libxl__domain_save_device_model(egc, dss, remus_checkpoint_dm_saved);
- } else {
- remus_checkpoint_dm_saved(egc, dss, 0);
- }
-}
-
-static void remus_checkpoint_dm_saved(libxl__egc *egc,
- libxl__domain_suspend_state *dss, int rc)
-{
- /* Convenience aliases */
- libxl__remus_devices_state *const rds = &dss->rds;
-
- STATE_AO_GC(dss->ao);
-
- if (rc) {
- LOG(ERROR, "Failed to save device model. Terminating Remus..");
- goto out;
- }
-
- rds->callback = remus_devices_commit_cb;
- libxl__remus_devices_commit(egc, rds);
-
- return;
-
-out:
- libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
-}
-
-static void remus_devices_commit_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc)
-{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
-
- STATE_AO_GC(dss->ao);
-
- if (rc) {
- LOG(ERROR, "Failed to do device commit op."
- " Terminating Remus..");
- goto out;
- }
-
- /*
- * At this point, we have successfully checkpointed the guest and
- * committed it at the backup. We'll come back after the checkpoint
- * interval to checkpoint the guest again. Until then, let the guest
- * continue execution.
- */
-
- /* Set checkpoint interval timeout */
- rc = libxl__ev_time_register_rel(gc, &dss->checkpoint_timeout,
- remus_next_checkpoint,
- dss->interval);
-
- if (rc)
- goto out;
-
- return;
-
-out:
- libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
-}
-
-static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
- const struct timeval *requested_abs)
-{
- libxl__domain_suspend_state *dss =
- CONTAINER_OF(ev, *dss, checkpoint_timeout);
-
- STATE_AO_GC(dss->ao);
-
- /*
- * Time to checkpoint the guest again. We return 1 to libxc
- * (xc_domain_save.c). in order to continue executing the infinite loop
- * (suspend, checkpoint, resume) in xc_domain_save().
- */
- libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 1);
-}
-
/*----- main code for suspending, in order of execution -----*/
void libxl__domain_suspend2(libxl__egc *egc,
libxl__domain_suspend_state2 *dss2)
@@ -2051,10 +1864,6 @@ static void save_device_model_datacopier_done(libxl__egc *egc,
dss->save_dm_callback(egc, dss, our_rc);
}
-static void remus_teardown_done(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc);
-
static void domain_suspend_done(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc)
{
@@ -2070,34 +1879,11 @@ static void domain_suspend_done(libxl__egc *egc,
xc_suspend_evtchn_release(CTX->xch, CTX->xce, domid,
dss2->guest_evtchn.port, &dss2->guest_evtchn_lockfd);
- if (!dss->remus) {
- remus_teardown_done(egc, &dss->rds, rc);
+ if (dss->remus) {
+ libxl__remus_teardown(egc, dss, rc);
return;
}
- /*
- * With Remus, if we reach this point, it means either
- * backup died or some network error occurred preventing us
- * from sending checkpoints. Teardown the network buffers and
- * release netlink resources. This is an async op.
- */
- LOG(WARN, "Remus: Domain suspend terminated with rc %d,"
- " teardown Remus devices...", rc);
- dss->rds.callback = remus_teardown_done;
- libxl__remus_devices_teardown(egc, &dss->rds);
-}
-
-static void remus_teardown_done(libxl__egc *egc,
- libxl__remus_devices_state *rds,
- int rc)
-{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
- STATE_AO_GC(dss->ao);
-
- if (rc)
- LOG(ERROR, "Remus: failed to teardown device for guest with domid %u,"
- " rc %d", dss->domid, rc);
-
dss->callback(egc, dss, rc);
}
diff --git a/tools/libxl/libxl_remus.c b/tools/libxl/libxl_remus.c
new file mode 100644
index 0000000..b555715
--- /dev/null
+++ b/tools/libxl/libxl_remus.c
@@ -0,0 +1,318 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+#include "libxl_remus.h"
+
+
+/*----- remus: setup the environment -----*/
+static void libxl__remus_setup_done(libxl__egc *egc,
+ libxl__remus_devices_state *rds, int rc);
+static void libxl__remus_setup_failed(libxl__egc *egc,
+ libxl__remus_devices_state *rds, int rc);
+
+void libxl__remus_setup(libxl__egc *egc,
+ libxl__domain_suspend_state *dss)
+{
+ /* Convenience aliases */
+ libxl__remus_devices_state *const rds = &dss->rds;
+ const libxl_domain_remus_info *const info = dss->remus;
+
+ STATE_AO_GC(dss->ao);
+
+ if (libxl_defbool_val(info->netbuf)) {
+ if (!libxl__netbuffer_enabled(gc)) {
+ LOG(ERROR, "Remus: No support for network buffering");
+ goto out;
+ }
+ rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VIF);
+ }
+
+ if (libxl_defbool_val(info->diskbuf))
+ rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VBD);
+
+ rds->ao = ao;
+ rds->domid = dss->domid;
+ rds->callback = libxl__remus_setup_done;
+
+ libxl__remus_devices_setup(egc, rds);
+ return;
+
+out:
+ libxl__remus_setup_failed(egc, rds, ERROR_FAIL);
+}
+
+static void libxl__remus_setup_done(libxl__egc *egc,
+ libxl__remus_devices_state *rds, int rc)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ STATE_AO_GC(dss->ao);
+
+ if (!rc) {
+ libxl__domain_suspend(egc, dss);
+ return;
+ }
+
+ LOG(ERROR, "Remus: failed to setup device for guest with domid %u, rc %d",
+ dss->domid, rc);
+ rds->callback = libxl__remus_setup_failed;
+ libxl__remus_devices_teardown(egc, rds);
+}
+
+static void libxl__remus_setup_failed(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ STATE_AO_GC(dss->ao);
+
+ if (rc)
+ LOG(ERROR, "Remus: failed to teardown device after setup failed"
+ " for guest with domid %u, rc %d", dss->domid, rc);
+
+ dss->callback(egc, dss, rc);
+}
+
+
+/*----- remus: teardown the environment -----*/
+static void remus_teardown_done(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc);
+
+void libxl__remus_teardown(libxl__egc *egc,
+ libxl__domain_suspend_state *dss,
+ int rc)
+{
+ EGC_GC;
+
+ /*
+ * If we reach this point, it means either backup died or some
+ * network error occurred preventing us from sending checkpoints.
+ * Teardown the network buffers and release netlink resources.
+ * This is an async op.
+ */
+ LOG(WARN, "Remus: Domain suspend terminated with rc %d,"
+ " teardown Remus devices...", rc);
+ dss->rds.callback = remus_teardown_done;
+ libxl__remus_devices_teardown(egc, &dss->rds);
+}
+
+static void remus_teardown_done(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ STATE_AO_GC(dss->ao);
+
+ if (rc)
+ LOG(ERROR, "Remus: failed to teardown device for guest with domid %u,"
+ " rc %d", dss->domid, rc);
+
+ dss->callback(egc, dss, rc);
+}
+
+
+/*----- remus: suspend the guest -----*/
+static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
+ libxl__domain_suspend_state2 *dss2, int ok);
+static void remus_devices_postsuspend_cb(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc);
+
+void libxl__remus_domain_suspend_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__egc *egc = shs->egc;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+
+ /* Convenience aliases */
+ libxl__domain_suspend_state2 *const dss2 = &dss->dss2;
+
+ dss2->callback_common_done = remus_domain_suspend_callback_common_done;
+ libxl__domain_suspend2(egc, dss2);
+}
+
+static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
+ libxl__domain_suspend_state2 *dss2, int ok)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(dss2, *dss, dss2);
+
+ if (!ok)
+ goto out;
+
+ libxl__remus_devices_state *const rds = &dss->rds;
+ rds->callback = remus_devices_postsuspend_cb;
+ libxl__remus_devices_postsuspend(egc, rds);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+static void remus_devices_postsuspend_cb(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc)
+{
+ int ok = 0;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+
+ if (rc)
+ goto out;
+
+ ok = 1;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+
+/*----- remus: resume the guest -----*/
+static void remus_devices_preresume_cb(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc);
+
+void libxl__remus_domain_resume_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__egc *egc = shs->egc;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+ STATE_AO_GC(dss->ao);
+
+ libxl__remus_devices_state *const rds = &dss->rds;
+ rds->callback = remus_devices_preresume_cb;
+ libxl__remus_devices_preresume(egc, rds);
+}
+
+static void remus_devices_preresume_cb(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc)
+{
+ int ok = 0;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ STATE_AO_GC(dss->ao);
+
+ if (rc)
+ goto out;
+
+ /* Resumes the domain and the device model */
+ rc = libxl__domain_resume(gc, dss->domid, /* Fast Suspend */1);
+ if (rc)
+ goto out;
+
+ ok = 1;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+
+/*----- remus: wait a new checkpoint -----*/
+static void remus_checkpoint_dm_saved(libxl__egc *egc,
+ libxl__domain_suspend_state *dss, int rc);
+static void remus_devices_commit_cb(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc);
+static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
+ const struct timeval *requested_abs);
+
+void libxl__remus_domain_checkpoint_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+ libxl__egc *egc = dss->shs.egc;
+ STATE_AO_GC(dss->ao);
+
+ /* This would go into tailbuf. */
+ if (dss->hvm) {
+ libxl__domain_save_device_model(egc, dss, remus_checkpoint_dm_saved);
+ } else {
+ remus_checkpoint_dm_saved(egc, dss, 0);
+ }
+}
+
+static void remus_checkpoint_dm_saved(libxl__egc *egc,
+ libxl__domain_suspend_state *dss, int rc)
+{
+ /* Convenience aliases */
+ libxl__remus_devices_state *const rds = &dss->rds;
+
+ STATE_AO_GC(dss->ao);
+
+ if (rc) {
+ LOG(ERROR, "Failed to save device model. Terminating Remus..");
+ goto out;
+ }
+
+ rds->callback = remus_devices_commit_cb;
+ libxl__remus_devices_commit(egc, rds);
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void remus_devices_commit_cb(libxl__egc *egc,
+ libxl__remus_devices_state *rds,
+ int rc)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+
+ STATE_AO_GC(dss->ao);
+
+ if (rc) {
+ LOG(ERROR, "Failed to do device commit op."
+ " Terminating Remus..");
+ goto out;
+ }
+
+ /*
+ * At this point, we have successfully checkpointed the guest and
+ * committed it at the backup. We'll come back after the checkpoint
+ * interval to checkpoint the guest again. Until then, let the guest
+ * continue execution.
+ */
+
+ /* Set checkpoint interval timeout */
+ rc = libxl__ev_time_register_rel(gc, &dss->checkpoint_timeout,
+ remus_next_checkpoint,
+ dss->interval);
+
+ if (rc)
+ goto out;
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
+ const struct timeval *requested_abs)
+{
+ libxl__domain_suspend_state *dss =
+ CONTAINER_OF(ev, *dss, checkpoint_timeout);
+
+ STATE_AO_GC(dss->ao);
+
+ /*
+ * Time to checkpoint the guest again. We return 1 to libxc
+ * (xc_domain_save.c). in order to continue executing the infinite loop
+ * (suspend, checkpoint, resume) in xc_domain_save().
+ */
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 1);
+}
diff --git a/tools/libxl/libxl_remus.h b/tools/libxl/libxl_remus.h
new file mode 100644
index 0000000..53e5e81
--- /dev/null
+++ b/tools/libxl/libxl_remus.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#ifndef LIBXL_REMUS_H
+#define LIBXL_REMUS_H
+
+void libxl__remus_setup(libxl__egc *egc,
+ libxl__domain_suspend_state *dss);
+void libxl__remus_teardown(libxl__egc *egc,
+ libxl__domain_suspend_state *dss,
+ int rc);
+void libxl__remus_domain_suspend_callback(void *data);
+void libxl__remus_domain_resume_callback(void *data);
+void libxl__remus_domain_checkpoint_callback(void *data);
+
+#endif
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 02/18] rename remus device to checkpoint device
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
2014-10-24 7:05 ` [RFC Patch v4 01/18] move remus related codes to libxl_remus.c Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 03/18] adjust the indentation Wen Congyang
` (17 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Shriram Rajagopalan, Yang Hongyang, Lai Jiangshan
This patch is auto generated by the following commands:
1. git mv tools/libxl/libxl_remus_device.c tools/libxl/libxl_checkpoint_device.c
2. perl -pi -e 's/libxl_remus_device/libxl_checkpoint_device/g' tools/libxl/Makefile
3. perl -pi -e 's/\blibxl__remus_devices/libxl__checkpoint_devices/g' tools/libxl/*.[ch]
4. perl -pi -e 's/\blibxl__remus_device\b/libxl__checkpoint_device/g' tools/libxl/*.[ch]
5. perl -pi -e 's/\blibxl__remus_device_instance_ops\b/libxl__checkpoint_device_instance_ops/g' tools/libxl/*.[ch]
6. perl -pi -e 's/\blibxl__remus_callback\b/libxl__checkpoint_callback/g' tools/libxl/*.[ch]
7. perl -pi -e 's/\bremus_device_init\b/checkpoint_device_init/g' tools/libxl/*.[ch]
8. perl -pi -e 's/\bremus_devices_setup\b/checkpoint_devices_setup/g' tools/libxl/*.[ch]
9. perl -pi -e 's/\bdefine_remus_checkpoint_api\b/define_checkpoint_api/g' tools/libxl/*.[ch]
10. perl -pi -e 's/\brds\b/cds/g' tools/libxl/*.[ch]
11. perl -pi -e 's/REMUS_DEVICE/CHECKPOINT_DEVICE/g' tools/libxl/*.[ch] tools/libxl/*.idl
12. perl -pi -e 's/REMUS_DEVOPS/CHECKPOINT_DEVOPS/g' tools/libxl/*.[ch] tools/libxl/*.idl
13. perl -pi -e 's/\bremus\b/checkpoint/g' tools/libxl/libxl_checkpoint_device.[ch]
14. perl -pi -e 's/\bremus device/checkpoint device/g' tools/libxl/libxl_internal.h
15. perl -pi -e 's/\bRemus device/checkpoint device/g' tools/libxl/libxl_internal.h
16. perl -pi -e 's/\bremus abstract/checkpoint abstract/g' tools/libxl/libxl_internal.h
17. perl -pi -e 's/\bremus invocation/checkpoint invocation/g' tools/libxl/libxl_internal.h
18. perl -pi -e 's/\blibxl__remus_device_\(/libxl__checkpoint_device_(/g' tools/libxl/libxl_internal.h
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: Shriram Rajagopalan <rshriram@cs.ubc.ca>
---
tools/libxl/Makefile | 2 +-
...xl_remus_device.c => libxl_checkpoint_device.c} | 198 ++++++++++-----------
tools/libxl/libxl_internal.h | 112 ++++++------
tools/libxl/libxl_netbuffer.c | 108 +++++------
tools/libxl/libxl_nonetbuffer.c | 10 +-
tools/libxl/libxl_remus.c | 78 ++++----
tools/libxl/libxl_remus_disk_drbd.c | 52 +++---
tools/libxl/libxl_types.idl | 4 +-
8 files changed, 282 insertions(+), 282 deletions(-)
rename tools/libxl/{libxl_remus_device.c => libxl_checkpoint_device.c} (52%)
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 4fb98ab..c970e7e 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -56,7 +56,7 @@ else
LIBXL_OBJS-y += libxl_nonetbuffer.o
endif
-LIBXL_OBJS-y += libxl_remus.o libxl_remus_device.o libxl_remus_disk_drbd.o
+LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
diff --git a/tools/libxl/libxl_remus_device.c b/tools/libxl/libxl_checkpoint_device.c
similarity index 52%
rename from tools/libxl/libxl_remus_device.c
rename to tools/libxl/libxl_checkpoint_device.c
index a6cb7f6..109cd23 100644
--- a/tools/libxl/libxl_remus_device.c
+++ b/tools/libxl/libxl_checkpoint_device.c
@@ -17,9 +17,9 @@
#include "libxl_internal.h"
-extern const libxl__remus_device_instance_ops remus_device_nic;
-extern const libxl__remus_device_instance_ops remus_device_drbd_disk;
-static const libxl__remus_device_instance_ops *remus_ops[] = {
+extern const libxl__checkpoint_device_instance_ops remus_device_nic;
+extern const libxl__checkpoint_device_instance_ops remus_device_drbd_disk;
+static const libxl__checkpoint_device_instance_ops *remus_ops[] = {
&remus_device_nic,
&remus_device_drbd_disk,
NULL,
@@ -27,18 +27,18 @@ static const libxl__remus_device_instance_ops *remus_ops[] = {
/*----- helper functions -----*/
-static int init_device_subkind(libxl__remus_devices_state *rds)
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
{
/* init device subkind-specific state in the libxl ctx */
int rc;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
if (libxl__netbuffer_enabled(gc)) {
- rc = init_subkind_nic(rds);
+ rc = init_subkind_nic(cds);
if (rc) goto out;
}
- rc = init_subkind_drbd_disk(rds);
+ rc = init_subkind_drbd_disk(cds);
if (rc) goto out;
rc = 0;
@@ -46,15 +46,15 @@ out:
return rc;
}
-static void cleanup_device_subkind(libxl__remus_devices_state *rds)
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
{
/* cleanup device subkind-specific state in the libxl ctx */
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
if (libxl__netbuffer_enabled(gc))
- cleanup_subkind_nic(rds);
+ cleanup_subkind_nic(cds);
- cleanup_subkind_drbd_disk(rds);
+ cleanup_subkind_drbd_disk(cds);
}
/*----- setup() and teardown() -----*/
@@ -70,103 +70,103 @@ static void devices_teardown_cb(libxl__egc *egc,
libxl__multidev *multidev,
int rc);
-/* remus device setup and teardown */
+/* checkpoint device setup and teardown */
-static libxl__remus_device* remus_device_init(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+static libxl__checkpoint_device* checkpoint_device_init(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
libxl__device_kind kind,
void *libxl_dev)
{
- libxl__remus_device *dev = NULL;
+ libxl__checkpoint_device *dev = NULL;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
GCNEW(dev);
dev->backend_dev = libxl_dev;
dev->kind = kind;
- dev->rds = rds;
+ dev->cds = cds;
return dev;
}
-static void remus_devices_setup(libxl__egc *egc,
- libxl__remus_devices_state *rds);
+static void checkpoint_devices_setup(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds);
-void libxl__remus_devices_setup(libxl__egc *egc, libxl__remus_devices_state *rds)
+void libxl__checkpoint_devices_setup(libxl__egc *egc, libxl__checkpoint_devices_state *cds)
{
int i, rc;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
- rc = init_device_subkind(rds);
+ rc = init_device_subkind(cds);
if (rc)
goto out;
- rds->num_devices = 0;
- rds->num_nics = 0;
- rds->num_disks = 0;
+ cds->num_devices = 0;
+ cds->num_nics = 0;
+ cds->num_disks = 0;
- if (rds->device_kind_flags & (1 << LIBXL__DEVICE_KIND_VIF))
- rds->nics = libxl_device_nic_list(CTX, rds->domid, &rds->num_nics);
+ if (cds->device_kind_flags & (1 << LIBXL__DEVICE_KIND_VIF))
+ cds->nics = libxl_device_nic_list(CTX, cds->domid, &cds->num_nics);
- if (rds->device_kind_flags & (1 << LIBXL__DEVICE_KIND_VBD))
- rds->disks = libxl_device_disk_list(CTX, rds->domid, &rds->num_disks);
+ if (cds->device_kind_flags & (1 << LIBXL__DEVICE_KIND_VBD))
+ cds->disks = libxl_device_disk_list(CTX, cds->domid, &cds->num_disks);
- if (rds->num_nics == 0 && rds->num_disks == 0)
+ if (cds->num_nics == 0 && cds->num_disks == 0)
goto out;
- GCNEW_ARRAY(rds->devs, rds->num_nics + rds->num_disks);
+ GCNEW_ARRAY(cds->devs, cds->num_nics + cds->num_disks);
- for (i = 0; i < rds->num_nics; i++) {
- rds->devs[rds->num_devices++] = remus_device_init(egc, rds,
+ for (i = 0; i < cds->num_nics; i++) {
+ cds->devs[cds->num_devices++] = checkpoint_device_init(egc, cds,
LIBXL__DEVICE_KIND_VIF,
- &rds->nics[i]);
+ &cds->nics[i]);
}
- for (i = 0; i < rds->num_disks; i++) {
- rds->devs[rds->num_devices++] = remus_device_init(egc, rds,
+ for (i = 0; i < cds->num_disks; i++) {
+ cds->devs[cds->num_devices++] = checkpoint_device_init(egc, cds,
LIBXL__DEVICE_KIND_VBD,
- &rds->disks[i]);
+ &cds->disks[i]);
}
- remus_devices_setup(egc, rds);
+ checkpoint_devices_setup(egc, cds);
return;
out:
- rds->callback(egc, rds, rc);
+ cds->callback(egc, cds, rc);
}
-static void remus_devices_setup(libxl__egc *egc,
- libxl__remus_devices_state *rds)
+static void checkpoint_devices_setup(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds)
{
int i, rc;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
- libxl__multidev_begin(ao, &rds->multidev);
- rds->multidev.callback = all_devices_setup_cb;
- for (i = 0; i < rds->num_devices; i++) {
- libxl__remus_device *dev = rds->devs[i];
+ libxl__multidev_begin(ao, &cds->multidev);
+ cds->multidev.callback = all_devices_setup_cb;
+ for (i = 0; i < cds->num_devices; i++) {
+ libxl__checkpoint_device *dev = cds->devs[i];
dev->ops_index = -1;
- libxl__multidev_prepare_with_aodev(&rds->multidev, &dev->aodev);
+ libxl__multidev_prepare_with_aodev(&cds->multidev, &dev->aodev);
- dev->aodev.rc = ERROR_REMUS_DEVICE_NOT_SUPPORTED;
+ dev->aodev.rc = ERROR_CHECKPOINT_DEVICE_NOT_SUPPORTED;
dev->aodev.callback = device_setup_iterate;
device_setup_iterate(egc,&dev->aodev);
}
rc = 0;
- libxl__multidev_prepared(egc, &rds->multidev, rc);
+ libxl__multidev_prepared(egc, &cds->multidev, rc);
}
static void device_setup_iterate(libxl__egc *egc, libxl__ao_device *aodev)
{
- libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+ libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
EGC_GC;
- if (aodev->rc != ERROR_REMUS_DEVICE_NOT_SUPPORTED &&
- aodev->rc != ERROR_REMUS_DEVOPS_DOES_NOT_MATCH)
+ if (aodev->rc != ERROR_CHECKPOINT_DEVICE_NOT_SUPPORTED &&
+ aodev->rc != ERROR_CHECKPOINT_DEVOPS_DOES_NOT_MATCH)
/* might be success or disaster */
goto out;
@@ -186,16 +186,16 @@ static void device_setup_iterate(libxl__egc *egc, libxl__ao_device *aodev)
domid = disk->backend_domid;
devid = libxl__device_disk_dev_number(disk->vdev, NULL, NULL);
} else {
- LOG(ERROR,"device kind not handled by remus: %s",
+ LOG(ERROR,"device kind not handled by checkpoint: %s",
libxl__device_kind_to_string(dev->kind));
aodev->rc = ERROR_FAIL;
goto out;
}
- LOG(ERROR,"device not handled by remus"
+ LOG(ERROR,"device not handled by checkpoint"
" (device=%s:%"PRId32"/%"PRId32")",
libxl__device_kind_to_string(dev->kind),
domid, devid);
- aodev->rc = ERROR_REMUS_DEVICE_NOT_SUPPORTED;
+ aodev->rc = ERROR_CHECKPOINT_DEVICE_NOT_SUPPORTED;
goto out;
}
} while (dev->ops->kind != dev->kind);
@@ -216,32 +216,32 @@ static void all_devices_setup_cb(libxl__egc *egc,
STATE_AO_GC(multidev->ao);
/* Convenience aliases */
- libxl__remus_devices_state *const rds =
- CONTAINER_OF(multidev, *rds, multidev);
+ libxl__checkpoint_devices_state *const cds =
+ CONTAINER_OF(multidev, *cds, multidev);
- rds->callback(egc, rds, rc);
+ cds->callback(egc, cds, rc);
}
-void libxl__remus_devices_teardown(libxl__egc *egc,
- libxl__remus_devices_state *rds)
+void libxl__checkpoint_devices_teardown(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds)
{
int i;
- libxl__remus_device *dev;
+ libxl__checkpoint_device *dev;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
- libxl__multidev_begin(ao, &rds->multidev);
- rds->multidev.callback = devices_teardown_cb;
- for (i = 0; i < rds->num_devices; i++) {
- dev = rds->devs[i];
+ libxl__multidev_begin(ao, &cds->multidev);
+ cds->multidev.callback = devices_teardown_cb;
+ for (i = 0; i < cds->num_devices; i++) {
+ dev = cds->devs[i];
if (!dev->ops || !dev->matched)
continue;
- libxl__multidev_prepare_with_aodev(&rds->multidev, &dev->aodev);
+ libxl__multidev_prepare_with_aodev(&cds->multidev, &dev->aodev);
dev->ops->teardown(egc,dev);
}
- libxl__multidev_prepared(egc, &rds->multidev, 0);
+ libxl__multidev_prepared(egc, &cds->multidev, 0);
}
static void devices_teardown_cb(libxl__egc *egc,
@@ -253,26 +253,26 @@ static void devices_teardown_cb(libxl__egc *egc,
STATE_AO_GC(multidev->ao);
/* Convenience aliases */
- libxl__remus_devices_state *const rds =
- CONTAINER_OF(multidev, *rds, multidev);
+ libxl__checkpoint_devices_state *const cds =
+ CONTAINER_OF(multidev, *cds, multidev);
/* clean nic */
- for (i = 0; i < rds->num_nics; i++)
- libxl_device_nic_dispose(&rds->nics[i]);
- free(rds->nics);
- rds->nics = NULL;
- rds->num_nics = 0;
+ for (i = 0; i < cds->num_nics; i++)
+ libxl_device_nic_dispose(&cds->nics[i]);
+ free(cds->nics);
+ cds->nics = NULL;
+ cds->num_nics = 0;
/* clean disk */
- for (i = 0; i < rds->num_disks; i++)
- libxl_device_disk_dispose(&rds->disks[i]);
- free(rds->disks);
- rds->disks = NULL;
- rds->num_disks = 0;
+ for (i = 0; i < cds->num_disks; i++)
+ libxl_device_disk_dispose(&cds->disks[i]);
+ free(cds->disks);
+ cds->disks = NULL;
+ cds->num_disks = 0;
- cleanup_device_subkind(rds);
+ cleanup_device_subkind(cds);
- rds->callback(egc, rds, rc);
+ cds->callback(egc, cds, rc);
}
/*----- checkpointing APIs -----*/
@@ -285,33 +285,33 @@ static void devices_checkpoint_cb(libxl__egc *egc,
/* API implementations */
-#define define_remus_checkpoint_api(api) \
-void libxl__remus_devices_##api(libxl__egc *egc, \
- libxl__remus_devices_state *rds) \
+#define define_checkpoint_api(api) \
+void libxl__checkpoint_devices_##api(libxl__egc *egc, \
+ libxl__checkpoint_devices_state *cds) \
{ \
int i; \
- libxl__remus_device *dev; \
+ libxl__checkpoint_device *dev; \
\
- STATE_AO_GC(rds->ao); \
+ STATE_AO_GC(cds->ao); \
\
- libxl__multidev_begin(ao, &rds->multidev); \
- rds->multidev.callback = devices_checkpoint_cb; \
- for (i = 0; i < rds->num_devices; i++) { \
- dev = rds->devs[i]; \
+ libxl__multidev_begin(ao, &cds->multidev); \
+ cds->multidev.callback = devices_checkpoint_cb; \
+ for (i = 0; i < cds->num_devices; i++) { \
+ dev = cds->devs[i]; \
if (!dev->matched || !dev->ops->api) \
continue; \
- libxl__multidev_prepare_with_aodev(&rds->multidev, &dev->aodev);\
+ libxl__multidev_prepare_with_aodev(&cds->multidev, &dev->aodev);\
dev->ops->api(egc,dev); \
} \
\
- libxl__multidev_prepared(egc, &rds->multidev, 0); \
+ libxl__multidev_prepared(egc, &cds->multidev, 0); \
}
-define_remus_checkpoint_api(postsuspend);
+define_checkpoint_api(postsuspend);
-define_remus_checkpoint_api(preresume);
+define_checkpoint_api(preresume);
-define_remus_checkpoint_api(commit);
+define_checkpoint_api(commit);
static void devices_checkpoint_cb(libxl__egc *egc,
libxl__multidev *multidev,
@@ -320,8 +320,8 @@ static void devices_checkpoint_cb(libxl__egc *egc,
STATE_AO_GC(multidev->ao);
/* Convenience aliases */
- libxl__remus_devices_state *const rds =
- CONTAINER_OF(multidev, *rds, multidev);
+ libxl__checkpoint_devices_state *const cds =
+ CONTAINER_OF(multidev, *cds, multidev);
- rds->callback(egc, rds, rc);
+ cds->callback(egc, cds, rc);
}
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 3036bb2..aa00d33 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2633,9 +2633,9 @@ typedef struct libxl__save_helper_state {
* marshalling and xc callback functions */
} libxl__save_helper_state;
-/*----- remus device related state structure -----*/
+/*----- checkpoint device related state structure -----*/
/*
- * The abstract Remus device layer exposes a common
+ * The abstract checkpoint device layer exposes a common
* set of API to [external] libxl for manipulating devices attached to
* a guest protected by Remus. The device layer also exposes a set of
* [internal] interfaces that every device type must implement.
@@ -2643,34 +2643,34 @@ typedef struct libxl__save_helper_state {
* The following API are exposed to libxl:
*
* One-time configuration operations:
- * +libxl__remus_devices_setup
+ * +libxl__checkpoint_devices_setup
* > Enable output buffering for NICs, setup disk replication, etc.
- * +libxl__remus_devices_teardown
+ * +libxl__checkpoint_devices_teardown
* > Disable output buffering and disk replication; teardown any
* associated external setups like qdiscs for NICs.
*
* Operations executed every checkpoint (in order of invocation):
- * +libxl__remus_devices_postsuspend
- * +libxl__remus_devices_preresume
- * +libxl__remus_devices_commit
+ * +libxl__checkpoint_devices_postsuspend
+ * +libxl__checkpoint_devices_preresume
+ * +libxl__checkpoint_devices_commit
*
* Each device type needs to implement the interfaces specified in
- * the libxl__remus_device_instance_ops if it wishes to support Remus.
+ * the libxl__checkpoint_device_instance_ops if it wishes to support Remus.
*
- * The high-level control flow through the Remus device layer is shown below:
+ * The high-level control flow through the checkpoint device layer is shown below:
*
* xl remus
* |-> libxl_domain_remus_start
- * |-> libxl__remus_devices_setup
- * |-> Per-checkpoint libxl__remus_devices_[postsuspend,preresume,commit]
+ * |-> libxl__checkpoint_devices_setup
+ * |-> Per-checkpoint libxl__checkpoint_devices_[postsuspend,preresume,commit]
* ...
* |-> On backup failure, network error or other internal errors:
- * libxl__remus_devices_teardown
+ * libxl__checkpoint_devices_teardown
*/
-typedef struct libxl__remus_device libxl__remus_device;
-typedef struct libxl__remus_devices_state libxl__remus_devices_state;
-typedef struct libxl__remus_device_instance_ops libxl__remus_device_instance_ops;
+typedef struct libxl__checkpoint_device libxl__checkpoint_device;
+typedef struct libxl__checkpoint_devices_state libxl__checkpoint_devices_state;
+typedef struct libxl__checkpoint_device_instance_ops libxl__checkpoint_device_instance_ops;
/*
* Interfaces to be implemented by every device subkind that wishes to
@@ -2680,7 +2680,7 @@ typedef struct libxl__remus_device_instance_ops libxl__remus_device_instance_ops
* synchronous and call dev->aodev.callback directly (as the last
* thing they do).
*/
-struct libxl__remus_device_instance_ops {
+struct libxl__checkpoint_device_instance_ops {
/* the device kind this ops belongs to... */
libxl__device_kind kind;
@@ -2691,12 +2691,12 @@ struct libxl__remus_device_instance_ops {
* Asynchronous.
*/
- void (*postsuspend)(libxl__egc *egc, libxl__remus_device *dev);
- void (*preresume)(libxl__egc *egc, libxl__remus_device *dev);
- void (*commit)(libxl__egc *egc, libxl__remus_device *dev);
+ void (*postsuspend)(libxl__egc *egc, libxl__checkpoint_device *dev);
+ void (*preresume)(libxl__egc *egc, libxl__checkpoint_device *dev);
+ void (*commit)(libxl__egc *egc, libxl__checkpoint_device *dev);
/*
- * setup() and teardown() are refer to the actual remus device.
+ * setup() and teardown() are refer to the actual checkpoint device.
* Asynchronous.
* teardown is called even if setup fails.
*/
@@ -2705,45 +2705,45 @@ struct libxl__remus_device_instance_ops {
* device. If matched, the device will then be managed with this set of
* subkind operations.
* Yields 0 if the device successfully set up.
- * REMUS_DEVOPS_DOES_NOT_MATCH if the ops does not match the device.
+ * CHECKPOINT_DEVOPS_DOES_NOT_MATCH if the ops does not match the device.
* any other rc indicates failure.
*/
- void (*setup)(libxl__egc *egc, libxl__remus_device *dev);
- void (*teardown)(libxl__egc *egc, libxl__remus_device *dev);
+ void (*setup)(libxl__egc *egc, libxl__checkpoint_device *dev);
+ void (*teardown)(libxl__egc *egc, libxl__checkpoint_device *dev);
};
-int init_subkind_nic(libxl__remus_devices_state *rds);
-void cleanup_subkind_nic(libxl__remus_devices_state *rds);
-int init_subkind_drbd_disk(libxl__remus_devices_state *rds);
-void cleanup_subkind_drbd_disk(libxl__remus_devices_state *rds);
+int init_subkind_nic(libxl__checkpoint_devices_state *cds);
+void cleanup_subkind_nic(libxl__checkpoint_devices_state *cds);
+int init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
+void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
-typedef void libxl__remus_callback(libxl__egc *,
- libxl__remus_devices_state *, int rc);
+typedef void libxl__checkpoint_callback(libxl__egc *,
+ libxl__checkpoint_devices_state *, int rc);
/*
- * State associated with a remus invocation, including parameters
- * passed to the remus abstract device layer by the remus
+ * State associated with a checkpoint invocation, including parameters
+ * passed to the checkpoint abstract device layer by the remus
* save/restore machinery.
*/
-struct libxl__remus_devices_state {
- /*---- must be set by caller of libxl__remus_device_(setup|teardown) ----*/
+struct libxl__checkpoint_devices_state {
+ /*---- must be set by caller of libxl__checkpoint_device_(setup|teardown) ----*/
libxl__ao *ao;
uint32_t domid;
- libxl__remus_callback *callback;
+ libxl__checkpoint_callback *callback;
int device_kind_flags;
/*----- private for abstract layer only -----*/
int num_devices;
/*
- * this array is allocated before setup the remus devices by the
- * remus abstract layer.
- * devs may be NULL, means there's no remus devices that has been set up.
+ * this array is allocated before setup the checkpoint devices by the
+ * checkpoint abstract layer.
+ * devs may be NULL, means there's no checkpoint devices that has been set up.
* the size of this array is 'num_devices', which is the total number
* of libxl nic devices and disk devices(num_nics + num_disks).
*/
- libxl__remus_device **devs;
+ libxl__checkpoint_device **devs;
libxl_device_nic *nics;
int num_nics;
@@ -2765,20 +2765,20 @@ struct libxl__remus_devices_state {
/*
* Information about a single device being handled by remus.
- * Allocated by the remus abstract layer.
+ * Allocated by the checkpoint abstract layer.
*/
-struct libxl__remus_device {
+struct libxl__checkpoint_device {
/*----- shared between abstract and concrete layers -----*/
/*
* if this is true, that means the subkind ops match the device
*/
bool matched;
- /*----- set by remus device abstruct layer -----*/
- /* libxl__device_* which this remus device related to */
+ /*----- set by checkpoint device abstruct layer -----*/
+ /* libxl__device_* which this checkpoint device related to */
const void *backend_dev;
libxl__device_kind kind;
- libxl__remus_devices_state *rds;
+ libxl__checkpoint_devices_state *cds;
libxl__ao_device aodev;
/*----- private for abstract layer only -----*/
@@ -2789,7 +2789,7 @@ struct libxl__remus_device {
* individual devices.
*/
int ops_index;
- const libxl__remus_device_instance_ops *ops;
+ const libxl__checkpoint_device_instance_ops *ops;
/*----- private for concrete (device-specific) layer -----*/
@@ -2797,17 +2797,17 @@ struct libxl__remus_device {
void *concrete_data;
};
-/* the following 5 APIs are async ops, call rds->callback when done */
-_hidden void libxl__remus_devices_setup(libxl__egc *egc,
- libxl__remus_devices_state *rds);
-_hidden void libxl__remus_devices_teardown(libxl__egc *egc,
- libxl__remus_devices_state *rds);
-_hidden void libxl__remus_devices_postsuspend(libxl__egc *egc,
- libxl__remus_devices_state *rds);
-_hidden void libxl__remus_devices_preresume(libxl__egc *egc,
- libxl__remus_devices_state *rds);
-_hidden void libxl__remus_devices_commit(libxl__egc *egc,
- libxl__remus_devices_state *rds);
+/* the following 5 APIs are async ops, call cds->callback when done */
+_hidden void libxl__checkpoint_devices_setup(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds);
+_hidden void libxl__checkpoint_devices_teardown(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds);
+_hidden void libxl__checkpoint_devices_postsuspend(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds);
+_hidden void libxl__checkpoint_devices_preresume(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds);
+_hidden void libxl__checkpoint_devices_commit(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds);
_hidden int libxl__netbuffer_enabled(libxl__gc *gc);
/*----- Domain suspend (save) state structure -----*/
@@ -2871,7 +2871,7 @@ struct libxl__domain_suspend_state {
libxl__domain_suspend_state2 dss2;
int hvm;
int xcflags;
- libxl__remus_devices_state rds;
+ libxl__checkpoint_devices_state cds;
libxl__ev_time checkpoint_timeout; /* used for Remus checkpoint */
int interval; /* checkpoint interval (for Remus) */
libxl__save_helper_state shs;
diff --git a/tools/libxl/libxl_netbuffer.c b/tools/libxl/libxl_netbuffer.c
index edc6843..2d668dd 100644
--- a/tools/libxl/libxl_netbuffer.c
+++ b/tools/libxl/libxl_netbuffer.c
@@ -38,21 +38,21 @@ int libxl__netbuffer_enabled(libxl__gc *gc)
return 1;
}
-int init_subkind_nic(libxl__remus_devices_state *rds)
+int init_subkind_nic(libxl__checkpoint_devices_state *cds)
{
int rc, ret;
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
- rds->nlsock = nl_socket_alloc();
- if (!rds->nlsock) {
+ cds->nlsock = nl_socket_alloc();
+ if (!cds->nlsock) {
LOG(ERROR, "cannot allocate nl socket");
rc = ERROR_FAIL;
goto out;
}
- ret = nl_connect(rds->nlsock, NETLINK_ROUTE);
+ ret = nl_connect(cds->nlsock, NETLINK_ROUTE);
if (ret) {
LOG(ERROR, "failed to open netlink socket: %s",
nl_geterror(ret));
@@ -61,7 +61,7 @@ int init_subkind_nic(libxl__remus_devices_state *rds)
}
/* get list of all qdiscs installed on network devs. */
- ret = rtnl_qdisc_alloc_cache(rds->nlsock, &rds->qdisc_cache);
+ ret = rtnl_qdisc_alloc_cache(cds->nlsock, &cds->qdisc_cache);
if (ret) {
LOG(ERROR, "failed to allocate qdisc cache: %s",
nl_geterror(ret));
@@ -70,9 +70,9 @@ int init_subkind_nic(libxl__remus_devices_state *rds)
}
if (dss->remus->netbufscript) {
- rds->netbufscript = libxl__strdup(gc, dss->remus->netbufscript);
+ cds->netbufscript = libxl__strdup(gc, dss->remus->netbufscript);
} else {
- rds->netbufscript = GCSPRINTF("%s/remus-netbuf-setup",
+ cds->netbufscript = GCSPRINTF("%s/remus-netbuf-setup",
libxl__xen_script_dir_path());
}
@@ -82,22 +82,22 @@ out:
return rc;
}
-void cleanup_subkind_nic(libxl__remus_devices_state *rds)
+void cleanup_subkind_nic(libxl__checkpoint_devices_state *cds)
{
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
/* free qdisc cache */
- if (rds->qdisc_cache) {
- nl_cache_clear(rds->qdisc_cache);
- nl_cache_free(rds->qdisc_cache);
- rds->qdisc_cache = NULL;
+ if (cds->qdisc_cache) {
+ nl_cache_clear(cds->qdisc_cache);
+ nl_cache_free(cds->qdisc_cache);
+ cds->qdisc_cache = NULL;
}
/* close & free nlsock */
- if (rds->nlsock) {
- nl_close(rds->nlsock);
- nl_socket_free(rds->nlsock);
- rds->nlsock = NULL;
+ if (cds->nlsock) {
+ nl_close(cds->nlsock);
+ nl_socket_free(cds->nlsock);
+ cds->nlsock = NULL;
}
}
@@ -111,17 +111,17 @@ void cleanup_subkind_nic(libxl__remus_devices_state *rds)
* it must ONLY be used for remus because if driver domains
* were in use it would constitute a security vulnerability.
*/
-static const char *get_vifname(libxl__remus_device *dev,
+static const char *get_vifname(libxl__checkpoint_device *dev,
const libxl_device_nic *nic)
{
const char *vifname = NULL;
const char *path;
int rc;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
/* Convenience aliases */
- const uint32_t domid = dev->rds->domid;
+ const uint32_t domid = dev->cds->domid;
path = GCSPRINTF("%s/backend/vif/%d/%d/vifname",
libxl__xs_get_dompath(gc, 0), domid, nic->devid);
@@ -144,19 +144,19 @@ static void free_qdisc(libxl__remus_device_nic *remus_nic)
remus_nic->qdisc = NULL;
}
-static int init_qdisc(libxl__remus_devices_state *rds,
+static int init_qdisc(libxl__checkpoint_devices_state *cds,
libxl__remus_device_nic *remus_nic)
{
int rc, ret, ifindex;
struct rtnl_link *ifb = NULL;
struct rtnl_qdisc *qdisc = NULL;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
/* Now that we have brought up REMUS_IFB device with plug qdisc for
* this vif, so we need to refill the qdisc cache.
*/
- ret = nl_cache_refill(rds->nlsock, rds->qdisc_cache);
+ ret = nl_cache_refill(cds->nlsock, cds->qdisc_cache);
if (ret) {
LOG(ERROR, "cannot refill qdisc cache: %s", nl_geterror(ret));
rc = ERROR_FAIL;
@@ -164,7 +164,7 @@ static int init_qdisc(libxl__remus_devices_state *rds,
}
/* get a handle to the REMUS_IFB interface */
- ret = rtnl_link_get_kernel(rds->nlsock, 0, remus_nic->ifb, &ifb);
+ ret = rtnl_link_get_kernel(cds->nlsock, 0, remus_nic->ifb, &ifb);
if (ret) {
LOG(ERROR, "cannot obtain handle for %s: %s", remus_nic->ifb,
nl_geterror(ret));
@@ -187,7 +187,7 @@ static int init_qdisc(libxl__remus_devices_state *rds,
* There is no need to explicitly free this qdisc as its just a
* reference from the qdisc cache we allocated earlier.
*/
- qdisc = rtnl_qdisc_get_by_parent(rds->qdisc_cache, ifindex, TC_H_ROOT);
+ qdisc = rtnl_qdisc_get_by_parent(cds->qdisc_cache, ifindex, TC_H_ROOT);
if (qdisc) {
const char *tc_kind = rtnl_tc_get_kind(TC_CAST(qdisc));
/* Sanity check: Ensure that the root qdisc is a plug qdisc. */
@@ -231,19 +231,19 @@ static void netbuf_teardown_script_cb(libxl__egc *egc,
* $REMUS_IFB (for teardown)
* setup/teardown as command line arg.
*/
-static void setup_async_exec(libxl__remus_device *dev, char *op)
+static void setup_async_exec(libxl__checkpoint_device *dev, char *op)
{
int arraysize, nr = 0;
char **env = NULL, **args = NULL;
libxl__remus_device_nic *remus_nic = dev->concrete_data;
- libxl__remus_devices_state *rds = dev->rds;
+ libxl__checkpoint_devices_state *cds = dev->cds;
libxl__async_exec_state *aes = &dev->aodev.aes;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
/* Convenience aliases */
- char *const script = libxl__strdup(gc, rds->netbufscript);
- const uint32_t domid = rds->domid;
+ char *const script = libxl__strdup(gc, cds->netbufscript);
+ const uint32_t domid = cds->domid;
const int dev_id = remus_nic->devid;
const char *const vif = remus_nic->vif;
const char *const ifb = remus_nic->ifb;
@@ -269,7 +269,7 @@ static void setup_async_exec(libxl__remus_device *dev, char *op)
args[nr++] = NULL;
assert(nr == arraysize);
- aes->ao = dev->rds->ao;
+ aes->ao = dev->cds->ao;
aes->what = GCSPRINTF("%s %s", args[0], args[1]);
aes->env = env;
aes->args = args;
@@ -286,13 +286,13 @@ static void setup_async_exec(libxl__remus_device *dev, char *op)
/* setup() and teardown() */
-static void nic_setup(libxl__egc *egc, libxl__remus_device *dev)
+static void nic_setup(libxl__egc *egc, libxl__checkpoint_device *dev)
{
int rc;
libxl__remus_device_nic *remus_nic;
const libxl_device_nic *nic = dev->backend_dev;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
/*
* thers's no subkind of nic devices, so nic ops is always matched
@@ -330,16 +330,16 @@ static void netbuf_setup_script_cb(libxl__egc *egc,
int status)
{
libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
- libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+ libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
libxl__remus_device_nic *remus_nic = dev->concrete_data;
- libxl__remus_devices_state *rds = dev->rds;
+ libxl__checkpoint_devices_state *cds = dev->cds;
const char *out_path_base, *hotplug_error = NULL;
int rc;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
/* Convenience aliases */
- const uint32_t domid = rds->domid;
+ const uint32_t domid = cds->domid;
const int devid = remus_nic->devid;
const char *const vif = remus_nic->vif;
const char **const ifb = &remus_nic->ifb;
@@ -373,7 +373,7 @@ static void netbuf_setup_script_cb(libxl__egc *egc,
if (hotplug_error) {
LOG(ERROR, "netbuf script %s setup failed for vif %s: %s",
- rds->netbufscript, vif, hotplug_error);
+ cds->netbufscript, vif, hotplug_error);
rc = ERROR_FAIL;
goto out;
}
@@ -384,17 +384,17 @@ static void netbuf_setup_script_cb(libxl__egc *egc,
}
LOG(DEBUG, "%s will buffer packets from vif %s", *ifb, vif);
- rc = init_qdisc(rds, remus_nic);
+ rc = init_qdisc(cds, remus_nic);
out:
aodev->rc = rc;
aodev->callback(egc, aodev);
}
-static void nic_teardown(libxl__egc *egc, libxl__remus_device *dev)
+static void nic_teardown(libxl__egc *egc, libxl__checkpoint_device *dev)
{
int rc;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
setup_async_exec(dev, "teardown");
@@ -415,7 +415,7 @@ static void netbuf_teardown_script_cb(libxl__egc *egc,
{
int rc;
libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
- libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+ libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
libxl__remus_device_nic *remus_nic = dev->concrete_data;
if (status)
@@ -440,12 +440,12 @@ enum {
/* API implementations */
static int remus_netbuf_op(libxl__remus_device_nic *remus_nic,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int buffer_op)
{
int rc, ret;
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
if (buffer_op == tc_buffer_start)
ret = rtnl_qdisc_plug_buffer(remus_nic->qdisc);
@@ -457,7 +457,7 @@ static int remus_netbuf_op(libxl__remus_device_nic *remus_nic,
goto out;
}
- ret = rtnl_qdisc_add(rds->nlsock, remus_nic->qdisc, NLM_F_REQUEST);
+ ret = rtnl_qdisc_add(cds->nlsock, remus_nic->qdisc, NLM_F_REQUEST);
if (ret) {
rc = ERROR_FAIL;
goto out;
@@ -474,33 +474,33 @@ out:
return rc;
}
-static void nic_postsuspend(libxl__egc *egc, libxl__remus_device *dev)
+static void nic_postsuspend(libxl__egc *egc, libxl__checkpoint_device *dev)
{
int rc;
libxl__remus_device_nic *remus_nic = dev->concrete_data;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
- rc = remus_netbuf_op(remus_nic, dev->rds, tc_buffer_start);
+ rc = remus_netbuf_op(remus_nic, dev->cds, tc_buffer_start);
dev->aodev.rc = rc;
dev->aodev.callback(egc, &dev->aodev);
}
-static void nic_commit(libxl__egc *egc, libxl__remus_device *dev)
+static void nic_commit(libxl__egc *egc, libxl__checkpoint_device *dev)
{
int rc;
libxl__remus_device_nic *remus_nic = dev->concrete_data;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
- rc = remus_netbuf_op(remus_nic, dev->rds, tc_buffer_release);
+ rc = remus_netbuf_op(remus_nic, dev->cds, tc_buffer_release);
dev->aodev.rc = rc;
dev->aodev.callback(egc, &dev->aodev);
}
-const libxl__remus_device_instance_ops remus_device_nic = {
+const libxl__checkpoint_device_instance_ops remus_device_nic = {
.kind = LIBXL__DEVICE_KIND_VIF,
.setup = nic_setup,
.teardown = nic_teardown,
diff --git a/tools/libxl/libxl_nonetbuffer.c b/tools/libxl/libxl_nonetbuffer.c
index 3c659c2..4b68152 100644
--- a/tools/libxl/libxl_nonetbuffer.c
+++ b/tools/libxl/libxl_nonetbuffer.c
@@ -22,25 +22,25 @@ int libxl__netbuffer_enabled(libxl__gc *gc)
return 0;
}
-int init_subkind_nic(libxl__remus_devices_state *rds)
+int init_subkind_nic(libxl__checkpoint_devices_state *cds)
{
return 0;
}
-void cleanup_subkind_nic(libxl__remus_devices_state *rds)
+void cleanup_subkind_nic(libxl__checkpoint_devices_state *cds)
{
return;
}
-static void nic_setup(libxl__egc *egc, libxl__remus_device *dev)
+static void nic_setup(libxl__egc *egc, libxl__checkpoint_device *dev)
{
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
dev->aodev.rc = ERROR_FAIL;
dev->aodev.callback(egc, &dev->aodev);
}
-const libxl__remus_device_instance_ops remus_device_nic = {
+const libxl__checkpoint_device_instance_ops remus_device_nic = {
.kind = LIBXL__DEVICE_KIND_VIF,
.setup = nic_setup,
};
diff --git a/tools/libxl/libxl_remus.c b/tools/libxl/libxl_remus.c
index b555715..211216c 100644
--- a/tools/libxl/libxl_remus.c
+++ b/tools/libxl/libxl_remus.c
@@ -21,15 +21,15 @@
/*----- remus: setup the environment -----*/
static void libxl__remus_setup_done(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc);
+ libxl__checkpoint_devices_state *cds, int rc);
static void libxl__remus_setup_failed(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc);
+ libxl__checkpoint_devices_state *cds, int rc);
void libxl__remus_setup(libxl__egc *egc,
libxl__domain_suspend_state *dss)
{
/* Convenience aliases */
- libxl__remus_devices_state *const rds = &dss->rds;
+ libxl__checkpoint_devices_state *const cds = &dss->cds;
const libxl_domain_remus_info *const info = dss->remus;
STATE_AO_GC(dss->ao);
@@ -39,27 +39,27 @@ void libxl__remus_setup(libxl__egc *egc,
LOG(ERROR, "Remus: No support for network buffering");
goto out;
}
- rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VIF);
+ cds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VIF);
}
if (libxl_defbool_val(info->diskbuf))
- rds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VBD);
+ cds->device_kind_flags |= (1 << LIBXL__DEVICE_KIND_VBD);
- rds->ao = ao;
- rds->domid = dss->domid;
- rds->callback = libxl__remus_setup_done;
+ cds->ao = ao;
+ cds->domid = dss->domid;
+ cds->callback = libxl__remus_setup_done;
- libxl__remus_devices_setup(egc, rds);
+ libxl__checkpoint_devices_setup(egc, cds);
return;
out:
- libxl__remus_setup_failed(egc, rds, ERROR_FAIL);
+ libxl__remus_setup_failed(egc, cds, ERROR_FAIL);
}
static void libxl__remus_setup_done(libxl__egc *egc,
- libxl__remus_devices_state *rds, int rc)
+ libxl__checkpoint_devices_state *cds, int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
STATE_AO_GC(dss->ao);
if (!rc) {
@@ -69,15 +69,15 @@ static void libxl__remus_setup_done(libxl__egc *egc,
LOG(ERROR, "Remus: failed to setup device for guest with domid %u, rc %d",
dss->domid, rc);
- rds->callback = libxl__remus_setup_failed;
- libxl__remus_devices_teardown(egc, rds);
+ cds->callback = libxl__remus_setup_failed;
+ libxl__checkpoint_devices_teardown(egc, cds);
}
static void libxl__remus_setup_failed(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
STATE_AO_GC(dss->ao);
if (rc)
@@ -90,7 +90,7 @@ static void libxl__remus_setup_failed(libxl__egc *egc,
/*----- remus: teardown the environment -----*/
static void remus_teardown_done(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc);
void libxl__remus_teardown(libxl__egc *egc,
@@ -107,15 +107,15 @@ void libxl__remus_teardown(libxl__egc *egc,
*/
LOG(WARN, "Remus: Domain suspend terminated with rc %d,"
" teardown Remus devices...", rc);
- dss->rds.callback = remus_teardown_done;
- libxl__remus_devices_teardown(egc, &dss->rds);
+ dss->cds.callback = remus_teardown_done;
+ libxl__checkpoint_devices_teardown(egc, &dss->cds);
}
static void remus_teardown_done(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
STATE_AO_GC(dss->ao);
if (rc)
@@ -130,7 +130,7 @@ static void remus_teardown_done(libxl__egc *egc,
static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
libxl__domain_suspend_state2 *dss2, int ok);
static void remus_devices_postsuspend_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc);
void libxl__remus_domain_suspend_callback(void *data)
@@ -154,9 +154,9 @@ static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
if (!ok)
goto out;
- libxl__remus_devices_state *const rds = &dss->rds;
- rds->callback = remus_devices_postsuspend_cb;
- libxl__remus_devices_postsuspend(egc, rds);
+ libxl__checkpoint_devices_state *const cds = &dss->cds;
+ cds->callback = remus_devices_postsuspend_cb;
+ libxl__checkpoint_devices_postsuspend(egc, cds);
return;
out:
@@ -164,11 +164,11 @@ out:
}
static void remus_devices_postsuspend_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc)
{
int ok = 0;
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
if (rc)
goto out;
@@ -182,7 +182,7 @@ out:
/*----- remus: resume the guest -----*/
static void remus_devices_preresume_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc);
void libxl__remus_domain_resume_callback(void *data)
@@ -192,17 +192,17 @@ void libxl__remus_domain_resume_callback(void *data)
libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
STATE_AO_GC(dss->ao);
- libxl__remus_devices_state *const rds = &dss->rds;
- rds->callback = remus_devices_preresume_cb;
- libxl__remus_devices_preresume(egc, rds);
+ libxl__checkpoint_devices_state *const cds = &dss->cds;
+ cds->callback = remus_devices_preresume_cb;
+ libxl__checkpoint_devices_preresume(egc, cds);
}
static void remus_devices_preresume_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc)
{
int ok = 0;
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
STATE_AO_GC(dss->ao);
if (rc)
@@ -224,7 +224,7 @@ out:
static void remus_checkpoint_dm_saved(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc);
static void remus_devices_commit_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc);
static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
const struct timeval *requested_abs);
@@ -248,7 +248,7 @@ static void remus_checkpoint_dm_saved(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc)
{
/* Convenience aliases */
- libxl__remus_devices_state *const rds = &dss->rds;
+ libxl__checkpoint_devices_state *const cds = &dss->cds;
STATE_AO_GC(dss->ao);
@@ -257,8 +257,8 @@ static void remus_checkpoint_dm_saved(libxl__egc *egc,
goto out;
}
- rds->callback = remus_devices_commit_cb;
- libxl__remus_devices_commit(egc, rds);
+ cds->callback = remus_devices_commit_cb;
+ libxl__checkpoint_devices_commit(egc, cds);
return;
@@ -267,10 +267,10 @@ out:
}
static void remus_devices_commit_cb(libxl__egc *egc,
- libxl__remus_devices_state *rds,
+ libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(rds, *dss, rds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
STATE_AO_GC(dss->ao);
diff --git a/tools/libxl/libxl_remus_disk_drbd.c b/tools/libxl/libxl_remus_disk_drbd.c
index 3215f93..039ffb1 100644
--- a/tools/libxl/libxl_remus_disk_drbd.c
+++ b/tools/libxl/libxl_remus_disk_drbd.c
@@ -26,30 +26,30 @@ typedef struct libxl__remus_drbd_disk {
int ackwait;
} libxl__remus_drbd_disk;
-int init_subkind_drbd_disk(libxl__remus_devices_state *rds)
+int init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds)
{
- STATE_AO_GC(rds->ao);
+ STATE_AO_GC(cds->ao);
- rds->drbd_probe_script = GCSPRINTF("%s/block-drbd-probe",
+ cds->drbd_probe_script = GCSPRINTF("%s/block-drbd-probe",
libxl__xen_script_dir_path());
return 0;
}
-void cleanup_subkind_drbd_disk(libxl__remus_devices_state *rds)
+void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds)
{
return;
}
/*----- helper functions, for async calls -----*/
static void drbd_async_call(libxl__egc *egc,
- libxl__remus_device *dev,
- void func(libxl__remus_device *),
+ libxl__checkpoint_device *dev,
+ void func(libxl__checkpoint_device *),
libxl__ev_child_callback callback)
{
int pid = -1, rc;
libxl__ao_device *aodev = &dev->aodev;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
/* Fork and call */
pid = libxl__ev_child_fork(gc, &aodev->child, callback);
@@ -82,21 +82,21 @@ static void match_async_exec_cb(libxl__egc *egc,
/* implementations */
-static void match_async_exec(libxl__egc *egc, libxl__remus_device *dev);
+static void match_async_exec(libxl__egc *egc, libxl__checkpoint_device *dev);
-static void drbd_setup(libxl__egc *egc, libxl__remus_device *dev)
+static void drbd_setup(libxl__egc *egc, libxl__checkpoint_device *dev)
{
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
match_async_exec(egc, dev);
}
-static void match_async_exec(libxl__egc *egc, libxl__remus_device *dev)
+static void match_async_exec(libxl__egc *egc, libxl__checkpoint_device *dev)
{
int arraysize, nr = 0, rc;
const libxl_device_disk *disk = dev->backend_dev;
libxl__async_exec_state *aes = &dev->aodev.aes;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
/* setup env & args */
arraysize = 1;
@@ -107,12 +107,12 @@ static void match_async_exec(libxl__egc *egc, libxl__remus_device *dev)
arraysize = 3;
nr = 0;
GCNEW_ARRAY(aes->args, arraysize);
- aes->args[nr++] = dev->rds->drbd_probe_script;
+ aes->args[nr++] = dev->cds->drbd_probe_script;
aes->args[nr++] = disk->pdev_path;
aes->args[nr++] = NULL;
assert(nr <= arraysize);
- aes->ao = dev->rds->ao;
+ aes->ao = dev->cds->ao;
aes->what = GCSPRINTF("%s %s", aes->args[0], aes->args[1]);
aes->timeout_ms = LIBXL_HOTPLUG_TIMEOUT * 1000;
aes->callback = match_async_exec_cb;
@@ -137,14 +137,14 @@ static void match_async_exec_cb(libxl__egc *egc,
{
int rc;
libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
- libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+ libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
libxl__remus_drbd_disk *drbd_disk;
const libxl_device_disk *disk = dev->backend_dev;
STATE_AO_GC(aodev->ao);
if (status) {
- rc = ERROR_REMUS_DEVOPS_DOES_NOT_MATCH;
+ rc = ERROR_CHECKPOINT_DEVOPS_DOES_NOT_MATCH;
goto out;
}
@@ -167,10 +167,10 @@ out:
aodev->callback(egc, aodev);
}
-static void drbd_teardown(libxl__egc *egc, libxl__remus_device *dev)
+static void drbd_teardown(libxl__egc *egc, libxl__checkpoint_device *dev)
{
libxl__remus_drbd_disk *drbd_disk = dev->concrete_data;
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
close(drbd_disk->ctl_fd);
dev->aodev.rc = 0;
@@ -187,9 +187,9 @@ static void checkpoint_async_call_done(libxl__egc *egc,
/* API implementations */
/* this op will not wait and block, so implement as sync op */
-static void drbd_postsuspend(libxl__egc *egc, libxl__remus_device *dev)
+static void drbd_postsuspend(libxl__egc *egc, libxl__checkpoint_device *dev)
{
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
libxl__remus_drbd_disk *rdd = dev->concrete_data;
@@ -203,16 +203,16 @@ static void drbd_postsuspend(libxl__egc *egc, libxl__remus_device *dev)
}
-static void drbd_preresume_async(libxl__remus_device *dev);
+static void drbd_preresume_async(libxl__checkpoint_device *dev);
-static void drbd_preresume(libxl__egc *egc, libxl__remus_device *dev)
+static void drbd_preresume(libxl__egc *egc, libxl__checkpoint_device *dev)
{
- STATE_AO_GC(dev->rds->ao);
+ STATE_AO_GC(dev->cds->ao);
drbd_async_call(egc, dev, drbd_preresume_async, checkpoint_async_call_done);
}
-static void drbd_preresume_async(libxl__remus_device *dev)
+static void drbd_preresume_async(libxl__checkpoint_device *dev)
{
libxl__remus_drbd_disk *rdd = dev->concrete_data;
int ackwait = rdd->ackwait;
@@ -231,7 +231,7 @@ static void checkpoint_async_call_done(libxl__egc *egc,
{
int rc;
libxl__ao_device *aodev = CONTAINER_OF(child, *aodev, child);
- libxl__remus_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+ libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
libxl__remus_drbd_disk *rdd = dev->concrete_data;
STATE_AO_GC(aodev->ao);
@@ -249,7 +249,7 @@ out:
aodev->callback(egc, aodev);
}
-const libxl__remus_device_instance_ops remus_device_drbd_disk = {
+const libxl__checkpoint_device_instance_ops remus_device_drbd_disk = {
.kind = LIBXL__DEVICE_KIND_VBD,
.setup = drbd_setup,
.teardown = drbd_teardown,
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 4614667..a5890f0 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -61,8 +61,8 @@ libxl_error = Enumeration("error", [
(-15, "LOCK_FAIL"),
(-16, "JSON_CONFIG_EMPTY"),
(-17, "DEVICE_EXISTS"),
- (-18, "REMUS_DEVOPS_DOES_NOT_MATCH"),
- (-19, "REMUS_DEVICE_NOT_SUPPORTED"),
+ (-18, "CHECKPOINT_DEVOPS_DOES_NOT_MATCH"),
+ (-19, "CHECKPOINT_DEVICE_NOT_SUPPORTED"),
], value_namespace = "")
libxl_domain_type = Enumeration("domain_type", [
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 03/18] adjust the indentation
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
2014-10-24 7:05 ` [RFC Patch v4 01/18] move remus related codes to libxl_remus.c Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 02/18] rename remus device to checkpoint device Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 04/18] don't touch remus in checkpoint_device Wen Congyang
` (16 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxl/libxl_checkpoint_device.c | 23 ++++++++++++-----------
tools/libxl/libxl_internal.h | 21 ++++++++++++---------
tools/libxl/libxl_remus.c | 12 ++++++++----
3 files changed, 32 insertions(+), 24 deletions(-)
diff --git a/tools/libxl/libxl_checkpoint_device.c b/tools/libxl/libxl_checkpoint_device.c
index 109cd23..0cfabc3 100644
--- a/tools/libxl/libxl_checkpoint_device.c
+++ b/tools/libxl/libxl_checkpoint_device.c
@@ -73,9 +73,9 @@ static void devices_teardown_cb(libxl__egc *egc,
/* checkpoint device setup and teardown */
static libxl__checkpoint_device* checkpoint_device_init(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds,
- libxl__device_kind kind,
- void *libxl_dev)
+ libxl__checkpoint_devices_state *cds,
+ libxl__device_kind kind,
+ void *libxl_dev)
{
libxl__checkpoint_device *dev = NULL;
@@ -89,9 +89,10 @@ static libxl__checkpoint_device* checkpoint_device_init(libxl__egc *egc,
}
static void checkpoint_devices_setup(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
-void libxl__checkpoint_devices_setup(libxl__egc *egc, libxl__checkpoint_devices_state *cds)
+void libxl__checkpoint_devices_setup(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds)
{
int i, rc;
@@ -137,7 +138,7 @@ out:
}
static void checkpoint_devices_setup(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds)
+ libxl__checkpoint_devices_state *cds)
{
int i, rc;
@@ -223,7 +224,7 @@ static void all_devices_setup_cb(libxl__egc *egc,
}
void libxl__checkpoint_devices_teardown(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds)
+ libxl__checkpoint_devices_state *cds)
{
int i;
libxl__checkpoint_device *dev;
@@ -285,12 +286,12 @@ static void devices_checkpoint_cb(libxl__egc *egc,
/* API implementations */
-#define define_checkpoint_api(api) \
-void libxl__checkpoint_devices_##api(libxl__egc *egc, \
- libxl__checkpoint_devices_state *cds) \
+#define define_checkpoint_api(api) \
+void libxl__checkpoint_devices_##api(libxl__egc *egc, \
+ libxl__checkpoint_devices_state *cds) \
{ \
int i; \
- libxl__checkpoint_device *dev; \
+ libxl__checkpoint_device *dev; \
\
STATE_AO_GC(cds->ao); \
\
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index aa00d33..25b1041 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2657,7 +2657,8 @@ typedef struct libxl__save_helper_state {
* Each device type needs to implement the interfaces specified in
* the libxl__checkpoint_device_instance_ops if it wishes to support Remus.
*
- * The high-level control flow through the checkpoint device layer is shown below:
+ * The high-level control flow through the checkpoint device layer is shown
+ * below:
*
* xl remus
* |-> libxl_domain_remus_start
@@ -2718,7 +2719,8 @@ int init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
typedef void libxl__checkpoint_callback(libxl__egc *,
- libxl__checkpoint_devices_state *, int rc);
+ libxl__checkpoint_devices_state *,
+ int rc);
/*
* State associated with a checkpoint invocation, including parameters
@@ -2726,7 +2728,7 @@ typedef void libxl__checkpoint_callback(libxl__egc *,
* save/restore machinery.
*/
struct libxl__checkpoint_devices_state {
- /*---- must be set by caller of libxl__checkpoint_device_(setup|teardown) ----*/
+ /*-- must be set by caller of libxl__checkpoint_device_(setup|teardown) --*/
libxl__ao *ao;
uint32_t domid;
@@ -2739,7 +2741,8 @@ struct libxl__checkpoint_devices_state {
/*
* this array is allocated before setup the checkpoint devices by the
* checkpoint abstract layer.
- * devs may be NULL, means there's no checkpoint devices that has been set up.
+ * devs may be NULL, means there's no checkpoint devices that has been
+ * set up.
* the size of this array is 'num_devices', which is the total number
* of libxl nic devices and disk devices(num_nics + num_disks).
*/
@@ -2799,15 +2802,15 @@ struct libxl__checkpoint_device {
/* the following 5 APIs are async ops, call cds->callback when done */
_hidden void libxl__checkpoint_devices_setup(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
_hidden void libxl__checkpoint_devices_teardown(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
_hidden void libxl__checkpoint_devices_postsuspend(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
_hidden void libxl__checkpoint_devices_preresume(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
_hidden void libxl__checkpoint_devices_commit(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds);
+ libxl__checkpoint_devices_state *cds);
_hidden int libxl__netbuffer_enabled(libxl__gc *gc);
/*----- Domain suspend (save) state structure -----*/
diff --git a/tools/libxl/libxl_remus.c b/tools/libxl/libxl_remus.c
index 211216c..5afa618 100644
--- a/tools/libxl/libxl_remus.c
+++ b/tools/libxl/libxl_remus.c
@@ -21,9 +21,11 @@
/*----- remus: setup the environment -----*/
static void libxl__remus_setup_done(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds, int rc);
+ libxl__checkpoint_devices_state *cds,
+ int rc);
static void libxl__remus_setup_failed(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds, int rc);
+ libxl__checkpoint_devices_state *cds,
+ int rc);
void libxl__remus_setup(libxl__egc *egc,
libxl__domain_suspend_state *dss)
@@ -57,7 +59,8 @@ out:
}
static void libxl__remus_setup_done(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds, int rc)
+ libxl__checkpoint_devices_state *cds,
+ int rc)
{
libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
STATE_AO_GC(dss->ao);
@@ -222,7 +225,8 @@ out:
/*----- remus: wait a new checkpoint -----*/
static void remus_checkpoint_dm_saved(libxl__egc *egc,
- libxl__domain_suspend_state *dss, int rc);
+ libxl__domain_suspend_state *dss,
+ int rc);
static void remus_devices_commit_cb(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc);
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 04/18] don't touch remus in checkpoint_device
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (2 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 03/18] adjust the indentation Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 05/18] Update libxl_save_msgs_gen.pl to support return data from xl to xc Wen Congyang
` (15 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Shriram Rajagopalan, Yang Hongyang, Lai Jiangshan
Checkpoint device is an abstract layer to do checkpoint.
COLO can also use it to do checkpoint. But there are
still some codes in checkpoint device which touch remus:
1. remus_ops: we use remus ops directly in checkpoint
device. Store it in checkpoint device state.
2. concrete layer's private member: add a new structure
remus state, and move them to remus state.
3. init/cleanup device subkind: we call (init|cleanup)_subkind_nic
and (init|cleanup)_subkind_drbd_disk directly in checkpoint
device. Call them before calling libxl__checkpoint_devices_setup()
or after calling libxl__checkpoint_devices_teardown().
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: Shriram Rajagopalan <rshriram@cs.ubc.ca>
---
tools/libxl/libxl.c | 2 +-
tools/libxl/libxl_checkpoint_device.c | 52 ++------------------
tools/libxl/libxl_dom.c | 3 +-
tools/libxl/libxl_internal.h | 37 ++++++++++-----
tools/libxl/libxl_netbuffer.c | 51 +++++++++++---------
tools/libxl/libxl_remus.c | 89 +++++++++++++++++++++++++++--------
tools/libxl/libxl_remus.h | 5 +-
tools/libxl/libxl_remus_disk_drbd.c | 9 ++--
8 files changed, 136 insertions(+), 112 deletions(-)
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index b3d01f8..fa990fe 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -858,7 +858,7 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
assert(info);
/* Point of no return */
- libxl__remus_setup(egc, dss);
+ libxl__remus_setup(egc, &dss->rs);
return AO_INPROGRESS;
out:
diff --git a/tools/libxl/libxl_checkpoint_device.c b/tools/libxl/libxl_checkpoint_device.c
index 0cfabc3..2b7318f 100644
--- a/tools/libxl/libxl_checkpoint_device.c
+++ b/tools/libxl/libxl_checkpoint_device.c
@@ -17,46 +17,6 @@
#include "libxl_internal.h"
-extern const libxl__checkpoint_device_instance_ops remus_device_nic;
-extern const libxl__checkpoint_device_instance_ops remus_device_drbd_disk;
-static const libxl__checkpoint_device_instance_ops *remus_ops[] = {
- &remus_device_nic,
- &remus_device_drbd_disk,
- NULL,
-};
-
-/*----- helper functions -----*/
-
-static int init_device_subkind(libxl__checkpoint_devices_state *cds)
-{
- /* init device subkind-specific state in the libxl ctx */
- int rc;
- STATE_AO_GC(cds->ao);
-
- if (libxl__netbuffer_enabled(gc)) {
- rc = init_subkind_nic(cds);
- if (rc) goto out;
- }
-
- rc = init_subkind_drbd_disk(cds);
- if (rc) goto out;
-
- rc = 0;
-out:
- return rc;
-}
-
-static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
-{
- /* cleanup device subkind-specific state in the libxl ctx */
- STATE_AO_GC(cds->ao);
-
- if (libxl__netbuffer_enabled(gc))
- cleanup_subkind_nic(cds);
-
- cleanup_subkind_drbd_disk(cds);
-}
-
/*----- setup() and teardown() -----*/
/* callbacks */
@@ -94,14 +54,10 @@ static void checkpoint_devices_setup(libxl__egc *egc,
void libxl__checkpoint_devices_setup(libxl__egc *egc,
libxl__checkpoint_devices_state *cds)
{
- int i, rc;
+ int i;
STATE_AO_GC(cds->ao);
- rc = init_device_subkind(cds);
- if (rc)
- goto out;
-
cds->num_devices = 0;
cds->num_nics = 0;
cds->num_disks = 0;
@@ -134,7 +90,7 @@ void libxl__checkpoint_devices_setup(libxl__egc *egc,
return;
out:
- cds->callback(egc, cds, rc);
+ cds->callback(egc, cds, 0);
}
static void checkpoint_devices_setup(libxl__egc *egc,
@@ -172,7 +128,7 @@ static void device_setup_iterate(libxl__egc *egc, libxl__ao_device *aodev)
goto out;
do {
- dev->ops = remus_ops[++dev->ops_index];
+ dev->ops = dev->cds->ops[++dev->ops_index];
if (!dev->ops) {
libxl_device_nic * nic = NULL;
libxl_device_disk * disk = NULL;
@@ -271,8 +227,6 @@ static void devices_teardown_cb(libxl__egc *egc,
cds->disks = NULL;
cds->num_disks = 0;
- cleanup_device_subkind(cds);
-
cds->callback(egc, cds, rc);
}
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 30c0de0..3359a9f 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -1698,7 +1698,6 @@ void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
dss2->save_dm = 1;
if (r_info != NULL) {
- dss->interval = r_info->interval;
if (libxl_defbool_val(r_info->compression))
dss->xcflags |= XCFLAGS_CHECKPOINT_COMPRESS;
}
@@ -1880,7 +1879,7 @@ static void domain_suspend_done(libxl__egc *egc,
dss2->guest_evtchn.port, &dss2->guest_evtchn_lockfd);
if (dss->remus) {
- libxl__remus_teardown(egc, dss, rc);
+ libxl__remus_teardown(egc, &dss->rs, rc);
return;
}
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 25b1041..040dee5 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2734,6 +2734,8 @@ struct libxl__checkpoint_devices_state {
uint32_t domid;
libxl__checkpoint_callback *callback;
int device_kind_flags;
+ /* The ops must be pointer array, and the last ops must be NULL */
+ const libxl__checkpoint_device_instance_ops **ops;
/*----- private for abstract layer only -----*/
@@ -2754,16 +2756,6 @@ struct libxl__checkpoint_devices_state {
int num_disks;
libxl__multidev multidev;
-
- /*----- private for concrete (device-specific) layer only -----*/
-
- /* private for nic device subkind ops */
- char *netbufscript;
- struct nl_sock *nlsock;
- struct nl_cache *qdisc_cache;
-
- /* private for drbd disk subkind ops */
- char *drbd_probe_script;
};
/*
@@ -2811,6 +2803,27 @@ _hidden void libxl__checkpoint_devices_preresume(libxl__egc *egc,
libxl__checkpoint_devices_state *cds);
_hidden void libxl__checkpoint_devices_commit(libxl__egc *egc,
libxl__checkpoint_devices_state *cds);
+
+/*----- Remus related state structure -----*/
+typedef struct libxl__remus_state libxl__remus_state;
+struct libxl__remus_state {
+ /* private */
+ libxl__ev_time checkpoint_timeout; /* used for Remus checkpoint */
+ int interval; /* checkpoint interval */
+
+ /* abstract layer */
+ libxl__checkpoint_devices_state cds;
+
+ /*----- private for concrete (device-specific) layer only -----*/
+ /* private for nic device subkind ops */
+ char *netbufscript;
+ struct nl_sock *nlsock;
+ struct nl_cache *qdisc_cache;
+
+ /* private for drbd disk subkind ops */
+ char *drbd_probe_script;
+};
+
_hidden int libxl__netbuffer_enabled(libxl__gc *gc);
/*----- Domain suspend (save) state structure -----*/
@@ -2874,9 +2887,7 @@ struct libxl__domain_suspend_state {
libxl__domain_suspend_state2 dss2;
int hvm;
int xcflags;
- libxl__checkpoint_devices_state cds;
- libxl__ev_time checkpoint_timeout; /* used for Remus checkpoint */
- int interval; /* checkpoint interval (for Remus) */
+ libxl__remus_state rs;
libxl__save_helper_state shs;
libxl__logdirty_switch logdirty;
/* private for libxl__domain_save_device_model */
diff --git a/tools/libxl/libxl_netbuffer.c b/tools/libxl/libxl_netbuffer.c
index 2d668dd..69a261b 100644
--- a/tools/libxl/libxl_netbuffer.c
+++ b/tools/libxl/libxl_netbuffer.c
@@ -41,18 +41,19 @@ int libxl__netbuffer_enabled(libxl__gc *gc)
int init_subkind_nic(libxl__checkpoint_devices_state *cds)
{
int rc, ret;
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rs, *dss, rs);
STATE_AO_GC(cds->ao);
- cds->nlsock = nl_socket_alloc();
- if (!cds->nlsock) {
+ rs->nlsock = nl_socket_alloc();
+ if (!rs->nlsock) {
LOG(ERROR, "cannot allocate nl socket");
rc = ERROR_FAIL;
goto out;
}
- ret = nl_connect(cds->nlsock, NETLINK_ROUTE);
+ ret = nl_connect(rs->nlsock, NETLINK_ROUTE);
if (ret) {
LOG(ERROR, "failed to open netlink socket: %s",
nl_geterror(ret));
@@ -61,7 +62,7 @@ int init_subkind_nic(libxl__checkpoint_devices_state *cds)
}
/* get list of all qdiscs installed on network devs. */
- ret = rtnl_qdisc_alloc_cache(cds->nlsock, &cds->qdisc_cache);
+ ret = rtnl_qdisc_alloc_cache(rs->nlsock, &rs->qdisc_cache);
if (ret) {
LOG(ERROR, "failed to allocate qdisc cache: %s",
nl_geterror(ret));
@@ -70,10 +71,10 @@ int init_subkind_nic(libxl__checkpoint_devices_state *cds)
}
if (dss->remus->netbufscript) {
- cds->netbufscript = libxl__strdup(gc, dss->remus->netbufscript);
+ rs->netbufscript = libxl__strdup(gc, dss->remus->netbufscript);
} else {
- cds->netbufscript = GCSPRINTF("%s/remus-netbuf-setup",
- libxl__xen_script_dir_path());
+ rs->netbufscript = GCSPRINTF("%s/remus-netbuf-setup",
+ libxl__xen_script_dir_path());
}
rc = 0;
@@ -84,20 +85,22 @@ out:
void cleanup_subkind_nic(libxl__checkpoint_devices_state *cds)
{
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
+
STATE_AO_GC(cds->ao);
/* free qdisc cache */
- if (cds->qdisc_cache) {
- nl_cache_clear(cds->qdisc_cache);
- nl_cache_free(cds->qdisc_cache);
- cds->qdisc_cache = NULL;
+ if (rs->qdisc_cache) {
+ nl_cache_clear(rs->qdisc_cache);
+ nl_cache_free(rs->qdisc_cache);
+ rs->qdisc_cache = NULL;
}
/* close & free nlsock */
- if (cds->nlsock) {
- nl_close(cds->nlsock);
- nl_socket_free(cds->nlsock);
- cds->nlsock = NULL;
+ if (rs->nlsock) {
+ nl_close(rs->nlsock);
+ nl_socket_free(rs->nlsock);
+ rs->nlsock = NULL;
}
}
@@ -150,13 +153,14 @@ static int init_qdisc(libxl__checkpoint_devices_state *cds,
int rc, ret, ifindex;
struct rtnl_link *ifb = NULL;
struct rtnl_qdisc *qdisc = NULL;
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
STATE_AO_GC(cds->ao);
/* Now that we have brought up REMUS_IFB device with plug qdisc for
* this vif, so we need to refill the qdisc cache.
*/
- ret = nl_cache_refill(cds->nlsock, cds->qdisc_cache);
+ ret = nl_cache_refill(rs->nlsock, rs->qdisc_cache);
if (ret) {
LOG(ERROR, "cannot refill qdisc cache: %s", nl_geterror(ret));
rc = ERROR_FAIL;
@@ -164,7 +168,7 @@ static int init_qdisc(libxl__checkpoint_devices_state *cds,
}
/* get a handle to the REMUS_IFB interface */
- ret = rtnl_link_get_kernel(cds->nlsock, 0, remus_nic->ifb, &ifb);
+ ret = rtnl_link_get_kernel(rs->nlsock, 0, remus_nic->ifb, &ifb);
if (ret) {
LOG(ERROR, "cannot obtain handle for %s: %s", remus_nic->ifb,
nl_geterror(ret));
@@ -187,7 +191,7 @@ static int init_qdisc(libxl__checkpoint_devices_state *cds,
* There is no need to explicitly free this qdisc as its just a
* reference from the qdisc cache we allocated earlier.
*/
- qdisc = rtnl_qdisc_get_by_parent(cds->qdisc_cache, ifindex, TC_H_ROOT);
+ qdisc = rtnl_qdisc_get_by_parent(rs->qdisc_cache, ifindex, TC_H_ROOT);
if (qdisc) {
const char *tc_kind = rtnl_tc_get_kind(TC_CAST(qdisc));
/* Sanity check: Ensure that the root qdisc is a plug qdisc. */
@@ -238,11 +242,12 @@ static void setup_async_exec(libxl__checkpoint_device *dev, char *op)
libxl__remus_device_nic *remus_nic = dev->concrete_data;
libxl__checkpoint_devices_state *cds = dev->cds;
libxl__async_exec_state *aes = &dev->aodev.aes;
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
STATE_AO_GC(cds->ao);
/* Convenience aliases */
- char *const script = libxl__strdup(gc, cds->netbufscript);
+ char *const script = libxl__strdup(gc, rs->netbufscript);
const uint32_t domid = cds->domid;
const int dev_id = remus_nic->devid;
const char *const vif = remus_nic->vif;
@@ -333,6 +338,7 @@ static void netbuf_setup_script_cb(libxl__egc *egc,
libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
libxl__remus_device_nic *remus_nic = dev->concrete_data;
libxl__checkpoint_devices_state *cds = dev->cds;
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
const char *out_path_base, *hotplug_error = NULL;
int rc;
@@ -373,7 +379,7 @@ static void netbuf_setup_script_cb(libxl__egc *egc,
if (hotplug_error) {
LOG(ERROR, "netbuf script %s setup failed for vif %s: %s",
- cds->netbufscript, vif, hotplug_error);
+ rs->netbufscript, vif, hotplug_error);
rc = ERROR_FAIL;
goto out;
}
@@ -444,6 +450,7 @@ static int remus_netbuf_op(libxl__remus_device_nic *remus_nic,
int buffer_op)
{
int rc, ret;
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
STATE_AO_GC(cds->ao);
@@ -457,7 +464,7 @@ static int remus_netbuf_op(libxl__remus_device_nic *remus_nic,
goto out;
}
- ret = rtnl_qdisc_add(cds->nlsock, remus_nic->qdisc, NLM_F_REQUEST);
+ ret = rtnl_qdisc_add(rs->nlsock, remus_nic->qdisc, NLM_F_REQUEST);
if (ret) {
rc = ERROR_FAIL;
goto out;
diff --git a/tools/libxl/libxl_remus.c b/tools/libxl/libxl_remus.c
index 5afa618..e393c2e 100644
--- a/tools/libxl/libxl_remus.c
+++ b/tools/libxl/libxl_remus.c
@@ -18,6 +18,45 @@
#include "libxl_internal.h"
#include "libxl_remus.h"
+extern const libxl__checkpoint_device_instance_ops remus_device_nic;
+extern const libxl__checkpoint_device_instance_ops remus_device_drbd_disk;
+static const libxl__checkpoint_device_instance_ops *remus_ops[] = {
+ &remus_device_nic,
+ &remus_device_drbd_disk,
+ NULL,
+};
+
+/*----- helper functions -----*/
+
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+ /* init device subkind-specific state in the libxl ctx */
+ int rc;
+ STATE_AO_GC(cds->ao);
+
+ if (libxl__netbuffer_enabled(gc)) {
+ rc = init_subkind_nic(cds);
+ if (rc) goto out;
+ }
+
+ rc = init_subkind_drbd_disk(cds);
+ if (rc) goto out;
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+ /* cleanup device subkind-specific state in the libxl ctx */
+ STATE_AO_GC(cds->ao);
+
+ if (libxl__netbuffer_enabled(gc))
+ cleanup_subkind_nic(cds);
+
+ cleanup_subkind_drbd_disk(cds);
+}
/*----- remus: setup the environment -----*/
static void libxl__remus_setup_done(libxl__egc *egc,
@@ -27,11 +66,12 @@ static void libxl__remus_setup_failed(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc);
-void libxl__remus_setup(libxl__egc *egc,
- libxl__domain_suspend_state *dss)
+void libxl__remus_setup(libxl__egc *egc, libxl__remus_state *rs)
{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rs, *dss, rs);
+
/* Convenience aliases */
- libxl__checkpoint_devices_state *const cds = &dss->cds;
+ libxl__checkpoint_devices_state *const cds = &rs->cds;
const libxl_domain_remus_info *const info = dss->remus;
STATE_AO_GC(dss->ao);
@@ -50,19 +90,24 @@ void libxl__remus_setup(libxl__egc *egc,
cds->ao = ao;
cds->domid = dss->domid;
cds->callback = libxl__remus_setup_done;
+ cds->ops = remus_ops;
+ rs->interval = info->interval;
+
+ if (init_device_subkind(cds))
+ goto out;
libxl__checkpoint_devices_setup(egc, cds);
return;
out:
- libxl__remus_setup_failed(egc, cds, ERROR_FAIL);
+ dss->callback(egc, dss, ERROR_FAIL);
}
static void libxl__remus_setup_done(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, rs.cds);
STATE_AO_GC(dss->ao);
if (!rc) {
@@ -80,13 +125,15 @@ static void libxl__remus_setup_failed(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, rs.cds);
STATE_AO_GC(dss->ao);
if (rc)
LOG(ERROR, "Remus: failed to teardown device after setup failed"
" for guest with domid %u, rc %d", dss->domid, rc);
+ cleanup_device_subkind(cds);
+
dss->callback(egc, dss, rc);
}
@@ -97,9 +144,11 @@ static void remus_teardown_done(libxl__egc *egc,
int rc);
void libxl__remus_teardown(libxl__egc *egc,
- libxl__domain_suspend_state *dss,
+ libxl__remus_state *rs,
int rc)
{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(rs, *dss, rs);
+
EGC_GC;
/*
@@ -110,21 +159,23 @@ void libxl__remus_teardown(libxl__egc *egc,
*/
LOG(WARN, "Remus: Domain suspend terminated with rc %d,"
" teardown Remus devices...", rc);
- dss->cds.callback = remus_teardown_done;
- libxl__checkpoint_devices_teardown(egc, &dss->cds);
+ dss->rs.cds.callback = remus_teardown_done;
+ libxl__checkpoint_devices_teardown(egc, &dss->rs.cds);
}
static void remus_teardown_done(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, rs.cds);
STATE_AO_GC(dss->ao);
if (rc)
LOG(ERROR, "Remus: failed to teardown device for guest with domid %u,"
" rc %d", dss->domid, rc);
+ cleanup_device_subkind(cds);
+
dss->callback(egc, dss, rc);
}
@@ -157,7 +208,7 @@ static void remus_domain_suspend_callback_common_done(libxl__egc *egc,
if (!ok)
goto out;
- libxl__checkpoint_devices_state *const cds = &dss->cds;
+ libxl__checkpoint_devices_state *const cds = &dss->rs.cds;
cds->callback = remus_devices_postsuspend_cb;
libxl__checkpoint_devices_postsuspend(egc, cds);
return;
@@ -171,7 +222,7 @@ static void remus_devices_postsuspend_cb(libxl__egc *egc,
int rc)
{
int ok = 0;
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, rs.cds);
if (rc)
goto out;
@@ -195,7 +246,7 @@ void libxl__remus_domain_resume_callback(void *data)
libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
STATE_AO_GC(dss->ao);
- libxl__checkpoint_devices_state *const cds = &dss->cds;
+ libxl__checkpoint_devices_state *const cds = &dss->rs.cds;
cds->callback = remus_devices_preresume_cb;
libxl__checkpoint_devices_preresume(egc, cds);
}
@@ -205,7 +256,7 @@ static void remus_devices_preresume_cb(libxl__egc *egc,
int rc)
{
int ok = 0;
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, rs.cds);
STATE_AO_GC(dss->ao);
if (rc)
@@ -252,7 +303,7 @@ static void remus_checkpoint_dm_saved(libxl__egc *egc,
libxl__domain_suspend_state *dss, int rc)
{
/* Convenience aliases */
- libxl__checkpoint_devices_state *const cds = &dss->cds;
+ libxl__checkpoint_devices_state *const cds = &dss->rs.cds;
STATE_AO_GC(dss->ao);
@@ -274,7 +325,7 @@ static void remus_devices_commit_cb(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc)
{
- libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(cds, *dss, rs.cds);
STATE_AO_GC(dss->ao);
@@ -292,9 +343,9 @@ static void remus_devices_commit_cb(libxl__egc *egc,
*/
/* Set checkpoint interval timeout */
- rc = libxl__ev_time_register_rel(gc, &dss->checkpoint_timeout,
+ rc = libxl__ev_time_register_rel(gc, &dss->rs.checkpoint_timeout,
remus_next_checkpoint,
- dss->interval);
+ dss->rs.interval);
if (rc)
goto out;
@@ -309,7 +360,7 @@ static void remus_next_checkpoint(libxl__egc *egc, libxl__ev_time *ev,
const struct timeval *requested_abs)
{
libxl__domain_suspend_state *dss =
- CONTAINER_OF(ev, *dss, checkpoint_timeout);
+ CONTAINER_OF(ev, *dss, rs.checkpoint_timeout);
STATE_AO_GC(dss->ao);
diff --git a/tools/libxl/libxl_remus.h b/tools/libxl/libxl_remus.h
index 53e5e81..15bbbe8 100644
--- a/tools/libxl/libxl_remus.h
+++ b/tools/libxl/libxl_remus.h
@@ -16,10 +16,9 @@
#ifndef LIBXL_REMUS_H
#define LIBXL_REMUS_H
-void libxl__remus_setup(libxl__egc *egc,
- libxl__domain_suspend_state *dss);
+void libxl__remus_setup(libxl__egc *egc, libxl__remus_state *rs);
void libxl__remus_teardown(libxl__egc *egc,
- libxl__domain_suspend_state *dss,
+ libxl__remus_state *rs,
int rc);
void libxl__remus_domain_suspend_callback(void *data);
void libxl__remus_domain_resume_callback(void *data);
diff --git a/tools/libxl/libxl_remus_disk_drbd.c b/tools/libxl/libxl_remus_disk_drbd.c
index 039ffb1..59d5305 100644
--- a/tools/libxl/libxl_remus_disk_drbd.c
+++ b/tools/libxl/libxl_remus_disk_drbd.c
@@ -28,10 +28,12 @@ typedef struct libxl__remus_drbd_disk {
int init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds)
{
+ libxl__remus_state *rs = CONTAINER_OF(cds, *rs, cds);
+
STATE_AO_GC(cds->ao);
- cds->drbd_probe_script = GCSPRINTF("%s/block-drbd-probe",
- libxl__xen_script_dir_path());
+ rs->drbd_probe_script = GCSPRINTF("%s/block-drbd-probe",
+ libxl__xen_script_dir_path());
return 0;
}
@@ -96,6 +98,7 @@ static void match_async_exec(libxl__egc *egc, libxl__checkpoint_device *dev)
int arraysize, nr = 0, rc;
const libxl_device_disk *disk = dev->backend_dev;
libxl__async_exec_state *aes = &dev->aodev.aes;
+ libxl__remus_state *rs = CONTAINER_OF(dev->cds, *rs, cds);
STATE_AO_GC(dev->cds->ao);
/* setup env & args */
@@ -107,7 +110,7 @@ static void match_async_exec(libxl__egc *egc, libxl__checkpoint_device *dev)
arraysize = 3;
nr = 0;
GCNEW_ARRAY(aes->args, arraysize);
- aes->args[nr++] = dev->cds->drbd_probe_script;
+ aes->args[nr++] = rs->drbd_probe_script;
aes->args[nr++] = disk->pdev_path;
aes->args[nr++] = NULL;
assert(nr <= arraysize);
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 05/18] Update libxl_save_msgs_gen.pl to support return data from xl to xc
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (3 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 04/18] don't touch remus in checkpoint_device Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 06/18] Allow slave sends data to master Wen Congyang
` (14 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
Currently, all callbacks return an integer value or void. We cannot
return some data to xc via callback. Update libxl_save_msgs_gen.pl
to support this case.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxl/libxl_internal.h | 3 ++
tools/libxl/libxl_save_callout.c | 31 ++++++++++++++++++
tools/libxl/libxl_save_helper.c | 17 ++++++++++
tools/libxl/libxl_save_msgs_gen.pl | 65 ++++++++++++++++++++++++++++++++++----
4 files changed, 109 insertions(+), 7 deletions(-)
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 040dee5..d4aa209 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3184,6 +3184,9 @@ _hidden void libxl__xc_domain_save_done(libxl__egc*, void *dss_void,
* When they are ready to indicate completion, they call this. */
void libxl__xc_domain_saverestore_async_callback_done(libxl__egc *egc,
libxl__save_helper_state *shs, int return_value);
+void libxl__xc_domain_saverestore_async_callback_done_with_data(libxl__egc *egc,
+ libxl__save_helper_state *shs,
+ const void *data, uint64_t size);
_hidden void libxl__domain_suspend_common_switch_qemu_logdirty
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 40b25e4..477e633 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -145,6 +145,15 @@ void libxl__xc_domain_saverestore_async_callback_done(libxl__egc *egc,
shs->egc = 0;
}
+void libxl__xc_domain_saverestore_async_callback_done_with_data(libxl__egc *egc,
+ libxl__save_helper_state *shs,
+ const void *data, uint64_t size)
+{
+ shs->egc = egc;
+ libxl__srm_callout_sendreply_data(data, size, shs);
+ shs->egc = 0;
+}
+
/*----- helper execution -----*/
static void run_helper(libxl__egc *egc, libxl__save_helper_state *shs,
@@ -370,6 +379,28 @@ void libxl__srm_callout_sendreply(int r, void *user)
helper_failed(egc, shs, ERROR_FAIL);
}
+void libxl__srm_callout_sendreply_data(const void *data, uint64_t size, void *user)
+{
+ libxl__save_helper_state *shs = user;
+ libxl__egc *egc = shs->egc;
+ STATE_AO_GC(shs->ao);
+ int errnoval;
+
+ errnoval = libxl_write_exactly(CTX, libxl__carefd_fd(shs->pipes[0]),
+ &size, sizeof(size), shs->stdin_what,
+ "callback return data length");
+ if (errnoval)
+ goto out;
+
+ errnoval = libxl_write_exactly(CTX, libxl__carefd_fd(shs->pipes[0]),
+ data, size, shs->stdin_what,
+ "callback return data");
+
+out:
+ if (errnoval)
+ helper_failed(egc, shs, ERROR_FAIL);
+}
+
void libxl__srm_callout_callback_log(uint32_t level, uint32_t errnoval,
const char *context, const char *formatted, void *user)
{
diff --git a/tools/libxl/libxl_save_helper.c b/tools/libxl/libxl_save_helper.c
index 74826a1..44c5807 100644
--- a/tools/libxl/libxl_save_helper.c
+++ b/tools/libxl/libxl_save_helper.c
@@ -155,6 +155,23 @@ int helper_getreply(void *user)
return v;
}
+uint8_t *helper_getreply_data(void *user)
+{
+ uint64_t size;
+ int r = read_exactly(0, &size, sizeof(size));
+ uint8_t *data;
+
+ if (r <= 0)
+ exit(-2);
+
+ data = helper_allocbuf(size, user);
+ r = read_exactly(0, data, size);
+ if (r <= 0)
+ exit(-2);
+
+ return data;
+}
+
/*----- other callbacks -----*/
static int toolstack_save_fd;
diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl
index 6b4b65e..41ee000 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -15,6 +15,7 @@ our @msgs = (
# and its null-ness needs to be passed through to the helper's xc
# W - needs a return value; callback is synchronous
# A - needs a return value; callback is asynchronous
+ # B - return value is an pointer
[ 1, 'sr', "log", [qw(uint32_t level
uint32_t errnoval
STRING context
@@ -99,23 +100,28 @@ our $libxl = "libxl__srm";
our $callback = "${libxl}_callout_callback";
our $receiveds = "${libxl}_callout_received";
our $sendreply = "${libxl}_callout_sendreply";
+our $sendreply_data = "${libxl}_callout_sendreply_data";
our $getcallbacks = "${libxl}_callout_get_callbacks";
our $enumcallbacks = "${libxl}_callout_enumcallbacks";
sub cbtype ($) { "${libxl}_".$_[0]."_autogen_callbacks"; };
f_decl($sendreply, 'callout', 'void', "(int r, void *user)");
+f_decl($sendreply_data, 'callout', 'void',
+ "(const void *data, uint64_t size, void *user)");
our $helper = "helper";
our $encode = "${helper}_stub";
our $allocbuf = "${helper}_allocbuf";
our $transmit = "${helper}_transmitmsg";
our $getreply = "${helper}_getreply";
+our $getreply_data = "${helper}_getreply_data";
our $setcallbacks = "${helper}_setcallbacks";
f_decl($allocbuf, 'helper', 'unsigned char *', '(int len, void *user)');
f_decl($transmit, 'helper', 'void',
'(unsigned char *msg_freed, int len, void *user)');
f_decl($getreply, 'helper', 'int', '(void *user)');
+f_decl($getreply_data, 'helper', 'uint8_t *', '(void *user)');
sub typeid ($) { my ($t) = @_; $t =~ s/\W/_/; return $t; };
@@ -259,12 +265,36 @@ foreach my $msginfo (@msgs) {
$f_more_sr->(" case $msgnum: { /* $name */\n");
if ($flags =~ m/W/) {
- $f_more_sr->(" int r;\n");
+ if ($flags =~ m/B/) {
+ $f_more_sr->(" uint8_t *data;\n".
+ " uint64_t size;\n");
+ } else {
+ $f_more_sr->(" int r;\n");
+ }
}
- my $c_rtype_helper = $flags =~ m/[WA]/ ? 'int' : 'void';
- my $c_rtype_callout = $flags =~ m/W/ ? 'int' : 'void';
+ my $c_rtype_helper;
+ if ($flags =~ m/[WA]/) {
+ if ($flags =~ m/B/) {
+ $c_rtype_helper = 'uint8_t *'
+ } else {
+ $c_rtype_helper = 'int'
+ }
+ } else {
+ $c_rtype_helper = 'void';
+ }
+ my $c_rtype_callout;
+ if ($flags =~ m/W/) {
+ if ($flags =~ m/B/) {
+ $c_rtype_callout = 'uint8_t *';
+ } else {
+ $c_rtype_callout = 'int';
+ }
+ } else {
+ $c_rtype_callout = 'void';
+ }
my $c_decl = '(';
+ my $c_helper_decl = '';
my $c_callback_args = '';
f_more("${encode}_$name",
@@ -305,7 +335,15 @@ END_ALWAYS
f_more("${encode}_$name", " ${typeid}_put(buf, &len, $c_args);\n");
}
$f_more_sr->($c_recv);
+ $c_helper_decl = $c_decl;
+ if ($flags =~ m/W/ and $flags =~ m/B/) {
+ $c_decl .= "uint64_t *size, "
+ }
$c_decl .= "void *user)";
+ $c_helper_decl .= "void *user)";
+ if ($flags =~ m/W/ and $flags =~ m/B/) {
+ $c_callback_args .= "&size, "
+ }
$c_callback_args .= "user";
$f_more_sr->(" if (msg != endmsg) return 0;\n");
@@ -326,10 +364,12 @@ END_ALWAYS
my $c_make_callback = "$c_callback($c_callback_args)";
if ($flags !~ m/W/) {
$f_more_sr->(" $c_make_callback;\n");
+ } elsif ($flags =~ m/B/) {
+ $f_more_sr->(" data = $c_make_callback;\n".
+ " $sendreply_data(data, size, user);\n");
} else {
$f_more_sr->(" r = $c_make_callback;\n".
" $sendreply(r, user);\n");
- f_decl($sendreply, 'callout', 'void', '(int r, void *user)');
}
if ($flags =~ m/x/) {
my $c_v = "(1u<<$msgnum)";
@@ -340,7 +380,7 @@ END_ALWAYS
}
$f_more_sr->(" return 1;\n }\n\n");
f_decl("${callback}_$name", 'callout', $c_rtype_callout, $c_decl);
- f_decl("${encode}_$name", 'helper', $c_rtype_helper, $c_decl);
+ f_decl("${encode}_$name", 'helper', $c_rtype_helper, $c_helper_decl);
f_more("${encode}_$name",
" if (buf) break;
buf = ${helper}_allocbuf(len, user);
@@ -352,12 +392,23 @@ END_ALWAYS
${transmit}(buf, len, user);
");
if ($flags =~ m/[WA]/) {
- f_more("${encode}_$name",
- (<<END_ALWAYS.($debug ? <<END_DEBUG : '').<<END_ALWAYS));
+ if ($flags =~ m/B/) {
+ f_more("${encode}_$name",
+ (<<END_ALWAYS.($debug ? <<END_DEBUG : '')));
+ uint8_t *r = ${helper}_getreply_data(user);
+END_ALWAYS
+ fprintf(stderr,"libxl-save-helper: $name got reply data\\n");
+END_DEBUG
+ } else {
+ f_more("${encode}_$name",
+ (<<END_ALWAYS.($debug ? <<END_DEBUG : '')));
int r = ${helper}_getreply(user);
END_ALWAYS
fprintf(stderr,"libxl-save-helper: $name got reply %d\\n",r);
END_DEBUG
+ }
+
+ f_more("${encode}_$name", (<<END_ALWAYS));
return r;
END_ALWAYS
}
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 06/18] Allow slave sends data to master
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (4 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 05/18] Update libxl_save_msgs_gen.pl to support return data from xl to xc Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 07/18] secondary vm suspend/resume/checkpoint code Wen Congyang
` (13 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
In colo mode, slave needs to send data to master, but the io_fd
only can be written in master, and only can be read in slave.
Save recv_fd in domain_suspend_state, and send_fd in
domain_create_state.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxl/libxl.c | 2 +-
tools/libxl/libxl_create.c | 14 ++++++++++----
tools/libxl/libxl_internal.h | 2 ++
tools/libxl/libxl_types.idl | 7 +++++++
tools/libxl/xl_cmdimpl.c | 7 +++++++
5 files changed, 27 insertions(+), 5 deletions(-)
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index fa990fe..145e897 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -849,7 +849,7 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
dss->callback = remus_failover_cb;
dss->domid = domid;
dss->fd = send_fd;
- /* TODO do something with recv_fd */
+ dss->recv_fd = recv_fd;
dss->type = type;
dss->live = 1;
dss->debug = 0;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index b1ff5ae..5f61514 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1491,8 +1491,8 @@ static void domain_create_cb(libxl__egc *egc,
int rc, uint32_t domid);
static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
- uint32_t *domid,
- int restore_fd, int checkpointed_stream,
+ uint32_t *domid, int restore_fd,
+ int send_fd, int checkpointed_stream,
const libxl_asyncop_how *ao_how,
const libxl_asyncprogress_how *aop_console_how)
{
@@ -1505,6 +1505,7 @@ static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
libxl_domain_config_init(&cdcs->dcs.guest_config_saved);
libxl_domain_config_copy(ctx, &cdcs->dcs.guest_config_saved, d_config);
cdcs->dcs.restore_fd = restore_fd;
+ cdcs->dcs.send_fd = send_fd;
cdcs->dcs.callback = domain_create_cb;
cdcs->dcs.checkpointed_stream = checkpointed_stream;
libxl__ao_progress_gethow(&cdcs->dcs.aop_console_how, aop_console_how);
@@ -1533,7 +1534,7 @@ int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,
const libxl_asyncop_how *ao_how,
const libxl_asyncprogress_how *aop_console_how)
{
- return do_domain_create(ctx, d_config, domid, -1, 0,
+ return do_domain_create(ctx, d_config, domid, -1, -1, 0,
ao_how, aop_console_how);
}
@@ -1543,7 +1544,12 @@ int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config,
const libxl_asyncop_how *ao_how,
const libxl_asyncprogress_how *aop_console_how)
{
- return do_domain_create(ctx, d_config, domid, restore_fd,
+ int send_fd = -1;
+
+ if (params->checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO)
+ send_fd = params->send_fd;
+
+ return do_domain_create(ctx, d_config, domid, restore_fd, send_fd,
params->checkpointed_stream, ao_how, aop_console_how);
}
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index d4aa209..76d57fb 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2879,6 +2879,7 @@ struct libxl__domain_suspend_state {
uint32_t domid;
int fd;
+ int recv_fd;
libxl_domain_type type;
int live;
int debug;
@@ -3144,6 +3145,7 @@ struct libxl__domain_create_state {
libxl_domain_config *guest_config;
libxl_domain_config guest_config_saved; /* vanilla config */
int restore_fd;
+ int send_fd;
libxl__domain_create_cb *callback;
libxl_asyncprogress_how aop_console_how;
/* private to domain_create */
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index a5890f0..ed567f4 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -194,6 +194,12 @@ libxl_viridian_enlightenment = Enumeration("viridian_enlightenment", [
(2, "time_ref_count"),
])
+libxl_checkpointed_stream = Enumeration("checkpointed_stream", [
+ (0, "NONE"),
+ (1, "REMUS"),
+ (2, "COLO"),
+ ], init_val = 0)
+
#
# Complex libxl types
#
@@ -340,6 +346,7 @@ libxl_domain_create_info = Struct("domain_create_info",[
libxl_domain_restore_params = Struct("domain_restore_params", [
("checkpointed_stream", integer),
+ ("send_fd", integer),
])
libxl_domain_sched_params = Struct("domain_sched_params",[
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 3c9f146..fea17ea 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -154,6 +154,7 @@ struct domain_create {
const char *extra_config; /* extra config string */
const char *restore_file;
int migrate_fd; /* -1 means none */
+ int send_fd; /* -1 means none */
char **migration_domname_r; /* from malloc */
};
@@ -2309,6 +2310,7 @@ static uint32_t create_domain(struct domain_create *dom_info)
void *config_data = 0;
int config_len = 0;
int restore_fd = -1;
+ int send_fd = -1;
const libxl_asyncprogress_how *autoconnect_console_how;
struct save_file_header hdr;
@@ -2325,6 +2327,7 @@ static uint32_t create_domain(struct domain_create *dom_info)
if (migrate_fd >= 0) {
restore_source = "<incoming migration stream>";
restore_fd = migrate_fd;
+ send_fd = dom_info->send_fd;
} else {
restore_source = restore_file;
restore_fd = open(restore_file, O_RDONLY);
@@ -2498,6 +2501,7 @@ start:
libxl_domain_restore_params_init(¶ms);
params.checkpointed_stream = dom_info->checkpointed_stream;
+ params.send_fd = send_fd;
ret = libxl_domain_create_restore(ctx, &d_config,
&domid, restore_fd,
¶ms,
@@ -4032,6 +4036,7 @@ static void migrate_receive(int debug, int daemonize, int monitor,
dom_info.monitor = monitor;
dom_info.paused = 1;
dom_info.migrate_fd = recv_fd;
+ dom_info.send_fd = send_fd;
dom_info.migration_domname_r = &migration_domname;
dom_info.checkpointed_stream = remus;
@@ -4202,6 +4207,7 @@ int main_restore(int argc, char **argv)
dom_info.config_file = config_file;
dom_info.restore_file = checkpoint_file;
dom_info.migrate_fd = -1;
+ dom_info.send_fd = -1;
dom_info.vnc = vnc;
dom_info.vncautopass = vncautopass;
dom_info.console_autoconnect = console_autoconnect;
@@ -4641,6 +4647,7 @@ int main_create(int argc, char **argv)
dom_info.config_file = filename;
dom_info.extra_config = extra_config;
dom_info.migrate_fd = -1;
+ dom_info.send_fd = -1;
dom_info.vnc = vnc;
dom_info.vncautopass = vncautopass;
dom_info.console_autoconnect = console_autoconnect;
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 07/18] secondary vm suspend/resume/checkpoint code
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (5 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 06/18] Allow slave sends data to master Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 08/18] primary vm suspend/get_dirty_pfn/resume/checkpoint code Wen Congyang
` (12 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
Secondary vm is running in colo mode. So we will do
the following things again and again:
1. Resume secondary vm
a. Send LIBXL_COLO_SVM_READY to master
b. If it is resumed the first time, call libxl__xc_domain_restore_done()
to build the secondary vm. We should also enable secondary vm's logdirty.
Otherwise, call libxl__domain_resume() to resume secondary vm.
c. Send LIBXL_COLO_SVM_RESUMED to master
2. Wait a new checkpoint
a. Read LIBXL_COLO_NEW_CHECKPOINT from master
3. Suspend secondary vm
a. Suspend secondary vm
b. Get secondary vm's dirty page information
c. Send LIBXL_COLO_SVM_SUSPENDED to master
d. Send secondary vm's dirty page information to master(count + pfn list)
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxc/include/xenguest.h | 20 +
tools/libxl/Makefile | 1 +
tools/libxl/libxl_colo.h | 38 ++
tools/libxl/libxl_colo_restore.c | 896 +++++++++++++++++++++++++++++++++++++
tools/libxl/libxl_create.c | 116 ++++-
tools/libxl/libxl_dom.c | 2 +-
tools/libxl/libxl_internal.h | 22 +
tools/libxl/libxl_save_callout.c | 6 +-
tools/libxl/libxl_save_msgs_gen.pl | 6 +-
9 files changed, 1100 insertions(+), 7 deletions(-)
create mode 100644 tools/libxl/libxl_colo.h
create mode 100644 tools/libxl/libxl_colo_restore.c
diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index 40bbac8..d3061c7 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -91,6 +91,26 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
/* callbacks provided by xc_domain_restore */
struct restore_callbacks {
+ /* Called after a new checkpoint to suspend the guest.
+ */
+ int (*suspend)(void* data);
+
+ /* Called after the secondary vm is ready to resume.
+ * Callback function resumes the guest & the device model,
+ * returns to xc_domain_restore.
+ */
+ int (*postcopy)(void* data);
+
+ /* callback to wait a new checkpoint
+ *
+ * returns:
+ * 0: terminate checkpointing gracefully
+ * 1: take another checkpoint */
+ int (*checkpoint)(void* data);
+
+ /* Enable qemu-dm logging dirty pages to xen */
+ int (*switch_qemu_logdirty)(int domid, unsigned enable, void *data); /* HVM only */
+
/* callback to restore toolstack specific data */
int (*toolstack_restore)(uint32_t domid, const uint8_t *buf,
uint32_t size, void* data);
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index c970e7e..9cfff46 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -57,6 +57,7 @@ LIBXL_OBJS-y += libxl_nonetbuffer.o
endif
LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
+LIBXL_OBJS-y += libxl_colo_restore.o
LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
diff --git a/tools/libxl/libxl_colo.h b/tools/libxl/libxl_colo.h
new file mode 100644
index 0000000..91df275
--- /dev/null
+++ b/tools/libxl/libxl_colo.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#ifndef LIBXL_COLO_H
+#define LIBXL_COLO_H
+
+/*
+ * values to control suspend/resume primary vm and secondary vm
+ * at the same time
+ */
+enum {
+ LIBXL_COLO_NEW_CHECKPOINT = 1,
+ LIBXL_COLO_SVM_SUSPENDED,
+ LIBXL_COLO_SVM_READY,
+ LIBXL_COLO_SVM_RESUMED,
+};
+
+extern void libxl__colo_restore_done(libxl__egc *egc, void *dcs_void,
+ int ret, int retval, int errnoval);
+extern void libxl__colo_restore_setup(libxl__egc *egc,
+ libxl__colo_restore_state *crs);
+extern void libxl__colo_restore_teardown(libxl__egc *egc,
+ libxl__colo_restore_state *crs,
+ int rc);
+
+#endif
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
new file mode 100644
index 0000000..a61caa0
--- /dev/null
+++ b/tools/libxl/libxl_colo_restore.c
@@ -0,0 +1,896 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+#include "libxl_colo.h"
+#include "../libxc/xg_private.h"
+#include "../libxc/xc_bitops.h"
+
+enum {
+ LIBXL_COLO_SETUPED,
+ LIBXL_COLO_SUSPENDED,
+ LIBXL_COLO_RESUMED,
+};
+
+typedef struct libxl__colo_restore_checkpoint_state libxl__colo_restore_checkpoint_state;
+struct libxl__colo_restore_checkpoint_state {
+ xc_hypercall_buffer_t _dirty_bitmap;
+ xc_hypercall_buffer_t *dirty_bitmap;
+ unsigned long p2m_size;
+ libxl__domain_suspend_state2 dss2;
+ /* for sending data to master */
+ libxl__datacopier_state dc;
+ /* for reading data from master */
+ libxl__datareader_state drs;
+ uint8_t section;
+ libxl__logdirty_switch lds;
+ libxl__colo_restore_state *crs;
+ int status;
+
+ void (*callback)(libxl__egc *,
+ libxl__colo_restore_checkpoint_state *,
+ int);
+
+ /*
+ * 0: secondary vm's dirty bitmap for domain @domid
+ * 1: secondary vm is ready(domain @domid)
+ * 2: secondary vm is resumed(domain @domid)
+ */
+ const char *copywhat[3];
+};
+
+
+static void libxl__colo_restore_domain_resume_callback(void *data);
+static void libxl__colo_restore_domain_checkpoint_callback(void *data);
+static void libxl__colo_restore_domain_suspend_callback(void *data);
+
+/* ===================== colo: common functions ===================== */
+static void colo_enable_logdirty(libxl__colo_restore_state *crs, libxl__egc *egc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+ /* Convenience aliases */
+ const uint32_t domid = crs->domid;
+ libxl__logdirty_switch *const lds = &crcs->lds;
+
+ STATE_AO_GC(crs->ao);
+
+ /* we need to know which pages are dirty to restore the guest */
+ if (xc_shadow_control(CTX->xch, domid,
+ XEN_DOMCTL_SHADOW_OP_ENABLE_LOGDIRTY,
+ NULL, 0, NULL, 0, NULL) < 0) {
+ LOG(ERROR, "cannot enable secondary vm's logdirty");
+ lds->callback(egc, lds, ERROR_FAIL);
+ return;
+ }
+
+ if (crs->hvm) {
+ libxl__domain_common_switch_qemu_logdirty(domid, 1, lds, egc);
+ return;
+ }
+
+ lds->callback(egc, lds, 0);
+}
+
+static void colo_disable_logdirty(libxl__colo_restore_state *crs,
+ libxl__egc *egc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+ /* Convenience aliases */
+ const uint32_t domid = crs->domid;
+ libxl__logdirty_switch *const lds = &crcs->lds;
+
+ STATE_AO_GC(crs->ao);
+
+ /* we need to know which pages are dirty to restore the guest */
+ if (xc_shadow_control(CTX->xch, domid, XEN_DOMCTL_SHADOW_OP_OFF,
+ NULL, 0, NULL, 0, NULL) < 0)
+ LOG(WARN, "cannot disable secondary vm's logdirty");
+
+ if (crs->hvm) {
+ libxl__domain_common_switch_qemu_logdirty(domid, 0, lds, egc);
+ return;
+ }
+
+ lds->callback(egc, lds, 0);
+}
+
+static void colo_resume_vm(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs,
+ int restore_device_model)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+ int rc;
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = crcs->crs;
+
+ STATE_AO_GC(crs->ao);
+
+ if (!crs->saved_cb) {
+ /* TODO: sync mmu for hvm? */
+ if (restore_device_model) {
+ rc = libxl__domain_restore(gc, crs->domid);
+ if (rc) {
+ LOG(ERROR, "cannot restore device model for secondary vm");
+ crcs->callback(egc, crcs, rc);
+ return;
+ }
+ }
+ rc = libxl__domain_resume(gc, crs->domid, 0);
+ if (rc)
+ LOG(ERROR, "cannot resume secondary vm");
+
+ crcs->callback(egc, crcs, rc);
+ return;
+ }
+
+ /*
+ * TODO: get store mfn and console mfn
+ * We should call the callback restore_results in
+ * xc_domain_restore() before resuming the guest.
+ */
+ libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0);
+
+ return;
+}
+
+
+/* ================ colo: setup restore environment ================ */
+static void libxl__colo_domain_create_cb(libxl__egc *egc,
+ libxl__domain_create_state *dcs,
+ int rc, uint32_t domid);
+
+static int init_dss2(libxl__domain_suspend_state2 *dss2)
+{
+ int rc = ERROR_FAIL;
+ libxl_domain_type type;
+
+ STATE_AO_GC(dss2->ao);
+
+ type = libxl__domain_type(gc, dss2->domid);
+ if (type == LIBXL_DOMAIN_TYPE_INVALID)
+ goto out;
+
+ libxl__xswait_init(&dss2->pvcontrol);
+ libxl__ev_evtchn_init(&dss2->guest_evtchn);
+ libxl__ev_xswatch_init(&dss2->guest_watch);
+ libxl__ev_time_init(&dss2->guest_timeout);
+
+ if (type == LIBXL_DOMAIN_TYPE_HVM)
+ dss2->hvm = 1;
+ else
+ dss2->hvm = 0;
+
+ dss2->guest_evtchn.port = -1;
+ dss2->guest_evtchn_lockfd = -1;
+ dss2->guest_responded = 0;
+ dss2->dm_savefile = libxl__device_model_savefile(gc, dss2->domid);
+ dss2->save_dm = 0;
+
+ /* Secondary vm is not created, so we cannot get evtchn port */
+
+ rc = 0;
+
+out:
+ return rc;
+}
+
+void libxl__colo_restore_setup(libxl__egc *egc,
+ libxl__colo_restore_state *crs)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ libxl__colo_restore_checkpoint_state *crcs;
+ DECLARE_HYPERCALL_BUFFER(unsigned long, dirty_bitmap);
+ int rc = ERROR_FAIL;
+ int bsize;
+
+ /* Convenience aliases */
+ libxl__srm_restore_autogen_callbacks *const callbacks =
+ &dcs->shs.callbacks.restore.a;
+ const int domid = crs->domid;
+
+ STATE_AO_GC(crs->ao);
+
+ GCNEW(crcs);
+ crs->crcs = crcs;
+ crcs->crs = crs;
+
+ crcs->p2m_size = xc_domain_maximum_gpfn(CTX->xch, domid) + 1;
+
+ crcs->copywhat[0] = GCSPRINTF("secondary vm's dirty bitmap for domain %"PRIu32,
+ domid);
+ crcs->copywhat[1] = GCSPRINTF("secondary vm is ready(domain %"PRIu32")",
+ domid);
+ crcs->copywhat[2] = GCSPRINTF("secondary vm is resumed(domain %"PRIu32")",
+ domid);
+
+ bsize = bitmap_size(crcs->p2m_size);
+ dirty_bitmap = xc_hypercall_buffer_alloc_pages(CTX->xch, dirty_bitmap,
+ NRPAGES(bsize));
+ if (!dirty_bitmap) {
+ rc = ERROR_NOMEM;
+ goto err;
+ }
+ memset(dirty_bitmap, 0, bsize);
+ crcs->_dirty_bitmap = *HYPERCALL_BUFFER(dirty_bitmap);
+ crcs->dirty_bitmap = &crcs->_dirty_bitmap;
+
+ /* setup dss2 */
+ crcs->dss2.ao = ao;
+ crcs->dss2.domid = domid;
+ if (init_dss2(&crcs->dss2))
+ goto err_init_dss2;
+
+ callbacks->suspend = libxl__colo_restore_domain_suspend_callback;
+ callbacks->postcopy = libxl__colo_restore_domain_resume_callback;
+ callbacks->checkpoint = libxl__colo_restore_domain_checkpoint_callback;
+
+ /*
+ * Secondary vm is running in colo mode, so we need to call
+ * libxl__xc_domain_restore_done() to create secondary vm.
+ * But we will exit in domain_create_cb(). So replace the
+ * callback here.
+ */
+ crs->saved_cb = dcs->callback;
+ dcs->callback = libxl__colo_domain_create_cb;
+ crcs->status = LIBXL_COLO_SETUPED;
+
+ logdirty_init(&crcs->lds);
+ crcs->lds.ao = ao;
+
+ rc = 0;
+
+out:
+ crs->callback(egc, crs, rc);
+ return;
+
+err_init_dss2:
+ xc_hypercall_buffer_free_pages(CTX->xch, dirty_bitmap, NRPAGES(bsize));
+ crcs->dirty_bitmap = NULL;
+err:
+ goto out;
+}
+
+static void libxl__colo_domain_create_cb(libxl__egc *egc,
+ libxl__domain_create_state *dcs,
+ int rc, uint32_t domid)
+{
+ libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+ crcs->callback(egc, crcs, rc);
+}
+
+
+/* ================ colo: teardown restore environment ================ */
+static void do_failover_done(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state* crcs,
+ int rc);
+static void colo_disable_logdirty_done(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc);
+
+static void do_failover(libxl__egc *egc, libxl__colo_restore_state *crs)
+{
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+ /* Convenience aliases */
+ const int status = crcs->status;
+ libxl__logdirty_switch *const lds = &crcs->lds;
+
+ STATE_AO_GC(crs->ao);
+
+ switch(status) {
+ case LIBXL_COLO_SETUPED:
+ /* We don't enable logdirty now */
+ colo_resume_vm(egc, crcs, 0);
+ return;
+ case LIBXL_COLO_SUSPENDED:
+ case LIBXL_COLO_RESUMED:
+ /* disable logdirty first */
+ lds->callback = colo_disable_logdirty_done;
+ colo_disable_logdirty(crs, egc);
+ return;
+ default:
+ LOG(ERROR, "invalid status: %d", status);
+ crcs->callback(egc, crcs, ERROR_FAIL);
+ }
+}
+
+void libxl__colo_restore_teardown(libxl__egc *egc,
+ libxl__colo_restore_state *crs,
+ int rc)
+{
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, crcs->dirty_bitmap);
+ int bsize = bitmap_size(crcs->p2m_size);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+ EGC_GC;
+
+ if (!dirty_bitmap)
+ goto do_failover;
+
+ xc_hypercall_buffer_free_pages(CTX->xch, dirty_bitmap, NRPAGES(bsize));
+
+do_failover:
+ if (!rc) {
+ crcs->callback = do_failover_done;
+ do_failover(egc, crs);
+ return;
+ }
+
+ if (crs->saved_cb) {
+ dcs->callback = crs->saved_cb;
+ crs->saved_cb = NULL;
+ }
+ crs->callback(egc, crs, rc);
+}
+
+static void do_failover_done(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state* crcs,
+ int rc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = crcs->crs;
+
+ STATE_AO_GC(crs->ao);
+
+ if (rc)
+ LOG(ERROR, "cannot do failover");
+
+ if (crs->saved_cb) {
+ dcs->callback = crs->saved_cb;
+ crs->saved_cb = NULL;
+ }
+
+ crs->callback(egc, crs, rc);
+}
+
+static void colo_disable_logdirty_done(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+
+ STATE_AO_GC(lds->ao);
+
+ if (rc)
+ LOG(WARN, "cannot disable logdirty");
+
+ if (crcs->status == LIBXL_COLO_SUSPENDED) {
+ /*
+ * failover when reading state from master, so no need to
+ * call libxl__domain_restore().
+ */
+ colo_resume_vm(egc, crcs, 0);
+ return;
+ }
+
+ /* If we cannot disable logdirty, we still can do failover */
+ crcs->callback(egc, crcs, 0);
+}
+
+/*
+ * checkpoint callbacks are called in the following order:
+ * 1. resume
+ * 2. checkpoint
+ * 3. suspend
+ */
+static void colo_common_send_data_done(libxl__egc *egc,
+ libxl__datacopier_state *dc,
+ int onwrite, int errnoval);
+/* ===================== colo: resume secondary vm ===================== */
+/*
+ * Do the following things when resuming secondary vm:
+ * 1. write LIBXL_COLO_SVM_READY
+ * 2. resume secondary vm
+ * 3. write LIBXL_COLO_SVM_RESUMED
+ */
+static void colo_send_svm_ready_done(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs,
+ int rc);
+static void colo_resume_vm_done(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs,
+ int rc);
+static void colo_write_svm_resumed(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs);
+static void colo_enable_logdirty_done(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int retval);
+static void colo_reenable_logdirty(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc);
+static void colo_reenable_logdirty_done(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc);
+
+static void libxl__colo_restore_domain_resume_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__domain_create_state *dcs = CONTAINER_OF(shs, *dcs, shs);
+ libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+ uint8_t section = LIBXL_COLO_SVM_READY;
+ int rc;
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = &dcs->crs;
+ const int send_fd = crs->send_fd;
+ libxl__datacopier_state *const dc = &crcs->dc;
+
+ STATE_AO_GC(crs->ao);
+
+ memset(dc, 0, sizeof(*dc));
+ dc->ao = ao;
+ dc->readfd = -1;
+ dc->writefd = send_fd;
+ dc->maxsz = INT_MAX;
+ dc->copywhat = crcs->copywhat[1];
+ dc->writewhat = "colo stream";
+ dc->callback = colo_common_send_data_done;
+ crcs->callback = colo_send_svm_ready_done;
+
+ rc = libxl__datacopier_start(dc);
+ if (rc) {
+ LOG(ERROR, "libxl__datacopier_start() fails");
+ goto out;
+ }
+
+ /* tell master that secondary vm is ready */
+ libxl__datacopier_prefixdata(shs->egc, dc, §ion, sizeof(section));
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(shs->egc, shs, 0);
+}
+
+static void colo_send_svm_ready_done(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs,
+ int rc)
+{
+ crcs->callback = colo_resume_vm_done;
+ colo_resume_vm(egc, crcs, 1);
+
+ return;
+}
+
+static void colo_resume_vm_done(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs,
+ int rc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = crcs->crs;
+ libxl__logdirty_switch *const lds = &crcs->lds;
+ libxl__save_helper_state *const shs = &dcs->shs;
+
+ STATE_AO_GC(crs->ao);
+
+ if (rc) {
+ LOG(ERROR, "cannot resume secondary vm");
+ goto out;
+ }
+
+ crcs->status = LIBXL_COLO_RESUMED;
+
+ /* avoid calling libxl__xc_domain_restore_done() more than once */
+ if (crs->saved_cb) {
+ dcs->callback = crs->saved_cb;
+ crs->saved_cb = NULL;
+
+ lds->callback = colo_enable_logdirty_done;
+ colo_enable_logdirty(crs, egc);
+ return;
+ }
+
+ colo_write_svm_resumed(egc, crcs);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_write_svm_resumed(libxl__egc *egc,
+ libxl__colo_restore_checkpoint_state *crcs)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+ uint8_t section = LIBXL_COLO_SVM_RESUMED;
+ int rc;
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = crcs->crs;
+ const int send_fd = crs->send_fd;
+ libxl__datacopier_state *const dc = &crcs->dc;
+ libxl__save_helper_state *const shs = &dcs->shs;
+
+ STATE_AO_GC(crs->ao);
+
+ memset(dc, 0, sizeof(*dc));
+ dc->ao = ao;
+ dc->readfd = -1;
+ dc->writefd = send_fd;
+ dc->maxsz = INT_MAX;
+ dc->copywhat = crcs->copywhat[2];
+ dc->writewhat = "colo stream";
+ dc->callback = colo_common_send_data_done;
+ /* TODO: configure network */
+ crcs->callback = NULL;
+
+ rc = libxl__datacopier_start(dc);
+ if (rc) {
+ LOG(ERROR, "libxl__datacopier_start() fails");
+ goto out;
+ }
+
+ /* tell master that secondary vm is resumed */
+ libxl__datacopier_prefixdata(egc, dc, §ion, sizeof(section));
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_enable_logdirty_done(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = crcs->crs;
+ libxl__save_helper_state *const shs = &dcs->shs;
+ const uint32_t domid = crs->domid;
+
+ STATE_AO_GC(crs->ao);
+
+ if (rc) {
+ /*
+ * log-dirty already enabled? There's no test op,
+ * so attempt to disable then reenable it
+ */
+ lds->callback = colo_reenable_logdirty;
+ colo_disable_logdirty(crs, egc);
+ return;
+ }
+
+ /* We have enabled secondary vm's logdirty, so we can unpause it now */
+ rc = libxl__domain_unpause(gc, domid);
+ if (rc) {
+ LOG(ERROR, "cannot unpause secondary vm");
+ goto out;
+ }
+
+ colo_write_svm_resumed(egc, crcs);
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_reenable_logdirty(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+ /* Convenience aliases */
+ libxl__colo_restore_state *const crs = crcs->crs;
+ libxl__save_helper_state *const shs = &dcs->shs;
+
+ STATE_AO_GC(crs->ao);
+
+ if (rc) {
+ LOG(ERROR, "cannot enable logdirty");
+ goto out;
+ }
+
+ lds->callback = colo_reenable_logdirty_done;
+ colo_enable_logdirty(crs, egc);
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_reenable_logdirty_done(libxl__egc *egc,
+ libxl__logdirty_switch *lds,
+ int rc)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+
+ /* Convenience aliases */
+ libxl__save_helper_state *const shs = &dcs->shs;
+ const uint32_t domid = crcs->crs->domid;
+
+ STATE_AO_GC(crcs->crs->ao);
+
+ if (rc) {
+ LOG(ERROR, "cannot enable logdirty");
+ goto out;
+ }
+
+ /* We have enabled secondary vm's logdirty, so we can unpause it now */
+ rc = libxl__domain_unpause(gc, domid);
+ if (rc) {
+ LOG(ERROR, "cannot unpause secondary vm");
+ goto out;
+ }
+
+ colo_write_svm_resumed(egc, crcs);
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+
+/* ===================== colo: wait new checkpoint ===================== */
+static void colo_stream_read_done(libxl__egc *egc,
+ libxl__datareader_state *drs,
+ ssize_t real_size, int errnoval);
+
+static void libxl__colo_restore_domain_checkpoint_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__domain_create_state *dcs = CONTAINER_OF(shs, *dcs, shs);
+ libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+ /* Convenience aliases */
+ const int recv_fd = dcs->crs.recv_fd;
+ libxl__datareader_state *const drs = &crcs->drs;
+
+ STATE_AO_GC(dcs->crs.ao);
+
+ memset(drs, 0, sizeof(*drs));
+ drs->ao = ao;
+ drs->readfd = recv_fd;
+ drs->readsize = sizeof(crcs->section);
+ drs->readwhat = "colo stream";
+ drs->callback = colo_stream_read_done;
+ drs->buf = &crcs->section;
+
+ if (libxl__datareader_start(drs)) {
+ LOG(ERROR, "libxl__datareader_start() fails");
+ goto out;
+ }
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(shs->egc, shs, 0);
+}
+
+static void colo_stream_read_done(libxl__egc *egc,
+ libxl__datareader_state *drs,
+ ssize_t real_size, int errnoval)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(drs, *crcs, drs);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+ int ok = 0;
+
+ /* Convenience aliases */
+ libxl__save_helper_state *const shs = &dcs->shs;
+
+ STATE_AO_GC(drs->ao);
+
+ if (real_size < drs->readsize) {
+ LOG(ERROR, "reading data fails: %lld", (long long)real_size);
+ goto out;
+ }
+
+ if (crcs->section != LIBXL_COLO_NEW_CHECKPOINT) {
+ LOG(ERROR, "invalid section: %d", crcs->section);
+ goto out;
+ }
+
+ ok = 1;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, ok);
+}
+
+
+/* ===================== colo: suspend secondary vm ===================== */
+/*
+ * Do the following things when resuming secondary vm:
+ * 1. suspend secondary vm
+ * 2. get secondary vm's dirty page information
+ * 3. send LIBXL_COLO_SVM_SUSPENDED
+ * 4. send secondary vm's dirty page information(count + pfn list)
+ */
+static void colo_suspend_vm_done(libxl__egc *egc,
+ libxl__domain_suspend_state2 *dss2,
+ int ok);
+static void colo_append_pfn_type(libxl__egc *egc,
+ libxl__datacopier_state *dc,
+ unsigned long *dirty_bitmap,
+ unsigned long p2m_size);
+
+static void libxl__colo_restore_domain_suspend_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__domain_create_state *dcs = CONTAINER_OF(shs, *dcs, shs);
+ libxl__colo_restore_checkpoint_state *crcs = dcs->crs.crcs;
+
+ STATE_AO_GC(dcs->ao);
+
+ /* Convenience aliases */
+ libxl__domain_suspend_state2 *const dss2 = &crcs->dss2;
+
+ /* suspend secondary vm */
+ dss2->callback_common_done = colo_suspend_vm_done;
+
+ libxl__domain_suspend2(shs->egc, dss2);
+}
+
+static void colo_suspend_vm_done(libxl__egc *egc,
+ libxl__domain_suspend_state2 *dss2,
+ int ok)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(dss2, *crcs, dss2);
+ libxl__colo_restore_state *crs = crcs->crs;
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ DECLARE_HYPERCALL_BUFFER_SHADOW(unsigned long, dirty_bitmap, crcs->dirty_bitmap);
+ uint8_t section = LIBXL_COLO_SVM_SUSPENDED;
+ int i, rc;
+ uint64_t count;
+
+ /* Convenience aliases */
+ const int send_fd = crs->send_fd;
+ const unsigned long p2m_size = crcs->p2m_size;
+ const uint32_t domid = crs->domid;
+ libxl__datacopier_state *const dc = &crcs->dc;
+
+ STATE_AO_GC(crs->ao);
+
+ if (!ok) {
+ LOG(ERROR, "cannot suspend secondary vm");
+ goto out;
+ }
+
+ crcs->status = LIBXL_COLO_SUSPENDED;
+
+ /*
+ * Secondary vm is running, so there are some dirty pages
+ * that are non-dirty in master. Get dirty bitmap and
+ * send it to master.
+ */
+ if (xc_shadow_control(CTX->xch, domid, XEN_DOMCTL_SHADOW_OP_CLEAN,
+ HYPERCALL_BUFFER(dirty_bitmap), p2m_size,
+ NULL, 0, NULL) != p2m_size) {
+ LOG(ERROR, "getting secondary vm's dirty bitmap fails");
+ goto out;
+ }
+
+ count = 0;
+ for (i = 0; i < p2m_size; i++) {
+ if (test_bit(i, dirty_bitmap))
+ count++;
+ }
+
+ memset(dc, 0, sizeof(*dc));
+ dc->ao = ao;
+ dc->readfd = -1;
+ dc->writefd = send_fd;
+ dc->maxsz = INT_MAX;
+ dc->copywhat = crcs->copywhat[0];
+ dc->writewhat = "colo stream";
+ dc->callback = colo_common_send_data_done;
+ crcs->callback = NULL;
+
+ rc = libxl__datacopier_start(dc);
+ if (rc) {
+ LOG(ERROR, "libxl__datacopier_start() fails");
+ goto out;
+ }
+
+ /* tell master that secondary vm is suspended */
+ libxl__datacopier_prefixdata(egc, dc, §ion, sizeof(section));
+
+ /* send dirty pages to master */
+ libxl__datacopier_prefixdata(egc, dc, &count, sizeof(count));
+ colo_append_pfn_type(egc, dc, dirty_bitmap, p2m_size);
+ return;
+
+out:
+ ok = 0;
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->shs, ok);
+}
+
+static void colo_append_pfn_type(libxl__egc *egc,
+ libxl__datacopier_state *dc,
+ unsigned long *dirty_bitmap,
+ unsigned long p2m_size)
+{
+ int i, count;
+ /* Hack, buf->buf is private member... */
+ libxl__datacopier_buf *buf = NULL;
+ int max_batch = sizeof(buf->buf) / sizeof(uint64_t);
+ int buf_size = max_batch * sizeof(uint64_t);
+ uint64_t *pfn;
+
+ STATE_AO_GC(dc->ao);
+
+ pfn = libxl__zalloc(NOGC, buf_size);
+
+ count = 0;
+ for (i = 0; i < p2m_size; i++) {
+ if (!test_bit(i, dirty_bitmap))
+ continue;
+
+ pfn[count++] = i;
+ if (count == max_batch) {
+ libxl__datacopier_prefixdata(egc, dc, pfn, buf_size);
+ count = 0;
+ }
+ }
+
+ if (count)
+ libxl__datacopier_prefixdata(egc, dc, pfn, count * sizeof(uint64_t));
+
+ free(pfn);
+}
+
+
+/* ===================== colo: common callback ===================== */
+static void colo_common_send_data_done(libxl__egc *egc,
+ libxl__datacopier_state *dc,
+ int onwrite, int errnoval)
+{
+ libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(dc, *crcs, dc);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
+ int ok;
+ STATE_AO_GC(dc->ao);
+
+ if (onwrite == -1) {
+ LOG(ERROR, "sending data fails");
+ ok = 0;
+ goto out;
+ }
+
+ if (errnoval) {
+ /* failure happens when reading/writing, do failover? */
+ ok = 2;
+ goto out;
+ }
+
+ if (!crcs->callback) {
+ /* Everythins is OK */
+ ok = 1;
+ goto out;
+ }
+
+ crcs->callback(egc, crcs, 0);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dcs->shs, ok);
+}
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 5f61514..985c7cd 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -19,6 +19,7 @@
#include "libxl_internal.h"
#include "libxl_arch.h"
+#include "libxl_colo.h"
#include <xc_dom.h>
#include <xenguest.h>
@@ -937,6 +938,96 @@ static void domcreate_console_available(libxl__egc *egc,
dcs->aop_console_how.for_event));
}
+static void libxl__colo_restore_teardown_done(libxl__egc *egc,
+ libxl__colo_restore_state *crs,
+ int rc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ STATE_AO_GC(crs->ao);
+
+ /* convenience aliases */
+ libxl__save_helper_state *const shs = &dcs->shs;
+ const int domid = crs->domid;
+ const libxl_ctx *const ctx = libxl__gc_owner(gc);
+ xc_interface *const xch = ctx->xch;
+
+ if (!rc)
+ /* failover, no need to destroy the secondary vm */
+ goto out;
+
+ if (shs->retval)
+ /*
+ * shs->retval stores the return value of xc_domain_restore().
+ * If it is not 0, we have destroyed the secondary vm in
+ * xc_domain_restore();
+ */
+ goto out;
+
+ xc_domain_destroy(xch, domid);
+
+out:
+ dcs->callback(egc, dcs, rc, crs->domid);
+}
+
+void libxl__colo_restore_done(libxl__egc *egc, void *dcs_void,
+ int ret, int retval, int errnoval)
+{
+ libxl__domain_create_state *dcs = dcs_void;
+ int rc = 1;
+
+ /* convenience aliases */
+ libxl__colo_restore_state *const crs = &dcs->crs;
+ STATE_AO_GC(crs->ao);
+
+ /* teardown and failover */
+ crs->callback = libxl__colo_restore_teardown_done;
+
+ if (ret == 0 && retval == 0)
+ rc = 0;
+
+ LOG(INFO, "%s", rc ? "colo fails" : "failover");
+ libxl__colo_restore_teardown(egc, crs, rc);
+}
+
+static void libxl__colo_restore_cp_done(libxl__egc *egc,
+ libxl__colo_restore_state *crs,
+ int rc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ int ok = 0;
+
+ /* convenience aliases */
+ libxl__save_helper_state *const shs = &dcs->shs;
+
+ if (!rc)
+ ok = 1;
+
+ libxl__xc_domain_saverestore_async_callback_done(shs->egc, shs, ok);
+}
+
+static void libxl__colo_restore_setup_done(libxl__egc *egc,
+ libxl__colo_restore_state *crs,
+ int rc)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+ /* convenience aliases */
+ const int hvm = crs->hvm;
+ const int superpages = crs->superpages;
+ const int pae = crs->pae;
+ STATE_AO_GC(crs->ao);
+
+ if (rc) {
+ LOG(ERROR, "colo restore setup fails: %d", rc);
+ libxl__xc_domain_restore_done(egc, dcs, rc, 0, 0);
+ return;
+ }
+
+ crs->callback = libxl__colo_restore_cp_done;
+ libxl__xc_domain_restore(egc, dcs,
+ hvm, pae, superpages);
+}
+
static void domcreate_bootloader_done(libxl__egc *egc,
libxl__bootloader_state *bl,
int rc)
@@ -952,6 +1043,8 @@ static void domcreate_bootloader_done(libxl__egc *egc,
libxl__domain_build_state *const state = &dcs->build_state;
libxl__srm_restore_autogen_callbacks *const callbacks =
&dcs->shs.callbacks.restore.a;
+ const int checkpointed_stream = dcs->checkpointed_stream;
+ libxl__colo_restore_state *const crs = &dcs->crs;
if (rc) {
domcreate_rebuild_done(egc, dcs, rc);
@@ -980,6 +1073,13 @@ static void domcreate_bootloader_done(libxl__egc *egc,
/* Restore */
+ /* COLO only supports HVM now */
+ if (info->type != LIBXL_DOMAIN_TYPE_HVM &&
+ checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
rc = libxl__build_pre(gc, domid, d_config, state);
if (rc)
goto out;
@@ -1002,8 +1102,20 @@ static void domcreate_bootloader_done(libxl__egc *egc,
rc = ERROR_INVAL;
goto out;
}
- libxl__xc_domain_restore(egc, dcs,
- hvm, pae, superpages);
+
+ if (checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO) {
+ crs->ao = ao;
+ crs->domid = domid;
+ crs->send_fd = dcs->send_fd;
+ crs->recv_fd = restore_fd;
+ crs->hvm = hvm;
+ crs->superpages = superpages;
+ crs->pae = pae;
+ crs->callback = libxl__colo_restore_setup_done;
+ libxl__colo_restore_setup(egc, crs);
+ } else
+ libxl__xc_domain_restore(egc, dcs,
+ hvm, pae, superpages);
return;
out:
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index 3359a9f..ad190f9 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -947,7 +947,7 @@ static void switch_logdirty_xswatch(libxl__egc *egc, libxl__ev_xswatch*,
static void switch_logdirty_done(libxl__egc *egc,
libxl__logdirty_switch *lds, int ok);
-static void logdirty_init(libxl__logdirty_switch *lds)
+void logdirty_init(libxl__logdirty_switch *lds)
{
lds->cmd_path = 0;
libxl__ev_xswatch_init(&lds->watch);
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 76d57fb..8f07d1b 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2848,6 +2848,7 @@ struct libxl__logdirty_switch {
libxl__ev_xswatch watch;
libxl__ev_time timeout;
};
+_hidden void logdirty_init(libxl__logdirty_switch *lds);
/*
* libxl__domain_suspend_state is for saving guest, not
@@ -3139,6 +3140,26 @@ typedef void libxl__domain_create_cb(libxl__egc *egc,
libxl__domain_create_state*,
int rc, uint32_t domid);
+/* colo related structure */
+typedef struct libxl__colo_restore_state libxl__colo_restore_state;
+typedef void libxl__colo_callback(libxl__egc *,
+ libxl__colo_restore_state *, int rc);
+struct libxl__colo_restore_state {
+ /* must set by caller of libxl__colo_(setup|teardown) */
+ libxl__ao *ao;
+ uint32_t domid;
+ int send_fd;
+ int recv_fd;
+ int hvm;
+ int pae;
+ int superpages;
+ libxl__colo_callback *callback;
+
+ /* private, colo restore checkpoint state */
+ libxl__domain_create_cb *saved_cb;
+ void *crcs;
+};
+
struct libxl__domain_create_state {
/* filled in by user */
libxl__ao *ao;
@@ -3152,6 +3173,7 @@ struct libxl__domain_create_state {
int guest_domid;
int checkpointed_stream;
libxl__domain_build_state build_state;
+ libxl__colo_restore_state crs;
libxl__bootloader_state bl;
libxl__stub_dm_spawn_state dmss;
/* If we're not doing stubdom, we use only dmss.dm,
diff --git a/tools/libxl/libxl_save_callout.c b/tools/libxl/libxl_save_callout.c
index 477e633..9082b66 100644
--- a/tools/libxl/libxl_save_callout.c
+++ b/tools/libxl/libxl_save_callout.c
@@ -15,6 +15,7 @@
#include "libxl_osdeps.h"
#include "libxl_internal.h"
+#include "libxl_colo.h"
/* stream_fd is as from the caller (eventually, the application).
* It may be 0, 1 or 2, in which case we need to dup it elsewhere.
@@ -65,7 +66,10 @@ void libxl__xc_domain_restore(libxl__egc *egc, libxl__domain_create_state *dcs,
dcs->shs.ao = ao;
dcs->shs.domid = domid;
dcs->shs.recv_callback = libxl__srm_callout_received_restore;
- dcs->shs.completion_callback = libxl__xc_domain_restore_done;
+ if (dcs->checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO)
+ dcs->shs.completion_callback = libxl__colo_restore_done;
+ else
+ dcs->shs.completion_callback = libxl__xc_domain_restore_done;
dcs->shs.caller_state = dcs;
dcs->shs.need_results = 1;
dcs->shs.toolstack_data_file = 0;
diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl
index 41ee000..0239cac 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -24,9 +24,9 @@ our @msgs = (
STRING doing_what),
'unsigned long', 'done',
'unsigned long', 'total'] ],
- [ 3, 'scxA', "suspend", [] ],
- [ 4, 'scxA', "postcopy", [] ],
- [ 5, 'scxA', "checkpoint", [] ],
+ [ 3, 'srcxA', "suspend", [] ],
+ [ 4, 'srcxA', "postcopy", [] ],
+ [ 5, 'srcxA', "checkpoint", [] ],
[ 6, 'scxA', "switch_qemu_logdirty", [qw(int domid
unsigned enable)] ],
# toolstack_save done entirely `by hand'
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 08/18] primary vm suspend/get_dirty_pfn/resume/checkpoint code
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (6 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 07/18] secondary vm suspend/resume/checkpoint code Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 09/18] xc_domain_save: flush cache before calling callbacks->postcopy() in colo mode Wen Congyang
` (11 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
We will do the following things again and again:
1. Suspend primary vm
a. Suspend primary vm
b. do postsuspend
c. Read LIBXL_COLO_SVM_SUSPENDED to master
d. Read secondary vm's dirty page information to master(count + pfn list)
2. Get dirty pfn list
a. Return secondary vm's dirty pfn list
3. Resume primary vm
a. Read LIBXL_COLO_SVM_READY from slave
b. Do presume
c. Resume primary vm
d. Read LIBXL_COLO_SVM_RESUMED from slave
4. Wait a new checkpoint
a. Wait a new checkpoint(not implemented)
b. Send LIBXL_COLO_NEW_CHECKPOINT to slave
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxc/include/xenguest.h | 12 +
tools/libxl/Makefile | 2 +-
tools/libxl/libxl.c | 6 +-
tools/libxl/libxl_colo.h | 10 +
tools/libxl/libxl_colo_save.c | 608 +++++++++++++++++++++++++++++++++++++
tools/libxl/libxl_dom.c | 13 +-
tools/libxl/libxl_internal.h | 32 +-
tools/libxl/libxl_save_msgs_gen.pl | 1 +
tools/libxl/libxl_types.idl | 1 +
9 files changed, 677 insertions(+), 8 deletions(-)
create mode 100644 tools/libxl/libxl_colo_save.c
diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index d3061c7..1aeaad2 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -72,6 +72,18 @@ struct save_callbacks {
*/
int (*toolstack_save)(uint32_t domid, uint8_t **buf, uint32_t *len, void *data);
+ /* Called after the guest is suspended.
+ *
+ * returns the list of dirty pfn:
+ * struct {
+ * uint64_t count;
+ * uint64_t pfn[];
+ * };
+ *
+ * Note: the caller must free the return value.
+ */
+ uint8_t *(*get_dirty_pfn)(void *data);
+
/* to be provided as the last argument to each callback function */
void* data;
};
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index 9cfff46..ab565ee 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -57,7 +57,7 @@ LIBXL_OBJS-y += libxl_nonetbuffer.o
endif
LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
-LIBXL_OBJS-y += libxl_colo_restore.o
+LIBXL_OBJS-y += libxl_colo_restore.o libxl_colo_save.o
LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 145e897..b05a4bf 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -18,6 +18,7 @@
#include "libxl_internal.h"
#include "libxl_remus.h"
+#include "libxl_colo.h"
#define PAGE_TO_MEMKB(pages) ((pages) * 4)
#define BACKEND_STRING_SIZE 5
@@ -858,7 +859,10 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
assert(info);
/* Point of no return */
- libxl__remus_setup(egc, &dss->rs);
+ if (libxl_defbool_val(info->colo))
+ libxl__colo_save_setup(egc, &dss->css);
+ else
+ libxl__remus_setup(egc, &dss->rs);
return AO_INPROGRESS;
out:
diff --git a/tools/libxl/libxl_colo.h b/tools/libxl/libxl_colo.h
index 91df275..26a2563 100644
--- a/tools/libxl/libxl_colo.h
+++ b/tools/libxl/libxl_colo.h
@@ -35,4 +35,14 @@ extern void libxl__colo_restore_teardown(libxl__egc *egc,
libxl__colo_restore_state *crs,
int rc);
+extern void libxl__colo_save_domain_suspend_callback(void *data);
+extern void libxl__colo_save_domain_resume_callback(void *data);
+extern void libxl__colo_save_domain_checkpoint_callback(void *data);
+extern void libxl__colo_save_get_dirty_pfn_callback(void *data);
+extern void libxl__colo_save_setup(libxl__egc *egc,
+ libxl__colo_save_state *css);
+extern void libxl__colo_save_teardown(libxl__egc *egc,
+ libxl__colo_save_state *css,
+ int rc);
+
#endif
diff --git a/tools/libxl/libxl_colo_save.c b/tools/libxl/libxl_colo_save.c
new file mode 100644
index 0000000..6fbff9f
--- /dev/null
+++ b/tools/libxl/libxl_colo_save.c
@@ -0,0 +1,608 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+#include "libxl_colo.h"
+
+static const libxl__checkpoint_device_instance_ops *colo_ops[] = {
+ NULL,
+};
+
+/* ================= colo: setup save environment ================= */
+static void colo_save_setup_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+static void colo_save_setup_failed(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+
+void libxl__colo_save_setup(libxl__egc *egc, libxl__colo_save_state *css)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ /* Convenience aliases */
+ libxl__checkpoint_devices_state *const cds = &css->cds;
+
+ STATE_AO_GC(dss->ao);
+
+ if (dss->type != LIBXL_DOMAIN_TYPE_HVM) {
+ LOG(ERROR, "COLO only supports hvm now");
+ goto out;
+ }
+
+ css->send_fd = dss->fd;
+ css->recv_fd = dss->recv_fd;
+ css->svm_running = false;
+
+ /* TODO: disk/nic support */
+ cds->device_kind_flags = 0;
+ cds->ops = colo_ops;
+ cds->callback = colo_save_setup_done;
+ cds->ao = ao;
+ cds->domid = dss->domid;
+
+ libxl__checkpoint_devices_setup(egc, &css->cds);
+
+ return;
+
+out:
+ libxl__ao_complete(egc, ao, ERROR_FAIL);
+}
+
+static void colo_save_setup_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+ STATE_AO_GC(cds->ao);
+
+ if (!rc) {
+ libxl__domain_suspend(egc, dss);
+ return;
+ }
+
+ LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
+ dss->domid);
+ css->cds.callback = colo_save_setup_failed;
+ libxl__checkpoint_devices_teardown(egc, &css->cds);
+}
+
+static void colo_save_setup_failed(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ STATE_AO_GC(cds->ao);
+
+ if (rc)
+ LOG(ERROR, "COLO: failed to teardown device after setup failed"
+ " for guest with domid %u, rc %d", cds->domid, rc);
+
+ libxl__ao_complete(egc, ao, rc);
+}
+
+
+/* ================= colo: teardown save environment ================= */
+static void colo_teardown_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+
+void libxl__colo_save_teardown(libxl__egc *egc,
+ libxl__colo_save_state *css,
+ int rc)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ STATE_AO_GC(css->cds.ao);
+
+ LOG(WARN, "COLO: Domain suspend terminated with rc %d,"
+ " teardown COLO devices...", rc);
+ dss->css.cds.callback = colo_teardown_done;
+ libxl__checkpoint_devices_teardown(egc, &dss->css.cds);
+ return;
+}
+
+static void colo_teardown_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+ dss->callback(egc, dss, rc);
+}
+
+/*
+ * checkpoint callbacks are called in the following order:
+ * 1. suspend
+ * 2. resume
+ * 3. checkpoint
+ */
+static void colo_common_read_done(libxl__egc *egc,
+ libxl__datareader_state *drs,
+ ssize_t real_size, int errnoval);
+/* ===================== colo: suspend primary vm ===================== */
+/*
+ * Do the following things when suspending primary vm:
+ * 1. suspend primary vm
+ * 2. do postsuspend
+ * 3. read LIBXL_COLO_SVM_SUSPENDED
+ * 4. read secondary vm's dirty pages
+ */
+static void colo_suspend_primary_vm_done(libxl__egc *egc,
+ libxl__domain_suspend_state2 *dss2,
+ int ok);
+static void colo_postsuspend_cb(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+static void colo_read_pfn(libxl__egc *egc, libxl__colo_save_state *css);
+
+void libxl__colo_save_domain_suspend_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__egc *egc = shs->egc;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+
+ /* Convenience aliases */
+ libxl__domain_suspend_state2 *dss2 = &dss->dss2;
+
+ dss2->callback_common_done = colo_suspend_primary_vm_done;
+ libxl__domain_suspend2(egc, dss2);
+}
+
+static void colo_suspend_primary_vm_done(libxl__egc *egc,
+ libxl__domain_suspend_state2 *dss2,
+ int ok)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(dss2, *dss, dss2);
+
+ STATE_AO_GC(dss2->ao);
+
+ if (!ok) {
+ LOG(ERROR, "cannot suspend primary vm");
+ goto out;
+ }
+
+ /* Convenience aliases */
+ libxl__checkpoint_devices_state *const cds = &dss->css.cds;
+
+ cds->callback = colo_postsuspend_cb;
+ libxl__checkpoint_devices_postsuspend(egc, cds);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+static void colo_postsuspend_cb(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ int ok = 0;
+ libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ /* Convenience aliases */
+ libxl__datareader_state *const drs = &css->drs;
+
+ STATE_AO_GC(cds->ao);
+
+ if (rc) {
+ LOG(ERROR, "postsuspend fails");
+ goto out;
+ }
+
+ if (!css->svm_running) {
+ ok = 1;
+ goto out;
+ }
+
+ /*
+ * read LIBXL_COLO_SVM_SUSPENDED and the count of
+ * secondary vm's dirty pages.
+ */
+ memset(drs, 0, sizeof(*drs));
+ drs->ao = ao;
+ drs->readfd = css->recv_fd;
+ drs->readsize = sizeof(css->temp_buff);
+ drs->readwhat = "colo stream";
+ drs->callback = colo_common_read_done;
+ drs->buf = css->temp_buff;
+ css->callback = colo_read_pfn;
+
+ if (libxl__datareader_start(drs)) {
+ LOG(ERROR, "libxl__datareader_start() fails");
+ goto out;
+ }
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+static void colo_read_pfn(libxl__egc *egc, libxl__colo_save_state *css)
+{
+ int ok = 0;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+ STATE_AO_GC(css->cds.ao);
+
+ /* Convenience aliases */
+ libxl__datareader_state *const drs = &css->drs;
+
+ assert(!css->buff);
+ css->section = css->temp_buff[0];
+ css->count = *(uint64_t *)(&css->temp_buff[1]);
+
+ if (css->section != LIBXL_COLO_SVM_SUSPENDED) {
+ LOG(ERROR, "invalid section: %d, expected: %d",
+ css->section, LIBXL_COLO_SVM_SUSPENDED);
+ goto out;
+ }
+
+ css->buff = libxl__zalloc(NOGC, sizeof(uint64_t) * (css->count + 1));
+ css->buff[0] = css->count;
+
+ if (css->count == 0) {
+ /* no dirty pages */
+ ok = 1;
+ goto out;
+ }
+
+ /* read the pfn of secondary vm's dirty pages */
+ memset(drs, 0, sizeof(*drs));
+ drs->ao = ao;
+ drs->readfd = css->recv_fd;
+ drs->readsize = css->count * sizeof(uint64_t);
+ drs->readwhat = "colo stream";
+ drs->callback = colo_common_read_done;
+ drs->buf = css->buff + 1;
+ css->callback = NULL;
+
+ if (libxl__datareader_start(drs)) {
+ LOG(ERROR, "libxl__datareader_start() fails");
+ goto out;
+ }
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+
+/* ===================== colo: get dirty pfn ===================== */
+void libxl__colo_save_get_dirty_pfn_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__egc *egc = shs->egc;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+ uint64_t size;
+
+ /* Convenience aliases */
+ libxl__colo_save_state *const css = &dss->css;
+
+ assert(css->buff);
+ size = sizeof(uint64_t) * (css->count + 1);
+
+ libxl__xc_domain_saverestore_async_callback_done_with_data(egc, shs,
+ (uint8_t *)css->buff,
+ size);
+ free(css->buff);
+ css->buff = NULL;
+}
+
+
+/* ===================== colo: resume primary vm ===================== */
+/*
+ * Do the following things when resuming primary vm:
+ * 1. read LIBXL_COLO_SVM_READY
+ * 2. do preresume
+ * 3. resume primary vm
+ * 4. read LIBXL_COLO_SVM_RESUMED
+ */
+static void colo_preresume_dm_saved(libxl__egc *egc,
+ libxl__domain_suspend_state *dss, int rc);
+static void colo_read_svm_ready_done(libxl__egc *egc,
+ libxl__colo_save_state *css);
+static void colo_preresume_cb(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+static void colo_read_svm_resumed_done(libxl__egc *egc,
+ libxl__colo_save_state *css);
+
+void libxl__colo_save_domain_resume_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__egc *egc = shs->egc;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+
+ /* This would go into tailbuf. */
+ if (dss->hvm) {
+ libxl__domain_save_device_model(egc, dss, colo_preresume_dm_saved);
+ } else {
+ colo_preresume_dm_saved(egc, dss, 0);
+ }
+
+ return;
+}
+
+static void colo_preresume_dm_saved(libxl__egc *egc,
+ libxl__domain_suspend_state *dss, int rc)
+{
+ /* Convenience aliases */
+ libxl__colo_save_state *const css = &dss->css;
+ libxl__datareader_state *const drs = &css->drs;
+
+ STATE_AO_GC(css->cds.ao);
+
+ if (rc) {
+ LOG(ERROR, "Failed to save device model. Terminating COLO..");
+ goto out;
+ }
+
+ /* read LIBXL_COLO_SVM_READY */
+ memset(drs, 0, sizeof(*drs));
+ drs->ao = ao;
+ drs->readfd = css->recv_fd;
+ drs->readsize = sizeof(css->section);
+ drs->readwhat = "colo stream";
+ drs->callback = colo_common_read_done;
+ drs->buf = &css->section;
+ css->callback = colo_read_svm_ready_done;
+
+ if (libxl__datareader_start(drs)) {
+ LOG(ERROR, "libxl__datareader_start() fails");
+ goto out;
+ }
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void colo_read_svm_ready_done(libxl__egc *egc,
+ libxl__colo_save_state *css)
+{
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ STATE_AO_GC(css->cds.ao);
+
+ if (css->section != LIBXL_COLO_SVM_READY) {
+ LOG(ERROR, "invalid section: %d, expected: %d",
+ css->section, LIBXL_COLO_SVM_READY);
+ goto out;
+ }
+
+ css->svm_running = true;
+ css->cds.callback = colo_preresume_cb;
+ libxl__checkpoint_devices_preresume(egc, &css->cds);
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void colo_preresume_cb(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ /* Convenience aliases */
+ libxl__datareader_state *const drs = &css->drs;
+
+ STATE_AO_GC(cds->ao);
+
+ if (rc) {
+ LOG(ERROR, "preresume fails");
+ goto out;
+ }
+
+ /* Resumes the domain and the device model */
+ if (libxl__domain_resume(gc, dss->domid, /* Fast Suspend */1)) {
+ LOG(ERROR, "cannot resume primary vm");
+ goto out;
+ }
+
+ /* read LIBXL_COLO_SVM_RESUMED */
+ memset(drs, 0, sizeof(*drs));
+ drs->ao = ao;
+ drs->readfd = css->recv_fd;
+ drs->readsize = sizeof(css->section);
+ drs->readwhat = "colo stream";
+ drs->callback = colo_common_read_done;
+ drs->buf = &css->section;
+ css->callback = colo_read_svm_resumed_done;
+
+ if (libxl__datareader_start(drs)) {
+ LOG(ERROR, "libxl__datareader_start() fails");
+ goto out;
+ }
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void colo_read_svm_resumed_done(libxl__egc *egc,
+ libxl__colo_save_state *css)
+{
+ int ok = 0;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ STATE_AO_GC(css->cds.ao);
+
+ if (css->section != LIBXL_COLO_SVM_RESUMED) {
+ LOG(ERROR, "invalid section: %d, expected: %d",
+ css->section, LIBXL_COLO_SVM_RESUMED);
+ goto out;
+ }
+
+ ok = 1;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+
+/* ===================== colo: wait new checkpoint ===================== */
+/*
+ * Do the following things:
+ * 1. do commit
+ * 2. wait for a new checkpoint
+ * 3. write LIBXL_COLO_NEW_CHECKPOINT
+ */
+static void colo_device_commit_cb(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+static void colo_start_new_checkpoint(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+static void colo_send_data_done(libxl__egc *egc,
+ libxl__datacopier_state *dc,
+ int onwrite, int errnoval);
+
+void libxl__colo_save_domain_checkpoint_callback(void *data)
+{
+ libxl__save_helper_state *shs = data;
+ libxl__domain_suspend_state *dss = CONTAINER_OF(shs, *dss, shs);
+ libxl__egc *egc = dss->shs.egc;
+
+ /* Convenience aliases */
+ libxl__checkpoint_devices_state *const cds = &dss->css.cds;
+
+ cds->callback = colo_device_commit_cb;
+ libxl__checkpoint_devices_commit(egc, cds);
+}
+
+static void colo_device_commit_cb(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ STATE_AO_GC(cds->ao);
+
+ if (rc) {
+ LOG(ERROR, "commit fails");
+ goto out;
+ }
+
+ /* TODO: wait a new checkpoint */
+ colo_start_new_checkpoint(egc, cds, 0);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void colo_start_new_checkpoint(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+ uint8_t section = LIBXL_COLO_NEW_CHECKPOINT;
+
+ /* Convenience aliases */
+ libxl__datacopier_state *const dc = &css->dc;
+
+ STATE_AO_GC(cds->ao);
+
+ if (rc)
+ goto out;
+
+ /* write LIBXL_COLO_NEW_CHECKPOINT */
+ memset(dc, 0, sizeof(*dc));
+ dc->ao = ao;
+ dc->readfd = -1;
+ dc->writefd = css->send_fd;
+ dc->maxsz = INT_MAX;
+ dc->copywhat = "new checkpoint is triggered";
+ dc->writewhat = "colo stream";
+ dc->callback = colo_send_data_done;
+
+ rc = libxl__datacopier_start(dc);
+ if (rc) {
+ LOG(ERROR, "libxl__datacopier_start() fails");
+ goto out;
+ }
+
+ /* tell slave that a new checkpoint is triggered */
+ libxl__datacopier_prefixdata(egc, dc, §ion, sizeof(section));
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, 0);
+}
+
+static void colo_send_data_done(libxl__egc *egc,
+ libxl__datacopier_state *dc,
+ int onwrite, int errnoval)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(dc, *css, dc);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+ int ok;
+
+ STATE_AO_GC(dc->ao);
+
+ if (onwrite == -1 || errnoval) {
+ LOG(ERROR, "cannot start a new checkpoint");
+ ok = 0;
+ goto out;
+ }
+
+ /* Everything is OK */
+ ok = 1;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
+
+
+/* ===================== colo: common callback ===================== */
+static void colo_common_read_done(libxl__egc *egc,
+ libxl__datareader_state *drs,
+ ssize_t real_size, int errnoval)
+{
+ int ok = 0;
+ libxl__colo_save_state *css = CONTAINER_OF(drs, *css, drs);
+ libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+ STATE_AO_GC(drs->ao);
+
+ if (real_size < drs->readsize) {
+ LOG(ERROR, "reading data fails: %lld", (long long)real_size);
+ goto out;
+ }
+
+ if (!css->callback) {
+ /* Everything is OK */
+ ok = 1;
+ goto out;
+ }
+
+ css->callback(egc, css);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, &dss->shs, ok);
+}
diff --git a/tools/libxl/libxl_dom.c b/tools/libxl/libxl_dom.c
index ad190f9..03a17a9 100644
--- a/tools/libxl/libxl_dom.c
+++ b/tools/libxl/libxl_dom.c
@@ -20,6 +20,7 @@
#include "libxl_internal.h"
#include "libxl_arch.h"
#include "libxl_remus.h"
+#include "libxl_colo.h"
#include <xc_dom.h>
#include <xen/hvm/hvm_info_table.h>
@@ -1717,7 +1718,12 @@ void libxl__domain_suspend(libxl__egc *egc, libxl__domain_suspend_state *dss)
}
memset(callbacks, 0, sizeof(*callbacks));
- if (r_info != NULL) {
+ if (r_info != NULL && libxl_defbool_val(r_info->colo)) {
+ callbacks->suspend = libxl__colo_save_domain_suspend_callback;
+ callbacks->postcopy = libxl__colo_save_domain_resume_callback;
+ callbacks->checkpoint = libxl__colo_save_domain_checkpoint_callback;
+ callbacks->get_dirty_pfn = libxl__colo_save_get_dirty_pfn_callback;
+ } else if (r_info != NULL) {
callbacks->suspend = libxl__remus_domain_suspend_callback;
callbacks->postcopy = libxl__remus_domain_resume_callback;
callbacks->checkpoint = libxl__remus_domain_checkpoint_callback;
@@ -1878,7 +1884,10 @@ static void domain_suspend_done(libxl__egc *egc,
xc_suspend_evtchn_release(CTX->xch, CTX->xce, domid,
dss2->guest_evtchn.port, &dss2->guest_evtchn_lockfd);
- if (dss->remus) {
+ if (dss->remus && libxl_defbool_val(dss->remus->colo)) {
+ libxl__colo_save_teardown(egc, &dss->css, rc);
+ return;
+ } else if (dss->remus) {
libxl__remus_teardown(egc, &dss->rs, rc);
return;
}
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 8f07d1b..924f8a3 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2637,7 +2637,7 @@ typedef struct libxl__save_helper_state {
/*
* The abstract checkpoint device layer exposes a common
* set of API to [external] libxl for manipulating devices attached to
- * a guest protected by Remus. The device layer also exposes a set of
+ * a guest protected by Remus/COLO. The device layer also exposes a set of
* [internal] interfaces that every device type must implement.
*
* The following API are exposed to libxl:
@@ -2655,7 +2655,7 @@ typedef struct libxl__save_helper_state {
* +libxl__checkpoint_devices_commit
*
* Each device type needs to implement the interfaces specified in
- * the libxl__checkpoint_device_instance_ops if it wishes to support Remus.
+ * the libxl__checkpoint_device_instance_ops if it wishes to support Remus/COLO.
*
* The high-level control flow through the checkpoint device layer is shown
* below:
@@ -2675,7 +2675,7 @@ typedef struct libxl__checkpoint_device_instance_ops libxl__checkpoint_device_in
/*
* Interfaces to be implemented by every device subkind that wishes to
- * support Remus. Functions must be implemented unless otherwise
+ * support Remus/COLO. Functions must be implemented unless otherwise
* stated. Many of these functions are asynchronous. They call
* dev->aodev.callback when done. The actual implementations may be
* synchronous and call dev->aodev.callback directly (as the last
@@ -2826,6 +2826,25 @@ struct libxl__remus_state {
_hidden int libxl__netbuffer_enabled(libxl__gc *gc);
+/*----- colo related state structure -----*/
+typedef struct libxl__colo_save_state libxl__colo_save_state;
+struct libxl__colo_save_state {
+ libxl__checkpoint_devices_state cds;
+ int send_fd;
+ int recv_fd;
+
+ /* private */
+ libxl__datacopier_state dc;
+ libxl__datareader_state drs;
+ uint8_t section;
+ uint64_t count;
+ uint64_t *buff;
+ /* read section and count, and then store it in temp_buff */
+ uint8_t temp_buff[9];
+ void (*callback)(libxl__egc *, libxl__colo_save_state *);
+ bool svm_running;
+};
+
/*----- Domain suspend (save) state structure -----*/
typedef struct libxl__domain_suspend_state libxl__domain_suspend_state;
@@ -2889,7 +2908,12 @@ struct libxl__domain_suspend_state {
libxl__domain_suspend_state2 dss2;
int hvm;
int xcflags;
- libxl__remus_state rs;
+ union {
+ /* for Remus */
+ libxl__remus_state rs;
+ /* for COLO */
+ libxl__colo_save_state css;
+ };
libxl__save_helper_state shs;
libxl__logdirty_switch logdirty;
/* private for libxl__domain_save_device_model */
diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl
index 0239cac..fbb2d67 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -36,6 +36,7 @@ our @msgs = (
'unsigned long', 'console_mfn'] ],
[ 9, 'srW', "complete", [qw(int retval
int errnoval)] ],
+ [ 10, 'scxAB', "get_dirty_pfn", [] ],
);
#----------------------------------------
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index ed567f4..54e1684 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -664,6 +664,7 @@ libxl_domain_remus_info = Struct("domain_remus_info",[
("netbuf", libxl_defbool),
("netbufscript", string),
("diskbuf", libxl_defbool),
+ ("colo", libxl_defbool)
])
libxl_event_type = Enumeration("event_type", [
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 09/18] xc_domain_save: flush cache before calling callbacks->postcopy() in colo mode
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (7 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 08/18] primary vm suspend/get_dirty_pfn/resume/checkpoint code Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 10/18] COLO: xc related codes Wen Congyang
` (10 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
In colo mode, secondary vm is running. We will use the io_fd to
ensure that both primary vm and secondary vm are resumed
at the same time. So we should call postcopy later.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxc/xc_domain_save.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
index d96fd24..36ebb8a 100644
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -2082,10 +2082,15 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
out_rc:
completed = 1;
- if ( !rc && callbacks->postcopy )
+ /*
+ * COLO: secondary vm is running. We will use the io_fd to
+ * ensure that both primary vm and secondary vm are resumed
+ * at the same time. So we should call postcopy later.
+ */
+ if ( !rc && callbacks->postcopy && !callbacks->get_dirty_pfn )
callbacks->postcopy(callbacks->data);
- /* guest has been resumed. Now we can compress data
+ /* Remus: guest has been resumed. Now we can compress data
* at our own pace.
*/
if (!rc && compressing)
@@ -2113,6 +2118,13 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
discard_file_cache(xch, io_fd, 1 /* flush */);
+ /*
+ * COLO: send qemu device state and resume both
+ * primary vm and secondary vm now.
+ */
+ if ( !rc && callbacks->postcopy && callbacks->get_dirty_pfn )
+ callbacks->postcopy(callbacks->data);
+
/* Enable compression now, finally */
compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS);
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 10/18] COLO: xc related codes
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (8 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 09/18] xc_domain_save: flush cache before calling callbacks->postcopy() in colo mode Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 11/18] send store mfn and console mfn to xl before resuming secondary vm Wen Congyang
` (9 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
Save:
1. send XC_SAVE_ID_LAST_CHECKPOINT, so secondary vm can be resumed
2. call callbacks->get_dirty_pfn() after suspend primary vm if we
are doing checkpoint.
Restore:
1. call the callbacks resume/checkpoint/suspend if secondary vm's
status is the same as primary vm's status.
2. zero out tdata because we will use it zero out pagebuf.tdata.
3. don't apply the secondary vm's state when we failed to get new
secondary vm's state, because we have applied it every checkpoint.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxc/xc_domain_restore.c | 82 +++++++++++++++++++++++++++++++++++------
tools/libxc/xc_domain_save.c | 49 +++++++++++++++++++++++-
2 files changed, 117 insertions(+), 14 deletions(-)
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
index d8bd9b3..07ac49a 100644
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -1454,7 +1454,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
int nraces = 0;
/* The new domain's shared-info frame number. */
- unsigned long shared_info_frame;
+ unsigned long shared_info_frame = 0;
unsigned char shared_info_page[PAGE_SIZE]; /* saved contents from file */
shared_info_any_t *old_shared_info =
(shared_info_any_t *)shared_info_page;
@@ -1504,6 +1504,8 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
DPRINTF("%s: starting restore of new domid %u", __func__, dom);
+ n = m = 0;
+
pagebuf_init(&pagebuf);
memset(&tailbuf, 0, sizeof(tailbuf));
tailbuf.ishvm = hvm;
@@ -1629,7 +1631,6 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
* We uncanonicalise page tables as we go.
*/
- n = m = 0;
loadpages:
for ( ; ; )
{
@@ -1793,26 +1794,45 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
goto finish;
}
+new_checkpoint:
// DPRINTF("Buffered checkpoint\n");
if ( pagebuf_get(xch, ctx, &pagebuf, io_fd, dom) ) {
PERROR("error when buffering batch, finishing");
- /*
- * Remus: discard the current incomplete checkpoint and restore
- * backup from the last complete checkpoint.
- */
- goto finish;
+ if ( callbacks && callbacks->checkpoint )
+ {
+ /* COLO: discard the current incomplete checkpoint */
+ rc = 0;
+ goto failover;
+ }
+ else
+ {
+ /*
+ * Remus: discard the current incomplete checkpoint and restore
+ * backup from the last complete checkpoint.
+ */
+ goto finish;
+ }
}
memset(&tmptail, 0, sizeof(tmptail));
tmptail.ishvm = hvm;
if ( buffer_tail(xch, ctx, &tmptail, io_fd, max_vcpu_id, vcpumap,
ext_vcpucontext, vcpuextstate_size) < 0 ) {
ERROR ("error buffering image tail, finishing");
- /*
- * Remus: discard the current incomplete checkpoint and restore
- * backup from the last complete checkpoint.
- */
- goto finish;
+ if ( callbacks && callbacks->checkpoint )
+ {
+ /* COLO: discard the current incomplete checkpoint */
+ rc = 0;
+ goto failover;
+ }
+ else
+ {
+ /*
+ * Remus: discard the current incomplete checkpoint and restore
+ * backup from the last complete checkpoint.
+ */
+ goto finish;
+ }
}
tailbuf_free(&tailbuf);
memcpy(&tailbuf, &tmptail, sizeof(tailbuf));
@@ -2301,6 +2321,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
free(tdata.data);
goto out;
}
+ memset(&tdata, 0, sizeof(tdata));
}
/* Dump the QEMU state to a state file for QEMU to load */
@@ -2368,6 +2389,43 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
rc = 0;
out:
+ if ( !rc && callbacks && callbacks->checkpoint )
+ {
+#define HANDLE_CALLBACK_RETURN_VALUE(frc) \
+ do { \
+ if ( frc == 0 ) \
+ { \
+ /* Some internal error happens */ \
+ rc = 1; \
+ goto out; \
+ } \
+ else if ( frc == 2 ) \
+ { \
+ /* Reading/writing error, do failover */ \
+ rc = 0; \
+ goto failover; \
+ } \
+ } while (0)
+ /* COLO */
+
+ /* TODO: call restore_results */
+
+ /* Resume secondary vm */
+ frc = callbacks->postcopy(callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(frc);
+
+ /* wait for new checkpoint */
+ frc = callbacks->checkpoint(callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(frc);
+
+ /* suspend secondary vm */
+ frc = callbacks->suspend(callbacks->data);
+ HANDLE_CALLBACK_RETURN_VALUE(frc);
+
+ goto new_checkpoint;
+ }
+
+failover:
if ( (rc != 0) && (dom != 0) )
xc_domain_destroy(xch, dom);
xc_hypercall_buffer_free(xch, ctxt);
diff --git a/tools/libxc/xc_domain_save.c b/tools/libxc/xc_domain_save.c
index 36ebb8a..db9c725 100644
--- a/tools/libxc/xc_domain_save.c
+++ b/tools/libxc/xc_domain_save.c
@@ -377,6 +377,31 @@ static int suspend_and_state(int (*suspend)(void*), void* data,
return 0;
}
+static int update_dirty_bitmap(uint8_t *(*get_dirty_pfn)(void *), void *data,
+ unsigned long p2m_size, unsigned long *to_send)
+{
+ uint64_t *pfn_list;
+ uint64_t count, i;
+ uint64_t pfn;
+
+ pfn_list = (uint64_t *)get_dirty_pfn(data);
+ assert(pfn_list);
+
+ count = pfn_list[0];
+ for (i = 0; i < count; i++) {
+ pfn = pfn_list[i + 1];
+ if (pfn > p2m_size) {
+ errno = EINVAL;
+ return -1;
+ }
+
+ set_bit(pfn, to_send);
+ }
+
+ free(pfn_list);
+ return 0;
+}
+
/*
** Map the top-level page of MFNs from the guest. The guest might not have
** finished resuming from a previous restore operation, so we wait a while for
@@ -1769,11 +1794,14 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
free(buf);
}
- if ( !callbacks->checkpoint )
+ if ( !callbacks->checkpoint || callbacks->get_dirty_pfn )
{
/*
* If this is not a checkpointed save then this must be the first and
* last checkpoint.
+ *
+ * If we are in colo mode, send last checkpoint to resume secondary
+ * vm.
*/
i = XC_SAVE_ID_LAST_CHECKPOINT;
if ( wrexact(io_fd, &i, sizeof(int)) )
@@ -2123,7 +2151,14 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
* primary vm and secondary vm now.
*/
if ( !rc && callbacks->postcopy && callbacks->get_dirty_pfn )
- callbacks->postcopy(callbacks->data);
+ {
+ errno = 0;
+ if ( !callbacks->postcopy(callbacks->data) )
+ {
+ ERROR("postcopy fails");
+ rc = errno ? : -1;
+ }
+ }
/* Enable compression now, finally */
compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS);
@@ -2153,6 +2188,16 @@ int xc_domain_save(xc_interface *xch, int io_fd, uint32_t dom, uint32_t max_iter
PERROR("Error flushing shadow PT");
}
+ if ( callbacks->get_dirty_pfn )
+ {
+ if ( update_dirty_bitmap(callbacks->get_dirty_pfn, callbacks->data,
+ dinfo->p2m_size, to_send) )
+ {
+ ERROR("getting secondary vm's dirty pages failed");
+ goto out;
+ }
+ }
+
goto copypages;
}
else
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 11/18] send store mfn and console mfn to xl before resuming secondary vm
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (9 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 10/18] COLO: xc related codes Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 12/18] implement the cmdline for COLO Wen Congyang
` (8 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
We will call libxl__xc_domain_restore_done() to rebuild secondary vm. But
we need store mfn and console mfn when rebuilding secondary vm. So make
restore_results is a function pointers in callbacks struct and struct
{save,restore}_callbacks, and use this callback to send store mfn and
console mfn to xl.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxc/include/xenguest.h | 8 ++++++++
tools/libxc/xc_domain_restore.c | 2 +-
tools/libxl/libxl_colo_restore.c | 5 -----
tools/libxl/libxl_create.c | 1 +
tools/libxl/libxl_save_msgs_gen.pl | 2 +-
5 files changed, 11 insertions(+), 7 deletions(-)
diff --git a/tools/libxc/include/xenguest.h b/tools/libxc/include/xenguest.h
index 1aeaad2..be8afd4 100644
--- a/tools/libxc/include/xenguest.h
+++ b/tools/libxc/include/xenguest.h
@@ -123,6 +123,14 @@ struct restore_callbacks {
/* Enable qemu-dm logging dirty pages to xen */
int (*switch_qemu_logdirty)(int domid, unsigned enable, void *data); /* HVM only */
+ /*
+ * callback to send store mfn and console mfn to xl
+ * if we want to resume vm before xc_domain_save()
+ * exits.
+ */
+ void (*restore_results)(unsigned long store_mfn, unsigned long console_mfn,
+ void *data);
+
/* callback to restore toolstack specific data */
int (*toolstack_restore)(uint32_t domid, const uint8_t *buf,
uint32_t size, void* data);
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
index 07ac49a..c7ef39f 100644
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -2408,7 +2408,7 @@ new_checkpoint:
} while (0)
/* COLO */
- /* TODO: call restore_results */
+ callbacks->restore_results(*store_mfn, *console_mfn, callbacks->data);
/* Resume secondary vm */
frc = callbacks->postcopy(callbacks->data);
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
index a61caa0..805d51f 100644
--- a/tools/libxl/libxl_colo_restore.c
+++ b/tools/libxl/libxl_colo_restore.c
@@ -142,11 +142,6 @@ static void colo_resume_vm(libxl__egc *egc,
return;
}
- /*
- * TODO: get store mfn and console mfn
- * We should call the callback restore_results in
- * xc_domain_restore() before resuming the guest.
- */
libxl__xc_domain_restore_done(egc, dcs, 0, 0, 0);
return;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 985c7cd..0c1c09c 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1102,6 +1102,7 @@ static void domcreate_bootloader_done(libxl__egc *egc,
rc = ERROR_INVAL;
goto out;
}
+ callbacks->restore_results = libxl__srm_callout_callback_restore_results;
if (checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO) {
crs->ao = ao;
diff --git a/tools/libxl/libxl_save_msgs_gen.pl b/tools/libxl/libxl_save_msgs_gen.pl
index fbb2d67..2ecd25d 100755
--- a/tools/libxl/libxl_save_msgs_gen.pl
+++ b/tools/libxl/libxl_save_msgs_gen.pl
@@ -32,7 +32,7 @@ our @msgs = (
# toolstack_save done entirely `by hand'
[ 7, 'rcxW', "toolstack_restore", [qw(uint32_t domid
BLOCK tsdata)] ],
- [ 8, 'r', "restore_results", ['unsigned long', 'store_mfn',
+ [ 8, 'rcx', "restore_results", ['unsigned long', 'store_mfn',
'unsigned long', 'console_mfn'] ],
[ 9, 'srW', "complete", [qw(int retval
int errnoval)] ],
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 12/18] implement the cmdline for COLO
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (10 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 11/18] send store mfn and console mfn to xl before resuming secondary vm Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 13/18] tools: xc_doamin_restore: zero ioreq page only one time Wen Congyang
` (7 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
Add a new option -c to the command 'xl remus'. If you want
to use COLO HA instead of Remus HA, please use -c option.
Update man pages to reflect the addition of a new option to
'xl remus' command.
Also add a new option -c to the internal command 'xl migrate-receive'.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
docs/man/xl.pod.1 | 12 +++++++++--
tools/libxl/libxl.c | 16 ++++++++++++++
tools/libxl/xl_cmdimpl.c | 53 +++++++++++++++++++++++++++++++++++++++--------
tools/libxl/xl_cmdtable.c | 4 +++-
4 files changed, 73 insertions(+), 12 deletions(-)
diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 6b89ba8..4c79ea0 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -432,12 +432,15 @@ Print huge (!) amount of debug during the migration process.
=item B<remus> [I<OPTIONS>] I<domain-id> I<host>
-Enable Remus HA for domain. By default B<xl> relies on ssh as a transport
-mechanism between the two hosts.
+Enable Remus HA or COLO HA for domain. By default B<xl> relies on ssh as a
+transport mechanism between the two hosts.
N.B: Remus support in xl is still in experimental (proof-of-concept) phase.
Disk replication support is limited to DRBD disks.
+ COLO support in xl is still in experimental (proof-of-concept) phase.
+ There is no support for network or disk at the moment.
+
B<OPTIONS>
=over 4
@@ -483,6 +486,11 @@ Disable network output buffering. Requires enabling unsafe mode.
Disable disk replication. Requires enabling unsafe mode.
+=item B<-c>
+
+Enable COLO HA. It is conflict with B<-i> and B<-b>, and memory
+checkpoint compression must be disabled.
+
=back
=item B<pause> I<domain-id>
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index b05a4bf..822f1f3 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -828,6 +828,22 @@ int libxl_domain_remus_start(libxl_ctx *ctx, libxl_domain_remus_info *info,
goto out;
}
+ /* The caller must set this defbool */
+ if (libxl_defbool_is_default(info->colo)) {
+ LOG(ERROR, "colo mode must be enabled/disabled");
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ if (libxl_defbool_val(info->colo)) {
+ libxl_defbool_setdefault(&info->compression, false);
+ if (libxl_defbool_val(info->compression)) {
+ LOG(ERROR, "cannot use memory checkpoint compression in COLO mode");
+ rc = ERROR_FAIL;
+ goto out;
+ }
+ }
+
libxl_defbool_setdefault(&info->allow_unsafe, false);
libxl_defbool_setdefault(&info->blackhole, false);
libxl_defbool_setdefault(&info->compression, true);
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index fea17ea..ec77217 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -4039,6 +4039,9 @@ static void migrate_receive(int debug, int daemonize, int monitor,
dom_info.send_fd = send_fd;
dom_info.migration_domname_r = &migration_domname;
dom_info.checkpointed_stream = remus;
+ if (remus == LIBXL_CHECKPOINTED_STREAM_COLO)
+ /* COLO uses stdout to send control message to master */
+ dom_info.quiet = 1;
rc = create_domain(&dom_info);
if (rc < 0) {
@@ -4053,7 +4056,8 @@ static void migrate_receive(int debug, int daemonize, int monitor,
/* If we are here, it means that the sender (primary) has crashed.
* TODO: Split-Brain Check.
*/
- fprintf(stderr, "migration target: Remus Failover for domain %u\n",
+ fprintf(stderr, "migration target: %s Failover for domain %u\n",
+ remus == LIBXL_CHECKPOINTED_STREAM_COLO ? "COLO" : "Remus",
domid);
/*
@@ -4070,15 +4074,21 @@ static void migrate_receive(int debug, int daemonize, int monitor,
rc = libxl_domain_rename(ctx, domid, migration_domname,
common_domname);
if (rc)
- fprintf(stderr, "migration target (Remus): "
+ fprintf(stderr, "migration target (%s): "
"Failed to rename domain from %s to %s:%d\n",
+ remus == LIBXL_CHECKPOINTED_STREAM_COLO ? "COLO" : "Remus",
migration_domname, common_domname, rc);
}
+ if (remus == LIBXL_CHECKPOINTED_STREAM_COLO)
+ /* The guest is running after failover in COLO mode */
+ exit(rc ? -ERROR_FAIL: 0);
+
rc = libxl_domain_unpause(ctx, domid);
if (rc)
- fprintf(stderr, "migration target (Remus): "
+ fprintf(stderr, "migration target (%s): "
"Failed to unpause domain %s (id: %u):%d\n",
+ remus == LIBXL_CHECKPOINTED_STREAM_COLO ? "COLO" : "Remus",
common_domname, domid, rc);
exit(rc ? -ERROR_FAIL: 0);
@@ -4224,7 +4234,7 @@ int main_migrate_receive(int argc, char **argv)
int debug = 0, daemonize = 1, monitor = 1, remus = 0;
int opt;
- SWITCH_FOREACH_OPT(opt, "Fedr", NULL, "migrate-receive", 0) {
+ SWITCH_FOREACH_OPT(opt, "Fedrc", NULL, "migrate-receive", 0) {
case 'F':
daemonize = 0;
break;
@@ -4236,8 +4246,10 @@ int main_migrate_receive(int argc, char **argv)
debug = 1;
break;
case 'r':
- remus = 1;
+ remus = LIBXL_CHECKPOINTED_STREAM_REMUS;
break;
+ case 'c':
+ remus = LIBXL_CHECKPOINTED_STREAM_COLO;
}
if (argc-optind != 0) {
@@ -7711,15 +7723,18 @@ int main_remus(int argc, char **argv)
pid_t child = -1;
uint8_t *config_data;
int config_len;
+ int interval = 0;
memset(&r_info, 0, sizeof(libxl_domain_remus_info));
/* Defaults */
r_info.interval = 200;
libxl_defbool_setdefault(&r_info.blackhole, false);
+ libxl_defbool_setdefault(&r_info.colo, false);
- SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:e", NULL, "remus", 2) {
+ SWITCH_FOREACH_OPT(opt, "Fbundi:s:N:ec", NULL, "remus", 2) {
case 'i':
r_info.interval = atoi(optarg);
+ interval = 1;
break;
case 'F':
libxl_defbool_set(&r_info.allow_unsafe, true);
@@ -7745,11 +7760,28 @@ int main_remus(int argc, char **argv)
case 'e':
daemonize = 0;
break;
+ case 'c':
+ libxl_defbool_set(&r_info.colo, true);
}
domid = find_domain(argv[optind]);
host = argv[optind + 1];
+ if (libxl_defbool_val(r_info.colo)) {
+ if (!interval)
+ r_info.interval = 0;
+
+ if (r_info.interval || libxl_defbool_val(r_info.blackhole)) {
+ perror("option -c is conflict with -i or -b");
+ exit(-1);
+ }
+
+ if (libxl_defbool_is_default(r_info.compression)) {
+ perror("option -u must be specified when using COLO");
+ exit(-1);
+ }
+ }
+
if (!r_info.netbufscript)
r_info.netbufscript = default_remus_netbufscript;
@@ -7764,8 +7796,9 @@ int main_remus(int argc, char **argv)
if (!ssh_command[0]) {
rune = host;
} else {
- if (asprintf(&rune, "exec %s %s xl migrate-receive -r %s",
+ if (asprintf(&rune, "exec %s %s xl migrate-receive %s %s",
ssh_command, host,
+ libxl_defbool_val(r_info.colo) ? "-c" : "-r",
daemonize ? "" : " -e") < 0)
return 1;
}
@@ -7794,7 +7827,8 @@ int main_remus(int argc, char **argv)
* domain to force failover
*/
if (libxl_domain_info(ctx, 0, domid)) {
- fprintf(stderr, "Remus: Primary domain has been destroyed.\n");
+ fprintf(stderr, "%s: Primary domain has been destroyed.\n",
+ libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
close(send_fd);
return 0;
}
@@ -7806,7 +7840,8 @@ int main_remus(int argc, char **argv)
if (rc == ERROR_GUEST_TIMEDOUT)
fprintf(stderr, "Failed to suspend domain at primary.\n");
else {
- fprintf(stderr, "Remus: Backup failed? resuming domain at primary.\n");
+ fprintf(stderr, "%s: Backup failed? resuming domain at primary.\n",
+ libxl_defbool_val(r_info.colo) ? "COLO" : "Remus");
libxl_domain_resume(ctx, domid, 1, 0);
}
diff --git a/tools/libxl/xl_cmdtable.c b/tools/libxl/xl_cmdtable.c
index 4b30d3d..6b1283c 100644
--- a/tools/libxl/xl_cmdtable.c
+++ b/tools/libxl/xl_cmdtable.c
@@ -513,7 +513,9 @@ struct cmd_spec cmd_table[] = {
"-b Replicate memory checkpoints to /dev/null (blackhole).\n"
" Works only in unsafe mode.\n"
"-n Disable network output buffering. Works only in unsafe mode.\n"
- "-d Disable disk replication. Works only in unsafe mode."
+ "-d Disable disk replication. Works only in unsafe mode.\n"
+ "-c Enable COLO HA. It is conflict with -i and -b, and memory\n"
+ " checkpoint must be disabled"
},
#endif
{ "devd",
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 13/18] tools: xc_doamin_restore: zero ioreq page only one time
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (11 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 12/18] implement the cmdline for COLO Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 14/18] block-colo: implement colo disk replication Wen Congyang
` (6 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
ioreq page contains evtchn which will be set when we resume the
secondary vm the first time. The hypervisor will check if the
evtchn is corrupted, so we cannot zero the ioreq page more
than one time.
The ioreq->state is always STATE_IOREQ_NONE after the vm is
suspended, so it is OK if we only zero it one time.
---
tools/libxc/xc_domain_restore.c | 24 +++++++++++++++++++++---
1 file changed, 21 insertions(+), 3 deletions(-)
diff --git a/tools/libxc/xc_domain_restore.c b/tools/libxc/xc_domain_restore.c
index c7ef39f..497dadb 100644
--- a/tools/libxc/xc_domain_restore.c
+++ b/tools/libxc/xc_domain_restore.c
@@ -1501,6 +1501,7 @@ int xc_domain_restore(xc_interface *xch, int io_fd, uint32_t dom,
struct restore_ctx _ctx;
struct restore_ctx *ctx = &_ctx;
struct domain_info_context *dinfo = &ctx->dinfo;
+ int skip_clear_ioreq_page = 0;
DPRINTF("%s: starting restore of new domid %u", __func__, dom);
@@ -2331,13 +2332,30 @@ new_checkpoint:
}
/* These comms pages need to be zeroed at the start of day */
- if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[0]) ||
- xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[1]) ||
- xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[2]) )
+ if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[2]) )
{
PERROR("error zeroing magic pages");
goto out;
}
+ if ( !skip_clear_ioreq_page )
+ {
+ if ( xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[0]) ||
+ xc_clear_domain_page(xch, dom, tailbuf.u.hvm.magicpfns[1]) )
+ {
+ PERROR("error zeroing magic pages");
+ goto out;
+ }
+ /*
+ * ioreq page contains evtchn which will be set when we resume the
+ * secondary vm the first time. The hypervisor will check if the
+ * evtchn is corrupted, so we cann't clear the ioreq page more
+ * than one time.
+ *
+ * The ioreq->state is always STATE_IOREQ_NONE after the vm is
+ * suspended, so it is OK if we only clear it one time.
+ */
+ skip_clear_ioreq_page = 1;
+ }
if ( (frc = xc_hvm_param_set(xch, dom,
HVM_PARAM_IOREQ_PFN, tailbuf.u.hvm.magicpfns[0]))
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 14/18] block-colo: implement colo disk replication
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (12 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 13/18] tools: xc_doamin_restore: zero ioreq page only one time Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 15/18] libxl/colo: setup and control disk replication for blktap2 backends Wen Congyang
` (5 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Shriram Rajagopalan, Yang Hongyang, Lai Jiangshan
TODO:
update block-remus to use async io to instead of mread/mwrite.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: Shriram Rajagopalan <rshriram@cs.ubc.ca>
---
tools/blktap2/drivers/Makefile | 3 +
tools/blktap2/drivers/block-colo.c | 1132 +++++++++++++++++++++++++++++
tools/blktap2/drivers/block-remus.c | 4 +-
tools/blktap2/drivers/block-replication.c | 262 ++++++-
tools/blktap2/drivers/block-replication.h | 77 +-
tools/blktap2/drivers/tapdisk-disktype.c | 9 +
tools/blktap2/drivers/tapdisk-disktype.h | 3 +-
7 files changed, 1476 insertions(+), 14 deletions(-)
create mode 100644 tools/blktap2/drivers/block-colo.c
diff --git a/tools/blktap2/drivers/Makefile b/tools/blktap2/drivers/Makefile
index a7f45c7..7b16e05 100644
--- a/tools/blktap2/drivers/Makefile
+++ b/tools/blktap2/drivers/Makefile
@@ -31,6 +31,8 @@ REMUS-OBJS += hashtable_itr.o
REMUS-OBJS += hashtable_utility.o
REMUS-OBJS += block-replication.o
+COLO-OBJS += block-colo.o
+
tapdisk2 tapdisk-stream tapdisk-diff $(QCOW_UTIL): AIOLIBS := -laio
MEMSHRLIBS :=
@@ -77,6 +79,7 @@ BLK-OBJS-y += aes.o
BLK-OBJS-y += md5.o
BLK-OBJS-y += $(PORTABLE-OBJS-y)
BLK-OBJS-y += $(REMUS-OBJS)
+BLK-OBJS-y += $(COLO-OBJS)
all: $(IBIN) lock-util qcow-util
diff --git a/tools/blktap2/drivers/block-colo.c b/tools/blktap2/drivers/block-colo.c
new file mode 100644
index 0000000..c8877cf
--- /dev/null
+++ b/tools/blktap2/drivers/block-colo.c
@@ -0,0 +1,1132 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "tapdisk.h"
+#include "tapdisk-server.h"
+#include "tapdisk-driver.h"
+#include "tapdisk-interface.h"
+#include "block-replication.h"
+
+#include <errno.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/un.h>
+#include <unistd.h>
+
+/* connect retry timeout (seconds) */
+#define COLO_CONNRETRY_TIMEOUT 1
+
+/* timeout for reads and writes in second */
+#define HEARTBEAT_S 1
+
+/* TAPDISK_DATA_REQUESTS I/O requests + commit flag */
+#define MAX_COLO_REQUEST TAPDISK_DATA_REQUESTS + 1
+
+#undef DPRINTF
+#undef EPRINTF
+#define DPRINTF(_f, _a...) syslog (LOG_DEBUG, "COLO: " _f, ## _a)
+#define EPRINTF(_f, _a...) syslog (LOG_ERR, "COLO: " _f, ## _a)
+
+#define TDCOLO_WRITE "wreq"
+#define TDCOLO_COMMIT "creq"
+#define TDCOLO_DONE "done"
+#define TDCOLO_FAIL "fail"
+
+enum tdcolo_mode {
+ mode_invalid = 0,
+ mode_unprotected,
+ mode_primary,
+ mode_backup,
+
+ /*
+ * If we find some internal error in backup mode, we cannot
+ * switch to unprotected mode.
+ */
+ mode_failed,
+};
+
+enum {
+ colo_io,
+ colo_commit,
+};
+
+typedef struct queued_io {
+ int type;
+ union {
+ td_request_t treq;
+ char *buff; /* TDCOLO_COMMIT */
+ };
+} queued_io_t;
+
+struct queued_io_ring {
+ /* waste one slot to distinguish between empty and full */
+ queued_io_t qio[MAX_COLO_REQUEST + 1];
+ unsigned int prod;
+ unsigned int cons;
+};
+
+typedef struct colo_control {
+ /*
+ * socket file, the user writes "flush" to this socket, and then
+ * we write the result to it.
+ */
+ char *path;
+ int listen_fd;
+ event_id_t listen_id;
+
+ int io_fd;
+ event_id_t io_id;
+} colo_control_t;
+
+struct tdcolo_state {
+ colo_control_t ctl;
+
+ /* async connection */
+ td_replication_connect_t t;
+ /* replication channel */
+ td_async_io_t rio, wio;
+
+ /*
+ * queue I/O requests, and they will be forwarded to backup
+ * asynchronously.
+ */
+ struct queued_io_ring qio_ring;
+
+ /* ramdisk data */
+ struct ramdisk ramdisk;
+
+ /* mode methods */
+ enum tdcolo_mode mode;
+ /* It will be called when switching mode */
+ int (*queue_flush)(struct tdcolo_state *c);
+
+ char request[5];
+ char header[sizeof(uint32_t) + sizeof(uint64_t)];
+ int commit;
+ void *buff;
+ int bsize;
+ int sector_size;
+};
+
+struct tap_disk tapdisk_colo;
+
+static void colo_control_respond(colo_control_t *ctl, const char *response);
+static int switch_mode(struct tdcolo_state *c, enum tdcolo_mode mode);
+
+/* ======== common functions ======== */
+static int check_read_result(td_async_io_t *rio, int realsize,
+ const char *target)
+{
+ if (realsize < 0) {
+ /* internal error */
+ EPRINTF("error reading from %s\n", target);
+ return ERROR_INTERNAL;
+ } else if (realsize < rio->size) {
+ /* timeout or I/O error */
+ EPRINTF("error reading from %s\n", target);
+ return ERROR_IO;
+ }
+
+ return 0;
+}
+
+static int check_write_result(td_async_io_t *wio, int realsize,
+ const char * target)
+{
+ if (realsize < 0) {
+ /* internal error */
+ EPRINTF("error writing to %s\n", target);
+ return ERROR_INTERNAL;
+ } else if (realsize == 0) {
+ /* timeout or I/O error */
+ EPRINTF("error writing to %s\n", target);
+ return ERROR_IO;
+ }
+
+ return 0;
+}
+
+/* ======= ring functions ======== */
+static inline unsigned int ring_next(unsigned int pos)
+{
+ if (++pos > MAX_COLO_REQUEST)
+ return 0;
+
+ return pos;
+}
+
+static inline int ring_isempty(struct queued_io_ring* ring)
+{
+ return ring->cons == ring->prod;
+}
+
+static inline int ring_isfull(struct queued_io_ring* ring)
+{
+ return ring_next(ring->prod) == ring->cons;
+}
+
+static void ring_add_request(struct queued_io_ring *ring,
+ const td_request_t *treq)
+{
+ /* If ring is full, it means that tapdisk2 has some bug */
+ if (ring_isfull(ring)) {
+ EPRINTF("OOPS, ring is full\n");
+ exit(1);
+ }
+
+ ring->qio[ring->prod].type = colo_io;
+ ring->qio[ring->prod].treq = *treq;
+ ring->prod = ring_next(ring->prod);
+}
+
+static void ring_add_commit_flag(struct queued_io_ring *ring)
+{
+ /* If ring is full, it means that tapdisk2 has some bug */
+ if (ring_isfull(ring)) {
+ EPRINTF("OOPS, ring is full\n");
+ exit(1);
+ }
+
+ ring->qio[ring->prod].type = colo_commit;
+ ring->qio[ring->prod].buff = TDCOLO_COMMIT;
+ ring->prod = ring_next(ring->prod);
+}
+
+/* return the first queued I/O request */
+static queued_io_t *ring_peek(struct queued_io_ring *ring)
+{
+ queued_io_t *qio;
+
+ if (ring_isempty(ring))
+ return NULL;
+
+ qio = &ring->qio[ring->cons];
+ return qio;
+}
+
+/* consume the first queued I/O request, and return it */
+static queued_io_t *ring_get(struct queued_io_ring *ring)
+{
+ queued_io_t *qio;
+
+ if (ring_isempty(ring))
+ return NULL;
+
+ qio = &ring->qio[ring->cons];
+ ring->cons = ring_next(ring->cons);
+ return qio;
+}
+
+/* ======== primary read/write functions ======== */
+static void primary_write_header(td_async_io_t *wio, int realsize, int errnoval);
+static void primary_write_data(td_async_io_t *wio, int realsize, int errnoval);
+static void primary_forward_done(td_async_io_t *wio, int realsize, int errnoval);
+static void primary_read_done(td_async_io_t *rio, int realsize, int errnoval);
+
+/*
+ * It is called when we cannot connect to backup, or find I/O error when
+ * reading/writing.
+ */
+static void primary_failed(struct tdcolo_state *c, int rc)
+{
+ td_replication_connect_kill(&c->t);
+ td_async_io_kill(&c->rio);
+ td_async_io_kill(&c->wio);
+ if (rc == ERROR_INTERNAL)
+ EPRINTF("switch to unprotected mode due to internal error");
+ if (rc == ERROR_CLOSE)
+ DPRINTF("switch to unprotected mode before closing");
+ switch_mode(c, mode_unprotected);
+}
+
+static void primary_waio(struct tdcolo_state *c, void *buff, size_t size,
+ taio_callback *callback)
+{
+ td_async_io_t *wio = &c->wio;
+
+ wio->fd = c->t.fd;
+ wio->timeout_s = HEARTBEAT_S;
+ wio->mode = td_async_write;
+ wio->buff = buff;
+ wio->size = size;
+ wio->callback = callback;
+
+ if (td_async_io_start(wio))
+ primary_failed(c, ERROR_INTERNAL);
+}
+
+static void primary_raio(struct tdcolo_state *c)
+{
+ td_async_io_t *rio = &c->rio;
+
+ if (c->t.fd < 0)
+ return;
+
+ rio->fd = c->t.fd;
+ rio->timeout_s = 0;
+ rio->mode = td_async_read;
+ rio->buff = c->request;
+ rio->size = sizeof(c->request) - 1;
+ rio->callback = primary_read_done;
+
+ if (td_async_io_start(rio))
+ primary_failed(c, ERROR_INTERNAL);
+}
+
+static void primary_handle_queued_io(struct tdcolo_state *c)
+{
+ struct queued_io_ring *qring = &c->qio_ring;
+ unsigned int cons;
+ queued_io_t *qio;
+ int rc;
+
+ while (!ring_isempty(qring)) {
+ qio = ring_peek(qring);
+ if (qio->type == colo_commit) {
+ primary_waio(c, qio->buff, strlen(qio->buff),
+ primary_forward_done);
+ return;
+ }
+
+ if (qio->treq.op == TD_OP_WRITE) {
+ primary_waio(c, TDCOLO_WRITE, strlen(TDCOLO_WRITE),
+ primary_write_header);
+ return;
+ }
+
+ td_forward_request(qio->treq);
+ ring_get(qring);
+ }
+}
+
+/* wait for "done" message to commit checkpoint */
+static void primary_read_done(td_async_io_t *rio, int realsize, int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(rio, *c, rio);
+ char *req = c->request;
+ int rc;
+
+ rc = check_read_result(rio, realsize, "backup");
+ if (rc)
+ goto err;
+
+ rc = ERROR_INTERNAL;
+ req[4] = '\0';
+
+ if (c->commit != 1) {
+ EPRINTF("received unexpected message: %s\n", req);
+ goto err;
+ }
+
+ c->commit--;
+
+ if (strcmp(req, TDCOLO_DONE)) {
+ EPRINTF("received unknown message: %s\n", req);
+ goto err;
+ }
+
+ /* checkpoint committed, inform msg_fd */
+ colo_control_respond(&c->ctl, TDCOLO_DONE);
+ primary_raio(c);
+
+ return;
+err:
+ colo_control_respond(&c->ctl, TDCOLO_FAIL);
+ primary_failed(c, rc);
+}
+
+static void primary_write_header(td_async_io_t *wio, int realsize, int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(wio, *c, wio);
+ queued_io_t *qio = ring_peek(&c->qio_ring);
+ uint32_t *sectors = (uint32_t *)c->header;
+ uint64_t *sector = (uint64_t *)(c->header + sizeof(uint32_t));
+ int rc;
+
+ rc = check_write_result(wio, realsize, "backup");
+ if (rc) {
+ primary_failed(c, rc);
+ return;
+ }
+
+ *sectors = qio->treq.secs;
+ *sector = qio->treq.sec;
+
+ primary_waio(c, c->header, sizeof(c->header), primary_write_data);
+}
+
+static void primary_write_data(td_async_io_t *wio, int realsize, int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(wio, *c, wio);
+ queued_io_t *qio = ring_peek(&c->qio_ring);
+ int rc;
+
+ rc = check_write_result(wio, realsize, "backup");
+ if (rc) {
+ primary_failed(c, rc);
+ return;
+ }
+
+ primary_waio(c, qio->treq.buf, qio->treq.secs * c->sector_size,
+ primary_forward_done);
+}
+
+static void primary_forward_done(td_async_io_t *wio, int realsize, int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(wio, *c, wio);
+ queued_io_t *qio;
+ struct td_request_t *treq;
+ int rc;
+
+ rc = check_write_result(wio, realsize, "backup");
+ if (rc) {
+ primary_failed(c, rc);
+ return;
+ }
+
+ qio = ring_get(&c->qio_ring);
+ if (qio->type == colo_io)
+ td_forward_request(qio->treq);
+ else
+ c->commit--;
+
+ primary_handle_queued_io(c);
+}
+
+static void primary_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdcolo_state *c = driver->data;
+ struct queued_io_ring *ring = &c->qio_ring;
+
+ if (ring_isempty(ring)) {
+ /* just pass read through */
+ td_forward_request(treq);
+ return;
+ }
+
+ ring_add_request(ring, &treq);
+ if (td_replication_connect_status(&c->t) != 1)
+ return;
+
+ if (!td_async_io_is_running(&c->wio))
+ primary_handle_queued_io(c);
+}
+
+static void primary_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdcolo_state *c = driver->data;
+ struct queued_io_ring *ring = &c->qio_ring;
+
+ ring_add_request(ring, &treq);
+ if (td_replication_connect_status(&c->t) != 1)
+ return;
+
+ if (!td_async_io_is_running(&c->wio))
+ primary_handle_queued_io(c);
+}
+
+/* It is called when the user write "flush" to control file. */
+static int client_flush(struct tdcolo_state *c)
+{
+ if (td_replication_connect_status(&c->t) != 1)
+ return 0;
+
+ if (c->commit > 0) {
+ EPRINTF("the last commit is not finished\n");
+ colo_control_respond(&c->ctl, TDCOLO_FAIL);
+ primary_failed(c, ERROR_INTERNAL);
+ return -1;
+ }
+
+ ring_add_commit_flag(&c->qio_ring);
+ c->commit = 2;
+ if (!td_async_io_is_running(&c->wio))
+ primary_handle_queued_io(c);
+
+ return 0;
+}
+
+/* It is called when switching the mode from primary to unprotected */
+static int primary_flush(struct tdcolo_state *c)
+{
+ struct queued_io_ring *qring = &c->qio_ring;
+ queued_io_t *qio;
+
+ if (ring_isempty(qring))
+ return 0;
+
+ while (!ring_isempty(qring)) {
+ qio = ring_get(qring);
+
+ if (qio->type == colo_commit) {
+ colo_control_respond(&c->ctl, TDCOLO_FAIL);
+ c->commit = 0;
+ continue;
+ }
+
+ td_forward_request(qio->treq);
+ }
+
+ return 0;
+}
+
+static void colo_client_established(td_replication_connect_t *t, int rc)
+{
+ struct tdcolo_state *c = CONTAINER_OF(t, *c, t);
+
+ if (rc) {
+ primary_failed(c, rc);
+ return;
+ }
+
+ /* the connect succeeded and handle the queued requests */
+ primary_handle_queued_io(c);
+
+ primary_raio(c);
+}
+
+static int primary_start(struct tdcolo_state *c)
+{
+ DPRINTF("activating client mode\n");
+
+ tapdisk_colo.td_queue_read = primary_queue_read;
+ tapdisk_colo.td_queue_write = primary_queue_write;
+ c->queue_flush = primary_flush;
+
+ c->t.callback = colo_client_established;
+ return td_replication_client_start(&c->t);
+}
+
+/* ======== backup read/write functions ======== */
+static void backup_read_header_done(td_async_io_t *rio, int realsize,
+ int errnoval);
+static void backup_read_data_done(td_async_io_t *rio, int realsize,
+ int errnoval);
+static void backup_write_done(td_async_io_t *wio, int realsize, int errnoval);
+
+static void backup_failed(struct tdcolo_state *c, int rc)
+{
+ td_replication_connect_kill(&c->t);
+ td_async_io_kill(&c->rio);
+ td_async_io_kill(&c->wio);
+
+ if (rc == ERROR_INTERNAL) {
+ EPRINTF("switch to failed mode due to internal error");
+ switch_mode(c, mode_failed);
+ return;
+ }
+
+ if (rc == ERROR_CLOSE)
+ DPRINTF("switch to unprotected mode before closing");
+
+ switch_mode(c, mode_unprotected);
+}
+
+static void backup_raio(struct tdcolo_state *c, void *buff, int size,
+ int timeout_s, taio_callback *callback)
+{
+ td_async_io_t *rio = &c->rio;
+
+ rio->fd = c->t.fd;
+ rio->timeout_s = timeout_s;
+ rio->mode = td_async_read;
+ rio->buff = buff;
+ rio->size = size;
+ rio->callback = callback;
+
+ if (td_async_io_start(rio)) {
+ EPRINTF("cannot start read aio\n");
+ backup_failed(c, ERROR_INTERNAL);
+ }
+}
+
+static void backup_waio(struct tdcolo_state *c)
+{
+ td_async_io_t *wio = &c->wio;
+
+ wio->fd = c->t.fd;
+ wio->timeout_s = HEARTBEAT_S;
+ wio->mode = td_async_write;
+ wio->buff = TDCOLO_DONE;
+ wio->size = strlen(TDCOLO_DONE);
+ wio->callback = backup_write_done;
+
+ if (td_async_io_start(wio)) {
+ EPRINTF("cannot start write aio\n");
+ backup_failed(c, ERROR_INTERNAL);
+ }
+}
+
+static void backup_read_req_done(td_async_io_t *rio, int realsize,
+ int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(rio, *c, rio);
+ char *req = c->request;
+ int rc;
+
+ rc = check_read_result(rio, realsize, "primary");
+ if (rc)
+ goto err;
+
+ rc = ERROR_INTERNAL;
+ req[4] = '\0';
+
+ if (!strcmp(req, TDCOLO_WRITE)) {
+ backup_raio(c, c->header, sizeof(c->header), HEARTBEAT_S,
+ backup_read_header_done);
+ return;
+ } else if (!strcmp(req, TDCOLO_COMMIT)) {
+ rc = ramdisk_clear_cache(&c->ramdisk, 0);
+ if (rc) {
+ EPRINTF("error clearing secondary cache\n");
+ goto err;
+ }
+ rc = ramdisk_start_flush(&c->ramdisk, 1);
+ if (rc) {
+ EPRINTF("error flushing queued I/O\n");
+ goto err;
+ }
+
+ backup_waio(c);
+ } else {
+ EPRINTF("unsupported request: %s\n", req);
+ goto err;
+ }
+
+ return;
+
+err:
+ backup_failed(c, ERROR_INTERNAL);
+ return;
+}
+
+static void backup_read_header_done(td_async_io_t *rio, int realsize,
+ int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(rio, *c, rio);
+ uint32_t *sectors = (uint32_t *)c->header;
+ int rc;
+
+ rc = check_read_result(rio, realsize, "primary");
+ if (rc)
+ goto err;
+
+ rc = ERROR_INTERNAL;
+ if (*sectors * c->sector_size > c->bsize) {
+ EPRINTF("write request is too large: %d/%d\n",
+ *sectors * c->sector_size, c->bsize);
+ goto err;
+ }
+
+ backup_raio(c, c->buff, *sectors * c->sector_size, HEARTBEAT_S,
+ backup_read_data_done);
+
+ return;
+err:
+ backup_failed(c, rc);
+}
+
+static void backup_read_data_done(td_async_io_t *rio, int realsize,
+ int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(rio, *c, rio);
+ uint32_t *sectors = (uint32_t *)c->header;
+ uint64_t *sector = (uint64_t *)(c->header + sizeof(uint32_t));
+ int rc;
+
+ rc = check_read_result(rio, realsize, "primary");
+ if (rc)
+ goto err;
+
+ rc = ramdisk_cache_write_request(&c->ramdisk, *sector, *sectors,
+ c->sector_size, c->buff, "COLO", 1);
+ if (rc) {
+ EPRINTF("cannot write primary data to hashtable\n");
+ rc = ERROR_INTERNAL;
+ goto err;
+ }
+
+ backup_raio(c, c->request, sizeof(c->request) - 1, 0,
+ backup_read_req_done);
+
+ return;
+err:
+ backup_failed(c, rc);
+}
+
+static void backup_write_done(td_async_io_t *wio, int realsize, int errnoval)
+{
+ struct tdcolo_state *c = CONTAINER_OF(wio, *c, wio);
+ int rc;
+
+ rc = check_write_result(wio, realsize, "primary");
+ if (rc) {
+ backup_failed(c, rc);
+ return;
+ }
+
+ backup_raio(c, c->request, sizeof(c->request) - 1, 0,
+ backup_read_req_done);
+}
+
+static void colo_server_established(td_replication_connect_t *t, int rc)
+{
+ struct tdcolo_state *c = CONTAINER_OF(t, *c, t);
+
+ if (rc) {
+ backup_failed(c, rc);
+ return;
+ }
+
+ backup_raio(c, c->request, sizeof(c->request) - 1, 0,
+ backup_read_req_done);
+}
+
+/* It is called when switching the mode from backup to unprotected */
+static int backup_flush(struct tdcolo_state *c)
+{
+ int rc;
+
+ rc = ramdisk_start_flush(&c->ramdisk, 0);
+ if (rc)
+ EPRINTF("error flushing local queued I/O\n");
+
+ return 0;
+}
+
+static void backup_queue_read(td_driver_t *driver, td_request_t treq)
+{
+ struct tdcolo_state *c = driver->data;
+
+ if (ramdisk_read_from_cache(&c->ramdisk, treq.sec, treq.secs,
+ c->sector_size, treq.buf, 0))
+ /* FIXME */
+ td_forward_request(treq);
+ else
+ /* complete the request */
+ td_complete_request(treq, 0);
+}
+
+static void backup_queue_write(td_driver_t *driver, td_request_t treq)
+{
+ struct tdcolo_state *c = driver->data;
+ int rc;
+
+ rc = ramdisk_cache_write_request(&c->ramdisk, treq.sec, treq.secs,
+ c->sector_size, treq.buf,
+ "COLO", 0);
+ if (rc)
+ td_complete_request(treq, -EBUSY);
+ else
+ td_complete_request(treq, 0);
+}
+
+static int backup_start(struct tdcolo_state *c)
+{
+ tapdisk_colo.td_queue_read = backup_queue_read;
+ tapdisk_colo.td_queue_write = backup_queue_write;
+ c->queue_flush = backup_flush;
+
+ c->bsize = sysconf(_SC_PAGESIZE);
+ c->buff = malloc(c->bsize);
+ if (!c->buff)
+ return -1;
+
+ return 0;
+}
+
+/* ======== unprotected read/write functions ======== */
+void unprotected_queue_io(td_driver_t *driver, td_request_t treq)
+{
+ struct tdcolo_state *c = driver->data;
+
+ /* wait for previous ramdisk to flush before servicing I/O */
+ if (ramdisk_writes_inflight(&c->ramdisk)) {
+ ramdisk_flush_pended_requests(&c->ramdisk);
+ td_complete_request(treq, -EBUSY);
+ } else {
+ /* here we just pass I/O through */
+ td_forward_request(treq);
+ }
+}
+
+static int unprotected_start(struct tdcolo_state *c)
+{
+ DPRINTF("failure detected, activating passthrough\n");
+
+ /* install the unprotected read/write handlers */
+ tapdisk_colo.td_queue_read = unprotected_queue_io;
+ tapdisk_colo.td_queue_write = unprotected_queue_io;
+ c->queue_flush = NULL;
+
+ return 0;
+}
+
+/* ======== failed read/write functions ======== */
+static void failed_queue_io(td_driver_t *driver, td_request_t treq)
+{
+ td_complete_request(treq, -EIO);
+}
+
+static int failed_start(struct tdcolo_state *c)
+{
+ tapdisk_colo.td_queue_read = failed_queue_io;
+ tapdisk_colo.td_queue_write = failed_queue_io;
+ c->queue_flush = NULL;
+
+ return 0;
+}
+
+/* ======== control ======== */
+static void colo_control_accept(event_id_t id, char mode, void *private);
+static void colo_control_handle_request(event_id_t id, char mode,
+ void *private);
+static void colo_control_close(colo_control_t *ctl);
+
+static void colo_control_init(colo_control_t *ctl)
+{
+ ctl->listen_fd = -1;
+ ctl->listen_id = -1;
+ ctl->io_fd = -1;
+ ctl->io_id = -1;
+}
+
+static int colo_create_control_socket(colo_control_t *ctl, const char *name)
+{
+ int i, l;
+ struct sockaddr_un saddr;
+ event_id_t id;
+ int rc;
+
+ /* first we must ensure that BLKTAP_CTRL_DIR exists */
+ if (mkdir(BLKTAP_CTRL_DIR, 0755) && errno != EEXIST) {
+ rc = -errno;
+ EPRINTF("error creating directory %s: %d\n",
+ BLKTAP_CTRL_DIR, errno);
+ goto fail;
+ }
+
+ /* use the device name to create the control socket path */
+ if (asprintf(&ctl->path, BLKTAP_CTRL_DIR "/colo_%s", name) < 0) {
+ rc = -errno;
+ goto fail;
+ }
+
+ /* scrub socket pathname */
+ l = strlen(ctl->path);
+ for (i = strlen(BLKTAP_CTRL_DIR) + 1; i < l; i++) {
+ if (strchr(":/", ctl->path[i]))
+ ctl->path[i] = '_';
+ }
+
+ if (unlink(ctl->path) && errno != ENOENT) {
+ rc = -errno;
+ EPRINTF("failed to unlink %s: %d\n", ctl->path, errno);
+ goto fail;
+ }
+
+ ctl->listen_fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (ctl->listen_fd == -1) {
+ rc = -errno;
+ EPRINTF("failed to create control socket: %d\n", errno);
+ goto fail;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ strncpy(saddr.sun_path, ctl->path, sizeof(saddr.sun_path));
+ saddr.sun_family = AF_UNIX;
+
+ rc = bind(ctl->listen_fd, (const struct sockaddr *)&saddr,
+ sizeof(saddr));
+ if (rc == -1) {
+ rc = -errno;
+ EPRINTF("failed to bind to %s: %d\n", saddr.sun_path, errno);
+ goto fail;
+ }
+
+ rc = listen(ctl->listen_fd, 10);
+ if (rc == -1) {
+ rc = -errno;
+ EPRINTF("failed to listen: %d\n", errno);
+ goto fail;
+ }
+
+ id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ ctl->listen_fd, 0,
+ colo_control_accept, ctl);
+ if (id < 0) {
+ EPRINTF("failed to add watch: %d\n", id);
+ rc = id;
+ goto fail;
+ }
+
+ ctl->listen_id = id;
+ return 0;
+
+fail:
+ colo_control_close(ctl);
+ return rc;
+}
+
+static void colo_control_accept(event_id_t id, char mode, void *private)
+{
+ colo_control_t *ctl = private;
+ int fd;
+
+ fd = accept(ctl->listen_fd, NULL, NULL);
+ if (fd == -1) {
+ EPRINTF("failed to accept new control connection: %d\n", errno);
+ return;
+ }
+
+ if (ctl->io_fd >= 0) {
+ EPRINTF("cannot accept two control connections\n");
+ close(fd);
+ return;
+ }
+
+ id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ fd, 0,
+ colo_control_handle_request,
+ ctl);
+ if (id < 0) {
+ close(fd);
+ EPRINTF("failed to register new control event: %d\n", id);
+ return;
+ }
+
+ ctl->io_fd = fd;
+ ctl->io_id = id;
+}
+
+static void colo_control_handle_request(event_id_t id, char mode, void *private)
+{
+ colo_control_t *ctl = private;
+ struct tdcolo_state *c = CONTAINER_OF(ctl, *c, ctl);
+ char req[6];
+ int rc;
+
+ rc = read(ctl->io_fd, req, sizeof(req) - 1);
+ if (!rc) {
+ EPRINTF("0-byte read received, close control socket\n");
+ goto err;
+ }
+
+ if (rc < 0) {
+ EPRINTF("error reading from control socket: %d\n", errno);
+ goto err;
+ }
+
+ req[rc] = '\0';
+ if (strncmp(req, "flush", 5)) {
+ EPRINTF("unknown command: %s\n", req);
+ colo_control_respond(ctl, TDCOLO_FAIL);
+ return;
+ }
+
+ if (c->mode != mode_primary) {
+ EPRINTF("invalid mode: %d\n", c->mode);
+ colo_control_respond(ctl, TDCOLO_FAIL);
+ return;
+ }
+
+ client_flush(c);
+ return;
+
+err:
+ UNREGISTER_EVENT(ctl->io_id);
+ CLOSE_FD(ctl->io_fd);
+ return;
+}
+
+static void colo_control_respond(colo_control_t *ctl, const char *response)
+{
+ int rc;
+
+ if (ctl->io_fd < 0)
+ return;
+
+ rc = write(ctl->io_fd, response, strlen(response));
+ if (rc < 0) {
+ EPRINTF("error writing notification: %d\n", errno);
+ CLOSE_FD(ctl->io_fd);
+ }
+}
+
+static void colo_control_close(colo_control_t *ctl)
+{
+ UNREGISTER_EVENT(ctl->listen_id);
+ UNREGISTER_EVENT(ctl->io_id);
+ CLOSE_FD(ctl->listen_fd);
+ CLOSE_FD(ctl->io_fd);
+
+ if (ctl->path) {
+ unlink(ctl->path);
+ free(ctl->path);
+ ctl->path = NULL;
+ }
+}
+
+/* ======== interface ======== */
+static int tdcolo_close(td_driver_t *driver);
+
+static int switch_mode(struct tdcolo_state *c, enum tdcolo_mode mode)
+{
+ int rc;
+
+ if (mode == c->mode)
+ return 0;
+
+ if (c->queue_flush)
+ if ((rc = c->queue_flush(c)) < 0) {
+ /* fall back to unprotected mode on error */
+ EPRINTF("switch_mode: error flushing queue (old: %d, new: %d)",
+ c->mode, mode);
+ mode = mode_unprotected;
+ }
+
+ if (mode == mode_unprotected)
+ rc = unprotected_start(c);
+ else if (mode == mode_primary)
+ rc = primary_start(c);
+ else if (mode == mode_backup)
+ rc = backup_start(c);
+ else if (mode == mode_failed)
+ rc = failed_start(c);
+ else {
+ EPRINTF("unknown mode requested: %d\n", mode);
+ rc = -1;
+ }
+
+ if (!rc)
+ c->mode = mode;
+
+ return rc;
+}
+
+static int tdcolo_open(td_driver_t *driver, td_image_t *image, td_uuid_t uuid)
+{
+ struct tdcolo_state *c = driver->data;
+ td_replication_connect_t *t = &c->t;
+ colo_control_t *ctl = &c->ctl;
+ ramdisk_t *ramdisk = &c->ramdisk;
+ int rc;
+ const char *name = image->name;
+ td_flag_t flags = image->flags;
+
+ DPRINTF("opening %s\n", name);
+
+ memset(c, 0, sizeof(*c));
+
+ /* init ramdisk */
+ ramdisk->log_prefix = "COLO";
+ ramdisk->sector_size = driver->info.sector_size;
+ ramdisk->image = image;
+ if (ramdisk_init(&c->ramdisk))
+ return -ENOMEM;
+
+ /* init async I/O */
+ td_async_io_init(&c->rio);
+ td_async_io_init(&c->wio);
+
+ c->sector_size = driver->info.sector_size;
+
+ /* init control socket */
+ colo_control_init(ctl);
+ rc = colo_create_control_socket(ctl, name);
+ if (rc)
+ return rc;
+
+ /* init async connection */
+ t->log_prefix = "COLO";
+ t->retry_timeout_s = COLO_CONNRETRY_TIMEOUT;
+ t->max_connections = 1;
+ t->callback = colo_server_established;
+ rc = td_replication_connect_init(t, name);
+ if (rc) {
+ colo_control_close(ctl);
+ return rc;
+ }
+
+ rc = td_replication_server_start(t);
+ if (!rc)
+ rc = switch_mode(c, mode_backup);
+ else if (rc == -2)
+ rc = switch_mode(c, mode_primary);
+
+ if (!rc)
+ return 0;
+
+ tdcolo_close(driver);
+ return -EIO;
+}
+
+static int tdcolo_pre_close(td_driver_t *driver)
+{
+ struct tdcolo_state *c = driver->data;
+
+ if (c->mode != mode_primary)
+ return 0;
+
+ if (td_replication_connect_status(&c->t))
+ return 0;
+
+ /*
+ * The connection is in progress, and we may queue some
+ * I/O requests.
+ */
+ primary_failed(c, ERROR_CLOSE);
+ return 0;
+}
+
+static int tdcolo_close(td_driver_t *driver)
+{
+ struct tdcolo_state *c = driver->data;
+
+ DPRINTF("closing\n");
+ ramdisk_destroy(&c->ramdisk);
+ td_replication_connect_kill(&c->t);
+ td_async_io_kill(&c->rio);
+ td_async_io_kill(&c->wio);
+ colo_control_close(&c->ctl);
+ free(c->buff);
+
+ return 0;
+}
+
+static int tdcolo_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
+{
+ /* we shouldn't have a parent... for now */
+ return -EINVAL;
+}
+
+static int tdcolo_validate_parent(td_driver_t *driver,
+ td_driver_t *pdriver, td_flag_t flags)
+{
+ return 0;
+}
+
+struct tap_disk tapdisk_colo = {
+ .disk_type = "tapdisk_colo",
+ .private_data_size = sizeof(struct tdcolo_state),
+ .td_open = tdcolo_open,
+ .td_queue_read = unprotected_queue_io,
+ .td_queue_write = unprotected_queue_io,
+ .td_pre_close = tdcolo_pre_close,
+ .td_close = tdcolo_close,
+ .td_get_parent_id = tdcolo_get_parent_id,
+ .td_validate_parent = tdcolo_validate_parent,
+};
diff --git a/tools/blktap2/drivers/block-remus.c b/tools/blktap2/drivers/block-remus.c
index c7b429c..fc630b4 100644
--- a/tools/blktap2/drivers/block-remus.c
+++ b/tools/blktap2/drivers/block-remus.c
@@ -672,7 +672,7 @@ static void server_do_wreq(td_driver_t *driver)
if (ramdisk_cache_write_request(&s->ramdisk, *sector, *sectors,
driver->info.sector_size, buf,
- "remus") < 0) {
+ "remus", 1) < 0) {
rc = ERROR_INTERNAL;
goto err;
}
@@ -693,7 +693,7 @@ static void server_do_creq(td_driver_t *driver)
// RPRINTF("committing buffer\n");
- ramdisk_start_flush(&s->ramdisk);
+ ramdisk_start_flush(&s->ramdisk, 1);
/* XXX this message should not be sent until flush completes! */
if (write(s->stream_fd.fd, TDREMUS_DONE, strlen(TDREMUS_DONE)) != 4)
diff --git a/tools/blktap2/drivers/block-replication.c b/tools/blktap2/drivers/block-replication.c
index 82d7609..ad8018d 100644
--- a/tools/blktap2/drivers/block-replication.c
+++ b/tools/blktap2/drivers/block-replication.c
@@ -732,6 +732,12 @@ int ramdisk_init(ramdisk_t *ramdisk)
if (!ramdisk->primary_cache)
return -1;
+ ramdisk->secondary_cache = ramdisk_new_hashtable();
+ if (!ramdisk->secondary_cache) {
+ HASHTABLE_DESTROY(ramdisk->primary_cache, 0);
+ return -1;
+ }
+
return 0;
}
@@ -780,14 +786,46 @@ int ramdisk_read(ramdisk_t *ramdisk, uint64_t sector,
return 0;
}
-int ramdisk_cache_write_request(ramdisk_t *ramdisk, uint64_t sector,
- int nb_sectors, size_t sector_size,
- char *buf, const char *log_prefix)
+int ramdisk_read_from_cache(ramdisk_t *ramdisk, uint64_t sector,
+ int nb_sectors, int sector_size,
+ char *buf, int use_primary_cache)
{
- int i, rc;
+ int i;
+ uint64_t key;
+ char *v;
+ struct hashtable *cache;
+
+ if (use_primary_cache)
+ cache = ramdisk->primary_cache;
+ else
+ cache = ramdisk->secondary_cache;
for (i = 0; i < nb_sectors; i++) {
- rc = ramdisk_write_hash(ramdisk->primary_cache, sector + i,
+ key = sector + i;
+ v = hashtable_search(cache, &key);
+ if (!v)
+ return -1;
+ memcpy(buf + i * sector_size, v, sector_size);
+ }
+
+ return 0;
+}
+
+int ramdisk_cache_write_request(ramdisk_t *ramdisk, uint64_t sector,
+ int nb_sectors, size_t sector_size,
+ char *buf, const char *log_prefix,
+ int use_primary_cache)
+{
+ int i, rc;
+ struct hashtable *cache;
+
+ if (use_primary_cache)
+ cache = ramdisk->primary_cache;
+ else
+ cache = ramdisk->secondary_cache;
+
+ for (i = 0; i < nb_sectors; i++) {
+ rc = ramdisk_write_hash(cache, sector + i,
buf + i * sector_size,
sector_size, log_prefix);
if (rc)
@@ -870,7 +908,7 @@ int ramdisk_flush_pended_requests(ramdisk_t *ramdisk)
return 0;
}
-int ramdisk_start_flush(ramdisk_t *ramdisk)
+int ramdisk_start_flush(ramdisk_t *ramdisk, int flush_primary_cache)
{
uint64_t *key;
char *buf;
@@ -880,7 +918,10 @@ int ramdisk_start_flush(ramdisk_t *ramdisk)
const char *log_prefix = ramdisk->log_prefix;
struct hashtable *cache;
- cache = ramdisk->primary_cache;
+ if (flush_primary_cache)
+ cache = ramdisk->primary_cache;
+ else
+ cache = ramdisk->secondary_cache;
if (!hashtable_count(cache))
return 0;
@@ -910,8 +951,12 @@ int ramdisk_start_flush(ramdisk_t *ramdisk)
* We create a new hashtable so that new writes can be performed before
* the old hashtable is completely drained.
*/
- ramdisk->primary_cache = ramdisk_new_hashtable();
- if (!ramdisk->primary_cache) {
+ cache = ramdisk_new_hashtable();
+ if (flush_primary_cache)
+ ramdisk->primary_cache = cache;
+ else
+ ramdisk->secondary_cache = cache;
+ if (!cache) {
EPRINTF("ramdisk_start_flush: creating cache table failed: OOM\n");
return -1;
}
@@ -919,6 +964,28 @@ int ramdisk_start_flush(ramdisk_t *ramdisk)
return ramdisk_flush_pended_requests(ramdisk);
}
+int ramdisk_clear_cache(ramdisk_t *ramdisk, int use_primary_cache)
+{
+ struct hashtable *cache;
+
+ if (use_primary_cache)
+ cache = ramdisk->primary_cache;
+ else
+ cache = ramdisk->secondary_cache;
+
+ hashtable_destroy(cache, 1);
+
+ cache = ramdisk_new_hashtable();
+ if (use_primary_cache)
+ ramdisk->primary_cache = cache;
+ else
+ ramdisk->secondary_cache = cache;
+ if (!cache)
+ return 1;
+
+ return 0;
+}
+
int ramdisk_writes_inflight(ramdisk_t *ramdisk)
{
if (!ramdisk->inflight && !ramdisk->prev)
@@ -926,3 +993,180 @@ int ramdisk_writes_inflight(ramdisk_t *ramdisk)
return 1;
}
+
+/* async I/O */
+static void td_async_io_readable(event_id_t id, char mode, void *private);
+static void td_async_io_writeable(event_id_t id, char mode, void *private);
+static void td_async_io_timeout(event_id_t id, char mode, void *private);
+
+void td_async_io_init(td_async_io_t *taio)
+{
+ memset(taio, 0, sizeof(*taio));
+ taio->fd = -1;
+ taio->timeout_id = -1;
+ taio->io_id = -1;
+}
+
+int td_async_io_start(td_async_io_t *taio)
+{
+ event_id_t id;
+
+ if (taio->running)
+ return -1;
+
+ if (taio->size <= 0 || taio->fd < 0)
+ return -1;
+
+ taio->running = 1;
+
+ if (taio->mode == td_async_read)
+ id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
+ taio->fd, 0,
+ td_async_io_readable,
+ taio);
+ else if (taio->mode == td_async_write)
+ id = tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD,
+ taio->fd, 0,
+ td_async_io_writeable,
+ taio);
+ else
+ id = -1;
+ if (id < 0)
+ goto err;
+ taio->io_id = id;
+
+ if (taio->timeout_s) {
+ id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+ -1, taio->timeout_s,
+ td_async_io_timeout, taio);
+ if (id < 0)
+ goto err;
+ taio->timeout_id = id;
+ }
+
+ taio->used = 0;
+ return 0;
+
+err:
+ td_async_io_kill(taio);
+ return -1;
+}
+
+static void td_async_io_callback(td_async_io_t *taio, int realsize,
+ int errnoval)
+{
+ td_async_io_kill(taio);
+ taio->callback(taio, realsize, errnoval);
+}
+
+static void td_async_io_update_timeout(td_async_io_t *taio)
+{
+ event_id_t id;
+
+ if (!taio->timeout_s)
+ return;
+
+ tapdisk_server_unregister_event(taio->timeout_id);
+ taio->timeout_id = -1;
+
+ id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
+ -1, taio->timeout_s,
+ td_async_io_timeout, taio);
+ if (id < 0)
+ td_async_io_callback(taio, -1, id);
+ else
+ taio->timeout_id = id;
+}
+
+static void td_async_io_readable(event_id_t id, char mode, void *private)
+{
+ td_async_io_t *taio = private;
+ int rc;
+
+ while (1) {
+ rc = read(taio->fd, taio->buff + taio->used,
+ taio->size - taio->used);
+ if (rc < 0) {
+ if (errno == EINTR)
+ continue;
+ if (errno == EWOULDBLOCK || errno == EAGAIN)
+ break;
+
+ td_async_io_callback(taio, 0, errno);
+ return;
+ }
+
+ if (rc == 0) {
+ td_async_io_callback(taio, taio->used, 0);
+ return;
+ }
+
+ taio->used += rc;
+ if (taio->used == taio->size) {
+ td_async_io_callback(taio, taio->used, 0);
+ return;
+ }
+ }
+
+ td_async_io_update_timeout(taio);
+}
+
+static void td_async_io_writeable(event_id_t id, char mode, void *private)
+{
+ td_async_io_t *taio = private;
+ int rc;
+
+ while (1) {
+ rc = write(taio->fd, taio->buff + taio->used,
+ taio->size - taio->used);
+
+ if (rc < 0) {
+ if (errno == EINTR)
+ continue;
+ if (errno == EWOULDBLOCK || errno == EAGAIN)
+ break;
+
+ td_async_io_callback(taio, 0, errno);
+ return;
+ }
+
+ taio->used += rc;
+ if (taio->used == taio->size) {
+ td_async_io_callback(taio, taio->used, 0);
+ return;
+ }
+ }
+
+ td_async_io_update_timeout(taio);
+}
+
+static void td_async_io_timeout(event_id_t id, char mode, void *private)
+{
+ td_async_io_t *taio = private;
+
+ td_async_io_kill(taio);
+ taio->callback(taio, 0, ETIME);
+}
+
+int td_async_io_is_running(td_async_io_t *taio)
+{
+ return taio->running;
+}
+
+void td_async_io_kill(td_async_io_t *taio)
+{
+ if (!taio->running)
+ return;
+
+ if (taio->timeout_id >= 0) {
+ tapdisk_server_unregister_event(taio->timeout_id);
+ taio->timeout_id = -1;
+ }
+
+ if (taio->io_id >= 0) {
+ tapdisk_server_unregister_event(taio->io_id);
+ taio->io_id = -1;
+ }
+
+ taio->running = 0;
+}
diff --git a/tools/blktap2/drivers/block-replication.h b/tools/blktap2/drivers/block-replication.h
index cbdac3c..c17be13 100644
--- a/tools/blktap2/drivers/block-replication.h
+++ b/tools/blktap2/drivers/block-replication.h
@@ -139,6 +139,12 @@ struct ramdisk {
* the checkpoint finishes.
*/
struct hashtable *primary_cache;
+ /*
+ * The secondary vm write request is queued in this
+ * hashtable, and will be dropped when the checkpoint
+ * finishes or flushed to ramdisk after failover.
+ */
+ struct hashtable *secondary_cache;
};
int ramdisk_init(ramdisk_t *ramdisk);
@@ -152,12 +158,21 @@ int ramdisk_read(ramdisk_t *ramdisk, uint64_t sector,
int nb_sectors, char *buf);
/*
+ * try to read from ramdisk's cache. Return -1 if some sectors are not in
+ * ramdisk's cache. Otherwise, return 0.
+ */
+int ramdisk_read_from_cache(ramdisk_t *ramdisk, uint64_t sector,
+ int nb_sectors, int sector_size,
+ char *buf, int use_primary_cache);
+
+/*
* cache the write requests, and it will be flushed after a
* new checkpoint finishes
*/
int ramdisk_cache_write_request(ramdisk_t *ramdisk, uint64_t sector,
int nb_sectors, size_t sector_size,
- char* buf, const char *log_prefix);
+ char* buf, const char *log_prefix,
+ int use_primary_cache);
/* flush pended write requests to disk */
int ramdisk_flush_pended_requests(ramdisk_t *ramdisk);
@@ -168,11 +183,69 @@ int ramdisk_flush_pended_requests(ramdisk_t *ramdisk);
* are flushed to disk. This function don't wait all write requests
* are flushed to disk.
*/
-int ramdisk_start_flush(ramdisk_t *ramdisk);
+int ramdisk_start_flush(ramdisk_t *ramdisk, int flush_primary_cache);
+/*
+ * clear the write requests that are stored in the cache, all write requests
+ * will be dropped.
+ */
+int ramdisk_clear_cache(ramdisk_t *ramdisk, int use_primary_cache);
/*
* Return true if some write reqeusts are inprogress or pended,
* otherwise return false
*/
int ramdisk_writes_inflight(ramdisk_t *ramdisk);
+/* async I/O, don't support read/write at the same time */
+typedef struct td_async_io td_async_io_t;
+enum {
+ td_async_read,
+ td_async_write,
+};
+
+/*
+ * realsize >= 1 means all data was read/written
+ * realsize == 0 means failure happened when reading/writing, and
+ * errnoval is valid
+ * realsize == -1 means some other internal failure happended, and
+ * errnoval is also valid
+ * In all cases async_io is killed before calling this callback
+ *
+ * If we don't read/write any more data in timeout_s seconds, realsize is
+ * 0, and errnoval is ETIME
+ *
+ * If timeout_s is 0, timeout will be disabled.
+ *
+ * NOTE: realsize is less than taio->size, if we read EOF.
+ */
+typedef void taio_callback(td_async_io_t *taio, int realsize,
+ int errnoval);
+
+struct td_async_io {
+ /* caller must fill these in, and they must all remain valid */
+ int fd;
+ int timeout_s;
+ int mode;
+ /*
+ * read: store the data to buff
+ * write: point to the data to be written
+ */
+ void *buff;
+ int size;
+ taio_callback *callback;
+
+ /* private */
+ event_id_t timeout_id, io_id;
+ int used;
+ int running;
+};
+
+/* Don't call it when td_async_io is running */
+void td_async_io_init(td_async_io_t *taio);
+/* return -1 if we find some error. Otherwise, return 0 */
+int td_async_io_start(td_async_io_t *taio);
+/* return 1 if td_async_io is running, otherwise return 0 */
+int td_async_io_is_running(td_async_io_t *taio);
+/* The callback will not be called */
+void td_async_io_kill(td_async_io_t *taio);
+
#endif
diff --git a/tools/blktap2/drivers/tapdisk-disktype.c b/tools/blktap2/drivers/tapdisk-disktype.c
index 8d1383b..aa2afab 100644
--- a/tools/blktap2/drivers/tapdisk-disktype.c
+++ b/tools/blktap2/drivers/tapdisk-disktype.c
@@ -94,6 +94,12 @@ static const disk_info_t remus_disk = {
0,
};
+static const disk_info_t colo_disk = {
+ "colo",
+ "colo disk replicator (COLO)",
+ 0,
+};
+
const disk_info_t *tapdisk_disk_types[] = {
[DISK_TYPE_AIO] = &aio_disk,
[DISK_TYPE_SYNC] = &sync_disk,
@@ -105,6 +111,7 @@ const disk_info_t *tapdisk_disk_types[] = {
[DISK_TYPE_BLOCK_CACHE] = &block_cache_disk,
[DISK_TYPE_LOG] = &log_disk,
[DISK_TYPE_REMUS] = &remus_disk,
+ [DISK_TYPE_COLO] = &colo_disk,
[DISK_TYPE_MAX] = NULL,
};
@@ -119,6 +126,7 @@ extern struct tap_disk tapdisk_block_cache;
extern struct tap_disk tapdisk_vhd_index;
extern struct tap_disk tapdisk_log;
extern struct tap_disk tapdisk_remus;
+extern struct tap_disk tapdisk_colo;
const struct tap_disk *tapdisk_disk_drivers[] = {
[DISK_TYPE_AIO] = &tapdisk_aio,
@@ -132,6 +140,7 @@ const struct tap_disk *tapdisk_disk_drivers[] = {
[DISK_TYPE_BLOCK_CACHE] = &tapdisk_block_cache,
[DISK_TYPE_LOG] = &tapdisk_log,
[DISK_TYPE_REMUS] = &tapdisk_remus,
+ [DISK_TYPE_COLO] = &tapdisk_colo,
[DISK_TYPE_MAX] = NULL,
};
diff --git a/tools/blktap2/drivers/tapdisk-disktype.h b/tools/blktap2/drivers/tapdisk-disktype.h
index c574990..ee8cb02 100644
--- a/tools/blktap2/drivers/tapdisk-disktype.h
+++ b/tools/blktap2/drivers/tapdisk-disktype.h
@@ -39,7 +39,8 @@
#define DISK_TYPE_BLOCK_CACHE 7
#define DISK_TYPE_LOG 8
#define DISK_TYPE_REMUS 9
-#define DISK_TYPE_MAX 10
+#define DISK_TYPE_COLO 10
+#define DISK_TYPE_MAX 11
#define DISK_TYPE_NAME_MAX 32
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 15/18] libxl/colo: setup and control disk replication for blktap2 backends
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (13 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 14/18] block-colo: implement colo disk replication Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 16/18] setup and control colo-agent for primary vm Wen Congyang
` (4 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
This patch adds the machinery required for protecting a guest's
disk state, when the guest disk uses a blktap2 disk backend.
1. COLO blktap2 disk device: Implements the interfaces required by the
checkpoint abstract device layer. A note about the implementation:
a) setup() is called for each disk attached to the guest.
During setup():
i) perform the sanity check: backend type should be LIBXL_DISK_BACKEND_TAP
and format should be LIBXL_DISK_FORMAT_COLO.
ii) connect to the control socket: /var/run/tap/colo_xxx, xxx is
"host:port"(The character ':/' will be changed to '_').
b) The postsuspend callback() will write "flush" to this socket
c) The commit callback() will wait and read "done" from this socket
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
docs/man/xl.pod.1 | 3 +-
tools/libxl/Makefile | 2 +-
tools/libxl/libxl_colo_save.c | 36 ++++-
tools/libxl/libxl_colo_save_disk_blktap2.c | 219 +++++++++++++++++++++++++++++
tools/libxl/libxl_create.c | 8 ++
tools/libxl/libxl_internal.h | 2 +
tools/libxl/libxl_noblktap2.c | 29 ++++
7 files changed, 295 insertions(+), 4 deletions(-)
create mode 100644 tools/libxl/libxl_colo_save_disk_blktap2.c
diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index 4c79ea0..ef1ff1f 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -439,7 +439,8 @@ N.B: Remus support in xl is still in experimental (proof-of-concept) phase.
Disk replication support is limited to DRBD disks.
COLO support in xl is still in experimental (proof-of-concept) phase.
- There is no support for network or disk at the moment.
+ There is no support for network at the moment.
+ Disk replication support is limited to blktap2 disks.
B<OPTIONS>
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index ab565ee..ed46af3 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -45,7 +45,7 @@ LIBXLU_LIBS =
LIBXL_OBJS-y = osdeps.o libxl_paths.o libxl_bootloader.o flexarray.o
ifeq ($(LIBXL_BLKTAP),y)
-LIBXL_OBJS-y += libxl_blktap2.o
+LIBXL_OBJS-y += libxl_blktap2.o libxl_colo_save_disk_blktap2.o
else
LIBXL_OBJS-y += libxl_noblktap2.o
endif
diff --git a/tools/libxl/libxl_colo_save.c b/tools/libxl/libxl_colo_save.c
index 6fbff9f..516b913 100644
--- a/tools/libxl/libxl_colo_save.c
+++ b/tools/libxl/libxl_colo_save.c
@@ -18,10 +18,36 @@
#include "libxl_internal.h"
#include "libxl_colo.h"
+extern const libxl__checkpoint_device_instance_ops colo_save_device_blktap2_disk;
+
static const libxl__checkpoint_device_instance_ops *colo_ops[] = {
+ &colo_save_device_blktap2_disk,
NULL,
};
+/* ================= helper functions ================= */
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+ /* init device subkind-specific state in the libxl ctx */
+ int rc;
+ STATE_AO_GC(cds->ao);
+
+ rc = init_subkind_drbd_disk(cds);
+ if (rc) goto out;
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+ /* cleanup device subkind-specific state in the libxl ctx */
+ STATE_AO_GC(cds->ao);
+
+ cleanup_subkind_blktap2_disk(cds);
+}
+
/* ================= colo: setup save environment ================= */
static void colo_save_setup_done(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
@@ -48,13 +74,16 @@ void libxl__colo_save_setup(libxl__egc *egc, libxl__colo_save_state *css)
css->recv_fd = dss->recv_fd;
css->svm_running = false;
- /* TODO: disk/nic support */
- cds->device_kind_flags = 0;
+ /* TODO: nic support */
+ cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VBD);
cds->ops = colo_ops;
cds->callback = colo_save_setup_done;
cds->ao = ao;
cds->domid = dss->domid;
+ if (init_device_subkind(cds))
+ goto out;
+
libxl__checkpoint_devices_setup(egc, &css->cds);
return;
@@ -92,6 +121,7 @@ static void colo_save_setup_failed(libxl__egc *egc,
LOG(ERROR, "COLO: failed to teardown device after setup failed"
" for guest with domid %u, rc %d", cds->domid, rc);
+ cleanup_device_subkind(cds);
libxl__ao_complete(egc, ao, rc);
}
@@ -122,6 +152,8 @@ static void colo_teardown_done(libxl__egc *egc,
{
libxl__colo_save_state *css = CONTAINER_OF(cds, *css, cds);
libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
+
+ cleanup_device_subkind(cds);
dss->callback(egc, dss, rc);
}
diff --git a/tools/libxl/libxl_colo_save_disk_blktap2.c b/tools/libxl/libxl_colo_save_disk_blktap2.c
new file mode 100644
index 0000000..86782a4
--- /dev/null
+++ b/tools/libxl/libxl_colo_save_disk_blktap2.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+
+#include <string.h>
+#include <sys/un.h>
+
+#define BLKTAP2_REQUEST "flush"
+#define BLKTAP2_RESPONSE "done"
+#define BLKTAP_CTRL_DIR "/var/run/tap"
+
+typedef struct libxl__colo_blktap2_disk {
+ char *name;
+ char *ctl_socket_path;
+ int fd;
+ libxl__ev_fd ev;
+ libxl__checkpoint_device *dev;
+}libxl__colo_blktap2_disk;
+
+/* ========== init() and cleanup() ========== */
+int init_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds)
+{
+ return 0;
+}
+
+void cleanup_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds)
+{
+}
+
+/* ========== setup() and teardown() ========== */
+static int blktap2_control_connect(libxl__gc *gc,
+ libxl__colo_blktap2_disk *blktap2_disk)
+{
+ struct sockaddr_un saddr;
+ int fd, err;
+
+ fd = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (fd < 0) {
+ LOG(ERROR, "cannot creating socket fd");
+ return ERROR_FAIL;
+ }
+
+ memset(&saddr, 0, sizeof(saddr));
+ saddr.sun_family = AF_UNIX;
+ strcpy(saddr.sun_path, blktap2_disk->ctl_socket_path);
+
+ err = connect(fd, (const struct sockaddr *)&saddr, sizeof(saddr));
+ if (err) {
+ LOG(ERROR, "cannot connecte to %s", blktap2_disk->ctl_socket_path);
+ close(fd);
+ return ERROR_FAIL;
+ }
+
+ blktap2_disk->fd = fd;
+ return 0;
+}
+
+static void blktap2_colo_setup(libxl__egc *egc, libxl__checkpoint_device *dev)
+{
+ const libxl_device_disk *disk = dev->backend_dev;
+ libxl__colo_blktap2_disk *blktap2_disk;
+ int rc;
+ int i, l;
+
+ STATE_AO_GC(dev->cds->ao);
+
+ if (disk->backend != LIBXL_DISK_BACKEND_TAP ||
+ !disk->filter ||
+ strcmp(disk->filter, "colo")) {
+ rc = ERROR_CHECKPOINT_DEVOPS_DOES_NOT_MATCH;
+ goto out;
+ }
+
+ dev->matched = 1;
+ GCNEW(blktap2_disk);
+ dev->concrete_data = blktap2_disk;
+ blktap2_disk->fd = -1;
+ blktap2_disk->dev = dev;
+
+ blktap2_disk->name = libxl__strdup(gc, disk->filter_params);
+ blktap2_disk->ctl_socket_path = libxl__sprintf(gc, "%s/colo_%s",
+ BLKTAP_CTRL_DIR,
+ blktap2_disk->name);
+ /* scrub socket pathname */
+ l = strlen(blktap2_disk->ctl_socket_path);
+ for (i = strlen(BLKTAP_CTRL_DIR) + 1; i < l; i++) {
+ if (strchr(":/", blktap2_disk->ctl_socket_path[i]))
+ blktap2_disk->ctl_socket_path[i] = '_';
+ }
+
+ libxl__ev_fd_init(&blktap2_disk->ev);
+
+ rc = blktap2_control_connect(gc, blktap2_disk);
+
+out:
+ dev->aodev.rc = rc;
+ dev->aodev.callback(egc, &dev->aodev);
+}
+
+static void blktap2_colo_teardown(libxl__egc *egc,
+ libxl__checkpoint_device *dev)
+{
+ libxl__colo_blktap2_disk *blktap2_disk = dev->concrete_data;
+
+ if (blktap2_disk->fd > 0) {
+ close(blktap2_disk->fd);
+ blktap2_disk->fd = -1;
+ }
+
+ dev->aodev.rc = 0;
+ dev->aodev.callback(egc, &dev->aodev);
+}
+
+/* ========== checkpointing APIs ========== */
+/*
+ * When a new checkpoint is triggered, we do the following thing:
+ * 1. send BLKTAP2_REQUEST to tapdisk2
+ * 2. tapdisk2 send "creq"
+ * 3. secondary vm's tapdisk2 reply "done"
+ * 4. tapdisk2 writes BLKTAP2_RESPONSE to the socket
+ * 5. read BLKTAP2_RESPONSE from the socket
+ * Step1 and 5 are implemented here.
+ */
+static void blktap2_control_readable(libxl__egc *egc, libxl__ev_fd *ev,
+ int fd, short events, short revents);
+
+static void blktap2_colo_postsuspend(libxl__egc *egc,
+ libxl__checkpoint_device *dev)
+{
+ int ret;
+ libxl__colo_blktap2_disk *blktap2_disk = dev->concrete_data;
+ int rc = 0;
+
+ /* unit socket fd, so not block */
+ ret = write(blktap2_disk->fd, BLKTAP2_REQUEST, strlen(BLKTAP2_REQUEST));
+ if (ret < strlen(BLKTAP2_REQUEST))
+ rc = ERROR_FAIL;
+
+ dev->aodev.rc = rc;
+ dev->aodev.callback(egc, &dev->aodev);
+}
+
+static void blktap2_colo_commit(libxl__egc *egc,
+ libxl__checkpoint_device *dev)
+{
+ libxl__colo_blktap2_disk *blktap2_disk = dev->concrete_data;
+ int rc;
+
+ /* Convenience aliases */
+ const int fd = blktap2_disk->fd;
+ libxl__ev_fd *const ev = &blktap2_disk->ev;
+
+ STATE_AO_GC(dev->cds->ao);
+
+ rc = libxl__ev_fd_register(gc, ev, blktap2_control_readable, fd, POLLIN);
+ if (rc) {
+ dev->aodev.rc = rc;
+ dev->aodev.callback(egc, &dev->aodev);
+ }
+}
+
+static void blktap2_control_readable(libxl__egc *egc, libxl__ev_fd *ev,
+ int fd, short events, short revents)
+{
+ libxl__colo_blktap2_disk *blktap2_disk =
+ CONTAINER_OF(ev, *blktap2_disk, ev);
+ int rc = 0, ret;
+ char response[5];
+
+ /* Convenience aliases */
+ libxl__checkpoint_device *const dev = blktap2_disk->dev;
+
+ EGC_GC;
+
+ libxl__ev_fd_deregister(gc, ev);
+
+ if (revents & ~POLLIN) {
+ LOG(ERROR, "unexpected poll event 0x%x (should be POLLIN)", revents);
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ ret = read(blktap2_disk->fd, response, sizeof(response) - 1);
+ if (ret < sizeof(response) - 1) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ response[4] = '\0';
+ if (strcmp(response, BLKTAP2_RESPONSE))
+ rc = ERROR_FAIL;
+
+out:
+ dev->aodev.rc = rc;
+ dev->aodev.callback(egc, &dev->aodev);
+}
+
+const libxl__checkpoint_device_instance_ops colo_save_device_blktap2_disk = {
+ .kind = LIBXL__DEVICE_KIND_VBD,
+ .setup = blktap2_colo_setup,
+ .teardown = blktap2_colo_teardown,
+ .postsuspend = blktap2_colo_postsuspend,
+ .commit = blktap2_colo_commit,
+};
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 0c1c09c..27e6002 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -870,6 +870,14 @@ static void initiate_domain_create(libxl__egc *egc,
for (i = 0; i < d_config->num_disks; i++) {
ret = libxl__device_disk_setdefault(gc, &d_config->disks[i]);
if (ret) goto error_out;
+
+ /* TODO: cleanup it when destroying the domain */
+ if (d_config->disks[i].backend == LIBXL_DISK_BACKEND_TAP &&
+ d_config->disks[i].filter)
+ libxl__blktap_devpath(gc, d_config->disks[i].pdev_path,
+ d_config->disks[i].format,
+ d_config->disks[i].filter,
+ d_config->disks[i].filter_params);
}
dcs->bl.ao = ao;
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 924f8a3..1659845 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2717,6 +2717,8 @@ int init_subkind_nic(libxl__checkpoint_devices_state *cds);
void cleanup_subkind_nic(libxl__checkpoint_devices_state *cds);
int init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
+int init_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds);
+void cleanup_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds);
typedef void libxl__checkpoint_callback(libxl__egc *,
libxl__checkpoint_devices_state *,
diff --git a/tools/libxl/libxl_noblktap2.c b/tools/libxl/libxl_noblktap2.c
index ba3120b..c65583a 100644
--- a/tools/libxl/libxl_noblktap2.c
+++ b/tools/libxl/libxl_noblktap2.c
@@ -37,6 +37,35 @@ int libxl__device_destroy_tapdisk(libxl__gc *gc,
return 0;
}
+static int blktap2_colo_init(libxl__checkpoint_device *cds)
+{
+ return 0;
+}
+
+static void blktap2_colo_cleanup(libxl__checkpoint_device *cds)
+{
+ return;
+}
+
+static void blktap2_colo_setup(libxl__checkpoint_device *cds)
+{
+ dev->aodev.rc = ERROR_FAIL;
+ dev->aodev.callback(dev->cds->egc, &dev->aodev);
+}
+
+static void blktap2_colo_teardown(libxl__checkpoint_device *cds)
+{
+ return;
+}
+
+const libxl__checkpoint_device_instance_ops colo_save_device_blktap2_disk = {
+ .kind = LIBXL__CHECKPOINT_DEVICE_DISK,
+ .init = blktap2_colo_init,
+ .cleanup = blktap2_colo_cleanup,
+ .setup = blktap2_colo_setup,
+ .teardown = blktap2_colo_teardown,
+};
+
/*
* Local variables:
* mode: C
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 16/18] setup and control colo-agent for primary vm
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (14 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 15/18] libxl/colo: setup and control disk replication for blktap2 backends Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 17/18] setup and control colo-agent for secondary vm Wen Congyang
` (3 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
This patch adds the machinery required for protecting a primary vm's
network device state. This patch comprises of three parts:
1. Hotplug scripts: The colo-agent-setup script is responsible for
setting up and tearing down the necessary infrastructure required
for COLO agent. This script should be invoked by libxl for each
of the guest's network interfaces, when starting or stopping COLO.
COLO agent is a agent that will compare the packets from primary
vm and secondary vm, and trigger a new checkpoint if they are
different.
Apart from returning success/failure indication via the usual hotplug
entries in xenstore, this script also writes to xenstore, the name of
the ifb device to be used to control COLO agent.
The script relies on libnl3 command line utilities to perform various
setup/teardown functions. The script is confined to Linux platforms only
since NetBSD does not seem to have libnl3.
2. Checkpoint network device: Implements the interfaces required by the
checkpoint abstract device layer. A note about the implementation:
a) setup() and teardown() are called for each vif attached to the
primary vm.
During setup(), the hotplug script is called to setup COLO agent for
given vif. The script does the follow things:
i) choose two available IFB devices from system(called IFB_PRIMARY,
and IFB_SECONDARY), and set up the colo qdisc on these two ifb
devices.
ii) copy and forward the egress traffic to the FORWARD device
iii) redirect vif egress traffic to the IFB_PRIMARY device
iv) redirect FORWARD device egress traffic to the IFB_SECONDARY device
During teardown(), the hotplug scripts are called again for each
vif. The scripts does the follow things:
i) remove the vif->IFB_PRIMARY traffic redirection
ii) remove the FORWARD->IFB_SECONDARY traffic redirection
iii) remove the vif->FORWARD traffic forwarding.
iv) release the two ifb devices and the colo qdisc associated with
them
b) The checkpoint callbacks are not for each vif. So we implement it in
libxl_colo_save.c
3. colo-tc: a simple command like tc, and just for the qdisc colo.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
.gitignore | 1 +
tools/hotplug/Linux/Makefile | 2 +
tools/hotplug/Linux/colo-agent-setup | 210 ++++++++++++
tools/hotplug/Linux/remus-netbuf-setup | 45 +--
tools/hotplug/Linux/xen-network-ft.sh | 102 ++++++
tools/libxl/Makefile | 8 +-
tools/libxl/colo-tc.c | 589 +++++++++++++++++++++++++++++++++
tools/libxl/libxl.c | 6 +
tools/libxl/libxl_colo_nic.c | 289 ++++++++++++++++
tools/libxl/libxl_colo_save.c | 177 +++++++++-
tools/libxl/libxl_internal.h | 8 +
tools/libxl/libxl_types.idl | 1 +
tools/libxl/xl_cmdimpl.c | 3 +
13 files changed, 1390 insertions(+), 51 deletions(-)
create mode 100755 tools/hotplug/Linux/colo-agent-setup
create mode 100644 tools/hotplug/Linux/xen-network-ft.sh
create mode 100644 tools/libxl/colo-tc.c
create mode 100644 tools/libxl/libxl_colo_nic.c
diff --git a/.gitignore b/.gitignore
index b24e905..137de89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -286,6 +286,7 @@ tools/libxl/*.pyc
tools/libxl/libxl-save-helper
tools/libxl/test_timedereg
tools/libxl/xen-init-dom0
+tools/libxl/colo-tc
tools/blktap2/control/tap-ctl
tools/firmware/etherboot/eb-roms.h
tools/firmware/etherboot/gpxe-git-snapshot.tar.gz
diff --git a/tools/hotplug/Linux/Makefile b/tools/hotplug/Linux/Makefile
index 1706c05..d8bd6d4 100644
--- a/tools/hotplug/Linux/Makefile
+++ b/tools/hotplug/Linux/Makefile
@@ -26,12 +26,14 @@ XEN_SCRIPTS += vscsi
XEN_SCRIPTS += block-iscsi
XEN_SCRIPTS += block-drbd-probe
XEN_SCRIPTS += $(XEN_SCRIPTS-y)
+XEN_SCRIPTS += colo-agent-setup
SUBDIRS-$(CONFIG_SYSTEMD) += systemd
XEN_SCRIPT_DATA = xen-script-common.sh locking.sh logging.sh
XEN_SCRIPT_DATA += xen-hotplug-common.sh xen-network-common.sh vif-common.sh
XEN_SCRIPT_DATA += block-common.sh
+XEN_SCRIPT_DATA += xen-network-ft.sh
UDEV_RULES_DIR = $(CONFIG_DIR)/udev
UDEV_RULES = xen-backend.rules $(UDEV_RULES-y)
diff --git a/tools/hotplug/Linux/colo-agent-setup b/tools/hotplug/Linux/colo-agent-setup
new file mode 100755
index 0000000..8cdc41a
--- /dev/null
+++ b/tools/hotplug/Linux/colo-agent-setup
@@ -0,0 +1,210 @@
+#! /bin/bash
+
+dir=$(dirname "$0")
+. "$dir/xen-hotplug-common.sh"
+. "$dir/hotplugpath.sh"
+. "$dir/xen-network-ft.sh"
+
+findCommand "$@"
+
+if [ "$command" != "setup" -a "$command" != "teardown" ]
+then
+ echo "Invalid command: $command"
+ log err "Invalid command: $command"
+ exit 1
+fi
+
+evalVariables "$@"
+
+: ${vifname:?}
+: ${XENBUS_PATH:?}
+: ${forwarddev:?}
+: ${mode:?}
+: ${vmid:?}
+
+if [ "$mode" != "primary" -a "$mode" != "secondary" ]
+then
+ echo "Invalid mode: $mode"
+ log err "Invalid mode: $mode"
+ exit 1
+fi
+
+# redirect input packets from src_nic to dst_nic
+function redirect_nic_traffic()
+{
+ local src_nic=$1
+ local dst_nic=$2
+
+ if ! tc qdisc add dev $src_nic ingress > /dev/null 2>&1
+ then
+ fatal "Unable to add ingress qdisc to nic $src_nic"
+ fi
+
+ if ! tc filter add dev $src_nic parent ffff: protocol ip prio 10 \
+ u32 match u32 0 0 flowid 1:2 action mirred egress redirect dev \
+ $dst_nic > /dev/null 2>&1
+ then
+ fatal "Unable to redirect ip packets from $src_nic to $dst_nic"
+ fi
+
+ if ! tc filter add dev $src_nic parent ffff: protocol arp prio 11 \
+ u32 match u32 0 0 flowid 1:2 action mirred egress redirect dev \
+ $dst_nic > /dev/null 2>&1
+ then
+ fatal "Unable to redirect arp packets from $src_nic to $dst_nic"
+ fi
+}
+
+function stop_redirect_nic_traffic()
+{
+ local src_nic=$1
+
+ do_without_error tc filter del dev $src_nic parent ffff: protocol ip prio 10 u32
+ do_without_error tc filter del dev $src_nic parent ffff: protocol arp prio 11 u32
+ do_without_error tc qdisc del dev $src_nic ingress
+}
+
+# copy and forward input packets from src_nic to dst_nic
+function copy_and_forward_nic_traffic()
+{
+ local src_nic=$1
+ local dst_nic=$2
+
+ if ! tc qdisc add dev $src_nic root handle 1: prio > /dev/null 2>&1
+ then
+ fatal "Unable to add qdic prio to nic $src_nic"
+ fi
+
+ if ! tc filter add dev $src_nic parent 1: protocol ip prio 10 \
+ u32 match u32 0 0 flowid 1:2 action mirred egress mirror dev \
+ $dst_nic > /dev/null 2>&1
+ then
+ fatal "Unable to copy and forward ip packets from $src_nic to $dst_nic"
+ fi
+
+ if ! tc filter add dev $src_nic parent 1: protocol arp prio 11 \
+ u32 match u32 0 0 flowid 1:2 action mirred egress mirror dev \
+ $dst_nic > /dev/null 2>&1
+ then
+ fatal "Unable to copy and forward arp packets from $src_nic to $dst_nic"
+ fi
+}
+
+function stop_copy_and_forward_nic_traffic()
+{
+ local src_nic=$1
+
+ do_without_error tc filter del dev $src_nic parent 1: protocol ip prio 10 u32
+ do_without_error tc filter del dev $src_nic parent 1: protocol arp prio 11 u32
+ do_without_error tc qdisc del dev $src_nic root handle 1: prio
+}
+
+function teardown_ifb()
+{
+ local ifb=$1
+
+ if [ -z "$ifb" ]
+ then
+ return
+ fi
+
+ do_without_error ip link set dev "$ifb" down
+ do_without_error tc qdisc del dev "$ifb" root handle 1: colo
+}
+
+function setup_primary()
+{
+ $LIBEXEC_BIN/colo-tc qdisc add dev $IFB_PRIMARY root handle 1: colo \
+ dev $IFB_SECONDARY primary vmid $vmid
+ $LIBEXEC_BIN/colo-tc qdisc add dev $IFB_SECONDARY root handle 1: colo \
+ dev $IFB_PRIMARY secondary vmid $vmid
+
+ redirect_nic_traffic $forwarddev $IFB_SECONDARY
+ copy_and_forward_nic_traffic $vifname $forwarddev
+ redirect_nic_traffic $vifname $IFB_PRIMARY
+ if ! ifconfig $forwarddev promisc
+ then
+ fatal "device $forwarddev cannot enter promiscuous mode"
+ fi
+}
+
+function teardown_primary()
+{
+ local ifb=
+
+ if xenstore-exits "$XENBUS_PATH/ifb_primary"
+ then
+ ifb=`xenstore-read "$XENBUS_PATH/ifb_primary" 2>/dev/null || true`
+ IFB_PRIMARY=$ifb
+ teardown_ifb "$ifb"
+ fi
+
+ if xenstore-exits "$XENBUS_PATH/ifb_secondary"
+ then
+ ifb=`xenstore-read "$XENBUS_PATH/ifb_secondary" 2>/dev/null || true`
+ IFB_SECONDARY=$ifb
+ teardown_ifb "$ifb"
+ fi
+
+ stop_redirect_nic_traffic $forwarddev
+ stop_redirect_nic_traffic $vifname
+ stop_copy_and_forward_nic_traffic $vifname
+
+ do_without_error ifconfig $forwarddev -promisc
+}
+
+function setup_secondary()
+{
+ redirect_nic_traffic $forwarddev $vifname
+ redirect_nic_traffic $vifname $forwarddev
+
+ if ! ifconfig $forwarddev promisc
+ then
+ fatal "device $forwarddev cannot enter promiscuous mode"
+ fi
+}
+
+function teardown_secondary()
+{
+ stop_redirect_nic_traffic $vifname
+ stop_redirect_nic_traffic $forwarddev
+
+ do_without_error ifconfig $forwarddev -promisc
+}
+
+case "$command" in
+ setup)
+ if [ "$mode" = "primary" ]
+ then
+ claim_lock "pickifb"
+ setup_ifb $vifname ifb_primary
+ IFB_PRIMARY=$ifb
+ setup_ifb $vifname ifb_secondary
+ IFB_SECONDARY=$ifb
+ setup_primary
+ release_lock "pickifb"
+ else
+ setup_secondary
+ fi
+
+ success
+ ;;
+ teardown)
+ if [ "$mode" = "primary" ]
+ then
+ teardown_primary
+ else
+ teardown_secondary
+ fi
+ ;;
+esac
+
+if [ "$mode" = "primary" ]
+then
+ log debug "Successful colo-agent-setup $command for $vifname." \
+ " ifb_primary: $IFB_PRIMARY, ifb_secondary: $IFB_SECONDARY, " \
+ "forwarddev: $forwarddev."
+else
+ log debug "Successful colo-agent-setup $command for $vifname."\
+ " forwarddev $forwarddev."
+fi
diff --git a/tools/hotplug/Linux/remus-netbuf-setup b/tools/hotplug/Linux/remus-netbuf-setup
index 87dfa69..9391d1e 100644
--- a/tools/hotplug/Linux/remus-netbuf-setup
+++ b/tools/hotplug/Linux/remus-netbuf-setup
@@ -76,6 +76,7 @@
#specific setup code such as renaming.
dir=$(dirname "$0")
. "$dir/xen-hotplug-common.sh"
+. "$dir/xen-network-ft.sh"
findCommand "$@"
@@ -116,47 +117,6 @@ check_modules() {
done
}
-#return 0 if the ifb is free
-check_ifb() {
- local installed=`nl-qdisc-list -d $1`
- [ -n "$installed" ] && return 1
-
- for domid in `xenstore-list "/local/domain" 2>/dev/null || true`
- do
- [ $domid -eq 0 ] && continue
- xenstore-exists "/libxl/$domid/remus/netbuf" || continue
- for devid in `xenstore-list "/libxl/$domid/remus/netbuf" 2>/dev/null || true`
- do
- local path="/libxl/$domid/remus/netbuf/$devid/ifb"
- xenstore-exists $path || continue
- local ifb=`xenstore-read "$path" 2>/dev/null || true`
- [ "$ifb" = "$1" ] && return 1
- done
- done
-
- return 0
-}
-
-setup_ifb() {
-
- for ifb in `ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
- do
- check_ifb "$ifb" || continue
- REMUS_IFB="$ifb"
- break
- done
-
- if [ -z "$REMUS_IFB" ]
- then
- fatal "Unable to find a free ifb device for $vifname"
- fi
-
- #not using xenstore_write that automatically exits on error
- #because we need to cleanup
- xenstore_write "$XENBUS_PATH/ifb" "$REMUS_IFB"
- do_or_die ip link set dev "$REMUS_IFB" up
-}
-
redirect_vif_traffic() {
local vif=$1
local ifb=$2
@@ -215,7 +175,8 @@ case "$command" in
check_modules
claim_lock "pickifb"
- setup_ifb
+ setup_ifb $vifname ifb
+ REMUS_IFB=$ifb
redirect_vif_traffic "$vifname" "$REMUS_IFB"
add_plug_qdisc "$vifname" "$REMUS_IFB"
release_lock "pickifb"
diff --git a/tools/hotplug/Linux/xen-network-ft.sh b/tools/hotplug/Linux/xen-network-ft.sh
new file mode 100644
index 0000000..9c642e4
--- /dev/null
+++ b/tools/hotplug/Linux/xen-network-ft.sh
@@ -0,0 +1,102 @@
+#
+# Copyright (C) 2014 FUJITSU LIMITED
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of version 2.1 of the GNU Lesser General Public
+# License as published by the Free Software Foundation.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+
+# ifb is stored in /libxl/<domid>/<path>/<devid>/<file>
+PATH_LIST=(
+ remus/netbuf
+ colo_agent
+ colo_agent
+)
+
+FILE_LIST=(
+ ifb
+ ifb_primary
+ ifb_secondary
+)
+
+# check_one_ifb_file $domid $path $file
+function check_one_ifb_file()
+{
+ local domid=$1
+ local path=$2
+ local file=$3
+ local full_path=
+ local ifb=
+
+ for devid in `xenstore-list "/libxl/$domid/$path" 2>/dev/null || true`
+ do
+ full_path="/libxl/$domid/$path/$devid/$file"
+ xenstore-exists $full_path || continue
+ ifb=`xenstore-read "$full_path" 2>/dev/null || true`
+ [ "$ifb" = "$1" ] && return 1
+ done
+
+ return 0
+}
+
+# return 0 if the ifb is free
+function check_ifb()
+{
+ local installed=`nl-qdisc-list -d $1`
+ local path=
+ local file=
+ local -i index=0
+
+ [ -n "$installed" ] && return 1
+
+ for domid in `xenstore-list "/local/domain" 2>/dev/null || true`
+ do
+ [ $domid -eq 0 ] && continue
+
+ index=0
+ for path in "${PATH_LIST[@]}"; do
+ index=$((index + 1))
+ xenstore-exists "/libxl/$domid/$path" || continue
+ file=${FILE_LIST[index]}
+
+ check_one_ifb_file $domid $path $file || return 1
+ done
+ done
+
+ return 0
+}
+
+# setup_ifb $nic_name $file_name
+# Note:
+# 1. The caller should acquire the lock pickifb
+# 2. ifb name will be stored in $XENBUS_PATH/$file_name
+function setup_ifb()
+{
+ local nic_name=$1
+ local file_name=$2
+ local found=0
+
+ for ifb in `ifconfig -a -s|egrep ^ifb|cut -d ' ' -f1`
+ do
+ check_ifb "$ifb" || continue
+ found=1
+ break
+ done
+
+ if [ $found -eq 0 ]
+ then
+ fatal "Unable to find a free ifb device for $nic_name"
+ fi
+
+ xenstore_write "$XENBUS_PATH/$file_name" "$ifb"
+ do_or_die ip link set dev "$ifb" up
+}
diff --git a/tools/libxl/Makefile b/tools/libxl/Makefile
index ed46af3..1d2d8d9 100644
--- a/tools/libxl/Makefile
+++ b/tools/libxl/Makefile
@@ -57,7 +57,7 @@ LIBXL_OBJS-y += libxl_nonetbuffer.o
endif
LIBXL_OBJS-y += libxl_remus.o libxl_checkpoint_device.o libxl_remus_disk_drbd.o
-LIBXL_OBJS-y += libxl_colo_restore.o libxl_colo_save.o
+LIBXL_OBJS-y += libxl_colo_restore.o libxl_colo_save.o libxl_colo_nic.o
LIBXL_OBJS-$(CONFIG_X86) += libxl_cpuid.o libxl_x86.o libxl_psr.o
LIBXL_OBJS-$(CONFIG_ARM) += libxl_nocpuid.o libxl_arm.o
@@ -126,7 +126,7 @@ LIBXLU_OBJS = libxlu_cfg_y.o libxlu_cfg_l.o libxlu_cfg.o \
libxlu_disk_l.o libxlu_disk.o libxlu_vif.o libxlu_pci.o
$(LIBXLU_OBJS): CFLAGS += $(CFLAGS_libxenctrl) # For xentoollog.h
-CLIENTS = xl testidl libxl-save-helper xen-init-dom0
+CLIENTS = xl testidl libxl-save-helper xen-init-dom0 colo-tc
CFLAGS_XL += $(CFLAGS_libxenlight)
CFLAGS_XL += -Wshadow
@@ -249,6 +249,9 @@ libxl-save-helper: $(SAVE_HELPER_OBJS) libxenlight.so
testidl: testidl.o libxlutil.so libxenlight.so
$(CC) $(LDFLAGS) -o $@ testidl.o libxlutil.so $(LDLIBS_libxenlight) $(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
+colo-tc: colo-tc.o
+ $(CC) $(LDFLAGS) -o $@ colo-tc.o
+
.PHONY: install
install: all
$(INSTALL_DIR) $(DESTDIR)$(SBINDIR)
@@ -259,6 +262,7 @@ install: all
$(INSTALL_PROG) xl $(DESTDIR)$(SBINDIR)
$(INSTALL_PROG) xen-init-dom0 $(DESTDIR)$(LIBEXEC_BIN)
$(INSTALL_PROG) libxl-save-helper $(DESTDIR)$(LIBEXEC_BIN)
+ $(INSTALL_PROG) colo-tc $(DESTDIR)$(LIBEXEC_BIN)
$(INSTALL_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)
$(SYMLINK_SHLIB) libxenlight.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenlight.so.$(MAJOR)
$(SYMLINK_SHLIB) libxenlight.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenlight.so
diff --git a/tools/libxl/colo-tc.c b/tools/libxl/colo-tc.c
new file mode 100644
index 0000000..76093db
--- /dev/null
+++ b/tools/libxl/colo-tc.c
@@ -0,0 +1,589 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * Almost all codes are copied from iproute.
+ *
+ * colo-agent introduces a new qdisc colo, and needs some parameter.
+ * tc only supports new qdisc without parameter, so we introduce
+ * a new simple command to support this new qdisc.
+ *
+ * The licenses of iproute is GPLv2 or later.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/socket.h>
+#include <linux/socket.h>
+#include <linux/rtnetlink.h>
+#include <linux/pkt_sched.h>
+#include <net/if.h>
+#include <errno.h>
+#include <unistd.h>
+#include <time.h>
+
+#define TCA_BUF_MAX (64*1024)
+#define NEXT_ARG() \
+ do { \
+ argv++; \
+ if (--argc <= 0) { \
+ fprintf(stderr, "Command line is not complete." \
+ " Try option \"help\"\n"); \
+ return -1; \
+ } \
+ } while(0)
+
+enum {
+ TCA_COLO_UNSPEC,
+ TCA_COLO_DEV_IDX,
+ TCA_COLO_FLAGS,
+ TCA_COLO_VM_IDX,
+ __TCA_COLO_MAX,
+};
+
+struct colo_idx {
+ uint32_t this_idx;
+ uint32_t other_idx;
+};
+
+/* flags */
+#define IS_PRIMARY (1 << 0)
+
+
+struct rtnl_handle
+{
+ int fd;
+ struct sockaddr_nl local;
+ struct sockaddr_nl peer;
+ __u32 seq;
+};
+
+#define NLMSG_TAIL(nmsg) \
+ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+
+static int rtnl_open(struct rtnl_handle *rth, unsigned subscriptions)
+{
+ socklen_t addr_len;
+ int sndbuf = 32768;
+ int rcvbuf = 1024 * 1024;
+
+ memset(rth, 0, sizeof(*rth));
+
+ rth->fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE);
+ if (rth->fd < 0) {
+ perror("Cannot open netlink socket");
+ return -1;
+ }
+
+ if (setsockopt(rth->fd, SOL_SOCKET, SO_SNDBUF, &sndbuf,
+ sizeof(sndbuf)) < 0) {
+ perror("SO_SNDBUF");
+ return -1;
+ }
+
+ if (setsockopt(rth->fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf,
+ sizeof(rcvbuf)) < 0) {
+ perror("SO_RCVBUF");
+ return -1;
+ }
+
+ memset(&rth->local, 0, sizeof(rth->local));
+ rth->local.nl_family = AF_NETLINK;
+ rth->local.nl_groups = subscriptions;
+ if (bind(rth->fd, (struct sockaddr*)&rth->local, sizeof(rth->local)) < 0) {
+ perror("Cannot bind netlink socket");
+ return -1;
+ }
+
+ addr_len = sizeof(rth->local);
+ if (getsockname(rth->fd, (struct sockaddr*)&rth->local, &addr_len) < 0) {
+ perror("Cannot getsockname");
+ return -1;
+ }
+ if (addr_len != sizeof(rth->local)) {
+ fprintf(stderr, "Wrong address length %d\n", addr_len);
+ return -1;
+ }
+ if (rth->local.nl_family != AF_NETLINK) {
+ fprintf(stderr, "Wrong address family %d\n", rth->local.nl_family);
+ return -1;
+ }
+
+ rth->seq = time(NULL);
+ return 0;
+}
+
+static int rtnl_talk(struct rtnl_handle *rtnl, struct nlmsghdr *n, pid_t peer,
+ unsigned groups, struct nlmsghdr *answer)
+{
+ int status;
+ unsigned seq;
+ struct nlmsghdr *h;
+ struct sockaddr_nl nladdr;
+ struct iovec iov = {
+ .iov_base = (void*) n,
+ .iov_len = n->nlmsg_len
+ };
+ struct msghdr msg = {
+ .msg_name = &nladdr,
+ .msg_namelen = sizeof(nladdr),
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+ char buf[16384];
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+ nladdr.nl_pid = peer;
+ nladdr.nl_groups = groups;
+
+ n->nlmsg_seq = seq = ++rtnl->seq;
+
+ if (answer == NULL)
+ n->nlmsg_flags |= NLM_F_ACK;
+
+ status = sendmsg(rtnl->fd, &msg, 0);
+
+ if (status < 0) {
+ perror("Cannot talk to rtnetlink");
+ return -1;
+ }
+
+ memset(buf,0,sizeof(buf));
+
+ iov.iov_base = buf;
+
+ while (1) {
+ iov.iov_len = sizeof(buf);
+ status = recvmsg(rtnl->fd, &msg, 0);
+
+ if (status < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ fprintf(stderr, "netlink receive error %s (%d)\n",
+ strerror(errno), errno);
+ return -1;
+ }
+ if (status == 0) {
+ fprintf(stderr, "EOF on netlink\n");
+ return -1;
+ }
+ if (msg.msg_namelen != sizeof(nladdr)) {
+ fprintf(stderr, "sender address length == %d\n", msg.msg_namelen);
+ exit(1);
+ }
+ for (h = (struct nlmsghdr*)buf; status >= sizeof(*h); ) {
+ int len = h->nlmsg_len;
+ int l = len - sizeof(*h);
+
+ if (l < 0 || len>status) {
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Truncated message\n");
+ return -1;
+ }
+ fprintf(stderr, "!!!malformed message: len=%d\n", len);
+ exit(1);
+ }
+
+ if (nladdr.nl_pid != peer ||
+ h->nlmsg_pid != rtnl->local.nl_pid ||
+ h->nlmsg_seq != seq) {
+ /* Don't forget to skip that message. */
+ status -= NLMSG_ALIGN(len);
+ h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+ continue;
+ }
+
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+ if (l < sizeof(struct nlmsgerr)) {
+ fprintf(stderr, "ERROR truncated\n");
+ } else {
+ if (!err->error) {
+ if (answer)
+ memcpy(answer, h, h->nlmsg_len);
+ return 0;
+ }
+
+ fprintf(stderr, "RTNETLINK answers: %s\n", strerror(-err->error));
+ errno = -err->error;
+ }
+ return -1;
+ }
+ if (answer) {
+ memcpy(answer, h, h->nlmsg_len);
+ return 0;
+ }
+
+ fprintf(stderr, "Unexpected reply!!!\n");
+
+ status -= NLMSG_ALIGN(len);
+ h = (struct nlmsghdr*)((char*)h + NLMSG_ALIGN(len));
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ fprintf(stderr, "Message truncated\n");
+ continue;
+ }
+ if (status) {
+ fprintf(stderr, "!!!Remnant of size %d\n", status);
+ exit(1);
+ }
+ }
+}
+
+static void rtnl_close(struct rtnl_handle *rth)
+{
+ if (rth->fd >= 0) {
+ close(rth->fd);
+ rth->fd = -1;
+ }
+}
+
+static int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data,
+ int alen)
+{
+ int len = RTA_LENGTH(alen);
+ struct rtattr *rta;
+
+ if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) {
+ fprintf(stderr, "addattr_l ERROR: message exceeded bound of %d\n",
+ maxlen);
+ return -1;
+ }
+ rta = NLMSG_TAIL(n);
+ rta->rta_type = type;
+ rta->rta_len = len;
+ memcpy(RTA_DATA(rta), data, alen);
+ n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
+ return 0;
+}
+
+static void duparg(const char *key, const char *arg)
+{
+ fprintf(stderr, "Error: duplicate \"%s\": \"%s\" is the second value.\n",
+ key, arg);
+ exit(1);
+}
+
+static void invarg(const char *msg, const char *arg)
+{
+ fprintf(stderr, "Error: argument \"%s\" is wrong: %s\n", arg, msg);
+ exit(1);
+}
+
+static int usage(void)
+{
+ fprintf(stderr, "Usage: tc qdisc [ add | del | replace | change ] dev STRING\n");
+ fprintf(stderr, " [ handle QHANDLE ] [ root | parent CLASSID ]\n");
+ fprintf(stderr, " QDISC_KIND [ dev STRING ]\n");
+ fprintf(stderr, "\n");
+ fprintf(stderr, "Where:\n");
+ fprintf(stderr, "QDISC_KIND := { primary | secondary. }\n");
+ return -1;
+}
+
+struct rtnl_handle rth;
+
+static int get_qdisc_handle(__u32 *h, const char *str)
+{
+ __u32 maj;
+ char *p;
+
+ maj = TC_H_UNSPEC;
+ if (strcmp(str, "none") == 0)
+ goto ok;
+ maj = strtoul(str, &p, 16);
+ if (p == str)
+ return -1;
+ maj <<= 16;
+ if (*p != ':' && *p!=0)
+ return -1;
+ok:
+ *h = maj;
+ return 0;
+}
+
+static int get_tc_classid(__u32 *h, const char *str)
+{
+ __u32 maj, min;
+ char *p;
+
+ maj = TC_H_ROOT;
+ if (strcmp(str, "root") == 0)
+ goto ok;
+ maj = TC_H_UNSPEC;
+ if (strcmp(str, "none") == 0)
+ goto ok;
+ maj = strtoul(str, &p, 16);
+ if (p == str) {
+ maj = 0;
+ if (*p != ':')
+ return -1;
+ }
+ if (*p == ':') {
+ if (maj >= (1<<16))
+ return -1;
+ maj <<= 16;
+ str = p+1;
+ min = strtoul(str, &p, 16);
+ if (*p != 0)
+ return -1;
+ if (min >= (1<<16))
+ return -1;
+ maj |= min;
+ } else if (*p != 0)
+ return -1;
+
+ok:
+ *h = maj;
+ return 0;
+}
+
+static uint32_t get_idx(const char *name)
+{
+ uint32_t idx;
+
+ idx = if_nametoindex(name);
+ if (!idx)
+ fprintf(stderr, "Cannot find device \"%s\"\n", name);
+
+ return idx;
+}
+
+static int parse_opt(int argc, char **argv, struct nlmsghdr *n, int cmd, int this_idx)
+{
+ struct colo_idx idx;
+ struct rtattr *tail;
+ int is_primary, is_secondary;
+ uint32_t flags = 0;
+ uint32_t vmidx = 0;
+ char *p;
+
+ if (cmd != RTM_NEWQDISC)
+ return 0;
+
+ is_primary = 0;
+ is_secondary = 0;
+ memset(&idx, 0, sizeof(idx));
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") ==0) {
+ NEXT_ARG();
+ if (idx.other_idx)
+ duparg(*argv, "dev");
+
+ idx.other_idx = get_idx(*argv);
+ if (!idx.other_idx)
+ return -1;
+
+ idx.this_idx = this_idx;
+ if (idx.this_idx == idx.other_idx) {
+ fprintf(stderr, "Cannot use the same device\n");
+ return -1;
+ }
+ } else if (strcmp(*argv, "primary") == 0) {
+ if (is_secondary) {
+ fprintf(stderr, "\"primary\" conflicts with \"secondary\"\n");
+ return -1;
+ }
+
+ is_primary = 1;
+ } else if (strcmp(*argv, "secondary") == 0) {
+ if (is_secondary) {
+ fprintf(stderr, "\"secondary\" conflicts with \"primary\"\n");
+ return -1;
+ }
+
+ is_secondary = 1;
+ } else if (strcmp(*argv, "vmid") == 0) {
+ NEXT_ARG();
+ if (vmidx)
+ duparg(*argv, "vmid");
+
+ vmidx = strtoul(*argv, &p, 10);
+ if (*p != '\0' || !vmidx) {
+ fprintf(stderr, "invalid vmid value %s\n", *argv);
+ return -1;
+ }
+ } else {
+ fprintf(stderr, "unsupported option \"%s\"\n", *argv);
+ return -1;
+ }
+ argc--;
+ argv++;
+ }
+
+ if (!idx.other_idx) {
+ fprintf(stderr, "missing option dev\n");
+ return -1;
+ }
+
+ if (!is_primary && !is_secondary) {
+ fprintf(stderr, "missing option primary or secondary\n");
+ return -1;
+ }
+
+ if (!vmidx) {
+ fprintf(stderr, "missing option vmidx\n");
+ return -1;
+ }
+
+ if (is_primary)
+ flags |= IS_PRIMARY;
+
+ tail = NLMSG_TAIL(n);
+ addattr_l(n, 1024, TCA_OPTIONS, NULL, 0);
+ addattr_l(n, 1024, TCA_COLO_DEV_IDX, &idx, sizeof(idx));
+ addattr_l(n, 1024, TCA_COLO_FLAGS, &flags, sizeof(flags));
+ addattr_l(n, 1024, TCA_COLO_VM_IDX, &vmidx, sizeof(vmidx));
+ tail->rta_len = (void *) NLMSG_TAIL(n) - (void *) tail;
+ return 0;
+}
+
+static int tc_qdisc_modify(int cmd, unsigned flags, int argc, char **argv)
+{
+ struct {
+ struct nlmsghdr n;
+ struct tcmsg t;
+ char buff[TCA_BUF_MAX];
+ } req;
+ char k[16];
+ uint32_t handle = 0, idx = 0;
+
+ memset(&req, 0, sizeof(req));
+ memset(k, 0, sizeof(k));
+
+ req.n.nlmsg_len = NLMSG_LENGTH(sizeof(struct tcmsg));
+ req.n.nlmsg_flags = NLM_F_REQUEST|flags;
+ req.n.nlmsg_type = cmd;
+ req.t.tcm_family = AF_UNSPEC;
+
+ while (argc > 0) {
+ if (strcmp(*argv, "dev") == 0) {
+ NEXT_ARG();
+ if (req.t.tcm_ifindex)
+ duparg("dev", *argv);
+
+ idx = get_idx(*argv);
+ if (!idx)
+ return -1;
+ req.t.tcm_ifindex = idx;
+ } else if (strcmp(*argv, "handle") == 0) {
+ NEXT_ARG();
+ if (req.t.tcm_handle)
+ duparg("handle", *argv);
+ if (get_qdisc_handle(&handle, *argv))
+ invarg(*argv, "invalid qdisc ID");
+ req.t.tcm_handle = handle;
+ } else if (strcmp(*argv, "root") == 0) {
+ if (req.t.tcm_parent) {
+ fprintf(stderr, "Error: \"root\" is duplicate parent ID\n");
+ return -1;
+ }
+ req.t.tcm_parent = TC_H_ROOT;
+ } else if (strcmp(*argv, "parent") == 0) {
+ NEXT_ARG();
+ if (req.t.tcm_parent)
+ duparg("parent", *argv);
+ if (get_tc_classid(&handle, *argv))
+ invarg(*argv, "invalid parent ID");
+ req.t.tcm_parent = handle;
+ } else if (strcmp(*argv, "colo") == 0) {
+ strncpy(k, *argv, sizeof(k) - 1);
+ argc--;
+ argv++;
+ break;
+ } else if (strcmp(*argv, "help") == 0){
+ usage();
+ return 0;
+ } else {
+ fprintf(stderr, "unsupported qdisc %s\n", *argv);
+ return -1;
+ }
+ argc--;
+ argv++;
+ }
+
+ if (!k[0]) {
+ fprintf(stderr, "no qdisc is specified\n");
+ return -1;
+ }
+
+ addattr_l(&req.n, sizeof(req), TCA_KIND, k, strlen(k)+1);
+ if (parse_opt(argc, argv, &req.n, cmd, idx))
+ return -1;
+
+ if (rtnl_talk(&rth, &req.n, 0, 0, NULL) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int matches(const char *cmd, const char *pattern)
+{
+ int len = strlen(cmd);
+ if (len > strlen(pattern))
+ return -1;
+ return memcmp(pattern, cmd, len);
+}
+
+static int do_qdisc(int argc, char *argv[])
+{
+ if (matches(*argv, "add") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_EXCL|NLM_F_CREATE, argc-1, argv+1);
+ if (matches(*argv, "change") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, 0, argc-1, argv+1);
+ if (matches(*argv, "replace") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_CREATE|NLM_F_REPLACE, argc-1, argv+1);
+ if (matches(*argv, "link") == 0)
+ return tc_qdisc_modify(RTM_NEWQDISC, NLM_F_REPLACE, argc-1, argv+1);
+ if (matches(*argv, "delete") == 0)
+ return tc_qdisc_modify(RTM_DELQDISC, 0, argc-1, argv+1);
+
+ fprintf(stderr, "Command \"%s\" is unknown, try \"tc qdisc help\".\n", *argv);
+ return -1;
+}
+
+int main(int argc, char *argv[])
+{
+ int ret;
+
+ if (rtnl_open(&rth, 0) < 0) {
+ fprintf(stderr, "Cannot open rtnetlink\n");
+ exit(1);
+ }
+
+ if (matches(argv[1], "qdisc")) {
+ usage();
+ exit(1);
+ }
+
+ argc -= 2;
+ argv += 2;
+
+ if (argc < 1) {
+ usage();
+ exit(1);
+ }
+
+ ret = do_qdisc(argc, argv);
+
+ rtnl_close(&rth);
+
+ if (ret)
+ return 1;
+
+ return 0;
+}
diff --git a/tools/libxl/libxl.c b/tools/libxl/libxl.c
index 822f1f3..f8c0bab 100644
--- a/tools/libxl/libxl.c
+++ b/tools/libxl/libxl.c
@@ -3371,6 +3371,11 @@ void libxl__device_nic_add(libxl__egc *egc, uint32_t domid,
flexarray_append(back, nic->ifname);
}
+ if (nic->forwarddev) {
+ flexarray_append(back, "forwarddev");
+ flexarray_append(back, nic->forwarddev);
+ }
+
flexarray_append(back, "mac");
flexarray_append(back,libxl__sprintf(gc,
LIBXL_MAC_FMT, LIBXL_MAC_BYTES(nic->mac)));
@@ -3494,6 +3499,7 @@ static int libxl__device_nic_from_xs_be(libxl__gc *gc,
nic->ip = READ_BACKEND(NOGC, "ip");
nic->bridge = READ_BACKEND(NOGC, "bridge");
nic->script = READ_BACKEND(NOGC, "script");
+ nic->forwarddev = READ_BACKEND(NOGC, "forwarddev");
/* vif_ioemu nics use the same xenstore entries as vif interfaces */
tmp = READ_BACKEND(gc, "type");
diff --git a/tools/libxl/libxl_colo_nic.c b/tools/libxl/libxl_colo_nic.c
new file mode 100644
index 0000000..0578973
--- /dev/null
+++ b/tools/libxl/libxl_colo_nic.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (C) 2014 FUJITSU LIMITED
+ * Author: Wen Congyang <wency@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published
+ * by the Free Software Foundation; version 2.1 only. with the special
+ * exception on linking described in file LICENSE.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ */
+
+#include "libxl_osdeps.h" /* must come before any other headers */
+
+#include "libxl_internal.h"
+
+typedef struct libxl__colo_device_nic {
+ int devid;
+ const char *vif;
+} libxl__colo_device_nic;
+
+enum {
+ primary,
+ secondary,
+};
+
+
+/* ========== init() and cleanup() ========== */
+int init_subkind_colo_nic(libxl__checkpoint_devices_state *cds)
+{
+ return 0;
+}
+
+void cleanup_subkind_colo_nic(libxl__checkpoint_devices_state *cds)
+{
+}
+
+/* ========== helper functions ========== */
+static void colo_save_setup_script_cb(libxl__egc *egc,
+ libxl__async_exec_state *aes,
+ int status);
+static void colo_save_teardown_script_cb(libxl__egc *egc,
+ libxl__async_exec_state *aes,
+ int status);
+
+/*
+ * If the device has a vifname, then use that instead of
+ * the vifX.Y format.
+ * it must ONLY be used for remus because if driver domains
+ * were in use it would constitute a security vulnerability.
+ */
+static const char *get_vifname(libxl__checkpoint_device *dev,
+ const libxl_device_nic *nic)
+{
+ const char *vifname = NULL;
+ const char *path;
+ int rc;
+
+ STATE_AO_GC(dev->cds->ao);
+
+ /* Convenience aliases */
+ const uint32_t domid = dev->cds->domid;
+
+ path = GCSPRINTF("%s/backend/vif/%d/%d/vifname",
+ libxl__xs_get_dompath(gc, 0), domid, nic->devid);
+ rc = libxl__xs_read_checked(gc, XBT_NULL, path, &vifname);
+ if (!rc && !vifname) {
+ vifname = libxl__device_nic_devname(gc, domid,
+ nic->devid,
+ nic->nictype);
+ }
+
+ return vifname;
+}
+
+/*
+ * the script needs the following env & args
+ * $vifname
+ * $XENBUS_PATH (/libxl/<domid>/colo_agent/<devid>/)
+ * $forwarddev
+ * $mode(primary/secondary)
+ * $vmid
+ * setup/teardown as command line arg.
+ */
+static void setup_async_exec(libxl__checkpoint_device *dev, char *op, int side,
+ char *colo_agent_script)
+{
+ int arraysize, nr = 0;
+ char **env = NULL, **args = NULL;
+ libxl__colo_device_nic *colo_nic = dev->concrete_data;
+ libxl__checkpoint_devices_state *cds = dev->cds;
+ libxl__async_exec_state *aes = &dev->aodev.aes;
+ const libxl_device_nic *nic = dev->backend_dev;
+
+ STATE_AO_GC(cds->ao);
+
+ /* Convenience aliases */
+ const uint32_t domid = cds->domid;
+ const int devid = colo_nic->devid;
+ const char *const vif = colo_nic->vif;
+
+ arraysize = 11;
+ GCNEW_ARRAY(env, arraysize);
+ env[nr++] = "vifname";
+ env[nr++] = libxl__strdup(gc, vif);
+ env[nr++] = "XENBUS_PATH";
+ env[nr++] = GCSPRINTF("%s/colo_agent/%d",
+ libxl__xs_libxl_path(gc, domid), devid);
+ env[nr++] = "forwarddev";
+ env[nr++] = libxl__strdup(gc, nic->forwarddev);
+ env[nr++] = "mode";
+ if (side == primary)
+ env[nr++] = "primary";
+ else
+ env[nr++] = "secondary";
+ env[nr++] = "vmid";
+ env[nr++] = GCSPRINTF("%u", domid);
+ env[nr++] = NULL;
+ assert(nr == arraysize);
+
+ arraysize = 3; nr = 0;
+ GCNEW_ARRAY(args, arraysize);
+ args[nr++] = colo_agent_script;
+ args[nr++] = op;
+ args[nr++] = NULL;
+ assert(nr == arraysize);
+
+ aes->ao = dev->cds->ao;
+ aes->what = GCSPRINTF("%s %s", args[0], args[1]);
+ aes->env = env;
+ aes->args = args;
+ aes->timeout_ms = LIBXL_HOTPLUG_TIMEOUT * 1000;
+ aes->stdfds[0] = -1;
+ aes->stdfds[1] = -1;
+ aes->stdfds[2] = -1;
+
+ if (!strcmp(op, "teardown"))
+ aes->callback = colo_save_teardown_script_cb;
+ else
+ aes->callback = colo_save_setup_script_cb;
+}
+
+/* ========== setup() and teardown() ========== */
+static void colo_nic_setup(libxl__egc *egc, libxl__checkpoint_device *dev,
+ int side, char *colo_agent_script)
+{
+ int rc;
+ libxl__colo_device_nic *colo_nic;
+ const libxl_device_nic *nic = dev->backend_dev;
+
+ STATE_AO_GC(dev->cds->ao);
+
+ /*
+ * thers's no subkind of nic devices, so nic ops is always matched
+ * with nic devices, we begin to setup the nic device
+ */
+ dev->matched = 1;
+
+ if (!nic->forwarddev) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ GCNEW(colo_nic);
+ dev->concrete_data = colo_nic;
+ colo_nic->devid = nic->devid;
+ colo_nic->vif = get_vifname(dev, nic);
+ if (!colo_nic->vif) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ setup_async_exec(dev, "setup", side, colo_agent_script);
+ rc = libxl__async_exec_start(gc, &dev->aodev.aes);
+ if (rc)
+ goto out;
+
+ return;
+
+out:
+ dev->aodev.rc = rc;
+ dev->aodev.callback(egc, &dev->aodev);
+}
+
+static void colo_save_setup_script_cb(libxl__egc *egc,
+ libxl__async_exec_state *aes,
+ int status)
+{
+ libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
+ libxl__checkpoint_device *dev = CONTAINER_OF(aodev, *dev, aodev);
+ libxl__colo_device_nic *colo_nic = dev->concrete_data;
+ libxl__checkpoint_devices_state *cds = dev->cds;
+ const char *out_path_base, *hotplug_error = NULL;
+ int rc;
+
+ STATE_AO_GC(cds->ao);
+
+ /* Convenience aliases */
+ const uint32_t domid = cds->domid;
+ const int devid = colo_nic->devid;
+ const char *const vif = colo_nic->vif;
+
+ out_path_base = GCSPRINTF("%s/colo_agent/%d",
+ libxl__xs_libxl_path(gc, domid), devid);
+
+ rc = libxl__xs_read_checked(gc, XBT_NULL,
+ GCSPRINTF("%s/hotplug-error", out_path_base),
+ &hotplug_error);
+ if (rc)
+ goto out;
+
+ if (hotplug_error) {
+ LOG(ERROR, "colo_agent script %s setup failed for vif %s: %s",
+ aes->args[0], vif, hotplug_error);
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ if (status) {
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ rc = 0;
+
+out:
+ aodev->rc = rc;
+ aodev->callback(egc, aodev);
+}
+
+static void colo_nic_teardown(libxl__egc *egc, libxl__checkpoint_device *dev,
+ int side, char *colo_agent_script)
+{
+ int rc;
+ STATE_AO_GC(dev->cds->ao);
+
+ setup_async_exec(dev, "teardown", side, colo_agent_script);
+
+ rc = libxl__async_exec_start(gc, &dev->aodev.aes);
+ if (rc)
+ goto out;
+
+ return;
+
+out:
+ dev->aodev.rc = rc;
+ dev->aodev.callback(egc, &dev->aodev);
+}
+
+static void colo_save_teardown_script_cb(libxl__egc *egc,
+ libxl__async_exec_state *aes,
+ int status)
+{
+ int rc;
+ libxl__ao_device *aodev = CONTAINER_OF(aes, *aodev, aes);
+
+ if (status)
+ rc = ERROR_FAIL;
+ else
+ rc = 0;
+
+ aodev->rc = rc;
+ aodev->callback(egc, aodev);
+}
+
+/* ======== primary ======== */
+static void colo_nic_save_setup(libxl__egc *egc, libxl__checkpoint_device *dev)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(dev->cds, *css, cds);
+
+ colo_nic_setup(egc, dev, primary, css->colo_agent_script);
+}
+
+static void colo_nic_save_teardown(libxl__egc *egc,
+ libxl__checkpoint_device *dev)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(dev->cds, *css, cds);
+
+ colo_nic_teardown(egc, dev, primary, css->colo_agent_script);
+}
+
+const libxl__checkpoint_device_instance_ops colo_save_device_nic = {
+ .kind = LIBXL__DEVICE_KIND_VIF,
+ .setup = colo_nic_save_setup,
+ .teardown = colo_nic_save_teardown,
+};
diff --git a/tools/libxl/libxl_colo_save.c b/tools/libxl/libxl_colo_save.c
index 516b913..b9680a3 100644
--- a/tools/libxl/libxl_colo_save.c
+++ b/tools/libxl/libxl_colo_save.c
@@ -19,12 +19,162 @@
#include "libxl_colo.h"
extern const libxl__checkpoint_device_instance_ops colo_save_device_blktap2_disk;
+extern const libxl__checkpoint_device_instance_ops colo_save_device_nic;
static const libxl__checkpoint_device_instance_ops *colo_ops[] = {
&colo_save_device_blktap2_disk,
+ &colo_save_device_nic,
NULL,
};
+/* ================= colo-agent: setup, wait and teardown ================= */
+static void colo_start_new_checkpoint(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
+static void colo_agent_async_wait_for_checkpoint(libxl__colo_save_state *css);
+static void colo_agent_async_call_done(libxl__egc *egc,
+ libxl__ev_child *child,
+ int pid,
+ int status);
+
+#define COMP_IOC_MAGIC 'k'
+#define COMP_IOCTWAIT _IO(COMP_IOC_MAGIC, 0)
+#define COMP_IOCTFLUSH _IO(COMP_IOC_MAGIC, 1)
+#define COMP_IOCTRESUME _IO(COMP_IOC_MAGIC, 2)
+
+#define COLO_IO 0x33
+#define COLO_CREATE_VM _IO(COLO_IO, 0x00)
+#define COLO_RELEASE_VM _IO(COLO_IO, 0x01)
+
+#define COMP_IOCTWAIT_TIMEOUT 5000
+
+static int colo_agent_setup(libxl__colo_save_state *css, int domid)
+{
+ int ret;
+
+ STATE_AO_GC(css->cds.ao);
+
+ css->fd = open("/dev/HA_compare", O_RDWR);
+ if (css->fd < 0) {
+ LOG(ERROR, "cannot open /dev/HA_compare");
+ return ERROR_FAIL;
+ }
+
+ ret = ioctl(css->fd, COLO_CREATE_VM, domid);
+ if (ret < 0) {
+ LOG(ERROR, "cannot pass vmid to colo-agent");
+ goto out;
+ }
+
+ css->vm_fd = ret;
+
+ return 0;
+
+out:
+ close(css->fd);
+ css->fd = -1;
+ return ERROR_FAIL;
+}
+
+static void colo_agent_preresume(libxl__colo_save_state *css)
+{
+ ioctl(css->vm_fd, COMP_IOCTFLUSH);
+}
+
+static void colo_agent_postresume(libxl__colo_save_state *css)
+{
+ ioctl(css->vm_fd, COMP_IOCTRESUME);
+}
+
+static void colo_agent_async_call(libxl__egc *egc,
+ libxl__colo_save_state *css,
+ void func(libxl__colo_save_state *),
+ libxl__ev_child_callback callback)
+{
+ int pid = -1, rc;
+
+ STATE_AO_GC(css->cds.ao);
+
+ /* Fork and call */
+ pid = libxl__ev_child_fork(gc, &css->child, callback);
+ if (pid == -1) {
+ LOG(ERROR, "unable to fork");
+ rc = ERROR_FAIL;
+ goto out;
+ }
+
+ if (!pid) {
+ /* child */
+ func(css);
+ /* notreached */
+ abort();
+ }
+
+ return;
+
+out:
+ callback(egc, &css->child, -1, 1);
+}
+
+static void colo_agent_wait_for_checkpoint(libxl__egc *egc,
+ libxl__colo_save_state *css)
+{
+ colo_agent_async_call(egc, css,
+ colo_agent_async_wait_for_checkpoint,
+ colo_agent_async_call_done);
+}
+
+static void colo_agent_async_wait_for_checkpoint(libxl__colo_save_state *css)
+{
+ int ret;
+
+again:
+ ret = ioctl(css->vm_fd, COMP_IOCTWAIT, COMP_IOCTWAIT_TIMEOUT);
+ if (ret < 0) {
+ if (errno == ERESTART)
+ goto again;
+
+ if (errno == ETIME)
+ _exit(0);
+
+ _exit(1);
+ }
+
+ _exit(0);
+}
+
+static void colo_agent_async_call_done(libxl__egc *egc,
+ libxl__ev_child *child,
+ int pid,
+ int status)
+{
+ libxl__colo_save_state *css = CONTAINER_OF(child, *css, child);
+
+ EGC_GC;
+
+ if (status) {
+ LOG(ERROR, "failed to wait for new checkpoint");
+ colo_start_new_checkpoint(egc, &css->cds, ERROR_FAIL);
+ return;
+ }
+
+ colo_start_new_checkpoint(egc, &css->cds, 0);
+}
+
+static void colo_agent_teardown(libxl__colo_save_state *css, int domid)
+{
+ if (css->vm_fd >= 0) {
+ close(css->vm_fd);
+ css->vm_fd = -1;
+ ioctl(css->fd, COLO_RELEASE_VM, domid);
+ }
+
+ if (css->fd >= 0) {
+ close(css->fd);
+ css->fd = -1;
+ }
+}
+
/* ================= helper functions ================= */
static int init_device_subkind(libxl__checkpoint_devices_state *cds)
{
@@ -32,6 +182,9 @@ static int init_device_subkind(libxl__checkpoint_devices_state *cds)
int rc;
STATE_AO_GC(cds->ao);
+ rc = init_subkind_colo_nic(cds);
+ if (rc) goto out;
+
rc = init_subkind_drbd_disk(cds);
if (rc) goto out;
@@ -46,6 +199,7 @@ static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
STATE_AO_GC(cds->ao);
cleanup_subkind_blktap2_disk(cds);
+ cleanup_subkind_colo_nic(cds);
}
/* ================= colo: setup save environment ================= */
@@ -73,9 +227,12 @@ void libxl__colo_save_setup(libxl__egc *egc, libxl__colo_save_state *css)
css->send_fd = dss->fd;
css->recv_fd = dss->recv_fd;
css->svm_running = false;
+ css->fd = -1;
+ css->vm_fd = -1;
+ libxl__ev_child_init(&css->child);
- /* TODO: nic support */
- cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VBD);
+ cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VBD) |
+ (1 << LIBXL__DEVICE_KIND_VIF);
cds->ops = colo_ops;
cds->callback = colo_save_setup_done;
cds->ao = ao;
@@ -101,12 +258,17 @@ static void colo_save_setup_done(libxl__egc *egc,
STATE_AO_GC(cds->ao);
if (!rc) {
+ rc = colo_agent_setup(css, dss->domid);
+ if (rc)
+ goto failed;
libxl__domain_suspend(egc, dss);
return;
}
LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
dss->domid);
+
+failed:
css->cds.callback = colo_save_setup_failed;
libxl__checkpoint_devices_teardown(egc, &css->cds);
}
@@ -154,6 +316,7 @@ static void colo_teardown_done(libxl__egc *egc,
libxl__domain_suspend_state *dss = CONTAINER_OF(css, *dss, css);
cleanup_device_subkind(cds);
+ colo_agent_teardown(css, dss->domid);
dss->callback(egc, dss, rc);
}
@@ -420,6 +583,8 @@ static void colo_read_svm_ready_done(libxl__egc *egc,
goto out;
}
+ colo_agent_preresume(css);
+
css->svm_running = true;
css->cds.callback = colo_preresume_cb;
libxl__checkpoint_devices_preresume(egc, &css->cds);
@@ -488,6 +653,8 @@ static void colo_read_svm_resumed_done(libxl__egc *egc,
goto out;
}
+ colo_agent_postresume(css);
+
ok = 1;
out:
@@ -505,9 +672,6 @@ out:
static void colo_device_commit_cb(libxl__egc *egc,
libxl__checkpoint_devices_state *cds,
int rc);
-static void colo_start_new_checkpoint(libxl__egc *egc,
- libxl__checkpoint_devices_state *cds,
- int rc);
static void colo_send_data_done(libxl__egc *egc,
libxl__datacopier_state *dc,
int onwrite, int errnoval);
@@ -539,8 +703,7 @@ static void colo_device_commit_cb(libxl__egc *egc,
goto out;
}
- /* TODO: wait a new checkpoint */
- colo_start_new_checkpoint(egc, cds, 0);
+ colo_agent_wait_for_checkpoint(egc, css);
return;
out:
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 1659845..cdd8d1e 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2719,6 +2719,8 @@ int init_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
void cleanup_subkind_drbd_disk(libxl__checkpoint_devices_state *cds);
int init_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds);
void cleanup_subkind_blktap2_disk(libxl__checkpoint_devices_state *cds);
+int init_subkind_colo_nic(libxl__checkpoint_devices_state *cds);
+void cleanup_subkind_colo_nic(libxl__checkpoint_devices_state *cds);
typedef void libxl__checkpoint_callback(libxl__egc *,
libxl__checkpoint_devices_state *,
@@ -2834,6 +2836,7 @@ struct libxl__colo_save_state {
libxl__checkpoint_devices_state cds;
int send_fd;
int recv_fd;
+ char *colo_agent_script;
/* private */
libxl__datacopier_state dc;
@@ -2845,6 +2848,11 @@ struct libxl__colo_save_state {
uint8_t temp_buff[9];
void (*callback)(libxl__egc *, libxl__colo_save_state *);
bool svm_running;
+
+ /* private, used by colo-agent */
+ int fd;
+ int vm_fd;
+ libxl__ev_child child;
};
/*----- Domain suspend (save) state structure -----*/
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 54e1684..7ee3333 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -514,6 +514,7 @@ libxl_device_nic = Struct("device_nic", [
("rate_bytes_per_interval", uint64),
("rate_interval_usecs", uint32),
("gatewaydev", string),
+ ("forwarddev", string)
])
libxl_device_pci = Struct("device_pci", [
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index ec77217..7a47dd2 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -1615,6 +1615,9 @@ static void parse_config_data(const char *config_source,
} else if (!strcmp(p, "gatewaydev")) {
free(nic->gatewaydev);
nic->gatewaydev = strdup(p2 + 1);
+ } else if (!strcmp(p, "forwarddev")) {
+ free(nic->forwarddev);
+ nic->forwarddev = strdup(p2 + 1);
}
} while ((p = strtok(NULL, ",")) != NULL);
skip_nic:
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 17/18] setup and control colo-agent for secondary vm
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (15 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 16/18] setup and control colo-agent for primary vm Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [RFC Patch v4 18/18] colo: cmdline switches and config vars to control colo-agent Wen Congyang
` (2 subsequent siblings)
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
This patch adds the machinery required for protecting a secondary vm's
network device state. This patch implements the interfaces required by
the checkpoint abstract device layer. A note about the implementation:
a) setup() and teardown() are called for each vif attached to the
secondary vm.
During setup(), the hotplug script is called to setup COLO agent for
given vif. The script does the follow things:
i) redirect vif egress traffic to the FORWARD device
ii) redirect FORWARD device egress traffic to vif
During teardown(), the hotplug scripts are called again for each
vif. The scripts does the follow things:
i) remove the vif->FORWARD traffic redirection
ii) remove the FORWARD->vif traffic redirection
b) Nothing should be done for secondary vm's network device.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
tools/libxl/libxl_colo_nic.c | 23 ++++++
tools/libxl/libxl_colo_restore.c | 152 +++++++++++++++++++++++++++++++++------
tools/libxl/libxl_internal.h | 2 +
3 files changed, 157 insertions(+), 20 deletions(-)
diff --git a/tools/libxl/libxl_colo_nic.c b/tools/libxl/libxl_colo_nic.c
index 0578973..391e9d4 100644
--- a/tools/libxl/libxl_colo_nic.c
+++ b/tools/libxl/libxl_colo_nic.c
@@ -287,3 +287,26 @@ const libxl__checkpoint_device_instance_ops colo_save_device_nic = {
.setup = colo_nic_save_setup,
.teardown = colo_nic_save_teardown,
};
+
+/* ======== secondary ======== */
+static void colo_nic_restore_setup(libxl__egc *egc,
+ libxl__checkpoint_device *dev)
+{
+ libxl__colo_restore_state *crs = CONTAINER_OF(dev->cds, *crs, cds);
+
+ colo_nic_setup(egc, dev, secondary, crs->colo_agent_script);
+}
+
+static void colo_nic_restore_teardown(libxl__egc *egc,
+ libxl__checkpoint_device *dev)
+{
+ libxl__colo_restore_state *crs = CONTAINER_OF(dev->cds, *crs, cds);
+
+ colo_nic_teardown(egc, dev, secondary, crs->colo_agent_script);
+}
+
+const libxl__checkpoint_device_instance_ops colo_restore_device_nic = {
+ .kind = LIBXL__DEVICE_KIND_VIF,
+ .setup = colo_nic_restore_setup,
+ .teardown = colo_nic_restore_teardown,
+};
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
index 805d51f..5183051 100644
--- a/tools/libxl/libxl_colo_restore.c
+++ b/tools/libxl/libxl_colo_restore.c
@@ -40,6 +40,9 @@ struct libxl__colo_restore_checkpoint_state {
libxl__logdirty_switch lds;
libxl__colo_restore_state *crs;
int status;
+ /* used for teardown */
+ int teardown_devices;
+ int saved_rc;
void (*callback)(libxl__egc *,
libxl__colo_restore_checkpoint_state *,
@@ -58,6 +61,13 @@ static void libxl__colo_restore_domain_resume_callback(void *data);
static void libxl__colo_restore_domain_checkpoint_callback(void *data);
static void libxl__colo_restore_domain_suspend_callback(void *data);
+extern const libxl__checkpoint_device_instance_ops colo_restore_device_nic;
+
+static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
+ &colo_restore_device_nic,
+ NULL,
+};
+
/* ===================== colo: common functions ===================== */
static void colo_enable_logdirty(libxl__colo_restore_state *crs, libxl__egc *egc)
{
@@ -147,6 +157,28 @@ static void colo_resume_vm(libxl__egc *egc,
return;
}
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+ /* init device subkind-specific state in the libxl ctx */
+ int rc;
+ STATE_AO_GC(cds->ao);
+
+ rc = init_subkind_colo_nic(cds);
+ if (rc) goto out;
+
+ rc = 0;
+out:
+ return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+ /* cleanup device subkind-specific state in the libxl ctx */
+ STATE_AO_GC(cds->ao);
+
+ cleanup_subkind_colo_nic(cds);
+}
+
/* ================ colo: setup restore environment ================ */
static void libxl__colo_domain_create_cb(libxl__egc *egc,
@@ -275,6 +307,9 @@ static void libxl__colo_domain_create_cb(libxl__egc *egc,
/* ================ colo: teardown restore environment ================ */
+static void colo_restore_teardown_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
static void do_failover_done(libxl__egc *egc,
libxl__colo_restore_checkpoint_state* crcs,
int rc);
@@ -321,11 +356,38 @@ void libxl__colo_restore_teardown(libxl__egc *egc,
EGC_GC;
if (!dirty_bitmap)
- goto do_failover;
+ goto teardown_devices;
xc_hypercall_buffer_free_pages(CTX->xch, dirty_bitmap, NRPAGES(bsize));
-do_failover:
+teardown_devices:
+ crcs->saved_rc = rc;
+ if (!crcs->teardown_devices) {
+ colo_restore_teardown_done(egc, &crs->cds, 0);
+ return;
+ }
+
+ crs->cds.callback = colo_restore_teardown_done;
+ libxl__checkpoint_devices_teardown(egc, &crs->cds);
+}
+
+static void colo_restore_teardown_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_restore_state *crs = CONTAINER_OF(cds, *crs, cds);
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+ EGC_GC;
+
+ if (rc)
+ LOG(ERROR, "COLO: failed to teardown device after setup failed"
+ " for guest with domid %u, rc %d", cds->domid, rc);
+
+ cleanup_device_subkind(cds);
+
+ rc = crcs->saved_rc;
if (!rc) {
crcs->callback = do_failover_done;
do_failover(egc, crs);
@@ -418,6 +480,11 @@ static void colo_reenable_logdirty(libxl__egc *egc,
static void colo_reenable_logdirty_done(libxl__egc *egc,
libxl__logdirty_switch *lds,
int rc);
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+ libxl__colo_restore_state *crs);
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc);
static void libxl__colo_restore_domain_resume_callback(void *data)
{
@@ -529,7 +596,6 @@ static void colo_write_svm_resumed(libxl__egc *egc,
dc->copywhat = crcs->copywhat[2];
dc->writewhat = "colo stream";
dc->callback = colo_common_send_data_done;
- /* TODO: configure network */
crcs->callback = NULL;
rc = libxl__datacopier_start(dc);
@@ -552,12 +618,9 @@ static void colo_enable_logdirty_done(libxl__egc *egc,
int rc)
{
libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
- libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
/* Convenience aliases */
libxl__colo_restore_state *const crs = crcs->crs;
- libxl__save_helper_state *const shs = &dcs->shs;
- const uint32_t domid = crs->domid;
STATE_AO_GC(crs->ao);
@@ -571,19 +634,7 @@ static void colo_enable_logdirty_done(libxl__egc *egc,
return;
}
- /* We have enabled secondary vm's logdirty, so we can unpause it now */
- rc = libxl__domain_unpause(gc, domid);
- if (rc) {
- LOG(ERROR, "cannot unpause secondary vm");
- goto out;
- }
-
- colo_write_svm_resumed(egc, crcs);
-
- return;
-
-out:
- libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+ colo_setup_checkpoint_devices(egc, crs);
}
static void colo_reenable_logdirty(libxl__egc *egc,
@@ -622,7 +673,6 @@ static void colo_reenable_logdirty_done(libxl__egc *egc,
/* Convenience aliases */
libxl__save_helper_state *const shs = &dcs->shs;
- const uint32_t domid = crcs->crs->domid;
STATE_AO_GC(crcs->crs->ao);
@@ -631,6 +681,68 @@ static void colo_reenable_logdirty_done(libxl__egc *egc,
goto out;
}
+ colo_setup_checkpoint_devices(egc, crcs->crs);
+
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+/*
+ * We cannot setup checkpoint devices in libxl__colo_restore_setup(),
+ * because the guest is not ready.
+ */
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+ libxl__colo_restore_state *crs)
+{
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+ /* Convenience aliases */
+ libxl__checkpoint_devices_state *cds = &crs->cds;
+ libxl__save_helper_state *const shs = &dcs->shs;
+
+ STATE_AO_GC(crs->ao);
+
+ crcs->teardown_devices = 1;
+
+ cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VIF);
+ cds->callback = colo_restore_setup_cds_done;
+ cds->ao = ao;
+ cds->domid = crs->domid;
+ cds->ops = colo_restore_ops;
+
+ if (init_device_subkind(cds))
+ goto out;
+
+ libxl__checkpoint_devices_setup(egc, cds);
+ return;
+
+out:
+ libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+ libxl__checkpoint_devices_state *cds,
+ int rc)
+{
+ libxl__colo_restore_state *crs = CONTAINER_OF(cds, *crs, cds);
+ libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+ libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+ /* Convenience aliases */
+ libxl__save_helper_state *const shs = &dcs->shs;
+ const uint32_t domid = crs->domid;
+
+ STATE_AO_GC(cds->ao);
+
+ if (rc) {
+ LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
+ cds->domid);
+ goto out;
+ }
+
/* We have enabled secondary vm's logdirty, so we can unpause it now */
rc = libxl__domain_unpause(gc, domid);
if (rc) {
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index cdd8d1e..8bca96c 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3188,10 +3188,12 @@ struct libxl__colo_restore_state {
int pae;
int superpages;
libxl__colo_callback *callback;
+ char *colo_agent_script;
/* private, colo restore checkpoint state */
libxl__domain_create_cb *saved_cb;
void *crcs;
+ libxl__checkpoint_devices_state cds;
};
struct libxl__domain_create_state {
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [RFC Patch v4 18/18] colo: cmdline switches and config vars to control colo-agent
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (16 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 17/18] setup and control colo-agent for secondary vm Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 7:06 ` [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state" Wen Congyang
2014-10-24 7:06 ` Wen Congyang
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Ian Jackson, Jiang Yunhong,
Dong Eddie, Yang Hongyang, Lai Jiangshan
Add cmdline switches to 'xl migrate-receive' command to specify
a domain-specific hotplug script to setup COLO agent.
Add a new config var 'colo.default.agentscript' to xl.conf, that
allows the user to override the default global script used to
setup COLO agent.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
docs/man/xl.conf.pod.5 | 6 ++++++
docs/man/xl.pod.1 | 1 -
tools/libxl/libxl_colo_save.c | 6 ++++++
tools/libxl/libxl_create.c | 17 +++++++++++++---
tools/libxl/libxl_internal.h | 1 +
tools/libxl/libxl_types.idl | 1 +
tools/libxl/xl.c | 3 +++
tools/libxl/xl.h | 1 +
tools/libxl/xl_cmdimpl.c | 47 ++++++++++++++++++++++++++++++++-----------
9 files changed, 67 insertions(+), 16 deletions(-)
diff --git a/docs/man/xl.conf.pod.5 b/docs/man/xl.conf.pod.5
index 8ae19bb..c65213d 100644
--- a/docs/man/xl.conf.pod.5
+++ b/docs/man/xl.conf.pod.5
@@ -111,6 +111,12 @@ Configures the default script used by Remus to setup network buffering.
Default: C</etc/xen/scripts/remus-netbuf-setup>
+=item B<colo.default.agentscript="PATH">
+
+Configures the default script used by COLO to setup colo-agent.
+
+Default: C</etc/xen/scripts/colo-agent-setup>
+
=item B<output_format="json|sxp">
Configures the default output format used by xl when printing "machine
diff --git a/docs/man/xl.pod.1 b/docs/man/xl.pod.1
index ef1ff1f..f46e85e 100644
--- a/docs/man/xl.pod.1
+++ b/docs/man/xl.pod.1
@@ -439,7 +439,6 @@ N.B: Remus support in xl is still in experimental (proof-of-concept) phase.
Disk replication support is limited to DRBD disks.
COLO support in xl is still in experimental (proof-of-concept) phase.
- There is no support for network at the moment.
Disk replication support is limited to blktap2 disks.
B<OPTIONS>
diff --git a/tools/libxl/libxl_colo_save.c b/tools/libxl/libxl_colo_save.c
index b9680a3..7a9fdd5 100644
--- a/tools/libxl/libxl_colo_save.c
+++ b/tools/libxl/libxl_colo_save.c
@@ -231,6 +231,12 @@ void libxl__colo_save_setup(libxl__egc *egc, libxl__colo_save_state *css)
css->vm_fd = -1;
libxl__ev_child_init(&css->child);
+ if (dss->remus->netbufscript)
+ css->colo_agent_script = libxl__strdup(gc, dss->remus->netbufscript);
+ else
+ css->colo_agent_script = GCSPRINTF("%s/colo-agent-setup",
+ libxl__xen_script_dir_path());
+
cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_VBD) |
(1 << LIBXL__DEVICE_KIND_VIF);
cds->ops = colo_ops;
diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 27e6002..6b72c4c 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -1121,6 +1121,11 @@ static void domcreate_bootloader_done(libxl__egc *egc,
crs->superpages = superpages;
crs->pae = pae;
crs->callback = libxl__colo_restore_setup_done;
+ if (dcs->colo_agent_script)
+ crs->colo_agent_script = libxl__strdup(gc, dcs->colo_agent_script);
+ else
+ crs->colo_agent_script = GCSPRINTF("%s/colo-agent-setup",
+ libxl__xen_script_dir_path());
libxl__colo_restore_setup(egc, crs);
} else
libxl__xc_domain_restore(egc, dcs,
@@ -1614,6 +1619,7 @@ static void domain_create_cb(libxl__egc *egc,
static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
uint32_t *domid, int restore_fd,
int send_fd, int checkpointed_stream,
+ const char *colo_agent_script,
const libxl_asyncop_how *ao_how,
const libxl_asyncprogress_how *aop_console_how)
{
@@ -1629,6 +1635,7 @@ static int do_domain_create(libxl_ctx *ctx, libxl_domain_config *d_config,
cdcs->dcs.send_fd = send_fd;
cdcs->dcs.callback = domain_create_cb;
cdcs->dcs.checkpointed_stream = checkpointed_stream;
+ cdcs->dcs.colo_agent_script = colo_agent_script;
libxl__ao_progress_gethow(&cdcs->dcs.aop_console_how, aop_console_how);
cdcs->domid_out = domid;
@@ -1655,7 +1662,7 @@ int libxl_domain_create_new(libxl_ctx *ctx, libxl_domain_config *d_config,
const libxl_asyncop_how *ao_how,
const libxl_asyncprogress_how *aop_console_how)
{
- return do_domain_create(ctx, d_config, domid, -1, -1, 0,
+ return do_domain_create(ctx, d_config, domid, -1, -1, 0, NULL,
ao_how, aop_console_how);
}
@@ -1666,12 +1673,16 @@ int libxl_domain_create_restore(libxl_ctx *ctx, libxl_domain_config *d_config,
const libxl_asyncprogress_how *aop_console_how)
{
int send_fd = -1;
+ char *colo_agent_script = NULL;
- if (params->checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO)
+ if (params->checkpointed_stream == LIBXL_CHECKPOINTED_STREAM_COLO) {
send_fd = params->send_fd;
+ colo_agent_script = params->colo_agent_script;
+ }
return do_domain_create(ctx, d_config, domid, restore_fd, send_fd,
- params->checkpointed_stream, ao_how, aop_console_how);
+ params->checkpointed_stream, colo_agent_script,
+ ao_how, aop_console_how);
}
/*
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 8bca96c..0ab5678 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3208,6 +3208,7 @@ struct libxl__domain_create_state {
/* private to domain_create */
int guest_domid;
int checkpointed_stream;
+ const char *colo_agent_script;
libxl__domain_build_state build_state;
libxl__colo_restore_state crs;
libxl__bootloader_state bl;
diff --git a/tools/libxl/libxl_types.idl b/tools/libxl/libxl_types.idl
index 7ee3333..6097c0e 100644
--- a/tools/libxl/libxl_types.idl
+++ b/tools/libxl/libxl_types.idl
@@ -347,6 +347,7 @@ libxl_domain_create_info = Struct("domain_create_info",[
libxl_domain_restore_params = Struct("domain_restore_params", [
("checkpointed_stream", integer),
("send_fd", integer),
+ ("colo_agent_script", string),
])
libxl_domain_sched_params = Struct("domain_sched_params",[
diff --git a/tools/libxl/xl.c b/tools/libxl/xl.c
index f014306..3d746f5 100644
--- a/tools/libxl/xl.c
+++ b/tools/libxl/xl.c
@@ -45,6 +45,7 @@ char *default_bridge = NULL;
char *default_gatewaydev = NULL;
char *default_vifbackend = NULL;
char *default_remus_netbufscript = NULL;
+char *default_colo_agent_script = NULL;
enum output_format default_output_format = OUTPUT_FORMAT_JSON;
int claim_mode = 1;
bool progress_use_cr = 0;
@@ -179,6 +180,8 @@ static void parse_global_config(const char *configfile,
xlu_cfg_replace_string (config, "remus.default.netbufscript",
&default_remus_netbufscript, 0);
+ xlu_cfg_replace_string (config, "colo.default.agentscript",
+ &default_colo_agent_script, 0);
xlu_cfg_destroy(config);
}
diff --git a/tools/libxl/xl.h b/tools/libxl/xl.h
index c91de4f..70be7bc 100644
--- a/tools/libxl/xl.h
+++ b/tools/libxl/xl.h
@@ -178,6 +178,7 @@ extern char *default_bridge;
extern char *default_gatewaydev;
extern char *default_vifbackend;
extern char *default_remus_netbufscript;
+extern char *default_colo_agent_script;
extern char *blkdev_start;
enum output_format {
diff --git a/tools/libxl/xl_cmdimpl.c b/tools/libxl/xl_cmdimpl.c
index 7a47dd2..0acfca5 100644
--- a/tools/libxl/xl_cmdimpl.c
+++ b/tools/libxl/xl_cmdimpl.c
@@ -153,6 +153,7 @@ struct domain_create {
const char *config_file;
const char *extra_config; /* extra config string */
const char *restore_file;
+ char *colo_agent_script;
int migrate_fd; /* -1 means none */
int send_fd; /* -1 means none */
char **migration_domname_r; /* from malloc */
@@ -2505,6 +2506,7 @@ start:
params.checkpointed_stream = dom_info->checkpointed_stream;
params.send_fd = send_fd;
+ params.colo_agent_script = dom_info->colo_agent_script;
ret = libxl_domain_create_restore(ctx, &d_config,
&domid, restore_fd,
¶ms,
@@ -4015,7 +4017,8 @@ static void migrate_domain(uint32_t domid, const char *rune, int debug,
}
static void migrate_receive(int debug, int daemonize, int monitor,
- int send_fd, int recv_fd, int remus)
+ int send_fd, int recv_fd, int remus,
+ char *colo_agent_script)
{
uint32_t domid;
int rc, rc2;
@@ -4042,6 +4045,7 @@ static void migrate_receive(int debug, int daemonize, int monitor,
dom_info.send_fd = send_fd;
dom_info.migration_domname_r = &migration_domname;
dom_info.checkpointed_stream = remus;
+ dom_info.colo_agent_script = colo_agent_script;
if (remus == LIBXL_CHECKPOINTED_STREAM_COLO)
/* COLO uses stdout to send control message to master */
dom_info.quiet = 1;
@@ -4236,8 +4240,9 @@ int main_migrate_receive(int argc, char **argv)
{
int debug = 0, daemonize = 1, monitor = 1, remus = 0;
int opt;
+ char *script = NULL;
- SWITCH_FOREACH_OPT(opt, "Fedrc", NULL, "migrate-receive", 0) {
+ SWITCH_FOREACH_OPT(opt, "Fedrcn:", NULL, "migrate-receive", 0) {
case 'F':
daemonize = 0;
break;
@@ -4253,6 +4258,8 @@ int main_migrate_receive(int argc, char **argv)
break;
case 'c':
remus = LIBXL_CHECKPOINTED_STREAM_COLO;
+ case 'n':
+ script = optarg;
}
if (argc-optind != 0) {
@@ -4261,7 +4268,7 @@ int main_migrate_receive(int argc, char **argv)
}
migrate_receive(debug, daemonize, monitor,
STDOUT_FILENO, STDIN_FILENO,
- remus);
+ remus, script);
return 0;
}
@@ -7774,8 +7781,10 @@ int main_remus(int argc, char **argv)
if (!interval)
r_info.interval = 0;
- if (r_info.interval || libxl_defbool_val(r_info.blackhole)) {
- perror("option -c is conflict with -i or -b");
+ if (r_info.interval || libxl_defbool_val(r_info.blackhole) ||
+ !libxl_defbool_is_default(r_info.netbuf) ||
+ !libxl_defbool_is_default(r_info.diskbuf)) {
+ perror("option -c is conflict with -i, -d, -n or -b");
exit(-1);
}
@@ -7785,8 +7794,12 @@ int main_remus(int argc, char **argv)
}
}
- if (!r_info.netbufscript)
- r_info.netbufscript = default_remus_netbufscript;
+ if (!r_info.netbufscript) {
+ if (libxl_defbool_val(r_info.colo))
+ r_info.netbufscript = default_colo_agent_script;
+ else
+ r_info.netbufscript = default_remus_netbufscript;
+ }
if (libxl_defbool_val(r_info.blackhole)) {
send_fd = open("/dev/null", O_RDWR, 0644);
@@ -7799,11 +7812,21 @@ int main_remus(int argc, char **argv)
if (!ssh_command[0]) {
rune = host;
} else {
- if (asprintf(&rune, "exec %s %s xl migrate-receive %s %s",
- ssh_command, host,
- libxl_defbool_val(r_info.colo) ? "-c" : "-r",
- daemonize ? "" : " -e") < 0)
- return 1;
+ if (!libxl_defbool_val(r_info.colo)) {
+ if (asprintf(&rune, "exec %s %s xl migrate-receive %s %s",
+ ssh_command, host,
+ "-r",
+ daemonize ? "" : " -e") < 0)
+ return 1;
+ } else {
+ if (asprintf(&rune, "exec %s %s xl migrate-receive %s %s %s %s",
+ ssh_command, host,
+ "-c",
+ r_info.netbufscript ? "-n" : "",
+ r_info.netbufscript ? r_info.netbufscript : "",
+ daemonize ? "" : " -e") < 0)
+ return 1;
+ }
}
save_domain_core_begin(domid, NULL, &config_data, &config_len);
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (17 preceding siblings ...)
2014-10-24 7:06 ` [RFC Patch v4 18/18] colo: cmdline switches and config vars to control colo-agent Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
2014-10-24 14:04 ` Eric Blake
` (2 more replies)
2014-10-24 7:06 ` Wen Congyang
19 siblings, 3 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
Introduce a "xen-load-devices-state" QAPI command that can be used to load
the state of all devices, but not the RAM or the block devices of the
VM.
We only have hmp commands savevm/loadvm, and qmp commands
xen-save-devices-state.
We use this new command for COLO:
1. suspend both primay vm and secondary vm
2. sync the state
3. resume both primary vm and secondary vm
In such case, we need to update all devices's state in any time.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: qemu-devl <qemu-devel@nongnu.org>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
---
qapi-schema.json | 18 ++++++++++++++++++
qmp-commands.hx | 27 +++++++++++++++++++++++++++
savevm.c | 36 ++++++++++++++++++++++++++++++++++++
3 files changed, 81 insertions(+)
diff --git a/qapi-schema.json b/qapi-schema.json
index 391356f..c569856 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4689,3 +4689,21 @@
'btn' : 'InputBtnEvent',
'rel' : 'InputMoveEvent',
'abs' : 'InputMoveEvent' } }
+
+##
+# @xen-load-devices-state:
+#
+# Load the state of all devices from file. The RAM and the block devices
+# of the VM are not loaded by this command.
+#
+# @filename: the file to load the state of the devices from as binary
+# data. See xen-save-devices-state.txt for a description of the binary
+# format.
+#
+# Returns: Nothing on success
+# If @filename cannot be opened, OpenFileFailed
+# If an I/O error occurs while reading the file, IOError
+#
+# Since: 2.0
+##
+{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index ed3ab92..b796be5 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -586,6 +586,33 @@ Example:
EQMP
{
+ .name = "xen-load-devices-state",
+ .args_type = "filename:F",
+ .mhandler.cmd_new = qmp_marshal_input_xen_load_devices_state,
+ },
+
+SQMP
+xen-load-devices-state
+-------
+
+Load the state of all devices from file. The RAM and the block devices
+of the VM are not loaded by this command.
+
+Arguments:
+
+- "filename": the file to load the state of the devices from as binary
+data. See xen-save-devices-state.txt for a description of the binary
+format.
+
+Example:
+
+-> { "execute": "xen-load-devices-state",
+ "arguments": { "filename": "/tmp/resume" } }
+<- { "return": {} }
+
+EQMP
+
+ {
.name = "xen-set-global-dirty-log",
.args_type = "enable:b",
.mhandler.cmd_new = qmp_marshal_input_xen_set_global_dirty_log,
diff --git a/savevm.c b/savevm.c
index 22123be..3ebc01f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -41,6 +41,7 @@
#include "qemu/iov.h"
#include "block/snapshot.h"
#include "block/qapi.h"
+#include "hw/xen/xen.h"
#define SELF_ANNOUNCE_ROUNDS 5
@@ -802,6 +803,14 @@ int qemu_loadvm_state(QEMUFile *f)
goto out;
}
+ /* Validate if it is a device's state */
+ if (xen_enabled() && se->is_ram) {
+ fprintf(stderr, "loadvm: %s RAM loading not allowed on Xen\n",
+ idstr);
+ ret = -EINVAL;
+ goto out;
+ }
+
/* Add entry */
le = g_malloc0(sizeof(*le));
@@ -1027,6 +1036,33 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp)
}
}
+void qmp_xen_load_devices_state(const char *filename, Error **errp)
+{
+ QEMUFile *f;
+ int saved_vm_running;
+ int ret;
+
+ saved_vm_running = runstate_is_running();
+ vm_stop(RUN_STATE_RESTORE_VM);
+
+ f = qemu_fopen(filename, "rb");
+ if (!f) {
+ error_setg_file_open(errp, errno, filename);
+ goto out;
+ }
+
+ ret = qemu_loadvm_state(f);
+ qemu_fclose(f);
+ if (ret < 0) {
+ error_set(errp, QERR_IO_ERROR);
+ }
+
+out:
+ if (saved_vm_running) {
+ vm_start();
+ }
+}
+
int load_vmstate(const char *name)
{
BlockDriverState *bs, *bs_vm_state;
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread* Re: [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 7:06 ` [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state" Wen Congyang
@ 2014-10-24 14:04 ` Eric Blake
2014-10-25 15:11 ` Stefano Stabellini
2014-10-25 15:11 ` Stefano Stabellini
2 siblings, 0 replies; 27+ messages in thread
From: Eric Blake @ 2014-10-24 14:04 UTC (permalink / raw)
To: Wen Congyang, xen devel
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
[-- Attachment #1: Type: text/plain, Size: 1267 bytes --]
On 10/24/2014 01:06 AM, Wen Congyang wrote:
> Introduce a "xen-load-devices-state" QAPI command that can be used to load
> the state of all devices, but not the RAM or the block devices of the
> VM.
>
> We only have hmp commands savevm/loadvm, and qmp commands
> xen-save-devices-state.
>
> We use this new command for COLO:
> 1. suspend both primay vm and secondary vm
> 2. sync the state
> 3. resume both primary vm and secondary vm
>
> In such case, we need to update all devices's state in any time.
>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Cc: qemu-devl <qemu-devel@nongnu.org>
> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> ---
> qapi-schema.json | 18 ++++++++++++++++++
> qmp-commands.hx | 27 +++++++++++++++++++++++++++
> savevm.c | 36 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 81 insertions(+)
>
> +#
> +# Since: 2.0
> +##
> +{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
s/2.0/2.2/ - if you even get it in 2.2 (haven't we already passed soft
freeze, but this is a feature addition?)
--
Eric Blake eblake redhat com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 539 bytes --]
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
@ 2014-10-24 14:04 ` Eric Blake
0 siblings, 0 replies; 27+ messages in thread
From: Eric Blake @ 2014-10-24 14:04 UTC (permalink / raw)
To: Wen Congyang, xen devel
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
[-- Attachment #1.1: Type: text/plain, Size: 1267 bytes --]
On 10/24/2014 01:06 AM, Wen Congyang wrote:
> Introduce a "xen-load-devices-state" QAPI command that can be used to load
> the state of all devices, but not the RAM or the block devices of the
> VM.
>
> We only have hmp commands savevm/loadvm, and qmp commands
> xen-save-devices-state.
>
> We use this new command for COLO:
> 1. suspend both primay vm and secondary vm
> 2. sync the state
> 3. resume both primary vm and secondary vm
>
> In such case, we need to update all devices's state in any time.
>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Cc: qemu-devl <qemu-devel@nongnu.org>
> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
> ---
> qapi-schema.json | 18 ++++++++++++++++++
> qmp-commands.hx | 27 +++++++++++++++++++++++++++
> savevm.c | 36 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 81 insertions(+)
>
> +#
> +# Since: 2.0
> +##
> +{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
s/2.0/2.2/ - if you even get it in 2.2 (haven't we already passed soft
freeze, but this is a feature addition?)
--
Eric Blake eblake redhat com +1-919-301-3266
Libvirt virtualization library http://libvirt.org
[-- Attachment #1.2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 539 bytes --]
[-- Attachment #2: Type: text/plain, Size: 126 bytes --]
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 14:04 ` Eric Blake
(?)
@ 2014-10-27 1:26 ` Wen Congyang
-1 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-27 1:26 UTC (permalink / raw)
To: Eric Blake, xen devel
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
On 10/24/2014 10:04 PM, Eric Blake wrote:
> On 10/24/2014 01:06 AM, Wen Congyang wrote:
>> Introduce a "xen-load-devices-state" QAPI command that can be used to load
>> the state of all devices, but not the RAM or the block devices of the
>> VM.
>>
>> We only have hmp commands savevm/loadvm, and qmp commands
>> xen-save-devices-state.
>>
>> We use this new command for COLO:
>> 1. suspend both primay vm and secondary vm
>> 2. sync the state
>> 3. resume both primary vm and secondary vm
>>
>> In such case, we need to update all devices's state in any time.
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> Cc: qemu-devl <qemu-devel@nongnu.org>
>> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>> qapi-schema.json | 18 ++++++++++++++++++
>> qmp-commands.hx | 27 +++++++++++++++++++++++++++
>> savevm.c | 36 ++++++++++++++++++++++++++++++++++++
>> 3 files changed, 81 insertions(+)
>>
>
>> +#
>> +# Since: 2.0
>> +##
>> +{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
>
> s/2.0/2.2/ - if you even get it in 2.2 (haven't we already passed soft
> freeze, but this is a feature addition?)
>
I forgot to update it. I will update it in the next version. But this qmp
command is for COLO, so I don't post the next version until xen-4.5 is
released.
Thanks
Wen Congyang
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 14:04 ` Eric Blake
(?)
(?)
@ 2014-10-27 1:26 ` Wen Congyang
-1 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-27 1:26 UTC (permalink / raw)
To: Eric Blake, xen devel
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
On 10/24/2014 10:04 PM, Eric Blake wrote:
> On 10/24/2014 01:06 AM, Wen Congyang wrote:
>> Introduce a "xen-load-devices-state" QAPI command that can be used to load
>> the state of all devices, but not the RAM or the block devices of the
>> VM.
>>
>> We only have hmp commands savevm/loadvm, and qmp commands
>> xen-save-devices-state.
>>
>> We use this new command for COLO:
>> 1. suspend both primay vm and secondary vm
>> 2. sync the state
>> 3. resume both primary vm and secondary vm
>>
>> In such case, we need to update all devices's state in any time.
>>
>> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
>> Cc: qemu-devl <qemu-devel@nongnu.org>
>> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
>> Cc: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>> qapi-schema.json | 18 ++++++++++++++++++
>> qmp-commands.hx | 27 +++++++++++++++++++++++++++
>> savevm.c | 36 ++++++++++++++++++++++++++++++++++++
>> 3 files changed, 81 insertions(+)
>>
>
>> +#
>> +# Since: 2.0
>> +##
>> +{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
>
> s/2.0/2.2/ - if you even get it in 2.2 (haven't we already passed soft
> freeze, but this is a feature addition?)
>
I forgot to update it. I will update it in the next version. But this qmp
command is for COLO, so I don't post the next version until xen-4.5 is
released.
Thanks
Wen Congyang
^ permalink raw reply [flat|nested] 27+ messages in thread
* Re: [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 7:06 ` [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state" Wen Congyang
2014-10-24 14:04 ` Eric Blake
@ 2014-10-25 15:11 ` Stefano Stabellini
2014-10-25 15:11 ` Stefano Stabellini
2 siblings, 0 replies; 27+ messages in thread
From: Stefano Stabellini @ 2014-10-25 15:11 UTC (permalink / raw)
To: Wen Congyang
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, xen devel, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
On Fri, 24 Oct 2014, Wen Congyang wrote:
> Introduce a "xen-load-devices-state" QAPI command that can be used to load
> the state of all devices, but not the RAM or the block devices of the
> VM.
>
> We only have hmp commands savevm/loadvm, and qmp commands
> xen-save-devices-state.
>
> We use this new command for COLO:
> 1. suspend both primay vm and secondary vm
> 2. sync the state
> 3. resume both primary vm and secondary vm
>
> In such case, we need to update all devices's state in any time.
>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Cc: qemu-devl <qemu-devel@nongnu.org>
> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
The patch looks OK to me, far better than the previous version, but I am
no QMP expert.
> qapi-schema.json | 18 ++++++++++++++++++
> qmp-commands.hx | 27 +++++++++++++++++++++++++++
> savevm.c | 36 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 81 insertions(+)
>
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 391356f..c569856 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -4689,3 +4689,21 @@
> 'btn' : 'InputBtnEvent',
> 'rel' : 'InputMoveEvent',
> 'abs' : 'InputMoveEvent' } }
> +
> +##
> +# @xen-load-devices-state:
> +#
> +# Load the state of all devices from file. The RAM and the block devices
> +# of the VM are not loaded by this command.
> +#
> +# @filename: the file to load the state of the devices from as binary
> +# data. See xen-save-devices-state.txt for a description of the binary
> +# format.
> +#
> +# Returns: Nothing on success
> +# If @filename cannot be opened, OpenFileFailed
> +# If an I/O error occurs while reading the file, IOError
> +#
> +# Since: 2.0
> +##
> +{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
> diff --git a/qmp-commands.hx b/qmp-commands.hx
> index ed3ab92..b796be5 100644
> --- a/qmp-commands.hx
> +++ b/qmp-commands.hx
> @@ -586,6 +586,33 @@ Example:
> EQMP
>
> {
> + .name = "xen-load-devices-state",
> + .args_type = "filename:F",
> + .mhandler.cmd_new = qmp_marshal_input_xen_load_devices_state,
> + },
> +
> +SQMP
> +xen-load-devices-state
> +-------
> +
> +Load the state of all devices from file. The RAM and the block devices
> +of the VM are not loaded by this command.
> +
> +Arguments:
> +
> +- "filename": the file to load the state of the devices from as binary
> +data. See xen-save-devices-state.txt for a description of the binary
> +format.
> +
> +Example:
> +
> +-> { "execute": "xen-load-devices-state",
> + "arguments": { "filename": "/tmp/resume" } }
> +<- { "return": {} }
> +
> +EQMP
> +
> + {
> .name = "xen-set-global-dirty-log",
> .args_type = "enable:b",
> .mhandler.cmd_new = qmp_marshal_input_xen_set_global_dirty_log,
> diff --git a/savevm.c b/savevm.c
> index 22123be..3ebc01f 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -41,6 +41,7 @@
> #include "qemu/iov.h"
> #include "block/snapshot.h"
> #include "block/qapi.h"
> +#include "hw/xen/xen.h"
>
> #define SELF_ANNOUNCE_ROUNDS 5
>
> @@ -802,6 +803,14 @@ int qemu_loadvm_state(QEMUFile *f)
> goto out;
> }
>
> + /* Validate if it is a device's state */
> + if (xen_enabled() && se->is_ram) {
> + fprintf(stderr, "loadvm: %s RAM loading not allowed on Xen\n",
> + idstr);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> /* Add entry */
> le = g_malloc0(sizeof(*le));
>
> @@ -1027,6 +1036,33 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp)
> }
> }
>
> +void qmp_xen_load_devices_state(const char *filename, Error **errp)
> +{
> + QEMUFile *f;
> + int saved_vm_running;
> + int ret;
> +
> + saved_vm_running = runstate_is_running();
> + vm_stop(RUN_STATE_RESTORE_VM);
> +
> + f = qemu_fopen(filename, "rb");
> + if (!f) {
> + error_setg_file_open(errp, errno, filename);
> + goto out;
> + }
> +
> + ret = qemu_loadvm_state(f);
> + qemu_fclose(f);
> + if (ret < 0) {
> + error_set(errp, QERR_IO_ERROR);
> + }
> +
> +out:
> + if (saved_vm_running) {
> + vm_start();
> + }
> +}
> +
> int load_vmstate(const char *name)
> {
> BlockDriverState *bs, *bs_vm_state;
> --
> 1.9.3
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread* Re: [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 7:06 ` [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state" Wen Congyang
2014-10-24 14:04 ` Eric Blake
2014-10-25 15:11 ` Stefano Stabellini
@ 2014-10-25 15:11 ` Stefano Stabellini
2 siblings, 0 replies; 27+ messages in thread
From: Stefano Stabellini @ 2014-10-25 15:11 UTC (permalink / raw)
To: Wen Congyang
Cc: Ian Campbell, Stefano Stabellini, Ian Jackson, Jiang Yunhong,
Dong Eddie, qemu-devl, xen devel, Paolo Bonzini, Yang Hongyang,
Lai Jiangshan
On Fri, 24 Oct 2014, Wen Congyang wrote:
> Introduce a "xen-load-devices-state" QAPI command that can be used to load
> the state of all devices, but not the RAM or the block devices of the
> VM.
>
> We only have hmp commands savevm/loadvm, and qmp commands
> xen-save-devices-state.
>
> We use this new command for COLO:
> 1. suspend both primay vm and secondary vm
> 2. sync the state
> 3. resume both primary vm and secondary vm
>
> In such case, we need to update all devices's state in any time.
>
> Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
> Cc: qemu-devl <qemu-devel@nongnu.org>
> Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
> Cc: Paolo Bonzini <pbonzini@redhat.com>
The patch looks OK to me, far better than the previous version, but I am
no QMP expert.
> qapi-schema.json | 18 ++++++++++++++++++
> qmp-commands.hx | 27 +++++++++++++++++++++++++++
> savevm.c | 36 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 81 insertions(+)
>
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 391356f..c569856 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -4689,3 +4689,21 @@
> 'btn' : 'InputBtnEvent',
> 'rel' : 'InputMoveEvent',
> 'abs' : 'InputMoveEvent' } }
> +
> +##
> +# @xen-load-devices-state:
> +#
> +# Load the state of all devices from file. The RAM and the block devices
> +# of the VM are not loaded by this command.
> +#
> +# @filename: the file to load the state of the devices from as binary
> +# data. See xen-save-devices-state.txt for a description of the binary
> +# format.
> +#
> +# Returns: Nothing on success
> +# If @filename cannot be opened, OpenFileFailed
> +# If an I/O error occurs while reading the file, IOError
> +#
> +# Since: 2.0
> +##
> +{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
> diff --git a/qmp-commands.hx b/qmp-commands.hx
> index ed3ab92..b796be5 100644
> --- a/qmp-commands.hx
> +++ b/qmp-commands.hx
> @@ -586,6 +586,33 @@ Example:
> EQMP
>
> {
> + .name = "xen-load-devices-state",
> + .args_type = "filename:F",
> + .mhandler.cmd_new = qmp_marshal_input_xen_load_devices_state,
> + },
> +
> +SQMP
> +xen-load-devices-state
> +-------
> +
> +Load the state of all devices from file. The RAM and the block devices
> +of the VM are not loaded by this command.
> +
> +Arguments:
> +
> +- "filename": the file to load the state of the devices from as binary
> +data. See xen-save-devices-state.txt for a description of the binary
> +format.
> +
> +Example:
> +
> +-> { "execute": "xen-load-devices-state",
> + "arguments": { "filename": "/tmp/resume" } }
> +<- { "return": {} }
> +
> +EQMP
> +
> + {
> .name = "xen-set-global-dirty-log",
> .args_type = "enable:b",
> .mhandler.cmd_new = qmp_marshal_input_xen_set_global_dirty_log,
> diff --git a/savevm.c b/savevm.c
> index 22123be..3ebc01f 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -41,6 +41,7 @@
> #include "qemu/iov.h"
> #include "block/snapshot.h"
> #include "block/qapi.h"
> +#include "hw/xen/xen.h"
>
> #define SELF_ANNOUNCE_ROUNDS 5
>
> @@ -802,6 +803,14 @@ int qemu_loadvm_state(QEMUFile *f)
> goto out;
> }
>
> + /* Validate if it is a device's state */
> + if (xen_enabled() && se->is_ram) {
> + fprintf(stderr, "loadvm: %s RAM loading not allowed on Xen\n",
> + idstr);
> + ret = -EINVAL;
> + goto out;
> + }
> +
> /* Add entry */
> le = g_malloc0(sizeof(*le));
>
> @@ -1027,6 +1036,33 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp)
> }
> }
>
> +void qmp_xen_load_devices_state(const char *filename, Error **errp)
> +{
> + QEMUFile *f;
> + int saved_vm_running;
> + int ret;
> +
> + saved_vm_running = runstate_is_running();
> + vm_stop(RUN_STATE_RESTORE_VM);
> +
> + f = qemu_fopen(filename, "rb");
> + if (!f) {
> + error_setg_file_open(errp, errno, filename);
> + goto out;
> + }
> +
> + ret = qemu_loadvm_state(f);
> + qemu_fclose(f);
> + if (ret < 0) {
> + error_set(errp, QERR_IO_ERROR);
> + }
> +
> +out:
> + if (saved_vm_running) {
> + vm_start();
> + }
> +}
> +
> int load_vmstate(const char *name)
> {
> BlockDriverState *bs, *bs_vm_state;
> --
> 1.9.3
>
>
^ permalink raw reply [flat|nested] 27+ messages in thread
* [PATCH 19/18] Introduce "xen-load-devices-state"
2014-10-24 7:05 [RFC Patch v4 00/18] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
` (18 preceding siblings ...)
2014-10-24 7:06 ` [Qemu-devel] [PATCH 19/18] Introduce "xen-load-devices-state" Wen Congyang
@ 2014-10-24 7:06 ` Wen Congyang
19 siblings, 0 replies; 27+ messages in thread
From: Wen Congyang @ 2014-10-24 7:06 UTC (permalink / raw)
To: xen devel
Cc: Ian Campbell, Wen Congyang, Stefano Stabellini, Ian Jackson,
Jiang Yunhong, Dong Eddie, qemu-devl, Paolo Bonzini,
Yang Hongyang, Lai Jiangshan
Introduce a "xen-load-devices-state" QAPI command that can be used to load
the state of all devices, but not the RAM or the block devices of the
VM.
We only have hmp commands savevm/loadvm, and qmp commands
xen-save-devices-state.
We use this new command for COLO:
1. suspend both primay vm and secondary vm
2. sync the state
3. resume both primary vm and secondary vm
In such case, we need to update all devices's state in any time.
Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
Cc: qemu-devl <qemu-devel@nongnu.org>
Cc: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
---
qapi-schema.json | 18 ++++++++++++++++++
qmp-commands.hx | 27 +++++++++++++++++++++++++++
savevm.c | 36 ++++++++++++++++++++++++++++++++++++
3 files changed, 81 insertions(+)
diff --git a/qapi-schema.json b/qapi-schema.json
index 391356f..c569856 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4689,3 +4689,21 @@
'btn' : 'InputBtnEvent',
'rel' : 'InputMoveEvent',
'abs' : 'InputMoveEvent' } }
+
+##
+# @xen-load-devices-state:
+#
+# Load the state of all devices from file. The RAM and the block devices
+# of the VM are not loaded by this command.
+#
+# @filename: the file to load the state of the devices from as binary
+# data. See xen-save-devices-state.txt for a description of the binary
+# format.
+#
+# Returns: Nothing on success
+# If @filename cannot be opened, OpenFileFailed
+# If an I/O error occurs while reading the file, IOError
+#
+# Since: 2.0
+##
+{ 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index ed3ab92..b796be5 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -586,6 +586,33 @@ Example:
EQMP
{
+ .name = "xen-load-devices-state",
+ .args_type = "filename:F",
+ .mhandler.cmd_new = qmp_marshal_input_xen_load_devices_state,
+ },
+
+SQMP
+xen-load-devices-state
+-------
+
+Load the state of all devices from file. The RAM and the block devices
+of the VM are not loaded by this command.
+
+Arguments:
+
+- "filename": the file to load the state of the devices from as binary
+data. See xen-save-devices-state.txt for a description of the binary
+format.
+
+Example:
+
+-> { "execute": "xen-load-devices-state",
+ "arguments": { "filename": "/tmp/resume" } }
+<- { "return": {} }
+
+EQMP
+
+ {
.name = "xen-set-global-dirty-log",
.args_type = "enable:b",
.mhandler.cmd_new = qmp_marshal_input_xen_set_global_dirty_log,
diff --git a/savevm.c b/savevm.c
index 22123be..3ebc01f 100644
--- a/savevm.c
+++ b/savevm.c
@@ -41,6 +41,7 @@
#include "qemu/iov.h"
#include "block/snapshot.h"
#include "block/qapi.h"
+#include "hw/xen/xen.h"
#define SELF_ANNOUNCE_ROUNDS 5
@@ -802,6 +803,14 @@ int qemu_loadvm_state(QEMUFile *f)
goto out;
}
+ /* Validate if it is a device's state */
+ if (xen_enabled() && se->is_ram) {
+ fprintf(stderr, "loadvm: %s RAM loading not allowed on Xen\n",
+ idstr);
+ ret = -EINVAL;
+ goto out;
+ }
+
/* Add entry */
le = g_malloc0(sizeof(*le));
@@ -1027,6 +1036,33 @@ void qmp_xen_save_devices_state(const char *filename, Error **errp)
}
}
+void qmp_xen_load_devices_state(const char *filename, Error **errp)
+{
+ QEMUFile *f;
+ int saved_vm_running;
+ int ret;
+
+ saved_vm_running = runstate_is_running();
+ vm_stop(RUN_STATE_RESTORE_VM);
+
+ f = qemu_fopen(filename, "rb");
+ if (!f) {
+ error_setg_file_open(errp, errno, filename);
+ goto out;
+ }
+
+ ret = qemu_loadvm_state(f);
+ qemu_fclose(f);
+ if (ret < 0) {
+ error_set(errp, QERR_IO_ERROR);
+ }
+
+out:
+ if (saved_vm_running) {
+ vm_start();
+ }
+}
+
int load_vmstate(const char *name)
{
BlockDriverState *bs, *bs_vm_state;
--
1.9.3
^ permalink raw reply related [flat|nested] 27+ messages in thread