[RFC Patch v3 21/22] setup and control colo-agent for secondary vm

xen-devel.lists.xenproject.org archive mirror
 help / color / mirror / Atom feed

From: Wen Congyang <wency@cn.fujitsu.com>
To: xen devel <xen-devel@lists.xen.org>
Cc: Ian Campbell <Ian.Campbell@citrix.com>,
	Wen Congyang <wency@cn.fujitsu.com>,
	Ian Jackson <Ian.Jackson@eu.citrix.com>,
	Jiang Yunhong <yunhong.jiang@intel.com>,
	Dong Eddie <eddie.dong@intel.com>,
	Yang Hongyang <yanghy@cn.fujitsu.com>,
	Lai Jiangshan <laijs@cn.fujitsu.com>
Subject: [RFC Patch v3 21/22] setup and control colo-agent for secondary vm
Date: Fri, 5 Sep 2014 17:25:56 +0800	[thread overview]
Message-ID: <1409909158-19243-22-git-send-email-wency@cn.fujitsu.com> (raw)
In-Reply-To: <1409909158-19243-1-git-send-email-wency@cn.fujitsu.com>

This patch adds the machinery required for protecting a secondary vm's
network device state. This patch implements the interfaces required by
the checkpoint abstract device layer. A note about the implementation:
   a) setup() and teardown() are called for each vif attached to the
      secondary vm.
      During setup(), the hotplug script is called to setup COLO agent for
      given vif. The script does the follow things:
      i)  redirect vif egress traffic to the FORWARD device
      ii) redirect FORWARD device egress traffic to vif

      During teardown(), the hotplug scripts are called again for each
      vif. The scripts does the follow things:
      i)  remove the vif->FORWARD traffic redirection
      ii) remove the FORWARD->vif traffic redirection

   b) Nothing should be done for secondary vm's network device.

Signed-off-by: Wen Congyang <wency@cn.fujitsu.com>
---
 tools/libxl/libxl_colo_nic.c     |  21 ++++++
 tools/libxl/libxl_colo_restore.c | 152 +++++++++++++++++++++++++++++++++------
 tools/libxl/libxl_internal.h     |   2 +
 3 files changed, 155 insertions(+), 20 deletions(-)

diff --git a/tools/libxl/libxl_colo_nic.c b/tools/libxl/libxl_colo_nic.c
index 56bccad..9955ef1 100644
--- a/tools/libxl/libxl_colo_nic.c
+++ b/tools/libxl/libxl_colo_nic.c
@@ -286,3 +286,24 @@ const libxl__checkpoint_device_instance_ops colo_save_device_nic = {
     .setup = colo_nic_save_setup,
     .teardown = colo_nic_save_teardown,
 };
+
+/* ======== secondary ======== */
+static void colo_nic_restore_setup(libxl__checkpoint_device *dev)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(dev->cds, *crs, cds);
+
+    colo_nic_setup(dev, secondary, crs->colo_agent_script);
+}
+
+static void colo_nic_restore_teardown(libxl__checkpoint_device *dev)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(dev->cds, *crs, cds);
+
+    colo_nic_teardown(dev, secondary, crs->colo_agent_script);
+}
+
+const libxl__checkpoint_device_instance_ops colo_restore_device_nic = {
+    .kind = LIBXL__DEVICE_KIND_CHECKPOINT_NIC,
+    .setup = colo_nic_restore_setup,
+    .teardown = colo_nic_restore_teardown,
+};
diff --git a/tools/libxl/libxl_colo_restore.c b/tools/libxl/libxl_colo_restore.c
index 3fd587b..c9b57a4 100644
--- a/tools/libxl/libxl_colo_restore.c
+++ b/tools/libxl/libxl_colo_restore.c
@@ -40,6 +40,9 @@ struct libxl__colo_restore_checkpoint_state {
     libxl__logdirty_switch lds;
     libxl__colo_restore_state *crs;
     int status;
+    /* used for teardown */
+    int teardown_devices;
+    int saved_rc;
 
     void (*callback)(libxl__egc *,
                      libxl__colo_restore_checkpoint_state *,
@@ -58,6 +61,13 @@ static void libxl__colo_restore_domain_resume_callback(void *data);
 static void libxl__colo_restore_domain_checkpoint_callback(void *data);
 static void libxl__colo_restore_domain_suspend_callback(void *data);
 
+extern const libxl__checkpoint_device_instance_ops colo_restore_device_nic;
+
+static const libxl__checkpoint_device_instance_ops *colo_restore_ops[] = {
+    &colo_restore_device_nic,
+    NULL,
+};
+
 /* ===================== colo: common functions ===================== */
 static void colo_enable_logdirty(libxl__colo_restore_state *crs, libxl__egc *egc)
 {
@@ -138,6 +148,28 @@ static void colo_resume_vm(libxl__egc *egc,
     return;
 }
 
+static int init_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+    /* init device subkind-specific state in the libxl ctx */
+    int rc;
+    STATE_AO_GC(cds->ao);
+
+    rc = init_subkind_colo_nic(cds);
+    if (rc) goto out;
+
+    rc = 0;
+out:
+    return rc;
+}
+
+static void cleanup_device_subkind(libxl__checkpoint_devices_state *cds)
+{
+    /* cleanup device subkind-specific state in the libxl ctx */
+    STATE_AO_GC(cds->ao);
+
+    cleanup_subkind_colo_nic(cds);
+}
+
 
 /* ================ colo: setup restore environment ================ */
 static void libxl__colo_domain_create_cb(libxl__egc *egc,
@@ -266,6 +298,9 @@ static void libxl__colo_domain_create_cb(libxl__egc *egc,
 
 
 /* ================ colo: teardown restore environment ================ */
+static void colo_restore_teardown_done(libxl__egc *egc,
+                                       libxl__checkpoint_devices_state *cds,
+                                       int rc);
 static void do_failover_done(libxl__egc *egc,
                              libxl__colo_restore_checkpoint_state* crcs,
                              int rc);
@@ -312,11 +347,38 @@ void libxl__colo_restore_teardown(libxl__egc *egc,
     EGC_GC;
 
     if (!dirty_bitmap)
-        goto do_failover;
+        goto teardown_devices;
 
     xc_hypercall_buffer_free_pages(CTX->xch, dirty_bitmap, NRPAGES(bsize));
 
-do_failover:
+teardown_devices:
+    crcs->saved_rc = rc;
+    if (!crcs->teardown_devices) {
+        colo_restore_teardown_done(egc, &crs->cds, 0);
+        return;
+    }
+
+    crs->cds.callback = colo_restore_teardown_done;
+    libxl__checkpoint_devices_teardown(egc, &crs->cds);
+}
+
+static void colo_restore_teardown_done(libxl__egc *egc,
+                                       libxl__checkpoint_devices_state *cds,
+                                       int rc)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(cds, *crs, cds);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+
+    EGC_GC;
+
+    if (rc)
+        LOG(ERROR, "COLO: failed to teardown device after setup failed"
+            " for guest with domid %u, rc %d", cds->domid, rc);
+
+    cleanup_device_subkind(cds);
+
+    rc = crcs->saved_rc;
     if (!rc) {
         crcs->callback = do_failover_done;
         do_failover(egc, crs);
@@ -405,6 +467,11 @@ static void colo_reenable_logdirty(libxl__egc *egc,
 static void colo_reenable_logdirty_done(libxl__egc *egc,
                                         libxl__logdirty_switch *lds,
                                         int rc);
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+                                          libxl__colo_restore_state *crs);
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc);
 
 static void libxl__colo_restore_domain_resume_callback(void *data)
 {
@@ -516,7 +583,6 @@ static void colo_write_svm_resumed(libxl__egc *egc,
     dc->copywhat = crcs->copywhat[2];
     dc->writewhat = "colo stream";
     dc->callback = colo_common_send_data_done;
-    /* TODO: configure network */
     crcs->callback = NULL;
 
     rc = libxl__datacopier_start(dc);
@@ -539,12 +605,9 @@ static void colo_enable_logdirty_done(libxl__egc *egc,
                                       int rc)
 {
     libxl__colo_restore_checkpoint_state *crcs = CONTAINER_OF(lds, *crcs, lds);
-    libxl__domain_create_state *dcs = CONTAINER_OF(crcs->crs, *dcs, crs);
 
     /* Convenience aliases */
     libxl__colo_restore_state *const crs = crcs->crs;
-    libxl__save_helper_state *const shs = &dcs->shs;
-    const uint32_t domid = crs->domid;
 
     STATE_AO_GC(crs->ao);
 
@@ -558,19 +621,7 @@ static void colo_enable_logdirty_done(libxl__egc *egc,
         return;
     }
 
-    /* We have enabled secondary vm's logdirty, so we can unpause it now */
-    rc = libxl__domain_unpause(gc, domid);
-    if (rc) {
-        LOG(ERROR, "cannot unpause secondary vm");
-        goto out;
-    }
-
-    colo_write_svm_resumed(egc, crcs);
-
-    return;
-
-out:
-    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+    colo_setup_checkpoint_devices(egc, crs);
 }
 
 static void colo_reenable_logdirty(libxl__egc *egc,
@@ -609,7 +660,6 @@ static void colo_reenable_logdirty_done(libxl__egc *egc,
 
     /* Convenience aliases */
     libxl__save_helper_state *const shs = &dcs->shs;
-    const uint32_t domid = crcs->crs->domid;
 
     STATE_AO_GC(crcs->crs->ao);
 
@@ -618,6 +668,68 @@ static void colo_reenable_logdirty_done(libxl__egc *egc,
         goto out;
     }
 
+    colo_setup_checkpoint_devices(egc, crcs->crs);
+
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+/*
+ * We cannot setup checkpoint devices in libxl__colo_restore_setup(),
+ * because the guest is not ready.
+ */
+static void colo_setup_checkpoint_devices(libxl__egc *egc,
+                                          libxl__colo_restore_state *crs)
+{
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__checkpoint_devices_state *cds = &crs->cds;
+    libxl__save_helper_state *const shs = &dcs->shs;
+
+    STATE_AO_GC(crs->ao);
+
+    crcs->teardown_devices = 1;
+
+    cds->device_kind_flags = (1 << LIBXL__DEVICE_KIND_CHECKPOINT_NIC);
+    cds->callback = colo_restore_setup_cds_done;
+    cds->ao = ao;
+    cds->domid = crs->domid;
+    cds->ops = colo_restore_ops;
+
+    if (init_device_subkind(cds))
+        goto out;
+
+    libxl__checkpoint_devices_setup(egc, cds);
+    return;
+
+out:
+    libxl__xc_domain_saverestore_async_callback_done(egc, shs, 0);
+}
+
+static void colo_restore_setup_cds_done(libxl__egc *egc,
+                                        libxl__checkpoint_devices_state *cds,
+                                        int rc)
+{
+    libxl__colo_restore_state *crs = CONTAINER_OF(cds, *crs, cds);
+    libxl__domain_create_state *dcs = CONTAINER_OF(crs, *dcs, crs);
+    libxl__colo_restore_checkpoint_state *crcs = crs->crcs;
+
+    /* Convenience aliases */
+    libxl__save_helper_state *const shs = &dcs->shs;
+    const uint32_t domid = crs->domid;
+
+    STATE_AO_GC(cds->ao);
+
+    if (rc) {
+        LOG(ERROR, "COLO: failed to setup device for guest with domid %u",
+            cds->domid);
+        goto out;
+    }
+
     /* We have enabled secondary vm's logdirty, so we can unpause it now */
     rc = libxl__domain_unpause(gc, domid);
     if (rc) {
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index b1a7208..00dfa1e 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -3078,10 +3078,12 @@ struct libxl__colo_restore_state {
     int pae;
     int superpages;
     libxl__colo_callback *callback;
+    char *colo_agent_script;
 
     /* private, colo restore checkpoint state */
     libxl__domain_create_cb *saved_cb;
     void *crcs;
+    libxl__checkpoint_devices_state cds;
 };
 
 struct libxl__domain_create_state {
-- 
1.9.3

next prev parent reply	other threads:[~2014-09-05  9:25 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-09-05  9:25 [RFC Patch v3 00/22] COarse-grain LOck-stepping Virtual Machines for Non-stop Service Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 01/22] move remus related codes to libxl_remus.c Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 02/22] rename remus device to checkpoint device Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 03/22] adjust the indentation Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 04/22] don't touch remus in checkpoint_device Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 05/22] Update libxl_save_msgs_gen.pl to support return data from xl to xc Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 06/22] Allow slave sends data to master Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 07/22] secondary vm suspend/resume/checkpoint code Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 08/22] primary vm suspend/get_dirty_pfn/resume/checkpoint code Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 09/22] xc_domain_save: flush cache before calling callbacks->postcopy() in colo mode Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 10/22] COLO: xc related codes Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 11/22] send store mfn and console mfn to xl before resuming secondary vm Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 12/22] implement the cmdline for COLO Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 13/22] blktap2: connect to backup asynchronously Wen Congyang
2014-09-24 19:11   ` Shriram Rajagopalan
2014-09-25  5:40     ` Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 14/22] switch to unprotected mode before closing Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 15/22] blktap2: move async connect related codes to block-replication.c Wen Congyang
2014-09-24 18:48   ` Shriram Rajagopalan
2014-09-05  9:25 ` [RFC Patch v3 16/22] blktap2: move ramdisk " Wen Congyang
2014-09-24 18:44   ` Shriram Rajagopalan
2014-09-26  5:18     ` Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 17/22] block-colo: implement colo disk replication Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 18/22] support blktap COLO in xl: Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 19/22] libxl/colo: setup and control disk replication for blktap2 backends Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 20/22] setup and control colo-agent for primary vm Wen Congyang
2014-09-05  9:25 ` Wen Congyang [this message]
2014-09-05  9:25 ` [RFC Patch v3 22/22] colo: cmdline switches and config vars to control colo-agent Wen Congyang
2014-09-05  9:25 ` [RFC Patch v3 23/22] Introduce "xen-load-devices-state" Wen Congyang
2014-09-05 21:57   ` Stefano Stabellini
     [not found]   ` <alpine.DEB.2.02.1409052229550.2334@kaball.uk.xensource.com>
2014-09-09  2:47     ` Wen Congyang
     [not found]     ` <540E6A44.8090507@cn.fujitsu.com>
2014-09-10 19:15       ` Stefano Stabellini
     [not found]       ` <alpine.DEB.2.02.1409102005450.8137@kaball.uk.xensource.com>
2014-09-11  5:03         ` Wen Congyang

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:56bccad dfblob:9955ef1 dfblob:3fd587b dfblob:c9b57a4
dfblob:b1a7208 dfblob:00dfa1e )
 OR (
bs:"[RFC Patch v3 21/22] setup and control colo-agent for secondary vm" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1409909158-19243-22-git-send-email-wency@cn.fujitsu.com \
    --to=wency@cn.fujitsu.com \
    --cc=Ian.Campbell@citrix.com \
    --cc=Ian.Jackson@eu.citrix.com \
    --cc=eddie.dong@intel.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=xen-devel@lists.xen.org \
    --cc=yanghy@cn.fujitsu.com \
    --cc=yunhong.jiang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).