* [PATCH 1 of 2] libxl: add libxl_device_<type>_list_free helpers
2012-08-07 9:08 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Campbell
@ 2012-08-07 9:08 ` Ian Campbell
2012-08-07 9:08 ` [PATCH 2 of 2] xl: migrate: destroy disks on source before giving destination the "go" Ian Campbell
2012-08-07 10:33 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Jackson
2 siblings, 0 replies; 9+ messages in thread
From: Ian Campbell @ 2012-08-07 9:08 UTC (permalink / raw)
To: xen-devel; +Cc: Ian Jackson
# HG changeset patch
# User Ian Campbell <ian.campbell@citrix.com>
# Date 1344326374 -3600
# Node ID 0a67d3147a174a26dda2e268ca801e56a9c5b711
# Parent a7ad22e5525831dd491d7ee1fe538b7543404ac7
libxl: add libxl_device_<type>_list_free helpers
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
diff -r a7ad22e55258 -r 0a67d3147a17 tools/libxl/libxl.c
--- a/tools/libxl/libxl.c Mon Aug 06 12:31:02 2012 +0100
+++ b/tools/libxl/libxl.c Tue Aug 07 08:59:34 2012 +0100
@@ -2090,6 +2090,14 @@ out_err:
return NULL;
}
+void libxl_device_disk_list_free(libxl_device_disk *list, int nr)
+{
+ int i;
+ for (i = 0; i < nr; i++)
+ libxl_device_disk_dispose(&list[i]);
+ free(list);
+}
+
int libxl_device_disk_getinfo(libxl_ctx *ctx, uint32_t domid,
libxl_device_disk *disk, libxl_diskinfo *diskinfo)
{
@@ -2759,6 +2767,14 @@ out_err:
return NULL;
}
+void libxl_device_nic_list_free(libxl_device_nic *list, int nr)
+{
+ int i;
+ for (i = 0; i < nr; i++)
+ libxl_device_nic_dispose(&list[i]);
+ free(list);
+}
+
int libxl_device_nic_getinfo(libxl_ctx *ctx, uint32_t domid,
libxl_device_nic *nic, libxl_nicinfo *nicinfo)
{
diff -r a7ad22e55258 -r 0a67d3147a17 tools/libxl/libxl.h
--- a/tools/libxl/libxl.h Mon Aug 06 12:31:02 2012 +0100
+++ b/tools/libxl/libxl.h Tue Aug 07 08:59:34 2012 +0100
@@ -668,6 +668,11 @@ void libxl_vcpuinfo_list_free(libxl_vcpu
* Initialises info with details of the given device which must be
* attached to the specified domain.
*
+ * libxl_device_<type>_list_free(*devs, int nr_dev):
+ *
+ * Dispose nr_dev elements of the array devs and then free the array
+ * itself.
+ *
* Creation / Control
* ------------------
*
@@ -714,6 +719,8 @@ int libxl_device_disk_destroy(libxl_ctx
LIBXL_EXTERNAL_CALLERS_ONLY;
libxl_device_disk *libxl_device_disk_list(libxl_ctx *ctx, uint32_t domid, int *num);
+void libxl_device_disk_list_free(libxl_device_disk *disks, int nr_disks);
+
int libxl_device_disk_getinfo(libxl_ctx *ctx, uint32_t domid,
libxl_device_disk *disk, libxl_diskinfo *diskinfo);
@@ -739,6 +746,7 @@ int libxl_device_nic_destroy(libxl_ctx *
LIBXL_EXTERNAL_CALLERS_ONLY;
libxl_device_nic *libxl_device_nic_list(libxl_ctx *ctx, uint32_t domid, int *num);
+void libxl_device_nic_list_free(libxl_device_nic *nics, int nr_nics);
int libxl_device_nic_getinfo(libxl_ctx *ctx, uint32_t domid,
libxl_device_nic *nic, libxl_nicinfo *nicinfo);
@@ -784,6 +792,7 @@ int libxl_device_pci_destroy(libxl_ctx *
libxl_device_pci *libxl_device_pci_list(libxl_ctx *ctx, uint32_t domid,
int *num);
+void libxl_device_pci_list_free(libxl_device_pci *pcis, int nr_pcis);
/*
* Functions related to making devices assignable -- that is, bound to
diff -r a7ad22e55258 -r 0a67d3147a17 tools/libxl/libxl_pci.c
--- a/tools/libxl/libxl_pci.c Mon Aug 06 12:31:02 2012 +0100
+++ b/tools/libxl/libxl_pci.c Tue Aug 07 08:59:34 2012 +0100
@@ -1406,6 +1406,14 @@ out:
return pcidevs;
}
+void libxl_device_pci_list_free(libxl_device_pci *list, int nr)
+{
+ int i;
+ for (i = 0; i < nr; i++)
+ libxl_device_pci_dispose(&list[i]);
+ free(list);
+}
+
int libxl__device_pci_destroy_all(libxl__gc *gc, uint32_t domid)
{
libxl_ctx *ctx = libxl__gc_owner(gc);
^ permalink raw reply [flat|nested] 9+ messages in thread* [PATCH 2 of 2] xl: migrate: destroy disks on source before giving destination the "go"
2012-08-07 9:08 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Campbell
2012-08-07 9:08 ` [PATCH 1 of 2] libxl: add libxl_device_<type>_list_free helpers Ian Campbell
@ 2012-08-07 9:08 ` Ian Campbell
2012-08-07 10:33 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Jackson
2 siblings, 0 replies; 9+ messages in thread
From: Ian Campbell @ 2012-08-07 9:08 UTC (permalink / raw)
To: xen-devel; +Cc: Ian Jackson
# HG changeset patch
# User Ian Campbell <ian.campbell@citrix.com>
# Date 1344330518 -3600
# Node ID 773de9b98cd183ee2b595f1856fb9768709e20f7
# Parent 0a67d3147a174a26dda2e268ca801e56a9c5b711
xl: migrate: destroy disks on source before giving destination the "go"
25733:353bc0801b11 "libxl: support custom block hotplug scripts which
had the (intended) side effect of re-enabling the hotplug script's
device sharing checks, which in turn has exposed the fact that during
migration xl currently has both devices in existence (but thankfully
not active).
Fix this by destroying the disk backends before sending the GO message
to the destination end (and recreating them on failure).
This is a bit of an ad-hoc solution for 4.2, we should revisit the
sequencing of the operations during migration for 4.3.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
diff -r 0a67d3147a17 -r 773de9b98cd1 tools/libxl/xl_cmdimpl.c
--- a/tools/libxl/xl_cmdimpl.c Tue Aug 07 08:59:34 2012 +0100
+++ b/tools/libxl/xl_cmdimpl.c Tue Aug 07 10:08:38 2012 +0100
@@ -3021,6 +3021,8 @@ static void migrate_domain(const char *d
char rc_buf;
uint8_t *config_data;
int config_len;
+ libxl_device_disk *disks;
+ int i, nr_disks = 0;
save_domain_core_begin(domain_spec, override_config_file,
&config_data, &config_len);
@@ -3072,6 +3074,18 @@ static void migrate_domain(const char *d
if (rc) goto failed_resume;
}
+ /* In order to avoid having the same disk active twice
+ * simultaneously on a migration, which will trigger the hotplug
+ * script sharing detection, we must first remove the disks from
+ * the source domain. */
+ disks = libxl_device_disk_list(ctx, domid, &nr_disks);
+ if (!disks) goto failed_badly;
+
+ for (i = 0 ; i < nr_disks ; i++) {
+ rc = libxl_device_disk_destroy(ctx, domid, &disks[i], NULL);
+ if (rc) goto failed_badly;
+ }
+
/* point of no return - as soon as we have tried to say
* "go" to the receiver, it's not safe to carry on. We leave
* the domain renamed to %s--migratedaway in case that's helpful.
@@ -3107,6 +3121,13 @@ static void migrate_domain(const char *d
fprintf(stderr, "migration sender: Trying to resume at our end.\n");
+ /* Re-add disks removed above. */
+ for (i = 0 ; i < nr_disks ; i++) {
+ rc = libxl_device_disk_add(ctx, domid, &disks[i], NULL);
+ if (rc) goto failed_badly;
+ }
+ libxl_device_disk_list_free(disks, nr_disks);
+
if (common_domname) {
libxl_domain_rename(ctx, domid, away_domname, common_domname);
}
@@ -3120,6 +3141,8 @@ static void migrate_domain(const char *d
fprintf(stderr, "migration sender: Target reports successful startup.\n");
libxl_domain_destroy(ctx, domid, 0); /* bang! */
fprintf(stderr, "Migration successful.\n");
+
+ libxl_device_disk_list_free(disks, nr_disks);
exit(0);
failed_suspend:
@@ -3132,6 +3155,7 @@ static void migrate_domain(const char *d
close(send_fd);
migration_child_report(recv_fd);
fprintf(stderr, "Migration failed, resuming at sender.\n");
+ assert(!nr_disks); /* This failure path is always before disk shutdown */
libxl_domain_resume(ctx, domid, 0);
exit(-ERROR_FAIL);
@@ -3146,6 +3170,7 @@ static void migrate_domain(const char *d
close(send_fd);
migration_child_report(recv_fd);
+ libxl_device_disk_list_free(disks, nr_disks);
exit(-ERROR_BADFAIL);
}
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11
2012-08-07 9:08 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Campbell
2012-08-07 9:08 ` [PATCH 1 of 2] libxl: add libxl_device_<type>_list_free helpers Ian Campbell
2012-08-07 9:08 ` [PATCH 2 of 2] xl: migrate: destroy disks on source before giving destination the "go" Ian Campbell
@ 2012-08-07 10:33 ` Ian Jackson
2012-08-07 10:36 ` Ian Campbell
2012-08-07 11:02 ` Ian Campbell
2 siblings, 2 replies; 9+ messages in thread
From: Ian Jackson @ 2012-08-07 10:33 UTC (permalink / raw)
To: Ian Campbell; +Cc: xen-devel@lists.xen.org
Ian Campbell writes ("[PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11"):
> On Tue, 2012-08-07 at 09:25 +0100, Ian Campbell wrote:
> > I'm going to investigate if I can make xl sequence the device
> > teardowns in the migration so as to make it work.
>
> Here is a short series which does this.
>
> It's a bit ad-hoc, we probably want to revist this for 4.3 (if not
> now).
I think the right answer at this stage of the release is to back out
the part of the recent change which is causing the problem.
In this case that means abolishing the execution of
/etc/xen/scripts/block, which "libxl: support custom block hotplug
scripts" had as an intended side-effect.
Ian.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11
2012-08-07 10:33 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Jackson
@ 2012-08-07 10:36 ` Ian Campbell
2012-08-07 11:02 ` Ian Campbell
1 sibling, 0 replies; 9+ messages in thread
From: Ian Campbell @ 2012-08-07 10:36 UTC (permalink / raw)
To: Ian Jackson; +Cc: xen-devel@lists.xen.org
On Tue, 2012-08-07 at 11:33 +0100, Ian Jackson wrote:
> Ian Campbell writes ("[PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11"):
> > On Tue, 2012-08-07 at 09:25 +0100, Ian Campbell wrote:
> > > I'm going to investigate if I can make xl sequence the device
> > > teardowns in the migration so as to make it work.
> >
> > Here is a short series which does this.
> >
> > It's a bit ad-hoc, we probably want to revist this for 4.3 (if not
> > now).
>
> I think the right answer at this stage of the release is to back out
> the part of the recent change which is causing the problem.
>
> In this case that means abolishing the execution of
> /etc/xen/scripts/block, which "libxl: support custom block hotplug
> scripts" had as an intended side-effect.
OK, I'll look into this.
Ian.
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11
2012-08-07 10:33 ` [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11 Ian Jackson
2012-08-07 10:36 ` Ian Campbell
@ 2012-08-07 11:02 ` Ian Campbell
2012-08-07 13:38 ` Ian Campbell
1 sibling, 1 reply; 9+ messages in thread
From: Ian Campbell @ 2012-08-07 11:02 UTC (permalink / raw)
To: Ian Jackson; +Cc: xen-devel@lists.xen.org
On Tue, 2012-08-07 at 11:33 +0100, Ian Jackson wrote:
> Ian Campbell writes ("[PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11"):
> > On Tue, 2012-08-07 at 09:25 +0100, Ian Campbell wrote:
> > > I'm going to investigate if I can make xl sequence the device
> > > teardowns in the migration so as to make it work.
> >
> > Here is a short series which does this.
> >
> > It's a bit ad-hoc, we probably want to revist this for 4.3 (if not
> > now).
>
> I think the right answer at this stage of the release is to back out
> the part of the recent change which is causing the problem.
>
> In this case that means abolishing the execution of
> /etc/xen/scripts/block, which "libxl: support custom block hotplug
> scripts" had as an intended side-effect.
Here it is. This approach has the side effect that we now require block
hotplug scripts to properly nest/reference count for localhost migration
to work. e.g. the effect of add->add->remove calls to the script should
be that the device remains connected until a second remove. This may be
more or less trivial depending on the device
I suppose we can live with that limitation since localhost migration is
mostly for test purposes.
I'm slightly concerned that this may also effect non-localhost
migration. e.g. an iSCSI target which only permits a single login.
8<------------------------------------------------------------
# HG changeset patch
# User Ian Campbell <ian.campbell@citrix.com>
# Date 1344337292 -3600
# Node ID c5f673d8b330d6195e2aa3bbf63bb594b4bc99ee
# Parent a7ad22e5525831dd491d7ee1fe538b7543404ac7
libxl: write physical-device node if user did not supply a block script
This reverts one of the intentional changes from 25733:353bc0801b11.
That change exposed an issue with the xl migration protocol, which
although safe triggers the hotplug scripts device sharing logic.
For 4.2 we disable this logic by writing the physical-device xenstore
node ourselves if a user did not supply a script. If the user did
supply a script then we continue to rely on it to write the
physical-device node (not least because the script may create the
device and therefore it is not available before we run the script).
This means that to support localhost migration a block hotplug script
needs to be robust against adding a device twice and should not
deactivate the device until it has been removed twice.
This should be revisted for 4.3.
Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
diff -r a7ad22e55258 -r c5f673d8b330 tools/libxl/libxl.c
--- a/tools/libxl/libxl.c Mon Aug 06 12:31:02 2012 +0100
+++ b/tools/libxl/libxl.c Tue Aug 07 12:01:32 2012 +0100
@@ -1845,18 +1845,31 @@ static void device_disk_add(libxl__egc *
case LIBXL_DISK_BACKEND_PHY:
dev = disk->pdev_path;
- script = libxl__abs_path(gc, disk->script ?: "block",
- libxl__xen_script_dir_path());
-
do_backend_phy:
flexarray_append(back, "params");
flexarray_append(back, dev);
- assert(script);
+ script = libxl__abs_path(gc, disk->script?: "block",
+ libxl__xen_script_dir_path());
flexarray_append_pair(back, "script", script);
+ /* If the user did not supply a block script then we
+ * write the physical-device node ourselves.
+ *
+ * If the user did supply a script then that script is
+ * responsible for this since the block device may not
+ * exist yet.
+ */
+ if (!disk->script) {
+ int major, minor;
+ libxl__device_physdisk_major_minor(dev, &major, &minor);
+ flexarray_append_pair(back, "physical-device",
+ libxl__sprintf(gc, "%x:%x", major, minor));
+ }
+
assert(device->backend_kind == LIBXL__DEVICE_KIND_VBD);
break;
+
case LIBXL_DISK_BACKEND_TAP:
dev = libxl__blktap_devpath(gc, disk->pdev_path, disk->format);
if (!dev) {
@@ -1870,12 +1883,9 @@ static void device_disk_add(libxl__egc *
libxl__device_disk_string_of_format(disk->format),
disk->pdev_path));
- /*
- * tap devices do not support custom block scripts and
- * always use the plain block script.
- */
- script = libxl__abs_path(gc, "block",
- libxl__xen_script_dir_path());
+ /* tap backends with scripts are rejected by
+ * libxl__device_disk_set_backend */
+ assert(!disk->script);
/* now create a phy device to export the device to the guest */
goto do_backend_phy;
^ permalink raw reply [flat|nested] 9+ messages in thread* Re: [PATCH 0 of 2] xl: fix localhost migration after 25733:353bc0801b11
2012-08-07 11:02 ` Ian Campbell
@ 2012-08-07 13:38 ` Ian Campbell
0 siblings, 0 replies; 9+ messages in thread
From: Ian Campbell @ 2012-08-07 13:38 UTC (permalink / raw)
To: Ian Jackson; +Cc: xen-devel@lists.xen.org
On Tue, 2012-08-07 at 12:02 +0100, Ian Campbell wrote:
> # HG changeset patch
> # User Ian Campbell <ian.campbell@citrix.com>
> # Date 1344337292 -3600
> # Node ID c5f673d8b330d6195e2aa3bbf63bb594b4bc99ee
> # Parent a7ad22e5525831dd491d7ee1fe538b7543404ac7
> libxl: write physical-device node if user did not supply a block script
>
> This reverts one of the intentional changes from 25733:353bc0801b11.
> That change exposed an issue with the xl migration protocol, which
> although safe triggers the hotplug scripts device sharing logic.
>
> For 4.2 we disable this logic by writing the physical-device xenstore
> node ourselves if a user did not supply a script. If the user did
> supply a script then we continue to rely on it to write the
> physical-device node (not least because the script may create the
> device and therefore it is not available before we run the script).
>
> This means that to support localhost migration a block hotplug script
> needs to be robust against adding a device twice and should not
> deactivate the device until it has been removed twice.
>
> This should be revisted for 4.3.
^i
>
> Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Ian J acked this IRL so I have committed.
Our reasoning is that at this stage in the release it is better to go
back to the previous behaviour, which although strictly speaking wrong
is harmless and has no API implications, rather than try to invent new
correct APIs on short notice etc.
Ian.
^ permalink raw reply [flat|nested] 9+ messages in thread