qemu-devel.nongnu.org archive mirror
 help / color / mirror / Atom feed
From: Michael Roth <mdroth@linux.vnet.ibm.com>
To: libvir-list@redhat.com
Cc: qemu-devel@nongnu.org, qemu-ppc@nongnu.org, aik@ozlabs.ru,
	alex.williamson@redhat.com, abologna@redhat.com, laine@laine.org,
	pkrempa@redhat.com, berrange@redhat.com, mprivozn@redhat.com,
	sbhat@linux.vnet.ibm.com
Subject: [Qemu-devel] [RFC PATCH 4/5] qemu: hotplug: unbind VFIO devices as a group
Date: Wed, 28 Jun 2017 19:24:59 -0500	[thread overview]
Message-ID: <1498695900-1648-5-git-send-email-mdroth@linux.vnet.ibm.com> (raw)
In-Reply-To: <1498695900-1648-1-git-send-email-mdroth@linux.vnet.ibm.com>

Currently we bind a managed hostdev back to the host driver (or
"unbind" from the perspective of the stub driver) immediately upon
receiving a DEVICE_DELETED event from QEMU. In cases where we have
more one device from the group attached to a guest, this runs the risk
of putting the group in a "non-viable" state where both a guest and
host are using devices from a group simultaneously.

This patch addresses this by deferring the unbind step until all
hostdevs from a group have been detached from the guest. In the
meantime, they are left on the drvManager's inactiveList, in a similar
state as they would be if they were unmanaged devices that were bound
to VFIO via nodedev-detach but not yet plugged into a guest.

Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
 src/libvirt_private.syms |  3 ++
 src/qemu/qemu_hostdev.c  | 16 +++++++++
 src/qemu/qemu_hostdev.h  |  4 +++
 src/qemu/qemu_hotplug.c  | 16 ++++++++-
 src/util/virhostdev.c    | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/util/virhostdev.h    |  8 +++++
 src/util/virpci.c        | 27 +++++++++++++++
 src/util/virpci.h        |  1 +
 8 files changed, 164 insertions(+), 1 deletion(-)

diff --git a/src/libvirt_private.syms b/src/libvirt_private.syms
index 2bd3581..ba7fa39 100644
--- a/src/libvirt_private.syms
+++ b/src/libvirt_private.syms
@@ -1783,6 +1783,8 @@ virHostCPUStatsAssign;
 virHostdevFindUSBDevice;
 virHostdevIsSCSIDevice;
 virHostdevManagerGetDefault;
+virHostdevPCIDeviceGroupUnbind;
+virHostdevPCIDeviceGroupUnbindable;
 virHostdevPCINodeDeviceDetach;
 virHostdevPCINodeDeviceReAttach;
 virHostdevPCINodeDeviceReset;
@@ -2342,6 +2344,7 @@ virPCIDeviceWaitForCleanup;
 virPCIEDeviceInfoFree;
 virPCIGetDeviceAddressFromSysfsLink;
 virPCIGetHeaderType;
+virPCIGetIOMMUGroupList;
 virPCIGetNetName;
 virPCIGetPhysicalFunction;
 virPCIGetVirtualFunctionIndex;
diff --git a/src/qemu/qemu_hostdev.c b/src/qemu/qemu_hostdev.c
index 73d26f4..fdc52fe 100644
--- a/src/qemu/qemu_hostdev.c
+++ b/src/qemu/qemu_hostdev.c
@@ -384,6 +384,22 @@ qemuHostdevPrepareDomainDevices(virQEMUDriverPtr driver,
 }
 
 void
+qemuHostdevReleasePCIDevices(virQEMUDriverPtr driver,
+                             const char *name,
+                             virDomainHostdevDefPtr *hostdevs,
+                             int nhostdevs)
+{
+    virQEMUDriverConfigPtr cfg = virQEMUDriverGetConfig(driver);
+    const char *oldStateDir = cfg->stateDir;
+    virHostdevManagerPtr hostdev_mgr = driver->hostdevMgr;
+
+    virHostdevReleasePCIDevices(hostdev_mgr, QEMU_DRIVER_NAME, name,
+                                hostdevs, nhostdevs, oldStateDir);
+
+    virObjectUnref(cfg);
+}
+
+void
 qemuHostdevReAttachPCIDevices(virQEMUDriverPtr driver,
                               const char *name,
                               virDomainHostdevDefPtr *hostdevs,
diff --git a/src/qemu/qemu_hostdev.h b/src/qemu/qemu_hostdev.h
index 9a7c7f1..b010085 100644
--- a/src/qemu/qemu_hostdev.h
+++ b/src/qemu/qemu_hostdev.h
@@ -74,6 +74,10 @@ void qemuHostdevReAttachPCIDevices(virQEMUDriverPtr driver,
                                    const char *name,
                                    virDomainHostdevDefPtr *hostdevs,
                                    int nhostdevs);
+void qemuHostdevReleasePCIDevices(virQEMUDriverPtr driver,
+                                  const char *name,
+                                  virDomainHostdevDefPtr *hostdevs,
+                                  int nhostdevs);
 void qemuHostdevReAttachUSBDevices(virQEMUDriverPtr driver,
                                    const char *name,
                                    virDomainHostdevDefPtr *hostdevs,
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index b557e82..af5ee6f 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -3896,7 +3896,10 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
 
     switch ((virDomainHostdevSubsysType) hostdev->source.subsys.type) {
     case VIR_DOMAIN_HOSTDEV_SUBSYS_TYPE_PCI:
-        qemuHostdevReAttachPCIDevices(driver, vm->def->name, &hostdev, 1);
+        if (is_vfio)
+            qemuHostdevReleasePCIDevices(driver, vm->def->name, &hostdev, 1);
+        else
+            qemuHostdevReAttachPCIDevices(driver, vm->def->name, &hostdev, 1);
         qemuDomainReleaseDeviceAddress(vm, hostdev->info, NULL);
         /* QEMU might no longer need to lock as much memory, eg. we just
          * detached the last VFIO device, so adjust the limit here */
@@ -3925,6 +3928,17 @@ qemuDomainRemoveHostDevice(virQEMUDriverPtr driver,
         virDomainNetDefFree(net);
     }
 
+    if (is_vfio) {
+        int iommu_group =
+            virPCIDeviceAddressGetIOMMUGroupNum(&hostdev->source.subsys.u.pci.addr);
+        if (virHostdevPCIDeviceGroupUnbindable(driver->hostdevMgr,
+                                               iommu_group)) {
+            virHostdevPCIDeviceGroupUnbind(driver->hostdevMgr,
+                                           iommu_group);
+        }
+    }
+
+
     ret = 0;
 
  cleanup:
diff --git a/src/util/virhostdev.c b/src/util/virhostdev.c
index 2cd3f34..a7f04fe 100644
--- a/src/util/virhostdev.c
+++ b/src/util/virhostdev.c
@@ -905,6 +905,96 @@ virHostdevPreparePCIDevices(virHostdevManagerPtr mgr,
     return ret;
 }
 
+static bool
+virHostdevPCIDeviceUnbindableInternal(virHostdevManagerPtr mgr,
+                                      int iommu_group)
+{
+    struct virHostdevIsPCINodeDeviceUsedData data = { mgr, NULL, true };
+
+    if (virPCIIOMMUGroupIterate(iommu_group,
+                                virHostdevIsPCINodeDeviceUsed,
+                                &data) < 0) {
+        VIR_DEBUG("IOMMU group %d is not unbindable", iommu_group);
+        return false;
+    }
+
+    VIR_DEBUG("IOMMU group %d is unbindable", iommu_group);
+    return true;
+}
+
+/*
+ * Check if devices within IOMMU group are in use by any domains
+ */
+bool
+virHostdevPCIDeviceGroupUnbindable(virHostdevManagerPtr mgr,
+                                   int iommu_group)
+{
+    bool result;
+
+    virObjectLock(mgr->activePCIHostdevs);
+    result = virHostdevPCIDeviceUnbindableInternal(mgr, iommu_group);
+    virObjectUnlock(mgr->activePCIHostdevs);
+
+    return result;
+}
+
+/*
+ * Confirm all devices in IOMMU group are in inactiveList
+ * before attempting to reattach to host driver. Devices in IOMMU
+ * group that aren't in either activeList or inactiveList are considered
+ * outside our control, so we treat them as inactive as well.
+ *
+ * Callers can check virHostdevPCIDeviceGroupUnbindable() beforehand
+ * for some indication that the group is ready for reattach to the
+ * host, but since it's possible for a hostdev from the group to get
+ * re-attached to a guest prior to subsequently calling this function
+ * there is no guarantee of this, which should be fine since it would
+ * only be immediately rebound to the stub driver anyway.
+ */
+void
+virHostdevPCIDeviceGroupUnbind(virHostdevManagerPtr mgr,
+                               int iommu_group)
+{
+    virPCIDeviceListPtr pcidevs = NULL;
+    size_t i;
+
+    virObjectLock(mgr->activePCIHostdevs);
+    virObjectLock(mgr->inactivePCIHostdevs);
+
+    if (!virHostdevPCIDeviceUnbindableInternal(mgr, iommu_group)) {
+        VIR_DEBUG("IOMMU group %d still in use, deferring reattach "
+                  "of PCI devices to host", iommu_group);
+        goto cleanup;
+    }
+
+    pcidevs = virPCIGetIOMMUGroupList(iommu_group);
+    for (i = 0; i < virPCIDeviceListCount(pcidevs); i++) {
+        virPCIDevicePtr actual, pci = virPCIDeviceListGet(pcidevs, i);
+        virPCIDeviceAddressPtr devAddr = virPCIDeviceGetAddress(pci);
+
+        actual = virPCIDeviceListFindByIDs(mgr->inactivePCIHostdevs,
+                                           devAddr->domain,
+                                           devAddr->bus,
+                                           devAddr->slot,
+                                           devAddr->function);
+        if (actual) {
+            VIR_DEBUG("Reattaching PCI device %s", virPCIDeviceGetName(actual));
+            if (virPCIDeviceGetManaged(actual))
+                if (virPCIDeviceReattach(actual, mgr->activePCIHostdevs,
+                                         mgr->inactivePCIHostdevs) < 0) {
+                    VIR_ERROR(_("Failed to re-attach PCI device: %s"),
+                              virGetLastErrorMessage());
+                    virResetLastError();
+                }
+        }
+    }
+
+ cleanup:
+    virObjectUnref(pcidevs);
+    virObjectUnlock(mgr->activePCIHostdevs);
+    virObjectUnlock(mgr->inactivePCIHostdevs);
+}
+
 /*
  * Pre-condition: inactivePCIHostdevs & activePCIHostdevs
  * are locked
diff --git a/src/util/virhostdev.h b/src/util/virhostdev.h
index fbc7fbd..2ab8101 100644
--- a/src/util/virhostdev.h
+++ b/src/util/virhostdev.h
@@ -122,6 +122,14 @@ virHostdevReleasePCIDevices(virHostdevManagerPtr mgr,
                             const char *oldStateDir)
     ATTRIBUTE_NONNULL(1);
 void
+virHostdevPCIDeviceGroupUnbind(virHostdevManagerPtr mgr,
+                               int iommu_group)
+    ATTRIBUTE_NONNULL(1);
+bool
+virHostdevPCIDeviceGroupUnbindable(virHostdevManagerPtr mgr,
+                                   int iommu_group)
+    ATTRIBUTE_NONNULL(1);
+void
 virHostdevReAttachUSBDevices(virHostdevManagerPtr hostdev_mgr,
                               const char *drv_name,
                               const char *dom_name,
diff --git a/src/util/virpci.c b/src/util/virpci.c
index b842f44..a8e5190 100644
--- a/src/util/virpci.c
+++ b/src/util/virpci.c
@@ -2298,6 +2298,33 @@ virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev)
 }
 
 
+/*
+ * virPCIGetIOMMUGroupList - return a virPCIDeviceList containing
+ * all of the devices in @iommu_group.
+ *
+ * Return the new list, or NULL on failure
+ */
+virPCIDeviceListPtr
+virPCIGetIOMMUGroupList(int iommu_group)
+{
+    virPCIDeviceListPtr groupList = virPCIDeviceListNew();
+
+    if (!groupList)
+        goto error;
+
+    if (virPCIIOMMUGroupIterate(iommu_group,
+                                virPCIDeviceGetIOMMUGroupAddOne,
+                                groupList) < 0)
+        goto error;
+
+    return groupList;
+
+ error:
+    virObjectUnref(groupList);
+    return NULL;
+}
+
+
 typedef struct {
     virPCIDeviceAddressPtr **iommuGroupDevices;
     size_t *nIommuGroupDevices;
diff --git a/src/util/virpci.h b/src/util/virpci.h
index 5ec1306..5bcacb2 100644
--- a/src/util/virpci.h
+++ b/src/util/virpci.h
@@ -180,6 +180,7 @@ int virPCIIOMMUGroupIterate(int iommu_group,
                             virPCIDeviceAddressActor actor,
                             void *opaque);
 virPCIDeviceListPtr virPCIDeviceGetIOMMUGroupList(virPCIDevicePtr dev);
+virPCIDeviceListPtr virPCIGetIOMMUGroupList(int iommu_group);
 int virPCIDeviceAddressGetIOMMUGroupAddresses(virPCIDeviceAddressPtr devAddr,
                                               virPCIDeviceAddressPtr **iommuGroupDevices,
                                               size_t *nIommuGroupDevices);
-- 
2.7.4

  parent reply	other threads:[~2017-06-29  0:26 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-06-29  0:24 [Qemu-devel] [RFC PATCH 0/5] hotplug: fix premature rebinding of VFIO devices to host Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 1/5] virhostdev: factor release out from reattach and export it for use later Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 2/5] qemu_hotplug: squash qemuDomainRemovePCIHostDevice into caller Michael Roth
2017-06-29  0:24 ` [Qemu-devel] [RFC PATCH 3/5] virpci: introduce virPCIIOMMUGroupIterate() Michael Roth
2017-06-29  0:24 ` Michael Roth [this message]
2017-06-29  0:25 ` [Qemu-devel] [RFC PATCH 5/5] qemu: hotplug: wait for VFIO group FD close before unbind Michael Roth
2017-06-29  8:33 ` [Qemu-devel] [RFC PATCH 0/5] hotplug: fix premature rebinding of VFIO devices to host Daniel P. Berrange
2017-06-29 18:22   ` Michael Roth
2017-06-29 19:31     ` Alex Williamson
2017-06-29 19:28   ` Alex Williamson
2017-06-29 20:21     ` Michael Roth
2017-06-29 18:50 ` Laine Stump
2017-06-29 19:44   ` Alex Williamson
2017-06-30  2:27     ` Laine Stump
2017-06-29 20:59   ` Michael Roth
2017-06-30  6:59     ` Andrea Bolognani

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1498695900-1648-5-git-send-email-mdroth@linux.vnet.ibm.com \
    --to=mdroth@linux.vnet.ibm.com \
    --cc=abologna@redhat.com \
    --cc=aik@ozlabs.ru \
    --cc=alex.williamson@redhat.com \
    --cc=berrange@redhat.com \
    --cc=laine@laine.org \
    --cc=libvir-list@redhat.com \
    --cc=mprivozn@redhat.com \
    --cc=pkrempa@redhat.com \
    --cc=qemu-devel@nongnu.org \
    --cc=qemu-ppc@nongnu.org \
    --cc=sbhat@linux.vnet.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).