All of lore.kernel.org
 help / color / mirror / Atom feed
From: Jason Gunthorpe <jgg@nvidia.com>
To: Matthew Rosato <mjrosato@linux.ibm.com>
Cc: Robin Murphy <robin.murphy@arm.com>,
	iommu@lists.linux.dev,
	Alex Williamson <alex.williamson@redhat.com>,
	linux-s390@vger.kernel.org, schnelle@linux.ibm.com,
	pmorel@linux.ibm.com, borntraeger@linux.ibm.com,
	hca@linux.ibm.com, gor@linux.ibm.com,
	gerald.schaefer@linux.ibm.com, agordeev@linux.ibm.com,
	svens@linux.ibm.com, joro@8bytes.org, will@kernel.org,
	linux-kernel@vger.kernel.org
Subject: Re: [PATCH v4 1/2] iommu/s390: Fix race with release_device ops
Date: Thu, 1 Sep 2022 17:37:21 -0300	[thread overview]
Message-ID: <YxEYAcFK0EdahXzJ@nvidia.com> (raw)
In-Reply-To: <369ad331-8bdc-d385-a227-f674bd410599@linux.ibm.com>

On Thu, Sep 01, 2022 at 12:14:24PM -0400, Matthew Rosato wrote:
> On 9/1/22 6:25 AM, Robin Murphy wrote:
> > On 2022-08-31 21:12, Matthew Rosato wrote:
> >> With commit fa7e9ecc5e1c ("iommu/s390: Tolerate repeat attach_dev
> >> calls") s390-iommu is supposed to handle dynamic switching between IOMMU
> >> domains and the DMA API handling.  However, this commit does not
> >> sufficiently handle the case where the device is released via a call
> >> to the release_device op as it may occur at the same time as an opposing
> >> attach_dev or detach_dev since the group mutex is not held over
> >> release_device.  This was observed if the device is deconfigured during a
> >> small window during vfio-pci initialization and can result in WARNs and
> >> potential kernel panics.
> > 
> > Hmm, the more I think about it, something doesn't sit right about this whole situation... release_device is called via the notifier from device_del() after the device has been removed from its parent bus and largely dismantled; it should definitely not still have a driver bound by that point, so how is VFIO doing things that manage to race at all?
> > 
> > Robin.
> 
> So, I generally have seen the issue manifest as one of the calls
> into the iommu core from __vfio_group_unset_container
> (e.g. iommu_deatch_group via vfio_type1_iommu) failing with a WARN.
> This happens when the vfio group fd is released, which could be
> coming e.g. from a userspace ioctl VFIO_GROUP_UNSET_CONTAINER.
> AFAICT there's nothing serializing the notion of calling into the
> iommu core here against a device that is simultaneously going
> through release_device (because we don't enter release_device with
> the group mutex held), resulting in unpredictable behavior between
> the dueling attach_dev/detach_dev and the release_device for
> s390-iommu at least.

Oh, this is a vfio bug.

dev->iommu_group is only a valid pointer as long as a driver is attach
to the device.

vfio copies the dev->iommu_group into struct vfio_group during probe()
but then lets vfio_group live independently. Particularly the driver
can be removed()'d and the vfio_group keeps on going.

Thus it is possible to UAF the iommu_group by referencing it through
the vfio_group.

We must wait during remove for all the vfio_groups to stop
referencing iommu_group.

Something like this or so:

diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index eb714a484662fc..d8f40b22c98ddb 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -65,7 +65,15 @@ struct vfio_container {
 struct vfio_group {
 	struct device 			dev;
 	struct cdev			cdev;
+	/*
+	 * When drivers is non-zero a driver is attached to the struct device
+	 * that provided the iommu_group and thus the iommu_group is a valid
+	 * pointer. When drivers is 0 the driver is being detached. Once users
+	 * reaches 0 then the iommu_group is invalid.
+	 */
+	refcount_t			drivers;
 	refcount_t			users;
+	struct completion		comp;
 	unsigned int			container_users;
 	struct iommu_group		*iommu_group;
 	struct vfio_container		*container;
@@ -276,8 +284,6 @@ void vfio_unregister_iommu_driver(const struct vfio_iommu_driver_ops *ops)
 }
 EXPORT_SYMBOL_GPL(vfio_unregister_iommu_driver);
 
-static void vfio_group_get(struct vfio_group *group);
-
 /*
  * Container objects - containers are created when /dev/vfio/vfio is
  * opened, but their lifecycle extends until the last user is done, so
@@ -305,16 +311,21 @@ static void vfio_container_put(struct vfio_container *container)
 /*
  * Group objects - create, release, get, put, search
  */
+
+ /*
+  * This returns a driver reference. It can only be used in the probe function
+  * of a device_driver, eg as part of the internal implementation of
+  * __vfio_register_dev().
+  */
 static struct vfio_group *
 __vfio_group_get_from_iommu(struct iommu_group *iommu_group)
 {
 	struct vfio_group *group;
 
 	list_for_each_entry(group, &vfio.group_list, vfio_next) {
-		if (group->iommu_group == iommu_group) {
-			vfio_group_get(group);
+		if (group->iommu_group == iommu_group &&
+		    refcount_inc_not_zero(&group->drivers))
 			return group;
-		}
 	}
 	return NULL;
 }
@@ -364,6 +375,8 @@ static struct vfio_group *vfio_group_alloc(struct iommu_group *iommu_group,
 	group->cdev.owner = THIS_MODULE;
 
 	refcount_set(&group->users, 1);
+	refcount_set(&group->drivers, 1);
+	init_completion(&group->comp);
 	init_rwsem(&group->group_rwsem);
 	INIT_LIST_HEAD(&group->device_list);
 	mutex_init(&group->device_lock);
@@ -422,8 +435,28 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group,
 
 static void vfio_group_put(struct vfio_group *group)
 {
-	if (!refcount_dec_and_mutex_lock(&group->users, &vfio.group_lock))
-		return;
+	if (refcount_dec_and_test(&group->users))
+		complete(&group->comp);
+}
+
+/*
+ * When the drivers count reaches 0 then the group must be destroyed
+ * immediately. A zero driver group is a zombie awaiting destruction.
+ */
+static void vfio_group_remove(struct vfio_group *group)
+{
+	/* Matches the get from vfio_group_alloc() */
+	vfio_group_put(group);
+
+	cdev_device_del(&group->cdev, &group->dev);
+
+	/*
+	 * Before we allow the last driver in the group to be unplugged the
+	 * group must be sanitized so nothing else is or can reference it. This
+	 * is because the group->iommu_group pointer is only valid so long as a
+	 * VFIO device driver is attached to a device belonging to the group.
+	 */
+	wait_for_completion(&group->comp);
 
 	/*
 	 * These data structures all have paired operations that can only be
@@ -434,19 +467,15 @@ static void vfio_group_put(struct vfio_group *group)
 	WARN_ON(!list_empty(&group->device_list));
 	WARN_ON(group->container || group->container_users);
 	WARN_ON(group->notifier.head);
+	group->iommu_group = NULL;
 
+	mutex_lock(&vfio.group_lock);
 	list_del(&group->vfio_next);
-	cdev_device_del(&group->cdev, &group->dev);
 	mutex_unlock(&vfio.group_lock);
 
 	put_device(&group->dev);
 }
 
-static void vfio_group_get(struct vfio_group *group)
-{
-	refcount_inc(&group->users);
-}
-
 /*
  * Device objects - create, release, get, put, search
  */
@@ -462,22 +491,6 @@ static bool vfio_device_try_get(struct vfio_device *device)
 	return refcount_inc_not_zero(&device->refcount);
 }
 
-static struct vfio_device *vfio_group_get_device(struct vfio_group *group,
-						 struct device *dev)
-{
-	struct vfio_device *device;
-
-	mutex_lock(&group->device_lock);
-	list_for_each_entry(device, &group->device_list, group_next) {
-		if (device->dev == dev && vfio_device_try_get(device)) {
-			mutex_unlock(&group->device_lock);
-			return device;
-		}
-	}
-	mutex_unlock(&group->device_lock);
-	return NULL;
-}
-
 /*
  * VFIO driver API
  */
@@ -576,8 +589,10 @@ static struct vfio_group *vfio_group_find_or_alloc(struct device *dev)
 static int __vfio_register_dev(struct vfio_device *device,
 		struct vfio_group *group)
 {
-	struct vfio_device *existing_device;
-
+	/*
+	 * In all cases group is the output of one of the group allocation functions
+	 * and we have group->drivers incremetned for us
+	 */
 	if (IS_ERR(group))
 		return PTR_ERR(group);
 
@@ -588,18 +603,6 @@ static int __vfio_register_dev(struct vfio_device *device,
 	if (!device->dev_set)
 		vfio_assign_device_set(device, device);
 
-	existing_device = vfio_group_get_device(group, device->dev);
-	if (existing_device) {
-		dev_WARN(device->dev, "Device already exists on group %d\n",
-			 iommu_group_id(group->iommu_group));
-		vfio_device_put(existing_device);
-		if (group->type == VFIO_NO_IOMMU ||
-		    group->type == VFIO_EMULATED_IOMMU)
-			iommu_group_remove_device(device->dev);
-		vfio_group_put(group);
-		return -EBUSY;
-	}
-
 	/* Our reference on group is moved to the device */
 	device->group = group;
 
@@ -702,8 +705,9 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 	if (group->type == VFIO_NO_IOMMU || group->type == VFIO_EMULATED_IOMMU)
 		iommu_group_remove_device(device->dev);
 
-	/* Matches the get in vfio_register_group_dev() */
-	vfio_group_put(group);
+	/* Matches the alloc get in vfio_register_group_dev() */
+	if (refcount_dec_and_test(&group->drivers))
+		vfio_group_remove(group);
 }
 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 

  reply	other threads:[~2022-09-01 20:37 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-31 20:12 [PATCH v4 0/2] iommu/s390: fixes related to repeat attach_dev calls Matthew Rosato
2022-08-31 20:12 ` [PATCH v4 1/2] iommu/s390: Fix race with release_device ops Matthew Rosato
2022-09-01  7:56   ` Pierre Morel
2022-09-01  9:37     ` Niklas Schnelle
2022-09-01 11:01       ` Robin Murphy
2022-09-01 13:42         ` Niklas Schnelle
2022-09-01 14:17           ` Niklas Schnelle
2022-09-01 14:29           ` Robin Murphy
2022-09-01 14:34             ` Jason Gunthorpe
2022-09-01 15:03               ` Robin Murphy
2022-09-01 15:49                 ` Jason Gunthorpe
2022-09-01 17:00                   ` Robin Murphy
2022-09-01 20:28       ` Matthew Rosato
2022-09-02  7:49         ` Niklas Schnelle
2022-09-01 10:25   ` Robin Murphy
2022-09-01 16:14     ` Matthew Rosato
2022-09-01 20:37       ` Jason Gunthorpe [this message]
2022-09-02 17:11         ` Matthew Rosato
2022-09-02 17:21           ` Jason Gunthorpe
2022-09-02 18:20             ` Matthew Rosato
2022-09-05  9:46             ` Robin Murphy
2022-09-06 13:36               ` Jason Gunthorpe
2022-09-02 10:48       ` Robin Murphy
2022-08-31 20:12 ` [PATCH v4 2/2] iommu/s390: fix leak of s390_domain_device Matthew Rosato

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YxEYAcFK0EdahXzJ@nvidia.com \
    --to=jgg@nvidia.com \
    --cc=agordeev@linux.ibm.com \
    --cc=alex.williamson@redhat.com \
    --cc=borntraeger@linux.ibm.com \
    --cc=gerald.schaefer@linux.ibm.com \
    --cc=gor@linux.ibm.com \
    --cc=hca@linux.ibm.com \
    --cc=iommu@lists.linux.dev \
    --cc=joro@8bytes.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-s390@vger.kernel.org \
    --cc=mjrosato@linux.ibm.com \
    --cc=pmorel@linux.ibm.com \
    --cc=robin.murphy@arm.com \
    --cc=schnelle@linux.ibm.com \
    --cc=svens@linux.ibm.com \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.