Linux CXL
 help / color / mirror / Atom feed
* [PATCH v2] cxl/region: Fix a race bug in delete_region_store
@ 2026-04-22  4:56 Sungwoo Kim
  2026-04-22 15:10 ` Dave Jiang
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Sungwoo Kim @ 2026-04-22  4:56 UTC (permalink / raw)
  To: Davidlohr Bueso, Jonathan Cameron, Dave Jiang, Alison Schofield,
	Vishal Verma, Ira Weiny, Dan Williams, Ben Widawsky
  Cc: Dave Tian, Sungwoo Kim, Dan Williams, Jonathan Cameron, linux-cxl,
	linux-kernel

devm_release_action() cannot find a matching entry in the following two
race scenarios.

scenario 1: delete two same regions concurrently

CPU 0                     CPU 1
======                    ======
delete_region_store()
  cxlr = cxl_find_region_by_name()
                          delete_region_store()
                            cxlr = cxl_find_region_by_name()
  devm_release_action()
                            devm_release_action()
                            // cannot find the action, WARN_ON()

scenario 2: delete parent and child concurrently [1]

CPU0                                    CPU1
devres_release_all()
   // take devres_lock
   remove_nodes(devres_head) // mv to local todo
   // drop devres_lock                   delete_region_store()
                                           cxlr = cxl_find_region_by_name()  // success
                                           devm_release_action(unregister_region)
                                             devres_release()
                                               devres_remove()
                                                 // hold devres_lock
                                                 find_dr(devres_head) // does not find it
                                             WARN_ON(-ENOENT)
   release_nodes() // drain todo
     unregister_region(cxlr) // release() cb
       device_del()

To fix scenario 1, delete_region_store() directly calls
unregister_region() with a test_and_set_bit(CXL_REGION_F_UNREGISTER).
Also, replace devm_release_action() to devm_remove_action() as
unregister_region() is now called directly.

To fix scenario 2, delete_region_store() removes actions only if the
driver is still attached. To ensure this, scoped_guard(device,
port->uport_dev) is required to check port->uport_dev->driver.
To hold this lock, a workqueue is required for a clean context.

Splat:

WARNING: drivers/base/devres.c:824 at devm_release_action drivers/base/devres.c:824 [inline], CPU#0: syz.1.12224/47589
WARNING: drivers/base/devres.c:824 at devm_release_action+0x2b2/0x360 drivers/base/devres.c:817, CPU#0: syz.1.12224/47589

[1] https://lore.kernel.org/linux-cxl/20260310183644.4rwc7ilmzy4t5xp6@offworld/

Fixes: 779dd20cfb56 ("cxl/region: Add region creation support")
Suggested-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
---
V1: https://lore.kernel.org/linux-cxl/20260308185958.2453707-2-iam@sung-woo.kim/
V1->V2:
- Made devm_remove_action() asynchronous.
- Made unregister_region() idempotent.
- Addressed Dan's comments and added the suggested-by tag.

 drivers/cxl/core/region.c | 45 ++++++++++++++++++++++++++++++++++++---
 drivers/cxl/cxl.h         |  8 +++++++
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
index e50dc716d4e8..64db0d332c13 100644
--- a/drivers/cxl/core/region.c
+++ b/drivers/cxl/core/region.c
@@ -39,6 +39,7 @@
 static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
 
 static struct cxl_region *to_cxl_region(struct device *dev);
+static void remove_devm_actions_work(struct work_struct *work);
 
 #define __ACCESS_ATTR_RO(_level, _name) {				\
 	.attr	= { .name = __stringify(_name), .mode = 0444 },		\
@@ -2543,6 +2544,9 @@ static void unregister_region(void *_cxlr)
 	struct cxl_region_params *p = &cxlr->params;
 	int i;
 
+	if (test_and_set_bit(CXL_REGION_F_UNREGISTER, &cxlr->flags))
+		return;
+
 	device_del(&cxlr->dev);
 
 	/*
@@ -2589,6 +2593,8 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
 	dev->type = &cxl_region_type;
 	cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
 
+	INIT_WORK(&cxlr->remove_work, remove_devm_actions_work);
+
 	return cxlr;
 }
 
@@ -2831,20 +2837,53 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
 	return to_cxl_region(region_dev);
 }
 
+static void remove_devm_actions_work(struct work_struct *work)
+{
+	struct cxl_region *cxlr = container_of(work, typeof(*cxlr), remove_work);
+	struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
+	struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
+
+	if (test_and_set_bit(CXL_REGION_F_DEVM_REMOVE, &cxlr->flags)) {
+		put_device(&cxlr->dev);
+		return;
+	}
+
+	scoped_guard(device, port->uport_dev) {
+		if (port->uport_dev->driver)
+			devm_remove_action(port->uport_dev, unregister_region, cxlr);
+	}
+
+	put_device(&cxlr->dev);
+}
+
+static int remove_devm_actions(struct cxl_region *cxlr)
+{
+	if (!schedule_work(&cxlr->remove_work))
+		return -EBUSY;
+
+	return 0;
+}
+
 static ssize_t delete_region_store(struct device *dev,
 				   struct device_attribute *attr,
 				   const char *buf, size_t len)
 {
 	struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
-	struct cxl_port *port = to_cxl_port(dev->parent);
 	struct cxl_region *cxlr;
+	int rc;
 
+	/* remove_devm_actions_work() will put cxlr->dev. */
 	cxlr = cxl_find_region_by_name(cxlrd, buf);
 	if (IS_ERR(cxlr))
 		return PTR_ERR(cxlr);
 
-	devm_release_action(port->uport_dev, unregister_region, cxlr);
-	put_device(&cxlr->dev);
+	unregister_region(cxlr);
+
+	rc = remove_devm_actions(cxlr);
+	if (rc) {
+		put_device(&cxlr->dev);
+		return rc;
+	}
 
 	return len;
 }
diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
index 1297594beaec..75ec292a9f42 100644
--- a/drivers/cxl/cxl.h
+++ b/drivers/cxl/cxl.h
@@ -447,6 +447,12 @@ struct cxl_region_params {
  */
 #define CXL_REGION_F_NORMALIZED_ADDRESSING 3
 
+/* Indicate that this region is being unregistered to prevent a race. */
+#define CXL_REGION_F_UNREGISTER 4
+
+/* Indicate that this region called devm_remove_action. */
+#define CXL_REGION_F_DEVM_REMOVE 5
+
 /**
  * struct cxl_region - CXL region
  * @dev: This region's device
@@ -462,6 +468,7 @@ struct cxl_region_params {
  * @coord: QoS access coordinates for the region
  * @node_notifier: notifier for setting the access coordinates to node
  * @adist_notifier: notifier for calculating the abstract distance of node
+ * @remove_work: trigger the remove action in a safe context to acquire locks
  */
 struct cxl_region {
 	struct device dev;
@@ -477,6 +484,7 @@ struct cxl_region {
 	struct access_coordinate coord[ACCESS_COORDINATE_MAX];
 	struct notifier_block node_notifier;
 	struct notifier_block adist_notifier;
+	struct work_struct remove_work;
 };
 
 struct cxl_nvdimm_bridge {
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2026-04-23  2:53 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-04-22  4:56 [PATCH v2] cxl/region: Fix a race bug in delete_region_store Sungwoo Kim
2026-04-22 15:10 ` Dave Jiang
2026-04-22 23:13   ` Sungwoo Kim
2026-04-22 23:19     ` Dave Jiang
2026-04-23  0:19 ` Dave Jiang
2026-04-23  2:52 ` Sungwoo Kim

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox