* Re: [PATCH v2] cxl/region: Fix a race bug in delete_region_store
2026-04-22 4:56 [PATCH v2] cxl/region: Fix a race bug in delete_region_store Sungwoo Kim
@ 2026-04-22 15:10 ` Dave Jiang
2026-04-22 23:13 ` Sungwoo Kim
2026-04-23 0:19 ` Dave Jiang
2026-04-23 2:52 ` Sungwoo Kim
2 siblings, 1 reply; 6+ messages in thread
From: Dave Jiang @ 2026-04-22 15:10 UTC (permalink / raw)
To: Sungwoo Kim, Davidlohr Bueso, Jonathan Cameron, Alison Schofield,
Vishal Verma, Ira Weiny, Dan Williams, Ben Widawsky
Cc: Dave Tian, Dan Williams, Jonathan Cameron, linux-cxl,
linux-kernel
On 4/21/26 9:56 PM, Sungwoo Kim wrote:
> devm_release_action() cannot find a matching entry in the following two
> race scenarios.
>
> scenario 1: delete two same regions concurrently
>
> CPU 0 CPU 1
> ====== ======
> delete_region_store()
> cxlr = cxl_find_region_by_name()
> delete_region_store()
> cxlr = cxl_find_region_by_name()
> devm_release_action()
> devm_release_action()
> // cannot find the action, WARN_ON()
>
> scenario 2: delete parent and child concurrently [1]
>
> CPU0 CPU1
> devres_release_all()
> // take devres_lock
> remove_nodes(devres_head) // mv to local todo
> // drop devres_lock delete_region_store()
> cxlr = cxl_find_region_by_name() // success
> devm_release_action(unregister_region)
> devres_release()
> devres_remove()
> // hold devres_lock
> find_dr(devres_head) // does not find it
> WARN_ON(-ENOENT)
> release_nodes() // drain todo
> unregister_region(cxlr) // release() cb
> device_del()
>
> To fix scenario 1, delete_region_store() directly calls
> unregister_region() with a test_and_set_bit(CXL_REGION_F_UNREGISTER).
> Also, replace devm_release_action() to devm_remove_action() as
> unregister_region() is now called directly.
>
> To fix scenario 2, delete_region_store() removes actions only if the
> driver is still attached. To ensure this, scoped_guard(device,
> port->uport_dev) is required to check port->uport_dev->driver.
> To hold this lock, a workqueue is required for a clean context.
Thank you Sungwoo for the fix. Given that this deals with 2 different scenarios, can this be split into 2 patches each address a scenario uniquely?
For scenario 2, can you please put a bit more context in its commit log to explain why a workqueue is needed from your discussion with Dan for future readers?
Also for the commit log please add context on how these issues were found and what impact was observed and how that effects a user. This information necessary for the Fixes tag dispatching upstream.
Thanks!
DJ
>
> Splat:
>
> WARNING: drivers/base/devres.c:824 at devm_release_action drivers/base/devres.c:824 [inline], CPU#0: syz.1.12224/47589
> WARNING: drivers/base/devres.c:824 at devm_release_action+0x2b2/0x360 drivers/base/devres.c:817, CPU#0: syz.1.12224/47589
>
> [1] https://lore.kernel.org/linux-cxl/20260310183644.4rwc7ilmzy4t5xp6@offworld/
>
> Fixes: 779dd20cfb56 ("cxl/region: Add region creation support")
> Suggested-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
> ---
> V1: https://lore.kernel.org/linux-cxl/20260308185958.2453707-2-iam@sung-woo.kim/
> V1->V2:
> - Made devm_remove_action() asynchronous.
> - Made unregister_region() idempotent.
> - Addressed Dan's comments and added the suggested-by tag.
>
> drivers/cxl/core/region.c | 45 ++++++++++++++++++++++++++++++++++++---
> drivers/cxl/cxl.h | 8 +++++++
> 2 files changed, 50 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index e50dc716d4e8..64db0d332c13 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -39,6 +39,7 @@
> static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
>
> static struct cxl_region *to_cxl_region(struct device *dev);
> +static void remove_devm_actions_work(struct work_struct *work);
>
> #define __ACCESS_ATTR_RO(_level, _name) { \
> .attr = { .name = __stringify(_name), .mode = 0444 }, \
> @@ -2543,6 +2544,9 @@ static void unregister_region(void *_cxlr)
> struct cxl_region_params *p = &cxlr->params;
> int i;
>
> + if (test_and_set_bit(CXL_REGION_F_UNREGISTER, &cxlr->flags))
> + return;
> +
> device_del(&cxlr->dev);
>
> /*
> @@ -2589,6 +2593,8 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
> dev->type = &cxl_region_type;
> cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
>
> + INIT_WORK(&cxlr->remove_work, remove_devm_actions_work);
> +
> return cxlr;
> }
>
> @@ -2831,20 +2837,53 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
> return to_cxl_region(region_dev);
> }
>
> +static void remove_devm_actions_work(struct work_struct *work)
> +{
> + struct cxl_region *cxlr = container_of(work, typeof(*cxlr), remove_work);
> + struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
> + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
> +
> + if (test_and_set_bit(CXL_REGION_F_DEVM_REMOVE, &cxlr->flags)) {
> + put_device(&cxlr->dev);
> + return;
> + }
> +
> + scoped_guard(device, port->uport_dev) {
> + if (port->uport_dev->driver)
> + devm_remove_action(port->uport_dev, unregister_region, cxlr);
> + }
> +
> + put_device(&cxlr->dev);
> +}
> +
> +static int remove_devm_actions(struct cxl_region *cxlr)
> +{
> + if (!schedule_work(&cxlr->remove_work))
> + return -EBUSY;
> +
> + return 0;
> +}
> +
> static ssize_t delete_region_store(struct device *dev,
> struct device_attribute *attr,
> const char *buf, size_t len)
> {
> struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> - struct cxl_port *port = to_cxl_port(dev->parent);
> struct cxl_region *cxlr;
> + int rc;
>
> + /* remove_devm_actions_work() will put cxlr->dev. */
> cxlr = cxl_find_region_by_name(cxlrd, buf);
> if (IS_ERR(cxlr))
> return PTR_ERR(cxlr);
>
> - devm_release_action(port->uport_dev, unregister_region, cxlr);
> - put_device(&cxlr->dev);
> + unregister_region(cxlr);
> +
> + rc = remove_devm_actions(cxlr);
> + if (rc) {
> + put_device(&cxlr->dev);
> + return rc;
> + }
>
> return len;
> }
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 1297594beaec..75ec292a9f42 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -447,6 +447,12 @@ struct cxl_region_params {
> */
> #define CXL_REGION_F_NORMALIZED_ADDRESSING 3
>
> +/* Indicate that this region is being unregistered to prevent a race. */
> +#define CXL_REGION_F_UNREGISTER 4
> +
> +/* Indicate that this region called devm_remove_action. */
> +#define CXL_REGION_F_DEVM_REMOVE 5
> +
> /**
> * struct cxl_region - CXL region
> * @dev: This region's device
> @@ -462,6 +468,7 @@ struct cxl_region_params {
> * @coord: QoS access coordinates for the region
> * @node_notifier: notifier for setting the access coordinates to node
> * @adist_notifier: notifier for calculating the abstract distance of node
> + * @remove_work: trigger the remove action in a safe context to acquire locks
> */
> struct cxl_region {
> struct device dev;
> @@ -477,6 +484,7 @@ struct cxl_region {
> struct access_coordinate coord[ACCESS_COORDINATE_MAX];
> struct notifier_block node_notifier;
> struct notifier_block adist_notifier;
> + struct work_struct remove_work;
> };
>
> struct cxl_nvdimm_bridge {
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH v2] cxl/region: Fix a race bug in delete_region_store
2026-04-22 15:10 ` Dave Jiang
@ 2026-04-22 23:13 ` Sungwoo Kim
2026-04-22 23:19 ` Dave Jiang
0 siblings, 1 reply; 6+ messages in thread
From: Sungwoo Kim @ 2026-04-22 23:13 UTC (permalink / raw)
To: Dave Jiang
Cc: Davidlohr Bueso, Jonathan Cameron, Alison Schofield, Vishal Verma,
Ira Weiny, Dan Williams, Ben Widawsky, Dave Tian, Dan Williams,
Jonathan Cameron, linux-cxl, linux-kernel
On Wed, Apr 22, 2026 at 11:10 AM Dave Jiang <dave.jiang@intel.com> wrote:
>
>
>
> On 4/21/26 9:56 PM, Sungwoo Kim wrote:
> > devm_release_action() cannot find a matching entry in the following two
> > race scenarios.
> >
> > scenario 1: delete two same regions concurrently
> >
> > CPU 0 CPU 1
> > ====== ======
> > delete_region_store()
> > cxlr = cxl_find_region_by_name()
> > delete_region_store()
> > cxlr = cxl_find_region_by_name()
> > devm_release_action()
> > devm_release_action()
> > // cannot find the action, WARN_ON()
> >
> > scenario 2: delete parent and child concurrently [1]
> >
> > CPU0 CPU1
> > devres_release_all()
> > // take devres_lock
> > remove_nodes(devres_head) // mv to local todo
> > // drop devres_lock delete_region_store()
> > cxlr = cxl_find_region_by_name() // success
> > devm_release_action(unregister_region)
> > devres_release()
> > devres_remove()
> > // hold devres_lock
> > find_dr(devres_head) // does not find it
> > WARN_ON(-ENOENT)
> > release_nodes() // drain todo
> > unregister_region(cxlr) // release() cb
> > device_del()
> >
> > To fix scenario 1, delete_region_store() directly calls
> > unregister_region() with a test_and_set_bit(CXL_REGION_F_UNREGISTER).
> > Also, replace devm_release_action() to devm_remove_action() as
> > unregister_region() is now called directly.
> >
> > To fix scenario 2, delete_region_store() removes actions only if the
> > driver is still attached. To ensure this, scoped_guard(device,
> > port->uport_dev) is required to check port->uport_dev->driver.
> > To hold this lock, a workqueue is required for a clean context.
>
> Thank you Sungwoo for the fix. Given that this deals with 2 different scenarios, can this be split into 2 patches each address a scenario uniquely?
Sure thing. I think reforming it into a single patchset is better than
splitting it into two distinct patches, since they are highly
relevant.
>
> For scenario 2, can you please put a bit more context in its commit log to explain why a workqueue is needed from your discussion with Dan for future readers?
>
> Also for the commit log please add context on how these issues were found and what impact was observed and how that effects a user. This information necessary for the Fixes tag dispatching upstream.
Will do in v3. How do you think about impact? I guess the impact (the
warning in devm_release_action()) is low, since the region has already
been released. So, it does not affect users, except for a negligible
warning message.
Apart from the impact, this bug is a case study in cleaning up a
resource, especially when it can be cleaned up by both upper and lower
devices. Since CXL is hierarchical, existing and future CXL components
may have a similar issue with different impacts and need to adopt this
pattern to avoid the same race condition. I'm personally interested in
finding bugs in this direction :)
Thank you!
Sungwoo.
>
> Thanks!
>
> DJ
>
> >
> > Splat:
> >
> > WARNING: drivers/base/devres.c:824 at devm_release_action drivers/base/devres.c:824 [inline], CPU#0: syz.1.12224/47589
> > WARNING: drivers/base/devres.c:824 at devm_release_action+0x2b2/0x360 drivers/base/devres.c:817, CPU#0: syz.1.12224/47589
> >
> > [1] https://lore.kernel.org/linux-cxl/20260310183644.4rwc7ilmzy4t5xp6@offworld/
> >
> > Fixes: 779dd20cfb56 ("cxl/region: Add region creation support")
> > Suggested-by: Dan Williams <dan.j.williams@intel.com>
> > Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
> > ---
> > V1: https://lore.kernel.org/linux-cxl/20260308185958.2453707-2-iam@sung-woo.kim/
> > V1->V2:
> > - Made devm_remove_action() asynchronous.
> > - Made unregister_region() idempotent.
> > - Addressed Dan's comments and added the suggested-by tag.
> >
> > drivers/cxl/core/region.c | 45 ++++++++++++++++++++++++++++++++++++---
> > drivers/cxl/cxl.h | 8 +++++++
> > 2 files changed, 50 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> > index e50dc716d4e8..64db0d332c13 100644
> > --- a/drivers/cxl/core/region.c
> > +++ b/drivers/cxl/core/region.c
> > @@ -39,6 +39,7 @@
> > static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
> >
> > static struct cxl_region *to_cxl_region(struct device *dev);
> > +static void remove_devm_actions_work(struct work_struct *work);
> >
> > #define __ACCESS_ATTR_RO(_level, _name) { \
> > .attr = { .name = __stringify(_name), .mode = 0444 }, \
> > @@ -2543,6 +2544,9 @@ static void unregister_region(void *_cxlr)
> > struct cxl_region_params *p = &cxlr->params;
> > int i;
> >
> > + if (test_and_set_bit(CXL_REGION_F_UNREGISTER, &cxlr->flags))
> > + return;
> > +
> > device_del(&cxlr->dev);
> >
> > /*
> > @@ -2589,6 +2593,8 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
> > dev->type = &cxl_region_type;
> > cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
> >
> > + INIT_WORK(&cxlr->remove_work, remove_devm_actions_work);
> > +
> > return cxlr;
> > }
> >
> > @@ -2831,20 +2837,53 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
> > return to_cxl_region(region_dev);
> > }
> >
> > +static void remove_devm_actions_work(struct work_struct *work)
> > +{
> > + struct cxl_region *cxlr = container_of(work, typeof(*cxlr), remove_work);
> > + struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
> > + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
> > +
> > + if (test_and_set_bit(CXL_REGION_F_DEVM_REMOVE, &cxlr->flags)) {
> > + put_device(&cxlr->dev);
> > + return;
> > + }
> > +
> > + scoped_guard(device, port->uport_dev) {
> > + if (port->uport_dev->driver)
> > + devm_remove_action(port->uport_dev, unregister_region, cxlr);
> > + }
> > +
> > + put_device(&cxlr->dev);
> > +}
> > +
> > +static int remove_devm_actions(struct cxl_region *cxlr)
> > +{
> > + if (!schedule_work(&cxlr->remove_work))
> > + return -EBUSY;
> > +
> > + return 0;
> > +}
> > +
> > static ssize_t delete_region_store(struct device *dev,
> > struct device_attribute *attr,
> > const char *buf, size_t len)
> > {
> > struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> > - struct cxl_port *port = to_cxl_port(dev->parent);
> > struct cxl_region *cxlr;
> > + int rc;
> >
> > + /* remove_devm_actions_work() will put cxlr->dev. */
> > cxlr = cxl_find_region_by_name(cxlrd, buf);
> > if (IS_ERR(cxlr))
> > return PTR_ERR(cxlr);
> >
> > - devm_release_action(port->uport_dev, unregister_region, cxlr);
> > - put_device(&cxlr->dev);
> > + unregister_region(cxlr);
> > +
> > + rc = remove_devm_actions(cxlr);
> > + if (rc) {
> > + put_device(&cxlr->dev);
> > + return rc;
> > + }
> >
> > return len;
> > }
> > diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> > index 1297594beaec..75ec292a9f42 100644
> > --- a/drivers/cxl/cxl.h
> > +++ b/drivers/cxl/cxl.h
> > @@ -447,6 +447,12 @@ struct cxl_region_params {
> > */
> > #define CXL_REGION_F_NORMALIZED_ADDRESSING 3
> >
> > +/* Indicate that this region is being unregistered to prevent a race. */
> > +#define CXL_REGION_F_UNREGISTER 4
> > +
> > +/* Indicate that this region called devm_remove_action. */
> > +#define CXL_REGION_F_DEVM_REMOVE 5
> > +
> > /**
> > * struct cxl_region - CXL region
> > * @dev: This region's device
> > @@ -462,6 +468,7 @@ struct cxl_region_params {
> > * @coord: QoS access coordinates for the region
> > * @node_notifier: notifier for setting the access coordinates to node
> > * @adist_notifier: notifier for calculating the abstract distance of node
> > + * @remove_work: trigger the remove action in a safe context to acquire locks
> > */
> > struct cxl_region {
> > struct device dev;
> > @@ -477,6 +484,7 @@ struct cxl_region {
> > struct access_coordinate coord[ACCESS_COORDINATE_MAX];
> > struct notifier_block node_notifier;
> > struct notifier_block adist_notifier;
> > + struct work_struct remove_work;
> > };
> >
> > struct cxl_nvdimm_bridge {
>
>
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH v2] cxl/region: Fix a race bug in delete_region_store
2026-04-22 23:13 ` Sungwoo Kim
@ 2026-04-22 23:19 ` Dave Jiang
0 siblings, 0 replies; 6+ messages in thread
From: Dave Jiang @ 2026-04-22 23:19 UTC (permalink / raw)
To: Sungwoo Kim
Cc: Davidlohr Bueso, Jonathan Cameron, Alison Schofield, Vishal Verma,
Ira Weiny, Dan Williams, Ben Widawsky, Dave Tian, Dan Williams,
Jonathan Cameron, linux-cxl, linux-kernel
On 4/22/26 4:13 PM, Sungwoo Kim wrote:
> On Wed, Apr 22, 2026 at 11:10 AM Dave Jiang <dave.jiang@intel.com> wrote:
>>
>>
>>
>> On 4/21/26 9:56 PM, Sungwoo Kim wrote:
>>> devm_release_action() cannot find a matching entry in the following two
>>> race scenarios.
>>>
>>> scenario 1: delete two same regions concurrently
>>>
>>> CPU 0 CPU 1
>>> ====== ======
>>> delete_region_store()
>>> cxlr = cxl_find_region_by_name()
>>> delete_region_store()
>>> cxlr = cxl_find_region_by_name()
>>> devm_release_action()
>>> devm_release_action()
>>> // cannot find the action, WARN_ON()
>>>
>>> scenario 2: delete parent and child concurrently [1]
>>>
>>> CPU0 CPU1
>>> devres_release_all()
>>> // take devres_lock
>>> remove_nodes(devres_head) // mv to local todo
>>> // drop devres_lock delete_region_store()
>>> cxlr = cxl_find_region_by_name() // success
>>> devm_release_action(unregister_region)
>>> devres_release()
>>> devres_remove()
>>> // hold devres_lock
>>> find_dr(devres_head) // does not find it
>>> WARN_ON(-ENOENT)
>>> release_nodes() // drain todo
>>> unregister_region(cxlr) // release() cb
>>> device_del()
>>>
>>> To fix scenario 1, delete_region_store() directly calls
>>> unregister_region() with a test_and_set_bit(CXL_REGION_F_UNREGISTER).
>>> Also, replace devm_release_action() to devm_remove_action() as
>>> unregister_region() is now called directly.
>>>
>>> To fix scenario 2, delete_region_store() removes actions only if the
>>> driver is still attached. To ensure this, scoped_guard(device,
>>> port->uport_dev) is required to check port->uport_dev->driver.
>>> To hold this lock, a workqueue is required for a clean context.
>>
>> Thank you Sungwoo for the fix. Given that this deals with 2 different scenarios, can this be split into 2 patches each address a scenario uniquely?
>
> Sure thing. I think reforming it into a single patchset is better than
> splitting it into two distinct patches, since they are highly
> relevant.
>
Yes. A series of 2 patches would make sense.
>>
>> For scenario 2, can you please put a bit more context in its commit log to explain why a workqueue is needed from your discussion with Dan for future readers?
>>
>> Also for the commit log please add context on how these issues were found and what impact was observed and how that effects a user. This information necessary for the Fixes tag dispatching upstream.
>
> Will do in v3. How do you think about impact? I guess the impact (the
> warning in devm_release_action()) is low, since the region has already
> been released. So, it does not affect users, except for a negligible
> warning message.
If that is the case, then put the above explanation in the commit log. Impact low.
>
> Apart from the impact, this bug is a case study in cleaning up a
> resource, especially when it can be cleaned up by both upper and lower
> devices. Since CXL is hierarchical, existing and future CXL components
> may have a similar issue with different impacts and need to adopt this
> pattern to avoid the same race condition. I'm personally interested in
> finding bugs in this direction :)
>
> Thank you!
> Sungwoo.
>
>>
>> Thanks!
>>
>> DJ
>>
>>>
>>> Splat:
>>>
>>> WARNING: drivers/base/devres.c:824 at devm_release_action drivers/base/devres.c:824 [inline], CPU#0: syz.1.12224/47589
>>> WARNING: drivers/base/devres.c:824 at devm_release_action+0x2b2/0x360 drivers/base/devres.c:817, CPU#0: syz.1.12224/47589
>>>
>>> [1] https://lore.kernel.org/linux-cxl/20260310183644.4rwc7ilmzy4t5xp6@offworld/
>>>
>>> Fixes: 779dd20cfb56 ("cxl/region: Add region creation support")
>>> Suggested-by: Dan Williams <dan.j.williams@intel.com>
>>> Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
>>> ---
>>> V1: https://lore.kernel.org/linux-cxl/20260308185958.2453707-2-iam@sung-woo.kim/
>>> V1->V2:
>>> - Made devm_remove_action() asynchronous.
>>> - Made unregister_region() idempotent.
>>> - Addressed Dan's comments and added the suggested-by tag.
>>>
>>> drivers/cxl/core/region.c | 45 ++++++++++++++++++++++++++++++++++++---
>>> drivers/cxl/cxl.h | 8 +++++++
>>> 2 files changed, 50 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
>>> index e50dc716d4e8..64db0d332c13 100644
>>> --- a/drivers/cxl/core/region.c
>>> +++ b/drivers/cxl/core/region.c
>>> @@ -39,6 +39,7 @@
>>> static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
>>>
>>> static struct cxl_region *to_cxl_region(struct device *dev);
>>> +static void remove_devm_actions_work(struct work_struct *work);
>>>
>>> #define __ACCESS_ATTR_RO(_level, _name) { \
>>> .attr = { .name = __stringify(_name), .mode = 0444 }, \
>>> @@ -2543,6 +2544,9 @@ static void unregister_region(void *_cxlr)
>>> struct cxl_region_params *p = &cxlr->params;
>>> int i;
>>>
>>> + if (test_and_set_bit(CXL_REGION_F_UNREGISTER, &cxlr->flags))
>>> + return;
>>> +
>>> device_del(&cxlr->dev);
>>>
>>> /*
>>> @@ -2589,6 +2593,8 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
>>> dev->type = &cxl_region_type;
>>> cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
>>>
>>> + INIT_WORK(&cxlr->remove_work, remove_devm_actions_work);
>>> +
>>> return cxlr;
>>> }
>>>
>>> @@ -2831,20 +2837,53 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
>>> return to_cxl_region(region_dev);
>>> }
>>>
>>> +static void remove_devm_actions_work(struct work_struct *work)
>>> +{
>>> + struct cxl_region *cxlr = container_of(work, typeof(*cxlr), remove_work);
>>> + struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
>>> + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
>>> +
>>> + if (test_and_set_bit(CXL_REGION_F_DEVM_REMOVE, &cxlr->flags)) {
>>> + put_device(&cxlr->dev);
>>> + return;
>>> + }
>>> +
>>> + scoped_guard(device, port->uport_dev) {
>>> + if (port->uport_dev->driver)
>>> + devm_remove_action(port->uport_dev, unregister_region, cxlr);
>>> + }
>>> +
>>> + put_device(&cxlr->dev);
>>> +}
>>> +
>>> +static int remove_devm_actions(struct cxl_region *cxlr)
>>> +{
>>> + if (!schedule_work(&cxlr->remove_work))
>>> + return -EBUSY;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> static ssize_t delete_region_store(struct device *dev,
>>> struct device_attribute *attr,
>>> const char *buf, size_t len)
>>> {
>>> struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
>>> - struct cxl_port *port = to_cxl_port(dev->parent);
>>> struct cxl_region *cxlr;
>>> + int rc;
>>>
>>> + /* remove_devm_actions_work() will put cxlr->dev. */
>>> cxlr = cxl_find_region_by_name(cxlrd, buf);
>>> if (IS_ERR(cxlr))
>>> return PTR_ERR(cxlr);
>>>
>>> - devm_release_action(port->uport_dev, unregister_region, cxlr);
>>> - put_device(&cxlr->dev);
>>> + unregister_region(cxlr);
>>> +
>>> + rc = remove_devm_actions(cxlr);
>>> + if (rc) {
>>> + put_device(&cxlr->dev);
>>> + return rc;
>>> + }
>>>
>>> return len;
>>> }
>>> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
>>> index 1297594beaec..75ec292a9f42 100644
>>> --- a/drivers/cxl/cxl.h
>>> +++ b/drivers/cxl/cxl.h
>>> @@ -447,6 +447,12 @@ struct cxl_region_params {
>>> */
>>> #define CXL_REGION_F_NORMALIZED_ADDRESSING 3
>>>
>>> +/* Indicate that this region is being unregistered to prevent a race. */
>>> +#define CXL_REGION_F_UNREGISTER 4
>>> +
>>> +/* Indicate that this region called devm_remove_action. */
>>> +#define CXL_REGION_F_DEVM_REMOVE 5
>>> +
>>> /**
>>> * struct cxl_region - CXL region
>>> * @dev: This region's device
>>> @@ -462,6 +468,7 @@ struct cxl_region_params {
>>> * @coord: QoS access coordinates for the region
>>> * @node_notifier: notifier for setting the access coordinates to node
>>> * @adist_notifier: notifier for calculating the abstract distance of node
>>> + * @remove_work: trigger the remove action in a safe context to acquire locks
>>> */
>>> struct cxl_region {
>>> struct device dev;
>>> @@ -477,6 +484,7 @@ struct cxl_region {
>>> struct access_coordinate coord[ACCESS_COORDINATE_MAX];
>>> struct notifier_block node_notifier;
>>> struct notifier_block adist_notifier;
>>> + struct work_struct remove_work;
>>> };
>>>
>>> struct cxl_nvdimm_bridge {
>>
>>
>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH v2] cxl/region: Fix a race bug in delete_region_store
2026-04-22 4:56 [PATCH v2] cxl/region: Fix a race bug in delete_region_store Sungwoo Kim
2026-04-22 15:10 ` Dave Jiang
@ 2026-04-23 0:19 ` Dave Jiang
2026-04-23 2:52 ` Sungwoo Kim
2 siblings, 0 replies; 6+ messages in thread
From: Dave Jiang @ 2026-04-23 0:19 UTC (permalink / raw)
To: Sungwoo Kim, Davidlohr Bueso, Jonathan Cameron, Alison Schofield,
Vishal Verma, Ira Weiny, Dan Williams, Ben Widawsky
Cc: Dave Tian, Dan Williams, Jonathan Cameron, linux-cxl,
linux-kernel
On 4/21/26 9:56 PM, Sungwoo Kim wrote:
> devm_release_action() cannot find a matching entry in the following two
> race scenarios.
>
> scenario 1: delete two same regions concurrently
>
> CPU 0 CPU 1
> ====== ======
> delete_region_store()
> cxlr = cxl_find_region_by_name()
> delete_region_store()
> cxlr = cxl_find_region_by_name()
> devm_release_action()
> devm_release_action()
> // cannot find the action, WARN_ON()
>
> scenario 2: delete parent and child concurrently [1]
>
> CPU0 CPU1
> devres_release_all()
> // take devres_lock
> remove_nodes(devres_head) // mv to local todo
> // drop devres_lock delete_region_store()
> cxlr = cxl_find_region_by_name() // success
> devm_release_action(unregister_region)
> devres_release()
> devres_remove()
> // hold devres_lock
> find_dr(devres_head) // does not find it
> WARN_ON(-ENOENT)
> release_nodes() // drain todo
> unregister_region(cxlr) // release() cb
> device_del()
>
> To fix scenario 1, delete_region_store() directly calls
> unregister_region() with a test_and_set_bit(CXL_REGION_F_UNREGISTER).
> Also, replace devm_release_action() to devm_remove_action() as
> unregister_region() is now called directly.
>
> To fix scenario 2, delete_region_store() removes actions only if the
> driver is still attached. To ensure this, scoped_guard(device,
> port->uport_dev) is required to check port->uport_dev->driver.
> To hold this lock, a workqueue is required for a clean context.
>
> Splat:
>
> WARNING: drivers/base/devres.c:824 at devm_release_action drivers/base/devres.c:824 [inline], CPU#0: syz.1.12224/47589
> WARNING: drivers/base/devres.c:824 at devm_release_action+0x2b2/0x360 drivers/base/devres.c:817, CPU#0: syz.1.12224/47589
>
> [1] https://lore.kernel.org/linux-cxl/20260310183644.4rwc7ilmzy4t5xp6@offworld/
>
> Fixes: 779dd20cfb56 ("cxl/region: Add region creation support")
> Suggested-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
Sungwoo,
I saw this on sashiko but have not reviewed the validity. Maybe take a look.
https://sashiko.dev/#/patchset/20260422045637.3048249-2-iam%40sung-woo.kim
DJ
> ---
> V1: https://lore.kernel.org/linux-cxl/20260308185958.2453707-2-iam@sung-woo.kim/
> V1->V2:
> - Made devm_remove_action() asynchronous.
> - Made unregister_region() idempotent.
> - Addressed Dan's comments and added the suggested-by tag.
>
> drivers/cxl/core/region.c | 45 ++++++++++++++++++++++++++++++++++++---
> drivers/cxl/cxl.h | 8 +++++++
> 2 files changed, 50 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index e50dc716d4e8..64db0d332c13 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -39,6 +39,7 @@
> static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
>
> static struct cxl_region *to_cxl_region(struct device *dev);
> +static void remove_devm_actions_work(struct work_struct *work);
>
> #define __ACCESS_ATTR_RO(_level, _name) { \
> .attr = { .name = __stringify(_name), .mode = 0444 }, \
> @@ -2543,6 +2544,9 @@ static void unregister_region(void *_cxlr)
> struct cxl_region_params *p = &cxlr->params;
> int i;
>
> + if (test_and_set_bit(CXL_REGION_F_UNREGISTER, &cxlr->flags))
> + return;
> +
> device_del(&cxlr->dev);
>
> /*
> @@ -2589,6 +2593,8 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
> dev->type = &cxl_region_type;
> cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
>
> + INIT_WORK(&cxlr->remove_work, remove_devm_actions_work);
> +
> return cxlr;
> }
>
> @@ -2831,20 +2837,53 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
> return to_cxl_region(region_dev);
> }
>
> +static void remove_devm_actions_work(struct work_struct *work)
> +{
> + struct cxl_region *cxlr = container_of(work, typeof(*cxlr), remove_work);
> + struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
> + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
> +
> + if (test_and_set_bit(CXL_REGION_F_DEVM_REMOVE, &cxlr->flags)) {
> + put_device(&cxlr->dev);
> + return;
> + }
> +
> + scoped_guard(device, port->uport_dev) {
> + if (port->uport_dev->driver)
> + devm_remove_action(port->uport_dev, unregister_region, cxlr);
> + }
> +
> + put_device(&cxlr->dev);
> +}
> +
> +static int remove_devm_actions(struct cxl_region *cxlr)
> +{
> + if (!schedule_work(&cxlr->remove_work))
> + return -EBUSY;
> +
> + return 0;
> +}
> +
> static ssize_t delete_region_store(struct device *dev,
> struct device_attribute *attr,
> const char *buf, size_t len)
> {
> struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> - struct cxl_port *port = to_cxl_port(dev->parent);
> struct cxl_region *cxlr;
> + int rc;
>
> + /* remove_devm_actions_work() will put cxlr->dev. */
> cxlr = cxl_find_region_by_name(cxlrd, buf);
> if (IS_ERR(cxlr))
> return PTR_ERR(cxlr);
>
> - devm_release_action(port->uport_dev, unregister_region, cxlr);
> - put_device(&cxlr->dev);
> + unregister_region(cxlr);
> +
> + rc = remove_devm_actions(cxlr);
> + if (rc) {
> + put_device(&cxlr->dev);
> + return rc;
> + }
>
> return len;
> }
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 1297594beaec..75ec292a9f42 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -447,6 +447,12 @@ struct cxl_region_params {
> */
> #define CXL_REGION_F_NORMALIZED_ADDRESSING 3
>
> +/* Indicate that this region is being unregistered to prevent a race. */
> +#define CXL_REGION_F_UNREGISTER 4
> +
> +/* Indicate that this region called devm_remove_action. */
> +#define CXL_REGION_F_DEVM_REMOVE 5
> +
> /**
> * struct cxl_region - CXL region
> * @dev: This region's device
> @@ -462,6 +468,7 @@ struct cxl_region_params {
> * @coord: QoS access coordinates for the region
> * @node_notifier: notifier for setting the access coordinates to node
> * @adist_notifier: notifier for calculating the abstract distance of node
> + * @remove_work: trigger the remove action in a safe context to acquire locks
> */
> struct cxl_region {
> struct device dev;
> @@ -477,6 +484,7 @@ struct cxl_region {
> struct access_coordinate coord[ACCESS_COORDINATE_MAX];
> struct notifier_block node_notifier;
> struct notifier_block adist_notifier;
> + struct work_struct remove_work;
> };
>
> struct cxl_nvdimm_bridge {
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH v2] cxl/region: Fix a race bug in delete_region_store
2026-04-22 4:56 [PATCH v2] cxl/region: Fix a race bug in delete_region_store Sungwoo Kim
2026-04-22 15:10 ` Dave Jiang
2026-04-23 0:19 ` Dave Jiang
@ 2026-04-23 2:52 ` Sungwoo Kim
2 siblings, 0 replies; 6+ messages in thread
From: Sungwoo Kim @ 2026-04-23 2:52 UTC (permalink / raw)
To: Davidlohr Bueso, Jonathan Cameron, Dave Jiang, Alison Schofield,
Vishal Verma, Ira Weiny, Dan Williams, Ben Widawsky
Cc: Dave Tian, Dan Williams, Jonathan Cameron, linux-cxl,
linux-kernel
This is a follow-up based on Sashiko's review.
https://sashiko.dev/#/patchset/20260422045637.3048249-2-iam%40sung-woo.kim
In sum, using a global system wq was a wrong choice. Instead, use a
cxl's wq to match wq and the driver's lifetime.
Also, Sashiko figured out a new race scenario in construct_region().
On Wed, Apr 22, 2026 at 1:00 AM Sungwoo Kim <iam@sung-woo.kim> wrote:
>
> devm_release_action() cannot find a matching entry in the following two
> race scenarios.
>
> scenario 1: delete two same regions concurrently
>
> CPU 0 CPU 1
> ====== ======
> delete_region_store()
> cxlr = cxl_find_region_by_name()
> delete_region_store()
> cxlr = cxl_find_region_by_name()
> devm_release_action()
> devm_release_action()
> // cannot find the action, WARN_ON()
>
> scenario 2: delete parent and child concurrently [1]
>
> CPU0 CPU1
> devres_release_all()
> // take devres_lock
> remove_nodes(devres_head) // mv to local todo
> // drop devres_lock delete_region_store()
> cxlr = cxl_find_region_by_name() // success
> devm_release_action(unregister_region)
> devres_release()
> devres_remove()
> // hold devres_lock
> find_dr(devres_head) // does not find it
> WARN_ON(-ENOENT)
> release_nodes() // drain todo
> unregister_region(cxlr) // release() cb
> device_del()
>
> To fix scenario 1, delete_region_store() directly calls
> unregister_region() with a test_and_set_bit(CXL_REGION_F_UNREGISTER).
> Also, replace devm_release_action() to devm_remove_action() as
> unregister_region() is now called directly.
>
> To fix scenario 2, delete_region_store() removes actions only if the
> driver is still attached. To ensure this, scoped_guard(device,
> port->uport_dev) is required to check port->uport_dev->driver.
> To hold this lock, a workqueue is required for a clean context.
>
> Splat:
>
> WARNING: drivers/base/devres.c:824 at devm_release_action drivers/base/devres.c:824 [inline], CPU#0: syz.1.12224/47589
> WARNING: drivers/base/devres.c:824 at devm_release_action+0x2b2/0x360 drivers/base/devres.c:817, CPU#0: syz.1.12224/47589
>
> [1] https://lore.kernel.org/linux-cxl/20260310183644.4rwc7ilmzy4t5xp6@offworld/
>
> Fixes: 779dd20cfb56 ("cxl/region: Add region creation support")
> Suggested-by: Dan Williams <dan.j.williams@intel.com>
> Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
> ---
> V1: https://lore.kernel.org/linux-cxl/20260308185958.2453707-2-iam@sung-woo.kim/
> V1->V2:
> - Made devm_remove_action() asynchronous.
> - Made unregister_region() idempotent.
> - Addressed Dan's comments and added the suggested-by tag.
>
> drivers/cxl/core/region.c | 45 ++++++++++++++++++++++++++++++++++++---
> drivers/cxl/cxl.h | 8 +++++++
> 2 files changed, 50 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index e50dc716d4e8..64db0d332c13 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -39,6 +39,7 @@
> static nodemask_t nodemask_region_seen = NODE_MASK_NONE;
>
> static struct cxl_region *to_cxl_region(struct device *dev);
> +static void remove_devm_actions_work(struct work_struct *work);
>
> #define __ACCESS_ATTR_RO(_level, _name) { \
> .attr = { .name = __stringify(_name), .mode = 0444 }, \
> @@ -2543,6 +2544,9 @@ static void unregister_region(void *_cxlr)
> struct cxl_region_params *p = &cxlr->params;
> int i;
>
> + if (test_and_set_bit(CXL_REGION_F_UNREGISTER, &cxlr->flags))
> + return;
> +
> device_del(&cxlr->dev);
>
> /*
> @@ -2589,6 +2593,8 @@ static struct cxl_region *cxl_region_alloc(struct cxl_root_decoder *cxlrd, int i
> dev->type = &cxl_region_type;
> cxl_region_setup_flags(cxlr, &cxlrd->cxlsd.cxld);
>
> + INIT_WORK(&cxlr->remove_work, remove_devm_actions_work);
> +
> return cxlr;
> }
>
> @@ -2831,20 +2837,53 @@ cxl_find_region_by_name(struct cxl_root_decoder *cxlrd, const char *name)
> return to_cxl_region(region_dev);
> }
>
> +static void remove_devm_actions_work(struct work_struct *work)
> +{
> + struct cxl_region *cxlr = container_of(work, typeof(*cxlr), remove_work);
> + struct cxl_root_decoder *cxlrd = cxlr->cxlrd;
> + struct cxl_port *port = to_cxl_port(cxlrd->cxlsd.cxld.dev.parent);
> +
> + if (test_and_set_bit(CXL_REGION_F_DEVM_REMOVE, &cxlr->flags)) {
> + put_device(&cxlr->dev);
> + return;
> + }
> +
> + scoped_guard(device, port->uport_dev) {
> + if (port->uport_dev->driver)
A rapid driver unbind-and-rebind can happen before this work. If so,
the work becomes stale, and devm_remove_action() is called on the
rebound driver, which has no reason to remove an action.
To fix this, use a driver's wq instead of system wq so unbinding can
drain this work.
> + devm_remove_action(port->uport_dev, unregister_region, cxlr);
> + }
> +
> + put_device(&cxlr->dev);
> +}
> +
> +static int remove_devm_actions(struct cxl_region *cxlr)
> +{
> + if (!schedule_work(&cxlr->remove_work))
This code can introduce use-after-free because the global system wq is
not flushed when the kernel module is unloaded.
After unloading the module, this can execute free'd remove_devm_actions_work().
To fix this, use a driver's wq instead of system wq so unbinding can
drain this work.
> + return -EBUSY;
> +
> + return 0;
> +}
Sashiko suggested a new race scenario. construct_region() can also
race with delete_region_store().
drivers/cxl/core/region.c:construct_region() {
...
/* remove_devm_actions_work() can be scheduled and run at this point */
rc = __construct_region(cxlr, ctx);
if (rc) {
/* this will fail to find an action and trigger WARN_ON */
devm_release_action(port->uport_dev, unregister_region, cxlr);
return ERR_PTR(rc);
}
...
}
To fix this, construct_region() should directly call
unregister_region() and schedule remove_devm_actions_work().
> +
> static ssize_t delete_region_store(struct device *dev,
> struct device_attribute *attr,
> const char *buf, size_t len)
> {
> struct cxl_root_decoder *cxlrd = to_cxl_root_decoder(dev);
> - struct cxl_port *port = to_cxl_port(dev->parent);
> struct cxl_region *cxlr;
> + int rc;
>
> + /* remove_devm_actions_work() will put cxlr->dev. */
> cxlr = cxl_find_region_by_name(cxlrd, buf);
> if (IS_ERR(cxlr))
> return PTR_ERR(cxlr);
>
> - devm_release_action(port->uport_dev, unregister_region, cxlr);
> - put_device(&cxlr->dev);
> + unregister_region(cxlr);
> +
> + rc = remove_devm_actions(cxlr);
> + if (rc) {
> + put_device(&cxlr->dev);
> + return rc;
> + }
>
> return len;
> }
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index 1297594beaec..75ec292a9f42 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -447,6 +447,12 @@ struct cxl_region_params {
> */
> #define CXL_REGION_F_NORMALIZED_ADDRESSING 3
>
> +/* Indicate that this region is being unregistered to prevent a race. */
> +#define CXL_REGION_F_UNREGISTER 4
> +
> +/* Indicate that this region called devm_remove_action. */
> +#define CXL_REGION_F_DEVM_REMOVE 5
> +
> /**
> * struct cxl_region - CXL region
> * @dev: This region's device
> @@ -462,6 +468,7 @@ struct cxl_region_params {
> * @coord: QoS access coordinates for the region
> * @node_notifier: notifier for setting the access coordinates to node
> * @adist_notifier: notifier for calculating the abstract distance of node
> + * @remove_work: trigger the remove action in a safe context to acquire locks
> */
> struct cxl_region {
> struct device dev;
> @@ -477,6 +484,7 @@ struct cxl_region {
> struct access_coordinate coord[ACCESS_COORDINATE_MAX];
> struct notifier_block node_notifier;
> struct notifier_block adist_notifier;
> + struct work_struct remove_work;
> };
>
> struct cxl_nvdimm_bridge {
> --
> 2.47.3
>
^ permalink raw reply [flat|nested] 6+ messages in thread