From: Tejun Heo <tj@kernel.org>
To: Herbert Xu <herbert@gondor.apana.org.au>
Cc: Thomas Graf <tgraf@suug.ch>,
Andrew Morton <akpm@linux-foundation.org>,
linux-crypto@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: [PATCH v3] rhashtable: Bounce deferred worker kick through irq_work
Date: Mon, 20 Apr 2026 20:03:26 -1000 [thread overview]
Message-ID: <20260421060326.2836354-1-tj@kernel.org> (raw)
In-Reply-To: <4ff731fc-3791-4b96-a997-89c3bcd2d69b@kernel.org>
Inserts past 75% load call schedule_work(&ht->run_work) to kick an
async resize. If a caller holds a raw spinlock (e.g. an
insecure_elasticity user), schedule_work() under that lock records
caller_lock -> pool->lock -> pi_lock -> rq->__lock
A cycle forms if any of these locks is acquired in the reverse
direction elsewhere. sched_ext, the only current insecure_elasticity
user, hits this: it holds scx_sched_lock across rhashtable inserts of
sub-schedulers, while scx_bypass() takes rq->__lock -> scx_sched_lock.
Exercising the resize path produces:
Chain exists of:
&pool->lock --> &rq->__lock --> scx_sched_lock
Bounce the kick from the insert paths through irq_work so
schedule_work() runs from hard IRQ context with the caller's lock no
longer held. rht_deferred_worker()'s self-rearm on error stays on
schedule_work(&ht->run_work) - the worker runs in process context with
no caller lock held, and keeping the self-requeue on @run_work lets
cancel_work_sync() in rhashtable_free_and_destroy() drain it.
v3: Keep rht_deferred_worker()'s self-rearm on schedule_work(&run_work).
Routing it through irq_work in v2 broke cancel_work_sync()'s
self-requeue handling - an irq_work queued after irq_work_sync()
returned but while cancel_work_sync() was still waiting could fire
post-teardown.
v2: Bounce unconditionally instead of gating on insecure_elasticity,
as suggested by Herbert.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
Herbert, dropped your Ack on v2. v2 routed rht_deferred_worker()'s
self-rearm through irq_work, which would race with
cancel_work_sync() in rhashtable_free_and_destroy(). v3 keeps only
the insert-path kicks on irq_work; the worker's self-rearm stays on
schedule_work(&ht->run_work).
include/linux/rhashtable-types.h | 3 +++
include/linux/rhashtable.h | 3 ++-
lib/rhashtable.c | 31 ++++++++++++++++++++++++++++---
3 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h
index 72082428d6c6..fc2f596a6df1 100644
--- a/include/linux/rhashtable-types.h
+++ b/include/linux/rhashtable-types.h
@@ -12,6 +12,7 @@
#include <linux/alloc_tag.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/irq_work_types.h>
#include <linux/mutex.h>
#include <linux/workqueue_types.h>
@@ -77,6 +78,7 @@ struct rhashtable_params {
* @p: Configuration parameters
* @rhlist: True if this is an rhltable
* @run_work: Deferred worker to expand/shrink asynchronously
+ * @run_irq_work: Bounces the @run_work kick through hard IRQ context.
* @mutex: Mutex to protect current/future table swapping
* @lock: Spin lock to protect walker list
* @nelems: Number of elements in table
@@ -88,6 +90,7 @@ struct rhashtable {
struct rhashtable_params p;
bool rhlist;
struct work_struct run_work;
+ struct irq_work run_irq_work;
struct mutex mutex;
spinlock_t lock;
atomic_t nelems;
diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index 7def3f0f556b..ef5230cece36 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -20,6 +20,7 @@
#include <linux/err.h>
#include <linux/errno.h>
+#include <linux/irq_work.h>
#include <linux/jhash.h>
#include <linux/list_nulls.h>
#include <linux/workqueue.h>
@@ -847,7 +848,7 @@ static __always_inline void *__rhashtable_insert_fast(
rht_assign_unlock(tbl, bkt, obj, flags);
if (rht_grow_above_75(ht, tbl))
- schedule_work(&ht->run_work);
+ irq_work_queue(&ht->run_irq_work);
data = NULL;
out:
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index fb2b7bc137ba..7a67ef5b67b6 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -441,10 +441,33 @@ static void rht_deferred_worker(struct work_struct *work)
mutex_unlock(&ht->mutex);
+ /*
+ * Re-arm via @run_work, not @run_irq_work.
+ * rhashtable_free_and_destroy() drains async work as irq_work_sync()
+ * followed by cancel_work_sync(). If this site queued irq_work while
+ * cancel_work_sync() was waiting for us, irq_work_sync() would already
+ * have returned and the stale irq_work could fire post-teardown.
+ * cancel_work_sync() natively handles self-requeue on @run_work.
+ */
if (err)
schedule_work(&ht->run_work);
}
+/*
+ * Insert-path callers can run under a raw spinlock (e.g. an insecure_elasticity
+ * user). Calling schedule_work() under that lock records caller_lock ->
+ * pool->lock -> pi_lock -> rq->__lock, closing a locking cycle if any of
+ * these is acquired in the reverse direction elsewhere. Bounce through
+ * irq_work so the schedule_work() runs with the caller's lock no longer held.
+ */
+static void rht_deferred_irq_work(struct irq_work *irq_work)
+{
+ struct rhashtable *ht = container_of(irq_work, struct rhashtable,
+ run_irq_work);
+
+ schedule_work(&ht->run_work);
+}
+
static int rhashtable_insert_rehash(struct rhashtable *ht,
struct bucket_table *tbl)
{
@@ -477,7 +500,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
if (err == -EEXIST)
err = 0;
} else
- schedule_work(&ht->run_work);
+ irq_work_queue(&ht->run_irq_work);
return err;
@@ -488,7 +511,7 @@ static int rhashtable_insert_rehash(struct rhashtable *ht,
/* Schedule async rehash to retry allocation in process context. */
if (err == -ENOMEM)
- schedule_work(&ht->run_work);
+ irq_work_queue(&ht->run_irq_work);
return err;
}
@@ -630,7 +653,7 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
rht_unlock(tbl, bkt, flags);
if (inserted && rht_grow_above_75(ht, tbl))
- schedule_work(&ht->run_work);
+ irq_work_queue(&ht->run_irq_work);
}
} while (!IS_ERR_OR_NULL(new_tbl));
@@ -1085,6 +1108,7 @@ int rhashtable_init_noprof(struct rhashtable *ht,
RCU_INIT_POINTER(ht->tbl, tbl);
INIT_WORK(&ht->run_work, rht_deferred_worker);
+ init_irq_work(&ht->run_irq_work, rht_deferred_irq_work);
return 0;
}
@@ -1150,6 +1174,7 @@ void rhashtable_free_and_destroy(struct rhashtable *ht,
struct bucket_table *tbl, *next_tbl;
unsigned int i;
+ irq_work_sync(&ht->run_irq_work);
cancel_work_sync(&ht->run_work);
mutex_lock(&ht->mutex);
--
2.53.0
next prev parent reply other threads:[~2026-04-21 6:03 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-04-19 18:19 [RFC PATCH] rhashtable: Bounce deferred worker kick through irq_work when insecure_elasticity is set Tejun Heo
2026-04-20 8:44 ` Herbert Xu
2026-04-20 17:02 ` Tejun Heo
2026-04-20 18:12 ` [PATCH v2] rhashtable: Bounce deferred worker kick through irq_work Tejun Heo
2026-04-21 3:02 ` Herbert Xu
2026-04-21 6:03 ` Tejun Heo [this message]
2026-04-21 6:06 ` [PATCH v3] " Herbert Xu
2026-04-21 6:14 ` Tejun Heo
2026-05-12 6:07 ` Hillf Danton
2026-04-27 23:12 ` [RFC PATCH] rhashtable: Bounce deferred worker kick through irq_work when insecure_elasticity is set kernel test robot
2026-04-28 7:44 ` kernel test robot
2026-05-07 8:56 ` kernel test robot
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260421060326.2836354-1-tj@kernel.org \
--to=tj@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=herbert@gondor.apana.org.au \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=tgraf@suug.ch \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.