[RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors

The Linux Kernel Mailing List
 help / color / mirror / Atom feed

From: wen.yang@linux.dev
To: Gabriele Monaco <gmonaco@redhat.com>,
	Steven Rostedt <rostedt@goodmis.org>
Cc: linux-trace-kernel@vger.kernel.org, linux-kernel@vger.kernel.org,
	Wen Yang <wen.yang@linux.dev>
Subject: [RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors
Date: Tue, 12 May 2026 02:24:50 +0800	[thread overview]
Message-ID: <2774332570ee823be60cfe84ba85e9573b4df478.1778522945.git.wen.yang@linux.dev> (raw)
In-Reply-To: <cover.1778522945.git.wen.yang@linux.dev>

From: Wen Yang <wen.yang@linux.dev>

da_create_empty_storage() uses kmalloc_nolock(), which requires
CONFIG_HAVE_ALIGNED_STRUCT_PAGE; on UML and some PREEMPT_RT
configurations it always returns NULL.  Calling kmalloc from scheduler
tracepoint handlers also adds unwanted latency and can fail under
memory pressure.

Add da_monitor_init_prealloc(N) as an opt-in alternative to
da_monitor_init().  It allocates N da_monitor_storage slots with
GFP_KERNEL up-front and manages them on a LIFO free-stack protected
by a spinlock, so da_create_or_get() never calls kmalloc on the hot
path.

Monitors that do not call da_monitor_init_prealloc() are unaffected.

Signed-off-by: Wen Yang <wen.yang@linux.dev>
---
 include/rv/da_monitor.h | 208 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 186 insertions(+), 22 deletions(-)

diff --git a/include/rv/da_monitor.h b/include/rv/da_monitor.h
index d04bb3229c75..7d6f62766251 100644
--- a/include/rv/da_monitor.h
+++ b/include/rv/da_monitor.h
@@ -433,18 +433,6 @@ static inline da_id_type da_get_id(struct da_monitor *da_mon)
 	return container_of(da_mon, struct da_monitor_storage, rv.da_mon)->id;
 }
 
-/*
- * da_create_or_get - create the per-object storage if not already there
- *
- * This needs a lookup so should be guarded by RCU, the condition is checked
- * directly in da_create_storage()
- */
-static inline void da_create_or_get(da_id_type id, monitor_target target)
-{
-	guard(rcu)();
-	da_create_storage(id, target, da_get_monitor(id, target));
-}
-
 /*
  * da_fill_empty_storage - store the target in a pre-allocated storage
  *
@@ -475,15 +463,121 @@ static inline monitor_target da_get_target_by_id(da_id_type id)
 	return mon_storage->target;
 }
 
+/*
+ * Per-object pool state.
+ *
+ * Zero-initialised by default (storage == NULL ⟹ kmalloc mode).  A monitor
+ * opts into pool mode by calling da_monitor_init_prealloc(N) instead of
+ * da_monitor_init(), which sets storage to a non-NULL kcalloc'd array.
+ *
+ * Because every field is wrapped in this struct and the struct itself is a
+ * per-TU static, each monitor that includes this header gets a completely
+ * independent pool.  A kmalloc monitor (e.g. nomiss) and a pool monitor
+ * (e.g. tlob) therefore coexist without any interference.
+ *
+ * da_pool_return_cb runs from softirq on non-PREEMPT_RT, so irqsave is
+ * required to prevent deadlock with task-context callers.  On PREEMPT_RT
+ * it runs from an rcuc kthread where spinlock_t is a sleeping lock.
+ */
+struct da_per_obj_pool {
+	struct da_monitor_storage  *storage;  /* non-NULL ⟹ pool mode */
+	struct da_monitor_storage **free;     /* kmalloc'd pointer stack */
+	unsigned int                free_top;
+	spinlock_t                  lock;
+};
+
+static struct da_per_obj_pool da_pool = {
+	.lock = __SPIN_LOCK_UNLOCKED(da_pool.lock),
+};
+
+static void da_pool_return_cb(struct rcu_head *head)
+{
+	struct da_monitor_storage *ms =
+		container_of(head, struct da_monitor_storage, rcu);
+	unsigned long flags;
+
+	spin_lock_irqsave(&da_pool.lock, flags);
+	da_pool.free[da_pool.free_top++] = ms;
+	spin_unlock_irqrestore(&da_pool.lock, flags);
+}
+
+/* Pops a slot from the pre-allocated pool; returns -ENOSPC if exhausted. */
+static inline int da_create_or_get_pool(da_id_type id, monitor_target target)
+{
+	struct da_monitor_storage *mon_storage;
+	unsigned long flags;
+
+	spin_lock_irqsave(&da_pool.lock, flags);
+	if (!da_pool.free_top) {
+		spin_unlock_irqrestore(&da_pool.lock, flags);
+		return -ENOSPC;
+	}
+	mon_storage = da_pool.free[--da_pool.free_top];
+	spin_unlock_irqrestore(&da_pool.lock, flags);
+
+	mon_storage->id = id;
+	mon_storage->target = target;
+	guard(rcu)();
+	hash_add_rcu(da_monitor_ht, &mon_storage->node, id);
+	return 0;
+}
+
+/*
+ * Tries da_create_storage() first (lock-free via kmalloc_nolock); falls back
+ * to kmalloc(GFP_KERNEL).  Must be called from task context.
+ */
+static inline int da_create_or_get_kmalloc(da_id_type id, monitor_target target)
+{
+	struct da_monitor_storage *mon_storage;
+
+	scoped_guard(rcu) {
+		if (da_create_storage(id, target, da_get_monitor(id, target)))
+			return 0;
+	}
+
+	/*
+	 * da_create_storage() failed because kmalloc_nolock() returned NULL.
+	 * Allocate with GFP_KERNEL outside the RCU read section: GFP_KERNEL
+	 * may sleep for memory reclaim, which is illegal while the RCU read
+	 * lock is held (preemption disabled on !PREEMPT_RT).
+	 */
+	mon_storage = kmalloc_obj(*mon_storage, GFP_KERNEL | __GFP_ZERO);
+	if (!mon_storage)
+		return -ENOMEM;
+	mon_storage->id = id;
+	mon_storage->target = target;
+
+	/*
+	 * Re-check for a concurrent insertion before linking: another
+	 * caller may have succeeded while we slept in kmalloc().
+	 * Discard our allocation and let the winner's entry stand.
+	 */
+	scoped_guard(rcu) {
+		if (da_get_monitor(id, target)) {
+			kfree(mon_storage);
+			return 0;
+		}
+		hash_add_rcu(da_monitor_ht, &mon_storage->node, id);
+	}
+	return 0;
+}
+
+/* Create the per-object storage if not already there. */
+static inline int da_create_or_get(da_id_type id, monitor_target target)
+{
+	if (da_pool.storage)
+		return da_create_or_get_pool(id, target);
+	return da_create_or_get_kmalloc(id, target);
+}
+
 /*
  * da_destroy_storage - destroy the per-object storage
  *
- * The caller is responsible to synchronise writers, either with locks or
- * implicitly. For instance, if da_destroy_storage is called at sched_exit and
- * da_create_storage can never occur after that, it's safe to call this without
- * locks.
- * This function includes an RCU read-side critical section to synchronise
- * against da_monitor_destroy().
+ * Pool mode: removes from hash and returns the slot via call_rcu().
+ * Kmalloc mode: removes from hash and frees via kfree_rcu().
+ *
+ * Includes an RCU read-side critical section to synchronise against
+ * da_monitor_destroy().
  */
 static inline void da_destroy_storage(da_id_type id)
 {
@@ -491,15 +585,17 @@ static inline void da_destroy_storage(da_id_type id)
 
 	guard(rcu)();
 	mon_storage = __da_get_mon_storage(id);
-
 	if (!mon_storage)
 		return;
 	da_monitor_reset_hook(&mon_storage->rv.da_mon);
 	hash_del_rcu(&mon_storage->node);
-	kfree_rcu(mon_storage, rcu);
+	if (da_pool.storage)
+		call_rcu(&mon_storage->rcu, da_pool_return_cb);
+	else
+		kfree_rcu(mon_storage, rcu);
 }
 
-static void da_monitor_reset_all(void)
+static __maybe_unused void da_monitor_reset_all(void)
 {
 	struct da_monitor_storage *mon_storage;
 	int bkt;
@@ -510,13 +606,65 @@ static void da_monitor_reset_all(void)
 	rcu_read_unlock();
 }
 
+/*
+ * da_monitor_init_prealloc - initialise with a pre-allocated storage pool
+ *
+ * Allocates @prealloc_count storage slots up-front so that da_create_or_get()
+ * and da_destroy_storage() never call kmalloc/kfree.  Must be called instead
+ * of da_monitor_init() for monitors that require pool mode.
+ */
+static inline int da_monitor_init_prealloc(unsigned int prealloc_count)
+{
+	hash_init(da_monitor_ht);
+
+	da_pool.storage = kcalloc(prealloc_count, sizeof(*da_pool.storage),
+				  GFP_KERNEL);
+	if (!da_pool.storage)
+		return -ENOMEM;
+
+	da_pool.free = kmalloc_array(prealloc_count, sizeof(*da_pool.free),
+				     GFP_KERNEL);
+	if (!da_pool.free) {
+		kfree(da_pool.storage);
+		da_pool.storage = NULL;
+		return -ENOMEM;
+	}
+
+	da_pool.free_top = 0;
+	for (unsigned int i = 0; i < prealloc_count; i++)
+		da_pool.free[da_pool.free_top++] = &da_pool.storage[i];
+	return 0;
+}
+
+/*
+ * da_monitor_init - initialise in kmalloc mode (no pre-allocation)
+ */
 static inline int da_monitor_init(void)
 {
 	hash_init(da_monitor_ht);
 	return 0;
 }
 
-static inline void da_monitor_destroy(void)
+static inline void da_monitor_destroy_pool(void)
+{
+	WARN_ON_ONCE(!hash_empty(da_monitor_ht));
+	/*
+	 * Wait for all in-flight da_pool_return_cb() callbacks to
+	 * complete before freeing da_pool.free.  synchronize_rcu() is
+	 * not sufficient: it only waits for callbacks registered before
+	 * it was called, but call_rcu() from concurrent da_destroy_storage()
+	 * calls may have been enqueued later.  rcu_barrier() drains every
+	 * pending callback.
+	 */
+	rcu_barrier();
+	kfree(da_pool.storage);
+	da_pool.storage = NULL;
+	kfree(da_pool.free);
+	da_pool.free = NULL;
+	da_pool.free_top = 0;
+}
+
+static inline void da_monitor_destroy_kmalloc(void)
 {
 	struct da_monitor_storage *mon_storage;
 	struct hlist_node *tmp;
@@ -534,6 +682,22 @@ static inline void da_monitor_destroy(void)
 	}
 }
 
+/*
+ * da_monitor_destroy - tear down the per-object monitor
+ *
+ * Pool mode: the hash must already be empty (caller must have drained all
+ * tasks first); calls rcu_barrier() to drain all pending da_pool_return_cb()
+ * callbacks before freeing pool arrays.
+ * Kmalloc mode: drains any remaining entries after synchronize_rcu().
+ */
+static inline void da_monitor_destroy(void)
+{
+	if (da_pool.storage)
+		da_monitor_destroy_pool();
+	else
+		da_monitor_destroy_kmalloc();
+}
+
 /*
  * Allow the per-object monitors to run allocation manually, necessary if the
  * start condition is in a context problematic for allocation (e.g. scheduling).
-- 
2.25.1

next prev parent reply	other threads:[~2026-05-11 18:25 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-11 18:24 [RFC PATCH v2 00/10] rv/tlob: Add task latency over budget RV monitor wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 01/10] rv/da: fix monitor start ordering and memory ordering for monitoring flag wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 02/10] rv/da: fix per-task da_monitor_destroy() ordering and sync wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 03/10] selftests/verification: fix verificationtest-ktap for out-of-tree execution wen.yang
2026-05-11 18:24 ` wen.yang [this message]
2026-05-11 18:24 ` [RFC PATCH v2 05/10] rv: add generic uprobe infrastructure for RV monitors wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 06/10] rvgen: support reset() on the __init arrow for global-window HA clocks wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 07/10] rv/tlob: add tlob model DOT file wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 08/10] rv/tlob: add tlob hybrid automaton monitor wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 09/10] rv/tlob: add KUnit tests for the tlob monitor wen.yang
2026-05-11 18:24 ` [RFC PATCH v2 10/10] selftests/verification: add tlob selftests wen.yang

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:d04bb3229c7 dfblob:7d6f6276625 )
 OR (
bs:"[RFC PATCH v2 04/10] rv/da: add pre-allocated storage pool for per-object monitors" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2774332570ee823be60cfe84ba85e9573b4df478.1778522945.git.wen.yang@linux.dev \
    --to=wen.yang@linux.dev \
    --cc=gmonaco@redhat.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-trace-kernel@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox