The Linux Kernel Mailing List
 help / color / mirror / Atom feed
From: Jeff Layton <jlayton@kernel.org>
To: Trond Myklebust <trondmy@kernel.org>,
	Anna Schumaker <anna@kernel.org>, NeilBrown <neil@brown.name>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <Dai.Ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
	Chuck Lever <cel@kernel.org>
Cc: linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org,
	Jeff Layton <jlayton@kernel.org>
Subject: [PATCH 5/4] sunrpc: protect the svc_pool_map pool_to[] array with RCU
Date: Tue, 30 Jun 2026 08:48:47 -0400	[thread overview]
Message-ID: <20260630124847.289974-1-jlayton@kernel.org> (raw)
In-Reply-To: <20260629-sunrpc-pool-mode-v3-0-d92676606dfd@kernel.org>

svc_pool_map_get_node() reads the global svc_pool_map without holding
svc_pool_map_mutex and dereferences m->pool_to[]. The array was both
published and torn down without any synchronisation against that
lockless reader:

 - svc_pool_map_get() incremented m->count to one before
   svc_pool_map_init_pernode() allocated and filled the arrays, so a
   reader observing the map as "in use" could see a NULL (or partially
   built) pool_to[] and oops.

 - svc_pool_map_put() freed the arrays as soon as the last reference
   went away, so a reader that had already started dereferencing
   pool_to[] could use it after free.

svc_new_thread() takes this lockless path for every service, including
unpooled ones that hold no map reference, so the reader genuinely can
run concurrently with another service's startup or shutdown.

Publish pool_to[] with rcu_assign_pointer() only after it is fully
built in a private allocation, and have svc_pool_map_get_node()
dereference it under rcu_read_lock(). On teardown, clear the pointer
and defer the free past a grace period with kfree_rcu_mightsleep().

svc_pool_map_set_cpumask() also reads pool_to[], but its caller holds a
map reference (it checks sv_nrpools > 1) so the array is stable; it uses
rcu_dereference_protected() rather than taking the read lock.

to_pool[] needs no such treatment: it is only read by services that
hold a map reference, so it cannot be freed under a reader.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
Assisted-by: Claude:claude-opus-4-8
---
 net/sunrpc/svc.c | 91 +++++++++++++++++++++++++-----------------------
 1 file changed, 48 insertions(+), 43 deletions(-)

Sashiko pointed out this (preexisting) problem during review of the
other 4 in the series. I figure we might as well fix it too while we're
in here.

diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 4fff0725ef8f..ebd953248e34 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -46,13 +46,13 @@ static void svc_unregister(const struct svc_serv *serv, struct net *net);
 struct svc_pool_map {
 	int count;			/* How many svc_servs use us */
 	unsigned int npools;
-	unsigned int *pool_to;		/* maps pool id to node */
+	unsigned int __rcu *pool_to;	/* maps pool id to node */
 	unsigned int *to_pool;		/* maps node to pool id */
 };
 
 static struct svc_pool_map svc_pool_map;
 
-static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
+static DEFINE_MUTEX(svc_pool_map_mutex);/* serialises svc_pool_map updates */
 
 /*
  * Pool modes that were historically accepted. They no longer select
@@ -102,32 +102,12 @@ param_get_pool_mode(char *buf, const struct kernel_param *kp)
 module_param_call(pool_mode, param_set_pool_mode, param_get_pool_mode,
 		  NULL, 0644);
 
-/*
- * Allocate the to_pool[] and pool_to[] arrays.
- * Returns 0 on success or an errno.
- */
-static int
-svc_pool_map_alloc_arrays(struct svc_pool_map *m, unsigned int maxpools)
-{
-	m->to_pool = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
-	if (!m->to_pool)
-		goto fail;
-	m->pool_to = kcalloc(maxpools, sizeof(unsigned int), GFP_KERNEL);
-	if (!m->pool_to)
-		goto fail_free;
-
-	return 0;
-
-fail_free:
-	kfree(m->to_pool);
-	m->to_pool = NULL;
-fail:
-	return -ENOMEM;
-}
-
 /*
  * Initialise the pool map for one pool per NUMA node.
  * Returns number of pools or <0 on error.
+ *
+ * pool_to[] is published with rcu_assign_pointer() only once fully built,
+ * so a lockless reader sees either NULL or a complete map.
  */
 static int
 svc_pool_map_init_pernode(struct svc_pool_map *m)
@@ -135,21 +115,30 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
 	unsigned int maxpools = nr_node_ids;
 	unsigned int pidx = 0;
 	unsigned int node;
-	int err;
+	unsigned int *to_pool, *pool_to;
 
-	err = svc_pool_map_alloc_arrays(m, maxpools);
-	if (err)
-		return err;
+	to_pool = kcalloc(maxpools, sizeof(*to_pool), GFP_KERNEL);
+	if (!to_pool)
+		return -ENOMEM;
+	pool_to = kcalloc(maxpools, sizeof(*pool_to), GFP_KERNEL);
+	if (!pool_to) {
+		kfree(to_pool);
+		return -ENOMEM;
+	}
 
 	for_each_node_with_cpus(node) {
 		/* some architectures (e.g. SN2) have cpuless nodes */
 		BUG_ON(pidx > maxpools);
-		m->to_pool[node] = pidx;
-		m->pool_to[pidx] = node;
+		to_pool[node] = pidx;
+		pool_to[pidx] = node;
 		pidx++;
 	}
 	/* nodes brought online later all get mapped to pool0, sorry */
 
+	m->npools = pidx;
+	m->to_pool = to_pool;
+	rcu_assign_pointer(m->pool_to, pool_to);
+
 	return pidx;
 }
 
@@ -178,7 +167,6 @@ svc_pool_map_get(void)
 		mutex_unlock(&svc_pool_map_mutex);
 		return 0;
 	}
-	m->npools = npools;
 	mutex_unlock(&svc_pool_map_mutex);
 	return npools;
 }
@@ -195,11 +183,21 @@ svc_pool_map_put(void)
 
 	mutex_lock(&svc_pool_map_mutex);
 	if (!--m->count) {
+		unsigned int *pool_to;
+
+		/* Protected by svc_pool_map_mutex */
+		pool_to = rcu_dereference_protected(m->pool_to, 1);
+
+		/*
+		 * Defer the pool_to[] free past a grace period; a lockless
+		 * reader may still hold it. to_pool[] has no such readers, so
+		 * free it directly.
+		 */
+		rcu_assign_pointer(m->pool_to, NULL);
 		kfree(m->to_pool);
 		m->to_pool = NULL;
-		kfree(m->pool_to);
-		m->pool_to = NULL;
 		m->npools = 0;
+		kfree_rcu_mightsleep(pool_to);
 	}
 	mutex_unlock(&svc_pool_map_mutex);
 }
@@ -207,10 +205,15 @@ svc_pool_map_put(void)
 static int svc_pool_map_get_node(unsigned int pidx)
 {
 	const struct svc_pool_map *m = &svc_pool_map;
+	const unsigned int *pool_to;
+	int node = numa_mem_id();
 
-	if (m->count)
-		return m->pool_to[pidx];
-	return numa_mem_id();
+	rcu_read_lock();
+	pool_to = rcu_dereference(m->pool_to);
+	if (pool_to)
+		node = pool_to[pidx];
+	rcu_read_unlock();
+	return node;
 }
 
 /*
@@ -221,17 +224,19 @@ static inline void
 svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 {
 	struct svc_pool_map *m = &svc_pool_map;
-	unsigned int node = m->pool_to[pidx];
+	const unsigned int *pool_to;
 
 	/*
-	 * The caller checks for sv_nrpools > 1, which
-	 * implies that we've been initialized.
+	 * The caller checks for sv_nrpools > 1, so its service holds a
+	 * reference to the map: pool_to[] is allocated and cannot be freed
+	 * under us. No RCU read lock is needed; the held reference keeps the
+	 * array stable.
 	 */
-	WARN_ON_ONCE(m->count == 0);
-	if (m->count == 0)
+	pool_to = rcu_dereference_protected(m->pool_to, 1);
+	if (WARN_ON_ONCE(!pool_to))
 		return;
 
-	set_cpus_allowed_ptr(task, cpumask_of_node(node));
+	set_cpus_allowed_ptr(task, cpumask_of_node(pool_to[pidx]));
 }
 
 /**
-- 
2.54.0


  parent reply	other threads:[~2026-06-30 12:48 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-29 17:48 [PATCH v3 0/4] sunrpc: hardcode pool_mode to pernode, remove other modes Jeff Layton
2026-06-29 17:48 ` [PATCH v3 1/4] sunrpc: route to a populated pool in svc_pool_for_cpu() Jeff Layton
2026-06-29 17:48 ` [PATCH v3 2/4] sunrpc: hardcode pool_mode to pernode, remove other modes Jeff Layton
2026-06-30 17:24   ` Chuck Lever
2026-06-29 17:48 ` [PATCH v3 3/4] sunrpc: guarantee a thread per CPU-bearing node when auto-distributing Jeff Layton
2026-06-29 17:48 ` [PATCH v3 4/4] sunrpc: eliminate a modulus operation from the enqueueing codepath Jeff Layton
2026-06-30 12:48 ` Jeff Layton [this message]
2026-07-01  1:50   ` [PATCH 5/4] sunrpc: protect the svc_pool_map pool_to[] array with RCU NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260630124847.289974-1-jlayton@kernel.org \
    --to=jlayton@kernel.org \
    --cc=Dai.Ngo@oracle.com \
    --cc=anna@kernel.org \
    --cc=cel@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neil@brown.name \
    --cc=okorniev@redhat.com \
    --cc=tom@talpey.com \
    --cc=trondmy@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox