Linux NFS development
 help / color / mirror / Atom feed
From: "Chuck Lever" <cel@kernel.org>
To: "Jeff Layton" <jlayton@kernel.org>,
	"Trond Myklebust" <trondmy@kernel.org>,
	"Anna Schumaker" <anna@kernel.org>, NeilBrown <neil@brown.name>,
	"Olga Kornievskaia" <okorniev@redhat.com>,
	"Dai Ngo" <Dai.Ngo@oracle.com>, "Tom Talpey" <tom@talpey.com>
Cc: linux-nfs@vger.kernel.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH v3 2/4] sunrpc: hardcode pool_mode to pernode, remove other modes
Date: Tue, 30 Jun 2026 13:24:45 -0400	[thread overview]
Message-ID: <3f46fdf3-553f-489f-9259-e98e3e1fb9e5@app.fastmail.com> (raw)
In-Reply-To: <20260629-sunrpc-pool-mode-v3-2-d92676606dfd@kernel.org>



On Mon, Jun 29, 2026, at 1:48 PM, Jeff Layton wrote:
> The SVC_POOL_AUTO/GLOBAL/PERCPU/PERNODE pool mode selection machinery
> was added when NUMA was new and the right default was unclear.  The
> default has always been "global" (a single pool for the whole service);
> the other modes were only used when an admin explicitly set the
> pool_mode parameter or asked for "auto", which then picked a mode from
> the host topology.

> diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
> index 82fb7faf563f..2f6938fe28b2 100644
> --- a/net/sunrpc/svc.c
> +++ b/net/sunrpc/svc.c

> @@ -58,62 +45,29 @@ enum {
> 
>  struct svc_pool_map {
>  	int count;			/* How many svc_servs use us */
> -	int mode;			/* Note: int not enum to avoid
> -					 * warnings about "enumeration value
> -					 * not handled in switch" */
>  	unsigned int npools;
> -	unsigned int *pool_to;		/* maps pool id to cpu or node */
> -	unsigned int *to_pool;		/* maps cpu or node to pool id */
> +	unsigned int *pool_to;		/* maps pool id to node */
> +	unsigned int *to_pool;		/* maps node to pool id */
>  };
> 
> -static struct svc_pool_map svc_pool_map = {
> -	.mode = SVC_POOL_DEFAULT
> -};
> +static struct svc_pool_map svc_pool_map;
> 
>  static DEFINE_MUTEX(svc_pool_map_mutex);/* protects svc_pool_map.count only */
> 
> -static int
> -__param_set_pool_mode(const char *val, struct svc_pool_map *m)
> -{
> -	int err, mode;
> -
> -	mutex_lock(&svc_pool_map_mutex);
> -
> -	err = 0;
> -	if (!strncmp(val, "auto", 4))
> -		mode = SVC_POOL_AUTO;
> -	else if (!strncmp(val, "global", 6))
> -		mode = SVC_POOL_GLOBAL;
> -	else if (!strncmp(val, "percpu", 6))
> -		mode = SVC_POOL_PERCPU;
> -	else if (!strncmp(val, "pernode", 7))
> -		mode = SVC_POOL_PERNODE;
> -	else
> -		err = -EINVAL;
> -
> -	if (err)
> -		goto out;
> -
> -	if (m->count == 0)
> -		m->mode = mode;
> -	else if (mode != m->mode)
> -		err = -EBUSY;
> -out:
> -	mutex_unlock(&svc_pool_map_mutex);
> -	return err;
> -}
> -
> -static int
> -param_set_pool_mode(const char *val, const struct kernel_param *kp)
> -{
> -	struct svc_pool_map *m = kp->arg;
> -
> -	return __param_set_pool_mode(val, m);
> -}
> +/*
> + * Pool modes that were historically accepted. They no longer select
> + * anything: the pool mode is always pernode. The names are retained
> + * only so that writing a previously-valid value still succeeds.
> + */
> +static const char * const pool_mode_names[] = {
> +	"auto", "global", "percpu", "pernode",
> +};
> 
>  int sunrpc_set_pool_mode(const char *val)
>  {
> -	return __param_set_pool_mode(val, &svc_pool_map);
> +	int idx = sysfs_match_string(pool_mode_names, val);
> +
> +	return idx < 0 ? idx : 0;
>  }
>  EXPORT_SYMBOL(sunrpc_set_pool_mode);
> 

sysfs_match_string() compares the whole string, whereas the old
__param_set_pool_mode() matched prefixes (strncmp(val, "global", 6)
and friends). An input like "globalx" that the old code accepted is
now rejected with -EINVAL.

The commit message says:

  Writing any previously-accepted value succeeds silently

The four documented names still succeed, including with a trailing
newline since sysfs_streq() tolerates one, so only prefix-extended
strings change.


> @@ -284,14 +158,13 @@ svc_pool_map_init_pernode(struct svc_pool_map *m)
>   * Add a reference to the global map of cpus to pools (and
>   * vice versa) if pools are in use.
>   * Initialise the map if we're the first user.
> - * Returns the number of pools. If this is '1', no reference
> - * was taken.
> + * Returns the number of pools, or 0 on failure.
>   */
>  static unsigned int
>  svc_pool_map_get(void)
>  {
>  	struct svc_pool_map *m = &svc_pool_map;
> -	int npools = -1;
> +	int npools;
> 
>  	mutex_lock(&svc_pool_map_mutex);
>  	if (m->count++) {

Regarding the kdoc modification above: with percpu removed the
map is node-to-pool; "cpus to pools" now contradicts the struct
field comments this same patch corrected ("maps pool id to
node" / "maps node to pool id").


> @@ -299,22 +172,11 @@ svc_pool_map_get(void)
>  		return m->npools;
>  	}
> 
> -	if (m->mode == SVC_POOL_AUTO)
> -		m->mode = svc_pool_map_choose_mode();
> -
> -	switch (m->mode) {
> -	case SVC_POOL_PERCPU:
> -		npools = svc_pool_map_init_percpu(m);
> -		break;
> -	case SVC_POOL_PERNODE:
> -		npools = svc_pool_map_init_pernode(m);
> -		break;
> -	}
> -
> +	npools = svc_pool_map_init_pernode(m);
>  	if (npools <= 0) {
> -		/* default, or memory allocation failure */
> -		npools = 1;
> -		m->mode = SVC_POOL_GLOBAL;
> +		m->count = 0;
> +		mutex_unlock(&svc_pool_map_mutex);
> +		return 0;
>  	}
>  	m->npools = npools;
>  	mutex_unlock(&svc_pool_map_mutex);

svc_pool_map_put() frees pool_to[] and sets it back to NULL when the
last pooled reference drops.

Can an unpooled svc_new_thread() running concurrently with an nfsd
start or stop observe m->count != 0 while pool_to is still NULL (the
first get) or already freed (the last put), and dereference it here?
The reads of m->count and m->pool_to are not under the mutex.

The old default global mode returned numa_mem_id() in this case and
never touched pool_to[]; with pernode as the default this path runs
for every server. Would it help to consult pool_to[] only when the
service holds a reference, the way svc_pool_map_set_cpumask() is
already gated in svc_new_thread()?

        if (serv->sv_nrpools > 1)
                svc_pool_map_set_cpumask(task, pool->sp_id);

An unpooled service (sv_nrpools == 1) could then take numa_mem_id()
without reading the shared map.


> @@ -346,14 +208,11 @@ static int svc_pool_map_get_node(unsigned int pidx)
>  {
>  	const struct svc_pool_map *m = &svc_pool_map;
> 
> -	if (m->count) {
> -		if (m->mode == SVC_POOL_PERCPU)
> -			return cpu_to_node(m->pool_to[pidx]);
> -		if (m->mode == SVC_POOL_PERNODE)
> -			return m->pool_to[pidx];
> -	}
> +	if (m->count)
> +		return m->pool_to[pidx];
>  	return numa_mem_id();
>  }
> +
>  /*
>   * Set the given thread's cpus_allowed mask so that it
>   * will only run on cpus in the given pool.

This path now dereferences pool_to[] whenever m->count is non-zero.

svc_new_thread() calls it for every service, pooled or not, holding
neither svc_pool_map_mutex nor a map reference:

        node = svc_pool_map_get_node(pool->sp_id);

Unpooled services reach this too: lockd and the NFS callback are both
created with svc_create(), which passes npools = 1 and never calls
svc_pool_map_get(), so their m->count is whatever a pooled nfsd has
left set.

svc_pool_map_get() bumps the count before pool_to[] is allocated, and
the whole sequence is serialized only against other map writers:

        mutex_lock(&svc_pool_map_mutex);
        if (m->count++) {
                ...
        }
        npools = svc_pool_map_init_pernode(m);   /* allocates pool_to */


> @@ -372,27 +231,15 @@ svc_pool_map_set_cpumask(struct task_struct 
> *task, unsigned int pidx)
>  	if (m->count == 0)
>  		return;
> 
> -	switch (m->mode) {
> -	case SVC_POOL_PERCPU:
> -	{
> -		set_cpus_allowed_ptr(task, cpumask_of(node));
> -		break;
> -	}
> -	case SVC_POOL_PERNODE:
> -	{
> -		set_cpus_allowed_ptr(task, cpumask_of_node(node));
> -		break;
> -	}
> -	}
> +	set_cpus_allowed_ptr(task, cpumask_of_node(node));
>  }

pool_to[pidx] is dereferenced before the m->count == 0 check, so if
count were ever 0 the NULL deref has already happened and the
WARN/return can never fire. The sv_nrpools > 1 gate at the call site
keeps this safe, so the guard is now dead code.

For the series as a whole, let's consider moving 5/4 to the front.
It looks like a fix that might want to be backported.


-- 
Chuck Lever

  reply	other threads:[~2026-06-30 17:25 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-29 17:48 [PATCH v3 0/4] sunrpc: hardcode pool_mode to pernode, remove other modes Jeff Layton
2026-06-29 17:48 ` [PATCH v3 1/4] sunrpc: route to a populated pool in svc_pool_for_cpu() Jeff Layton
2026-06-29 17:48 ` [PATCH v3 2/4] sunrpc: hardcode pool_mode to pernode, remove other modes Jeff Layton
2026-06-30 17:24   ` Chuck Lever [this message]
2026-06-29 17:48 ` [PATCH v3 3/4] sunrpc: guarantee a thread per CPU-bearing node when auto-distributing Jeff Layton
2026-06-29 17:48 ` [PATCH v3 4/4] sunrpc: eliminate a modulus operation from the enqueueing codepath Jeff Layton
2026-06-30 12:48 ` [PATCH 5/4] sunrpc: protect the svc_pool_map pool_to[] array with RCU Jeff Layton
2026-07-01  1:50   ` NeilBrown

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3f46fdf3-553f-489f-9259-e98e3e1fb9e5@app.fastmail.com \
    --to=cel@kernel.org \
    --cc=Dai.Ngo@oracle.com \
    --cc=anna@kernel.org \
    --cc=jlayton@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neil@brown.name \
    --cc=okorniev@redhat.com \
    --cc=tom@talpey.com \
    --cc=trondmy@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox