All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: NeilBrown <neilb@ownmail.net>, Jeff Layton <jlayton@kernel.org>,
	Olga Kornievskaia <okorniev@redhat.com>,
	Dai Ngo <dai.ngo@oracle.com>, Tom Talpey <tom@talpey.com>,
	daire@dneg.com, Mike Snitzer <snitzer@kernel.org>
Cc: <linux-nfs@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH 1/7] workqueue: Automatic affinity scope fallback for single-pod topologies
Date: Thu,  5 Feb 2026 10:57:23 -0500	[thread overview]
Message-ID: <20260205155729.6841-2-cel@kernel.org> (raw)
In-Reply-To: <20260205155729.6841-1-cel@kernel.org>

From: Chuck Lever <chuck.lever@oracle.com>

The default affinity scope WQ_AFFN_CACHE assumes systems have
multiple last-level caches. On systems where all CPUs share a
single LLC (common with Intel monolithic dies), this scope
degenerates to a single worker pool. All queue_work() calls then
contend on that pool's single lock, causing severe performance
degradation under high-throughput workloads.

For example, on a 12-core system with a single shared L3 cache
running NFS over RDMA with 12 fio jobs, perf shows approximately
39% of CPU cycles spent in native_queued_spin_lock_slowpath,
nearly all from __queue_work() contending on the single pool lock.

On such systems WQ_AFFN_CACHE, WQ_AFFN_SMT, and WQ_AFFN_NUMA
scopes all collapse to a single pod.

Add wq_effective_affn_scope() to detect when a selected affinity
scope provides only one pod despite having multiple CPUs, and
automatically fall back to a finer-grained scope. This enables lock
distribution to scale with CPU count without requiring manual
configuration via the workqueue.default_affinity_scope parameter or
per-workqueue sysfs tuning.

The fallback is conservative: it triggers only when a scope
degenerates to exactly one pod, and respects explicitly configured
(non-default) scopes.

Also update wq_affn_scope_show() to display the effective scope
when fallback occurs, making the behavior transparent to
administrators via sysfs (e.g., "default (cache -> smt)").

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/workqueue.h |  8 ++++-
 kernel/workqueue.c        | 68 +++++++++++++++++++++++++++++++++++----
 2 files changed, 69 insertions(+), 7 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index dabc351cc127..1fca5791337d 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -128,10 +128,16 @@ struct rcu_work {
 	struct workqueue_struct *wq;
 };
 
+/*
+ * Affinity scopes are ordered from finest to coarsest granularity. This
+ * ordering is used by the automatic fallback logic in wq_effective_affn_scope()
+ * which walks from coarse toward fine when a scope degenerates to a single pod.
+ */
 enum wq_affn_scope {
 	WQ_AFFN_DFL,			/* use system default */
 	WQ_AFFN_CPU,			/* one pod per CPU */
-	WQ_AFFN_SMT,			/* one pod poer SMT */
+	WQ_AFFN_SMT,			/* one pod per SMT */
+	WQ_AFFN_CLUSTER,		/* one pod per cluster */
 	WQ_AFFN_CACHE,			/* one pod per LLC */
 	WQ_AFFN_NUMA,			/* one pod per NUMA node */
 	WQ_AFFN_SYSTEM,			/* one pod across the whole system */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6..32598b9cd1c2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -405,6 +405,7 @@ static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
 	[WQ_AFFN_DFL]		= "default",
 	[WQ_AFFN_CPU]		= "cpu",
 	[WQ_AFFN_SMT]		= "smt",
+	[WQ_AFFN_CLUSTER]	= "cluster",
 	[WQ_AFFN_CACHE]		= "cache",
 	[WQ_AFFN_NUMA]		= "numa",
 	[WQ_AFFN_SYSTEM]	= "system",
@@ -4753,6 +4754,39 @@ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
 		cpumask_copy(attrs->cpumask, unbound_cpumask);
 }
 
+/*
+ * Determine the effective affinity scope. If the configured scope results
+ * in a single pod (e.g., WQ_AFFN_CACHE on a system with one shared LLC),
+ * fall back to a finer-grained scope to distribute pool lock contention.
+ *
+ * The search stops at WQ_AFFN_CPU, which always provides one pod per CPU
+ * and thus cannot degenerate further.
+ *
+ * Returns the scope to actually use, which may differ from the configured
+ * scope on systems where coarser scopes degenerate.
+ */
+static enum wq_affn_scope wq_effective_affn_scope(enum wq_affn_scope scope)
+{
+	struct wq_pod_type *pt;
+
+	/*
+	 * Walk from the requested scope toward finer granularity. Stop
+	 * when a scope provides more than one pod, or when CPU scope is
+	 * reached. CPU scope always provides nr_possible_cpus() pods.
+	 */
+	while (scope > WQ_AFFN_CPU) {
+		pt = &wq_pod_types[scope];
+
+		/* Multiple pods at this scope; no fallback needed */
+		if (pt->nr_pods > 1)
+			break;
+
+		scope--;
+	}
+
+	return scope;
+}
+
 /* find wq_pod_type to use for @attrs */
 static const struct wq_pod_type *
 wqattrs_pod_type(const struct workqueue_attrs *attrs)
@@ -4763,8 +4797,13 @@ wqattrs_pod_type(const struct workqueue_attrs *attrs)
 	/* to synchronize access to wq_affn_dfl */
 	lockdep_assert_held(&wq_pool_mutex);
 
+	/*
+	 * For default scope, apply automatic fallback for degenerate
+	 * topologies. Explicit scope selection via sysfs or per-workqueue
+	 * attributes bypasses fallback, preserving administrator intent.
+	 */
 	if (attrs->affn_scope == WQ_AFFN_DFL)
-		scope = wq_affn_dfl;
+		scope = wq_effective_affn_scope(wq_affn_dfl);
 	else
 		scope = attrs->affn_scope;
 
@@ -7206,16 +7245,27 @@ static ssize_t wq_affn_scope_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
+	enum wq_affn_scope scope, effective;
 	int written;
 
 	mutex_lock(&wq->mutex);
-	if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
-		written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
-				    wq_affn_names[WQ_AFFN_DFL],
-				    wq_affn_names[wq_affn_dfl]);
-	else
+	if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) {
+		scope = wq_affn_dfl;
+		effective = wq_effective_affn_scope(scope);
+		if (wq_pod_types[effective].nr_pods >
+		    wq_pod_types[scope].nr_pods)
+			written = scnprintf(buf, PAGE_SIZE, "%s (%s -> %s)\n",
+					    wq_affn_names[WQ_AFFN_DFL],
+					    wq_affn_names[scope],
+					    wq_affn_names[effective]);
+		else
+			written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+					    wq_affn_names[WQ_AFFN_DFL],
+					    wq_affn_names[scope]);
+	} else {
 		written = scnprintf(buf, PAGE_SIZE, "%s\n",
 				    wq_affn_names[wq->unbound_attrs->affn_scope]);
+	}
 	mutex_unlock(&wq->mutex);
 
 	return written;
@@ -8023,6 +8073,11 @@ static bool __init cpus_share_smt(int cpu0, int cpu1)
 #endif
 }
 
+static bool __init cpus_share_cluster(int cpu0, int cpu1)
+{
+	return cpumask_test_cpu(cpu0, topology_cluster_cpumask(cpu1));
+}
+
 static bool __init cpus_share_numa(int cpu0, int cpu1)
 {
 	return cpu_to_node(cpu0) == cpu_to_node(cpu1);
@@ -8042,6 +8097,7 @@ void __init workqueue_init_topology(void)
 
 	init_pod_type(&wq_pod_types[WQ_AFFN_CPU], cpus_dont_share);
 	init_pod_type(&wq_pod_types[WQ_AFFN_SMT], cpus_share_smt);
+	init_pod_type(&wq_pod_types[WQ_AFFN_CLUSTER], cpus_share_cluster);
 	init_pod_type(&wq_pod_types[WQ_AFFN_CACHE], cpus_share_cache);
 	init_pod_type(&wq_pod_types[WQ_AFFN_NUMA], cpus_share_numa);
 
-- 
2.52.0


  reply	other threads:[~2026-02-05 15:57 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-05 15:57 [RFC PATCH 0/7] sunrpc: Reduce lock contention for NFSD TCP sockets Chuck Lever
2026-02-05 15:57 ` Chuck Lever [this message]
2026-02-06 14:57   ` [RFC PATCH 1/7] workqueue: Automatic affinity scope fallback for single-pod topologies Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 2/7] sunrpc: split svc_data_ready into protocol-specific callbacks Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 3/7] sunrpc: add per-transport page recycling pool Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 4/7] sunrpc: add dedicated TCP receiver thread Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 5/7] sunrpc: implement flat combining for TCP socket sends Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 6/7] sunrpc: unify fore and backchannel server TCP send paths Chuck Lever
2026-02-05 15:57 ` [RFC PATCH 7/7] SUNRPC: Set explicit TCP socket buffer sizes for NFSD Chuck Lever
2026-03-30 18:57 ` [RFC PATCH 0/7] sunrpc: Reduce lock contention for NFSD TCP sockets Mike Snitzer
2026-03-30 19:04   ` Chuck Lever

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260205155729.6841-2-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=dai.ngo@oracle.com \
    --cc=daire@dneg.com \
    --cc=jlayton@kernel.org \
    --cc=linux-nfs@vger.kernel.org \
    --cc=neilb@ownmail.net \
    --cc=okorniev@redhat.com \
    --cc=snitzer@kernel.org \
    --cc=tom@talpey.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.