All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chuck Lever <cel@kernel.org>
To: tj@kernel.org, jiangshanlai@gmail.com
Cc: <linux-kernel@vger.kernel.org>, Chuck Lever <chuck.lever@oracle.com>
Subject: [RFC PATCH] workqueue: Automatic affinity scope fallback for single-pod topologies
Date: Tue,  3 Feb 2026 09:37:44 -0500	[thread overview]
Message-ID: <20260203143744.16578-1-cel@kernel.org> (raw)

From: Chuck Lever <chuck.lever@oracle.com>

The default affinity scope WQ_AFFN_CACHE assumes systems have multiple
last-level caches. On systems where all CPUs share a single LLC (common
with Intel monolithic dies), this scope degenerates to a single worker
pool. All queue_work() calls then contend on that pool's single lock,
causing severe performance degradation under high-throughput workloads.

For example, on a 12-core system with a single shared L3 cache running
NFS over RDMA with 12 fio jobs, perf shows approximately 39% of CPU
cycles spent in native_queued_spin_lock_slowpath, nearly all from
__queue_work() contending on the single pool lock.

On such systems WQ_AFFN_CACHE, WQ_AFFN_SMT, and WQ_AFFN_NUMA scopes all
collapse to a single pod.

Add wq_effective_affn_scope() to detect when a selected affinity scope
provides only one pod despite having multiple CPUs, and automatically
fall back to a finer-grained scope. This ensures reasonable lock
distribution without requiring manual configuration via the
workqueue.default_affinity_scope parameter or per-workqueue sysfs
tuning.

The fallback is conservative: it triggers only when a scope degenerates
to exactly one pod, and respects explicitly configured (non-default)
scopes.

Also update wq_affn_scope_show() to display the effective scope when
fallback occurs, making the behavior transparent to administrators
via sysfs (e.g., "default (cache -> smt)").

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 include/linux/workqueue.h |  7 ++++-
 kernel/workqueue.c        | 60 +++++++++++++++++++++++++++++++++++----
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index dabc351cc127..130c452fcecf 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -128,10 +128,15 @@ struct rcu_work {
 	struct workqueue_struct *wq;
 };
 
+/*
+ * Affinity scopes are ordered from finest to coarsest granularity. This
+ * ordering is used by the automatic fallback logic in wq_effective_affn_scope()
+ * which walks from coarse toward fine when a scope degenerates to a single pod.
+ */
 enum wq_affn_scope {
 	WQ_AFFN_DFL,			/* use system default */
 	WQ_AFFN_CPU,			/* one pod per CPU */
-	WQ_AFFN_SMT,			/* one pod poer SMT */
+	WQ_AFFN_SMT,			/* one pod per SMT */
 	WQ_AFFN_CACHE,			/* one pod per LLC */
 	WQ_AFFN_NUMA,			/* one pod per NUMA node */
 	WQ_AFFN_SYSTEM,			/* one pod across the whole system */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 253311af47c6..efbc10ef79fb 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4753,6 +4753,39 @@ static void wqattrs_actualize_cpumask(struct workqueue_attrs *attrs,
 		cpumask_copy(attrs->cpumask, unbound_cpumask);
 }
 
+/*
+ * Determine the effective affinity scope. If the configured scope results
+ * in a single pod (e.g., WQ_AFFN_CACHE on a system with one shared LLC),
+ * fall back to a finer-grained scope to distribute pool lock contention.
+ *
+ * The search stops at WQ_AFFN_CPU, which always provides one pod per CPU
+ * and thus cannot degenerate further.
+ *
+ * Returns the scope to actually use, which may differ from the configured
+ * scope on systems where coarser scopes degenerate.
+ */
+static enum wq_affn_scope wq_effective_affn_scope(enum wq_affn_scope scope)
+{
+	struct wq_pod_type *pt;
+
+	/*
+	 * Walk from the requested scope toward finer granularity. Stop when
+	 * a scope provides more than one pod, or when CPU scope is reached.
+	 * CPU scope always provides nr_possible_cpus() pods.
+	 */
+	while (scope > WQ_AFFN_CPU) {
+		pt = &wq_pod_types[scope];
+
+		/* Multiple pods at this scope; no fallback needed */
+		if (pt->nr_pods > 1)
+			break;
+
+		scope--;
+	}
+
+	return scope;
+}
+
 /* find wq_pod_type to use for @attrs */
 static const struct wq_pod_type *
 wqattrs_pod_type(const struct workqueue_attrs *attrs)
@@ -4763,8 +4796,13 @@ wqattrs_pod_type(const struct workqueue_attrs *attrs)
 	/* to synchronize access to wq_affn_dfl */
 	lockdep_assert_held(&wq_pool_mutex);
 
+	/*
+	 * For default scope, apply automatic fallback for degenerate
+	 * topologies. Explicit scope selection via sysfs or per-workqueue
+	 * attributes bypasses fallback, preserving administrator intent.
+	 */
 	if (attrs->affn_scope == WQ_AFFN_DFL)
-		scope = wq_affn_dfl;
+		scope = wq_effective_affn_scope(wq_affn_dfl);
 	else
 		scope = attrs->affn_scope;
 
@@ -7206,16 +7244,26 @@ static ssize_t wq_affn_scope_show(struct device *dev,
 				  struct device_attribute *attr, char *buf)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
+	enum wq_affn_scope scope, effective;
 	int written;
 
 	mutex_lock(&wq->mutex);
-	if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL)
-		written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
-				    wq_affn_names[WQ_AFFN_DFL],
-				    wq_affn_names[wq_affn_dfl]);
-	else
+	if (wq->unbound_attrs->affn_scope == WQ_AFFN_DFL) {
+		scope = wq_affn_dfl;
+		effective = wq_effective_affn_scope(scope);
+		if (effective != scope)
+			written = scnprintf(buf, PAGE_SIZE, "%s (%s -> %s)\n",
+					    wq_affn_names[WQ_AFFN_DFL],
+					    wq_affn_names[scope],
+					    wq_affn_names[effective]);
+		else
+			written = scnprintf(buf, PAGE_SIZE, "%s (%s)\n",
+					    wq_affn_names[WQ_AFFN_DFL],
+					    wq_affn_names[scope]);
+	} else {
 		written = scnprintf(buf, PAGE_SIZE, "%s\n",
 				    wq_affn_names[wq->unbound_attrs->affn_scope]);
+	}
 	mutex_unlock(&wq->mutex);
 
 	return written;
-- 
2.52.0


             reply	other threads:[~2026-02-03 14:37 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-02-03 14:37 Chuck Lever [this message]
2026-02-03 19:10 ` [RFC PATCH] workqueue: Automatic affinity scope fallback for single-pod topologies Tejun Heo
2026-02-03 20:14   ` Chuck Lever
2026-02-03 20:29     ` Tejun Heo
2026-02-03 20:34       ` Chuck Lever
2026-02-03 20:43         ` Tejun Heo

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260203143744.16578-1-cel@kernel.org \
    --to=cel@kernel.org \
    --cc=chuck.lever@oracle.com \
    --cc=jiangshanlai@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.