[PATCH 08/10] workqueue: Introduce struct wq_node_nr_active

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: jiangshanlai@gmail.com
Cc: linux-kernel@vger.kernel.org, Naohiro.Aota@wdc.com,
	kernel-team@meta.com, Tejun Heo <tj@kernel.org>
Subject: [PATCH 08/10] workqueue: Introduce struct wq_node_nr_active
Date: Wed, 20 Dec 2023 16:24:39 +0900	[thread overview]
Message-ID: <20231220072529.1036099-9-tj@kernel.org> (raw)
In-Reply-To: <20231220072529.1036099-1-tj@kernel.org>

Currently, for both percpu and unbound workqueues, max_active applies
per-cpu, which is a recent change for unbound workqueues. The change for
unbound workqueues was a significant departure from the previous behavior of
per-node application. It made some use cases create undesirable number of
concurrent work items and left no good way of fixing them. To address the
problem, workqueue is implementing a NUMA node segmented global nr_active
mechanism, which will be explained further in the next patch.

As a preparation, this patch introduces struct wq_node_nr_active. It's a
data structured allocated for each workqueue and NUMA node pair and
currently only tracks the workqueue's number of active work items on the
node. This is split out from the next patch to make it easier to understand
and review.

Note that there is an extra wq_node_nr_active allocated for the invalid node
nr_node_ids which is used to track nr_active for pools which don't have NUMA
node associated such as the default fallback system-wide pool.

This doesn't cause any behavior changes visible to userland yet. The next
patch will expand to implement the control mechanism on top.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 132 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 129 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9982c63470e5..6aa6f2eee94e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -278,6 +278,16 @@ struct wq_flusher {
 
 struct wq_device;
 
+/*
+ * Unlike in a per-cpu workqueue where max_active limits its concurrency level
+ * on each CPU, in an unbound workqueue, max_active applies to the whole system.
+ * As sharing a single nr_active across multiple sockets can be very expensive,
+ * the counting and enforcement is per NUMA node.
+ */
+struct wq_node_nr_active {
+	atomic_t		count;		/* per-node nr_active count */
+};
+
 /*
  * The externally visible workqueue.  It relays the issued work items to
  * the appropriate worker_pool through its pool_workqueues.
@@ -324,6 +334,7 @@ struct workqueue_struct {
 	/* hot fields used during command issue, aligned to cacheline */
 	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
 	struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
+	struct wq_node_nr_active **node_nr_active; /* I: per-node nr_active */
 };
 
 static struct kmem_cache *pwq_cache;
@@ -1398,6 +1409,31 @@ work_func_t wq_worker_last_func(struct task_struct *task)
 	return worker->last_func;
 }
 
+/**
+ * wq_node_nr_active - Determine wq_node_nr_active to use
+ * @wq: workqueue of interest
+ * @node: NUMA node, can be %NUMA_NO_NODE
+ *
+ * Determine wq_node_nr_active to use for @wq on @node. Returns:
+ *
+ * - %NULL for per-cpu workqueues as they don't need to use shared nr_active.
+ *
+ * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE.
+ *
+ * - Otherwise, node_nr_active[@node].
+ */
+static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq,
+						   int node)
+{
+	if (!(wq->flags & WQ_UNBOUND))
+		return NULL;
+
+	if (node == NUMA_NO_NODE)
+		node = nr_node_ids;
+
+	return wq->node_nr_active[node];
+}
+
 /**
  * get_pwq - get an extra reference on the specified pool_workqueue
  * @pwq: pool_workqueue to get
@@ -1476,12 +1512,17 @@ static bool pwq_activate_work(struct pool_workqueue *pwq,
 			      struct work_struct *work)
 {
 	struct worker_pool *pool = pwq->pool;
+	struct wq_node_nr_active *nna;
 
 	lockdep_assert_held(&pool->lock);
 
 	if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
 		return false;
 
+	nna = wq_node_nr_active(pwq->wq, pool->node);
+	if (nna)
+		atomic_inc(&nna->count);
+
 	pwq->nr_active++;
 	__pwq_activate_work(pwq, work);
 	return true;
@@ -1498,12 +1539,21 @@ static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq)
 {
 	struct workqueue_struct *wq = pwq->wq;
 	struct worker_pool *pool = pwq->pool;
-	bool obtained;
+	struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node);
+	bool obtained = false;
 
 	lockdep_assert_held(&pool->lock);
 
-	obtained = pwq->nr_active < wq->max_active;
+	if (!nna) {
+		/* per-cpu workqueue, pwq->nr_active is sufficient */
+		obtained = pwq->nr_active < wq->max_active;
+		goto out;
+	}
+
+	atomic_inc(&nna->count);
+	obtained = true;
 
+out:
 	if (obtained)
 		pwq->nr_active++;
 	return obtained;
@@ -1542,10 +1592,26 @@ static bool pwq_activate_first_inactive(struct pool_workqueue *pwq)
 static void pwq_dec_nr_active(struct pool_workqueue *pwq)
 {
 	struct worker_pool *pool = pwq->pool;
+	struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node);
 
 	lockdep_assert_held(&pool->lock);
 
+	/*
+	 * @pwq->nr_active should be decremented for both percpu and unbound
+	 * workqueues.
+	 */
 	pwq->nr_active--;
+
+	/*
+	 * For a percpu workqueue, it's simple. Just need to kick the first
+	 * inactive work item on @pwq itself.
+	 */
+	if (!nna) {
+		pwq_activate_first_inactive(pwq);
+		return;
+	}
+
+	atomic_dec(&nna->count);
 	pwq_activate_first_inactive(pwq);
 }
 
@@ -4004,12 +4070,64 @@ static void wq_free_lockdep(struct workqueue_struct *wq)
 }
 #endif
 
+static void free_node_nr_active(struct wq_node_nr_active **ptr_ar)
+{
+	int i;
+
+	if (!ptr_ar)
+		return;
+	for (i = 0; i < nr_node_ids + 1; i++)
+		kfree(ptr_ar[i]);
+	kfree(ptr_ar);
+}
+
+static void init_node_nr_active(struct wq_node_nr_active *nna)
+{
+	atomic_set(&nna->count, 0);
+}
+
+/*
+ * Each node's nr_active counter will be accessed mostly from its own node and
+ * should be allocated in the node.
+ */
+static struct wq_node_nr_active **alloc_node_nr_active(void)
+{
+	struct wq_node_nr_active **nna_ar, *nna;
+	int node;
+
+	nna_ar = kcalloc(nr_node_ids + 1, sizeof(*nna_ar), GFP_KERNEL);
+	if (!nna_ar)
+		return NULL;
+
+	for_each_node(node) {
+		nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, node);
+		if (!nna)
+			goto err_free;
+		init_node_nr_active(nna);
+		nna_ar[node] = nna;
+	}
+
+	/* [nr_node_ids] is used as the fallback */
+	nna = kzalloc_node(sizeof(*nna), GFP_KERNEL, NUMA_NO_NODE);
+	if (!nna)
+		goto err_free;
+	init_node_nr_active(nna);
+	nna_ar[nr_node_ids] = nna;
+
+	return nna_ar;
+
+err_free:
+	free_node_nr_active(nna_ar);
+	return NULL;
+}
+
 static void rcu_free_wq(struct rcu_head *rcu)
 {
 	struct workqueue_struct *wq =
 		container_of(rcu, struct workqueue_struct, rcu);
 
 	wq_free_lockdep(wq);
+	free_node_nr_active(wq->node_nr_active);
 	free_percpu(wq->cpu_pwq);
 	free_workqueue_attrs(wq->unbound_attrs);
 	kfree(wq);
@@ -4800,8 +4918,14 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 	wq_init_lockdep(wq);
 	INIT_LIST_HEAD(&wq->list);
 
+	if (flags & WQ_UNBOUND) {
+		wq->node_nr_active = alloc_node_nr_active();
+		if (!wq->node_nr_active)
+			goto err_unreg_lockdep;
+	}
+
 	if (alloc_and_link_pwqs(wq) < 0)
-		goto err_unreg_lockdep;
+		goto err_free_node_nr_active;
 
 	if (wq_online && init_rescuer(wq) < 0)
 		goto err_destroy;
@@ -4826,6 +4950,8 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
 
 	return wq;
 
+err_free_node_nr_active:
+	free_node_nr_active(wq->node_nr_active);
 err_unreg_lockdep:
 	wq_unregister_lockdep(wq);
 	wq_free_lockdep(wq);
-- 
2.43.0

next prev parent reply	other threads:[~2023-12-20  7:26 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-20  7:24 [PATCHSET wq/for-6.8] workqueue: Implement system-wide max_active for unbound workqueues Tejun Heo
2023-12-20  7:24 ` [PATCH 01/10] workqueue: Move pwq->max_active to wq->max_active Tejun Heo
2023-12-26  9:13   ` Lai Jiangshan
2023-12-26 20:05     ` Tejun Heo
2023-12-26 21:36       ` Tejun Heo
2023-12-20  7:24 ` [PATCH 02/10] workqueue: Factor out pwq_is_empty() Tejun Heo
2023-12-20  7:24 ` [PATCH 03/10] workqueue: Replace pwq_activate_inactive_work() with [__]pwq_activate_work() Tejun Heo
2023-12-20  7:24 ` [PATCH 04/10] workqueue: Move nr_active handling into helpers Tejun Heo
2023-12-26  9:12   ` Lai Jiangshan
2023-12-26 20:06     ` Tejun Heo
2023-12-20  7:24 ` [PATCH 05/10] workqueue: Make wq_adjust_max_active() round-robin pwqs while activating Tejun Heo
2023-12-20  7:24 ` [PATCH 06/10] workqueue: Add first_possible_node and node_nr_cpus[] Tejun Heo
2023-12-20  7:24 ` [PATCH 07/10] workqueue: Move pwq_dec_nr_in_flight() to the end of work item handling Tejun Heo
2023-12-20  7:24 ` Tejun Heo [this message]
2023-12-26  9:14   ` [PATCH 08/10] workqueue: Introduce struct wq_node_nr_active Lai Jiangshan
2023-12-26 20:12     ` Tejun Heo
2023-12-20  7:24 ` [PATCH 09/10] workqueue: Implement system-wide nr_active enforcement for unbound workqueues Tejun Heo
2023-12-20  7:24 ` [PATCH 10/10] workqueue: Reimplement ordered workqueue using shared nr_active Tejun Heo
2024-01-13  0:18   ` Tejun Heo
2023-12-20  9:20 ` [PATCHSET wq/for-6.8] workqueue: Implement system-wide max_active for unbound workqueues Lai Jiangshan
2023-12-21 23:01   ` Tejun Heo
2023-12-22  8:04     ` Lai Jiangshan
2023-12-22  9:08       ` Tejun Heo
2024-01-05  2:44 ` Naohiro Aota
2024-01-12  0:49   ` Tejun Heo
2024-01-13  0:17     ` Tejun Heo
2024-01-15  5:46     ` Naohiro Aota
2024-01-16 21:04       ` Tejun Heo
2024-01-30  2:24         ` Naohiro Aota
2024-01-30 16:11           ` Tejun Heo
  -- strict thread matches above, loose matches on Subject: below --
2024-01-25 17:05 [PATCHSET v3 wq/for-6.9] " Tejun Heo
2024-01-25 17:06 ` [PATCH 08/10] workqueue: Introduce struct wq_node_nr_active Tejun Heo
2024-01-29 16:02   ` Lai Jiangshan

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:9982c63470e dfblob:6aa6f2eee94 )
 OR (
bs:"[PATCH 08/10] workqueue: Introduce struct wq_node_nr_active" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231220072529.1036099-9-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=Naohiro.Aota@wdc.com \
    --cc=jiangshanlai@gmail.com \
    --cc=kernel-team@meta.com \
    --cc=linux-kernel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox