[PATCH 08/14] workqueue: map an unbound workqueues to multiple per-node pool_workqueues

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Tejun Heo <tj@kernel.org>
To: laijs@cn.fujitsu.com
Cc: axboe@kernel.dk, jack@suse.cz, fengguang.wu@intel.com,
	jmoyer@redhat.com, zab@redhat.com, linux-kernel@vger.kernel.org,
	herbert@gondor.apana.org.au, davem@davemloft.net,
	linux-crypto@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 08/14] workqueue: map an unbound workqueues to multiple per-node pool_workqueues
Date: Wed, 27 Mar 2013 23:43:34 -0700	[thread overview]
Message-ID: <1364453020-2829-9-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1364453020-2829-1-git-send-email-tj@kernel.org>

Currently, an unbound workqueue has only one "current" pool_workqueue
associated with it.  It may have multple pool_workqueues but only the
first pool_workqueue servies new work items.  For NUMA affinity, we
want to change this so that there are multiple current pool_workqueues
serving different NUMA nodes.

Introduce workqueue->numa_pwq_tbl[] which is indexed by NUMA node and
points to the pool_workqueue to use for each possible node.  This
replaces first_pwq() in __queue_work() and workqueue_congested().

numa_pwq_tbl[] is currently initialized to point to the same
pool_workqueue as first_pwq() so this patch doesn't make any behavior
changes.

v2: Use rcu_dereference_raw() in unbound_pwq_by_node() as the function
    may be called only with wq->mutex held.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9297ea3..5b53705 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -257,6 +257,7 @@ struct workqueue_struct {
 	/* hot fields used during command issue, aligned to cacheline */
 	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
 	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
+	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
 };
 
 static struct kmem_cache *pwq_cache;
@@ -525,6 +526,22 @@ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
 				      pwqs_node);
 }
 
+/**
+ * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
+ * @wq: the target workqueue
+ * @node: the node ID
+ *
+ * This must be called either with pwq_lock held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ */
+static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
+						  int node)
+{
+	assert_rcu_or_wq_mutex(wq);
+	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
+}
+
 static unsigned int work_color_to_flags(int color)
 {
 	return color << WORK_STRUCT_COLOR_SHIFT;
@@ -1278,14 +1295,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
 retry:
+	if (req_cpu == WORK_CPU_UNBOUND)
+		cpu = raw_smp_processor_id();
+
 	/* pwq which will be used unless @work is executing elsewhere */
-	if (!(wq->flags & WQ_UNBOUND)) {
-		if (cpu == WORK_CPU_UNBOUND)
-			cpu = raw_smp_processor_id();
+	if (!(wq->flags & WQ_UNBOUND))
 		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
-	} else {
-		pwq = first_pwq(wq);
-	}
+	else
+		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	/*
 	 * If @work was previously on a different pool, it might still be
@@ -1315,8 +1332,8 @@ retry:
 	 * pwq is determined and locked.  For unbound pools, we could have
 	 * raced with pwq release and it could already be dead.  If its
 	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
-	 * without another pwq replacing it as the first pwq or while a
-	 * work item is executing on it, so the retying is guaranteed to
+	 * without another pwq replacing it in the numa_pwq_tbl or while
+	 * work items are executing on it, so the retrying is guaranteed to
 	 * make forward-progress.
 	 */
 	if (unlikely(!pwq->refcnt)) {
@@ -3614,6 +3631,8 @@ static void init_and_link_pwq(struct pool_workqueue *pwq,
 			      struct worker_pool *pool,
 			      struct pool_workqueue **p_last_pwq)
 {
+	int node;
+
 	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
 
 	pwq->pool = pool;
@@ -3640,8 +3659,11 @@ static void init_and_link_pwq(struct pool_workqueue *pwq,
 	/* link in @pwq */
 	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
 
-	if (wq->flags & WQ_UNBOUND)
+	if (wq->flags & WQ_UNBOUND) {
 		copy_workqueue_attrs(wq->unbound_attrs, pool->attrs);
+		for_each_node(node)
+			rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+	}
 
 	mutex_unlock(&wq->mutex);
 }
@@ -3756,12 +3778,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 					       struct lock_class_key *key,
 					       const char *lock_name, ...)
 {
+	size_t tbl_size = 0;
 	va_list args;
 	struct workqueue_struct *wq;
 	struct pool_workqueue *pwq;
 
 	/* allocate wq and format name */
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (flags & WQ_UNBOUND)
+		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+
+	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
 	if (!wq)
 		return NULL;
 
@@ -3989,7 +4015,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	if (!(wq->flags & WQ_UNBOUND))
 		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
 	else
-		pwq = first_pwq(wq);
+		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	ret = !list_empty(&pwq->delayed_works);
 	rcu_read_unlock_sched();
-- 
1.8.1.4

WARNING: multiple messages have this Message-ID (diff)

From: Tejun Heo <tj@kernel.org>
To: laijs@cn.fujitsu.com
Cc: axboe@kernel.dk, jack@suse.cz, fengguang.wu@intel.com,
	jmoyer@redhat.com, zab@redhat.com, linux-kernel@vger.kernel.org,
	herbert@gondor.hengli.com.au, davem@davemloft.net,
	linux-crypto@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 08/14] workqueue: map an unbound workqueues to multiple per-node pool_workqueues
Date: Wed, 27 Mar 2013 23:43:34 -0700	[thread overview]
Message-ID: <1364453020-2829-9-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1364453020-2829-1-git-send-email-tj@kernel.org>

Currently, an unbound workqueue has only one "current" pool_workqueue
associated with it.  It may have multple pool_workqueues but only the
first pool_workqueue servies new work items.  For NUMA affinity, we
want to change this so that there are multiple current pool_workqueues
serving different NUMA nodes.

Introduce workqueue->numa_pwq_tbl[] which is indexed by NUMA node and
points to the pool_workqueue to use for each possible node.  This
replaces first_pwq() in __queue_work() and workqueue_congested().

numa_pwq_tbl[] is currently initialized to point to the same
pool_workqueue as first_pwq() so this patch doesn't make any behavior
changes.

v2: Use rcu_dereference_raw() in unbound_pwq_by_node() as the function
    may be called only with wq->mutex held.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/workqueue.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9297ea3..5b53705 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -257,6 +257,7 @@ struct workqueue_struct {
 	/* hot fields used during command issue, aligned to cacheline */
 	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
 	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
+	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
 };
 
 static struct kmem_cache *pwq_cache;
@@ -525,6 +526,22 @@ static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
 				      pwqs_node);
 }
 
+/**
+ * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
+ * @wq: the target workqueue
+ * @node: the node ID
+ *
+ * This must be called either with pwq_lock held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ */
+static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
+						  int node)
+{
+	assert_rcu_or_wq_mutex(wq);
+	return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
+}
+
 static unsigned int work_color_to_flags(int color)
 {
 	return color << WORK_STRUCT_COLOR_SHIFT;
@@ -1278,14 +1295,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
 retry:
+	if (req_cpu == WORK_CPU_UNBOUND)
+		cpu = raw_smp_processor_id();
+
 	/* pwq which will be used unless @work is executing elsewhere */
-	if (!(wq->flags & WQ_UNBOUND)) {
-		if (cpu == WORK_CPU_UNBOUND)
-			cpu = raw_smp_processor_id();
+	if (!(wq->flags & WQ_UNBOUND))
 		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
-	} else {
-		pwq = first_pwq(wq);
-	}
+	else
+		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	/*
 	 * If @work was previously on a different pool, it might still be
@@ -1315,8 +1332,8 @@ retry:
 	 * pwq is determined and locked.  For unbound pools, we could have
 	 * raced with pwq release and it could already be dead.  If its
 	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
-	 * without another pwq replacing it as the first pwq or while a
-	 * work item is executing on it, so the retying is guaranteed to
+	 * without another pwq replacing it in the numa_pwq_tbl or while
+	 * work items are executing on it, so the retrying is guaranteed to
 	 * make forward-progress.
 	 */
 	if (unlikely(!pwq->refcnt)) {
@@ -3614,6 +3631,8 @@ static void init_and_link_pwq(struct pool_workqueue *pwq,
 			      struct worker_pool *pool,
 			      struct pool_workqueue **p_last_pwq)
 {
+	int node;
+
 	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
 
 	pwq->pool = pool;
@@ -3640,8 +3659,11 @@ static void init_and_link_pwq(struct pool_workqueue *pwq,
 	/* link in @pwq */
 	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
 
-	if (wq->flags & WQ_UNBOUND)
+	if (wq->flags & WQ_UNBOUND) {
 		copy_workqueue_attrs(wq->unbound_attrs, pool->attrs);
+		for_each_node(node)
+			rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+	}
 
 	mutex_unlock(&wq->mutex);
 }
@@ -3756,12 +3778,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 					       struct lock_class_key *key,
 					       const char *lock_name, ...)
 {
+	size_t tbl_size = 0;
 	va_list args;
 	struct workqueue_struct *wq;
 	struct pool_workqueue *pwq;
 
 	/* allocate wq and format name */
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (flags & WQ_UNBOUND)
+		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+
+	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
 	if (!wq)
 		return NULL;
 
@@ -3989,7 +4015,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 	if (!(wq->flags & WQ_UNBOUND))
 		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
 	else
-		pwq = first_pwq(wq);
+		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
 
 	ret = !list_empty(&pwq->delayed_works);
 	rcu_read_unlock_sched();
-- 
1.8.1.4

next prev parent reply	other threads:[~2013-03-28  6:43 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2013-03-28  6:43 Subject: [PATCHSET v2 wq/for-3.10] workqueue: NUMA affinity for unbound workqueues Tejun Heo
2013-03-28  6:43 ` Tejun Heo
2013-03-28  6:43 ` [PATCH 01/14] workqueue: move pwq_pool_locking outside of get/put_unbound_pool() Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 02/14] workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[] Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 03/14] workqueue: drop 'H' from kworker names of unbound worker pools Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 04/14] workqueue: determine NUMA node of workers accourding to the allowed cpumask Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 05/14] workqueue: add workqueue->unbound_attrs Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 06/14] workqueue: make workqueue->name[] fixed len Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 07/14] workqueue: move hot fields of workqueue_struct to the end Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` Tejun Heo [this message]
2013-03-28  6:43   ` [PATCH 08/14] workqueue: map an unbound workqueues to multiple per-node pool_workqueues Tejun Heo
2013-03-28  6:43 ` [PATCH 09/14] workqueue: break init_and_link_pwq() into two functions and introduce alloc_unbound_pwq() Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 10/14] workqueue: use NUMA-aware allocation for pool_workqueues Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 11/14] workqueue: introduce numa_pwq_tbl_install() Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 12/14] workqueue: introduce put_pwq_unlocked() Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-28  6:43 ` [PATCH 13/14] workqueue: implement NUMA affinity for unbound workqueues Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-03-29 22:44   ` [PATCH v4 " Tejun Heo
2013-03-29 22:44     ` Tejun Heo
     [not found]     ` <CACvQF50c3m3eMiGKctagoOe6s3uhehfFy733imBfnLKTXSqZ4A@mail.gmail.com>
     [not found]       ` <CAOS58YOgwB4s4-2e528T6SV36pDxLS3Zx+b5eR0L2kQjiZBEnw@mail.gmail.com>
2013-03-30 17:23         ` Lai Jiangshan
2013-03-30 17:23           ` Lai Jiangshan
2013-03-31 19:06           ` Tejun Heo
2013-03-31 19:06             ` Tejun Heo
2013-04-01 18:28   ` [PATCH v5 " Tejun Heo
2013-04-01 18:28     ` Tejun Heo
2013-03-28  6:43 ` [PATCH 14/14] workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity Tejun Heo
2013-03-28  6:43   ` Tejun Heo
2013-04-01 18:27 ` [PATCH 0.5/14] workqueue: fix memory leak in apply_workqueue_attrs() Tejun Heo
2013-04-01 18:27   ` Tejun Heo
2013-04-01 18:29 ` Subject: [PATCHSET v2 wq/for-3.10] workqueue: NUMA affinity for unbound workqueues Tejun Heo
2013-04-01 18:29   ` Tejun Heo

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:9297ea3 dfblob:5b53705 dfblob:9297ea3 dfblob:5b53705 )
 OR (
bs:"[PATCH 08/14] workqueue: map an unbound workqueues to multiple per-node pool_workqueues" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1364453020-2829-9-git-send-email-tj@kernel.org \
    --to=tj@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=davem@davemloft.net \
    --cc=fengguang.wu@intel.com \
    --cc=herbert@gondor.apana.org.au \
    --cc=jack@suse.cz \
    --cc=jmoyer@redhat.com \
    --cc=laijs@cn.fujitsu.com \
    --cc=linux-crypto@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=zab@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.