From: Tejun Heo <tj@kernel.org>
To: laijs@cn.fujitsu.com
Cc: axboe@kernel.dk, jack@suse.cz, fengguang.wu@intel.com,
jmoyer@redhat.com, zab@redhat.com, linux-kernel@vger.kernel.org,
herbert@gondor.apana.org.au, davem@davemloft.net,
linux-crypto@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 14/14] workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity
Date: Wed, 27 Mar 2013 23:43:40 -0700 [thread overview]
Message-ID: <1364453020-2829-15-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1364453020-2829-1-git-send-email-tj@kernel.org>
Unbound workqueues are now NUMA aware. Let's add some control knobs
and update sysfs interface accordingly.
* Add kernel param workqueue.numa_disable which disables NUMA affinity
globally.
* Replace sysfs file "pool_id" with "pool_ids" which contain
node:pool_id pairs. This change is userland-visible but "pool_id"
hasn't seen a release yet, so this is okay.
* Add a new sysf files "numa" which can toggle NUMA affinity on
individual workqueues. This is implemented as attrs->no_numa whichn
is special in that it isn't part of a pool's attributes. It only
affects how apply_workqueue_attrs() picks which pools to use.
After "pool_ids" change, first_pwq() doesn't have any user left.
Removed.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
Documentation/kernel-parameters.txt | 9 ++++
include/linux/workqueue.h | 5 +++
kernel/workqueue.c | 82 ++++++++++++++++++++++++++-----------
3 files changed, 73 insertions(+), 23 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4609e81..c75ea0b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
or other driver-specific files in the
Documentation/watchdog/ directory.
+ workqueue.disable_numa
+ By default, all work items queued to unbound
+ workqueues are affine to the NUMA nodes they're
+ issued on, which results in better behavior in
+ general. If NUMA affinity needs to be disabled for
+ whatever reason, this option can be used. Note
+ that this also can be controlled per-workqueue for
+ workqueues visible under /sys/bus/workqueue/.
+
x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
default x2apic cluster mode on platforms
supporting x2apic.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 835d12b..7179756 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -119,10 +119,15 @@ struct delayed_work {
/*
* A struct for workqueue attributes. This can be used to change
* attributes of an unbound workqueue.
+ *
+ * Unlike other fields, ->no_numa isn't a property of a worker_pool. It
+ * only modifies how apply_workqueue_attrs() select pools and thus doesn't
+ * participate in pool hash calculations or equality comparisons.
*/
struct workqueue_attrs {
int nice; /* nice level */
cpumask_var_t cpumask; /* allowed CPUs */
+ bool no_numa; /* disable NUMA affinity */
};
static inline struct delayed_work *to_delayed_work(struct work_struct *work)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 637debe..0b6a3b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -517,21 +520,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
}
/**
- * first_pwq - return the first pool_workqueue of the specified workqueue
- * @wq: the target workqueue
- *
- * This must be called either with wq->mutex held or sched RCU read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- */
-static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
-{
- assert_rcu_or_wq_mutex(wq);
- return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
- pwqs_node);
-}
-
-/**
* unbound_pwq_by_node - return the unbound pool_workqueue for the given node
* @wq: the target workqueue
* @node: the node ID
@@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = {
__ATTR_NULL,
};
-static ssize_t wq_pool_id_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
- struct worker_pool *pool;
- int written;
+ const char *delim = "";
+ int node, written = 0;
rcu_read_lock_sched();
- pool = first_pwq(wq)->pool;
- written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
rcu_read_unlock_sched();
return written;
@@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev,
return ret ?: count;
}
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_id, 0444, wq_pool_id_show, NULL),
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
__ATTR_NULL,
};
@@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
int cpu_going_down, cpumask_t *cpumask)
{
- if (!wq_numa_enabled)
+ if (!wq_numa_enabled || attrs->no_numa)
goto use_dfl;
/* does @node have any online CPUs @attrs wants? */
@@ -3940,6 +3969,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
cpumask = target_attrs->cpumask;
retry:
mutex_lock(&wq->mutex);
+ if (wq->unbound_attrs->no_numa)
+ goto out_unlock;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
pwq = unbound_pwq_by_node(wq, node);
@@ -4757,6 +4788,11 @@ static void __init wq_numa_init(void)
if (num_possible_nodes() <= 1)
return;
+ if (wq_disable_numa) {
+ pr_info("workqueue: NUMA affinity support disabled\n");
+ return;
+ }
+
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
BUG_ON(!wq_update_unbound_numa_attrs_buf);
--
1.8.1.4
WARNING: multiple messages have this Message-ID (diff)
From: Tejun Heo <tj@kernel.org>
To: laijs@cn.fujitsu.com
Cc: axboe@kernel.dk, jack@suse.cz, fengguang.wu@intel.com,
jmoyer@redhat.com, zab@redhat.com, linux-kernel@vger.kernel.org,
herbert@gondor.hengli.com.au, davem@davemloft.net,
linux-crypto@vger.kernel.org, Tejun Heo <tj@kernel.org>
Subject: [PATCH 14/14] workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity
Date: Wed, 27 Mar 2013 23:43:40 -0700 [thread overview]
Message-ID: <1364453020-2829-15-git-send-email-tj@kernel.org> (raw)
In-Reply-To: <1364453020-2829-1-git-send-email-tj@kernel.org>
Unbound workqueues are now NUMA aware. Let's add some control knobs
and update sysfs interface accordingly.
* Add kernel param workqueue.numa_disable which disables NUMA affinity
globally.
* Replace sysfs file "pool_id" with "pool_ids" which contain
node:pool_id pairs. This change is userland-visible but "pool_id"
hasn't seen a release yet, so this is okay.
* Add a new sysf files "numa" which can toggle NUMA affinity on
individual workqueues. This is implemented as attrs->no_numa whichn
is special in that it isn't part of a pool's attributes. It only
affects how apply_workqueue_attrs() picks which pools to use.
After "pool_ids" change, first_pwq() doesn't have any user left.
Removed.
Signed-off-by: Tejun Heo <tj@kernel.org>
---
Documentation/kernel-parameters.txt | 9 ++++
include/linux/workqueue.h | 5 +++
kernel/workqueue.c | 82 ++++++++++++++++++++++++++-----------
3 files changed, 73 insertions(+), 23 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4609e81..c75ea0b 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
or other driver-specific files in the
Documentation/watchdog/ directory.
+ workqueue.disable_numa
+ By default, all work items queued to unbound
+ workqueues are affine to the NUMA nodes they're
+ issued on, which results in better behavior in
+ general. If NUMA affinity needs to be disabled for
+ whatever reason, this option can be used. Note
+ that this also can be controlled per-workqueue for
+ workqueues visible under /sys/bus/workqueue/.
+
x2apic_phys [X86-64,APIC] Use x2apic physical mode instead of
default x2apic cluster mode on platforms
supporting x2apic.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 835d12b..7179756 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -119,10 +119,15 @@ struct delayed_work {
/*
* A struct for workqueue attributes. This can be used to change
* attributes of an unbound workqueue.
+ *
+ * Unlike other fields, ->no_numa isn't a property of a worker_pool. It
+ * only modifies how apply_workqueue_attrs() select pools and thus doesn't
+ * participate in pool hash calculations or equality comparisons.
*/
struct workqueue_attrs {
int nice; /* nice level */
cpumask_var_t cpumask; /* allowed CPUs */
+ bool no_numa; /* disable NUMA affinity */
};
static inline struct delayed_work *to_delayed_work(struct work_struct *work)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 637debe..0b6a3b0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -268,6 +268,9 @@ static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -517,21 +520,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
}
/**
- * first_pwq - return the first pool_workqueue of the specified workqueue
- * @wq: the target workqueue
- *
- * This must be called either with wq->mutex held or sched RCU read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- */
-static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
-{
- assert_rcu_or_wq_mutex(wq);
- return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
- pwqs_node);
-}
-
-/**
* unbound_pwq_by_node - return the unbound pool_workqueue for the given node
* @wq: the target workqueue
* @node: the node ID
@@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = {
__ATTR_NULL,
};
-static ssize_t wq_pool_id_show(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct workqueue_struct *wq = dev_to_wq(dev);
- struct worker_pool *pool;
- int written;
+ const char *delim = "";
+ int node, written = 0;
rcu_read_lock_sched();
- pool = first_pwq(wq)->pool;
- written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
rcu_read_unlock_sched();
return written;
@@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev,
return ret ?: count;
}
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
static struct device_attribute wq_sysfs_unbound_attrs[] = {
- __ATTR(pool_id, 0444, wq_pool_id_show, NULL),
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
__ATTR_NULL,
};
@@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
int cpu_going_down, cpumask_t *cpumask)
{
- if (!wq_numa_enabled)
+ if (!wq_numa_enabled || attrs->no_numa)
goto use_dfl;
/* does @node have any online CPUs @attrs wants? */
@@ -3940,6 +3969,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
cpumask = target_attrs->cpumask;
retry:
mutex_lock(&wq->mutex);
+ if (wq->unbound_attrs->no_numa)
+ goto out_unlock;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
pwq = unbound_pwq_by_node(wq, node);
@@ -4757,6 +4788,11 @@ static void __init wq_numa_init(void)
if (num_possible_nodes() <= 1)
return;
+ if (wq_disable_numa) {
+ pr_info("workqueue: NUMA affinity support disabled\n");
+ return;
+ }
+
wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
BUG_ON(!wq_update_unbound_numa_attrs_buf);
--
1.8.1.4
next prev parent reply other threads:[~2013-03-28 6:43 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-03-28 6:43 Subject: [PATCHSET v2 wq/for-3.10] workqueue: NUMA affinity for unbound workqueues Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 01/14] workqueue: move pwq_pool_locking outside of get/put_unbound_pool() Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 02/14] workqueue: add wq_numa_tbl_len and wq_numa_possible_cpumask[] Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 03/14] workqueue: drop 'H' from kworker names of unbound worker pools Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 04/14] workqueue: determine NUMA node of workers accourding to the allowed cpumask Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 05/14] workqueue: add workqueue->unbound_attrs Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 06/14] workqueue: make workqueue->name[] fixed len Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 07/14] workqueue: move hot fields of workqueue_struct to the end Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 08/14] workqueue: map an unbound workqueues to multiple per-node pool_workqueues Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 09/14] workqueue: break init_and_link_pwq() into two functions and introduce alloc_unbound_pwq() Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 10/14] workqueue: use NUMA-aware allocation for pool_workqueues Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 11/14] workqueue: introduce numa_pwq_tbl_install() Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 12/14] workqueue: introduce put_pwq_unlocked() Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-28 6:43 ` [PATCH 13/14] workqueue: implement NUMA affinity for unbound workqueues Tejun Heo
2013-03-28 6:43 ` Tejun Heo
2013-03-29 22:44 ` [PATCH v4 " Tejun Heo
2013-03-29 22:44 ` Tejun Heo
[not found] ` <CACvQF50c3m3eMiGKctagoOe6s3uhehfFy733imBfnLKTXSqZ4A@mail.gmail.com>
[not found] ` <CAOS58YOgwB4s4-2e528T6SV36pDxLS3Zx+b5eR0L2kQjiZBEnw@mail.gmail.com>
2013-03-30 17:23 ` Lai Jiangshan
2013-03-30 17:23 ` Lai Jiangshan
2013-03-31 19:06 ` Tejun Heo
2013-03-31 19:06 ` Tejun Heo
2013-04-01 18:28 ` [PATCH v5 " Tejun Heo
2013-04-01 18:28 ` Tejun Heo
2013-03-28 6:43 ` Tejun Heo [this message]
2013-03-28 6:43 ` [PATCH 14/14] workqueue: update sysfs interface to reflect NUMA awareness and a kernel param to disable NUMA affinity Tejun Heo
2013-04-01 18:27 ` [PATCH 0.5/14] workqueue: fix memory leak in apply_workqueue_attrs() Tejun Heo
2013-04-01 18:27 ` Tejun Heo
2013-04-01 18:29 ` Subject: [PATCHSET v2 wq/for-3.10] workqueue: NUMA affinity for unbound workqueues Tejun Heo
2013-04-01 18:29 ` Tejun Heo
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1364453020-2829-15-git-send-email-tj@kernel.org \
--to=tj@kernel.org \
--cc=axboe@kernel.dk \
--cc=davem@davemloft.net \
--cc=fengguang.wu@intel.com \
--cc=herbert@gondor.apana.org.au \
--cc=jack@suse.cz \
--cc=jmoyer@redhat.com \
--cc=laijs@cn.fujitsu.com \
--cc=linux-crypto@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=zab@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.