From: Peter Zijlstra <peterz@infradead.org>
To: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@kernel.org>,
hpa@zytor.com, linux-kernel@vger.kernel.org,
Linus Torvalds <torvalds@linux-foundation.org>,
pjt@google.com, cl@linux.com, riel@redhat.com,
bharata.rao@gmail.com, Andrew Morton <akpm@linux-foundation.org>,
Lee.Schermerhorn@hp.com, aarcange@redhat.com, danms@us.ibm.com,
suresh.b.siddha@intel.com, tglx@linutronix.de,
linux-tip-commits@vger.kernel.org
Subject: Re: [tip:sched/numa] sched/numa: Introduce sys_numa_{t,m}bind()
Date: Fri, 01 Jun 2012 00:03:46 +0200 [thread overview]
Message-ID: <1338501826.28384.133.camel@twins> (raw)
In-Reply-To: <1337934953.9783.162.camel@laptop>
On Fri, 2012-05-25 at 10:35 +0200, Peter Zijlstra wrote:
> What does the node distance table on that thing look like?
The below makes it boot and I think even balance right. But I'm not
happy with the patch as I think it can be done simpler. The resulting
domain setup isn't minimal either.
---
Subject:
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu May 31 14:47:33 CEST 2012
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
include/linux/sched.h | 11 ++++++++
kernel/sched/core.c | 64 +++++++++++++++++++++++++++++++++++++++++++-------
kernel/sched/fair.c | 5 ++-
kernel/sched/sched.h | 2 +
4 files changed, 72 insertions(+), 10 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -878,6 +878,8 @@ struct sched_group_power {
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
+
+ unsigned long cpumask[0]; /* iteration mask */
};
struct sched_group {
@@ -902,6 +904,15 @@ static inline struct cpumask *sched_grou
return to_cpumask(sg->cpumask);
}
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+ return to_cpumask(sg->sgp->cpumask);
+}
+
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6008,6 +6008,44 @@ struct sched_domain_topology_level {
struct sd_data data;
};
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+ const struct cpumask *span = sched_domain_span(sd);
+ struct sd_data *sdd = sd->private;
+ struct sched_domain *sibling;
+ int i;
+
+ for_each_cpu(i, span) {
+ sibling = *per_cpu_ptr(sdd->sd, i);
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+ continue;
+
+ cpumask_set_cpu(i, sched_group_mask(sg));
+ }
+}
+
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+ return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
+
static int
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
{
@@ -6026,6 +6064,12 @@ build_overlap_sched_groups(struct sched_
if (cpumask_test_cpu(i, covered))
continue;
+ child = *per_cpu_ptr(sdd->sd, i);
+
+ /* See the comment near build_group_mask(). */
+ if (!cpumask_test_cpu(i, sched_domain_span(child)))
+ continue;
+
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
GFP_KERNEL, cpu_to_node(cpu));
@@ -6033,8 +6077,6 @@ build_overlap_sched_groups(struct sched_
goto fail;
sg_span = sched_group_cpus(sg);
-
- child = *per_cpu_ptr(sdd->sd, i);
if (child->child) {
child = child->child;
cpumask_copy(sg_span, sched_domain_span(child));
@@ -6044,13 +6086,18 @@ build_overlap_sched_groups(struct sched_
cpumask_or(covered, covered, sg_span);
sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- atomic_inc(&sg->sgp->ref);
+ if (atomic_inc_return(&sg->sgp->ref) == 1)
+ build_group_mask(sd, sg);
+
+ /*
+ * Make sure the first group of this domain contains the
+ * canonical balance cpu. Otherwise the sched_domain iteration
+ * breaks. See update_sg_lb_stats().
+ */
if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
- cpumask_first(sg_span) == cpu) {
- WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span));
+ group_balance_cpu(sg) == cpu)
groups = sg;
- }
if (!first)
first = sg;
@@ -6123,6 +6170,7 @@ build_sched_groups(struct sched_domain *
cpumask_clear(sched_group_cpus(sg));
sg->sgp->power = 0;
+ cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
@@ -6164,7 +6212,7 @@ static void init_sched_groups_power(int
sg = sg->next;
} while (sg != sd->groups);
- if (cpu != group_first_cpu(sg))
+ if (cpu != group_balance_cpu(sg))
return;
update_group_power(sd, cpu);
@@ -6572,7 +6620,7 @@ static int __sdt_alloc(const struct cpum
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power),
+ sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sgp)
return -ENOMEM;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3846,7 +3846,7 @@ static inline void update_sg_lb_stats(st
int i;
if (local_group)
- balance_cpu = group_first_cpu(group);
+ balance_cpu = group_balance_cpu(group);
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
@@ -3861,7 +3861,8 @@ static inline void update_sg_lb_stats(st
/* Bias balancing toward cpus of our domain */
if (local_group) {
- if (idle_cpu(i) && !first_idle_cpu) {
+ if (idle_cpu(i) && !first_idle_cpu &&
+ cpumask_test_cpu(i, sched_group_mask(group))) {
first_idle_cpu = 1;
balance_cpu = i;
}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -542,6 +542,8 @@ DECLARE_PER_CPU(struct sched_domain *, s
DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_node);
+extern int group_balance_cpu(struct sched_group *sg);
+
#endif /* CONFIG_SMP */
#include "stats.h"
next prev parent reply other threads:[~2012-05-31 22:04 UTC|newest]
Thread overview: 30+ messages / expand[flat|nested] mbox.gz Atom feed top
2012-05-18 10:42 [tip:sched/numa] sched/numa: Introduce sys_numa_{t,m}bind() tip-bot for Peter Zijlstra
2012-05-18 15:14 ` Rik van Riel
2012-05-18 15:25 ` Christoph Lameter
2012-05-18 15:33 ` Peter Zijlstra
2012-05-18 15:37 ` Christoph Lameter
2012-05-18 15:47 ` Peter Zijlstra
2012-05-18 15:35 ` Peter Zijlstra
2012-05-18 15:40 ` Peter Zijlstra
2012-05-18 15:47 ` Christoph Lameter
2012-05-18 15:49 ` Peter Zijlstra
2012-05-18 16:00 ` Christoph Lameter
2012-05-18 16:04 ` Peter Zijlstra
2012-05-18 16:07 ` Christoph Lameter
2012-05-18 15:48 ` Rik van Riel
2012-05-18 16:05 ` Peter Zijlstra
2012-05-19 11:19 ` Ingo Molnar
2012-05-19 11:09 ` Ingo Molnar
2012-05-19 10:32 ` Pekka Enberg
2012-05-20 2:23 ` David Rientjes
2012-05-21 8:40 ` Ingo Molnar
2012-05-22 2:16 ` David Rientjes
2012-05-22 2:42 ` David Rientjes
2012-05-22 12:04 ` Peter Zijlstra
2012-05-22 15:00 ` Peter Zijlstra
2012-05-23 16:00 ` Peter Zijlstra
2012-05-24 0:58 ` David Rientjes
2012-05-25 8:35 ` Peter Zijlstra
2012-05-31 22:03 ` Peter Zijlstra [this message]
2012-05-30 13:37 ` [tip:sched/urgent] sched: Fix SD_OVERLAP tip-bot for Peter Zijlstra
2012-05-30 13:38 ` [tip:sched/urgent] sched: Make sure to not re-read variables after validation tip-bot for Peter Zijlstra
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1338501826.28384.133.camel@twins \
--to=peterz@infradead.org \
--cc=Lee.Schermerhorn@hp.com \
--cc=aarcange@redhat.com \
--cc=akpm@linux-foundation.org \
--cc=bharata.rao@gmail.com \
--cc=cl@linux.com \
--cc=danms@us.ibm.com \
--cc=hpa@zytor.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-tip-commits@vger.kernel.org \
--cc=mingo@kernel.org \
--cc=pjt@google.com \
--cc=riel@redhat.com \
--cc=rientjes@google.com \
--cc=suresh.b.siddha@intel.com \
--cc=tglx@linutronix.de \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).