* [RFC][PATCH 01/14] sched: Remove obsolete arch_ prefixes
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 02/14] sched: Simplify cpu_power initialization Peter Zijlstra
` (13 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-remove-arch-prefixes.patch --]
[-- Type: text/plain, Size: 2392 bytes --]
Non weak static functions clearly are not arch specific, so remove the
arch_ prefix.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -232,7 +232,7 @@ static void destroy_rt_bandwidth(struct
#endif
/*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
@@ -7646,7 +7646,7 @@ void free_sched_domains(cpumask_var_t do
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
{
int err;
@@ -7663,7 +7663,7 @@ static int arch_init_sched_domains(const
return err;
}
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+static void destroy_sched_domains(const struct cpumask *cpu_map,
struct cpumask *tmpmask)
{
free_sched_groups(cpu_map, tmpmask);
@@ -7682,7 +7682,7 @@ static void detach_destroy_domains(const
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
synchronize_sched();
- arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
}
/* handle null as "default" */
@@ -7791,7 +7791,7 @@ void partition_sched_domains(int ndoms_n
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
{
get_online_cpus();
@@ -7824,7 +7824,7 @@ static ssize_t sched_power_savings_store
else
sched_mc_power_savings = level;
- arch_reinit_sched_domains();
+ reinit_sched_domains();
return count;
}
@@ -7950,7 +7950,7 @@ void __init sched_init_smp(void)
#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
- arch_init_sched_domains(cpu_active_mask);
+ init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 02/14] sched: Simplify cpu_power initialization
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 01/14] sched: Remove obsolete arch_ prefixes Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 21:35 ` Steven Rostedt
2011-03-14 15:06 ` [RFC][PATCH 03/14] sched: Simplify build_sched_groups Peter Zijlstra
` (12 subsequent siblings)
14 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-simplify-cpu_power.patch --]
[-- Type: text/plain, Size: 2778 bytes --]
The code in update_group_power() does what init_sched_groups_power()
does and more, so remove the special init_ code and call the generic
code instead.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 44 +++++---------------------------------------
1 file changed, 5 insertions(+), 39 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6655,9 +6655,6 @@ cpu_attach_domain(struct sched_domain *s
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- for (tmp = sd; tmp; tmp = tmp->parent)
- tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
-
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
@@ -7135,11 +7132,6 @@ static void free_sched_groups(const stru
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- struct sched_domain *child;
- struct sched_group *group;
- long power;
- int weight;
-
WARN_ON(!sd || !sd->groups);
if (cpu != group_first_cpu(sd->groups))
@@ -7147,36 +7139,7 @@ static void init_sched_groups_power(int
sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
- child = sd->child;
-
- sd->groups->cpu_power = 0;
-
- if (!child) {
- power = SCHED_LOAD_SCALE;
- weight = cpumask_weight(sched_domain_span(sd));
- /*
- * SMT siblings share the power of a single core.
- * Usually multiple threads get a better yield out of
- * that one core than a single thread would have,
- * reflect that in sd->smt_gain.
- */
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- power *= sd->smt_gain;
- power /= weight;
- power >>= SCHED_LOAD_SHIFT;
- }
- sd->groups->cpu_power += power;
- return;
- }
-
- /*
- * Add cpu_power of each child group to this groups cpu_power.
- */
- group = child->groups;
- do {
- sd->groups->cpu_power += group->cpu_power;
- group = group->next;
- } while (group != child->groups);
+ update_group_power(sd, cpu);
}
/*
@@ -7483,7 +7446,7 @@ static int __build_sched_domains(const s
{
enum s_alloc alloc_state = sa_none;
struct s_data d;
- struct sched_domain *sd;
+ struct sched_domain *sd, *tmp;
int i;
#ifdef CONFIG_NUMA
d.sd_allnodes = 0;
@@ -7506,6 +7469,9 @@ static int __build_sched_domains(const s
sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
+
+ for (tmp = sd; tmp; tmp = tmp->parent)
+ tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
}
for_each_cpu(i, cpu_map) {
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 02/14] sched: Simplify cpu_power initialization
2011-03-14 15:06 ` [RFC][PATCH 02/14] sched: Simplify cpu_power initialization Peter Zijlstra
@ 2011-03-14 21:35 ` Steven Rostedt
2011-03-14 21:46 ` Peter Zijlstra
0 siblings, 1 reply; 23+ messages in thread
From: Steven Rostedt @ 2011-03-14 21:35 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, linux-kernel, Benjamin Herrenschmidt,
Anton Blanchard, Srivatsa Vaddagiri, Suresh Siddha,
Venkatesh Pallipadi, Paul Turner, Mike Galbraith, Thomas Gleixner,
Heiko Carstens, Andreas Herrmann
On Mon, Mar 14, 2011 at 04:06:15PM +0100, Peter Zijlstra wrote:
> The code in update_group_power() does what init_sched_groups_power()
> does and more, so remove the special init_ code and call the generic
> code instead.
>
> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> LKML-Reference: <new-submission>
> ---
> kernel/sched.c | 44 +++++---------------------------------------
> 1 file changed, 5 insertions(+), 39 deletions(-)
>
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -6655,9 +6655,6 @@ cpu_attach_domain(struct sched_domain *s
> struct rq *rq = cpu_rq(cpu);
> struct sched_domain *tmp;
>
> - for (tmp = sd; tmp; tmp = tmp->parent)
> - tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
> -
> /* Remove the sched domains which do not contribute to scheduling. */
> for (tmp = sd; tmp; ) {
> struct sched_domain *parent = tmp->parent;
This and ...
[ snip what was explained in change log ]
>
> /*
> @@ -7483,7 +7446,7 @@ static int __build_sched_domains(const s
> {
> enum s_alloc alloc_state = sa_none;
> struct s_data d;
> - struct sched_domain *sd;
> + struct sched_domain *sd, *tmp;
> int i;
> #ifdef CONFIG_NUMA
> d.sd_allnodes = 0;
> @@ -7506,6 +7469,9 @@ static int __build_sched_domains(const s
> sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
> sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
> sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
> +
> + for (tmp = sd; tmp; tmp = tmp->parent)
> + tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
> }
>
> for_each_cpu(i, cpu_map) {
>
this, looks like a separate change than what was explained in the change
log. Did you forget a "quilt new" between these two changes?
-- Steve
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 02/14] sched: Simplify cpu_power initialization
2011-03-14 21:35 ` Steven Rostedt
@ 2011-03-14 21:46 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 21:46 UTC (permalink / raw)
To: Steven Rostedt
Cc: Ingo Molnar, linux-kernel, Benjamin Herrenschmidt,
Anton Blanchard, Srivatsa Vaddagiri, Suresh Siddha,
Venkatesh Pallipadi, Paul Turner, Mike Galbraith, Thomas Gleixner,
Heiko Carstens, Andreas Herrmann
On Mon, 2011-03-14 at 17:35 -0400, Steven Rostedt wrote:
> On Mon, Mar 14, 2011 at 04:06:15PM +0100, Peter Zijlstra wrote:
> > The code in update_group_power() does what init_sched_groups_power()
> > does and more, so remove the special init_ code and call the generic
> > code instead.
> >
> > Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
> > LKML-Reference: <new-submission>
> > ---
> > kernel/sched.c | 44 +++++---------------------------------------
> > 1 file changed, 5 insertions(+), 39 deletions(-)
> >
> > Index: linux-2.6/kernel/sched.c
> > ===================================================================
> > --- linux-2.6.orig/kernel/sched.c
> > +++ linux-2.6/kernel/sched.c
> > @@ -6655,9 +6655,6 @@ cpu_attach_domain(struct sched_domain *s
> > struct rq *rq = cpu_rq(cpu);
> > struct sched_domain *tmp;
> >
> > - for (tmp = sd; tmp; tmp = tmp->parent)
> > - tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
> > -
> > /* Remove the sched domains which do not contribute to scheduling. */
> > for (tmp = sd; tmp; ) {
> > struct sched_domain *parent = tmp->parent;
>
> This and ...
>
> [ snip what was explained in change log ]
>
> >
> > /*
> > @@ -7483,7 +7446,7 @@ static int __build_sched_domains(const s
> > {
> > enum s_alloc alloc_state = sa_none;
> > struct s_data d;
> > - struct sched_domain *sd;
> > + struct sched_domain *sd, *tmp;
> > int i;
> > #ifdef CONFIG_NUMA
> > d.sd_allnodes = 0;
> > @@ -7506,6 +7469,9 @@ static int __build_sched_domains(const s
> > sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
> > sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
> > sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
> > +
> > + for (tmp = sd; tmp; tmp = tmp->parent)
> > + tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
> > }
> >
> > for_each_cpu(i, cpu_map) {
> >
>
> this, looks like a separate change than what was explained in the change
> log. Did you forget a "quilt new" between these two changes?
Ah, no, I simply wrote the changelog several days and a weekend after
the patch and completely forgot about it, its needed because
update_group_power() needs to have sched_domain::span_weight set up, so
it needs to be done sooner.
I guess I can split it into a separate patch before the other bits
though.
^ permalink raw reply [flat|nested] 23+ messages in thread
* [RFC][PATCH 03/14] sched: Simplify build_sched_groups
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 01/14] sched: Remove obsolete arch_ prefixes Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 02/14] sched: Simplify cpu_power initialization Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 04/14] sched: Change NODE sched_domain group creation Peter Zijlstra
` (11 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo3.patch --]
[-- Type: text/plain, Size: 4741 bytes --]
Notice that the mask being computed is the same as the domain span we
just computed. By using the domain_span we can avoid some mask
allocations and computations.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 52 ++++++++++++++++------------------------------------
1 file changed, 16 insertions(+), 36 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6842,9 +6842,6 @@ struct s_data {
cpumask_var_t notcovered;
#endif
cpumask_var_t nodemask;
- cpumask_var_t this_sibling_map;
- cpumask_var_t this_core_map;
- cpumask_var_t this_book_map;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
struct sched_group **sched_group_nodes;
@@ -6856,9 +6853,6 @@ enum s_alloc {
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
- sa_this_book_map,
- sa_this_core_map,
- sa_this_sibling_map,
sa_nodemask,
sa_sched_group_nodes,
#ifdef CONFIG_NUMA
@@ -7201,12 +7195,6 @@ static void __free_domain_allocs(struct
free_cpumask_var(d->tmpmask); /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
- case sa_this_book_map:
- free_cpumask_var(d->this_book_map); /* fall through */
- case sa_this_core_map:
- free_cpumask_var(d->this_core_map); /* fall through */
- case sa_this_sibling_map:
- free_cpumask_var(d->this_sibling_map); /* fall through */
case sa_nodemask:
free_cpumask_var(d->nodemask); /* fall through */
case sa_sched_group_nodes:
@@ -7245,14 +7233,8 @@ static enum s_alloc __visit_domain_alloc
#endif
if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
return sa_sched_group_nodes;
- if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- return sa_nodemask;
- if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- return sa_this_core_map;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- return sa_this_book_map;
+ return sa_nodemask;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
d->rd = alloc_rootdomain();
@@ -7364,39 +7346,40 @@ static struct sched_domain *__build_smt_
static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
const struct cpumask *cpu_map, int cpu)
{
+ struct sched_domain *sd;
+
switch (l) {
#ifdef CONFIG_SCHED_SMT
case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- cpumask_and(d->this_sibling_map, cpu_map,
- topology_thread_cpumask(cpu));
- if (cpu == cpumask_first(d->this_sibling_map))
- init_sched_build_groups(d->this_sibling_map, cpu_map,
+ sd = &per_cpu(cpu_domains, cpu).sd;
+ if (cpu == cpumask_first(sched_domain_span(sd)))
+ init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_cpu_group,
d->send_covered, d->tmpmask);
break;
#endif
#ifdef CONFIG_SCHED_MC
case SD_LV_MC: /* set up multi-core groups */
- cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- if (cpu == cpumask_first(d->this_core_map))
- init_sched_build_groups(d->this_core_map, cpu_map,
+ sd = &per_cpu(core_domains, cpu).sd;
+ if (cpu == cpumask_first(sched_domain_span(sd)))
+ init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_core_group,
d->send_covered, d->tmpmask);
break;
#endif
#ifdef CONFIG_SCHED_BOOK
case SD_LV_BOOK: /* set up book groups */
- cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- if (cpu == cpumask_first(d->this_book_map))
- init_sched_build_groups(d->this_book_map, cpu_map,
+ sd = &per_cpu(book_domains, cpu).sd;
+ if (cpu == cpumask_first(sched_domain_span(sd)))
+ init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_book_group,
d->send_covered, d->tmpmask);
break;
#endif
case SD_LV_CPU: /* set up physical groups */
- cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- if (!cpumask_empty(d->nodemask))
- init_sched_build_groups(d->nodemask, cpu_map,
+ sd = &per_cpu(phys_domains, cpu).sd;
+ if (cpu == cpumask_first(sched_domain_span(sd)))
+ init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_phys_group,
d->send_covered, d->tmpmask);
break;
@@ -7452,11 +7435,8 @@ static int __build_sched_domains(const s
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- }
-
- /* Set up physical groups */
- for (i = 0; i < nr_node_ids; i++)
build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ }
#ifdef CONFIG_NUMA
/* Set up node groups */
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 04/14] sched: Change NODE sched_domain group creation
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (2 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 03/14] sched: Simplify build_sched_groups Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 05/14] sched: Clean up some ALLNODES code Peter Zijlstra
` (10 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo1.patch --]
[-- Type: text/plain, Size: 11443 bytes --]
The NODE sched_domain is 'special' in that it allocates sched_groups
per CPU, instead of sharing the sched_groups between all CPUs.
While this might have some benefits on large NUMA and avoid remote
memory accesses when iterating the sched_groups, this does break
current code that assumes sched_groups are shared between all
sched_domains (since the dynamic cpu_power patches).
So refactor the NODE groups to behave like all other groups.
(The ALLNODES domain again shared its groups across the CPUs for some
reason).
If someone does measure a performance decrease due to this change we
need to revisit this and come up with another way to have both dynamic
cpu_power and NUMA work nice together.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 231 ++++++++-------------------------------------------------
1 file changed, 33 insertions(+), 198 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6837,29 +6837,18 @@ struct static_sched_domain {
struct s_data {
#ifdef CONFIG_NUMA
int sd_allnodes;
- cpumask_var_t domainspan;
- cpumask_var_t covered;
- cpumask_var_t notcovered;
#endif
cpumask_var_t nodemask;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
- struct sched_group **sched_group_nodes;
struct root_domain *rd;
};
enum s_alloc {
- sa_sched_groups = 0,
sa_rootdomain,
sa_tmpmask,
sa_send_covered,
sa_nodemask,
- sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
- sa_notcovered,
- sa_covered,
- sa_domainspan,
-#endif
sa_none,
};
@@ -6955,18 +6944,10 @@ cpu_to_phys_group(int cpu, const struct
}
#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_node);
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map,
struct sched_group **sg,
struct cpumask *nodemask)
{
@@ -6976,142 +6957,27 @@ static int cpu_to_allnodes_group(int cpu
group = cpumask_first(nodemask);
if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
+ *sg = &per_cpu(sched_group_node, group).sg;
return group;
}
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
- struct sched_group *sg = group_head;
- int j;
-
- if (!sg)
- return;
- do {
- for_each_cpu(j, sched_group_cpus(sg)) {
- struct sched_domain *sd;
-
- sd = &per_cpu(phys_domains, j).sd;
- if (j != group_first_cpu(sd->groups)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
-
- sg->cpu_power += sd->groups->cpu_power;
- }
- sg = sg->next;
- } while (sg != group_head);
-}
+static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
-static int build_numa_sched_groups(struct s_data *d,
- const struct cpumask *cpu_map, int num)
+static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+ struct sched_group **sg,
+ struct cpumask *nodemask)
{
- struct sched_domain *sd;
- struct sched_group *sg, *prev;
- int n, j;
+ int group;
- cpumask_clear(d->covered);
- cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- if (cpumask_empty(d->nodemask)) {
- d->sched_group_nodes[num] = NULL;
- goto out;
- }
-
- sched_domain_node_span(num, d->domainspan);
- cpumask_and(d->domainspan, d->domainspan, cpu_map);
-
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- num);
- return -ENOMEM;
- }
- d->sched_group_nodes[num] = sg;
-
- for_each_cpu(j, d->nodemask) {
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
-
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->nodemask);
- sg->next = sg;
- cpumask_or(d->covered, d->covered, d->nodemask);
-
- prev = sg;
- for (j = 0; j < nr_node_ids; j++) {
- n = (num + j) % nr_node_ids;
- cpumask_complement(d->notcovered, d->covered);
- cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- if (cpumask_empty(d->tmpmask))
- break;
- cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- if (cpumask_empty(d->tmpmask))
- continue;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- return -ENOMEM;
- }
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- sg->next = prev->next;
- cpumask_or(d->covered, d->covered, d->tmpmask);
- prev->next = sg;
- prev = sg;
- }
-out:
- return 0;
-}
-#endif /* CONFIG_NUMA */
+ cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+ group = cpumask_first(nodemask);
-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
- int cpu, i;
-
- for_each_cpu(cpu, cpu_map) {
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
-
- if (!sched_group_nodes)
- continue;
-
- for (i = 0; i < nr_node_ids; i++) {
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
- continue;
-
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
- }
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
- }
-}
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
+ if (sg)
+ *sg = &per_cpu(sched_group_allnodes, group).sg;
+ return group;
}
+
#endif /* CONFIG_NUMA */
/*
@@ -7212,9 +7078,6 @@ static void __free_domain_allocs(struct
const struct cpumask *cpu_map)
{
switch (what) {
- case sa_sched_groups:
- free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- d->sched_group_nodes = NULL;
case sa_rootdomain:
free_rootdomain(d->rd); /* fall through */
case sa_tmpmask:
@@ -7223,16 +7086,6 @@ static void __free_domain_allocs(struct
free_cpumask_var(d->send_covered); /* fall through */
case sa_nodemask:
free_cpumask_var(d->nodemask); /* fall through */
- case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
- kfree(d->sched_group_nodes); /* fall through */
- case sa_notcovered:
- free_cpumask_var(d->notcovered); /* fall through */
- case sa_covered:
- free_cpumask_var(d->covered); /* fall through */
- case sa_domainspan:
- free_cpumask_var(d->domainspan); /* fall through */
-#endif
case sa_none:
break;
}
@@ -7241,24 +7094,8 @@ static void __free_domain_allocs(struct
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
-#ifdef CONFIG_NUMA
- if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- return sa_none;
- if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- return sa_domainspan;
- if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- return sa_covered;
- /* Allocate the per-node list of sched groups */
- d->sched_group_nodes = kcalloc(nr_node_ids,
- sizeof(struct sched_group *), GFP_KERNEL);
- if (!d->sched_group_nodes) {
- printk(KERN_WARNING "Can not alloc sched group node list\n");
- return sa_notcovered;
- }
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_sched_group_nodes;
+ return sa_none;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
return sa_nodemask;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
@@ -7298,6 +7135,7 @@ static struct sched_domain *__build_numa
if (parent)
parent->child = sd;
cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
+ cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
return sd;
}
@@ -7410,6 +7248,13 @@ static void build_sched_groups(struct s_
d->send_covered, d->tmpmask);
break;
#ifdef CONFIG_NUMA
+ case SD_LV_NODE:
+ sd = &per_cpu(node_domains, cpu).sd;
+ if (cpu == cpumask_first(sched_domain_span(sd)))
+ init_sched_build_groups(sched_domain_span(sd), cpu_map,
+ &cpu_to_node_group,
+ d->send_covered, d->tmpmask);
+
case SD_LV_ALLNODES:
init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
d->send_covered, d->tmpmask);
@@ -7438,7 +7283,6 @@ static int __build_sched_domains(const s
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- alloc_state = sa_sched_groups;
/*
* Set up domains for cpus specified by the cpu_map.
@@ -7462,16 +7306,13 @@ static int __build_sched_domains(const s
build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ build_sched_groups(&d, SD_LV_NODE, cpu_map, i);
}
#ifdef CONFIG_NUMA
/* Set up node groups */
if (d.sd_allnodes)
build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-
- for (i = 0; i < nr_node_ids; i++)
- if (build_numa_sched_groups(&d, cpu_map, i))
- goto error;
#endif
/* Calculate CPU power for physical packages and nodes */
@@ -7500,15 +7341,16 @@ static int __build_sched_domains(const s
}
#ifdef CONFIG_NUMA
- for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(node_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }
if (d.sd_allnodes) {
- struct sched_group *sg;
-
- cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- d.tmpmask);
- init_numa_sched_groups_power(sg);
+ for_each_cpu(i, cpu_map) {
+ sd = &per_cpu(allnodes_domains, i).sd;
+ init_sched_groups_power(i, sd);
+ }
}
#endif
@@ -7526,7 +7368,6 @@ static int __build_sched_domains(const s
cpu_attach_domain(sd, d.rd, i);
}
- d.sched_group_nodes = NULL; /* don't free this we still need it */
__free_domain_allocs(&d, sa_tmpmask, cpu_map);
return 0;
@@ -7612,7 +7453,6 @@ static int init_sched_domains(const stru
static void destroy_sched_domains(const struct cpumask *cpu_map,
struct cpumask *tmpmask)
{
- free_sched_groups(cpu_map, tmpmask);
}
/*
@@ -7889,11 +7729,6 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
- sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- GFP_KERNEL);
- BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 05/14] sched: Clean up some ALLNODES code
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (3 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 04/14] sched: Change NODE sched_domain group creation Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 06/14] Simplify group creation Peter Zijlstra
` (9 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo4.patch --]
[-- Type: text/plain, Size: 1227 bytes --]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 11 ++++-------
1 file changed, 4 insertions(+), 7 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7256,7 +7256,9 @@ static void build_sched_groups(struct s_
d->send_covered, d->tmpmask);
case SD_LV_ALLNODES:
- init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+ if (cpu == cpumask_first(cpu_map))
+ init_sched_build_groups(cpu_map, cpu_map,
+ &cpu_to_allnodes_group,
d->send_covered, d->tmpmask);
break;
#endif
@@ -7307,14 +7309,9 @@ static int __build_sched_domains(const s
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
build_sched_groups(&d, SD_LV_NODE, cpu_map, i);
+ build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, i);
}
-#ifdef CONFIG_NUMA
- /* Set up node groups */
- if (d.sd_allnodes)
- build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-#endif
-
/* Calculate CPU power for physical packages and nodes */
#ifdef CONFIG_SCHED_SMT
for_each_cpu(i, cpu_map) {
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 06/14] Simplify group creation
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (4 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 05/14] sched: Clean up some ALLNODES code Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 07/14] sched: Simplify finding the lowest sched_domain Peter Zijlstra
` (8 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo5.patch --]
[-- Type: text/plain, Size: 3198 bytes --]
Instead of calling build_sched_groups() for each possible sched_domain
we might have created, note that we can simply iterate the
sched_domain tree and call it for each sched_domain present.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 24 +++++-------------------
1 file changed, 5 insertions(+), 19 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7207,15 +7207,12 @@ static struct sched_domain *__build_smt_
return sd;
}
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+static void build_sched_groups(struct s_data *d, struct sched_domain *sd,
const struct cpumask *cpu_map, int cpu)
{
- struct sched_domain *sd;
-
- switch (l) {
+ switch (sd->level) {
#ifdef CONFIG_SCHED_SMT
case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- sd = &per_cpu(cpu_domains, cpu).sd;
if (cpu == cpumask_first(sched_domain_span(sd)))
init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_cpu_group,
@@ -7224,7 +7221,6 @@ static void build_sched_groups(struct s_
#endif
#ifdef CONFIG_SCHED_MC
case SD_LV_MC: /* set up multi-core groups */
- sd = &per_cpu(core_domains, cpu).sd;
if (cpu == cpumask_first(sched_domain_span(sd)))
init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_core_group,
@@ -7233,7 +7229,6 @@ static void build_sched_groups(struct s_
#endif
#ifdef CONFIG_SCHED_BOOK
case SD_LV_BOOK: /* set up book groups */
- sd = &per_cpu(book_domains, cpu).sd;
if (cpu == cpumask_first(sched_domain_span(sd)))
init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_book_group,
@@ -7241,7 +7236,6 @@ static void build_sched_groups(struct s_
break;
#endif
case SD_LV_CPU: /* set up physical groups */
- sd = &per_cpu(phys_domains, cpu).sd;
if (cpu == cpumask_first(sched_domain_span(sd)))
init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_phys_group,
@@ -7249,7 +7243,6 @@ static void build_sched_groups(struct s_
break;
#ifdef CONFIG_NUMA
case SD_LV_NODE:
- sd = &per_cpu(node_domains, cpu).sd;
if (cpu == cpumask_first(sched_domain_span(sd)))
init_sched_build_groups(sched_domain_span(sd), cpu_map,
&cpu_to_node_group,
@@ -7299,17 +7292,10 @@ static int __build_sched_domains(const s
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- for (tmp = sd; tmp; tmp = tmp->parent)
+ for (tmp = sd; tmp; tmp = tmp->parent) {
tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
- }
-
- for_each_cpu(i, cpu_map) {
- build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
- build_sched_groups(&d, SD_LV_NODE, cpu_map, i);
- build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, i);
+ build_sched_groups(&d, tmp, cpu_map, i);
+ }
}
/* Calculate CPU power for physical packages and nodes */
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 07/14] sched: Simplify finding the lowest sched_domain
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (5 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 06/14] Simplify group creation Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 08/14] sched: Simplify sched_groups_power initialization Peter Zijlstra
` (7 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo6.patch --]
[-- Type: text/plain, Size: 2456 bytes --]
Instead of relying on knowing the build order and various CONFIG_
flags simply remember the bottom most sched_domain when we created the
domain hierarchy.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/asm-generic/percpu.h | 4 ++++
kernel/sched.c | 23 +++++++++++++----------
2 files changed, 17 insertions(+), 10 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6841,11 +6841,13 @@ struct s_data {
cpumask_var_t nodemask;
cpumask_var_t send_covered;
cpumask_var_t tmpmask;
+ struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
sa_rootdomain,
+ sa_sd,
sa_tmpmask,
sa_send_covered,
sa_nodemask,
@@ -7080,6 +7082,8 @@ static void __free_domain_allocs(struct
switch (what) {
case sa_rootdomain:
free_rootdomain(d->rd); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
case sa_tmpmask:
free_cpumask_var(d->tmpmask); /* fall through */
case sa_send_covered:
@@ -7100,10 +7104,15 @@ static enum s_alloc __visit_domain_alloc
return sa_nodemask;
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
return sa_send_covered;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd) {
+ printk(KERN_WARNING "Cannot alloc per-cpu pointers\n");
+ return sa_tmpmask;
+ }
d->rd = alloc_rootdomain();
if (!d->rd) {
printk(KERN_WARNING "Cannot alloc root domain\n");
- return sa_tmpmask;
+ return sa_sd;
}
return sa_rootdomain;
}
@@ -7292,6 +7301,8 @@ static int __build_sched_domains(const s
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
+ *per_cpu_ptr(d.sd, i) = sd;
+
for (tmp = sd; tmp; tmp = tmp->parent) {
tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
build_sched_groups(&d, tmp, cpu_map, i);
@@ -7339,15 +7350,7 @@ static int __build_sched_domains(const s
/* Attach the domains */
for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
- sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
- sd = &per_cpu(book_domains, i).sd;
-#else
- sd = &per_cpu(phys_domains, i).sd;
-#endif
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 08/14] sched: Simplify sched_groups_power initialization
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (6 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 07/14] sched: Simplify finding the lowest sched_domain Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures Peter Zijlstra
` (6 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo7.patch --]
[-- Type: text/plain, Size: 1997 bytes --]
Again, instead of relying on knowing the possible domains and their
order, simply rely on the sched_domain tree and whatever domains are
present in there to initialize the sched_group cpu_power.
Note: we need to iterate the CPU mask backwards because of the
cpumask_first() condition for iterating up the tree. By iterating the
mask backwards we ensure all groups of a domain are set-up before
starting on the parent groups that rely on its children to be
completely done.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 39 +++++----------------------------------
1 file changed, 5 insertions(+), 34 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -7310,43 +7310,14 @@ static int __build_sched_domains(const s
}
/* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(cpu_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_MC
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(core_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_BOOK
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(book_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(phys_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-
-#ifdef CONFIG_NUMA
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(node_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
- if (d.sd_allnodes) {
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(allnodes_domains, i).sd;
+ sd = *per_cpu_ptr(d.sd, i);
+ for (; sd; sd = sd->parent)
init_sched_groups_power(i, sd);
- }
}
-#endif
/* Attach the domains */
for_each_cpu(i, cpu_map) {
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (7 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 08/14] sched: Simplify sched_groups_power initialization Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-18 9:08 ` Bharata B Rao
2011-03-19 1:23 ` Venkatesh Pallipadi
2011-03-14 15:06 ` [RFC][PATCH 10/14] sched: Simplify the free path some Peter Zijlstra
` (5 subsequent siblings)
14 siblings, 2 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo8.patch --]
[-- Type: text/plain, Size: 26574 bytes --]
Instead of relying on static allocations for the sched_domain and
sched_group trees, dynamically allocate and RCU free them.
Allocating this dynamically also allows for some build_sched_groups()
simplification since we can now (like with other simplifications) rely
on the sched_domain tree instead of hard-coded knowledge.
One tricky to note is that detach_destroy_domains() needs to hold
rcu_read_lock() over the entire tear-down, per-cpu is not sufficient
since that can lead to partial sched_group existance (could possibly
be solved by doing the tear-down backwards but this is much more
robust).
A concequence of the above is that we can no longer print the
sched_domain debug stuff from cpu_attach_domain() since that might now
run with preemption disabled (due to classic RCU etc.) and
sched_domain_debug() does some GFP_KERNEL allocations.
Another thing to note is that we now fully rely on normal RCU and not
RCU-sched, this is because with the new and exiting RCU flavours we
grew over the years BH doesn't necessarily hold off RCU-sched grace
periods (-rt is known to break this). This would in fact already cause
us grief since we do sched_domain/sched_group iterations from softirq
context.
This patch is somewhat larger than I would like it to be, but I didn't
find any means of shrinking/splitting this.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/linux/sched.h | 5
kernel/sched.c | 464 +++++++++++++++++++-------------------------------
kernel/sched_fair.c | 32 ++-
3 files changed, 214 insertions(+), 287 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -867,6 +867,7 @@ static inline int sd_power_saving_flags(
struct sched_group {
struct sched_group *next; /* Must be a circular list */
+ atomic_t ref;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -972,6 +973,10 @@ struct sched_domain {
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
+ union {
+ void *private; /* used during construction */
+ struct rcu_head rcu; /* used during destruction */
+ };
unsigned int span_weight;
/*
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -418,6 +418,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -572,7 +573,7 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_sched_held() || \
+ rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -6548,12 +6549,11 @@ sd_parent_degenerate(struct sched_domain
return 1;
}
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
{
- synchronize_sched();
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
-
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
@@ -6594,7 +6594,7 @@ static void rq_attach_root(struct rq *rq
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- free_rootdomain(old_rd);
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -6645,6 +6645,25 @@ static struct root_domain *alloc_rootdom
return rd;
}
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+ if (atomic_dec_and_test(&sd->groups->ref))
+ kfree(sd->groups);
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -6665,20 +6684,25 @@ cpu_attach_domain(struct sched_domain *s
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
+ tmp = sd;
sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
}
- sched_domain_debug(sd, cpu);
+// sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
+ tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
}
/* cpus with isolated domains */
@@ -6694,56 +6718,6 @@ static int __init isolated_cpu_setup(cha
__setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
- const struct cpumask *cpu_map,
- int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *tmpmask),
- struct cpumask *covered, struct cpumask *tmpmask)
-{
- struct sched_group *first = NULL, *last = NULL;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group = group_fn(i, cpu_map, &sg, tmpmask);
- int j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
-
- for_each_cpu(j, span) {
- if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-}
-
#define SD_NODES_PER_DOMAIN 16
#ifdef CONFIG_NUMA
@@ -6834,154 +6808,96 @@ struct static_sched_domain {
DECLARE_BITMAP(span, CONFIG_NR_CPUS);
};
+struct sd_data {
+ struct sched_domain **__percpu sd;
+ struct sched_group **__percpu sg;
+};
+
struct s_data {
#ifdef CONFIG_NUMA
int sd_allnodes;
#endif
cpumask_var_t nodemask;
cpumask_var_t send_covered;
- cpumask_var_t tmpmask;
struct sched_domain ** __percpu sd;
+ struct sd_data sdd[SD_LV_MAX];
struct root_domain *rd;
};
enum s_alloc {
sa_rootdomain,
sa_sd,
- sa_tmpmask,
+ sa_sd_storage,
sa_send_covered,
sa_nodemask,
sa_none,
};
/*
- * SMT sched-domains:
+ * Assumes the sched_domain tree is fully constructed
*/
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
-
-static int
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
- if (sg)
- *sg = &per_cpu(sched_groups, cpu).sg;
- return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
-/*
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
-static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_SMT
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
if (sg)
- *sg = &per_cpu(sched_group_core, group).sg;
- return group;
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+
+ return cpu;
}
-#endif /* CONFIG_SCHED_MC */
/*
- * book sched-domains:
+ * build_sched_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
+ *
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
*/
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
-
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group = cpu;
-#ifdef CONFIG_SCHED_MC
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_book, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
-
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
-
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
+static void
+build_sched_groups(struct sched_domain *sd, struct cpumask *covered)
{
- int group;
-#ifdef CONFIG_SCHED_BOOK
- cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_phys, group).sg;
- return group;
-}
-
-#ifdef CONFIG_NUMA
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_node);
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ int i;
-static int cpu_to_node_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *nodemask)
-{
- int group;
+ cpumask_clear(covered);
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- group = cpumask_first(nodemask);
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group = get_group(i, sdd, &sg);
+ int j;
- if (sg)
- *sg = &per_cpu(sched_group_node, group).sg;
- return group;
-}
+ if (cpumask_test_cpu(i, covered))
+ continue;
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+ cpumask_clear(sched_group_cpus(sg));
+ sg->cpu_power = 0;
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *nodemask)
-{
- int group;
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
+ continue;
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- group = cpumask_first(nodemask);
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
+ }
- if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
- return group;
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
+ }
+ last->next = first;
}
-#endif /* CONFIG_NUMA */
-
/*
* Initialize sched groups cpu_power.
*
@@ -7015,15 +6931,15 @@ static void init_sched_groups_power(int
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
-#define SD_INIT(sd, type) sd_init_##type(sd)
-
-#define SD_INIT_FUNC(type) \
-static noinline void sd_init_##type(struct sched_domain *sd) \
-{ \
- memset(sd, 0, sizeof(*sd)); \
- *sd = SD_##type##_INIT; \
- sd->level = SD_LV_##type; \
- SD_INIT_NAME(sd, type); \
+#define SD_INIT_FUNC(type) \
+static noinline struct sched_domain *sd_init_##type(struct s_data *d, int cpu) \
+{ \
+ struct sched_domain *sd = *per_cpu_ptr(d->sdd[SD_LV_##type].sd, cpu); \
+ *sd = SD_##type##_INIT; \
+ sd->level = SD_LV_##type; \
+ SD_INIT_NAME(sd, type); \
+ sd->private = &d->sdd[SD_LV_##type]; \
+ return sd; \
}
SD_INIT_FUNC(CPU)
@@ -7079,13 +6995,22 @@ static void set_domain_attribute(struct
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
+ int i, j;
+
switch (what) {
case sa_rootdomain:
- free_rootdomain(d->rd); /* fall through */
+ free_rootdomain(&d->rd->rcu); /* fall through */
case sa_sd:
free_percpu(d->sd); /* fall through */
- case sa_tmpmask:
- free_cpumask_var(d->tmpmask); /* fall through */
+ case sa_sd_storage:
+ for (i = 0; i < SD_LV_MAX; i++) {
+ for_each_cpu(j, cpu_map) {
+ kfree(*per_cpu_ptr(d->sdd[i].sd, j));
+ kfree(*per_cpu_ptr(d->sdd[i].sg, j));
+ }
+ free_percpu(d->sdd[i].sd);
+ free_percpu(d->sdd[i].sg);
+ } /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
case sa_nodemask:
@@ -7098,25 +7023,70 @@ static void __free_domain_allocs(struct
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
+ int i, j;
+
+ memset(d, 0, sizeof(*d));
+
if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
return sa_none;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
return sa_nodemask;
- if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- return sa_send_covered;
- d->sd = alloc_percpu(struct sched_domain *);
- if (!d->sd) {
- printk(KERN_WARNING "Cannot alloc per-cpu pointers\n");
- return sa_tmpmask;
+ for (i = 0; i < SD_LV_MAX; i++) {
+ d->sdd[i].sd = alloc_percpu(struct sched_domain *);
+ if (!d->sdd[i].sd)
+ return sa_sd_storage;
+
+ d->sdd[i].sg = alloc_percpu(struct sched_group *);
+ if (!d->sdd[i].sg)
+ return sa_sd_storage;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return sa_sd_storage;
+
+ *per_cpu_ptr(d->sdd[i].sd, j) = sd;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return sa_sd_storage;
+
+ *per_cpu_ptr(d->sdd[i].sg, j) = sg;
+ }
}
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
d->rd = alloc_rootdomain();
- if (!d->rd) {
- printk(KERN_WARNING "Cannot alloc root domain\n");
+ if (!d->rd)
return sa_sd;
- }
return sa_rootdomain;
}
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+ struct sd_data *sdd = sd->private;
+ struct sched_group *sg = sd->groups;
+
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
+
+ if (cpu == cpumask_first(sched_group_cpus(sg))) {
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ }
+}
+
static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
{
@@ -7127,24 +7097,20 @@ static struct sched_domain *__build_numa
d->sd_allnodes = 0;
if (cpumask_weight(cpu_map) >
SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- sd = &per_cpu(allnodes_domains, i).sd;
- SD_INIT(sd, ALLNODES);
+ sd = sd_init_ALLNODES(d, i);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), cpu_map);
- cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
d->sd_allnodes = 1;
}
parent = sd;
- sd = &per_cpu(node_domains, i).sd;
- SD_INIT(sd, NODE);
+ sd = sd_init_NODE(d, i);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
sd->parent = parent;
if (parent)
parent->child = sd;
cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
- cpu_to_node_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
return sd;
}
@@ -7154,14 +7120,12 @@ static struct sched_domain *__build_cpu_
struct sched_domain *parent, int i)
{
struct sched_domain *sd;
- sd = &per_cpu(phys_domains, i).sd;
- SD_INIT(sd, CPU);
+ sd = sd_init_CPU(d, i);
set_domain_attribute(sd, attr);
cpumask_copy(sched_domain_span(sd), d->nodemask);
sd->parent = parent;
if (parent)
parent->child = sd;
- cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
return sd;
}
@@ -7171,13 +7135,11 @@ static struct sched_domain *__build_book
{
struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_BOOK
- sd = &per_cpu(book_domains, i).sd;
- SD_INIT(sd, BOOK);
+ sd = sd_init_BOOK(d, i);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
sd->parent = parent;
parent->child = sd;
- cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
return sd;
}
@@ -7188,13 +7150,11 @@ static struct sched_domain *__build_mc_s
{
struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_MC
- sd = &per_cpu(core_domains, i).sd;
- SD_INIT(sd, MC);
+ sd = sd_init_MC(d, i);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
sd->parent = parent;
parent->child = sd;
- cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
return sd;
}
@@ -7205,70 +7165,15 @@ static struct sched_domain *__build_smt_
{
struct sched_domain *sd = parent;
#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
- SD_INIT(sd, SIBLING);
+ sd = sd_init_SIBLING(d, i);
set_domain_attribute(sd, attr);
cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
sd->parent = parent;
parent->child = sd;
- cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
return sd;
}
-static void build_sched_groups(struct s_data *d, struct sched_domain *sd,
- const struct cpumask *cpu_map, int cpu)
-{
- switch (sd->level) {
-#ifdef CONFIG_SCHED_SMT
- case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- if (cpu == cpumask_first(sched_domain_span(sd)))
- init_sched_build_groups(sched_domain_span(sd), cpu_map,
- &cpu_to_cpu_group,
- d->send_covered, d->tmpmask);
- break;
-#endif
-#ifdef CONFIG_SCHED_MC
- case SD_LV_MC: /* set up multi-core groups */
- if (cpu == cpumask_first(sched_domain_span(sd)))
- init_sched_build_groups(sched_domain_span(sd), cpu_map,
- &cpu_to_core_group,
- d->send_covered, d->tmpmask);
- break;
-#endif
-#ifdef CONFIG_SCHED_BOOK
- case SD_LV_BOOK: /* set up book groups */
- if (cpu == cpumask_first(sched_domain_span(sd)))
- init_sched_build_groups(sched_domain_span(sd), cpu_map,
- &cpu_to_book_group,
- d->send_covered, d->tmpmask);
- break;
-#endif
- case SD_LV_CPU: /* set up physical groups */
- if (cpu == cpumask_first(sched_domain_span(sd)))
- init_sched_build_groups(sched_domain_span(sd), cpu_map,
- &cpu_to_phys_group,
- d->send_covered, d->tmpmask);
- break;
-#ifdef CONFIG_NUMA
- case SD_LV_NODE:
- if (cpu == cpumask_first(sched_domain_span(sd)))
- init_sched_build_groups(sched_domain_span(sd), cpu_map,
- &cpu_to_node_group,
- d->send_covered, d->tmpmask);
-
- case SD_LV_ALLNODES:
- if (cpu == cpumask_first(cpu_map))
- init_sched_build_groups(cpu_map, cpu_map,
- &cpu_to_allnodes_group,
- d->send_covered, d->tmpmask);
- break;
-#endif
- default:
- break;
- }
-}
-
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
@@ -7277,20 +7182,15 @@ static int __build_sched_domains(const s
struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
+ struct sched_domain *sd;
struct s_data d;
- struct sched_domain *sd, *tmp;
int i;
-#ifdef CONFIG_NUMA
- d.sd_allnodes = 0;
-#endif
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- /*
- * Set up domains for cpus specified by the cpu_map.
- */
+ /* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
cpu_map);
@@ -7302,10 +7202,19 @@ static int __build_sched_domains(const s
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
*per_cpu_ptr(d.sd, i) = sd;
+ }
- for (tmp = sd; tmp; tmp = tmp->parent) {
- tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
- build_sched_groups(&d, tmp, cpu_map, i);
+ /* Build the groups for the domains */
+ for_each_cpu(i, cpu_map) {
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ get_group(i, sd->private, &sd->groups);
+ atomic_inc(&sd->groups->ref);
+
+ if (i != cpumask_first(sched_domain_span(sd)))
+ continue;
+
+ build_sched_groups(sd, d.send_covered);
}
}
@@ -7314,18 +7223,20 @@ static int __build_sched_domains(const s
if (!cpumask_test_cpu(i, cpu_map))
continue;
- sd = *per_cpu_ptr(d.sd, i);
- for (; sd; sd = sd->parent)
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
init_sched_groups_power(i, sd);
+ }
}
/* Attach the domains */
for_each_cpu(i, cpu_map) {
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
+ sched_domain_debug(sd, i);
}
- __free_domain_allocs(&d, sa_tmpmask, cpu_map);
+ __free_domain_allocs(&d, sa_sd, cpu_map);
return 0;
error:
@@ -7407,25 +7318,18 @@ static int init_sched_domains(const stru
return err;
}
-static void destroy_sched_domains(const struct cpumask *cpu_map,
- struct cpumask *tmpmask)
-{
-}
-
/*
* Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
- /* Save because hotplug lock held. */
- static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i;
+ rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
- synchronize_sched();
- destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ rcu_read_unlock();
}
/* handle null as "default" */
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -1621,6 +1621,7 @@ static int select_idle_sibling(struct ta
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
+ rcu_read_lock();
for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
@@ -1640,6 +1641,7 @@ static int select_idle_sibling(struct ta
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break;
}
+ rcu_read_unlock();
return target;
}
@@ -1672,6 +1674,7 @@ select_task_rq_fair(struct rq *rq, struc
new_cpu = prev_cpu;
}
+ rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
@@ -1721,10 +1724,11 @@ select_task_rq_fair(struct rq *rq, struc
}
if (affine_sd) {
- if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- return select_idle_sibling(p, cpu);
- else
- return select_idle_sibling(p, prev_cpu);
+ if (wake_affine(affine_sd, p, sync))
+ prev_cpu = cpu;
+
+ new_cpu = select_idle_sibling(p, prev_cpu);
+ goto unlock;
}
while (sd) {
@@ -1765,6 +1769,8 @@ select_task_rq_fair(struct rq *rq, struc
}
/* while loop will break here if sd == NULL */
}
+unlock:
+ rcu_read_unlock();
return new_cpu;
}
@@ -3466,6 +3472,7 @@ static void idle_balance(int this_cpu, s
raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu);
+ rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
@@ -3487,6 +3494,7 @@ static void idle_balance(int this_cpu, s
break;
}
}
+ rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
@@ -3535,6 +3543,7 @@ static int active_load_balance_cpu_stop(
double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3550,6 +3559,7 @@ static int active_load_balance_cpu_stop(
else
schedstat_inc(sd, alb_failed);
}
+ rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
@@ -3676,6 +3686,7 @@ static int find_new_ilb(int cpu)
{
struct sched_domain *sd;
struct sched_group *ilb_group;
+ int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -3691,20 +3702,25 @@ static int find_new_ilb(int cpu)
if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
+ rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups;
do {
- if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.grp_idle_mask);
+ if (is_semi_idle_group(ilb_group)) {
+ ilb = cpumask_first(nohz.grp_idle_mask);
+ goto unlock;
+ }
ilb_group = ilb_group->next;
} while (ilb_group != sd->groups);
}
+unlock:
+ rcu_read_unlock();
out_done:
- return nr_cpu_ids;
+ return ilb;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
@@ -3838,6 +3854,7 @@ static void rebalance_domains(int cpu, e
update_shares(cpu);
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -3886,6 +3903,7 @@ static void rebalance_domains(int cpu, e
if (!balance)
break;
}
+ rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures
2011-03-14 15:06 ` [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures Peter Zijlstra
@ 2011-03-18 9:08 ` Bharata B Rao
2011-03-18 9:37 ` Peter Zijlstra
2011-03-19 1:23 ` Venkatesh Pallipadi
1 sibling, 1 reply; 23+ messages in thread
From: Bharata B Rao @ 2011-03-18 9:08 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, linux-kernel, Benjamin Herrenschmidt,
Anton Blanchard, Srivatsa Vaddagiri, Suresh Siddha,
Venkatesh Pallipadi, Paul Turner, Mike Galbraith, Thomas Gleixner,
Heiko Carstens, Andreas Herrmann
On Mon, Mar 14, 2011 at 8:36 PM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
>
> enum s_alloc {
> sa_rootdomain,
> sa_sd,
> - sa_tmpmask,
> + sa_sd_storage,
> sa_send_covered,
> sa_nodemask,
> sa_none,
> };
>
>
>
> SD_INIT_FUNC(CPU)
> @@ -7079,13 +6995,22 @@ static void set_domain_attribute(struct
> static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
> const struct cpumask *cpu_map)
> {
> + int i, j;
> +
> switch (what) {
> case sa_rootdomain:
> - free_rootdomain(d->rd); /* fall through */
> + free_rootdomain(&d->rd->rcu); /* fall through */
> case sa_sd:
> free_percpu(d->sd); /* fall through */
> - case sa_tmpmask:
> - free_cpumask_var(d->tmpmask); /* fall through */
> + case sa_sd_storage:
> + for (i = 0; i < SD_LV_MAX; i++) {
> + for_each_cpu(j, cpu_map) {
> + kfree(*per_cpu_ptr(d->sdd[i].sd, j));
> + kfree(*per_cpu_ptr(d->sdd[i].sg, j));
> + }
> + free_percpu(d->sdd[i].sd);
> + free_percpu(d->sdd[i].sg);
> + } /* fall through */
> case sa_send_covered:
> free_cpumask_var(d->send_covered); /* fall through */
> case sa_nodemask:
> @@ -7098,25 +7023,70 @@ static void __free_domain_allocs(struct
> static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
> const struct cpumask *cpu_map)
> {
> + int i, j;
> +
> + memset(d, 0, sizeof(*d));
> +
> if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
> return sa_none;
> if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
> return sa_nodemask;
> - if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
> - return sa_send_covered;
sa_send_covered enum member can be removed, since you no longer seem
to be using it.
Regards,
Bharata.
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures
2011-03-18 9:08 ` Bharata B Rao
@ 2011-03-18 9:37 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-18 9:37 UTC (permalink / raw)
To: Bharata B Rao
Cc: Ingo Molnar, linux-kernel, Benjamin Herrenschmidt,
Anton Blanchard, Srivatsa Vaddagiri, Suresh Siddha,
Venkatesh Pallipadi, Paul Turner, Mike Galbraith, Thomas Gleixner,
Heiko Carstens, Andreas Herrmann
On Fri, 2011-03-18 at 14:38 +0530, Bharata B Rao wrote:
> sa_send_covered enum member can be removed, since you no longer seem
> to be using it.
Indeed, thanks!
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures
2011-03-14 15:06 ` [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures Peter Zijlstra
2011-03-18 9:08 ` Bharata B Rao
@ 2011-03-19 1:23 ` Venkatesh Pallipadi
2011-03-25 21:06 ` Peter Zijlstra
1 sibling, 1 reply; 23+ messages in thread
From: Venkatesh Pallipadi @ 2011-03-19 1:23 UTC (permalink / raw)
To: Peter Zijlstra
Cc: Ingo Molnar, linux-kernel, Benjamin Herrenschmidt,
Anton Blanchard, Srivatsa Vaddagiri, Suresh Siddha, Paul Turner,
Mike Galbraith, Thomas Gleixner, Heiko Carstens, Andreas Herrmann
On Mon, Mar 14, 2011 at 8:06 AM, Peter Zijlstra <a.p.zijlstra@chello.nl> wrote:
> @@ -1721,10 +1724,11 @@ select_task_rq_fair(struct rq *rq, struc
> }
>
> if (affine_sd) {
> - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
> - return select_idle_sibling(p, cpu);
> - else
> - return select_idle_sibling(p, prev_cpu);
> + if (wake_affine(affine_sd, p, sync))
> + prev_cpu = cpu;
> +
> + new_cpu = select_idle_sibling(p, prev_cpu);
> + goto unlock;
> }
>
> while (sd) {
This would result in going through wake_affine() doing all
effective_load stuff even with cpu == prev_cpu. No?
So, we need either if (cpu != prev_cpu && wake_affine(affine_sd, p,
sync)) or an check at the start to have want_affine=0 for this case.
Overall patchset looks great!
Thanks,
Venki
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures
2011-03-19 1:23 ` Venkatesh Pallipadi
@ 2011-03-25 21:06 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-25 21:06 UTC (permalink / raw)
To: Venkatesh Pallipadi
Cc: Ingo Molnar, linux-kernel, Benjamin Herrenschmidt,
Anton Blanchard, Srivatsa Vaddagiri, Suresh Siddha, Paul Turner,
Mike Galbraith, Thomas Gleixner, Heiko Carstens, Andreas Herrmann
On Fri, 2011-03-18 at 18:23 -0700, Venkatesh Pallipadi wrote:
> > if (affine_sd) {
> > - if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
> > - return select_idle_sibling(p, cpu);
> > - else
> > - return select_idle_sibling(p, prev_cpu);
> > + if (wake_affine(affine_sd, p, sync))
> > + prev_cpu = cpu;
> > +
> > + new_cpu = select_idle_sibling(p, prev_cpu);
> > + goto unlock;
> > }
> >
> > while (sd) {
>
> This would result in going through wake_affine() doing all
> effective_load stuff even with cpu == prev_cpu. No?
> So, we need either if (cpu != prev_cpu && wake_affine(affine_sd, p,
> sync)) or an check at the start to have want_affine=0 for this case.
D'0h yeah, I missed the conditional execution of wake_affine there, silly me.
^ permalink raw reply [flat|nested] 23+ messages in thread
* [RFC][PATCH 10/14] sched: Simplify the free path some
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (8 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 09/14] sched: Dynamically allocate sched_domain/sched_group data-structures Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 11/14] sched: Reduce some allocation pressure Peter Zijlstra
` (4 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo9.patch --]
[-- Type: text/plain, Size: 1416 bytes --]
If we check the root_domain reference count we can see if its been
used or not, use this observation to simplify some of the return
paths.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6999,7 +6999,8 @@ static void __free_domain_allocs(struct
switch (what) {
case sa_rootdomain:
- free_rootdomain(&d->rd->rcu); /* fall through */
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
case sa_sd:
free_percpu(d->sd); /* fall through */
case sa_sd_storage:
@@ -7184,7 +7185,7 @@ static int __build_sched_domains(const s
enum s_alloc alloc_state = sa_none;
struct sched_domain *sd;
struct s_data d;
- int i;
+ int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
@@ -7236,12 +7237,10 @@ static int __build_sched_domains(const s
sched_domain_debug(sd, i);
}
- __free_domain_allocs(&d, sa_sd, cpu_map);
- return 0;
-
+ ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
- return -ENOMEM;
+ return ret;
}
static int build_sched_domains(const struct cpumask *cpu_map)
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 11/14] sched: Reduce some allocation pressure
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (9 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 10/14] sched: Simplify the free path some Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 12/14] sched: Simplify NODE/ALLNODES domain creation Peter Zijlstra
` (3 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo10.patch --]
[-- Type: text/plain, Size: 991 bytes --]
Since we now allocate SD_LV_MAX * nr_cpu_ids sched_domain/sched_group
structures when rebuilding the scheduler toplogy it might make sense
to shrink that depending on the CONFIG_ options.
This is only needed until we get rid of SD_LV_* alltogether and
provide a full dynamic topology interface.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/linux/sched.h | 8 ++++++++
1 file changed, 8 insertions(+)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -895,12 +895,20 @@ static inline struct cpumask *sched_grou
enum sched_domain_level {
SD_LV_NONE = 0,
+#ifdef CONFIG_SCHED_SMT
SD_LV_SIBLING,
+#endif
+#ifdef CONFIG_SCHED_MC
SD_LV_MC,
+#endif
+#ifdef CONFIG_SCHED_BOOK
SD_LV_BOOK,
+#endif
SD_LV_CPU,
+#ifdef CONFIG_NUMA
SD_LV_NODE,
SD_LV_ALLNODES,
+#endif
SD_LV_MAX
};
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 12/14] sched: Simplify NODE/ALLNODES domain creation
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (10 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 11/14] sched: Reduce some allocation pressure Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 13/14] sched: Remove nodemask allocation Peter Zijlstra
` (2 subsequent siblings)
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo11.patch --]
[-- Type: text/plain, Size: 2839 bytes --]
Don't treat ALLNODES/NODE different for difference's sake. Simply
always create the ALLNODES domain and let the sd_degenerate() checks
kill it when its redundant. This simplifies the code flow.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 40 ++++++++++++++++++++++------------------
1 file changed, 22 insertions(+), 18 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6814,9 +6814,6 @@ struct sd_data {
};
struct s_data {
-#ifdef CONFIG_NUMA
- int sd_allnodes;
-#endif
cpumask_var_t nodemask;
cpumask_var_t send_covered;
struct sched_domain ** __percpu sd;
@@ -7088,30 +7085,35 @@ static void claim_allocations(int cpu, s
}
}
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+static struct sched_domain *__build_allnodes_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
{
struct sched_domain *sd = NULL;
#ifdef CONFIG_NUMA
- struct sched_domain *parent;
-
- d->sd_allnodes = 0;
- if (cpumask_weight(cpu_map) >
- SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- sd = sd_init_ALLNODES(d, i);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), cpu_map);
- d->sd_allnodes = 1;
- }
- parent = sd;
+ sd = sd_init_ALLNODES(d, i);
+ set_domain_attribute(sd, attr);
+ cpumask_copy(sched_domain_span(sd), cpu_map);
+ sd->parent = parent;
+ if (parent)
+ parent->child = sd;
+#endif
+ return sd;
+}
+static struct sched_domain *__build_node_sched_domain(struct s_data *d,
+ const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ struct sched_domain *parent, int i)
+{
+ struct sched_domain *sd = NULL;
+#ifdef CONFIG_NUMA
sd = sd_init_NODE(d, i);
set_domain_attribute(sd, attr);
sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+ cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
sd->parent = parent;
if (parent)
parent->child = sd;
- cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
#endif
return sd;
}
@@ -7196,7 +7198,9 @@ static int __build_sched_domains(const s
cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
cpu_map);
- sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
+ sd = NULL;
+ sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i);
+ sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 13/14] sched: Remove nodemask allocation
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (11 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 12/14] sched: Simplify NODE/ALLNODES domain creation Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-14 15:06 ` [RFC][PATCH 14/14] sched: Remove some dead code Peter Zijlstra
2011-03-25 21:46 ` [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo12.patch --]
[-- Type: text/plain, Size: 2155 bytes --]
There's only one nodemask user left so remove it with a direct
computation and save some memory and reduce some code-flow
complexity.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
kernel/sched.c | 14 +++-----------
1 file changed, 3 insertions(+), 11 deletions(-)
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6814,7 +6814,6 @@ struct sd_data {
};
struct s_data {
- cpumask_var_t nodemask;
cpumask_var_t send_covered;
struct sched_domain ** __percpu sd;
struct sd_data sdd[SD_LV_MAX];
@@ -6826,7 +6825,6 @@ enum s_alloc {
sa_sd,
sa_sd_storage,
sa_send_covered,
- sa_nodemask,
sa_none,
};
@@ -7011,8 +7009,6 @@ static void __free_domain_allocs(struct
} /* fall through */
case sa_send_covered:
free_cpumask_var(d->send_covered); /* fall through */
- case sa_nodemask:
- free_cpumask_var(d->nodemask); /* fall through */
case sa_none:
break;
}
@@ -7025,10 +7021,8 @@ static enum s_alloc __visit_domain_alloc
memset(d, 0, sizeof(*d));
- if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_none;
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- return sa_nodemask;
+ return sa_none;
for (i = 0; i < SD_LV_MAX; i++) {
d->sdd[i].sd = alloc_percpu(struct sched_domain *);
if (!d->sdd[i].sd)
@@ -7125,7 +7119,8 @@ static struct sched_domain *__build_cpu_
struct sched_domain *sd;
sd = sd_init_CPU(d, i);
set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), d->nodemask);
+ cpumask_and(sched_domain_span(sd),
+ cpumask_of_node(cpu_to_node(i)), cpu_map);
sd->parent = parent;
if (parent)
parent->child = sd;
@@ -7195,9 +7190,6 @@ static int __build_sched_domains(const s
/* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
- cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- cpu_map);
-
sd = NULL;
sd = __build_allnodes_sched_domain(&d, cpu_map, attr, sd, i);
sd = __build_node_sched_domain(&d, cpu_map, attr, sd, i);
^ permalink raw reply [flat|nested] 23+ messages in thread* [RFC][PATCH 14/14] sched: Remove some dead code
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (12 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 13/14] sched: Remove nodemask allocation Peter Zijlstra
@ 2011-03-14 15:06 ` Peter Zijlstra
2011-03-25 21:46 ` [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
14 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-14 15:06 UTC (permalink / raw)
To: Ingo Molnar, linux-kernel
Cc: Benjamin Herrenschmidt, Anton Blanchard, Srivatsa Vaddagiri,
Suresh Siddha, Venkatesh Pallipadi, Paul Turner, Mike Galbraith,
Thomas Gleixner, Heiko Carstens, Andreas Herrmann, Peter Zijlstra
[-- Attachment #1: sched-foo13.patch --]
[-- Type: text/plain, Size: 1967 bytes --]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
---
include/linux/sched.h | 6 ------
kernel/sched.c | 16 ----------------
2 files changed, 22 deletions(-)
Index: linux-2.6/include/linux/sched.h
===================================================================
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -881,9 +881,6 @@ struct sched_group {
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
- *
- * It is also be embedded into static data structures at build
- * time. (See 'struct static_sched_group' in kernel/sched.c)
*/
unsigned long cpumask[0];
};
@@ -992,9 +989,6 @@ struct sched_domain {
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
- *
- * It is also be embedded into static data structures at build
- * time. (See 'struct static_sched_domain' in kernel/sched.c)
*/
unsigned long span[0];
};
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -6793,22 +6793,6 @@ static void sched_domain_node_span(int n
int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-/*
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- * and struct sched_domain. )
- */
-struct static_sched_group {
- struct sched_group sg;
- DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
-
-struct static_sched_domain {
- struct sched_domain sd;
- DECLARE_BITMAP(span, CONFIG_NR_CPUS);
-};
-
struct sd_data {
struct sched_domain **__percpu sd;
struct sched_group **__percpu sg;
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation
2011-03-14 15:06 [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
` (13 preceding siblings ...)
2011-03-14 15:06 ` [RFC][PATCH 14/14] sched: Remove some dead code Peter Zijlstra
@ 2011-03-25 21:46 ` Peter Zijlstra
2011-03-25 21:53 ` Peter Zijlstra
14 siblings, 1 reply; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-25 21:46 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Benjamin Herrenschmidt, Anton Blanchard,
Srivatsa Vaddagiri, Suresh Siddha, Venkatesh Pallipadi,
Paul Turner, Mike Galbraith, Thomas Gleixner, Heiko Carstens,
Andreas Herrmann, Bharata B Rao
On Mon, 2011-03-14 at 16:06 +0100, Peter Zijlstra wrote:
> So I got annoyed with the whole sched_domain/sched_group creation mess again
> and decided to actuall do something about it. See here ;-)
>
> Its not completely done yet, but much of the ground-work is there.
>
> The final goal is to be able to have a simple way to dynamically specify the
> architecture topoplogy and have the generic code in charge of building the data
> structures. The architecture would need to provide a function that maps a cpu
> to a cpumask and a function that initializes the sched_domain (flags etc.),
> this would replace the now still hard-coded __build_*_sched_domain() calls in
> __build_sched_domains().
Got to here last friday a week ago, and started on:
> With that we can also kill the current NODE and ALLNODES mess and generate
> appropriate masks from say the ACPI SLIT table by grouping CPUs on their node
> distance.
which I just finished.
> This would allow us to kill the horrid mess in x86's cpu_coregroup_mask() and
> properly support the AMD magnycours stuff.
Still todo.
> Anyway, not quite there yet.. patches build and boot on a 2*6*2 wsm box.
Still boots, although a 2 node system isn't very interesting.
I pushed out an updated version to:
git://git.kernel.org/pub/scm/linux/kernel/git/peterz/linux-2.6-sched.git sched_domain
we're now at:
include/linux/sched.h | 26 +-
include/linux/topology.h | 25 -
kernel/cpuset.c | 2 +-
kernel/sched.c | 1094 ++++++++++++++++------------------------------
kernel/sched_fair.c | 32 +-
5 files changed, 407 insertions(+), 772 deletions(-)
^ permalink raw reply [flat|nested] 23+ messages in thread* Re: [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation
2011-03-25 21:46 ` [RFC][PATCH 00/14] Rewrite sched_domain/sched_group creation Peter Zijlstra
@ 2011-03-25 21:53 ` Peter Zijlstra
0 siblings, 0 replies; 23+ messages in thread
From: Peter Zijlstra @ 2011-03-25 21:53 UTC (permalink / raw)
To: Ingo Molnar
Cc: linux-kernel, Benjamin Herrenschmidt, Anton Blanchard,
Srivatsa Vaddagiri, Suresh Siddha, Venkatesh Pallipadi,
Paul Turner, Mike Galbraith, Thomas Gleixner, Heiko Carstens,
Andreas Herrmann, Bharata B Rao
On Fri, 2011-03-25 at 22:46 +0100, Peter Zijlstra wrote:
> > Anyway, not quite there yet.. patches build and boot on a 2*6*2 wsm
> box.
>
> Still boots, although a 2 node system isn't very interesting.
>
OK, halt made the domain stuff explode (also didn't test hotplug), so
there's something fishy.
^ permalink raw reply [flat|nested] 23+ messages in thread