[PATCH] ia64 cpuset + build_sched_domains() mangles structures

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] ia64 cpuset + build_sched_domains() mangles structures
@ 2005-08-20 14:50 John Hawkes
  2005-08-22  7:08 ` Ingo Molnar
  2005-08-22 20:38 ` Paul Jackson
  0 siblings, 2 replies; 8+ messages in thread
From: John Hawkes @ 2005-08-20 14:50 UTC (permalink / raw)
  To: linux-ia64, linux-kernel; +Cc: mingo, pj, dino, nickpiggin, akpm

I've already sent this to the maintainers, and this is now being sent to a
larger community audience.  I have fixed a problem with the ia64 version of
build_sched_domains(), but a similar fix still needs to be made to the
generic build_sched_domains() in kernel/sched.c.

The "dynamic sched domains" functionality has recently been merged into
2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
"isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
groups to separate away the CPUs in this cpu-exclusive cpuset from the
remainder of the non-isolated CPUs.  This allows the non-isolated CPUs to
completely ignore the isolated CPUs when doing load-balancing.

Unfortunately, build_sched_domains() expects that a sched domain will
include all the CPUs of each node in the domain, i.e., that no node will
belong in both an isolated cpuset and a non-isolated cpuset.  Declaring
a cpuset that violates this presumption will produce flawed data
structures and will oops the kernel.

To trigger the problem (on a NUMA system with >1 CPUs per node):
   cd /dev/cpuset
   mkdir newcpuset
   cd newcpuset
   echo 0 >cpus
   echo 0 >mems
   echo 1 >cpu_exclusive

I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
A similar shortcoming exists in the generic build_sched_domains() (in
kernel/sched.c) for NUMA, and that needs to be fixed also.  The fix involves
dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for
each invocation of build_sched_domains(), rather than using global arrays
for these structures.  Care must be taken to remember kmalloc() addresses
so that arch_destroy_sched_domains() can properly kfree() the new dynamic
structures.

This is a patch against 2.6.13-rc6.

Signed-off-by: John Hawkes <hawkes@sgi.com>

Index: linux/arch/ia64/kernel/domain.c
===================================================================
--- linux.orig/arch/ia64/kernel/domain.c	2005-08-19 08:54:00.000000000 -0700
+++ linux/arch/ia64/kernel/domain.c	2005-08-20 07:39:32.000000000 -0700
@@ -120,10 +120,10 @@
  * gets dynamically allocated.
  */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
 
 static int cpu_to_allnodes_group(int cpu)
 {
@@ -138,6 +138,21 @@
 void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
+#ifdef CONFIG_NUMA
+	struct sched_group **sched_group_nodes = NULL;
+	struct sched_group *sched_group_allnodes = NULL;
+
+	/*
+	 * Allocate the per-node list of sched groups
+	 */
+	sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
+					   GFP_ATOMIC);
+	if (!sched_group_nodes) {
+		printk(KERN_WARNING "Can not alloc sched group node list\n");
+		return;
+	}
+	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+#endif
 
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
@@ -150,8 +165,21 @@
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
-		if (num_online_cpus()
+		if (cpus_weight(*cpu_map)
 				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			if (!sched_group_allnodes) {
+				sched_group_allnodes
+					= kmalloc(sizeof(struct sched_group)
+							* MAX_NUMNODES,
+						  GFP_KERNEL);
+				if (!sched_group_allnodes) {
+					printk(KERN_WARNING
+					"Can not alloc allnodes sched group\n");
+					break;
+				}
+				sched_group_allnodes_bycpu[i]
+						= sched_group_allnodes;
+			}
 			sd = &per_cpu(allnodes_domains, i);
 			*sd = SD_ALLNODES_INIT;
 			sd->span = *cpu_map;
@@ -214,8 +242,9 @@
 	}
 
 #ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, *cpu_map,
-				&cpu_to_allnodes_group);
+	if (sched_group_allnodes)
+		init_sched_build_groups(sched_group_allnodes, *cpu_map,
+					&cpu_to_allnodes_group);
 
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		/* Set up node groups */
@@ -226,8 +255,10 @@
 		int j;
 
 		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
+		if (cpus_empty(nodemask)) {
+			sched_group_nodes[i] = NULL;
 			continue;
+		}
 
 		domainspan = sched_domain_node_span(i);
 		cpus_and(domainspan, domainspan, *cpu_map);
@@ -341,7 +372,7 @@
 #endif
 
 	/* Attach the domains */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -372,25 +403,42 @@
 {
 #ifdef CONFIG_NUMA
 	int i;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-		struct sched_group *oldsg, *sg = sched_group_nodes[i];
+	int cpu;
 
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
+	for_each_cpu_mask(cpu, *cpu_map) {
+		struct sched_group *sched_group_allnodes
+			= sched_group_allnodes_bycpu[cpu];
+		struct sched_group **sched_group_nodes
+			= sched_group_nodes_bycpu[cpu];
+
+		if (sched_group_allnodes) {
+			kfree(sched_group_allnodes);
+			sched_group_allnodes_bycpu[cpu] = NULL;
+		}
 
-		if (sg == NULL)
+		if (!sched_group_nodes)
 			continue;
-		sg = sg->next;
+
+		for (i = 0; i < MAX_NUMNODES; i++) {
+			cpumask_t nodemask = node_to_cpumask(i);
+			struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+			cpus_and(nodemask, nodemask, *cpu_map);
+			if (cpus_empty(nodemask))
+				continue;
+
+			if (sg == NULL)
+				continue;
+			sg = sg->next;
 next_sg:
-		oldsg = sg;
-		sg = sg->next;
-		kfree(oldsg);
-		if (oldsg != sched_group_nodes[i])
-			goto next_sg;
-		sched_group_nodes[i] = NULL;
+			oldsg = sg;
+			sg = sg->next;
+			kfree(oldsg);
+			if (oldsg != sched_group_nodes[i])
+				goto next_sg;
+		}
+		kfree(sched_group_nodes);
+		sched_group_nodes_bycpu[cpu] = NULL;
 	}
 #endif
 }
-

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-20 14:50 [PATCH] ia64 cpuset + build_sched_domains() mangles structures John Hawkes
@ 2005-08-22  7:08 ` Ingo Molnar
  2005-08-22 14:14   ` Dinakar Guniguntala
  2005-08-22 20:38 ` Paul Jackson
  1 sibling, 1 reply; 8+ messages in thread
From: Ingo Molnar @ 2005-08-22  7:08 UTC (permalink / raw)
  To: John Hawkes; +Cc: linux-ia64, linux-kernel, pj, dino, nickpiggin, akpm


* John Hawkes <hawkes@jackhammer.engr.sgi.com> wrote:

> The "dynamic sched domains" functionality has recently been merged 
> into 2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive 
> (a.k.a. "isolated") cpuset and rebuilds the CPU Scheduler sched 
> domains and sched groups to separate away the CPUs in this 
> cpu-exclusive cpuset from the remainder of the non-isolated CPUs.  
> This allows the non-isolated CPUs to completely ignore the isolated 
> CPUs when doing load-balancing.
> 
> Unfortunately, build_sched_domains() expects that a sched domain will 
> include all the CPUs of each node in the domain, i.e., that no node 
> will belong in both an isolated cpuset and a non-isolated cpuset.  
> Declaring a cpuset that violates this presumption will produce flawed 
> data structures and will oops the kernel.

ouch.

looks good to me, but in terms of impact we can only do it in 2.6.14. Is 
there something simpler for 2.6.13, to make sure the kernel doesnt oops?

in terms of 2.6.14, the replacement patch below also does what i always 
wanted to do: to merge the ia64-specific build_sched_domains() code back 
into kernel/sched.c. I've done this by taking your improved dynamic 
build-domains code and putting it into kernel/sched.c.

it builds/boots fine on x86 up to 8-way non-NUMA, but the question is 
ppc64 NUMA. It should mostly work though. (Patch is against the tail of 
the scheduler queue in -mm.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>

	Ingo

-----
I've already sent this to the maintainers, and this is now being sent to a
larger community audience.  I have fixed a problem with the ia64 version of
build_sched_domains(), but a similar fix still needs to be made to the
generic build_sched_domains() in kernel/sched.c.

The "dynamic sched domains" functionality has recently been merged into
2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
"isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
groups to separate away the CPUs in this cpu-exclusive cpuset from the
remainder of the non-isolated CPUs.  This allows the non-isolated CPUs to
completely ignore the isolated CPUs when doing load-balancing.

Unfortunately, build_sched_domains() expects that a sched domain will
include all the CPUs of each node in the domain, i.e., that no node will
belong in both an isolated cpuset and a non-isolated cpuset.  Declaring
a cpuset that violates this presumption will produce flawed data
structures and will oops the kernel.

To trigger the problem (on a NUMA system with >1 CPUs per node):
   cd /dev/cpuset
   mkdir newcpuset
   cd newcpuset
   echo 0 >cpus
   echo 0 >mems
   echo 1 >cpu_exclusive

I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
A similar shortcoming exists in the generic build_sched_domains() (in
kernel/sched.c) for NUMA, and that needs to be fixed also.  The fix involves
dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for
each invocation of build_sched_domains(), rather than using global arrays
for these structures.  Care must be taken to remember kmalloc() addresses
so that arch_destroy_sched_domains() can properly kfree() the new dynamic
structures.

This is a patch against 2.6.13-rc6.

Signed-off-by: John Hawkes <hawkes@sgi.com>

reworked the patch to also move the ia64 domain setup code to the generic
code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

 arch/ia64/kernel/domain.c    |  400 -------------------------------------------
 arch/ia64/kernel/Makefile    |    2 
 include/asm-ia64/processor.h |    3 
 include/asm-ia64/topology.h  |   22 --
 include/linux/sched.h        |    9 
 include/linux/topology.h     |   22 ++
 kernel/sched.c               |  292 +++++++++++++++++++++++++------
 7 files changed, 260 insertions(+), 490 deletions(-)

Index: linux-sched-curr/arch/ia64/kernel/Makefile
===================================================================
--- linux-sched-curr.orig/arch/ia64/kernel/Makefile
+++ linux-sched-curr/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acp
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
Index: linux-sched-curr/arch/ia64/kernel/domain.c
===================================================================
--- linux-sched-curr.orig/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span, nodemask;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#endif
-
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
-
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
-
-static int cpu_to_allnodes_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-	int i;
-
-	/*
-	 * Set up domains for cpus specified by the cpu_map.
-	 */
-	for_each_cpu_mask(i, *cpu_map) {
-		int group;
-		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-
-#ifdef CONFIG_NUMA
-		if (num_online_cpus()
-				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
-			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
-			sd->groups = &sched_group_allnodes[group];
-			p = sd;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
-		sd->span = sched_domain_node_span(cpu_to_node(i));
-		sd->parent = p;
-		cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-
-		p = sd;
-		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
-		sd->parent = p;
-		sd->groups = &sched_group_phys[group];
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
-		*sd = SD_SIBLING_INIT;
-		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, *cpu_map);
-		sd->parent = p;
-		sd->groups = &sched_group_cpus[group];
-#endif
-	}
-
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
-			continue;
-
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
-	}
-#endif
-
-	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
-	}
-
-#ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, *cpu_map,
-				&cpu_to_allnodes_group);
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
-		int j;
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
-
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *sd;
-			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
-		}
-		sg->cpu_power = 0;
-		sg->cpumask = nodemask;
-		cpus_or(covered, covered, nodemask);
-		prev = sg;
-
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
-			int n = (i + j) % MAX_NUMNODES;
-
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
-				break;
-
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
-			if (cpus_empty(tmp))
-				continue;
-
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				break;
-			}
-			sg->cpu_power = 0;
-			sg->cpumask = tmp;
-			cpus_or(covered, covered, tmp);
-			prev->next = sg;
-			prev = sg;
-		}
-		prev->next = sched_group_nodes[i];
-	}
-#endif
-
-	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-
-		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-		sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
-	}
-
-#ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
-			int power;
-
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-
-			sg->cpu_power += power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
-#endif
-
-	/* Attach the domains */
-	for_each_online_cpu(i) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-#else
-		sd = &per_cpu(phys_domains, i);
-#endif
-		cpu_attach_domain(sd, i);
-	}
-	/*
-	 * Tune cache-hot values:
-	 */
-	calibrate_migration_costs();
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-	cpumask_t cpu_default_map;
-
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
-	build_sched_domains(&cpu_default_map);
-}
-
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-	int i;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-		struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		if (sg == NULL)
-			continue;
-		sg = sg->next;
-next_sg:
-		oldsg = sg;
-		sg = sg->next;
-		kfree(oldsg);
-		if (oldsg != sched_group_nodes[i])
-			goto next_sg;
-		sched_group_nodes[i] = NULL;
-	}
-#endif
-}
-
Index: linux-sched-curr/include/asm-ia64/processor.h
===================================================================
--- linux-sched-curr.orig/include/asm-ia64/processor.h
+++ linux-sched-curr/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
-
 #define IA64_NUM_DBG_REGS	8
 /*
  * Limits for PMC and PMD are set to less than maximum architected values
Index: linux-sched-curr/include/asm-ia64/topology.h
===================================================================
--- linux-sched-curr.orig/include/asm-ia64/topology.h
+++ linux-sched-curr/include/asm-ia64/topology.h
@@ -96,28 +96,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 64,			\
-	.max_interval		= 64*num_online_cpus(),	\
-	.busy_factor		= 128,			\
-	.imbalance_pct		= 133,			\
-	.cache_nice_tries	= 1,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
Index: linux-sched-curr/include/linux/sched.h
===================================================================
--- linux-sched-curr.orig/include/linux/sched.h
+++ linux-sched-curr/include/linux/sched.h
@@ -546,15 +546,6 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-	                        cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-
-#endif /* ARCH_HAS_SCHED_DOMAIN */
-
 /*
  * Maximum cache size the migration-costs auto-tuning code will
  * search from:
Index: linux-sched-curr/include/linux/topology.h
===================================================================
--- linux-sched-curr.orig/include/linux/topology.h
+++ linux-sched-curr/include/linux/topology.h
@@ -133,6 +133,28 @@
 }
 #endif
 
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
+	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 64,			\
+	.nr_balance_failed	= 0,			\
+}
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
Index: linux-sched-curr/kernel/sched.c
===================================================================
--- linux-sched-curr.orig/kernel/sched.c
+++ linux-sched-curr/kernel/sched.c
@@ -4947,7 +4947,7 @@ static int sd_parent_degenerate(struct s
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4970,7 +4970,7 @@ void cpu_attach_domain(struct sched_doma
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4998,8 +4998,8 @@ __setup ("isolcpus=", isolated_cpu_setup
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -5513,12 +5513,85 @@ void __devinit calibrate_migration_costs
 	local_irq_restore(flags);
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -5540,36 +5613,20 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
 /*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
  */
-static void check_sibling_maps(void)
-{
-	int i, j;
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
 
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int cpu_to_allnodes_group(int cpu)
+{
+	return cpu_to_node(cpu);
 }
 #endif
 
@@ -5577,7 +5634,7 @@ static void check_sibling_maps(void)
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
@@ -5592,11 +5649,22 @@ static void build_sched_domains(const cp
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -5621,7 +5689,7 @@ static void build_sched_domains(const cp
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -5646,8 +5714,74 @@ static void build_sched_domains(const cp
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+				&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5666,16 +5800,48 @@ static void build_sched_domains(const cp
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
-	for_each_cpu_mask(i, *cpu_map) {
+	for_each_online_cpu(i) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);
@@ -5692,13 +5858,10 @@ static void build_sched_domains(const cp
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5711,10 +5874,29 @@ static void arch_init_sched_domains(cpum
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+		struct sched_group *oldsg, *sg = sched_group_nodes[i];
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		if (sg == NULL)
+			continue;
+		sg = sg->next;
+next_sg:
+		oldsg = sg;
+		sg = sg->next;
+		kfree(oldsg);
+		if (oldsg != sched_group_nodes[i])
+			goto next_sg;
+		sched_group_nodes[i] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-22  7:08 ` Ingo Molnar
@ 2005-08-22 14:14   ` Dinakar Guniguntala
  2005-08-22 16:07     ` Ingo Molnar
  0 siblings, 1 reply; 8+ messages in thread
From: Dinakar Guniguntala @ 2005-08-22 14:14 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: John Hawkes, linux-ia64, linux-kernel, pj, nickpiggin, akpm

[-- Attachment #1: Type: text/plain, Size: 720 bytes --]

On Mon, Aug 22, 2005 at 09:08:34AM +0200, Ingo Molnar wrote:
> 
> in terms of 2.6.14, the replacement patch below also does what i always 
> wanted to do: to merge the ia64-specific build_sched_domains() code back 
> into kernel/sched.c. I've done this by taking your improved dynamic 
> build-domains code and putting it into kernel/sched.c.
> 

Ingo, one change required to your patch and the exclusive
cpuset functionality seems to work fine on a NUMA ppc64 box.
I am still running some of my dynamic sched domain tests. So far
it seems to be holding ok.
Any idea why the ia64 stuff was forked in the first place?

The patch below is on top of your patch. (This is the earlier patch
John had sent)

        -Dinakar


[-- Attachment #2: sd-numa.patch --]
[-- Type: text/plain, Size: 447 bytes --]

diff -Naurp linux-2.6.13-rc6.ingo/kernel/sched.c linux-2.6.13-rc6/kernel/sched.c
--- linux-2.6.13-rc6.ingo/kernel/sched.c	2005-08-22 19:23:06.000000000 +0530
+++ linux-2.6.13-rc6/kernel/sched.c	2005-08-22 19:36:45.000000000 +0530
@@ -5192,7 +5192,7 @@ next_sg:
 #endif
 
 	/* Attach the domains */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
 		sd = &per_cpu(cpu_domains, i);

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-22 14:14   ` Dinakar Guniguntala
@ 2005-08-22 16:07     ` Ingo Molnar
  2005-08-22 20:16       ` Dinakar Guniguntala
  2005-09-02 14:47       ` Dinakar Guniguntala
  0 siblings, 2 replies; 8+ messages in thread
From: Ingo Molnar @ 2005-08-22 16:07 UTC (permalink / raw)
  To: Dinakar Guniguntala
  Cc: John Hawkes, linux-ia64, linux-kernel, pj, nickpiggin, akpm


* Dinakar Guniguntala <dino@in.ibm.com> wrote:

> On Mon, Aug 22, 2005 at 09:08:34AM +0200, Ingo Molnar wrote:
> > 
> > in terms of 2.6.14, the replacement patch below also does what i always 
> > wanted to do: to merge the ia64-specific build_sched_domains() code back 
> > into kernel/sched.c. I've done this by taking your improved dynamic 
> > build-domains code and putting it into kernel/sched.c.
> > 
> 
> Ingo, one change required to your patch and the exclusive
> cpuset functionality seems to work fine on a NUMA ppc64 box.
> I am still running some of my dynamic sched domain tests. So far
> it seems to be holding ok.

great! Andrew, i'd suggest we try the merged patch attached below in 
-mm.

> Any idea why the ia64 stuff was forked in the first place?

most of the NUMA domain-trees stuff happened in the ia64 space so there 
was a natural desire to keep it more hackable there. But now i think 
it's getting counterproductive.

	Ingo

-----
I've already sent this to the maintainers, and this is now being sent to a
larger community audience.  I have fixed a problem with the ia64 version of
build_sched_domains(), but a similar fix still needs to be made to the
generic build_sched_domains() in kernel/sched.c.

The "dynamic sched domains" functionality has recently been merged into
2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
"isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
groups to separate away the CPUs in this cpu-exclusive cpuset from the
remainder of the non-isolated CPUs.  This allows the non-isolated CPUs to
completely ignore the isolated CPUs when doing load-balancing.

Unfortunately, build_sched_domains() expects that a sched domain will
include all the CPUs of each node in the domain, i.e., that no node will
belong in both an isolated cpuset and a non-isolated cpuset.  Declaring
a cpuset that violates this presumption will produce flawed data
structures and will oops the kernel.

To trigger the problem (on a NUMA system with >1 CPUs per node):
   cd /dev/cpuset
   mkdir newcpuset
   cd newcpuset
   echo 0 >cpus
   echo 0 >mems
   echo 1 >cpu_exclusive

I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
A similar shortcoming exists in the generic build_sched_domains() (in
kernel/sched.c) for NUMA, and that needs to be fixed also.  The fix involves
dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for
each invocation of build_sched_domains(), rather than using global arrays
for these structures.  Care must be taken to remember kmalloc() addresses
so that arch_destroy_sched_domains() can properly kfree() the new dynamic
structures.

This is a patch against 2.6.13-rc6.

Signed-off-by: John Hawkes <hawkes@sgi.com>

reworked the patch to also move the ia64 domain setup code to the generic
code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

ppc64 fix

From: Dinakar Guniguntala <dino@in.ibm.com>

 arch/ia64/kernel/domain.c    |  400 -------------------------------------------
 arch/ia64/kernel/Makefile    |    2 
 include/asm-ia64/processor.h |    3 
 include/asm-ia64/topology.h  |   22 --
 include/linux/sched.h        |    9 
 include/linux/topology.h     |   22 ++
 kernel/sched.c               |  290 +++++++++++++++++++++++++------
 7 files changed, 259 insertions(+), 489 deletions(-)

Index: linux-sched-curr/arch/ia64/kernel/Makefile
===================================================================
--- linux-sched-curr.orig/arch/ia64/kernel/Makefile
+++ linux-sched-curr/arch/ia64/kernel/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acp
 obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
 obj-$(CONFIG_IOSAPIC)		+= iosapic.o
 obj-$(CONFIG_MODULES)		+= module.o
-obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
+obj-$(CONFIG_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_NUMA)		+= numa.o
 obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
 obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
Index: linux-sched-curr/arch/ia64/kernel/domain.c
===================================================================
--- linux-sched-curr.orig/arch/ia64/kernel/domain.c
+++ /dev/null
@@ -1,400 +0,0 @@
-/*
- * arch/ia64/kernel/domain.c
- * Architecture specific sched-domains builder.
- *
- * Copyright (C) 2004 Jesse Barnes
- * Copyright (C) 2004 Silicon Graphics, Inc.
- */
-
-#include <linux/sched.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-#include <linux/cpumask.h>
-#include <linux/init.h>
-#include <linux/topology.h>
-#include <linux/nodemask.h>
-
-#define SD_NODES_PER_DOMAIN 16
-
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Start at @node */
-		n = (node + i) % MAX_NUMNODES;
-
-		if (!nr_cpus_node(n))
-			continue;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, n);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static cpumask_t sched_domain_node_span(int node)
-{
-	int i;
-	cpumask_t span, nodemask;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	nodemask = node_to_cpumask(node);
-	cpus_or(span, span, nodemask);
-	set_bit(node, used_nodes);
-
-	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		nodemask = node_to_cpumask(next_node);
-		cpus_or(span, span, nodemask);
-	}
-
-	return span;
-}
-#endif
-
-/*
- * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
- * can switch it on easily if needed.
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
-{
-	return cpu;
-}
-#endif
-
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
-{
-#ifdef CONFIG_SCHED_SMT
-	return first_cpu(cpu_sibling_map[cpu]);
-#else
-	return cpu;
-#endif
-}
-
-#ifdef CONFIG_NUMA
-/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
- */
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group *sched_group_nodes[MAX_NUMNODES];
-
-static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
-static struct sched_group sched_group_allnodes[MAX_NUMNODES];
-
-static int cpu_to_allnodes_group(int cpu)
-{
-	return cpu_to_node(cpu);
-}
-#endif
-
-/*
- * Build sched domains for a given set of cpus and attach the sched domains
- * to the individual cpus
- */
-void build_sched_domains(const cpumask_t *cpu_map)
-{
-	int i;
-
-	/*
-	 * Set up domains for cpus specified by the cpu_map.
-	 */
-	for_each_cpu_mask(i, *cpu_map) {
-		int group;
-		struct sched_domain *sd = NULL, *p;
-		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-
-#ifdef CONFIG_NUMA
-		if (num_online_cpus()
-				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
-			sd = &per_cpu(allnodes_domains, i);
-			*sd = SD_ALLNODES_INIT;
-			sd->span = *cpu_map;
-			group = cpu_to_allnodes_group(i);
-			sd->groups = &sched_group_allnodes[group];
-			p = sd;
-		} else
-			p = NULL;
-
-		sd = &per_cpu(node_domains, i);
-		*sd = SD_NODE_INIT;
-		sd->span = sched_domain_node_span(cpu_to_node(i));
-		sd->parent = p;
-		cpus_and(sd->span, sd->span, *cpu_map);
-#endif
-
-		p = sd;
-		sd = &per_cpu(phys_domains, i);
-		group = cpu_to_phys_group(i);
-		*sd = SD_CPU_INIT;
-		sd->span = nodemask;
-		sd->parent = p;
-		sd->groups = &sched_group_phys[group];
-
-#ifdef CONFIG_SCHED_SMT
-		p = sd;
-		sd = &per_cpu(cpu_domains, i);
-		group = cpu_to_cpu_group(i);
-		*sd = SD_SIBLING_INIT;
-		sd->span = cpu_sibling_map[i];
-		cpus_and(sd->span, sd->span, *cpu_map);
-		sd->parent = p;
-		sd->groups = &sched_group_cpus[group];
-#endif
-	}
-
-#ifdef CONFIG_SCHED_SMT
-	/* Set up CPU (sibling) groups */
-	for_each_cpu_mask(i, *cpu_map) {
-		cpumask_t this_sibling_map = cpu_sibling_map[i];
-		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
-		if (i != first_cpu(this_sibling_map))
-			continue;
-
-		init_sched_build_groups(sched_group_cpus, this_sibling_map,
-						&cpu_to_cpu_group);
-	}
-#endif
-
-	/* Set up physical groups */
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		init_sched_build_groups(sched_group_phys, nodemask,
-						&cpu_to_phys_group);
-	}
-
-#ifdef CONFIG_NUMA
-	init_sched_build_groups(sched_group_allnodes, *cpu_map,
-				&cpu_to_allnodes_group);
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		/* Set up node groups */
-		struct sched_group *sg, *prev;
-		cpumask_t nodemask = node_to_cpumask(i);
-		cpumask_t domainspan;
-		cpumask_t covered = CPU_MASK_NONE;
-		int j;
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		domainspan = sched_domain_node_span(i);
-		cpus_and(domainspan, domainspan, *cpu_map);
-
-		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-		sched_group_nodes[i] = sg;
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *sd;
-			sd = &per_cpu(node_domains, j);
-			sd->groups = sg;
-			if (sd->groups == NULL) {
-				/* Turn off balancing if we have no groups */
-				sd->flags = 0;
-			}
-		}
-		if (!sg) {
-			printk(KERN_WARNING
-			"Can not alloc domain group for node %d\n", i);
-			continue;
-		}
-		sg->cpu_power = 0;
-		sg->cpumask = nodemask;
-		cpus_or(covered, covered, nodemask);
-		prev = sg;
-
-		for (j = 0; j < MAX_NUMNODES; j++) {
-			cpumask_t tmp, notcovered;
-			int n = (i + j) % MAX_NUMNODES;
-
-			cpus_complement(notcovered, covered);
-			cpus_and(tmp, notcovered, *cpu_map);
-			cpus_and(tmp, tmp, domainspan);
-			if (cpus_empty(tmp))
-				break;
-
-			nodemask = node_to_cpumask(n);
-			cpus_and(tmp, tmp, nodemask);
-			if (cpus_empty(tmp))
-				continue;
-
-			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
-			if (!sg) {
-				printk(KERN_WARNING
-				"Can not alloc domain group for node %d\n", j);
-				break;
-			}
-			sg->cpu_power = 0;
-			sg->cpumask = tmp;
-			cpus_or(covered, covered, tmp);
-			prev->next = sg;
-			prev = sg;
-		}
-		prev->next = sched_group_nodes[i];
-	}
-#endif
-
-	/* Calculate CPU power for physical packages and nodes */
-	for_each_cpu_mask(i, *cpu_map) {
-		int power;
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-		power = SCHED_LOAD_SCALE;
-		sd->groups->cpu_power = power;
-#endif
-
-		sd = &per_cpu(phys_domains, i);
-		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-		sd->groups->cpu_power = power;
-
-#ifdef CONFIG_NUMA
-		sd = &per_cpu(allnodes_domains, i);
-		if (sd->groups) {
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-			sd->groups->cpu_power = power;
-		}
-#endif
-	}
-
-#ifdef CONFIG_NUMA
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *sg = sched_group_nodes[i];
-		int j;
-
-		if (sg == NULL)
-			continue;
-next_sg:
-		for_each_cpu_mask(j, sg->cpumask) {
-			struct sched_domain *sd;
-			int power;
-
-			sd = &per_cpu(phys_domains, j);
-			if (j != first_cpu(sd->groups->cpumask)) {
-				/*
-				 * Only add "power" once for each
-				 * physical package.
-				 */
-				continue;
-			}
-			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
-				(cpus_weight(sd->groups->cpumask)-1) / 10;
-
-			sg->cpu_power += power;
-		}
-		sg = sg->next;
-		if (sg != sched_group_nodes[i])
-			goto next_sg;
-	}
-#endif
-
-	/* Attach the domains */
-	for_each_online_cpu(i) {
-		struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-		sd = &per_cpu(cpu_domains, i);
-#else
-		sd = &per_cpu(phys_domains, i);
-#endif
-		cpu_attach_domain(sd, i);
-	}
-	/*
-	 * Tune cache-hot values:
-	 */
-	calibrate_migration_costs();
-}
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-void arch_init_sched_domains(const cpumask_t *cpu_map)
-{
-	cpumask_t cpu_default_map;
-
-	/*
-	 * Setup mask for cpus without special case scheduling requirements.
-	 * For now this just excludes isolated cpus, but could be used to
-	 * exclude other special cases in the future.
-	 */
-	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
-
-	build_sched_domains(&cpu_default_map);
-}
-
-void arch_destroy_sched_domains(const cpumask_t *cpu_map)
-{
-#ifdef CONFIG_NUMA
-	int i;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		cpumask_t nodemask = node_to_cpumask(i);
-		struct sched_group *oldsg, *sg = sched_group_nodes[i];
-
-		cpus_and(nodemask, nodemask, *cpu_map);
-		if (cpus_empty(nodemask))
-			continue;
-
-		if (sg == NULL)
-			continue;
-		sg = sg->next;
-next_sg:
-		oldsg = sg;
-		sg = sg->next;
-		kfree(oldsg);
-		if (oldsg != sched_group_nodes[i])
-			goto next_sg;
-		sched_group_nodes[i] = NULL;
-	}
-#endif
-}
-
Index: linux-sched-curr/include/asm-ia64/processor.h
===================================================================
--- linux-sched-curr.orig/include/asm-ia64/processor.h
+++ linux-sched-curr/include/asm-ia64/processor.h
@@ -20,9 +20,6 @@
 #include <asm/ptrace.h>
 #include <asm/ustack.h>
 
-/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
-#define ARCH_HAS_SCHED_DOMAIN
-
 #define IA64_NUM_DBG_REGS	8
 /*
  * Limits for PMC and PMD are set to less than maximum architected values
Index: linux-sched-curr/include/asm-ia64/topology.h
===================================================================
--- linux-sched-curr.orig/include/asm-ia64/topology.h
+++ linux-sched-curr/include/asm-ia64/topology.h
@@ -96,28 +96,6 @@ void build_cpu_to_node_map(void);
 	.nr_balance_failed	= 0,			\
 }
 
-/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
-#define SD_ALLNODES_INIT (struct sched_domain) {	\
-	.span			= CPU_MASK_NONE,	\
-	.parent			= NULL,			\
-	.groups			= NULL,			\
-	.min_interval		= 64,			\
-	.max_interval		= 64*num_online_cpus(),	\
-	.busy_factor		= 128,			\
-	.imbalance_pct		= 133,			\
-	.cache_nice_tries	= 1,			\
-	.busy_idx		= 3,			\
-	.idle_idx		= 3,			\
-	.newidle_idx		= 0, /* unused */	\
-	.wake_idx		= 0, /* unused */	\
-	.forkexec_idx		= 0, /* unused */	\
-	.per_cpu_gain		= 100,			\
-	.flags			= SD_LOAD_BALANCE,	\
-	.last_balance		= jiffies,		\
-	.balance_interval	= 64,			\
-	.nr_balance_failed	= 0,			\
-}
-
 #endif /* CONFIG_NUMA */
 
 #include <asm-generic/topology.h>
Index: linux-sched-curr/include/linux/sched.h
===================================================================
--- linux-sched-curr.orig/include/linux/sched.h
+++ linux-sched-curr/include/linux/sched.h
@@ -546,15 +546,6 @@ struct sched_domain {
 
 extern void partition_sched_domains(cpumask_t *partition1,
 				    cpumask_t *partition2);
-#ifdef ARCH_HAS_SCHED_DOMAIN
-/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
-extern cpumask_t cpu_isolated_map;
-extern void init_sched_build_groups(struct sched_group groups[],
-	                        cpumask_t span, int (*group_fn)(int cpu));
-extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
-
-#endif /* ARCH_HAS_SCHED_DOMAIN */
-
 /*
  * Maximum cache size the migration-costs auto-tuning code will
  * search from:
Index: linux-sched-curr/include/linux/topology.h
===================================================================
--- linux-sched-curr.orig/include/linux/topology.h
+++ linux-sched-curr/include/linux/topology.h
@@ -133,6 +133,28 @@
 }
 #endif
 
+/* sched_domains SD_ALLNODES_INIT for NUMA machines */
+#define SD_ALLNODES_INIT (struct sched_domain) {	\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 64,			\
+	.max_interval		= 64*num_online_cpus(),	\
+	.busy_factor		= 128,			\
+	.imbalance_pct		= 133,			\
+	.cache_nice_tries	= 1,			\
+	.busy_idx		= 3,			\
+	.idle_idx		= 3,			\
+	.newidle_idx		= 0, /* unused */	\
+	.wake_idx		= 0, /* unused */	\
+	.forkexec_idx		= 0, /* unused */	\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_LOAD_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 64,			\
+	.nr_balance_failed	= 0,			\
+}
+
 #ifdef CONFIG_NUMA
 #ifndef SD_NODE_INIT
 #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
Index: linux-sched-curr/kernel/sched.c
===================================================================
--- linux-sched-curr.orig/kernel/sched.c
+++ linux-sched-curr/kernel/sched.c
@@ -4947,7 +4947,7 @@ static int sd_parent_degenerate(struct s
  * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
  * hold the hotplug lock.
  */
-void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
@@ -4970,7 +4970,7 @@ void cpu_attach_domain(struct sched_doma
 }
 
 /* cpus with isolated domains */
-cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
 
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -4998,8 +4998,8 @@ __setup ("isolcpus=", isolated_cpu_setup
  * covered by the given span, and will set each group's ->cpumask correctly,
  * and ->cpu_power to 0.
  */
-void init_sched_build_groups(struct sched_group groups[],
-			cpumask_t span, int (*group_fn)(int cpu))
+static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+				    int (*group_fn)(int cpu))
 {
 	struct sched_group *first = NULL, *last = NULL;
 	cpumask_t covered = CPU_MASK_NONE;
@@ -5513,12 +5513,85 @@ void __devinit calibrate_migration_costs
 	local_irq_restore(flags);
 }
 
+#define SD_NODES_PER_DOMAIN 16
 
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void build_sched_domains(const cpumask_t *cpu_map);
-extern void arch_init_sched_domains(const cpumask_t *cpu_map);
-extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
-#else
+#ifdef CONFIG_NUMA
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Start at @node */
+		n = (node + i) % MAX_NUMNODES;
+
+		if (!nr_cpus_node(n))
+			continue;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, n);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+static cpumask_t sched_domain_node_span(int node)
+{
+	int i;
+	cpumask_t span, nodemask;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	nodemask = node_to_cpumask(node);
+	cpus_or(span, span, nodemask);
+	set_bit(node, used_nodes);
+
+	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		nodemask = node_to_cpumask(next_node);
+		cpus_or(span, span, nodemask);
+	}
+
+	return span;
+}
+#endif
+
+/*
+ * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
+ * can switch it on easily if needed.
+ */
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
@@ -5540,44 +5613,28 @@ static int cpu_to_phys_group(int cpu)
 }
 
 #ifdef CONFIG_NUMA
-
+/*
+ * The init_sched_build_groups can't handle what we want to do with node
+ * groups, so roll our own. Now each node has its own list of groups which
+ * gets dynamically allocated.
+ */
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int cpu_to_node_group(int cpu)
+static struct sched_group *sched_group_nodes[MAX_NUMNODES];
+
+static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+static struct sched_group sched_group_allnodes[MAX_NUMNODES];
+
+static int cpu_to_allnodes_group(int cpu)
 {
 	return cpu_to_node(cpu);
 }
 #endif
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-/*
- * The domains setup code relies on siblings not spanning
- * multiple nodes. Make sure the architecture has a proper
- * siblings map:
- */
-static void check_sibling_maps(void)
-{
-	int i, j;
-
-	for_each_online_cpu(i) {
-		for_each_cpu_mask(j, cpu_sibling_map[i]) {
-			if (cpu_to_node(i) != cpu_to_node(j)) {
-				printk(KERN_INFO "warning: CPU %d siblings map "
-					"to different node - isolating "
-					"them.\n", i);
-				cpu_sibling_map[i] = cpumask_of_cpu(i);
-				break;
-			}
-		}
-	}
-}
-#endif
-
 /*
  * Build sched domains for a given set of cpus and attach the sched domains
  * to the individual cpus
  */
-static void build_sched_domains(const cpumask_t *cpu_map)
+void build_sched_domains(const cpumask_t *cpu_map)
 {
 	int i;
 
@@ -5592,11 +5649,22 @@ static void build_sched_domains(const cp
 		cpus_and(nodemask, nodemask, *cpu_map);
 
 #ifdef CONFIG_NUMA
+		if (num_online_cpus()
+				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+			sd = &per_cpu(allnodes_domains, i);
+			*sd = SD_ALLNODES_INIT;
+			sd->span = *cpu_map;
+			group = cpu_to_allnodes_group(i);
+			sd->groups = &sched_group_allnodes[group];
+			p = sd;
+		} else
+			p = NULL;
+
 		sd = &per_cpu(node_domains, i);
-		group = cpu_to_node_group(i);
 		*sd = SD_NODE_INIT;
-		sd->span = *cpu_map;
-		sd->groups = &sched_group_nodes[group];
+		sd->span = sched_domain_node_span(cpu_to_node(i));
+		sd->parent = p;
+		cpus_and(sd->span, sd->span, *cpu_map);
 #endif
 
 		p = sd;
@@ -5621,7 +5689,7 @@ static void build_sched_domains(const cp
 
 #ifdef CONFIG_SCHED_SMT
 	/* Set up CPU (sibling) groups */
-	for_each_online_cpu(i) {
+	for_each_cpu_mask(i, *cpu_map) {
 		cpumask_t this_sibling_map = cpu_sibling_map[i];
 		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
 		if (i != first_cpu(this_sibling_map))
@@ -5646,8 +5714,74 @@ static void build_sched_domains(const cp
 
 #ifdef CONFIG_NUMA
 	/* Set up node groups */
-	init_sched_build_groups(sched_group_nodes, *cpu_map,
-					&cpu_to_node_group);
+	init_sched_build_groups(sched_group_allnodes, *cpu_map,
+				&cpu_to_allnodes_group);
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		/* Set up node groups */
+		struct sched_group *sg, *prev;
+		cpumask_t nodemask = node_to_cpumask(i);
+		cpumask_t domainspan;
+		cpumask_t covered = CPU_MASK_NONE;
+		int j;
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
+
+		domainspan = sched_domain_node_span(i);
+		cpus_and(domainspan, domainspan, *cpu_map);
+
+		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+		sched_group_nodes[i] = sg;
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *sd;
+			sd = &per_cpu(node_domains, j);
+			sd->groups = sg;
+			if (sd->groups == NULL) {
+				/* Turn off balancing if we have no groups */
+				sd->flags = 0;
+			}
+		}
+		if (!sg) {
+			printk(KERN_WARNING
+			"Can not alloc domain group for node %d\n", i);
+			continue;
+		}
+		sg->cpu_power = 0;
+		sg->cpumask = nodemask;
+		cpus_or(covered, covered, nodemask);
+		prev = sg;
+
+		for (j = 0; j < MAX_NUMNODES; j++) {
+			cpumask_t tmp, notcovered;
+			int n = (i + j) % MAX_NUMNODES;
+
+			cpus_complement(notcovered, covered);
+			cpus_and(tmp, notcovered, *cpu_map);
+			cpus_and(tmp, tmp, domainspan);
+			if (cpus_empty(tmp))
+				break;
+
+			nodemask = node_to_cpumask(n);
+			cpus_and(tmp, tmp, nodemask);
+			if (cpus_empty(tmp))
+				continue;
+
+			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
+			if (!sg) {
+				printk(KERN_WARNING
+				"Can not alloc domain group for node %d\n", j);
+				break;
+			}
+			sg->cpu_power = 0;
+			sg->cpumask = tmp;
+			cpus_or(covered, covered, tmp);
+			prev->next = sg;
+			prev = sg;
+		}
+		prev->next = sched_group_nodes[i];
+	}
 #endif
 
 	/* Calculate CPU power for physical packages and nodes */
@@ -5666,14 +5800,46 @@ static void build_sched_domains(const cp
 		sd->groups->cpu_power = power;
 
 #ifdef CONFIG_NUMA
-		if (i == first_cpu(sd->groups->cpumask)) {
-			/* Only add "power" once for each physical package. */
-			sd = &per_cpu(node_domains, i);
-			sd->groups->cpu_power += power;
+		sd = &per_cpu(allnodes_domains, i);
+		if (sd->groups) {
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+			sd->groups->cpu_power = power;
 		}
 #endif
 	}
 
+#ifdef CONFIG_NUMA
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *sg = sched_group_nodes[i];
+		int j;
+
+		if (sg == NULL)
+			continue;
+next_sg:
+		for_each_cpu_mask(j, sg->cpumask) {
+			struct sched_domain *sd;
+			int power;
+
+			sd = &per_cpu(phys_domains, j);
+			if (j != first_cpu(sd->groups->cpumask)) {
+				/*
+				 * Only add "power" once for each
+				 * physical package.
+				 */
+				continue;
+			}
+			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+				(cpus_weight(sd->groups->cpumask)-1) / 10;
+
+			sg->cpu_power += power;
+		}
+		sg = sg->next;
+		if (sg != sched_group_nodes[i])
+			goto next_sg;
+	}
+#endif
+
 	/* Attach the domains */
 	for_each_cpu_mask(i, *cpu_map) {
 		struct sched_domain *sd;
@@ -5692,13 +5858,10 @@ static void build_sched_domains(const cp
 /*
  * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
  */
-static void arch_init_sched_domains(cpumask_t *cpu_map)
+static void arch_init_sched_domains(const cpumask_t *cpu_map)
 {
 	cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-	check_sibling_maps();
-#endif
 	/*
 	 * Setup mask for cpus without special case scheduling requirements.
 	 * For now this just excludes isolated cpus, but could be used to
@@ -5711,10 +5874,29 @@ static void arch_init_sched_domains(cpum
 
 static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
 {
-	/* Do nothing: everything is statically allocated. */
-}
+#ifdef CONFIG_NUMA
+	int i;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t nodemask = node_to_cpumask(i);
+		struct sched_group *oldsg, *sg = sched_group_nodes[i];
+
+		cpus_and(nodemask, nodemask, *cpu_map);
+		if (cpus_empty(nodemask))
+			continue;
 
-#endif /* ARCH_HAS_SCHED_DOMAIN */
+		if (sg == NULL)
+			continue;
+		sg = sg->next;
+next_sg:
+		oldsg = sg;
+		sg = sg->next;
+		kfree(oldsg);
+		if (oldsg != sched_group_nodes[i])
+			goto next_sg;
+		sched_group_nodes[i] = NULL;
+	}
+#endif
+}
 
 /*
  * Detach sched domains from a group of cpus specified in cpu_map

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-22 16:07     ` Ingo Molnar
@ 2005-08-22 20:16       ` Dinakar Guniguntala
  2005-08-22 20:18         ` Dinakar Guniguntala
  2005-09-02 14:47       ` Dinakar Guniguntala
  1 sibling, 1 reply; 8+ messages in thread
From: Dinakar Guniguntala @ 2005-08-22 20:16 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: John Hawkes, linux-ia64, linux-kernel, pj, nickpiggin, akpm

On Mon, Aug 22, 2005 at 06:07:19PM +0200, Ingo Molnar wrote:
> great! Andrew, i'd suggest we try the merged patch attached below in 
> -mm.
> 

Ingo, unfortunately I am hitting panic's on stress testing. The panic
screen is attached in the .png below.

On debugging I found that the panic happens consistently in this line
 of code in function find_busiest_group

	*imbalance = min((max_load - avg_load) * busiest->cpu_power,
                                (avg_load - this_load) * this->cpu_power)
                        / SCHED_LOAD_SCALE;

Here I find that the "this" pointer is still NULL. I verified this by
a quick hack as below in the same function and with this hack it seems 
to run for hours

-	if (!busiest || this_load >= max_load)
+	if (!this || !busiest || this_load >= max_load)

This can only happen if the none of the sched groups pointed to by the 
'sd' of the current cpu contain the current cpu. I was wondering if
this had anything to do with the way that we are using RCU to assign/
read the 'sd' pointer.

Any thoughts ??

	-Dinakar

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-22 20:16       ` Dinakar Guniguntala
@ 2005-08-22 20:18         ` Dinakar Guniguntala
  0 siblings, 0 replies; 8+ messages in thread
From: Dinakar Guniguntala @ 2005-08-22 20:18 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: John Hawkes, linux-ia64, linux-kernel, pj, nickpiggin, akpm

[-- Attachment #1: Type: text/plain, Size: 1237 bytes --]

On Tue, Aug 23, 2005 at 01:46:26AM +0530, Dinakar Guniguntala wrote:
> On Mon, Aug 22, 2005 at 06:07:19PM +0200, Ingo Molnar wrote:
> > great! Andrew, i'd suggest we try the merged patch attached below in 
> > -mm.
> > 
> 
> Ingo, unfortunately I am hitting panic's on stress testing. The panic
> screen is attached in the .png below.

Sorry, forgot to add the .png. Here it is...

> 
> On debugging I found that the panic happens consistently in this line
>  of code in function find_busiest_group
> 
> 	*imbalance = min((max_load - avg_load) * busiest->cpu_power,
>                                 (avg_load - this_load) * this->cpu_power)
>                         / SCHED_LOAD_SCALE;
> 
> Here I find that the "this" pointer is still NULL. I verified this by
> a quick hack as below in the same function and with this hack it seems 
> to run for hours
> 
> -	if (!busiest || this_load >= max_load)
> +	if (!this || !busiest || this_load >= max_load)
> 
> This can only happen if the none of the sched groups pointed to by the 
> 'sd' of the current cpu contain the current cpu. I was wondering if
> this had anything to do with the way that we are using RCU to assign/
> read the 'sd' pointer.
> 
> Any thoughts ??
> 
> 	-Dinakar
> 

[-- Attachment #2: sd-panic.png --]
[-- Type: image/png, Size: 5726 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-20 14:50 [PATCH] ia64 cpuset + build_sched_domains() mangles structures John Hawkes
  2005-08-22  7:08 ` Ingo Molnar
@ 2005-08-22 20:38 ` Paul Jackson
  1 sibling, 0 replies; 8+ messages in thread
From: Paul Jackson @ 2005-08-22 20:38 UTC (permalink / raw)
  To: John Hawkes; +Cc: linux-ia64, linux-kernel, mingo, dino, nickpiggin, akpm

[P.S. - I just noticed that Dinakar is reporting another problem,
        something about a panic on stress testing.  Dinakar - should
	we adapt what I call choice (2) below, to disable this feature
	in 2.6.13, instead of the choice (3) I recommend below? -pj ]

In separate email, Nick suggested disabling some of this mechanism in
2.6.13, in order to provide more soak time for Hawkes dynamic sched
domain fix for partial nodes in 2.6.14.

Nick's hope was to find some quick and easy (and safe) way to avoid the
oops that Hawkes found for 2.6.13 (which is almost out the door).

There are three easy ways I see offhand one might do to avoid this
oops in 2.6.13

 1) disable 'cpu_exclusive' cpusets
 2) disable Dinakar's patch entirely.
 3) disable Dinakar's patch just for partial node cpusets.

The 'cpu_exclusive' cpusets are already in the kernel, as of 2.6.12,
though until Dinakar's patch is added, they are of limited usefulness.
I'd mildly prefer not to disable them in 2.6.13 (choice (1) above),
unless we can't find a better way.  A few poor folks off in some corner
of cpuset land would probably notice that something that worked in
2.6.12 was disconnected in 2.6.13.

What Dinakar's patch, added for 2.6.13, does is (roughly):

  If the cpuset code notices that it is setting up a cpuset
  marked 'cpu_exclusive', then the cpuset code calls the
  dynamic sched domain code to define a sched domain along
  the boundaries of that cpuset.

The safest, mind numbingly simple thing to do that would avoid
the oops that Hawkes reported is to simply not have the cpuset
code call the code to setup a dynamic sched domain.  This is
choice (2) above, and could be done at the last hour with
relative safety.

Here is an untested patch that does (2):

=====

Index: linux-2.6.13-cpuset-mempolicy-migrate/kernel/cpuset.c
===================================================================
--- linux-2.6.13-cpuset-mempolicy-migrate.orig/kernel/cpuset.c
+++ linux-2.6.13-cpuset-mempolicy-migrate/kernel/cpuset.c
@@ -627,6 +627,15 @@ static int validate_change(const struct 
  * Call with cpuset_sem held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
+
+/*
+ * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
+ * Disable letting 'cpu_exclusive' cpusets define dynamic sched
+ * domains, until the sched domain can handle partial nodes.
+ * Remove this ifdef hackery when sched domains fixed.
+ */
+#define DISABLE_EXCLUSIVE_CPU_DOMAINS 1
+#ifdef DISABLE_EXCLUSIVE_CPU_DOMAINS
 static void update_cpu_domains(struct cpuset *cur)
 {
 	struct cpuset *c, *par = cur->parent;
@@ -667,6 +676,11 @@ static void update_cpu_domains(struct cp
 	partition_sched_domains(&pspan, &cspan);
 	unlock_cpu_hotplug();
 }
+#else
+static void update_cpu_domains(struct cpuset *cur)
+{
+}
+#endif

 static int update_cpumask(struct cpuset *cs, char *buf)
 {

=====

For a half-dozen easy lines of code, if we have a few hours and
a chance for Dinakar and Hawkes to concur, the best solution is
choice (3) above.  The cpuset code that calls out to the code
to setup a dynamic sched domain already avoides that call out for
a variety of conditions (such as an empty cpuset or the top
cpuset).  It can just as easily avoid the call out for one more
reason, that being that the cpuset in question is not on node
boundaries (for some node, uses a proper non-empty subset of the
cpus on that node).

Here is an untested patch that does this:

=====

Index: linux-2.6.13-cpuset-mempolicy-migrate/kernel/cpuset.c
===================================================================
--- linux-2.6.13-cpuset-mempolicy-migrate.orig/kernel/cpuset.c
+++ linux-2.6.13-cpuset-mempolicy-migrate/kernel/cpuset.c
@@ -636,6 +636,23 @@ static void update_cpu_domains(struct cp
 		return;

 	/*
+	 * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
+	 * Require the 'cpu_exclusive' cpuset to include all (or none)
+	 * of the CPUs on each node, or return w/o changing sched domains.
+	 * Remove this hack when dynamic sched domains fixed.
+	 */
+	{
+		int i, j;
+
+		for_each_cpu_mask(i, cur->cpus_allowed) {
+			for_each_cpu_mask(j, node_to_cpumask(cpu_to_node(i))) {
+				if (!cpu_isset(j, cur->cpus_allowed))
+					return;
+			}
+		}
+	}
+
+	/*
 	 * Get all cpus from parent's cpus_allowed not part of exclusive
 	 * children
 	 */

=====

I recommend this last patch above - choice (3).  It provides what
Dinakar intended to allow user control of sched domains, while
avoiding the corner case that Hawkes reported, where a sched
domain not on a node boundary could oops the kernel.

I will prepare a real patch that does (3), and send it out later
today.

-- 
                  I won't rest till it's the best ...
                  Programmer, Linux Scalability
                  Paul Jackson <pj@sgi.com> 1.925.600.0401

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ia64 cpuset + build_sched_domains() mangles structures
  2005-08-22 16:07     ` Ingo Molnar
  2005-08-22 20:16       ` Dinakar Guniguntala
@ 2005-09-02 14:47       ` Dinakar Guniguntala
  1 sibling, 0 replies; 8+ messages in thread
From: Dinakar Guniguntala @ 2005-09-02 14:47 UTC (permalink / raw)
  To: Ingo Molnar; +Cc: John Hawkes, linux-ia64, linux-kernel, pj, nickpiggin, akpm

Andrew,

Please include the patch below into -mm. I had reported a problem
with this patch earlier on 2.6.13-rc6, but I am just not able to
reproduce the problem on newer kernels (2.6.13 and 2.6.13-mm1).

I have tested this extensively on a Power5 box and I believe
that John Hawke's has tested this on ia64 as well.

The patch is here

http://marc.theaimsgroup.com/?l=linux-ia64&m=112474434128996&w=2


Regards,

Dinakar



On Mon, Aug 22, 2005 at 06:07:19PM +0200, Ingo Molnar wrote:
> 
> * Dinakar Guniguntala <dino@in.ibm.com> wrote:
> 
> > On Mon, Aug 22, 2005 at 09:08:34AM +0200, Ingo Molnar wrote:
> > > 
> > > in terms of 2.6.14, the replacement patch below also does what i always 
> > > wanted to do: to merge the ia64-specific build_sched_domains() code back 
> > > into kernel/sched.c. I've done this by taking your improved dynamic 
> > > build-domains code and putting it into kernel/sched.c.
> > > 
> > 
> > Ingo, one change required to your patch and the exclusive
> > cpuset functionality seems to work fine on a NUMA ppc64 box.
> > I am still running some of my dynamic sched domain tests. So far
> > it seems to be holding ok.
> 
> great! Andrew, i'd suggest we try the merged patch attached below in 
> -mm.
> 
> > Any idea why the ia64 stuff was forked in the first place?
> 
> most of the NUMA domain-trees stuff happened in the ia64 space so there 
> was a natural desire to keep it more hackable there. But now i think 
> it's getting counterproductive.
> 
> 	Ingo
> 
> -----
> I've already sent this to the maintainers, and this is now being sent to a
> larger community audience.  I have fixed a problem with the ia64 version of
> build_sched_domains(), but a similar fix still needs to be made to the
> generic build_sched_domains() in kernel/sched.c.
> 
> The "dynamic sched domains" functionality has recently been merged into
> 2.6.13-rcN that sees the dynamic declaration of a cpu-exclusive (a.k.a.
> "isolated") cpuset and rebuilds the CPU Scheduler sched domains and sched
> groups to separate away the CPUs in this cpu-exclusive cpuset from the
> remainder of the non-isolated CPUs.  This allows the non-isolated CPUs to
> completely ignore the isolated CPUs when doing load-balancing.
> 
> Unfortunately, build_sched_domains() expects that a sched domain will
> include all the CPUs of each node in the domain, i.e., that no node will
> belong in both an isolated cpuset and a non-isolated cpuset.  Declaring
> a cpuset that violates this presumption will produce flawed data
> structures and will oops the kernel.
> 
> To trigger the problem (on a NUMA system with >1 CPUs per node):
>    cd /dev/cpuset
>    mkdir newcpuset
>    cd newcpuset
>    echo 0 >cpus
>    echo 0 >mems
>    echo 1 >cpu_exclusive
> 
> I have fixed this shortcoming for ia64 NUMA (with multiple CPUs per node).
> A similar shortcoming exists in the generic build_sched_domains() (in
> kernel/sched.c) for NUMA, and that needs to be fixed also.  The fix involves
> dynamically allocating sched_group_nodes[] and sched_group_allnodes[] for
> each invocation of build_sched_domains(), rather than using global arrays
> for these structures.  Care must be taken to remember kmalloc() addresses
> so that arch_destroy_sched_domains() can properly kfree() the new dynamic
> structures.
> 
> This is a patch against 2.6.13-rc6.
> 
> Signed-off-by: John Hawkes <hawkes@sgi.com>
> 
> reworked the patch to also move the ia64 domain setup code to the generic
> code.
> 
> Signed-off-by: Ingo Molnar <mingo@elte.hu>
> 
> ppc64 fix
> 
> From: Dinakar Guniguntala <dino@in.ibm.com>
> 
>  arch/ia64/kernel/domain.c    |  400 -------------------------------------------
>  arch/ia64/kernel/Makefile    |    2 
>  include/asm-ia64/processor.h |    3 
>  include/asm-ia64/topology.h  |   22 --
>  include/linux/sched.h        |    9 
>  include/linux/topology.h     |   22 ++
>  kernel/sched.c               |  290 +++++++++++++++++++++++++------
>  7 files changed, 259 insertions(+), 489 deletions(-)
> 
> Index: linux-sched-curr/arch/ia64/kernel/Makefile
> ===================================================================
> --- linux-sched-curr.orig/arch/ia64/kernel/Makefile
> +++ linux-sched-curr/arch/ia64/kernel/Makefile
> @@ -16,7 +16,7 @@ obj-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += acp
>  obj-$(CONFIG_IA64_PALINFO)	+= palinfo.o
>  obj-$(CONFIG_IOSAPIC)		+= iosapic.o
>  obj-$(CONFIG_MODULES)		+= module.o
> -obj-$(CONFIG_SMP)		+= smp.o smpboot.o domain.o
> +obj-$(CONFIG_SMP)		+= smp.o smpboot.o
>  obj-$(CONFIG_NUMA)		+= numa.o
>  obj-$(CONFIG_PERFMON)		+= perfmon_default_smpl.o
>  obj-$(CONFIG_IA64_CYCLONE)	+= cyclone.o
> Index: linux-sched-curr/arch/ia64/kernel/domain.c
> ===================================================================
> --- linux-sched-curr.orig/arch/ia64/kernel/domain.c
> +++ /dev/null
> @@ -1,400 +0,0 @@
> -/*
> - * arch/ia64/kernel/domain.c
> - * Architecture specific sched-domains builder.
> - *
> - * Copyright (C) 2004 Jesse Barnes
> - * Copyright (C) 2004 Silicon Graphics, Inc.
> - */
> -
> -#include <linux/sched.h>
> -#include <linux/percpu.h>
> -#include <linux/slab.h>
> -#include <linux/cpumask.h>
> -#include <linux/init.h>
> -#include <linux/topology.h>
> -#include <linux/nodemask.h>
> -
> -#define SD_NODES_PER_DOMAIN 16
> -
> -#ifdef CONFIG_NUMA
> -/**
> - * find_next_best_node - find the next node to include in a sched_domain
> - * @node: node whose sched_domain we're building
> - * @used_nodes: nodes already in the sched_domain
> - *
> - * Find the next node to include in a given scheduling domain.  Simply
> - * finds the closest node not already in the @used_nodes map.
> - *
> - * Should use nodemask_t.
> - */
> -static int find_next_best_node(int node, unsigned long *used_nodes)
> -{
> -	int i, n, val, min_val, best_node = 0;
> -
> -	min_val = INT_MAX;
> -
> -	for (i = 0; i < MAX_NUMNODES; i++) {
> -		/* Start at @node */
> -		n = (node + i) % MAX_NUMNODES;
> -
> -		if (!nr_cpus_node(n))
> -			continue;
> -
> -		/* Skip already used nodes */
> -		if (test_bit(n, used_nodes))
> -			continue;
> -
> -		/* Simple min distance search */
> -		val = node_distance(node, n);
> -
> -		if (val < min_val) {
> -			min_val = val;
> -			best_node = n;
> -		}
> -	}
> -
> -	set_bit(best_node, used_nodes);
> -	return best_node;
> -}
> -
> -/**
> - * sched_domain_node_span - get a cpumask for a node's sched_domain
> - * @node: node whose cpumask we're constructing
> - * @size: number of nodes to include in this span
> - *
> - * Given a node, construct a good cpumask for its sched_domain to span.  It
> - * should be one that prevents unnecessary balancing, but also spreads tasks
> - * out optimally.
> - */
> -static cpumask_t sched_domain_node_span(int node)
> -{
> -	int i;
> -	cpumask_t span, nodemask;
> -	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
> -
> -	cpus_clear(span);
> -	bitmap_zero(used_nodes, MAX_NUMNODES);
> -
> -	nodemask = node_to_cpumask(node);
> -	cpus_or(span, span, nodemask);
> -	set_bit(node, used_nodes);
> -
> -	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
> -		int next_node = find_next_best_node(node, used_nodes);
> -		nodemask = node_to_cpumask(next_node);
> -		cpus_or(span, span, nodemask);
> -	}
> -
> -	return span;
> -}
> -#endif
> -
> -/*
> - * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
> - * can switch it on easily if needed.
> - */
> -#ifdef CONFIG_SCHED_SMT
> -static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
> -static struct sched_group sched_group_cpus[NR_CPUS];
> -static int cpu_to_cpu_group(int cpu)
> -{
> -	return cpu;
> -}
> -#endif
> -
> -static DEFINE_PER_CPU(struct sched_domain, phys_domains);
> -static struct sched_group sched_group_phys[NR_CPUS];
> -static int cpu_to_phys_group(int cpu)
> -{
> -#ifdef CONFIG_SCHED_SMT
> -	return first_cpu(cpu_sibling_map[cpu]);
> -#else
> -	return cpu;
> -#endif
> -}
> -
> -#ifdef CONFIG_NUMA
> -/*
> - * The init_sched_build_groups can't handle what we want to do with node
> - * groups, so roll our own. Now each node has its own list of groups which
> - * gets dynamically allocated.
> - */
> -static DEFINE_PER_CPU(struct sched_domain, node_domains);
> -static struct sched_group *sched_group_nodes[MAX_NUMNODES];
> -
> -static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
> -static struct sched_group sched_group_allnodes[MAX_NUMNODES];
> -
> -static int cpu_to_allnodes_group(int cpu)
> -{
> -	return cpu_to_node(cpu);
> -}
> -#endif
> -
> -/*
> - * Build sched domains for a given set of cpus and attach the sched domains
> - * to the individual cpus
> - */
> -void build_sched_domains(const cpumask_t *cpu_map)
> -{
> -	int i;
> -
> -	/*
> -	 * Set up domains for cpus specified by the cpu_map.
> -	 */
> -	for_each_cpu_mask(i, *cpu_map) {
> -		int group;
> -		struct sched_domain *sd = NULL, *p;
> -		cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
> -
> -		cpus_and(nodemask, nodemask, *cpu_map);
> -
> -#ifdef CONFIG_NUMA
> -		if (num_online_cpus()
> -				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
> -			sd = &per_cpu(allnodes_domains, i);
> -			*sd = SD_ALLNODES_INIT;
> -			sd->span = *cpu_map;
> -			group = cpu_to_allnodes_group(i);
> -			sd->groups = &sched_group_allnodes[group];
> -			p = sd;
> -		} else
> -			p = NULL;
> -
> -		sd = &per_cpu(node_domains, i);
> -		*sd = SD_NODE_INIT;
> -		sd->span = sched_domain_node_span(cpu_to_node(i));
> -		sd->parent = p;
> -		cpus_and(sd->span, sd->span, *cpu_map);
> -#endif
> -
> -		p = sd;
> -		sd = &per_cpu(phys_domains, i);
> -		group = cpu_to_phys_group(i);
> -		*sd = SD_CPU_INIT;
> -		sd->span = nodemask;
> -		sd->parent = p;
> -		sd->groups = &sched_group_phys[group];
> -
> -#ifdef CONFIG_SCHED_SMT
> -		p = sd;
> -		sd = &per_cpu(cpu_domains, i);
> -		group = cpu_to_cpu_group(i);
> -		*sd = SD_SIBLING_INIT;
> -		sd->span = cpu_sibling_map[i];
> -		cpus_and(sd->span, sd->span, *cpu_map);
> -		sd->parent = p;
> -		sd->groups = &sched_group_cpus[group];
> -#endif
> -	}
> -
> -#ifdef CONFIG_SCHED_SMT
> -	/* Set up CPU (sibling) groups */
> -	for_each_cpu_mask(i, *cpu_map) {
> -		cpumask_t this_sibling_map = cpu_sibling_map[i];
> -		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
> -		if (i != first_cpu(this_sibling_map))
> -			continue;
> -
> -		init_sched_build_groups(sched_group_cpus, this_sibling_map,
> -						&cpu_to_cpu_group);
> -	}
> -#endif
> -
> -	/* Set up physical groups */
> -	for (i = 0; i < MAX_NUMNODES; i++) {
> -		cpumask_t nodemask = node_to_cpumask(i);
> -
> -		cpus_and(nodemask, nodemask, *cpu_map);
> -		if (cpus_empty(nodemask))
> -			continue;
> -
> -		init_sched_build_groups(sched_group_phys, nodemask,
> -						&cpu_to_phys_group);
> -	}
> -
> -#ifdef CONFIG_NUMA
> -	init_sched_build_groups(sched_group_allnodes, *cpu_map,
> -				&cpu_to_allnodes_group);
> -
> -	for (i = 0; i < MAX_NUMNODES; i++) {
> -		/* Set up node groups */
> -		struct sched_group *sg, *prev;
> -		cpumask_t nodemask = node_to_cpumask(i);
> -		cpumask_t domainspan;
> -		cpumask_t covered = CPU_MASK_NONE;
> -		int j;
> -
> -		cpus_and(nodemask, nodemask, *cpu_map);
> -		if (cpus_empty(nodemask))
> -			continue;
> -
> -		domainspan = sched_domain_node_span(i);
> -		cpus_and(domainspan, domainspan, *cpu_map);
> -
> -		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
> -		sched_group_nodes[i] = sg;
> -		for_each_cpu_mask(j, nodemask) {
> -			struct sched_domain *sd;
> -			sd = &per_cpu(node_domains, j);
> -			sd->groups = sg;
> -			if (sd->groups == NULL) {
> -				/* Turn off balancing if we have no groups */
> -				sd->flags = 0;
> -			}
> -		}
> -		if (!sg) {
> -			printk(KERN_WARNING
> -			"Can not alloc domain group for node %d\n", i);
> -			continue;
> -		}
> -		sg->cpu_power = 0;
> -		sg->cpumask = nodemask;
> -		cpus_or(covered, covered, nodemask);
> -		prev = sg;
> -
> -		for (j = 0; j < MAX_NUMNODES; j++) {
> -			cpumask_t tmp, notcovered;
> -			int n = (i + j) % MAX_NUMNODES;
> -
> -			cpus_complement(notcovered, covered);
> -			cpus_and(tmp, notcovered, *cpu_map);
> -			cpus_and(tmp, tmp, domainspan);
> -			if (cpus_empty(tmp))
> -				break;
> -
> -			nodemask = node_to_cpumask(n);
> -			cpus_and(tmp, tmp, nodemask);
> -			if (cpus_empty(tmp))
> -				continue;
> -
> -			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
> -			if (!sg) {
> -				printk(KERN_WARNING
> -				"Can not alloc domain group for node %d\n", j);
> -				break;
> -			}
> -			sg->cpu_power = 0;
> -			sg->cpumask = tmp;
> -			cpus_or(covered, covered, tmp);
> -			prev->next = sg;
> -			prev = sg;
> -		}
> -		prev->next = sched_group_nodes[i];
> -	}
> -#endif
> -
> -	/* Calculate CPU power for physical packages and nodes */
> -	for_each_cpu_mask(i, *cpu_map) {
> -		int power;
> -		struct sched_domain *sd;
> -#ifdef CONFIG_SCHED_SMT
> -		sd = &per_cpu(cpu_domains, i);
> -		power = SCHED_LOAD_SCALE;
> -		sd->groups->cpu_power = power;
> -#endif
> -
> -		sd = &per_cpu(phys_domains, i);
> -		power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
> -				(cpus_weight(sd->groups->cpumask)-1) / 10;
> -		sd->groups->cpu_power = power;
> -
> -#ifdef CONFIG_NUMA
> -		sd = &per_cpu(allnodes_domains, i);
> -		if (sd->groups) {
> -			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
> -				(cpus_weight(sd->groups->cpumask)-1) / 10;
> -			sd->groups->cpu_power = power;
> -		}
> -#endif
> -	}
> -
> -#ifdef CONFIG_NUMA
> -	for (i = 0; i < MAX_NUMNODES; i++) {
> -		struct sched_group *sg = sched_group_nodes[i];
> -		int j;
> -
> -		if (sg == NULL)
> -			continue;
> -next_sg:
> -		for_each_cpu_mask(j, sg->cpumask) {
> -			struct sched_domain *sd;
> -			int power;
> -
> -			sd = &per_cpu(phys_domains, j);
> -			if (j != first_cpu(sd->groups->cpumask)) {
> -				/*
> -				 * Only add "power" once for each
> -				 * physical package.
> -				 */
> -				continue;
> -			}
> -			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
> -				(cpus_weight(sd->groups->cpumask)-1) / 10;
> -
> -			sg->cpu_power += power;
> -		}
> -		sg = sg->next;
> -		if (sg != sched_group_nodes[i])
> -			goto next_sg;
> -	}
> -#endif
> -
> -	/* Attach the domains */
> -	for_each_online_cpu(i) {
> -		struct sched_domain *sd;
> -#ifdef CONFIG_SCHED_SMT
> -		sd = &per_cpu(cpu_domains, i);
> -#else
> -		sd = &per_cpu(phys_domains, i);
> -#endif
> -		cpu_attach_domain(sd, i);
> -	}
> -	/*
> -	 * Tune cache-hot values:
> -	 */
> -	calibrate_migration_costs();
> -}
> -/*
> - * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
> - */
> -void arch_init_sched_domains(const cpumask_t *cpu_map)
> -{
> -	cpumask_t cpu_default_map;
> -
> -	/*
> -	 * Setup mask for cpus without special case scheduling requirements.
> -	 * For now this just excludes isolated cpus, but could be used to
> -	 * exclude other special cases in the future.
> -	 */
> -	cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
> -
> -	build_sched_domains(&cpu_default_map);
> -}
> -
> -void arch_destroy_sched_domains(const cpumask_t *cpu_map)
> -{
> -#ifdef CONFIG_NUMA
> -	int i;
> -	for (i = 0; i < MAX_NUMNODES; i++) {
> -		cpumask_t nodemask = node_to_cpumask(i);
> -		struct sched_group *oldsg, *sg = sched_group_nodes[i];
> -
> -		cpus_and(nodemask, nodemask, *cpu_map);
> -		if (cpus_empty(nodemask))
> -			continue;
> -
> -		if (sg == NULL)
> -			continue;
> -		sg = sg->next;
> -next_sg:
> -		oldsg = sg;
> -		sg = sg->next;
> -		kfree(oldsg);
> -		if (oldsg != sched_group_nodes[i])
> -			goto next_sg;
> -		sched_group_nodes[i] = NULL;
> -	}
> -#endif
> -}
> -
> Index: linux-sched-curr/include/asm-ia64/processor.h
> ===================================================================
> --- linux-sched-curr.orig/include/asm-ia64/processor.h
> +++ linux-sched-curr/include/asm-ia64/processor.h
> @@ -20,9 +20,6 @@
>  #include <asm/ptrace.h>
>  #include <asm/ustack.h>
>  
> -/* Our arch specific arch_init_sched_domain is in arch/ia64/kernel/domain.c */
> -#define ARCH_HAS_SCHED_DOMAIN
> -
>  #define IA64_NUM_DBG_REGS	8
>  /*
>   * Limits for PMC and PMD are set to less than maximum architected values
> Index: linux-sched-curr/include/asm-ia64/topology.h
> ===================================================================
> --- linux-sched-curr.orig/include/asm-ia64/topology.h
> +++ linux-sched-curr/include/asm-ia64/topology.h
> @@ -96,28 +96,6 @@ void build_cpu_to_node_map(void);
>  	.nr_balance_failed	= 0,			\
>  }
>  
> -/* sched_domains SD_ALLNODES_INIT for IA64 NUMA machines */
> -#define SD_ALLNODES_INIT (struct sched_domain) {	\
> -	.span			= CPU_MASK_NONE,	\
> -	.parent			= NULL,			\
> -	.groups			= NULL,			\
> -	.min_interval		= 64,			\
> -	.max_interval		= 64*num_online_cpus(),	\
> -	.busy_factor		= 128,			\
> -	.imbalance_pct		= 133,			\
> -	.cache_nice_tries	= 1,			\
> -	.busy_idx		= 3,			\
> -	.idle_idx		= 3,			\
> -	.newidle_idx		= 0, /* unused */	\
> -	.wake_idx		= 0, /* unused */	\
> -	.forkexec_idx		= 0, /* unused */	\
> -	.per_cpu_gain		= 100,			\
> -	.flags			= SD_LOAD_BALANCE,	\
> -	.last_balance		= jiffies,		\
> -	.balance_interval	= 64,			\
> -	.nr_balance_failed	= 0,			\
> -}
> -
>  #endif /* CONFIG_NUMA */
>  
>  #include <asm-generic/topology.h>
> Index: linux-sched-curr/include/linux/sched.h
> ===================================================================
> --- linux-sched-curr.orig/include/linux/sched.h
> +++ linux-sched-curr/include/linux/sched.h
> @@ -546,15 +546,6 @@ struct sched_domain {
>  
>  extern void partition_sched_domains(cpumask_t *partition1,
>  				    cpumask_t *partition2);
> -#ifdef ARCH_HAS_SCHED_DOMAIN
> -/* Useful helpers that arch setup code may use. Defined in kernel/sched.c */
> -extern cpumask_t cpu_isolated_map;
> -extern void init_sched_build_groups(struct sched_group groups[],
> -	                        cpumask_t span, int (*group_fn)(int cpu));
> -extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
> -
> -#endif /* ARCH_HAS_SCHED_DOMAIN */
> -
>  /*
>   * Maximum cache size the migration-costs auto-tuning code will
>   * search from:
> Index: linux-sched-curr/include/linux/topology.h
> ===================================================================
> --- linux-sched-curr.orig/include/linux/topology.h
> +++ linux-sched-curr/include/linux/topology.h
> @@ -133,6 +133,28 @@
>  }
>  #endif
>  
> +/* sched_domains SD_ALLNODES_INIT for NUMA machines */
> +#define SD_ALLNODES_INIT (struct sched_domain) {	\
> +	.span			= CPU_MASK_NONE,	\
> +	.parent			= NULL,			\
> +	.groups			= NULL,			\
> +	.min_interval		= 64,			\
> +	.max_interval		= 64*num_online_cpus(),	\
> +	.busy_factor		= 128,			\
> +	.imbalance_pct		= 133,			\
> +	.cache_nice_tries	= 1,			\
> +	.busy_idx		= 3,			\
> +	.idle_idx		= 3,			\
> +	.newidle_idx		= 0, /* unused */	\
> +	.wake_idx		= 0, /* unused */	\
> +	.forkexec_idx		= 0, /* unused */	\
> +	.per_cpu_gain		= 100,			\
> +	.flags			= SD_LOAD_BALANCE,	\
> +	.last_balance		= jiffies,		\
> +	.balance_interval	= 64,			\
> +	.nr_balance_failed	= 0,			\
> +}
> +
>  #ifdef CONFIG_NUMA
>  #ifndef SD_NODE_INIT
>  #error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
> Index: linux-sched-curr/kernel/sched.c
> ===================================================================
> --- linux-sched-curr.orig/kernel/sched.c
> +++ linux-sched-curr/kernel/sched.c
> @@ -4947,7 +4947,7 @@ static int sd_parent_degenerate(struct s
>   * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
>   * hold the hotplug lock.
>   */
> -void cpu_attach_domain(struct sched_domain *sd, int cpu)
> +static void cpu_attach_domain(struct sched_domain *sd, int cpu)
>  {
>  	runqueue_t *rq = cpu_rq(cpu);
>  	struct sched_domain *tmp;
> @@ -4970,7 +4970,7 @@ void cpu_attach_domain(struct sched_doma
>  }
>  
>  /* cpus with isolated domains */
> -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
> +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
>  
>  /* Setup the mask of cpus configured for isolated domains */
>  static int __init isolated_cpu_setup(char *str)
> @@ -4998,8 +4998,8 @@ __setup ("isolcpus=", isolated_cpu_setup
>   * covered by the given span, and will set each group's ->cpumask correctly,
>   * and ->cpu_power to 0.
>   */
> -void init_sched_build_groups(struct sched_group groups[],
> -			cpumask_t span, int (*group_fn)(int cpu))
> +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
> +				    int (*group_fn)(int cpu))
>  {
>  	struct sched_group *first = NULL, *last = NULL;
>  	cpumask_t covered = CPU_MASK_NONE;
> @@ -5513,12 +5513,85 @@ void __devinit calibrate_migration_costs
>  	local_irq_restore(flags);
>  }
>  
> +#define SD_NODES_PER_DOMAIN 16
>  
> -#ifdef ARCH_HAS_SCHED_DOMAIN
> -extern void build_sched_domains(const cpumask_t *cpu_map);
> -extern void arch_init_sched_domains(const cpumask_t *cpu_map);
> -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
> -#else
> +#ifdef CONFIG_NUMA
> +/**
> + * find_next_best_node - find the next node to include in a sched_domain
> + * @node: node whose sched_domain we're building
> + * @used_nodes: nodes already in the sched_domain
> + *
> + * Find the next node to include in a given scheduling domain.  Simply
> + * finds the closest node not already in the @used_nodes map.
> + *
> + * Should use nodemask_t.
> + */
> +static int find_next_best_node(int node, unsigned long *used_nodes)
> +{
> +	int i, n, val, min_val, best_node = 0;
> +
> +	min_val = INT_MAX;
> +
> +	for (i = 0; i < MAX_NUMNODES; i++) {
> +		/* Start at @node */
> +		n = (node + i) % MAX_NUMNODES;
> +
> +		if (!nr_cpus_node(n))
> +			continue;
> +
> +		/* Skip already used nodes */
> +		if (test_bit(n, used_nodes))
> +			continue;
> +
> +		/* Simple min distance search */
> +		val = node_distance(node, n);
> +
> +		if (val < min_val) {
> +			min_val = val;
> +			best_node = n;
> +		}
> +	}
> +
> +	set_bit(best_node, used_nodes);
> +	return best_node;
> +}
> +
> +/**
> + * sched_domain_node_span - get a cpumask for a node's sched_domain
> + * @node: node whose cpumask we're constructing
> + * @size: number of nodes to include in this span
> + *
> + * Given a node, construct a good cpumask for its sched_domain to span.  It
> + * should be one that prevents unnecessary balancing, but also spreads tasks
> + * out optimally.
> + */
> +static cpumask_t sched_domain_node_span(int node)
> +{
> +	int i;
> +	cpumask_t span, nodemask;
> +	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
> +
> +	cpus_clear(span);
> +	bitmap_zero(used_nodes, MAX_NUMNODES);
> +
> +	nodemask = node_to_cpumask(node);
> +	cpus_or(span, span, nodemask);
> +	set_bit(node, used_nodes);
> +
> +	for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
> +		int next_node = find_next_best_node(node, used_nodes);
> +		nodemask = node_to_cpumask(next_node);
> +		cpus_or(span, span, nodemask);
> +	}
> +
> +	return span;
> +}
> +#endif
> +
> +/*
> + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
> + * can switch it on easily if needed.
> + */
>  #ifdef CONFIG_SCHED_SMT
>  static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
>  static struct sched_group sched_group_cpus[NR_CPUS];
> @@ -5540,44 +5613,28 @@ static int cpu_to_phys_group(int cpu)
>  }
>  
>  #ifdef CONFIG_NUMA
> -
> +/*
> + * The init_sched_build_groups can't handle what we want to do with node
> + * groups, so roll our own. Now each node has its own list of groups which
> + * gets dynamically allocated.
> + */
>  static DEFINE_PER_CPU(struct sched_domain, node_domains);
> -static struct sched_group sched_group_nodes[MAX_NUMNODES];
> -static int cpu_to_node_group(int cpu)
> +static struct sched_group *sched_group_nodes[MAX_NUMNODES];
> +
> +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
> +static struct sched_group sched_group_allnodes[MAX_NUMNODES];
> +
> +static int cpu_to_allnodes_group(int cpu)
>  {
>  	return cpu_to_node(cpu);
>  }
>  #endif
>  
> -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
> -/*
> - * The domains setup code relies on siblings not spanning
> - * multiple nodes. Make sure the architecture has a proper
> - * siblings map:
> - */
> -static void check_sibling_maps(void)
> -{
> -	int i, j;
> -
> -	for_each_online_cpu(i) {
> -		for_each_cpu_mask(j, cpu_sibling_map[i]) {
> -			if (cpu_to_node(i) != cpu_to_node(j)) {
> -				printk(KERN_INFO "warning: CPU %d siblings map "
> -					"to different node - isolating "
> -					"them.\n", i);
> -				cpu_sibling_map[i] = cpumask_of_cpu(i);
> -				break;
> -			}
> -		}
> -	}
> -}
> -#endif
> -
>  /*
>   * Build sched domains for a given set of cpus and attach the sched domains
>   * to the individual cpus
>   */
> -static void build_sched_domains(const cpumask_t *cpu_map)
> +void build_sched_domains(const cpumask_t *cpu_map)
>  {
>  	int i;
>  
> @@ -5592,11 +5649,22 @@ static void build_sched_domains(const cp
>  		cpus_and(nodemask, nodemask, *cpu_map);
>  
>  #ifdef CONFIG_NUMA
> +		if (num_online_cpus()
> +				> SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
> +			sd = &per_cpu(allnodes_domains, i);
> +			*sd = SD_ALLNODES_INIT;
> +			sd->span = *cpu_map;
> +			group = cpu_to_allnodes_group(i);
> +			sd->groups = &sched_group_allnodes[group];
> +			p = sd;
> +		} else
> +			p = NULL;
> +
>  		sd = &per_cpu(node_domains, i);
> -		group = cpu_to_node_group(i);
>  		*sd = SD_NODE_INIT;
> -		sd->span = *cpu_map;
> -		sd->groups = &sched_group_nodes[group];
> +		sd->span = sched_domain_node_span(cpu_to_node(i));
> +		sd->parent = p;
> +		cpus_and(sd->span, sd->span, *cpu_map);
>  #endif
>  
>  		p = sd;
> @@ -5621,7 +5689,7 @@ static void build_sched_domains(const cp
>  
>  #ifdef CONFIG_SCHED_SMT
>  	/* Set up CPU (sibling) groups */
> -	for_each_online_cpu(i) {
> +	for_each_cpu_mask(i, *cpu_map) {
>  		cpumask_t this_sibling_map = cpu_sibling_map[i];
>  		cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
>  		if (i != first_cpu(this_sibling_map))
> @@ -5646,8 +5714,74 @@ static void build_sched_domains(const cp
>  
>  #ifdef CONFIG_NUMA
>  	/* Set up node groups */
> -	init_sched_build_groups(sched_group_nodes, *cpu_map,
> -					&cpu_to_node_group);
> +	init_sched_build_groups(sched_group_allnodes, *cpu_map,
> +				&cpu_to_allnodes_group);
> +
> +	for (i = 0; i < MAX_NUMNODES; i++) {
> +		/* Set up node groups */
> +		struct sched_group *sg, *prev;
> +		cpumask_t nodemask = node_to_cpumask(i);
> +		cpumask_t domainspan;
> +		cpumask_t covered = CPU_MASK_NONE;
> +		int j;
> +
> +		cpus_and(nodemask, nodemask, *cpu_map);
> +		if (cpus_empty(nodemask))
> +			continue;
> +
> +		domainspan = sched_domain_node_span(i);
> +		cpus_and(domainspan, domainspan, *cpu_map);
> +
> +		sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
> +		sched_group_nodes[i] = sg;
> +		for_each_cpu_mask(j, nodemask) {
> +			struct sched_domain *sd;
> +			sd = &per_cpu(node_domains, j);
> +			sd->groups = sg;
> +			if (sd->groups == NULL) {
> +				/* Turn off balancing if we have no groups */
> +				sd->flags = 0;
> +			}
> +		}
> +		if (!sg) {
> +			printk(KERN_WARNING
> +			"Can not alloc domain group for node %d\n", i);
> +			continue;
> +		}
> +		sg->cpu_power = 0;
> +		sg->cpumask = nodemask;
> +		cpus_or(covered, covered, nodemask);
> +		prev = sg;
> +
> +		for (j = 0; j < MAX_NUMNODES; j++) {
> +			cpumask_t tmp, notcovered;
> +			int n = (i + j) % MAX_NUMNODES;
> +
> +			cpus_complement(notcovered, covered);
> +			cpus_and(tmp, notcovered, *cpu_map);
> +			cpus_and(tmp, tmp, domainspan);
> +			if (cpus_empty(tmp))
> +				break;
> +
> +			nodemask = node_to_cpumask(n);
> +			cpus_and(tmp, tmp, nodemask);
> +			if (cpus_empty(tmp))
> +				continue;
> +
> +			sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL);
> +			if (!sg) {
> +				printk(KERN_WARNING
> +				"Can not alloc domain group for node %d\n", j);
> +				break;
> +			}
> +			sg->cpu_power = 0;
> +			sg->cpumask = tmp;
> +			cpus_or(covered, covered, tmp);
> +			prev->next = sg;
> +			prev = sg;
> +		}
> +		prev->next = sched_group_nodes[i];
> +	}
>  #endif
>  
>  	/* Calculate CPU power for physical packages and nodes */
> @@ -5666,14 +5800,46 @@ static void build_sched_domains(const cp
>  		sd->groups->cpu_power = power;
>  
>  #ifdef CONFIG_NUMA
> -		if (i == first_cpu(sd->groups->cpumask)) {
> -			/* Only add "power" once for each physical package. */
> -			sd = &per_cpu(node_domains, i);
> -			sd->groups->cpu_power += power;
> +		sd = &per_cpu(allnodes_domains, i);
> +		if (sd->groups) {
> +			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
> +				(cpus_weight(sd->groups->cpumask)-1) / 10;
> +			sd->groups->cpu_power = power;
>  		}
>  #endif
>  	}
>  
> +#ifdef CONFIG_NUMA
> +	for (i = 0; i < MAX_NUMNODES; i++) {
> +		struct sched_group *sg = sched_group_nodes[i];
> +		int j;
> +
> +		if (sg == NULL)
> +			continue;
> +next_sg:
> +		for_each_cpu_mask(j, sg->cpumask) {
> +			struct sched_domain *sd;
> +			int power;
> +
> +			sd = &per_cpu(phys_domains, j);
> +			if (j != first_cpu(sd->groups->cpumask)) {
> +				/*
> +				 * Only add "power" once for each
> +				 * physical package.
> +				 */
> +				continue;
> +			}
> +			power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
> +				(cpus_weight(sd->groups->cpumask)-1) / 10;
> +
> +			sg->cpu_power += power;
> +		}
> +		sg = sg->next;
> +		if (sg != sched_group_nodes[i])
> +			goto next_sg;
> +	}
> +#endif
> +
>  	/* Attach the domains */
>  	for_each_cpu_mask(i, *cpu_map) {
>  		struct sched_domain *sd;
> @@ -5692,13 +5858,10 @@ static void build_sched_domains(const cp
>  /*
>   * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
>   */
> -static void arch_init_sched_domains(cpumask_t *cpu_map)
> +static void arch_init_sched_domains(const cpumask_t *cpu_map)
>  {
>  	cpumask_t cpu_default_map;
>  
> -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
> -	check_sibling_maps();
> -#endif
>  	/*
>  	 * Setup mask for cpus without special case scheduling requirements.
>  	 * For now this just excludes isolated cpus, but could be used to
> @@ -5711,10 +5874,29 @@ static void arch_init_sched_domains(cpum
>  
>  static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
>  {
> -	/* Do nothing: everything is statically allocated. */
> -}
> +#ifdef CONFIG_NUMA
> +	int i;
> +	for (i = 0; i < MAX_NUMNODES; i++) {
> +		cpumask_t nodemask = node_to_cpumask(i);
> +		struct sched_group *oldsg, *sg = sched_group_nodes[i];
> +
> +		cpus_and(nodemask, nodemask, *cpu_map);
> +		if (cpus_empty(nodemask))
> +			continue;
>  
> -#endif /* ARCH_HAS_SCHED_DOMAIN */
> +		if (sg == NULL)
> +			continue;
> +		sg = sg->next;
> +next_sg:
> +		oldsg = sg;
> +		sg = sg->next;
> +		kfree(oldsg);
> +		if (oldsg != sched_group_nodes[i])
> +			goto next_sg;
> +		sched_group_nodes[i] = NULL;
> +	}
> +#endif
> +}
>  
>  /*
>   * Detach sched domains from a group of cpus specified in cpu_map
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2005-09-02 14:34 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-08-20 14:50 [PATCH] ia64 cpuset + build_sched_domains() mangles structures John Hawkes
2005-08-22  7:08 ` Ingo Molnar
2005-08-22 14:14   ` Dinakar Guniguntala
2005-08-22 16:07     ` Ingo Molnar
2005-08-22 20:16       ` Dinakar Guniguntala
2005-08-22 20:18         ` Dinakar Guniguntala
2005-09-02 14:47       ` Dinakar Guniguntala
2005-08-22 20:38 ` Paul Jackson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox