public inbox for linux-ia64@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH] add scheduler domains for ia64
@ 2004-08-13 18:08 Jesse Barnes
  2004-08-14  3:23 ` Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Jesse Barnes @ 2004-08-13 18:08 UTC (permalink / raw)
  To: linux-kernel, linux-ia64, Nick Piggin; +Cc: John Hawkes

[-- Attachment #1: Type: text/plain, Size: 417 bytes --]

Nick, how does this look?  It adds scheduler domain code for ia64 and replaces 
the patch in Andrew's tree.  It also adds SD_NODE_INIT macros to each arch 
that has ARCH_HAS_SCHED_DOMAIN so that the balance values are more easily 
tweaked.  Since the cpu span of the nodes on ia64 is smaller than the whole 
system, I also removed a WARN_ON in active_load_balance, but I'm not sure if 
that's correct.

Thanks,
Jesse

[-- Attachment #2: sched-domains-ia64.patch --]
[-- Type: text/plain, Size: 9474 bytes --]

===== arch/ia64/kernel/smpboot.c 1.56 vs edited =====
--- 1.56/arch/ia64/kernel/smpboot.c	2004-08-04 10:50:16 -07:00
+++ edited/arch/ia64/kernel/smpboot.c	2004-08-13 11:03:29 -07:00
@@ -719,3 +719,182 @@
 		printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
 		       ia64_sal_strerror(sal_ret));
 }
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < numnodes; i++) {
+		/* Start at @node */
+		n = (node + i) % numnodes;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, i);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+cpumask_t __init sched_domain_node_span(int node, int size)
+{
+	int i;
+	cpumask_t span;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	for (i = 0; i < size; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		cpus_or(span, span, node_to_cpumask(next_node));
+	}
+
+	return span;
+}
+
+static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+
+/* Number of nearby nodes in a node's scheduling domain */
+#define SD_NODES_PER_DOMAIN 4
+
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+void __init arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_node = NULL, *last_node = NULL;
+
+	/* Set up domains */
+	for_each_cpu(i) {
+		int node = cpu_to_node(i);
+		cpumask_t nodemask = node_to_cpumask(node);
+		struct sched_domain *node_sd = &per_cpu(node_domains, i);
+		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+
+		*node_sd = SD_NODE_INIT;
+		node_sd->span = sched_domain_node_span(i, SD_NODES_PER_DOMAIN);
+		node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
+
+		*cpu_sd = SD_CPU_INIT;
+		cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
+		cpu_sd->groups = &sched_group_cpus[i];
+		cpu_sd->parent = node_sd;
+	}
+
+	/* Set up groups */
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		cpumask_t tmp = node_to_cpumask(i);
+		cpumask_t nodemask;
+		struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+		struct sched_group *node = &sched_group_nodes[i];
+		int j;
+
+		cpus_and(nodemask, tmp, cpu_possible_map);
+
+		if (cpus_empty(nodemask))
+			continue;
+
+		node->cpumask = nodemask;
+		node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
+
+		for_each_cpu_mask(j, node->cpumask) {
+			struct sched_group *cpu = &sched_group_cpus[j];
+
+			cpus_clear(cpu->cpumask);
+			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+
+		if (!first_node)
+			first_node = node;
+		if (last_node)
+			last_node->next = node;
+		last_node = node;
+	}
+	last_node->next = first_node;
+
+	mb();
+	for_each_cpu(i) {
+		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+		cpu_attach_domain(cpu_sd, i);
+	}
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+	/* Set up domains */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+
+		*cpu_sd = SD_CPU_INIT;
+		cpu_sd->span = cpu_possible_map;
+		cpu_sd->groups = &sched_group_cpus[i];
+	}
+
+	/* Set up CPU groups */
+	for_each_cpu_mask(i, cpu_possible_map) {
+		struct sched_group *cpu = &sched_group_cpus[i];
+
+		cpus_clear(cpu->cpumask);
+		cpu_set(i, cpu->cpumask);
+		cpu->cpu_power = SCHED_LOAD_SCALE;
+
+		if (!first_cpu)
+			first_cpu = cpu;
+		if (last_cpu)
+			last_cpu->next = cpu;
+		last_cpu = cpu;
+	}
+	last_cpu->next = first_cpu;
+
+	mb(); /* domains were modified outside the lock */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+		cpu_attach_domain(cpu_sd, i);
+	}
+}
+#endif /* CONFIG_NUMA */
===== include/asm-i386/processor.h 1.67 vs edited =====
--- 1.67/include/asm-i386/processor.h	2004-06-27 00:19:26 -07:00
+++ edited/include/asm-i386/processor.h	2004-08-13 10:37:06 -07:00
@@ -647,6 +647,24 @@
 
 #ifdef CONFIG_SCHED_SMT
 #define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 8,			\
+	.max_interval		= 32,			\
+	.busy_factor		= 32,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
 #define ARCH_HAS_SCHED_WAKE_IDLE
 #endif
 
===== include/asm-ia64/processor.h 1.61 vs edited =====
--- 1.61/include/asm-ia64/processor.h	2004-07-26 22:26:50 -07:00
+++ edited/include/asm-ia64/processor.h	2004-08-13 10:08:03 -07:00
@@ -334,6 +334,29 @@
 /* Prepare to copy thread state - unlazy all lazy status */
 #define prepare_to_copy(tsk)	do { } while (0)
 
+#ifdef CONFIG_NUMA
+/* smpboot.c defines a numa specific scheduler domain routine */
+#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 80,			\
+	.max_interval		= 320,			\
+	.busy_factor		= 320,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 10,			\
+	.nr_balance_failed	= 0,			\
+}
+#endif
+
 /*
  * This is the mechanism for creating a new kernel thread.
  *
===== include/asm-ppc64/processor.h 1.48 vs edited =====
--- 1.48/include/asm-ppc64/processor.h	2004-07-26 15:13:12 -07:00
+++ edited/include/asm-ppc64/processor.h	2004-08-13 10:37:19 -07:00
@@ -628,6 +628,24 @@
 
 #ifdef CONFIG_SCHED_SMT
 #define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 8,			\
+	.max_interval		= 32,			\
+	.busy_factor		= 32,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
 #define ARCH_HAS_SCHED_WAKE_IDLE
 #endif
 
===== include/asm-x86_64/processor.h 1.36 vs edited =====
--- 1.36/include/asm-x86_64/processor.h	2004-06-27 00:19:26 -07:00
+++ edited/include/asm-x86_64/processor.h	2004-08-13 10:37:36 -07:00
@@ -458,6 +458,24 @@
 
 #ifdef CONFIG_SCHED_SMT
 #define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 8,			\
+	.max_interval		= 32,			\
+	.busy_factor		= 32,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_BALANCE_CLONE	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 1,			\
+	.nr_balance_failed	= 0,			\
+}
 #define ARCH_HAS_SCHED_WAKE_IDLE
 #endif
 
===== include/linux/sched.h 1.228 vs edited =====
--- 1.228/include/linux/sched.h	2004-07-28 21:58:54 -07:00
+++ edited/include/linux/sched.h	2004-08-13 10:06:05 -07:00
@@ -17,6 +17,7 @@
 #include <asm/system.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
+#include <asm/processor.h>
 #include <asm/ptrace.h>
 #include <asm/mmu.h>
 
@@ -654,6 +655,7 @@
 }
 
 #ifdef CONFIG_NUMA
+#ifndef ARCH_HAS_SCHED_DOMAIN
 /* Common values for NUMA nodes */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
@@ -673,6 +675,7 @@
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
 }
+#endif
 #endif
 
 extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
===== kernel/sched.c 1.319 vs edited =====
--- 1.319/kernel/sched.c	2004-08-02 01:00:40 -07:00
+++ edited/kernel/sched.c	2004-08-13 10:59:53 -07:00
@@ -1826,10 +1826,8 @@
 	for_each_domain(busiest_cpu, sd)
 		if (cpu_isset(busiest->push_cpu, sd->span))
 			break;
-	if (!sd) {
-		WARN_ON(1);
+	if (!sd)
 		return;
-	}
 
  	group = sd->groups;
 	while (!cpu_isset(busiest_cpu, group->cpumask))

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-13 18:08 [PATCH] add scheduler domains for ia64 Jesse Barnes
@ 2004-08-14  3:23 ` Nick Piggin
  2004-08-14 20:52   ` Jesse Barnes
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2004-08-14  3:23 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: linux-kernel, linux-ia64, John Hawkes, Ingo Molnar

Jesse Barnes wrote:
> Nick, how does this look?  It adds scheduler domain code for ia64 and replaces 
> the patch in Andrew's tree.  It also adds SD_NODE_INIT macros to each arch 
> that has ARCH_HAS_SCHED_DOMAIN so that the balance values are more easily 
> tweaked.  Since the cpu span of the nodes on ia64 is smaller than the whole 
> system, I also removed a WARN_ON in active_load_balance, but I'm not sure if 
> that's correct.

Hi Jesse,
Andrew's latest tree should have a number of improvements and changes
to the sched domains code which you will need to synch up to.

One issue you may have is that Ingo removed the ability to have arch
code override the domain structure due to it being too hazardous for
architectures to use in this form (which I don't entirely disagree with).

Now I guess your patch could go into the generic code because it is
pretty general - however are you guys going to want to do anything
more fancy with these things?

Nick

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-14  3:23 ` Nick Piggin
@ 2004-08-14 20:52   ` Jesse Barnes
  2004-08-15  0:54     ` Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Jesse Barnes @ 2004-08-14 20:52 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, linux-ia64, John Hawkes, Ingo Molnar

On Friday, August 13, 2004 8:23 pm, Nick Piggin wrote:
> Andrew's latest tree should have a number of improvements and changes
> to the sched domains code which you will need to synch up to.

Yeah, I forgot about those.  I'll respin against your consolidation stuff.

> One issue you may have is that Ingo removed the ability to have arch
> code override the domain structure due to it being too hazardous for
> architectures to use in this form (which I don't entirely disagree with).
>
> Now I guess your patch could go into the generic code because it is
> pretty general - however are you guys going to want to do anything
> more fancy with these things?

Maybe, we haven't figured out the best way to schedule on a 512p yet, but most 
or all of this code is generic.  In order for things to work at all though, 
we'll need to change some of the SD_NODE_INIT values, maybe we can keep that 
as per-arch?

Thanks,
Jesse

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-14 20:52   ` Jesse Barnes
@ 2004-08-15  0:54     ` Nick Piggin
  2004-08-17 20:57       ` Jesse Barnes
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2004-08-15  0:54 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: linux-kernel, linux-ia64, John Hawkes, Ingo Molnar

Jesse Barnes wrote:
> On Friday, August 13, 2004 8:23 pm, Nick Piggin wrote:
> 
>>Andrew's latest tree should have a number of improvements and changes
>>to the sched domains code which you will need to synch up to.
> 
> 
> Yeah, I forgot about those.  I'll respin against your consolidation stuff.
> 

Thanks.

> 
>>One issue you may have is that Ingo removed the ability to have arch
>>code override the domain structure due to it being too hazardous for
>>architectures to use in this form (which I don't entirely disagree with).
>>
>>Now I guess your patch could go into the generic code because it is
>>pretty general - however are you guys going to want to do anything
>>more fancy with these things?
> 
> 
> Maybe, we haven't figured out the best way to schedule on a 512p yet, but most 
> or all of this code is generic.  In order for things to work at all though, 
> we'll need to change some of the SD_NODE_INIT values, maybe we can keep that 
> as per-arch?
> 

Yeah, all the SD_*_INIT values are overridable. We could even say, put
in an SD_NODE2_INIT for a 2nd level NUMA domain in the generic code,
for example.

I'd say your closest-node setup would probably get close to what you want.
The main thing you want is to not do huge amounts of balancing work in
interrupt context, and also not to move a task from one side of the
system to the other when one node is a little bit out of balance.

I guess if you want to do anything fancier then we can take a look at
re-exporting the domain setup.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-15  0:54     ` Nick Piggin
@ 2004-08-17 20:57       ` Jesse Barnes
  2004-08-20  2:11         ` Nick Piggin
  0 siblings, 1 reply; 10+ messages in thread
From: Jesse Barnes @ 2004-08-17 20:57 UTC (permalink / raw)
  To: Nick Piggin; +Cc: linux-kernel, linux-ia64, John Hawkes, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 903 bytes --]

On Saturday, August 14, 2004 8:54 pm, Nick Piggin wrote:
> Yeah, all the SD_*_INIT values are overridable. We could even say, put
> in an SD_NODE2_INIT for a 2nd level NUMA domain in the generic code,
> for example.

Yeah, we'll need different values for each level in the hierarchy.

> I'd say your closest-node setup would probably get close to what you want.
> The main thing you want is to not do huge amounts of balancing work in
> interrupt context, and also not to move a task from one side of the
> system to the other when one node is a little bit out of balance.
>
> I guess if you want to do anything fancier then we can take a look at
> re-exporting the domain setup.

Ok, sounds good.  How does this look?  It sits on top of 2.6.8.1-mm1, ripping 
out the ia64 specific bits and moving things to sched.c.  I've also added an 
ia64 specific SD_NODE_INIT and an #if !defined to sched.c

Jesse

[-- Attachment #2: node-span.patch --]
[-- Type: text/x-diff, Size: 6020 bytes --]

diff -Napur -X /home/jbarnes/dontdiff linux-2.6.8.1-mm1/arch/ia64/kernel/smpboot.c linux-2.6.8.1-mm1.nodespan/arch/ia64/kernel/smpboot.c
--- linux-2.6.8.1-mm1/arch/ia64/kernel/smpboot.c	2004-08-17 13:41:43.000000000 -0700
+++ linux-2.6.8.1-mm1.nodespan/arch/ia64/kernel/smpboot.c	2004-08-17 13:34:28.000000000 -0700
@@ -707,69 +707,3 @@ init_smp_config(void)
 		       ia64_sal_strerror(sal_ret));
 }
 
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int __init find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < numnodes; i++) {
-		/* Start at @node */
-		n = (node + i) % numnodes;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, i);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-cpumask_t __init sched_domain_node_span(int node, int size)
-{
-	int i;
-	cpumask_t span;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	for (i = 0; i < size; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		cpus_or(span, span, node_to_cpumask(next_node));
-	}
-
-	return span;
-}
-#endif /* CONFIG_NUMA */
-
diff -Napur -X /home/jbarnes/dontdiff linux-2.6.8.1-mm1/include/asm-ia64/processor.h linux-2.6.8.1-mm1.nodespan/include/asm-ia64/processor.h
--- linux-2.6.8.1-mm1/include/asm-ia64/processor.h	2004-08-17 13:41:22.000000000 -0700
+++ linux-2.6.8.1-mm1.nodespan/include/asm-ia64/processor.h	2004-08-17 13:37:13.000000000 -0700
@@ -335,8 +335,23 @@ struct task_struct;
 #define prepare_to_copy(tsk)	do { } while (0)
 
 #ifdef CONFIG_NUMA
-/* smpboot.c defines a numa specific scheduler domain routine */
-#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 80,			\
+	.max_interval		= 320,			\
+	.busy_factor		= 320,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 10,			\
+	.nr_balance_failed	= 0,			\
+}
 #endif
 
 /*
diff -Napur -X /home/jbarnes/dontdiff linux-2.6.8.1-mm1/kernel/sched.c linux-2.6.8.1-mm1.nodespan/kernel/sched.c
--- linux-2.6.8.1-mm1/kernel/sched.c	2004-08-17 13:41:37.000000000 -0700
+++ linux-2.6.8.1-mm1.nodespan/kernel/sched.c	2004-08-17 13:43:36.000000000 -0700
@@ -401,7 +401,8 @@ struct sched_domain {
 	.nr_balance_failed	= 0,			\
 }
 
-#ifdef CONFIG_NUMA
+/* Arch can override this macro in processor.h */
+#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
 /* Common values for NUMA nodes */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
@@ -2218,10 +2219,8 @@ static void active_load_balance(runqueue
 	for_each_domain(busiest_cpu, sd)
 		if (cpu_isset(busiest->push_cpu, sd->span))
 			break;
-	if (!sd) {
-		WARN_ON(1);
+	if (!sd)
 		return;
-	}
 
 	group = sd->groups;
 	while (!cpu_isset(busiest_cpu, group->cpumask))
@@ -4121,15 +4120,74 @@ static void cpu_attach_domain(struct sch
 }
 
 #ifdef CONFIG_NUMA
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern cpumask_t __init sched_domain_node_span(int node, int size);
-#else
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < numnodes; i++) {
+		/* Start at @node */
+		n = (node + i) % numnodes;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, i);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+cpumask_t __init sched_domain_node_span(int node, int size)
+{
+	int i;
+	cpumask_t span;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	for (i = 0; i < size; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		cpus_or(span, span, node_to_cpumask(next_node));
+	}
+
+	return span;
+}
+#else /* !CONFIG_NUMA */
 static cpumask_t __init sched_domain_node_span(int node, int size)
 {
 	return cpu_possible_map;
 }
-#endif /* ARCH_HAS_SCHED_DOMAIN */
-#endif
+#endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-17 20:57       ` Jesse Barnes
@ 2004-08-20  2:11         ` Nick Piggin
  2004-08-20  2:22           ` Jesse Barnes
  0 siblings, 1 reply; 10+ messages in thread
From: Nick Piggin @ 2004-08-20  2:11 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: linux-kernel, linux-ia64, John Hawkes, Ingo Molnar

Jesse Barnes wrote:
> On Saturday, August 14, 2004 8:54 pm, Nick Piggin wrote:
> 
>>Yeah, all the SD_*_INIT values are overridable. We could even say, put
>>in an SD_NODE2_INIT for a 2nd level NUMA domain in the generic code,
>>for example.
> 
> 
> Yeah, we'll need different values for each level in the hierarchy.
> 
> 
>>I'd say your closest-node setup would probably get close to what you want.
>>The main thing you want is to not do huge amounts of balancing work in
>>interrupt context, and also not to move a task from one side of the
>>system to the other when one node is a little bit out of balance.
>>
>>I guess if you want to do anything fancier then we can take a look at
>>re-exporting the domain setup.
> 
> 
> Ok, sounds good.  How does this look?  It sits on top of 2.6.8.1-mm1, ripping 
> out the ia64 specific bits and moving things to sched.c.  I've also added an 
> ia64 specific SD_NODE_INIT and an #if !defined to sched.c
> 

Sorry I haven't replied earlier.  I think this looks good, provided
it does the right thing for you (I can't test it myself). Send it to
Andrew to get merged if you'd like.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-20  2:11         ` Nick Piggin
@ 2004-08-20  2:22           ` Jesse Barnes
  2004-08-20  6:28             ` Andrew Morton
  2004-08-20  8:06             ` Ingo Molnar
  0 siblings, 2 replies; 10+ messages in thread
From: Jesse Barnes @ 2004-08-20  2:22 UTC (permalink / raw)
  To: Nick Piggin, akpm; +Cc: linux-kernel, linux-ia64, John Hawkes, Ingo Molnar

[-- Attachment #1: Type: text/plain, Size: 1520 bytes --]

On Thursday, August 19, 2004 10:11 pm, Nick Piggin wrote:
> Sorry I haven't replied earlier.  I think this looks good, provided
> it does the right thing for you (I can't test it myself). Send it to
> Andrew to get merged if you'd like.

Yep, it's been working ok so far.  There's still more we can do, but this is a 
good start I think.  Andrew, this version applies on top of 2.6.8.1-mm2 but 
overwrites most of the earlier node-span patch by moving bits from arch/ia64 
to kernel/sched.c, so let me know if you want the patch in a different 
format.

This patch adds some more NUMA specific logic to the creation of scheduler 
domains.  Domains spanning all CPUs in a large system are too large to 
schedule across efficiently, leading to livelocks and inordinate amounts of 
time being spent in scheduler routines.  With this patch applied, the node 
scheduling domains for NUMA platforms will only contain a specified number of 
nearby CPUs, based on the value of SD_NODES_PER_DOMAIN.  It also allows 
arches to override SD_NODE_INIT, which sets the domain scheduling parameters 
for each node's domain.  This is necessary especially for large systems.

Possible future directions:
  o multilevel node hierarchy (e.g. node domains could contain 4 nodes worth 
of CPUs, supernode domains could contain 32 nodes worth, etc. each with their 
own SD_NODE_INIT values)
  o more tweaking of SD_NODE_INIT values for good load balancing vs. overhead 
tradeoffs

Signed-off-by: Jesse Barnes <jbarnes@sgi.com>

Thanks,
Jesse

[-- Attachment #2: node-span.patch --]
[-- Type: text/x-diff, Size: 6020 bytes --]

diff -Napur -X /home/jbarnes/dontdiff linux-2.6.8.1-mm1/arch/ia64/kernel/smpboot.c linux-2.6.8.1-mm1.nodespan/arch/ia64/kernel/smpboot.c
--- linux-2.6.8.1-mm1/arch/ia64/kernel/smpboot.c	2004-08-17 13:41:43.000000000 -0700
+++ linux-2.6.8.1-mm1.nodespan/arch/ia64/kernel/smpboot.c	2004-08-17 13:34:28.000000000 -0700
@@ -707,69 +707,3 @@ init_smp_config(void)
 		       ia64_sal_strerror(sal_ret));
 }
 
-#ifdef CONFIG_NUMA
-
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain.  Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int __init find_next_best_node(int node, unsigned long *used_nodes)
-{
-	int i, n, val, min_val, best_node = 0;
-
-	min_val = INT_MAX;
-
-	for (i = 0; i < numnodes; i++) {
-		/* Start at @node */
-		n = (node + i) % numnodes;
-
-		/* Skip already used nodes */
-		if (test_bit(n, used_nodes))
-			continue;
-
-		/* Simple min distance search */
-		val = node_distance(node, i);
-
-		if (val < min_val) {
-			min_val = val;
-			best_node = n;
-		}
-	}
-
-	set_bit(best_node, used_nodes);
-	return best_node;
-}
-
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @size: number of nodes to include in this span
- *
- * Given a node, construct a good cpumask for its sched_domain to span.  It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-cpumask_t __init sched_domain_node_span(int node, int size)
-{
-	int i;
-	cpumask_t span;
-	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
-
-	cpus_clear(span);
-	bitmap_zero(used_nodes, MAX_NUMNODES);
-
-	for (i = 0; i < size; i++) {
-		int next_node = find_next_best_node(node, used_nodes);
-		cpus_or(span, span, node_to_cpumask(next_node));
-	}
-
-	return span;
-}
-#endif /* CONFIG_NUMA */
-
diff -Napur -X /home/jbarnes/dontdiff linux-2.6.8.1-mm1/include/asm-ia64/processor.h linux-2.6.8.1-mm1.nodespan/include/asm-ia64/processor.h
--- linux-2.6.8.1-mm1/include/asm-ia64/processor.h	2004-08-17 13:41:22.000000000 -0700
+++ linux-2.6.8.1-mm1.nodespan/include/asm-ia64/processor.h	2004-08-17 13:37:13.000000000 -0700
@@ -335,8 +335,23 @@ struct task_struct;
 #define prepare_to_copy(tsk)	do { } while (0)
 
 #ifdef CONFIG_NUMA
-/* smpboot.c defines a numa specific scheduler domain routine */
-#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) {		\
+	.span			= CPU_MASK_NONE,	\
+	.parent			= NULL,			\
+	.groups			= NULL,			\
+	.min_interval		= 80,			\
+	.max_interval		= 320,			\
+	.busy_factor		= 320,			\
+	.imbalance_pct		= 125,			\
+	.cache_hot_time		= (10*1000000),		\
+	.cache_nice_tries	= 1,			\
+	.per_cpu_gain		= 100,			\
+	.flags			= SD_BALANCE_EXEC	\
+				| SD_WAKE_BALANCE,	\
+	.last_balance		= jiffies,		\
+	.balance_interval	= 10,			\
+	.nr_balance_failed	= 0,			\
+}
 #endif
 
 /*
diff -Napur -X /home/jbarnes/dontdiff linux-2.6.8.1-mm1/kernel/sched.c linux-2.6.8.1-mm1.nodespan/kernel/sched.c
--- linux-2.6.8.1-mm1/kernel/sched.c	2004-08-17 13:41:37.000000000 -0700
+++ linux-2.6.8.1-mm1.nodespan/kernel/sched.c	2004-08-17 13:43:36.000000000 -0700
@@ -401,7 +401,8 @@ struct sched_domain {
 	.nr_balance_failed	= 0,			\
 }
 
-#ifdef CONFIG_NUMA
+/* Arch can override this macro in processor.h */
+#if defined(CONFIG_NUMA) && !defined(SD_NODE_INIT)
 /* Common values for NUMA nodes */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
@@ -2218,10 +2219,8 @@ static void active_load_balance(runqueue
 	for_each_domain(busiest_cpu, sd)
 		if (cpu_isset(busiest->push_cpu, sd->span))
 			break;
-	if (!sd) {
-		WARN_ON(1);
+	if (!sd)
 		return;
-	}
 
 	group = sd->groups;
 	while (!cpu_isset(busiest_cpu, group->cpumask))
@@ -4121,15 +4120,74 @@ static void cpu_attach_domain(struct sch
 }
 
 #ifdef CONFIG_NUMA
-#ifdef ARCH_HAS_SCHED_DOMAIN
-extern cpumask_t __init sched_domain_node_span(int node, int size);
-#else
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain.  Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+	int i, n, val, min_val, best_node = 0;
+
+	min_val = INT_MAX;
+
+	for (i = 0; i < numnodes; i++) {
+		/* Start at @node */
+		n = (node + i) % numnodes;
+
+		/* Skip already used nodes */
+		if (test_bit(n, used_nodes))
+			continue;
+
+		/* Simple min distance search */
+		val = node_distance(node, i);
+
+		if (val < min_val) {
+			min_val = val;
+			best_node = n;
+		}
+	}
+
+	set_bit(best_node, used_nodes);
+	return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span.  It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+cpumask_t __init sched_domain_node_span(int node, int size)
+{
+	int i;
+	cpumask_t span;
+	DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+	cpus_clear(span);
+	bitmap_zero(used_nodes, MAX_NUMNODES);
+
+	for (i = 0; i < size; i++) {
+		int next_node = find_next_best_node(node, used_nodes);
+		cpus_or(span, span, node_to_cpumask(next_node));
+	}
+
+	return span;
+}
+#else /* !CONFIG_NUMA */
 static cpumask_t __init sched_domain_node_span(int node, int size)
 {
 	return cpu_possible_map;
 }
-#endif /* ARCH_HAS_SCHED_DOMAIN */
-#endif
+#endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SCHED_SMT
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-20  2:22           ` Jesse Barnes
@ 2004-08-20  6:28             ` Andrew Morton
  2004-08-20 14:57               ` Jesse Barnes
  2004-08-20  8:06             ` Ingo Molnar
  1 sibling, 1 reply; 10+ messages in thread
From: Andrew Morton @ 2004-08-20  6:28 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: nickpiggin, linux-kernel, linux-ia64, hawkes, mingo

Jesse Barnes <jbarnes@engr.sgi.com> wrote:
>
> Yep, it's been working ok so far.  There's still more we can do, but this is a 
>  good start I think.  Andrew, this version applies on top of 2.6.8.1-mm2 but 
>  overwrites most of the earlier node-span patch by moving bits from arch/ia64 
>  to kernel/sched.c, so let me know if you want the patch in a different 
>  format.

Is OK.  I wiggled it into the logical place so we'll end up with a sane
patch series.

Watch the warnings please...



kernel/sched.c:3732: warning: `sched_domain_node_span' defined but not used

Signed-off-by: Andrew Morton <akpm@osdl.org>
---

 25-akpm/kernel/sched.c |    5 -----
 1 files changed, 5 deletions(-)

diff -puN kernel/sched.c~sched-domain-node-span-4-update-warning-fix kernel/sched.c
--- 25/kernel/sched.c~sched-domain-node-span-4-update-warning-fix	2004-08-19 23:28:24.395974232 -0700
+++ 25-akpm/kernel/sched.c	2004-08-19 23:28:24.400973472 -0700
@@ -3727,11 +3727,6 @@ cpumask_t __init sched_domain_node_span(
 
 	return span;
 }
-#else /* !CONFIG_NUMA */
-static cpumask_t __init sched_domain_node_span(int node, int size)
-{
-	return cpu_possible_map;
-}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SCHED_SMT
_


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-20  2:22           ` Jesse Barnes
  2004-08-20  6:28             ` Andrew Morton
@ 2004-08-20  8:06             ` Ingo Molnar
  1 sibling, 0 replies; 10+ messages in thread
From: Ingo Molnar @ 2004-08-20  8:06 UTC (permalink / raw)
  To: Jesse Barnes; +Cc: Nick Piggin, akpm, linux-kernel, linux-ia64, John Hawkes


* Jesse Barnes <jbarnes@engr.sgi.com> wrote:

> This patch adds some more NUMA specific logic to the creation of
> scheduler domains.  Domains spanning all CPUs in a large system are
> too large to schedule across efficiently, leading to livelocks and
> inordinate amounts of time being spent in scheduler routines.  With
> this patch applied, the node scheduling domains for NUMA platforms
> will only contain a specified number of nearby CPUs, based on the
> value of SD_NODES_PER_DOMAIN.  It also allows arches to override
> SD_NODE_INIT, which sets the domain scheduling parameters for each
> node's domain.  This is necessary especially for large systems.

looks good to me too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>

	Ingo

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] add scheduler domains for ia64
  2004-08-20  6:28             ` Andrew Morton
@ 2004-08-20 14:57               ` Jesse Barnes
  0 siblings, 0 replies; 10+ messages in thread
From: Jesse Barnes @ 2004-08-20 14:57 UTC (permalink / raw)
  To: Andrew Morton; +Cc: nickpiggin, linux-kernel, linux-ia64, hawkes, mingo

On Friday, August 20, 2004 2:28 am, Andrew Morton wrote:
> Jesse Barnes <jbarnes@engr.sgi.com> wrote:
> > Yep, it's been working ok so far.  There's still more we can do, but this
> > is a good start I think.  Andrew, this version applies on top of
> > 2.6.8.1-mm2 but overwrites most of the earlier node-span patch by moving
> > bits from arch/ia64 to kernel/sched.c, so let me know if you want the
> > patch in a different format.
>
> Is OK.  I wiggled it into the logical place so we'll end up with a sane
> patch series.
>
> Watch the warnings please...
>
>
>
> kernel/sched.c:3732: warning: `sched_domain_node_span' defined but not used

Oops, sorry about that!  I meant to test with CONFIG_NUMA=n but fell asleep.  
I'll be more careful in the future.

Jesse

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2004-08-20 14:57 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-08-13 18:08 [PATCH] add scheduler domains for ia64 Jesse Barnes
2004-08-14  3:23 ` Nick Piggin
2004-08-14 20:52   ` Jesse Barnes
2004-08-15  0:54     ` Nick Piggin
2004-08-17 20:57       ` Jesse Barnes
2004-08-20  2:11         ` Nick Piggin
2004-08-20  2:22           ` Jesse Barnes
2004-08-20  6:28             ` Andrew Morton
2004-08-20 14:57               ` Jesse Barnes
2004-08-20  8:06             ` Ingo Molnar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox