2.6.5-rc3-mm4 x86_64 sched domains patch

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* 2.6.5-rc3-mm4 x86_64 sched domains patch
@ 2004-04-08 23:22 Darren Hart
  2004-04-08 23:42 ` Nick Piggin
  2004-04-14 13:44 ` Andi Kleen
  0 siblings, 2 replies; 11+ messages in thread
From: Darren Hart @ 2004-04-08 23:22 UTC (permalink / raw)
  To: lkml; +Cc: piggin, ak, Martin J Bligh, Rick Lindsley, akpm

The current default implementations of arch_init_sched_domains
constructs either a flat or two level topolology.  The two level
topology is built if CONFIG_NUMA is set.  It seems that CONFIG_NUMA is
not the appropriate flag to use for constructing a two level topology
since some architectures which define CONFIG_NUMA would be better served
with a flat topology.  x86_64 for example will construct a two level
topology with one CPU per node, causing performance problems because
balancing within nodes is pointless and balancing across nodes doesn't
occur as often.

This patch introduces a new CONFIG_SCHED_NUMA flag and uses it to decide
between a flat or two level topology of sched_domains.  The patch is
minimally invasive as it primarily modifies Kconfig files and sets the
appropriate default (off for x86_64, on for everything that used to
export CONFIG_NUMA) and should only change the sched_domains topology
constructed on x86_64 systems.  I have verified this on a 4 node x86
NUMAQ, but need someone to test x86_64.

This patch is intended as a quick fix for the x86_64 problem, and
doesn't solve the problem of how to build generic sched domain
topologies.  We can certainly conceive of various topologies for x86
systems, so even arch specific topologies may not be sufficient.  Would
sub-arch (ie NUMAQ) be the right way to handle different topologies, or
will we be able to autodiscover the appropriate topology?  I will be
looking into this more, but thought some might benefit from an immediate
x86_64 fix.  I am very interested in hearing your ideas on this.

Regards,

Darren Hart


diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/arch/alpha/Kconfig linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/alpha/Kconfig
--- linux-2.6.5-rc3-mm4/arch/alpha/Kconfig	2004-04-02 06:42:46.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/alpha/Kconfig	2004-04-02 16:16:58.000000000 -0800
@@ -519,6 +519,14 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server machines.  If in doubt, say N.
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 # LARGE_VMALLOC is racy, if you *really* need it then fix it first
 config ALPHA_LARGE_VMALLOC
 	bool
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/arch/i386/Kconfig linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/i386/Kconfig
--- linux-2.6.5-rc3-mm4/arch/i386/Kconfig	2004-04-02 06:42:52.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/i386/Kconfig	2004-04-07 11:57:41.000000000 -0700
@@ -772,6 +772,14 @@ config NUMA
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
 	depends on X86_NUMAQ && (!HIGHMEM64G || !SMP)
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/arch/ia64/Kconfig linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/ia64/Kconfig
--- linux-2.6.5-rc3-mm4/arch/ia64/Kconfig	2004-04-02 06:42:52.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/ia64/Kconfig	2004-04-02 16:16:57.000000000 -0800
@@ -172,6 +172,14 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server systems.  If in doubt, say N.
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 config VIRTUAL_MEM_MAP
 	bool "Virtual mem map"
 	default y if !IA64_HP_SIM
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/arch/mips/Kconfig linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/mips/Kconfig
--- linux-2.6.5-rc3-mm4/arch/mips/Kconfig	2004-04-02 06:42:46.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/mips/Kconfig	2004-04-02 16:16:58.000000000 -0800
@@ -337,6 +337,14 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server machines.  If in doubt, say N.
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 config MAPPED_KERNEL
 	bool "Mapped kernel support"
 	depends on SGI_IP27
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/arch/ppc64/Kconfig linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/ppc64/Kconfig
--- linux-2.6.5-rc3-mm4/arch/ppc64/Kconfig	2004-04-02 06:42:52.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/ppc64/Kconfig	2004-04-02 16:16:59.000000000 -0800
@@ -173,6 +173,14 @@ config NUMA
 	bool "NUMA support"
 	depends on DISCONTIGMEM
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/arch/x86_64/Kconfig linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/x86_64/Kconfig
--- linux-2.6.5-rc3-mm4/arch/x86_64/Kconfig	2004-04-02 06:42:52.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/arch/x86_64/Kconfig	2004-04-02 16:17:00.000000000 -0800
@@ -261,6 +261,14 @@ config NUMA
        depends on K8_NUMA
        default y
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default n
+       help
+         Enable two level sched domains hierarchy.
+         Say N if unsure.
+
 config HAVE_DEC_LOCK
 	bool
 	depends on SMP
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/include/linux/sched.h linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/include/linux/sched.h
--- linux-2.6.5-rc3-mm4/include/linux/sched.h	2004-04-02 06:42:53.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/include/linux/sched.h	2004-04-02 16:17:01.000000000 -0800
@@ -623,7 +623,7 @@ struct sched_domain {
 	.nr_balance_failed	= 0,			\
 }
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 /* Common values for NUMA nodes */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
@@ -656,7 +656,7 @@ static inline int set_cpus_allowed(task_
 
 extern unsigned long long sched_clock(void);
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 extern void sched_balance_exec(void);
 #else
 #define sched_balance_exec()   {}
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-rc3-mm4/kernel/sched.c linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/kernel/sched.c
--- linux-2.6.5-rc3-mm4/kernel/sched.c	2004-04-02 06:42:53.000000000 -0800
+++ linux-2.6.5-rc3-mm4-x86_64_arch_sched_domain/kernel/sched.c	2004-04-07 11:50:11.000000000 -0700
@@ -42,7 +42,7 @@
 #include <linux/percpu.h>
 #include <linux/kthread.h>
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
 #define cpu_to_node_mask(cpu) (cpu_online_map)
@@ -1142,7 +1142,7 @@ enum idle_type
 };
 
 #ifdef CONFIG_SMP
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 /*
  * If dest_cpu is allowed for this process, migrate the task to it.
  * This is accomplished by forcing the cpu_allowed mask to only
@@ -1241,7 +1241,7 @@ void sched_balance_exec(void)
 out:
 	put_cpu();
 }
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SCHED_NUMA */
 
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -3461,7 +3461,7 @@ extern void __init arch_init_sched_domai
 #else
 static struct sched_group sched_group_cpus[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static void __init arch_init_sched_domains(void)
@@ -3532,7 +3532,7 @@ static void __init arch_init_sched_domai
 	}
 }
 
-#else /* !CONFIG_NUMA */
+#else /* !CONFIG_SCHED_NUMA */
 static void __init arch_init_sched_domains(void)
 {
 	int i;
@@ -3570,7 +3570,7 @@ static void __init arch_init_sched_domai
 	}
 }
 
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SCHED_NUMA */
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
 #define SCHED_DOMAIN_DEBUG


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-08 23:22 2.6.5-rc3-mm4 x86_64 sched domains patch Darren Hart
@ 2004-04-08 23:42 ` Nick Piggin
  2004-04-11  8:57   ` shai
  2004-04-14 13:44 ` Andi Kleen
  1 sibling, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2004-04-08 23:42 UTC (permalink / raw)
  To: Darren Hart; +Cc: lkml, ak, Martin J Bligh, Rick Lindsley, akpm, Ingo Molnar

Darren Hart wrote:

>The current default implementations of arch_init_sched_domains
>constructs either a flat or two level topolology.  The two level
>topology is built if CONFIG_NUMA is set.  It seems that CONFIG_NUMA is
>not the appropriate flag to use for constructing a two level topology
>since some architectures which define CONFIG_NUMA would be better served
>with a flat topology.  x86_64 for example will construct a two level
>topology with one CPU per node, causing performance problems because
>balancing within nodes is pointless and balancing across nodes doesn't
>occur as often.
>
>

This is correct, although I don't know why there would be
performance problems. The rebalance in the degenerate node-local
domain should be basically unmeasurable. It would be nice to
get rid of it at some time. I have code to prune off degenerate
domains, which I will submit soonish.

The NUMA rebalance should occur more often than the old numasched
did, but perhaps with some recent Altix-centric changes to the
generic setup, this is no longer the case.

The STREAM performance problem is due mainly to the more
conservative nature of balancing, which is otherwise a good thing.
I think we can fix this in the short term by having x86_64 balance
between nodes more often. In the long term, we can merge Ingo's
balance on clone stuff, and the interested people can play with
that.

>This patch introduces a new CONFIG_SCHED_NUMA flag and uses it to decide
>between a flat or two level topology of sched_domains.  The patch is
>minimally invasive as it primarily modifies Kconfig files and sets the
>appropriate default (off for x86_64, on for everything that used to
>export CONFIG_NUMA) and should only change the sched_domains topology
>constructed on x86_64 systems.  I have verified this on a 4 node x86
>NUMAQ, but need someone to test x86_64.
>
>

I guess I can't see a big problem with this, other than more
complexity. In the long run, we should obviously have the arch
code set up optimal domains depending on the machine and config.

>This patch is intended as a quick fix for the x86_64 problem, and
>doesn't solve the problem of how to build generic sched domain
>topologies.  We can certainly conceive of various topologies for x86
>systems, so even arch specific topologies may not be sufficient.  Would
>sub-arch (ie NUMAQ) be the right way to handle different topologies, or
>will we be able to autodiscover the appropriate topology?  I will be
>looking into this more, but thought some might benefit from an immediate
>x86_64 fix.  I am very interested in hearing your ideas on this.
>
>

SGI want to do sub arch domains so they can do specific things
with their systems. I don't really care what the arch code does
with them, but it would be wise to only specialise it when there
is a genuine need. I'm glad you'll be looking into it, thanks.

Nick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-08 23:42 ` Nick Piggin
@ 2004-04-11  8:57   ` shai
  2004-04-11  9:57     ` Rick Lindsley
  2004-04-11 15:07     ` Martin J. Bligh
  0 siblings, 2 replies; 11+ messages in thread
From: shai @ 2004-04-11  8:57 UTC (permalink / raw)
  To: 'Nick Piggin', 'Darren Hart'
  Cc: 'lkml', ak, 'Martin J Bligh',
	'Rick Lindsley', akpm, 'Ingo Molnar'

Can SLIT/SRAT be used here to define topology for the generic case?

SRAT is being used by i386 to build zonelists, but not for the scheduler -
any good reason why?

--Shai

-----Original Message-----
From: linux-kernel-owner@vger.kernel.org
[mailto:linux-kernel-owner@vger.kernel.org] On Behalf Of Nick Piggin
Sent: Thursday, April 08, 2004 16:42
To: Darren Hart
Cc: lkml; ak@suse.de; Martin J Bligh; Rick Lindsley; akpm@osdl.org; Ingo
Molnar
Subject: Re: 2.6.5-rc3-mm4 x86_64 sched domains patch

Darren Hart wrote:

>The current default implementations of arch_init_sched_domains
>constructs either a flat or two level topolology.  The two level
>topology is built if CONFIG_NUMA is set.  It seems that CONFIG_NUMA is
>not the appropriate flag to use for constructing a two level topology
>since some architectures which define CONFIG_NUMA would be better served
>with a flat topology.  x86_64 for example will construct a two level
>topology with one CPU per node, causing performance problems because
>balancing within nodes is pointless and balancing across nodes doesn't
>occur as often.
>
>

This is correct, although I don't know why there would be
performance problems. The rebalance in the degenerate node-local
domain should be basically unmeasurable. It would be nice to
get rid of it at some time. I have code to prune off degenerate
domains, which I will submit soonish.

The NUMA rebalance should occur more often than the old numasched
did, but perhaps with some recent Altix-centric changes to the
generic setup, this is no longer the case.

The STREAM performance problem is due mainly to the more
conservative nature of balancing, which is otherwise a good thing.
I think we can fix this in the short term by having x86_64 balance
between nodes more often. In the long term, we can merge Ingo's
balance on clone stuff, and the interested people can play with
that.

>This patch introduces a new CONFIG_SCHED_NUMA flag and uses it to decide
>between a flat or two level topology of sched_domains.  The patch is
>minimally invasive as it primarily modifies Kconfig files and sets the
>appropriate default (off for x86_64, on for everything that used to
>export CONFIG_NUMA) and should only change the sched_domains topology
>constructed on x86_64 systems.  I have verified this on a 4 node x86
>NUMAQ, but need someone to test x86_64.
>
>

I guess I can't see a big problem with this, other than more
complexity. In the long run, we should obviously have the arch
code set up optimal domains depending on the machine and config.

>This patch is intended as a quick fix for the x86_64 problem, and
>doesn't solve the problem of how to build generic sched domain
>topologies.  We can certainly conceive of various topologies for x86
>systems, so even arch specific topologies may not be sufficient.  Would
>sub-arch (ie NUMAQ) be the right way to handle different topologies, or
>will we be able to autodiscover the appropriate topology?  I will be
>looking into this more, but thought some might benefit from an immediate
>x86_64 fix.  I am very interested in hearing your ideas on this.
>
>

SGI want to do sub arch domains so they can do specific things
with their systems. I don't really care what the arch code does
with them, but it would be wise to only specialise it when there
is a genuine need. I'm glad you'll be looking into it, thanks.

Nick

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-11  8:57   ` shai
@ 2004-04-11  9:57     ` Rick Lindsley
  2004-04-11 15:07     ` Martin J. Bligh
  1 sibling, 0 replies; 11+ messages in thread
From: Rick Lindsley @ 2004-04-11  9:57 UTC (permalink / raw)
  To: shai
  Cc: 'Nick Piggin', 'Darren Hart', 'lkml', ak,
	'Martin J Bligh', akpm, 'Ingo Molnar'

    Can SLIT/SRAT be used here to define topology for the generic case?
    
    SRAT is being used by i386 to build zonelists, but not for the scheduler -
    any good reason why?

I can think of some possible reasons, but I'm not familiar with SLIT/SRAT
... can you describe it for me?

Rick

^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-11  8:57   ` shai
  2004-04-11  9:57     ` Rick Lindsley
@ 2004-04-11 15:07     ` Martin J. Bligh
  1 sibling, 0 replies; 11+ messages in thread
From: Martin J. Bligh @ 2004-04-11 15:07 UTC (permalink / raw)
  To: shai, 'Nick Piggin', 'Darren Hart'
  Cc: 'lkml', ak, 'Rick Lindsley', akpm,
	'Ingo Molnar'

> Can SLIT/SRAT be used here to define topology for the generic case?
> 
> SRAT is being used by i386 to build zonelists, but not for the scheduler -
> any good reason why?

Because it's not generic to all machines. 

M.


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-08 23:22 2.6.5-rc3-mm4 x86_64 sched domains patch Darren Hart
  2004-04-08 23:42 ` Nick Piggin
@ 2004-04-14 13:44 ` Andi Kleen
  2004-04-14 14:14   ` Nick Piggin
  2004-04-14 17:24   ` Darren Hart
  1 sibling, 2 replies; 11+ messages in thread
From: Andi Kleen @ 2004-04-14 13:44 UTC (permalink / raw)
  To: Darren Hart; +Cc: linux-kernel, piggin, mjbligh, ricklind, akpm

On Thu, 08 Apr 2004 16:22:09 -0700
Darren Hart <dvhltc@us.ibm.com> wrote:


> 
> This patch is intended as a quick fix for the x86_64 problem, and

Ingo's latest tweaks seemed to already cure STREAM, but some more
tuning is probably a good idea agreed.

> doesn't solve the problem of how to build generic sched domain
> topologies.  We can certainly conceive of various topologies for x86
> systems, so even arch specific topologies may not be sufficient.  Would
> sub-arch (ie NUMAQ) be the right way to handle different topologies, or
> will we be able to autodiscover the appropriate topology?  I will be
> looking into this more, but thought some might benefit from an immediate
> x86_64 fix.  I am very interested in hearing your ideas on this.


The patch doesn't apply against 2.6.5-mm5 anymore. Can you generate a new patch? 
I will test it then.

Also it will need merging with the patch that adds SMT support for IA32e machines
on x86-64.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-14 13:44 ` Andi Kleen
@ 2004-04-14 14:14   ` Nick Piggin
  2004-04-14 14:41     ` Andi Kleen
  2004-04-14 17:24   ` Darren Hart
  1 sibling, 1 reply; 11+ messages in thread
From: Nick Piggin @ 2004-04-14 14:14 UTC (permalink / raw)
  To: Andi Kleen
  Cc: Darren Hart, linux-kernel, Ingo Molnar, mjbligh, ricklind, akpm

Andi Kleen wrote:
> On Thu, 08 Apr 2004 16:22:09 -0700
> Darren Hart <dvhltc@us.ibm.com> wrote:
> 
> 
> 
>>This patch is intended as a quick fix for the x86_64 problem, and
> 
> 
> Ingo's latest tweaks seemed to already cure STREAM, but some more
> tuning is probably a good idea agreed.
> 

Where is STREAM versus other kernels? You said you got
best performance on a custom 2.4 kernel. Do we match
that?

How is your performance for other things? I recall you
may have told me about some other (smaller) issues you
were seeing?

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-14 14:14   ` Nick Piggin
@ 2004-04-14 14:41     ` Andi Kleen
  2004-04-15  5:51       ` Nick Piggin
  0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2004-04-14 14:41 UTC (permalink / raw)
  To: Nick Piggin; +Cc: dvhltc, linux-kernel, mingo, mjbligh, ricklind, akpm

On Thu, 15 Apr 2004 00:14:19 +1000
Nick Piggin <nickpiggin@yahoo.com.au> wrote:

> Andi Kleen wrote:
> > On Thu, 08 Apr 2004 16:22:09 -0700
> > Darren Hart <dvhltc@us.ibm.com> wrote:
> > 
> > 
> > 
> >>This patch is intended as a quick fix for the x86_64 problem, and
> > 
> > 
> > Ingo's latest tweaks seemed to already cure STREAM, but some more
> > tuning is probably a good idea agreed.
> > 
> 
> Where is STREAM versus other kernels? You said you got
> best performance on a custom 2.4 kernel. Do we match
> that?

Differences were below the measurement error, so I consider it fixed.

> 
> How is your performance for other things? I recall you
> may have told me about some other (smaller) issues you
> were seeing?

I haven't tested much yet.  I can compare kernel compilations later.

Also I'm still somewhat hoping that the IBM benchmark team will take a stab at 
it - they are much better than me at running many tests.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-14 14:41     ` Andi Kleen
@ 2004-04-15  5:51       ` Nick Piggin
  0 siblings, 0 replies; 11+ messages in thread
From: Nick Piggin @ 2004-04-15  5:51 UTC (permalink / raw)
  To: Andi Kleen; +Cc: dvhltc, linux-kernel, mingo, mjbligh, ricklind, akpm

Andi Kleen wrote:
> On Thu, 15 Apr 2004 00:14:19 +1000
> Nick Piggin <nickpiggin@yahoo.com.au> wrote:

>>Where is STREAM versus other kernels? You said you got
>>best performance on a custom 2.4 kernel. Do we match
>>that?
> 
> 
> Differences were below the measurement error, so I consider it fixed.
> 

great.

> 
>>How is your performance for other things? I recall you
>>may have told me about some other (smaller) issues you
>>were seeing?
> 
> 
> I haven't tested much yet.  I can compare kernel compilations later.
> 

That would be good. I don't expect you to do all the work,
but Opteron being a non traditional NUMA, and me doing most
of my testing on an old NUMAQ makes them quite important.

Even if you just got some results for a couple of random
benchmarks would be great.

> Also I'm still somewhat hoping that the IBM benchmark team will take a stab at 
> it - they are much better than me at running many tests.
> 

Well we've survived OSDL's STP tests as far as I know. A
couple of regressions were found and fixed there, so that
was good.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: 2.6.5-rc3-mm4 x86_64 sched domains patch
  2004-04-14 13:44 ` Andi Kleen
  2004-04-14 14:14   ` Nick Piggin
@ 2004-04-14 17:24   ` Darren Hart
  1 sibling, 0 replies; 11+ messages in thread
From: Darren Hart @ 2004-04-14 17:24 UTC (permalink / raw)
  To: Andi Kleen; +Cc: lkml, piggin, Martin J Bligh, Rick Lindsley, akpm

On Wed, 2004-04-14 at 06:44, Andi Kleen wrote:
> On Thu, 08 Apr 2004 16:22:09 -0700
> Darren Hart <dvhltc@us.ibm.com> wrote:
> > This patch is intended as a quick fix for the x86_64 problem, and
> 
> Ingo's latest tweaks seemed to already cure STREAM, but some more
> tuning is probably a good idea agreed.
> ...
> The patch doesn't apply against 2.6.5-mm5 anymore. Can you generate a new patch? 
> I will test it then.

Find below the patch updated for akpm's 2.6.5-mm5-1.bz2 patch.  As with
the previous patch I verified it works properly on a 4 node, 16 CPU
NUMA-Q.  Please test both CONFIG_SCHED_NUMA=n (the improved case,
default) and CONFIG_SCHED_NUMA=y (pre-patch equivalent) on x86_64, and
thanks!

> 
> Also it will need merging with the patch that adds SMT support for IA32e machines
> on x86-64.

Where is this patch?

-- Darren





diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/arch/alpha/Kconfig linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/alpha/Kconfig
--- linux-2.6.5-mm5/arch/alpha/Kconfig	2004-04-03 19:37:40.000000000 -0800
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/alpha/Kconfig	2004-04-14 09:39:40.000000000 -0700
@@ -519,6 +519,14 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server machines.  If in doubt, say N.
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 # LARGE_VMALLOC is racy, if you *really* need it then fix it first
 config ALPHA_LARGE_VMALLOC
 	bool
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/arch/i386/Kconfig linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/i386/Kconfig
--- linux-2.6.5-mm5/arch/i386/Kconfig	2004-04-14 09:37:40.000000000 -0700
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/i386/Kconfig	2004-04-14 09:39:40.000000000 -0700
@@ -724,6 +724,14 @@ config NUMA
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 # Need comments to help the hapless user trying to turn on NUMA support
 comment "NUMA (NUMA-Q) requires SMP, 64GB highmem support"
 	depends on X86_NUMAQ && (!HIGHMEM64G || !SMP)
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/arch/ia64/Kconfig linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/ia64/Kconfig
--- linux-2.6.5-mm5/arch/ia64/Kconfig	2004-04-14 09:37:41.000000000 -0700
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/ia64/Kconfig	2004-04-14 09:39:40.000000000 -0700
@@ -172,6 +172,14 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server systems.  If in doubt, say N.
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 config VIRTUAL_MEM_MAP
 	bool "Virtual mem map"
 	default y if !IA64_HP_SIM
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/arch/mips/Kconfig linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/mips/Kconfig
--- linux-2.6.5-mm5/arch/mips/Kconfig	2004-04-03 19:37:06.000000000 -0800
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/mips/Kconfig	2004-04-14 09:39:40.000000000 -0700
@@ -337,6 +337,14 @@ config NUMA
 	  Access).  This option is for configuring high-end multiprocessor
 	  server machines.  If in doubt, say N.
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 config MAPPED_KERNEL
 	bool "Mapped kernel support"
 	depends on SGI_IP27
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/arch/ppc64/Kconfig linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/ppc64/Kconfig
--- linux-2.6.5-mm5/arch/ppc64/Kconfig	2004-04-14 09:37:43.000000000 -0700
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/ppc64/Kconfig	2004-04-14 09:39:40.000000000 -0700
@@ -173,6 +173,14 @@ config NUMA
 	bool "NUMA support"
 	depends on DISCONTIGMEM
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default y
+       help
+         Enable two level sched domains hierarchy.
+         Say Y if unsure.
+
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SMP
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/arch/x86_64/Kconfig linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/x86_64/Kconfig
--- linux-2.6.5-mm5/arch/x86_64/Kconfig	2004-04-14 09:37:46.000000000 -0700
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/arch/x86_64/Kconfig	2004-04-14 09:39:40.000000000 -0700
@@ -261,6 +261,14 @@ config NUMA
        depends on K8_NUMA
        default y
 
+config SCHED_NUMA
+       bool "Two level sched domains"
+       depends on NUMA
+       default n
+       help
+         Enable two level sched domains hierarchy.
+         Say N if unsure.
+
 config HAVE_DEC_LOCK
 	bool
 	depends on SMP
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/include/linux/sched.h linux-2.6.5-mm5-x86_64_arch_sched_domain/include/linux/sched.h
--- linux-2.6.5-mm5/include/linux/sched.h	2004-04-14 09:38:08.000000000 -0700
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/include/linux/sched.h	2004-04-14 09:41:35.000000000 -0700
@@ -670,7 +670,7 @@ struct sched_domain {
 	.nr_balance_failed	= 0,			\
 }
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 /* Common values for NUMA nodes */
 #define SD_NODE_INIT (struct sched_domain) {		\
 	.span			= CPU_MASK_NONE,	\
diff -aurpN -X /home/dvhart/.diff.exclude linux-2.6.5-mm5/kernel/sched.c linux-2.6.5-mm5-x86_64_arch_sched_domain/kernel/sched.c
--- linux-2.6.5-mm5/kernel/sched.c	2004-04-14 09:38:09.000000000 -0700
+++ linux-2.6.5-mm5-x86_64_arch_sched_domain/kernel/sched.c	2004-04-14 09:45:34.000000000 -0700
@@ -45,7 +45,7 @@
 #include <linux/seq_file.h>
 #include <linux/times.h>
 
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu))
 #else
 #define cpu_to_node_mask(cpu) (cpu_online_map)
@@ -3735,7 +3735,7 @@ extern void __init arch_init_sched_domai
 #else
 static struct sched_group sched_group_cpus[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-#ifdef CONFIG_NUMA
+#ifdef CONFIG_SCHED_NUMA
 static struct sched_group sched_group_nodes[MAX_NUMNODES];
 static DEFINE_PER_CPU(struct sched_domain, node_domains);
 static void __init arch_init_sched_domains(void)
@@ -3806,7 +3806,7 @@ static void __init arch_init_sched_domai
 	}
 }
 
-#else /* !CONFIG_NUMA */
+#else /* !CONFIG_SCHED_NUMA */
 static void __init arch_init_sched_domains(void)
 {
 	int i;
@@ -3845,7 +3845,7 @@ static void __init arch_init_sched_domai
 	}
 }
 
-#endif /* CONFIG_NUMA */
+#endif /* CONFIG_SCHED_NUMA */
 #endif /* ARCH_HAS_SCHED_DOMAIN */
 
 #define SCHED_DOMAIN_DEBUG



^ permalink raw reply	[flat|nested] 11+ messages in thread

* RE: 2.6.5-rc3-mm4 x86_64 sched domains patch
@ 2004-04-14 23:20 Siddha, Suresh B
  0 siblings, 0 replies; 11+ messages in thread
From: Siddha, Suresh B @ 2004-04-14 23:20 UTC (permalink / raw)
  To: Darren Hart, Andi Kleen; +Cc: lkml, piggin, Martin J Bligh, Rick Lindsley, akpm

[-- Attachment #1: Type: text/plain, Size: 311 bytes --]

Darren Hart wrote:
> On Wed, 2004-04-14 at 06:44, Andi Kleen wrote:
> > Also it will need merging with the patch that adds SMT 
> support for IA32e machines
> > on x86-64.
> 
> Where is this patch?
> 
> -- Darren

Attched the patch which goes on top of a slightly older mm tree.

thanks,
suresh


[-- Attachment #2: smt.diff --]
[-- Type: application/octet-stream, Size: 17472 bytes --]

diff -Nru linux-2.6.4-rc1/arch/i386/kernel/Makefile linux-64/arch/i386/kernel/Makefile
--- linux-2.6.4-rc1/arch/i386/kernel/Makefile	2004-03-02 09:45:43.214170408 -0800
+++ linux-64/arch/i386/kernel/Makefile	2004-03-01 17:34:12.000000000 -0800
@@ -32,6 +32,7 @@
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SCHED_SMT)		+= init_sched_domains.o
 
 EXTRA_AFLAGS   := -traditional
 
diff -Nru linux-2.6.4-rc1/arch/i386/kernel/smpboot.c linux-64/arch/i386/kernel/smpboot.c
--- linux-2.6.4-rc1/arch/i386/kernel/smpboot.c	2004-03-02 09:45:43.245165696 -0800
+++ linux-64/arch/i386/kernel/smpboot.c	2004-03-01 17:34:12.000000000 -0800
@@ -1123,215 +1123,6 @@
 		synchronize_tsc_bp();
 }
 
-#ifdef CONFIG_SCHED_SMT
-#ifdef CONFIG_NUMA
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-__init void arch_init_sched_domains(void)
-{
-	int i;
-	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
-
-	/* Set up domains */
-	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-		struct sched_domain *node_domain = &per_cpu(node_domains, i);
-		int node = cpu_to_node(i);
-		cpumask_t nodemask = node_to_cpumask(node);
-
-		*cpu_domain = SD_SIBLING_INIT;
-		cpu_domain->span = cpu_sibling_map[i];
-
-		*phys_domain = SD_CPU_INIT;
-		phys_domain->span = nodemask;
-
-		*node_domain = SD_NODE_INIT;
-		node_domain->span = cpu_possible_map;
-	}
-
-	/* Set up CPU (sibling) groups */
-	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		int j;
-		first_cpu = last_cpu = NULL;
-
-		if (i != first_cpu(cpu_domain->span))
-			continue;
-
-		for_each_cpu_mask(j, cpu_domain->span) {
-			struct sched_group *cpu = &sched_group_cpus[j];
-
-			cpu->cpumask = CPU_MASK_NONE;
-			cpu_set(j, cpu->cpumask);
-			cpu->cpu_power = SCHED_LOAD_SCALE;
-
-			if (!first_cpu)
-				first_cpu = cpu;
-			if (last_cpu)
-				last_cpu->next = cpu;
-			last_cpu = cpu;
-		}
-		last_cpu->next = first_cpu;
-	}
-
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		int j;
-		cpumask_t nodemask;
-		struct sched_group *node = &sched_group_nodes[i];
-		cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map);
-
-		if (cpus_empty(nodemask))
-			continue;
-
-		first_cpu = last_cpu = NULL;
-		/* Set up physical groups */
-		for_each_cpu_mask(j, nodemask) {
-			struct sched_domain *cpu_domain = cpu_sched_domain(j);
-			struct sched_group *cpu = &sched_group_phys[j];
-
-			if (j != first_cpu(cpu_domain->span))
-				continue;
-
-			cpu->cpumask = cpu_domain->span;
-			/*
-			 * Make each extra sibling increase power by 10% of
-			 * the basic CPU. This is very arbitrary.
-			 */
-			cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-			node->cpu_power += cpu->cpu_power;
-
-			if (!first_cpu)
-				first_cpu = cpu;
-			if (last_cpu)
-				last_cpu->next = cpu;
-			last_cpu = cpu;
-		}
-		last_cpu->next = first_cpu;
-	}
-
-	/* Set up nodes */
-	first_cpu = last_cpu = NULL;
-	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *cpu = &sched_group_nodes[i];
-		cpumask_t nodemask;
-		cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map);
-
-		if (cpus_empty(nodemask))
-			continue;
-
-		cpu->cpumask = nodemask;
-		/* ->cpu_power already setup */
-
-		if (!first_cpu)
-			first_cpu = cpu;
-		if (last_cpu)
-			last_cpu->next = cpu;
-		last_cpu = cpu;
-	}
-	last_cpu->next = first_cpu;
-
-	mb();
-	for_each_cpu(i) {
-		int node = cpu_to_node(i);
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-		struct sched_domain *node_domain = &per_cpu(node_domains, i);
-		struct sched_group *cpu_group = &sched_group_cpus[i];
-		struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
-		struct sched_group *node_group = &sched_group_nodes[node];
-
-		cpu_domain->parent = phys_domain;
-		phys_domain->parent = node_domain;
-
-		node_domain->groups = node_group;
-		phys_domain->groups = phys_group;
-		cpu_domain->groups = cpu_group;
-	}
-}
-#else /* CONFIG_NUMA */
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-__init void arch_init_sched_domains(void)
-{
-	int i;
-	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
-
-	/* Set up domains */
-	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-
-		*cpu_domain = SD_SIBLING_INIT;
-		cpu_domain->span = cpu_sibling_map[i];
-
-		*phys_domain = SD_CPU_INIT;
-		phys_domain->span = cpu_possible_map;
-	}
-
-	/* Set up CPU (sibling) groups */
-	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		int j;
-		first_cpu = last_cpu = NULL;
-
-		if (i != first_cpu(cpu_domain->span))
-			continue;
-
-		for_each_cpu_mask(j, cpu_domain->span) {
-			struct sched_group *cpu = &sched_group_cpus[j];
-
-			cpus_clear(cpu->cpumask);
-			cpu_set(j, cpu->cpumask);
-			cpu->cpu_power = SCHED_LOAD_SCALE;
-
-			if (!first_cpu)
-				first_cpu = cpu;
-			if (last_cpu)
-				last_cpu->next = cpu;
-			last_cpu = cpu;
-		}
-		last_cpu->next = first_cpu;
-	}
-
-	first_cpu = last_cpu = NULL;
-	/* Set up physical groups */
-	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		struct sched_group *cpu = &sched_group_phys[i];
-
-		if (i != first_cpu(cpu_domain->span))
-			continue;
-
-		cpu->cpumask = cpu_domain->span;
-		/* See SMT+NUMA setup for comment */
-		cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-
-		if (!first_cpu)
-			first_cpu = cpu;
-		if (last_cpu)
-			last_cpu->next = cpu;
-		last_cpu = cpu;
-	}
-	last_cpu->next = first_cpu;
-
-	mb();
-	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
-		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-		struct sched_group *cpu_group = &sched_group_cpus[i];
-		struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
-		cpu_domain->parent = phys_domain;
-		phys_domain->groups = phys_group;
-		cpu_domain->groups = cpu_group;
-	}
-}
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_SCHED_SMT */
 
 /* These are wrappers to interface to the new boot process.  Someone
    who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
diff -Nru linux-2.6.4-rc1/arch/i386/kernel/init_sched_domains.c linux-64/arch/i386/kernel/init_sched_domains.c
--- linux-2.6.4-rc1/arch/i386/kernel/init_sched_domains.c	1969-12-31 16:00:00.000000000 -0800
+++ linux-64/arch/i386/kernel/init_sched_domains.c	2004-03-01 17:34:12.000000000 -0800
@@ -0,0 +1,212 @@
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+
+
+#ifdef CONFIG_NUMA
+static struct sched_group sched_group_cpus[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+__init void arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+	/* Set up domains */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+		struct sched_domain *node_domain = &per_cpu(node_domains, i);
+		int node = cpu_to_node(i);
+		cpumask_t nodemask = node_to_cpumask(node);
+
+		*cpu_domain = SD_SIBLING_INIT;
+		cpu_domain->span = cpu_sibling_map[i];
+
+		*phys_domain = SD_CPU_INIT;
+		phys_domain->span = nodemask;
+
+		*node_domain = SD_NODE_INIT;
+		node_domain->span = cpu_possible_map;
+	}
+
+	/* Set up CPU (sibling) groups */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		int j;
+		first_cpu = last_cpu = NULL;
+
+		if (i != first_cpu(cpu_domain->span))
+			continue;
+
+		for_each_cpu_mask(j, cpu_domain->span) {
+			struct sched_group *cpu = &sched_group_cpus[j];
+
+			cpu->cpumask = CPU_MASK_NONE;
+			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+	}
+
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		int j;
+		cpumask_t nodemask;
+		struct sched_group *node = &sched_group_nodes[i];
+		cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map);
+
+		if (cpus_empty(nodemask))
+			continue;
+
+		first_cpu = last_cpu = NULL;
+		/* Set up physical groups */
+		for_each_cpu_mask(j, nodemask) {
+			struct sched_domain *cpu_domain = cpu_sched_domain(j);
+			struct sched_group *cpu = &sched_group_phys[j];
+
+			if (j != first_cpu(cpu_domain->span))
+				continue;
+
+			cpu->cpumask = cpu_domain->span;
+			/*
+			 * Make each extra sibling increase power by 10% of
+			 * the basic CPU. This is very arbitrary.
+			 */
+			cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
+			node->cpu_power += cpu->cpu_power;
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+	}
+
+	/* Set up nodes */
+	first_cpu = last_cpu = NULL;
+	for (i = 0; i < MAX_NUMNODES; i++) {
+		struct sched_group *cpu = &sched_group_nodes[i];
+		cpumask_t nodemask;
+		cpus_and(nodemask, node_to_cpumask(i), cpu_possible_map);
+
+		if (cpus_empty(nodemask))
+			continue;
+
+		cpu->cpumask = nodemask;
+		/* ->cpu_power already setup */
+
+		if (!first_cpu)
+			first_cpu = cpu;
+		if (last_cpu)
+			last_cpu->next = cpu;
+		last_cpu = cpu;
+	}
+	last_cpu->next = first_cpu;
+
+	mb();
+	for_each_cpu(i) {
+		int node = cpu_to_node(i);
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+		struct sched_domain *node_domain = &per_cpu(node_domains, i);
+		struct sched_group *cpu_group = &sched_group_cpus[i];
+		struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
+		struct sched_group *node_group = &sched_group_nodes[node];
+
+		cpu_domain->parent = phys_domain;
+		phys_domain->parent = node_domain;
+
+		node_domain->groups = node_group;
+		phys_domain->groups = phys_group;
+		cpu_domain->groups = cpu_group;
+	}
+}
+#else /* CONFIG_NUMA */
+static struct sched_group sched_group_cpus[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+__init void arch_init_sched_domains(void)
+{
+	int i;
+	struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+	/* Set up domains */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+
+		*cpu_domain = SD_SIBLING_INIT;
+		cpu_domain->span = cpu_sibling_map[i];
+
+		*phys_domain = SD_CPU_INIT;
+		phys_domain->span = cpu_possible_map;
+	}
+
+	/* Set up CPU (sibling) groups */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		int j;
+		first_cpu = last_cpu = NULL;
+
+		if (i != first_cpu(cpu_domain->span))
+			continue;
+
+		for_each_cpu_mask(j, cpu_domain->span) {
+			struct sched_group *cpu = &sched_group_cpus[j];
+
+			cpus_clear(cpu->cpumask);
+			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
+
+			if (!first_cpu)
+				first_cpu = cpu;
+			if (last_cpu)
+				last_cpu->next = cpu;
+			last_cpu = cpu;
+		}
+		last_cpu->next = first_cpu;
+	}
+
+	first_cpu = last_cpu = NULL;
+	/* Set up physical groups */
+	for_each_cpu(i) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_group *cpu = &sched_group_phys[i];
+
+		if (i != first_cpu(cpu_domain->span))
+			continue;
+
+		cpu->cpumask = cpu_domain->span;
+		/* See SMT+NUMA setup for comment */
+		cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
+
+		if (!first_cpu)
+			first_cpu = cpu;
+		if (last_cpu)
+			last_cpu->next = cpu;
+		last_cpu = cpu;
+	}
+	last_cpu->next = first_cpu;
+
+	mb();
+	for_each_cpu(i) {
+		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
+		struct sched_group *cpu_group = &sched_group_cpus[i];
+		struct sched_group *phys_group = &sched_group_phys[first_cpu(cpu_domain->span)];
+		cpu_domain->parent = phys_domain;
+		phys_domain->groups = phys_group;
+		cpu_domain->groups = cpu_group;
+	}
+}
+#endif /* CONFIG_NUMA */
diff -Nru linux-2.6.4-rc1/arch/x86_64/Kconfig linux-64/arch/x86_64/Kconfig
--- linux-2.6.4-rc1/arch/x86_64/Kconfig	2004-03-02 09:45:44.055042576 -0800
+++ linux-64/arch/x86_64/Kconfig	2004-03-01 17:34:12.000000000 -0800
@@ -222,6 +222,16 @@
 
 	  If you don't know what to do here, say N.
 
+config SCHED_SMT
+	bool "SMT (Hyperthreading) scheduler support"
+	depends on SMP
+	default off
+	help
+	  SMT scheduler support improves the CPU scheduler's decision making
+	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  cost of slightly increased overhead in some places. If unsure say
+	  N here.
+
 config PREEMPT
 	bool "Preemptible Kernel"
 	---help---
diff -Nru linux-2.6.4-rc1/arch/x86_64/kernel/Makefile linux-64/arch/x86_64/kernel/Makefile
--- linux-2.6.4-rc1/arch/x86_64/kernel/Makefile	2004-03-02 09:45:44.111034064 -0800
+++ linux-64/arch/x86_64/kernel/Makefile	2004-03-01 17:34:34.000000000 -0800
@@ -25,6 +25,7 @@
 obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o aperture.o
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
 obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
+obj-$(CONFIG_SCHED_SMT)		+= init_sched_domains.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 obj-$(CONFIG_KGDB)		+= kgdb_stub.o
@@ -36,3 +37,4 @@
 topology-y                     += ../../i386/mach-default/topology.o
 swiotlb-$(CONFIG_SWIOTLB)      += ../../ia64/lib/swiotlb.o
 microcode-$(subst m,y,$(CONFIG_MICROCODE))  += ../../i386/kernel/microcode.o
+init_sched_domains-$(CONFIG_SCHED_SMT)                += ../../i386/kernel/init_sched_domains.o
diff -Nru linux-2.6.4-rc1/arch/x86_64/kernel/smpboot.c linux-64/arch/x86_64/kernel/smpboot.c
--- linux-2.6.4-rc1/arch/x86_64/kernel/smpboot.c	2004-03-02 09:45:57.274032984 -0800
+++ linux-64/arch/x86_64/kernel/smpboot.c	2004-03-01 17:39:19.000000000 -0800
@@ -75,7 +75,7 @@
 /* Set when the idlers are all forked */
 int smp_threads_ready;
 
-int cpu_sibling_map[NR_CPUS] __cacheline_aligned;
+cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned;
 
 /*
  * Trampoline 80x86 program as an array.
@@ -872,35 +872,38 @@
 		Dprintk("Before bogocount - setting activated=1.\n");
 	}
 
+	Dprintk("Boot done.\n");
+
 	/*
-	 * If Hyper-Threading is avaialble, construct cpu_sibling_map[], so
-	 * that we can tell the sibling CPU efficiently.
+	 * construct cpu_sibling_map[], so that we can tell sibling CPUs
+	 * efficiently.
 	 */
-	if (cpu_has_ht && smp_num_siblings > 1) {
-		for (cpu = 0; cpu < NR_CPUS; cpu++)
-			cpu_sibling_map[cpu] = NO_PROC_ID;
-		
-		for (cpu = 0; cpu < NR_CPUS; cpu++) {
-			int 	i;
-			if (!cpu_isset(cpu, cpu_callout_map))
-				continue;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpus_clear(cpu_sibling_map[cpu]);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		int siblings = 0;
+		int i;
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
 
+		if (smp_num_siblings > 1) {
 			for (i = 0; i < NR_CPUS; i++) {
-				if (i == cpu || !cpu_isset(i, cpu_callout_map))
+				if (!cpu_isset(i, cpu_callout_map))
 					continue;
 				if (phys_proc_id[cpu] == phys_proc_id[i]) {
-					cpu_sibling_map[cpu] = i;
-					break;
+					siblings++;
+					cpu_set(i, cpu_sibling_map[cpu]);
 				}
 			}
-			if (cpu_sibling_map[cpu] == NO_PROC_ID) {
-				smp_num_siblings = 1;
-				printk(KERN_WARNING "WARNING: No sibling found for CPU %d.\n", cpu);
-			}
+		} else {
+			siblings++;
+			cpu_set(cpu, cpu_sibling_map[cpu]);
 		}
-	}
 
-	Dprintk("Boot done.\n");
+		if (siblings != smp_num_siblings)
+			printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings);
+	}
 
 	/*
 	 * Here we can be sure that there is an IO-APIC in the system. Let's
diff -Nru linux-2.6.4-rc1/include/asm-x86_64/processor.h linux-64/include/asm-x86_64/processor.h
--- linux-2.6.4-rc1/include/asm-x86_64/processor.h	2004-03-02 09:45:50.190109904 -0800
+++ linux-64/include/asm-x86_64/processor.h	2004-03-01 17:34:12.000000000 -0800
@@ -451,4 +451,10 @@
 	ti->task;					\
 })
 
+
+#ifdef CONFIG_SCHED_SMT
+#define ARCH_HAS_SCHED_DOMAIN
+#define ARCH_HAS_SCHED_WAKE_BALANCE
+#endif
+
 #endif /* __ASM_X86_64_PROCESSOR_H */
diff -Nru linux-2.6.4-rc1/include/asm-x86_64/smp.h linux-64/include/asm-x86_64/smp.h
--- linux-2.6.4-rc1/include/asm-x86_64/smp.h	2004-03-02 09:45:57.284031464 -0800
+++ linux-64/include/asm-x86_64/smp.h	2004-03-01 17:39:43.000000000 -0800
@@ -47,7 +47,7 @@
 extern void (*mtrr_hook) (void);
 extern void zap_low_mappings(void);
 void smp_stop_cpu(void);
-extern int cpu_sibling_map[];
+extern cpumask_t cpu_sibling_map[];
 
 #define SMP_TRAMPOLINE_BASE 0x6000
 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-04-15  5:51 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-04-08 23:22 2.6.5-rc3-mm4 x86_64 sched domains patch Darren Hart
2004-04-08 23:42 ` Nick Piggin
2004-04-11  8:57   ` shai
2004-04-11  9:57     ` Rick Lindsley
2004-04-11 15:07     ` Martin J. Bligh
2004-04-14 13:44 ` Andi Kleen
2004-04-14 14:14   ` Nick Piggin
2004-04-14 14:41     ` Andi Kleen
2004-04-15  5:51       ` Nick Piggin
2004-04-14 17:24   ` Darren Hart
  -- strict thread matches above, loose matches on Subject: below --
2004-04-14 23:20 Siddha, Suresh B

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox