From: Nick Piggin <nickpiggin@yahoo.com.au>
To: Jesse Barnes <jbarnes@engr.sgi.com>
Cc: "Martin J. Bligh" <mbligh@aracnet.com>,
linux-kernel <linux-kernel@vger.kernel.org>,
John Hawkes <hawkes@sgi.com>
Subject: Re: [PATCH] reduce inter-node balancing frequency
Date: Sat, 17 Jul 2004 13:00:46 +1000 [thread overview]
Message-ID: <40F8965E.6070809@yahoo.com.au> (raw)
In-Reply-To: <200407161045.38983.jbarnes@engr.sgi.com>
[-- Attachment #1: Type: text/plain, Size: 968 bytes --]
Jesse Barnes wrote:
> On Friday, July 16, 2004 1:53 am, Nick Piggin wrote:
>>Instead of a top level domain spanning all CPUs, have each CPU's top level
>>domain just span all CPUs within a couple of hops (enough to get, say 16 to
>>64 CPUs into each top level domain). I could give you a hand with this if
>>you need.
>
>
> Yeah, that's what I had in mind. I'll wait for the patch you mentioned above
> and hack on top of that...
>
The patch is attached, although it needs a bit of commenting and testing.
Also, the init_sched_build_groups helper function in kernel/sched.c probably
wants to be exported for use by architecture code.
Out of interest, what sort of performance problems are you seeing with
this high rate of global balancing? I have a couple of patches to cut down
runqueue locking to almost zero in interrupt paths, although I imagine the
main problem you are having is pulling a cacheline off every remote CPU
when calculating runqueue loads?
[-- Attachment #2: sched-consolidate-domains.patch --]
[-- Type: text/x-patch, Size: 27530 bytes --]
---
linux-2.6-npiggin/arch/i386/kernel/smpboot.c | 207 -------------------
linux-2.6-npiggin/arch/ppc64/kernel/smp.c | 227 +--------------------
linux-2.6-npiggin/arch/x86_64/kernel/Makefile | 1
linux-2.6-npiggin/arch/x86_64/kernel/Makefile-HEAD | 1
linux-2.6-npiggin/include/asm-i386/processor.h | 5
linux-2.6-npiggin/include/asm-ppc64/processor.h | 5
linux-2.6-npiggin/include/asm-ppc64/smp.h | 3
linux-2.6-npiggin/include/asm-x86_64/processor.h | 5
linux-2.6-npiggin/include/linux/sched.h | 5
linux-2.6-npiggin/kernel/sched.c | 203 +++++++++++-------
linux-2.6/arch/x86_64/kernel/domain.c | 93 --------
11 files changed, 142 insertions(+), 613 deletions(-)
diff -puN kernel/sched.c~sched-consolidate-domains kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-consolidate-domains 2004-07-16 15:55:58.000000000 +1000
+++ linux-2.6-npiggin/kernel/sched.c 2004-07-16 20:37:39.000000000 +1000
@@ -3674,118 +3674,159 @@ void cpu_attach_domain(struct sched_doma
#ifdef ARCH_HAS_SCHED_DOMAIN
extern void __init arch_init_sched_domains(void);
#else
-static struct sched_group sched_group_cpus[NR_CPUS];
+
+#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+static struct sched_group sched_group_cpus[NR_CPUS];
+__init static int cpu_to_cpu_group(int cpu)
+{
+ return cpu;
+}
+#endif
+
+static DEFINE_PER_CPU(struct sched_domain, phys_domains);
+static struct sched_group sched_group_phys[NR_CPUS];
+__init static int cpu_to_phys_group(int cpu)
+{
+ return first_cpu(cpu_sibling_map[cpu]);
+}
+
#ifdef CONFIG_NUMA
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
static DEFINE_PER_CPU(struct sched_domain, node_domains);
-static void __init arch_init_sched_domains(void)
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+__init static int cpu_to_node_group(int cpu)
{
- int i;
- struct sched_group *first_node = NULL, *last_node = NULL;
+ return cpu_to_node(cpu);
+}
+#endif
- /* Set up domains */
- for_each_cpu(i) {
- int node = cpu_to_node(i);
- cpumask_t nodemask = node_to_cpumask(node);
- struct sched_domain *node_sd = &per_cpu(node_domains, i);
- struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
-
- *node_sd = SD_NODE_INIT;
- node_sd->span = cpu_possible_map;
- node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
-
- *cpu_sd = SD_CPU_INIT;
- cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
- cpu_sd->groups = &sched_group_cpus[i];
- cpu_sd->parent = node_sd;
- }
+__init static void init_sched_build_groups(struct sched_group groups[],
+ cpumask_t span, int (*group_fn)(int cpu))
+{
+ struct sched_group *first = NULL, *last = NULL;
+ cpumask_t covered = CPU_MASK_NONE;
+ int i;
- /* Set up groups */
- for (i = 0; i < MAX_NUMNODES; i++) {
- cpumask_t tmp = node_to_cpumask(i);
- cpumask_t nodemask;
- struct sched_group *first_cpu = NULL, *last_cpu = NULL;
- struct sched_group *node = &sched_group_nodes[i];
+ for_each_cpu_mask(i, span) {
+ int group = group_fn(i);
+ struct sched_group *sg = &groups[group];
int j;
- cpus_and(nodemask, tmp, cpu_possible_map);
-
- if (cpus_empty(nodemask))
+ if (cpu_isset(i, covered))
continue;
- node->cpumask = nodemask;
- node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
+ sg->cpumask = CPU_MASK_NONE;
+ sg->cpu_power = 0;
- for_each_cpu_mask(j, node->cpumask) {
- struct sched_group *cpu = &sched_group_cpus[j];
+ for_each_cpu_mask(j, span) {
+ if (group_fn(j) != group)
+ continue;
- cpus_clear(cpu->cpumask);
- cpu_set(j, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
-
- if (!first_cpu)
- first_cpu = cpu;
- if (last_cpu)
- last_cpu->next = cpu;
- last_cpu = cpu;
+ cpu_set(j, covered);
+ cpu_set(j, sg->cpumask);
}
- last_cpu->next = first_cpu;
-
- if (!first_node)
- first_node = node;
- if (last_node)
- last_node->next = node;
- last_node = node;
- }
- last_node->next = first_node;
-
- mb();
- for_each_cpu(i) {
- struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_sd, i);
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
}
+ last->next = first;
}
-#else /* !CONFIG_NUMA */
-static void __init arch_init_sched_domains(void)
+__init void arch_init_sched_domains(void)
{
int i;
- struct sched_group *first_cpu = NULL, *last_cpu = NULL;
/* Set up domains */
for_each_cpu(i) {
- struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+ int group;
+ struct sched_domain *sd = NULL, *p;
+ cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
- *cpu_sd = SD_CPU_INIT;
- cpu_sd->span = cpu_possible_map;
- cpu_sd->groups = &sched_group_cpus[i];
+#ifdef CONFIG_NUMA
+ sd = &per_cpu(node_domains, i);
+ group = cpu_to_node_group(i);
+ *sd = SD_NODE_INIT;
+ sd->span = cpu_possible_map;
+ sd->groups = &sched_group_nodes[group];
+#endif
+
+ p = sd;
+ sd = &per_cpu(phys_domains, i);
+ group = cpu_to_phys_group(i);
+ *sd = SD_CPU_INIT;
+ sd->span = nodemask;
+ sd->parent = p;
+ sd->groups = &sched_group_phys[group];
+
+#ifdef CONFIG_SCHED_SMT
+ p = sd;
+ sd = &per_cpu(cpu_domains, i);
+ group = cpu_to_cpu_group(i);
+ *sd = SD_SIBLING_INIT;
+ sd->span = cpu_sibling_map[i];
+ sd->parent = p;
+ sd->groups = &sched_group_cpus[group];
+#endif
+ }
+
+#ifdef CONFIG_SCHED_SMT
+ /* Set up CPU (sibling) groups */
+ for_each_cpu(i) {
+ if (i != first_cpu(cpu_sibling_map[i]))
+ continue;
+
+ init_sched_build_groups(sched_group_cpus, cpu_sibling_map[i],
+ &cpu_to_cpu_group);
+ }
+#endif
+
+ /* Set up physical groups */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t nodemask = node_to_cpumask(i);
+
+ cpus_and(nodemask, nodemask, cpu_possible_map);
+ if (cpus_empty(nodemask))
+ continue;
+
+ init_sched_build_groups(sched_group_phys, nodemask,
+ &cpu_to_phys_group);
}
- /* Set up CPU groups */
- for_each_cpu_mask(i, cpu_possible_map) {
- struct sched_group *cpu = &sched_group_cpus[i];
+#ifdef CONFIG_NUMA
+ /* Set up node groups */
+ init_sched_build_groups(sched_group_nodes, cpu_possible_map,
+ &cpu_to_node_group);
+#endif
- cpus_clear(cpu->cpumask);
- cpu_set(i, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
+ /* Calculate CPU power for physical packages and nodes */
+ for_each_cpu(i) {
+ int power;
+ struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+ sd = &per_cpu(cpu_domains, i);
+ power = SCHED_LOAD_SCALE;
+ sd->groups->cpu_power = power;
+#endif
+
+ sd = &per_cpu(phys_domains, i);
+ power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
+ (cpus_weight(sd->groups->cpumask)-1) / 10;
+ sd->groups->cpu_power = power;
- if (!first_cpu)
- first_cpu = cpu;
- if (last_cpu)
- last_cpu->next = cpu;
- last_cpu = cpu;
+#ifdef CONFIG_NUMA
+ sd = &per_cpu(node_domains, i);
+ sd->groups->cpu_power += power;
+#endif
}
- last_cpu->next = first_cpu;
- mb(); /* domains were modified outside the lock */
+ /* Attach the domains */
for_each_cpu(i) {
- struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_sd, i);
+ struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
+ cpu_attach_domain(cpu_domain, i);
}
}
-
-#endif /* CONFIG_NUMA */
#endif /* ARCH_HAS_SCHED_DOMAIN */
#define SCHED_DOMAIN_DEBUG
diff -puN include/linux/sched.h~sched-consolidate-domains include/linux/sched.h
--- linux-2.6/include/linux/sched.h~sched-consolidate-domains 2004-07-16 15:56:02.000000000 +1000
+++ linux-2.6-npiggin/include/linux/sched.h 2004-07-16 20:47:42.000000000 +1000
@@ -615,6 +615,9 @@ struct sched_domain {
unsigned int nr_balance_failed; /* initialise to 0 */
};
+#ifndef ARCH_HAS_SCHED_TUNE
+#ifdef CONFIG_SCHED_SMT
+#define ARCH_HASH_SCHED_WAKE_IDLE
/* Common values for SMT siblings */
#define SD_SIBLING_INIT (struct sched_domain) { \
.span = CPU_MASK_NONE, \
@@ -636,6 +639,7 @@ struct sched_domain {
.balance_interval = 1, \
.nr_balance_failed = 0, \
}
+#endif
/* Common values for CPUs */
#define SD_CPU_INIT (struct sched_domain) { \
@@ -678,6 +682,7 @@ struct sched_domain {
.nr_balance_failed = 0, \
}
#endif
+#endif /* __ARCH_HAS_SCHED_TUNE */
extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
diff -puN arch/i386/kernel/smpboot.c~sched-consolidate-domains arch/i386/kernel/smpboot.c
--- linux-2.6/arch/i386/kernel/smpboot.c~sched-consolidate-domains 2004-07-16 15:56:15.000000000 +1000
+++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c 2004-07-16 15:56:45.000000000 +1000
@@ -1136,213 +1136,6 @@ static void __init smp_boot_cpus(unsigne
smp_tune_scheduling();
}
-#ifdef CONFIG_SCHED_SMT
-#ifdef CONFIG_NUMA
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-__init void arch_init_sched_domains(void)
-{
- int i;
- struct sched_group *first = NULL, *last = NULL;
-
- /* Set up domains */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
- struct sched_domain *node_domain = &per_cpu(node_domains, i);
- int node = cpu_to_node(i);
- cpumask_t nodemask = node_to_cpumask(node);
-
- *cpu_domain = SD_SIBLING_INIT;
- cpu_domain->span = cpu_sibling_map[i];
- cpu_domain->parent = phys_domain;
- cpu_domain->groups = &sched_group_cpus[i];
-
- *phys_domain = SD_CPU_INIT;
- phys_domain->span = nodemask;
- phys_domain->parent = node_domain;
- phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
-
- *node_domain = SD_NODE_INIT;
- node_domain->span = cpu_possible_map;
- node_domain->groups = &sched_group_nodes[cpu_to_node(i)];
- }
-
- /* Set up CPU (sibling) groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- int j;
- first = last = NULL;
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- for_each_cpu_mask(j, cpu_domain->span) {
- struct sched_group *cpu = &sched_group_cpus[j];
-
- cpu->cpumask = CPU_MASK_NONE;
- cpu_set(j, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- for (i = 0; i < MAX_NUMNODES; i++) {
- int j;
- cpumask_t nodemask;
- struct sched_group *node = &sched_group_nodes[i];
- cpumask_t node_cpumask = node_to_cpumask(i);
-
- cpus_and(nodemask, node_cpumask, cpu_possible_map);
-
- if (cpus_empty(nodemask))
- continue;
-
- first = last = NULL;
- /* Set up physical groups */
- for_each_cpu_mask(j, nodemask) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j);
- struct sched_group *cpu = &sched_group_phys[j];
-
- if (j != first_cpu(cpu_domain->span))
- continue;
-
- cpu->cpumask = cpu_domain->span;
- /*
- * Make each extra sibling increase power by 10% of
- * the basic CPU. This is very arbitrary.
- */
- cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
- node->cpu_power += cpu->cpu_power;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- /* Set up nodes */
- first = last = NULL;
- for (i = 0; i < MAX_NUMNODES; i++) {
- struct sched_group *cpu = &sched_group_nodes[i];
- cpumask_t nodemask;
- cpumask_t node_cpumask = node_to_cpumask(i);
-
- cpus_and(nodemask, node_cpumask, cpu_possible_map);
-
- if (cpus_empty(nodemask))
- continue;
-
- cpu->cpumask = nodemask;
- /* ->cpu_power already setup */
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
-
- mb();
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_domain, i);
- }
-}
-#else /* !CONFIG_NUMA */
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-__init void arch_init_sched_domains(void)
-{
- int i;
- struct sched_group *first = NULL, *last = NULL;
-
- /* Set up domains */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-
- *cpu_domain = SD_SIBLING_INIT;
- cpu_domain->span = cpu_sibling_map[i];
- cpu_domain->parent = phys_domain;
- cpu_domain->groups = &sched_group_cpus[i];
-
- *phys_domain = SD_CPU_INIT;
- phys_domain->span = cpu_possible_map;
- phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
- }
-
- /* Set up CPU (sibling) groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- int j;
- first = last = NULL;
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- for_each_cpu_mask(j, cpu_domain->span) {
- struct sched_group *cpu = &sched_group_cpus[j];
-
- cpus_clear(cpu->cpumask);
- cpu_set(j, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- first = last = NULL;
- /* Set up physical groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_group *cpu = &sched_group_phys[i];
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- cpu->cpumask = cpu_domain->span;
- /* See SMT+NUMA setup for comment */
- cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
-
- mb();
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_domain, i);
- }
-}
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_SCHED_SMT */
-
/* These are wrappers to interface to the new boot process. Someone
who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */
void __init smp_prepare_cpus(unsigned int max_cpus)
diff -puN include/asm-ppc64/processor.h~sched-consolidate-domains include/asm-ppc64/processor.h
--- linux-2.6/include/asm-ppc64/processor.h~sched-consolidate-domains 2004-07-16 19:53:13.000000000 +1000
+++ linux-2.6-npiggin/include/asm-ppc64/processor.h 2004-07-16 20:00:32.000000000 +1000
@@ -626,11 +626,6 @@ static inline void prefetchw(const void
#define spin_lock_prefetch(x) prefetchw(x)
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_DOMAIN
-#define ARCH_HAS_SCHED_WAKE_IDLE
-#endif
-
#endif /* ASSEMBLY */
/*
diff -puN include/asm-i386/processor.h~sched-consolidate-domains include/asm-i386/processor.h
--- linux-2.6/include/asm-i386/processor.h~sched-consolidate-domains 2004-07-16 19:53:18.000000000 +1000
+++ linux-2.6-npiggin/include/asm-i386/processor.h 2004-07-16 20:02:24.000000000 +1000
@@ -649,9 +649,4 @@ extern void select_idle_routine(const st
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_DOMAIN
-#define ARCH_HAS_SCHED_WAKE_IDLE
-#endif
-
#endif /* __ASM_I386_PROCESSOR_H */
diff -puN include/asm-x86_64/processor.h~sched-consolidate-domains include/asm-x86_64/processor.h
--- linux-2.6/include/asm-x86_64/processor.h~sched-consolidate-domains 2004-07-16 19:53:22.000000000 +1000
+++ linux-2.6-npiggin/include/asm-x86_64/processor.h 2004-07-16 20:02:29.000000000 +1000
@@ -458,9 +458,4 @@ static inline void __mwait(unsigned long
#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
-#ifdef CONFIG_SCHED_SMT
-#define ARCH_HAS_SCHED_DOMAIN
-#define ARCH_HAS_SCHED_WAKE_IDLE
-#endif
-
#endif /* __ASM_X86_64_PROCESSOR_H */
diff -puN arch/ppc64/kernel/smp.c~sched-consolidate-domains arch/ppc64/kernel/smp.c
--- linux-2.6/arch/ppc64/kernel/smp.c~sched-consolidate-domains 2004-07-16 19:56:33.000000000 +1000
+++ linux-2.6-npiggin/arch/ppc64/kernel/smp.c 2004-07-16 20:14:54.000000000 +1000
@@ -55,6 +55,9 @@
#include <asm/rtas.h>
int smp_threads_ready;
+#ifdef CONFIG_SCHED_SMT
+cpumask_t cpu_sibling_map[NR_CPUS];
+#endif
unsigned long cache_decay_ticks;
cpumask_t cpu_possible_map = CPU_MASK_NONE;
@@ -436,6 +439,15 @@ static inline void look_for_more_cpus(vo
/* Make those cpus (which might appear later) possible too. */
for (i = 0; i < maxcpus; i++)
cpu_set(i, cpu_possible_map);
+
+#ifdef CONFIG_SCHED_SMT
+ memset(cpu_sibling_map, 0, sizeof(cpu_sibling_map));
+ for_each_cpu(i) {
+ cpu_set(i, cpu_sibling_map[i]);
+ if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
+ cpu_set(i^1, cpu_sibling_map[i]);
+ }
+#endif
}
#else /* ... CONFIG_HOTPLUG_CPU */
static inline int __devinit smp_startup_cpu(unsigned int lcpu)
@@ -990,218 +1002,3 @@ void __init smp_cpus_done(unsigned int m
set_cpus_allowed(current, old_mask);
}
-
-#ifdef CONFIG_SCHED_SMT
-#ifdef CONFIG_NUMA
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct sched_domain, node_domains);
-__init void arch_init_sched_domains(void)
-{
- int i;
- struct sched_group *first = NULL, *last = NULL;
-
- /* Set up domains */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
- struct sched_domain *node_domain = &per_cpu(node_domains, i);
- int node = cpu_to_node(i);
- cpumask_t nodemask = node_to_cpumask(node);
- cpumask_t my_cpumask = cpumask_of_cpu(i);
- cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1);
-
- *cpu_domain = SD_SIBLING_INIT;
- if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
- cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask);
- else
- cpu_domain->span = my_cpumask;
- cpu_domain->parent = phys_domain;
- cpu_domain->groups = &sched_group_cpus[i];
-
- *phys_domain = SD_CPU_INIT;
- phys_domain->span = nodemask;
- phys_domain->parent = node_domain;
- phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
-
- *node_domain = SD_NODE_INIT;
- node_domain->span = cpu_possible_map;
- node_domain->groups = &sched_group_nodes[node];
- }
-
- /* Set up CPU (sibling) groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- int j;
- first = last = NULL;
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- for_each_cpu_mask(j, cpu_domain->span) {
- struct sched_group *cpu = &sched_group_cpus[j];
-
- cpus_clear(cpu->cpumask);
- cpu_set(j, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- for (i = 0; i < MAX_NUMNODES; i++) {
- int j;
- cpumask_t nodemask;
- struct sched_group *node = &sched_group_nodes[i];
- cpumask_t node_cpumask = node_to_cpumask(i);
- cpus_and(nodemask, node_cpumask, cpu_possible_map);
-
- if (cpus_empty(nodemask))
- continue;
-
- first = last = NULL;
- /* Set up physical groups */
- for_each_cpu_mask(j, nodemask) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j);
- struct sched_group *cpu = &sched_group_phys[j];
-
- if (j != first_cpu(cpu_domain->span))
- continue;
-
- cpu->cpumask = cpu_domain->span;
- /*
- * Make each extra sibling increase power by 10% of
- * the basic CPU. This is very arbitrary.
- */
- cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
- node->cpu_power += cpu->cpu_power;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- /* Set up nodes */
- first = last = NULL;
- for (i = 0; i < MAX_NUMNODES; i++) {
- struct sched_group *cpu = &sched_group_nodes[i];
- cpumask_t nodemask;
- cpumask_t node_cpumask = node_to_cpumask(i);
- cpus_and(nodemask, node_cpumask, cpu_possible_map);
-
- if (cpus_empty(nodemask))
- continue;
-
- cpu->cpumask = nodemask;
- /* ->cpu_power already setup */
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
-
- mb();
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_domain, i);
- }
-}
-#else /* !CONFIG_NUMA */
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-__init void arch_init_sched_domains(void)
-{
- int i;
- struct sched_group *first = NULL, *last = NULL;
-
- /* Set up domains */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
- cpumask_t my_cpumask = cpumask_of_cpu(i);
- cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1);
-
- *cpu_domain = SD_SIBLING_INIT;
- if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
- cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask);
- else
- cpu_domain->span = my_cpumask;
- cpu_domain->parent = phys_domain;
- cpu_domain->groups = &sched_group_cpus[i];
-
- *phys_domain = SD_CPU_INIT;
- phys_domain->span = cpu_possible_map;
- phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
- }
-
- /* Set up CPU (sibling) groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- int j;
- first = last = NULL;
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- for_each_cpu_mask(j, cpu_domain->span) {
- struct sched_group *cpu = &sched_group_cpus[j];
-
- cpus_clear(cpu->cpumask);
- cpu_set(j, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- first = last = NULL;
- /* Set up physical groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_group *cpu = &sched_group_phys[i];
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- cpu->cpumask = cpu_domain->span;
- /* See SMT+NUMA setup for comment */
- cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
-
- mb();
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_domain, i);
- }
-}
-#endif /* CONFIG_NUMA */
-#endif /* CONFIG_SCHED_SMT */
diff -L arch/x86_64/kernel/domain.c -puN arch/x86_64/kernel/domain.c~sched-consolidate-domains /dev/null
--- linux-2.6/arch/x86_64/kernel/domain.c
+++ /dev/null 2004-06-24 18:17:02.000000000 +1000
@@ -1,93 +0,0 @@
-#include <linux/init.h>
-#include <linux/sched.h>
-
-/* Don't do any NUMA setup on Opteron right now. They seem to be
- better off with flat scheduling. This is just for SMT. */
-
-#ifdef CONFIG_SCHED_SMT
-
-static struct sched_group sched_group_cpus[NR_CPUS];
-static struct sched_group sched_group_phys[NR_CPUS];
-static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-__init void arch_init_sched_domains(void)
-{
- int i;
- struct sched_group *first = NULL, *last = NULL;
-
- /* Set up domains */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_domain *phys_domain = &per_cpu(phys_domains, i);
-
- *cpu_domain = SD_SIBLING_INIT;
- /* Disable SMT NICE for CMP */
- /* RED-PEN use a generic flag */
- if (cpu_data[i].x86_vendor == X86_VENDOR_AMD)
- cpu_domain->flags &= ~SD_SHARE_CPUPOWER;
- cpu_domain->span = cpu_sibling_map[i];
- cpu_domain->parent = phys_domain;
- cpu_domain->groups = &sched_group_cpus[i];
-
- *phys_domain = SD_CPU_INIT;
- phys_domain->span = cpu_possible_map;
- phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)];
- }
-
- /* Set up CPU (sibling) groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- int j;
- first = last = NULL;
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- for_each_cpu_mask(j, cpu_domain->span) {
- struct sched_group *cpu = &sched_group_cpus[j];
-
- cpus_clear(cpu->cpumask);
- cpu_set(j, cpu->cpumask);
- cpu->cpu_power = SCHED_LOAD_SCALE;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
- }
-
- first = last = NULL;
- /* Set up physical groups */
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- struct sched_group *cpu = &sched_group_phys[i];
-
- if (i != first_cpu(cpu_domain->span))
- continue;
-
- cpu->cpumask = cpu_domain->span;
- /*
- * Make each extra sibling increase power by 10% of
- * the basic CPU. This is very arbitrary.
- */
- cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
-
- if (!first)
- first = cpu;
- if (last)
- last->next = cpu;
- last = cpu;
- }
- last->next = first;
-
- mb();
- for_each_cpu(i) {
- struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i);
- cpu_attach_domain(cpu_domain, i);
- }
-}
-
-#endif
diff -puN arch/x86_64/kernel/Makefile~sched-consolidate-domains arch/x86_64/kernel/Makefile
--- linux-2.6/arch/x86_64/kernel/Makefile~sched-consolidate-domains 2004-07-16 20:04:19.000000000 +1000
+++ linux-2.6-npiggin/arch/x86_64/kernel/Makefile 2004-07-16 20:04:31.000000000 +1000
@@ -25,7 +25,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin
obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_SCHED_SMT) += domain.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KGDB) += kgdb_stub.o
diff -puN arch/x86_64/kernel/Makefile-HEAD~sched-consolidate-domains arch/x86_64/kernel/Makefile-HEAD
--- linux-2.6/arch/x86_64/kernel/Makefile-HEAD~sched-consolidate-domains 2004-07-16 20:04:24.000000000 +1000
+++ linux-2.6-npiggin/arch/x86_64/kernel/Makefile-HEAD 2004-07-16 20:04:34.000000000 +1000
@@ -25,7 +25,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin
obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o
obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o
obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_SCHED_SMT) += domain.o
obj-$(CONFIG_MODULES) += module.o
diff -puN include/asm-ppc64/smp.h~sched-consolidate-domains include/asm-ppc64/smp.h
--- linux-2.6/include/asm-ppc64/smp.h~sched-consolidate-domains 2004-07-16 20:06:13.000000000 +1000
+++ linux-2.6-npiggin/include/asm-ppc64/smp.h 2004-07-16 20:07:37.000000000 +1000
@@ -73,6 +73,9 @@ void smp_init_pSeries(void);
extern int __cpu_disable(void);
extern void __cpu_die(unsigned int cpu);
extern void cpu_die(void) __attribute__((noreturn));
+#ifdef CONFIG_SCHED_SMT
+extern cpumask_t cpu_sibling_map[NR_CPUS];
+#endif
#endif /* !(CONFIG_SMP) */
#define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id)
_
next prev parent reply other threads:[~2004-07-17 3:01 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-07-15 22:29 [PATCH] reduce inter-node balancing frequency Jesse Barnes
2004-07-16 0:14 ` Martin J. Bligh
2004-07-16 0:38 ` Jesse Barnes
2004-07-16 1:48 ` Nick Piggin
2004-07-16 1:58 ` Jesse Barnes
2004-07-16 5:40 ` Martin J. Bligh
2004-07-16 5:53 ` Nick Piggin
2004-07-16 14:45 ` Jesse Barnes
2004-07-16 15:04 ` Chris Wedgwood
2004-07-16 15:30 ` Jesse Barnes
2004-07-17 3:00 ` Nick Piggin [this message]
2004-07-17 16:44 ` Jesse Barnes
2004-07-16 14:42 ` Jesse Barnes
2004-07-18 13:12 ` Jes Sorensen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=40F8965E.6070809@yahoo.com.au \
--to=nickpiggin@yahoo.com.au \
--cc=hawkes@sgi.com \
--cc=jbarnes@engr.sgi.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mbligh@aracnet.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.