From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S264843AbUGQDBv (ORCPT ); Fri, 16 Jul 2004 23:01:51 -0400 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S264855AbUGQDBv (ORCPT ); Fri, 16 Jul 2004 23:01:51 -0400 Received: from smtp107.mail.sc5.yahoo.com ([66.163.169.227]:23131 "HELO smtp107.mail.sc5.yahoo.com") by vger.kernel.org with SMTP id S264843AbUGQDA4 (ORCPT ); Fri, 16 Jul 2004 23:00:56 -0400 Message-ID: <40F8965E.6070809@yahoo.com.au> Date: Sat, 17 Jul 2004 13:00:46 +1000 From: Nick Piggin User-Agent: Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7) Gecko/20040707 Debian/1.7-5 X-Accept-Language: en MIME-Version: 1.0 To: Jesse Barnes CC: "Martin J. Bligh" , linux-kernel , John Hawkes Subject: Re: [PATCH] reduce inter-node balancing frequency References: <200407151829.20069.jbarnes@engr.sgi.com> <2700000.1089956404@[10.10.2.4]> <40F76D3F.8050309@yahoo.com.au> <200407161045.38983.jbarnes@engr.sgi.com> In-Reply-To: <200407161045.38983.jbarnes@engr.sgi.com> Content-Type: multipart/mixed; boundary="------------040108020208060701010304" Sender: linux-kernel-owner@vger.kernel.org X-Mailing-List: linux-kernel@vger.kernel.org This is a multi-part message in MIME format. --------------040108020208060701010304 Content-Type: text/plain; charset=us-ascii; format=flowed Content-Transfer-Encoding: 7bit Jesse Barnes wrote: > On Friday, July 16, 2004 1:53 am, Nick Piggin wrote: >>Instead of a top level domain spanning all CPUs, have each CPU's top level >>domain just span all CPUs within a couple of hops (enough to get, say 16 to >>64 CPUs into each top level domain). I could give you a hand with this if >>you need. > > > Yeah, that's what I had in mind. I'll wait for the patch you mentioned above > and hack on top of that... > The patch is attached, although it needs a bit of commenting and testing. Also, the init_sched_build_groups helper function in kernel/sched.c probably wants to be exported for use by architecture code. Out of interest, what sort of performance problems are you seeing with this high rate of global balancing? I have a couple of patches to cut down runqueue locking to almost zero in interrupt paths, although I imagine the main problem you are having is pulling a cacheline off every remote CPU when calculating runqueue loads? --------------040108020208060701010304 Content-Type: text/x-patch; name="sched-consolidate-domains.patch" Content-Transfer-Encoding: 7bit Content-Disposition: inline; filename="sched-consolidate-domains.patch" --- linux-2.6-npiggin/arch/i386/kernel/smpboot.c | 207 ------------------- linux-2.6-npiggin/arch/ppc64/kernel/smp.c | 227 +-------------------- linux-2.6-npiggin/arch/x86_64/kernel/Makefile | 1 linux-2.6-npiggin/arch/x86_64/kernel/Makefile-HEAD | 1 linux-2.6-npiggin/include/asm-i386/processor.h | 5 linux-2.6-npiggin/include/asm-ppc64/processor.h | 5 linux-2.6-npiggin/include/asm-ppc64/smp.h | 3 linux-2.6-npiggin/include/asm-x86_64/processor.h | 5 linux-2.6-npiggin/include/linux/sched.h | 5 linux-2.6-npiggin/kernel/sched.c | 203 +++++++++++------- linux-2.6/arch/x86_64/kernel/domain.c | 93 -------- 11 files changed, 142 insertions(+), 613 deletions(-) diff -puN kernel/sched.c~sched-consolidate-domains kernel/sched.c --- linux-2.6/kernel/sched.c~sched-consolidate-domains 2004-07-16 15:55:58.000000000 +1000 +++ linux-2.6-npiggin/kernel/sched.c 2004-07-16 20:37:39.000000000 +1000 @@ -3674,118 +3674,159 @@ void cpu_attach_domain(struct sched_doma #ifdef ARCH_HAS_SCHED_DOMAIN extern void __init arch_init_sched_domains(void); #else -static struct sched_group sched_group_cpus[NR_CPUS]; + +#ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); +static struct sched_group sched_group_cpus[NR_CPUS]; +__init static int cpu_to_cpu_group(int cpu) +{ + return cpu; +} +#endif + +static DEFINE_PER_CPU(struct sched_domain, phys_domains); +static struct sched_group sched_group_phys[NR_CPUS]; +__init static int cpu_to_phys_group(int cpu) +{ + return first_cpu(cpu_sibling_map[cpu]); +} + #ifdef CONFIG_NUMA -static struct sched_group sched_group_nodes[MAX_NUMNODES]; static DEFINE_PER_CPU(struct sched_domain, node_domains); -static void __init arch_init_sched_domains(void) +static struct sched_group sched_group_nodes[MAX_NUMNODES]; +__init static int cpu_to_node_group(int cpu) { - int i; - struct sched_group *first_node = NULL, *last_node = NULL; + return cpu_to_node(cpu); +} +#endif - /* Set up domains */ - for_each_cpu(i) { - int node = cpu_to_node(i); - cpumask_t nodemask = node_to_cpumask(node); - struct sched_domain *node_sd = &per_cpu(node_domains, i); - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); - - *node_sd = SD_NODE_INIT; - node_sd->span = cpu_possible_map; - node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; - - *cpu_sd = SD_CPU_INIT; - cpus_and(cpu_sd->span, nodemask, cpu_possible_map); - cpu_sd->groups = &sched_group_cpus[i]; - cpu_sd->parent = node_sd; - } +__init static void init_sched_build_groups(struct sched_group groups[], + cpumask_t span, int (*group_fn)(int cpu)) +{ + struct sched_group *first = NULL, *last = NULL; + cpumask_t covered = CPU_MASK_NONE; + int i; - /* Set up groups */ - for (i = 0; i < MAX_NUMNODES; i++) { - cpumask_t tmp = node_to_cpumask(i); - cpumask_t nodemask; - struct sched_group *first_cpu = NULL, *last_cpu = NULL; - struct sched_group *node = &sched_group_nodes[i]; + for_each_cpu_mask(i, span) { + int group = group_fn(i); + struct sched_group *sg = &groups[group]; int j; - cpus_and(nodemask, tmp, cpu_possible_map); - - if (cpus_empty(nodemask)) + if (cpu_isset(i, covered)) continue; - node->cpumask = nodemask; - node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); + sg->cpumask = CPU_MASK_NONE; + sg->cpu_power = 0; - for_each_cpu_mask(j, node->cpumask) { - struct sched_group *cpu = &sched_group_cpus[j]; + for_each_cpu_mask(j, span) { + if (group_fn(j) != group) + continue; - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; + cpu_set(j, covered); + cpu_set(j, sg->cpumask); } - last_cpu->next = first_cpu; - - if (!first_node) - first_node = node; - if (last_node) - last_node->next = node; - last_node = node; - } - last_node->next = first_node; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_sd, i); + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; } + last->next = first; } -#else /* !CONFIG_NUMA */ -static void __init arch_init_sched_domains(void) +__init void arch_init_sched_domains(void) { int i; - struct sched_group *first_cpu = NULL, *last_cpu = NULL; /* Set up domains */ for_each_cpu(i) { - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); + int group; + struct sched_domain *sd = NULL, *p; + cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - *cpu_sd = SD_CPU_INIT; - cpu_sd->span = cpu_possible_map; - cpu_sd->groups = &sched_group_cpus[i]; +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + group = cpu_to_node_group(i); + *sd = SD_NODE_INIT; + sd->span = cpu_possible_map; + sd->groups = &sched_group_nodes[group]; +#endif + + p = sd; + sd = &per_cpu(phys_domains, i); + group = cpu_to_phys_group(i); + *sd = SD_CPU_INIT; + sd->span = nodemask; + sd->parent = p; + sd->groups = &sched_group_phys[group]; + +#ifdef CONFIG_SCHED_SMT + p = sd; + sd = &per_cpu(cpu_domains, i); + group = cpu_to_cpu_group(i); + *sd = SD_SIBLING_INIT; + sd->span = cpu_sibling_map[i]; + sd->parent = p; + sd->groups = &sched_group_cpus[group]; +#endif + } + +#ifdef CONFIG_SCHED_SMT + /* Set up CPU (sibling) groups */ + for_each_cpu(i) { + if (i != first_cpu(cpu_sibling_map[i])) + continue; + + init_sched_build_groups(sched_group_cpus, cpu_sibling_map[i], + &cpu_to_cpu_group); + } +#endif + + /* Set up physical groups */ + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + + cpus_and(nodemask, nodemask, cpu_possible_map); + if (cpus_empty(nodemask)) + continue; + + init_sched_build_groups(sched_group_phys, nodemask, + &cpu_to_phys_group); } - /* Set up CPU groups */ - for_each_cpu_mask(i, cpu_possible_map) { - struct sched_group *cpu = &sched_group_cpus[i]; +#ifdef CONFIG_NUMA + /* Set up node groups */ + init_sched_build_groups(sched_group_nodes, cpu_possible_map, + &cpu_to_node_group); +#endif - cpus_clear(cpu->cpumask); - cpu_set(i, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; + /* Calculate CPU power for physical packages and nodes */ + for_each_cpu(i) { + int power; + struct sched_domain *sd; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i); + power = SCHED_LOAD_SCALE; + sd->groups->cpu_power = power; +#endif + + sd = &per_cpu(phys_domains, i); + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; - if (!first_cpu) - first_cpu = cpu; - if (last_cpu) - last_cpu->next = cpu; - last_cpu = cpu; +#ifdef CONFIG_NUMA + sd = &per_cpu(node_domains, i); + sd->groups->cpu_power += power; +#endif } - last_cpu->next = first_cpu; - mb(); /* domains were modified outside the lock */ + /* Attach the domains */ for_each_cpu(i) { - struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_sd, i); + struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); + cpu_attach_domain(cpu_domain, i); } } - -#endif /* CONFIG_NUMA */ #endif /* ARCH_HAS_SCHED_DOMAIN */ #define SCHED_DOMAIN_DEBUG diff -puN include/linux/sched.h~sched-consolidate-domains include/linux/sched.h --- linux-2.6/include/linux/sched.h~sched-consolidate-domains 2004-07-16 15:56:02.000000000 +1000 +++ linux-2.6-npiggin/include/linux/sched.h 2004-07-16 20:47:42.000000000 +1000 @@ -615,6 +615,9 @@ struct sched_domain { unsigned int nr_balance_failed; /* initialise to 0 */ }; +#ifndef ARCH_HAS_SCHED_TUNE +#ifdef CONFIG_SCHED_SMT +#define ARCH_HASH_SCHED_WAKE_IDLE /* Common values for SMT siblings */ #define SD_SIBLING_INIT (struct sched_domain) { \ .span = CPU_MASK_NONE, \ @@ -636,6 +639,7 @@ struct sched_domain { .balance_interval = 1, \ .nr_balance_failed = 0, \ } +#endif /* Common values for CPUs */ #define SD_CPU_INIT (struct sched_domain) { \ @@ -678,6 +682,7 @@ struct sched_domain { .nr_balance_failed = 0, \ } #endif +#endif /* __ARCH_HAS_SCHED_TUNE */ extern void cpu_attach_domain(struct sched_domain *sd, int cpu); diff -puN arch/i386/kernel/smpboot.c~sched-consolidate-domains arch/i386/kernel/smpboot.c --- linux-2.6/arch/i386/kernel/smpboot.c~sched-consolidate-domains 2004-07-16 15:56:15.000000000 +1000 +++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c 2004-07-16 15:56:45.000000000 +1000 @@ -1136,213 +1136,6 @@ static void __init smp_boot_cpus(unsigne smp_tune_scheduling(); } -#ifdef CONFIG_SCHED_SMT -#ifdef CONFIG_NUMA -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_domain, node_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - int node = cpu_to_node(i); - cpumask_t nodemask = node_to_cpumask(node); - - *cpu_domain = SD_SIBLING_INIT; - cpu_domain->span = cpu_sibling_map[i]; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = nodemask; - phys_domain->parent = node_domain; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - - *node_domain = SD_NODE_INIT; - node_domain->span = cpu_possible_map; - node_domain->groups = &sched_group_nodes[cpu_to_node(i)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpu->cpumask = CPU_MASK_NONE; - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - for (i = 0; i < MAX_NUMNODES; i++) { - int j; - cpumask_t nodemask; - struct sched_group *node = &sched_group_nodes[i]; - cpumask_t node_cpumask = node_to_cpumask(i); - - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu_mask(j, nodemask) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); - struct sched_group *cpu = &sched_group_phys[j]; - - if (j != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* - * Make each extra sibling increase power by 10% of - * the basic CPU. This is very arbitrary. - */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - node->cpu_power += cpu->cpu_power; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - /* Set up nodes */ - first = last = NULL; - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *cpu = &sched_group_nodes[i]; - cpumask_t nodemask; - cpumask_t node_cpumask = node_to_cpumask(i); - - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - cpu->cpumask = nodemask; - /* ->cpu_power already setup */ - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#else /* !CONFIG_NUMA */ -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - - *cpu_domain = SD_SIBLING_INIT; - cpu_domain->span = cpu_sibling_map[i]; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = cpu_possible_map; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_group *cpu = &sched_group_phys[i]; - - if (i != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* See SMT+NUMA setup for comment */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_SCHED_SMT */ - /* These are wrappers to interface to the new boot process. Someone who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ void __init smp_prepare_cpus(unsigned int max_cpus) diff -puN include/asm-ppc64/processor.h~sched-consolidate-domains include/asm-ppc64/processor.h --- linux-2.6/include/asm-ppc64/processor.h~sched-consolidate-domains 2004-07-16 19:53:13.000000000 +1000 +++ linux-2.6-npiggin/include/asm-ppc64/processor.h 2004-07-16 20:00:32.000000000 +1000 @@ -626,11 +626,6 @@ static inline void prefetchw(const void #define spin_lock_prefetch(x) prefetchw(x) -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE -#endif - #endif /* ASSEMBLY */ /* diff -puN include/asm-i386/processor.h~sched-consolidate-domains include/asm-i386/processor.h --- linux-2.6/include/asm-i386/processor.h~sched-consolidate-domains 2004-07-16 19:53:18.000000000 +1000 +++ linux-2.6-npiggin/include/asm-i386/processor.h 2004-07-16 20:02:24.000000000 +1000 @@ -649,9 +649,4 @@ extern void select_idle_routine(const st #define cache_line_size() (boot_cpu_data.x86_cache_alignment) -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE -#endif - #endif /* __ASM_I386_PROCESSOR_H */ diff -puN include/asm-x86_64/processor.h~sched-consolidate-domains include/asm-x86_64/processor.h --- linux-2.6/include/asm-x86_64/processor.h~sched-consolidate-domains 2004-07-16 19:53:22.000000000 +1000 +++ linux-2.6-npiggin/include/asm-x86_64/processor.h 2004-07-16 20:02:29.000000000 +1000 @@ -458,9 +458,4 @@ static inline void __mwait(unsigned long #define cache_line_size() (boot_cpu_data.x86_cache_alignment) -#ifdef CONFIG_SCHED_SMT -#define ARCH_HAS_SCHED_DOMAIN -#define ARCH_HAS_SCHED_WAKE_IDLE -#endif - #endif /* __ASM_X86_64_PROCESSOR_H */ diff -puN arch/ppc64/kernel/smp.c~sched-consolidate-domains arch/ppc64/kernel/smp.c --- linux-2.6/arch/ppc64/kernel/smp.c~sched-consolidate-domains 2004-07-16 19:56:33.000000000 +1000 +++ linux-2.6-npiggin/arch/ppc64/kernel/smp.c 2004-07-16 20:14:54.000000000 +1000 @@ -55,6 +55,9 @@ #include int smp_threads_ready; +#ifdef CONFIG_SCHED_SMT +cpumask_t cpu_sibling_map[NR_CPUS]; +#endif unsigned long cache_decay_ticks; cpumask_t cpu_possible_map = CPU_MASK_NONE; @@ -436,6 +439,15 @@ static inline void look_for_more_cpus(vo /* Make those cpus (which might appear later) possible too. */ for (i = 0; i < maxcpus; i++) cpu_set(i, cpu_possible_map); + +#ifdef CONFIG_SCHED_SMT + memset(cpu_sibling_map, 0, sizeof(cpu_sibling_map)); + for_each_cpu(i) { + cpu_set(i, cpu_sibling_map[i]); + if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) + cpu_set(i^1, cpu_sibling_map[i]); + } +#endif } #else /* ... CONFIG_HOTPLUG_CPU */ static inline int __devinit smp_startup_cpu(unsigned int lcpu) @@ -990,218 +1002,3 @@ void __init smp_cpus_done(unsigned int m set_cpus_allowed(current, old_mask); } - -#ifdef CONFIG_SCHED_SMT -#ifdef CONFIG_NUMA -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -static DEFINE_PER_CPU(struct sched_domain, node_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - struct sched_domain *node_domain = &per_cpu(node_domains, i); - int node = cpu_to_node(i); - cpumask_t nodemask = node_to_cpumask(node); - cpumask_t my_cpumask = cpumask_of_cpu(i); - cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); - - *cpu_domain = SD_SIBLING_INIT; - if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) - cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); - else - cpu_domain->span = my_cpumask; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = nodemask; - phys_domain->parent = node_domain; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - - *node_domain = SD_NODE_INIT; - node_domain->span = cpu_possible_map; - node_domain->groups = &sched_group_nodes[node]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - for (i = 0; i < MAX_NUMNODES; i++) { - int j; - cpumask_t nodemask; - struct sched_group *node = &sched_group_nodes[i]; - cpumask_t node_cpumask = node_to_cpumask(i); - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu_mask(j, nodemask) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, j); - struct sched_group *cpu = &sched_group_phys[j]; - - if (j != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* - * Make each extra sibling increase power by 10% of - * the basic CPU. This is very arbitrary. - */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - node->cpu_power += cpu->cpu_power; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - /* Set up nodes */ - first = last = NULL; - for (i = 0; i < MAX_NUMNODES; i++) { - struct sched_group *cpu = &sched_group_nodes[i]; - cpumask_t nodemask; - cpumask_t node_cpumask = node_to_cpumask(i); - cpus_and(nodemask, node_cpumask, cpu_possible_map); - - if (cpus_empty(nodemask)) - continue; - - cpu->cpumask = nodemask; - /* ->cpu_power already setup */ - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#else /* !CONFIG_NUMA */ -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - cpumask_t my_cpumask = cpumask_of_cpu(i); - cpumask_t sibling_cpumask = cpumask_of_cpu(i ^ 0x1); - - *cpu_domain = SD_SIBLING_INIT; - if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) - cpus_or(cpu_domain->span, my_cpumask, sibling_cpumask); - else - cpu_domain->span = my_cpumask; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = cpu_possible_map; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_group *cpu = &sched_group_phys[i]; - - if (i != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* See SMT+NUMA setup for comment */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_SCHED_SMT */ diff -L arch/x86_64/kernel/domain.c -puN arch/x86_64/kernel/domain.c~sched-consolidate-domains /dev/null --- linux-2.6/arch/x86_64/kernel/domain.c +++ /dev/null 2004-06-24 18:17:02.000000000 +1000 @@ -1,93 +0,0 @@ -#include -#include - -/* Don't do any NUMA setup on Opteron right now. They seem to be - better off with flat scheduling. This is just for SMT. */ - -#ifdef CONFIG_SCHED_SMT - -static struct sched_group sched_group_cpus[NR_CPUS]; -static struct sched_group sched_group_phys[NR_CPUS]; -static DEFINE_PER_CPU(struct sched_domain, cpu_domains); -static DEFINE_PER_CPU(struct sched_domain, phys_domains); -__init void arch_init_sched_domains(void) -{ - int i; - struct sched_group *first = NULL, *last = NULL; - - /* Set up domains */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_domain *phys_domain = &per_cpu(phys_domains, i); - - *cpu_domain = SD_SIBLING_INIT; - /* Disable SMT NICE for CMP */ - /* RED-PEN use a generic flag */ - if (cpu_data[i].x86_vendor == X86_VENDOR_AMD) - cpu_domain->flags &= ~SD_SHARE_CPUPOWER; - cpu_domain->span = cpu_sibling_map[i]; - cpu_domain->parent = phys_domain; - cpu_domain->groups = &sched_group_cpus[i]; - - *phys_domain = SD_CPU_INIT; - phys_domain->span = cpu_possible_map; - phys_domain->groups = &sched_group_phys[first_cpu(cpu_domain->span)]; - } - - /* Set up CPU (sibling) groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - int j; - first = last = NULL; - - if (i != first_cpu(cpu_domain->span)) - continue; - - for_each_cpu_mask(j, cpu_domain->span) { - struct sched_group *cpu = &sched_group_cpus[j]; - - cpus_clear(cpu->cpumask); - cpu_set(j, cpu->cpumask); - cpu->cpu_power = SCHED_LOAD_SCALE; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - } - - first = last = NULL; - /* Set up physical groups */ - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - struct sched_group *cpu = &sched_group_phys[i]; - - if (i != first_cpu(cpu_domain->span)) - continue; - - cpu->cpumask = cpu_domain->span; - /* - * Make each extra sibling increase power by 10% of - * the basic CPU. This is very arbitrary. - */ - cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10; - - if (!first) - first = cpu; - if (last) - last->next = cpu; - last = cpu; - } - last->next = first; - - mb(); - for_each_cpu(i) { - struct sched_domain *cpu_domain = &per_cpu(cpu_domains, i); - cpu_attach_domain(cpu_domain, i); - } -} - -#endif diff -puN arch/x86_64/kernel/Makefile~sched-consolidate-domains arch/x86_64/kernel/Makefile --- linux-2.6/arch/x86_64/kernel/Makefile~sched-consolidate-domains 2004-07-16 20:04:19.000000000 +1000 +++ linux-2.6-npiggin/arch/x86_64/kernel/Makefile 2004-07-16 20:04:31.000000000 +1000 @@ -25,7 +25,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_SWIOTLB) += swiotlb.o -obj-$(CONFIG_SCHED_SMT) += domain.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_KGDB) += kgdb_stub.o diff -puN arch/x86_64/kernel/Makefile-HEAD~sched-consolidate-domains arch/x86_64/kernel/Makefile-HEAD --- linux-2.6/arch/x86_64/kernel/Makefile-HEAD~sched-consolidate-domains 2004-07-16 20:04:24.000000000 +1000 +++ linux-2.6-npiggin/arch/x86_64/kernel/Makefile-HEAD 2004-07-16 20:04:34.000000000 +1000 @@ -25,7 +25,6 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin obj-$(CONFIG_GART_IOMMU) += pci-gart.o aperture.o obj-$(CONFIG_DUMMY_IOMMU) += pci-nommu.o pci-dma.o obj-$(CONFIG_SWIOTLB) += swiotlb.o -obj-$(CONFIG_SCHED_SMT) += domain.o obj-$(CONFIG_MODULES) += module.o diff -puN include/asm-ppc64/smp.h~sched-consolidate-domains include/asm-ppc64/smp.h --- linux-2.6/include/asm-ppc64/smp.h~sched-consolidate-domains 2004-07-16 20:06:13.000000000 +1000 +++ linux-2.6-npiggin/include/asm-ppc64/smp.h 2004-07-16 20:07:37.000000000 +1000 @@ -73,6 +73,9 @@ void smp_init_pSeries(void); extern int __cpu_disable(void); extern void __cpu_die(unsigned int cpu); extern void cpu_die(void) __attribute__((noreturn)); +#ifdef CONFIG_SCHED_SMT +extern cpumask_t cpu_sibling_map[NR_CPUS]; +#endif #endif /* !(CONFIG_SMP) */ #define get_hard_smp_processor_id(CPU) (paca[(CPU)].hw_cpu_id) _ --------------040108020208060701010304--