From: Jesse Barnes <jbarnes@engr.sgi.com>
To: linux-kernel@vger.kernel.org, linux-ia64@vger.kernel.org,
Nick Piggin <nickpiggin@yahoo.com.au>
Cc: John Hawkes <hawkes@sgi.com>
Subject: [PATCH] add scheduler domains for ia64
Date: Fri, 13 Aug 2004 18:08:40 +0000 [thread overview]
Message-ID: <200408131108.40502.jbarnes@engr.sgi.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 417 bytes --]
Nick, how does this look? It adds scheduler domain code for ia64 and replaces
the patch in Andrew's tree. It also adds SD_NODE_INIT macros to each arch
that has ARCH_HAS_SCHED_DOMAIN so that the balance values are more easily
tweaked. Since the cpu span of the nodes on ia64 is smaller than the whole
system, I also removed a WARN_ON in active_load_balance, but I'm not sure if
that's correct.
Thanks,
Jesse
[-- Attachment #2: sched-domains-ia64.patch --]
[-- Type: text/plain, Size: 9474 bytes --]
===== arch/ia64/kernel/smpboot.c 1.56 vs edited =====
--- 1.56/arch/ia64/kernel/smpboot.c 2004-08-04 10:50:16 -07:00
+++ edited/arch/ia64/kernel/smpboot.c 2004-08-13 11:03:29 -07:00
@@ -719,3 +719,182 @@
printk(KERN_ERR "SMP: Can't set SAL AP Boot Rendezvous: %s\n",
ia64_sal_strerror(sal_ret));
}
+
+#ifdef CONFIG_NUMA
+
+/**
+ * find_next_best_node - find the next node to include in a sched_domain
+ * @node: node whose sched_domain we're building
+ * @used_nodes: nodes already in the sched_domain
+ *
+ * Find the next node to include in a given scheduling domain. Simply
+ * finds the closest node not already in the @used_nodes map.
+ *
+ * Should use nodemask_t.
+ */
+static int __init find_next_best_node(int node, unsigned long *used_nodes)
+{
+ int i, n, val, min_val, best_node = 0;
+
+ min_val = INT_MAX;
+
+ for (i = 0; i < numnodes; i++) {
+ /* Start at @node */
+ n = (node + i) % numnodes;
+
+ /* Skip already used nodes */
+ if (test_bit(n, used_nodes))
+ continue;
+
+ /* Simple min distance search */
+ val = node_distance(node, i);
+
+ if (val < min_val) {
+ min_val = val;
+ best_node = n;
+ }
+ }
+
+ set_bit(best_node, used_nodes);
+ return best_node;
+}
+
+/**
+ * sched_domain_node_span - get a cpumask for a node's sched_domain
+ * @node: node whose cpumask we're constructing
+ * @size: number of nodes to include in this span
+ *
+ * Given a node, construct a good cpumask for its sched_domain to span. It
+ * should be one that prevents unnecessary balancing, but also spreads tasks
+ * out optimally.
+ */
+cpumask_t __init sched_domain_node_span(int node, int size)
+{
+ int i;
+ cpumask_t span;
+ DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
+
+ cpus_clear(span);
+ bitmap_zero(used_nodes, MAX_NUMNODES);
+
+ for (i = 0; i < size; i++) {
+ int next_node = find_next_best_node(node, used_nodes);
+ cpus_or(span, span, node_to_cpumask(next_node));
+ }
+
+ return span;
+}
+
+static struct sched_group sched_group_cpus[NR_CPUS];
+static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
+
+/* Number of nearby nodes in a node's scheduling domain */
+#define SD_NODES_PER_DOMAIN 4
+
+static struct sched_group sched_group_nodes[MAX_NUMNODES];
+static DEFINE_PER_CPU(struct sched_domain, node_domains);
+void __init arch_init_sched_domains(void)
+{
+ int i;
+ struct sched_group *first_node = NULL, *last_node = NULL;
+
+ /* Set up domains */
+ for_each_cpu(i) {
+ int node = cpu_to_node(i);
+ cpumask_t nodemask = node_to_cpumask(node);
+ struct sched_domain *node_sd = &per_cpu(node_domains, i);
+ struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+
+ *node_sd = SD_NODE_INIT;
+ node_sd->span = sched_domain_node_span(i, SD_NODES_PER_DOMAIN);
+ node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
+
+ *cpu_sd = SD_CPU_INIT;
+ cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
+ cpu_sd->groups = &sched_group_cpus[i];
+ cpu_sd->parent = node_sd;
+ }
+
+ /* Set up groups */
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ cpumask_t tmp = node_to_cpumask(i);
+ cpumask_t nodemask;
+ struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+ struct sched_group *node = &sched_group_nodes[i];
+ int j;
+
+ cpus_and(nodemask, tmp, cpu_possible_map);
+
+ if (cpus_empty(nodemask))
+ continue;
+
+ node->cpumask = nodemask;
+ node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
+
+ for_each_cpu_mask(j, node->cpumask) {
+ struct sched_group *cpu = &sched_group_cpus[j];
+
+ cpus_clear(cpu->cpumask);
+ cpu_set(j, cpu->cpumask);
+ cpu->cpu_power = SCHED_LOAD_SCALE;
+
+ if (!first_cpu)
+ first_cpu = cpu;
+ if (last_cpu)
+ last_cpu->next = cpu;
+ last_cpu = cpu;
+ }
+ last_cpu->next = first_cpu;
+
+ if (!first_node)
+ first_node = node;
+ if (last_node)
+ last_node->next = node;
+ last_node = node;
+ }
+ last_node->next = first_node;
+
+ mb();
+ for_each_cpu(i) {
+ struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+ cpu_attach_domain(cpu_sd, i);
+ }
+}
+#else /* !CONFIG_NUMA */
+static void __init arch_init_sched_domains(void)
+{
+ int i;
+ struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+
+ /* Set up domains */
+ for_each_cpu(i) {
+ struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+
+ *cpu_sd = SD_CPU_INIT;
+ cpu_sd->span = cpu_possible_map;
+ cpu_sd->groups = &sched_group_cpus[i];
+ }
+
+ /* Set up CPU groups */
+ for_each_cpu_mask(i, cpu_possible_map) {
+ struct sched_group *cpu = &sched_group_cpus[i];
+
+ cpus_clear(cpu->cpumask);
+ cpu_set(i, cpu->cpumask);
+ cpu->cpu_power = SCHED_LOAD_SCALE;
+
+ if (!first_cpu)
+ first_cpu = cpu;
+ if (last_cpu)
+ last_cpu->next = cpu;
+ last_cpu = cpu;
+ }
+ last_cpu->next = first_cpu;
+
+ mb(); /* domains were modified outside the lock */
+ for_each_cpu(i) {
+ struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i);
+ cpu_attach_domain(cpu_sd, i);
+ }
+}
+#endif /* CONFIG_NUMA */
===== include/asm-i386/processor.h 1.67 vs edited =====
--- 1.67/include/asm-i386/processor.h 2004-06-27 00:19:26 -07:00
+++ edited/include/asm-i386/processor.h 2004-08-13 10:37:06 -07:00
@@ -647,6 +647,24 @@
#ifdef CONFIG_SCHED_SMT
#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
#define ARCH_HAS_SCHED_WAKE_IDLE
#endif
===== include/asm-ia64/processor.h 1.61 vs edited =====
--- 1.61/include/asm-ia64/processor.h 2004-07-26 22:26:50 -07:00
+++ edited/include/asm-ia64/processor.h 2004-08-13 10:08:03 -07:00
@@ -334,6 +334,29 @@
/* Prepare to copy thread state - unlazy all lazy status */
#define prepare_to_copy(tsk) do { } while (0)
+#ifdef CONFIG_NUMA
+/* smpboot.c defines a numa specific scheduler domain routine */
+#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 80, \
+ .max_interval = 320, \
+ .busy_factor = 320, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 10, \
+ .nr_balance_failed = 0, \
+}
+#endif
+
/*
* This is the mechanism for creating a new kernel thread.
*
===== include/asm-ppc64/processor.h 1.48 vs edited =====
--- 1.48/include/asm-ppc64/processor.h 2004-07-26 15:13:12 -07:00
+++ edited/include/asm-ppc64/processor.h 2004-08-13 10:37:19 -07:00
@@ -628,6 +628,24 @@
#ifdef CONFIG_SCHED_SMT
#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
#define ARCH_HAS_SCHED_WAKE_IDLE
#endif
===== include/asm-x86_64/processor.h 1.36 vs edited =====
--- 1.36/include/asm-x86_64/processor.h 2004-06-27 00:19:26 -07:00
+++ edited/include/asm-x86_64/processor.h 2004-08-13 10:37:36 -07:00
@@ -458,6 +458,24 @@
#ifdef CONFIG_SCHED_SMT
#define ARCH_HAS_SCHED_DOMAIN
+#define SD_NODE_INIT (struct sched_domain) { \
+ .span = CPU_MASK_NONE, \
+ .parent = NULL, \
+ .groups = NULL, \
+ .min_interval = 8, \
+ .max_interval = 32, \
+ .busy_factor = 32, \
+ .imbalance_pct = 125, \
+ .cache_hot_time = (10*1000000), \
+ .cache_nice_tries = 1, \
+ .per_cpu_gain = 100, \
+ .flags = SD_BALANCE_EXEC \
+ | SD_BALANCE_CLONE \
+ | SD_WAKE_BALANCE, \
+ .last_balance = jiffies, \
+ .balance_interval = 1, \
+ .nr_balance_failed = 0, \
+}
#define ARCH_HAS_SCHED_WAKE_IDLE
#endif
===== include/linux/sched.h 1.228 vs edited =====
--- 1.228/include/linux/sched.h 2004-07-28 21:58:54 -07:00
+++ edited/include/linux/sched.h 2004-08-13 10:06:05 -07:00
@@ -17,6 +17,7 @@
#include <asm/system.h>
#include <asm/semaphore.h>
#include <asm/page.h>
+#include <asm/processor.h>
#include <asm/ptrace.h>
#include <asm/mmu.h>
@@ -654,6 +655,7 @@
}
#ifdef CONFIG_NUMA
+#ifndef ARCH_HAS_SCHED_DOMAIN
/* Common values for NUMA nodes */
#define SD_NODE_INIT (struct sched_domain) { \
.span = CPU_MASK_NONE, \
@@ -673,6 +675,7 @@
.balance_interval = 1, \
.nr_balance_failed = 0, \
}
+#endif
#endif
extern void cpu_attach_domain(struct sched_domain *sd, int cpu);
===== kernel/sched.c 1.319 vs edited =====
--- 1.319/kernel/sched.c 2004-08-02 01:00:40 -07:00
+++ edited/kernel/sched.c 2004-08-13 10:59:53 -07:00
@@ -1826,10 +1826,8 @@
for_each_domain(busiest_cpu, sd)
if (cpu_isset(busiest->push_cpu, sd->span))
break;
- if (!sd) {
- WARN_ON(1);
+ if (!sd)
return;
- }
group = sd->groups;
while (!cpu_isset(busiest_cpu, group->cpumask))
next reply other threads:[~2004-08-13 18:08 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-08-13 18:08 Jesse Barnes [this message]
2004-08-14 3:23 ` [PATCH] add scheduler domains for ia64 Nick Piggin
2004-08-14 20:52 ` Jesse Barnes
2004-08-15 0:54 ` Nick Piggin
2004-08-17 20:57 ` Jesse Barnes
2004-08-20 2:11 ` Nick Piggin
2004-08-20 2:22 ` Jesse Barnes
2004-08-20 6:28 ` Andrew Morton
2004-08-20 14:57 ` Jesse Barnes
2004-08-20 8:06 ` Ingo Molnar
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=200408131108.40502.jbarnes@engr.sgi.com \
--to=jbarnes@engr.sgi.com \
--cc=hawkes@sgi.com \
--cc=linux-ia64@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=nickpiggin@yahoo.com.au \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox