All of lore.kernel.org
 help / color / mirror / Atom feed
From: Xavier Bru <xavier.bru@bull.net>
To: linux-ia64@vger.kernel.org
Subject: ia64 sched-domains initialisation
Date: Fri, 27 May 2005 15:57:53 +0000	[thread overview]
Message-ID: <42974381.9080000@bull.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 7180 bytes --]

Hello Jesse and all,

There is curently an issue with sched domain initialisations on some 
Numa platforms:

Current ia64 implementation provides a SD_NODES_PER_DOMAIN #define that 
is used to build a top level domain when there are 2 levels of Numa in 
the platform.
This value is different on some platforms: for example a 2 modules * 4 
nodes * 4 cpus platform should use SD_NODES_PER_DOMAIN = 4 instead of 
the current value 6.
It is easy to provide the SD_NODES_PER_DOMAIN as a config parameter or 
boot parameter.
But, even with the correct value for the platform, there are side 
effects when the configuration has some disymetry.
For example: with SD_NODES_PER_DOMAIN=4:
    .  on a 1 module of 4 nodes * 4 cpus where there is a missing cpu 
(then a 3 nodes * 4 cpus plus a 1 node * 3 cpus), sched_domain 
initialisation tries to build a top-level domain for the node that 
contains 3 cpus and we get an "ERROR: domain->cpu_power not set" error.
    . on a 2 modules * 4 nodes configuration with 1 node missing (then a 
4 * nodes module and a 3 * nodes module), there is 1 node that is part 
of both node domains.
An alternative is setting SD_NODES_PER_DOMAIN to the maximum number of 
nodes (thus loosing the ability to have 2 levels of sched domains to 
take in account the Numa topology).
An other alternative is using the node_distance() that comes from the 
SLIT to build the sched domains instead of using
SD_NODES_PER_DOMAIN on the platform.
The following patch sets SD_NODES_PER_DOMAIN as a config/boot parameter, 
and when the value is 0 uses the node_distance to build the sched domains.

This patch allows configuring the sched-domains based on the SLIT table 
on ia64 platforms.
It should allow having disymetric configurations like having different 
numbers of cpus per node or missing nodes
when a top level domain is used.
Current limitation is 2 level Numa.

diff --exclude-from /home17/xb/proc/patch.exclude -Nurp 
linux-2.6.11-kgdbr/arch/ia64/Kconfig linux-2.6.11-kgdb/arch/ia64/Kconfig
--- linux-2.6.11-kgdbr/arch/ia64/Kconfig    2005-03-02 
08:38:26.000000000 +0100
+++ linux-2.6.11-kgdb/arch/ia64/Kconfig    2005-05-26 13:52:10.362718582 
+0200
@@ -174,6 +174,18 @@ config NUMA
       Access).  This option is for configuring high-end multiprocessor
       server systems.  If in doubt, say N.
 
+config SD_NODES_PER_DOMAIN
+    int "Number of nodes per base sched_domains"
+    default "6"
+    help
+      Number of nodes per base sched_domains.
+
+      Should be 6 for SGI platforms.
+      Should be 0 for platforms that rely on SLIT table
+      to build the sched_domains (Eg: Bull Novascale)
+      This value can be provided at boot time using the
+      sd_nodes_per_domain boot parameter.
+       
 config VIRTUAL_MEM_MAP
     bool "Virtual mem map"
     default y if !IA64_HP_SIM
diff --exclude-from /home17/xb/proc/patch.exclude -Nurp 
linux-2.6.11-kgdbr/arch/ia64/kernel/domain.c 
linux-2.6.11-kgdb/arch/ia64/kernel/domain.c
--- linux-2.6.11-kgdbr/arch/ia64/kernel/domain.c    2005-03-02 
08:38:33.000000000 +0100
+++ linux-2.6.11-kgdb/arch/ia64/kernel/domain.c    2005-05-26 
16:11:49.299139378 +0200
@@ -14,20 +14,29 @@
 #include <linux/topology.h>
 #include <linux/nodemask.h>
 
-#define SD_NODES_PER_DOMAIN 6
-
 #ifdef CONFIG_NUMA
+
+static int numa_lvls = -1;
+static int sd_nodes_per_domain = CONFIG_SD_NODES_PER_DOMAIN;
+
+static int __init set_sd_nodes_per_domain(char *str)
+{
+    get_option(&str, &sd_nodes_per_domain);
+    return 1;
+}
+__setup("sd_nodes_per_domain=", set_sd_nodes_per_domain);
+
 /**
  * find_next_best_node - find the next node to include in a sched_domain
  * @node: node whose sched_domain we're building
  * @used_nodes: nodes already in the sched_domain
- *
+ * @dist: distance to node
  * Find the next node to include in a given scheduling domain.  Simply
  * finds the closest node not already in the @used_nodes map.
  *
  * Should use nodemask_t.
  */
-static int __devinit find_next_best_node(int node, unsigned long 
*used_nodes)
+static int __devinit find_next_best_node(int node, unsigned long 
*used_nodes, int *dist)
 {
     int i, n, val, min_val, best_node = 0;
 
@@ -54,6 +63,7 @@ static int __devinit find_next_best_node
     }
 
     set_bit(best_node, used_nodes);
+    *dist = min_val;
     return best_node;
 }
 
@@ -70,6 +80,7 @@ static cpumask_t __devinit sched_domain_
 {
     int i;
     cpumask_t span, nodemask;
+    int dist_min = INT_MAX;
     DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
 
     cpus_clear(span);
@@ -79,8 +90,13 @@ static cpumask_t __devinit sched_domain_
     cpus_or(span, span, nodemask);
     set_bit(node, used_nodes);
 
-    for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-        int next_node = find_next_best_node(node, used_nodes);
+    for (i = 1; i < sd_nodes_per_domain; i++) {
+        int dist;
+        int next_node = find_next_best_node(node, used_nodes, &dist);
+        if ((numa_lvls >= 0) && (dist > dist_min))
+            /* keep only nearest nodes when building sched domains 
based on node distance */
+            break;
+        dist_min = dist;
         nodemask = node_to_cpumask(next_node);
         cpus_or(span, span, nodemask);
     }
@@ -132,6 +148,26 @@ static int __devinit cpu_to_allnodes_gro
 #endif
 
 /*
+ *    returns number of numa levels based on node_distance()
+ */
+
+static int find_numa_lvls(void)
+{
+    int i, j, dist[MAX_NUMNODES]={0}, numa_lvls=0;
+   
+    for (i = 0; i < MAX_NUMNODES; i++) {
+        if (!nr_cpus_node(i))
+            continue;
+        for (j = 0; j < MAX_NUMNODES; j++)
+            if (node_distance(0,i) == dist[j])
+                break;
+        if (j == MAX_NUMNODES)
+            dist[numa_lvls++] = node_distance(0,i);
+    }
+    return numa_lvls - 1;
+}
+
+/*
  * Set up scheduler domains and groups.  Callers must hold the hotplug 
lock.
  */
 void __devinit arch_init_sched_domains(void)
@@ -139,6 +175,19 @@ void __devinit arch_init_sched_domains(v
     int i;
     cpumask_t cpu_default_map;
 
+    if (sd_nodes_per_domain == 0) {
+   
+        /* sched domain configuration relies on node distances */
+
+        numa_lvls = find_numa_lvls();
+        sd_nodes_per_domain = MAX_NUMNODES;
+
+        /* Currently 2-level numa maximum support */
+
+        if (numa_lvls > 2)
+            BUG();
+    }
+
     /*
      * Setup mask for cpus without special case scheduling requirements.
      * For now this just excludes isolated cpus, but could be used to
@@ -158,8 +207,8 @@ void __devinit arch_init_sched_domains(v
         cpus_and(nodemask, nodemask, cpu_default_map);
 
 #ifdef CONFIG_NUMA
-        if (num_online_cpus()
-                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
+         if ((numa_lvls == 2) || (num_online_cpus()
+                 > sd_nodes_per_domain*cpus_weight(nodemask))) {
             sd = &per_cpu(allnodes_domains, i);
             *sd = SD_ALLNODES_INIT;
             sd->span = cpu_default_map;

-- 

	Sincères salutations.


[-- Attachment #2: xavier.bru.vcf --]
[-- Type: text/x-vcard, Size: 306 bytes --]

begin:vcard
fn:Xavier Bru
n:Bru;Xavier
adr:;;1 rue de Provence, BP 208;Echirolles;;38432 Cedex;France
email;internet:Xavier.Bru@bull.net
title:BULL/DT/Open Software/linux/ia64
tel;work:+33 (0)4 76 29 77 45
tel;fax:+33 (0)4 76 29 77 70
x-mozilla-html:TRUE
url:http://www-frec.bull.fr
version:2.1
end:vcard


             reply	other threads:[~2005-05-27 15:57 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-05-27 15:57 Xavier Bru [this message]
2005-05-27 16:13 ` ia64 sched-domains initialisation Jesse Barnes
2005-05-29  5:49 ` Nick Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42974381.9080000@bull.net \
    --to=xavier.bru@bull.net \
    --cc=linux-ia64@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.