Hello Jesse and all, There is curently an issue with sched domain initialisations on some Numa platforms: Current ia64 implementation provides a SD_NODES_PER_DOMAIN #define that is used to build a top level domain when there are 2 levels of Numa in the platform. This value is different on some platforms: for example a 2 modules * 4 nodes * 4 cpus platform should use SD_NODES_PER_DOMAIN = 4 instead of the current value 6. It is easy to provide the SD_NODES_PER_DOMAIN as a config parameter or boot parameter. But, even with the correct value for the platform, there are side effects when the configuration has some disymetry. For example: with SD_NODES_PER_DOMAIN=4: . on a 1 module of 4 nodes * 4 cpus where there is a missing cpu (then a 3 nodes * 4 cpus plus a 1 node * 3 cpus), sched_domain initialisation tries to build a top-level domain for the node that contains 3 cpus and we get an "ERROR: domain->cpu_power not set" error. . on a 2 modules * 4 nodes configuration with 1 node missing (then a 4 * nodes module and a 3 * nodes module), there is 1 node that is part of both node domains. An alternative is setting SD_NODES_PER_DOMAIN to the maximum number of nodes (thus loosing the ability to have 2 levels of sched domains to take in account the Numa topology). An other alternative is using the node_distance() that comes from the SLIT to build the sched domains instead of using SD_NODES_PER_DOMAIN on the platform. The following patch sets SD_NODES_PER_DOMAIN as a config/boot parameter, and when the value is 0 uses the node_distance to build the sched domains. This patch allows configuring the sched-domains based on the SLIT table on ia64 platforms. It should allow having disymetric configurations like having different numbers of cpus per node or missing nodes when a top level domain is used. Current limitation is 2 level Numa. diff --exclude-from /home17/xb/proc/patch.exclude -Nurp linux-2.6.11-kgdbr/arch/ia64/Kconfig linux-2.6.11-kgdb/arch/ia64/Kconfig --- linux-2.6.11-kgdbr/arch/ia64/Kconfig 2005-03-02 08:38:26.000000000 +0100 +++ linux-2.6.11-kgdb/arch/ia64/Kconfig 2005-05-26 13:52:10.362718582 +0200 @@ -174,6 +174,18 @@ config NUMA Access). This option is for configuring high-end multiprocessor server systems. If in doubt, say N. +config SD_NODES_PER_DOMAIN + int "Number of nodes per base sched_domains" + default "6" + help + Number of nodes per base sched_domains. + + Should be 6 for SGI platforms. + Should be 0 for platforms that rely on SLIT table + to build the sched_domains (Eg: Bull Novascale) + This value can be provided at boot time using the + sd_nodes_per_domain boot parameter. + config VIRTUAL_MEM_MAP bool "Virtual mem map" default y if !IA64_HP_SIM diff --exclude-from /home17/xb/proc/patch.exclude -Nurp linux-2.6.11-kgdbr/arch/ia64/kernel/domain.c linux-2.6.11-kgdb/arch/ia64/kernel/domain.c --- linux-2.6.11-kgdbr/arch/ia64/kernel/domain.c 2005-03-02 08:38:33.000000000 +0100 +++ linux-2.6.11-kgdb/arch/ia64/kernel/domain.c 2005-05-26 16:11:49.299139378 +0200 @@ -14,20 +14,29 @@ #include #include -#define SD_NODES_PER_DOMAIN 6 - #ifdef CONFIG_NUMA + +static int numa_lvls = -1; +static int sd_nodes_per_domain = CONFIG_SD_NODES_PER_DOMAIN; + +static int __init set_sd_nodes_per_domain(char *str) +{ + get_option(&str, &sd_nodes_per_domain); + return 1; +} +__setup("sd_nodes_per_domain=", set_sd_nodes_per_domain); + /** * find_next_best_node - find the next node to include in a sched_domain * @node: node whose sched_domain we're building * @used_nodes: nodes already in the sched_domain - * + * @dist: distance to node * Find the next node to include in a given scheduling domain. Simply * finds the closest node not already in the @used_nodes map. * * Should use nodemask_t. */ -static int __devinit find_next_best_node(int node, unsigned long *used_nodes) +static int __devinit find_next_best_node(int node, unsigned long *used_nodes, int *dist) { int i, n, val, min_val, best_node = 0; @@ -54,6 +63,7 @@ static int __devinit find_next_best_node } set_bit(best_node, used_nodes); + *dist = min_val; return best_node; } @@ -70,6 +80,7 @@ static cpumask_t __devinit sched_domain_ { int i; cpumask_t span, nodemask; + int dist_min = INT_MAX; DECLARE_BITMAP(used_nodes, MAX_NUMNODES); cpus_clear(span); @@ -79,8 +90,13 @@ static cpumask_t __devinit sched_domain_ cpus_or(span, span, nodemask); set_bit(node, used_nodes); - for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { - int next_node = find_next_best_node(node, used_nodes); + for (i = 1; i < sd_nodes_per_domain; i++) { + int dist; + int next_node = find_next_best_node(node, used_nodes, &dist); + if ((numa_lvls >= 0) && (dist > dist_min)) + /* keep only nearest nodes when building sched domains based on node distance */ + break; + dist_min = dist; nodemask = node_to_cpumask(next_node); cpus_or(span, span, nodemask); } @@ -132,6 +148,26 @@ static int __devinit cpu_to_allnodes_gro #endif /* + * returns number of numa levels based on node_distance() + */ + +static int find_numa_lvls(void) +{ + int i, j, dist[MAX_NUMNODES]={0}, numa_lvls=0; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (!nr_cpus_node(i)) + continue; + for (j = 0; j < MAX_NUMNODES; j++) + if (node_distance(0,i) == dist[j]) + break; + if (j == MAX_NUMNODES) + dist[numa_lvls++] = node_distance(0,i); + } + return numa_lvls - 1; +} + +/* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ void __devinit arch_init_sched_domains(void) @@ -139,6 +175,19 @@ void __devinit arch_init_sched_domains(v int i; cpumask_t cpu_default_map; + if (sd_nodes_per_domain == 0) { + + /* sched domain configuration relies on node distances */ + + numa_lvls = find_numa_lvls(); + sd_nodes_per_domain = MAX_NUMNODES; + + /* Currently 2-level numa maximum support */ + + if (numa_lvls > 2) + BUG(); + } + /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to @@ -158,8 +207,8 @@ void __devinit arch_init_sched_domains(v cpus_and(nodemask, nodemask, cpu_default_map); #ifdef CONFIG_NUMA - if (num_online_cpus() - > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if ((numa_lvls == 2) || (num_online_cpus() + > sd_nodes_per_domain*cpus_weight(nodemask))) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = cpu_default_map; -- Sincères salutations.