All of lore.kernel.org
 help / color / mirror / Atom feed
From: Andreas Herrmann <andreas.herrmann3@amd.com>
To: Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@elte.hu>
Cc: linux-kernel@vger.kernel.org
Subject: [PATCH 12/15] sched: Allow NODE domain to be parent of MC instead of CPU domain
Date: Thu, 20 Aug 2009 15:42:45 +0200	[thread overview]
Message-ID: <20090820134245.GA29327@alberich.amd.com> (raw)
In-Reply-To: <20090820131243.GO29327@alberich.amd.com>


The level of NODE domain's child domain is provided in s_data.numa_child_level.
Then several adaptions are required when creating the domain hierarchy.
In case NODE domain is parent of MC domain we have to:
- limit NODE domains' span in sched_domain_node_span() to not exceed
  corresponding topology_core_cpumask.
- fix CPU domain span to cover entire cpu_map
- fix CPU domain sched groups to cover entire physical groups instead of
  covering a node (a node sched_group might be a proper subset of a CPU
  sched_group).
- use correct child domain in init_numa_sched_groups_power() when
  calculating sched_group.__cpu_power in NODE domain
- calculate group_power of NODE domain after its child domain

Note: As I have no idea when the ALLNODES domain is required
      I assumed that an ALLNODES domain exists only if NODE domain
      is parent of CPU domain.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
---
 kernel/sched.c |  106 ++++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 73 insertions(+), 33 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 464b6ba..b03701d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8161,7 +8161,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
  * should be one that prevents unnecessary balancing, but also spreads tasks
  * out optimally.
  */
-static void sched_domain_node_span(int node, struct cpumask *span)
+static void sched_domain_node_span(int node, struct cpumask *span,
+				   enum sched_domain_level child_level)
 {
 	nodemask_t used_nodes;
 	int i;
@@ -8177,6 +8178,10 @@ static void sched_domain_node_span(int node, struct cpumask *span)
 
 		cpumask_or(span, span, cpumask_of_node(next_node));
 	}
+
+	if (child_level == SD_LV_MC)
+		cpumask_and(span, span, topology_core_cpumask(
+			      cpumask_first(cpumask_of_node(node))));
 }
 #endif /* CONFIG_NUMA */
 
@@ -8201,6 +8206,7 @@ struct static_sched_domain {
 };
 
 struct s_data {
+	enum sched_domain_level numa_child_level;
 #ifdef CONFIG_NUMA
 	int			sd_allnodes;
 	cpumask_var_t		domainspan;
@@ -8354,7 +8360,8 @@ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
 	return group;
 }
 
-static void init_numa_sched_groups_power(struct sched_group *group_head)
+static void init_numa_sched_groups_power(struct sched_group *group_head,
+					 enum sched_domain_level child_level)
 {
 	struct sched_group *sg = group_head;
 	int j;
@@ -8365,7 +8372,11 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 		for_each_cpu(j, sched_group_cpus(sg)) {
 			struct sched_domain *sd;
 
-			sd = &per_cpu(phys_domains, j).sd;
+			if (child_level == SD_LV_CPU)
+				sd = &per_cpu(phys_domains, j).sd;
+			else /* SD_LV_MC */
+				sd = &per_cpu(core_domains, j).sd;
+
 			if (j != group_first_cpu(sd->groups)) {
 				/*
 				 * Only add "power" once for each
@@ -8394,7 +8405,7 @@ static int build_numa_sched_groups(struct s_data *d,
 		goto out;
 	}
 
-	sched_domain_node_span(num, d->domainspan);
+	sched_domain_node_span(num, d->domainspan, d->numa_child_level);
 	cpumask_and(d->domainspan, d->domainspan, cpu_map);
 
 	sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -8699,15 +8710,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 }
 
 static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
-	const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+	const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+	struct sched_domain *parent, int i)
 {
-	struct sched_domain *sd = NULL;
+	struct sched_domain *sd = parent;
 #ifdef CONFIG_NUMA
-	struct sched_domain *parent;
-
 	d->sd_allnodes = 0;
-	if (cpumask_weight(cpu_map) >
-	    SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+	if ((cpumask_weight(cpu_map) >
+	     SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) &&
+	    (d->numa_child_level == SD_LV_CPU)) {
 		sd = &per_cpu(allnodes_domains, i).sd;
 		SD_INIT(sd, ALLNODES);
 		set_domain_attribute(sd, attr);
@@ -8720,7 +8731,8 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
 	sd = &per_cpu(node_domains, i).sd;
 	SD_INIT(sd, NODE);
 	set_domain_attribute(sd, attr);
-	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+	sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd),
+			       d->numa_child_level);
 	sd->parent = parent;
 	if (parent)
 		parent->child = sd;
@@ -8737,10 +8749,12 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
 	sd = &per_cpu(phys_domains, i).sd;
 	SD_INIT(sd, CPU);
 	set_domain_attribute(sd, attr);
-	cpumask_copy(sched_domain_span(sd), d->nodemask);
 	sd->parent = parent;
-	if (parent)
+	if (parent) {
+		cpumask_copy(sched_domain_span(sd), d->nodemask);
 		parent->child = sd;
+	} else
+		cpumask_copy(sched_domain_span(sd), cpu_map);
 	cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
 	return sd;
 }
@@ -8831,11 +8845,18 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
 		break;
 #endif
 	case SD_LV_CPU: /* set up physical groups */
-		cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
-		if (!cpumask_empty(d->nodemask))
-			init_sched_build_groups(d->nodemask, cpu_map,
-						&cpu_to_phys_group,
-						d->send_covered, d->tmpmask);
+		if (d->numa_child_level == SD_LV_MC) {
+			init_sched_build_groups(cpu_map, cpu_map,
+                                                &cpu_to_phys_group,
+                                                d->send_covered, d->tmpmask);
+		} else {
+			cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+			if (!cpumask_empty(d->nodemask))
+				init_sched_build_groups(d->nodemask, cpu_map,
+							&cpu_to_phys_group,
+							d->send_covered,
+							d->tmpmask);
+		}
 		break;
 #ifdef CONFIG_NUMA
 	case SD_LV_ALLNODES:
@@ -8859,9 +8880,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	struct s_data d;
 	struct sched_domain *sd;
 	int i;
-#ifdef CONFIG_NUMA
-	d.sd_allnodes = 0;
-#endif
+
+	d.numa_child_level = SD_LV_NONE;
 
 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
 	if (alloc_state != sa_rootdomain)
@@ -8875,9 +8895,18 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
 			    cpu_map);
 
-		sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
-		sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
-		sd = __build_mn_sched_domain(&d, cpu_map, attr, sd, i);
+		if (d.numa_child_level == SD_LV_CPU) {
+			sd = __build_numa_sched_domains(&d, cpu_map, attr,
+							NULL, i);
+			sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+			sd = __build_mn_sched_domain(&d, cpu_map, attr, sd, i);
+		} else {
+			sd = __build_cpu_sched_domain(&d, cpu_map, attr,
+						      NULL, i);
+			sd = __build_mn_sched_domain(&d, cpu_map, attr, sd, i);
+			sd = __build_numa_sched_domains(&d, cpu_map, attr,
+							sd, i);
+		}
 		sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
 		sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
 	}
@@ -8915,6 +8944,15 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		init_sched_groups_power(i, sd);
 	}
 #endif
+
+#ifdef CONFIG_NUMA
+	if (d.numa_child_level == SD_LV_MC)
+		for (i = 0; i < nr_node_ids; i++)
+			init_numa_sched_groups_power(d.sched_group_nodes[i],
+						     d.numa_child_level);
+#endif
+
+
 #ifdef CONFIG_SCHED_MN
 	for_each_cpu(i, cpu_map) {
 		sd = &per_cpu(cpu_node_domains, i).sd;
@@ -8928,15 +8966,17 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	}
 
 #ifdef CONFIG_NUMA
-	for (i = 0; i < nr_node_ids; i++)
-		init_numa_sched_groups_power(d.sched_group_nodes[i]);
-
-	if (d.sd_allnodes) {
-		struct sched_group *sg;
-
-		cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
-								d.tmpmask);
-		init_numa_sched_groups_power(sg);
+	if (d.numa_child_level == SD_LV_CPU) {
+		for (i = 0; i < nr_node_ids; i++)
+			init_numa_sched_groups_power(d.sched_group_nodes[i],
+						     d.numa_child_level);
+
+		if (d.sd_allnodes) {
+			struct sched_group *sg;
+			cpu_to_allnodes_group(cpumask_first(cpu_map),
+					      cpu_map, &sg, d.tmpmask);
+			init_numa_sched_groups_power(sg, d.numa_child_level);
+		}
 	}
 #endif
 
-- 
1.6.0.4




  parent reply	other threads:[~2009-08-20 13:43 UTC|newest]

Thread overview: 64+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-08-20 13:12 [RFC][PATCH 0/15] sched: Fix scheduling for multi-node processors Andreas Herrmann
2009-08-20 13:15 ` [PATCH 1/15] x86, sched: Add config option for multi-node CPU scheduling Andreas Herrmann
2009-08-21 13:50   ` Valdis.Kletnieks
2009-08-24  8:49     ` Andreas Herrmann
2009-08-20 13:34 ` [PATCH 2/15] sched, x86: Provide initializer for MN scheduling domain, define MN level Andreas Herrmann
2009-08-20 13:34 ` [PATCH 3/15] sched: Add cpumask to be used when building MN domain Andreas Herrmann
2009-08-20 13:35 ` [PATCH 4/15] sched: Define per CPU variables and cpu_to_group function for " Andreas Herrmann
2009-08-20 13:36 ` [PATCH 5/15] sched: Add function to build MN sched domain Andreas Herrmann
2009-08-20 13:37 ` [PATCH 6/15] sched: Add support for MN domain in build_sched_groups Andreas Herrmann
2009-08-20 13:38 ` [PATCH 7/15] sched: Activate build of MN domains Andreas Herrmann
2009-08-20 13:39 ` [PATCH 8/15] sched: Add parameter sched_mn_power_savings to control MN domain sched policy Andreas Herrmann
2009-08-24 14:56   ` Peter Zijlstra
2009-08-24 15:32     ` Vaidyanathan Srinivasan
2009-08-24 15:45       ` Peter Zijlstra
2009-08-25  7:52         ` Andreas Herrmann
2009-08-25  7:50       ` Andreas Herrmann
2009-08-25  6:24     ` Andreas Herrmann
2009-08-25  6:41       ` Peter Zijlstra
2009-08-25  8:38         ` Andreas Herrmann
2009-08-26  9:30   ` Gautham R Shenoy
2009-08-27 12:47     ` Andreas Herrmann
2009-08-20 13:40 ` [PATCH 9/15] sched: Check sched_mn_power_savings when setting flags for CPU and MN domains Andreas Herrmann
2009-08-24 14:57   ` Peter Zijlstra
2009-08-25  9:34     ` Gautham R Shenoy
2009-08-26 10:01   ` Gautham R Shenoy
2009-08-20 13:41 ` [PATCH 10/15] sched: Check for sched_mn_power_savings when doing load balancing Andreas Herrmann
2009-08-24 15:03   ` Peter Zijlstra
2009-08-24 15:40     ` Vaidyanathan Srinivasan
2009-08-25  8:00       ` Andreas Herrmann
2009-08-20 13:41 ` [PATCH 11/15] sched: Pass unlimited __cpu_power information to upper domain level groups Andreas Herrmann
2009-08-24 15:21   ` Peter Zijlstra
2009-08-24 16:44     ` Balbir Singh
2009-08-24 17:26       ` Peter Zijlstra
2009-08-24 18:19         ` Balbir Singh
2009-08-25  7:11           ` Peter Zijlstra
2009-08-25  8:04             ` Balbir Singh
2009-08-25  8:30               ` Peter Zijlstra
2009-08-25  8:51     ` Andreas Herrmann
2009-08-20 13:42 ` Andreas Herrmann [this message]
2009-08-24 15:32   ` [PATCH 12/15] sched: Allow NODE domain to be parent of MC instead of CPU domain Peter Zijlstra
2009-08-25  8:55     ` Andreas Herrmann
2009-08-20 13:43 ` [PATCH 13/15] sched: Detect child domain of NUMA (aka NODE) domain Andreas Herrmann
2009-08-24 15:34   ` Peter Zijlstra
2009-08-25  9:13     ` Andreas Herrmann
2009-08-20 13:45 ` [PATCH 14/15] sched: Conditionally limit __cpu_power when child sched domain has type NODE Andreas Herrmann
2009-08-24 15:35   ` Peter Zijlstra
2009-08-25  9:19     ` Andreas Herrmann
2009-08-20 13:46 ` [PATCH 15/15] x86: Fix cpu_coregroup_mask to return correct cpumask on multi-node processors Andreas Herrmann
2009-08-24 15:36   ` Peter Zijlstra
2009-08-24 18:21     ` Ingo Molnar
2009-08-25 10:13       ` Andreas Herrmann
2009-08-25 10:36         ` Ingo Molnar
2009-08-27 13:18           ` Andreas Herrmann
2009-08-25  9:31     ` Andreas Herrmann
2009-08-25  9:55       ` Peter Zijlstra
2009-08-25 10:20         ` Ingo Molnar
2009-08-25 10:24         ` Andreas Herrmann
2009-08-25 10:28           ` Ingo Molnar
2009-08-25 10:35           ` Peter Zijlstra
2009-08-27 15:42             ` Andreas Herrmann
2009-08-27 15:25         ` Andreas Herrmann
2009-08-28 10:39           ` Peter Zijlstra
2009-08-28 12:03             ` Andreas Herrmann
2009-08-28 12:50               ` Peter Zijlstra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090820134245.GA29327@alberich.amd.com \
    --to=andreas.herrmann3@amd.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mingo@elte.hu \
    --cc=peterz@infradead.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.