[PATCH] 2.6.3-rc3-mm1: sched-group-power

public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH] 2.6.3-rc3-mm1: sched-group-power
@ 2004-02-19  9:24 Nick Piggin
  2004-02-20  1:17 ` Rick Lindsley
  0 siblings, 1 reply; 5+ messages in thread
From: Nick Piggin @ 2004-02-19  9:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: LSE, Nakajima, Jun, Rick Lindsley, Anton Blanchard, Rick Lindsley,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 435 bytes --]

The following patch implements a cpu_power member to
struct sched_group.

This allows special casing to be removed for SMT groups
in the balancing code. It does not take CPU hotplug into
account yet, but that shouldn't be too hard.

I have tested it on the NUMAQ by pretending it has SMT.
Works as expected. Active balances across nodes.

Andrew, please apply. I suppose you'd better not send
the scheduler changes to Linus quite yet.

[-- Attachment #2: sched-group-power.patch --]
[-- Type: text/plain, Size: 10495 bytes --]

 linux-2.6-npiggin/arch/i386/kernel/smpboot.c |   15 ++-
 linux-2.6-npiggin/include/linux/sched.h      |   12 ++
 linux-2.6-npiggin/kernel/sched.c             |  131 ++++++++++++++-------------
 3 files changed, 91 insertions(+), 67 deletions(-)

diff -puN include/linux/sched.h~sched-group-power include/linux/sched.h
--- linux-2.6/include/linux/sched.h~sched-group-power	2004-02-19 16:56:22.000000000 +1100
+++ linux-2.6-npiggin/include/linux/sched.h	2004-02-19 16:56:23.000000000 +1100
@@ -530,15 +530,25 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 
 #ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
+#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT)
+
 #define SD_FLAG_NEWIDLE		1	/* Balance when about to become idle */
 #define SD_FLAG_EXEC		2	/* Balance on exec */
 #define SD_FLAG_WAKE		4	/* Balance on task wakeup */
 #define SD_FLAG_FASTMIGRATE	8	/* Sync wakes put task on waking CPU */
-#define SD_FLAG_IDLE		16	/* Should not have all CPUs idle */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
 	cpumask_t cpumask;
+
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU. This should be read only (except for setup). Although
+	 * it will need to be written to at cpu hot(un)plug time, perhaps the
+	 * cpucontrol semaphore will provide enough exclusion?
+	 */
+	unsigned long cpu_power;
 };
 
 struct sched_domain {
diff -puN kernel/sched.c~sched-group-power kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-group-power	2004-02-19 16:56:22.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c	2004-02-19 17:09:47.000000000 +1100
@@ -190,9 +190,6 @@ struct prio_array {
 	struct list_head queue[MAX_PRIO];
 };
 
-#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
-#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT)
-
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -1350,16 +1347,14 @@ find_busiest_group(struct sched_domain *
 				unsigned long *imbalance, enum idle_type idle)
 {
 	unsigned long max_load, avg_load, total_load, this_load;
-	int modify, total_nr_cpus, busiest_nr_cpus, this_nr_cpus;
-	enum idle_type package_idle = IDLE;
-	struct sched_group *busiest = NULL, *group = domain->groups;
+	unsigned int total_pwr;
+	int modify;
+	struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups;
 
 	max_load = 0;
 	this_load = 0;
 	total_load = 0;
-	total_nr_cpus = 0;
-	busiest_nr_cpus = 0;
-	this_nr_cpus = 0;
+	total_pwr = 0;
 
 	if (group == NULL)
 		goto out_balanced;
@@ -1390,8 +1385,6 @@ find_busiest_group(struct sched_domain *
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				load = get_high_cpu_load(i, modify);
-				if (!idle_cpu(i))
-					package_idle = NOT_IDLE;
 			} else
 				load = get_low_cpu_load(i, modify);
 
@@ -1403,48 +1396,34 @@ find_busiest_group(struct sched_domain *
 			goto nextgroup;
 
 		total_load += avg_load;
+		total_pwr += group->cpu_power;
 
-		/*
-		 * Load is cumulative over SD_FLAG_IDLE domains, but
-		 * spread over !SD_FLAG_IDLE domains. For example, 2
-		 * processes running on an SMT CPU puts a load of 2 on
-		 * that CPU, however 2 processes running on 2 CPUs puts
-		 * a load of 1 on that domain.
-		 *
-		 * This should be configurable so as SMT siblings become
-		 * more powerful, they can "spread" more load - for example,
-		 * the above case might only count as a load of 1.7.
-		 */
-		if (!(domain->flags & SD_FLAG_IDLE)) {
-			avg_load /= nr_cpus;
-			total_nr_cpus += nr_cpus;
-		} else
-			total_nr_cpus++;
-
-		if (avg_load > max_load)
-			max_load = avg_load;
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power;
 
 		if (local_group) {
 			this_load = avg_load;
-			this_nr_cpus = nr_cpus;
-		} else if (avg_load >= max_load) {
+			this = group;
+			goto nextgroup;
+		}
+		if (avg_load > max_load) {
+			max_load = avg_load;
 			busiest = group;
-			busiest_nr_cpus = nr_cpus;
 		}
 nextgroup:
 		group = group->next;
 	} while (group != domain->groups);
 
-	if (!busiest)
+	if (!busiest || this_load >= max_load)
 		goto out_balanced;
 
-	avg_load = total_load / total_nr_cpus;
-
-	if (this_load >= avg_load)
-		goto out_balanced;
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
-	if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load)
+	if (idle == NOT_IDLE) {
+		if (this_load >= avg_load ||
+			100*max_load <= domain->imbalance_pct*this_load)
 		goto out_balanced;
+	}
 
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
@@ -1458,15 +1437,45 @@ nextgroup:
 	 * appear as very large values with unsigned longs.
 	 */
 	*imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2;
-	/* Get rid of the scaling factor, rounding *up* as we divide */
-	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2 + 1)
-					>> SCHED_LOAD_SHIFT;
 
-	if (*imbalance == 0)
-		goto out_balanced;
+	if (*imbalance <= SCHED_LOAD_SCALE/2) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long load;
+		unsigned long tmp;
+
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
+
+		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now >>= SCHED_LOAD_SHIFT;
+
+		/* Amount of load we'd subtract */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+							max_load - tmp);
+
+		/* Amount of load we'd add */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
+		pwr_move >>= SCHED_LOAD_SHIFT;
+
+		/* Move if we gain another 8th of a CPU worth of throughput */
+		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+			goto out_balanced;
+		*imbalance = 1;
+		return busiest;
+	}
 
 	/* How many tasks to actually move to equalise the imbalance */
-	*imbalance *= min(busiest_nr_cpus, this_nr_cpus);
+	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+				>> SCHED_LOAD_SHIFT;
+	/* Get rid of the scaling factor, rounding *up* as we divide */
+	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT;
 
 	return busiest;
 
@@ -1542,26 +1551,19 @@ out:
 	if (!balanced && nr_moved == 0)
 		failed = 1;
 
-	if (domain->flags & SD_FLAG_IDLE && failed && busiest &&
+	if (failed && busiest &&
 	   		domain->nr_balance_failed > domain->cache_nice_tries) {
-		int i;
-		for_each_cpu_mask(i, group->cpumask) {
-			int wake = 0;
+		int wake = 0;
 
-			if (!cpu_online(i))
-				continue;
-
-			busiest = cpu_rq(i);
-			spin_lock(&busiest->lock);
-			if (!busiest->active_balance) {
-				busiest->active_balance = 1;
-				busiest->push_cpu = this_cpu;
-				wake = 1;
-			}
-			spin_unlock(&busiest->lock);
-			if (wake)
-				wake_up_process(busiest->migration_thread);
-		}
+		spin_lock(&busiest->lock);
+		if (!busiest->active_balance) {
+			busiest->active_balance = 1;
+			busiest->push_cpu = this_cpu;
+			wake = 1;
+		}
+		spin_unlock(&busiest->lock);
+		if (wake)
+			wake_up_process(busiest->migration_thread);
 	}
 
 	if (failed)
@@ -3324,12 +3326,14 @@ static void __init arch_init_sched_domai
 			continue;
 
 		node->cpumask = nodemask;
+		node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
 
 		for_each_cpu_mask(j, node->cpumask) {
 			struct sched_group *cpu = &sched_group_cpus[j];
 
 			cpus_clear(cpu->cpumask);
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -3376,6 +3380,7 @@ static void __init arch_init_sched_domai
 
 		cpus_clear(cpu->cpumask);
 		cpu_set(i, cpu->cpumask);
+		cpu->cpu_power = SCHED_LOAD_SCALE;
 
 		if (!first_cpu)
 			first_cpu = cpu;
diff -puN arch/i386/kernel/smpboot.c~sched-group-power arch/i386/kernel/smpboot.c
--- linux-2.6/arch/i386/kernel/smpboot.c~sched-group-power	2004-02-19 16:56:23.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c	2004-02-19 16:56:23.000000000 +1100
@@ -1149,7 +1149,6 @@ __init void arch_init_sched_domains(void
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = nodemask;
-		phys_domain->flags |= SD_FLAG_IDLE;
 
 		*node_domain = SD_NODE_INIT;
 		node_domain->span = cpu_online_map;
@@ -1169,6 +1168,7 @@ __init void arch_init_sched_domains(void
 
 			cpu->cpumask = CPU_MASK_NONE;
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1182,6 +1182,7 @@ __init void arch_init_sched_domains(void
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		int j;
 		cpumask_t nodemask;
+		struct sched_group *node = &sched_group_nodes[i];
 		cpus_and(nodemask, node_to_cpumask(i), cpu_online_map);
 
 		if (cpus_empty(nodemask))
@@ -1197,6 +1198,12 @@ __init void arch_init_sched_domains(void
 				continue;
 
 			cpu->cpumask = cpu_domain->span;
+			/*
+			 * Make each extra sibling increase power by 10% of
+			 * the basic CPU. This is very arbitrary.
+			 */
+			cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
+			node->cpu_power += cpu->cpu_power;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1218,6 +1225,7 @@ __init void arch_init_sched_domains(void
 			continue;
 
 		cpu->cpumask = nodemask;
+		/* ->cpu_power already setup */
 
 		if (!first_cpu)
 			first_cpu = cpu;
@@ -1227,7 +1235,6 @@ __init void arch_init_sched_domains(void
 	}
 	last_cpu->next = first_cpu;
 
-
 	mb();
 	for_each_cpu_mask(i, cpu_online_map) {
 		int node = cpu_to_node(i);
@@ -1265,7 +1272,6 @@ __init void arch_init_sched_domains(void
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = cpu_online_map;
-		phys_domain->flags |= SD_FLAG_IDLE;
 	}
 
 	/* Set up CPU (sibling) groups */
@@ -1282,6 +1288,7 @@ __init void arch_init_sched_domains(void
 
 			cpus_clear(cpu->cpumask);
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1302,6 +1309,8 @@ __init void arch_init_sched_domains(void
 			continue;
 
 		cpu->cpumask = cpu_domain->span;
+		/* See SMT+NUMA setup for comment */
+		cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
 
 		if (!first_cpu)
 			first_cpu = cpu;

_

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] 2.6.3-rc3-mm1: sched-group-power
  2004-02-19  9:24 [PATCH] 2.6.3-rc3-mm1: sched-group-power Nick Piggin
@ 2004-02-20  1:17 ` Rick Lindsley
  2004-02-20  1:37   ` Nick Piggin
  0 siblings, 1 reply; 5+ messages in thread
From: Rick Lindsley @ 2004-02-20  1:17 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, LSE, Nakajima, Jun, Anton Blanchard, linux-kernel

Nick, I'm not sure what capability this patch adds .. perhaps some words
of explanation.

So we have SMT/HT situations where we'd prefer to balance across cores;
that is, if 0, 1, 2, and 3 share a core and 4, 5, 6, and 7 share a core,
you'd like two processes to arrange themselves so one is on [0123] and
another is on [4567].  This is what the SD_IDLE flag indicated before.

With this patch, we can "weight" the load imposed by certain cpus, right?
What advantage does this give us?  On a given machine, won't the "weight"
of any one set of SMT siblings and cores be uniform with respect to all
the cores and siblings anyway?

Rick

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] 2.6.3-rc3-mm1: sched-group-power
  2004-02-20  1:17 ` Rick Lindsley
@ 2004-02-20  1:37   ` Nick Piggin
  2004-02-20 23:46     ` Rick Lindsley
  0 siblings, 1 reply; 5+ messages in thread
From: Nick Piggin @ 2004-02-20  1:37 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: Andrew Morton, LSE, Nakajima, Jun, Anton Blanchard, linux-kernel

Rick Lindsley wrote:

>Nick, I'm not sure what capability this patch adds .. perhaps some words
>of explanation.
>
>So we have SMT/HT situations where we'd prefer to balance across cores;
>that is, if 0, 1, 2, and 3 share a core and 4, 5, 6, and 7 share a core,
>you'd like two processes to arrange themselves so one is on [0123] and
>another is on [4567].  This is what the SD_IDLE flag indicated before.
>
>With this patch, we can "weight" the load imposed by certain cpus, right?
>What advantage does this give us?  On a given machine, won't the "weight"
>of any one set of SMT siblings and cores be uniform with respect to all
>the cores and siblings anyway?
>
>

It is difficult to propogate the SD_FLAG_IDLE attribute up
multiple domains.

For example, with SMT + CPU + NODE domains you can get into
the following situation:

01, 23 are 4 siblings in 2 cores on node 0,
45, 67 are " " " on node 1.

The top level balancing domain now spans 01234567, and wants to
balance between groups 0123, and 4567. We don't want SD_FLAG_IDLE
semantics here, because that would mean if two tasks were running
on node 0, one would be migrated to node 1. We want to migrate 1
task if one node is idle, and the other has 3 processes running for
example.

Also this copes with siblings becoming much more powerful, or
some groups with SMT turned off, some on (think hotplug cpu),
different speed CPUs, etc.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] 2.6.3-rc3-mm1: sched-group-power
  2004-02-20  1:37   ` Nick Piggin
@ 2004-02-20 23:46     ` Rick Lindsley
  2004-02-21  1:39       ` Nick Piggin
  0 siblings, 1 reply; 5+ messages in thread
From: Rick Lindsley @ 2004-02-20 23:46 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Andrew Morton, LSE, Nakajima, Jun, Anton Blanchard, linux-kernel

So let me try a diagram. Each of these groups of numbers represent a
cpu_group, and the labels to the left are individual sched_domains.

SD1		     01234567
SD2-SD3		 0123        4567
SD4-SD7	       01    23    45    67
SD8-SD15      0  1  2  3  4  5  6  7

Currently, we assume each cpu has a power of 1, so each cpu group in
domains SD8-SD15 would have a power of 1, each cpu group in SD4-SD7
would have a power of 2, each of SD2 and SD3 would have a power of 4,
and collectively, all CPUs as represented in SD1 would have a power of 8.
Of course, we don't really make use of this assumption but this just
enumerates our assumption that all nodes, all cpus are created equal.

Your new power code would assign each cpu group a static power other
than this, making SMT pairs, for instance, 1.2 instead of 2.  In the
case of four siblings, 1.4 instead of 4.  Correct?  In the example above,
SD2 and SD3 would have a power rating of 2.4, and SD1 would have a power
rating of 4*1.2 or 4.8, right?

With your current code, we only consult the power ratings if we've already
decided that we are currently "balanced enough".  I'd go one step further
and say that manipulating for power only makes sense if you have an idle
processor somewhere.  If all processors are busy, then short of some
quality-of-process assessment, how can you improve power?  (You could
improve fairness, I suppose, but that would require lots more stats and
history than we have here.) If one set of procs is slower than another,
won't that make itself apparent by a longer queue developing there? (or
shorter queues forming somewhere else?) and it being load-balanced
by the existing algorithm?  Seems to me we only need to make power
decisions when we want to consider an idle processor stealing a task (a
possibly *running* task) from another processor because this processor
is faster/stronger/better.

Rick

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] 2.6.3-rc3-mm1: sched-group-power
  2004-02-20 23:46     ` Rick Lindsley
@ 2004-02-21  1:39       ` Nick Piggin
  0 siblings, 0 replies; 5+ messages in thread
From: Nick Piggin @ 2004-02-21  1:39 UTC (permalink / raw)
  To: Rick Lindsley
  Cc: Andrew Morton, LSE, Nakajima, Jun, Anton Blanchard, linux-kernel



Rick Lindsley wrote:

>So let me try a diagram. Each of these groups of numbers represent a
>cpu_group, and the labels to the left are individual sched_domains.
>
>SD1		     01234567
>SD2-SD3		 0123        4567
>SD4-SD7	       01    23    45    67
>SD8-SD15      0  1  2  3  4  5  6  7
>
>Currently, we assume each cpu has a power of 1, so each cpu group in
>domains SD8-SD15 would have a power of 1, each cpu group in SD4-SD7
>would have a power of 2, each of SD2 and SD3 would have a power of 4,
>and collectively, all CPUs as represented in SD1 would have a power of 8.
>Of course, we don't really make use of this assumption but this just
>enumerates our assumption that all nodes, all cpus are created equal.
>
>

Well we used to sum up the number of CPUs in each group, so it
wasn't quite that bad. We assumed all CPUs are created equal.

>Your new power code would assign each cpu group a static power other
>than this, making SMT pairs, for instance, 1.2 instead of 2.  In the
>case of four siblings, 1.4 instead of 4.  Correct?  In the example above,
>SD2 and SD3 would have a power rating of 2.4, and SD1 would have a power
>rating of 4*1.2 or 4.8, right?
>
>

Right.

>With your current code, we only consult the power ratings if we've already
>decided that we are currently "balanced enough".
>

Well we do work out the per group loads by dividing with the power
rating instead of cpus-in-the-group too.

>  I'd go one step further
>and say that manipulating for power only makes sense if you have an idle
>processor somewhere.  If all processors are busy, then short of some
>quality-of-process assessment, how can you improve power?  (You could
>improve fairness, I suppose, but that would require lots more stats and
>history than we have here.) If one set of procs is slower than another,
>won't that make itself apparent by a longer queue developing there? (or
>shorter queues forming somewhere else?) and it being load-balanced
>by the existing algorithm?  Seems to me we only need to make power
>decisions when we want to consider an idle processor stealing a task (a
>possibly *running* task) from another processor because this processor
>is faster/stronger/better.
>
>

Yeah, probably we could change that test to:
if (*imbalance <= SCHED_LOAD_SCALE / 2
            && this_load < SCHED_LOAD_SCALE)

Either way, if the calculation should be done in such a way that
if your CPUs are not idle, then it wouldn't predict a performance
increase.

No doubt there is room for improvement, but hopefully it is now
at a "good enough" stage...


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2004-02-21  1:39 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-02-19  9:24 [PATCH] 2.6.3-rc3-mm1: sched-group-power Nick Piggin
2004-02-20  1:17 ` Rick Lindsley
2004-02-20  1:37   ` Nick Piggin
2004-02-20 23:46     ` Rick Lindsley
2004-02-21  1:39       ` Nick Piggin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox