public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: Nick Piggin <piggin@cyberone.com.au>
To: Andrew Morton <akpm@osdl.org>
Cc: LSE <lse-tech@lists.sourceforge.net>,
	"Nakajima, Jun" <jun.nakajima@intel.com>,
	Rick Lindsley <ricklind@us.ibm.com>,
	Anton Blanchard <anton@samba.org>,
	Rick Lindsley <ricklind@us.ibm.com>,
	linux-kernel <linux-kernel@vger.kernel.org>
Subject: [PATCH] 2.6.3-rc3-mm1: sched-group-power
Date: Thu, 19 Feb 2004 20:24:26 +1100	[thread overview]
Message-ID: <403480CA.7050804@cyberone.com.au> (raw)

[-- Attachment #1: Type: text/plain, Size: 435 bytes --]

The following patch implements a cpu_power member to
struct sched_group.

This allows special casing to be removed for SMT groups
in the balancing code. It does not take CPU hotplug into
account yet, but that shouldn't be too hard.

I have tested it on the NUMAQ by pretending it has SMT.
Works as expected. Active balances across nodes.

Andrew, please apply. I suppose you'd better not send
the scheduler changes to Linus quite yet.

[-- Attachment #2: sched-group-power.patch --]
[-- Type: text/plain, Size: 10495 bytes --]

 linux-2.6-npiggin/arch/i386/kernel/smpboot.c |   15 ++-
 linux-2.6-npiggin/include/linux/sched.h      |   12 ++
 linux-2.6-npiggin/kernel/sched.c             |  131 ++++++++++++++-------------
 3 files changed, 91 insertions(+), 67 deletions(-)

diff -puN include/linux/sched.h~sched-group-power include/linux/sched.h
--- linux-2.6/include/linux/sched.h~sched-group-power	2004-02-19 16:56:22.000000000 +1100
+++ linux-2.6-npiggin/include/linux/sched.h	2004-02-19 16:56:23.000000000 +1100
@@ -530,15 +530,25 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 
 #ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
+#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT)
+
 #define SD_FLAG_NEWIDLE		1	/* Balance when about to become idle */
 #define SD_FLAG_EXEC		2	/* Balance on exec */
 #define SD_FLAG_WAKE		4	/* Balance on task wakeup */
 #define SD_FLAG_FASTMIGRATE	8	/* Sync wakes put task on waking CPU */
-#define SD_FLAG_IDLE		16	/* Should not have all CPUs idle */
 
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
 	cpumask_t cpumask;
+
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU. This should be read only (except for setup). Although
+	 * it will need to be written to at cpu hot(un)plug time, perhaps the
+	 * cpucontrol semaphore will provide enough exclusion?
+	 */
+	unsigned long cpu_power;
 };
 
 struct sched_domain {
diff -puN kernel/sched.c~sched-group-power kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-group-power	2004-02-19 16:56:22.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c	2004-02-19 17:09:47.000000000 +1100
@@ -190,9 +190,6 @@ struct prio_array {
 	struct list_head queue[MAX_PRIO];
 };
 
-#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
-#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT)
-
 /*
  * This is the main, per-CPU runqueue data structure.
  *
@@ -1350,16 +1347,14 @@ find_busiest_group(struct sched_domain *
 				unsigned long *imbalance, enum idle_type idle)
 {
 	unsigned long max_load, avg_load, total_load, this_load;
-	int modify, total_nr_cpus, busiest_nr_cpus, this_nr_cpus;
-	enum idle_type package_idle = IDLE;
-	struct sched_group *busiest = NULL, *group = domain->groups;
+	unsigned int total_pwr;
+	int modify;
+	struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups;
 
 	max_load = 0;
 	this_load = 0;
 	total_load = 0;
-	total_nr_cpus = 0;
-	busiest_nr_cpus = 0;
-	this_nr_cpus = 0;
+	total_pwr = 0;
 
 	if (group == NULL)
 		goto out_balanced;
@@ -1390,8 +1385,6 @@ find_busiest_group(struct sched_domain *
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
 				load = get_high_cpu_load(i, modify);
-				if (!idle_cpu(i))
-					package_idle = NOT_IDLE;
 			} else
 				load = get_low_cpu_load(i, modify);
 
@@ -1403,48 +1396,34 @@ find_busiest_group(struct sched_domain *
 			goto nextgroup;
 
 		total_load += avg_load;
+		total_pwr += group->cpu_power;
 
-		/*
-		 * Load is cumulative over SD_FLAG_IDLE domains, but
-		 * spread over !SD_FLAG_IDLE domains. For example, 2
-		 * processes running on an SMT CPU puts a load of 2 on
-		 * that CPU, however 2 processes running on 2 CPUs puts
-		 * a load of 1 on that domain.
-		 *
-		 * This should be configurable so as SMT siblings become
-		 * more powerful, they can "spread" more load - for example,
-		 * the above case might only count as a load of 1.7.
-		 */
-		if (!(domain->flags & SD_FLAG_IDLE)) {
-			avg_load /= nr_cpus;
-			total_nr_cpus += nr_cpus;
-		} else
-			total_nr_cpus++;
-
-		if (avg_load > max_load)
-			max_load = avg_load;
+		/* Adjust by relative CPU power of the group */
+		avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power;
 
 		if (local_group) {
 			this_load = avg_load;
-			this_nr_cpus = nr_cpus;
-		} else if (avg_load >= max_load) {
+			this = group;
+			goto nextgroup;
+		}
+		if (avg_load > max_load) {
+			max_load = avg_load;
 			busiest = group;
-			busiest_nr_cpus = nr_cpus;
 		}
 nextgroup:
 		group = group->next;
 	} while (group != domain->groups);
 
-	if (!busiest)
+	if (!busiest || this_load >= max_load)
 		goto out_balanced;
 
-	avg_load = total_load / total_nr_cpus;
-
-	if (this_load >= avg_load)
-		goto out_balanced;
+	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
 
-	if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load)
+	if (idle == NOT_IDLE) {
+		if (this_load >= avg_load ||
+			100*max_load <= domain->imbalance_pct*this_load)
 		goto out_balanced;
+	}
 
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
@@ -1458,15 +1437,45 @@ nextgroup:
 	 * appear as very large values with unsigned longs.
 	 */
 	*imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2;
-	/* Get rid of the scaling factor, rounding *up* as we divide */
-	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2 + 1)
-					>> SCHED_LOAD_SHIFT;
 
-	if (*imbalance == 0)
-		goto out_balanced;
+	if (*imbalance <= SCHED_LOAD_SCALE/2) {
+		unsigned long pwr_now = 0, pwr_move = 0;
+		unsigned long load;
+		unsigned long tmp;
+
+		/*
+		 * OK, we don't have enough imbalance to justify moving tasks,
+		 * however we may be able to increase total CPU power used by
+		 * moving them.
+		 */
+
+		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
+		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
+		pwr_now >>= SCHED_LOAD_SHIFT;
+
+		/* Amount of load we'd subtract */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
+		if (max_load > tmp)
+			pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
+							max_load - tmp);
+
+		/* Amount of load we'd add */
+		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
+		pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
+		pwr_move >>= SCHED_LOAD_SHIFT;
+
+		/* Move if we gain another 8th of a CPU worth of throughput */
+		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
+			goto out_balanced;
+		*imbalance = 1;
+		return busiest;
+	}
 
 	/* How many tasks to actually move to equalise the imbalance */
-	*imbalance *= min(busiest_nr_cpus, this_nr_cpus);
+	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
+				>> SCHED_LOAD_SHIFT;
+	/* Get rid of the scaling factor, rounding *up* as we divide */
+	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT;
 
 	return busiest;
 
@@ -1542,26 +1551,19 @@ out:
 	if (!balanced && nr_moved == 0)
 		failed = 1;
 
-	if (domain->flags & SD_FLAG_IDLE && failed && busiest &&
+	if (failed && busiest &&
 	   		domain->nr_balance_failed > domain->cache_nice_tries) {
-		int i;
-		for_each_cpu_mask(i, group->cpumask) {
-			int wake = 0;
+		int wake = 0;
 
-			if (!cpu_online(i))
-				continue;
-
-			busiest = cpu_rq(i);
-			spin_lock(&busiest->lock);
-			if (!busiest->active_balance) {
-				busiest->active_balance = 1;
-				busiest->push_cpu = this_cpu;
-				wake = 1;
-			}
-			spin_unlock(&busiest->lock);
-			if (wake)
-				wake_up_process(busiest->migration_thread);
-		}
+		spin_lock(&busiest->lock);
+		if (!busiest->active_balance) {
+			busiest->active_balance = 1;
+			busiest->push_cpu = this_cpu;
+			wake = 1;
+		}
+		spin_unlock(&busiest->lock);
+		if (wake)
+			wake_up_process(busiest->migration_thread);
 	}
 
 	if (failed)
@@ -3324,12 +3326,14 @@ static void __init arch_init_sched_domai
 			continue;
 
 		node->cpumask = nodemask;
+		node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
 
 		for_each_cpu_mask(j, node->cpumask) {
 			struct sched_group *cpu = &sched_group_cpus[j];
 
 			cpus_clear(cpu->cpumask);
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -3376,6 +3380,7 @@ static void __init arch_init_sched_domai
 
 		cpus_clear(cpu->cpumask);
 		cpu_set(i, cpu->cpumask);
+		cpu->cpu_power = SCHED_LOAD_SCALE;
 
 		if (!first_cpu)
 			first_cpu = cpu;
diff -puN arch/i386/kernel/smpboot.c~sched-group-power arch/i386/kernel/smpboot.c
--- linux-2.6/arch/i386/kernel/smpboot.c~sched-group-power	2004-02-19 16:56:23.000000000 +1100
+++ linux-2.6-npiggin/arch/i386/kernel/smpboot.c	2004-02-19 16:56:23.000000000 +1100
@@ -1149,7 +1149,6 @@ __init void arch_init_sched_domains(void
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = nodemask;
-		phys_domain->flags |= SD_FLAG_IDLE;
 
 		*node_domain = SD_NODE_INIT;
 		node_domain->span = cpu_online_map;
@@ -1169,6 +1168,7 @@ __init void arch_init_sched_domains(void
 
 			cpu->cpumask = CPU_MASK_NONE;
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1182,6 +1182,7 @@ __init void arch_init_sched_domains(void
 	for (i = 0; i < MAX_NUMNODES; i++) {
 		int j;
 		cpumask_t nodemask;
+		struct sched_group *node = &sched_group_nodes[i];
 		cpus_and(nodemask, node_to_cpumask(i), cpu_online_map);
 
 		if (cpus_empty(nodemask))
@@ -1197,6 +1198,12 @@ __init void arch_init_sched_domains(void
 				continue;
 
 			cpu->cpumask = cpu_domain->span;
+			/*
+			 * Make each extra sibling increase power by 10% of
+			 * the basic CPU. This is very arbitrary.
+			 */
+			cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
+			node->cpu_power += cpu->cpu_power;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1218,6 +1225,7 @@ __init void arch_init_sched_domains(void
 			continue;
 
 		cpu->cpumask = nodemask;
+		/* ->cpu_power already setup */
 
 		if (!first_cpu)
 			first_cpu = cpu;
@@ -1227,7 +1235,6 @@ __init void arch_init_sched_domains(void
 	}
 	last_cpu->next = first_cpu;
 
-
 	mb();
 	for_each_cpu_mask(i, cpu_online_map) {
 		int node = cpu_to_node(i);
@@ -1265,7 +1272,6 @@ __init void arch_init_sched_domains(void
 
 		*phys_domain = SD_CPU_INIT;
 		phys_domain->span = cpu_online_map;
-		phys_domain->flags |= SD_FLAG_IDLE;
 	}
 
 	/* Set up CPU (sibling) groups */
@@ -1282,6 +1288,7 @@ __init void arch_init_sched_domains(void
 
 			cpus_clear(cpu->cpumask);
 			cpu_set(j, cpu->cpumask);
+			cpu->cpu_power = SCHED_LOAD_SCALE;
 
 			if (!first_cpu)
 				first_cpu = cpu;
@@ -1302,6 +1309,8 @@ __init void arch_init_sched_domains(void
 			continue;
 
 		cpu->cpumask = cpu_domain->span;
+		/* See SMT+NUMA setup for comment */
+		cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
 
 		if (!first_cpu)
 			first_cpu = cpu;

_

             reply	other threads:[~2004-02-19  9:24 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-02-19  9:24 Nick Piggin [this message]
2004-02-20  1:17 ` [PATCH] 2.6.3-rc3-mm1: sched-group-power Rick Lindsley
2004-02-20  1:37   ` Nick Piggin
2004-02-20 23:46     ` Rick Lindsley
2004-02-21  1:39       ` Nick Piggin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=403480CA.7050804@cyberone.com.au \
    --to=piggin@cyberone.com.au \
    --cc=akpm@osdl.org \
    --cc=anton@samba.org \
    --cc=jun.nakajima@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lse-tech@lists.sourceforge.net \
    --cc=ricklind@us.ibm.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox