All of lore.kernel.org
 help / color / mirror / Atom feed
From: Erich Focht <efocht@hpce.nec.com>
To: "linux-kernel" <linux-kernel@vger.kernel.org>,
	LSE <lse-tech@lists.sourceforge.net>
Cc: "Martin J. Bligh" <Martin.Bligh@us.ibm.com>,
	Andi Kleen <ak@muc.de>,
	torvalds@osdl.org
Subject: [patch] scheduler fix for 1cpu/node case
Date: Mon, 28 Jul 2003 21:16:46 +0200	[thread overview]
Message-ID: <200307280548.53976.efocht@gmx.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 1686 bytes --]

Hi,

after talking to several people at OLS about the current NUMA
scheduler the conclusion was:
(1) it sucks (for particular workloads),
(2) on x86_64 (embarassingly simple NUMA) it's useless, goto (1).

Fact is that the current separation of local and global balancing,
where global balancing is done only in the timer interrupt at a fixed
rate is way too unflexible. A CPU going idle inside a well balanced
node will stay idle for a while even if there's a lot of work to
do. Especially in the corner case of one CPU per node this is
condemning that CPU to idleness for at least 5 ms. So x86_64 platforms
(but not only those!) suffer and whish to switch off the NUMA
scheduler while keeping NUMA memory management on.

The attached patch is a simple solution which
- solves the 1 CPU / node problem,
- lets other systems behave (almost) as before,
- opens the way to other optimisations like multi-level node
  hierarchies (by tuning the retry rate)
- simpifies the NUMA scheduler and deletes more lines of code than it
  adds.

The timer interrupt based global rebalancing might appear to be a
simple and good idea but it takes the scheduler a lot of
flexibility. In the patch the global rebalancing is done after a
certain number of failed attempts to locally balance. The number of
attempts is proportional to the number of CPUs in the current
node. For only 1 CPU in the current node the scheduler doesn't even
try to balance locally, it wouldn't make sense anyway. Of course one
could instead set IDLE_NODE_REBALANCE_TICK = IDLE_REBALANCE_TICK, but
this is more ugly (IMHO) and only helps when all nodes have 1 CPU /
node.

Please consider this for inclusion.

Thanks,
Erich




[-- Attachment #2: 1cpufix-lb-2.6.0t1.patch --]
[-- Type: text/x-diff, Size: 4255 bytes --]

diff -urNp 2.6.0t1/kernel/sched.c 2.6.0t1-1cpufix/kernel/sched.c
--- 2.6.0t1/kernel/sched.c	2003-07-14 05:37:14.000000000 +0200
+++ 2.6.0t1-1cpufix/kernel/sched.c	2003-07-28 05:32:28.000000000 +0200
@@ -164,6 +164,7 @@ struct runqueue {
 	prio_array_t *active, *expired, arrays[2];
 	int prev_cpu_load[NR_CPUS];
 #ifdef CONFIG_NUMA
+	unsigned int nr_lb_failed;
 	atomic_t *node_nr_running;
 	int prev_node_load[MAX_NUMNODES];
 #endif
@@ -856,6 +857,35 @@ static int find_busiest_node(int this_no
 	return node;
 }
 
+/*
+ * Decide whether the scheduler should balance locally (inside the same node)
+ * or globally depending on the number of failed local balance attempts.
+ * The number of failed local balance attempts depends on the number of cpus
+ * in the current node. In case it's just one, go immediately for global
+ * balancing. On a busy cpu the number of retries is smaller.
+ */
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	int node, retries, this_node = cpu_to_node(this_cpu);
+
+	retries = nr_cpus_node(this_node) - 1;
+	if (this_rq->curr != this_rq->idle)
+		retries >>= 1;
+	if (this_rq->nr_lb_failed >= retries) {
+		node = find_busiest_node(this_node);
+		this_rq->nr_lb_failed = 0;
+		if (node >= 0)
+			return (node_to_cpu_mask(node) | (1UL << this_cpu));
+	}
+	return node_to_cpu_mask(this_node);
+}
+
+#else /* !CONFIG_NUMA */
+
+static inline unsigned long cpus_to_balance(int this_cpu, runqueue_t *this_rq)
+{
+	return cpu_online_map;
+}
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_SMP
@@ -960,6 +990,12 @@ static inline runqueue_t *find_busiest_q
 		busiest = NULL;
 	}
 out:
+#ifdef CONFIG_NUMA
+	if (!busiest)
+		this_rq->nr_lb_failed++;
+	else
+		this_rq->nr_lb_failed = 0;
+#endif
 	return busiest;
 }
 
@@ -995,7 +1031,7 @@ static inline void pull_task(runqueue_t 
  * We call this with the current runqueue locked,
  * irqs disabled.
  */
-static void load_balance(runqueue_t *this_rq, int idle, unsigned long cpumask)
+static void load_balance(runqueue_t *this_rq, int idle)
 {
 	int imbalance, idx, this_cpu = smp_processor_id();
 	runqueue_t *busiest;
@@ -1003,7 +1039,8 @@ static void load_balance(runqueue_t *thi
 	struct list_head *head, *curr;
 	task_t *tmp;
 
-	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance, cpumask);
+	busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance,
+				     cpus_to_balance(this_cpu, this_rq));
 	if (!busiest)
 		goto out;
 
@@ -1085,29 +1122,9 @@ out:
  */
 #define IDLE_REBALANCE_TICK (HZ/1000 ?: 1)
 #define BUSY_REBALANCE_TICK (HZ/5 ?: 1)
-#define IDLE_NODE_REBALANCE_TICK (IDLE_REBALANCE_TICK * 5)
-#define BUSY_NODE_REBALANCE_TICK (BUSY_REBALANCE_TICK * 2)
-
-#ifdef CONFIG_NUMA
-static void balance_node(runqueue_t *this_rq, int idle, int this_cpu)
-{
-	int node = find_busiest_node(cpu_to_node(this_cpu));
-	unsigned long cpumask, this_cpumask = 1UL << this_cpu;
-
-	if (node >= 0) {
-		cpumask = node_to_cpumask(node) | this_cpumask;
-		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpumask);
-		spin_unlock(&this_rq->lock);
-	}
-}
-#endif
 
 static void rebalance_tick(runqueue_t *this_rq, int idle)
 {
-#ifdef CONFIG_NUMA
-	int this_cpu = smp_processor_id();
-#endif
 	unsigned long j = jiffies;
 
 	/*
@@ -1119,24 +1136,16 @@ static void rebalance_tick(runqueue_t *t
 	 * are not balanced.)
 	 */
 	if (idle) {
-#ifdef CONFIG_NUMA
-		if (!(j % IDLE_NODE_REBALANCE_TICK))
-			balance_node(this_rq, idle, this_cpu);
-#endif
 		if (!(j % IDLE_REBALANCE_TICK)) {
 			spin_lock(&this_rq->lock);
-			load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+			load_balance(this_rq, idle);
 			spin_unlock(&this_rq->lock);
 		}
 		return;
 	}
-#ifdef CONFIG_NUMA
-	if (!(j % BUSY_NODE_REBALANCE_TICK))
-		balance_node(this_rq, idle, this_cpu);
-#endif
 	if (!(j % BUSY_REBALANCE_TICK)) {
 		spin_lock(&this_rq->lock);
-		load_balance(this_rq, idle, cpu_to_node_mask(this_cpu));
+		load_balance(this_rq, idle);
 		spin_unlock(&this_rq->lock);
 	}
 }
@@ -1306,7 +1315,7 @@ need_resched:
 pick_next_task:
 	if (unlikely(!rq->nr_running)) {
 #ifdef CONFIG_SMP
-		load_balance(rq, 1, cpu_to_node_mask(smp_processor_id()));
+		load_balance(rq, 1);
 		if (rq->nr_running)
 			goto pick_next_task;
 #endif

             reply	other threads:[~2003-07-28 19:14 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2003-07-28 19:16 Erich Focht [this message]
2003-07-28 19:55 ` [patch] scheduler fix for 1cpu/node case Martin J. Bligh
2003-07-28 20:18   ` Erich Focht
2003-07-28 20:37     ` Martin J. Bligh
2003-07-29  2:24       ` Andrew Theurer
2003-07-29 10:08         ` Erich Focht
2003-07-29 13:33           ` [Lse-tech] " Andrew Theurer
2003-07-30 15:23             ` Erich Focht
2003-07-30 15:44               ` Andrew Theurer
2003-07-29 14:27           ` Martin J. Bligh
2003-08-13 20:49         ` Bill Davidsen
2003-08-22 15:46           ` [Lse-tech] " Andrew Theurer
2003-08-22 22:56             ` Nick Piggin
2003-08-23  0:12               ` Andrew Theurer
2003-08-23  0:29                 ` Nick Piggin
2003-08-23  0:47                   ` William Lee Irwin III
2003-08-23  8:48                     ` Nick Piggin
2003-08-23 14:32                   ` Andrew Theurer
2003-08-23  1:31                 ` Martin J. Bligh
2003-07-29 10:08       ` Erich Focht
2003-07-29 14:41     ` Andi Kleen
2003-07-31 15:05 ` Martin J. Bligh
2003-07-31 21:45   ` Erich Focht
2003-08-01  0:26     ` Martin J. Bligh
2003-08-01 16:30       ` [Lse-tech] " Erich Focht

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=200307280548.53976.efocht@gmx.net \
    --to=efocht@hpce.nec.com \
    --cc=Martin.Bligh@us.ibm.com \
    --cc=ak@muc.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lse-tech@lists.sourceforge.net \
    --cc=torvalds@osdl.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.